diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,31008 @@ +{ + "best_global_step": 2886, + "best_metric": 0.14812766015529633, + "best_model_checkpoint": "saves_stability/lora/llama-3-8b-instruct/train_cola_1757340261/checkpoint-2886", + "epoch": 10.0, + "eval_steps": 962, + "global_step": 19240, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002598752598752599, + "grad_norm": 10.797879219055176, + "learning_rate": 1.0395010395010396e-07, + "loss": 1.4212, + "num_input_tokens_seen": 928, + "step": 5 + }, + { + "epoch": 0.005197505197505198, + "grad_norm": 12.04483699798584, + "learning_rate": 2.338877338877339e-07, + "loss": 1.528, + "num_input_tokens_seen": 1920, + "step": 10 + }, + { + "epoch": 0.007796257796257797, + "grad_norm": 9.93985366821289, + "learning_rate": 3.6382536382536384e-07, + "loss": 1.1381, + "num_input_tokens_seen": 2944, + "step": 15 + }, + { + "epoch": 0.010395010395010396, + "grad_norm": 10.36928653717041, + "learning_rate": 4.937629937629938e-07, + "loss": 1.2104, + "num_input_tokens_seen": 3904, + "step": 20 + }, + { + "epoch": 0.012993762993762994, + "grad_norm": 11.67911434173584, + "learning_rate": 6.237006237006237e-07, + "loss": 2.157, + "num_input_tokens_seen": 4832, + "step": 25 + }, + { + "epoch": 0.015592515592515593, + "grad_norm": 10.75258731842041, + "learning_rate": 7.536382536382538e-07, + "loss": 1.0584, + "num_input_tokens_seen": 5696, + "step": 30 + }, + { + "epoch": 0.018191268191268192, + "grad_norm": 12.7691650390625, + "learning_rate": 8.835758835758837e-07, + "loss": 1.1809, + "num_input_tokens_seen": 6592, + "step": 35 + }, + { + "epoch": 0.02079002079002079, + "grad_norm": 10.994141578674316, + "learning_rate": 1.0135135135135136e-06, + "loss": 1.6686, + "num_input_tokens_seen": 7552, + "step": 40 + }, + { + "epoch": 0.02338877338877339, + "grad_norm": 12.541343688964844, + "learning_rate": 1.1434511434511436e-06, + "loss": 1.4303, + "num_input_tokens_seen": 8512, + "step": 45 + }, + { + "epoch": 0.02598752598752599, + "grad_norm": 12.73262882232666, + "learning_rate": 1.2733887733887735e-06, + "loss": 1.1836, + "num_input_tokens_seen": 9504, + "step": 50 + }, + { + "epoch": 0.028586278586278588, + "grad_norm": 14.551464080810547, + "learning_rate": 1.4033264033264034e-06, + "loss": 1.3069, + "num_input_tokens_seen": 10464, + "step": 55 + }, + { + "epoch": 0.031185031185031187, + "grad_norm": 13.126277923583984, + "learning_rate": 1.5332640332640334e-06, + "loss": 1.2555, + "num_input_tokens_seen": 11456, + "step": 60 + }, + { + "epoch": 0.033783783783783786, + "grad_norm": 15.6592435836792, + "learning_rate": 1.6632016632016633e-06, + "loss": 1.0894, + "num_input_tokens_seen": 12416, + "step": 65 + }, + { + "epoch": 0.036382536382536385, + "grad_norm": 12.663433074951172, + "learning_rate": 1.7931392931392932e-06, + "loss": 1.1897, + "num_input_tokens_seen": 13312, + "step": 70 + }, + { + "epoch": 0.03898128898128898, + "grad_norm": 14.091818809509277, + "learning_rate": 1.9230769230769234e-06, + "loss": 0.7981, + "num_input_tokens_seen": 14240, + "step": 75 + }, + { + "epoch": 0.04158004158004158, + "grad_norm": 15.13443374633789, + "learning_rate": 2.053014553014553e-06, + "loss": 0.9471, + "num_input_tokens_seen": 15136, + "step": 80 + }, + { + "epoch": 0.04417879417879418, + "grad_norm": 14.75004768371582, + "learning_rate": 2.1829521829521833e-06, + "loss": 0.8899, + "num_input_tokens_seen": 16064, + "step": 85 + }, + { + "epoch": 0.04677754677754678, + "grad_norm": 10.66485595703125, + "learning_rate": 2.312889812889813e-06, + "loss": 1.1528, + "num_input_tokens_seen": 17024, + "step": 90 + }, + { + "epoch": 0.04937629937629938, + "grad_norm": 4.884730339050293, + "learning_rate": 2.442827442827443e-06, + "loss": 0.3734, + "num_input_tokens_seen": 18016, + "step": 95 + }, + { + "epoch": 0.05197505197505198, + "grad_norm": 5.4147443771362305, + "learning_rate": 2.572765072765073e-06, + "loss": 0.6476, + "num_input_tokens_seen": 18976, + "step": 100 + }, + { + "epoch": 0.05457380457380458, + "grad_norm": 2.619866132736206, + "learning_rate": 2.702702702702703e-06, + "loss": 0.3548, + "num_input_tokens_seen": 19936, + "step": 105 + }, + { + "epoch": 0.057172557172557176, + "grad_norm": 6.0840535163879395, + "learning_rate": 2.8326403326403327e-06, + "loss": 0.0627, + "num_input_tokens_seen": 20896, + "step": 110 + }, + { + "epoch": 0.059771309771309775, + "grad_norm": 5.345730781555176, + "learning_rate": 2.962577962577963e-06, + "loss": 0.8282, + "num_input_tokens_seen": 21792, + "step": 115 + }, + { + "epoch": 0.062370062370062374, + "grad_norm": 5.031264781951904, + "learning_rate": 3.092515592515593e-06, + "loss": 0.3342, + "num_input_tokens_seen": 22784, + "step": 120 + }, + { + "epoch": 0.06496881496881497, + "grad_norm": 0.18347986042499542, + "learning_rate": 3.2224532224532228e-06, + "loss": 0.255, + "num_input_tokens_seen": 23744, + "step": 125 + }, + { + "epoch": 0.06756756756756757, + "grad_norm": 0.0801958292722702, + "learning_rate": 3.352390852390853e-06, + "loss": 0.6622, + "num_input_tokens_seen": 24704, + "step": 130 + }, + { + "epoch": 0.07016632016632017, + "grad_norm": 8.656383514404297, + "learning_rate": 3.4823284823284826e-06, + "loss": 0.3474, + "num_input_tokens_seen": 25632, + "step": 135 + }, + { + "epoch": 0.07276507276507277, + "grad_norm": 0.39732009172439575, + "learning_rate": 3.6122661122661128e-06, + "loss": 0.097, + "num_input_tokens_seen": 26560, + "step": 140 + }, + { + "epoch": 0.07536382536382537, + "grad_norm": 10.89357852935791, + "learning_rate": 3.7422037422037425e-06, + "loss": 0.4092, + "num_input_tokens_seen": 27488, + "step": 145 + }, + { + "epoch": 0.07796257796257797, + "grad_norm": 2.110738515853882, + "learning_rate": 3.872141372141373e-06, + "loss": 0.1116, + "num_input_tokens_seen": 28576, + "step": 150 + }, + { + "epoch": 0.08056133056133057, + "grad_norm": 4.46411657333374, + "learning_rate": 4.002079002079003e-06, + "loss": 0.3416, + "num_input_tokens_seen": 29600, + "step": 155 + }, + { + "epoch": 0.08316008316008316, + "grad_norm": 8.576583862304688, + "learning_rate": 4.132016632016632e-06, + "loss": 0.2896, + "num_input_tokens_seen": 30592, + "step": 160 + }, + { + "epoch": 0.08575883575883576, + "grad_norm": 9.217649459838867, + "learning_rate": 4.261954261954262e-06, + "loss": 0.3386, + "num_input_tokens_seen": 31520, + "step": 165 + }, + { + "epoch": 0.08835758835758836, + "grad_norm": 8.814844131469727, + "learning_rate": 4.391891891891892e-06, + "loss": 0.2305, + "num_input_tokens_seen": 32480, + "step": 170 + }, + { + "epoch": 0.09095634095634096, + "grad_norm": 0.3654564619064331, + "learning_rate": 4.5218295218295225e-06, + "loss": 0.1269, + "num_input_tokens_seen": 33376, + "step": 175 + }, + { + "epoch": 0.09355509355509356, + "grad_norm": 0.048520997166633606, + "learning_rate": 4.651767151767152e-06, + "loss": 0.1878, + "num_input_tokens_seen": 34304, + "step": 180 + }, + { + "epoch": 0.09615384615384616, + "grad_norm": 3.1699090003967285, + "learning_rate": 4.781704781704782e-06, + "loss": 0.1612, + "num_input_tokens_seen": 35328, + "step": 185 + }, + { + "epoch": 0.09875259875259876, + "grad_norm": 0.3153856098651886, + "learning_rate": 4.911642411642412e-06, + "loss": 0.3382, + "num_input_tokens_seen": 36224, + "step": 190 + }, + { + "epoch": 0.10135135135135136, + "grad_norm": 2.5747358798980713, + "learning_rate": 5.041580041580042e-06, + "loss": 0.2496, + "num_input_tokens_seen": 37152, + "step": 195 + }, + { + "epoch": 0.10395010395010396, + "grad_norm": 9.183514595031738, + "learning_rate": 5.1715176715176724e-06, + "loss": 0.5506, + "num_input_tokens_seen": 38080, + "step": 200 + }, + { + "epoch": 0.10654885654885655, + "grad_norm": 3.4848363399505615, + "learning_rate": 5.301455301455302e-06, + "loss": 0.0735, + "num_input_tokens_seen": 39008, + "step": 205 + }, + { + "epoch": 0.10914760914760915, + "grad_norm": 10.360175132751465, + "learning_rate": 5.431392931392932e-06, + "loss": 0.3155, + "num_input_tokens_seen": 39968, + "step": 210 + }, + { + "epoch": 0.11174636174636175, + "grad_norm": 6.048911094665527, + "learning_rate": 5.561330561330562e-06, + "loss": 0.314, + "num_input_tokens_seen": 40928, + "step": 215 + }, + { + "epoch": 0.11434511434511435, + "grad_norm": 8.826172828674316, + "learning_rate": 5.691268191268192e-06, + "loss": 0.2848, + "num_input_tokens_seen": 41920, + "step": 220 + }, + { + "epoch": 0.11694386694386695, + "grad_norm": 2.067065715789795, + "learning_rate": 5.8212058212058215e-06, + "loss": 0.2065, + "num_input_tokens_seen": 42816, + "step": 225 + }, + { + "epoch": 0.11954261954261955, + "grad_norm": 12.903179168701172, + "learning_rate": 5.951143451143452e-06, + "loss": 0.4159, + "num_input_tokens_seen": 43776, + "step": 230 + }, + { + "epoch": 0.12214137214137215, + "grad_norm": 4.859861850738525, + "learning_rate": 6.081081081081082e-06, + "loss": 0.1729, + "num_input_tokens_seen": 44736, + "step": 235 + }, + { + "epoch": 0.12474012474012475, + "grad_norm": 2.7638816833496094, + "learning_rate": 6.211018711018712e-06, + "loss": 0.376, + "num_input_tokens_seen": 45664, + "step": 240 + }, + { + "epoch": 0.12733887733887733, + "grad_norm": 4.834848880767822, + "learning_rate": 6.340956340956341e-06, + "loss": 0.2568, + "num_input_tokens_seen": 46688, + "step": 245 + }, + { + "epoch": 0.12993762993762994, + "grad_norm": 4.0606818199157715, + "learning_rate": 6.4708939708939705e-06, + "loss": 0.232, + "num_input_tokens_seen": 47680, + "step": 250 + }, + { + "epoch": 0.13253638253638253, + "grad_norm": 3.970198154449463, + "learning_rate": 6.6008316008316015e-06, + "loss": 0.284, + "num_input_tokens_seen": 48672, + "step": 255 + }, + { + "epoch": 0.13513513513513514, + "grad_norm": 4.558541774749756, + "learning_rate": 6.730769230769231e-06, + "loss": 0.2468, + "num_input_tokens_seen": 49600, + "step": 260 + }, + { + "epoch": 0.13773388773388773, + "grad_norm": 2.0267982482910156, + "learning_rate": 6.860706860706862e-06, + "loss": 0.0837, + "num_input_tokens_seen": 50592, + "step": 265 + }, + { + "epoch": 0.14033264033264034, + "grad_norm": 5.778964996337891, + "learning_rate": 6.99064449064449e-06, + "loss": 0.3142, + "num_input_tokens_seen": 51616, + "step": 270 + }, + { + "epoch": 0.14293139293139293, + "grad_norm": 0.9606799483299255, + "learning_rate": 7.120582120582121e-06, + "loss": 0.1627, + "num_input_tokens_seen": 52576, + "step": 275 + }, + { + "epoch": 0.14553014553014554, + "grad_norm": 5.596894264221191, + "learning_rate": 7.250519750519751e-06, + "loss": 0.1959, + "num_input_tokens_seen": 53472, + "step": 280 + }, + { + "epoch": 0.14812889812889812, + "grad_norm": 0.9875503778457642, + "learning_rate": 7.3804573804573816e-06, + "loss": 0.2626, + "num_input_tokens_seen": 54400, + "step": 285 + }, + { + "epoch": 0.15072765072765074, + "grad_norm": 1.436509132385254, + "learning_rate": 7.510395010395011e-06, + "loss": 0.1054, + "num_input_tokens_seen": 55328, + "step": 290 + }, + { + "epoch": 0.15332640332640332, + "grad_norm": 0.374382346868515, + "learning_rate": 7.640332640332642e-06, + "loss": 0.194, + "num_input_tokens_seen": 56288, + "step": 295 + }, + { + "epoch": 0.15592515592515593, + "grad_norm": 4.240149021148682, + "learning_rate": 7.77027027027027e-06, + "loss": 0.1238, + "num_input_tokens_seen": 57216, + "step": 300 + }, + { + "epoch": 0.15852390852390852, + "grad_norm": 0.9088580012321472, + "learning_rate": 7.9002079002079e-06, + "loss": 0.2046, + "num_input_tokens_seen": 58176, + "step": 305 + }, + { + "epoch": 0.16112266112266113, + "grad_norm": 3.705383062362671, + "learning_rate": 8.03014553014553e-06, + "loss": 0.1327, + "num_input_tokens_seen": 59104, + "step": 310 + }, + { + "epoch": 0.16372141372141372, + "grad_norm": 3.659804582595825, + "learning_rate": 8.16008316008316e-06, + "loss": 0.2587, + "num_input_tokens_seen": 60032, + "step": 315 + }, + { + "epoch": 0.16632016632016633, + "grad_norm": 8.355545043945312, + "learning_rate": 8.290020790020791e-06, + "loss": 0.0973, + "num_input_tokens_seen": 60960, + "step": 320 + }, + { + "epoch": 0.16891891891891891, + "grad_norm": 0.5468668937683105, + "learning_rate": 8.419958419958421e-06, + "loss": 0.0114, + "num_input_tokens_seen": 61888, + "step": 325 + }, + { + "epoch": 0.17151767151767153, + "grad_norm": 6.177222728729248, + "learning_rate": 8.54989604989605e-06, + "loss": 0.4449, + "num_input_tokens_seen": 62912, + "step": 330 + }, + { + "epoch": 0.1741164241164241, + "grad_norm": 9.328977584838867, + "learning_rate": 8.679833679833681e-06, + "loss": 0.1714, + "num_input_tokens_seen": 63872, + "step": 335 + }, + { + "epoch": 0.17671517671517672, + "grad_norm": 4.654053211212158, + "learning_rate": 8.80977130977131e-06, + "loss": 0.0828, + "num_input_tokens_seen": 64832, + "step": 340 + }, + { + "epoch": 0.1793139293139293, + "grad_norm": 0.9110220074653625, + "learning_rate": 8.93970893970894e-06, + "loss": 0.2364, + "num_input_tokens_seen": 65760, + "step": 345 + }, + { + "epoch": 0.18191268191268192, + "grad_norm": 1.4395866394042969, + "learning_rate": 9.06964656964657e-06, + "loss": 0.12, + "num_input_tokens_seen": 66752, + "step": 350 + }, + { + "epoch": 0.1845114345114345, + "grad_norm": 2.536559581756592, + "learning_rate": 9.1995841995842e-06, + "loss": 0.5629, + "num_input_tokens_seen": 67744, + "step": 355 + }, + { + "epoch": 0.18711018711018712, + "grad_norm": 1.1524341106414795, + "learning_rate": 9.32952182952183e-06, + "loss": 0.3175, + "num_input_tokens_seen": 68704, + "step": 360 + }, + { + "epoch": 0.1897089397089397, + "grad_norm": 0.8308989405632019, + "learning_rate": 9.45945945945946e-06, + "loss": 0.0554, + "num_input_tokens_seen": 69600, + "step": 365 + }, + { + "epoch": 0.19230769230769232, + "grad_norm": 1.8888778686523438, + "learning_rate": 9.589397089397089e-06, + "loss": 0.2197, + "num_input_tokens_seen": 70592, + "step": 370 + }, + { + "epoch": 0.1949064449064449, + "grad_norm": 7.310072422027588, + "learning_rate": 9.719334719334721e-06, + "loss": 0.4207, + "num_input_tokens_seen": 71584, + "step": 375 + }, + { + "epoch": 0.19750519750519752, + "grad_norm": 0.9461462497711182, + "learning_rate": 9.84927234927235e-06, + "loss": 0.1995, + "num_input_tokens_seen": 72544, + "step": 380 + }, + { + "epoch": 0.2001039501039501, + "grad_norm": 8.112709999084473, + "learning_rate": 9.97920997920998e-06, + "loss": 0.3014, + "num_input_tokens_seen": 73440, + "step": 385 + }, + { + "epoch": 0.20270270270270271, + "grad_norm": 1.9596649408340454, + "learning_rate": 1.010914760914761e-05, + "loss": 0.2582, + "num_input_tokens_seen": 74400, + "step": 390 + }, + { + "epoch": 0.2053014553014553, + "grad_norm": 1.324997901916504, + "learning_rate": 1.023908523908524e-05, + "loss": 0.0412, + "num_input_tokens_seen": 75360, + "step": 395 + }, + { + "epoch": 0.2079002079002079, + "grad_norm": 4.496888160705566, + "learning_rate": 1.036902286902287e-05, + "loss": 0.2745, + "num_input_tokens_seen": 76384, + "step": 400 + }, + { + "epoch": 0.2104989604989605, + "grad_norm": 2.1316301822662354, + "learning_rate": 1.04989604989605e-05, + "loss": 0.0546, + "num_input_tokens_seen": 77344, + "step": 405 + }, + { + "epoch": 0.2130977130977131, + "grad_norm": 5.755380630493164, + "learning_rate": 1.0628898128898128e-05, + "loss": 0.1993, + "num_input_tokens_seen": 78304, + "step": 410 + }, + { + "epoch": 0.2156964656964657, + "grad_norm": 4.059833526611328, + "learning_rate": 1.075883575883576e-05, + "loss": 0.1328, + "num_input_tokens_seen": 79200, + "step": 415 + }, + { + "epoch": 0.2182952182952183, + "grad_norm": 9.914511680603027, + "learning_rate": 1.0888773388773389e-05, + "loss": 0.1936, + "num_input_tokens_seen": 80128, + "step": 420 + }, + { + "epoch": 0.2208939708939709, + "grad_norm": 5.6224236488342285, + "learning_rate": 1.101871101871102e-05, + "loss": 0.2576, + "num_input_tokens_seen": 81056, + "step": 425 + }, + { + "epoch": 0.2234927234927235, + "grad_norm": 8.0014009475708, + "learning_rate": 1.1148648648648649e-05, + "loss": 0.3465, + "num_input_tokens_seen": 82144, + "step": 430 + }, + { + "epoch": 0.2260914760914761, + "grad_norm": 5.992046356201172, + "learning_rate": 1.127858627858628e-05, + "loss": 0.1774, + "num_input_tokens_seen": 83072, + "step": 435 + }, + { + "epoch": 0.2286902286902287, + "grad_norm": 3.5691049098968506, + "learning_rate": 1.140852390852391e-05, + "loss": 0.093, + "num_input_tokens_seen": 83968, + "step": 440 + }, + { + "epoch": 0.2312889812889813, + "grad_norm": 3.1470236778259277, + "learning_rate": 1.153846153846154e-05, + "loss": 0.3076, + "num_input_tokens_seen": 84992, + "step": 445 + }, + { + "epoch": 0.2338877338877339, + "grad_norm": 2.48262095451355, + "learning_rate": 1.166839916839917e-05, + "loss": 0.0788, + "num_input_tokens_seen": 85984, + "step": 450 + }, + { + "epoch": 0.23648648648648649, + "grad_norm": 0.5901393890380859, + "learning_rate": 1.17983367983368e-05, + "loss": 0.1513, + "num_input_tokens_seen": 86976, + "step": 455 + }, + { + "epoch": 0.2390852390852391, + "grad_norm": 0.5095131397247314, + "learning_rate": 1.1928274428274428e-05, + "loss": 0.0906, + "num_input_tokens_seen": 87872, + "step": 460 + }, + { + "epoch": 0.24168399168399168, + "grad_norm": 0.7085443139076233, + "learning_rate": 1.205821205821206e-05, + "loss": 0.2173, + "num_input_tokens_seen": 88864, + "step": 465 + }, + { + "epoch": 0.2442827442827443, + "grad_norm": 1.461985468864441, + "learning_rate": 1.2188149688149689e-05, + "loss": 0.1625, + "num_input_tokens_seen": 89760, + "step": 470 + }, + { + "epoch": 0.24688149688149688, + "grad_norm": 0.030371973291039467, + "learning_rate": 1.2318087318087319e-05, + "loss": 0.1819, + "num_input_tokens_seen": 90656, + "step": 475 + }, + { + "epoch": 0.2494802494802495, + "grad_norm": 8.70595932006836, + "learning_rate": 1.2448024948024949e-05, + "loss": 0.3166, + "num_input_tokens_seen": 91584, + "step": 480 + }, + { + "epoch": 0.2520790020790021, + "grad_norm": 0.4940551519393921, + "learning_rate": 1.2577962577962579e-05, + "loss": 0.2588, + "num_input_tokens_seen": 92512, + "step": 485 + }, + { + "epoch": 0.25467775467775466, + "grad_norm": 1.3902393579483032, + "learning_rate": 1.270790020790021e-05, + "loss": 0.0449, + "num_input_tokens_seen": 93408, + "step": 490 + }, + { + "epoch": 0.25727650727650725, + "grad_norm": 0.13413594663143158, + "learning_rate": 1.2837837837837838e-05, + "loss": 0.026, + "num_input_tokens_seen": 94336, + "step": 495 + }, + { + "epoch": 0.2598752598752599, + "grad_norm": 4.23362398147583, + "learning_rate": 1.2967775467775468e-05, + "loss": 0.3547, + "num_input_tokens_seen": 95264, + "step": 500 + }, + { + "epoch": 0.2624740124740125, + "grad_norm": 6.70444393157959, + "learning_rate": 1.30977130977131e-05, + "loss": 0.4126, + "num_input_tokens_seen": 96352, + "step": 505 + }, + { + "epoch": 0.26507276507276506, + "grad_norm": 12.028645515441895, + "learning_rate": 1.3227650727650728e-05, + "loss": 0.4118, + "num_input_tokens_seen": 97312, + "step": 510 + }, + { + "epoch": 0.26767151767151764, + "grad_norm": 0.8770215511322021, + "learning_rate": 1.3357588357588358e-05, + "loss": 0.1151, + "num_input_tokens_seen": 98208, + "step": 515 + }, + { + "epoch": 0.2702702702702703, + "grad_norm": 1.296202540397644, + "learning_rate": 1.3487525987525987e-05, + "loss": 0.1666, + "num_input_tokens_seen": 99136, + "step": 520 + }, + { + "epoch": 0.27286902286902287, + "grad_norm": 3.1100761890411377, + "learning_rate": 1.3617463617463619e-05, + "loss": 0.133, + "num_input_tokens_seen": 100128, + "step": 525 + }, + { + "epoch": 0.27546777546777546, + "grad_norm": 5.468357563018799, + "learning_rate": 1.3747401247401249e-05, + "loss": 0.3766, + "num_input_tokens_seen": 101120, + "step": 530 + }, + { + "epoch": 0.27806652806652804, + "grad_norm": 4.146188735961914, + "learning_rate": 1.3877338877338877e-05, + "loss": 0.3615, + "num_input_tokens_seen": 102016, + "step": 535 + }, + { + "epoch": 0.2806652806652807, + "grad_norm": 0.944405198097229, + "learning_rate": 1.4007276507276507e-05, + "loss": 0.2577, + "num_input_tokens_seen": 102944, + "step": 540 + }, + { + "epoch": 0.28326403326403327, + "grad_norm": 0.8486806154251099, + "learning_rate": 1.4137214137214139e-05, + "loss": 0.2295, + "num_input_tokens_seen": 103936, + "step": 545 + }, + { + "epoch": 0.28586278586278585, + "grad_norm": 3.6915392875671387, + "learning_rate": 1.4267151767151768e-05, + "loss": 0.0656, + "num_input_tokens_seen": 104960, + "step": 550 + }, + { + "epoch": 0.28846153846153844, + "grad_norm": 0.14888593554496765, + "learning_rate": 1.4397089397089398e-05, + "loss": 0.2842, + "num_input_tokens_seen": 105920, + "step": 555 + }, + { + "epoch": 0.2910602910602911, + "grad_norm": 4.261628150939941, + "learning_rate": 1.4527027027027026e-05, + "loss": 0.2684, + "num_input_tokens_seen": 106880, + "step": 560 + }, + { + "epoch": 0.29365904365904366, + "grad_norm": 2.67976975440979, + "learning_rate": 1.4656964656964658e-05, + "loss": 0.3255, + "num_input_tokens_seen": 107776, + "step": 565 + }, + { + "epoch": 0.29625779625779625, + "grad_norm": 4.9160614013671875, + "learning_rate": 1.4786902286902288e-05, + "loss": 0.2156, + "num_input_tokens_seen": 108704, + "step": 570 + }, + { + "epoch": 0.29885654885654883, + "grad_norm": 1.9751431941986084, + "learning_rate": 1.4916839916839917e-05, + "loss": 0.216, + "num_input_tokens_seen": 109696, + "step": 575 + }, + { + "epoch": 0.30145530145530147, + "grad_norm": 1.7181288003921509, + "learning_rate": 1.5046777546777547e-05, + "loss": 0.1436, + "num_input_tokens_seen": 110688, + "step": 580 + }, + { + "epoch": 0.30405405405405406, + "grad_norm": 2.6051230430603027, + "learning_rate": 1.5176715176715179e-05, + "loss": 0.2538, + "num_input_tokens_seen": 111680, + "step": 585 + }, + { + "epoch": 0.30665280665280664, + "grad_norm": 6.27025842666626, + "learning_rate": 1.530665280665281e-05, + "loss": 0.3005, + "num_input_tokens_seen": 112608, + "step": 590 + }, + { + "epoch": 0.3092515592515592, + "grad_norm": 3.7761621475219727, + "learning_rate": 1.5436590436590437e-05, + "loss": 0.1308, + "num_input_tokens_seen": 113568, + "step": 595 + }, + { + "epoch": 0.31185031185031187, + "grad_norm": 2.88999605178833, + "learning_rate": 1.5566528066528066e-05, + "loss": 0.0587, + "num_input_tokens_seen": 114496, + "step": 600 + }, + { + "epoch": 0.31444906444906445, + "grad_norm": 6.5235748291015625, + "learning_rate": 1.5696465696465697e-05, + "loss": 0.1098, + "num_input_tokens_seen": 115424, + "step": 605 + }, + { + "epoch": 0.31704781704781704, + "grad_norm": 8.863351821899414, + "learning_rate": 1.5826403326403326e-05, + "loss": 0.3275, + "num_input_tokens_seen": 116384, + "step": 610 + }, + { + "epoch": 0.3196465696465696, + "grad_norm": 3.5306642055511475, + "learning_rate": 1.5956340956340958e-05, + "loss": 0.3648, + "num_input_tokens_seen": 117408, + "step": 615 + }, + { + "epoch": 0.32224532224532226, + "grad_norm": 11.74785041809082, + "learning_rate": 1.6086278586278586e-05, + "loss": 0.3346, + "num_input_tokens_seen": 118368, + "step": 620 + }, + { + "epoch": 0.32484407484407485, + "grad_norm": 3.3469231128692627, + "learning_rate": 1.6216216216216218e-05, + "loss": 0.1127, + "num_input_tokens_seen": 119392, + "step": 625 + }, + { + "epoch": 0.32744282744282743, + "grad_norm": 4.157953262329102, + "learning_rate": 1.6346153846153847e-05, + "loss": 0.1188, + "num_input_tokens_seen": 120352, + "step": 630 + }, + { + "epoch": 0.33004158004158, + "grad_norm": 7.210325717926025, + "learning_rate": 1.6476091476091475e-05, + "loss": 0.1566, + "num_input_tokens_seen": 121344, + "step": 635 + }, + { + "epoch": 0.33264033264033266, + "grad_norm": 4.921578884124756, + "learning_rate": 1.6606029106029107e-05, + "loss": 0.2473, + "num_input_tokens_seen": 122272, + "step": 640 + }, + { + "epoch": 0.33523908523908524, + "grad_norm": 4.405200481414795, + "learning_rate": 1.673596673596674e-05, + "loss": 0.1627, + "num_input_tokens_seen": 123232, + "step": 645 + }, + { + "epoch": 0.33783783783783783, + "grad_norm": 8.054732322692871, + "learning_rate": 1.6865904365904367e-05, + "loss": 0.2284, + "num_input_tokens_seen": 124128, + "step": 650 + }, + { + "epoch": 0.3404365904365904, + "grad_norm": 3.837114095687866, + "learning_rate": 1.6995841995841996e-05, + "loss": 0.2034, + "num_input_tokens_seen": 125088, + "step": 655 + }, + { + "epoch": 0.34303534303534305, + "grad_norm": 0.14586788415908813, + "learning_rate": 1.7125779625779624e-05, + "loss": 0.059, + "num_input_tokens_seen": 126048, + "step": 660 + }, + { + "epoch": 0.34563409563409564, + "grad_norm": 0.32496875524520874, + "learning_rate": 1.7255717255717256e-05, + "loss": 0.1872, + "num_input_tokens_seen": 126912, + "step": 665 + }, + { + "epoch": 0.3482328482328482, + "grad_norm": 0.4759271442890167, + "learning_rate": 1.7385654885654888e-05, + "loss": 0.1567, + "num_input_tokens_seen": 127936, + "step": 670 + }, + { + "epoch": 0.3508316008316008, + "grad_norm": 1.4321528673171997, + "learning_rate": 1.7515592515592516e-05, + "loss": 0.2227, + "num_input_tokens_seen": 128928, + "step": 675 + }, + { + "epoch": 0.35343035343035345, + "grad_norm": 0.22649715840816498, + "learning_rate": 1.7645530145530145e-05, + "loss": 0.3065, + "num_input_tokens_seen": 129888, + "step": 680 + }, + { + "epoch": 0.35602910602910603, + "grad_norm": 0.22863104939460754, + "learning_rate": 1.7775467775467776e-05, + "loss": 0.239, + "num_input_tokens_seen": 130784, + "step": 685 + }, + { + "epoch": 0.3586278586278586, + "grad_norm": 2.306990146636963, + "learning_rate": 1.7905405405405405e-05, + "loss": 0.1784, + "num_input_tokens_seen": 131744, + "step": 690 + }, + { + "epoch": 0.3612266112266112, + "grad_norm": 2.4579503536224365, + "learning_rate": 1.8035343035343037e-05, + "loss": 0.2352, + "num_input_tokens_seen": 132704, + "step": 695 + }, + { + "epoch": 0.36382536382536385, + "grad_norm": 2.646613359451294, + "learning_rate": 1.8165280665280665e-05, + "loss": 0.1295, + "num_input_tokens_seen": 133600, + "step": 700 + }, + { + "epoch": 0.36642411642411643, + "grad_norm": 6.813455581665039, + "learning_rate": 1.8295218295218297e-05, + "loss": 0.2494, + "num_input_tokens_seen": 134464, + "step": 705 + }, + { + "epoch": 0.369022869022869, + "grad_norm": 5.2612504959106445, + "learning_rate": 1.8425155925155926e-05, + "loss": 0.1801, + "num_input_tokens_seen": 135392, + "step": 710 + }, + { + "epoch": 0.3716216216216216, + "grad_norm": 3.4093146324157715, + "learning_rate": 1.8555093555093554e-05, + "loss": 0.1414, + "num_input_tokens_seen": 136352, + "step": 715 + }, + { + "epoch": 0.37422037422037424, + "grad_norm": 8.247339248657227, + "learning_rate": 1.8685031185031186e-05, + "loss": 0.2066, + "num_input_tokens_seen": 137248, + "step": 720 + }, + { + "epoch": 0.3768191268191268, + "grad_norm": 5.519759654998779, + "learning_rate": 1.8814968814968818e-05, + "loss": 0.272, + "num_input_tokens_seen": 138240, + "step": 725 + }, + { + "epoch": 0.3794178794178794, + "grad_norm": 0.997715175151825, + "learning_rate": 1.8944906444906446e-05, + "loss": 0.2197, + "num_input_tokens_seen": 139136, + "step": 730 + }, + { + "epoch": 0.382016632016632, + "grad_norm": 7.29288387298584, + "learning_rate": 1.9074844074844075e-05, + "loss": 0.2813, + "num_input_tokens_seen": 140096, + "step": 735 + }, + { + "epoch": 0.38461538461538464, + "grad_norm": 2.7974727153778076, + "learning_rate": 1.9204781704781703e-05, + "loss": 0.1798, + "num_input_tokens_seen": 141120, + "step": 740 + }, + { + "epoch": 0.3872141372141372, + "grad_norm": 7.896302700042725, + "learning_rate": 1.9334719334719338e-05, + "loss": 0.478, + "num_input_tokens_seen": 142080, + "step": 745 + }, + { + "epoch": 0.3898128898128898, + "grad_norm": 1.4198888540267944, + "learning_rate": 1.9464656964656967e-05, + "loss": 0.0407, + "num_input_tokens_seen": 143040, + "step": 750 + }, + { + "epoch": 0.3924116424116424, + "grad_norm": 3.699277877807617, + "learning_rate": 1.9594594594594595e-05, + "loss": 0.1091, + "num_input_tokens_seen": 143904, + "step": 755 + }, + { + "epoch": 0.39501039501039503, + "grad_norm": 3.7977182865142822, + "learning_rate": 1.9724532224532224e-05, + "loss": 0.1627, + "num_input_tokens_seen": 144832, + "step": 760 + }, + { + "epoch": 0.3976091476091476, + "grad_norm": 8.086742401123047, + "learning_rate": 1.9854469854469855e-05, + "loss": 0.3631, + "num_input_tokens_seen": 145824, + "step": 765 + }, + { + "epoch": 0.4002079002079002, + "grad_norm": 6.851086616516113, + "learning_rate": 1.9984407484407487e-05, + "loss": 0.1511, + "num_input_tokens_seen": 146752, + "step": 770 + }, + { + "epoch": 0.4028066528066528, + "grad_norm": 1.373323917388916, + "learning_rate": 2.0114345114345116e-05, + "loss": 0.4304, + "num_input_tokens_seen": 147616, + "step": 775 + }, + { + "epoch": 0.40540540540540543, + "grad_norm": 4.910678386688232, + "learning_rate": 2.0244282744282744e-05, + "loss": 0.3002, + "num_input_tokens_seen": 148480, + "step": 780 + }, + { + "epoch": 0.408004158004158, + "grad_norm": 0.6879134178161621, + "learning_rate": 2.0374220374220376e-05, + "loss": 0.1909, + "num_input_tokens_seen": 149440, + "step": 785 + }, + { + "epoch": 0.4106029106029106, + "grad_norm": 4.1584367752075195, + "learning_rate": 2.0504158004158005e-05, + "loss": 0.2285, + "num_input_tokens_seen": 150400, + "step": 790 + }, + { + "epoch": 0.4132016632016632, + "grad_norm": 3.6741654872894287, + "learning_rate": 2.0634095634095636e-05, + "loss": 0.0926, + "num_input_tokens_seen": 151296, + "step": 795 + }, + { + "epoch": 0.4158004158004158, + "grad_norm": 3.6285412311553955, + "learning_rate": 2.0764033264033265e-05, + "loss": 0.0929, + "num_input_tokens_seen": 152288, + "step": 800 + }, + { + "epoch": 0.4183991683991684, + "grad_norm": 2.1170246601104736, + "learning_rate": 2.0893970893970897e-05, + "loss": 0.3121, + "num_input_tokens_seen": 153248, + "step": 805 + }, + { + "epoch": 0.420997920997921, + "grad_norm": 5.045691013336182, + "learning_rate": 2.1023908523908525e-05, + "loss": 0.1149, + "num_input_tokens_seen": 154240, + "step": 810 + }, + { + "epoch": 0.4235966735966736, + "grad_norm": 1.0434365272521973, + "learning_rate": 2.1153846153846154e-05, + "loss": 0.1756, + "num_input_tokens_seen": 155168, + "step": 815 + }, + { + "epoch": 0.4261954261954262, + "grad_norm": 1.4100193977355957, + "learning_rate": 2.1283783783783785e-05, + "loss": 0.2448, + "num_input_tokens_seen": 156128, + "step": 820 + }, + { + "epoch": 0.4287941787941788, + "grad_norm": 0.1917978823184967, + "learning_rate": 2.1413721413721417e-05, + "loss": 0.0615, + "num_input_tokens_seen": 156992, + "step": 825 + }, + { + "epoch": 0.4313929313929314, + "grad_norm": 5.138338088989258, + "learning_rate": 2.1543659043659046e-05, + "loss": 0.1302, + "num_input_tokens_seen": 157952, + "step": 830 + }, + { + "epoch": 0.433991683991684, + "grad_norm": 5.642187118530273, + "learning_rate": 2.1673596673596674e-05, + "loss": 0.2915, + "num_input_tokens_seen": 158848, + "step": 835 + }, + { + "epoch": 0.4365904365904366, + "grad_norm": 6.5511579513549805, + "learning_rate": 2.1803534303534303e-05, + "loss": 0.3055, + "num_input_tokens_seen": 159808, + "step": 840 + }, + { + "epoch": 0.4391891891891892, + "grad_norm": 4.5656208992004395, + "learning_rate": 2.1933471933471934e-05, + "loss": 0.1871, + "num_input_tokens_seen": 160736, + "step": 845 + }, + { + "epoch": 0.4417879417879418, + "grad_norm": 5.352968692779541, + "learning_rate": 2.2063409563409566e-05, + "loss": 0.2562, + "num_input_tokens_seen": 161728, + "step": 850 + }, + { + "epoch": 0.44438669438669437, + "grad_norm": 0.49375244975090027, + "learning_rate": 2.2193347193347195e-05, + "loss": 0.1426, + "num_input_tokens_seen": 162752, + "step": 855 + }, + { + "epoch": 0.446985446985447, + "grad_norm": 9.136771202087402, + "learning_rate": 2.2323284823284823e-05, + "loss": 0.2643, + "num_input_tokens_seen": 163776, + "step": 860 + }, + { + "epoch": 0.4495841995841996, + "grad_norm": 0.7375821471214294, + "learning_rate": 2.2453222453222455e-05, + "loss": 0.031, + "num_input_tokens_seen": 164640, + "step": 865 + }, + { + "epoch": 0.4521829521829522, + "grad_norm": 2.237992286682129, + "learning_rate": 2.2583160083160083e-05, + "loss": 0.0455, + "num_input_tokens_seen": 165568, + "step": 870 + }, + { + "epoch": 0.45478170478170477, + "grad_norm": 5.463961124420166, + "learning_rate": 2.2713097713097715e-05, + "loss": 0.2532, + "num_input_tokens_seen": 166592, + "step": 875 + }, + { + "epoch": 0.4573804573804574, + "grad_norm": 2.3336877822875977, + "learning_rate": 2.2843035343035344e-05, + "loss": 0.223, + "num_input_tokens_seen": 167488, + "step": 880 + }, + { + "epoch": 0.45997920997921, + "grad_norm": 5.617924213409424, + "learning_rate": 2.2972972972972976e-05, + "loss": 0.294, + "num_input_tokens_seen": 168480, + "step": 885 + }, + { + "epoch": 0.4625779625779626, + "grad_norm": 0.05454741045832634, + "learning_rate": 2.3102910602910604e-05, + "loss": 0.0836, + "num_input_tokens_seen": 169408, + "step": 890 + }, + { + "epoch": 0.46517671517671516, + "grad_norm": 0.14178627729415894, + "learning_rate": 2.3232848232848233e-05, + "loss": 0.6473, + "num_input_tokens_seen": 170304, + "step": 895 + }, + { + "epoch": 0.4677754677754678, + "grad_norm": 5.6856231689453125, + "learning_rate": 2.3362785862785864e-05, + "loss": 0.2255, + "num_input_tokens_seen": 171264, + "step": 900 + }, + { + "epoch": 0.4703742203742204, + "grad_norm": 4.022686004638672, + "learning_rate": 2.3492723492723496e-05, + "loss": 0.1838, + "num_input_tokens_seen": 172224, + "step": 905 + }, + { + "epoch": 0.47297297297297297, + "grad_norm": 4.159492015838623, + "learning_rate": 2.3622661122661125e-05, + "loss": 0.2451, + "num_input_tokens_seen": 173120, + "step": 910 + }, + { + "epoch": 0.47557172557172556, + "grad_norm": 2.931636095046997, + "learning_rate": 2.3752598752598753e-05, + "loss": 0.29, + "num_input_tokens_seen": 174048, + "step": 915 + }, + { + "epoch": 0.4781704781704782, + "grad_norm": 4.011681079864502, + "learning_rate": 2.388253638253638e-05, + "loss": 0.2709, + "num_input_tokens_seen": 174912, + "step": 920 + }, + { + "epoch": 0.4807692307692308, + "grad_norm": 5.765507698059082, + "learning_rate": 2.4012474012474013e-05, + "loss": 0.1388, + "num_input_tokens_seen": 175872, + "step": 925 + }, + { + "epoch": 0.48336798336798337, + "grad_norm": 0.7468072772026062, + "learning_rate": 2.4142411642411645e-05, + "loss": 0.1576, + "num_input_tokens_seen": 176896, + "step": 930 + }, + { + "epoch": 0.48596673596673595, + "grad_norm": 1.714655876159668, + "learning_rate": 2.4272349272349274e-05, + "loss": 0.1512, + "num_input_tokens_seen": 177856, + "step": 935 + }, + { + "epoch": 0.4885654885654886, + "grad_norm": 0.17838601768016815, + "learning_rate": 2.4402286902286902e-05, + "loss": 0.1126, + "num_input_tokens_seen": 178848, + "step": 940 + }, + { + "epoch": 0.4911642411642412, + "grad_norm": 5.282505035400391, + "learning_rate": 2.4532224532224534e-05, + "loss": 0.1599, + "num_input_tokens_seen": 179776, + "step": 945 + }, + { + "epoch": 0.49376299376299376, + "grad_norm": 5.782370567321777, + "learning_rate": 2.4662162162162162e-05, + "loss": 0.0948, + "num_input_tokens_seen": 180736, + "step": 950 + }, + { + "epoch": 0.49636174636174635, + "grad_norm": 4.200228691101074, + "learning_rate": 2.4792099792099794e-05, + "loss": 0.4796, + "num_input_tokens_seen": 181664, + "step": 955 + }, + { + "epoch": 0.498960498960499, + "grad_norm": 4.424343109130859, + "learning_rate": 2.4922037422037423e-05, + "loss": 0.3114, + "num_input_tokens_seen": 182656, + "step": 960 + }, + { + "epoch": 0.5, + "eval_loss": 0.1727323681116104, + "eval_runtime": 9.2319, + "eval_samples_per_second": 92.722, + "eval_steps_per_second": 23.181, + "num_input_tokens_seen": 183040, + "step": 962 + }, + { + "epoch": 0.5015592515592515, + "grad_norm": 4.957394599914551, + "learning_rate": 2.505197505197505e-05, + "loss": 0.1817, + "num_input_tokens_seen": 183616, + "step": 965 + }, + { + "epoch": 0.5041580041580042, + "grad_norm": 0.4428081214427948, + "learning_rate": 2.5181912681912683e-05, + "loss": 0.1189, + "num_input_tokens_seen": 184576, + "step": 970 + }, + { + "epoch": 0.5067567567567568, + "grad_norm": 0.6303672790527344, + "learning_rate": 2.531185031185031e-05, + "loss": 0.0933, + "num_input_tokens_seen": 185568, + "step": 975 + }, + { + "epoch": 0.5093555093555093, + "grad_norm": 11.509081840515137, + "learning_rate": 2.5441787941787943e-05, + "loss": 0.4073, + "num_input_tokens_seen": 186560, + "step": 980 + }, + { + "epoch": 0.511954261954262, + "grad_norm": 0.5707141757011414, + "learning_rate": 2.5571725571725575e-05, + "loss": 0.2337, + "num_input_tokens_seen": 187488, + "step": 985 + }, + { + "epoch": 0.5145530145530145, + "grad_norm": 0.6125525832176208, + "learning_rate": 2.57016632016632e-05, + "loss": 0.1028, + "num_input_tokens_seen": 188384, + "step": 990 + }, + { + "epoch": 0.5171517671517671, + "grad_norm": 4.390307426452637, + "learning_rate": 2.5831600831600832e-05, + "loss": 0.2176, + "num_input_tokens_seen": 189280, + "step": 995 + }, + { + "epoch": 0.5197505197505198, + "grad_norm": 3.691336154937744, + "learning_rate": 2.5961538461538464e-05, + "loss": 0.1476, + "num_input_tokens_seen": 190240, + "step": 1000 + }, + { + "epoch": 0.5223492723492723, + "grad_norm": 6.269742965698242, + "learning_rate": 2.6091476091476092e-05, + "loss": 0.3596, + "num_input_tokens_seen": 191296, + "step": 1005 + }, + { + "epoch": 0.524948024948025, + "grad_norm": 0.15469954907894135, + "learning_rate": 2.6221413721413724e-05, + "loss": 0.1402, + "num_input_tokens_seen": 192224, + "step": 1010 + }, + { + "epoch": 0.5275467775467776, + "grad_norm": 12.283392906188965, + "learning_rate": 2.635135135135135e-05, + "loss": 0.2107, + "num_input_tokens_seen": 193088, + "step": 1015 + }, + { + "epoch": 0.5301455301455301, + "grad_norm": 6.677470684051514, + "learning_rate": 2.648128898128898e-05, + "loss": 0.3656, + "num_input_tokens_seen": 194048, + "step": 1020 + }, + { + "epoch": 0.5327442827442828, + "grad_norm": 3.8346846103668213, + "learning_rate": 2.6611226611226613e-05, + "loss": 0.2927, + "num_input_tokens_seen": 194976, + "step": 1025 + }, + { + "epoch": 0.5353430353430353, + "grad_norm": 4.9780426025390625, + "learning_rate": 2.674116424116424e-05, + "loss": 0.2694, + "num_input_tokens_seen": 195904, + "step": 1030 + }, + { + "epoch": 0.5379417879417879, + "grad_norm": 4.611321449279785, + "learning_rate": 2.6871101871101873e-05, + "loss": 0.1016, + "num_input_tokens_seen": 196800, + "step": 1035 + }, + { + "epoch": 0.5405405405405406, + "grad_norm": 1.8792976140975952, + "learning_rate": 2.7001039501039505e-05, + "loss": 0.2483, + "num_input_tokens_seen": 197760, + "step": 1040 + }, + { + "epoch": 0.5431392931392931, + "grad_norm": 4.705212593078613, + "learning_rate": 2.713097713097713e-05, + "loss": 0.1207, + "num_input_tokens_seen": 198720, + "step": 1045 + }, + { + "epoch": 0.5457380457380457, + "grad_norm": 3.8734920024871826, + "learning_rate": 2.7260914760914762e-05, + "loss": 0.1955, + "num_input_tokens_seen": 199680, + "step": 1050 + }, + { + "epoch": 0.5483367983367984, + "grad_norm": 1.4791784286499023, + "learning_rate": 2.739085239085239e-05, + "loss": 0.1778, + "num_input_tokens_seen": 200576, + "step": 1055 + }, + { + "epoch": 0.5509355509355509, + "grad_norm": 4.842630863189697, + "learning_rate": 2.7520790020790022e-05, + "loss": 0.3516, + "num_input_tokens_seen": 201504, + "step": 1060 + }, + { + "epoch": 0.5535343035343036, + "grad_norm": 2.5369958877563477, + "learning_rate": 2.7650727650727654e-05, + "loss": 0.1803, + "num_input_tokens_seen": 202464, + "step": 1065 + }, + { + "epoch": 0.5561330561330561, + "grad_norm": 0.8226618766784668, + "learning_rate": 2.778066528066528e-05, + "loss": 0.18, + "num_input_tokens_seen": 203328, + "step": 1070 + }, + { + "epoch": 0.5587318087318087, + "grad_norm": 3.162135362625122, + "learning_rate": 2.791060291060291e-05, + "loss": 0.0838, + "num_input_tokens_seen": 204256, + "step": 1075 + }, + { + "epoch": 0.5613305613305614, + "grad_norm": 1.0489662885665894, + "learning_rate": 2.8040540540540543e-05, + "loss": 0.1403, + "num_input_tokens_seen": 205184, + "step": 1080 + }, + { + "epoch": 0.5639293139293139, + "grad_norm": 3.4505069255828857, + "learning_rate": 2.817047817047817e-05, + "loss": 0.2029, + "num_input_tokens_seen": 206176, + "step": 1085 + }, + { + "epoch": 0.5665280665280665, + "grad_norm": 6.663577079772949, + "learning_rate": 2.8300415800415803e-05, + "loss": 0.2138, + "num_input_tokens_seen": 207168, + "step": 1090 + }, + { + "epoch": 0.5691268191268192, + "grad_norm": 0.5722617506980896, + "learning_rate": 2.8430353430353428e-05, + "loss": 0.1379, + "num_input_tokens_seen": 208192, + "step": 1095 + }, + { + "epoch": 0.5717255717255717, + "grad_norm": 3.6413824558258057, + "learning_rate": 2.856029106029106e-05, + "loss": 0.1931, + "num_input_tokens_seen": 209152, + "step": 1100 + }, + { + "epoch": 0.5743243243243243, + "grad_norm": 0.8144516348838806, + "learning_rate": 2.8690228690228692e-05, + "loss": 0.0536, + "num_input_tokens_seen": 210112, + "step": 1105 + }, + { + "epoch": 0.5769230769230769, + "grad_norm": 8.647539138793945, + "learning_rate": 2.882016632016632e-05, + "loss": 0.3208, + "num_input_tokens_seen": 211040, + "step": 1110 + }, + { + "epoch": 0.5795218295218295, + "grad_norm": 9.10535717010498, + "learning_rate": 2.8950103950103952e-05, + "loss": 0.0361, + "num_input_tokens_seen": 211936, + "step": 1115 + }, + { + "epoch": 0.5821205821205822, + "grad_norm": 0.10410264134407043, + "learning_rate": 2.9080041580041584e-05, + "loss": 0.0731, + "num_input_tokens_seen": 212928, + "step": 1120 + }, + { + "epoch": 0.5847193347193347, + "grad_norm": 0.13444888591766357, + "learning_rate": 2.920997920997921e-05, + "loss": 0.1993, + "num_input_tokens_seen": 213856, + "step": 1125 + }, + { + "epoch": 0.5873180873180873, + "grad_norm": 6.65608549118042, + "learning_rate": 2.933991683991684e-05, + "loss": 0.3077, + "num_input_tokens_seen": 214816, + "step": 1130 + }, + { + "epoch": 0.58991683991684, + "grad_norm": 1.6085166931152344, + "learning_rate": 2.946985446985447e-05, + "loss": 0.0378, + "num_input_tokens_seen": 215744, + "step": 1135 + }, + { + "epoch": 0.5925155925155925, + "grad_norm": 0.09550956636667252, + "learning_rate": 2.95997920997921e-05, + "loss": 0.0219, + "num_input_tokens_seen": 216768, + "step": 1140 + }, + { + "epoch": 0.5951143451143451, + "grad_norm": 0.4984482228755951, + "learning_rate": 2.9729729729729733e-05, + "loss": 0.3197, + "num_input_tokens_seen": 217696, + "step": 1145 + }, + { + "epoch": 0.5977130977130977, + "grad_norm": 4.560796737670898, + "learning_rate": 2.9859667359667358e-05, + "loss": 0.1741, + "num_input_tokens_seen": 218688, + "step": 1150 + }, + { + "epoch": 0.6003118503118503, + "grad_norm": 0.8130283951759338, + "learning_rate": 2.998960498960499e-05, + "loss": 0.3396, + "num_input_tokens_seen": 219616, + "step": 1155 + }, + { + "epoch": 0.6029106029106029, + "grad_norm": 0.3763618469238281, + "learning_rate": 3.0119542619542622e-05, + "loss": 0.0868, + "num_input_tokens_seen": 220576, + "step": 1160 + }, + { + "epoch": 0.6055093555093555, + "grad_norm": 0.7856326699256897, + "learning_rate": 3.024948024948025e-05, + "loss": 0.1729, + "num_input_tokens_seen": 221472, + "step": 1165 + }, + { + "epoch": 0.6081081081081081, + "grad_norm": 0.5760618448257446, + "learning_rate": 3.0379417879417882e-05, + "loss": 0.1975, + "num_input_tokens_seen": 222464, + "step": 1170 + }, + { + "epoch": 0.6107068607068608, + "grad_norm": 0.8184646368026733, + "learning_rate": 3.0509355509355507e-05, + "loss": 0.182, + "num_input_tokens_seen": 223392, + "step": 1175 + }, + { + "epoch": 0.6133056133056133, + "grad_norm": 6.310567855834961, + "learning_rate": 3.063929313929314e-05, + "loss": 0.2524, + "num_input_tokens_seen": 224416, + "step": 1180 + }, + { + "epoch": 0.6159043659043659, + "grad_norm": 7.584100723266602, + "learning_rate": 3.0769230769230774e-05, + "loss": 0.2352, + "num_input_tokens_seen": 225376, + "step": 1185 + }, + { + "epoch": 0.6185031185031185, + "grad_norm": 5.612671852111816, + "learning_rate": 3.08991683991684e-05, + "loss": 0.1559, + "num_input_tokens_seen": 226336, + "step": 1190 + }, + { + "epoch": 0.6211018711018711, + "grad_norm": 3.8247427940368652, + "learning_rate": 3.102910602910603e-05, + "loss": 0.2103, + "num_input_tokens_seen": 227200, + "step": 1195 + }, + { + "epoch": 0.6237006237006237, + "grad_norm": 4.536149501800537, + "learning_rate": 3.115904365904366e-05, + "loss": 0.0987, + "num_input_tokens_seen": 228128, + "step": 1200 + }, + { + "epoch": 0.6262993762993763, + "grad_norm": 8.00773811340332, + "learning_rate": 3.128898128898129e-05, + "loss": 0.3656, + "num_input_tokens_seen": 229088, + "step": 1205 + }, + { + "epoch": 0.6288981288981289, + "grad_norm": 3.6946041584014893, + "learning_rate": 3.141891891891892e-05, + "loss": 0.1484, + "num_input_tokens_seen": 230016, + "step": 1210 + }, + { + "epoch": 0.6314968814968815, + "grad_norm": 6.182285308837891, + "learning_rate": 3.1548856548856545e-05, + "loss": 0.2232, + "num_input_tokens_seen": 230944, + "step": 1215 + }, + { + "epoch": 0.6340956340956341, + "grad_norm": 0.2105250358581543, + "learning_rate": 3.167879417879418e-05, + "loss": 0.0922, + "num_input_tokens_seen": 231936, + "step": 1220 + }, + { + "epoch": 0.6366943866943867, + "grad_norm": 1.1177043914794922, + "learning_rate": 3.180873180873181e-05, + "loss": 0.0306, + "num_input_tokens_seen": 232896, + "step": 1225 + }, + { + "epoch": 0.6392931392931392, + "grad_norm": 2.9920780658721924, + "learning_rate": 3.193866943866944e-05, + "loss": 0.1071, + "num_input_tokens_seen": 233824, + "step": 1230 + }, + { + "epoch": 0.6418918918918919, + "grad_norm": 15.865307807922363, + "learning_rate": 3.206860706860707e-05, + "loss": 0.4069, + "num_input_tokens_seen": 234784, + "step": 1235 + }, + { + "epoch": 0.6444906444906445, + "grad_norm": 5.413437843322754, + "learning_rate": 3.2198544698544704e-05, + "loss": 0.2828, + "num_input_tokens_seen": 235712, + "step": 1240 + }, + { + "epoch": 0.6470893970893971, + "grad_norm": 3.361135482788086, + "learning_rate": 3.232848232848233e-05, + "loss": 0.3413, + "num_input_tokens_seen": 236704, + "step": 1245 + }, + { + "epoch": 0.6496881496881497, + "grad_norm": 3.3478143215179443, + "learning_rate": 3.245841995841996e-05, + "loss": 0.1968, + "num_input_tokens_seen": 237600, + "step": 1250 + }, + { + "epoch": 0.6522869022869023, + "grad_norm": 0.07949750870466232, + "learning_rate": 3.2588357588357586e-05, + "loss": 0.0287, + "num_input_tokens_seen": 238560, + "step": 1255 + }, + { + "epoch": 0.6548856548856549, + "grad_norm": 3.520878791809082, + "learning_rate": 3.271829521829522e-05, + "loss": 0.2076, + "num_input_tokens_seen": 239488, + "step": 1260 + }, + { + "epoch": 0.6574844074844075, + "grad_norm": 5.17156982421875, + "learning_rate": 3.284823284823285e-05, + "loss": 0.3055, + "num_input_tokens_seen": 240512, + "step": 1265 + }, + { + "epoch": 0.66008316008316, + "grad_norm": 5.363323211669922, + "learning_rate": 3.2978170478170475e-05, + "loss": 0.1557, + "num_input_tokens_seen": 241440, + "step": 1270 + }, + { + "epoch": 0.6626819126819127, + "grad_norm": 2.6796813011169434, + "learning_rate": 3.310810810810811e-05, + "loss": 0.1836, + "num_input_tokens_seen": 242464, + "step": 1275 + }, + { + "epoch": 0.6652806652806653, + "grad_norm": 0.659529447555542, + "learning_rate": 3.3238045738045745e-05, + "loss": 0.0959, + "num_input_tokens_seen": 243424, + "step": 1280 + }, + { + "epoch": 0.6678794178794178, + "grad_norm": 1.7245473861694336, + "learning_rate": 3.336798336798337e-05, + "loss": 0.1488, + "num_input_tokens_seen": 244320, + "step": 1285 + }, + { + "epoch": 0.6704781704781705, + "grad_norm": 0.5394813418388367, + "learning_rate": 3.3497920997921e-05, + "loss": 0.0993, + "num_input_tokens_seen": 245312, + "step": 1290 + }, + { + "epoch": 0.6730769230769231, + "grad_norm": 3.9466652870178223, + "learning_rate": 3.362785862785863e-05, + "loss": 0.0531, + "num_input_tokens_seen": 246304, + "step": 1295 + }, + { + "epoch": 0.6756756756756757, + "grad_norm": 0.296638548374176, + "learning_rate": 3.375779625779626e-05, + "loss": 0.1725, + "num_input_tokens_seen": 247200, + "step": 1300 + }, + { + "epoch": 0.6782744282744283, + "grad_norm": 4.3726325035095215, + "learning_rate": 3.388773388773389e-05, + "loss": 0.1496, + "num_input_tokens_seen": 248160, + "step": 1305 + }, + { + "epoch": 0.6808731808731808, + "grad_norm": 4.092756748199463, + "learning_rate": 3.4017671517671516e-05, + "loss": 0.2716, + "num_input_tokens_seen": 249056, + "step": 1310 + }, + { + "epoch": 0.6834719334719335, + "grad_norm": 8.214967727661133, + "learning_rate": 3.414760914760915e-05, + "loss": 0.4535, + "num_input_tokens_seen": 250080, + "step": 1315 + }, + { + "epoch": 0.6860706860706861, + "grad_norm": 0.18670150637626648, + "learning_rate": 3.427754677754678e-05, + "loss": 0.1999, + "num_input_tokens_seen": 251040, + "step": 1320 + }, + { + "epoch": 0.6886694386694386, + "grad_norm": 2.7802348136901855, + "learning_rate": 3.4407484407484405e-05, + "loss": 0.0823, + "num_input_tokens_seen": 251936, + "step": 1325 + }, + { + "epoch": 0.6912681912681913, + "grad_norm": 0.24137216806411743, + "learning_rate": 3.4537422037422044e-05, + "loss": 0.1078, + "num_input_tokens_seen": 252960, + "step": 1330 + }, + { + "epoch": 0.6938669438669439, + "grad_norm": 3.3526206016540527, + "learning_rate": 3.466735966735967e-05, + "loss": 0.2779, + "num_input_tokens_seen": 253952, + "step": 1335 + }, + { + "epoch": 0.6964656964656964, + "grad_norm": 2.492011547088623, + "learning_rate": 3.47972972972973e-05, + "loss": 0.3672, + "num_input_tokens_seen": 254848, + "step": 1340 + }, + { + "epoch": 0.6990644490644491, + "grad_norm": 1.222192645072937, + "learning_rate": 3.492723492723493e-05, + "loss": 0.153, + "num_input_tokens_seen": 255840, + "step": 1345 + }, + { + "epoch": 0.7016632016632016, + "grad_norm": 1.43758225440979, + "learning_rate": 3.505717255717256e-05, + "loss": 0.1675, + "num_input_tokens_seen": 256768, + "step": 1350 + }, + { + "epoch": 0.7042619542619543, + "grad_norm": 0.3269171118736267, + "learning_rate": 3.518711018711019e-05, + "loss": 0.0689, + "num_input_tokens_seen": 257664, + "step": 1355 + }, + { + "epoch": 0.7068607068607069, + "grad_norm": 8.647032737731934, + "learning_rate": 3.531704781704782e-05, + "loss": 0.4194, + "num_input_tokens_seen": 258624, + "step": 1360 + }, + { + "epoch": 0.7094594594594594, + "grad_norm": 1.9798884391784668, + "learning_rate": 3.5446985446985446e-05, + "loss": 0.3003, + "num_input_tokens_seen": 259552, + "step": 1365 + }, + { + "epoch": 0.7120582120582121, + "grad_norm": 11.203836441040039, + "learning_rate": 3.557692307692308e-05, + "loss": 0.0997, + "num_input_tokens_seen": 260576, + "step": 1370 + }, + { + "epoch": 0.7146569646569647, + "grad_norm": 1.945120930671692, + "learning_rate": 3.57068607068607e-05, + "loss": 0.1381, + "num_input_tokens_seen": 261472, + "step": 1375 + }, + { + "epoch": 0.7172557172557172, + "grad_norm": 3.4772982597351074, + "learning_rate": 3.583679833679834e-05, + "loss": 0.1417, + "num_input_tokens_seen": 262464, + "step": 1380 + }, + { + "epoch": 0.7198544698544699, + "grad_norm": 4.528722763061523, + "learning_rate": 3.5966735966735974e-05, + "loss": 0.1374, + "num_input_tokens_seen": 263424, + "step": 1385 + }, + { + "epoch": 0.7224532224532224, + "grad_norm": 0.4743356704711914, + "learning_rate": 3.60966735966736e-05, + "loss": 0.0577, + "num_input_tokens_seen": 264384, + "step": 1390 + }, + { + "epoch": 0.725051975051975, + "grad_norm": 0.16007116436958313, + "learning_rate": 3.622661122661123e-05, + "loss": 0.1792, + "num_input_tokens_seen": 265312, + "step": 1395 + }, + { + "epoch": 0.7276507276507277, + "grad_norm": 4.696006774902344, + "learning_rate": 3.635654885654886e-05, + "loss": 0.3344, + "num_input_tokens_seen": 266176, + "step": 1400 + }, + { + "epoch": 0.7302494802494802, + "grad_norm": 3.1616263389587402, + "learning_rate": 3.648648648648649e-05, + "loss": 0.1817, + "num_input_tokens_seen": 267136, + "step": 1405 + }, + { + "epoch": 0.7328482328482329, + "grad_norm": 1.7303450107574463, + "learning_rate": 3.661642411642412e-05, + "loss": 0.0448, + "num_input_tokens_seen": 268064, + "step": 1410 + }, + { + "epoch": 0.7354469854469855, + "grad_norm": 3.115104913711548, + "learning_rate": 3.6746361746361744e-05, + "loss": 0.0265, + "num_input_tokens_seen": 268992, + "step": 1415 + }, + { + "epoch": 0.738045738045738, + "grad_norm": 0.07434652000665665, + "learning_rate": 3.6876299376299376e-05, + "loss": 0.1349, + "num_input_tokens_seen": 269920, + "step": 1420 + }, + { + "epoch": 0.7406444906444907, + "grad_norm": 11.382824897766113, + "learning_rate": 3.700623700623701e-05, + "loss": 0.4515, + "num_input_tokens_seen": 270976, + "step": 1425 + }, + { + "epoch": 0.7432432432432432, + "grad_norm": 7.808342456817627, + "learning_rate": 3.713617463617464e-05, + "loss": 0.1865, + "num_input_tokens_seen": 271904, + "step": 1430 + }, + { + "epoch": 0.7458419958419958, + "grad_norm": 2.555190086364746, + "learning_rate": 3.726611226611227e-05, + "loss": 0.1875, + "num_input_tokens_seen": 272896, + "step": 1435 + }, + { + "epoch": 0.7484407484407485, + "grad_norm": 7.875151634216309, + "learning_rate": 3.7396049896049903e-05, + "loss": 0.2008, + "num_input_tokens_seen": 273760, + "step": 1440 + }, + { + "epoch": 0.751039501039501, + "grad_norm": 1.114668369293213, + "learning_rate": 3.752598752598753e-05, + "loss": 0.1251, + "num_input_tokens_seen": 274752, + "step": 1445 + }, + { + "epoch": 0.7536382536382537, + "grad_norm": 0.18020467460155487, + "learning_rate": 3.765592515592516e-05, + "loss": 0.1038, + "num_input_tokens_seen": 275776, + "step": 1450 + }, + { + "epoch": 0.7562370062370062, + "grad_norm": 0.10137228667736053, + "learning_rate": 3.7785862785862785e-05, + "loss": 0.1038, + "num_input_tokens_seen": 276672, + "step": 1455 + }, + { + "epoch": 0.7588357588357588, + "grad_norm": 0.2291146069765091, + "learning_rate": 3.791580041580042e-05, + "loss": 0.2005, + "num_input_tokens_seen": 277536, + "step": 1460 + }, + { + "epoch": 0.7614345114345115, + "grad_norm": 2.7403924465179443, + "learning_rate": 3.804573804573805e-05, + "loss": 0.1036, + "num_input_tokens_seen": 278432, + "step": 1465 + }, + { + "epoch": 0.764033264033264, + "grad_norm": 1.1882911920547485, + "learning_rate": 3.8175675675675674e-05, + "loss": 0.2079, + "num_input_tokens_seen": 279424, + "step": 1470 + }, + { + "epoch": 0.7666320166320166, + "grad_norm": 0.5326305031776428, + "learning_rate": 3.8305613305613306e-05, + "loss": 0.159, + "num_input_tokens_seen": 280384, + "step": 1475 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 5.642427921295166, + "learning_rate": 3.843555093555094e-05, + "loss": 0.1341, + "num_input_tokens_seen": 281344, + "step": 1480 + }, + { + "epoch": 0.7718295218295218, + "grad_norm": 0.1242862120270729, + "learning_rate": 3.856548856548857e-05, + "loss": 0.4355, + "num_input_tokens_seen": 282272, + "step": 1485 + }, + { + "epoch": 0.7744282744282744, + "grad_norm": 6.798549652099609, + "learning_rate": 3.86954261954262e-05, + "loss": 0.1929, + "num_input_tokens_seen": 283232, + "step": 1490 + }, + { + "epoch": 0.777027027027027, + "grad_norm": 2.993637800216675, + "learning_rate": 3.8825363825363827e-05, + "loss": 0.2548, + "num_input_tokens_seen": 284192, + "step": 1495 + }, + { + "epoch": 0.7796257796257796, + "grad_norm": 0.6244673132896423, + "learning_rate": 3.895530145530146e-05, + "loss": 0.1612, + "num_input_tokens_seen": 285120, + "step": 1500 + }, + { + "epoch": 0.7822245322245323, + "grad_norm": 1.6014714241027832, + "learning_rate": 3.908523908523909e-05, + "loss": 0.0572, + "num_input_tokens_seen": 286144, + "step": 1505 + }, + { + "epoch": 0.7848232848232848, + "grad_norm": 6.547388076782227, + "learning_rate": 3.9215176715176715e-05, + "loss": 0.2091, + "num_input_tokens_seen": 287072, + "step": 1510 + }, + { + "epoch": 0.7874220374220374, + "grad_norm": 0.7334463000297546, + "learning_rate": 3.934511434511435e-05, + "loss": 0.057, + "num_input_tokens_seen": 288064, + "step": 1515 + }, + { + "epoch": 0.7900207900207901, + "grad_norm": 1.202533483505249, + "learning_rate": 3.947505197505197e-05, + "loss": 0.1348, + "num_input_tokens_seen": 289024, + "step": 1520 + }, + { + "epoch": 0.7926195426195426, + "grad_norm": 11.333463668823242, + "learning_rate": 3.9604989604989604e-05, + "loss": 0.2766, + "num_input_tokens_seen": 289984, + "step": 1525 + }, + { + "epoch": 0.7952182952182952, + "grad_norm": 0.19741080701351166, + "learning_rate": 3.9734927234927236e-05, + "loss": 0.1596, + "num_input_tokens_seen": 290912, + "step": 1530 + }, + { + "epoch": 0.7978170478170478, + "grad_norm": 0.2034258395433426, + "learning_rate": 3.986486486486487e-05, + "loss": 0.1976, + "num_input_tokens_seen": 291840, + "step": 1535 + }, + { + "epoch": 0.8004158004158004, + "grad_norm": 0.10752521455287933, + "learning_rate": 3.99948024948025e-05, + "loss": 0.3031, + "num_input_tokens_seen": 292768, + "step": 1540 + }, + { + "epoch": 0.803014553014553, + "grad_norm": 2.540839672088623, + "learning_rate": 4.012474012474013e-05, + "loss": 0.2502, + "num_input_tokens_seen": 293824, + "step": 1545 + }, + { + "epoch": 0.8056133056133056, + "grad_norm": 3.6929779052734375, + "learning_rate": 4.0254677754677757e-05, + "loss": 0.1626, + "num_input_tokens_seen": 294784, + "step": 1550 + }, + { + "epoch": 0.8082120582120582, + "grad_norm": 0.3556975722312927, + "learning_rate": 4.038461538461539e-05, + "loss": 0.1243, + "num_input_tokens_seen": 295712, + "step": 1555 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 0.062391497194767, + "learning_rate": 4.0514553014553013e-05, + "loss": 0.0889, + "num_input_tokens_seen": 296640, + "step": 1560 + }, + { + "epoch": 0.8134095634095634, + "grad_norm": 3.5602664947509766, + "learning_rate": 4.0644490644490645e-05, + "loss": 0.2251, + "num_input_tokens_seen": 297568, + "step": 1565 + }, + { + "epoch": 0.816008316008316, + "grad_norm": 8.4227933883667, + "learning_rate": 4.077442827442828e-05, + "loss": 0.0573, + "num_input_tokens_seen": 298528, + "step": 1570 + }, + { + "epoch": 0.8186070686070686, + "grad_norm": 2.7176103591918945, + "learning_rate": 4.09043659043659e-05, + "loss": 0.1621, + "num_input_tokens_seen": 299488, + "step": 1575 + }, + { + "epoch": 0.8212058212058212, + "grad_norm": 33.77507400512695, + "learning_rate": 4.1034303534303534e-05, + "loss": 0.119, + "num_input_tokens_seen": 300448, + "step": 1580 + }, + { + "epoch": 0.8238045738045738, + "grad_norm": 0.6572396159172058, + "learning_rate": 4.1164241164241166e-05, + "loss": 0.1754, + "num_input_tokens_seen": 301472, + "step": 1585 + }, + { + "epoch": 0.8264033264033264, + "grad_norm": 0.3106120526790619, + "learning_rate": 4.12941787941788e-05, + "loss": 0.3325, + "num_input_tokens_seen": 302368, + "step": 1590 + }, + { + "epoch": 0.829002079002079, + "grad_norm": 0.3693045377731323, + "learning_rate": 4.142411642411643e-05, + "loss": 0.0446, + "num_input_tokens_seen": 303264, + "step": 1595 + }, + { + "epoch": 0.8316008316008316, + "grad_norm": 5.053155422210693, + "learning_rate": 4.1554054054054055e-05, + "loss": 0.2354, + "num_input_tokens_seen": 304256, + "step": 1600 + }, + { + "epoch": 0.8341995841995842, + "grad_norm": 1.7727495431900024, + "learning_rate": 4.1683991683991686e-05, + "loss": 0.1419, + "num_input_tokens_seen": 305184, + "step": 1605 + }, + { + "epoch": 0.8367983367983368, + "grad_norm": 0.26505419611930847, + "learning_rate": 4.181392931392932e-05, + "loss": 0.0851, + "num_input_tokens_seen": 306144, + "step": 1610 + }, + { + "epoch": 0.8393970893970893, + "grad_norm": 5.681260108947754, + "learning_rate": 4.194386694386694e-05, + "loss": 0.2472, + "num_input_tokens_seen": 307104, + "step": 1615 + }, + { + "epoch": 0.841995841995842, + "grad_norm": 3.116065502166748, + "learning_rate": 4.2073804573804575e-05, + "loss": 0.4016, + "num_input_tokens_seen": 308096, + "step": 1620 + }, + { + "epoch": 0.8445945945945946, + "grad_norm": 0.040946558117866516, + "learning_rate": 4.220374220374221e-05, + "loss": 0.0567, + "num_input_tokens_seen": 309088, + "step": 1625 + }, + { + "epoch": 0.8471933471933472, + "grad_norm": 2.5303564071655273, + "learning_rate": 4.233367983367983e-05, + "loss": 0.311, + "num_input_tokens_seen": 310048, + "step": 1630 + }, + { + "epoch": 0.8497920997920998, + "grad_norm": 7.206785202026367, + "learning_rate": 4.2463617463617464e-05, + "loss": 0.1359, + "num_input_tokens_seen": 311008, + "step": 1635 + }, + { + "epoch": 0.8523908523908524, + "grad_norm": 2.1711583137512207, + "learning_rate": 4.2593555093555096e-05, + "loss": 0.2044, + "num_input_tokens_seen": 312000, + "step": 1640 + }, + { + "epoch": 0.854989604989605, + "grad_norm": 3.7275736331939697, + "learning_rate": 4.272349272349273e-05, + "loss": 0.2731, + "num_input_tokens_seen": 312864, + "step": 1645 + }, + { + "epoch": 0.8575883575883576, + "grad_norm": 2.6774582862854004, + "learning_rate": 4.285343035343036e-05, + "loss": 0.3732, + "num_input_tokens_seen": 313792, + "step": 1650 + }, + { + "epoch": 0.8601871101871101, + "grad_norm": 0.7275611758232117, + "learning_rate": 4.2983367983367985e-05, + "loss": 0.1846, + "num_input_tokens_seen": 314688, + "step": 1655 + }, + { + "epoch": 0.8627858627858628, + "grad_norm": 0.4194641411304474, + "learning_rate": 4.3113305613305616e-05, + "loss": 0.0869, + "num_input_tokens_seen": 315648, + "step": 1660 + }, + { + "epoch": 0.8653846153846154, + "grad_norm": 0.4645836055278778, + "learning_rate": 4.324324324324325e-05, + "loss": 0.0974, + "num_input_tokens_seen": 316704, + "step": 1665 + }, + { + "epoch": 0.867983367983368, + "grad_norm": 0.7178572416305542, + "learning_rate": 4.337318087318087e-05, + "loss": 0.2355, + "num_input_tokens_seen": 317664, + "step": 1670 + }, + { + "epoch": 0.8705821205821206, + "grad_norm": 0.3808295726776123, + "learning_rate": 4.3503118503118505e-05, + "loss": 0.0251, + "num_input_tokens_seen": 318720, + "step": 1675 + }, + { + "epoch": 0.8731808731808732, + "grad_norm": 0.7866677641868591, + "learning_rate": 4.363305613305613e-05, + "loss": 0.0614, + "num_input_tokens_seen": 319744, + "step": 1680 + }, + { + "epoch": 0.8757796257796258, + "grad_norm": 0.11243905872106552, + "learning_rate": 4.376299376299376e-05, + "loss": 0.1124, + "num_input_tokens_seen": 320736, + "step": 1685 + }, + { + "epoch": 0.8783783783783784, + "grad_norm": 6.844594955444336, + "learning_rate": 4.3892931392931394e-05, + "loss": 0.1368, + "num_input_tokens_seen": 321600, + "step": 1690 + }, + { + "epoch": 0.8809771309771309, + "grad_norm": 0.4273189604282379, + "learning_rate": 4.4022869022869026e-05, + "loss": 0.2002, + "num_input_tokens_seen": 322528, + "step": 1695 + }, + { + "epoch": 0.8835758835758836, + "grad_norm": 4.5362019538879395, + "learning_rate": 4.415280665280666e-05, + "loss": 0.4392, + "num_input_tokens_seen": 323488, + "step": 1700 + }, + { + "epoch": 0.8861746361746362, + "grad_norm": 5.475741863250732, + "learning_rate": 4.428274428274429e-05, + "loss": 0.1288, + "num_input_tokens_seen": 324480, + "step": 1705 + }, + { + "epoch": 0.8887733887733887, + "grad_norm": 3.6263465881347656, + "learning_rate": 4.4412681912681914e-05, + "loss": 0.0234, + "num_input_tokens_seen": 325440, + "step": 1710 + }, + { + "epoch": 0.8913721413721414, + "grad_norm": 11.91124153137207, + "learning_rate": 4.4542619542619546e-05, + "loss": 0.3944, + "num_input_tokens_seen": 326368, + "step": 1715 + }, + { + "epoch": 0.893970893970894, + "grad_norm": 4.1417012214660645, + "learning_rate": 4.467255717255717e-05, + "loss": 0.1121, + "num_input_tokens_seen": 327264, + "step": 1720 + }, + { + "epoch": 0.8965696465696466, + "grad_norm": 13.180960655212402, + "learning_rate": 4.48024948024948e-05, + "loss": 0.213, + "num_input_tokens_seen": 328192, + "step": 1725 + }, + { + "epoch": 0.8991683991683992, + "grad_norm": 5.138045310974121, + "learning_rate": 4.4932432432432435e-05, + "loss": 0.1923, + "num_input_tokens_seen": 329184, + "step": 1730 + }, + { + "epoch": 0.9017671517671517, + "grad_norm": 10.493078231811523, + "learning_rate": 4.506237006237006e-05, + "loss": 0.1896, + "num_input_tokens_seen": 330112, + "step": 1735 + }, + { + "epoch": 0.9043659043659044, + "grad_norm": 5.3083624839782715, + "learning_rate": 4.519230769230769e-05, + "loss": 0.0628, + "num_input_tokens_seen": 331072, + "step": 1740 + }, + { + "epoch": 0.906964656964657, + "grad_norm": 5.574632167816162, + "learning_rate": 4.5322245322245324e-05, + "loss": 0.2654, + "num_input_tokens_seen": 332032, + "step": 1745 + }, + { + "epoch": 0.9095634095634095, + "grad_norm": 5.535085678100586, + "learning_rate": 4.5452182952182956e-05, + "loss": 0.3008, + "num_input_tokens_seen": 332960, + "step": 1750 + }, + { + "epoch": 0.9121621621621622, + "grad_norm": 3.6637542247772217, + "learning_rate": 4.558212058212059e-05, + "loss": 0.2826, + "num_input_tokens_seen": 333952, + "step": 1755 + }, + { + "epoch": 0.9147609147609148, + "grad_norm": 1.7246201038360596, + "learning_rate": 4.571205821205821e-05, + "loss": 0.1752, + "num_input_tokens_seen": 334848, + "step": 1760 + }, + { + "epoch": 0.9173596673596673, + "grad_norm": 2.1769750118255615, + "learning_rate": 4.5841995841995844e-05, + "loss": 0.3416, + "num_input_tokens_seen": 335904, + "step": 1765 + }, + { + "epoch": 0.91995841995842, + "grad_norm": 1.9967843294143677, + "learning_rate": 4.5971933471933476e-05, + "loss": 0.1793, + "num_input_tokens_seen": 336864, + "step": 1770 + }, + { + "epoch": 0.9225571725571725, + "grad_norm": 0.6303365230560303, + "learning_rate": 4.61018711018711e-05, + "loss": 0.1615, + "num_input_tokens_seen": 337824, + "step": 1775 + }, + { + "epoch": 0.9251559251559252, + "grad_norm": 4.288788795471191, + "learning_rate": 4.623180873180873e-05, + "loss": 0.0667, + "num_input_tokens_seen": 338784, + "step": 1780 + }, + { + "epoch": 0.9277546777546778, + "grad_norm": 4.233643054962158, + "learning_rate": 4.6361746361746365e-05, + "loss": 0.3675, + "num_input_tokens_seen": 339744, + "step": 1785 + }, + { + "epoch": 0.9303534303534303, + "grad_norm": 5.827807426452637, + "learning_rate": 4.649168399168399e-05, + "loss": 0.1994, + "num_input_tokens_seen": 340672, + "step": 1790 + }, + { + "epoch": 0.932952182952183, + "grad_norm": 3.717507839202881, + "learning_rate": 4.662162162162162e-05, + "loss": 0.3475, + "num_input_tokens_seen": 341632, + "step": 1795 + }, + { + "epoch": 0.9355509355509356, + "grad_norm": 0.35698720812797546, + "learning_rate": 4.6751559251559254e-05, + "loss": 0.0856, + "num_input_tokens_seen": 342656, + "step": 1800 + }, + { + "epoch": 0.9381496881496881, + "grad_norm": 0.13404658436775208, + "learning_rate": 4.6881496881496886e-05, + "loss": 0.3308, + "num_input_tokens_seen": 343584, + "step": 1805 + }, + { + "epoch": 0.9407484407484408, + "grad_norm": 0.44807690382003784, + "learning_rate": 4.701143451143452e-05, + "loss": 0.0391, + "num_input_tokens_seen": 344544, + "step": 1810 + }, + { + "epoch": 0.9433471933471933, + "grad_norm": 7.824325084686279, + "learning_rate": 4.714137214137214e-05, + "loss": 0.1294, + "num_input_tokens_seen": 345504, + "step": 1815 + }, + { + "epoch": 0.9459459459459459, + "grad_norm": 7.580188274383545, + "learning_rate": 4.7271309771309774e-05, + "loss": 0.16, + "num_input_tokens_seen": 346528, + "step": 1820 + }, + { + "epoch": 0.9485446985446986, + "grad_norm": 4.874233245849609, + "learning_rate": 4.7401247401247406e-05, + "loss": 0.1994, + "num_input_tokens_seen": 347456, + "step": 1825 + }, + { + "epoch": 0.9511434511434511, + "grad_norm": 0.11127970367670059, + "learning_rate": 4.753118503118503e-05, + "loss": 0.2464, + "num_input_tokens_seen": 348416, + "step": 1830 + }, + { + "epoch": 0.9537422037422038, + "grad_norm": 0.058995865285396576, + "learning_rate": 4.766112266112266e-05, + "loss": 0.2215, + "num_input_tokens_seen": 349408, + "step": 1835 + }, + { + "epoch": 0.9563409563409564, + "grad_norm": 0.9919191598892212, + "learning_rate": 4.779106029106029e-05, + "loss": 0.4095, + "num_input_tokens_seen": 350368, + "step": 1840 + }, + { + "epoch": 0.9589397089397089, + "grad_norm": 0.3371467590332031, + "learning_rate": 4.792099792099792e-05, + "loss": 0.0903, + "num_input_tokens_seen": 351296, + "step": 1845 + }, + { + "epoch": 0.9615384615384616, + "grad_norm": 5.537634372711182, + "learning_rate": 4.805093555093555e-05, + "loss": 0.1994, + "num_input_tokens_seen": 352192, + "step": 1850 + }, + { + "epoch": 0.9641372141372141, + "grad_norm": 5.821361541748047, + "learning_rate": 4.8180873180873184e-05, + "loss": 0.0471, + "num_input_tokens_seen": 353120, + "step": 1855 + }, + { + "epoch": 0.9667359667359667, + "grad_norm": 0.28735753893852234, + "learning_rate": 4.8310810810810816e-05, + "loss": 0.0842, + "num_input_tokens_seen": 354048, + "step": 1860 + }, + { + "epoch": 0.9693347193347194, + "grad_norm": 0.2810559868812561, + "learning_rate": 4.844074844074845e-05, + "loss": 0.1477, + "num_input_tokens_seen": 355040, + "step": 1865 + }, + { + "epoch": 0.9719334719334719, + "grad_norm": 0.15896418690681458, + "learning_rate": 4.857068607068607e-05, + "loss": 0.121, + "num_input_tokens_seen": 356064, + "step": 1870 + }, + { + "epoch": 0.9745322245322245, + "grad_norm": 1.5869849920272827, + "learning_rate": 4.8700623700623704e-05, + "loss": 0.54, + "num_input_tokens_seen": 357056, + "step": 1875 + }, + { + "epoch": 0.9771309771309772, + "grad_norm": 2.6653594970703125, + "learning_rate": 4.883056133056133e-05, + "loss": 0.4766, + "num_input_tokens_seen": 358048, + "step": 1880 + }, + { + "epoch": 0.9797297297297297, + "grad_norm": 1.8523311614990234, + "learning_rate": 4.896049896049896e-05, + "loss": 0.3022, + "num_input_tokens_seen": 358976, + "step": 1885 + }, + { + "epoch": 0.9823284823284824, + "grad_norm": 1.6993740797042847, + "learning_rate": 4.909043659043659e-05, + "loss": 0.1386, + "num_input_tokens_seen": 359968, + "step": 1890 + }, + { + "epoch": 0.9849272349272349, + "grad_norm": 0.4063449800014496, + "learning_rate": 4.922037422037422e-05, + "loss": 0.1175, + "num_input_tokens_seen": 360928, + "step": 1895 + }, + { + "epoch": 0.9875259875259875, + "grad_norm": 1.844796061515808, + "learning_rate": 4.935031185031185e-05, + "loss": 0.2262, + "num_input_tokens_seen": 361856, + "step": 1900 + }, + { + "epoch": 0.9901247401247402, + "grad_norm": 4.463516712188721, + "learning_rate": 4.948024948024949e-05, + "loss": 0.2513, + "num_input_tokens_seen": 362720, + "step": 1905 + }, + { + "epoch": 0.9927234927234927, + "grad_norm": 5.740899085998535, + "learning_rate": 4.9610187110187114e-05, + "loss": 0.2697, + "num_input_tokens_seen": 363648, + "step": 1910 + }, + { + "epoch": 0.9953222453222453, + "grad_norm": 0.28223568201065063, + "learning_rate": 4.9740124740124745e-05, + "loss": 0.0421, + "num_input_tokens_seen": 364512, + "step": 1915 + }, + { + "epoch": 0.997920997920998, + "grad_norm": 6.107125282287598, + "learning_rate": 4.987006237006237e-05, + "loss": 0.2949, + "num_input_tokens_seen": 365440, + "step": 1920 + }, + { + "epoch": 1.0, + "eval_loss": 0.21663005650043488, + "eval_runtime": 9.3004, + "eval_samples_per_second": 92.039, + "eval_steps_per_second": 23.01, + "num_input_tokens_seen": 366136, + "step": 1924 + }, + { + "epoch": 1.0005197505197505, + "grad_norm": 0.5737208724021912, + "learning_rate": 5e-05, + "loss": 0.0997, + "num_input_tokens_seen": 366296, + "step": 1925 + }, + { + "epoch": 1.003118503118503, + "grad_norm": 0.4207681119441986, + "learning_rate": 4.9999989713809036e-05, + "loss": 0.2099, + "num_input_tokens_seen": 367256, + "step": 1930 + }, + { + "epoch": 1.0057172557172558, + "grad_norm": 6.063812255859375, + "learning_rate": 4.999995885524459e-05, + "loss": 0.1116, + "num_input_tokens_seen": 368184, + "step": 1935 + }, + { + "epoch": 1.0083160083160083, + "grad_norm": 0.057391028851270676, + "learning_rate": 4.999990742433206e-05, + "loss": 0.0811, + "num_input_tokens_seen": 369112, + "step": 1940 + }, + { + "epoch": 1.0109147609147608, + "grad_norm": 12.48550033569336, + "learning_rate": 4.9999835421113784e-05, + "loss": 0.0597, + "num_input_tokens_seen": 370072, + "step": 1945 + }, + { + "epoch": 1.0135135135135136, + "grad_norm": 12.174312591552734, + "learning_rate": 4.999974284564899e-05, + "loss": 0.4563, + "num_input_tokens_seen": 371064, + "step": 1950 + }, + { + "epoch": 1.0161122661122661, + "grad_norm": 3.2687714099884033, + "learning_rate": 4.999962969801387e-05, + "loss": 0.2698, + "num_input_tokens_seen": 372024, + "step": 1955 + }, + { + "epoch": 1.0187110187110187, + "grad_norm": 12.878811836242676, + "learning_rate": 4.9999495978301534e-05, + "loss": 0.0682, + "num_input_tokens_seen": 372952, + "step": 1960 + }, + { + "epoch": 1.0213097713097714, + "grad_norm": 5.353172302246094, + "learning_rate": 4.999934168662201e-05, + "loss": 0.098, + "num_input_tokens_seen": 373976, + "step": 1965 + }, + { + "epoch": 1.023908523908524, + "grad_norm": 1.034735083580017, + "learning_rate": 4.9999166823102275e-05, + "loss": 0.0899, + "num_input_tokens_seen": 374904, + "step": 1970 + }, + { + "epoch": 1.0265072765072765, + "grad_norm": 0.2394150048494339, + "learning_rate": 4.9998971387886217e-05, + "loss": 0.0132, + "num_input_tokens_seen": 375832, + "step": 1975 + }, + { + "epoch": 1.0291060291060292, + "grad_norm": 16.492202758789062, + "learning_rate": 4.9998755381134655e-05, + "loss": 0.0804, + "num_input_tokens_seen": 376792, + "step": 1980 + }, + { + "epoch": 1.0317047817047817, + "grad_norm": 0.043886128813028336, + "learning_rate": 4.999851880302535e-05, + "loss": 0.117, + "num_input_tokens_seen": 377720, + "step": 1985 + }, + { + "epoch": 1.0343035343035343, + "grad_norm": 0.06320493668317795, + "learning_rate": 4.999826165375298e-05, + "loss": 0.1536, + "num_input_tokens_seen": 378712, + "step": 1990 + }, + { + "epoch": 1.0369022869022868, + "grad_norm": 0.04894839599728584, + "learning_rate": 4.999798393352914e-05, + "loss": 0.1003, + "num_input_tokens_seen": 379576, + "step": 1995 + }, + { + "epoch": 1.0395010395010396, + "grad_norm": 2.8507583141326904, + "learning_rate": 4.999768564258238e-05, + "loss": 0.1201, + "num_input_tokens_seen": 380536, + "step": 2000 + }, + { + "epoch": 1.042099792099792, + "grad_norm": 10.885866165161133, + "learning_rate": 4.999736678115815e-05, + "loss": 0.3307, + "num_input_tokens_seen": 381464, + "step": 2005 + }, + { + "epoch": 1.0446985446985446, + "grad_norm": 0.13783155381679535, + "learning_rate": 4.9997027349518845e-05, + "loss": 0.0975, + "num_input_tokens_seen": 382488, + "step": 2010 + }, + { + "epoch": 1.0472972972972974, + "grad_norm": 0.22044877707958221, + "learning_rate": 4.999666734794378e-05, + "loss": 0.0128, + "num_input_tokens_seen": 383448, + "step": 2015 + }, + { + "epoch": 1.04989604989605, + "grad_norm": 0.26998767256736755, + "learning_rate": 4.999628677672921e-05, + "loss": 0.1632, + "num_input_tokens_seen": 384408, + "step": 2020 + }, + { + "epoch": 1.0524948024948024, + "grad_norm": 0.10617353767156601, + "learning_rate": 4.999588563618828e-05, + "loss": 0.1252, + "num_input_tokens_seen": 385336, + "step": 2025 + }, + { + "epoch": 1.0550935550935552, + "grad_norm": 5.767865180969238, + "learning_rate": 4.999546392665111e-05, + "loss": 0.2205, + "num_input_tokens_seen": 386264, + "step": 2030 + }, + { + "epoch": 1.0576923076923077, + "grad_norm": 0.10179793834686279, + "learning_rate": 4.999502164846471e-05, + "loss": 0.0193, + "num_input_tokens_seen": 387128, + "step": 2035 + }, + { + "epoch": 1.0602910602910602, + "grad_norm": 4.219815254211426, + "learning_rate": 4.9994558801993043e-05, + "loss": 0.1747, + "num_input_tokens_seen": 388056, + "step": 2040 + }, + { + "epoch": 1.062889812889813, + "grad_norm": 2.1148221492767334, + "learning_rate": 4.999407538761696e-05, + "loss": 0.191, + "num_input_tokens_seen": 388984, + "step": 2045 + }, + { + "epoch": 1.0654885654885655, + "grad_norm": 15.598138809204102, + "learning_rate": 4.999357140573428e-05, + "loss": 0.2084, + "num_input_tokens_seen": 389912, + "step": 2050 + }, + { + "epoch": 1.068087318087318, + "grad_norm": 3.2878177165985107, + "learning_rate": 4.999304685675972e-05, + "loss": 0.1413, + "num_input_tokens_seen": 390808, + "step": 2055 + }, + { + "epoch": 1.0706860706860706, + "grad_norm": 2.676959276199341, + "learning_rate": 4.999250174112493e-05, + "loss": 0.1155, + "num_input_tokens_seen": 391768, + "step": 2060 + }, + { + "epoch": 1.0732848232848233, + "grad_norm": 0.3590862452983856, + "learning_rate": 4.999193605927848e-05, + "loss": 0.091, + "num_input_tokens_seen": 392664, + "step": 2065 + }, + { + "epoch": 1.0758835758835759, + "grad_norm": 0.9155418872833252, + "learning_rate": 4.9991349811685874e-05, + "loss": 0.122, + "num_input_tokens_seen": 393592, + "step": 2070 + }, + { + "epoch": 1.0784823284823284, + "grad_norm": 4.971704006195068, + "learning_rate": 4.999074299882953e-05, + "loss": 0.1053, + "num_input_tokens_seen": 394520, + "step": 2075 + }, + { + "epoch": 1.0810810810810811, + "grad_norm": 8.07427978515625, + "learning_rate": 4.999011562120879e-05, + "loss": 0.1542, + "num_input_tokens_seen": 395512, + "step": 2080 + }, + { + "epoch": 1.0836798336798337, + "grad_norm": 0.12560723721981049, + "learning_rate": 4.9989467679339915e-05, + "loss": 0.0153, + "num_input_tokens_seen": 396440, + "step": 2085 + }, + { + "epoch": 1.0862785862785862, + "grad_norm": 3.2973124980926514, + "learning_rate": 4.99887991737561e-05, + "loss": 0.1366, + "num_input_tokens_seen": 397304, + "step": 2090 + }, + { + "epoch": 1.088877338877339, + "grad_norm": 12.760185241699219, + "learning_rate": 4.9988110105007444e-05, + "loss": 0.3899, + "num_input_tokens_seen": 398296, + "step": 2095 + }, + { + "epoch": 1.0914760914760915, + "grad_norm": 8.680214881896973, + "learning_rate": 4.9987400473661e-05, + "loss": 0.2413, + "num_input_tokens_seen": 399384, + "step": 2100 + }, + { + "epoch": 1.094074844074844, + "grad_norm": 3.300733804702759, + "learning_rate": 4.998667028030071e-05, + "loss": 0.2108, + "num_input_tokens_seen": 400376, + "step": 2105 + }, + { + "epoch": 1.0966735966735968, + "grad_norm": 0.5848206877708435, + "learning_rate": 4.9985919525527434e-05, + "loss": 0.1024, + "num_input_tokens_seen": 401336, + "step": 2110 + }, + { + "epoch": 1.0992723492723493, + "grad_norm": 0.5929356813430786, + "learning_rate": 4.998514820995898e-05, + "loss": 0.098, + "num_input_tokens_seen": 402328, + "step": 2115 + }, + { + "epoch": 1.1018711018711018, + "grad_norm": 0.16337716579437256, + "learning_rate": 4.9984356334230055e-05, + "loss": 0.1037, + "num_input_tokens_seen": 403224, + "step": 2120 + }, + { + "epoch": 1.1044698544698546, + "grad_norm": 0.3427627980709076, + "learning_rate": 4.9983543898992284e-05, + "loss": 0.1094, + "num_input_tokens_seen": 404152, + "step": 2125 + }, + { + "epoch": 1.107068607068607, + "grad_norm": 0.11696132272481918, + "learning_rate": 4.9982710904914224e-05, + "loss": 0.1402, + "num_input_tokens_seen": 405080, + "step": 2130 + }, + { + "epoch": 1.1096673596673596, + "grad_norm": 0.34097060561180115, + "learning_rate": 4.998185735268135e-05, + "loss": 0.0281, + "num_input_tokens_seen": 406072, + "step": 2135 + }, + { + "epoch": 1.1122661122661124, + "grad_norm": 0.05038857460021973, + "learning_rate": 4.998098324299603e-05, + "loss": 0.1931, + "num_input_tokens_seen": 407096, + "step": 2140 + }, + { + "epoch": 1.114864864864865, + "grad_norm": 0.5693551301956177, + "learning_rate": 4.998008857657756e-05, + "loss": 0.1994, + "num_input_tokens_seen": 408056, + "step": 2145 + }, + { + "epoch": 1.1174636174636174, + "grad_norm": 0.7331157326698303, + "learning_rate": 4.997917335416218e-05, + "loss": 0.0529, + "num_input_tokens_seen": 408984, + "step": 2150 + }, + { + "epoch": 1.12006237006237, + "grad_norm": 8.841506958007812, + "learning_rate": 4.997823757650301e-05, + "loss": 0.1108, + "num_input_tokens_seen": 409944, + "step": 2155 + }, + { + "epoch": 1.1226611226611227, + "grad_norm": 1.6888242959976196, + "learning_rate": 4.997728124437009e-05, + "loss": 0.2188, + "num_input_tokens_seen": 410968, + "step": 2160 + }, + { + "epoch": 1.1252598752598753, + "grad_norm": 5.692882537841797, + "learning_rate": 4.9976304358550384e-05, + "loss": 0.2045, + "num_input_tokens_seen": 411864, + "step": 2165 + }, + { + "epoch": 1.1278586278586278, + "grad_norm": 2.4053163528442383, + "learning_rate": 4.9975306919847774e-05, + "loss": 0.0964, + "num_input_tokens_seen": 412856, + "step": 2170 + }, + { + "epoch": 1.1304573804573805, + "grad_norm": 12.006715774536133, + "learning_rate": 4.997428892908305e-05, + "loss": 0.0742, + "num_input_tokens_seen": 413816, + "step": 2175 + }, + { + "epoch": 1.133056133056133, + "grad_norm": 0.07755673676729202, + "learning_rate": 4.997325038709391e-05, + "loss": 0.1543, + "num_input_tokens_seen": 414744, + "step": 2180 + }, + { + "epoch": 1.1356548856548856, + "grad_norm": 3.3030519485473633, + "learning_rate": 4.997219129473495e-05, + "loss": 0.1565, + "num_input_tokens_seen": 415704, + "step": 2185 + }, + { + "epoch": 1.1382536382536383, + "grad_norm": 3.7185351848602295, + "learning_rate": 4.9971111652877705e-05, + "loss": 0.0297, + "num_input_tokens_seen": 416760, + "step": 2190 + }, + { + "epoch": 1.1408523908523909, + "grad_norm": 11.81783390045166, + "learning_rate": 4.99700114624106e-05, + "loss": 0.1495, + "num_input_tokens_seen": 417688, + "step": 2195 + }, + { + "epoch": 1.1434511434511434, + "grad_norm": 12.580696105957031, + "learning_rate": 4.9968890724238996e-05, + "loss": 0.299, + "num_input_tokens_seen": 418648, + "step": 2200 + }, + { + "epoch": 1.1460498960498962, + "grad_norm": 0.9728527665138245, + "learning_rate": 4.996774943928513e-05, + "loss": 0.0191, + "num_input_tokens_seen": 419640, + "step": 2205 + }, + { + "epoch": 1.1486486486486487, + "grad_norm": 0.1705656349658966, + "learning_rate": 4.996658760848815e-05, + "loss": 0.0748, + "num_input_tokens_seen": 420600, + "step": 2210 + }, + { + "epoch": 1.1512474012474012, + "grad_norm": 10.267090797424316, + "learning_rate": 4.996540523280413e-05, + "loss": 0.2653, + "num_input_tokens_seen": 421464, + "step": 2215 + }, + { + "epoch": 1.1538461538461537, + "grad_norm": 0.454512357711792, + "learning_rate": 4.996420231320604e-05, + "loss": 0.1524, + "num_input_tokens_seen": 422488, + "step": 2220 + }, + { + "epoch": 1.1564449064449065, + "grad_norm": 0.11541032791137695, + "learning_rate": 4.996297885068376e-05, + "loss": 0.1115, + "num_input_tokens_seen": 423480, + "step": 2225 + }, + { + "epoch": 1.159043659043659, + "grad_norm": 12.748964309692383, + "learning_rate": 4.996173484624408e-05, + "loss": 0.0232, + "num_input_tokens_seen": 424376, + "step": 2230 + }, + { + "epoch": 1.1616424116424116, + "grad_norm": 2.4986157417297363, + "learning_rate": 4.9960470300910665e-05, + "loss": 0.2456, + "num_input_tokens_seen": 425368, + "step": 2235 + }, + { + "epoch": 1.1642411642411643, + "grad_norm": 0.1304699331521988, + "learning_rate": 4.995918521572411e-05, + "loss": 0.0288, + "num_input_tokens_seen": 426296, + "step": 2240 + }, + { + "epoch": 1.1668399168399168, + "grad_norm": 3.047248125076294, + "learning_rate": 4.995787959174192e-05, + "loss": 0.2563, + "num_input_tokens_seen": 427256, + "step": 2245 + }, + { + "epoch": 1.1694386694386694, + "grad_norm": 3.289433240890503, + "learning_rate": 4.995655343003847e-05, + "loss": 0.1166, + "num_input_tokens_seen": 428216, + "step": 2250 + }, + { + "epoch": 1.1720374220374221, + "grad_norm": 1.6936733722686768, + "learning_rate": 4.995520673170506e-05, + "loss": 0.2159, + "num_input_tokens_seen": 429208, + "step": 2255 + }, + { + "epoch": 1.1746361746361746, + "grad_norm": 0.5056337118148804, + "learning_rate": 4.9953839497849886e-05, + "loss": 0.0273, + "num_input_tokens_seen": 430168, + "step": 2260 + }, + { + "epoch": 1.1772349272349272, + "grad_norm": 0.21145427227020264, + "learning_rate": 4.995245172959802e-05, + "loss": 0.1883, + "num_input_tokens_seen": 431064, + "step": 2265 + }, + { + "epoch": 1.17983367983368, + "grad_norm": 1.7288169860839844, + "learning_rate": 4.995104342809147e-05, + "loss": 0.1045, + "num_input_tokens_seen": 431992, + "step": 2270 + }, + { + "epoch": 1.1824324324324325, + "grad_norm": 2.599848985671997, + "learning_rate": 4.994961459448911e-05, + "loss": 0.1702, + "num_input_tokens_seen": 432984, + "step": 2275 + }, + { + "epoch": 1.185031185031185, + "grad_norm": 0.12377243489027023, + "learning_rate": 4.994816522996672e-05, + "loss": 0.1086, + "num_input_tokens_seen": 433848, + "step": 2280 + }, + { + "epoch": 1.1876299376299375, + "grad_norm": 0.6522893905639648, + "learning_rate": 4.994669533571699e-05, + "loss": 0.013, + "num_input_tokens_seen": 434776, + "step": 2285 + }, + { + "epoch": 1.1902286902286903, + "grad_norm": 0.24099931120872498, + "learning_rate": 4.994520491294947e-05, + "loss": 0.162, + "num_input_tokens_seen": 435704, + "step": 2290 + }, + { + "epoch": 1.1928274428274428, + "grad_norm": 4.567376136779785, + "learning_rate": 4.994369396289063e-05, + "loss": 0.1281, + "num_input_tokens_seen": 436632, + "step": 2295 + }, + { + "epoch": 1.1954261954261955, + "grad_norm": 7.695158004760742, + "learning_rate": 4.9942162486783825e-05, + "loss": 0.0889, + "num_input_tokens_seen": 437592, + "step": 2300 + }, + { + "epoch": 1.198024948024948, + "grad_norm": 0.16209858655929565, + "learning_rate": 4.994061048588929e-05, + "loss": 0.2365, + "num_input_tokens_seen": 438648, + "step": 2305 + }, + { + "epoch": 1.2006237006237006, + "grad_norm": 0.23352453112602234, + "learning_rate": 4.993903796148418e-05, + "loss": 0.1139, + "num_input_tokens_seen": 439576, + "step": 2310 + }, + { + "epoch": 1.2032224532224531, + "grad_norm": 4.344804763793945, + "learning_rate": 4.99374449148625e-05, + "loss": 0.0256, + "num_input_tokens_seen": 440504, + "step": 2315 + }, + { + "epoch": 1.2058212058212059, + "grad_norm": 3.9432373046875, + "learning_rate": 4.993583134733516e-05, + "loss": 0.1694, + "num_input_tokens_seen": 441432, + "step": 2320 + }, + { + "epoch": 1.2084199584199584, + "grad_norm": 0.4991118907928467, + "learning_rate": 4.993419726022997e-05, + "loss": 0.1373, + "num_input_tokens_seen": 442424, + "step": 2325 + }, + { + "epoch": 1.211018711018711, + "grad_norm": 1.4793727397918701, + "learning_rate": 4.993254265489159e-05, + "loss": 0.3125, + "num_input_tokens_seen": 443352, + "step": 2330 + }, + { + "epoch": 1.2136174636174637, + "grad_norm": 2.992570161819458, + "learning_rate": 4.9930867532681615e-05, + "loss": 0.3337, + "num_input_tokens_seen": 444312, + "step": 2335 + }, + { + "epoch": 1.2162162162162162, + "grad_norm": 0.21057476103305817, + "learning_rate": 4.992917189497848e-05, + "loss": 0.2016, + "num_input_tokens_seen": 445304, + "step": 2340 + }, + { + "epoch": 1.2188149688149688, + "grad_norm": 7.163358688354492, + "learning_rate": 4.9927455743177515e-05, + "loss": 0.1943, + "num_input_tokens_seen": 446264, + "step": 2345 + }, + { + "epoch": 1.2214137214137215, + "grad_norm": 0.2587575316429138, + "learning_rate": 4.9925719078690934e-05, + "loss": 0.101, + "num_input_tokens_seen": 447192, + "step": 2350 + }, + { + "epoch": 1.224012474012474, + "grad_norm": 9.924954414367676, + "learning_rate": 4.992396190294785e-05, + "loss": 0.1883, + "num_input_tokens_seen": 448248, + "step": 2355 + }, + { + "epoch": 1.2266112266112266, + "grad_norm": 0.4764508903026581, + "learning_rate": 4.99221842173942e-05, + "loss": 0.1581, + "num_input_tokens_seen": 449144, + "step": 2360 + }, + { + "epoch": 1.2292099792099793, + "grad_norm": 2.083777666091919, + "learning_rate": 4.992038602349286e-05, + "loss": 0.0764, + "num_input_tokens_seen": 450040, + "step": 2365 + }, + { + "epoch": 1.2318087318087318, + "grad_norm": 6.4147233963012695, + "learning_rate": 4.991856732272354e-05, + "loss": 0.1621, + "num_input_tokens_seen": 451064, + "step": 2370 + }, + { + "epoch": 1.2344074844074844, + "grad_norm": 0.41417691111564636, + "learning_rate": 4.9916728116582856e-05, + "loss": 0.1532, + "num_input_tokens_seen": 452024, + "step": 2375 + }, + { + "epoch": 1.237006237006237, + "grad_norm": 1.8500008583068848, + "learning_rate": 4.991486840658427e-05, + "loss": 0.0993, + "num_input_tokens_seen": 452952, + "step": 2380 + }, + { + "epoch": 1.2396049896049897, + "grad_norm": 2.9168806076049805, + "learning_rate": 4.9912988194258125e-05, + "loss": 0.0753, + "num_input_tokens_seen": 454008, + "step": 2385 + }, + { + "epoch": 1.2422037422037422, + "grad_norm": 0.8565680384635925, + "learning_rate": 4.991108748115165e-05, + "loss": 0.0943, + "num_input_tokens_seen": 454968, + "step": 2390 + }, + { + "epoch": 1.2448024948024947, + "grad_norm": 13.560432434082031, + "learning_rate": 4.990916626882893e-05, + "loss": 0.0637, + "num_input_tokens_seen": 455896, + "step": 2395 + }, + { + "epoch": 1.2474012474012475, + "grad_norm": 0.12258061021566391, + "learning_rate": 4.990722455887091e-05, + "loss": 0.1847, + "num_input_tokens_seen": 456760, + "step": 2400 + }, + { + "epoch": 1.25, + "grad_norm": 0.041579850018024445, + "learning_rate": 4.990526235287544e-05, + "loss": 0.0612, + "num_input_tokens_seen": 457752, + "step": 2405 + }, + { + "epoch": 1.2525987525987525, + "grad_norm": 1.1942895650863647, + "learning_rate": 4.9903279652457177e-05, + "loss": 0.4186, + "num_input_tokens_seen": 458744, + "step": 2410 + }, + { + "epoch": 1.255197505197505, + "grad_norm": 0.6040869951248169, + "learning_rate": 4.99012764592477e-05, + "loss": 0.0943, + "num_input_tokens_seen": 459640, + "step": 2415 + }, + { + "epoch": 1.2577962577962578, + "grad_norm": 2.5543293952941895, + "learning_rate": 4.989925277489542e-05, + "loss": 0.4007, + "num_input_tokens_seen": 460536, + "step": 2420 + }, + { + "epoch": 1.2603950103950103, + "grad_norm": 0.02161867544054985, + "learning_rate": 4.9897208601065614e-05, + "loss": 0.048, + "num_input_tokens_seen": 461464, + "step": 2425 + }, + { + "epoch": 1.262993762993763, + "grad_norm": 5.1713128089904785, + "learning_rate": 4.9895143939440434e-05, + "loss": 0.4015, + "num_input_tokens_seen": 462392, + "step": 2430 + }, + { + "epoch": 1.2655925155925156, + "grad_norm": 0.21651725471019745, + "learning_rate": 4.989305879171886e-05, + "loss": 0.061, + "num_input_tokens_seen": 463256, + "step": 2435 + }, + { + "epoch": 1.2681912681912682, + "grad_norm": 0.19471856951713562, + "learning_rate": 4.989095315961677e-05, + "loss": 0.0874, + "num_input_tokens_seen": 464184, + "step": 2440 + }, + { + "epoch": 1.2707900207900207, + "grad_norm": 5.337538719177246, + "learning_rate": 4.988882704486687e-05, + "loss": 0.0984, + "num_input_tokens_seen": 465112, + "step": 2445 + }, + { + "epoch": 1.2733887733887734, + "grad_norm": 2.197202682495117, + "learning_rate": 4.988668044921872e-05, + "loss": 0.1697, + "num_input_tokens_seen": 466040, + "step": 2450 + }, + { + "epoch": 1.275987525987526, + "grad_norm": 5.471217632293701, + "learning_rate": 4.988451337443877e-05, + "loss": 0.1748, + "num_input_tokens_seen": 467000, + "step": 2455 + }, + { + "epoch": 1.2785862785862787, + "grad_norm": 0.6844409704208374, + "learning_rate": 4.9882325822310275e-05, + "loss": 0.2569, + "num_input_tokens_seen": 467896, + "step": 2460 + }, + { + "epoch": 1.2811850311850312, + "grad_norm": 5.9083404541015625, + "learning_rate": 4.9880117794633365e-05, + "loss": 0.0994, + "num_input_tokens_seen": 468824, + "step": 2465 + }, + { + "epoch": 1.2837837837837838, + "grad_norm": 0.07726985216140747, + "learning_rate": 4.9877889293225014e-05, + "loss": 0.1316, + "num_input_tokens_seen": 469752, + "step": 2470 + }, + { + "epoch": 1.2863825363825363, + "grad_norm": 5.619214057922363, + "learning_rate": 4.987564031991905e-05, + "loss": 0.1738, + "num_input_tokens_seen": 470680, + "step": 2475 + }, + { + "epoch": 1.288981288981289, + "grad_norm": 5.460494041442871, + "learning_rate": 4.987337087656614e-05, + "loss": 0.2672, + "num_input_tokens_seen": 471576, + "step": 2480 + }, + { + "epoch": 1.2915800415800416, + "grad_norm": 7.81404972076416, + "learning_rate": 4.98710809650338e-05, + "loss": 0.1718, + "num_input_tokens_seen": 472504, + "step": 2485 + }, + { + "epoch": 1.2941787941787941, + "grad_norm": 7.6812920570373535, + "learning_rate": 4.9868770587206394e-05, + "loss": 0.0992, + "num_input_tokens_seen": 473560, + "step": 2490 + }, + { + "epoch": 1.2967775467775469, + "grad_norm": 4.5357346534729, + "learning_rate": 4.98664397449851e-05, + "loss": 0.0714, + "num_input_tokens_seen": 474488, + "step": 2495 + }, + { + "epoch": 1.2993762993762994, + "grad_norm": 1.793876051902771, + "learning_rate": 4.986408844028797e-05, + "loss": 0.2289, + "num_input_tokens_seen": 475448, + "step": 2500 + }, + { + "epoch": 1.301975051975052, + "grad_norm": 4.0109124183654785, + "learning_rate": 4.986171667504989e-05, + "loss": 0.2399, + "num_input_tokens_seen": 476440, + "step": 2505 + }, + { + "epoch": 1.3045738045738045, + "grad_norm": 0.8392947912216187, + "learning_rate": 4.985932445122257e-05, + "loss": 0.0871, + "num_input_tokens_seen": 477368, + "step": 2510 + }, + { + "epoch": 1.3071725571725572, + "grad_norm": 0.1802162230014801, + "learning_rate": 4.985691177077454e-05, + "loss": 0.0555, + "num_input_tokens_seen": 478360, + "step": 2515 + }, + { + "epoch": 1.3097713097713097, + "grad_norm": 0.09898947179317474, + "learning_rate": 4.9854478635691215e-05, + "loss": 0.0863, + "num_input_tokens_seen": 479288, + "step": 2520 + }, + { + "epoch": 1.3123700623700625, + "grad_norm": 0.3707248270511627, + "learning_rate": 4.985202504797478e-05, + "loss": 0.0239, + "num_input_tokens_seen": 480216, + "step": 2525 + }, + { + "epoch": 1.314968814968815, + "grad_norm": 2.2293717861175537, + "learning_rate": 4.984955100964431e-05, + "loss": 0.2173, + "num_input_tokens_seen": 481208, + "step": 2530 + }, + { + "epoch": 1.3175675675675675, + "grad_norm": 0.1923990100622177, + "learning_rate": 4.9847056522735655e-05, + "loss": 0.0135, + "num_input_tokens_seen": 482136, + "step": 2535 + }, + { + "epoch": 1.32016632016632, + "grad_norm": 6.844296932220459, + "learning_rate": 4.984454158930153e-05, + "loss": 0.1841, + "num_input_tokens_seen": 483064, + "step": 2540 + }, + { + "epoch": 1.3227650727650728, + "grad_norm": 0.09884180873632431, + "learning_rate": 4.984200621141145e-05, + "loss": 0.2016, + "num_input_tokens_seen": 484056, + "step": 2545 + }, + { + "epoch": 1.3253638253638254, + "grad_norm": 7.6985249519348145, + "learning_rate": 4.9839450391151785e-05, + "loss": 0.2717, + "num_input_tokens_seen": 484984, + "step": 2550 + }, + { + "epoch": 1.3279625779625779, + "grad_norm": 1.0604478120803833, + "learning_rate": 4.983687413062569e-05, + "loss": 0.1325, + "num_input_tokens_seen": 485912, + "step": 2555 + }, + { + "epoch": 1.3305613305613306, + "grad_norm": 0.19466906785964966, + "learning_rate": 4.983427743195317e-05, + "loss": 0.1054, + "num_input_tokens_seen": 486872, + "step": 2560 + }, + { + "epoch": 1.3331600831600832, + "grad_norm": 11.625582695007324, + "learning_rate": 4.983166029727102e-05, + "loss": 0.3506, + "num_input_tokens_seen": 487800, + "step": 2565 + }, + { + "epoch": 1.3357588357588357, + "grad_norm": 3.7489709854125977, + "learning_rate": 4.9829022728732884e-05, + "loss": 0.1569, + "num_input_tokens_seen": 488760, + "step": 2570 + }, + { + "epoch": 1.3383575883575882, + "grad_norm": 0.18098382651805878, + "learning_rate": 4.9826364728509195e-05, + "loss": 0.1518, + "num_input_tokens_seen": 489720, + "step": 2575 + }, + { + "epoch": 1.340956340956341, + "grad_norm": 2.1386280059814453, + "learning_rate": 4.982368629878722e-05, + "loss": 0.1502, + "num_input_tokens_seen": 490616, + "step": 2580 + }, + { + "epoch": 1.3435550935550935, + "grad_norm": 0.46780478954315186, + "learning_rate": 4.9820987441771e-05, + "loss": 0.112, + "num_input_tokens_seen": 491576, + "step": 2585 + }, + { + "epoch": 1.3461538461538463, + "grad_norm": 0.20615607500076294, + "learning_rate": 4.981826815968145e-05, + "loss": 0.0651, + "num_input_tokens_seen": 492536, + "step": 2590 + }, + { + "epoch": 1.3487525987525988, + "grad_norm": 0.0680459514260292, + "learning_rate": 4.981552845475622e-05, + "loss": 0.0513, + "num_input_tokens_seen": 493464, + "step": 2595 + }, + { + "epoch": 1.3513513513513513, + "grad_norm": 0.5723167061805725, + "learning_rate": 4.981276832924982e-05, + "loss": 0.1845, + "num_input_tokens_seen": 494392, + "step": 2600 + }, + { + "epoch": 1.3539501039501038, + "grad_norm": 6.534383296966553, + "learning_rate": 4.9809987785433544e-05, + "loss": 0.3353, + "num_input_tokens_seen": 495352, + "step": 2605 + }, + { + "epoch": 1.3565488565488566, + "grad_norm": 2.602442979812622, + "learning_rate": 4.980718682559547e-05, + "loss": 0.1375, + "num_input_tokens_seen": 496344, + "step": 2610 + }, + { + "epoch": 1.3591476091476091, + "grad_norm": 6.3038129806518555, + "learning_rate": 4.9804365452040516e-05, + "loss": 0.2393, + "num_input_tokens_seen": 497336, + "step": 2615 + }, + { + "epoch": 1.3617463617463619, + "grad_norm": 4.734416484832764, + "learning_rate": 4.980152366709037e-05, + "loss": 0.1114, + "num_input_tokens_seen": 498328, + "step": 2620 + }, + { + "epoch": 1.3643451143451144, + "grad_norm": 7.0635986328125, + "learning_rate": 4.979866147308352e-05, + "loss": 0.0854, + "num_input_tokens_seen": 499224, + "step": 2625 + }, + { + "epoch": 1.366943866943867, + "grad_norm": 0.2856924533843994, + "learning_rate": 4.979577887237525e-05, + "loss": 0.2556, + "num_input_tokens_seen": 500152, + "step": 2630 + }, + { + "epoch": 1.3695426195426195, + "grad_norm": 0.329127699136734, + "learning_rate": 4.979287586733765e-05, + "loss": 0.2595, + "num_input_tokens_seen": 501112, + "step": 2635 + }, + { + "epoch": 1.3721413721413722, + "grad_norm": 0.3200262784957886, + "learning_rate": 4.978995246035958e-05, + "loss": 0.2719, + "num_input_tokens_seen": 502072, + "step": 2640 + }, + { + "epoch": 1.3747401247401247, + "grad_norm": 6.850763320922852, + "learning_rate": 4.97870086538467e-05, + "loss": 0.1442, + "num_input_tokens_seen": 503096, + "step": 2645 + }, + { + "epoch": 1.3773388773388773, + "grad_norm": 0.18471364676952362, + "learning_rate": 4.9784044450221454e-05, + "loss": 0.1061, + "num_input_tokens_seen": 504056, + "step": 2650 + }, + { + "epoch": 1.37993762993763, + "grad_norm": 2.2016282081604004, + "learning_rate": 4.978105985192306e-05, + "loss": 0.1321, + "num_input_tokens_seen": 504984, + "step": 2655 + }, + { + "epoch": 1.3825363825363826, + "grad_norm": 0.7327699065208435, + "learning_rate": 4.9778054861407555e-05, + "loss": 0.1462, + "num_input_tokens_seen": 505912, + "step": 2660 + }, + { + "epoch": 1.385135135135135, + "grad_norm": 3.9842629432678223, + "learning_rate": 4.977502948114772e-05, + "loss": 0.0473, + "num_input_tokens_seen": 506808, + "step": 2665 + }, + { + "epoch": 1.3877338877338876, + "grad_norm": 0.3201759159564972, + "learning_rate": 4.977198371363311e-05, + "loss": 0.1714, + "num_input_tokens_seen": 507736, + "step": 2670 + }, + { + "epoch": 1.3903326403326404, + "grad_norm": 3.3773438930511475, + "learning_rate": 4.9768917561370093e-05, + "loss": 0.0394, + "num_input_tokens_seen": 508664, + "step": 2675 + }, + { + "epoch": 1.392931392931393, + "grad_norm": 0.13943618535995483, + "learning_rate": 4.9765831026881785e-05, + "loss": 0.064, + "num_input_tokens_seen": 509688, + "step": 2680 + }, + { + "epoch": 1.3955301455301456, + "grad_norm": 0.14033262431621552, + "learning_rate": 4.9762724112708084e-05, + "loss": 0.0153, + "num_input_tokens_seen": 510648, + "step": 2685 + }, + { + "epoch": 1.3981288981288982, + "grad_norm": 0.5169442892074585, + "learning_rate": 4.975959682140564e-05, + "loss": 0.0767, + "num_input_tokens_seen": 511544, + "step": 2690 + }, + { + "epoch": 1.4007276507276507, + "grad_norm": 10.133838653564453, + "learning_rate": 4.97564491555479e-05, + "loss": 0.0702, + "num_input_tokens_seen": 512536, + "step": 2695 + }, + { + "epoch": 1.4033264033264032, + "grad_norm": 0.08114776015281677, + "learning_rate": 4.975328111772507e-05, + "loss": 0.2524, + "num_input_tokens_seen": 513528, + "step": 2700 + }, + { + "epoch": 1.405925155925156, + "grad_norm": 10.507990837097168, + "learning_rate": 4.975009271054409e-05, + "loss": 0.2004, + "num_input_tokens_seen": 514456, + "step": 2705 + }, + { + "epoch": 1.4085239085239085, + "grad_norm": 12.546305656433105, + "learning_rate": 4.974688393662872e-05, + "loss": 0.2314, + "num_input_tokens_seen": 515448, + "step": 2710 + }, + { + "epoch": 1.411122661122661, + "grad_norm": 11.22164249420166, + "learning_rate": 4.974365479861941e-05, + "loss": 0.4544, + "num_input_tokens_seen": 516408, + "step": 2715 + }, + { + "epoch": 1.4137214137214138, + "grad_norm": 0.7762060761451721, + "learning_rate": 4.974040529917342e-05, + "loss": 0.0164, + "num_input_tokens_seen": 517528, + "step": 2720 + }, + { + "epoch": 1.4163201663201663, + "grad_norm": 1.9530563354492188, + "learning_rate": 4.973713544096475e-05, + "loss": 0.0959, + "num_input_tokens_seen": 518488, + "step": 2725 + }, + { + "epoch": 1.4189189189189189, + "grad_norm": 4.4322004318237305, + "learning_rate": 4.973384522668413e-05, + "loss": 0.0214, + "num_input_tokens_seen": 519448, + "step": 2730 + }, + { + "epoch": 1.4215176715176714, + "grad_norm": 0.6432997584342957, + "learning_rate": 4.973053465903909e-05, + "loss": 0.0319, + "num_input_tokens_seen": 520408, + "step": 2735 + }, + { + "epoch": 1.4241164241164241, + "grad_norm": 0.1384652554988861, + "learning_rate": 4.9727203740753855e-05, + "loss": 0.0956, + "num_input_tokens_seen": 521336, + "step": 2740 + }, + { + "epoch": 1.4267151767151767, + "grad_norm": 11.919386863708496, + "learning_rate": 4.972385247456945e-05, + "loss": 0.1721, + "num_input_tokens_seen": 522360, + "step": 2745 + }, + { + "epoch": 1.4293139293139294, + "grad_norm": 0.22422918677330017, + "learning_rate": 4.972048086324359e-05, + "loss": 0.0476, + "num_input_tokens_seen": 523352, + "step": 2750 + }, + { + "epoch": 1.431912681912682, + "grad_norm": 0.12081655114889145, + "learning_rate": 4.9717088909550775e-05, + "loss": 0.1033, + "num_input_tokens_seen": 524184, + "step": 2755 + }, + { + "epoch": 1.4345114345114345, + "grad_norm": 12.157293319702148, + "learning_rate": 4.971367661628222e-05, + "loss": 0.2925, + "num_input_tokens_seen": 525144, + "step": 2760 + }, + { + "epoch": 1.437110187110187, + "grad_norm": 0.07584191113710403, + "learning_rate": 4.971024398624588e-05, + "loss": 0.0924, + "num_input_tokens_seen": 526168, + "step": 2765 + }, + { + "epoch": 1.4397089397089398, + "grad_norm": 1.8403583765029907, + "learning_rate": 4.970679102226646e-05, + "loss": 0.1979, + "num_input_tokens_seen": 527064, + "step": 2770 + }, + { + "epoch": 1.4423076923076923, + "grad_norm": 0.0792529284954071, + "learning_rate": 4.97033177271854e-05, + "loss": 0.0959, + "num_input_tokens_seen": 528056, + "step": 2775 + }, + { + "epoch": 1.444906444906445, + "grad_norm": 2.7480971813201904, + "learning_rate": 4.9699824103860815e-05, + "loss": 0.283, + "num_input_tokens_seen": 528984, + "step": 2780 + }, + { + "epoch": 1.4475051975051976, + "grad_norm": 4.0055742263793945, + "learning_rate": 4.9696310155167635e-05, + "loss": 0.2164, + "num_input_tokens_seen": 529944, + "step": 2785 + }, + { + "epoch": 1.45010395010395, + "grad_norm": 3.3742170333862305, + "learning_rate": 4.9692775883997456e-05, + "loss": 0.0854, + "num_input_tokens_seen": 530904, + "step": 2790 + }, + { + "epoch": 1.4527027027027026, + "grad_norm": 2.847804546356201, + "learning_rate": 4.9689221293258605e-05, + "loss": 0.2415, + "num_input_tokens_seen": 531864, + "step": 2795 + }, + { + "epoch": 1.4553014553014554, + "grad_norm": 0.9062843918800354, + "learning_rate": 4.968564638587615e-05, + "loss": 0.0427, + "num_input_tokens_seen": 532792, + "step": 2800 + }, + { + "epoch": 1.457900207900208, + "grad_norm": 12.696125030517578, + "learning_rate": 4.9682051164791855e-05, + "loss": 0.0579, + "num_input_tokens_seen": 533784, + "step": 2805 + }, + { + "epoch": 1.4604989604989604, + "grad_norm": 6.7022271156311035, + "learning_rate": 4.967843563296422e-05, + "loss": 0.2133, + "num_input_tokens_seen": 534712, + "step": 2810 + }, + { + "epoch": 1.4630977130977132, + "grad_norm": 3.2481203079223633, + "learning_rate": 4.967479979336844e-05, + "loss": 0.324, + "num_input_tokens_seen": 535640, + "step": 2815 + }, + { + "epoch": 1.4656964656964657, + "grad_norm": 5.454586029052734, + "learning_rate": 4.9671143648996445e-05, + "loss": 0.0848, + "num_input_tokens_seen": 536600, + "step": 2820 + }, + { + "epoch": 1.4682952182952183, + "grad_norm": 0.3649411201477051, + "learning_rate": 4.9667467202856844e-05, + "loss": 0.1519, + "num_input_tokens_seen": 537464, + "step": 2825 + }, + { + "epoch": 1.4708939708939708, + "grad_norm": 1.1293423175811768, + "learning_rate": 4.966377045797498e-05, + "loss": 0.127, + "num_input_tokens_seen": 538392, + "step": 2830 + }, + { + "epoch": 1.4734927234927235, + "grad_norm": 0.2086305171251297, + "learning_rate": 4.9660053417392866e-05, + "loss": 0.2306, + "num_input_tokens_seen": 539288, + "step": 2835 + }, + { + "epoch": 1.476091476091476, + "grad_norm": 9.138508796691895, + "learning_rate": 4.9656316084169255e-05, + "loss": 0.3247, + "num_input_tokens_seen": 540248, + "step": 2840 + }, + { + "epoch": 1.4786902286902288, + "grad_norm": 0.9677737355232239, + "learning_rate": 4.965255846137958e-05, + "loss": 0.2584, + "num_input_tokens_seen": 541176, + "step": 2845 + }, + { + "epoch": 1.4812889812889813, + "grad_norm": 1.9537217617034912, + "learning_rate": 4.964878055211597e-05, + "loss": 0.1768, + "num_input_tokens_seen": 542104, + "step": 2850 + }, + { + "epoch": 1.4838877338877339, + "grad_norm": 2.4597880840301514, + "learning_rate": 4.9644982359487234e-05, + "loss": 0.1413, + "num_input_tokens_seen": 543064, + "step": 2855 + }, + { + "epoch": 1.4864864864864864, + "grad_norm": 3.3736534118652344, + "learning_rate": 4.964116388661891e-05, + "loss": 0.231, + "num_input_tokens_seen": 543992, + "step": 2860 + }, + { + "epoch": 1.4890852390852392, + "grad_norm": 6.532998085021973, + "learning_rate": 4.963732513665319e-05, + "loss": 0.0815, + "num_input_tokens_seen": 544920, + "step": 2865 + }, + { + "epoch": 1.4916839916839917, + "grad_norm": 0.6306436657905579, + "learning_rate": 4.963346611274896e-05, + "loss": 0.158, + "num_input_tokens_seen": 545848, + "step": 2870 + }, + { + "epoch": 1.4942827442827442, + "grad_norm": 5.479037284851074, + "learning_rate": 4.96295868180818e-05, + "loss": 0.1888, + "num_input_tokens_seen": 546776, + "step": 2875 + }, + { + "epoch": 1.496881496881497, + "grad_norm": 0.14485441148281097, + "learning_rate": 4.962568725584395e-05, + "loss": 0.1682, + "num_input_tokens_seen": 547672, + "step": 2880 + }, + { + "epoch": 1.4994802494802495, + "grad_norm": 1.912073016166687, + "learning_rate": 4.962176742924436e-05, + "loss": 0.1907, + "num_input_tokens_seen": 548632, + "step": 2885 + }, + { + "epoch": 1.5, + "eval_loss": 0.14812766015529633, + "eval_runtime": 9.2836, + "eval_samples_per_second": 92.205, + "eval_steps_per_second": 23.051, + "num_input_tokens_seen": 548856, + "step": 2886 + }, + { + "epoch": 1.502079002079002, + "grad_norm": 3.057159662246704, + "learning_rate": 4.961782734150862e-05, + "loss": 0.1793, + "num_input_tokens_seen": 549560, + "step": 2890 + }, + { + "epoch": 1.5046777546777546, + "grad_norm": 0.11375726014375687, + "learning_rate": 4.961386699587902e-05, + "loss": 0.0179, + "num_input_tokens_seen": 550584, + "step": 2895 + }, + { + "epoch": 1.5072765072765073, + "grad_norm": 14.288124084472656, + "learning_rate": 4.96098863956145e-05, + "loss": 0.0762, + "num_input_tokens_seen": 551608, + "step": 2900 + }, + { + "epoch": 1.5098752598752598, + "grad_norm": 2.3569605350494385, + "learning_rate": 4.960588554399069e-05, + "loss": 0.2372, + "num_input_tokens_seen": 552536, + "step": 2905 + }, + { + "epoch": 1.5124740124740126, + "grad_norm": 0.04723771661520004, + "learning_rate": 4.9601864444299875e-05, + "loss": 0.2435, + "num_input_tokens_seen": 553496, + "step": 2910 + }, + { + "epoch": 1.5150727650727651, + "grad_norm": 0.162594273686409, + "learning_rate": 4.959782309985098e-05, + "loss": 0.0218, + "num_input_tokens_seen": 554392, + "step": 2915 + }, + { + "epoch": 1.5176715176715176, + "grad_norm": 10.015912055969238, + "learning_rate": 4.959376151396962e-05, + "loss": 0.1946, + "num_input_tokens_seen": 555384, + "step": 2920 + }, + { + "epoch": 1.5202702702702702, + "grad_norm": 1.9031535387039185, + "learning_rate": 4.9589679689998046e-05, + "loss": 0.1657, + "num_input_tokens_seen": 556312, + "step": 2925 + }, + { + "epoch": 1.5228690228690227, + "grad_norm": 0.585745632648468, + "learning_rate": 4.9585577631295186e-05, + "loss": 0.3456, + "num_input_tokens_seen": 557272, + "step": 2930 + }, + { + "epoch": 1.5254677754677755, + "grad_norm": 0.07436993718147278, + "learning_rate": 4.958145534123659e-05, + "loss": 0.0722, + "num_input_tokens_seen": 558168, + "step": 2935 + }, + { + "epoch": 1.5280665280665282, + "grad_norm": 0.10185002535581589, + "learning_rate": 4.957731282321449e-05, + "loss": 0.0373, + "num_input_tokens_seen": 559128, + "step": 2940 + }, + { + "epoch": 1.5306652806652807, + "grad_norm": 1.342566728591919, + "learning_rate": 4.957315008063773e-05, + "loss": 0.0583, + "num_input_tokens_seen": 560056, + "step": 2945 + }, + { + "epoch": 1.5332640332640333, + "grad_norm": 10.525630950927734, + "learning_rate": 4.956896711693181e-05, + "loss": 0.138, + "num_input_tokens_seen": 560984, + "step": 2950 + }, + { + "epoch": 1.5358627858627858, + "grad_norm": 0.12478490173816681, + "learning_rate": 4.956476393553887e-05, + "loss": 0.1325, + "num_input_tokens_seen": 561880, + "step": 2955 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 0.10444329679012299, + "learning_rate": 4.9560540539917697e-05, + "loss": 0.3955, + "num_input_tokens_seen": 562904, + "step": 2960 + }, + { + "epoch": 1.541060291060291, + "grad_norm": 0.062495406717061996, + "learning_rate": 4.95562969335437e-05, + "loss": 0.0596, + "num_input_tokens_seen": 563896, + "step": 2965 + }, + { + "epoch": 1.5436590436590436, + "grad_norm": 0.1310296356678009, + "learning_rate": 4.9552033119908924e-05, + "loss": 0.136, + "num_input_tokens_seen": 564760, + "step": 2970 + }, + { + "epoch": 1.5462577962577964, + "grad_norm": 4.02937650680542, + "learning_rate": 4.954774910252204e-05, + "loss": 0.1321, + "num_input_tokens_seen": 565688, + "step": 2975 + }, + { + "epoch": 1.5488565488565489, + "grad_norm": 6.759334087371826, + "learning_rate": 4.954344488490834e-05, + "loss": 0.2449, + "num_input_tokens_seen": 566648, + "step": 2980 + }, + { + "epoch": 1.5514553014553014, + "grad_norm": 2.1862399578094482, + "learning_rate": 4.953912047060976e-05, + "loss": 0.2099, + "num_input_tokens_seen": 567576, + "step": 2985 + }, + { + "epoch": 1.554054054054054, + "grad_norm": 12.51477336883545, + "learning_rate": 4.953477586318482e-05, + "loss": 0.0722, + "num_input_tokens_seen": 568568, + "step": 2990 + }, + { + "epoch": 1.5566528066528067, + "grad_norm": 1.179866909980774, + "learning_rate": 4.953041106620869e-05, + "loss": 0.0791, + "num_input_tokens_seen": 569560, + "step": 2995 + }, + { + "epoch": 1.5592515592515592, + "grad_norm": 11.733736038208008, + "learning_rate": 4.952602608327313e-05, + "loss": 0.1335, + "num_input_tokens_seen": 570552, + "step": 3000 + }, + { + "epoch": 1.561850311850312, + "grad_norm": 4.526607036590576, + "learning_rate": 4.952162091798653e-05, + "loss": 0.1409, + "num_input_tokens_seen": 571544, + "step": 3005 + }, + { + "epoch": 1.5644490644490645, + "grad_norm": 0.10873514413833618, + "learning_rate": 4.9517195573973886e-05, + "loss": 0.1177, + "num_input_tokens_seen": 572408, + "step": 3010 + }, + { + "epoch": 1.567047817047817, + "grad_norm": 0.12282353639602661, + "learning_rate": 4.9512750054876786e-05, + "loss": 0.1419, + "num_input_tokens_seen": 573400, + "step": 3015 + }, + { + "epoch": 1.5696465696465696, + "grad_norm": 0.21290786564350128, + "learning_rate": 4.9508284364353416e-05, + "loss": 0.0875, + "num_input_tokens_seen": 574392, + "step": 3020 + }, + { + "epoch": 1.572245322245322, + "grad_norm": 1.6710559129714966, + "learning_rate": 4.950379850607859e-05, + "loss": 0.1247, + "num_input_tokens_seen": 575256, + "step": 3025 + }, + { + "epoch": 1.5748440748440748, + "grad_norm": 0.19620409607887268, + "learning_rate": 4.949929248374369e-05, + "loss": 0.1217, + "num_input_tokens_seen": 576184, + "step": 3030 + }, + { + "epoch": 1.5774428274428276, + "grad_norm": 2.675269603729248, + "learning_rate": 4.949476630105669e-05, + "loss": 0.3705, + "num_input_tokens_seen": 577080, + "step": 3035 + }, + { + "epoch": 1.5800415800415801, + "grad_norm": 0.13620440661907196, + "learning_rate": 4.949021996174219e-05, + "loss": 0.1281, + "num_input_tokens_seen": 578072, + "step": 3040 + }, + { + "epoch": 1.5826403326403327, + "grad_norm": 0.0923699364066124, + "learning_rate": 4.9485653469541335e-05, + "loss": 0.2027, + "num_input_tokens_seen": 579032, + "step": 3045 + }, + { + "epoch": 1.5852390852390852, + "grad_norm": 2.7578139305114746, + "learning_rate": 4.9481066828211865e-05, + "loss": 0.1715, + "num_input_tokens_seen": 579928, + "step": 3050 + }, + { + "epoch": 1.5878378378378377, + "grad_norm": 0.2767934501171112, + "learning_rate": 4.947646004152812e-05, + "loss": 0.0301, + "num_input_tokens_seen": 580888, + "step": 3055 + }, + { + "epoch": 1.5904365904365905, + "grad_norm": 8.004318237304688, + "learning_rate": 4.9471833113280994e-05, + "loss": 0.143, + "num_input_tokens_seen": 581816, + "step": 3060 + }, + { + "epoch": 1.593035343035343, + "grad_norm": 4.864521503448486, + "learning_rate": 4.9467186047277965e-05, + "loss": 0.2579, + "num_input_tokens_seen": 582808, + "step": 3065 + }, + { + "epoch": 1.5956340956340958, + "grad_norm": 0.15452036261558533, + "learning_rate": 4.9462518847343075e-05, + "loss": 0.1501, + "num_input_tokens_seen": 583736, + "step": 3070 + }, + { + "epoch": 1.5982328482328483, + "grad_norm": 4.936932563781738, + "learning_rate": 4.945783151731696e-05, + "loss": 0.1768, + "num_input_tokens_seen": 584696, + "step": 3075 + }, + { + "epoch": 1.6008316008316008, + "grad_norm": 0.6116353273391724, + "learning_rate": 4.9453124061056786e-05, + "loss": 0.1008, + "num_input_tokens_seen": 585656, + "step": 3080 + }, + { + "epoch": 1.6034303534303533, + "grad_norm": 1.2126575708389282, + "learning_rate": 4.94483964824363e-05, + "loss": 0.1535, + "num_input_tokens_seen": 586680, + "step": 3085 + }, + { + "epoch": 1.6060291060291059, + "grad_norm": 4.008216381072998, + "learning_rate": 4.94436487853458e-05, + "loss": 0.2372, + "num_input_tokens_seen": 587736, + "step": 3090 + }, + { + "epoch": 1.6086278586278586, + "grad_norm": 0.09116732329130173, + "learning_rate": 4.943888097369216e-05, + "loss": 0.1337, + "num_input_tokens_seen": 588728, + "step": 3095 + }, + { + "epoch": 1.6112266112266114, + "grad_norm": 13.859234809875488, + "learning_rate": 4.943409305139877e-05, + "loss": 0.0691, + "num_input_tokens_seen": 589688, + "step": 3100 + }, + { + "epoch": 1.613825363825364, + "grad_norm": 0.7724465131759644, + "learning_rate": 4.94292850224056e-05, + "loss": 0.0098, + "num_input_tokens_seen": 590648, + "step": 3105 + }, + { + "epoch": 1.6164241164241164, + "grad_norm": 0.0519450381398201, + "learning_rate": 4.9424456890669144e-05, + "loss": 0.0073, + "num_input_tokens_seen": 591608, + "step": 3110 + }, + { + "epoch": 1.619022869022869, + "grad_norm": 10.521476745605469, + "learning_rate": 4.941960866016246e-05, + "loss": 0.1093, + "num_input_tokens_seen": 592504, + "step": 3115 + }, + { + "epoch": 1.6216216216216215, + "grad_norm": 0.3513982892036438, + "learning_rate": 4.941474033487513e-05, + "loss": 0.0033, + "num_input_tokens_seen": 593432, + "step": 3120 + }, + { + "epoch": 1.6242203742203742, + "grad_norm": 0.4358207881450653, + "learning_rate": 4.940985191881328e-05, + "loss": 0.0895, + "num_input_tokens_seen": 594360, + "step": 3125 + }, + { + "epoch": 1.6268191268191268, + "grad_norm": 6.919356346130371, + "learning_rate": 4.940494341599955e-05, + "loss": 0.3013, + "num_input_tokens_seen": 595352, + "step": 3130 + }, + { + "epoch": 1.6294178794178795, + "grad_norm": 11.928922653198242, + "learning_rate": 4.940001483047314e-05, + "loss": 0.1964, + "num_input_tokens_seen": 596280, + "step": 3135 + }, + { + "epoch": 1.632016632016632, + "grad_norm": 13.327188491821289, + "learning_rate": 4.939506616628976e-05, + "loss": 0.2051, + "num_input_tokens_seen": 597272, + "step": 3140 + }, + { + "epoch": 1.6346153846153846, + "grad_norm": 7.170097827911377, + "learning_rate": 4.939009742752162e-05, + "loss": 0.2641, + "num_input_tokens_seen": 598232, + "step": 3145 + }, + { + "epoch": 1.637214137214137, + "grad_norm": 2.904573917388916, + "learning_rate": 4.9385108618257505e-05, + "loss": 0.1215, + "num_input_tokens_seen": 599128, + "step": 3150 + }, + { + "epoch": 1.6398128898128899, + "grad_norm": 17.041709899902344, + "learning_rate": 4.938009974260265e-05, + "loss": 0.3182, + "num_input_tokens_seen": 600152, + "step": 3155 + }, + { + "epoch": 1.6424116424116424, + "grad_norm": 0.37602314352989197, + "learning_rate": 4.9375070804678866e-05, + "loss": 0.2038, + "num_input_tokens_seen": 601080, + "step": 3160 + }, + { + "epoch": 1.6450103950103951, + "grad_norm": 0.46394574642181396, + "learning_rate": 4.937002180862441e-05, + "loss": 0.2098, + "num_input_tokens_seen": 602008, + "step": 3165 + }, + { + "epoch": 1.6476091476091477, + "grad_norm": 1.2476688623428345, + "learning_rate": 4.936495275859411e-05, + "loss": 0.2132, + "num_input_tokens_seen": 603000, + "step": 3170 + }, + { + "epoch": 1.6502079002079002, + "grad_norm": 9.30827808380127, + "learning_rate": 4.9359863658759235e-05, + "loss": 0.1537, + "num_input_tokens_seen": 603896, + "step": 3175 + }, + { + "epoch": 1.6528066528066527, + "grad_norm": 0.3761821985244751, + "learning_rate": 4.93547545133076e-05, + "loss": 0.1179, + "num_input_tokens_seen": 604792, + "step": 3180 + }, + { + "epoch": 1.6554054054054053, + "grad_norm": 0.28813081979751587, + "learning_rate": 4.9349625326443483e-05, + "loss": 0.1143, + "num_input_tokens_seen": 605720, + "step": 3185 + }, + { + "epoch": 1.658004158004158, + "grad_norm": 6.5833330154418945, + "learning_rate": 4.9344476102387685e-05, + "loss": 0.1773, + "num_input_tokens_seen": 606616, + "step": 3190 + }, + { + "epoch": 1.6606029106029108, + "grad_norm": 0.2309989184141159, + "learning_rate": 4.933930684537746e-05, + "loss": 0.2485, + "num_input_tokens_seen": 607576, + "step": 3195 + }, + { + "epoch": 1.6632016632016633, + "grad_norm": 1.32839035987854, + "learning_rate": 4.933411755966657e-05, + "loss": 0.0938, + "num_input_tokens_seen": 608568, + "step": 3200 + }, + { + "epoch": 1.6658004158004158, + "grad_norm": 1.2458553314208984, + "learning_rate": 4.9328908249525264e-05, + "loss": 0.1342, + "num_input_tokens_seen": 609560, + "step": 3205 + }, + { + "epoch": 1.6683991683991684, + "grad_norm": 0.4635586142539978, + "learning_rate": 4.9323678919240246e-05, + "loss": 0.0402, + "num_input_tokens_seen": 610488, + "step": 3210 + }, + { + "epoch": 1.6709979209979209, + "grad_norm": 1.4518990516662598, + "learning_rate": 4.931842957311472e-05, + "loss": 0.14, + "num_input_tokens_seen": 611416, + "step": 3215 + }, + { + "epoch": 1.6735966735966736, + "grad_norm": 0.5109272599220276, + "learning_rate": 4.9313160215468334e-05, + "loss": 0.1707, + "num_input_tokens_seen": 612376, + "step": 3220 + }, + { + "epoch": 1.6761954261954262, + "grad_norm": 0.6940157413482666, + "learning_rate": 4.930787085063723e-05, + "loss": 0.0862, + "num_input_tokens_seen": 613272, + "step": 3225 + }, + { + "epoch": 1.678794178794179, + "grad_norm": 0.15723542869091034, + "learning_rate": 4.930256148297398e-05, + "loss": 0.1354, + "num_input_tokens_seen": 614232, + "step": 3230 + }, + { + "epoch": 1.6813929313929314, + "grad_norm": 3.731238842010498, + "learning_rate": 4.929723211684767e-05, + "loss": 0.1786, + "num_input_tokens_seen": 615256, + "step": 3235 + }, + { + "epoch": 1.683991683991684, + "grad_norm": 6.937473297119141, + "learning_rate": 4.929188275664379e-05, + "loss": 0.0197, + "num_input_tokens_seen": 616120, + "step": 3240 + }, + { + "epoch": 1.6865904365904365, + "grad_norm": 0.055758073925971985, + "learning_rate": 4.928651340676431e-05, + "loss": 0.2772, + "num_input_tokens_seen": 617048, + "step": 3245 + }, + { + "epoch": 1.689189189189189, + "grad_norm": 0.07353943586349487, + "learning_rate": 4.9281124071627624e-05, + "loss": 0.2181, + "num_input_tokens_seen": 618040, + "step": 3250 + }, + { + "epoch": 1.6917879417879418, + "grad_norm": 1.951053500175476, + "learning_rate": 4.9275714755668624e-05, + "loss": 0.1012, + "num_input_tokens_seen": 619064, + "step": 3255 + }, + { + "epoch": 1.6943866943866945, + "grad_norm": 0.2096008062362671, + "learning_rate": 4.927028546333858e-05, + "loss": 0.2047, + "num_input_tokens_seen": 619928, + "step": 3260 + }, + { + "epoch": 1.696985446985447, + "grad_norm": 0.41380828619003296, + "learning_rate": 4.926483619910525e-05, + "loss": 0.0145, + "num_input_tokens_seen": 620888, + "step": 3265 + }, + { + "epoch": 1.6995841995841996, + "grad_norm": 0.10737881064414978, + "learning_rate": 4.9259366967452794e-05, + "loss": 0.2086, + "num_input_tokens_seen": 621912, + "step": 3270 + }, + { + "epoch": 1.7021829521829521, + "grad_norm": 2.210157871246338, + "learning_rate": 4.925387777288183e-05, + "loss": 0.1436, + "num_input_tokens_seen": 622936, + "step": 3275 + }, + { + "epoch": 1.7047817047817047, + "grad_norm": 0.21670138835906982, + "learning_rate": 4.924836861990938e-05, + "loss": 0.0169, + "num_input_tokens_seen": 623864, + "step": 3280 + }, + { + "epoch": 1.7073804573804574, + "grad_norm": 0.2564888596534729, + "learning_rate": 4.9242839513068906e-05, + "loss": 0.2157, + "num_input_tokens_seen": 624824, + "step": 3285 + }, + { + "epoch": 1.70997920997921, + "grad_norm": 0.21928395330905914, + "learning_rate": 4.923729045691028e-05, + "loss": 0.0314, + "num_input_tokens_seen": 625752, + "step": 3290 + }, + { + "epoch": 1.7125779625779627, + "grad_norm": 2.1400363445281982, + "learning_rate": 4.92317214559998e-05, + "loss": 0.3709, + "num_input_tokens_seen": 626712, + "step": 3295 + }, + { + "epoch": 1.7151767151767152, + "grad_norm": 18.150615692138672, + "learning_rate": 4.9226132514920165e-05, + "loss": 0.1879, + "num_input_tokens_seen": 627672, + "step": 3300 + }, + { + "epoch": 1.7177754677754677, + "grad_norm": 1.753395438194275, + "learning_rate": 4.9220523638270494e-05, + "loss": 0.1343, + "num_input_tokens_seen": 628664, + "step": 3305 + }, + { + "epoch": 1.7203742203742203, + "grad_norm": 0.7776670455932617, + "learning_rate": 4.92148948306663e-05, + "loss": 0.1613, + "num_input_tokens_seen": 629592, + "step": 3310 + }, + { + "epoch": 1.722972972972973, + "grad_norm": 0.12207344174385071, + "learning_rate": 4.92092460967395e-05, + "loss": 0.1135, + "num_input_tokens_seen": 630488, + "step": 3315 + }, + { + "epoch": 1.7255717255717256, + "grad_norm": 0.17170245945453644, + "learning_rate": 4.920357744113841e-05, + "loss": 0.0929, + "num_input_tokens_seen": 631512, + "step": 3320 + }, + { + "epoch": 1.7281704781704783, + "grad_norm": 11.369165420532227, + "learning_rate": 4.9197888868527756e-05, + "loss": 0.134, + "num_input_tokens_seen": 632472, + "step": 3325 + }, + { + "epoch": 1.7307692307692308, + "grad_norm": 8.855207443237305, + "learning_rate": 4.919218038358861e-05, + "loss": 0.1907, + "num_input_tokens_seen": 633400, + "step": 3330 + }, + { + "epoch": 1.7333679833679834, + "grad_norm": 0.5387296080589294, + "learning_rate": 4.918645199101848e-05, + "loss": 0.1747, + "num_input_tokens_seen": 634296, + "step": 3335 + }, + { + "epoch": 1.735966735966736, + "grad_norm": 0.1908143162727356, + "learning_rate": 4.918070369553123e-05, + "loss": 0.0146, + "num_input_tokens_seen": 635256, + "step": 3340 + }, + { + "epoch": 1.7385654885654884, + "grad_norm": 0.07730240374803543, + "learning_rate": 4.917493550185709e-05, + "loss": 0.1791, + "num_input_tokens_seen": 636216, + "step": 3345 + }, + { + "epoch": 1.7411642411642412, + "grad_norm": 2.746412754058838, + "learning_rate": 4.91691474147427e-05, + "loss": 0.1516, + "num_input_tokens_seen": 637176, + "step": 3350 + }, + { + "epoch": 1.743762993762994, + "grad_norm": 2.6831328868865967, + "learning_rate": 4.916333943895104e-05, + "loss": 0.0365, + "num_input_tokens_seen": 638168, + "step": 3355 + }, + { + "epoch": 1.7463617463617465, + "grad_norm": 0.26590338349342346, + "learning_rate": 4.915751157926146e-05, + "loss": 0.0128, + "num_input_tokens_seen": 639224, + "step": 3360 + }, + { + "epoch": 1.748960498960499, + "grad_norm": 4.024911403656006, + "learning_rate": 4.9151663840469687e-05, + "loss": 0.1165, + "num_input_tokens_seen": 640184, + "step": 3365 + }, + { + "epoch": 1.7515592515592515, + "grad_norm": 0.08551501482725143, + "learning_rate": 4.914579622738779e-05, + "loss": 0.1289, + "num_input_tokens_seen": 641048, + "step": 3370 + }, + { + "epoch": 1.754158004158004, + "grad_norm": 0.16041919589042664, + "learning_rate": 4.913990874484421e-05, + "loss": 0.1501, + "num_input_tokens_seen": 642008, + "step": 3375 + }, + { + "epoch": 1.7567567567567568, + "grad_norm": 2.0687503814697266, + "learning_rate": 4.913400139768372e-05, + "loss": 0.1105, + "num_input_tokens_seen": 642936, + "step": 3380 + }, + { + "epoch": 1.7593555093555093, + "grad_norm": 5.469308853149414, + "learning_rate": 4.9128074190767456e-05, + "loss": 0.0829, + "num_input_tokens_seen": 643864, + "step": 3385 + }, + { + "epoch": 1.761954261954262, + "grad_norm": 0.14466820657253265, + "learning_rate": 4.912212712897288e-05, + "loss": 0.0097, + "num_input_tokens_seen": 644792, + "step": 3390 + }, + { + "epoch": 1.7645530145530146, + "grad_norm": 4.654240608215332, + "learning_rate": 4.911616021719381e-05, + "loss": 0.0836, + "num_input_tokens_seen": 645752, + "step": 3395 + }, + { + "epoch": 1.7671517671517671, + "grad_norm": 0.07861494272947311, + "learning_rate": 4.911017346034037e-05, + "loss": 0.1067, + "num_input_tokens_seen": 646712, + "step": 3400 + }, + { + "epoch": 1.7697505197505197, + "grad_norm": 9.160551071166992, + "learning_rate": 4.910416686333906e-05, + "loss": 0.0832, + "num_input_tokens_seen": 647672, + "step": 3405 + }, + { + "epoch": 1.7723492723492722, + "grad_norm": 0.32693085074424744, + "learning_rate": 4.909814043113267e-05, + "loss": 0.1426, + "num_input_tokens_seen": 648664, + "step": 3410 + }, + { + "epoch": 1.774948024948025, + "grad_norm": 0.03604090213775635, + "learning_rate": 4.909209416868032e-05, + "loss": 0.181, + "num_input_tokens_seen": 649592, + "step": 3415 + }, + { + "epoch": 1.7775467775467777, + "grad_norm": 0.16975361108779907, + "learning_rate": 4.9086028080957445e-05, + "loss": 0.2519, + "num_input_tokens_seen": 650584, + "step": 3420 + }, + { + "epoch": 1.7801455301455302, + "grad_norm": 10.847650527954102, + "learning_rate": 4.907994217295582e-05, + "loss": 0.0803, + "num_input_tokens_seen": 651576, + "step": 3425 + }, + { + "epoch": 1.7827442827442828, + "grad_norm": 4.719595432281494, + "learning_rate": 4.9073836449683486e-05, + "loss": 0.1439, + "num_input_tokens_seen": 652600, + "step": 3430 + }, + { + "epoch": 1.7853430353430353, + "grad_norm": 4.4536452293396, + "learning_rate": 4.906771091616483e-05, + "loss": 0.1505, + "num_input_tokens_seen": 653624, + "step": 3435 + }, + { + "epoch": 1.7879417879417878, + "grad_norm": 0.2321763038635254, + "learning_rate": 4.9061565577440516e-05, + "loss": 0.0852, + "num_input_tokens_seen": 654552, + "step": 3440 + }, + { + "epoch": 1.7905405405405406, + "grad_norm": 2.6092469692230225, + "learning_rate": 4.9055400438567515e-05, + "loss": 0.1358, + "num_input_tokens_seen": 655512, + "step": 3445 + }, + { + "epoch": 1.793139293139293, + "grad_norm": 3.6528477668762207, + "learning_rate": 4.90492155046191e-05, + "loss": 0.1553, + "num_input_tokens_seen": 656472, + "step": 3450 + }, + { + "epoch": 1.7957380457380459, + "grad_norm": 4.714333534240723, + "learning_rate": 4.9043010780684814e-05, + "loss": 0.2736, + "num_input_tokens_seen": 657432, + "step": 3455 + }, + { + "epoch": 1.7983367983367984, + "grad_norm": 0.9140357971191406, + "learning_rate": 4.9036786271870504e-05, + "loss": 0.1815, + "num_input_tokens_seen": 658392, + "step": 3460 + }, + { + "epoch": 1.800935550935551, + "grad_norm": 4.027628421783447, + "learning_rate": 4.903054198329827e-05, + "loss": 0.1402, + "num_input_tokens_seen": 659352, + "step": 3465 + }, + { + "epoch": 1.8035343035343034, + "grad_norm": 2.84417986869812, + "learning_rate": 4.902427792010653e-05, + "loss": 0.1875, + "num_input_tokens_seen": 660344, + "step": 3470 + }, + { + "epoch": 1.806133056133056, + "grad_norm": 0.6711820363998413, + "learning_rate": 4.9017994087449946e-05, + "loss": 0.1327, + "num_input_tokens_seen": 661304, + "step": 3475 + }, + { + "epoch": 1.8087318087318087, + "grad_norm": 0.16694746911525726, + "learning_rate": 4.901169049049945e-05, + "loss": 0.1115, + "num_input_tokens_seen": 662232, + "step": 3480 + }, + { + "epoch": 1.8113305613305615, + "grad_norm": 0.11201293021440506, + "learning_rate": 4.9005367134442235e-05, + "loss": 0.0995, + "num_input_tokens_seen": 663160, + "step": 3485 + }, + { + "epoch": 1.813929313929314, + "grad_norm": 1.2534080743789673, + "learning_rate": 4.8999024024481775e-05, + "loss": 0.2795, + "num_input_tokens_seen": 664152, + "step": 3490 + }, + { + "epoch": 1.8165280665280665, + "grad_norm": 1.8904814720153809, + "learning_rate": 4.8992661165837785e-05, + "loss": 0.153, + "num_input_tokens_seen": 665080, + "step": 3495 + }, + { + "epoch": 1.819126819126819, + "grad_norm": 17.24232292175293, + "learning_rate": 4.8986278563746216e-05, + "loss": 0.139, + "num_input_tokens_seen": 666072, + "step": 3500 + }, + { + "epoch": 1.8217255717255716, + "grad_norm": 2.6118977069854736, + "learning_rate": 4.8979876223459295e-05, + "loss": 0.3218, + "num_input_tokens_seen": 667032, + "step": 3505 + }, + { + "epoch": 1.8243243243243243, + "grad_norm": 2.497342348098755, + "learning_rate": 4.8973454150245466e-05, + "loss": 0.1004, + "num_input_tokens_seen": 667896, + "step": 3510 + }, + { + "epoch": 1.8269230769230769, + "grad_norm": 4.368038654327393, + "learning_rate": 4.896701234938944e-05, + "loss": 0.3354, + "num_input_tokens_seen": 668856, + "step": 3515 + }, + { + "epoch": 1.8295218295218296, + "grad_norm": 0.34653088450431824, + "learning_rate": 4.896055082619213e-05, + "loss": 0.0622, + "num_input_tokens_seen": 669880, + "step": 3520 + }, + { + "epoch": 1.8321205821205822, + "grad_norm": 0.3649754822254181, + "learning_rate": 4.89540695859707e-05, + "loss": 0.1653, + "num_input_tokens_seen": 670872, + "step": 3525 + }, + { + "epoch": 1.8347193347193347, + "grad_norm": 5.38226318359375, + "learning_rate": 4.8947568634058525e-05, + "loss": 0.0482, + "num_input_tokens_seen": 671832, + "step": 3530 + }, + { + "epoch": 1.8373180873180872, + "grad_norm": 11.421675682067871, + "learning_rate": 4.894104797580522e-05, + "loss": 0.1887, + "num_input_tokens_seen": 672728, + "step": 3535 + }, + { + "epoch": 1.83991683991684, + "grad_norm": 4.569623947143555, + "learning_rate": 4.893450761657658e-05, + "loss": 0.212, + "num_input_tokens_seen": 673656, + "step": 3540 + }, + { + "epoch": 1.8425155925155925, + "grad_norm": 1.1866710186004639, + "learning_rate": 4.8927947561754675e-05, + "loss": 0.0252, + "num_input_tokens_seen": 674648, + "step": 3545 + }, + { + "epoch": 1.8451143451143452, + "grad_norm": 2.517565965652466, + "learning_rate": 4.892136781673771e-05, + "loss": 0.0541, + "num_input_tokens_seen": 675640, + "step": 3550 + }, + { + "epoch": 1.8477130977130978, + "grad_norm": 0.43812358379364014, + "learning_rate": 4.891476838694012e-05, + "loss": 0.1309, + "num_input_tokens_seen": 676696, + "step": 3555 + }, + { + "epoch": 1.8503118503118503, + "grad_norm": 6.468728542327881, + "learning_rate": 4.890814927779258e-05, + "loss": 0.1029, + "num_input_tokens_seen": 677688, + "step": 3560 + }, + { + "epoch": 1.8529106029106028, + "grad_norm": 0.14587758481502533, + "learning_rate": 4.8901510494741895e-05, + "loss": 0.0123, + "num_input_tokens_seen": 678680, + "step": 3565 + }, + { + "epoch": 1.8555093555093554, + "grad_norm": 0.03490854799747467, + "learning_rate": 4.88948520432511e-05, + "loss": 0.1276, + "num_input_tokens_seen": 679576, + "step": 3570 + }, + { + "epoch": 1.8581081081081081, + "grad_norm": 3.3612709045410156, + "learning_rate": 4.88881739287994e-05, + "loss": 0.2248, + "num_input_tokens_seen": 680504, + "step": 3575 + }, + { + "epoch": 1.8607068607068609, + "grad_norm": 0.16136318445205688, + "learning_rate": 4.888147615688219e-05, + "loss": 0.1554, + "num_input_tokens_seen": 681432, + "step": 3580 + }, + { + "epoch": 1.8633056133056134, + "grad_norm": 2.4569952487945557, + "learning_rate": 4.8874758733011023e-05, + "loss": 0.1958, + "num_input_tokens_seen": 682360, + "step": 3585 + }, + { + "epoch": 1.865904365904366, + "grad_norm": 0.291191428899765, + "learning_rate": 4.886802166271364e-05, + "loss": 0.1505, + "num_input_tokens_seen": 683352, + "step": 3590 + }, + { + "epoch": 1.8685031185031185, + "grad_norm": 10.252853393554688, + "learning_rate": 4.886126495153395e-05, + "loss": 0.0955, + "num_input_tokens_seen": 684344, + "step": 3595 + }, + { + "epoch": 1.871101871101871, + "grad_norm": 5.887052536010742, + "learning_rate": 4.8854488605032014e-05, + "loss": 0.1223, + "num_input_tokens_seen": 685336, + "step": 3600 + }, + { + "epoch": 1.8737006237006237, + "grad_norm": 1.9058266878128052, + "learning_rate": 4.884769262878406e-05, + "loss": 0.1134, + "num_input_tokens_seen": 686328, + "step": 3605 + }, + { + "epoch": 1.8762993762993763, + "grad_norm": 5.985334396362305, + "learning_rate": 4.884087702838246e-05, + "loss": 0.1633, + "num_input_tokens_seen": 687288, + "step": 3610 + }, + { + "epoch": 1.878898128898129, + "grad_norm": 0.11510640382766724, + "learning_rate": 4.8834041809435736e-05, + "loss": 0.185, + "num_input_tokens_seen": 688216, + "step": 3615 + }, + { + "epoch": 1.8814968814968815, + "grad_norm": 4.8731913566589355, + "learning_rate": 4.8827186977568565e-05, + "loss": 0.221, + "num_input_tokens_seen": 689304, + "step": 3620 + }, + { + "epoch": 1.884095634095634, + "grad_norm": 3.274081230163574, + "learning_rate": 4.8820312538421755e-05, + "loss": 0.2875, + "num_input_tokens_seen": 690328, + "step": 3625 + }, + { + "epoch": 1.8866943866943866, + "grad_norm": 2.8272488117218018, + "learning_rate": 4.881341849765224e-05, + "loss": 0.2019, + "num_input_tokens_seen": 691288, + "step": 3630 + }, + { + "epoch": 1.8892931392931391, + "grad_norm": 2.2349021434783936, + "learning_rate": 4.88065048609331e-05, + "loss": 0.2246, + "num_input_tokens_seen": 692280, + "step": 3635 + }, + { + "epoch": 1.8918918918918919, + "grad_norm": 0.568015992641449, + "learning_rate": 4.879957163395354e-05, + "loss": 0.0766, + "num_input_tokens_seen": 693304, + "step": 3640 + }, + { + "epoch": 1.8944906444906446, + "grad_norm": 0.22624172270298004, + "learning_rate": 4.879261882241888e-05, + "loss": 0.0972, + "num_input_tokens_seen": 694296, + "step": 3645 + }, + { + "epoch": 1.8970893970893972, + "grad_norm": 1.715725302696228, + "learning_rate": 4.878564643205054e-05, + "loss": 0.0325, + "num_input_tokens_seen": 695256, + "step": 3650 + }, + { + "epoch": 1.8996881496881497, + "grad_norm": 0.06914017349481583, + "learning_rate": 4.877865446858608e-05, + "loss": 0.1257, + "num_input_tokens_seen": 696280, + "step": 3655 + }, + { + "epoch": 1.9022869022869022, + "grad_norm": 8.57241153717041, + "learning_rate": 4.877164293777916e-05, + "loss": 0.1731, + "num_input_tokens_seen": 697272, + "step": 3660 + }, + { + "epoch": 1.9048856548856548, + "grad_norm": 0.3856169581413269, + "learning_rate": 4.8764611845399516e-05, + "loss": 0.2511, + "num_input_tokens_seen": 698232, + "step": 3665 + }, + { + "epoch": 1.9074844074844075, + "grad_norm": 0.33796781301498413, + "learning_rate": 4.875756119723301e-05, + "loss": 0.2008, + "num_input_tokens_seen": 699192, + "step": 3670 + }, + { + "epoch": 1.91008316008316, + "grad_norm": 0.8781201839447021, + "learning_rate": 4.87504909990816e-05, + "loss": 0.1292, + "num_input_tokens_seen": 700184, + "step": 3675 + }, + { + "epoch": 1.9126819126819128, + "grad_norm": 3.2996108531951904, + "learning_rate": 4.87434012567633e-05, + "loss": 0.284, + "num_input_tokens_seen": 701144, + "step": 3680 + }, + { + "epoch": 1.9152806652806653, + "grad_norm": 0.3169112503528595, + "learning_rate": 4.8736291976112235e-05, + "loss": 0.0215, + "num_input_tokens_seen": 702072, + "step": 3685 + }, + { + "epoch": 1.9178794178794178, + "grad_norm": 2.9433369636535645, + "learning_rate": 4.872916316297859e-05, + "loss": 0.0905, + "num_input_tokens_seen": 703000, + "step": 3690 + }, + { + "epoch": 1.9204781704781704, + "grad_norm": 8.875313758850098, + "learning_rate": 4.872201482322865e-05, + "loss": 0.2424, + "num_input_tokens_seen": 703960, + "step": 3695 + }, + { + "epoch": 1.9230769230769231, + "grad_norm": 0.17832987010478973, + "learning_rate": 4.8714846962744725e-05, + "loss": 0.1791, + "num_input_tokens_seen": 704952, + "step": 3700 + }, + { + "epoch": 1.9256756756756757, + "grad_norm": 0.536946177482605, + "learning_rate": 4.870765958742523e-05, + "loss": 0.1051, + "num_input_tokens_seen": 705944, + "step": 3705 + }, + { + "epoch": 1.9282744282744284, + "grad_norm": 5.879810810089111, + "learning_rate": 4.8700452703184616e-05, + "loss": 0.1829, + "num_input_tokens_seen": 706904, + "step": 3710 + }, + { + "epoch": 1.930873180873181, + "grad_norm": 2.0197179317474365, + "learning_rate": 4.869322631595341e-05, + "loss": 0.2318, + "num_input_tokens_seen": 707864, + "step": 3715 + }, + { + "epoch": 1.9334719334719335, + "grad_norm": 6.991998195648193, + "learning_rate": 4.8685980431678144e-05, + "loss": 0.1391, + "num_input_tokens_seen": 708856, + "step": 3720 + }, + { + "epoch": 1.936070686070686, + "grad_norm": 2.549004554748535, + "learning_rate": 4.867871505632144e-05, + "loss": 0.2244, + "num_input_tokens_seen": 709816, + "step": 3725 + }, + { + "epoch": 1.9386694386694385, + "grad_norm": 1.5303864479064941, + "learning_rate": 4.867143019586195e-05, + "loss": 0.0693, + "num_input_tokens_seen": 710808, + "step": 3730 + }, + { + "epoch": 1.9412681912681913, + "grad_norm": 7.513998031616211, + "learning_rate": 4.866412585629432e-05, + "loss": 0.1567, + "num_input_tokens_seen": 711704, + "step": 3735 + }, + { + "epoch": 1.943866943866944, + "grad_norm": 0.24542401731014252, + "learning_rate": 4.865680204362928e-05, + "loss": 0.0867, + "num_input_tokens_seen": 712632, + "step": 3740 + }, + { + "epoch": 1.9464656964656966, + "grad_norm": 6.179004669189453, + "learning_rate": 4.864945876389356e-05, + "loss": 0.1709, + "num_input_tokens_seen": 713528, + "step": 3745 + }, + { + "epoch": 1.949064449064449, + "grad_norm": 0.19274067878723145, + "learning_rate": 4.864209602312991e-05, + "loss": 0.0586, + "num_input_tokens_seen": 714456, + "step": 3750 + }, + { + "epoch": 1.9516632016632016, + "grad_norm": 14.267871856689453, + "learning_rate": 4.863471382739708e-05, + "loss": 0.0533, + "num_input_tokens_seen": 715480, + "step": 3755 + }, + { + "epoch": 1.9542619542619541, + "grad_norm": 5.673616886138916, + "learning_rate": 4.862731218276987e-05, + "loss": 0.3101, + "num_input_tokens_seen": 716440, + "step": 3760 + }, + { + "epoch": 1.956860706860707, + "grad_norm": 2.2166833877563477, + "learning_rate": 4.8619891095339034e-05, + "loss": 0.3829, + "num_input_tokens_seen": 717368, + "step": 3765 + }, + { + "epoch": 1.9594594594594594, + "grad_norm": 2.7381908893585205, + "learning_rate": 4.861245057121135e-05, + "loss": 0.1573, + "num_input_tokens_seen": 718264, + "step": 3770 + }, + { + "epoch": 1.9620582120582122, + "grad_norm": 1.8117918968200684, + "learning_rate": 4.8604990616509616e-05, + "loss": 0.1611, + "num_input_tokens_seen": 719224, + "step": 3775 + }, + { + "epoch": 1.9646569646569647, + "grad_norm": 0.08607964217662811, + "learning_rate": 4.8597511237372574e-05, + "loss": 0.1259, + "num_input_tokens_seen": 720184, + "step": 3780 + }, + { + "epoch": 1.9672557172557172, + "grad_norm": 4.703255653381348, + "learning_rate": 4.859001243995497e-05, + "loss": 0.2013, + "num_input_tokens_seen": 721176, + "step": 3785 + }, + { + "epoch": 1.9698544698544698, + "grad_norm": 2.1905527114868164, + "learning_rate": 4.858249423042753e-05, + "loss": 0.0569, + "num_input_tokens_seen": 722168, + "step": 3790 + }, + { + "epoch": 1.9724532224532223, + "grad_norm": 8.464709281921387, + "learning_rate": 4.857495661497695e-05, + "loss": 0.2841, + "num_input_tokens_seen": 723128, + "step": 3795 + }, + { + "epoch": 1.975051975051975, + "grad_norm": 3.5031650066375732, + "learning_rate": 4.856739959980591e-05, + "loss": 0.2434, + "num_input_tokens_seen": 724088, + "step": 3800 + }, + { + "epoch": 1.9776507276507278, + "grad_norm": 0.27248045802116394, + "learning_rate": 4.855982319113304e-05, + "loss": 0.0234, + "num_input_tokens_seen": 724984, + "step": 3805 + }, + { + "epoch": 1.9802494802494803, + "grad_norm": 3.8534281253814697, + "learning_rate": 4.855222739519292e-05, + "loss": 0.2211, + "num_input_tokens_seen": 725880, + "step": 3810 + }, + { + "epoch": 1.9828482328482329, + "grad_norm": 7.252470016479492, + "learning_rate": 4.8544612218236096e-05, + "loss": 0.2062, + "num_input_tokens_seen": 726776, + "step": 3815 + }, + { + "epoch": 1.9854469854469854, + "grad_norm": 5.728098392486572, + "learning_rate": 4.853697766652907e-05, + "loss": 0.2375, + "num_input_tokens_seen": 727736, + "step": 3820 + }, + { + "epoch": 1.988045738045738, + "grad_norm": 0.9526638388633728, + "learning_rate": 4.852932374635427e-05, + "loss": 0.0519, + "num_input_tokens_seen": 728664, + "step": 3825 + }, + { + "epoch": 1.9906444906444907, + "grad_norm": 9.425487518310547, + "learning_rate": 4.852165046401008e-05, + "loss": 0.0424, + "num_input_tokens_seen": 729624, + "step": 3830 + }, + { + "epoch": 1.9932432432432432, + "grad_norm": 1.1424057483673096, + "learning_rate": 4.85139578258108e-05, + "loss": 0.2432, + "num_input_tokens_seen": 730488, + "step": 3835 + }, + { + "epoch": 1.995841995841996, + "grad_norm": 0.09837708622217178, + "learning_rate": 4.850624583808667e-05, + "loss": 0.0317, + "num_input_tokens_seen": 731416, + "step": 3840 + }, + { + "epoch": 1.9984407484407485, + "grad_norm": 0.22573138773441315, + "learning_rate": 4.849851450718385e-05, + "loss": 0.2194, + "num_input_tokens_seen": 732376, + "step": 3845 + }, + { + "epoch": 2.0, + "eval_loss": 0.1639394462108612, + "eval_runtime": 9.2617, + "eval_samples_per_second": 92.424, + "eval_steps_per_second": 23.106, + "num_input_tokens_seen": 732880, + "step": 3848 + }, + { + "epoch": 2.001039501039501, + "grad_norm": 0.0663289725780487, + "learning_rate": 4.849076383946441e-05, + "loss": 0.0685, + "num_input_tokens_seen": 733200, + "step": 3850 + }, + { + "epoch": 2.0036382536382535, + "grad_norm": 4.4837188720703125, + "learning_rate": 4.848299384130634e-05, + "loss": 0.0654, + "num_input_tokens_seen": 734160, + "step": 3855 + }, + { + "epoch": 2.006237006237006, + "grad_norm": 0.18451960384845734, + "learning_rate": 4.8475204519103536e-05, + "loss": 0.0257, + "num_input_tokens_seen": 735184, + "step": 3860 + }, + { + "epoch": 2.008835758835759, + "grad_norm": 0.038852594792842865, + "learning_rate": 4.8467395879265786e-05, + "loss": 0.1129, + "num_input_tokens_seen": 736112, + "step": 3865 + }, + { + "epoch": 2.0114345114345116, + "grad_norm": 0.09794275462627411, + "learning_rate": 4.8459567928218794e-05, + "loss": 0.0064, + "num_input_tokens_seen": 737104, + "step": 3870 + }, + { + "epoch": 2.014033264033264, + "grad_norm": 3.784363031387329, + "learning_rate": 4.845172067240415e-05, + "loss": 0.0738, + "num_input_tokens_seen": 738032, + "step": 3875 + }, + { + "epoch": 2.0166320166320166, + "grad_norm": 0.030317693948745728, + "learning_rate": 4.844385411827931e-05, + "loss": 0.0913, + "num_input_tokens_seen": 738960, + "step": 3880 + }, + { + "epoch": 2.019230769230769, + "grad_norm": 0.20975559949874878, + "learning_rate": 4.8435968272317624e-05, + "loss": 0.2352, + "num_input_tokens_seen": 739888, + "step": 3885 + }, + { + "epoch": 2.0218295218295217, + "grad_norm": 0.10406780242919922, + "learning_rate": 4.8428063141008316e-05, + "loss": 0.1123, + "num_input_tokens_seen": 740784, + "step": 3890 + }, + { + "epoch": 2.024428274428274, + "grad_norm": 0.06625548750162125, + "learning_rate": 4.8420138730856495e-05, + "loss": 0.0434, + "num_input_tokens_seen": 741776, + "step": 3895 + }, + { + "epoch": 2.027027027027027, + "grad_norm": 0.1788565218448639, + "learning_rate": 4.8412195048383115e-05, + "loss": 0.0123, + "num_input_tokens_seen": 742768, + "step": 3900 + }, + { + "epoch": 2.0296257796257797, + "grad_norm": 2.2150087356567383, + "learning_rate": 4.8404232100124994e-05, + "loss": 0.2445, + "num_input_tokens_seen": 743664, + "step": 3905 + }, + { + "epoch": 2.0322245322245323, + "grad_norm": 3.890641689300537, + "learning_rate": 4.839624989263479e-05, + "loss": 0.1309, + "num_input_tokens_seen": 744624, + "step": 3910 + }, + { + "epoch": 2.034823284823285, + "grad_norm": 0.05436832085251808, + "learning_rate": 4.838824843248104e-05, + "loss": 0.0719, + "num_input_tokens_seen": 745584, + "step": 3915 + }, + { + "epoch": 2.0374220374220373, + "grad_norm": 2.923321008682251, + "learning_rate": 4.838022772624812e-05, + "loss": 0.2431, + "num_input_tokens_seen": 746544, + "step": 3920 + }, + { + "epoch": 2.04002079002079, + "grad_norm": 0.5578654408454895, + "learning_rate": 4.837218778053621e-05, + "loss": 0.0883, + "num_input_tokens_seen": 747472, + "step": 3925 + }, + { + "epoch": 2.042619542619543, + "grad_norm": 0.09248201549053192, + "learning_rate": 4.8364128601961335e-05, + "loss": 0.0777, + "num_input_tokens_seen": 748432, + "step": 3930 + }, + { + "epoch": 2.0452182952182953, + "grad_norm": 1.847010850906372, + "learning_rate": 4.835605019715538e-05, + "loss": 0.2126, + "num_input_tokens_seen": 749488, + "step": 3935 + }, + { + "epoch": 2.047817047817048, + "grad_norm": 9.44339370727539, + "learning_rate": 4.834795257276601e-05, + "loss": 0.1398, + "num_input_tokens_seen": 750416, + "step": 3940 + }, + { + "epoch": 2.0504158004158004, + "grad_norm": 1.996357798576355, + "learning_rate": 4.8339835735456736e-05, + "loss": 0.0637, + "num_input_tokens_seen": 751376, + "step": 3945 + }, + { + "epoch": 2.053014553014553, + "grad_norm": 0.06689882278442383, + "learning_rate": 4.833169969190685e-05, + "loss": 0.079, + "num_input_tokens_seen": 752304, + "step": 3950 + }, + { + "epoch": 2.0556133056133055, + "grad_norm": 3.513441324234009, + "learning_rate": 4.832354444881147e-05, + "loss": 0.1068, + "num_input_tokens_seen": 753232, + "step": 3955 + }, + { + "epoch": 2.0582120582120584, + "grad_norm": 0.0434693768620491, + "learning_rate": 4.8315370012881514e-05, + "loss": 0.0394, + "num_input_tokens_seen": 754288, + "step": 3960 + }, + { + "epoch": 2.060810810810811, + "grad_norm": 0.06818453967571259, + "learning_rate": 4.830717639084367e-05, + "loss": 0.0864, + "num_input_tokens_seen": 755280, + "step": 3965 + }, + { + "epoch": 2.0634095634095635, + "grad_norm": 0.033780328929424286, + "learning_rate": 4.829896358944044e-05, + "loss": 0.0902, + "num_input_tokens_seen": 756208, + "step": 3970 + }, + { + "epoch": 2.066008316008316, + "grad_norm": 0.1328269988298416, + "learning_rate": 4.8290731615430104e-05, + "loss": 0.0694, + "num_input_tokens_seen": 757136, + "step": 3975 + }, + { + "epoch": 2.0686070686070686, + "grad_norm": 0.08687067031860352, + "learning_rate": 4.828248047558672e-05, + "loss": 0.1331, + "num_input_tokens_seen": 758064, + "step": 3980 + }, + { + "epoch": 2.071205821205821, + "grad_norm": 0.10882964730262756, + "learning_rate": 4.82742101767001e-05, + "loss": 0.0795, + "num_input_tokens_seen": 759056, + "step": 3985 + }, + { + "epoch": 2.0738045738045736, + "grad_norm": 0.2568250596523285, + "learning_rate": 4.8265920725575825e-05, + "loss": 0.0082, + "num_input_tokens_seen": 760016, + "step": 3990 + }, + { + "epoch": 2.0764033264033266, + "grad_norm": 0.048167284578084946, + "learning_rate": 4.825761212903527e-05, + "loss": 0.0434, + "num_input_tokens_seen": 760944, + "step": 3995 + }, + { + "epoch": 2.079002079002079, + "grad_norm": 2.344717502593994, + "learning_rate": 4.824928439391552e-05, + "loss": 0.3678, + "num_input_tokens_seen": 761840, + "step": 4000 + }, + { + "epoch": 2.0816008316008316, + "grad_norm": 6.569521903991699, + "learning_rate": 4.824093752706943e-05, + "loss": 0.0884, + "num_input_tokens_seen": 762864, + "step": 4005 + }, + { + "epoch": 2.084199584199584, + "grad_norm": 0.11934157460927963, + "learning_rate": 4.823257153536561e-05, + "loss": 0.0108, + "num_input_tokens_seen": 763728, + "step": 4010 + }, + { + "epoch": 2.0867983367983367, + "grad_norm": 0.07409209758043289, + "learning_rate": 4.822418642568839e-05, + "loss": 0.0842, + "num_input_tokens_seen": 764688, + "step": 4015 + }, + { + "epoch": 2.0893970893970892, + "grad_norm": 3.7725446224212646, + "learning_rate": 4.821578220493783e-05, + "loss": 0.0971, + "num_input_tokens_seen": 765744, + "step": 4020 + }, + { + "epoch": 2.091995841995842, + "grad_norm": 1.6956087350845337, + "learning_rate": 4.8207358880029726e-05, + "loss": 0.0773, + "num_input_tokens_seen": 766672, + "step": 4025 + }, + { + "epoch": 2.0945945945945947, + "grad_norm": 0.048738911747932434, + "learning_rate": 4.8198916457895604e-05, + "loss": 0.1831, + "num_input_tokens_seen": 767568, + "step": 4030 + }, + { + "epoch": 2.0971933471933473, + "grad_norm": 6.231434345245361, + "learning_rate": 4.819045494548268e-05, + "loss": 0.0632, + "num_input_tokens_seen": 768592, + "step": 4035 + }, + { + "epoch": 2.0997920997921, + "grad_norm": 0.17257490754127502, + "learning_rate": 4.81819743497539e-05, + "loss": 0.1254, + "num_input_tokens_seen": 769520, + "step": 4040 + }, + { + "epoch": 2.1023908523908523, + "grad_norm": 3.2767982482910156, + "learning_rate": 4.8173474677687904e-05, + "loss": 0.0201, + "num_input_tokens_seen": 770448, + "step": 4045 + }, + { + "epoch": 2.104989604989605, + "grad_norm": 6.315924167633057, + "learning_rate": 4.816495593627902e-05, + "loss": 0.0526, + "num_input_tokens_seen": 771376, + "step": 4050 + }, + { + "epoch": 2.1075883575883574, + "grad_norm": 13.357500076293945, + "learning_rate": 4.81564181325373e-05, + "loss": 0.1262, + "num_input_tokens_seen": 772368, + "step": 4055 + }, + { + "epoch": 2.1101871101871104, + "grad_norm": 4.534238815307617, + "learning_rate": 4.814786127348845e-05, + "loss": 0.0174, + "num_input_tokens_seen": 773264, + "step": 4060 + }, + { + "epoch": 2.112785862785863, + "grad_norm": 0.09434173256158829, + "learning_rate": 4.813928536617388e-05, + "loss": 0.0476, + "num_input_tokens_seen": 774256, + "step": 4065 + }, + { + "epoch": 2.1153846153846154, + "grad_norm": 0.41327473521232605, + "learning_rate": 4.813069041765065e-05, + "loss": 0.056, + "num_input_tokens_seen": 775248, + "step": 4070 + }, + { + "epoch": 2.117983367983368, + "grad_norm": 24.315277099609375, + "learning_rate": 4.8122076434991506e-05, + "loss": 0.1253, + "num_input_tokens_seen": 776144, + "step": 4075 + }, + { + "epoch": 2.1205821205821205, + "grad_norm": 0.042117323726415634, + "learning_rate": 4.8113443425284865e-05, + "loss": 0.0116, + "num_input_tokens_seen": 777104, + "step": 4080 + }, + { + "epoch": 2.123180873180873, + "grad_norm": 0.1029563769698143, + "learning_rate": 4.8104791395634774e-05, + "loss": 0.0722, + "num_input_tokens_seen": 778128, + "step": 4085 + }, + { + "epoch": 2.125779625779626, + "grad_norm": 7.754895210266113, + "learning_rate": 4.809612035316096e-05, + "loss": 0.1364, + "num_input_tokens_seen": 779056, + "step": 4090 + }, + { + "epoch": 2.1283783783783785, + "grad_norm": 0.09451329708099365, + "learning_rate": 4.808743030499877e-05, + "loss": 0.1116, + "num_input_tokens_seen": 779984, + "step": 4095 + }, + { + "epoch": 2.130977130977131, + "grad_norm": 23.41499900817871, + "learning_rate": 4.807872125829922e-05, + "loss": 0.1763, + "num_input_tokens_seen": 780880, + "step": 4100 + }, + { + "epoch": 2.1335758835758836, + "grad_norm": 0.10242285579442978, + "learning_rate": 4.8069993220228925e-05, + "loss": 0.1674, + "num_input_tokens_seen": 781904, + "step": 4105 + }, + { + "epoch": 2.136174636174636, + "grad_norm": 7.707656383514404, + "learning_rate": 4.806124619797016e-05, + "loss": 0.0613, + "num_input_tokens_seen": 782800, + "step": 4110 + }, + { + "epoch": 2.1387733887733886, + "grad_norm": 3.493603229522705, + "learning_rate": 4.805248019872081e-05, + "loss": 0.1454, + "num_input_tokens_seen": 783728, + "step": 4115 + }, + { + "epoch": 2.141372141372141, + "grad_norm": 0.31382814049720764, + "learning_rate": 4.8043695229694365e-05, + "loss": 0.1061, + "num_input_tokens_seen": 784624, + "step": 4120 + }, + { + "epoch": 2.143970893970894, + "grad_norm": 16.098520278930664, + "learning_rate": 4.803489129811993e-05, + "loss": 0.121, + "num_input_tokens_seen": 785584, + "step": 4125 + }, + { + "epoch": 2.1465696465696467, + "grad_norm": 0.14288347959518433, + "learning_rate": 4.802606841124223e-05, + "loss": 0.0604, + "num_input_tokens_seen": 786512, + "step": 4130 + }, + { + "epoch": 2.149168399168399, + "grad_norm": 1.0116736888885498, + "learning_rate": 4.8017226576321586e-05, + "loss": 0.2009, + "num_input_tokens_seen": 787504, + "step": 4135 + }, + { + "epoch": 2.1517671517671517, + "grad_norm": 0.23308683931827545, + "learning_rate": 4.8008365800633875e-05, + "loss": 0.1288, + "num_input_tokens_seen": 788400, + "step": 4140 + }, + { + "epoch": 2.1543659043659042, + "grad_norm": 0.20735399425029755, + "learning_rate": 4.799948609147061e-05, + "loss": 0.2102, + "num_input_tokens_seen": 789392, + "step": 4145 + }, + { + "epoch": 2.156964656964657, + "grad_norm": 2.2920098304748535, + "learning_rate": 4.799058745613885e-05, + "loss": 0.1159, + "num_input_tokens_seen": 790320, + "step": 4150 + }, + { + "epoch": 2.1595634095634098, + "grad_norm": 0.6211904287338257, + "learning_rate": 4.798166990196125e-05, + "loss": 0.0239, + "num_input_tokens_seen": 791248, + "step": 4155 + }, + { + "epoch": 2.1621621621621623, + "grad_norm": 0.12701943516731262, + "learning_rate": 4.797273343627601e-05, + "loss": 0.0142, + "num_input_tokens_seen": 792176, + "step": 4160 + }, + { + "epoch": 2.164760914760915, + "grad_norm": 0.054061051458120346, + "learning_rate": 4.796377806643692e-05, + "loss": 0.0823, + "num_input_tokens_seen": 793104, + "step": 4165 + }, + { + "epoch": 2.1673596673596673, + "grad_norm": 0.28099003434181213, + "learning_rate": 4.79548037998133e-05, + "loss": 0.0974, + "num_input_tokens_seen": 794064, + "step": 4170 + }, + { + "epoch": 2.16995841995842, + "grad_norm": 0.13210126757621765, + "learning_rate": 4.7945810643790026e-05, + "loss": 0.1716, + "num_input_tokens_seen": 794992, + "step": 4175 + }, + { + "epoch": 2.1725571725571724, + "grad_norm": 0.03838645666837692, + "learning_rate": 4.793679860576755e-05, + "loss": 0.0315, + "num_input_tokens_seen": 795952, + "step": 4180 + }, + { + "epoch": 2.1751559251559254, + "grad_norm": 0.9966594576835632, + "learning_rate": 4.7927767693161805e-05, + "loss": 0.1196, + "num_input_tokens_seen": 796976, + "step": 4185 + }, + { + "epoch": 2.177754677754678, + "grad_norm": 0.018388889729976654, + "learning_rate": 4.791871791340431e-05, + "loss": 0.0182, + "num_input_tokens_seen": 797904, + "step": 4190 + }, + { + "epoch": 2.1803534303534304, + "grad_norm": 3.1140308380126953, + "learning_rate": 4.7909649273942083e-05, + "loss": 0.0202, + "num_input_tokens_seen": 798832, + "step": 4195 + }, + { + "epoch": 2.182952182952183, + "grad_norm": 0.020873479545116425, + "learning_rate": 4.790056178223764e-05, + "loss": 0.1183, + "num_input_tokens_seen": 799760, + "step": 4200 + }, + { + "epoch": 2.1855509355509355, + "grad_norm": 0.07481619715690613, + "learning_rate": 4.789145544576906e-05, + "loss": 0.1532, + "num_input_tokens_seen": 800688, + "step": 4205 + }, + { + "epoch": 2.188149688149688, + "grad_norm": 0.04900038614869118, + "learning_rate": 4.7882330272029906e-05, + "loss": 0.0425, + "num_input_tokens_seen": 801584, + "step": 4210 + }, + { + "epoch": 2.1907484407484406, + "grad_norm": 0.4431658685207367, + "learning_rate": 4.787318626852923e-05, + "loss": 0.147, + "num_input_tokens_seen": 802544, + "step": 4215 + }, + { + "epoch": 2.1933471933471935, + "grad_norm": 11.950695037841797, + "learning_rate": 4.7864023442791587e-05, + "loss": 0.0211, + "num_input_tokens_seen": 803536, + "step": 4220 + }, + { + "epoch": 2.195945945945946, + "grad_norm": 0.03149132430553436, + "learning_rate": 4.785484180235702e-05, + "loss": 0.0043, + "num_input_tokens_seen": 804496, + "step": 4225 + }, + { + "epoch": 2.1985446985446986, + "grad_norm": 0.025775037705898285, + "learning_rate": 4.7845641354781065e-05, + "loss": 0.0877, + "num_input_tokens_seen": 805456, + "step": 4230 + }, + { + "epoch": 2.201143451143451, + "grad_norm": 0.04948381334543228, + "learning_rate": 4.7836422107634735e-05, + "loss": 0.0297, + "num_input_tokens_seen": 806416, + "step": 4235 + }, + { + "epoch": 2.2037422037422036, + "grad_norm": 0.042947426438331604, + "learning_rate": 4.782718406850449e-05, + "loss": 0.0254, + "num_input_tokens_seen": 807376, + "step": 4240 + }, + { + "epoch": 2.206340956340956, + "grad_norm": 21.653913497924805, + "learning_rate": 4.781792724499228e-05, + "loss": 0.1892, + "num_input_tokens_seen": 808240, + "step": 4245 + }, + { + "epoch": 2.208939708939709, + "grad_norm": 0.1238914430141449, + "learning_rate": 4.78086516447155e-05, + "loss": 0.1129, + "num_input_tokens_seen": 809264, + "step": 4250 + }, + { + "epoch": 2.2115384615384617, + "grad_norm": 0.030751729384064674, + "learning_rate": 4.779935727530699e-05, + "loss": 0.1202, + "num_input_tokens_seen": 810224, + "step": 4255 + }, + { + "epoch": 2.214137214137214, + "grad_norm": 0.03305168077349663, + "learning_rate": 4.779004414441504e-05, + "loss": 0.0897, + "num_input_tokens_seen": 811152, + "step": 4260 + }, + { + "epoch": 2.2167359667359667, + "grad_norm": 2.818458080291748, + "learning_rate": 4.77807122597034e-05, + "loss": 0.2993, + "num_input_tokens_seen": 812176, + "step": 4265 + }, + { + "epoch": 2.2193347193347193, + "grad_norm": 0.20682214200496674, + "learning_rate": 4.777136162885121e-05, + "loss": 0.0761, + "num_input_tokens_seen": 813072, + "step": 4270 + }, + { + "epoch": 2.221933471933472, + "grad_norm": 0.07688742130994797, + "learning_rate": 4.776199225955308e-05, + "loss": 0.0099, + "num_input_tokens_seen": 814064, + "step": 4275 + }, + { + "epoch": 2.2245322245322248, + "grad_norm": 5.351086139678955, + "learning_rate": 4.775260415951901e-05, + "loss": 0.1483, + "num_input_tokens_seen": 815088, + "step": 4280 + }, + { + "epoch": 2.2271309771309773, + "grad_norm": 5.253909111022949, + "learning_rate": 4.774319733647442e-05, + "loss": 0.1216, + "num_input_tokens_seen": 816048, + "step": 4285 + }, + { + "epoch": 2.22972972972973, + "grad_norm": 0.1857377141714096, + "learning_rate": 4.773377179816016e-05, + "loss": 0.0081, + "num_input_tokens_seen": 816976, + "step": 4290 + }, + { + "epoch": 2.2323284823284824, + "grad_norm": 10.619406700134277, + "learning_rate": 4.7724327552332425e-05, + "loss": 0.1963, + "num_input_tokens_seen": 818064, + "step": 4295 + }, + { + "epoch": 2.234927234927235, + "grad_norm": 0.37315526604652405, + "learning_rate": 4.771486460676288e-05, + "loss": 0.0972, + "num_input_tokens_seen": 818992, + "step": 4300 + }, + { + "epoch": 2.2375259875259874, + "grad_norm": 0.08261454105377197, + "learning_rate": 4.770538296923851e-05, + "loss": 0.1076, + "num_input_tokens_seen": 819920, + "step": 4305 + }, + { + "epoch": 2.24012474012474, + "grad_norm": 0.1412617415189743, + "learning_rate": 4.769588264756172e-05, + "loss": 0.0071, + "num_input_tokens_seen": 820816, + "step": 4310 + }, + { + "epoch": 2.242723492723493, + "grad_norm": 0.05510789528489113, + "learning_rate": 4.768636364955028e-05, + "loss": 0.0153, + "num_input_tokens_seen": 821744, + "step": 4315 + }, + { + "epoch": 2.2453222453222454, + "grad_norm": 0.3746234178543091, + "learning_rate": 4.7676825983037334e-05, + "loss": 0.1233, + "num_input_tokens_seen": 822736, + "step": 4320 + }, + { + "epoch": 2.247920997920998, + "grad_norm": 12.224924087524414, + "learning_rate": 4.766726965587137e-05, + "loss": 0.0354, + "num_input_tokens_seen": 823696, + "step": 4325 + }, + { + "epoch": 2.2505197505197505, + "grad_norm": 14.323692321777344, + "learning_rate": 4.765769467591625e-05, + "loss": 0.1132, + "num_input_tokens_seen": 824624, + "step": 4330 + }, + { + "epoch": 2.253118503118503, + "grad_norm": 0.0665869414806366, + "learning_rate": 4.764810105105119e-05, + "loss": 0.0061, + "num_input_tokens_seen": 825520, + "step": 4335 + }, + { + "epoch": 2.2557172557172556, + "grad_norm": 2.0015342235565186, + "learning_rate": 4.763848878917072e-05, + "loss": 0.1109, + "num_input_tokens_seen": 826480, + "step": 4340 + }, + { + "epoch": 2.258316008316008, + "grad_norm": 4.036377906799316, + "learning_rate": 4.762885789818473e-05, + "loss": 0.1434, + "num_input_tokens_seen": 827376, + "step": 4345 + }, + { + "epoch": 2.260914760914761, + "grad_norm": 0.14610493183135986, + "learning_rate": 4.7619208386018455e-05, + "loss": 0.0104, + "num_input_tokens_seen": 828400, + "step": 4350 + }, + { + "epoch": 2.2635135135135136, + "grad_norm": 14.813477516174316, + "learning_rate": 4.760954026061241e-05, + "loss": 0.1204, + "num_input_tokens_seen": 829392, + "step": 4355 + }, + { + "epoch": 2.266112266112266, + "grad_norm": 0.025232983753085136, + "learning_rate": 4.759985352992245e-05, + "loss": 0.0124, + "num_input_tokens_seen": 830352, + "step": 4360 + }, + { + "epoch": 2.2687110187110187, + "grad_norm": 0.5127564668655396, + "learning_rate": 4.759014820191975e-05, + "loss": 0.0806, + "num_input_tokens_seen": 831280, + "step": 4365 + }, + { + "epoch": 2.271309771309771, + "grad_norm": 0.28422045707702637, + "learning_rate": 4.758042428459078e-05, + "loss": 0.0046, + "num_input_tokens_seen": 832176, + "step": 4370 + }, + { + "epoch": 2.2739085239085237, + "grad_norm": 0.048926420509815216, + "learning_rate": 4.75706817859373e-05, + "loss": 0.0644, + "num_input_tokens_seen": 833104, + "step": 4375 + }, + { + "epoch": 2.2765072765072767, + "grad_norm": 0.043308157473802567, + "learning_rate": 4.7560920713976365e-05, + "loss": 0.3683, + "num_input_tokens_seen": 834128, + "step": 4380 + }, + { + "epoch": 2.279106029106029, + "grad_norm": 0.46644139289855957, + "learning_rate": 4.7551141076740316e-05, + "loss": 0.0249, + "num_input_tokens_seen": 835088, + "step": 4385 + }, + { + "epoch": 2.2817047817047817, + "grad_norm": 0.1468491554260254, + "learning_rate": 4.7541342882276775e-05, + "loss": 0.177, + "num_input_tokens_seen": 836080, + "step": 4390 + }, + { + "epoch": 2.2843035343035343, + "grad_norm": 1.547044277191162, + "learning_rate": 4.7531526138648616e-05, + "loss": 0.0087, + "num_input_tokens_seen": 837072, + "step": 4395 + }, + { + "epoch": 2.286902286902287, + "grad_norm": 0.19912029802799225, + "learning_rate": 4.752169085393401e-05, + "loss": 0.0053, + "num_input_tokens_seen": 838000, + "step": 4400 + }, + { + "epoch": 2.2895010395010393, + "grad_norm": 0.020301461219787598, + "learning_rate": 4.751183703622636e-05, + "loss": 0.0119, + "num_input_tokens_seen": 838928, + "step": 4405 + }, + { + "epoch": 2.2920997920997923, + "grad_norm": 6.57958984375, + "learning_rate": 4.750196469363432e-05, + "loss": 0.0963, + "num_input_tokens_seen": 839824, + "step": 4410 + }, + { + "epoch": 2.294698544698545, + "grad_norm": 0.038671668618917465, + "learning_rate": 4.74920738342818e-05, + "loss": 0.2834, + "num_input_tokens_seen": 840816, + "step": 4415 + }, + { + "epoch": 2.2972972972972974, + "grad_norm": 0.09411406517028809, + "learning_rate": 4.748216446630794e-05, + "loss": 0.1258, + "num_input_tokens_seen": 841712, + "step": 4420 + }, + { + "epoch": 2.29989604989605, + "grad_norm": 0.04709600284695625, + "learning_rate": 4.747223659786711e-05, + "loss": 0.0346, + "num_input_tokens_seen": 842704, + "step": 4425 + }, + { + "epoch": 2.3024948024948024, + "grad_norm": 9.060857772827148, + "learning_rate": 4.74622902371289e-05, + "loss": 0.1601, + "num_input_tokens_seen": 843664, + "step": 4430 + }, + { + "epoch": 2.305093555093555, + "grad_norm": 0.11800925433635712, + "learning_rate": 4.7452325392278144e-05, + "loss": 0.0058, + "num_input_tokens_seen": 844624, + "step": 4435 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 0.054471757262945175, + "learning_rate": 4.744234207151484e-05, + "loss": 0.206, + "num_input_tokens_seen": 845584, + "step": 4440 + }, + { + "epoch": 2.3102910602910605, + "grad_norm": 0.08552934974431992, + "learning_rate": 4.743234028305424e-05, + "loss": 0.0186, + "num_input_tokens_seen": 846544, + "step": 4445 + }, + { + "epoch": 2.312889812889813, + "grad_norm": 0.1577637493610382, + "learning_rate": 4.742232003512675e-05, + "loss": 0.0564, + "num_input_tokens_seen": 847472, + "step": 4450 + }, + { + "epoch": 2.3154885654885655, + "grad_norm": 1.9308953285217285, + "learning_rate": 4.741228133597799e-05, + "loss": 0.0252, + "num_input_tokens_seen": 848432, + "step": 4455 + }, + { + "epoch": 2.318087318087318, + "grad_norm": 2.9594836235046387, + "learning_rate": 4.740222419386876e-05, + "loss": 0.086, + "num_input_tokens_seen": 849392, + "step": 4460 + }, + { + "epoch": 2.3206860706860706, + "grad_norm": 3.077136754989624, + "learning_rate": 4.7392148617075036e-05, + "loss": 0.0639, + "num_input_tokens_seen": 850320, + "step": 4465 + }, + { + "epoch": 2.323284823284823, + "grad_norm": 15.069658279418945, + "learning_rate": 4.738205461388796e-05, + "loss": 0.1572, + "num_input_tokens_seen": 851280, + "step": 4470 + }, + { + "epoch": 2.3258835758835756, + "grad_norm": 0.02548670955002308, + "learning_rate": 4.7371942192613836e-05, + "loss": 0.0053, + "num_input_tokens_seen": 852272, + "step": 4475 + }, + { + "epoch": 2.3284823284823286, + "grad_norm": 0.04467194527387619, + "learning_rate": 4.7361811361574134e-05, + "loss": 0.0061, + "num_input_tokens_seen": 853264, + "step": 4480 + }, + { + "epoch": 2.331081081081081, + "grad_norm": 0.6000943779945374, + "learning_rate": 4.735166212910547e-05, + "loss": 0.0879, + "num_input_tokens_seen": 854224, + "step": 4485 + }, + { + "epoch": 2.3336798336798337, + "grad_norm": 10.819910049438477, + "learning_rate": 4.7341494503559594e-05, + "loss": 0.1978, + "num_input_tokens_seen": 855152, + "step": 4490 + }, + { + "epoch": 2.336278586278586, + "grad_norm": 4.765232563018799, + "learning_rate": 4.73313084933034e-05, + "loss": 0.2546, + "num_input_tokens_seen": 856208, + "step": 4495 + }, + { + "epoch": 2.3388773388773387, + "grad_norm": 0.09217492491006851, + "learning_rate": 4.7321104106718895e-05, + "loss": 0.0091, + "num_input_tokens_seen": 857200, + "step": 4500 + }, + { + "epoch": 2.3414760914760917, + "grad_norm": 0.02379038743674755, + "learning_rate": 4.731088135220324e-05, + "loss": 0.0054, + "num_input_tokens_seen": 858128, + "step": 4505 + }, + { + "epoch": 2.3440748440748442, + "grad_norm": 0.0264719408005476, + "learning_rate": 4.730064023816867e-05, + "loss": 0.0508, + "num_input_tokens_seen": 859024, + "step": 4510 + }, + { + "epoch": 2.3466735966735968, + "grad_norm": 2.2440667152404785, + "learning_rate": 4.7290380773042575e-05, + "loss": 0.0941, + "num_input_tokens_seen": 859952, + "step": 4515 + }, + { + "epoch": 2.3492723492723493, + "grad_norm": 0.04155071824789047, + "learning_rate": 4.72801029652674e-05, + "loss": 0.0061, + "num_input_tokens_seen": 860848, + "step": 4520 + }, + { + "epoch": 2.351871101871102, + "grad_norm": 0.07173734903335571, + "learning_rate": 4.726980682330071e-05, + "loss": 0.109, + "num_input_tokens_seen": 861840, + "step": 4525 + }, + { + "epoch": 2.3544698544698544, + "grad_norm": 0.17083364725112915, + "learning_rate": 4.725949235561516e-05, + "loss": 0.096, + "num_input_tokens_seen": 862800, + "step": 4530 + }, + { + "epoch": 2.357068607068607, + "grad_norm": 18.39315414428711, + "learning_rate": 4.7249159570698466e-05, + "loss": 0.0841, + "num_input_tokens_seen": 863760, + "step": 4535 + }, + { + "epoch": 2.35966735966736, + "grad_norm": 0.12055860459804535, + "learning_rate": 4.723880847705343e-05, + "loss": 0.0065, + "num_input_tokens_seen": 864720, + "step": 4540 + }, + { + "epoch": 2.3622661122661124, + "grad_norm": 0.2442387491464615, + "learning_rate": 4.722843908319792e-05, + "loss": 0.0095, + "num_input_tokens_seen": 865680, + "step": 4545 + }, + { + "epoch": 2.364864864864865, + "grad_norm": 0.04331092908978462, + "learning_rate": 4.721805139766486e-05, + "loss": 0.007, + "num_input_tokens_seen": 866576, + "step": 4550 + }, + { + "epoch": 2.3674636174636174, + "grad_norm": 0.03578382357954979, + "learning_rate": 4.7207645429002226e-05, + "loss": 0.0496, + "num_input_tokens_seen": 867504, + "step": 4555 + }, + { + "epoch": 2.37006237006237, + "grad_norm": 0.12361709773540497, + "learning_rate": 4.719722118577305e-05, + "loss": 0.0032, + "num_input_tokens_seen": 868368, + "step": 4560 + }, + { + "epoch": 2.3726611226611225, + "grad_norm": 3.8412275314331055, + "learning_rate": 4.718677867655538e-05, + "loss": 0.1862, + "num_input_tokens_seen": 869328, + "step": 4565 + }, + { + "epoch": 2.375259875259875, + "grad_norm": 0.09591829031705856, + "learning_rate": 4.717631790994231e-05, + "loss": 0.1025, + "num_input_tokens_seen": 870256, + "step": 4570 + }, + { + "epoch": 2.377858627858628, + "grad_norm": 0.10119776427745819, + "learning_rate": 4.716583889454197e-05, + "loss": 0.1203, + "num_input_tokens_seen": 871280, + "step": 4575 + }, + { + "epoch": 2.3804573804573805, + "grad_norm": 0.10597172379493713, + "learning_rate": 4.7155341638977475e-05, + "loss": 0.1043, + "num_input_tokens_seen": 872208, + "step": 4580 + }, + { + "epoch": 2.383056133056133, + "grad_norm": 0.1686752885580063, + "learning_rate": 4.714482615188697e-05, + "loss": 0.0079, + "num_input_tokens_seen": 873264, + "step": 4585 + }, + { + "epoch": 2.3856548856548856, + "grad_norm": 0.12055522948503494, + "learning_rate": 4.71342924419236e-05, + "loss": 0.2297, + "num_input_tokens_seen": 874288, + "step": 4590 + }, + { + "epoch": 2.388253638253638, + "grad_norm": 10.662604331970215, + "learning_rate": 4.712374051775551e-05, + "loss": 0.14, + "num_input_tokens_seen": 875216, + "step": 4595 + }, + { + "epoch": 2.390852390852391, + "grad_norm": 3.152146339416504, + "learning_rate": 4.7113170388065833e-05, + "loss": 0.2305, + "num_input_tokens_seen": 876176, + "step": 4600 + }, + { + "epoch": 2.3934511434511436, + "grad_norm": 1.1844756603240967, + "learning_rate": 4.710258206155266e-05, + "loss": 0.014, + "num_input_tokens_seen": 877104, + "step": 4605 + }, + { + "epoch": 2.396049896049896, + "grad_norm": 2.0207340717315674, + "learning_rate": 4.7091975546929093e-05, + "loss": 0.0844, + "num_input_tokens_seen": 878096, + "step": 4610 + }, + { + "epoch": 2.3986486486486487, + "grad_norm": 1.6012383699417114, + "learning_rate": 4.7081350852923177e-05, + "loss": 0.1408, + "num_input_tokens_seen": 879024, + "step": 4615 + }, + { + "epoch": 2.401247401247401, + "grad_norm": 0.37375375628471375, + "learning_rate": 4.707070798827792e-05, + "loss": 0.035, + "num_input_tokens_seen": 879952, + "step": 4620 + }, + { + "epoch": 2.4038461538461537, + "grad_norm": 29.9837589263916, + "learning_rate": 4.7060046961751294e-05, + "loss": 0.0555, + "num_input_tokens_seen": 880912, + "step": 4625 + }, + { + "epoch": 2.4064449064449063, + "grad_norm": 9.771427154541016, + "learning_rate": 4.704936778211619e-05, + "loss": 0.2041, + "num_input_tokens_seen": 881840, + "step": 4630 + }, + { + "epoch": 2.4090436590436592, + "grad_norm": 0.09606888890266418, + "learning_rate": 4.703867045816047e-05, + "loss": 0.0076, + "num_input_tokens_seen": 882768, + "step": 4635 + }, + { + "epoch": 2.4116424116424118, + "grad_norm": 0.10198366641998291, + "learning_rate": 4.702795499868691e-05, + "loss": 0.0968, + "num_input_tokens_seen": 883664, + "step": 4640 + }, + { + "epoch": 2.4142411642411643, + "grad_norm": 0.10274895280599594, + "learning_rate": 4.70172214125132e-05, + "loss": 0.0711, + "num_input_tokens_seen": 884656, + "step": 4645 + }, + { + "epoch": 2.416839916839917, + "grad_norm": 0.16772986948490143, + "learning_rate": 4.700646970847197e-05, + "loss": 0.0782, + "num_input_tokens_seen": 885648, + "step": 4650 + }, + { + "epoch": 2.4194386694386694, + "grad_norm": 25.765504837036133, + "learning_rate": 4.699569989541074e-05, + "loss": 0.0297, + "num_input_tokens_seen": 886576, + "step": 4655 + }, + { + "epoch": 2.422037422037422, + "grad_norm": 0.18743424117565155, + "learning_rate": 4.6984911982191936e-05, + "loss": 0.0375, + "num_input_tokens_seen": 887568, + "step": 4660 + }, + { + "epoch": 2.4246361746361744, + "grad_norm": 0.065270334482193, + "learning_rate": 4.6974105977692884e-05, + "loss": 0.0039, + "num_input_tokens_seen": 888496, + "step": 4665 + }, + { + "epoch": 2.4272349272349274, + "grad_norm": 0.06537817418575287, + "learning_rate": 4.69632818908058e-05, + "loss": 0.124, + "num_input_tokens_seen": 889424, + "step": 4670 + }, + { + "epoch": 2.42983367983368, + "grad_norm": 0.11525186896324158, + "learning_rate": 4.695243973043777e-05, + "loss": 0.0029, + "num_input_tokens_seen": 890384, + "step": 4675 + }, + { + "epoch": 2.4324324324324325, + "grad_norm": 0.04207899793982506, + "learning_rate": 4.694157950551075e-05, + "loss": 0.0031, + "num_input_tokens_seen": 891408, + "step": 4680 + }, + { + "epoch": 2.435031185031185, + "grad_norm": 0.10669717192649841, + "learning_rate": 4.6930701224961573e-05, + "loss": 0.019, + "num_input_tokens_seen": 892400, + "step": 4685 + }, + { + "epoch": 2.4376299376299375, + "grad_norm": 0.018446968868374825, + "learning_rate": 4.6919804897741925e-05, + "loss": 0.0027, + "num_input_tokens_seen": 893360, + "step": 4690 + }, + { + "epoch": 2.44022869022869, + "grad_norm": 0.024158647283911705, + "learning_rate": 4.690889053281834e-05, + "loss": 0.1898, + "num_input_tokens_seen": 894256, + "step": 4695 + }, + { + "epoch": 2.442827442827443, + "grad_norm": 0.14778365194797516, + "learning_rate": 4.68979581391722e-05, + "loss": 0.1375, + "num_input_tokens_seen": 895280, + "step": 4700 + }, + { + "epoch": 2.4454261954261955, + "grad_norm": 0.2697875201702118, + "learning_rate": 4.688700772579972e-05, + "loss": 0.0056, + "num_input_tokens_seen": 896272, + "step": 4705 + }, + { + "epoch": 2.448024948024948, + "grad_norm": 18.331607818603516, + "learning_rate": 4.687603930171194e-05, + "loss": 0.1772, + "num_input_tokens_seen": 897264, + "step": 4710 + }, + { + "epoch": 2.4506237006237006, + "grad_norm": 1.0125564336776733, + "learning_rate": 4.686505287593473e-05, + "loss": 0.1245, + "num_input_tokens_seen": 898160, + "step": 4715 + }, + { + "epoch": 2.453222453222453, + "grad_norm": 0.18594932556152344, + "learning_rate": 4.685404845750877e-05, + "loss": 0.0051, + "num_input_tokens_seen": 899184, + "step": 4720 + }, + { + "epoch": 2.4558212058212057, + "grad_norm": 0.02309310808777809, + "learning_rate": 4.6843026055489524e-05, + "loss": 0.1804, + "num_input_tokens_seen": 900112, + "step": 4725 + }, + { + "epoch": 2.4584199584199586, + "grad_norm": 0.2179414927959442, + "learning_rate": 4.6831985678947296e-05, + "loss": 0.1174, + "num_input_tokens_seen": 901040, + "step": 4730 + }, + { + "epoch": 2.461018711018711, + "grad_norm": 0.18342284858226776, + "learning_rate": 4.682092733696716e-05, + "loss": 0.0206, + "num_input_tokens_seen": 902000, + "step": 4735 + }, + { + "epoch": 2.4636174636174637, + "grad_norm": 0.06208100542426109, + "learning_rate": 4.680985103864896e-05, + "loss": 0.0914, + "num_input_tokens_seen": 902992, + "step": 4740 + }, + { + "epoch": 2.4662162162162162, + "grad_norm": 0.21459035575389862, + "learning_rate": 4.679875679310734e-05, + "loss": 0.2546, + "num_input_tokens_seen": 904016, + "step": 4745 + }, + { + "epoch": 2.4688149688149688, + "grad_norm": 0.0843406692147255, + "learning_rate": 4.67876446094717e-05, + "loss": 0.0202, + "num_input_tokens_seen": 905008, + "step": 4750 + }, + { + "epoch": 2.4714137214137213, + "grad_norm": 3.8145720958709717, + "learning_rate": 4.677651449688619e-05, + "loss": 0.1047, + "num_input_tokens_seen": 905936, + "step": 4755 + }, + { + "epoch": 2.474012474012474, + "grad_norm": 9.281464576721191, + "learning_rate": 4.676536646450975e-05, + "loss": 0.0823, + "num_input_tokens_seen": 906960, + "step": 4760 + }, + { + "epoch": 2.476611226611227, + "grad_norm": 0.04598342627286911, + "learning_rate": 4.675420052151603e-05, + "loss": 0.1042, + "num_input_tokens_seen": 907888, + "step": 4765 + }, + { + "epoch": 2.4792099792099793, + "grad_norm": 0.07173620909452438, + "learning_rate": 4.674301667709343e-05, + "loss": 0.0093, + "num_input_tokens_seen": 908880, + "step": 4770 + }, + { + "epoch": 2.481808731808732, + "grad_norm": 0.16271740198135376, + "learning_rate": 4.673181494044509e-05, + "loss": 0.0302, + "num_input_tokens_seen": 909872, + "step": 4775 + }, + { + "epoch": 2.4844074844074844, + "grad_norm": 7.457341194152832, + "learning_rate": 4.672059532078886e-05, + "loss": 0.3757, + "num_input_tokens_seen": 910864, + "step": 4780 + }, + { + "epoch": 2.487006237006237, + "grad_norm": 1.617796540260315, + "learning_rate": 4.670935782735732e-05, + "loss": 0.0073, + "num_input_tokens_seen": 911792, + "step": 4785 + }, + { + "epoch": 2.4896049896049894, + "grad_norm": 0.08107176423072815, + "learning_rate": 4.669810246939774e-05, + "loss": 0.0044, + "num_input_tokens_seen": 912752, + "step": 4790 + }, + { + "epoch": 2.492203742203742, + "grad_norm": 0.06011596694588661, + "learning_rate": 4.668682925617211e-05, + "loss": 0.0046, + "num_input_tokens_seen": 913616, + "step": 4795 + }, + { + "epoch": 2.494802494802495, + "grad_norm": 16.098913192749023, + "learning_rate": 4.6675538196957096e-05, + "loss": 0.1246, + "num_input_tokens_seen": 914512, + "step": 4800 + }, + { + "epoch": 2.4974012474012475, + "grad_norm": 7.853962421417236, + "learning_rate": 4.666422930104406e-05, + "loss": 0.0641, + "num_input_tokens_seen": 915472, + "step": 4805 + }, + { + "epoch": 2.5, + "grad_norm": 5.180329322814941, + "learning_rate": 4.665290257773904e-05, + "loss": 0.1692, + "num_input_tokens_seen": 916368, + "step": 4810 + }, + { + "epoch": 2.5, + "eval_loss": 0.19045723974704742, + "eval_runtime": 9.2862, + "eval_samples_per_second": 92.179, + "eval_steps_per_second": 23.045, + "num_input_tokens_seen": 916368, + "step": 4810 + }, + { + "epoch": 2.5025987525987525, + "grad_norm": 0.06201736629009247, + "learning_rate": 4.6641558036362754e-05, + "loss": 0.0061, + "num_input_tokens_seen": 917328, + "step": 4815 + }, + { + "epoch": 2.505197505197505, + "grad_norm": 0.054649367928504944, + "learning_rate": 4.663019568625055e-05, + "loss": 0.0198, + "num_input_tokens_seen": 918256, + "step": 4820 + }, + { + "epoch": 2.507796257796258, + "grad_norm": 0.09196235984563828, + "learning_rate": 4.661881553675247e-05, + "loss": 0.0065, + "num_input_tokens_seen": 919248, + "step": 4825 + }, + { + "epoch": 2.51039501039501, + "grad_norm": 0.03499903902411461, + "learning_rate": 4.660741759723317e-05, + "loss": 0.0388, + "num_input_tokens_seen": 920240, + "step": 4830 + }, + { + "epoch": 2.512993762993763, + "grad_norm": 2.011834144592285, + "learning_rate": 4.6596001877071976e-05, + "loss": 0.1182, + "num_input_tokens_seen": 921200, + "step": 4835 + }, + { + "epoch": 2.5155925155925156, + "grad_norm": 6.793698310852051, + "learning_rate": 4.658456838566282e-05, + "loss": 0.0848, + "num_input_tokens_seen": 922128, + "step": 4840 + }, + { + "epoch": 2.518191268191268, + "grad_norm": 0.05358162894845009, + "learning_rate": 4.657311713241427e-05, + "loss": 0.1057, + "num_input_tokens_seen": 923088, + "step": 4845 + }, + { + "epoch": 2.5207900207900207, + "grad_norm": 0.59412682056427, + "learning_rate": 4.656164812674951e-05, + "loss": 0.0051, + "num_input_tokens_seen": 924016, + "step": 4850 + }, + { + "epoch": 2.523388773388773, + "grad_norm": 10.61545467376709, + "learning_rate": 4.655016137810634e-05, + "loss": 0.0762, + "num_input_tokens_seen": 924976, + "step": 4855 + }, + { + "epoch": 2.525987525987526, + "grad_norm": 6.554389953613281, + "learning_rate": 4.6538656895937135e-05, + "loss": 0.1214, + "num_input_tokens_seen": 925968, + "step": 4860 + }, + { + "epoch": 2.5285862785862787, + "grad_norm": 0.16170905530452728, + "learning_rate": 4.652713468970888e-05, + "loss": 0.1943, + "num_input_tokens_seen": 926960, + "step": 4865 + }, + { + "epoch": 2.5311850311850312, + "grad_norm": 0.14027252793312073, + "learning_rate": 4.651559476890315e-05, + "loss": 0.0095, + "num_input_tokens_seen": 927920, + "step": 4870 + }, + { + "epoch": 2.5337837837837838, + "grad_norm": 0.04390488564968109, + "learning_rate": 4.65040371430161e-05, + "loss": 0.1065, + "num_input_tokens_seen": 928848, + "step": 4875 + }, + { + "epoch": 2.5363825363825363, + "grad_norm": 0.04181057587265968, + "learning_rate": 4.6492461821558434e-05, + "loss": 0.0591, + "num_input_tokens_seen": 929776, + "step": 4880 + }, + { + "epoch": 2.538981288981289, + "grad_norm": 0.06501344591379166, + "learning_rate": 4.6480868814055424e-05, + "loss": 0.1202, + "num_input_tokens_seen": 930736, + "step": 4885 + }, + { + "epoch": 2.5415800415800414, + "grad_norm": 0.074299156665802, + "learning_rate": 4.646925813004691e-05, + "loss": 0.1036, + "num_input_tokens_seen": 931728, + "step": 4890 + }, + { + "epoch": 2.5441787941787943, + "grad_norm": 2.7888224124908447, + "learning_rate": 4.645762977908728e-05, + "loss": 0.2097, + "num_input_tokens_seen": 932624, + "step": 4895 + }, + { + "epoch": 2.546777546777547, + "grad_norm": 0.8382644057273865, + "learning_rate": 4.644598377074543e-05, + "loss": 0.1215, + "num_input_tokens_seen": 933552, + "step": 4900 + }, + { + "epoch": 2.5493762993762994, + "grad_norm": 0.16533516347408295, + "learning_rate": 4.64343201146048e-05, + "loss": 0.0134, + "num_input_tokens_seen": 934480, + "step": 4905 + }, + { + "epoch": 2.551975051975052, + "grad_norm": 0.09869053214788437, + "learning_rate": 4.642263882026339e-05, + "loss": 0.033, + "num_input_tokens_seen": 935440, + "step": 4910 + }, + { + "epoch": 2.5545738045738045, + "grad_norm": 0.02949225716292858, + "learning_rate": 4.6410939897333646e-05, + "loss": 0.0502, + "num_input_tokens_seen": 936368, + "step": 4915 + }, + { + "epoch": 2.5571725571725574, + "grad_norm": 0.2780553996562958, + "learning_rate": 4.639922335544258e-05, + "loss": 0.1009, + "num_input_tokens_seen": 937296, + "step": 4920 + }, + { + "epoch": 2.5597713097713095, + "grad_norm": 0.07585831731557846, + "learning_rate": 4.638748920423167e-05, + "loss": 0.2505, + "num_input_tokens_seen": 938224, + "step": 4925 + }, + { + "epoch": 2.5623700623700625, + "grad_norm": 0.29850223660469055, + "learning_rate": 4.637573745335691e-05, + "loss": 0.0561, + "num_input_tokens_seen": 939184, + "step": 4930 + }, + { + "epoch": 2.564968814968815, + "grad_norm": 0.13538387417793274, + "learning_rate": 4.636396811248872e-05, + "loss": 0.0205, + "num_input_tokens_seen": 940080, + "step": 4935 + }, + { + "epoch": 2.5675675675675675, + "grad_norm": 0.814620316028595, + "learning_rate": 4.635218119131207e-05, + "loss": 0.0803, + "num_input_tokens_seen": 941168, + "step": 4940 + }, + { + "epoch": 2.57016632016632, + "grad_norm": 0.6555452942848206, + "learning_rate": 4.6340376699526356e-05, + "loss": 0.0402, + "num_input_tokens_seen": 942192, + "step": 4945 + }, + { + "epoch": 2.5727650727650726, + "grad_norm": 4.063392162322998, + "learning_rate": 4.6328554646845434e-05, + "loss": 0.307, + "num_input_tokens_seen": 943120, + "step": 4950 + }, + { + "epoch": 2.5753638253638256, + "grad_norm": 0.7387357950210571, + "learning_rate": 4.631671504299762e-05, + "loss": 0.0646, + "num_input_tokens_seen": 944112, + "step": 4955 + }, + { + "epoch": 2.577962577962578, + "grad_norm": 0.025433789938688278, + "learning_rate": 4.6304857897725653e-05, + "loss": 0.0058, + "num_input_tokens_seen": 945200, + "step": 4960 + }, + { + "epoch": 2.5805613305613306, + "grad_norm": 0.07412973046302795, + "learning_rate": 4.629298322078674e-05, + "loss": 0.0557, + "num_input_tokens_seen": 946160, + "step": 4965 + }, + { + "epoch": 2.583160083160083, + "grad_norm": 3.3500585556030273, + "learning_rate": 4.628109102195249e-05, + "loss": 0.0828, + "num_input_tokens_seen": 947056, + "step": 4970 + }, + { + "epoch": 2.5857588357588357, + "grad_norm": 0.0376429557800293, + "learning_rate": 4.626918131100894e-05, + "loss": 0.0836, + "num_input_tokens_seen": 947952, + "step": 4975 + }, + { + "epoch": 2.5883575883575882, + "grad_norm": 0.21033258736133575, + "learning_rate": 4.625725409775652e-05, + "loss": 0.0047, + "num_input_tokens_seen": 948848, + "step": 4980 + }, + { + "epoch": 2.5909563409563408, + "grad_norm": 0.06059103459119797, + "learning_rate": 4.6245309392010094e-05, + "loss": 0.0063, + "num_input_tokens_seen": 949872, + "step": 4985 + }, + { + "epoch": 2.5935550935550937, + "grad_norm": 0.029823165386915207, + "learning_rate": 4.6233347203598896e-05, + "loss": 0.1354, + "num_input_tokens_seen": 950800, + "step": 4990 + }, + { + "epoch": 2.5961538461538463, + "grad_norm": 6.000887870788574, + "learning_rate": 4.622136754236657e-05, + "loss": 0.0097, + "num_input_tokens_seen": 951856, + "step": 4995 + }, + { + "epoch": 2.598752598752599, + "grad_norm": 0.044555071741342545, + "learning_rate": 4.62093704181711e-05, + "loss": 0.1531, + "num_input_tokens_seen": 952752, + "step": 5000 + }, + { + "epoch": 2.6013513513513513, + "grad_norm": 0.13712938129901886, + "learning_rate": 4.619735584088487e-05, + "loss": 0.004, + "num_input_tokens_seen": 953744, + "step": 5005 + }, + { + "epoch": 2.603950103950104, + "grad_norm": 0.1159396767616272, + "learning_rate": 4.618532382039463e-05, + "loss": 0.1945, + "num_input_tokens_seen": 954672, + "step": 5010 + }, + { + "epoch": 2.606548856548857, + "grad_norm": 0.10287028551101685, + "learning_rate": 4.6173274366601466e-05, + "loss": 0.0039, + "num_input_tokens_seen": 955568, + "step": 5015 + }, + { + "epoch": 2.609147609147609, + "grad_norm": 0.05800308659672737, + "learning_rate": 4.616120748942081e-05, + "loss": 0.0075, + "num_input_tokens_seen": 956592, + "step": 5020 + }, + { + "epoch": 2.611746361746362, + "grad_norm": 0.5042757391929626, + "learning_rate": 4.614912319878244e-05, + "loss": 0.2301, + "num_input_tokens_seen": 957584, + "step": 5025 + }, + { + "epoch": 2.6143451143451144, + "grad_norm": 0.10460629314184189, + "learning_rate": 4.6137021504630476e-05, + "loss": 0.0497, + "num_input_tokens_seen": 958640, + "step": 5030 + }, + { + "epoch": 2.616943866943867, + "grad_norm": 3.4143621921539307, + "learning_rate": 4.612490241692332e-05, + "loss": 0.2774, + "num_input_tokens_seen": 959600, + "step": 5035 + }, + { + "epoch": 2.6195426195426195, + "grad_norm": 0.030767502263188362, + "learning_rate": 4.611276594563374e-05, + "loss": 0.1423, + "num_input_tokens_seen": 960496, + "step": 5040 + }, + { + "epoch": 2.622141372141372, + "grad_norm": 3.5894155502319336, + "learning_rate": 4.6100612100748765e-05, + "loss": 0.1021, + "num_input_tokens_seen": 961584, + "step": 5045 + }, + { + "epoch": 2.624740124740125, + "grad_norm": 0.13618576526641846, + "learning_rate": 4.608844089226974e-05, + "loss": 0.1407, + "num_input_tokens_seen": 962544, + "step": 5050 + }, + { + "epoch": 2.6273388773388775, + "grad_norm": 3.2297770977020264, + "learning_rate": 4.607625233021228e-05, + "loss": 0.1792, + "num_input_tokens_seen": 963472, + "step": 5055 + }, + { + "epoch": 2.62993762993763, + "grad_norm": 6.636751651763916, + "learning_rate": 4.6064046424606324e-05, + "loss": 0.047, + "num_input_tokens_seen": 964304, + "step": 5060 + }, + { + "epoch": 2.6325363825363826, + "grad_norm": 2.1614227294921875, + "learning_rate": 4.605182318549602e-05, + "loss": 0.084, + "num_input_tokens_seen": 965232, + "step": 5065 + }, + { + "epoch": 2.635135135135135, + "grad_norm": 0.13501305878162384, + "learning_rate": 4.6039582622939854e-05, + "loss": 0.027, + "num_input_tokens_seen": 966160, + "step": 5070 + }, + { + "epoch": 2.6377338877338876, + "grad_norm": 0.33562830090522766, + "learning_rate": 4.602732474701049e-05, + "loss": 0.0838, + "num_input_tokens_seen": 967088, + "step": 5075 + }, + { + "epoch": 2.64033264033264, + "grad_norm": 0.08091291785240173, + "learning_rate": 4.60150495677949e-05, + "loss": 0.1544, + "num_input_tokens_seen": 968080, + "step": 5080 + }, + { + "epoch": 2.642931392931393, + "grad_norm": 0.1849038004875183, + "learning_rate": 4.600275709539427e-05, + "loss": 0.0067, + "num_input_tokens_seen": 968944, + "step": 5085 + }, + { + "epoch": 2.6455301455301456, + "grad_norm": 4.259855270385742, + "learning_rate": 4.5990447339924e-05, + "loss": 0.175, + "num_input_tokens_seen": 969904, + "step": 5090 + }, + { + "epoch": 2.648128898128898, + "grad_norm": 1.1868577003479004, + "learning_rate": 4.597812031151374e-05, + "loss": 0.0087, + "num_input_tokens_seen": 970928, + "step": 5095 + }, + { + "epoch": 2.6507276507276507, + "grad_norm": 0.329679012298584, + "learning_rate": 4.5965776020307344e-05, + "loss": 0.1822, + "num_input_tokens_seen": 971856, + "step": 5100 + }, + { + "epoch": 2.6533264033264032, + "grad_norm": 3.65557861328125, + "learning_rate": 4.5953414476462875e-05, + "loss": 0.1283, + "num_input_tokens_seen": 972816, + "step": 5105 + }, + { + "epoch": 2.6559251559251558, + "grad_norm": 0.4404292404651642, + "learning_rate": 4.594103569015258e-05, + "loss": 0.0847, + "num_input_tokens_seen": 973776, + "step": 5110 + }, + { + "epoch": 2.6585239085239083, + "grad_norm": 13.708088874816895, + "learning_rate": 4.592863967156291e-05, + "loss": 0.1763, + "num_input_tokens_seen": 974768, + "step": 5115 + }, + { + "epoch": 2.6611226611226613, + "grad_norm": 3.557225227355957, + "learning_rate": 4.5916226430894494e-05, + "loss": 0.2447, + "num_input_tokens_seen": 975696, + "step": 5120 + }, + { + "epoch": 2.663721413721414, + "grad_norm": 0.10005206614732742, + "learning_rate": 4.590379597836212e-05, + "loss": 0.0361, + "num_input_tokens_seen": 976592, + "step": 5125 + }, + { + "epoch": 2.6663201663201663, + "grad_norm": 0.171956866979599, + "learning_rate": 4.589134832419475e-05, + "loss": 0.0477, + "num_input_tokens_seen": 977552, + "step": 5130 + }, + { + "epoch": 2.668918918918919, + "grad_norm": 14.426977157592773, + "learning_rate": 4.58788834786355e-05, + "loss": 0.3746, + "num_input_tokens_seen": 978576, + "step": 5135 + }, + { + "epoch": 2.6715176715176714, + "grad_norm": 0.32744070887565613, + "learning_rate": 4.586640145194164e-05, + "loss": 0.0108, + "num_input_tokens_seen": 979536, + "step": 5140 + }, + { + "epoch": 2.6741164241164244, + "grad_norm": 40.580230712890625, + "learning_rate": 4.5853902254384564e-05, + "loss": 0.2631, + "num_input_tokens_seen": 980560, + "step": 5145 + }, + { + "epoch": 2.6767151767151764, + "grad_norm": 0.07304979115724564, + "learning_rate": 4.584138589624981e-05, + "loss": 0.0617, + "num_input_tokens_seen": 981520, + "step": 5150 + }, + { + "epoch": 2.6793139293139294, + "grad_norm": 0.2582721710205078, + "learning_rate": 4.582885238783702e-05, + "loss": 0.0868, + "num_input_tokens_seen": 982576, + "step": 5155 + }, + { + "epoch": 2.681912681912682, + "grad_norm": 0.09892959147691727, + "learning_rate": 4.581630173945995e-05, + "loss": 0.2472, + "num_input_tokens_seen": 983536, + "step": 5160 + }, + { + "epoch": 2.6845114345114345, + "grad_norm": 0.10174065083265305, + "learning_rate": 4.58037339614465e-05, + "loss": 0.1314, + "num_input_tokens_seen": 984464, + "step": 5165 + }, + { + "epoch": 2.687110187110187, + "grad_norm": 2.6686747074127197, + "learning_rate": 4.579114906413861e-05, + "loss": 0.2006, + "num_input_tokens_seen": 985424, + "step": 5170 + }, + { + "epoch": 2.6897089397089395, + "grad_norm": 0.3300897777080536, + "learning_rate": 4.577854705789234e-05, + "loss": 0.1432, + "num_input_tokens_seen": 986384, + "step": 5175 + }, + { + "epoch": 2.6923076923076925, + "grad_norm": 8.437013626098633, + "learning_rate": 4.576592795307782e-05, + "loss": 0.0883, + "num_input_tokens_seen": 987408, + "step": 5180 + }, + { + "epoch": 2.694906444906445, + "grad_norm": 0.19113165140151978, + "learning_rate": 4.5753291760079265e-05, + "loss": 0.1228, + "num_input_tokens_seen": 988368, + "step": 5185 + }, + { + "epoch": 2.6975051975051976, + "grad_norm": 0.1130053922533989, + "learning_rate": 4.5740638489294915e-05, + "loss": 0.0532, + "num_input_tokens_seen": 989328, + "step": 5190 + }, + { + "epoch": 2.70010395010395, + "grad_norm": 0.06286972761154175, + "learning_rate": 4.5727968151137104e-05, + "loss": 0.1111, + "num_input_tokens_seen": 990224, + "step": 5195 + }, + { + "epoch": 2.7027027027027026, + "grad_norm": 0.060872483998537064, + "learning_rate": 4.5715280756032184e-05, + "loss": 0.0864, + "num_input_tokens_seen": 991152, + "step": 5200 + }, + { + "epoch": 2.705301455301455, + "grad_norm": 0.36976560950279236, + "learning_rate": 4.5702576314420554e-05, + "loss": 0.0867, + "num_input_tokens_seen": 992080, + "step": 5205 + }, + { + "epoch": 2.7079002079002077, + "grad_norm": 6.455740928649902, + "learning_rate": 4.5689854836756654e-05, + "loss": 0.1483, + "num_input_tokens_seen": 993072, + "step": 5210 + }, + { + "epoch": 2.7104989604989607, + "grad_norm": 2.015890121459961, + "learning_rate": 4.567711633350891e-05, + "loss": 0.2259, + "num_input_tokens_seen": 994096, + "step": 5215 + }, + { + "epoch": 2.713097713097713, + "grad_norm": 0.12520432472229004, + "learning_rate": 4.5664360815159775e-05, + "loss": 0.1184, + "num_input_tokens_seen": 995088, + "step": 5220 + }, + { + "epoch": 2.7156964656964657, + "grad_norm": 1.9031082391738892, + "learning_rate": 4.5651588292205715e-05, + "loss": 0.176, + "num_input_tokens_seen": 996016, + "step": 5225 + }, + { + "epoch": 2.7182952182952183, + "grad_norm": 1.419081211090088, + "learning_rate": 4.5638798775157176e-05, + "loss": 0.1034, + "num_input_tokens_seen": 996976, + "step": 5230 + }, + { + "epoch": 2.720893970893971, + "grad_norm": 0.43248623609542847, + "learning_rate": 4.562599227453859e-05, + "loss": 0.117, + "num_input_tokens_seen": 997904, + "step": 5235 + }, + { + "epoch": 2.7234927234927238, + "grad_norm": 4.123561382293701, + "learning_rate": 4.5613168800888366e-05, + "loss": 0.1065, + "num_input_tokens_seen": 998832, + "step": 5240 + }, + { + "epoch": 2.726091476091476, + "grad_norm": 44.17076110839844, + "learning_rate": 4.560032836475888e-05, + "loss": 0.1471, + "num_input_tokens_seen": 999824, + "step": 5245 + }, + { + "epoch": 2.728690228690229, + "grad_norm": 0.12490683048963547, + "learning_rate": 4.5587470976716464e-05, + "loss": 0.0859, + "num_input_tokens_seen": 1000784, + "step": 5250 + }, + { + "epoch": 2.7312889812889813, + "grad_norm": 0.7141446471214294, + "learning_rate": 4.557459664734141e-05, + "loss": 0.08, + "num_input_tokens_seen": 1001680, + "step": 5255 + }, + { + "epoch": 2.733887733887734, + "grad_norm": 0.06171558424830437, + "learning_rate": 4.556170538722794e-05, + "loss": 0.097, + "num_input_tokens_seen": 1002640, + "step": 5260 + }, + { + "epoch": 2.7364864864864864, + "grad_norm": 0.03354429826140404, + "learning_rate": 4.55487972069842e-05, + "loss": 0.0081, + "num_input_tokens_seen": 1003600, + "step": 5265 + }, + { + "epoch": 2.739085239085239, + "grad_norm": 1.756387710571289, + "learning_rate": 4.553587211723228e-05, + "loss": 0.1584, + "num_input_tokens_seen": 1004496, + "step": 5270 + }, + { + "epoch": 2.741683991683992, + "grad_norm": 0.05383099243044853, + "learning_rate": 4.5522930128608176e-05, + "loss": 0.1142, + "num_input_tokens_seen": 1005488, + "step": 5275 + }, + { + "epoch": 2.7442827442827444, + "grad_norm": 0.07099480926990509, + "learning_rate": 4.550997125176179e-05, + "loss": 0.0063, + "num_input_tokens_seen": 1006448, + "step": 5280 + }, + { + "epoch": 2.746881496881497, + "grad_norm": 14.637290000915527, + "learning_rate": 4.549699549735692e-05, + "loss": 0.1551, + "num_input_tokens_seen": 1007408, + "step": 5285 + }, + { + "epoch": 2.7494802494802495, + "grad_norm": 2.640327215194702, + "learning_rate": 4.548400287607124e-05, + "loss": 0.1642, + "num_input_tokens_seen": 1008368, + "step": 5290 + }, + { + "epoch": 2.752079002079002, + "grad_norm": 0.12255776673555374, + "learning_rate": 4.547099339859634e-05, + "loss": 0.169, + "num_input_tokens_seen": 1009264, + "step": 5295 + }, + { + "epoch": 2.7546777546777546, + "grad_norm": 0.10800418257713318, + "learning_rate": 4.5457967075637644e-05, + "loss": 0.145, + "num_input_tokens_seen": 1010256, + "step": 5300 + }, + { + "epoch": 2.757276507276507, + "grad_norm": 0.1539176106452942, + "learning_rate": 4.544492391791445e-05, + "loss": 0.1235, + "num_input_tokens_seen": 1011280, + "step": 5305 + }, + { + "epoch": 2.75987525987526, + "grad_norm": 0.18296802043914795, + "learning_rate": 4.543186393615991e-05, + "loss": 0.055, + "num_input_tokens_seen": 1012208, + "step": 5310 + }, + { + "epoch": 2.7624740124740126, + "grad_norm": 0.3285243511199951, + "learning_rate": 4.5418787141121026e-05, + "loss": 0.1171, + "num_input_tokens_seen": 1013168, + "step": 5315 + }, + { + "epoch": 2.765072765072765, + "grad_norm": 0.11829633265733719, + "learning_rate": 4.540569354355864e-05, + "loss": 0.0927, + "num_input_tokens_seen": 1014128, + "step": 5320 + }, + { + "epoch": 2.7676715176715176, + "grad_norm": 0.6519357562065125, + "learning_rate": 4.539258315424739e-05, + "loss": 0.1317, + "num_input_tokens_seen": 1015120, + "step": 5325 + }, + { + "epoch": 2.77027027027027, + "grad_norm": 0.15952761471271515, + "learning_rate": 4.537945598397577e-05, + "loss": 0.1356, + "num_input_tokens_seen": 1016048, + "step": 5330 + }, + { + "epoch": 2.7728690228690227, + "grad_norm": 0.04696407914161682, + "learning_rate": 4.536631204354606e-05, + "loss": 0.1872, + "num_input_tokens_seen": 1016912, + "step": 5335 + }, + { + "epoch": 2.7754677754677752, + "grad_norm": 0.04638927802443504, + "learning_rate": 4.535315134377435e-05, + "loss": 0.0445, + "num_input_tokens_seen": 1017840, + "step": 5340 + }, + { + "epoch": 2.778066528066528, + "grad_norm": 1.604427456855774, + "learning_rate": 4.533997389549052e-05, + "loss": 0.0331, + "num_input_tokens_seen": 1018768, + "step": 5345 + }, + { + "epoch": 2.7806652806652807, + "grad_norm": 2.93060302734375, + "learning_rate": 4.532677970953821e-05, + "loss": 0.0074, + "num_input_tokens_seen": 1019728, + "step": 5350 + }, + { + "epoch": 2.7832640332640333, + "grad_norm": 7.14898681640625, + "learning_rate": 4.531356879677488e-05, + "loss": 0.195, + "num_input_tokens_seen": 1020624, + "step": 5355 + }, + { + "epoch": 2.785862785862786, + "grad_norm": 0.14557404816150665, + "learning_rate": 4.530034116807172e-05, + "loss": 0.0197, + "num_input_tokens_seen": 1021584, + "step": 5360 + }, + { + "epoch": 2.7884615384615383, + "grad_norm": 1.099914789199829, + "learning_rate": 4.528709683431368e-05, + "loss": 0.0045, + "num_input_tokens_seen": 1022480, + "step": 5365 + }, + { + "epoch": 2.7910602910602913, + "grad_norm": 0.02181261219084263, + "learning_rate": 4.527383580639946e-05, + "loss": 0.0397, + "num_input_tokens_seen": 1023440, + "step": 5370 + }, + { + "epoch": 2.7936590436590434, + "grad_norm": 0.04683063551783562, + "learning_rate": 4.526055809524149e-05, + "loss": 0.003, + "num_input_tokens_seen": 1024336, + "step": 5375 + }, + { + "epoch": 2.7962577962577964, + "grad_norm": 0.023304246366024017, + "learning_rate": 4.524726371176594e-05, + "loss": 0.1033, + "num_input_tokens_seen": 1025328, + "step": 5380 + }, + { + "epoch": 2.798856548856549, + "grad_norm": 0.5370621085166931, + "learning_rate": 4.52339526669127e-05, + "loss": 0.3033, + "num_input_tokens_seen": 1026256, + "step": 5385 + }, + { + "epoch": 2.8014553014553014, + "grad_norm": 0.14150284230709076, + "learning_rate": 4.522062497163538e-05, + "loss": 0.0685, + "num_input_tokens_seen": 1027248, + "step": 5390 + }, + { + "epoch": 2.804054054054054, + "grad_norm": 0.02119358815252781, + "learning_rate": 4.5207280636901246e-05, + "loss": 0.1115, + "num_input_tokens_seen": 1028176, + "step": 5395 + }, + { + "epoch": 2.8066528066528065, + "grad_norm": 0.013076232746243477, + "learning_rate": 4.519391967369131e-05, + "loss": 0.079, + "num_input_tokens_seen": 1029040, + "step": 5400 + }, + { + "epoch": 2.8092515592515594, + "grad_norm": 0.0112623181194067, + "learning_rate": 4.5180542093000234e-05, + "loss": 0.003, + "num_input_tokens_seen": 1029968, + "step": 5405 + }, + { + "epoch": 2.811850311850312, + "grad_norm": 0.1615327149629593, + "learning_rate": 4.516714790583637e-05, + "loss": 0.0234, + "num_input_tokens_seen": 1030864, + "step": 5410 + }, + { + "epoch": 2.8144490644490645, + "grad_norm": 26.000368118286133, + "learning_rate": 4.515373712322174e-05, + "loss": 0.1567, + "num_input_tokens_seen": 1031792, + "step": 5415 + }, + { + "epoch": 2.817047817047817, + "grad_norm": 0.017177842557430267, + "learning_rate": 4.5140309756192e-05, + "loss": 0.006, + "num_input_tokens_seen": 1032720, + "step": 5420 + }, + { + "epoch": 2.8196465696465696, + "grad_norm": 0.2918132543563843, + "learning_rate": 4.5126865815796474e-05, + "loss": 0.0945, + "num_input_tokens_seen": 1033680, + "step": 5425 + }, + { + "epoch": 2.822245322245322, + "grad_norm": 15.16713809967041, + "learning_rate": 4.511340531309812e-05, + "loss": 0.19, + "num_input_tokens_seen": 1034640, + "step": 5430 + }, + { + "epoch": 2.8248440748440746, + "grad_norm": 3.5497889518737793, + "learning_rate": 4.5099928259173516e-05, + "loss": 0.2249, + "num_input_tokens_seen": 1035632, + "step": 5435 + }, + { + "epoch": 2.8274428274428276, + "grad_norm": 0.9146074056625366, + "learning_rate": 4.5086434665112864e-05, + "loss": 0.1233, + "num_input_tokens_seen": 1036656, + "step": 5440 + }, + { + "epoch": 2.83004158004158, + "grad_norm": 0.4479049742221832, + "learning_rate": 4.507292454201999e-05, + "loss": 0.0516, + "num_input_tokens_seen": 1037616, + "step": 5445 + }, + { + "epoch": 2.8326403326403327, + "grad_norm": 0.44480976462364197, + "learning_rate": 4.50593979010123e-05, + "loss": 0.0333, + "num_input_tokens_seen": 1038544, + "step": 5450 + }, + { + "epoch": 2.835239085239085, + "grad_norm": 3.8350539207458496, + "learning_rate": 4.5045854753220805e-05, + "loss": 0.2428, + "num_input_tokens_seen": 1039504, + "step": 5455 + }, + { + "epoch": 2.8378378378378377, + "grad_norm": 0.041940588504076004, + "learning_rate": 4.50322951097901e-05, + "loss": 0.0049, + "num_input_tokens_seen": 1040432, + "step": 5460 + }, + { + "epoch": 2.8404365904365907, + "grad_norm": 0.5789424777030945, + "learning_rate": 4.501871898187836e-05, + "loss": 0.2557, + "num_input_tokens_seen": 1041456, + "step": 5465 + }, + { + "epoch": 2.8430353430353428, + "grad_norm": 0.2268538922071457, + "learning_rate": 4.5005126380657296e-05, + "loss": 0.0148, + "num_input_tokens_seen": 1042352, + "step": 5470 + }, + { + "epoch": 2.8456340956340958, + "grad_norm": 11.825983047485352, + "learning_rate": 4.499151731731221e-05, + "loss": 0.1328, + "num_input_tokens_seen": 1043280, + "step": 5475 + }, + { + "epoch": 2.8482328482328483, + "grad_norm": 1.5202503204345703, + "learning_rate": 4.497789180304193e-05, + "loss": 0.1142, + "num_input_tokens_seen": 1044176, + "step": 5480 + }, + { + "epoch": 2.850831600831601, + "grad_norm": 1.4088383913040161, + "learning_rate": 4.496424984905883e-05, + "loss": 0.0081, + "num_input_tokens_seen": 1045104, + "step": 5485 + }, + { + "epoch": 2.8534303534303533, + "grad_norm": 1.4839156866073608, + "learning_rate": 4.495059146658881e-05, + "loss": 0.0561, + "num_input_tokens_seen": 1046096, + "step": 5490 + }, + { + "epoch": 2.856029106029106, + "grad_norm": 0.11757862567901611, + "learning_rate": 4.493691666687129e-05, + "loss": 0.1033, + "num_input_tokens_seen": 1046928, + "step": 5495 + }, + { + "epoch": 2.858627858627859, + "grad_norm": 0.10826723277568817, + "learning_rate": 4.49232254611592e-05, + "loss": 0.1038, + "num_input_tokens_seen": 1047888, + "step": 5500 + }, + { + "epoch": 2.8612266112266114, + "grad_norm": 0.12171634286642075, + "learning_rate": 4.4909517860718954e-05, + "loss": 0.0876, + "num_input_tokens_seen": 1048912, + "step": 5505 + }, + { + "epoch": 2.863825363825364, + "grad_norm": 0.05413998290896416, + "learning_rate": 4.489579387683048e-05, + "loss": 0.0063, + "num_input_tokens_seen": 1049840, + "step": 5510 + }, + { + "epoch": 2.8664241164241164, + "grad_norm": 0.056085921823978424, + "learning_rate": 4.4882053520787196e-05, + "loss": 0.1049, + "num_input_tokens_seen": 1050768, + "step": 5515 + }, + { + "epoch": 2.869022869022869, + "grad_norm": 0.032305553555488586, + "learning_rate": 4.4868296803895946e-05, + "loss": 0.1308, + "num_input_tokens_seen": 1051728, + "step": 5520 + }, + { + "epoch": 2.8716216216216215, + "grad_norm": 0.07528036832809448, + "learning_rate": 4.4854523737477094e-05, + "loss": 0.0045, + "num_input_tokens_seen": 1052656, + "step": 5525 + }, + { + "epoch": 2.874220374220374, + "grad_norm": 21.12236213684082, + "learning_rate": 4.484073433286442e-05, + "loss": 0.139, + "num_input_tokens_seen": 1053584, + "step": 5530 + }, + { + "epoch": 2.876819126819127, + "grad_norm": 0.02982599288225174, + "learning_rate": 4.4826928601405155e-05, + "loss": 0.1098, + "num_input_tokens_seen": 1054576, + "step": 5535 + }, + { + "epoch": 2.8794178794178795, + "grad_norm": 0.06356900185346603, + "learning_rate": 4.481310655445998e-05, + "loss": 0.0085, + "num_input_tokens_seen": 1055504, + "step": 5540 + }, + { + "epoch": 2.882016632016632, + "grad_norm": 0.38921329379081726, + "learning_rate": 4.479926820340298e-05, + "loss": 0.0136, + "num_input_tokens_seen": 1056368, + "step": 5545 + }, + { + "epoch": 2.8846153846153846, + "grad_norm": 0.03686419874429703, + "learning_rate": 4.478541355962168e-05, + "loss": 0.0968, + "num_input_tokens_seen": 1057360, + "step": 5550 + }, + { + "epoch": 2.887214137214137, + "grad_norm": 0.7774587869644165, + "learning_rate": 4.4771542634517e-05, + "loss": 0.0972, + "num_input_tokens_seen": 1058384, + "step": 5555 + }, + { + "epoch": 2.88981288981289, + "grad_norm": 0.12600499391555786, + "learning_rate": 4.4757655439503255e-05, + "loss": 0.1553, + "num_input_tokens_seen": 1059344, + "step": 5560 + }, + { + "epoch": 2.892411642411642, + "grad_norm": 0.3106594383716583, + "learning_rate": 4.474375198600815e-05, + "loss": 0.0748, + "num_input_tokens_seen": 1060272, + "step": 5565 + }, + { + "epoch": 2.895010395010395, + "grad_norm": 0.29123929142951965, + "learning_rate": 4.472983228547278e-05, + "loss": 0.0688, + "num_input_tokens_seen": 1061136, + "step": 5570 + }, + { + "epoch": 2.8976091476091477, + "grad_norm": 0.0929776281118393, + "learning_rate": 4.4715896349351596e-05, + "loss": 0.0481, + "num_input_tokens_seen": 1062096, + "step": 5575 + }, + { + "epoch": 2.9002079002079, + "grad_norm": 4.440683364868164, + "learning_rate": 4.4701944189112404e-05, + "loss": 0.0261, + "num_input_tokens_seen": 1063056, + "step": 5580 + }, + { + "epoch": 2.9028066528066527, + "grad_norm": 0.017539847642183304, + "learning_rate": 4.468797581623638e-05, + "loss": 0.0132, + "num_input_tokens_seen": 1063984, + "step": 5585 + }, + { + "epoch": 2.9054054054054053, + "grad_norm": 0.18622253835201263, + "learning_rate": 4.4673991242218045e-05, + "loss": 0.1104, + "num_input_tokens_seen": 1065040, + "step": 5590 + }, + { + "epoch": 2.9080041580041582, + "grad_norm": 0.03168921545147896, + "learning_rate": 4.4659990478565215e-05, + "loss": 0.0088, + "num_input_tokens_seen": 1065968, + "step": 5595 + }, + { + "epoch": 2.9106029106029108, + "grad_norm": 0.07991797477006912, + "learning_rate": 4.4645973536799065e-05, + "loss": 0.0586, + "num_input_tokens_seen": 1066992, + "step": 5600 + }, + { + "epoch": 2.9132016632016633, + "grad_norm": 0.026511503383517265, + "learning_rate": 4.463194042845408e-05, + "loss": 0.0773, + "num_input_tokens_seen": 1067888, + "step": 5605 + }, + { + "epoch": 2.915800415800416, + "grad_norm": 0.21364910900592804, + "learning_rate": 4.4617891165078014e-05, + "loss": 0.0079, + "num_input_tokens_seen": 1068752, + "step": 5610 + }, + { + "epoch": 2.9183991683991684, + "grad_norm": 13.794631004333496, + "learning_rate": 4.4603825758231954e-05, + "loss": 0.0517, + "num_input_tokens_seen": 1069808, + "step": 5615 + }, + { + "epoch": 2.920997920997921, + "grad_norm": 0.02199443243443966, + "learning_rate": 4.4589744219490256e-05, + "loss": 0.0042, + "num_input_tokens_seen": 1070768, + "step": 5620 + }, + { + "epoch": 2.9235966735966734, + "grad_norm": 0.06799838691949844, + "learning_rate": 4.457564656044056e-05, + "loss": 0.0321, + "num_input_tokens_seen": 1071792, + "step": 5625 + }, + { + "epoch": 2.9261954261954264, + "grad_norm": 38.91627502441406, + "learning_rate": 4.456153279268375e-05, + "loss": 0.2547, + "num_input_tokens_seen": 1072784, + "step": 5630 + }, + { + "epoch": 2.928794178794179, + "grad_norm": 58.31025314331055, + "learning_rate": 4.454740292783397e-05, + "loss": 0.0606, + "num_input_tokens_seen": 1073712, + "step": 5635 + }, + { + "epoch": 2.9313929313929314, + "grad_norm": 2.662489414215088, + "learning_rate": 4.4533256977518646e-05, + "loss": 0.4022, + "num_input_tokens_seen": 1074672, + "step": 5640 + }, + { + "epoch": 2.933991683991684, + "grad_norm": 0.13489629328250885, + "learning_rate": 4.45190949533784e-05, + "loss": 0.1289, + "num_input_tokens_seen": 1075600, + "step": 5645 + }, + { + "epoch": 2.9365904365904365, + "grad_norm": 0.03450971096754074, + "learning_rate": 4.450491686706709e-05, + "loss": 0.0629, + "num_input_tokens_seen": 1076528, + "step": 5650 + }, + { + "epoch": 2.939189189189189, + "grad_norm": 15.840789794921875, + "learning_rate": 4.44907227302518e-05, + "loss": 0.0854, + "num_input_tokens_seen": 1077488, + "step": 5655 + }, + { + "epoch": 2.9417879417879416, + "grad_norm": 0.12232880294322968, + "learning_rate": 4.4476512554612826e-05, + "loss": 0.0046, + "num_input_tokens_seen": 1078448, + "step": 5660 + }, + { + "epoch": 2.9443866943866945, + "grad_norm": 4.824110984802246, + "learning_rate": 4.446228635184364e-05, + "loss": 0.0744, + "num_input_tokens_seen": 1079344, + "step": 5665 + }, + { + "epoch": 2.946985446985447, + "grad_norm": 0.13736209273338318, + "learning_rate": 4.444804413365093e-05, + "loss": 0.0827, + "num_input_tokens_seen": 1080368, + "step": 5670 + }, + { + "epoch": 2.9495841995841996, + "grad_norm": 0.07516790926456451, + "learning_rate": 4.4433785911754545e-05, + "loss": 0.1445, + "num_input_tokens_seen": 1081296, + "step": 5675 + }, + { + "epoch": 2.952182952182952, + "grad_norm": 0.059654414653778076, + "learning_rate": 4.44195116978875e-05, + "loss": 0.1493, + "num_input_tokens_seen": 1082224, + "step": 5680 + }, + { + "epoch": 2.9547817047817047, + "grad_norm": 0.4571727216243744, + "learning_rate": 4.440522150379599e-05, + "loss": 0.0203, + "num_input_tokens_seen": 1083184, + "step": 5685 + }, + { + "epoch": 2.9573804573804576, + "grad_norm": 0.10235518217086792, + "learning_rate": 4.439091534123935e-05, + "loss": 0.0083, + "num_input_tokens_seen": 1084144, + "step": 5690 + }, + { + "epoch": 2.9599792099792097, + "grad_norm": 0.3244641125202179, + "learning_rate": 4.437659322199004e-05, + "loss": 0.108, + "num_input_tokens_seen": 1085136, + "step": 5695 + }, + { + "epoch": 2.9625779625779627, + "grad_norm": 2.8198485374450684, + "learning_rate": 4.436225515783368e-05, + "loss": 0.0759, + "num_input_tokens_seen": 1086128, + "step": 5700 + }, + { + "epoch": 2.965176715176715, + "grad_norm": 0.09323878586292267, + "learning_rate": 4.4347901160568985e-05, + "loss": 0.0068, + "num_input_tokens_seen": 1087088, + "step": 5705 + }, + { + "epoch": 2.9677754677754677, + "grad_norm": 2.0860092639923096, + "learning_rate": 4.43335312420078e-05, + "loss": 0.2225, + "num_input_tokens_seen": 1088016, + "step": 5710 + }, + { + "epoch": 2.9703742203742203, + "grad_norm": 5.474557399749756, + "learning_rate": 4.4319145413975044e-05, + "loss": 0.2234, + "num_input_tokens_seen": 1088976, + "step": 5715 + }, + { + "epoch": 2.972972972972973, + "grad_norm": 1.1254582405090332, + "learning_rate": 4.430474368830877e-05, + "loss": 0.0154, + "num_input_tokens_seen": 1089968, + "step": 5720 + }, + { + "epoch": 2.975571725571726, + "grad_norm": 2.296518087387085, + "learning_rate": 4.429032607686008e-05, + "loss": 0.0785, + "num_input_tokens_seen": 1090928, + "step": 5725 + }, + { + "epoch": 2.9781704781704783, + "grad_norm": 0.12973199784755707, + "learning_rate": 4.427589259149315e-05, + "loss": 0.0354, + "num_input_tokens_seen": 1091920, + "step": 5730 + }, + { + "epoch": 2.980769230769231, + "grad_norm": 2.7333524227142334, + "learning_rate": 4.426144324408524e-05, + "loss": 0.1037, + "num_input_tokens_seen": 1092816, + "step": 5735 + }, + { + "epoch": 2.9833679833679834, + "grad_norm": 0.060335684567689896, + "learning_rate": 4.424697804652666e-05, + "loss": 0.0063, + "num_input_tokens_seen": 1093808, + "step": 5740 + }, + { + "epoch": 2.985966735966736, + "grad_norm": 15.691051483154297, + "learning_rate": 4.423249701072072e-05, + "loss": 0.0984, + "num_input_tokens_seen": 1094672, + "step": 5745 + }, + { + "epoch": 2.9885654885654884, + "grad_norm": 0.04818872734904289, + "learning_rate": 4.421800014858382e-05, + "loss": 0.1652, + "num_input_tokens_seen": 1095664, + "step": 5750 + }, + { + "epoch": 2.991164241164241, + "grad_norm": 2.885742425918579, + "learning_rate": 4.420348747204536e-05, + "loss": 0.0872, + "num_input_tokens_seen": 1096624, + "step": 5755 + }, + { + "epoch": 2.993762993762994, + "grad_norm": 0.11593808978796005, + "learning_rate": 4.418895899304774e-05, + "loss": 0.0953, + "num_input_tokens_seen": 1097584, + "step": 5760 + }, + { + "epoch": 2.9963617463617465, + "grad_norm": 0.10210102796554565, + "learning_rate": 4.417441472354638e-05, + "loss": 0.0391, + "num_input_tokens_seen": 1098544, + "step": 5765 + }, + { + "epoch": 2.998960498960499, + "grad_norm": 0.07402777671813965, + "learning_rate": 4.41598546755097e-05, + "loss": 0.0942, + "num_input_tokens_seen": 1099504, + "step": 5770 + }, + { + "epoch": 3.0, + "eval_loss": 0.18155623972415924, + "eval_runtime": 9.268, + "eval_samples_per_second": 92.361, + "eval_steps_per_second": 23.09, + "num_input_tokens_seen": 1099816, + "step": 5772 + }, + { + "epoch": 3.0015592515592515, + "grad_norm": 0.028248732909560204, + "learning_rate": 4.414527886091909e-05, + "loss": 0.0064, + "num_input_tokens_seen": 1100392, + "step": 5775 + }, + { + "epoch": 3.004158004158004, + "grad_norm": 38.20802307128906, + "learning_rate": 4.413068729176891e-05, + "loss": 0.2042, + "num_input_tokens_seen": 1101416, + "step": 5780 + }, + { + "epoch": 3.0067567567567566, + "grad_norm": 0.6232936978340149, + "learning_rate": 4.4116079980066504e-05, + "loss": 0.0052, + "num_input_tokens_seen": 1102440, + "step": 5785 + }, + { + "epoch": 3.0093555093555096, + "grad_norm": 0.025082536041736603, + "learning_rate": 4.4101456937832166e-05, + "loss": 0.0907, + "num_input_tokens_seen": 1103336, + "step": 5790 + }, + { + "epoch": 3.011954261954262, + "grad_norm": 0.05958852916955948, + "learning_rate": 4.408681817709911e-05, + "loss": 0.0359, + "num_input_tokens_seen": 1104232, + "step": 5795 + }, + { + "epoch": 3.0145530145530146, + "grad_norm": 7.805817604064941, + "learning_rate": 4.407216370991351e-05, + "loss": 0.0705, + "num_input_tokens_seen": 1105096, + "step": 5800 + }, + { + "epoch": 3.017151767151767, + "grad_norm": 0.10228478908538818, + "learning_rate": 4.405749354833447e-05, + "loss": 0.1456, + "num_input_tokens_seen": 1106088, + "step": 5805 + }, + { + "epoch": 3.0197505197505197, + "grad_norm": 3.3674161434173584, + "learning_rate": 4.404280770443398e-05, + "loss": 0.0104, + "num_input_tokens_seen": 1107048, + "step": 5810 + }, + { + "epoch": 3.022349272349272, + "grad_norm": 0.08401818573474884, + "learning_rate": 4.402810619029696e-05, + "loss": 0.0203, + "num_input_tokens_seen": 1108040, + "step": 5815 + }, + { + "epoch": 3.024948024948025, + "grad_norm": 0.016964886337518692, + "learning_rate": 4.401338901802122e-05, + "loss": 0.0017, + "num_input_tokens_seen": 1108936, + "step": 5820 + }, + { + "epoch": 3.0275467775467777, + "grad_norm": 0.025650253519415855, + "learning_rate": 4.3998656199717435e-05, + "loss": 0.1319, + "num_input_tokens_seen": 1109864, + "step": 5825 + }, + { + "epoch": 3.0301455301455302, + "grad_norm": 9.301253318786621, + "learning_rate": 4.3983907747509195e-05, + "loss": 0.2127, + "num_input_tokens_seen": 1110888, + "step": 5830 + }, + { + "epoch": 3.0327442827442828, + "grad_norm": 0.04264579713344574, + "learning_rate": 4.396914367353292e-05, + "loss": 0.0455, + "num_input_tokens_seen": 1111848, + "step": 5835 + }, + { + "epoch": 3.0353430353430353, + "grad_norm": 0.0413065068423748, + "learning_rate": 4.3954363989937894e-05, + "loss": 0.0055, + "num_input_tokens_seen": 1112808, + "step": 5840 + }, + { + "epoch": 3.037941787941788, + "grad_norm": 0.02766398899257183, + "learning_rate": 4.3939568708886245e-05, + "loss": 0.0135, + "num_input_tokens_seen": 1113736, + "step": 5845 + }, + { + "epoch": 3.0405405405405403, + "grad_norm": 38.42988586425781, + "learning_rate": 4.3924757842552955e-05, + "loss": 0.0658, + "num_input_tokens_seen": 1114696, + "step": 5850 + }, + { + "epoch": 3.0431392931392933, + "grad_norm": 0.02357351966202259, + "learning_rate": 4.3909931403125805e-05, + "loss": 0.0026, + "num_input_tokens_seen": 1115624, + "step": 5855 + }, + { + "epoch": 3.045738045738046, + "grad_norm": 0.021786309778690338, + "learning_rate": 4.38950894028054e-05, + "loss": 0.0041, + "num_input_tokens_seen": 1116616, + "step": 5860 + }, + { + "epoch": 3.0483367983367984, + "grad_norm": 0.022778227925300598, + "learning_rate": 4.388023185380516e-05, + "loss": 0.0037, + "num_input_tokens_seen": 1117544, + "step": 5865 + }, + { + "epoch": 3.050935550935551, + "grad_norm": 0.009608174674212933, + "learning_rate": 4.386535876835129e-05, + "loss": 0.0008, + "num_input_tokens_seen": 1118440, + "step": 5870 + }, + { + "epoch": 3.0535343035343034, + "grad_norm": 44.40339660644531, + "learning_rate": 4.3850470158682774e-05, + "loss": 0.037, + "num_input_tokens_seen": 1119496, + "step": 5875 + }, + { + "epoch": 3.056133056133056, + "grad_norm": 0.0074235121719539165, + "learning_rate": 4.383556603705139e-05, + "loss": 0.0009, + "num_input_tokens_seen": 1120392, + "step": 5880 + }, + { + "epoch": 3.058731808731809, + "grad_norm": 0.10217924416065216, + "learning_rate": 4.382064641572167e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1121352, + "step": 5885 + }, + { + "epoch": 3.0613305613305615, + "grad_norm": 0.02086864970624447, + "learning_rate": 4.380571130697088e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1122248, + "step": 5890 + }, + { + "epoch": 3.063929313929314, + "grad_norm": 0.004059748258441687, + "learning_rate": 4.3790760723089074e-05, + "loss": 0.1982, + "num_input_tokens_seen": 1123208, + "step": 5895 + }, + { + "epoch": 3.0665280665280665, + "grad_norm": 0.08068650960922241, + "learning_rate": 4.3775794676379e-05, + "loss": 0.0177, + "num_input_tokens_seen": 1124136, + "step": 5900 + }, + { + "epoch": 3.069126819126819, + "grad_norm": 0.015976455062627792, + "learning_rate": 4.376081317915616e-05, + "loss": 0.006, + "num_input_tokens_seen": 1125160, + "step": 5905 + }, + { + "epoch": 3.0717255717255716, + "grad_norm": 0.006786749232560396, + "learning_rate": 4.3745816243748755e-05, + "loss": 0.0049, + "num_input_tokens_seen": 1126152, + "step": 5910 + }, + { + "epoch": 3.074324324324324, + "grad_norm": 0.02163018472492695, + "learning_rate": 4.373080388249768e-05, + "loss": 0.1373, + "num_input_tokens_seen": 1127080, + "step": 5915 + }, + { + "epoch": 3.076923076923077, + "grad_norm": 0.024275727570056915, + "learning_rate": 4.371577610775656e-05, + "loss": 0.1038, + "num_input_tokens_seen": 1128040, + "step": 5920 + }, + { + "epoch": 3.0795218295218296, + "grad_norm": 0.041569050401449203, + "learning_rate": 4.370073293189165e-05, + "loss": 0.1222, + "num_input_tokens_seen": 1128968, + "step": 5925 + }, + { + "epoch": 3.082120582120582, + "grad_norm": 0.11421497166156769, + "learning_rate": 4.3685674367281925e-05, + "loss": 0.0468, + "num_input_tokens_seen": 1129864, + "step": 5930 + }, + { + "epoch": 3.0847193347193347, + "grad_norm": 0.183809295296669, + "learning_rate": 4.367060042631901e-05, + "loss": 0.0047, + "num_input_tokens_seen": 1130792, + "step": 5935 + }, + { + "epoch": 3.087318087318087, + "grad_norm": 0.010314762592315674, + "learning_rate": 4.3655511121407176e-05, + "loss": 0.0025, + "num_input_tokens_seen": 1131784, + "step": 5940 + }, + { + "epoch": 3.0899168399168397, + "grad_norm": 0.062128596007823944, + "learning_rate": 4.3640406464963333e-05, + "loss": 0.001, + "num_input_tokens_seen": 1132744, + "step": 5945 + }, + { + "epoch": 3.0925155925155927, + "grad_norm": 0.3348368704319, + "learning_rate": 4.3625286469417046e-05, + "loss": 0.2308, + "num_input_tokens_seen": 1133736, + "step": 5950 + }, + { + "epoch": 3.0951143451143452, + "grad_norm": 0.04741016775369644, + "learning_rate": 4.3610151147210475e-05, + "loss": 0.0019, + "num_input_tokens_seen": 1134696, + "step": 5955 + }, + { + "epoch": 3.0977130977130978, + "grad_norm": 0.0158183965831995, + "learning_rate": 4.359500051079841e-05, + "loss": 0.1536, + "num_input_tokens_seen": 1135592, + "step": 5960 + }, + { + "epoch": 3.1003118503118503, + "grad_norm": 0.033713001757860184, + "learning_rate": 4.357983457264825e-05, + "loss": 0.0023, + "num_input_tokens_seen": 1136520, + "step": 5965 + }, + { + "epoch": 3.102910602910603, + "grad_norm": 0.04160444065928459, + "learning_rate": 4.356465334523995e-05, + "loss": 0.0788, + "num_input_tokens_seen": 1137544, + "step": 5970 + }, + { + "epoch": 3.1055093555093554, + "grad_norm": 0.03951268643140793, + "learning_rate": 4.354945684106608e-05, + "loss": 0.0027, + "num_input_tokens_seen": 1138536, + "step": 5975 + }, + { + "epoch": 3.108108108108108, + "grad_norm": 0.06461615115404129, + "learning_rate": 4.3534245072631785e-05, + "loss": 0.0521, + "num_input_tokens_seen": 1139496, + "step": 5980 + }, + { + "epoch": 3.110706860706861, + "grad_norm": 0.03379769250750542, + "learning_rate": 4.351901805245474e-05, + "loss": 0.0011, + "num_input_tokens_seen": 1140616, + "step": 5985 + }, + { + "epoch": 3.1133056133056134, + "grad_norm": 4.128078460693359, + "learning_rate": 4.350377579306519e-05, + "loss": 0.1725, + "num_input_tokens_seen": 1141608, + "step": 5990 + }, + { + "epoch": 3.115904365904366, + "grad_norm": 0.06277868896722794, + "learning_rate": 4.348851830700593e-05, + "loss": 0.0017, + "num_input_tokens_seen": 1142600, + "step": 5995 + }, + { + "epoch": 3.1185031185031185, + "grad_norm": 0.017176948487758636, + "learning_rate": 4.347324560683227e-05, + "loss": 0.0017, + "num_input_tokens_seen": 1143560, + "step": 6000 + }, + { + "epoch": 3.121101871101871, + "grad_norm": 0.05304423347115517, + "learning_rate": 4.3457957705112034e-05, + "loss": 0.0419, + "num_input_tokens_seen": 1144552, + "step": 6005 + }, + { + "epoch": 3.1237006237006235, + "grad_norm": 0.07615764439105988, + "learning_rate": 4.344265461442557e-05, + "loss": 0.0678, + "num_input_tokens_seen": 1145576, + "step": 6010 + }, + { + "epoch": 3.1262993762993765, + "grad_norm": 3.954894542694092, + "learning_rate": 4.342733634736571e-05, + "loss": 0.143, + "num_input_tokens_seen": 1146504, + "step": 6015 + }, + { + "epoch": 3.128898128898129, + "grad_norm": 0.02913070097565651, + "learning_rate": 4.341200291653781e-05, + "loss": 0.0013, + "num_input_tokens_seen": 1147560, + "step": 6020 + }, + { + "epoch": 3.1314968814968815, + "grad_norm": 0.07128477841615677, + "learning_rate": 4.339665433455965e-05, + "loss": 0.0016, + "num_input_tokens_seen": 1148488, + "step": 6025 + }, + { + "epoch": 3.134095634095634, + "grad_norm": 0.06375741213560104, + "learning_rate": 4.338129061406151e-05, + "loss": 0.0024, + "num_input_tokens_seen": 1149384, + "step": 6030 + }, + { + "epoch": 3.1366943866943866, + "grad_norm": 0.01509932428598404, + "learning_rate": 4.336591176768613e-05, + "loss": 0.0443, + "num_input_tokens_seen": 1150344, + "step": 6035 + }, + { + "epoch": 3.139293139293139, + "grad_norm": 0.01908009685575962, + "learning_rate": 4.3350517808088694e-05, + "loss": 0.2296, + "num_input_tokens_seen": 1151304, + "step": 6040 + }, + { + "epoch": 3.141891891891892, + "grad_norm": 0.019162194803357124, + "learning_rate": 4.333510874793681e-05, + "loss": 0.0016, + "num_input_tokens_seen": 1152264, + "step": 6045 + }, + { + "epoch": 3.1444906444906446, + "grad_norm": 0.3522590696811676, + "learning_rate": 4.331968459991052e-05, + "loss": 0.0563, + "num_input_tokens_seen": 1153288, + "step": 6050 + }, + { + "epoch": 3.147089397089397, + "grad_norm": 0.047274719923734665, + "learning_rate": 4.330424537670229e-05, + "loss": 0.002, + "num_input_tokens_seen": 1154248, + "step": 6055 + }, + { + "epoch": 3.1496881496881497, + "grad_norm": 0.020549843087792397, + "learning_rate": 4.3288791091016983e-05, + "loss": 0.1716, + "num_input_tokens_seen": 1155240, + "step": 6060 + }, + { + "epoch": 3.1522869022869022, + "grad_norm": 0.08268964290618896, + "learning_rate": 4.327332175557185e-05, + "loss": 0.007, + "num_input_tokens_seen": 1156104, + "step": 6065 + }, + { + "epoch": 3.1548856548856548, + "grad_norm": 0.07232010364532471, + "learning_rate": 4.325783738309654e-05, + "loss": 0.0052, + "num_input_tokens_seen": 1157032, + "step": 6070 + }, + { + "epoch": 3.1574844074844073, + "grad_norm": 0.205613911151886, + "learning_rate": 4.324233798633308e-05, + "loss": 0.0048, + "num_input_tokens_seen": 1157960, + "step": 6075 + }, + { + "epoch": 3.1600831600831603, + "grad_norm": 0.04369082674384117, + "learning_rate": 4.322682357803582e-05, + "loss": 0.054, + "num_input_tokens_seen": 1158856, + "step": 6080 + }, + { + "epoch": 3.162681912681913, + "grad_norm": 0.01558584813028574, + "learning_rate": 4.321129417097153e-05, + "loss": 0.0032, + "num_input_tokens_seen": 1159752, + "step": 6085 + }, + { + "epoch": 3.1652806652806653, + "grad_norm": 0.02887248620390892, + "learning_rate": 4.319574977791926e-05, + "loss": 0.0033, + "num_input_tokens_seen": 1160680, + "step": 6090 + }, + { + "epoch": 3.167879417879418, + "grad_norm": 0.06808643043041229, + "learning_rate": 4.318019041167042e-05, + "loss": 0.0018, + "num_input_tokens_seen": 1161672, + "step": 6095 + }, + { + "epoch": 3.1704781704781704, + "grad_norm": 1.657986044883728, + "learning_rate": 4.316461608502875e-05, + "loss": 0.0046, + "num_input_tokens_seen": 1162600, + "step": 6100 + }, + { + "epoch": 3.173076923076923, + "grad_norm": 0.011458008550107479, + "learning_rate": 4.314902681081029e-05, + "loss": 0.0965, + "num_input_tokens_seen": 1163592, + "step": 6105 + }, + { + "epoch": 3.175675675675676, + "grad_norm": 0.012209475040435791, + "learning_rate": 4.313342260184337e-05, + "loss": 0.001, + "num_input_tokens_seen": 1164552, + "step": 6110 + }, + { + "epoch": 3.1782744282744284, + "grad_norm": 0.05143093690276146, + "learning_rate": 4.311780347096863e-05, + "loss": 0.1783, + "num_input_tokens_seen": 1165512, + "step": 6115 + }, + { + "epoch": 3.180873180873181, + "grad_norm": 0.03600318729877472, + "learning_rate": 4.310216943103898e-05, + "loss": 0.0017, + "num_input_tokens_seen": 1166408, + "step": 6120 + }, + { + "epoch": 3.1834719334719335, + "grad_norm": 1.2011713981628418, + "learning_rate": 4.308652049491957e-05, + "loss": 0.0636, + "num_input_tokens_seen": 1167368, + "step": 6125 + }, + { + "epoch": 3.186070686070686, + "grad_norm": 0.046458907425403595, + "learning_rate": 4.307085667548788e-05, + "loss": 0.1275, + "num_input_tokens_seen": 1168296, + "step": 6130 + }, + { + "epoch": 3.1886694386694385, + "grad_norm": 0.012416806071996689, + "learning_rate": 4.3055177985633556e-05, + "loss": 0.1404, + "num_input_tokens_seen": 1169256, + "step": 6135 + }, + { + "epoch": 3.1912681912681915, + "grad_norm": 0.44087865948677063, + "learning_rate": 4.3039484438258536e-05, + "loss": 0.069, + "num_input_tokens_seen": 1170184, + "step": 6140 + }, + { + "epoch": 3.193866943866944, + "grad_norm": 0.06966958940029144, + "learning_rate": 4.302377604627696e-05, + "loss": 0.0043, + "num_input_tokens_seen": 1171176, + "step": 6145 + }, + { + "epoch": 3.1964656964656966, + "grad_norm": 0.05287214741110802, + "learning_rate": 4.30080528226152e-05, + "loss": 0.1044, + "num_input_tokens_seen": 1172136, + "step": 6150 + }, + { + "epoch": 3.199064449064449, + "grad_norm": 0.1115192100405693, + "learning_rate": 4.299231478021181e-05, + "loss": 0.0057, + "num_input_tokens_seen": 1173032, + "step": 6155 + }, + { + "epoch": 3.2016632016632016, + "grad_norm": 4.264469623565674, + "learning_rate": 4.297656193201755e-05, + "loss": 0.1224, + "num_input_tokens_seen": 1173960, + "step": 6160 + }, + { + "epoch": 3.204261954261954, + "grad_norm": 0.05559679493308067, + "learning_rate": 4.296079429099538e-05, + "loss": 0.1148, + "num_input_tokens_seen": 1174952, + "step": 6165 + }, + { + "epoch": 3.2068607068607067, + "grad_norm": 0.09738025069236755, + "learning_rate": 4.2945011870120395e-05, + "loss": 0.0875, + "num_input_tokens_seen": 1175912, + "step": 6170 + }, + { + "epoch": 3.2094594594594597, + "grad_norm": 0.02656608261168003, + "learning_rate": 4.2929214682379894e-05, + "loss": 0.0047, + "num_input_tokens_seen": 1176840, + "step": 6175 + }, + { + "epoch": 3.212058212058212, + "grad_norm": 0.06092597916722298, + "learning_rate": 4.2913402740773294e-05, + "loss": 0.1123, + "num_input_tokens_seen": 1177736, + "step": 6180 + }, + { + "epoch": 3.2146569646569647, + "grad_norm": 0.05209524556994438, + "learning_rate": 4.2897576058312176e-05, + "loss": 0.2263, + "num_input_tokens_seen": 1178728, + "step": 6185 + }, + { + "epoch": 3.2172557172557172, + "grad_norm": 5.073832035064697, + "learning_rate": 4.2881734648020245e-05, + "loss": 0.1417, + "num_input_tokens_seen": 1179624, + "step": 6190 + }, + { + "epoch": 3.2198544698544698, + "grad_norm": 0.1790757179260254, + "learning_rate": 4.286587852293331e-05, + "loss": 0.0965, + "num_input_tokens_seen": 1180520, + "step": 6195 + }, + { + "epoch": 3.2224532224532223, + "grad_norm": 0.21295973658561707, + "learning_rate": 4.285000769609932e-05, + "loss": 0.0058, + "num_input_tokens_seen": 1181512, + "step": 6200 + }, + { + "epoch": 3.225051975051975, + "grad_norm": 23.377309799194336, + "learning_rate": 4.283412218057829e-05, + "loss": 0.0366, + "num_input_tokens_seen": 1182472, + "step": 6205 + }, + { + "epoch": 3.227650727650728, + "grad_norm": 0.034105293452739716, + "learning_rate": 4.281822198944233e-05, + "loss": 0.0153, + "num_input_tokens_seen": 1183496, + "step": 6210 + }, + { + "epoch": 3.2302494802494803, + "grad_norm": 0.10079897195100784, + "learning_rate": 4.280230713577564e-05, + "loss": 0.0033, + "num_input_tokens_seen": 1184392, + "step": 6215 + }, + { + "epoch": 3.232848232848233, + "grad_norm": 0.05329185724258423, + "learning_rate": 4.278637763267448e-05, + "loss": 0.0338, + "num_input_tokens_seen": 1185320, + "step": 6220 + }, + { + "epoch": 3.2354469854469854, + "grad_norm": 0.011346641927957535, + "learning_rate": 4.277043349324716e-05, + "loss": 0.0554, + "num_input_tokens_seen": 1186312, + "step": 6225 + }, + { + "epoch": 3.238045738045738, + "grad_norm": 0.030093854293227196, + "learning_rate": 4.275447473061405e-05, + "loss": 0.0314, + "num_input_tokens_seen": 1187208, + "step": 6230 + }, + { + "epoch": 3.2406444906444904, + "grad_norm": 0.01664133556187153, + "learning_rate": 4.273850135790752e-05, + "loss": 0.0019, + "num_input_tokens_seen": 1188168, + "step": 6235 + }, + { + "epoch": 3.2432432432432434, + "grad_norm": 0.008931435644626617, + "learning_rate": 4.272251338827199e-05, + "loss": 0.0018, + "num_input_tokens_seen": 1189096, + "step": 6240 + }, + { + "epoch": 3.245841995841996, + "grad_norm": 0.026892652735114098, + "learning_rate": 4.270651083486389e-05, + "loss": 0.0212, + "num_input_tokens_seen": 1190056, + "step": 6245 + }, + { + "epoch": 3.2484407484407485, + "grad_norm": 0.5516585111618042, + "learning_rate": 4.269049371085164e-05, + "loss": 0.0017, + "num_input_tokens_seen": 1191016, + "step": 6250 + }, + { + "epoch": 3.251039501039501, + "grad_norm": 0.010188799351453781, + "learning_rate": 4.2674462029415654e-05, + "loss": 0.0966, + "num_input_tokens_seen": 1192040, + "step": 6255 + }, + { + "epoch": 3.2536382536382535, + "grad_norm": 0.02188034914433956, + "learning_rate": 4.265841580374834e-05, + "loss": 0.0036, + "num_input_tokens_seen": 1192968, + "step": 6260 + }, + { + "epoch": 3.256237006237006, + "grad_norm": 0.0652146190404892, + "learning_rate": 4.264235504705404e-05, + "loss": 0.001, + "num_input_tokens_seen": 1193960, + "step": 6265 + }, + { + "epoch": 3.258835758835759, + "grad_norm": 0.01923360675573349, + "learning_rate": 4.2626279772549096e-05, + "loss": 0.0008, + "num_input_tokens_seen": 1194824, + "step": 6270 + }, + { + "epoch": 3.2614345114345116, + "grad_norm": 0.01794094406068325, + "learning_rate": 4.2610189993461766e-05, + "loss": 0.2681, + "num_input_tokens_seen": 1195784, + "step": 6275 + }, + { + "epoch": 3.264033264033264, + "grad_norm": 0.052087828516960144, + "learning_rate": 4.259408572303225e-05, + "loss": 0.0799, + "num_input_tokens_seen": 1196680, + "step": 6280 + }, + { + "epoch": 3.2666320166320166, + "grad_norm": 0.2897411584854126, + "learning_rate": 4.2577966974512685e-05, + "loss": 0.0026, + "num_input_tokens_seen": 1197640, + "step": 6285 + }, + { + "epoch": 3.269230769230769, + "grad_norm": 0.024469126015901566, + "learning_rate": 4.25618337611671e-05, + "loss": 0.002, + "num_input_tokens_seen": 1198664, + "step": 6290 + }, + { + "epoch": 3.2718295218295217, + "grad_norm": 0.12235717475414276, + "learning_rate": 4.254568609627145e-05, + "loss": 0.0027, + "num_input_tokens_seen": 1199656, + "step": 6295 + }, + { + "epoch": 3.274428274428274, + "grad_norm": 21.694913864135742, + "learning_rate": 4.2529523993113574e-05, + "loss": 0.0241, + "num_input_tokens_seen": 1200584, + "step": 6300 + }, + { + "epoch": 3.277027027027027, + "grad_norm": 0.08901014178991318, + "learning_rate": 4.2513347464993184e-05, + "loss": 0.0801, + "num_input_tokens_seen": 1201512, + "step": 6305 + }, + { + "epoch": 3.2796257796257797, + "grad_norm": 0.03125005215406418, + "learning_rate": 4.249715652522187e-05, + "loss": 0.0069, + "num_input_tokens_seen": 1202536, + "step": 6310 + }, + { + "epoch": 3.2822245322245323, + "grad_norm": 0.16246582567691803, + "learning_rate": 4.2480951187123084e-05, + "loss": 0.0043, + "num_input_tokens_seen": 1203432, + "step": 6315 + }, + { + "epoch": 3.284823284823285, + "grad_norm": 0.0998678058385849, + "learning_rate": 4.246473146403212e-05, + "loss": 0.0019, + "num_input_tokens_seen": 1204360, + "step": 6320 + }, + { + "epoch": 3.2874220374220373, + "grad_norm": 0.02438625693321228, + "learning_rate": 4.2448497369296096e-05, + "loss": 0.1087, + "num_input_tokens_seen": 1205384, + "step": 6325 + }, + { + "epoch": 3.29002079002079, + "grad_norm": 0.018558884039521217, + "learning_rate": 4.2432248916273996e-05, + "loss": 0.0812, + "num_input_tokens_seen": 1206312, + "step": 6330 + }, + { + "epoch": 3.2926195426195424, + "grad_norm": 0.016841303557157516, + "learning_rate": 4.241598611833659e-05, + "loss": 0.001, + "num_input_tokens_seen": 1207304, + "step": 6335 + }, + { + "epoch": 3.2952182952182953, + "grad_norm": 0.01086138654500246, + "learning_rate": 4.239970898886645e-05, + "loss": 0.0101, + "num_input_tokens_seen": 1208232, + "step": 6340 + }, + { + "epoch": 3.297817047817048, + "grad_norm": 0.08035615086555481, + "learning_rate": 4.2383417541257954e-05, + "loss": 0.0017, + "num_input_tokens_seen": 1209192, + "step": 6345 + }, + { + "epoch": 3.3004158004158004, + "grad_norm": 0.0101949917152524, + "learning_rate": 4.236711178891725e-05, + "loss": 0.0015, + "num_input_tokens_seen": 1210056, + "step": 6350 + }, + { + "epoch": 3.303014553014553, + "grad_norm": 0.005180461797863245, + "learning_rate": 4.2350791745262274e-05, + "loss": 0.0021, + "num_input_tokens_seen": 1211080, + "step": 6355 + }, + { + "epoch": 3.3056133056133055, + "grad_norm": 0.008334388956427574, + "learning_rate": 4.2334457423722704e-05, + "loss": 0.1138, + "num_input_tokens_seen": 1212072, + "step": 6360 + }, + { + "epoch": 3.3082120582120584, + "grad_norm": 0.048143792897462845, + "learning_rate": 4.231810883773999e-05, + "loss": 0.0013, + "num_input_tokens_seen": 1212968, + "step": 6365 + }, + { + "epoch": 3.310810810810811, + "grad_norm": 0.032027050852775574, + "learning_rate": 4.230174600076729e-05, + "loss": 0.0014, + "num_input_tokens_seen": 1213928, + "step": 6370 + }, + { + "epoch": 3.3134095634095635, + "grad_norm": 0.008816261775791645, + "learning_rate": 4.228536892626951e-05, + "loss": 0.0012, + "num_input_tokens_seen": 1214824, + "step": 6375 + }, + { + "epoch": 3.316008316008316, + "grad_norm": 0.30214667320251465, + "learning_rate": 4.2268977627723285e-05, + "loss": 0.0017, + "num_input_tokens_seen": 1215752, + "step": 6380 + }, + { + "epoch": 3.3186070686070686, + "grad_norm": 0.006106313783675432, + "learning_rate": 4.225257211861691e-05, + "loss": 0.0011, + "num_input_tokens_seen": 1216712, + "step": 6385 + }, + { + "epoch": 3.321205821205821, + "grad_norm": 0.17212244868278503, + "learning_rate": 4.223615241245041e-05, + "loss": 0.0015, + "num_input_tokens_seen": 1217704, + "step": 6390 + }, + { + "epoch": 3.3238045738045736, + "grad_norm": 0.014471504837274551, + "learning_rate": 4.221971852273549e-05, + "loss": 0.1794, + "num_input_tokens_seen": 1218600, + "step": 6395 + }, + { + "epoch": 3.3264033264033266, + "grad_norm": 0.006083118729293346, + "learning_rate": 4.2203270462995515e-05, + "loss": 0.0017, + "num_input_tokens_seen": 1219528, + "step": 6400 + }, + { + "epoch": 3.329002079002079, + "grad_norm": 0.01487005315721035, + "learning_rate": 4.218680824676552e-05, + "loss": 0.0016, + "num_input_tokens_seen": 1220456, + "step": 6405 + }, + { + "epoch": 3.3316008316008316, + "grad_norm": 31.01215362548828, + "learning_rate": 4.217033188759219e-05, + "loss": 0.0864, + "num_input_tokens_seen": 1221480, + "step": 6410 + }, + { + "epoch": 3.334199584199584, + "grad_norm": 0.03186456859111786, + "learning_rate": 4.215384139903382e-05, + "loss": 0.0014, + "num_input_tokens_seen": 1222440, + "step": 6415 + }, + { + "epoch": 3.3367983367983367, + "grad_norm": 0.00628352165222168, + "learning_rate": 4.2137336794660384e-05, + "loss": 0.0028, + "num_input_tokens_seen": 1223368, + "step": 6420 + }, + { + "epoch": 3.3393970893970892, + "grad_norm": 0.00760975107550621, + "learning_rate": 4.212081808805342e-05, + "loss": 0.0741, + "num_input_tokens_seen": 1224328, + "step": 6425 + }, + { + "epoch": 3.3419958419958418, + "grad_norm": 0.013738819397985935, + "learning_rate": 4.210428529280611e-05, + "loss": 0.1472, + "num_input_tokens_seen": 1225288, + "step": 6430 + }, + { + "epoch": 3.3445945945945947, + "grad_norm": 0.05502315238118172, + "learning_rate": 4.2087738422523206e-05, + "loss": 0.0039, + "num_input_tokens_seen": 1226312, + "step": 6435 + }, + { + "epoch": 3.3471933471933473, + "grad_norm": 0.027822894975543022, + "learning_rate": 4.207117749082104e-05, + "loss": 0.059, + "num_input_tokens_seen": 1227304, + "step": 6440 + }, + { + "epoch": 3.3497920997921, + "grad_norm": 0.10234204679727554, + "learning_rate": 4.205460251132755e-05, + "loss": 0.0019, + "num_input_tokens_seen": 1228264, + "step": 6445 + }, + { + "epoch": 3.3523908523908523, + "grad_norm": 0.08731049299240112, + "learning_rate": 4.2038013497682186e-05, + "loss": 0.0021, + "num_input_tokens_seen": 1229256, + "step": 6450 + }, + { + "epoch": 3.354989604989605, + "grad_norm": 2.6954150199890137, + "learning_rate": 4.202141046353597e-05, + "loss": 0.2515, + "num_input_tokens_seen": 1230216, + "step": 6455 + }, + { + "epoch": 3.357588357588358, + "grad_norm": 0.05088431015610695, + "learning_rate": 4.2004793422551475e-05, + "loss": 0.0149, + "num_input_tokens_seen": 1231176, + "step": 6460 + }, + { + "epoch": 3.3601871101871104, + "grad_norm": 0.1101212352514267, + "learning_rate": 4.198816238840277e-05, + "loss": 0.0063, + "num_input_tokens_seen": 1232200, + "step": 6465 + }, + { + "epoch": 3.362785862785863, + "grad_norm": 0.03205224126577377, + "learning_rate": 4.197151737477547e-05, + "loss": 0.0026, + "num_input_tokens_seen": 1233128, + "step": 6470 + }, + { + "epoch": 3.3653846153846154, + "grad_norm": 0.0847291424870491, + "learning_rate": 4.195485839536666e-05, + "loss": 0.1081, + "num_input_tokens_seen": 1234056, + "step": 6475 + }, + { + "epoch": 3.367983367983368, + "grad_norm": 0.006647842936217785, + "learning_rate": 4.193818546388495e-05, + "loss": 0.0012, + "num_input_tokens_seen": 1235016, + "step": 6480 + }, + { + "epoch": 3.3705821205821205, + "grad_norm": 0.03720966354012489, + "learning_rate": 4.192149859405041e-05, + "loss": 0.1447, + "num_input_tokens_seen": 1236008, + "step": 6485 + }, + { + "epoch": 3.373180873180873, + "grad_norm": 0.07665830105543137, + "learning_rate": 4.190479779959459e-05, + "loss": 0.0038, + "num_input_tokens_seen": 1236936, + "step": 6490 + }, + { + "epoch": 3.375779625779626, + "grad_norm": 0.044222839176654816, + "learning_rate": 4.1888083094260486e-05, + "loss": 0.004, + "num_input_tokens_seen": 1237896, + "step": 6495 + }, + { + "epoch": 3.3783783783783785, + "grad_norm": 1.7065943479537964, + "learning_rate": 4.187135449180256e-05, + "loss": 0.1077, + "num_input_tokens_seen": 1238856, + "step": 6500 + }, + { + "epoch": 3.380977130977131, + "grad_norm": 0.05177943781018257, + "learning_rate": 4.1854612005986704e-05, + "loss": 0.0033, + "num_input_tokens_seen": 1239816, + "step": 6505 + }, + { + "epoch": 3.3835758835758836, + "grad_norm": 0.11877058446407318, + "learning_rate": 4.1837855650590216e-05, + "loss": 0.1357, + "num_input_tokens_seen": 1240744, + "step": 6510 + }, + { + "epoch": 3.386174636174636, + "grad_norm": 0.07305299490690231, + "learning_rate": 4.182108543940183e-05, + "loss": 0.0044, + "num_input_tokens_seen": 1241736, + "step": 6515 + }, + { + "epoch": 3.3887733887733886, + "grad_norm": 0.5979126691818237, + "learning_rate": 4.1804301386221665e-05, + "loss": 0.0921, + "num_input_tokens_seen": 1242600, + "step": 6520 + }, + { + "epoch": 3.391372141372141, + "grad_norm": 6.232037544250488, + "learning_rate": 4.1787503504861256e-05, + "loss": 0.0829, + "num_input_tokens_seen": 1243560, + "step": 6525 + }, + { + "epoch": 3.393970893970894, + "grad_norm": 14.653985977172852, + "learning_rate": 4.1770691809143495e-05, + "loss": 0.2038, + "num_input_tokens_seen": 1244584, + "step": 6530 + }, + { + "epoch": 3.3965696465696467, + "grad_norm": 0.02587190829217434, + "learning_rate": 4.175386631290263e-05, + "loss": 0.0045, + "num_input_tokens_seen": 1245576, + "step": 6535 + }, + { + "epoch": 3.399168399168399, + "grad_norm": 0.17696446180343628, + "learning_rate": 4.1737027029984307e-05, + "loss": 0.0871, + "num_input_tokens_seen": 1246600, + "step": 6540 + }, + { + "epoch": 3.4017671517671517, + "grad_norm": 0.05486224964261055, + "learning_rate": 4.172017397424548e-05, + "loss": 0.0379, + "num_input_tokens_seen": 1247624, + "step": 6545 + }, + { + "epoch": 3.4043659043659042, + "grad_norm": 0.035082146525382996, + "learning_rate": 4.170330715955444e-05, + "loss": 0.0022, + "num_input_tokens_seen": 1248520, + "step": 6550 + }, + { + "epoch": 3.406964656964657, + "grad_norm": 0.029622724279761314, + "learning_rate": 4.1686426599790826e-05, + "loss": 0.0042, + "num_input_tokens_seen": 1249480, + "step": 6555 + }, + { + "epoch": 3.4095634095634098, + "grad_norm": 0.0863828957080841, + "learning_rate": 4.166953230884556e-05, + "loss": 0.0029, + "num_input_tokens_seen": 1250440, + "step": 6560 + }, + { + "epoch": 3.4121621621621623, + "grad_norm": 0.0469505712389946, + "learning_rate": 4.165262430062088e-05, + "loss": 0.0081, + "num_input_tokens_seen": 1251400, + "step": 6565 + }, + { + "epoch": 3.414760914760915, + "grad_norm": 0.008304592221975327, + "learning_rate": 4.16357025890303e-05, + "loss": 0.0014, + "num_input_tokens_seen": 1252328, + "step": 6570 + }, + { + "epoch": 3.4173596673596673, + "grad_norm": 0.01661982387304306, + "learning_rate": 4.161876718799863e-05, + "loss": 0.0012, + "num_input_tokens_seen": 1253224, + "step": 6575 + }, + { + "epoch": 3.41995841995842, + "grad_norm": 0.009716786444187164, + "learning_rate": 4.160181811146192e-05, + "loss": 0.0009, + "num_input_tokens_seen": 1254088, + "step": 6580 + }, + { + "epoch": 3.4225571725571724, + "grad_norm": 0.007558475714176893, + "learning_rate": 4.158485537336748e-05, + "loss": 0.0024, + "num_input_tokens_seen": 1255048, + "step": 6585 + }, + { + "epoch": 3.4251559251559254, + "grad_norm": 0.00880619790405035, + "learning_rate": 4.156787898767388e-05, + "loss": 0.0011, + "num_input_tokens_seen": 1255912, + "step": 6590 + }, + { + "epoch": 3.427754677754678, + "grad_norm": 0.006751602049916983, + "learning_rate": 4.15508889683509e-05, + "loss": 0.0008, + "num_input_tokens_seen": 1256840, + "step": 6595 + }, + { + "epoch": 3.4303534303534304, + "grad_norm": 0.01381528377532959, + "learning_rate": 4.153388532937955e-05, + "loss": 0.1038, + "num_input_tokens_seen": 1257832, + "step": 6600 + }, + { + "epoch": 3.432952182952183, + "grad_norm": 0.0059337737038731575, + "learning_rate": 4.151686808475204e-05, + "loss": 0.1371, + "num_input_tokens_seen": 1258824, + "step": 6605 + }, + { + "epoch": 3.4355509355509355, + "grad_norm": 2.6079154014587402, + "learning_rate": 4.149983724847178e-05, + "loss": 0.1074, + "num_input_tokens_seen": 1259752, + "step": 6610 + }, + { + "epoch": 3.438149688149688, + "grad_norm": 0.4095555543899536, + "learning_rate": 4.1482792834553374e-05, + "loss": 0.1476, + "num_input_tokens_seen": 1260648, + "step": 6615 + }, + { + "epoch": 3.4407484407484406, + "grad_norm": 0.15267078578472137, + "learning_rate": 4.146573485702258e-05, + "loss": 0.0059, + "num_input_tokens_seen": 1261608, + "step": 6620 + }, + { + "epoch": 3.4433471933471935, + "grad_norm": 6.2442240715026855, + "learning_rate": 4.144866332991634e-05, + "loss": 0.1274, + "num_input_tokens_seen": 1262536, + "step": 6625 + }, + { + "epoch": 3.445945945945946, + "grad_norm": 0.16213850677013397, + "learning_rate": 4.143157826728271e-05, + "loss": 0.004, + "num_input_tokens_seen": 1263496, + "step": 6630 + }, + { + "epoch": 3.4485446985446986, + "grad_norm": 0.02592848800122738, + "learning_rate": 4.1414479683180926e-05, + "loss": 0.0033, + "num_input_tokens_seen": 1264424, + "step": 6635 + }, + { + "epoch": 3.451143451143451, + "grad_norm": 0.0233615729957819, + "learning_rate": 4.139736759168133e-05, + "loss": 0.1516, + "num_input_tokens_seen": 1265320, + "step": 6640 + }, + { + "epoch": 3.4537422037422036, + "grad_norm": 0.03134910389780998, + "learning_rate": 4.138024200686538e-05, + "loss": 0.1653, + "num_input_tokens_seen": 1266280, + "step": 6645 + }, + { + "epoch": 3.456340956340956, + "grad_norm": 0.069990374147892, + "learning_rate": 4.1363102942825634e-05, + "loss": 0.003, + "num_input_tokens_seen": 1267208, + "step": 6650 + }, + { + "epoch": 3.4589397089397087, + "grad_norm": 0.010742247104644775, + "learning_rate": 4.134595041366575e-05, + "loss": 0.0015, + "num_input_tokens_seen": 1268168, + "step": 6655 + }, + { + "epoch": 3.4615384615384617, + "grad_norm": 0.0947619378566742, + "learning_rate": 4.1328784433500464e-05, + "loss": 0.1415, + "num_input_tokens_seen": 1269000, + "step": 6660 + }, + { + "epoch": 3.464137214137214, + "grad_norm": 0.024710072204470634, + "learning_rate": 4.131160501645558e-05, + "loss": 0.0595, + "num_input_tokens_seen": 1269992, + "step": 6665 + }, + { + "epoch": 3.4667359667359667, + "grad_norm": 0.03760754317045212, + "learning_rate": 4.1294412176667954e-05, + "loss": 0.0264, + "num_input_tokens_seen": 1270952, + "step": 6670 + }, + { + "epoch": 3.4693347193347193, + "grad_norm": 1.6739760637283325, + "learning_rate": 4.12772059282855e-05, + "loss": 0.1013, + "num_input_tokens_seen": 1271880, + "step": 6675 + }, + { + "epoch": 3.471933471933472, + "grad_norm": 0.1132260337471962, + "learning_rate": 4.1259986285467155e-05, + "loss": 0.1164, + "num_input_tokens_seen": 1272872, + "step": 6680 + }, + { + "epoch": 3.4745322245322248, + "grad_norm": 0.0728481337428093, + "learning_rate": 4.1242753262382884e-05, + "loss": 0.005, + "num_input_tokens_seen": 1273832, + "step": 6685 + }, + { + "epoch": 3.4771309771309773, + "grad_norm": 2.9771838188171387, + "learning_rate": 4.122550687321366e-05, + "loss": 0.2053, + "num_input_tokens_seen": 1274792, + "step": 6690 + }, + { + "epoch": 3.47972972972973, + "grad_norm": 0.049064408987760544, + "learning_rate": 4.1208247132151456e-05, + "loss": 0.0866, + "num_input_tokens_seen": 1275784, + "step": 6695 + }, + { + "epoch": 3.4823284823284824, + "grad_norm": 16.04593276977539, + "learning_rate": 4.119097405339922e-05, + "loss": 0.1107, + "num_input_tokens_seen": 1276776, + "step": 6700 + }, + { + "epoch": 3.484927234927235, + "grad_norm": 11.029412269592285, + "learning_rate": 4.11736876511709e-05, + "loss": 0.0106, + "num_input_tokens_seen": 1277768, + "step": 6705 + }, + { + "epoch": 3.4875259875259874, + "grad_norm": 3.4771759510040283, + "learning_rate": 4.11563879396914e-05, + "loss": 0.1266, + "num_input_tokens_seen": 1278696, + "step": 6710 + }, + { + "epoch": 3.49012474012474, + "grad_norm": 0.23206910490989685, + "learning_rate": 4.113907493319655e-05, + "loss": 0.0025, + "num_input_tokens_seen": 1279592, + "step": 6715 + }, + { + "epoch": 3.492723492723493, + "grad_norm": 0.02316444367170334, + "learning_rate": 4.1121748645933164e-05, + "loss": 0.0022, + "num_input_tokens_seen": 1280552, + "step": 6720 + }, + { + "epoch": 3.4953222453222454, + "grad_norm": 0.10922440886497498, + "learning_rate": 4.1104409092158943e-05, + "loss": 0.0059, + "num_input_tokens_seen": 1281576, + "step": 6725 + }, + { + "epoch": 3.497920997920998, + "grad_norm": 0.016019990667700768, + "learning_rate": 4.1087056286142544e-05, + "loss": 0.1207, + "num_input_tokens_seen": 1282440, + "step": 6730 + }, + { + "epoch": 3.5, + "eval_loss": 0.2044539600610733, + "eval_runtime": 9.2657, + "eval_samples_per_second": 92.383, + "eval_steps_per_second": 23.096, + "num_input_tokens_seen": 1283208, + "step": 6734 + }, + { + "epoch": 3.5005197505197505, + "grad_norm": 0.018464302644133568, + "learning_rate": 4.1069690242163484e-05, + "loss": 0.0047, + "num_input_tokens_seen": 1283400, + "step": 6735 + }, + { + "epoch": 3.503118503118503, + "grad_norm": 0.10051820427179337, + "learning_rate": 4.105231097451222e-05, + "loss": 0.0027, + "num_input_tokens_seen": 1284328, + "step": 6740 + }, + { + "epoch": 3.5057172557172556, + "grad_norm": 0.04977717995643616, + "learning_rate": 4.103491849749006e-05, + "loss": 0.094, + "num_input_tokens_seen": 1285288, + "step": 6745 + }, + { + "epoch": 3.508316008316008, + "grad_norm": 0.027939947322010994, + "learning_rate": 4.101751282540919e-05, + "loss": 0.0024, + "num_input_tokens_seen": 1286184, + "step": 6750 + }, + { + "epoch": 3.510914760914761, + "grad_norm": 9.946380615234375, + "learning_rate": 4.1000093972592654e-05, + "loss": 0.0859, + "num_input_tokens_seen": 1287144, + "step": 6755 + }, + { + "epoch": 3.5135135135135136, + "grad_norm": 0.010164506733417511, + "learning_rate": 4.098266195337436e-05, + "loss": 0.0059, + "num_input_tokens_seen": 1288168, + "step": 6760 + }, + { + "epoch": 3.516112266112266, + "grad_norm": 2.7412710189819336, + "learning_rate": 4.0965216782099004e-05, + "loss": 0.005, + "num_input_tokens_seen": 1289064, + "step": 6765 + }, + { + "epoch": 3.5187110187110187, + "grad_norm": 0.06201111525297165, + "learning_rate": 4.0947758473122165e-05, + "loss": 0.0989, + "num_input_tokens_seen": 1290088, + "step": 6770 + }, + { + "epoch": 3.521309771309771, + "grad_norm": 0.012443436309695244, + "learning_rate": 4.093028704081019e-05, + "loss": 0.0899, + "num_input_tokens_seen": 1291048, + "step": 6775 + }, + { + "epoch": 3.523908523908524, + "grad_norm": 0.014427277259528637, + "learning_rate": 4.091280249954024e-05, + "loss": 0.0516, + "num_input_tokens_seen": 1292008, + "step": 6780 + }, + { + "epoch": 3.5265072765072762, + "grad_norm": 0.01679864339530468, + "learning_rate": 4.089530486370025e-05, + "loss": 0.0854, + "num_input_tokens_seen": 1292936, + "step": 6785 + }, + { + "epoch": 3.529106029106029, + "grad_norm": 0.057523611932992935, + "learning_rate": 4.087779414768896e-05, + "loss": 0.1196, + "num_input_tokens_seen": 1293864, + "step": 6790 + }, + { + "epoch": 3.5317047817047817, + "grad_norm": 8.475729942321777, + "learning_rate": 4.086027036591585e-05, + "loss": 0.0088, + "num_input_tokens_seen": 1294792, + "step": 6795 + }, + { + "epoch": 3.5343035343035343, + "grad_norm": 0.2492896020412445, + "learning_rate": 4.084273353280115e-05, + "loss": 0.1002, + "num_input_tokens_seen": 1295784, + "step": 6800 + }, + { + "epoch": 3.536902286902287, + "grad_norm": 0.1341525763273239, + "learning_rate": 4.082518366277585e-05, + "loss": 0.1127, + "num_input_tokens_seen": 1296776, + "step": 6805 + }, + { + "epoch": 3.5395010395010393, + "grad_norm": 0.36124709248542786, + "learning_rate": 4.080762077028164e-05, + "loss": 0.0049, + "num_input_tokens_seen": 1297704, + "step": 6810 + }, + { + "epoch": 3.5420997920997923, + "grad_norm": 0.011725720018148422, + "learning_rate": 4.079004486977095e-05, + "loss": 0.0016, + "num_input_tokens_seen": 1298600, + "step": 6815 + }, + { + "epoch": 3.544698544698545, + "grad_norm": 0.009387186728417873, + "learning_rate": 4.077245597570691e-05, + "loss": 0.015, + "num_input_tokens_seen": 1299560, + "step": 6820 + }, + { + "epoch": 3.5472972972972974, + "grad_norm": 0.032161083072423935, + "learning_rate": 4.075485410256332e-05, + "loss": 0.0022, + "num_input_tokens_seen": 1300488, + "step": 6825 + }, + { + "epoch": 3.54989604989605, + "grad_norm": 0.009732937440276146, + "learning_rate": 4.07372392648247e-05, + "loss": 0.0011, + "num_input_tokens_seen": 1301480, + "step": 6830 + }, + { + "epoch": 3.5524948024948024, + "grad_norm": 4.497344493865967, + "learning_rate": 4.071961147698621e-05, + "loss": 0.0101, + "num_input_tokens_seen": 1302376, + "step": 6835 + }, + { + "epoch": 3.555093555093555, + "grad_norm": 0.005366755649447441, + "learning_rate": 4.070197075355366e-05, + "loss": 0.001, + "num_input_tokens_seen": 1303240, + "step": 6840 + }, + { + "epoch": 3.5576923076923075, + "grad_norm": 0.01772901974618435, + "learning_rate": 4.068431710904354e-05, + "loss": 0.0924, + "num_input_tokens_seen": 1304104, + "step": 6845 + }, + { + "epoch": 3.5602910602910605, + "grad_norm": 0.013732087798416615, + "learning_rate": 4.066665055798293e-05, + "loss": 0.0008, + "num_input_tokens_seen": 1305032, + "step": 6850 + }, + { + "epoch": 3.562889812889813, + "grad_norm": 17.570526123046875, + "learning_rate": 4.0648971114909564e-05, + "loss": 0.1606, + "num_input_tokens_seen": 1305896, + "step": 6855 + }, + { + "epoch": 3.5654885654885655, + "grad_norm": 4.599792003631592, + "learning_rate": 4.0631278794371776e-05, + "loss": 0.1481, + "num_input_tokens_seen": 1306792, + "step": 6860 + }, + { + "epoch": 3.568087318087318, + "grad_norm": 0.023435495793819427, + "learning_rate": 4.0613573610928476e-05, + "loss": 0.0014, + "num_input_tokens_seen": 1307752, + "step": 6865 + }, + { + "epoch": 3.5706860706860706, + "grad_norm": 25.438385009765625, + "learning_rate": 4.059585557914919e-05, + "loss": 0.0616, + "num_input_tokens_seen": 1308712, + "step": 6870 + }, + { + "epoch": 3.5732848232848236, + "grad_norm": 3.6861536502838135, + "learning_rate": 4.0578124713614e-05, + "loss": 0.1164, + "num_input_tokens_seen": 1309736, + "step": 6875 + }, + { + "epoch": 3.5758835758835756, + "grad_norm": 0.05815550684928894, + "learning_rate": 4.0560381028913544e-05, + "loss": 0.0025, + "num_input_tokens_seen": 1310696, + "step": 6880 + }, + { + "epoch": 3.5784823284823286, + "grad_norm": 6.353489875793457, + "learning_rate": 4.054262453964902e-05, + "loss": 0.0715, + "num_input_tokens_seen": 1311656, + "step": 6885 + }, + { + "epoch": 3.581081081081081, + "grad_norm": 0.03267017751932144, + "learning_rate": 4.052485526043217e-05, + "loss": 0.0113, + "num_input_tokens_seen": 1312552, + "step": 6890 + }, + { + "epoch": 3.5836798336798337, + "grad_norm": 0.02079511620104313, + "learning_rate": 4.050707320588524e-05, + "loss": 0.1675, + "num_input_tokens_seen": 1313576, + "step": 6895 + }, + { + "epoch": 3.586278586278586, + "grad_norm": 0.041692789644002914, + "learning_rate": 4.0489278390640996e-05, + "loss": 0.0049, + "num_input_tokens_seen": 1314504, + "step": 6900 + }, + { + "epoch": 3.5888773388773387, + "grad_norm": 0.18309782445430756, + "learning_rate": 4.047147082934272e-05, + "loss": 0.0052, + "num_input_tokens_seen": 1315464, + "step": 6905 + }, + { + "epoch": 3.5914760914760917, + "grad_norm": 5.4912848472595215, + "learning_rate": 4.045365053664415e-05, + "loss": 0.186, + "num_input_tokens_seen": 1316392, + "step": 6910 + }, + { + "epoch": 3.5940748440748442, + "grad_norm": 0.7521219253540039, + "learning_rate": 4.043581752720954e-05, + "loss": 0.005, + "num_input_tokens_seen": 1317352, + "step": 6915 + }, + { + "epoch": 3.5966735966735968, + "grad_norm": 2.94933819770813, + "learning_rate": 4.0417971815713584e-05, + "loss": 0.122, + "num_input_tokens_seen": 1318376, + "step": 6920 + }, + { + "epoch": 3.5992723492723493, + "grad_norm": 0.02234150841832161, + "learning_rate": 4.040011341684142e-05, + "loss": 0.0044, + "num_input_tokens_seen": 1319304, + "step": 6925 + }, + { + "epoch": 3.601871101871102, + "grad_norm": 0.06632765382528305, + "learning_rate": 4.038224234528866e-05, + "loss": 0.1145, + "num_input_tokens_seen": 1320264, + "step": 6930 + }, + { + "epoch": 3.6044698544698544, + "grad_norm": 22.804109573364258, + "learning_rate": 4.036435861576131e-05, + "loss": 0.1221, + "num_input_tokens_seen": 1321192, + "step": 6935 + }, + { + "epoch": 3.607068607068607, + "grad_norm": 0.0733417421579361, + "learning_rate": 4.0346462242975826e-05, + "loss": 0.0635, + "num_input_tokens_seen": 1322152, + "step": 6940 + }, + { + "epoch": 3.60966735966736, + "grad_norm": 0.04469003528356552, + "learning_rate": 4.032855324165902e-05, + "loss": 0.0751, + "num_input_tokens_seen": 1323048, + "step": 6945 + }, + { + "epoch": 3.6122661122661124, + "grad_norm": 0.05676696076989174, + "learning_rate": 4.031063162654815e-05, + "loss": 0.2032, + "num_input_tokens_seen": 1323976, + "step": 6950 + }, + { + "epoch": 3.614864864864865, + "grad_norm": 0.03940657526254654, + "learning_rate": 4.029269741239081e-05, + "loss": 0.0459, + "num_input_tokens_seen": 1324936, + "step": 6955 + }, + { + "epoch": 3.6174636174636174, + "grad_norm": 8.496716499328613, + "learning_rate": 4.027475061394499e-05, + "loss": 0.0252, + "num_input_tokens_seen": 1325832, + "step": 6960 + }, + { + "epoch": 3.62006237006237, + "grad_norm": 0.04337969422340393, + "learning_rate": 4.0256791245979024e-05, + "loss": 0.0043, + "num_input_tokens_seen": 1326728, + "step": 6965 + }, + { + "epoch": 3.6226611226611225, + "grad_norm": 0.09774535894393921, + "learning_rate": 4.023881932327159e-05, + "loss": 0.0044, + "num_input_tokens_seen": 1327624, + "step": 6970 + }, + { + "epoch": 3.625259875259875, + "grad_norm": 0.39155253767967224, + "learning_rate": 4.0220834860611705e-05, + "loss": 0.0093, + "num_input_tokens_seen": 1328616, + "step": 6975 + }, + { + "epoch": 3.627858627858628, + "grad_norm": 8.66998291015625, + "learning_rate": 4.0202837872798695e-05, + "loss": 0.0752, + "num_input_tokens_seen": 1329544, + "step": 6980 + }, + { + "epoch": 3.6304573804573805, + "grad_norm": 0.025905951857566833, + "learning_rate": 4.018482837464219e-05, + "loss": 0.2819, + "num_input_tokens_seen": 1330504, + "step": 6985 + }, + { + "epoch": 3.633056133056133, + "grad_norm": 0.5731920003890991, + "learning_rate": 4.016680638096212e-05, + "loss": 0.0051, + "num_input_tokens_seen": 1331400, + "step": 6990 + }, + { + "epoch": 3.6356548856548856, + "grad_norm": 0.05167840048670769, + "learning_rate": 4.0148771906588706e-05, + "loss": 0.1249, + "num_input_tokens_seen": 1332328, + "step": 6995 + }, + { + "epoch": 3.638253638253638, + "grad_norm": 0.03157031908631325, + "learning_rate": 4.013072496636243e-05, + "loss": 0.0069, + "num_input_tokens_seen": 1333256, + "step": 7000 + }, + { + "epoch": 3.640852390852391, + "grad_norm": 0.12079551815986633, + "learning_rate": 4.0112665575134033e-05, + "loss": 0.0066, + "num_input_tokens_seen": 1334216, + "step": 7005 + }, + { + "epoch": 3.643451143451143, + "grad_norm": 0.02100764773786068, + "learning_rate": 4.009459374776451e-05, + "loss": 0.0595, + "num_input_tokens_seen": 1335080, + "step": 7010 + }, + { + "epoch": 3.646049896049896, + "grad_norm": 0.044171106070280075, + "learning_rate": 4.007650949912506e-05, + "loss": 0.1085, + "num_input_tokens_seen": 1336040, + "step": 7015 + }, + { + "epoch": 3.6486486486486487, + "grad_norm": 3.41019868850708, + "learning_rate": 4.0058412844097153e-05, + "loss": 0.0054, + "num_input_tokens_seen": 1336968, + "step": 7020 + }, + { + "epoch": 3.651247401247401, + "grad_norm": 0.031100798398256302, + "learning_rate": 4.004030379757243e-05, + "loss": 0.1118, + "num_input_tokens_seen": 1337896, + "step": 7025 + }, + { + "epoch": 3.6538461538461537, + "grad_norm": 0.041374530643224716, + "learning_rate": 4.0022182374452736e-05, + "loss": 0.0045, + "num_input_tokens_seen": 1338856, + "step": 7030 + }, + { + "epoch": 3.6564449064449063, + "grad_norm": 21.176353454589844, + "learning_rate": 4.0004048589650104e-05, + "loss": 0.0727, + "num_input_tokens_seen": 1339816, + "step": 7035 + }, + { + "epoch": 3.6590436590436592, + "grad_norm": 0.016996068879961967, + "learning_rate": 3.9985902458086746e-05, + "loss": 0.0025, + "num_input_tokens_seen": 1340776, + "step": 7040 + }, + { + "epoch": 3.6616424116424118, + "grad_norm": 11.048589706420898, + "learning_rate": 3.996774399469502e-05, + "loss": 0.2655, + "num_input_tokens_seen": 1341704, + "step": 7045 + }, + { + "epoch": 3.6642411642411643, + "grad_norm": 0.02618660219013691, + "learning_rate": 3.9949573214417447e-05, + "loss": 0.0027, + "num_input_tokens_seen": 1342664, + "step": 7050 + }, + { + "epoch": 3.666839916839917, + "grad_norm": 0.06211112067103386, + "learning_rate": 3.993139013220668e-05, + "loss": 0.0164, + "num_input_tokens_seen": 1343560, + "step": 7055 + }, + { + "epoch": 3.6694386694386694, + "grad_norm": 0.11886706948280334, + "learning_rate": 3.9913194763025486e-05, + "loss": 0.0015, + "num_input_tokens_seen": 1344520, + "step": 7060 + }, + { + "epoch": 3.672037422037422, + "grad_norm": 0.045469313859939575, + "learning_rate": 3.989498712184674e-05, + "loss": 0.126, + "num_input_tokens_seen": 1345416, + "step": 7065 + }, + { + "epoch": 3.6746361746361744, + "grad_norm": 0.01036488451063633, + "learning_rate": 3.9876767223653446e-05, + "loss": 0.0016, + "num_input_tokens_seen": 1346376, + "step": 7070 + }, + { + "epoch": 3.6772349272349274, + "grad_norm": 0.029341086745262146, + "learning_rate": 3.985853508343865e-05, + "loss": 0.0038, + "num_input_tokens_seen": 1347368, + "step": 7075 + }, + { + "epoch": 3.67983367983368, + "grad_norm": 0.14254751801490784, + "learning_rate": 3.9840290716205495e-05, + "loss": 0.0024, + "num_input_tokens_seen": 1348328, + "step": 7080 + }, + { + "epoch": 3.6824324324324325, + "grad_norm": 0.010965270921587944, + "learning_rate": 3.98220341369672e-05, + "loss": 0.0042, + "num_input_tokens_seen": 1349288, + "step": 7085 + }, + { + "epoch": 3.685031185031185, + "grad_norm": 0.012149397283792496, + "learning_rate": 3.980376536074701e-05, + "loss": 0.0836, + "num_input_tokens_seen": 1350312, + "step": 7090 + }, + { + "epoch": 3.6876299376299375, + "grad_norm": 23.59425926208496, + "learning_rate": 3.9785484402578216e-05, + "loss": 0.2012, + "num_input_tokens_seen": 1351304, + "step": 7095 + }, + { + "epoch": 3.6902286902286905, + "grad_norm": 0.021200668066740036, + "learning_rate": 3.976719127750413e-05, + "loss": 0.0034, + "num_input_tokens_seen": 1352232, + "step": 7100 + }, + { + "epoch": 3.6928274428274426, + "grad_norm": 0.051120709627866745, + "learning_rate": 3.974888600057808e-05, + "loss": 0.0363, + "num_input_tokens_seen": 1353192, + "step": 7105 + }, + { + "epoch": 3.6954261954261955, + "grad_norm": 59.40955352783203, + "learning_rate": 3.9730568586863384e-05, + "loss": 0.1318, + "num_input_tokens_seen": 1354088, + "step": 7110 + }, + { + "epoch": 3.698024948024948, + "grad_norm": 1.4468554258346558, + "learning_rate": 3.971223905143336e-05, + "loss": 0.2197, + "num_input_tokens_seen": 1355048, + "step": 7115 + }, + { + "epoch": 3.7006237006237006, + "grad_norm": 14.019946098327637, + "learning_rate": 3.9693897409371316e-05, + "loss": 0.1343, + "num_input_tokens_seen": 1355976, + "step": 7120 + }, + { + "epoch": 3.703222453222453, + "grad_norm": 0.16850851476192474, + "learning_rate": 3.967554367577047e-05, + "loss": 0.0086, + "num_input_tokens_seen": 1357000, + "step": 7125 + }, + { + "epoch": 3.7058212058212057, + "grad_norm": 2.7595114707946777, + "learning_rate": 3.965717786573404e-05, + "loss": 0.0905, + "num_input_tokens_seen": 1357992, + "step": 7130 + }, + { + "epoch": 3.7084199584199586, + "grad_norm": 0.0698261484503746, + "learning_rate": 3.963879999437516e-05, + "loss": 0.1116, + "num_input_tokens_seen": 1358888, + "step": 7135 + }, + { + "epoch": 3.711018711018711, + "grad_norm": 0.07035528868436813, + "learning_rate": 3.962041007681691e-05, + "loss": 0.1381, + "num_input_tokens_seen": 1359816, + "step": 7140 + }, + { + "epoch": 3.7136174636174637, + "grad_norm": 0.0873626098036766, + "learning_rate": 3.960200812819223e-05, + "loss": 0.0113, + "num_input_tokens_seen": 1360744, + "step": 7145 + }, + { + "epoch": 3.7162162162162162, + "grad_norm": 0.353271484375, + "learning_rate": 3.9583594163644036e-05, + "loss": 0.0078, + "num_input_tokens_seen": 1361640, + "step": 7150 + }, + { + "epoch": 3.7188149688149688, + "grad_norm": 0.04137663170695305, + "learning_rate": 3.9565168198325064e-05, + "loss": 0.1159, + "num_input_tokens_seen": 1362632, + "step": 7155 + }, + { + "epoch": 3.7214137214137213, + "grad_norm": 0.2606944441795349, + "learning_rate": 3.954673024739797e-05, + "loss": 0.0301, + "num_input_tokens_seen": 1363656, + "step": 7160 + }, + { + "epoch": 3.724012474012474, + "grad_norm": 0.14518487453460693, + "learning_rate": 3.952828032603525e-05, + "loss": 0.0875, + "num_input_tokens_seen": 1364488, + "step": 7165 + }, + { + "epoch": 3.726611226611227, + "grad_norm": 1.6946595907211304, + "learning_rate": 3.950981844941926e-05, + "loss": 0.0142, + "num_input_tokens_seen": 1365448, + "step": 7170 + }, + { + "epoch": 3.7292099792099793, + "grad_norm": 1.7869012355804443, + "learning_rate": 3.949134463274218e-05, + "loss": 0.0071, + "num_input_tokens_seen": 1366472, + "step": 7175 + }, + { + "epoch": 3.731808731808732, + "grad_norm": 0.029556773602962494, + "learning_rate": 3.947285889120605e-05, + "loss": 0.0029, + "num_input_tokens_seen": 1367368, + "step": 7180 + }, + { + "epoch": 3.7344074844074844, + "grad_norm": 0.015370551496744156, + "learning_rate": 3.945436124002268e-05, + "loss": 0.1301, + "num_input_tokens_seen": 1368392, + "step": 7185 + }, + { + "epoch": 3.737006237006237, + "grad_norm": 0.023369763046503067, + "learning_rate": 3.94358516944137e-05, + "loss": 0.0555, + "num_input_tokens_seen": 1369352, + "step": 7190 + }, + { + "epoch": 3.73960498960499, + "grad_norm": 0.04248615354299545, + "learning_rate": 3.941733026961054e-05, + "loss": 0.0847, + "num_input_tokens_seen": 1370312, + "step": 7195 + }, + { + "epoch": 3.742203742203742, + "grad_norm": 0.03860875219106674, + "learning_rate": 3.939879698085439e-05, + "loss": 0.0015, + "num_input_tokens_seen": 1371208, + "step": 7200 + }, + { + "epoch": 3.744802494802495, + "grad_norm": 0.019827771931886673, + "learning_rate": 3.93802518433962e-05, + "loss": 0.0639, + "num_input_tokens_seen": 1372200, + "step": 7205 + }, + { + "epoch": 3.7474012474012475, + "grad_norm": 0.16996821761131287, + "learning_rate": 3.936169487249667e-05, + "loss": 0.0102, + "num_input_tokens_seen": 1373160, + "step": 7210 + }, + { + "epoch": 3.75, + "grad_norm": 0.022853730246424675, + "learning_rate": 3.9343126083426264e-05, + "loss": 0.0721, + "num_input_tokens_seen": 1374056, + "step": 7215 + }, + { + "epoch": 3.7525987525987525, + "grad_norm": 0.35832279920578003, + "learning_rate": 3.932454549146513e-05, + "loss": 0.003, + "num_input_tokens_seen": 1374920, + "step": 7220 + }, + { + "epoch": 3.755197505197505, + "grad_norm": 0.1428830921649933, + "learning_rate": 3.930595311190316e-05, + "loss": 0.1399, + "num_input_tokens_seen": 1375880, + "step": 7225 + }, + { + "epoch": 3.757796257796258, + "grad_norm": 3.1641807556152344, + "learning_rate": 3.9287348960039926e-05, + "loss": 0.0754, + "num_input_tokens_seen": 1376872, + "step": 7230 + }, + { + "epoch": 3.76039501039501, + "grad_norm": 0.1700984388589859, + "learning_rate": 3.926873305118471e-05, + "loss": 0.0092, + "num_input_tokens_seen": 1377800, + "step": 7235 + }, + { + "epoch": 3.762993762993763, + "grad_norm": 0.04167687147855759, + "learning_rate": 3.9250105400656456e-05, + "loss": 0.0037, + "num_input_tokens_seen": 1378728, + "step": 7240 + }, + { + "epoch": 3.7655925155925156, + "grad_norm": 0.0420461930334568, + "learning_rate": 3.9231466023783756e-05, + "loss": 0.0064, + "num_input_tokens_seen": 1379688, + "step": 7245 + }, + { + "epoch": 3.768191268191268, + "grad_norm": 0.018603969365358353, + "learning_rate": 3.9212814935904874e-05, + "loss": 0.0657, + "num_input_tokens_seen": 1380616, + "step": 7250 + }, + { + "epoch": 3.7707900207900207, + "grad_norm": 2.8021368980407715, + "learning_rate": 3.9194152152367695e-05, + "loss": 0.1463, + "num_input_tokens_seen": 1381480, + "step": 7255 + }, + { + "epoch": 3.773388773388773, + "grad_norm": 0.015145990066230297, + "learning_rate": 3.917547768852975e-05, + "loss": 0.0015, + "num_input_tokens_seen": 1382408, + "step": 7260 + }, + { + "epoch": 3.775987525987526, + "grad_norm": 0.05967561528086662, + "learning_rate": 3.915679155975815e-05, + "loss": 0.0281, + "num_input_tokens_seen": 1383400, + "step": 7265 + }, + { + "epoch": 3.7785862785862787, + "grad_norm": 43.0918083190918, + "learning_rate": 3.913809378142964e-05, + "loss": 0.0304, + "num_input_tokens_seen": 1384328, + "step": 7270 + }, + { + "epoch": 3.7811850311850312, + "grad_norm": 0.47144684195518494, + "learning_rate": 3.911938436893051e-05, + "loss": 0.0102, + "num_input_tokens_seen": 1385288, + "step": 7275 + }, + { + "epoch": 3.7837837837837838, + "grad_norm": 0.027249712496995926, + "learning_rate": 3.9100663337656676e-05, + "loss": 0.123, + "num_input_tokens_seen": 1386248, + "step": 7280 + }, + { + "epoch": 3.7863825363825363, + "grad_norm": 3.796875, + "learning_rate": 3.908193070301356e-05, + "loss": 0.0054, + "num_input_tokens_seen": 1387176, + "step": 7285 + }, + { + "epoch": 3.788981288981289, + "grad_norm": 24.032577514648438, + "learning_rate": 3.906318648041617e-05, + "loss": 0.1314, + "num_input_tokens_seen": 1388168, + "step": 7290 + }, + { + "epoch": 3.7915800415800414, + "grad_norm": 1.1286417245864868, + "learning_rate": 3.904443068528905e-05, + "loss": 0.0063, + "num_input_tokens_seen": 1389128, + "step": 7295 + }, + { + "epoch": 3.7941787941787943, + "grad_norm": 0.11423470079898834, + "learning_rate": 3.902566333306623e-05, + "loss": 0.1067, + "num_input_tokens_seen": 1390024, + "step": 7300 + }, + { + "epoch": 3.796777546777547, + "grad_norm": 0.2626819908618927, + "learning_rate": 3.900688443919129e-05, + "loss": 0.134, + "num_input_tokens_seen": 1391048, + "step": 7305 + }, + { + "epoch": 3.7993762993762994, + "grad_norm": 7.626429557800293, + "learning_rate": 3.8988094019117294e-05, + "loss": 0.1197, + "num_input_tokens_seen": 1392040, + "step": 7310 + }, + { + "epoch": 3.801975051975052, + "grad_norm": 0.026244372129440308, + "learning_rate": 3.896929208830679e-05, + "loss": 0.091, + "num_input_tokens_seen": 1392968, + "step": 7315 + }, + { + "epoch": 3.8045738045738045, + "grad_norm": 0.03468679264187813, + "learning_rate": 3.895047866223179e-05, + "loss": 0.1162, + "num_input_tokens_seen": 1393992, + "step": 7320 + }, + { + "epoch": 3.8071725571725574, + "grad_norm": 0.08042307943105698, + "learning_rate": 3.893165375637378e-05, + "loss": 0.0676, + "num_input_tokens_seen": 1394952, + "step": 7325 + }, + { + "epoch": 3.8097713097713095, + "grad_norm": 7.490211486816406, + "learning_rate": 3.891281738622369e-05, + "loss": 0.1195, + "num_input_tokens_seen": 1395816, + "step": 7330 + }, + { + "epoch": 3.8123700623700625, + "grad_norm": 0.16337049007415771, + "learning_rate": 3.889396956728187e-05, + "loss": 0.098, + "num_input_tokens_seen": 1396744, + "step": 7335 + }, + { + "epoch": 3.814968814968815, + "grad_norm": 4.095853805541992, + "learning_rate": 3.887511031505811e-05, + "loss": 0.1869, + "num_input_tokens_seen": 1397640, + "step": 7340 + }, + { + "epoch": 3.8175675675675675, + "grad_norm": 0.15990577638149261, + "learning_rate": 3.8856239645071604e-05, + "loss": 0.078, + "num_input_tokens_seen": 1398600, + "step": 7345 + }, + { + "epoch": 3.82016632016632, + "grad_norm": 0.124790258705616, + "learning_rate": 3.883735757285092e-05, + "loss": 0.0108, + "num_input_tokens_seen": 1399560, + "step": 7350 + }, + { + "epoch": 3.8227650727650726, + "grad_norm": 1.0447025299072266, + "learning_rate": 3.881846411393403e-05, + "loss": 0.008, + "num_input_tokens_seen": 1400520, + "step": 7355 + }, + { + "epoch": 3.8253638253638256, + "grad_norm": 0.11148123443126678, + "learning_rate": 3.879955928386829e-05, + "loss": 0.0043, + "num_input_tokens_seen": 1401480, + "step": 7360 + }, + { + "epoch": 3.827962577962578, + "grad_norm": 0.18985958397388458, + "learning_rate": 3.878064309821038e-05, + "loss": 0.0033, + "num_input_tokens_seen": 1402312, + "step": 7365 + }, + { + "epoch": 3.8305613305613306, + "grad_norm": 0.05710588023066521, + "learning_rate": 3.876171557252633e-05, + "loss": 0.1016, + "num_input_tokens_seen": 1403240, + "step": 7370 + }, + { + "epoch": 3.833160083160083, + "grad_norm": 0.01797644980251789, + "learning_rate": 3.874277672239154e-05, + "loss": 0.0779, + "num_input_tokens_seen": 1404200, + "step": 7375 + }, + { + "epoch": 3.8357588357588357, + "grad_norm": 0.06006081774830818, + "learning_rate": 3.872382656339068e-05, + "loss": 0.0019, + "num_input_tokens_seen": 1405160, + "step": 7380 + }, + { + "epoch": 3.8383575883575882, + "grad_norm": 0.037442948669195175, + "learning_rate": 3.8704865111117746e-05, + "loss": 0.0016, + "num_input_tokens_seen": 1406120, + "step": 7385 + }, + { + "epoch": 3.8409563409563408, + "grad_norm": 0.03479263558983803, + "learning_rate": 3.8685892381176034e-05, + "loss": 0.0013, + "num_input_tokens_seen": 1407080, + "step": 7390 + }, + { + "epoch": 3.8435550935550937, + "grad_norm": 0.024211740121245384, + "learning_rate": 3.8666908389178127e-05, + "loss": 0.0019, + "num_input_tokens_seen": 1407976, + "step": 7395 + }, + { + "epoch": 3.8461538461538463, + "grad_norm": 4.048192501068115, + "learning_rate": 3.864791315074583e-05, + "loss": 0.1386, + "num_input_tokens_seen": 1408968, + "step": 7400 + }, + { + "epoch": 3.848752598752599, + "grad_norm": 0.03334622085094452, + "learning_rate": 3.862890668151025e-05, + "loss": 0.001, + "num_input_tokens_seen": 1409832, + "step": 7405 + }, + { + "epoch": 3.8513513513513513, + "grad_norm": 12.289582252502441, + "learning_rate": 3.8609888997111734e-05, + "loss": 0.0093, + "num_input_tokens_seen": 1410760, + "step": 7410 + }, + { + "epoch": 3.853950103950104, + "grad_norm": 3.170605421066284, + "learning_rate": 3.8590860113199835e-05, + "loss": 0.2422, + "num_input_tokens_seen": 1411688, + "step": 7415 + }, + { + "epoch": 3.856548856548857, + "grad_norm": 0.06799587607383728, + "learning_rate": 3.8571820045433326e-05, + "loss": 0.0044, + "num_input_tokens_seen": 1412616, + "step": 7420 + }, + { + "epoch": 3.859147609147609, + "grad_norm": 0.06108073145151138, + "learning_rate": 3.85527688094802e-05, + "loss": 0.0047, + "num_input_tokens_seen": 1413544, + "step": 7425 + }, + { + "epoch": 3.861746361746362, + "grad_norm": 11.08521842956543, + "learning_rate": 3.8533706421017614e-05, + "loss": 0.1527, + "num_input_tokens_seen": 1414504, + "step": 7430 + }, + { + "epoch": 3.8643451143451144, + "grad_norm": 0.1826324611902237, + "learning_rate": 3.851463289573193e-05, + "loss": 0.0027, + "num_input_tokens_seen": 1415464, + "step": 7435 + }, + { + "epoch": 3.866943866943867, + "grad_norm": 1.3231409788131714, + "learning_rate": 3.8495548249318655e-05, + "loss": 0.1182, + "num_input_tokens_seen": 1416360, + "step": 7440 + }, + { + "epoch": 3.8695426195426195, + "grad_norm": 0.10146880894899368, + "learning_rate": 3.8476452497482466e-05, + "loss": 0.0031, + "num_input_tokens_seen": 1417256, + "step": 7445 + }, + { + "epoch": 3.872141372141372, + "grad_norm": 0.07821639627218246, + "learning_rate": 3.845734565593716e-05, + "loss": 0.0405, + "num_input_tokens_seen": 1418184, + "step": 7450 + }, + { + "epoch": 3.874740124740125, + "grad_norm": 0.022583289071917534, + "learning_rate": 3.843822774040565e-05, + "loss": 0.0229, + "num_input_tokens_seen": 1419144, + "step": 7455 + }, + { + "epoch": 3.8773388773388775, + "grad_norm": 0.014896747656166553, + "learning_rate": 3.841909876662001e-05, + "loss": 0.0933, + "num_input_tokens_seen": 1420072, + "step": 7460 + }, + { + "epoch": 3.87993762993763, + "grad_norm": 0.06389892101287842, + "learning_rate": 3.839995875032135e-05, + "loss": 0.1195, + "num_input_tokens_seen": 1421000, + "step": 7465 + }, + { + "epoch": 3.8825363825363826, + "grad_norm": 0.05238921940326691, + "learning_rate": 3.8380807707259923e-05, + "loss": 0.0785, + "num_input_tokens_seen": 1421928, + "step": 7470 + }, + { + "epoch": 3.885135135135135, + "grad_norm": 0.06273006647825241, + "learning_rate": 3.8361645653195026e-05, + "loss": 0.0054, + "num_input_tokens_seen": 1422984, + "step": 7475 + }, + { + "epoch": 3.8877338877338876, + "grad_norm": 0.07756107300519943, + "learning_rate": 3.8342472603895024e-05, + "loss": 0.0021, + "num_input_tokens_seen": 1423880, + "step": 7480 + }, + { + "epoch": 3.89033264033264, + "grad_norm": 0.05095815658569336, + "learning_rate": 3.8323288575137316e-05, + "loss": 0.0029, + "num_input_tokens_seen": 1424808, + "step": 7485 + }, + { + "epoch": 3.892931392931393, + "grad_norm": 0.02203422412276268, + "learning_rate": 3.8304093582708366e-05, + "loss": 0.1311, + "num_input_tokens_seen": 1425704, + "step": 7490 + }, + { + "epoch": 3.8955301455301456, + "grad_norm": 0.02600092999637127, + "learning_rate": 3.828488764240363e-05, + "loss": 0.0064, + "num_input_tokens_seen": 1426600, + "step": 7495 + }, + { + "epoch": 3.898128898128898, + "grad_norm": 0.13300517201423645, + "learning_rate": 3.826567077002759e-05, + "loss": 0.0635, + "num_input_tokens_seen": 1427592, + "step": 7500 + }, + { + "epoch": 3.9007276507276507, + "grad_norm": 0.2594112455844879, + "learning_rate": 3.824644298139371e-05, + "loss": 0.0696, + "num_input_tokens_seen": 1428552, + "step": 7505 + }, + { + "epoch": 3.9033264033264032, + "grad_norm": 0.021001115441322327, + "learning_rate": 3.8227204292324484e-05, + "loss": 0.1047, + "num_input_tokens_seen": 1429512, + "step": 7510 + }, + { + "epoch": 3.9059251559251558, + "grad_norm": 18.334291458129883, + "learning_rate": 3.820795471865129e-05, + "loss": 0.1145, + "num_input_tokens_seen": 1430504, + "step": 7515 + }, + { + "epoch": 3.9085239085239083, + "grad_norm": 4.696917533874512, + "learning_rate": 3.818869427621453e-05, + "loss": 0.282, + "num_input_tokens_seen": 1431464, + "step": 7520 + }, + { + "epoch": 3.9111226611226613, + "grad_norm": 0.044826850295066833, + "learning_rate": 3.8169422980863544e-05, + "loss": 0.008, + "num_input_tokens_seen": 1432424, + "step": 7525 + }, + { + "epoch": 3.913721413721414, + "grad_norm": 1.309493899345398, + "learning_rate": 3.8150140848456574e-05, + "loss": 0.0084, + "num_input_tokens_seen": 1433384, + "step": 7530 + }, + { + "epoch": 3.9163201663201663, + "grad_norm": 0.0359896644949913, + "learning_rate": 3.81308478948608e-05, + "loss": 0.105, + "num_input_tokens_seen": 1434408, + "step": 7535 + }, + { + "epoch": 3.918918918918919, + "grad_norm": 0.02619960904121399, + "learning_rate": 3.81115441359523e-05, + "loss": 0.0035, + "num_input_tokens_seen": 1435336, + "step": 7540 + }, + { + "epoch": 3.9215176715176714, + "grad_norm": 0.02368258871138096, + "learning_rate": 3.809222958761605e-05, + "loss": 0.0023, + "num_input_tokens_seen": 1436232, + "step": 7545 + }, + { + "epoch": 3.9241164241164244, + "grad_norm": 0.022220250219106674, + "learning_rate": 3.80729042657459e-05, + "loss": 0.0282, + "num_input_tokens_seen": 1437256, + "step": 7550 + }, + { + "epoch": 3.9267151767151764, + "grad_norm": 0.15739691257476807, + "learning_rate": 3.805356818624457e-05, + "loss": 0.1007, + "num_input_tokens_seen": 1438184, + "step": 7555 + }, + { + "epoch": 3.9293139293139294, + "grad_norm": 14.422882080078125, + "learning_rate": 3.8034221365023624e-05, + "loss": 0.0784, + "num_input_tokens_seen": 1439272, + "step": 7560 + }, + { + "epoch": 3.931912681912682, + "grad_norm": 0.1639624387025833, + "learning_rate": 3.801486381800347e-05, + "loss": 0.184, + "num_input_tokens_seen": 1440264, + "step": 7565 + }, + { + "epoch": 3.9345114345114345, + "grad_norm": 0.1251268982887268, + "learning_rate": 3.7995495561113336e-05, + "loss": 0.0325, + "num_input_tokens_seen": 1441192, + "step": 7570 + }, + { + "epoch": 3.937110187110187, + "grad_norm": 0.10504685342311859, + "learning_rate": 3.797611661029128e-05, + "loss": 0.0182, + "num_input_tokens_seen": 1442312, + "step": 7575 + }, + { + "epoch": 3.9397089397089395, + "grad_norm": 0.03469037637114525, + "learning_rate": 3.795672698148415e-05, + "loss": 0.0053, + "num_input_tokens_seen": 1443272, + "step": 7580 + }, + { + "epoch": 3.9423076923076925, + "grad_norm": 1.3904274702072144, + "learning_rate": 3.7937326690647556e-05, + "loss": 0.0029, + "num_input_tokens_seen": 1444200, + "step": 7585 + }, + { + "epoch": 3.944906444906445, + "grad_norm": 0.014277284033596516, + "learning_rate": 3.7917915753745935e-05, + "loss": 0.0017, + "num_input_tokens_seen": 1445128, + "step": 7590 + }, + { + "epoch": 3.9475051975051976, + "grad_norm": 0.1116451546549797, + "learning_rate": 3.789849418675245e-05, + "loss": 0.0019, + "num_input_tokens_seen": 1446024, + "step": 7595 + }, + { + "epoch": 3.95010395010395, + "grad_norm": 0.7496129870414734, + "learning_rate": 3.7879062005649e-05, + "loss": 0.0035, + "num_input_tokens_seen": 1446952, + "step": 7600 + }, + { + "epoch": 3.9527027027027026, + "grad_norm": 0.013573098927736282, + "learning_rate": 3.785961922642626e-05, + "loss": 0.0013, + "num_input_tokens_seen": 1447912, + "step": 7605 + }, + { + "epoch": 3.955301455301455, + "grad_norm": 0.34069350361824036, + "learning_rate": 3.784016586508357e-05, + "loss": 0.0015, + "num_input_tokens_seen": 1448936, + "step": 7610 + }, + { + "epoch": 3.9579002079002077, + "grad_norm": 0.012841531075537205, + "learning_rate": 3.782070193762904e-05, + "loss": 0.0251, + "num_input_tokens_seen": 1449896, + "step": 7615 + }, + { + "epoch": 3.9604989604989607, + "grad_norm": 0.1098603680729866, + "learning_rate": 3.7801227460079424e-05, + "loss": 0.0011, + "num_input_tokens_seen": 1450824, + "step": 7620 + }, + { + "epoch": 3.963097713097713, + "grad_norm": 0.02247305028140545, + "learning_rate": 3.778174244846019e-05, + "loss": 0.0286, + "num_input_tokens_seen": 1451848, + "step": 7625 + }, + { + "epoch": 3.9656964656964657, + "grad_norm": 0.02341170236468315, + "learning_rate": 3.776224691880545e-05, + "loss": 0.1169, + "num_input_tokens_seen": 1452776, + "step": 7630 + }, + { + "epoch": 3.9682952182952183, + "grad_norm": 0.007620465476065874, + "learning_rate": 3.7742740887158e-05, + "loss": 0.1009, + "num_input_tokens_seen": 1453736, + "step": 7635 + }, + { + "epoch": 3.970893970893971, + "grad_norm": 4.877505302429199, + "learning_rate": 3.772322436956924e-05, + "loss": 0.0846, + "num_input_tokens_seen": 1454728, + "step": 7640 + }, + { + "epoch": 3.9734927234927238, + "grad_norm": 14.374533653259277, + "learning_rate": 3.7703697382099234e-05, + "loss": 0.1866, + "num_input_tokens_seen": 1455624, + "step": 7645 + }, + { + "epoch": 3.976091476091476, + "grad_norm": 0.012155424803495407, + "learning_rate": 3.768415994081664e-05, + "loss": 0.0292, + "num_input_tokens_seen": 1456520, + "step": 7650 + }, + { + "epoch": 3.978690228690229, + "grad_norm": 7.019650459289551, + "learning_rate": 3.766461206179874e-05, + "loss": 0.292, + "num_input_tokens_seen": 1457672, + "step": 7655 + }, + { + "epoch": 3.9812889812889813, + "grad_norm": 0.01428857259452343, + "learning_rate": 3.764505376113138e-05, + "loss": 0.0013, + "num_input_tokens_seen": 1458600, + "step": 7660 + }, + { + "epoch": 3.983887733887734, + "grad_norm": 4.350991249084473, + "learning_rate": 3.762548505490899e-05, + "loss": 0.0994, + "num_input_tokens_seen": 1459496, + "step": 7665 + }, + { + "epoch": 3.9864864864864864, + "grad_norm": 0.07573389261960983, + "learning_rate": 3.7605905959234576e-05, + "loss": 0.1248, + "num_input_tokens_seen": 1460456, + "step": 7670 + }, + { + "epoch": 3.989085239085239, + "grad_norm": 0.030833104625344276, + "learning_rate": 3.758631649021968e-05, + "loss": 0.0027, + "num_input_tokens_seen": 1461352, + "step": 7675 + }, + { + "epoch": 3.991683991683992, + "grad_norm": 0.11129091680049896, + "learning_rate": 3.756671666398438e-05, + "loss": 0.1122, + "num_input_tokens_seen": 1462440, + "step": 7680 + }, + { + "epoch": 3.9942827442827444, + "grad_norm": 0.038610998541116714, + "learning_rate": 3.754710649665728e-05, + "loss": 0.0034, + "num_input_tokens_seen": 1463432, + "step": 7685 + }, + { + "epoch": 3.996881496881497, + "grad_norm": 0.048576317727565765, + "learning_rate": 3.7527486004375506e-05, + "loss": 0.0996, + "num_input_tokens_seen": 1464328, + "step": 7690 + }, + { + "epoch": 3.9994802494802495, + "grad_norm": 0.5468044281005859, + "learning_rate": 3.750785520328465e-05, + "loss": 0.0045, + "num_input_tokens_seen": 1465320, + "step": 7695 + }, + { + "epoch": 4.0, + "eval_loss": 0.18106698989868164, + "eval_runtime": 9.2913, + "eval_samples_per_second": 92.13, + "eval_steps_per_second": 23.032, + "num_input_tokens_seen": 1465464, + "step": 7696 + }, + { + "epoch": 4.002079002079002, + "grad_norm": 0.15612274408340454, + "learning_rate": 3.748821410953882e-05, + "loss": 0.003, + "num_input_tokens_seen": 1466136, + "step": 7700 + }, + { + "epoch": 4.004677754677755, + "grad_norm": 0.02714856155216694, + "learning_rate": 3.746856273930058e-05, + "loss": 0.0021, + "num_input_tokens_seen": 1467096, + "step": 7705 + }, + { + "epoch": 4.007276507276507, + "grad_norm": 0.023257816210389137, + "learning_rate": 3.744890110874093e-05, + "loss": 0.002, + "num_input_tokens_seen": 1468056, + "step": 7710 + }, + { + "epoch": 4.00987525987526, + "grad_norm": 0.014957001432776451, + "learning_rate": 3.742922923403935e-05, + "loss": 0.0059, + "num_input_tokens_seen": 1468984, + "step": 7715 + }, + { + "epoch": 4.012474012474012, + "grad_norm": 0.33369922637939453, + "learning_rate": 3.740954713138373e-05, + "loss": 0.0029, + "num_input_tokens_seen": 1469976, + "step": 7720 + }, + { + "epoch": 4.015072765072765, + "grad_norm": 0.01597665622830391, + "learning_rate": 3.7389854816970386e-05, + "loss": 0.001, + "num_input_tokens_seen": 1470904, + "step": 7725 + }, + { + "epoch": 4.017671517671518, + "grad_norm": 0.05265701189637184, + "learning_rate": 3.737015230700402e-05, + "loss": 0.001, + "num_input_tokens_seen": 1471800, + "step": 7730 + }, + { + "epoch": 4.02027027027027, + "grad_norm": 0.028743376955389977, + "learning_rate": 3.7350439617697734e-05, + "loss": 0.0527, + "num_input_tokens_seen": 1472760, + "step": 7735 + }, + { + "epoch": 4.022869022869023, + "grad_norm": 0.009255556389689445, + "learning_rate": 3.733071676527302e-05, + "loss": 0.1175, + "num_input_tokens_seen": 1473752, + "step": 7740 + }, + { + "epoch": 4.025467775467775, + "grad_norm": 0.36242586374282837, + "learning_rate": 3.73109837659597e-05, + "loss": 0.0034, + "num_input_tokens_seen": 1474712, + "step": 7745 + }, + { + "epoch": 4.028066528066528, + "grad_norm": 0.026815000921487808, + "learning_rate": 3.7291240635995985e-05, + "loss": 0.0015, + "num_input_tokens_seen": 1475704, + "step": 7750 + }, + { + "epoch": 4.03066528066528, + "grad_norm": 0.008508036844432354, + "learning_rate": 3.727148739162839e-05, + "loss": 0.0031, + "num_input_tokens_seen": 1476632, + "step": 7755 + }, + { + "epoch": 4.033264033264033, + "grad_norm": 0.031505536288022995, + "learning_rate": 3.725172404911177e-05, + "loss": 0.0016, + "num_input_tokens_seen": 1477592, + "step": 7760 + }, + { + "epoch": 4.035862785862786, + "grad_norm": 2.5582478046417236, + "learning_rate": 3.723195062470929e-05, + "loss": 0.1435, + "num_input_tokens_seen": 1478616, + "step": 7765 + }, + { + "epoch": 4.038461538461538, + "grad_norm": 0.01479694526642561, + "learning_rate": 3.7212167134692414e-05, + "loss": 0.0012, + "num_input_tokens_seen": 1479576, + "step": 7770 + }, + { + "epoch": 4.041060291060291, + "grad_norm": 0.0194613728672266, + "learning_rate": 3.719237359534087e-05, + "loss": 0.0017, + "num_input_tokens_seen": 1480504, + "step": 7775 + }, + { + "epoch": 4.043659043659043, + "grad_norm": 0.15941563248634338, + "learning_rate": 3.717257002294267e-05, + "loss": 0.0019, + "num_input_tokens_seen": 1481432, + "step": 7780 + }, + { + "epoch": 4.046257796257796, + "grad_norm": 0.01558847539126873, + "learning_rate": 3.715275643379408e-05, + "loss": 0.0024, + "num_input_tokens_seen": 1482360, + "step": 7785 + }, + { + "epoch": 4.048856548856548, + "grad_norm": 0.9382162690162659, + "learning_rate": 3.7132932844199614e-05, + "loss": 0.1251, + "num_input_tokens_seen": 1483352, + "step": 7790 + }, + { + "epoch": 4.051455301455301, + "grad_norm": 0.029014891013503075, + "learning_rate": 3.7113099270472005e-05, + "loss": 0.028, + "num_input_tokens_seen": 1484280, + "step": 7795 + }, + { + "epoch": 4.054054054054054, + "grad_norm": 0.2919447720050812, + "learning_rate": 3.709325572893221e-05, + "loss": 0.0048, + "num_input_tokens_seen": 1485208, + "step": 7800 + }, + { + "epoch": 4.0566528066528065, + "grad_norm": 0.08556835353374481, + "learning_rate": 3.707340223590939e-05, + "loss": 0.0033, + "num_input_tokens_seen": 1486136, + "step": 7805 + }, + { + "epoch": 4.0592515592515594, + "grad_norm": 0.03508737310767174, + "learning_rate": 3.705353880774088e-05, + "loss": 0.0023, + "num_input_tokens_seen": 1487032, + "step": 7810 + }, + { + "epoch": 4.0618503118503115, + "grad_norm": 0.015032107010483742, + "learning_rate": 3.703366546077221e-05, + "loss": 0.0017, + "num_input_tokens_seen": 1487960, + "step": 7815 + }, + { + "epoch": 4.0644490644490645, + "grad_norm": 0.03318740800023079, + "learning_rate": 3.701378221135707e-05, + "loss": 0.0015, + "num_input_tokens_seen": 1488952, + "step": 7820 + }, + { + "epoch": 4.0670478170478175, + "grad_norm": 0.011091838590800762, + "learning_rate": 3.699388907585727e-05, + "loss": 0.0018, + "num_input_tokens_seen": 1489912, + "step": 7825 + }, + { + "epoch": 4.06964656964657, + "grad_norm": 0.012640907429158688, + "learning_rate": 3.697398607064279e-05, + "loss": 0.0012, + "num_input_tokens_seen": 1490872, + "step": 7830 + }, + { + "epoch": 4.0722453222453225, + "grad_norm": 0.02349909394979477, + "learning_rate": 3.695407321209172e-05, + "loss": 0.0008, + "num_input_tokens_seen": 1491800, + "step": 7835 + }, + { + "epoch": 4.074844074844075, + "grad_norm": 5.7771100997924805, + "learning_rate": 3.693415051659026e-05, + "loss": 0.0941, + "num_input_tokens_seen": 1492760, + "step": 7840 + }, + { + "epoch": 4.077442827442828, + "grad_norm": 0.036203011870384216, + "learning_rate": 3.69142180005327e-05, + "loss": 0.0414, + "num_input_tokens_seen": 1493688, + "step": 7845 + }, + { + "epoch": 4.08004158004158, + "grad_norm": 0.01074804738163948, + "learning_rate": 3.689427568032141e-05, + "loss": 0.0009, + "num_input_tokens_seen": 1494616, + "step": 7850 + }, + { + "epoch": 4.082640332640333, + "grad_norm": 0.009409759193658829, + "learning_rate": 3.687432357236683e-05, + "loss": 0.0018, + "num_input_tokens_seen": 1495544, + "step": 7855 + }, + { + "epoch": 4.085239085239086, + "grad_norm": 0.020491180941462517, + "learning_rate": 3.685436169308746e-05, + "loss": 0.1576, + "num_input_tokens_seen": 1496504, + "step": 7860 + }, + { + "epoch": 4.087837837837838, + "grad_norm": 0.022371066734194756, + "learning_rate": 3.683439005890983e-05, + "loss": 0.0009, + "num_input_tokens_seen": 1497528, + "step": 7865 + }, + { + "epoch": 4.090436590436591, + "grad_norm": 0.014631272293627262, + "learning_rate": 3.681440868626851e-05, + "loss": 0.0006, + "num_input_tokens_seen": 1498456, + "step": 7870 + }, + { + "epoch": 4.093035343035343, + "grad_norm": 0.030003223568201065, + "learning_rate": 3.679441759160608e-05, + "loss": 0.1344, + "num_input_tokens_seen": 1499416, + "step": 7875 + }, + { + "epoch": 4.095634095634096, + "grad_norm": 0.006752966903150082, + "learning_rate": 3.677441679137311e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1500376, + "step": 7880 + }, + { + "epoch": 4.098232848232848, + "grad_norm": 0.006461648270487785, + "learning_rate": 3.675440630202817e-05, + "loss": 0.001, + "num_input_tokens_seen": 1501368, + "step": 7885 + }, + { + "epoch": 4.100831600831601, + "grad_norm": 0.012125938199460506, + "learning_rate": 3.673438614003778e-05, + "loss": 0.1425, + "num_input_tokens_seen": 1502296, + "step": 7890 + }, + { + "epoch": 4.103430353430354, + "grad_norm": 0.011790072545409203, + "learning_rate": 3.671435632187646e-05, + "loss": 0.0014, + "num_input_tokens_seen": 1503224, + "step": 7895 + }, + { + "epoch": 4.106029106029106, + "grad_norm": 0.015942715108394623, + "learning_rate": 3.669431686402664e-05, + "loss": 0.0014, + "num_input_tokens_seen": 1504184, + "step": 7900 + }, + { + "epoch": 4.108627858627859, + "grad_norm": 0.010757236741483212, + "learning_rate": 3.667426778297871e-05, + "loss": 0.0013, + "num_input_tokens_seen": 1505112, + "step": 7905 + }, + { + "epoch": 4.111226611226611, + "grad_norm": 0.029068531468510628, + "learning_rate": 3.6654209095230935e-05, + "loss": 0.0018, + "num_input_tokens_seen": 1506040, + "step": 7910 + }, + { + "epoch": 4.113825363825364, + "grad_norm": 0.043745674192905426, + "learning_rate": 3.663414081728954e-05, + "loss": 0.0019, + "num_input_tokens_seen": 1506968, + "step": 7915 + }, + { + "epoch": 4.116424116424117, + "grad_norm": 0.018969399854540825, + "learning_rate": 3.6614062965668614e-05, + "loss": 0.0018, + "num_input_tokens_seen": 1507928, + "step": 7920 + }, + { + "epoch": 4.119022869022869, + "grad_norm": 0.010622306726872921, + "learning_rate": 3.6593975556890106e-05, + "loss": 0.0014, + "num_input_tokens_seen": 1508856, + "step": 7925 + }, + { + "epoch": 4.121621621621622, + "grad_norm": 0.02119065448641777, + "learning_rate": 3.657387860748387e-05, + "loss": 0.0089, + "num_input_tokens_seen": 1509848, + "step": 7930 + }, + { + "epoch": 4.124220374220374, + "grad_norm": 0.008275649510324001, + "learning_rate": 3.655377213398759e-05, + "loss": 0.001, + "num_input_tokens_seen": 1510744, + "step": 7935 + }, + { + "epoch": 4.126819126819127, + "grad_norm": 0.04377612844109535, + "learning_rate": 3.653365615294678e-05, + "loss": 0.0009, + "num_input_tokens_seen": 1511704, + "step": 7940 + }, + { + "epoch": 4.129417879417879, + "grad_norm": 0.008830524981021881, + "learning_rate": 3.651353068091479e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1512536, + "step": 7945 + }, + { + "epoch": 4.132016632016632, + "grad_norm": 0.0076597039587795734, + "learning_rate": 3.649339573445277e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1513432, + "step": 7950 + }, + { + "epoch": 4.134615384615385, + "grad_norm": 0.008984272368252277, + "learning_rate": 3.647325133012969e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1514424, + "step": 7955 + }, + { + "epoch": 4.137214137214137, + "grad_norm": 2.2006609439849854, + "learning_rate": 3.6453097484522257e-05, + "loss": 0.1364, + "num_input_tokens_seen": 1515416, + "step": 7960 + }, + { + "epoch": 4.13981288981289, + "grad_norm": 0.019640833139419556, + "learning_rate": 3.6432934214215e-05, + "loss": 0.1135, + "num_input_tokens_seen": 1516440, + "step": 7965 + }, + { + "epoch": 4.142411642411642, + "grad_norm": 0.02385815791785717, + "learning_rate": 3.641276153580016e-05, + "loss": 0.0012, + "num_input_tokens_seen": 1517432, + "step": 7970 + }, + { + "epoch": 4.145010395010395, + "grad_norm": 0.09015733748674393, + "learning_rate": 3.6392579465877754e-05, + "loss": 0.0019, + "num_input_tokens_seen": 1518456, + "step": 7975 + }, + { + "epoch": 4.147609147609147, + "grad_norm": 11.280288696289062, + "learning_rate": 3.63723880210555e-05, + "loss": 0.1324, + "num_input_tokens_seen": 1519384, + "step": 7980 + }, + { + "epoch": 4.1502079002079, + "grad_norm": 0.006035329308360815, + "learning_rate": 3.635218721794886e-05, + "loss": 0.0988, + "num_input_tokens_seen": 1520312, + "step": 7985 + }, + { + "epoch": 4.152806652806653, + "grad_norm": 0.019403938204050064, + "learning_rate": 3.6331977073180964e-05, + "loss": 0.0016, + "num_input_tokens_seen": 1521336, + "step": 7990 + }, + { + "epoch": 4.155405405405405, + "grad_norm": 0.04011289402842522, + "learning_rate": 3.631175760338265e-05, + "loss": 0.0017, + "num_input_tokens_seen": 1522296, + "step": 7995 + }, + { + "epoch": 4.158004158004158, + "grad_norm": 0.012321539223194122, + "learning_rate": 3.629152882519242e-05, + "loss": 0.0011, + "num_input_tokens_seen": 1523256, + "step": 8000 + }, + { + "epoch": 4.16060291060291, + "grad_norm": 0.6717885732650757, + "learning_rate": 3.627129075525645e-05, + "loss": 0.0043, + "num_input_tokens_seen": 1524216, + "step": 8005 + }, + { + "epoch": 4.163201663201663, + "grad_norm": 0.006419390439987183, + "learning_rate": 3.625104341022854e-05, + "loss": 0.0015, + "num_input_tokens_seen": 1525208, + "step": 8010 + }, + { + "epoch": 4.165800415800415, + "grad_norm": 0.015656279399991035, + "learning_rate": 3.623078680677014e-05, + "loss": 0.001, + "num_input_tokens_seen": 1526168, + "step": 8015 + }, + { + "epoch": 4.168399168399168, + "grad_norm": 0.13219135999679565, + "learning_rate": 3.6210520961550314e-05, + "loss": 0.0013, + "num_input_tokens_seen": 1527064, + "step": 8020 + }, + { + "epoch": 4.170997920997921, + "grad_norm": 0.020427633076906204, + "learning_rate": 3.619024589124573e-05, + "loss": 0.0008, + "num_input_tokens_seen": 1527960, + "step": 8025 + }, + { + "epoch": 4.173596673596673, + "grad_norm": 0.012042947113513947, + "learning_rate": 3.6169961612540645e-05, + "loss": 0.1452, + "num_input_tokens_seen": 1528920, + "step": 8030 + }, + { + "epoch": 4.176195426195426, + "grad_norm": 0.021907569840550423, + "learning_rate": 3.61496681421269e-05, + "loss": 0.188, + "num_input_tokens_seen": 1529912, + "step": 8035 + }, + { + "epoch": 4.1787941787941785, + "grad_norm": 0.02190912514925003, + "learning_rate": 3.61293654967039e-05, + "loss": 0.0017, + "num_input_tokens_seen": 1530776, + "step": 8040 + }, + { + "epoch": 4.1813929313929314, + "grad_norm": 0.014146735891699791, + "learning_rate": 3.610905369297859e-05, + "loss": 0.002, + "num_input_tokens_seen": 1531736, + "step": 8045 + }, + { + "epoch": 4.183991683991684, + "grad_norm": 3.3714306354522705, + "learning_rate": 3.608873274766545e-05, + "loss": 0.1081, + "num_input_tokens_seen": 1532632, + "step": 8050 + }, + { + "epoch": 4.1865904365904365, + "grad_norm": 0.10785745084285736, + "learning_rate": 3.60684026774865e-05, + "loss": 0.0021, + "num_input_tokens_seen": 1533624, + "step": 8055 + }, + { + "epoch": 4.1891891891891895, + "grad_norm": 0.0579376146197319, + "learning_rate": 3.604806349917126e-05, + "loss": 0.0028, + "num_input_tokens_seen": 1534616, + "step": 8060 + }, + { + "epoch": 4.191787941787942, + "grad_norm": 0.015051578171551228, + "learning_rate": 3.6027715229456734e-05, + "loss": 0.0829, + "num_input_tokens_seen": 1535576, + "step": 8065 + }, + { + "epoch": 4.1943866943866945, + "grad_norm": 0.01386975310742855, + "learning_rate": 3.600735788508743e-05, + "loss": 0.003, + "num_input_tokens_seen": 1536536, + "step": 8070 + }, + { + "epoch": 4.196985446985447, + "grad_norm": 0.017352603375911713, + "learning_rate": 3.59869914828153e-05, + "loss": 0.0023, + "num_input_tokens_seen": 1537400, + "step": 8075 + }, + { + "epoch": 4.1995841995842, + "grad_norm": 0.011258319020271301, + "learning_rate": 3.596661603939977e-05, + "loss": 0.0598, + "num_input_tokens_seen": 1538264, + "step": 8080 + }, + { + "epoch": 4.202182952182953, + "grad_norm": 0.02816794440150261, + "learning_rate": 3.594623157160769e-05, + "loss": 0.0182, + "num_input_tokens_seen": 1539192, + "step": 8085 + }, + { + "epoch": 4.204781704781705, + "grad_norm": 0.0077084205113351345, + "learning_rate": 3.592583809621334e-05, + "loss": 0.0021, + "num_input_tokens_seen": 1540120, + "step": 8090 + }, + { + "epoch": 4.207380457380458, + "grad_norm": 0.01048305630683899, + "learning_rate": 3.590543562999841e-05, + "loss": 0.0055, + "num_input_tokens_seen": 1541048, + "step": 8095 + }, + { + "epoch": 4.20997920997921, + "grad_norm": 0.006227817386388779, + "learning_rate": 3.588502418975201e-05, + "loss": 0.0011, + "num_input_tokens_seen": 1542104, + "step": 8100 + }, + { + "epoch": 4.212577962577963, + "grad_norm": 0.006259117275476456, + "learning_rate": 3.5864603792270604e-05, + "loss": 0.001, + "num_input_tokens_seen": 1542968, + "step": 8105 + }, + { + "epoch": 4.215176715176715, + "grad_norm": 0.06942945718765259, + "learning_rate": 3.584417445435805e-05, + "loss": 0.0018, + "num_input_tokens_seen": 1543896, + "step": 8110 + }, + { + "epoch": 4.217775467775468, + "grad_norm": 0.019199704751372337, + "learning_rate": 3.5823736192825545e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1544792, + "step": 8115 + }, + { + "epoch": 4.220374220374221, + "grad_norm": 0.02652125433087349, + "learning_rate": 3.580328902449164e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1545656, + "step": 8120 + }, + { + "epoch": 4.222972972972973, + "grad_norm": 0.016583167016506195, + "learning_rate": 3.578283296618221e-05, + "loss": 0.0008, + "num_input_tokens_seen": 1546648, + "step": 8125 + }, + { + "epoch": 4.225571725571726, + "grad_norm": 0.02277185767889023, + "learning_rate": 3.5762368034730466e-05, + "loss": 0.2511, + "num_input_tokens_seen": 1547640, + "step": 8130 + }, + { + "epoch": 4.228170478170478, + "grad_norm": 0.03258303925395012, + "learning_rate": 3.574189424697688e-05, + "loss": 0.0011, + "num_input_tokens_seen": 1548600, + "step": 8135 + }, + { + "epoch": 4.230769230769231, + "grad_norm": 0.0052109165117144585, + "learning_rate": 3.5721411619769254e-05, + "loss": 0.0014, + "num_input_tokens_seen": 1549560, + "step": 8140 + }, + { + "epoch": 4.233367983367984, + "grad_norm": 27.24933433532715, + "learning_rate": 3.5700920169962626e-05, + "loss": 0.1295, + "num_input_tokens_seen": 1550488, + "step": 8145 + }, + { + "epoch": 4.235966735966736, + "grad_norm": 0.02773812972009182, + "learning_rate": 3.568041991441934e-05, + "loss": 0.0035, + "num_input_tokens_seen": 1551448, + "step": 8150 + }, + { + "epoch": 4.238565488565489, + "grad_norm": 0.053699344396591187, + "learning_rate": 3.5659910870008934e-05, + "loss": 0.0013, + "num_input_tokens_seen": 1552376, + "step": 8155 + }, + { + "epoch": 4.241164241164241, + "grad_norm": 0.002660207450389862, + "learning_rate": 3.563939305360822e-05, + "loss": 0.0034, + "num_input_tokens_seen": 1553272, + "step": 8160 + }, + { + "epoch": 4.243762993762994, + "grad_norm": 0.015079510398209095, + "learning_rate": 3.56188664821012e-05, + "loss": 0.0019, + "num_input_tokens_seen": 1554232, + "step": 8165 + }, + { + "epoch": 4.246361746361746, + "grad_norm": 0.11943867802619934, + "learning_rate": 3.55983311723791e-05, + "loss": 0.001, + "num_input_tokens_seen": 1555192, + "step": 8170 + }, + { + "epoch": 4.248960498960499, + "grad_norm": 0.01427221205085516, + "learning_rate": 3.557778714134033e-05, + "loss": 0.1306, + "num_input_tokens_seen": 1556152, + "step": 8175 + }, + { + "epoch": 4.251559251559252, + "grad_norm": 0.008594769984483719, + "learning_rate": 3.555723440589047e-05, + "loss": 0.0009, + "num_input_tokens_seen": 1557048, + "step": 8180 + }, + { + "epoch": 4.254158004158004, + "grad_norm": 0.010364443995058537, + "learning_rate": 3.5536672982942275e-05, + "loss": 0.0014, + "num_input_tokens_seen": 1558008, + "step": 8185 + }, + { + "epoch": 4.256756756756757, + "grad_norm": 0.00975821539759636, + "learning_rate": 3.551610288941564e-05, + "loss": 0.0013, + "num_input_tokens_seen": 1558968, + "step": 8190 + }, + { + "epoch": 4.259355509355509, + "grad_norm": 0.0138521334156394, + "learning_rate": 3.549552414223761e-05, + "loss": 0.0013, + "num_input_tokens_seen": 1559928, + "step": 8195 + }, + { + "epoch": 4.261954261954262, + "grad_norm": 0.011820415034890175, + "learning_rate": 3.547493675834232e-05, + "loss": 0.2401, + "num_input_tokens_seen": 1560856, + "step": 8200 + }, + { + "epoch": 4.264553014553014, + "grad_norm": 0.007262366358190775, + "learning_rate": 3.545434075467103e-05, + "loss": 0.0211, + "num_input_tokens_seen": 1561848, + "step": 8205 + }, + { + "epoch": 4.267151767151767, + "grad_norm": 0.02084948867559433, + "learning_rate": 3.543373614817212e-05, + "loss": 0.002, + "num_input_tokens_seen": 1562840, + "step": 8210 + }, + { + "epoch": 4.26975051975052, + "grad_norm": 0.009805666282773018, + "learning_rate": 3.5413122955801005e-05, + "loss": 0.0013, + "num_input_tokens_seen": 1563864, + "step": 8215 + }, + { + "epoch": 4.272349272349272, + "grad_norm": 0.010473538190126419, + "learning_rate": 3.5392501194520174e-05, + "loss": 0.0018, + "num_input_tokens_seen": 1564888, + "step": 8220 + }, + { + "epoch": 4.274948024948025, + "grad_norm": 0.018398303538560867, + "learning_rate": 3.537187088129919e-05, + "loss": 0.0009, + "num_input_tokens_seen": 1565784, + "step": 8225 + }, + { + "epoch": 4.277546777546777, + "grad_norm": 0.06350994110107422, + "learning_rate": 3.535123203311464e-05, + "loss": 0.0013, + "num_input_tokens_seen": 1566712, + "step": 8230 + }, + { + "epoch": 4.28014553014553, + "grad_norm": 0.012673432007431984, + "learning_rate": 3.533058466695013e-05, + "loss": 0.0874, + "num_input_tokens_seen": 1567704, + "step": 8235 + }, + { + "epoch": 4.282744282744282, + "grad_norm": 0.025033799931406975, + "learning_rate": 3.530992879979629e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1568696, + "step": 8240 + }, + { + "epoch": 4.285343035343035, + "grad_norm": 5.638076305389404, + "learning_rate": 3.528926444865073e-05, + "loss": 0.1034, + "num_input_tokens_seen": 1569592, + "step": 8245 + }, + { + "epoch": 4.287941787941788, + "grad_norm": 0.013845724053680897, + "learning_rate": 3.5268591630518036e-05, + "loss": 0.0262, + "num_input_tokens_seen": 1570616, + "step": 8250 + }, + { + "epoch": 4.29054054054054, + "grad_norm": 0.00614283699542284, + "learning_rate": 3.524791036240979e-05, + "loss": 0.0017, + "num_input_tokens_seen": 1571608, + "step": 8255 + }, + { + "epoch": 4.293139293139293, + "grad_norm": 0.007696662098169327, + "learning_rate": 3.52272206613445e-05, + "loss": 0.037, + "num_input_tokens_seen": 1572568, + "step": 8260 + }, + { + "epoch": 4.295738045738045, + "grad_norm": 0.006696358323097229, + "learning_rate": 3.520652254434762e-05, + "loss": 0.0012, + "num_input_tokens_seen": 1573496, + "step": 8265 + }, + { + "epoch": 4.298336798336798, + "grad_norm": 0.011715010739862919, + "learning_rate": 3.518581602845154e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1574424, + "step": 8270 + }, + { + "epoch": 4.3009355509355505, + "grad_norm": 0.02417866885662079, + "learning_rate": 3.516510113069555e-05, + "loss": 0.0011, + "num_input_tokens_seen": 1575320, + "step": 8275 + }, + { + "epoch": 4.303534303534303, + "grad_norm": 0.01113355252891779, + "learning_rate": 3.5144377868125855e-05, + "loss": 0.0053, + "num_input_tokens_seen": 1576248, + "step": 8280 + }, + { + "epoch": 4.306133056133056, + "grad_norm": 0.02295931614935398, + "learning_rate": 3.512364625779551e-05, + "loss": 0.0008, + "num_input_tokens_seen": 1577208, + "step": 8285 + }, + { + "epoch": 4.3087318087318085, + "grad_norm": 0.026380084455013275, + "learning_rate": 3.510290631676447e-05, + "loss": 0.1091, + "num_input_tokens_seen": 1578104, + "step": 8290 + }, + { + "epoch": 4.3113305613305615, + "grad_norm": 0.005214437376707792, + "learning_rate": 3.5082158062099536e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1579064, + "step": 8295 + }, + { + "epoch": 4.313929313929314, + "grad_norm": 0.0046245260164141655, + "learning_rate": 3.506140151087434e-05, + "loss": 0.1202, + "num_input_tokens_seen": 1580024, + "step": 8300 + }, + { + "epoch": 4.3165280665280665, + "grad_norm": 0.09497901797294617, + "learning_rate": 3.504063668016936e-05, + "loss": 0.0077, + "num_input_tokens_seen": 1580920, + "step": 8305 + }, + { + "epoch": 4.3191268191268195, + "grad_norm": 0.0045767417177557945, + "learning_rate": 3.5019863587071867e-05, + "loss": 0.0005, + "num_input_tokens_seen": 1581848, + "step": 8310 + }, + { + "epoch": 4.321725571725572, + "grad_norm": 0.005732773803174496, + "learning_rate": 3.499908224867594e-05, + "loss": 0.0016, + "num_input_tokens_seen": 1582904, + "step": 8315 + }, + { + "epoch": 4.324324324324325, + "grad_norm": 0.0120061244815588, + "learning_rate": 3.497829268208246e-05, + "loss": 0.1321, + "num_input_tokens_seen": 1583800, + "step": 8320 + }, + { + "epoch": 4.326923076923077, + "grad_norm": 0.02107643149793148, + "learning_rate": 3.495749490439905e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1584728, + "step": 8325 + }, + { + "epoch": 4.32952182952183, + "grad_norm": 0.003120183711871505, + "learning_rate": 3.493668893274011e-05, + "loss": 0.001, + "num_input_tokens_seen": 1585720, + "step": 8330 + }, + { + "epoch": 4.332120582120582, + "grad_norm": 0.003339325776323676, + "learning_rate": 3.491587478422677e-05, + "loss": 0.0005, + "num_input_tokens_seen": 1586712, + "step": 8335 + }, + { + "epoch": 4.334719334719335, + "grad_norm": 0.00494698341935873, + "learning_rate": 3.48950524759869e-05, + "loss": 0.0016, + "num_input_tokens_seen": 1587736, + "step": 8340 + }, + { + "epoch": 4.337318087318088, + "grad_norm": 0.008128204382956028, + "learning_rate": 3.487422202515508e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1588696, + "step": 8345 + }, + { + "epoch": 4.33991683991684, + "grad_norm": 0.016355670988559723, + "learning_rate": 3.485338344887258e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1589656, + "step": 8350 + }, + { + "epoch": 4.342515592515593, + "grad_norm": 0.0019056719029322267, + "learning_rate": 3.483253676428737e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1590616, + "step": 8355 + }, + { + "epoch": 4.345114345114345, + "grad_norm": 1.4171836376190186, + "learning_rate": 3.481168198855409e-05, + "loss": 0.0011, + "num_input_tokens_seen": 1591608, + "step": 8360 + }, + { + "epoch": 4.347713097713098, + "grad_norm": 0.03649400547146797, + "learning_rate": 3.4790819138834044e-05, + "loss": 0.1765, + "num_input_tokens_seen": 1592536, + "step": 8365 + }, + { + "epoch": 4.350311850311851, + "grad_norm": 0.009784182533621788, + "learning_rate": 3.4769948232295166e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1593528, + "step": 8370 + }, + { + "epoch": 4.352910602910603, + "grad_norm": 0.0036831730976700783, + "learning_rate": 3.4749069286112027e-05, + "loss": 0.2538, + "num_input_tokens_seen": 1594552, + "step": 8375 + }, + { + "epoch": 4.355509355509356, + "grad_norm": 0.01389295794069767, + "learning_rate": 3.4728182317465795e-05, + "loss": 0.0013, + "num_input_tokens_seen": 1595448, + "step": 8380 + }, + { + "epoch": 4.358108108108108, + "grad_norm": 0.02505119889974594, + "learning_rate": 3.470728734354429e-05, + "loss": 0.0017, + "num_input_tokens_seen": 1596376, + "step": 8385 + }, + { + "epoch": 4.360706860706861, + "grad_norm": 0.019795959815382957, + "learning_rate": 3.468638438154186e-05, + "loss": 0.0019, + "num_input_tokens_seen": 1597368, + "step": 8390 + }, + { + "epoch": 4.363305613305613, + "grad_norm": 0.005132460035383701, + "learning_rate": 3.466547344865948e-05, + "loss": 0.0016, + "num_input_tokens_seen": 1598360, + "step": 8395 + }, + { + "epoch": 4.365904365904366, + "grad_norm": 137.7931365966797, + "learning_rate": 3.4644554562104634e-05, + "loss": 0.1259, + "num_input_tokens_seen": 1599288, + "step": 8400 + }, + { + "epoch": 4.368503118503119, + "grad_norm": 0.004936987534165382, + "learning_rate": 3.4623627739091384e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1600248, + "step": 8405 + }, + { + "epoch": 4.371101871101871, + "grad_norm": 0.010463619604706764, + "learning_rate": 3.4602692996840324e-05, + "loss": 0.001, + "num_input_tokens_seen": 1601176, + "step": 8410 + }, + { + "epoch": 4.373700623700624, + "grad_norm": 0.020648956298828125, + "learning_rate": 3.458175035257854e-05, + "loss": 0.0008, + "num_input_tokens_seen": 1602136, + "step": 8415 + }, + { + "epoch": 4.376299376299376, + "grad_norm": 0.06349457055330276, + "learning_rate": 3.4560799823539635e-05, + "loss": 0.0005, + "num_input_tokens_seen": 1603096, + "step": 8420 + }, + { + "epoch": 4.378898128898129, + "grad_norm": 0.016008954495191574, + "learning_rate": 3.453984142696372e-05, + "loss": 0.0009, + "num_input_tokens_seen": 1604120, + "step": 8425 + }, + { + "epoch": 4.381496881496881, + "grad_norm": 0.007222018204629421, + "learning_rate": 3.4518875180097335e-05, + "loss": 0.0366, + "num_input_tokens_seen": 1605016, + "step": 8430 + }, + { + "epoch": 4.384095634095634, + "grad_norm": 0.004856929648667574, + "learning_rate": 3.449790110019351e-05, + "loss": 0.0268, + "num_input_tokens_seen": 1606008, + "step": 8435 + }, + { + "epoch": 4.386694386694387, + "grad_norm": 0.004717925097793341, + "learning_rate": 3.4476919204511735e-05, + "loss": 0.0005, + "num_input_tokens_seen": 1606968, + "step": 8440 + }, + { + "epoch": 4.389293139293139, + "grad_norm": 0.00603255070745945, + "learning_rate": 3.44559295103179e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1607928, + "step": 8445 + }, + { + "epoch": 4.391891891891892, + "grad_norm": 0.015790719538927078, + "learning_rate": 3.443493203488431e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1608920, + "step": 8450 + }, + { + "epoch": 4.394490644490644, + "grad_norm": 0.004079906735569239, + "learning_rate": 3.441392679548973e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1609848, + "step": 8455 + }, + { + "epoch": 4.397089397089397, + "grad_norm": 0.03155229240655899, + "learning_rate": 3.439291380941923e-05, + "loss": 0.278, + "num_input_tokens_seen": 1610776, + "step": 8460 + }, + { + "epoch": 4.399688149688149, + "grad_norm": 0.005308633204549551, + "learning_rate": 3.437189309396432e-05, + "loss": 0.0232, + "num_input_tokens_seen": 1611768, + "step": 8465 + }, + { + "epoch": 4.402286902286902, + "grad_norm": 0.028623664751648903, + "learning_rate": 3.435086466642284e-05, + "loss": 0.0008, + "num_input_tokens_seen": 1612632, + "step": 8470 + }, + { + "epoch": 4.404885654885655, + "grad_norm": 0.013124784454703331, + "learning_rate": 3.432982854409899e-05, + "loss": 0.0011, + "num_input_tokens_seen": 1613560, + "step": 8475 + }, + { + "epoch": 4.407484407484407, + "grad_norm": 6.2996320724487305, + "learning_rate": 3.4308784744303276e-05, + "loss": 0.0766, + "num_input_tokens_seen": 1614488, + "step": 8480 + }, + { + "epoch": 4.41008316008316, + "grad_norm": 0.15556544065475464, + "learning_rate": 3.4287733284352556e-05, + "loss": 0.0016, + "num_input_tokens_seen": 1615384, + "step": 8485 + }, + { + "epoch": 4.412681912681912, + "grad_norm": 2.012831449508667, + "learning_rate": 3.426667418156999e-05, + "loss": 0.2627, + "num_input_tokens_seen": 1616440, + "step": 8490 + }, + { + "epoch": 4.415280665280665, + "grad_norm": 0.0314534567296505, + "learning_rate": 3.4245607453285e-05, + "loss": 0.0042, + "num_input_tokens_seen": 1617496, + "step": 8495 + }, + { + "epoch": 4.417879417879418, + "grad_norm": 0.05015339329838753, + "learning_rate": 3.422453311683329e-05, + "loss": 0.0043, + "num_input_tokens_seen": 1618424, + "step": 8500 + }, + { + "epoch": 4.42047817047817, + "grad_norm": 0.012574128806591034, + "learning_rate": 3.4203451189556844e-05, + "loss": 0.0171, + "num_input_tokens_seen": 1619416, + "step": 8505 + }, + { + "epoch": 4.423076923076923, + "grad_norm": 0.011611697264015675, + "learning_rate": 3.4182361688803886e-05, + "loss": 0.0011, + "num_input_tokens_seen": 1620408, + "step": 8510 + }, + { + "epoch": 4.425675675675675, + "grad_norm": 0.022636834532022476, + "learning_rate": 3.416126463192885e-05, + "loss": 0.0225, + "num_input_tokens_seen": 1621400, + "step": 8515 + }, + { + "epoch": 4.428274428274428, + "grad_norm": 0.013849306851625443, + "learning_rate": 3.4140160036292414e-05, + "loss": 0.1398, + "num_input_tokens_seen": 1622296, + "step": 8520 + }, + { + "epoch": 4.4308731808731805, + "grad_norm": 0.011534187011420727, + "learning_rate": 3.4119047919261444e-05, + "loss": 0.0009, + "num_input_tokens_seen": 1623256, + "step": 8525 + }, + { + "epoch": 4.4334719334719335, + "grad_norm": 0.17717668414115906, + "learning_rate": 3.4097928298209e-05, + "loss": 0.0026, + "num_input_tokens_seen": 1624248, + "step": 8530 + }, + { + "epoch": 4.436070686070686, + "grad_norm": 0.011308480054140091, + "learning_rate": 3.4076801190514334e-05, + "loss": 0.0012, + "num_input_tokens_seen": 1625112, + "step": 8535 + }, + { + "epoch": 4.4386694386694385, + "grad_norm": 17.81656265258789, + "learning_rate": 3.405566661356284e-05, + "loss": 0.0902, + "num_input_tokens_seen": 1626104, + "step": 8540 + }, + { + "epoch": 4.4412681912681915, + "grad_norm": 0.009696068242192268, + "learning_rate": 3.4034524584746044e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1627064, + "step": 8545 + }, + { + "epoch": 4.443866943866944, + "grad_norm": 0.004780042450875044, + "learning_rate": 3.4013375121461625e-05, + "loss": 0.0008, + "num_input_tokens_seen": 1627960, + "step": 8550 + }, + { + "epoch": 4.446465696465697, + "grad_norm": 0.028302565217018127, + "learning_rate": 3.39922182411134e-05, + "loss": 0.0312, + "num_input_tokens_seen": 1628888, + "step": 8555 + }, + { + "epoch": 4.4490644490644495, + "grad_norm": 0.012896990403532982, + "learning_rate": 3.3971053961111245e-05, + "loss": 0.0009, + "num_input_tokens_seen": 1629880, + "step": 8560 + }, + { + "epoch": 4.451663201663202, + "grad_norm": 0.010095639154314995, + "learning_rate": 3.394988229887114e-05, + "loss": 0.0006, + "num_input_tokens_seen": 1630840, + "step": 8565 + }, + { + "epoch": 4.454261954261955, + "grad_norm": 0.02547982521355152, + "learning_rate": 3.392870327181516e-05, + "loss": 0.1455, + "num_input_tokens_seen": 1631800, + "step": 8570 + }, + { + "epoch": 4.456860706860707, + "grad_norm": 0.24350623786449432, + "learning_rate": 3.390751689737143e-05, + "loss": 0.1184, + "num_input_tokens_seen": 1632824, + "step": 8575 + }, + { + "epoch": 4.45945945945946, + "grad_norm": 0.12797300517559052, + "learning_rate": 3.3886323192974106e-05, + "loss": 0.0083, + "num_input_tokens_seen": 1633720, + "step": 8580 + }, + { + "epoch": 4.462058212058212, + "grad_norm": 0.07311179488897324, + "learning_rate": 3.386512217606339e-05, + "loss": 0.0048, + "num_input_tokens_seen": 1634680, + "step": 8585 + }, + { + "epoch": 4.464656964656965, + "grad_norm": 0.020869245752692223, + "learning_rate": 3.384391386408551e-05, + "loss": 0.002, + "num_input_tokens_seen": 1635640, + "step": 8590 + }, + { + "epoch": 4.467255717255718, + "grad_norm": 0.006830984726548195, + "learning_rate": 3.382269827449267e-05, + "loss": 0.0012, + "num_input_tokens_seen": 1636600, + "step": 8595 + }, + { + "epoch": 4.46985446985447, + "grad_norm": 0.008264413103461266, + "learning_rate": 3.3801475424743075e-05, + "loss": 0.0011, + "num_input_tokens_seen": 1637592, + "step": 8600 + }, + { + "epoch": 4.472453222453223, + "grad_norm": 0.00861015822738409, + "learning_rate": 3.378024533230093e-05, + "loss": 0.1101, + "num_input_tokens_seen": 1638552, + "step": 8605 + }, + { + "epoch": 4.475051975051975, + "grad_norm": 0.018160097301006317, + "learning_rate": 3.3759008014636365e-05, + "loss": 0.0017, + "num_input_tokens_seen": 1639512, + "step": 8610 + }, + { + "epoch": 4.477650727650728, + "grad_norm": 0.017593560740351677, + "learning_rate": 3.373776348922546e-05, + "loss": 0.0016, + "num_input_tokens_seen": 1640408, + "step": 8615 + }, + { + "epoch": 4.48024948024948, + "grad_norm": 0.021576354280114174, + "learning_rate": 3.3716511773550256e-05, + "loss": 0.0373, + "num_input_tokens_seen": 1641368, + "step": 8620 + }, + { + "epoch": 4.482848232848233, + "grad_norm": 0.0263748187571764, + "learning_rate": 3.369525288509867e-05, + "loss": 0.0019, + "num_input_tokens_seen": 1642424, + "step": 8625 + }, + { + "epoch": 4.485446985446986, + "grad_norm": 0.032226577401161194, + "learning_rate": 3.367398684136454e-05, + "loss": 0.0021, + "num_input_tokens_seen": 1643384, + "step": 8630 + }, + { + "epoch": 4.488045738045738, + "grad_norm": 0.02299836091697216, + "learning_rate": 3.365271365984761e-05, + "loss": 0.1185, + "num_input_tokens_seen": 1644376, + "step": 8635 + }, + { + "epoch": 4.490644490644491, + "grad_norm": 1.8379615545272827, + "learning_rate": 3.363143335805347e-05, + "loss": 0.1392, + "num_input_tokens_seen": 1645272, + "step": 8640 + }, + { + "epoch": 4.493243243243243, + "grad_norm": 0.016026703640818596, + "learning_rate": 3.361014595349358e-05, + "loss": 0.0009, + "num_input_tokens_seen": 1646136, + "step": 8645 + }, + { + "epoch": 4.495841995841996, + "grad_norm": 0.029855996370315552, + "learning_rate": 3.358885146368524e-05, + "loss": 0.1088, + "num_input_tokens_seen": 1647096, + "step": 8650 + }, + { + "epoch": 4.498440748440748, + "grad_norm": 0.031523484736680984, + "learning_rate": 3.35675499061516e-05, + "loss": 0.0025, + "num_input_tokens_seen": 1648056, + "step": 8655 + }, + { + "epoch": 4.5, + "eval_loss": 0.19190248847007751, + "eval_runtime": 9.2674, + "eval_samples_per_second": 92.367, + "eval_steps_per_second": 23.092, + "num_input_tokens_seen": 1648696, + "step": 8658 + }, + { + "epoch": 4.501039501039501, + "grad_norm": 0.04584374651312828, + "learning_rate": 3.35462412984216e-05, + "loss": 0.0032, + "num_input_tokens_seen": 1649048, + "step": 8660 + }, + { + "epoch": 4.503638253638254, + "grad_norm": 0.0350438617169857, + "learning_rate": 3.352492565802999e-05, + "loss": 0.0875, + "num_input_tokens_seen": 1650008, + "step": 8665 + }, + { + "epoch": 4.506237006237006, + "grad_norm": 0.10573546588420868, + "learning_rate": 3.350360300251732e-05, + "loss": 0.0064, + "num_input_tokens_seen": 1650968, + "step": 8670 + }, + { + "epoch": 4.508835758835759, + "grad_norm": 0.022648856043815613, + "learning_rate": 3.348227334942989e-05, + "loss": 0.0066, + "num_input_tokens_seen": 1651960, + "step": 8675 + }, + { + "epoch": 4.511434511434511, + "grad_norm": 0.036423131823539734, + "learning_rate": 3.346093671631979e-05, + "loss": 0.003, + "num_input_tokens_seen": 1652824, + "step": 8680 + }, + { + "epoch": 4.514033264033264, + "grad_norm": 0.013109245337545872, + "learning_rate": 3.3439593120744816e-05, + "loss": 0.0017, + "num_input_tokens_seen": 1653720, + "step": 8685 + }, + { + "epoch": 4.516632016632016, + "grad_norm": 0.02927917055785656, + "learning_rate": 3.341824258026851e-05, + "loss": 0.0018, + "num_input_tokens_seen": 1654648, + "step": 8690 + }, + { + "epoch": 4.519230769230769, + "grad_norm": 0.017025254666805267, + "learning_rate": 3.339688511246014e-05, + "loss": 0.001, + "num_input_tokens_seen": 1655640, + "step": 8695 + }, + { + "epoch": 4.521829521829522, + "grad_norm": 0.017639728263020515, + "learning_rate": 3.337552073489467e-05, + "loss": 0.0016, + "num_input_tokens_seen": 1656600, + "step": 8700 + }, + { + "epoch": 4.524428274428274, + "grad_norm": 0.138295978307724, + "learning_rate": 3.335414946515275e-05, + "loss": 0.0722, + "num_input_tokens_seen": 1657560, + "step": 8705 + }, + { + "epoch": 4.527027027027027, + "grad_norm": 0.3688705265522003, + "learning_rate": 3.3332771320820676e-05, + "loss": 0.1173, + "num_input_tokens_seen": 1658552, + "step": 8710 + }, + { + "epoch": 4.529625779625779, + "grad_norm": 0.08625061064958572, + "learning_rate": 3.3311386319490436e-05, + "loss": 0.0013, + "num_input_tokens_seen": 1659512, + "step": 8715 + }, + { + "epoch": 4.532224532224532, + "grad_norm": 1.3117239475250244, + "learning_rate": 3.328999447875965e-05, + "loss": 0.0523, + "num_input_tokens_seen": 1660472, + "step": 8720 + }, + { + "epoch": 4.534823284823284, + "grad_norm": 0.022019652649760246, + "learning_rate": 3.326859581623155e-05, + "loss": 0.0015, + "num_input_tokens_seen": 1661432, + "step": 8725 + }, + { + "epoch": 4.537422037422037, + "grad_norm": 0.008861594833433628, + "learning_rate": 3.3247190349515e-05, + "loss": 0.0011, + "num_input_tokens_seen": 1662392, + "step": 8730 + }, + { + "epoch": 4.54002079002079, + "grad_norm": 2.9159581661224365, + "learning_rate": 3.322577809622446e-05, + "loss": 0.0991, + "num_input_tokens_seen": 1663320, + "step": 8735 + }, + { + "epoch": 4.542619542619542, + "grad_norm": 0.01799764111638069, + "learning_rate": 3.3204359073979964e-05, + "loss": 0.1147, + "num_input_tokens_seen": 1664248, + "step": 8740 + }, + { + "epoch": 4.545218295218295, + "grad_norm": 0.05466839298605919, + "learning_rate": 3.318293330040714e-05, + "loss": 0.0033, + "num_input_tokens_seen": 1665176, + "step": 8745 + }, + { + "epoch": 4.547817047817047, + "grad_norm": 0.007083173841238022, + "learning_rate": 3.316150079313713e-05, + "loss": 0.0018, + "num_input_tokens_seen": 1666040, + "step": 8750 + }, + { + "epoch": 4.5504158004158, + "grad_norm": 0.03065682388842106, + "learning_rate": 3.3140061569806685e-05, + "loss": 0.002, + "num_input_tokens_seen": 1666936, + "step": 8755 + }, + { + "epoch": 4.553014553014553, + "grad_norm": 0.031097112223505974, + "learning_rate": 3.3118615648058e-05, + "loss": 0.0016, + "num_input_tokens_seen": 1667928, + "step": 8760 + }, + { + "epoch": 4.5556133056133055, + "grad_norm": 0.025010498240590096, + "learning_rate": 3.309716304553884e-05, + "loss": 0.0021, + "num_input_tokens_seen": 1668952, + "step": 8765 + }, + { + "epoch": 4.558212058212058, + "grad_norm": 0.058226704597473145, + "learning_rate": 3.307570377990245e-05, + "loss": 0.0013, + "num_input_tokens_seen": 1669912, + "step": 8770 + }, + { + "epoch": 4.5608108108108105, + "grad_norm": 0.048593390733003616, + "learning_rate": 3.3054237868807556e-05, + "loss": 0.0009, + "num_input_tokens_seen": 1670808, + "step": 8775 + }, + { + "epoch": 4.5634095634095635, + "grad_norm": 0.007506629917770624, + "learning_rate": 3.303276532991835e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1671736, + "step": 8780 + }, + { + "epoch": 4.5660083160083165, + "grad_norm": 0.006302996072918177, + "learning_rate": 3.3011286180904494e-05, + "loss": 0.1694, + "num_input_tokens_seen": 1672696, + "step": 8785 + }, + { + "epoch": 4.5686070686070686, + "grad_norm": 0.01782156340777874, + "learning_rate": 3.298980043944107e-05, + "loss": 0.0812, + "num_input_tokens_seen": 1673688, + "step": 8790 + }, + { + "epoch": 4.5712058212058215, + "grad_norm": 0.009467339143157005, + "learning_rate": 3.2968308123208595e-05, + "loss": 0.001, + "num_input_tokens_seen": 1674616, + "step": 8795 + }, + { + "epoch": 4.573804573804574, + "grad_norm": 0.018043169751763344, + "learning_rate": 3.2946809249893e-05, + "loss": 0.0009, + "num_input_tokens_seen": 1675480, + "step": 8800 + }, + { + "epoch": 4.576403326403327, + "grad_norm": 0.011544843204319477, + "learning_rate": 3.29253038371856e-05, + "loss": 0.0013, + "num_input_tokens_seen": 1676472, + "step": 8805 + }, + { + "epoch": 4.579002079002079, + "grad_norm": 0.007500397972762585, + "learning_rate": 3.29037919027831e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1677336, + "step": 8810 + }, + { + "epoch": 4.581600831600832, + "grad_norm": 0.029700959101319313, + "learning_rate": 3.288227346438756e-05, + "loss": 0.0013, + "num_input_tokens_seen": 1678264, + "step": 8815 + }, + { + "epoch": 4.584199584199585, + "grad_norm": 0.005971736740320921, + "learning_rate": 3.286074853970642e-05, + "loss": 0.0006, + "num_input_tokens_seen": 1679256, + "step": 8820 + }, + { + "epoch": 4.586798336798337, + "grad_norm": 0.004166759550571442, + "learning_rate": 3.2839217146452426e-05, + "loss": 0.0021, + "num_input_tokens_seen": 1680184, + "step": 8825 + }, + { + "epoch": 4.58939708939709, + "grad_norm": 0.013309861533343792, + "learning_rate": 3.281767930234366e-05, + "loss": 0.0009, + "num_input_tokens_seen": 1681144, + "step": 8830 + }, + { + "epoch": 4.591995841995842, + "grad_norm": 0.012859582901000977, + "learning_rate": 3.279613502510352e-05, + "loss": 0.0009, + "num_input_tokens_seen": 1682136, + "step": 8835 + }, + { + "epoch": 4.594594594594595, + "grad_norm": 0.0033025280572474003, + "learning_rate": 3.277458433246068e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1683096, + "step": 8840 + }, + { + "epoch": 4.597193347193347, + "grad_norm": 0.010472239926457405, + "learning_rate": 3.2753027242149105e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1684088, + "step": 8845 + }, + { + "epoch": 4.5997920997921, + "grad_norm": 0.07887256145477295, + "learning_rate": 3.273146377190803e-05, + "loss": 0.0006, + "num_input_tokens_seen": 1685048, + "step": 8850 + }, + { + "epoch": 4.602390852390853, + "grad_norm": 0.002531029749661684, + "learning_rate": 3.270989393948193e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1685944, + "step": 8855 + }, + { + "epoch": 4.604989604989605, + "grad_norm": 0.007167569827288389, + "learning_rate": 3.2688317762620513e-05, + "loss": 0.0006, + "num_input_tokens_seen": 1686904, + "step": 8860 + }, + { + "epoch": 4.607588357588358, + "grad_norm": 0.031475577503442764, + "learning_rate": 3.266673525907872e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1687800, + "step": 8865 + }, + { + "epoch": 4.61018711018711, + "grad_norm": 0.0016422549961134791, + "learning_rate": 3.2645146446616684e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1688824, + "step": 8870 + }, + { + "epoch": 4.612785862785863, + "grad_norm": 0.005095670465379953, + "learning_rate": 3.2623551342999734e-05, + "loss": 0.1178, + "num_input_tokens_seen": 1689784, + "step": 8875 + }, + { + "epoch": 4.615384615384615, + "grad_norm": 0.008467121981084347, + "learning_rate": 3.2601949965998404e-05, + "loss": 0.0041, + "num_input_tokens_seen": 1690712, + "step": 8880 + }, + { + "epoch": 4.617983367983368, + "grad_norm": 0.007164251059293747, + "learning_rate": 3.258034233338834e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1691640, + "step": 8885 + }, + { + "epoch": 4.620582120582121, + "grad_norm": 0.006643849890679121, + "learning_rate": 3.2558728462950364e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1692600, + "step": 8890 + }, + { + "epoch": 4.623180873180873, + "grad_norm": 0.00514846621081233, + "learning_rate": 3.2537108372470425e-05, + "loss": 0.0006, + "num_input_tokens_seen": 1693592, + "step": 8895 + }, + { + "epoch": 4.625779625779626, + "grad_norm": 0.015430060215294361, + "learning_rate": 3.2515482079739615e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1694488, + "step": 8900 + }, + { + "epoch": 4.628378378378378, + "grad_norm": 0.007750445045530796, + "learning_rate": 3.2493849602554076e-05, + "loss": 0.1619, + "num_input_tokens_seen": 1695448, + "step": 8905 + }, + { + "epoch": 4.630977130977131, + "grad_norm": 0.01906711980700493, + "learning_rate": 3.24722109587151e-05, + "loss": 0.1404, + "num_input_tokens_seen": 1696472, + "step": 8910 + }, + { + "epoch": 4.633575883575883, + "grad_norm": 0.043692346662282944, + "learning_rate": 3.245056616602901e-05, + "loss": 0.0028, + "num_input_tokens_seen": 1697464, + "step": 8915 + }, + { + "epoch": 4.636174636174636, + "grad_norm": 0.005373490508645773, + "learning_rate": 3.242891524230721e-05, + "loss": 0.0409, + "num_input_tokens_seen": 1698392, + "step": 8920 + }, + { + "epoch": 4.638773388773389, + "grad_norm": 0.012616473250091076, + "learning_rate": 3.2407258205366136e-05, + "loss": 0.001, + "num_input_tokens_seen": 1699320, + "step": 8925 + }, + { + "epoch": 4.641372141372141, + "grad_norm": 0.07653912901878357, + "learning_rate": 3.238559507302726e-05, + "loss": 0.0018, + "num_input_tokens_seen": 1700344, + "step": 8930 + }, + { + "epoch": 4.643970893970894, + "grad_norm": 0.03418548405170441, + "learning_rate": 3.236392586311709e-05, + "loss": 0.001, + "num_input_tokens_seen": 1701336, + "step": 8935 + }, + { + "epoch": 4.646569646569646, + "grad_norm": 0.08091478794813156, + "learning_rate": 3.23422505934671e-05, + "loss": 0.0009, + "num_input_tokens_seen": 1702296, + "step": 8940 + }, + { + "epoch": 4.649168399168399, + "grad_norm": 0.07253038883209229, + "learning_rate": 3.232056928191376e-05, + "loss": 0.0009, + "num_input_tokens_seen": 1703256, + "step": 8945 + }, + { + "epoch": 4.651767151767151, + "grad_norm": 0.05130884051322937, + "learning_rate": 3.229888194629854e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1704184, + "step": 8950 + }, + { + "epoch": 4.654365904365904, + "grad_norm": 0.008201653137803078, + "learning_rate": 3.227718860446782e-05, + "loss": 0.0006, + "num_input_tokens_seen": 1705144, + "step": 8955 + }, + { + "epoch": 4.656964656964657, + "grad_norm": 0.00782904401421547, + "learning_rate": 3.2255489274272975e-05, + "loss": 0.253, + "num_input_tokens_seen": 1706104, + "step": 8960 + }, + { + "epoch": 4.659563409563409, + "grad_norm": 0.0161115862429142, + "learning_rate": 3.2233783973570274e-05, + "loss": 0.0006, + "num_input_tokens_seen": 1707064, + "step": 8965 + }, + { + "epoch": 4.662162162162162, + "grad_norm": 0.006283559370785952, + "learning_rate": 3.22120727202209e-05, + "loss": 0.0005, + "num_input_tokens_seen": 1708024, + "step": 8970 + }, + { + "epoch": 4.664760914760915, + "grad_norm": 0.018335917964577675, + "learning_rate": 3.219035553209093e-05, + "loss": 0.0805, + "num_input_tokens_seen": 1709016, + "step": 8975 + }, + { + "epoch": 4.667359667359667, + "grad_norm": 23.791175842285156, + "learning_rate": 3.216863242705136e-05, + "loss": 0.0086, + "num_input_tokens_seen": 1710008, + "step": 8980 + }, + { + "epoch": 4.66995841995842, + "grad_norm": 0.2107325941324234, + "learning_rate": 3.214690342297802e-05, + "loss": 0.0015, + "num_input_tokens_seen": 1710936, + "step": 8985 + }, + { + "epoch": 4.672557172557172, + "grad_norm": 0.00821636151522398, + "learning_rate": 3.212516853775161e-05, + "loss": 0.1271, + "num_input_tokens_seen": 1711832, + "step": 8990 + }, + { + "epoch": 4.675155925155925, + "grad_norm": 0.014678903855383396, + "learning_rate": 3.210342778925763e-05, + "loss": 0.0015, + "num_input_tokens_seen": 1712728, + "step": 8995 + }, + { + "epoch": 4.6777546777546775, + "grad_norm": 0.0053763906471431255, + "learning_rate": 3.2081681195386496e-05, + "loss": 0.1448, + "num_input_tokens_seen": 1713688, + "step": 9000 + }, + { + "epoch": 4.68035343035343, + "grad_norm": 0.010509279556572437, + "learning_rate": 3.205992877403334e-05, + "loss": 0.0009, + "num_input_tokens_seen": 1714616, + "step": 9005 + }, + { + "epoch": 4.682952182952183, + "grad_norm": 0.014829753898084164, + "learning_rate": 3.203817054309813e-05, + "loss": 0.1455, + "num_input_tokens_seen": 1715512, + "step": 9010 + }, + { + "epoch": 4.6855509355509355, + "grad_norm": 0.15008644759655, + "learning_rate": 3.2016406520485636e-05, + "loss": 0.0017, + "num_input_tokens_seen": 1716536, + "step": 9015 + }, + { + "epoch": 4.6881496881496885, + "grad_norm": 0.043134067207574844, + "learning_rate": 3.199463672410534e-05, + "loss": 0.0241, + "num_input_tokens_seen": 1717464, + "step": 9020 + }, + { + "epoch": 4.6907484407484406, + "grad_norm": 0.04136810079216957, + "learning_rate": 3.197286117187151e-05, + "loss": 0.0011, + "num_input_tokens_seen": 1718456, + "step": 9025 + }, + { + "epoch": 4.6933471933471935, + "grad_norm": 0.20544810593128204, + "learning_rate": 3.195107988170315e-05, + "loss": 0.0014, + "num_input_tokens_seen": 1719384, + "step": 9030 + }, + { + "epoch": 4.695945945945946, + "grad_norm": 0.015312128700315952, + "learning_rate": 3.1929292871523994e-05, + "loss": 0.001, + "num_input_tokens_seen": 1720408, + "step": 9035 + }, + { + "epoch": 4.698544698544699, + "grad_norm": 0.011995804496109486, + "learning_rate": 3.190750015926244e-05, + "loss": 0.0013, + "num_input_tokens_seen": 1721368, + "step": 9040 + }, + { + "epoch": 4.701143451143452, + "grad_norm": 0.00883855577558279, + "learning_rate": 3.188570176285164e-05, + "loss": 0.0012, + "num_input_tokens_seen": 1722328, + "step": 9045 + }, + { + "epoch": 4.703742203742204, + "grad_norm": 31.341503143310547, + "learning_rate": 3.1863897700229375e-05, + "loss": 0.1394, + "num_input_tokens_seen": 1723256, + "step": 9050 + }, + { + "epoch": 4.706340956340957, + "grad_norm": 0.014478705823421478, + "learning_rate": 3.18420879893381e-05, + "loss": 0.0013, + "num_input_tokens_seen": 1724280, + "step": 9055 + }, + { + "epoch": 4.708939708939709, + "grad_norm": 22.649660110473633, + "learning_rate": 3.182027264812494e-05, + "loss": 0.0112, + "num_input_tokens_seen": 1725240, + "step": 9060 + }, + { + "epoch": 4.711538461538462, + "grad_norm": 0.007754080928862095, + "learning_rate": 3.179845169454162e-05, + "loss": 0.123, + "num_input_tokens_seen": 1726200, + "step": 9065 + }, + { + "epoch": 4.714137214137214, + "grad_norm": 0.02238883078098297, + "learning_rate": 3.1776625146544504e-05, + "loss": 0.0009, + "num_input_tokens_seen": 1727160, + "step": 9070 + }, + { + "epoch": 4.716735966735967, + "grad_norm": 0.016385992988944054, + "learning_rate": 3.175479302209455e-05, + "loss": 0.001, + "num_input_tokens_seen": 1728024, + "step": 9075 + }, + { + "epoch": 4.71933471933472, + "grad_norm": 0.01133829914033413, + "learning_rate": 3.173295533915733e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1728952, + "step": 9080 + }, + { + "epoch": 4.721933471933472, + "grad_norm": 0.011880737729370594, + "learning_rate": 3.1711112115702954e-05, + "loss": 0.0008, + "num_input_tokens_seen": 1729912, + "step": 9085 + }, + { + "epoch": 4.724532224532225, + "grad_norm": 0.0091466149315238, + "learning_rate": 3.1689263369706104e-05, + "loss": 0.1786, + "num_input_tokens_seen": 1730808, + "step": 9090 + }, + { + "epoch": 4.727130977130977, + "grad_norm": 0.006776084192097187, + "learning_rate": 3.166740911914603e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1731768, + "step": 9095 + }, + { + "epoch": 4.72972972972973, + "grad_norm": 0.014731966890394688, + "learning_rate": 3.164554938200647e-05, + "loss": 0.0011, + "num_input_tokens_seen": 1732632, + "step": 9100 + }, + { + "epoch": 4.732328482328482, + "grad_norm": 0.06826353073120117, + "learning_rate": 3.162368417627571e-05, + "loss": 0.1064, + "num_input_tokens_seen": 1733624, + "step": 9105 + }, + { + "epoch": 4.734927234927235, + "grad_norm": 0.025537975132465363, + "learning_rate": 3.1601813519946514e-05, + "loss": 0.0014, + "num_input_tokens_seen": 1734616, + "step": 9110 + }, + { + "epoch": 4.737525987525988, + "grad_norm": 0.014847101643681526, + "learning_rate": 3.157993743101616e-05, + "loss": 0.002, + "num_input_tokens_seen": 1735544, + "step": 9115 + }, + { + "epoch": 4.74012474012474, + "grad_norm": 0.014607532881200314, + "learning_rate": 3.1558055927486355e-05, + "loss": 0.1153, + "num_input_tokens_seen": 1736504, + "step": 9120 + }, + { + "epoch": 4.742723492723493, + "grad_norm": 0.0196085162460804, + "learning_rate": 3.1536169027363304e-05, + "loss": 0.0018, + "num_input_tokens_seen": 1737400, + "step": 9125 + }, + { + "epoch": 4.745322245322245, + "grad_norm": 12.09311294555664, + "learning_rate": 3.151427674865763e-05, + "loss": 0.0104, + "num_input_tokens_seen": 1738360, + "step": 9130 + }, + { + "epoch": 4.747920997920998, + "grad_norm": 0.04134281352162361, + "learning_rate": 3.149237910938438e-05, + "loss": 0.1001, + "num_input_tokens_seen": 1739256, + "step": 9135 + }, + { + "epoch": 4.75051975051975, + "grad_norm": 0.036900293081998825, + "learning_rate": 3.147047612756302e-05, + "loss": 0.0031, + "num_input_tokens_seen": 1740184, + "step": 9140 + }, + { + "epoch": 4.753118503118503, + "grad_norm": 0.030477765947580338, + "learning_rate": 3.1448567821217415e-05, + "loss": 0.0022, + "num_input_tokens_seen": 1741240, + "step": 9145 + }, + { + "epoch": 4.755717255717256, + "grad_norm": 0.021724367514252663, + "learning_rate": 3.14266542083758e-05, + "loss": 0.0024, + "num_input_tokens_seen": 1742264, + "step": 9150 + }, + { + "epoch": 4.758316008316008, + "grad_norm": 0.011189729906618595, + "learning_rate": 3.1404735307070785e-05, + "loss": 0.0012, + "num_input_tokens_seen": 1743224, + "step": 9155 + }, + { + "epoch": 4.760914760914761, + "grad_norm": 0.0070863403379917145, + "learning_rate": 3.138281113533933e-05, + "loss": 0.0017, + "num_input_tokens_seen": 1744152, + "step": 9160 + }, + { + "epoch": 4.763513513513513, + "grad_norm": 0.005780091974884272, + "learning_rate": 3.136088171122274e-05, + "loss": 0.001, + "num_input_tokens_seen": 1745048, + "step": 9165 + }, + { + "epoch": 4.766112266112266, + "grad_norm": 3.709433078765869, + "learning_rate": 3.133894705276662e-05, + "loss": 0.194, + "num_input_tokens_seen": 1746040, + "step": 9170 + }, + { + "epoch": 4.768711018711018, + "grad_norm": 0.022140081971883774, + "learning_rate": 3.131700717802091e-05, + "loss": 0.0012, + "num_input_tokens_seen": 1746968, + "step": 9175 + }, + { + "epoch": 4.771309771309771, + "grad_norm": 4.547706127166748, + "learning_rate": 3.129506210503983e-05, + "loss": 0.0065, + "num_input_tokens_seen": 1748024, + "step": 9180 + }, + { + "epoch": 4.773908523908524, + "grad_norm": 3.4875504970550537, + "learning_rate": 3.127311185188187e-05, + "loss": 0.006, + "num_input_tokens_seen": 1748984, + "step": 9185 + }, + { + "epoch": 4.776507276507276, + "grad_norm": 0.026736833155155182, + "learning_rate": 3.125115643660978e-05, + "loss": 0.0773, + "num_input_tokens_seen": 1749944, + "step": 9190 + }, + { + "epoch": 4.779106029106029, + "grad_norm": 0.0767909437417984, + "learning_rate": 3.12291958772906e-05, + "loss": 0.0012, + "num_input_tokens_seen": 1750904, + "step": 9195 + }, + { + "epoch": 4.781704781704782, + "grad_norm": 9.718994140625, + "learning_rate": 3.120723019199554e-05, + "loss": 0.0727, + "num_input_tokens_seen": 1751896, + "step": 9200 + }, + { + "epoch": 4.784303534303534, + "grad_norm": 0.01013969350606203, + "learning_rate": 3.118525939880007e-05, + "loss": 0.0763, + "num_input_tokens_seen": 1752856, + "step": 9205 + }, + { + "epoch": 4.786902286902287, + "grad_norm": 0.26056838035583496, + "learning_rate": 3.116328351578384e-05, + "loss": 0.0024, + "num_input_tokens_seen": 1753848, + "step": 9210 + }, + { + "epoch": 4.789501039501039, + "grad_norm": 0.012393277138471603, + "learning_rate": 3.114130256103072e-05, + "loss": 0.0042, + "num_input_tokens_seen": 1754776, + "step": 9215 + }, + { + "epoch": 4.792099792099792, + "grad_norm": 0.011613582260906696, + "learning_rate": 3.111931655262872e-05, + "loss": 0.0689, + "num_input_tokens_seen": 1755768, + "step": 9220 + }, + { + "epoch": 4.794698544698544, + "grad_norm": 0.003934043925255537, + "learning_rate": 3.109732550867003e-05, + "loss": 0.0006, + "num_input_tokens_seen": 1756696, + "step": 9225 + }, + { + "epoch": 4.797297297297297, + "grad_norm": 0.013581991195678711, + "learning_rate": 3.107532944725097e-05, + "loss": 0.0006, + "num_input_tokens_seen": 1757592, + "step": 9230 + }, + { + "epoch": 4.79989604989605, + "grad_norm": 3.8106157779693604, + "learning_rate": 3.1053328386472e-05, + "loss": 0.126, + "num_input_tokens_seen": 1758584, + "step": 9235 + }, + { + "epoch": 4.802494802494802, + "grad_norm": 0.023651141673326492, + "learning_rate": 3.103132234443768e-05, + "loss": 0.0008, + "num_input_tokens_seen": 1759576, + "step": 9240 + }, + { + "epoch": 4.805093555093555, + "grad_norm": 0.02841584011912346, + "learning_rate": 3.10093113392567e-05, + "loss": 0.0013, + "num_input_tokens_seen": 1760536, + "step": 9245 + }, + { + "epoch": 4.8076923076923075, + "grad_norm": 0.4716147184371948, + "learning_rate": 3.0987295389041786e-05, + "loss": 0.0083, + "num_input_tokens_seen": 1761560, + "step": 9250 + }, + { + "epoch": 4.8102910602910605, + "grad_norm": 0.01068122312426567, + "learning_rate": 3.096527451190978e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1762520, + "step": 9255 + }, + { + "epoch": 4.8128898128898125, + "grad_norm": 0.022581253200769424, + "learning_rate": 3.094324872598154e-05, + "loss": 0.0016, + "num_input_tokens_seen": 1763416, + "step": 9260 + }, + { + "epoch": 4.8154885654885655, + "grad_norm": 0.06228764355182648, + "learning_rate": 3.0921218049382e-05, + "loss": 0.0012, + "num_input_tokens_seen": 1764280, + "step": 9265 + }, + { + "epoch": 4.8180873180873185, + "grad_norm": 19.280515670776367, + "learning_rate": 3.089918250024008e-05, + "loss": 0.1936, + "num_input_tokens_seen": 1765272, + "step": 9270 + }, + { + "epoch": 4.820686070686071, + "grad_norm": 0.014133023098111153, + "learning_rate": 3.087714209668875e-05, + "loss": 0.0006, + "num_input_tokens_seen": 1766232, + "step": 9275 + }, + { + "epoch": 4.8232848232848236, + "grad_norm": 0.9543604850769043, + "learning_rate": 3.085509685686494e-05, + "loss": 0.0044, + "num_input_tokens_seen": 1767288, + "step": 9280 + }, + { + "epoch": 4.825883575883576, + "grad_norm": 0.15724366903305054, + "learning_rate": 3.0833046798909563e-05, + "loss": 0.0014, + "num_input_tokens_seen": 1768152, + "step": 9285 + }, + { + "epoch": 4.828482328482329, + "grad_norm": 0.020191770046949387, + "learning_rate": 3.0810991940967535e-05, + "loss": 0.1071, + "num_input_tokens_seen": 1769112, + "step": 9290 + }, + { + "epoch": 4.831081081081081, + "grad_norm": 0.010415970347821712, + "learning_rate": 3.078893230118767e-05, + "loss": 0.0021, + "num_input_tokens_seen": 1770072, + "step": 9295 + }, + { + "epoch": 4.833679833679834, + "grad_norm": 0.0056745680049061775, + "learning_rate": 3.076686789772276e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1771064, + "step": 9300 + }, + { + "epoch": 4.836278586278587, + "grad_norm": 0.03152604028582573, + "learning_rate": 3.0744798748729494e-05, + "loss": 0.1304, + "num_input_tokens_seen": 1771960, + "step": 9305 + }, + { + "epoch": 4.838877338877339, + "grad_norm": 0.01846563071012497, + "learning_rate": 3.072272487236847e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1772888, + "step": 9310 + }, + { + "epoch": 4.841476091476092, + "grad_norm": 0.00529064703732729, + "learning_rate": 3.0700646286804165e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1773816, + "step": 9315 + }, + { + "epoch": 4.844074844074844, + "grad_norm": 0.009354786016047001, + "learning_rate": 3.067856301020495e-05, + "loss": 0.0009, + "num_input_tokens_seen": 1774776, + "step": 9320 + }, + { + "epoch": 4.846673596673597, + "grad_norm": 53.95225524902344, + "learning_rate": 3.065647506074306e-05, + "loss": 0.1243, + "num_input_tokens_seen": 1775704, + "step": 9325 + }, + { + "epoch": 4.849272349272349, + "grad_norm": 0.0052909841760993, + "learning_rate": 3.0634382456594543e-05, + "loss": 0.0017, + "num_input_tokens_seen": 1776696, + "step": 9330 + }, + { + "epoch": 4.851871101871102, + "grad_norm": 0.02156156301498413, + "learning_rate": 3.061228521593931e-05, + "loss": 0.0017, + "num_input_tokens_seen": 1777592, + "step": 9335 + }, + { + "epoch": 4.854469854469855, + "grad_norm": 0.005876675248146057, + "learning_rate": 3.059018335696109e-05, + "loss": 0.0951, + "num_input_tokens_seen": 1778456, + "step": 9340 + }, + { + "epoch": 4.857068607068607, + "grad_norm": 0.00511527992784977, + "learning_rate": 3.056807689784738e-05, + "loss": 0.0005, + "num_input_tokens_seen": 1779352, + "step": 9345 + }, + { + "epoch": 4.85966735966736, + "grad_norm": 0.011632218956947327, + "learning_rate": 3.0545965856789486e-05, + "loss": 0.0005, + "num_input_tokens_seen": 1780184, + "step": 9350 + }, + { + "epoch": 4.862266112266112, + "grad_norm": 0.6797464489936829, + "learning_rate": 3.0523850251982474e-05, + "loss": 0.0903, + "num_input_tokens_seen": 1781080, + "step": 9355 + }, + { + "epoch": 4.864864864864865, + "grad_norm": 0.0077201612293720245, + "learning_rate": 3.0501730101625182e-05, + "loss": 0.1049, + "num_input_tokens_seen": 1781976, + "step": 9360 + }, + { + "epoch": 4.867463617463617, + "grad_norm": 0.26470237970352173, + "learning_rate": 3.0479605423920165e-05, + "loss": 0.0015, + "num_input_tokens_seen": 1782936, + "step": 9365 + }, + { + "epoch": 4.87006237006237, + "grad_norm": 0.3052136301994324, + "learning_rate": 3.0457476237073723e-05, + "loss": 0.0052, + "num_input_tokens_seen": 1783928, + "step": 9370 + }, + { + "epoch": 4.872661122661123, + "grad_norm": 0.008350609801709652, + "learning_rate": 3.043534255929586e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1784984, + "step": 9375 + }, + { + "epoch": 4.875259875259875, + "grad_norm": 0.01409556157886982, + "learning_rate": 3.0413204408800265e-05, + "loss": 0.1007, + "num_input_tokens_seen": 1785912, + "step": 9380 + }, + { + "epoch": 4.877858627858628, + "grad_norm": 1.618609070777893, + "learning_rate": 3.0391061803804334e-05, + "loss": 0.1336, + "num_input_tokens_seen": 1786840, + "step": 9385 + }, + { + "epoch": 4.88045738045738, + "grad_norm": 0.11126026511192322, + "learning_rate": 3.036891476252911e-05, + "loss": 0.0006, + "num_input_tokens_seen": 1787832, + "step": 9390 + }, + { + "epoch": 4.883056133056133, + "grad_norm": 0.005153709556907415, + "learning_rate": 3.0346763303199273e-05, + "loss": 0.0782, + "num_input_tokens_seen": 1788824, + "step": 9395 + }, + { + "epoch": 4.885654885654886, + "grad_norm": 0.019039709120988846, + "learning_rate": 3.0324607444043162e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1789784, + "step": 9400 + }, + { + "epoch": 4.888253638253638, + "grad_norm": 0.02018953487277031, + "learning_rate": 3.0302447203292737e-05, + "loss": 0.0519, + "num_input_tokens_seen": 1790712, + "step": 9405 + }, + { + "epoch": 4.890852390852391, + "grad_norm": 0.02157357521355152, + "learning_rate": 3.0280282599183547e-05, + "loss": 0.0011, + "num_input_tokens_seen": 1791672, + "step": 9410 + }, + { + "epoch": 4.893451143451143, + "grad_norm": 0.02840130589902401, + "learning_rate": 3.025811364995474e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1792696, + "step": 9415 + }, + { + "epoch": 4.896049896049896, + "grad_norm": 6.29915714263916, + "learning_rate": 3.0235940373849042e-05, + "loss": 0.2041, + "num_input_tokens_seen": 1793656, + "step": 9420 + }, + { + "epoch": 4.898648648648649, + "grad_norm": 0.004957227502018213, + "learning_rate": 3.0213762789112737e-05, + "loss": 0.0006, + "num_input_tokens_seen": 1794552, + "step": 9425 + }, + { + "epoch": 4.901247401247401, + "grad_norm": 0.004120007622987032, + "learning_rate": 3.0191580913995655e-05, + "loss": 0.0229, + "num_input_tokens_seen": 1795480, + "step": 9430 + }, + { + "epoch": 4.903846153846154, + "grad_norm": 0.004427856300026178, + "learning_rate": 3.016939476675115e-05, + "loss": 0.1562, + "num_input_tokens_seen": 1796440, + "step": 9435 + }, + { + "epoch": 4.906444906444906, + "grad_norm": 0.012033361941576004, + "learning_rate": 3.0147204365636116e-05, + "loss": 0.0032, + "num_input_tokens_seen": 1797368, + "step": 9440 + }, + { + "epoch": 4.909043659043659, + "grad_norm": 0.09074781090021133, + "learning_rate": 3.0125009728910908e-05, + "loss": 0.0016, + "num_input_tokens_seen": 1798360, + "step": 9445 + }, + { + "epoch": 4.911642411642411, + "grad_norm": 0.010677755810320377, + "learning_rate": 3.01028108748394e-05, + "loss": 0.1103, + "num_input_tokens_seen": 1799320, + "step": 9450 + }, + { + "epoch": 4.914241164241164, + "grad_norm": 0.004420019220560789, + "learning_rate": 3.0080607821688922e-05, + "loss": 0.0013, + "num_input_tokens_seen": 1800280, + "step": 9455 + }, + { + "epoch": 4.916839916839917, + "grad_norm": 0.09518930315971375, + "learning_rate": 3.005840058773025e-05, + "loss": 0.0964, + "num_input_tokens_seen": 1801240, + "step": 9460 + }, + { + "epoch": 4.919438669438669, + "grad_norm": 0.02448096126317978, + "learning_rate": 3.003618919123763e-05, + "loss": 0.0006, + "num_input_tokens_seen": 1802168, + "step": 9465 + }, + { + "epoch": 4.922037422037422, + "grad_norm": 0.02119196206331253, + "learning_rate": 3.0013973650488713e-05, + "loss": 0.0014, + "num_input_tokens_seen": 1803192, + "step": 9470 + }, + { + "epoch": 4.924636174636174, + "grad_norm": 0.012030094861984253, + "learning_rate": 2.9991753983764547e-05, + "loss": 0.0018, + "num_input_tokens_seen": 1804088, + "step": 9475 + }, + { + "epoch": 4.927234927234927, + "grad_norm": 0.12219502031803131, + "learning_rate": 2.9969530209349604e-05, + "loss": 0.0012, + "num_input_tokens_seen": 1805144, + "step": 9480 + }, + { + "epoch": 4.9298336798336795, + "grad_norm": 4.281802177429199, + "learning_rate": 2.9947302345531725e-05, + "loss": 0.087, + "num_input_tokens_seen": 1806168, + "step": 9485 + }, + { + "epoch": 4.9324324324324325, + "grad_norm": 0.22573429346084595, + "learning_rate": 2.9925070410602112e-05, + "loss": 0.003, + "num_input_tokens_seen": 1807128, + "step": 9490 + }, + { + "epoch": 4.935031185031185, + "grad_norm": 0.0370144248008728, + "learning_rate": 2.9902834422855308e-05, + "loss": 0.096, + "num_input_tokens_seen": 1808024, + "step": 9495 + }, + { + "epoch": 4.9376299376299375, + "grad_norm": 6.674158096313477, + "learning_rate": 2.9880594400589213e-05, + "loss": 0.1012, + "num_input_tokens_seen": 1808952, + "step": 9500 + }, + { + "epoch": 4.9402286902286905, + "grad_norm": 0.03363824635744095, + "learning_rate": 2.9858350362105035e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1809848, + "step": 9505 + }, + { + "epoch": 4.942827442827443, + "grad_norm": 0.014512662775814533, + "learning_rate": 2.983610232570728e-05, + "loss": 0.1166, + "num_input_tokens_seen": 1810744, + "step": 9510 + }, + { + "epoch": 4.9454261954261955, + "grad_norm": 0.008714977651834488, + "learning_rate": 2.9813850309703773e-05, + "loss": 0.0523, + "num_input_tokens_seen": 1811672, + "step": 9515 + }, + { + "epoch": 4.948024948024948, + "grad_norm": 0.06463262438774109, + "learning_rate": 2.9791594332405576e-05, + "loss": 0.034, + "num_input_tokens_seen": 1812568, + "step": 9520 + }, + { + "epoch": 4.950623700623701, + "grad_norm": 0.8256622552871704, + "learning_rate": 2.9769334412127024e-05, + "loss": 0.0734, + "num_input_tokens_seen": 1813560, + "step": 9525 + }, + { + "epoch": 4.953222453222454, + "grad_norm": 0.029886795207858086, + "learning_rate": 2.974707056718571e-05, + "loss": 0.0022, + "num_input_tokens_seen": 1814488, + "step": 9530 + }, + { + "epoch": 4.955821205821206, + "grad_norm": 0.01469662506133318, + "learning_rate": 2.9724802815902443e-05, + "loss": 0.097, + "num_input_tokens_seen": 1815544, + "step": 9535 + }, + { + "epoch": 4.958419958419959, + "grad_norm": 0.05792401358485222, + "learning_rate": 2.9702531176601257e-05, + "loss": 0.0023, + "num_input_tokens_seen": 1816504, + "step": 9540 + }, + { + "epoch": 4.961018711018711, + "grad_norm": 0.027484603226184845, + "learning_rate": 2.9680255667609368e-05, + "loss": 0.0021, + "num_input_tokens_seen": 1817464, + "step": 9545 + }, + { + "epoch": 4.963617463617464, + "grad_norm": 0.10935764759778976, + "learning_rate": 2.965797630725719e-05, + "loss": 0.0037, + "num_input_tokens_seen": 1818392, + "step": 9550 + }, + { + "epoch": 4.966216216216216, + "grad_norm": 0.02907703071832657, + "learning_rate": 2.9635693113878317e-05, + "loss": 0.0059, + "num_input_tokens_seen": 1819384, + "step": 9555 + }, + { + "epoch": 4.968814968814969, + "grad_norm": 0.030444154515862465, + "learning_rate": 2.961340610580946e-05, + "loss": 0.0017, + "num_input_tokens_seen": 1820344, + "step": 9560 + }, + { + "epoch": 4.971413721413722, + "grad_norm": 0.0047965808771550655, + "learning_rate": 2.959111530139051e-05, + "loss": 0.001, + "num_input_tokens_seen": 1821304, + "step": 9565 + }, + { + "epoch": 4.974012474012474, + "grad_norm": 0.01253589242696762, + "learning_rate": 2.9568820718964464e-05, + "loss": 0.0009, + "num_input_tokens_seen": 1822328, + "step": 9570 + }, + { + "epoch": 4.976611226611227, + "grad_norm": 0.013057976961135864, + "learning_rate": 2.9546522376877416e-05, + "loss": 0.0006, + "num_input_tokens_seen": 1823256, + "step": 9575 + }, + { + "epoch": 4.979209979209979, + "grad_norm": 0.09559977799654007, + "learning_rate": 2.952422029347858e-05, + "loss": 0.0011, + "num_input_tokens_seen": 1824248, + "step": 9580 + }, + { + "epoch": 4.981808731808732, + "grad_norm": 0.010767117142677307, + "learning_rate": 2.9501914487120226e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1825208, + "step": 9585 + }, + { + "epoch": 4.984407484407484, + "grad_norm": 0.02032138593494892, + "learning_rate": 2.9479604976157705e-05, + "loss": 0.0018, + "num_input_tokens_seen": 1826168, + "step": 9590 + }, + { + "epoch": 4.987006237006237, + "grad_norm": 0.0067282808013260365, + "learning_rate": 2.9457291778949396e-05, + "loss": 0.0021, + "num_input_tokens_seen": 1827128, + "step": 9595 + }, + { + "epoch": 4.98960498960499, + "grad_norm": 0.015477443113923073, + "learning_rate": 2.943497491385674e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1828056, + "step": 9600 + }, + { + "epoch": 4.992203742203742, + "grad_norm": 0.014000419527292252, + "learning_rate": 2.9412654399244173e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1828984, + "step": 9605 + }, + { + "epoch": 4.994802494802495, + "grad_norm": 0.011425365693867207, + "learning_rate": 2.939033025347913e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1829880, + "step": 9610 + }, + { + "epoch": 4.997401247401247, + "grad_norm": 0.003410005709156394, + "learning_rate": 2.936800249493206e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1830840, + "step": 9615 + }, + { + "epoch": 5.0, + "grad_norm": 0.005730455741286278, + "learning_rate": 2.9345671141976373e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1831728, + "step": 9620 + }, + { + "epoch": 5.0, + "eval_loss": 0.2771633267402649, + "eval_runtime": 9.2635, + "eval_samples_per_second": 92.406, + "eval_steps_per_second": 23.102, + "num_input_tokens_seen": 1831728, + "step": 9620 + }, + { + "epoch": 5.002598752598753, + "grad_norm": 0.0073987385258078575, + "learning_rate": 2.9323336212988413e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1832688, + "step": 9625 + }, + { + "epoch": 5.005197505197505, + "grad_norm": 0.0035548906307667494, + "learning_rate": 2.93009977263475e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1833584, + "step": 9630 + }, + { + "epoch": 5.007796257796258, + "grad_norm": 0.0076035321690142155, + "learning_rate": 2.9278655700435876e-05, + "loss": 0.0867, + "num_input_tokens_seen": 1834576, + "step": 9635 + }, + { + "epoch": 5.01039501039501, + "grad_norm": 0.7961125373840332, + "learning_rate": 2.925631015363868e-05, + "loss": 0.001, + "num_input_tokens_seen": 1835504, + "step": 9640 + }, + { + "epoch": 5.012993762993763, + "grad_norm": 0.0035825164522975683, + "learning_rate": 2.9233961104343954e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1836368, + "step": 9645 + }, + { + "epoch": 5.015592515592515, + "grad_norm": 0.040853917598724365, + "learning_rate": 2.9211608570942638e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1837360, + "step": 9650 + }, + { + "epoch": 5.018191268191268, + "grad_norm": 0.003424358321353793, + "learning_rate": 2.918925257182851e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1838320, + "step": 9655 + }, + { + "epoch": 5.020790020790021, + "grad_norm": 0.0029174855444580317, + "learning_rate": 2.9166893125398225e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1839184, + "step": 9660 + }, + { + "epoch": 5.023388773388773, + "grad_norm": 0.012649950571358204, + "learning_rate": 2.9144530250051265e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1840176, + "step": 9665 + }, + { + "epoch": 5.025987525987526, + "grad_norm": 0.0016568185528740287, + "learning_rate": 2.9122163964189946e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1841136, + "step": 9670 + }, + { + "epoch": 5.028586278586278, + "grad_norm": 0.002772019011899829, + "learning_rate": 2.909979428621935e-05, + "loss": 0.0001, + "num_input_tokens_seen": 1842192, + "step": 9675 + }, + { + "epoch": 5.031185031185031, + "grad_norm": 0.0017844230169430375, + "learning_rate": 2.9077421234547402e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1843152, + "step": 9680 + }, + { + "epoch": 5.033783783783784, + "grad_norm": 0.00193346431478858, + "learning_rate": 2.905504482758479e-05, + "loss": 0.0001, + "num_input_tokens_seen": 1844176, + "step": 9685 + }, + { + "epoch": 5.036382536382536, + "grad_norm": 0.043400246649980545, + "learning_rate": 2.9032665083744926e-05, + "loss": 0.1392, + "num_input_tokens_seen": 1845072, + "step": 9690 + }, + { + "epoch": 5.038981288981289, + "grad_norm": 0.011893251910805702, + "learning_rate": 2.9010282021444008e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1846000, + "step": 9695 + }, + { + "epoch": 5.041580041580041, + "grad_norm": 0.029262876138091087, + "learning_rate": 2.898789565910096e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1846960, + "step": 9700 + }, + { + "epoch": 5.044178794178794, + "grad_norm": 0.049632906913757324, + "learning_rate": 2.89655060151374e-05, + "loss": 0.0006, + "num_input_tokens_seen": 1847920, + "step": 9705 + }, + { + "epoch": 5.046777546777546, + "grad_norm": 0.011903449892997742, + "learning_rate": 2.894311310797767e-05, + "loss": 0.0005, + "num_input_tokens_seen": 1848880, + "step": 9710 + }, + { + "epoch": 5.049376299376299, + "grad_norm": 0.09803415834903717, + "learning_rate": 2.892071695604878e-05, + "loss": 0.001, + "num_input_tokens_seen": 1849808, + "step": 9715 + }, + { + "epoch": 5.051975051975052, + "grad_norm": 0.0030475787352770567, + "learning_rate": 2.8898317577780425e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1850800, + "step": 9720 + }, + { + "epoch": 5.0545738045738045, + "grad_norm": 0.007388690486550331, + "learning_rate": 2.8875914991604948e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1851760, + "step": 9725 + }, + { + "epoch": 5.057172557172557, + "grad_norm": 0.004478015936911106, + "learning_rate": 2.8853509215957323e-05, + "loss": 0.1223, + "num_input_tokens_seen": 1852688, + "step": 9730 + }, + { + "epoch": 5.0597713097713095, + "grad_norm": 0.012301350943744183, + "learning_rate": 2.8831100269275168e-05, + "loss": 0.0758, + "num_input_tokens_seen": 1853552, + "step": 9735 + }, + { + "epoch": 5.0623700623700625, + "grad_norm": 0.0027177033480256796, + "learning_rate": 2.8808688169998694e-05, + "loss": 0.1008, + "num_input_tokens_seen": 1854480, + "step": 9740 + }, + { + "epoch": 5.064968814968815, + "grad_norm": 0.029073378071188927, + "learning_rate": 2.878627293657071e-05, + "loss": 0.0008, + "num_input_tokens_seen": 1855440, + "step": 9745 + }, + { + "epoch": 5.0675675675675675, + "grad_norm": 0.13265138864517212, + "learning_rate": 2.8763854587436605e-05, + "loss": 0.0017, + "num_input_tokens_seen": 1856368, + "step": 9750 + }, + { + "epoch": 5.0701663201663205, + "grad_norm": 0.015224930830299854, + "learning_rate": 2.8741433141044334e-05, + "loss": 0.001, + "num_input_tokens_seen": 1857296, + "step": 9755 + }, + { + "epoch": 5.072765072765073, + "grad_norm": 0.008670857176184654, + "learning_rate": 2.87190086158444e-05, + "loss": 0.0006, + "num_input_tokens_seen": 1858224, + "step": 9760 + }, + { + "epoch": 5.075363825363826, + "grad_norm": 1.0035297870635986, + "learning_rate": 2.8696581030289838e-05, + "loss": 0.1144, + "num_input_tokens_seen": 1859184, + "step": 9765 + }, + { + "epoch": 5.077962577962578, + "grad_norm": 0.002643064595758915, + "learning_rate": 2.8674150402836202e-05, + "loss": 0.0009, + "num_input_tokens_seen": 1860112, + "step": 9770 + }, + { + "epoch": 5.080561330561331, + "grad_norm": 0.01675534062087536, + "learning_rate": 2.8651716751941555e-05, + "loss": 0.0005, + "num_input_tokens_seen": 1861040, + "step": 9775 + }, + { + "epoch": 5.083160083160083, + "grad_norm": 0.008381146006286144, + "learning_rate": 2.862928009606643e-05, + "loss": 0.0005, + "num_input_tokens_seen": 1861968, + "step": 9780 + }, + { + "epoch": 5.085758835758836, + "grad_norm": 0.010794018395245075, + "learning_rate": 2.8606840453673867e-05, + "loss": 0.0013, + "num_input_tokens_seen": 1862928, + "step": 9785 + }, + { + "epoch": 5.088357588357589, + "grad_norm": 0.004928914364427328, + "learning_rate": 2.8584397843229317e-05, + "loss": 0.1628, + "num_input_tokens_seen": 1863920, + "step": 9790 + }, + { + "epoch": 5.090956340956341, + "grad_norm": 0.06969255208969116, + "learning_rate": 2.856195228320071e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1864848, + "step": 9795 + }, + { + "epoch": 5.093555093555094, + "grad_norm": 0.0037022975739091635, + "learning_rate": 2.8539503792058393e-05, + "loss": 0.001, + "num_input_tokens_seen": 1865808, + "step": 9800 + }, + { + "epoch": 5.096153846153846, + "grad_norm": 0.0229784082621336, + "learning_rate": 2.8517052388275116e-05, + "loss": 0.0015, + "num_input_tokens_seen": 1866768, + "step": 9805 + }, + { + "epoch": 5.098752598752599, + "grad_norm": 0.014956368133425713, + "learning_rate": 2.8494598090326043e-05, + "loss": 0.0009, + "num_input_tokens_seen": 1867696, + "step": 9810 + }, + { + "epoch": 5.101351351351352, + "grad_norm": 0.03505134955048561, + "learning_rate": 2.8472140916688706e-05, + "loss": 0.0017, + "num_input_tokens_seen": 1868624, + "step": 9815 + }, + { + "epoch": 5.103950103950104, + "grad_norm": 0.0494837611913681, + "learning_rate": 2.8449680885843e-05, + "loss": 0.0012, + "num_input_tokens_seen": 1869616, + "step": 9820 + }, + { + "epoch": 5.106548856548857, + "grad_norm": 0.006388429086655378, + "learning_rate": 2.8427218016271185e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1870480, + "step": 9825 + }, + { + "epoch": 5.109147609147609, + "grad_norm": 0.0047788131050765514, + "learning_rate": 2.8404752326457856e-05, + "loss": 0.0008, + "num_input_tokens_seen": 1871472, + "step": 9830 + }, + { + "epoch": 5.111746361746362, + "grad_norm": 0.0061924876645207405, + "learning_rate": 2.8382283834889904e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1872464, + "step": 9835 + }, + { + "epoch": 5.114345114345114, + "grad_norm": 0.0370691753923893, + "learning_rate": 2.8359812560056564e-05, + "loss": 0.0269, + "num_input_tokens_seen": 1873392, + "step": 9840 + }, + { + "epoch": 5.116943866943867, + "grad_norm": 0.003972810693085194, + "learning_rate": 2.8337338520449336e-05, + "loss": 0.0005, + "num_input_tokens_seen": 1874384, + "step": 9845 + }, + { + "epoch": 5.11954261954262, + "grad_norm": 0.008540164679288864, + "learning_rate": 2.8314861734561997e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1875280, + "step": 9850 + }, + { + "epoch": 5.122141372141372, + "grad_norm": 0.006937776226550341, + "learning_rate": 2.829238222089059e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1876144, + "step": 9855 + }, + { + "epoch": 5.124740124740125, + "grad_norm": 0.022330282256007195, + "learning_rate": 2.82698999979334e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1877040, + "step": 9860 + }, + { + "epoch": 5.127338877338877, + "grad_norm": 0.0015556247672066092, + "learning_rate": 2.8247415084190953e-05, + "loss": 0.0588, + "num_input_tokens_seen": 1878000, + "step": 9865 + }, + { + "epoch": 5.12993762993763, + "grad_norm": 0.0036757371854037046, + "learning_rate": 2.8224927498165964e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1879024, + "step": 9870 + }, + { + "epoch": 5.132536382536382, + "grad_norm": 0.001998922089114785, + "learning_rate": 2.820243725836337e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1879920, + "step": 9875 + }, + { + "epoch": 5.135135135135135, + "grad_norm": 0.0038021127693355083, + "learning_rate": 2.8179944383290274e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1880848, + "step": 9880 + }, + { + "epoch": 5.137733887733888, + "grad_norm": 0.004488928243517876, + "learning_rate": 2.8157448891455963e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1881776, + "step": 9885 + }, + { + "epoch": 5.14033264033264, + "grad_norm": 0.0031986022368073463, + "learning_rate": 2.813495080137186e-05, + "loss": 0.1078, + "num_input_tokens_seen": 1882704, + "step": 9890 + }, + { + "epoch": 5.142931392931393, + "grad_norm": 0.012105569243431091, + "learning_rate": 2.8112450131551564e-05, + "loss": 0.0006, + "num_input_tokens_seen": 1883728, + "step": 9895 + }, + { + "epoch": 5.145530145530145, + "grad_norm": 0.013135685585439205, + "learning_rate": 2.808994690051075e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1884848, + "step": 9900 + }, + { + "epoch": 5.148128898128898, + "grad_norm": 0.03042474202811718, + "learning_rate": 2.806744112676722e-05, + "loss": 0.0037, + "num_input_tokens_seen": 1885840, + "step": 9905 + }, + { + "epoch": 5.150727650727651, + "grad_norm": 0.01673874817788601, + "learning_rate": 2.804493282884087e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1886768, + "step": 9910 + }, + { + "epoch": 5.153326403326403, + "grad_norm": 0.0015536720165982842, + "learning_rate": 2.8022422025253682e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1887696, + "step": 9915 + }, + { + "epoch": 5.155925155925156, + "grad_norm": 0.016831710934638977, + "learning_rate": 2.7999908734529673e-05, + "loss": 0.0815, + "num_input_tokens_seen": 1888624, + "step": 9920 + }, + { + "epoch": 5.158523908523908, + "grad_norm": 0.00817448552697897, + "learning_rate": 2.7977392975194937e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1889648, + "step": 9925 + }, + { + "epoch": 5.161122661122661, + "grad_norm": 0.009548602625727654, + "learning_rate": 2.7954874765777583e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1890640, + "step": 9930 + }, + { + "epoch": 5.163721413721413, + "grad_norm": 0.001991298981010914, + "learning_rate": 2.793235412480774e-05, + "loss": 0.0001, + "num_input_tokens_seen": 1891632, + "step": 9935 + }, + { + "epoch": 5.166320166320166, + "grad_norm": 0.00663122721016407, + "learning_rate": 2.790983107081753e-05, + "loss": 0.1346, + "num_input_tokens_seen": 1892592, + "step": 9940 + }, + { + "epoch": 5.168918918918919, + "grad_norm": 0.011370430700480938, + "learning_rate": 2.7887305622341087e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1893520, + "step": 9945 + }, + { + "epoch": 5.171517671517671, + "grad_norm": 0.0018200621707364917, + "learning_rate": 2.786477779791447e-05, + "loss": 0.0009, + "num_input_tokens_seen": 1894448, + "step": 9950 + }, + { + "epoch": 5.174116424116424, + "grad_norm": 0.04038584232330322, + "learning_rate": 2.7842247616075734e-05, + "loss": 0.1204, + "num_input_tokens_seen": 1895376, + "step": 9955 + }, + { + "epoch": 5.1767151767151764, + "grad_norm": 3.8666465282440186, + "learning_rate": 2.7819715095364863e-05, + "loss": 0.1207, + "num_input_tokens_seen": 1896304, + "step": 9960 + }, + { + "epoch": 5.179313929313929, + "grad_norm": 0.016805499792099, + "learning_rate": 2.779718025432375e-05, + "loss": 0.0079, + "num_input_tokens_seen": 1897328, + "step": 9965 + }, + { + "epoch": 5.1819126819126815, + "grad_norm": 0.2628886103630066, + "learning_rate": 2.777464311149622e-05, + "loss": 0.0045, + "num_input_tokens_seen": 1898288, + "step": 9970 + }, + { + "epoch": 5.1845114345114345, + "grad_norm": 0.03407302126288414, + "learning_rate": 2.775210368542797e-05, + "loss": 0.0019, + "num_input_tokens_seen": 1899280, + "step": 9975 + }, + { + "epoch": 5.1871101871101875, + "grad_norm": 0.033863477408885956, + "learning_rate": 2.77295619946666e-05, + "loss": 0.0013, + "num_input_tokens_seen": 1900144, + "step": 9980 + }, + { + "epoch": 5.1897089397089395, + "grad_norm": 0.022346243262290955, + "learning_rate": 2.770701805776155e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1901104, + "step": 9985 + }, + { + "epoch": 5.1923076923076925, + "grad_norm": 2.816162347793579, + "learning_rate": 2.7684471893264124e-05, + "loss": 0.0018, + "num_input_tokens_seen": 1902128, + "step": 9990 + }, + { + "epoch": 5.194906444906445, + "grad_norm": 0.008431602269411087, + "learning_rate": 2.7661923519727463e-05, + "loss": 0.0005, + "num_input_tokens_seen": 1903088, + "step": 9995 + }, + { + "epoch": 5.197505197505198, + "grad_norm": 0.0036833544727414846, + "learning_rate": 2.76393729557065e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1904080, + "step": 10000 + }, + { + "epoch": 5.20010395010395, + "grad_norm": 0.00664568692445755, + "learning_rate": 2.7616820219757993e-05, + "loss": 0.1228, + "num_input_tokens_seen": 1904976, + "step": 10005 + }, + { + "epoch": 5.202702702702703, + "grad_norm": 0.02100154384970665, + "learning_rate": 2.7594265330440494e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1905968, + "step": 10010 + }, + { + "epoch": 5.205301455301456, + "grad_norm": 0.00351136177778244, + "learning_rate": 2.7571708306314298e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1906928, + "step": 10015 + }, + { + "epoch": 5.207900207900208, + "grad_norm": 0.012274142354726791, + "learning_rate": 2.754914916594148e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1907824, + "step": 10020 + }, + { + "epoch": 5.210498960498961, + "grad_norm": 0.006357515696436167, + "learning_rate": 2.7526587927885857e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1908816, + "step": 10025 + }, + { + "epoch": 5.213097713097713, + "grad_norm": 0.004730620887130499, + "learning_rate": 2.7504024610712963e-05, + "loss": 0.0027, + "num_input_tokens_seen": 1909744, + "step": 10030 + }, + { + "epoch": 5.215696465696466, + "grad_norm": 0.006055327132344246, + "learning_rate": 2.7481459232990038e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1910736, + "step": 10035 + }, + { + "epoch": 5.218295218295219, + "grad_norm": 0.0024731552693992853, + "learning_rate": 2.7458891813286024e-05, + "loss": 0.0476, + "num_input_tokens_seen": 1911664, + "step": 10040 + }, + { + "epoch": 5.220893970893971, + "grad_norm": 0.010041159577667713, + "learning_rate": 2.7436322370171562e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1912624, + "step": 10045 + }, + { + "epoch": 5.223492723492724, + "grad_norm": 0.023737983778119087, + "learning_rate": 2.7413750922218917e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1913584, + "step": 10050 + }, + { + "epoch": 5.226091476091476, + "grad_norm": 0.003538857214152813, + "learning_rate": 2.739117748800204e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1914512, + "step": 10055 + }, + { + "epoch": 5.228690228690229, + "grad_norm": 0.001471049152314663, + "learning_rate": 2.7368602086096494e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1915440, + "step": 10060 + }, + { + "epoch": 5.231288981288981, + "grad_norm": 0.0023415766190737486, + "learning_rate": 2.7346024735079486e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1916400, + "step": 10065 + }, + { + "epoch": 5.233887733887734, + "grad_norm": 0.0017525682924315333, + "learning_rate": 2.7323445453529795e-05, + "loss": 0.0001, + "num_input_tokens_seen": 1917328, + "step": 10070 + }, + { + "epoch": 5.236486486486487, + "grad_norm": 0.005199346225708723, + "learning_rate": 2.730086426002782e-05, + "loss": 0.0001, + "num_input_tokens_seen": 1918224, + "step": 10075 + }, + { + "epoch": 5.239085239085239, + "grad_norm": 0.003893016604706645, + "learning_rate": 2.7278281173155507e-05, + "loss": 0.0001, + "num_input_tokens_seen": 1919152, + "step": 10080 + }, + { + "epoch": 5.241683991683992, + "grad_norm": 0.004593814257532358, + "learning_rate": 2.7255696211496375e-05, + "loss": 0.0001, + "num_input_tokens_seen": 1920080, + "step": 10085 + }, + { + "epoch": 5.244282744282744, + "grad_norm": 0.003355672350153327, + "learning_rate": 2.7233109393635482e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1920976, + "step": 10090 + }, + { + "epoch": 5.246881496881497, + "grad_norm": 0.02631322294473648, + "learning_rate": 2.7210520738159423e-05, + "loss": 0.0005, + "num_input_tokens_seen": 1921904, + "step": 10095 + }, + { + "epoch": 5.24948024948025, + "grad_norm": 2.7663872241973877, + "learning_rate": 2.718793026365628e-05, + "loss": 0.121, + "num_input_tokens_seen": 1922832, + "step": 10100 + }, + { + "epoch": 5.252079002079002, + "grad_norm": 0.004599854815751314, + "learning_rate": 2.716533798871565e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1923856, + "step": 10105 + }, + { + "epoch": 5.254677754677755, + "grad_norm": 0.03421506658196449, + "learning_rate": 2.7142743931928628e-05, + "loss": 0.001, + "num_input_tokens_seen": 1924784, + "step": 10110 + }, + { + "epoch": 5.257276507276507, + "grad_norm": 0.03519110754132271, + "learning_rate": 2.7120148111887732e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1925840, + "step": 10115 + }, + { + "epoch": 5.25987525987526, + "grad_norm": 0.018554646521806717, + "learning_rate": 2.7097550547186973e-05, + "loss": 0.0015, + "num_input_tokens_seen": 1926736, + "step": 10120 + }, + { + "epoch": 5.262474012474012, + "grad_norm": 0.008131231181323528, + "learning_rate": 2.7074951256421776e-05, + "loss": 0.0006, + "num_input_tokens_seen": 1927632, + "step": 10125 + }, + { + "epoch": 5.265072765072765, + "grad_norm": 0.011917541734874249, + "learning_rate": 2.7052350258188987e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1928560, + "step": 10130 + }, + { + "epoch": 5.267671517671518, + "grad_norm": 0.009634350426495075, + "learning_rate": 2.7029747571086857e-05, + "loss": 0.1283, + "num_input_tokens_seen": 1929424, + "step": 10135 + }, + { + "epoch": 5.27027027027027, + "grad_norm": 3.5154337882995605, + "learning_rate": 2.700714321371504e-05, + "loss": 0.0689, + "num_input_tokens_seen": 1930320, + "step": 10140 + }, + { + "epoch": 5.272869022869023, + "grad_norm": 0.006790866609662771, + "learning_rate": 2.6984537204674548e-05, + "loss": 0.0008, + "num_input_tokens_seen": 1931248, + "step": 10145 + }, + { + "epoch": 5.275467775467775, + "grad_norm": 0.19708314538002014, + "learning_rate": 2.6961929562567767e-05, + "loss": 0.0025, + "num_input_tokens_seen": 1932176, + "step": 10150 + }, + { + "epoch": 5.278066528066528, + "grad_norm": 0.12399598956108093, + "learning_rate": 2.693932030599841e-05, + "loss": 0.0019, + "num_input_tokens_seen": 1933168, + "step": 10155 + }, + { + "epoch": 5.28066528066528, + "grad_norm": 0.003532303497195244, + "learning_rate": 2.691670945357154e-05, + "loss": 0.003, + "num_input_tokens_seen": 1934128, + "step": 10160 + }, + { + "epoch": 5.283264033264033, + "grad_norm": 0.006831405218690634, + "learning_rate": 2.6894097023893504e-05, + "loss": 0.0005, + "num_input_tokens_seen": 1935024, + "step": 10165 + }, + { + "epoch": 5.285862785862786, + "grad_norm": 0.013531297445297241, + "learning_rate": 2.6871483035571977e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1935984, + "step": 10170 + }, + { + "epoch": 5.288461538461538, + "grad_norm": 0.012537305243313313, + "learning_rate": 2.68488675072159e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1936944, + "step": 10175 + }, + { + "epoch": 5.291060291060291, + "grad_norm": 0.020485134795308113, + "learning_rate": 2.6826250457435475e-05, + "loss": 0.0005, + "num_input_tokens_seen": 1937904, + "step": 10180 + }, + { + "epoch": 5.293659043659043, + "grad_norm": 0.0019531541038304567, + "learning_rate": 2.6803631904842174e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1938832, + "step": 10185 + }, + { + "epoch": 5.296257796257796, + "grad_norm": 0.022318178787827492, + "learning_rate": 2.67810118680487e-05, + "loss": 0.0009, + "num_input_tokens_seen": 1939856, + "step": 10190 + }, + { + "epoch": 5.298856548856548, + "grad_norm": 0.010108296759426594, + "learning_rate": 2.675839036566897e-05, + "loss": 0.0001, + "num_input_tokens_seen": 1940848, + "step": 10195 + }, + { + "epoch": 5.301455301455301, + "grad_norm": 0.0009701810195110738, + "learning_rate": 2.673576741631811e-05, + "loss": 0.1816, + "num_input_tokens_seen": 1941904, + "step": 10200 + }, + { + "epoch": 5.304054054054054, + "grad_norm": 0.00987024512141943, + "learning_rate": 2.671314303861244e-05, + "loss": 0.0955, + "num_input_tokens_seen": 1942864, + "step": 10205 + }, + { + "epoch": 5.3066528066528065, + "grad_norm": 0.009132437407970428, + "learning_rate": 2.6690517251169455e-05, + "loss": 0.0005, + "num_input_tokens_seen": 1943760, + "step": 10210 + }, + { + "epoch": 5.3092515592515594, + "grad_norm": 0.017236266285181046, + "learning_rate": 2.6667890072607805e-05, + "loss": 0.0019, + "num_input_tokens_seen": 1944752, + "step": 10215 + }, + { + "epoch": 5.3118503118503115, + "grad_norm": 0.034610167145729065, + "learning_rate": 2.6645261521547294e-05, + "loss": 0.0006, + "num_input_tokens_seen": 1945744, + "step": 10220 + }, + { + "epoch": 5.3144490644490645, + "grad_norm": 0.010934170335531235, + "learning_rate": 2.6622631616608845e-05, + "loss": 0.0005, + "num_input_tokens_seen": 1946640, + "step": 10225 + }, + { + "epoch": 5.317047817047817, + "grad_norm": 0.014199856668710709, + "learning_rate": 2.6600000376414496e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1947568, + "step": 10230 + }, + { + "epoch": 5.31964656964657, + "grad_norm": 0.014359809458255768, + "learning_rate": 2.65773678195874e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1948592, + "step": 10235 + }, + { + "epoch": 5.3222453222453225, + "grad_norm": 0.010484217666089535, + "learning_rate": 2.6554733964751776e-05, + "loss": 0.0101, + "num_input_tokens_seen": 1949584, + "step": 10240 + }, + { + "epoch": 5.324844074844075, + "grad_norm": 0.0017913737101480365, + "learning_rate": 2.653209883053291e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1950480, + "step": 10245 + }, + { + "epoch": 5.327442827442828, + "grad_norm": 6.19639778137207, + "learning_rate": 2.6509462435557152e-05, + "loss": 0.1028, + "num_input_tokens_seen": 1951408, + "step": 10250 + }, + { + "epoch": 5.33004158004158, + "grad_norm": 0.017503825947642326, + "learning_rate": 2.6486824798451892e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1952432, + "step": 10255 + }, + { + "epoch": 5.332640332640333, + "grad_norm": 0.004010719712823629, + "learning_rate": 2.646418593784552e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1953264, + "step": 10260 + }, + { + "epoch": 5.335239085239086, + "grad_norm": 0.00308617134578526, + "learning_rate": 2.6441545872367453e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1954224, + "step": 10265 + }, + { + "epoch": 5.337837837837838, + "grad_norm": 0.006299132946878672, + "learning_rate": 2.6418904620648094e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1955184, + "step": 10270 + }, + { + "epoch": 5.340436590436591, + "grad_norm": 0.028571240603923798, + "learning_rate": 2.6396262201318823e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1956080, + "step": 10275 + }, + { + "epoch": 5.343035343035343, + "grad_norm": 0.013825909234583378, + "learning_rate": 2.637361863301198e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1957040, + "step": 10280 + }, + { + "epoch": 5.345634095634096, + "grad_norm": 0.035341404378414154, + "learning_rate": 2.6350973934360857e-05, + "loss": 0.0008, + "num_input_tokens_seen": 1958032, + "step": 10285 + }, + { + "epoch": 5.348232848232848, + "grad_norm": 0.0026790122501552105, + "learning_rate": 2.6328328123999664e-05, + "loss": 0.0011, + "num_input_tokens_seen": 1959024, + "step": 10290 + }, + { + "epoch": 5.350831600831601, + "grad_norm": 0.8374123573303223, + "learning_rate": 2.6305681220563526e-05, + "loss": 0.001, + "num_input_tokens_seen": 1959984, + "step": 10295 + }, + { + "epoch": 5.353430353430354, + "grad_norm": 0.002192010637372732, + "learning_rate": 2.6283033242688478e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1960912, + "step": 10300 + }, + { + "epoch": 5.356029106029106, + "grad_norm": 0.0014181931037455797, + "learning_rate": 2.626038420901144e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1961840, + "step": 10305 + }, + { + "epoch": 5.358627858627859, + "grad_norm": 0.0030922475270926952, + "learning_rate": 2.6237734138170177e-05, + "loss": 0.0001, + "num_input_tokens_seen": 1962736, + "step": 10310 + }, + { + "epoch": 5.361226611226611, + "grad_norm": 0.002371805487200618, + "learning_rate": 2.6215083048803348e-05, + "loss": 0.0001, + "num_input_tokens_seen": 1963600, + "step": 10315 + }, + { + "epoch": 5.363825363825364, + "grad_norm": 0.00205264356918633, + "learning_rate": 2.6192430959550407e-05, + "loss": 0.1177, + "num_input_tokens_seen": 1964560, + "step": 10320 + }, + { + "epoch": 5.366424116424117, + "grad_norm": 0.0008195419213734567, + "learning_rate": 2.616977788905166e-05, + "loss": 0.0001, + "num_input_tokens_seen": 1965488, + "step": 10325 + }, + { + "epoch": 5.369022869022869, + "grad_norm": 0.008372941054403782, + "learning_rate": 2.614712385594822e-05, + "loss": 0.0248, + "num_input_tokens_seen": 1966448, + "step": 10330 + }, + { + "epoch": 5.371621621621622, + "grad_norm": 0.0013117867056280375, + "learning_rate": 2.6124468878881968e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1967440, + "step": 10335 + }, + { + "epoch": 5.374220374220374, + "grad_norm": 0.034095168113708496, + "learning_rate": 2.61018129764956e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1968432, + "step": 10340 + }, + { + "epoch": 5.376819126819127, + "grad_norm": 0.17853663861751556, + "learning_rate": 2.6079156167432524e-05, + "loss": 0.0006, + "num_input_tokens_seen": 1969424, + "step": 10345 + }, + { + "epoch": 5.379417879417879, + "grad_norm": 5.520345687866211, + "learning_rate": 2.6056498470336936e-05, + "loss": 0.0121, + "num_input_tokens_seen": 1970384, + "step": 10350 + }, + { + "epoch": 5.382016632016632, + "grad_norm": 0.0020828046835958958, + "learning_rate": 2.6033839903853745e-05, + "loss": 0.0001, + "num_input_tokens_seen": 1971280, + "step": 10355 + }, + { + "epoch": 5.384615384615385, + "grad_norm": 0.0014068797463551164, + "learning_rate": 2.6011180486628585e-05, + "loss": 0.0001, + "num_input_tokens_seen": 1972240, + "step": 10360 + }, + { + "epoch": 5.387214137214137, + "grad_norm": 0.10333020985126495, + "learning_rate": 2.5988520237307774e-05, + "loss": 0.043, + "num_input_tokens_seen": 1973232, + "step": 10365 + }, + { + "epoch": 5.38981288981289, + "grad_norm": 0.19011472165584564, + "learning_rate": 2.596585917453833e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1974160, + "step": 10370 + }, + { + "epoch": 5.392411642411642, + "grad_norm": 0.0008690414833836257, + "learning_rate": 2.5943197316967933e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1975056, + "step": 10375 + }, + { + "epoch": 5.395010395010395, + "grad_norm": 0.003961566369980574, + "learning_rate": 2.5920534683244914e-05, + "loss": 0.0001, + "num_input_tokens_seen": 1976048, + "step": 10380 + }, + { + "epoch": 5.397609147609147, + "grad_norm": 2.2926416397094727, + "learning_rate": 2.5897871292018256e-05, + "loss": 0.0005, + "num_input_tokens_seen": 1976944, + "step": 10385 + }, + { + "epoch": 5.4002079002079, + "grad_norm": 0.0008031894685700536, + "learning_rate": 2.5875207161937553e-05, + "loss": 0.0009, + "num_input_tokens_seen": 1977840, + "step": 10390 + }, + { + "epoch": 5.402806652806653, + "grad_norm": 0.004648193717002869, + "learning_rate": 2.5852542311653005e-05, + "loss": 0.0, + "num_input_tokens_seen": 1978736, + "step": 10395 + }, + { + "epoch": 5.405405405405405, + "grad_norm": 0.00120589102152735, + "learning_rate": 2.5829876759815414e-05, + "loss": 0.0001, + "num_input_tokens_seen": 1979664, + "step": 10400 + }, + { + "epoch": 5.408004158004158, + "grad_norm": 0.0007858658209443092, + "learning_rate": 2.5807210525076158e-05, + "loss": 0.0001, + "num_input_tokens_seen": 1980624, + "step": 10405 + }, + { + "epoch": 5.41060291060291, + "grad_norm": 3.0664637088775635, + "learning_rate": 2.5784543626087172e-05, + "loss": 0.2001, + "num_input_tokens_seen": 1981488, + "step": 10410 + }, + { + "epoch": 5.413201663201663, + "grad_norm": 0.0005500117549672723, + "learning_rate": 2.576187608150094e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1982416, + "step": 10415 + }, + { + "epoch": 5.415800415800415, + "grad_norm": 0.019583577290177345, + "learning_rate": 2.5739207909970485e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1983312, + "step": 10420 + }, + { + "epoch": 5.418399168399168, + "grad_norm": 0.030713984742760658, + "learning_rate": 2.5716539130149326e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1984304, + "step": 10425 + }, + { + "epoch": 5.420997920997921, + "grad_norm": 0.01792234554886818, + "learning_rate": 2.56938697606915e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1985296, + "step": 10430 + }, + { + "epoch": 5.423596673596673, + "grad_norm": 3.5433850288391113, + "learning_rate": 2.5671199820251534e-05, + "loss": 0.1103, + "num_input_tokens_seen": 1986256, + "step": 10435 + }, + { + "epoch": 5.426195426195426, + "grad_norm": 8.435336112976074, + "learning_rate": 2.56485293274844e-05, + "loss": 0.1899, + "num_input_tokens_seen": 1987280, + "step": 10440 + }, + { + "epoch": 5.4287941787941785, + "grad_norm": 0.033955167979002, + "learning_rate": 2.5625858301045535e-05, + "loss": 0.0012, + "num_input_tokens_seen": 1988240, + "step": 10445 + }, + { + "epoch": 5.4313929313929314, + "grad_norm": 0.029579253867268562, + "learning_rate": 2.5603186759590837e-05, + "loss": 0.146, + "num_input_tokens_seen": 1989200, + "step": 10450 + }, + { + "epoch": 5.4339916839916835, + "grad_norm": 0.029145991429686546, + "learning_rate": 2.558051472177661e-05, + "loss": 0.0658, + "num_input_tokens_seen": 1990160, + "step": 10455 + }, + { + "epoch": 5.4365904365904365, + "grad_norm": 0.02683936059474945, + "learning_rate": 2.5557842206259552e-05, + "loss": 0.0075, + "num_input_tokens_seen": 1991088, + "step": 10460 + }, + { + "epoch": 5.4391891891891895, + "grad_norm": 0.021585559472441673, + "learning_rate": 2.5535169231696777e-05, + "loss": 0.003, + "num_input_tokens_seen": 1992048, + "step": 10465 + }, + { + "epoch": 5.441787941787942, + "grad_norm": 0.03644300997257233, + "learning_rate": 2.5512495816745773e-05, + "loss": 0.0015, + "num_input_tokens_seen": 1993040, + "step": 10470 + }, + { + "epoch": 5.4443866943866945, + "grad_norm": 0.024162359535694122, + "learning_rate": 2.5489821980064383e-05, + "loss": 0.0015, + "num_input_tokens_seen": 1993968, + "step": 10475 + }, + { + "epoch": 5.446985446985447, + "grad_norm": 0.018508341163396835, + "learning_rate": 2.546714774031079e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1994864, + "step": 10480 + }, + { + "epoch": 5.4495841995842, + "grad_norm": 0.06826429069042206, + "learning_rate": 2.5444473116143534e-05, + "loss": 0.0013, + "num_input_tokens_seen": 1995792, + "step": 10485 + }, + { + "epoch": 5.452182952182953, + "grad_norm": 4.974493980407715, + "learning_rate": 2.5421798126221447e-05, + "loss": 0.1413, + "num_input_tokens_seen": 1996720, + "step": 10490 + }, + { + "epoch": 5.454781704781705, + "grad_norm": 0.012242819182574749, + "learning_rate": 2.5399122789203672e-05, + "loss": 0.1226, + "num_input_tokens_seen": 1997616, + "step": 10495 + }, + { + "epoch": 5.457380457380458, + "grad_norm": 0.036347731947898865, + "learning_rate": 2.537644712374965e-05, + "loss": 0.0009, + "num_input_tokens_seen": 1998576, + "step": 10500 + }, + { + "epoch": 5.45997920997921, + "grad_norm": 0.0069772712886333466, + "learning_rate": 2.5353771148519057e-05, + "loss": 0.002, + "num_input_tokens_seen": 1999632, + "step": 10505 + }, + { + "epoch": 5.462577962577963, + "grad_norm": 0.021137570962309837, + "learning_rate": 2.5331094882171857e-05, + "loss": 0.0014, + "num_input_tokens_seen": 2000560, + "step": 10510 + }, + { + "epoch": 5.465176715176715, + "grad_norm": 0.6438010931015015, + "learning_rate": 2.5308418343368247e-05, + "loss": 0.0043, + "num_input_tokens_seen": 2001552, + "step": 10515 + }, + { + "epoch": 5.467775467775468, + "grad_norm": 0.0005115228123031557, + "learning_rate": 2.528574155076864e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2002544, + "step": 10520 + }, + { + "epoch": 5.470374220374221, + "grad_norm": 0.03160947561264038, + "learning_rate": 2.5263064523033653e-05, + "loss": 0.0009, + "num_input_tokens_seen": 2003568, + "step": 10525 + }, + { + "epoch": 5.472972972972973, + "grad_norm": 0.014572198502719402, + "learning_rate": 2.524038727882411e-05, + "loss": 0.0008, + "num_input_tokens_seen": 2004464, + "step": 10530 + }, + { + "epoch": 5.475571725571726, + "grad_norm": 5.607770919799805, + "learning_rate": 2.521770983680102e-05, + "loss": 0.0105, + "num_input_tokens_seen": 2005488, + "step": 10535 + }, + { + "epoch": 5.478170478170478, + "grad_norm": 0.06191791221499443, + "learning_rate": 2.5195032215625524e-05, + "loss": 0.0007, + "num_input_tokens_seen": 2006448, + "step": 10540 + }, + { + "epoch": 5.480769230769231, + "grad_norm": 0.00349079305306077, + "learning_rate": 2.5172354433958944e-05, + "loss": 0.0006, + "num_input_tokens_seen": 2007440, + "step": 10545 + }, + { + "epoch": 5.483367983367984, + "grad_norm": 0.0006559825269505382, + "learning_rate": 2.5149676510462717e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2008336, + "step": 10550 + }, + { + "epoch": 5.485966735966736, + "grad_norm": 0.6264951825141907, + "learning_rate": 2.5126998463798396e-05, + "loss": 0.0009, + "num_input_tokens_seen": 2009232, + "step": 10555 + }, + { + "epoch": 5.488565488565489, + "grad_norm": 0.02239363081753254, + "learning_rate": 2.5104320312627634e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2010224, + "step": 10560 + }, + { + "epoch": 5.491164241164241, + "grad_norm": 0.009617208503186703, + "learning_rate": 2.5081642075612177e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2011056, + "step": 10565 + }, + { + "epoch": 5.493762993762994, + "grad_norm": 152.67724609375, + "learning_rate": 2.5058963771413844e-05, + "loss": 0.0664, + "num_input_tokens_seen": 2011952, + "step": 10570 + }, + { + "epoch": 5.496361746361746, + "grad_norm": 0.009262412786483765, + "learning_rate": 2.5036285418694507e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2012912, + "step": 10575 + }, + { + "epoch": 5.498960498960499, + "grad_norm": 0.0026789619587361813, + "learning_rate": 2.5013607036116065e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2013872, + "step": 10580 + }, + { + "epoch": 5.5, + "eval_loss": 0.29926496744155884, + "eval_runtime": 9.2826, + "eval_samples_per_second": 92.216, + "eval_steps_per_second": 23.054, + "num_input_tokens_seen": 2014288, + "step": 10582 + }, + { + "epoch": 5.501559251559252, + "grad_norm": 0.003648072248324752, + "learning_rate": 2.4990928642340468e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2014864, + "step": 10585 + }, + { + "epoch": 5.504158004158004, + "grad_norm": 16.418968200683594, + "learning_rate": 2.4968250256029636e-05, + "loss": 0.0495, + "num_input_tokens_seen": 2015792, + "step": 10590 + }, + { + "epoch": 5.506756756756757, + "grad_norm": 0.0015431083738803864, + "learning_rate": 2.4945571895845523e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2016720, + "step": 10595 + }, + { + "epoch": 5.509355509355509, + "grad_norm": 0.01611882634460926, + "learning_rate": 2.4922893580450038e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2017680, + "step": 10600 + }, + { + "epoch": 5.511954261954262, + "grad_norm": 0.0022706114687025547, + "learning_rate": 2.4900215328505063e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2018608, + "step": 10605 + }, + { + "epoch": 5.514553014553014, + "grad_norm": 0.004525800235569477, + "learning_rate": 2.4877537158672427e-05, + "loss": 0.1252, + "num_input_tokens_seen": 2019568, + "step": 10610 + }, + { + "epoch": 5.517151767151767, + "grad_norm": 0.053127843886613846, + "learning_rate": 2.485485908961388e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2020432, + "step": 10615 + }, + { + "epoch": 5.51975051975052, + "grad_norm": 0.005142239388078451, + "learning_rate": 2.48321811399911e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2021296, + "step": 10620 + }, + { + "epoch": 5.522349272349272, + "grad_norm": 0.0004047907132189721, + "learning_rate": 2.480950332846567e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2022256, + "step": 10625 + }, + { + "epoch": 5.524948024948025, + "grad_norm": 0.007690840400755405, + "learning_rate": 2.4786825673699052e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2023280, + "step": 10630 + }, + { + "epoch": 5.527546777546777, + "grad_norm": 7.516406059265137, + "learning_rate": 2.476414819435258e-05, + "loss": 0.1728, + "num_input_tokens_seen": 2024304, + "step": 10635 + }, + { + "epoch": 5.53014553014553, + "grad_norm": 0.08455125242471695, + "learning_rate": 2.4741470909087457e-05, + "loss": 0.0005, + "num_input_tokens_seen": 2025200, + "step": 10640 + }, + { + "epoch": 5.532744282744282, + "grad_norm": 0.010091974399983883, + "learning_rate": 2.471879383656469e-05, + "loss": 0.0515, + "num_input_tokens_seen": 2026128, + "step": 10645 + }, + { + "epoch": 5.535343035343035, + "grad_norm": 0.00788335595279932, + "learning_rate": 2.4696116995445147e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2027056, + "step": 10650 + }, + { + "epoch": 5.537941787941788, + "grad_norm": 4.5379414558410645, + "learning_rate": 2.4673440404389493e-05, + "loss": 0.0457, + "num_input_tokens_seen": 2028080, + "step": 10655 + }, + { + "epoch": 5.54054054054054, + "grad_norm": 0.11235693097114563, + "learning_rate": 2.465076408205818e-05, + "loss": 0.0987, + "num_input_tokens_seen": 2029040, + "step": 10660 + }, + { + "epoch": 5.543139293139293, + "grad_norm": 0.055466845631599426, + "learning_rate": 2.4628088047111464e-05, + "loss": 0.0014, + "num_input_tokens_seen": 2029936, + "step": 10665 + }, + { + "epoch": 5.545738045738045, + "grad_norm": 0.06472362577915192, + "learning_rate": 2.4605412318209332e-05, + "loss": 0.0022, + "num_input_tokens_seen": 2030864, + "step": 10670 + }, + { + "epoch": 5.548336798336798, + "grad_norm": 3.3896515369415283, + "learning_rate": 2.458273691401156e-05, + "loss": 0.0608, + "num_input_tokens_seen": 2031824, + "step": 10675 + }, + { + "epoch": 5.5509355509355505, + "grad_norm": 0.21340787410736084, + "learning_rate": 2.4560061853177594e-05, + "loss": 0.0048, + "num_input_tokens_seen": 2032816, + "step": 10680 + }, + { + "epoch": 5.553534303534303, + "grad_norm": 0.0550798736512661, + "learning_rate": 2.4537387154366653e-05, + "loss": 0.0007, + "num_input_tokens_seen": 2033744, + "step": 10685 + }, + { + "epoch": 5.556133056133056, + "grad_norm": 0.2143814116716385, + "learning_rate": 2.4514712836237638e-05, + "loss": 0.0007, + "num_input_tokens_seen": 2034608, + "step": 10690 + }, + { + "epoch": 5.5587318087318085, + "grad_norm": 0.1609329730272293, + "learning_rate": 2.4492038917449137e-05, + "loss": 0.0006, + "num_input_tokens_seen": 2035504, + "step": 10695 + }, + { + "epoch": 5.5613305613305615, + "grad_norm": 0.002900491002947092, + "learning_rate": 2.446936541665941e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2036432, + "step": 10700 + }, + { + "epoch": 5.563929313929314, + "grad_norm": 0.005889768712222576, + "learning_rate": 2.4446692352526387e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2037360, + "step": 10705 + }, + { + "epoch": 5.5665280665280665, + "grad_norm": 0.02295658364892006, + "learning_rate": 2.4424019743707607e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2038384, + "step": 10710 + }, + { + "epoch": 5.5691268191268195, + "grad_norm": 0.001752823474816978, + "learning_rate": 2.4401347608860257e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2039248, + "step": 10715 + }, + { + "epoch": 5.571725571725572, + "grad_norm": 0.0023484439589083195, + "learning_rate": 2.4378675966641134e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2040272, + "step": 10720 + }, + { + "epoch": 5.574324324324325, + "grad_norm": 0.0015538270818069577, + "learning_rate": 2.4356004835706625e-05, + "loss": 0.1196, + "num_input_tokens_seen": 2041232, + "step": 10725 + }, + { + "epoch": 5.576923076923077, + "grad_norm": 0.001902109826914966, + "learning_rate": 2.4333334234712697e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2042192, + "step": 10730 + }, + { + "epoch": 5.57952182952183, + "grad_norm": 0.003968250472098589, + "learning_rate": 2.4310664182314873e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2043120, + "step": 10735 + }, + { + "epoch": 5.582120582120583, + "grad_norm": 0.0033294521272182465, + "learning_rate": 2.4287994697168247e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2044112, + "step": 10740 + }, + { + "epoch": 5.584719334719335, + "grad_norm": 0.013270138762891293, + "learning_rate": 2.426532579792742e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2045008, + "step": 10745 + }, + { + "epoch": 5.587318087318088, + "grad_norm": 0.002508724108338356, + "learning_rate": 2.4242657503246523e-05, + "loss": 0.0007, + "num_input_tokens_seen": 2046096, + "step": 10750 + }, + { + "epoch": 5.58991683991684, + "grad_norm": 0.003725097281858325, + "learning_rate": 2.4219989831779187e-05, + "loss": 0.0007, + "num_input_tokens_seen": 2047056, + "step": 10755 + }, + { + "epoch": 5.592515592515593, + "grad_norm": 0.0041701230220496655, + "learning_rate": 2.4197322802178534e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2047952, + "step": 10760 + }, + { + "epoch": 5.595114345114345, + "grad_norm": 0.00393138499930501, + "learning_rate": 2.417465643309716e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2048944, + "step": 10765 + }, + { + "epoch": 5.597713097713098, + "grad_norm": 0.002464865567162633, + "learning_rate": 2.415199074318712e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2049904, + "step": 10770 + }, + { + "epoch": 5.600311850311851, + "grad_norm": 0.000815605977550149, + "learning_rate": 2.412932575109988e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2050928, + "step": 10775 + }, + { + "epoch": 5.602910602910603, + "grad_norm": 0.13638317584991455, + "learning_rate": 2.410666147548637e-05, + "loss": 0.1441, + "num_input_tokens_seen": 2051856, + "step": 10780 + }, + { + "epoch": 5.605509355509356, + "grad_norm": 0.005669965408742428, + "learning_rate": 2.408399793499691e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2052816, + "step": 10785 + }, + { + "epoch": 5.608108108108108, + "grad_norm": 0.0031108863186091185, + "learning_rate": 2.4061335148281224e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2053808, + "step": 10790 + }, + { + "epoch": 5.610706860706861, + "grad_norm": 0.008867157623171806, + "learning_rate": 2.403867313398841e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2054704, + "step": 10795 + }, + { + "epoch": 5.613305613305613, + "grad_norm": 0.006907755509018898, + "learning_rate": 2.401601191076694e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2055600, + "step": 10800 + }, + { + "epoch": 5.615904365904366, + "grad_norm": 0.014711598865687847, + "learning_rate": 2.399335149726463e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2056560, + "step": 10805 + }, + { + "epoch": 5.618503118503119, + "grad_norm": 0.001467084395699203, + "learning_rate": 2.3970691912128608e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2057520, + "step": 10810 + }, + { + "epoch": 5.621101871101871, + "grad_norm": 0.0003620015049818903, + "learning_rate": 2.394803317400535e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2058480, + "step": 10815 + }, + { + "epoch": 5.623700623700624, + "grad_norm": 3.413004159927368, + "learning_rate": 2.3925375301540627e-05, + "loss": 0.0019, + "num_input_tokens_seen": 2059376, + "step": 10820 + }, + { + "epoch": 5.626299376299376, + "grad_norm": 0.012872993014752865, + "learning_rate": 2.390271831337949e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2060336, + "step": 10825 + }, + { + "epoch": 5.628898128898129, + "grad_norm": 0.00011620506120380014, + "learning_rate": 2.3880062228166276e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2061296, + "step": 10830 + }, + { + "epoch": 5.631496881496881, + "grad_norm": 0.003786996239796281, + "learning_rate": 2.3857407064544567e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2062256, + "step": 10835 + }, + { + "epoch": 5.634095634095634, + "grad_norm": 0.008235383778810501, + "learning_rate": 2.3834752841157188e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2063216, + "step": 10840 + }, + { + "epoch": 5.636694386694387, + "grad_norm": 0.020855052396655083, + "learning_rate": 2.381209957664619e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2064240, + "step": 10845 + }, + { + "epoch": 5.639293139293139, + "grad_norm": 0.003396180924028158, + "learning_rate": 2.3789447289652838e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2065136, + "step": 10850 + }, + { + "epoch": 5.641891891891892, + "grad_norm": 0.0001297876879107207, + "learning_rate": 2.37667959988176e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2066128, + "step": 10855 + }, + { + "epoch": 5.644490644490644, + "grad_norm": 0.002772279316559434, + "learning_rate": 2.374414572278011e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2067024, + "step": 10860 + }, + { + "epoch": 5.647089397089397, + "grad_norm": 11.855825424194336, + "learning_rate": 2.372149648017917e-05, + "loss": 0.1144, + "num_input_tokens_seen": 2068112, + "step": 10865 + }, + { + "epoch": 5.649688149688149, + "grad_norm": 0.0028839700389653444, + "learning_rate": 2.3698848289652747e-05, + "loss": 0.1382, + "num_input_tokens_seen": 2069104, + "step": 10870 + }, + { + "epoch": 5.652286902286902, + "grad_norm": 29.212858200073242, + "learning_rate": 2.3676201169837917e-05, + "loss": 0.1477, + "num_input_tokens_seen": 2070096, + "step": 10875 + }, + { + "epoch": 5.654885654885655, + "grad_norm": 0.03209097310900688, + "learning_rate": 2.365355513937089e-05, + "loss": 0.0008, + "num_input_tokens_seen": 2071120, + "step": 10880 + }, + { + "epoch": 5.657484407484407, + "grad_norm": 0.007313217967748642, + "learning_rate": 2.3630910216886982e-05, + "loss": 0.2419, + "num_input_tokens_seen": 2072080, + "step": 10885 + }, + { + "epoch": 5.66008316008316, + "grad_norm": 0.003747571026906371, + "learning_rate": 2.3608266421020592e-05, + "loss": 0.0873, + "num_input_tokens_seen": 2073008, + "step": 10890 + }, + { + "epoch": 5.662681912681912, + "grad_norm": 0.01943318545818329, + "learning_rate": 2.358562377040519e-05, + "loss": 0.0006, + "num_input_tokens_seen": 2074032, + "step": 10895 + }, + { + "epoch": 5.665280665280665, + "grad_norm": 0.010984661988914013, + "learning_rate": 2.356298228367331e-05, + "loss": 0.0022, + "num_input_tokens_seen": 2074992, + "step": 10900 + }, + { + "epoch": 5.667879417879417, + "grad_norm": 0.22504588961601257, + "learning_rate": 2.354034197945653e-05, + "loss": 0.0015, + "num_input_tokens_seen": 2075984, + "step": 10905 + }, + { + "epoch": 5.67047817047817, + "grad_norm": 46.76985168457031, + "learning_rate": 2.351770287638543e-05, + "loss": 0.0176, + "num_input_tokens_seen": 2076880, + "step": 10910 + }, + { + "epoch": 5.673076923076923, + "grad_norm": 0.006845077034085989, + "learning_rate": 2.3495064993089637e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2077744, + "step": 10915 + }, + { + "epoch": 5.675675675675675, + "grad_norm": 0.0029704112093895674, + "learning_rate": 2.3472428348197754e-05, + "loss": 0.0005, + "num_input_tokens_seen": 2078736, + "step": 10920 + }, + { + "epoch": 5.678274428274428, + "grad_norm": 0.00471919309347868, + "learning_rate": 2.344979296033737e-05, + "loss": 0.0014, + "num_input_tokens_seen": 2079632, + "step": 10925 + }, + { + "epoch": 5.6808731808731805, + "grad_norm": 0.07092984765768051, + "learning_rate": 2.3427158848135035e-05, + "loss": 0.0009, + "num_input_tokens_seen": 2080656, + "step": 10930 + }, + { + "epoch": 5.6834719334719335, + "grad_norm": 0.007078138180077076, + "learning_rate": 2.340452603021627e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2081616, + "step": 10935 + }, + { + "epoch": 5.686070686070686, + "grad_norm": 0.005939982365816832, + "learning_rate": 2.338189452520549e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2082608, + "step": 10940 + }, + { + "epoch": 5.6886694386694385, + "grad_norm": 0.01859750971198082, + "learning_rate": 2.335926435172606e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2083472, + "step": 10945 + }, + { + "epoch": 5.6912681912681915, + "grad_norm": 0.03301658481359482, + "learning_rate": 2.333663552840025e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2084432, + "step": 10950 + }, + { + "epoch": 5.693866943866944, + "grad_norm": 0.011214371770620346, + "learning_rate": 2.3314008073849207e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2085424, + "step": 10955 + }, + { + "epoch": 5.696465696465697, + "grad_norm": 0.023073457181453705, + "learning_rate": 2.329138200669296e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2086480, + "step": 10960 + }, + { + "epoch": 5.6990644490644495, + "grad_norm": 0.004935878794640303, + "learning_rate": 2.3268757345550383e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2087408, + "step": 10965 + }, + { + "epoch": 5.701663201663202, + "grad_norm": 0.002467743121087551, + "learning_rate": 2.3246134109039226e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2088304, + "step": 10970 + }, + { + "epoch": 5.704261954261955, + "grad_norm": 0.0014024702832102776, + "learning_rate": 2.3223512315776022e-05, + "loss": 0.0813, + "num_input_tokens_seen": 2089232, + "step": 10975 + }, + { + "epoch": 5.706860706860707, + "grad_norm": 0.07828935980796814, + "learning_rate": 2.320089198437614e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2090192, + "step": 10980 + }, + { + "epoch": 5.70945945945946, + "grad_norm": 0.0015075907576829195, + "learning_rate": 2.3178273133453748e-05, + "loss": 0.0014, + "num_input_tokens_seen": 2091120, + "step": 10985 + }, + { + "epoch": 5.712058212058212, + "grad_norm": 0.0010027489624917507, + "learning_rate": 2.3155655781621793e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2092080, + "step": 10990 + }, + { + "epoch": 5.714656964656965, + "grad_norm": 0.11277436465024948, + "learning_rate": 2.3133039947491987e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2093040, + "step": 10995 + }, + { + "epoch": 5.717255717255718, + "grad_norm": 0.0023304687347263098, + "learning_rate": 2.3110425649674796e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2094064, + "step": 11000 + }, + { + "epoch": 5.71985446985447, + "grad_norm": 0.0017756052548065782, + "learning_rate": 2.3087812906779408e-05, + "loss": 0.1127, + "num_input_tokens_seen": 2095024, + "step": 11005 + }, + { + "epoch": 5.722453222453223, + "grad_norm": 0.007530794478952885, + "learning_rate": 2.3065201737413748e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2096016, + "step": 11010 + }, + { + "epoch": 5.725051975051975, + "grad_norm": 0.0013643905986100435, + "learning_rate": 2.3042592160184444e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2097072, + "step": 11015 + }, + { + "epoch": 5.727650727650728, + "grad_norm": 0.049168504774570465, + "learning_rate": 2.3019984193696804e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2098000, + "step": 11020 + }, + { + "epoch": 5.73024948024948, + "grad_norm": 0.003926376812160015, + "learning_rate": 2.2997377856554822e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2098960, + "step": 11025 + }, + { + "epoch": 5.732848232848233, + "grad_norm": 0.0044921026565134525, + "learning_rate": 2.2974773167361146e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2099952, + "step": 11030 + }, + { + "epoch": 5.735446985446986, + "grad_norm": 0.001171500189229846, + "learning_rate": 2.295217014471707e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2100880, + "step": 11035 + }, + { + "epoch": 5.738045738045738, + "grad_norm": 9.132128715515137, + "learning_rate": 2.2929568807222508e-05, + "loss": 0.1192, + "num_input_tokens_seen": 2101904, + "step": 11040 + }, + { + "epoch": 5.740644490644491, + "grad_norm": 0.09206919372081757, + "learning_rate": 2.2906969173475995e-05, + "loss": 0.0005, + "num_input_tokens_seen": 2102896, + "step": 11045 + }, + { + "epoch": 5.743243243243243, + "grad_norm": 0.009075035341084003, + "learning_rate": 2.2884371262074665e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2103920, + "step": 11050 + }, + { + "epoch": 5.745841995841996, + "grad_norm": 0.0006276593776419759, + "learning_rate": 2.2861775091614233e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2104848, + "step": 11055 + }, + { + "epoch": 5.748440748440748, + "grad_norm": 0.005800605285912752, + "learning_rate": 2.2839180680688983e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2105904, + "step": 11060 + }, + { + "epoch": 5.751039501039501, + "grad_norm": 0.0008039016975089908, + "learning_rate": 2.2816588047891753e-05, + "loss": 0.1077, + "num_input_tokens_seen": 2106832, + "step": 11065 + }, + { + "epoch": 5.753638253638254, + "grad_norm": 0.003074242966249585, + "learning_rate": 2.27939972118139e-05, + "loss": 0.1502, + "num_input_tokens_seen": 2107696, + "step": 11070 + }, + { + "epoch": 5.756237006237006, + "grad_norm": 0.02146757021546364, + "learning_rate": 2.2771408191045322e-05, + "loss": 0.0006, + "num_input_tokens_seen": 2108624, + "step": 11075 + }, + { + "epoch": 5.758835758835759, + "grad_norm": 0.009509136900305748, + "learning_rate": 2.274882100417442e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2109552, + "step": 11080 + }, + { + "epoch": 5.761434511434511, + "grad_norm": 0.012518534436821938, + "learning_rate": 2.2726235669788083e-05, + "loss": 0.0012, + "num_input_tokens_seen": 2110544, + "step": 11085 + }, + { + "epoch": 5.764033264033264, + "grad_norm": 0.03696843236684799, + "learning_rate": 2.2703652206471667e-05, + "loss": 0.0016, + "num_input_tokens_seen": 2111536, + "step": 11090 + }, + { + "epoch": 5.766632016632016, + "grad_norm": 0.008932497352361679, + "learning_rate": 2.2681070632809014e-05, + "loss": 0.0005, + "num_input_tokens_seen": 2112560, + "step": 11095 + }, + { + "epoch": 5.769230769230769, + "grad_norm": 0.05037342384457588, + "learning_rate": 2.26584909673824e-05, + "loss": 0.0012, + "num_input_tokens_seen": 2113520, + "step": 11100 + }, + { + "epoch": 5.771829521829522, + "grad_norm": 0.016760215163230896, + "learning_rate": 2.2635913228772496e-05, + "loss": 0.0005, + "num_input_tokens_seen": 2114480, + "step": 11105 + }, + { + "epoch": 5.774428274428274, + "grad_norm": 3.2298526763916016, + "learning_rate": 2.2613337435558433e-05, + "loss": 0.1596, + "num_input_tokens_seen": 2115408, + "step": 11110 + }, + { + "epoch": 5.777027027027027, + "grad_norm": 0.07508603483438492, + "learning_rate": 2.2590763606317723e-05, + "loss": 0.0005, + "num_input_tokens_seen": 2116336, + "step": 11115 + }, + { + "epoch": 5.779625779625779, + "grad_norm": 0.005451715085655451, + "learning_rate": 2.2568191759626263e-05, + "loss": 0.0006, + "num_input_tokens_seen": 2117328, + "step": 11120 + }, + { + "epoch": 5.782224532224532, + "grad_norm": 0.08569209277629852, + "learning_rate": 2.254562191405832e-05, + "loss": 0.0017, + "num_input_tokens_seen": 2118288, + "step": 11125 + }, + { + "epoch": 5.784823284823284, + "grad_norm": 0.009893898852169514, + "learning_rate": 2.252305408818652e-05, + "loss": 0.0009, + "num_input_tokens_seen": 2119216, + "step": 11130 + }, + { + "epoch": 5.787422037422037, + "grad_norm": 0.012346192263066769, + "learning_rate": 2.250048830058181e-05, + "loss": 0.0008, + "num_input_tokens_seen": 2120208, + "step": 11135 + }, + { + "epoch": 5.79002079002079, + "grad_norm": 3.525078535079956, + "learning_rate": 2.2477924569813473e-05, + "loss": 0.0847, + "num_input_tokens_seen": 2121200, + "step": 11140 + }, + { + "epoch": 5.792619542619542, + "grad_norm": 0.0028407895006239414, + "learning_rate": 2.2455362914449094e-05, + "loss": 0.0009, + "num_input_tokens_seen": 2122128, + "step": 11145 + }, + { + "epoch": 5.795218295218295, + "grad_norm": 0.02499687299132347, + "learning_rate": 2.243280335305456e-05, + "loss": 0.0138, + "num_input_tokens_seen": 2123056, + "step": 11150 + }, + { + "epoch": 5.797817047817047, + "grad_norm": 0.01930459402501583, + "learning_rate": 2.2410245904194018e-05, + "loss": 0.0009, + "num_input_tokens_seen": 2124016, + "step": 11155 + }, + { + "epoch": 5.8004158004158, + "grad_norm": 0.0169349554926157, + "learning_rate": 2.2387690586429893e-05, + "loss": 0.0019, + "num_input_tokens_seen": 2124976, + "step": 11160 + }, + { + "epoch": 5.803014553014553, + "grad_norm": 0.007659503258764744, + "learning_rate": 2.2365137418322855e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2125872, + "step": 11165 + }, + { + "epoch": 5.8056133056133055, + "grad_norm": 0.2442292869091034, + "learning_rate": 2.234258641843179e-05, + "loss": 0.0013, + "num_input_tokens_seen": 2126800, + "step": 11170 + }, + { + "epoch": 5.808212058212058, + "grad_norm": 0.015883566811680794, + "learning_rate": 2.2320037605313808e-05, + "loss": 0.0005, + "num_input_tokens_seen": 2127760, + "step": 11175 + }, + { + "epoch": 5.8108108108108105, + "grad_norm": 0.0007432572892867029, + "learning_rate": 2.2297490997524224e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2128720, + "step": 11180 + }, + { + "epoch": 5.8134095634095635, + "grad_norm": 0.02664581686258316, + "learning_rate": 2.2274946613616537e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2129680, + "step": 11185 + }, + { + "epoch": 5.8160083160083165, + "grad_norm": 0.007977106608450413, + "learning_rate": 2.2252404472142414e-05, + "loss": 0.0005, + "num_input_tokens_seen": 2130608, + "step": 11190 + }, + { + "epoch": 5.8186070686070686, + "grad_norm": 0.8787294626235962, + "learning_rate": 2.2229864591651684e-05, + "loss": 0.0011, + "num_input_tokens_seen": 2131632, + "step": 11195 + }, + { + "epoch": 5.8212058212058215, + "grad_norm": 11.935964584350586, + "learning_rate": 2.220732699069229e-05, + "loss": 0.0986, + "num_input_tokens_seen": 2132528, + "step": 11200 + }, + { + "epoch": 5.823804573804574, + "grad_norm": 0.012400832958519459, + "learning_rate": 2.2184791687810327e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2133488, + "step": 11205 + }, + { + "epoch": 5.826403326403327, + "grad_norm": 0.0005005689454264939, + "learning_rate": 2.216225870154999e-05, + "loss": 0.1061, + "num_input_tokens_seen": 2134416, + "step": 11210 + }, + { + "epoch": 5.829002079002079, + "grad_norm": 53.908592224121094, + "learning_rate": 2.213972805045356e-05, + "loss": 0.0258, + "num_input_tokens_seen": 2135376, + "step": 11215 + }, + { + "epoch": 5.831600831600832, + "grad_norm": 0.0038925078697502613, + "learning_rate": 2.2117199753061414e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2136304, + "step": 11220 + }, + { + "epoch": 5.834199584199585, + "grad_norm": 0.056173935532569885, + "learning_rate": 2.209467382791198e-05, + "loss": 0.0008, + "num_input_tokens_seen": 2137328, + "step": 11225 + }, + { + "epoch": 5.836798336798337, + "grad_norm": 0.004718660842627287, + "learning_rate": 2.2072150293541743e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2138288, + "step": 11230 + }, + { + "epoch": 5.83939708939709, + "grad_norm": 0.004829366225749254, + "learning_rate": 2.2049629168485193e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2139216, + "step": 11235 + }, + { + "epoch": 5.841995841995842, + "grad_norm": 0.0018715744372457266, + "learning_rate": 2.2027110471274863e-05, + "loss": 0.0338, + "num_input_tokens_seen": 2140144, + "step": 11240 + }, + { + "epoch": 5.844594594594595, + "grad_norm": 0.0003762354317586869, + "learning_rate": 2.200459422044129e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2141104, + "step": 11245 + }, + { + "epoch": 5.847193347193347, + "grad_norm": 0.034508779644966125, + "learning_rate": 2.198208043451299e-05, + "loss": 0.0135, + "num_input_tokens_seen": 2142096, + "step": 11250 + }, + { + "epoch": 5.8497920997921, + "grad_norm": 3.173861026763916, + "learning_rate": 2.1959569132016445e-05, + "loss": 0.003, + "num_input_tokens_seen": 2143056, + "step": 11255 + }, + { + "epoch": 5.852390852390853, + "grad_norm": 0.03502676263451576, + "learning_rate": 2.193706033147611e-05, + "loss": 0.0686, + "num_input_tokens_seen": 2144048, + "step": 11260 + }, + { + "epoch": 5.854989604989605, + "grad_norm": 0.05227731168270111, + "learning_rate": 2.1914554051414354e-05, + "loss": 0.0009, + "num_input_tokens_seen": 2144976, + "step": 11265 + }, + { + "epoch": 5.857588357588358, + "grad_norm": 0.002713020658120513, + "learning_rate": 2.1892050310351503e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2145936, + "step": 11270 + }, + { + "epoch": 5.86018711018711, + "grad_norm": 0.009395481087267399, + "learning_rate": 2.1869549126805774e-05, + "loss": 0.1503, + "num_input_tokens_seen": 2146832, + "step": 11275 + }, + { + "epoch": 5.862785862785863, + "grad_norm": 0.019571801647543907, + "learning_rate": 2.1847050519293284e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2147728, + "step": 11280 + }, + { + "epoch": 5.865384615384615, + "grad_norm": 0.0015493794344365597, + "learning_rate": 2.182455450632803e-05, + "loss": 0.0008, + "num_input_tokens_seen": 2148688, + "step": 11285 + }, + { + "epoch": 5.867983367983368, + "grad_norm": 11.097652435302734, + "learning_rate": 2.1802061106421883e-05, + "loss": 0.3558, + "num_input_tokens_seen": 2149648, + "step": 11290 + }, + { + "epoch": 5.870582120582121, + "grad_norm": 0.23629263043403625, + "learning_rate": 2.177957033808455e-05, + "loss": 0.0032, + "num_input_tokens_seen": 2150672, + "step": 11295 + }, + { + "epoch": 5.873180873180873, + "grad_norm": 0.009623351506888866, + "learning_rate": 2.1757082219823572e-05, + "loss": 0.0018, + "num_input_tokens_seen": 2151600, + "step": 11300 + }, + { + "epoch": 5.875779625779626, + "grad_norm": 0.023345643654465675, + "learning_rate": 2.1734596770144324e-05, + "loss": 0.0006, + "num_input_tokens_seen": 2152560, + "step": 11305 + }, + { + "epoch": 5.878378378378378, + "grad_norm": 0.13546104729175568, + "learning_rate": 2.171211400754997e-05, + "loss": 0.0006, + "num_input_tokens_seen": 2153488, + "step": 11310 + }, + { + "epoch": 5.880977130977131, + "grad_norm": 0.007298161741346121, + "learning_rate": 2.1689633950541475e-05, + "loss": 0.0006, + "num_input_tokens_seen": 2154448, + "step": 11315 + }, + { + "epoch": 5.883575883575883, + "grad_norm": 29.143978118896484, + "learning_rate": 2.1667156617617568e-05, + "loss": 0.0924, + "num_input_tokens_seen": 2155376, + "step": 11320 + }, + { + "epoch": 5.886174636174636, + "grad_norm": 0.6480805277824402, + "learning_rate": 2.164468202727474e-05, + "loss": 0.099, + "num_input_tokens_seen": 2156368, + "step": 11325 + }, + { + "epoch": 5.888773388773389, + "grad_norm": 0.008334723301231861, + "learning_rate": 2.1622210198007238e-05, + "loss": 0.0005, + "num_input_tokens_seen": 2157296, + "step": 11330 + }, + { + "epoch": 5.891372141372141, + "grad_norm": 0.016444776207208633, + "learning_rate": 2.1599741148306997e-05, + "loss": 0.001, + "num_input_tokens_seen": 2158192, + "step": 11335 + }, + { + "epoch": 5.893970893970894, + "grad_norm": 0.01035422645509243, + "learning_rate": 2.1577274896663714e-05, + "loss": 0.0158, + "num_input_tokens_seen": 2159152, + "step": 11340 + }, + { + "epoch": 5.896569646569646, + "grad_norm": 0.015327009372413158, + "learning_rate": 2.155481146156475e-05, + "loss": 0.0006, + "num_input_tokens_seen": 2160176, + "step": 11345 + }, + { + "epoch": 5.899168399168399, + "grad_norm": 0.06849266588687897, + "learning_rate": 2.1532350861495168e-05, + "loss": 0.0005, + "num_input_tokens_seen": 2161072, + "step": 11350 + }, + { + "epoch": 5.901767151767151, + "grad_norm": 0.09582570195198059, + "learning_rate": 2.1509893114937688e-05, + "loss": 0.0009, + "num_input_tokens_seen": 2162032, + "step": 11355 + }, + { + "epoch": 5.904365904365904, + "grad_norm": 0.2519174814224243, + "learning_rate": 2.148743824037269e-05, + "loss": 0.0006, + "num_input_tokens_seen": 2162960, + "step": 11360 + }, + { + "epoch": 5.906964656964657, + "grad_norm": 0.03493322804570198, + "learning_rate": 2.1464986256278167e-05, + "loss": 0.0005, + "num_input_tokens_seen": 2163952, + "step": 11365 + }, + { + "epoch": 5.909563409563409, + "grad_norm": 0.00391751853749156, + "learning_rate": 2.1442537181129757e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2164848, + "step": 11370 + }, + { + "epoch": 5.912162162162162, + "grad_norm": 0.0035006788093596697, + "learning_rate": 2.1420091033400705e-05, + "loss": 0.1456, + "num_input_tokens_seen": 2165904, + "step": 11375 + }, + { + "epoch": 5.914760914760915, + "grad_norm": 0.059996895492076874, + "learning_rate": 2.139764783156183e-05, + "loss": 0.0008, + "num_input_tokens_seen": 2166896, + "step": 11380 + }, + { + "epoch": 5.917359667359667, + "grad_norm": 0.14110766351222992, + "learning_rate": 2.1375207594081547e-05, + "loss": 0.0014, + "num_input_tokens_seen": 2167856, + "step": 11385 + }, + { + "epoch": 5.91995841995842, + "grad_norm": 0.0067573837004601955, + "learning_rate": 2.135277033942582e-05, + "loss": 0.0007, + "num_input_tokens_seen": 2168816, + "step": 11390 + }, + { + "epoch": 5.922557172557172, + "grad_norm": 0.02307548187673092, + "learning_rate": 2.1330336086058154e-05, + "loss": 0.001, + "num_input_tokens_seen": 2169776, + "step": 11395 + }, + { + "epoch": 5.925155925155925, + "grad_norm": 0.028159886598587036, + "learning_rate": 2.1307904852439593e-05, + "loss": 0.0014, + "num_input_tokens_seen": 2170736, + "step": 11400 + }, + { + "epoch": 5.9277546777546775, + "grad_norm": 0.004450556356459856, + "learning_rate": 2.128547665702869e-05, + "loss": 0.0017, + "num_input_tokens_seen": 2171664, + "step": 11405 + }, + { + "epoch": 5.93035343035343, + "grad_norm": 0.007205131929367781, + "learning_rate": 2.126305151828151e-05, + "loss": 0.0007, + "num_input_tokens_seen": 2172656, + "step": 11410 + }, + { + "epoch": 5.932952182952183, + "grad_norm": 5.158809185028076, + "learning_rate": 2.1240629454651583e-05, + "loss": 0.0024, + "num_input_tokens_seen": 2173648, + "step": 11415 + }, + { + "epoch": 5.9355509355509355, + "grad_norm": 0.004463885445147753, + "learning_rate": 2.1218210484589924e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2174544, + "step": 11420 + }, + { + "epoch": 5.9381496881496885, + "grad_norm": 0.009056691080331802, + "learning_rate": 2.1195794626545007e-05, + "loss": 0.103, + "num_input_tokens_seen": 2175568, + "step": 11425 + }, + { + "epoch": 5.9407484407484406, + "grad_norm": 0.00843371544033289, + "learning_rate": 2.117338189896272e-05, + "loss": 0.0009, + "num_input_tokens_seen": 2176496, + "step": 11430 + }, + { + "epoch": 5.9433471933471935, + "grad_norm": 0.005626426078379154, + "learning_rate": 2.1150972320286398e-05, + "loss": 0.1007, + "num_input_tokens_seen": 2177424, + "step": 11435 + }, + { + "epoch": 5.945945945945946, + "grad_norm": 4.912357807159424, + "learning_rate": 2.1128565908956775e-05, + "loss": 0.1428, + "num_input_tokens_seen": 2178352, + "step": 11440 + }, + { + "epoch": 5.948544698544699, + "grad_norm": 0.008768558502197266, + "learning_rate": 2.1106162683411983e-05, + "loss": 0.0008, + "num_input_tokens_seen": 2179376, + "step": 11445 + }, + { + "epoch": 5.951143451143452, + "grad_norm": 0.11295311152935028, + "learning_rate": 2.108376266208753e-05, + "loss": 0.001, + "num_input_tokens_seen": 2180272, + "step": 11450 + }, + { + "epoch": 5.953742203742204, + "grad_norm": 0.012112453579902649, + "learning_rate": 2.106136586341629e-05, + "loss": 0.0006, + "num_input_tokens_seen": 2181232, + "step": 11455 + }, + { + "epoch": 5.956340956340957, + "grad_norm": 0.01013841200619936, + "learning_rate": 2.1038972305828486e-05, + "loss": 0.0006, + "num_input_tokens_seen": 2182256, + "step": 11460 + }, + { + "epoch": 5.958939708939709, + "grad_norm": 0.010341137647628784, + "learning_rate": 2.1016582007751658e-05, + "loss": 0.0005, + "num_input_tokens_seen": 2183280, + "step": 11465 + }, + { + "epoch": 5.961538461538462, + "grad_norm": 0.059824686497449875, + "learning_rate": 2.099419498761069e-05, + "loss": 0.0416, + "num_input_tokens_seen": 2184208, + "step": 11470 + }, + { + "epoch": 5.964137214137214, + "grad_norm": 0.002063364489004016, + "learning_rate": 2.0971811263827746e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2185136, + "step": 11475 + }, + { + "epoch": 5.966735966735967, + "grad_norm": 0.015463853254914284, + "learning_rate": 2.0949430854822288e-05, + "loss": 0.0007, + "num_input_tokens_seen": 2186096, + "step": 11480 + }, + { + "epoch": 5.96933471933472, + "grad_norm": 0.032715316861867905, + "learning_rate": 2.092705377901105e-05, + "loss": 0.0006, + "num_input_tokens_seen": 2187088, + "step": 11485 + }, + { + "epoch": 5.971933471933472, + "grad_norm": 0.003963975701481104, + "learning_rate": 2.090468005480804e-05, + "loss": 0.0013, + "num_input_tokens_seen": 2188080, + "step": 11490 + }, + { + "epoch": 5.974532224532225, + "grad_norm": 15.268412590026855, + "learning_rate": 2.0882309700624457e-05, + "loss": 0.0835, + "num_input_tokens_seen": 2189008, + "step": 11495 + }, + { + "epoch": 5.977130977130977, + "grad_norm": 0.013736617751419544, + "learning_rate": 2.0859942734868778e-05, + "loss": 0.1318, + "num_input_tokens_seen": 2189968, + "step": 11500 + }, + { + "epoch": 5.97972972972973, + "grad_norm": 0.024113483726978302, + "learning_rate": 2.0837579175946674e-05, + "loss": 0.0013, + "num_input_tokens_seen": 2190864, + "step": 11505 + }, + { + "epoch": 5.982328482328482, + "grad_norm": 0.020983804017305374, + "learning_rate": 2.0815219042261003e-05, + "loss": 0.0058, + "num_input_tokens_seen": 2191856, + "step": 11510 + }, + { + "epoch": 5.984927234927235, + "grad_norm": 0.01422122959047556, + "learning_rate": 2.0792862352211822e-05, + "loss": 0.0006, + "num_input_tokens_seen": 2192752, + "step": 11515 + }, + { + "epoch": 5.987525987525988, + "grad_norm": 0.012487941421568394, + "learning_rate": 2.077050912419634e-05, + "loss": 0.0006, + "num_input_tokens_seen": 2193680, + "step": 11520 + }, + { + "epoch": 5.99012474012474, + "grad_norm": 4.549112796783447, + "learning_rate": 2.074815937660894e-05, + "loss": 0.1236, + "num_input_tokens_seen": 2194576, + "step": 11525 + }, + { + "epoch": 5.992723492723493, + "grad_norm": 0.1305440217256546, + "learning_rate": 2.0725813127841103e-05, + "loss": 0.0011, + "num_input_tokens_seen": 2195568, + "step": 11530 + }, + { + "epoch": 5.995322245322245, + "grad_norm": 21.515335083007812, + "learning_rate": 2.0703470396281454e-05, + "loss": 0.0125, + "num_input_tokens_seen": 2196464, + "step": 11535 + }, + { + "epoch": 5.997920997920998, + "grad_norm": 0.03448396176099777, + "learning_rate": 2.068113120031573e-05, + "loss": 0.0005, + "num_input_tokens_seen": 2197488, + "step": 11540 + }, + { + "epoch": 6.0, + "eval_loss": 0.2348688393831253, + "eval_runtime": 9.2736, + "eval_samples_per_second": 92.305, + "eval_steps_per_second": 23.076, + "num_input_tokens_seen": 2198176, + "step": 11544 + }, + { + "epoch": 6.000519750519751, + "grad_norm": 0.9724030494689941, + "learning_rate": 2.0658795558326743e-05, + "loss": 0.0022, + "num_input_tokens_seen": 2198336, + "step": 11545 + }, + { + "epoch": 6.003118503118503, + "grad_norm": 0.0078431973233819, + "learning_rate": 2.0636463488694392e-05, + "loss": 0.001, + "num_input_tokens_seen": 2199232, + "step": 11550 + }, + { + "epoch": 6.005717255717256, + "grad_norm": 0.025075502693653107, + "learning_rate": 2.0614135009795633e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2200192, + "step": 11555 + }, + { + "epoch": 6.008316008316008, + "grad_norm": 0.014081068336963654, + "learning_rate": 2.059181014000446e-05, + "loss": 0.0026, + "num_input_tokens_seen": 2201088, + "step": 11560 + }, + { + "epoch": 6.010914760914761, + "grad_norm": 0.009635720402002335, + "learning_rate": 2.0569488897691898e-05, + "loss": 0.0005, + "num_input_tokens_seen": 2202112, + "step": 11565 + }, + { + "epoch": 6.013513513513513, + "grad_norm": 0.017032835632562637, + "learning_rate": 2.0547171301226007e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2203072, + "step": 11570 + }, + { + "epoch": 6.016112266112266, + "grad_norm": 0.004817981738597155, + "learning_rate": 2.052485736897182e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2204064, + "step": 11575 + }, + { + "epoch": 6.018711018711019, + "grad_norm": 0.017283085733652115, + "learning_rate": 2.050254711929137e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2205088, + "step": 11580 + }, + { + "epoch": 6.021309771309771, + "grad_norm": 0.03927413374185562, + "learning_rate": 2.048024057054366e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2206016, + "step": 11585 + }, + { + "epoch": 6.023908523908524, + "grad_norm": 0.007246074732393026, + "learning_rate": 2.0457937741084644e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2207040, + "step": 11590 + }, + { + "epoch": 6.026507276507276, + "grad_norm": 0.008294586092233658, + "learning_rate": 2.0435638649267205e-05, + "loss": 0.0011, + "num_input_tokens_seen": 2208000, + "step": 11595 + }, + { + "epoch": 6.029106029106029, + "grad_norm": 0.0038273553363978863, + "learning_rate": 2.0413343313441165e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2208928, + "step": 11600 + }, + { + "epoch": 6.031704781704781, + "grad_norm": 0.00407455675303936, + "learning_rate": 2.0391051751953256e-05, + "loss": 0.0006, + "num_input_tokens_seen": 2209856, + "step": 11605 + }, + { + "epoch": 6.034303534303534, + "grad_norm": 0.007337281946092844, + "learning_rate": 2.0368763983147092e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2210816, + "step": 11610 + }, + { + "epoch": 6.036902286902287, + "grad_norm": 0.0007672258070670068, + "learning_rate": 2.034648002536318e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2211744, + "step": 11615 + }, + { + "epoch": 6.039501039501039, + "grad_norm": 0.0038432630244642496, + "learning_rate": 2.0324199896938883e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2212704, + "step": 11620 + }, + { + "epoch": 6.042099792099792, + "grad_norm": 0.005372217390686274, + "learning_rate": 2.0301923616208404e-05, + "loss": 0.0005, + "num_input_tokens_seen": 2213632, + "step": 11625 + }, + { + "epoch": 6.044698544698544, + "grad_norm": 0.0017937947995960712, + "learning_rate": 2.0279651201502793e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2214656, + "step": 11630 + }, + { + "epoch": 6.047297297297297, + "grad_norm": 0.002415713621303439, + "learning_rate": 2.0257382671149914e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2215584, + "step": 11635 + }, + { + "epoch": 6.04989604989605, + "grad_norm": 0.009043765254318714, + "learning_rate": 2.023511804347444e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2216608, + "step": 11640 + }, + { + "epoch": 6.052494802494802, + "grad_norm": 0.00481073185801506, + "learning_rate": 2.0212857336797823e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2217600, + "step": 11645 + }, + { + "epoch": 6.055093555093555, + "grad_norm": 0.01395538728684187, + "learning_rate": 2.01906005694383e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2218496, + "step": 11650 + }, + { + "epoch": 6.0576923076923075, + "grad_norm": 0.0016385646304115653, + "learning_rate": 2.016834775971087e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2219424, + "step": 11655 + }, + { + "epoch": 6.0602910602910605, + "grad_norm": 0.0009341377881355584, + "learning_rate": 2.014609892592724e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2220352, + "step": 11660 + }, + { + "epoch": 6.0628898128898125, + "grad_norm": 0.005590348970144987, + "learning_rate": 2.012385408639588e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2221248, + "step": 11665 + }, + { + "epoch": 6.0654885654885655, + "grad_norm": 0.001750935218296945, + "learning_rate": 2.0101613259421963e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2222240, + "step": 11670 + }, + { + "epoch": 6.0680873180873185, + "grad_norm": 0.00403734901919961, + "learning_rate": 2.0079376463307368e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2223168, + "step": 11675 + }, + { + "epoch": 6.070686070686071, + "grad_norm": 0.0010118124773725867, + "learning_rate": 2.005714371635064e-05, + "loss": 0.0007, + "num_input_tokens_seen": 2224160, + "step": 11680 + }, + { + "epoch": 6.0732848232848236, + "grad_norm": 0.006817726418375969, + "learning_rate": 2.003491503684701e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2225120, + "step": 11685 + }, + { + "epoch": 6.075883575883576, + "grad_norm": 0.27729347348213196, + "learning_rate": 2.0012690443088344e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2226112, + "step": 11690 + }, + { + "epoch": 6.078482328482329, + "grad_norm": 0.004928118549287319, + "learning_rate": 1.999046995336316e-05, + "loss": 0.0065, + "num_input_tokens_seen": 2227104, + "step": 11695 + }, + { + "epoch": 6.081081081081081, + "grad_norm": 0.00022194326447788626, + "learning_rate": 1.9968253585956598e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2228064, + "step": 11700 + }, + { + "epoch": 6.083679833679834, + "grad_norm": 0.0008095114608295262, + "learning_rate": 1.9946041359150393e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2229024, + "step": 11705 + }, + { + "epoch": 6.086278586278587, + "grad_norm": 0.0014513563364744186, + "learning_rate": 1.992383329122289e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2230016, + "step": 11710 + }, + { + "epoch": 6.088877338877339, + "grad_norm": 0.00031447503715753555, + "learning_rate": 1.9901629400448997e-05, + "loss": 0.0, + "num_input_tokens_seen": 2230912, + "step": 11715 + }, + { + "epoch": 6.091476091476092, + "grad_norm": 0.00211348757147789, + "learning_rate": 1.9879429705100204e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2231840, + "step": 11720 + }, + { + "epoch": 6.094074844074844, + "grad_norm": 0.00021371705224737525, + "learning_rate": 1.9857234223444516e-05, + "loss": 0.001, + "num_input_tokens_seen": 2232800, + "step": 11725 + }, + { + "epoch": 6.096673596673597, + "grad_norm": 0.000264552712906152, + "learning_rate": 1.98350429737465e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2233760, + "step": 11730 + }, + { + "epoch": 6.099272349272349, + "grad_norm": 0.001647809287533164, + "learning_rate": 1.9812855974267225e-05, + "loss": 0.054, + "num_input_tokens_seen": 2234752, + "step": 11735 + }, + { + "epoch": 6.101871101871102, + "grad_norm": 0.0046903882175683975, + "learning_rate": 1.979067324326428e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2235680, + "step": 11740 + }, + { + "epoch": 6.104469854469855, + "grad_norm": 0.0019611960742622614, + "learning_rate": 1.9768494798991714e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2236576, + "step": 11745 + }, + { + "epoch": 6.107068607068607, + "grad_norm": 0.0062710740603506565, + "learning_rate": 1.974632065970008e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2237600, + "step": 11750 + }, + { + "epoch": 6.10966735966736, + "grad_norm": 0.004007246810942888, + "learning_rate": 1.9724150843636375e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2238528, + "step": 11755 + }, + { + "epoch": 6.112266112266112, + "grad_norm": 0.00652926554903388, + "learning_rate": 1.9701985369044013e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2239456, + "step": 11760 + }, + { + "epoch": 6.114864864864865, + "grad_norm": 0.0014828009298071265, + "learning_rate": 1.9679824254162864e-05, + "loss": 0.0, + "num_input_tokens_seen": 2240416, + "step": 11765 + }, + { + "epoch": 6.117463617463618, + "grad_norm": 0.001596841961145401, + "learning_rate": 1.965766751722922e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2241344, + "step": 11770 + }, + { + "epoch": 6.12006237006237, + "grad_norm": 0.0014535058289766312, + "learning_rate": 1.9635515176475747e-05, + "loss": 0.0, + "num_input_tokens_seen": 2242272, + "step": 11775 + }, + { + "epoch": 6.122661122661123, + "grad_norm": 0.000502811570186168, + "learning_rate": 1.96133672501315e-05, + "loss": 0.0, + "num_input_tokens_seen": 2243232, + "step": 11780 + }, + { + "epoch": 6.125259875259875, + "grad_norm": 0.0010410983813926578, + "learning_rate": 1.9591223756421916e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2244288, + "step": 11785 + }, + { + "epoch": 6.127858627858628, + "grad_norm": 0.0003545841318555176, + "learning_rate": 1.9569084713568752e-05, + "loss": 0.0, + "num_input_tokens_seen": 2245216, + "step": 11790 + }, + { + "epoch": 6.13045738045738, + "grad_norm": 0.04132326319813728, + "learning_rate": 1.954695013979013e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2246208, + "step": 11795 + }, + { + "epoch": 6.133056133056133, + "grad_norm": 0.012073689140379429, + "learning_rate": 1.9524820053300485e-05, + "loss": 0.1002, + "num_input_tokens_seen": 2247136, + "step": 11800 + }, + { + "epoch": 6.135654885654886, + "grad_norm": 0.0002765111858025193, + "learning_rate": 1.950269447231056e-05, + "loss": 0.1077, + "num_input_tokens_seen": 2248064, + "step": 11805 + }, + { + "epoch": 6.138253638253638, + "grad_norm": 0.428448349237442, + "learning_rate": 1.9480573415027395e-05, + "loss": 0.0007, + "num_input_tokens_seen": 2249024, + "step": 11810 + }, + { + "epoch": 6.140852390852391, + "grad_norm": 0.0012219647178426385, + "learning_rate": 1.9458456899654303e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2250048, + "step": 11815 + }, + { + "epoch": 6.143451143451143, + "grad_norm": 0.0028639137744903564, + "learning_rate": 1.943634494439086e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2250976, + "step": 11820 + }, + { + "epoch": 6.146049896049896, + "grad_norm": 0.006605847738683224, + "learning_rate": 1.9414237567432886e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2251936, + "step": 11825 + }, + { + "epoch": 6.148648648648648, + "grad_norm": 0.00302464934065938, + "learning_rate": 1.939213478697244e-05, + "loss": 0.0005, + "num_input_tokens_seen": 2252832, + "step": 11830 + }, + { + "epoch": 6.151247401247401, + "grad_norm": 0.005706378724426031, + "learning_rate": 1.9370036621197793e-05, + "loss": 0.0018, + "num_input_tokens_seen": 2253792, + "step": 11835 + }, + { + "epoch": 6.153846153846154, + "grad_norm": 0.02847285568714142, + "learning_rate": 1.9347943088293423e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2254848, + "step": 11840 + }, + { + "epoch": 6.156444906444906, + "grad_norm": 0.007600831333547831, + "learning_rate": 1.9325854206439996e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2255776, + "step": 11845 + }, + { + "epoch": 6.159043659043659, + "grad_norm": 0.0008974708034656942, + "learning_rate": 1.9303769993814353e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2256736, + "step": 11850 + }, + { + "epoch": 6.161642411642411, + "grad_norm": 0.0008255933062173426, + "learning_rate": 1.9281690468589473e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2257664, + "step": 11855 + }, + { + "epoch": 6.164241164241164, + "grad_norm": 0.0030338773503899574, + "learning_rate": 1.9259615648934505e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2258592, + "step": 11860 + }, + { + "epoch": 6.166839916839917, + "grad_norm": 0.002355564385652542, + "learning_rate": 1.923754555301471e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2259552, + "step": 11865 + }, + { + "epoch": 6.169438669438669, + "grad_norm": 0.015959909185767174, + "learning_rate": 1.9215480198991466e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2260544, + "step": 11870 + }, + { + "epoch": 6.172037422037422, + "grad_norm": 0.009562622755765915, + "learning_rate": 1.9193419605022248e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2261504, + "step": 11875 + }, + { + "epoch": 6.174636174636174, + "grad_norm": 0.0011687222868204117, + "learning_rate": 1.9171363789260614e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2262432, + "step": 11880 + }, + { + "epoch": 6.177234927234927, + "grad_norm": 0.0012729365844279528, + "learning_rate": 1.914931276985621e-05, + "loss": 0.0, + "num_input_tokens_seen": 2263488, + "step": 11885 + }, + { + "epoch": 6.1798336798336795, + "grad_norm": 0.007061419077217579, + "learning_rate": 1.9127266564954678e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2264448, + "step": 11890 + }, + { + "epoch": 6.1824324324324325, + "grad_norm": 0.012949742376804352, + "learning_rate": 1.910522519269776e-05, + "loss": 0.0005, + "num_input_tokens_seen": 2265472, + "step": 11895 + }, + { + "epoch": 6.185031185031185, + "grad_norm": 0.0002210001985076815, + "learning_rate": 1.9083188671223196e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2266368, + "step": 11900 + }, + { + "epoch": 6.1876299376299375, + "grad_norm": 0.0050325035117566586, + "learning_rate": 1.906115701866473e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2267328, + "step": 11905 + }, + { + "epoch": 6.1902286902286905, + "grad_norm": 0.0026291338726878166, + "learning_rate": 1.903913025315211e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2268288, + "step": 11910 + }, + { + "epoch": 6.192827442827443, + "grad_norm": 0.00010202395060332492, + "learning_rate": 1.9017108392811065e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2269280, + "step": 11915 + }, + { + "epoch": 6.1954261954261955, + "grad_norm": 0.01741311140358448, + "learning_rate": 1.8995091455763254e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2270272, + "step": 11920 + }, + { + "epoch": 6.198024948024948, + "grad_norm": 0.0027943854220211506, + "learning_rate": 1.8973079460126334e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2271264, + "step": 11925 + }, + { + "epoch": 6.200623700623701, + "grad_norm": 188.45042419433594, + "learning_rate": 1.895107242401386e-05, + "loss": 0.0356, + "num_input_tokens_seen": 2272192, + "step": 11930 + }, + { + "epoch": 6.203222453222454, + "grad_norm": 0.000200200971448794, + "learning_rate": 1.8929070365535323e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2273152, + "step": 11935 + }, + { + "epoch": 6.205821205821206, + "grad_norm": 0.0009294454357586801, + "learning_rate": 1.8907073302796115e-05, + "loss": 0.0, + "num_input_tokens_seen": 2274176, + "step": 11940 + }, + { + "epoch": 6.208419958419959, + "grad_norm": 0.0014596377732232213, + "learning_rate": 1.8885081253897504e-05, + "loss": 0.0, + "num_input_tokens_seen": 2275104, + "step": 11945 + }, + { + "epoch": 6.211018711018711, + "grad_norm": 0.0005680373287759721, + "learning_rate": 1.886309423693667e-05, + "loss": 0.0, + "num_input_tokens_seen": 2276256, + "step": 11950 + }, + { + "epoch": 6.213617463617464, + "grad_norm": 0.0006049643852747977, + "learning_rate": 1.8841112270006596e-05, + "loss": 0.0, + "num_input_tokens_seen": 2277184, + "step": 11955 + }, + { + "epoch": 6.216216216216216, + "grad_norm": 0.0025756985414773226, + "learning_rate": 1.881913537119615e-05, + "loss": 0.0, + "num_input_tokens_seen": 2278112, + "step": 11960 + }, + { + "epoch": 6.218814968814969, + "grad_norm": 0.0002756379544734955, + "learning_rate": 1.8797163558590018e-05, + "loss": 0.0, + "num_input_tokens_seen": 2279104, + "step": 11965 + }, + { + "epoch": 6.221413721413722, + "grad_norm": 0.0007691693608649075, + "learning_rate": 1.8775196850268703e-05, + "loss": 0.0, + "num_input_tokens_seen": 2280096, + "step": 11970 + }, + { + "epoch": 6.224012474012474, + "grad_norm": 0.0010331724770367146, + "learning_rate": 1.8753235264308504e-05, + "loss": 0.0014, + "num_input_tokens_seen": 2281088, + "step": 11975 + }, + { + "epoch": 6.226611226611227, + "grad_norm": 0.00579744391143322, + "learning_rate": 1.8731278818781506e-05, + "loss": 0.0, + "num_input_tokens_seen": 2282080, + "step": 11980 + }, + { + "epoch": 6.229209979209979, + "grad_norm": 0.00016174330085050315, + "learning_rate": 1.8709327531755562e-05, + "loss": 0.0, + "num_input_tokens_seen": 2282976, + "step": 11985 + }, + { + "epoch": 6.231808731808732, + "grad_norm": 0.0005870184977538884, + "learning_rate": 1.8687381421294287e-05, + "loss": 0.0, + "num_input_tokens_seen": 2283904, + "step": 11990 + }, + { + "epoch": 6.234407484407485, + "grad_norm": 0.001747244386933744, + "learning_rate": 1.8665440505457027e-05, + "loss": 0.0, + "num_input_tokens_seen": 2284928, + "step": 11995 + }, + { + "epoch": 6.237006237006237, + "grad_norm": 0.0021654171869158745, + "learning_rate": 1.864350480229886e-05, + "loss": 0.0, + "num_input_tokens_seen": 2285952, + "step": 12000 + }, + { + "epoch": 6.23960498960499, + "grad_norm": 0.0016758251003921032, + "learning_rate": 1.8621574329870575e-05, + "loss": 0.0, + "num_input_tokens_seen": 2286848, + "step": 12005 + }, + { + "epoch": 6.242203742203742, + "grad_norm": 0.00028893721173517406, + "learning_rate": 1.859964910621865e-05, + "loss": 0.0, + "num_input_tokens_seen": 2287776, + "step": 12010 + }, + { + "epoch": 6.244802494802495, + "grad_norm": 0.00014453931362368166, + "learning_rate": 1.8577729149385257e-05, + "loss": 0.0, + "num_input_tokens_seen": 2288768, + "step": 12015 + }, + { + "epoch": 6.247401247401247, + "grad_norm": 0.037190571427345276, + "learning_rate": 1.8555814477408214e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2289792, + "step": 12020 + }, + { + "epoch": 6.25, + "grad_norm": 0.018799716606736183, + "learning_rate": 1.8533905108321005e-05, + "loss": 0.0, + "num_input_tokens_seen": 2290688, + "step": 12025 + }, + { + "epoch": 6.252598752598753, + "grad_norm": 0.0022928500548005104, + "learning_rate": 1.8512001060152744e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2291648, + "step": 12030 + }, + { + "epoch": 6.255197505197505, + "grad_norm": 0.0004372104594949633, + "learning_rate": 1.8490102350928172e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2292576, + "step": 12035 + }, + { + "epoch": 6.257796257796258, + "grad_norm": 0.00030515893013216555, + "learning_rate": 1.8468208998667636e-05, + "loss": 0.0, + "num_input_tokens_seen": 2293536, + "step": 12040 + }, + { + "epoch": 6.26039501039501, + "grad_norm": 0.0001689942873781547, + "learning_rate": 1.8446321021387078e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2294464, + "step": 12045 + }, + { + "epoch": 6.262993762993763, + "grad_norm": 0.03190956637263298, + "learning_rate": 1.842443843709799e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2295424, + "step": 12050 + }, + { + "epoch": 6.265592515592515, + "grad_norm": 0.000597915961407125, + "learning_rate": 1.840256126380746e-05, + "loss": 0.0, + "num_input_tokens_seen": 2296384, + "step": 12055 + }, + { + "epoch": 6.268191268191268, + "grad_norm": 0.0006522374460473657, + "learning_rate": 1.8380689519518112e-05, + "loss": 0.0, + "num_input_tokens_seen": 2297440, + "step": 12060 + }, + { + "epoch": 6.270790020790021, + "grad_norm": 0.0001845982187660411, + "learning_rate": 1.8358823222228097e-05, + "loss": 0.0, + "num_input_tokens_seen": 2298368, + "step": 12065 + }, + { + "epoch": 6.273388773388773, + "grad_norm": 0.00014640638255514205, + "learning_rate": 1.8336962389931085e-05, + "loss": 0.0, + "num_input_tokens_seen": 2299232, + "step": 12070 + }, + { + "epoch": 6.275987525987526, + "grad_norm": 0.0012200711062178016, + "learning_rate": 1.8315107040616263e-05, + "loss": 0.0, + "num_input_tokens_seen": 2300192, + "step": 12075 + }, + { + "epoch": 6.278586278586278, + "grad_norm": 0.0001295094407396391, + "learning_rate": 1.8293257192268296e-05, + "loss": 0.0, + "num_input_tokens_seen": 2301184, + "step": 12080 + }, + { + "epoch": 6.281185031185031, + "grad_norm": 0.0029814811423420906, + "learning_rate": 1.8271412862867305e-05, + "loss": 0.0, + "num_input_tokens_seen": 2302080, + "step": 12085 + }, + { + "epoch": 6.283783783783784, + "grad_norm": 0.032201994210481644, + "learning_rate": 1.8249574070388893e-05, + "loss": 0.222, + "num_input_tokens_seen": 2302944, + "step": 12090 + }, + { + "epoch": 6.286382536382536, + "grad_norm": 0.0015193272847682238, + "learning_rate": 1.82277408328041e-05, + "loss": 0.0, + "num_input_tokens_seen": 2303872, + "step": 12095 + }, + { + "epoch": 6.288981288981289, + "grad_norm": 0.0001143625340773724, + "learning_rate": 1.820591316807939e-05, + "loss": 0.0, + "num_input_tokens_seen": 2304736, + "step": 12100 + }, + { + "epoch": 6.291580041580041, + "grad_norm": 0.007152357138693333, + "learning_rate": 1.818409109417666e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2305664, + "step": 12105 + }, + { + "epoch": 6.294178794178794, + "grad_norm": 0.0024532373063266277, + "learning_rate": 1.816227462905318e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2306592, + "step": 12110 + }, + { + "epoch": 6.296777546777546, + "grad_norm": 0.00024611823027953506, + "learning_rate": 1.8140463790661606e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2307552, + "step": 12115 + }, + { + "epoch": 6.299376299376299, + "grad_norm": 0.002429606392979622, + "learning_rate": 1.811865859694999e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2308416, + "step": 12120 + }, + { + "epoch": 6.301975051975052, + "grad_norm": 0.0035023505333811045, + "learning_rate": 1.8096859065861722e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2309376, + "step": 12125 + }, + { + "epoch": 6.3045738045738045, + "grad_norm": 0.017066167667508125, + "learning_rate": 1.8075065215335525e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2310336, + "step": 12130 + }, + { + "epoch": 6.307172557172557, + "grad_norm": 0.09456293284893036, + "learning_rate": 1.8053277063305456e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2311264, + "step": 12135 + }, + { + "epoch": 6.3097713097713095, + "grad_norm": 0.0007974709733389318, + "learning_rate": 1.803149462770089e-05, + "loss": 0.0, + "num_input_tokens_seen": 2312160, + "step": 12140 + }, + { + "epoch": 6.3123700623700625, + "grad_norm": 0.025370746850967407, + "learning_rate": 1.8009717926446492e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2313120, + "step": 12145 + }, + { + "epoch": 6.314968814968815, + "grad_norm": 0.0002014455385506153, + "learning_rate": 1.7987946977462194e-05, + "loss": 0.132, + "num_input_tokens_seen": 2314112, + "step": 12150 + }, + { + "epoch": 6.3175675675675675, + "grad_norm": 0.03955419361591339, + "learning_rate": 1.7966181798663218e-05, + "loss": 0.0005, + "num_input_tokens_seen": 2315104, + "step": 12155 + }, + { + "epoch": 6.3201663201663205, + "grad_norm": 0.06406331062316895, + "learning_rate": 1.794442240796002e-05, + "loss": 0.0005, + "num_input_tokens_seen": 2315968, + "step": 12160 + }, + { + "epoch": 6.322765072765073, + "grad_norm": 9.771114855539054e-05, + "learning_rate": 1.7922668823258304e-05, + "loss": 0.0009, + "num_input_tokens_seen": 2316992, + "step": 12165 + }, + { + "epoch": 6.325363825363826, + "grad_norm": 0.0004625493020284921, + "learning_rate": 1.790092106245899e-05, + "loss": 0.1279, + "num_input_tokens_seen": 2317856, + "step": 12170 + }, + { + "epoch": 6.327962577962578, + "grad_norm": 0.01274019479751587, + "learning_rate": 1.7879179143458212e-05, + "loss": 0.0007, + "num_input_tokens_seen": 2318784, + "step": 12175 + }, + { + "epoch": 6.330561330561331, + "grad_norm": 0.0016582147218286991, + "learning_rate": 1.7857443084147296e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2319712, + "step": 12180 + }, + { + "epoch": 6.333160083160083, + "grad_norm": 0.019359640777111053, + "learning_rate": 1.7835712902412726e-05, + "loss": 0.0982, + "num_input_tokens_seen": 2320608, + "step": 12185 + }, + { + "epoch": 6.335758835758836, + "grad_norm": 0.020833995193243027, + "learning_rate": 1.7813988616136177e-05, + "loss": 0.0009, + "num_input_tokens_seen": 2321600, + "step": 12190 + }, + { + "epoch": 6.338357588357589, + "grad_norm": 0.006148037500679493, + "learning_rate": 1.7792270243194452e-05, + "loss": 0.0006, + "num_input_tokens_seen": 2322560, + "step": 12195 + }, + { + "epoch": 6.340956340956341, + "grad_norm": 0.025037923827767372, + "learning_rate": 1.7770557801459513e-05, + "loss": 0.002, + "num_input_tokens_seen": 2323520, + "step": 12200 + }, + { + "epoch": 6.343555093555094, + "grad_norm": 0.006901323329657316, + "learning_rate": 1.774885130879842e-05, + "loss": 0.0005, + "num_input_tokens_seen": 2324480, + "step": 12205 + }, + { + "epoch": 6.346153846153846, + "grad_norm": 0.014518301002681255, + "learning_rate": 1.7727150783073352e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2325408, + "step": 12210 + }, + { + "epoch": 6.348752598752599, + "grad_norm": 0.01326845120638609, + "learning_rate": 1.7705456242141547e-05, + "loss": 0.0013, + "num_input_tokens_seen": 2326432, + "step": 12215 + }, + { + "epoch": 6.351351351351352, + "grad_norm": 0.00885369349271059, + "learning_rate": 1.7683767703855354e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2327392, + "step": 12220 + }, + { + "epoch": 6.353950103950104, + "grad_norm": 0.0027832582127302885, + "learning_rate": 1.7662085186062165e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2328288, + "step": 12225 + }, + { + "epoch": 6.356548856548857, + "grad_norm": 0.04846420884132385, + "learning_rate": 1.7640408706604422e-05, + "loss": 0.0007, + "num_input_tokens_seen": 2329312, + "step": 12230 + }, + { + "epoch": 6.359147609147609, + "grad_norm": 0.004158699419349432, + "learning_rate": 1.7618738283319604e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2330272, + "step": 12235 + }, + { + "epoch": 6.361746361746362, + "grad_norm": 0.0649101510643959, + "learning_rate": 1.7597073934040193e-05, + "loss": 0.0005, + "num_input_tokens_seen": 2331232, + "step": 12240 + }, + { + "epoch": 6.364345114345114, + "grad_norm": 0.07864724099636078, + "learning_rate": 1.7575415676593688e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2332160, + "step": 12245 + }, + { + "epoch": 6.366943866943867, + "grad_norm": 0.0063788085244596004, + "learning_rate": 1.7553763528802554e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2333088, + "step": 12250 + }, + { + "epoch": 6.36954261954262, + "grad_norm": 0.011852026917040348, + "learning_rate": 1.7532117508484243e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2334016, + "step": 12255 + }, + { + "epoch": 6.372141372141372, + "grad_norm": 0.0052099404856562614, + "learning_rate": 1.7510477633451172e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2334944, + "step": 12260 + }, + { + "epoch": 6.374740124740125, + "grad_norm": 0.007831376977264881, + "learning_rate": 1.748884392151069e-05, + "loss": 0.0034, + "num_input_tokens_seen": 2335904, + "step": 12265 + }, + { + "epoch": 6.377338877338877, + "grad_norm": 0.00671012420207262, + "learning_rate": 1.746721639046507e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2336832, + "step": 12270 + }, + { + "epoch": 6.37993762993763, + "grad_norm": 0.19956301152706146, + "learning_rate": 1.744559505811152e-05, + "loss": 0.0005, + "num_input_tokens_seen": 2337856, + "step": 12275 + }, + { + "epoch": 6.382536382536383, + "grad_norm": 0.0025789805222302675, + "learning_rate": 1.742397994224211e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2338816, + "step": 12280 + }, + { + "epoch": 6.385135135135135, + "grad_norm": 0.00026789132971316576, + "learning_rate": 1.740237106064383e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2339808, + "step": 12285 + }, + { + "epoch": 6.387733887733888, + "grad_norm": 0.0052073062397539616, + "learning_rate": 1.7380768431098527e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2340864, + "step": 12290 + }, + { + "epoch": 6.39033264033264, + "grad_norm": 0.0003412259102333337, + "learning_rate": 1.7359172071382897e-05, + "loss": 0.0064, + "num_input_tokens_seen": 2341824, + "step": 12295 + }, + { + "epoch": 6.392931392931393, + "grad_norm": 0.002365178195759654, + "learning_rate": 1.733758199926849e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2342784, + "step": 12300 + }, + { + "epoch": 6.395530145530145, + "grad_norm": 0.0009039189317263663, + "learning_rate": 1.731599823252167e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2343744, + "step": 12305 + }, + { + "epoch": 6.398128898128898, + "grad_norm": 0.008639713749289513, + "learning_rate": 1.7294420788903627e-05, + "loss": 0.0025, + "num_input_tokens_seen": 2344768, + "step": 12310 + }, + { + "epoch": 6.400727650727651, + "grad_norm": 0.003655686043202877, + "learning_rate": 1.7272849686170314e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2345696, + "step": 12315 + }, + { + "epoch": 6.403326403326403, + "grad_norm": 0.003607349004596472, + "learning_rate": 1.72512849420725e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2346656, + "step": 12320 + }, + { + "epoch": 6.405925155925156, + "grad_norm": 0.0022320770658552647, + "learning_rate": 1.722972657435572e-05, + "loss": 0.1379, + "num_input_tokens_seen": 2347616, + "step": 12325 + }, + { + "epoch": 6.408523908523908, + "grad_norm": 9.913567191688344e-05, + "learning_rate": 1.7208174600760247e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2348608, + "step": 12330 + }, + { + "epoch": 6.411122661122661, + "grad_norm": 0.016457611694931984, + "learning_rate": 1.7186629039021102e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2349568, + "step": 12335 + }, + { + "epoch": 6.413721413721413, + "grad_norm": 0.023375093936920166, + "learning_rate": 1.7165089906868028e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2350496, + "step": 12340 + }, + { + "epoch": 6.416320166320166, + "grad_norm": 0.005520991515368223, + "learning_rate": 1.714355722202546e-05, + "loss": 0.0006, + "num_input_tokens_seen": 2351488, + "step": 12345 + }, + { + "epoch": 6.418918918918919, + "grad_norm": 0.004843624774366617, + "learning_rate": 1.7122031002212556e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2352480, + "step": 12350 + }, + { + "epoch": 6.421517671517671, + "grad_norm": 0.01042421255260706, + "learning_rate": 1.7100511265143132e-05, + "loss": 0.0005, + "num_input_tokens_seen": 2353408, + "step": 12355 + }, + { + "epoch": 6.424116424116424, + "grad_norm": 0.000670810928568244, + "learning_rate": 1.707899802852569e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2354304, + "step": 12360 + }, + { + "epoch": 6.4267151767151764, + "grad_norm": 0.009965123608708382, + "learning_rate": 1.7057491310063355e-05, + "loss": 0.0005, + "num_input_tokens_seen": 2355264, + "step": 12365 + }, + { + "epoch": 6.429313929313929, + "grad_norm": 0.0015200711786746979, + "learning_rate": 1.703599112745392e-05, + "loss": 0.0005, + "num_input_tokens_seen": 2356128, + "step": 12370 + }, + { + "epoch": 6.4319126819126815, + "grad_norm": 0.00199409993365407, + "learning_rate": 1.701449749838978e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2357120, + "step": 12375 + }, + { + "epoch": 6.4345114345114345, + "grad_norm": 0.00797036662697792, + "learning_rate": 1.699301044055793e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2358112, + "step": 12380 + }, + { + "epoch": 6.4371101871101875, + "grad_norm": 0.018261296674609184, + "learning_rate": 1.6971529971639975e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2359040, + "step": 12385 + }, + { + "epoch": 6.4397089397089395, + "grad_norm": 0.0009258723584935069, + "learning_rate": 1.6950056109312097e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2360000, + "step": 12390 + }, + { + "epoch": 6.4423076923076925, + "grad_norm": 0.005378162022680044, + "learning_rate": 1.692858887124503e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2360864, + "step": 12395 + }, + { + "epoch": 6.444906444906445, + "grad_norm": 0.0006830400670878589, + "learning_rate": 1.6907128275104063e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2361792, + "step": 12400 + }, + { + "epoch": 6.447505197505198, + "grad_norm": 0.031222907826304436, + "learning_rate": 1.6885674338549025e-05, + "loss": 0.0006, + "num_input_tokens_seen": 2362816, + "step": 12405 + }, + { + "epoch": 6.45010395010395, + "grad_norm": 0.0027423426508903503, + "learning_rate": 1.686422707923425e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2363776, + "step": 12410 + }, + { + "epoch": 6.452702702702703, + "grad_norm": 0.03047732636332512, + "learning_rate": 1.6842786514808593e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2364672, + "step": 12415 + }, + { + "epoch": 6.455301455301456, + "grad_norm": 0.004915900062769651, + "learning_rate": 1.6821352662915388e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2365664, + "step": 12420 + }, + { + "epoch": 6.457900207900208, + "grad_norm": 0.0020419603679329157, + "learning_rate": 1.6799925541192454e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2366592, + "step": 12425 + }, + { + "epoch": 6.460498960498961, + "grad_norm": 0.005769755691289902, + "learning_rate": 1.677850516727207e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2367552, + "step": 12430 + }, + { + "epoch": 6.463097713097713, + "grad_norm": 0.00016632348706480116, + "learning_rate": 1.6757091558780955e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2368480, + "step": 12435 + }, + { + "epoch": 6.465696465696466, + "grad_norm": 0.0026477964129298925, + "learning_rate": 1.6735684733340278e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2369472, + "step": 12440 + }, + { + "epoch": 6.468295218295219, + "grad_norm": 0.006983248051255941, + "learning_rate": 1.6714284708565598e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2370432, + "step": 12445 + }, + { + "epoch": 6.470893970893971, + "grad_norm": 0.01558666117489338, + "learning_rate": 1.6692891502066903e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2371424, + "step": 12450 + }, + { + "epoch": 6.473492723492724, + "grad_norm": 0.006130000576376915, + "learning_rate": 1.667150513144856e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2372352, + "step": 12455 + }, + { + "epoch": 6.476091476091476, + "grad_norm": 0.005277466960251331, + "learning_rate": 1.6650125614309314e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2373344, + "step": 12460 + }, + { + "epoch": 6.478690228690229, + "grad_norm": 0.0015007993206381798, + "learning_rate": 1.6628752968242272e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2374304, + "step": 12465 + }, + { + "epoch": 6.481288981288981, + "grad_norm": 0.002287114504724741, + "learning_rate": 1.6607387210834887e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2375264, + "step": 12470 + }, + { + "epoch": 6.483887733887734, + "grad_norm": 0.00013340837904252112, + "learning_rate": 1.6586028359668922e-05, + "loss": 0.0, + "num_input_tokens_seen": 2376160, + "step": 12475 + }, + { + "epoch": 6.486486486486487, + "grad_norm": 3.9070329666137695, + "learning_rate": 1.6564676432320485e-05, + "loss": 0.0974, + "num_input_tokens_seen": 2377088, + "step": 12480 + }, + { + "epoch": 6.489085239085239, + "grad_norm": 0.007694104686379433, + "learning_rate": 1.6543331446359976e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2378048, + "step": 12485 + }, + { + "epoch": 6.491683991683992, + "grad_norm": 0.004184362478554249, + "learning_rate": 1.652199341935209e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2378944, + "step": 12490 + }, + { + "epoch": 6.494282744282744, + "grad_norm": 0.010392150841653347, + "learning_rate": 1.6500662368855776e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2379936, + "step": 12495 + }, + { + "epoch": 6.496881496881497, + "grad_norm": 0.4244946241378784, + "learning_rate": 1.6479338312424258e-05, + "loss": 0.0008, + "num_input_tokens_seen": 2380864, + "step": 12500 + }, + { + "epoch": 6.49948024948025, + "grad_norm": 0.005007464904338121, + "learning_rate": 1.6458021267605018e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2381824, + "step": 12505 + }, + { + "epoch": 6.5, + "eval_loss": 0.31930992007255554, + "eval_runtime": 9.2407, + "eval_samples_per_second": 92.634, + "eval_steps_per_second": 23.159, + "num_input_tokens_seen": 2382016, + "step": 12506 + }, + { + "epoch": 6.502079002079002, + "grad_norm": 0.002529170596972108, + "learning_rate": 1.643671125193973e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2382784, + "step": 12510 + }, + { + "epoch": 6.504677754677755, + "grad_norm": 0.002679921919479966, + "learning_rate": 1.6415408282964313e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2383680, + "step": 12515 + }, + { + "epoch": 6.507276507276507, + "grad_norm": 0.00015318425721488893, + "learning_rate": 1.6394112378208877e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2384704, + "step": 12520 + }, + { + "epoch": 6.50987525987526, + "grad_norm": 0.009570203721523285, + "learning_rate": 1.6372823555197726e-05, + "loss": 0.0654, + "num_input_tokens_seen": 2385664, + "step": 12525 + }, + { + "epoch": 6.512474012474012, + "grad_norm": 0.0016874050488695502, + "learning_rate": 1.635154183144933e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2386624, + "step": 12530 + }, + { + "epoch": 6.515072765072765, + "grad_norm": 0.00012319424422457814, + "learning_rate": 1.6330267224476326e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2387616, + "step": 12535 + }, + { + "epoch": 6.517671517671518, + "grad_norm": 0.009627056308090687, + "learning_rate": 1.630899975178547e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2388576, + "step": 12540 + }, + { + "epoch": 6.52027027027027, + "grad_norm": 0.0014320805203169584, + "learning_rate": 1.628773943087768e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2389504, + "step": 12545 + }, + { + "epoch": 6.522869022869023, + "grad_norm": 0.2912241816520691, + "learning_rate": 1.6266486279247968e-05, + "loss": 0.0005, + "num_input_tokens_seen": 2390464, + "step": 12550 + }, + { + "epoch": 6.525467775467775, + "grad_norm": 0.000900813436601311, + "learning_rate": 1.6245240314385458e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2391456, + "step": 12555 + }, + { + "epoch": 6.528066528066528, + "grad_norm": 0.0009099690360017121, + "learning_rate": 1.6224001553773345e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2392352, + "step": 12560 + }, + { + "epoch": 6.53066528066528, + "grad_norm": 0.0008477139635942876, + "learning_rate": 1.6202770014888906e-05, + "loss": 0.0, + "num_input_tokens_seen": 2393312, + "step": 12565 + }, + { + "epoch": 6.533264033264033, + "grad_norm": 90.66512298583984, + "learning_rate": 1.6181545715203488e-05, + "loss": 0.0683, + "num_input_tokens_seen": 2394240, + "step": 12570 + }, + { + "epoch": 6.535862785862786, + "grad_norm": 0.15961585938930511, + "learning_rate": 1.6160328672182445e-05, + "loss": 0.0378, + "num_input_tokens_seen": 2395200, + "step": 12575 + }, + { + "epoch": 6.538461538461538, + "grad_norm": 0.0053822947666049, + "learning_rate": 1.61391189032852e-05, + "loss": 0.0, + "num_input_tokens_seen": 2396160, + "step": 12580 + }, + { + "epoch": 6.541060291060291, + "grad_norm": 0.00167519086971879, + "learning_rate": 1.611791642596516e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2397056, + "step": 12585 + }, + { + "epoch": 6.543659043659043, + "grad_norm": 0.004004520829766989, + "learning_rate": 1.609672125766975e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2398048, + "step": 12590 + }, + { + "epoch": 6.546257796257796, + "grad_norm": 0.00023219850845634937, + "learning_rate": 1.6075533415840372e-05, + "loss": 0.0841, + "num_input_tokens_seen": 2398976, + "step": 12595 + }, + { + "epoch": 6.548856548856548, + "grad_norm": 0.014428077265620232, + "learning_rate": 1.60543529179124e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2399936, + "step": 12600 + }, + { + "epoch": 6.551455301455301, + "grad_norm": 16.43584442138672, + "learning_rate": 1.6033179781315178e-05, + "loss": 0.1507, + "num_input_tokens_seen": 2400928, + "step": 12605 + }, + { + "epoch": 6.554054054054054, + "grad_norm": 0.0025581230875104666, + "learning_rate": 1.6012014023471954e-05, + "loss": 0.1197, + "num_input_tokens_seen": 2401856, + "step": 12610 + }, + { + "epoch": 6.5566528066528065, + "grad_norm": 0.0059363883920013905, + "learning_rate": 1.5990855661799947e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2402848, + "step": 12615 + }, + { + "epoch": 6.5592515592515594, + "grad_norm": 0.005142976529896259, + "learning_rate": 1.5969704713710275e-05, + "loss": 0.0032, + "num_input_tokens_seen": 2403776, + "step": 12620 + }, + { + "epoch": 6.5618503118503115, + "grad_norm": 0.021961726248264313, + "learning_rate": 1.594856119660794e-05, + "loss": 0.0007, + "num_input_tokens_seen": 2404672, + "step": 12625 + }, + { + "epoch": 6.5644490644490645, + "grad_norm": 0.006038930732756853, + "learning_rate": 1.5927425127891856e-05, + "loss": 0.0008, + "num_input_tokens_seen": 2405600, + "step": 12630 + }, + { + "epoch": 6.567047817047817, + "grad_norm": 0.009327318519353867, + "learning_rate": 1.59062965249548e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2406528, + "step": 12635 + }, + { + "epoch": 6.56964656964657, + "grad_norm": 0.003749291179701686, + "learning_rate": 1.588517540518338e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2407488, + "step": 12640 + }, + { + "epoch": 6.5722453222453225, + "grad_norm": 0.0299542173743248, + "learning_rate": 1.5864061785958076e-05, + "loss": 0.0006, + "num_input_tokens_seen": 2408480, + "step": 12645 + }, + { + "epoch": 6.574844074844075, + "grad_norm": 0.00836177822202444, + "learning_rate": 1.584295568465318e-05, + "loss": 0.0949, + "num_input_tokens_seen": 2409408, + "step": 12650 + }, + { + "epoch": 6.577442827442828, + "grad_norm": 0.036507751792669296, + "learning_rate": 1.582185711863681e-05, + "loss": 0.0005, + "num_input_tokens_seen": 2410368, + "step": 12655 + }, + { + "epoch": 6.58004158004158, + "grad_norm": 0.14814971387386322, + "learning_rate": 1.5800766105270877e-05, + "loss": 0.0522, + "num_input_tokens_seen": 2411328, + "step": 12660 + }, + { + "epoch": 6.582640332640333, + "grad_norm": 0.022365348413586617, + "learning_rate": 1.5779682661911072e-05, + "loss": 0.0005, + "num_input_tokens_seen": 2412256, + "step": 12665 + }, + { + "epoch": 6.585239085239085, + "grad_norm": 0.00509367510676384, + "learning_rate": 1.5758606805906867e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2413248, + "step": 12670 + }, + { + "epoch": 6.587837837837838, + "grad_norm": 0.00047958147479221225, + "learning_rate": 1.5737538554601473e-05, + "loss": 0.001, + "num_input_tokens_seen": 2414208, + "step": 12675 + }, + { + "epoch": 6.590436590436591, + "grad_norm": 0.0019494243897497654, + "learning_rate": 1.571647792533186e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2415104, + "step": 12680 + }, + { + "epoch": 6.593035343035343, + "grad_norm": 0.0015920170117169619, + "learning_rate": 1.569542493542872e-05, + "loss": 0.0006, + "num_input_tokens_seen": 2416096, + "step": 12685 + }, + { + "epoch": 6.595634095634096, + "grad_norm": 0.00019599735969677567, + "learning_rate": 1.5674379602216464e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2417120, + "step": 12690 + }, + { + "epoch": 6.598232848232849, + "grad_norm": 0.0005842869868502021, + "learning_rate": 1.5653341943013195e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2418048, + "step": 12695 + }, + { + "epoch": 6.600831600831601, + "grad_norm": 0.04121076688170433, + "learning_rate": 1.5632311975130705e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2418976, + "step": 12700 + }, + { + "epoch": 6.603430353430354, + "grad_norm": 0.008222920820116997, + "learning_rate": 1.5611289715874443e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2419904, + "step": 12705 + }, + { + "epoch": 6.606029106029106, + "grad_norm": 0.0029375895392149687, + "learning_rate": 1.559027518254354e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2420864, + "step": 12710 + }, + { + "epoch": 6.608627858627859, + "grad_norm": 0.003621939104050398, + "learning_rate": 1.5569268392430753e-05, + "loss": 0.0, + "num_input_tokens_seen": 2421824, + "step": 12715 + }, + { + "epoch": 6.611226611226611, + "grad_norm": 0.0028584268875420094, + "learning_rate": 1.554826936282247e-05, + "loss": 0.1501, + "num_input_tokens_seen": 2422720, + "step": 12720 + }, + { + "epoch": 6.613825363825364, + "grad_norm": 0.002439255127683282, + "learning_rate": 1.552727811099869e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2423712, + "step": 12725 + }, + { + "epoch": 6.616424116424117, + "grad_norm": 0.003697262844070792, + "learning_rate": 1.5506294654233023e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2424608, + "step": 12730 + }, + { + "epoch": 6.619022869022869, + "grad_norm": 22.047346115112305, + "learning_rate": 1.548531900979266e-05, + "loss": 0.0079, + "num_input_tokens_seen": 2425568, + "step": 12735 + }, + { + "epoch": 6.621621621621622, + "grad_norm": 0.005373975727707148, + "learning_rate": 1.5464351194938337e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2426592, + "step": 12740 + }, + { + "epoch": 6.624220374220374, + "grad_norm": 0.002409894485026598, + "learning_rate": 1.5443391226924386e-05, + "loss": 0.0008, + "num_input_tokens_seen": 2427520, + "step": 12745 + }, + { + "epoch": 6.626819126819127, + "grad_norm": 0.0012632885482162237, + "learning_rate": 1.542243912299866e-05, + "loss": 0.0139, + "num_input_tokens_seen": 2428512, + "step": 12750 + }, + { + "epoch": 6.629417879417879, + "grad_norm": 0.0014791241846978664, + "learning_rate": 1.5401494900402557e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2429440, + "step": 12755 + }, + { + "epoch": 6.632016632016632, + "grad_norm": 0.003470023162662983, + "learning_rate": 1.538055857637097e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2430368, + "step": 12760 + }, + { + "epoch": 6.634615384615385, + "grad_norm": 0.00545232929289341, + "learning_rate": 1.5359630168132316e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2431296, + "step": 12765 + }, + { + "epoch": 6.637214137214137, + "grad_norm": 0.5168865919113159, + "learning_rate": 1.5338709692908456e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2432192, + "step": 12770 + }, + { + "epoch": 6.63981288981289, + "grad_norm": 0.23477980494499207, + "learning_rate": 1.5317797167914767e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2433216, + "step": 12775 + }, + { + "epoch": 6.642411642411642, + "grad_norm": 0.0037856344133615494, + "learning_rate": 1.5296892610360064e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2434144, + "step": 12780 + }, + { + "epoch": 6.645010395010395, + "grad_norm": 0.0013814216945320368, + "learning_rate": 1.527599603744661e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2435040, + "step": 12785 + }, + { + "epoch": 6.647609147609147, + "grad_norm": 16.09920883178711, + "learning_rate": 1.52551074663701e-05, + "loss": 0.0639, + "num_input_tokens_seen": 2436032, + "step": 12790 + }, + { + "epoch": 6.6502079002079, + "grad_norm": 0.0011928207240998745, + "learning_rate": 1.5234226914319632e-05, + "loss": 0.0005, + "num_input_tokens_seen": 2436960, + "step": 12795 + }, + { + "epoch": 6.652806652806653, + "grad_norm": 0.011359861120581627, + "learning_rate": 1.5213354398477722e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2437856, + "step": 12800 + }, + { + "epoch": 6.655405405405405, + "grad_norm": 0.007721046917140484, + "learning_rate": 1.5192489936020257e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2438880, + "step": 12805 + }, + { + "epoch": 6.658004158004158, + "grad_norm": 0.00014466102584265172, + "learning_rate": 1.5171633544116509e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2439776, + "step": 12810 + }, + { + "epoch": 6.66060291060291, + "grad_norm": 0.005483611952513456, + "learning_rate": 1.5150785239929102e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2440768, + "step": 12815 + }, + { + "epoch": 6.663201663201663, + "grad_norm": 0.0020803455263376236, + "learning_rate": 1.512994504061401e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2441632, + "step": 12820 + }, + { + "epoch": 6.665800415800415, + "grad_norm": 0.010908998548984528, + "learning_rate": 1.5109112963320532e-05, + "loss": 0.0756, + "num_input_tokens_seen": 2442688, + "step": 12825 + }, + { + "epoch": 6.668399168399168, + "grad_norm": 0.004457351751625538, + "learning_rate": 1.5088289025191293e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2443616, + "step": 12830 + }, + { + "epoch": 6.670997920997921, + "grad_norm": 0.0023642873857170343, + "learning_rate": 1.5067473243362204e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2444512, + "step": 12835 + }, + { + "epoch": 6.673596673596673, + "grad_norm": 0.0004894359735772014, + "learning_rate": 1.5046665634962476e-05, + "loss": 0.0, + "num_input_tokens_seen": 2445472, + "step": 12840 + }, + { + "epoch": 6.676195426195426, + "grad_norm": 0.0006149965920485556, + "learning_rate": 1.5025866217114592e-05, + "loss": 0.0, + "num_input_tokens_seen": 2446432, + "step": 12845 + }, + { + "epoch": 6.6787941787941785, + "grad_norm": 0.00026271960814483464, + "learning_rate": 1.50050750069343e-05, + "loss": 0.0, + "num_input_tokens_seen": 2447392, + "step": 12850 + }, + { + "epoch": 6.6813929313929314, + "grad_norm": 0.001032044761814177, + "learning_rate": 1.4984292021530578e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2448352, + "step": 12855 + }, + { + "epoch": 6.6839916839916835, + "grad_norm": 0.004942056257277727, + "learning_rate": 1.4963517278005656e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2449280, + "step": 12860 + }, + { + "epoch": 6.6865904365904365, + "grad_norm": 0.0001683609007159248, + "learning_rate": 1.494275079345498e-05, + "loss": 0.0, + "num_input_tokens_seen": 2450176, + "step": 12865 + }, + { + "epoch": 6.6891891891891895, + "grad_norm": 0.0029358493629842997, + "learning_rate": 1.492199258496717e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2451104, + "step": 12870 + }, + { + "epoch": 6.691787941787942, + "grad_norm": 0.005973389837890863, + "learning_rate": 1.4901242669624065e-05, + "loss": 0.0, + "num_input_tokens_seen": 2452160, + "step": 12875 + }, + { + "epoch": 6.6943866943866945, + "grad_norm": 0.0016557464841753244, + "learning_rate": 1.488050106450068e-05, + "loss": 0.0, + "num_input_tokens_seen": 2453056, + "step": 12880 + }, + { + "epoch": 6.696985446985447, + "grad_norm": 0.0007877358584664762, + "learning_rate": 1.4859767786665183e-05, + "loss": 0.0, + "num_input_tokens_seen": 2454016, + "step": 12885 + }, + { + "epoch": 6.6995841995842, + "grad_norm": 0.002971251495182514, + "learning_rate": 1.4839042853178886e-05, + "loss": 0.0, + "num_input_tokens_seen": 2454976, + "step": 12890 + }, + { + "epoch": 6.702182952182953, + "grad_norm": 0.007896228693425655, + "learning_rate": 1.481832628109625e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2455936, + "step": 12895 + }, + { + "epoch": 6.704781704781705, + "grad_norm": 0.0002266426890855655, + "learning_rate": 1.4797618087464827e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2456864, + "step": 12900 + }, + { + "epoch": 6.707380457380458, + "grad_norm": 7.22310142009519e-05, + "learning_rate": 1.4776918289325298e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2457824, + "step": 12905 + }, + { + "epoch": 6.70997920997921, + "grad_norm": 0.003973369486629963, + "learning_rate": 1.4756226903711429e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2458720, + "step": 12910 + }, + { + "epoch": 6.712577962577963, + "grad_norm": 0.0007414593710564077, + "learning_rate": 1.4735543947650066e-05, + "loss": 0.1501, + "num_input_tokens_seen": 2459648, + "step": 12915 + }, + { + "epoch": 6.715176715176716, + "grad_norm": 0.0006190445856191218, + "learning_rate": 1.4714869438161116e-05, + "loss": 0.1141, + "num_input_tokens_seen": 2460576, + "step": 12920 + }, + { + "epoch": 6.717775467775468, + "grad_norm": 0.005787239409983158, + "learning_rate": 1.4694203392257536e-05, + "loss": 0.0, + "num_input_tokens_seen": 2461536, + "step": 12925 + }, + { + "epoch": 6.720374220374221, + "grad_norm": 0.0007566649001091719, + "learning_rate": 1.467354582694532e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2462464, + "step": 12930 + }, + { + "epoch": 6.722972972972973, + "grad_norm": 0.0035467008128762245, + "learning_rate": 1.4652896759223472e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2463360, + "step": 12935 + }, + { + "epoch": 6.725571725571726, + "grad_norm": 0.00022078322945162654, + "learning_rate": 1.4632256206084016e-05, + "loss": 0.0, + "num_input_tokens_seen": 2464320, + "step": 12940 + }, + { + "epoch": 6.728170478170478, + "grad_norm": 0.0013832544209435582, + "learning_rate": 1.4611624184511968e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2465344, + "step": 12945 + }, + { + "epoch": 6.730769230769231, + "grad_norm": 0.017519468441605568, + "learning_rate": 1.4591000711485314e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2466272, + "step": 12950 + }, + { + "epoch": 6.733367983367984, + "grad_norm": 0.008182402700185776, + "learning_rate": 1.4570385803975031e-05, + "loss": 0.0008, + "num_input_tokens_seen": 2467264, + "step": 12955 + }, + { + "epoch": 6.735966735966736, + "grad_norm": 0.0012461987789720297, + "learning_rate": 1.4549779478945005e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2468224, + "step": 12960 + }, + { + "epoch": 6.738565488565489, + "grad_norm": 0.04581489786505699, + "learning_rate": 1.4529181753352117e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2469152, + "step": 12965 + }, + { + "epoch": 6.741164241164241, + "grad_norm": 0.0020574089139699936, + "learning_rate": 1.4508592644146093e-05, + "loss": 0.0, + "num_input_tokens_seen": 2470144, + "step": 12970 + }, + { + "epoch": 6.743762993762994, + "grad_norm": 0.002532975748181343, + "learning_rate": 1.448801216826965e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2471104, + "step": 12975 + }, + { + "epoch": 6.746361746361746, + "grad_norm": 0.0011549469782039523, + "learning_rate": 1.446744034265834e-05, + "loss": 0.1113, + "num_input_tokens_seen": 2472064, + "step": 12980 + }, + { + "epoch": 6.748960498960499, + "grad_norm": 0.0006858112756162882, + "learning_rate": 1.4446877184240643e-05, + "loss": 0.0, + "num_input_tokens_seen": 2472992, + "step": 12985 + }, + { + "epoch": 6.751559251559252, + "grad_norm": 0.013494965620338917, + "learning_rate": 1.4426322709937862e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2473856, + "step": 12990 + }, + { + "epoch": 6.754158004158004, + "grad_norm": 0.0032028469722718, + "learning_rate": 1.4405776936664203e-05, + "loss": 0.0, + "num_input_tokens_seen": 2474848, + "step": 12995 + }, + { + "epoch": 6.756756756756757, + "grad_norm": 0.09234325587749481, + "learning_rate": 1.4385239881326673e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2475776, + "step": 13000 + }, + { + "epoch": 6.759355509355509, + "grad_norm": 0.0004223252472002059, + "learning_rate": 1.4364711560825104e-05, + "loss": 0.0, + "num_input_tokens_seen": 2476672, + "step": 13005 + }, + { + "epoch": 6.761954261954262, + "grad_norm": 0.005754341371357441, + "learning_rate": 1.4344191992052178e-05, + "loss": 0.0, + "num_input_tokens_seen": 2477632, + "step": 13010 + }, + { + "epoch": 6.764553014553014, + "grad_norm": 0.0007722730515524745, + "learning_rate": 1.4323681191893328e-05, + "loss": 0.0005, + "num_input_tokens_seen": 2478592, + "step": 13015 + }, + { + "epoch": 6.767151767151767, + "grad_norm": 0.002199694747105241, + "learning_rate": 1.4303179177226823e-05, + "loss": 0.0, + "num_input_tokens_seen": 2479584, + "step": 13020 + }, + { + "epoch": 6.76975051975052, + "grad_norm": 0.0009386177989654243, + "learning_rate": 1.4282685964923642e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2480544, + "step": 13025 + }, + { + "epoch": 6.772349272349272, + "grad_norm": 0.0005521223647519946, + "learning_rate": 1.4262201571847584e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2481536, + "step": 13030 + }, + { + "epoch": 6.774948024948025, + "grad_norm": 0.002831099322065711, + "learning_rate": 1.4241726014855139e-05, + "loss": 0.0, + "num_input_tokens_seen": 2482464, + "step": 13035 + }, + { + "epoch": 6.777546777546777, + "grad_norm": 0.00032913361792452633, + "learning_rate": 1.4221259310795543e-05, + "loss": 0.0548, + "num_input_tokens_seen": 2483360, + "step": 13040 + }, + { + "epoch": 6.78014553014553, + "grad_norm": 0.00030409093596972525, + "learning_rate": 1.4200801476510767e-05, + "loss": 0.0, + "num_input_tokens_seen": 2484352, + "step": 13045 + }, + { + "epoch": 6.782744282744282, + "grad_norm": 0.0001920805952977389, + "learning_rate": 1.418035252883545e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2485248, + "step": 13050 + }, + { + "epoch": 6.785343035343035, + "grad_norm": 0.0003662325907498598, + "learning_rate": 1.4159912484596949e-05, + "loss": 0.0, + "num_input_tokens_seen": 2486208, + "step": 13055 + }, + { + "epoch": 6.787941787941788, + "grad_norm": 0.012868017889559269, + "learning_rate": 1.4139481360615275e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2487104, + "step": 13060 + }, + { + "epoch": 6.79054054054054, + "grad_norm": 0.00026481665554456413, + "learning_rate": 1.4119059173703089e-05, + "loss": 0.0, + "num_input_tokens_seen": 2488032, + "step": 13065 + }, + { + "epoch": 6.793139293139293, + "grad_norm": 0.0004969595465809107, + "learning_rate": 1.4098645940665737e-05, + "loss": 0.0, + "num_input_tokens_seen": 2488992, + "step": 13070 + }, + { + "epoch": 6.795738045738045, + "grad_norm": 0.01155069563537836, + "learning_rate": 1.4078241678301146e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2489984, + "step": 13075 + }, + { + "epoch": 6.798336798336798, + "grad_norm": 0.0011090869084000587, + "learning_rate": 1.4057846403399912e-05, + "loss": 0.0, + "num_input_tokens_seen": 2490944, + "step": 13080 + }, + { + "epoch": 6.8009355509355505, + "grad_norm": 0.0019129097927361727, + "learning_rate": 1.4037460132745189e-05, + "loss": 0.0, + "num_input_tokens_seen": 2491872, + "step": 13085 + }, + { + "epoch": 6.803534303534303, + "grad_norm": 0.0019737647380679846, + "learning_rate": 1.4017082883112764e-05, + "loss": 0.0, + "num_input_tokens_seen": 2492864, + "step": 13090 + }, + { + "epoch": 6.806133056133056, + "grad_norm": 0.004790612496435642, + "learning_rate": 1.3996714671270969e-05, + "loss": 0.0, + "num_input_tokens_seen": 2493792, + "step": 13095 + }, + { + "epoch": 6.8087318087318085, + "grad_norm": 0.0007098628557287157, + "learning_rate": 1.3976355513980708e-05, + "loss": 0.0, + "num_input_tokens_seen": 2494688, + "step": 13100 + }, + { + "epoch": 6.8113305613305615, + "grad_norm": 0.00028467652737163007, + "learning_rate": 1.3956005427995421e-05, + "loss": 0.0573, + "num_input_tokens_seen": 2495584, + "step": 13105 + }, + { + "epoch": 6.813929313929314, + "grad_norm": 0.0013160357484593987, + "learning_rate": 1.3935664430061129e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2496544, + "step": 13110 + }, + { + "epoch": 6.8165280665280665, + "grad_norm": 0.01029721274971962, + "learning_rate": 1.3915332536916314e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2497536, + "step": 13115 + }, + { + "epoch": 6.8191268191268195, + "grad_norm": 0.0032611044589430094, + "learning_rate": 1.3895009765292011e-05, + "loss": 0.0, + "num_input_tokens_seen": 2498496, + "step": 13120 + }, + { + "epoch": 6.821725571725572, + "grad_norm": 0.0010281519498676062, + "learning_rate": 1.3874696131911746e-05, + "loss": 0.0, + "num_input_tokens_seen": 2499424, + "step": 13125 + }, + { + "epoch": 6.824324324324325, + "grad_norm": 0.005445382092148066, + "learning_rate": 1.3854391653491478e-05, + "loss": 0.1652, + "num_input_tokens_seen": 2500384, + "step": 13130 + }, + { + "epoch": 6.826923076923077, + "grad_norm": 8.454909402644262e-05, + "learning_rate": 1.3834096346739689e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2501344, + "step": 13135 + }, + { + "epoch": 6.82952182952183, + "grad_norm": 8.141190483001992e-05, + "learning_rate": 1.3813810228357283e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2502272, + "step": 13140 + }, + { + "epoch": 6.832120582120583, + "grad_norm": 0.00626450264826417, + "learning_rate": 1.3793533315037616e-05, + "loss": 0.0012, + "num_input_tokens_seen": 2503232, + "step": 13145 + }, + { + "epoch": 6.834719334719335, + "grad_norm": 0.0006871515652164817, + "learning_rate": 1.3773265623466458e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2504192, + "step": 13150 + }, + { + "epoch": 6.837318087318088, + "grad_norm": 0.0021289412397891283, + "learning_rate": 1.3753007170322008e-05, + "loss": 0.0011, + "num_input_tokens_seen": 2505216, + "step": 13155 + }, + { + "epoch": 6.83991683991684, + "grad_norm": 0.006503810174763203, + "learning_rate": 1.3732757972274845e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2506144, + "step": 13160 + }, + { + "epoch": 6.842515592515593, + "grad_norm": 0.0014491339679807425, + "learning_rate": 1.371251804598793e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2507072, + "step": 13165 + }, + { + "epoch": 6.845114345114345, + "grad_norm": 0.0028269411996006966, + "learning_rate": 1.3692287408116617e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2508000, + "step": 13170 + }, + { + "epoch": 6.847713097713098, + "grad_norm": 0.12493753433227539, + "learning_rate": 1.3672066075308587e-05, + "loss": 0.0008, + "num_input_tokens_seen": 2508960, + "step": 13175 + }, + { + "epoch": 6.850311850311851, + "grad_norm": 0.006738223601132631, + "learning_rate": 1.3651854064203901e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2509888, + "step": 13180 + }, + { + "epoch": 6.852910602910603, + "grad_norm": 0.0025139222852885723, + "learning_rate": 1.3631651391434902e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2510848, + "step": 13185 + }, + { + "epoch": 6.855509355509356, + "grad_norm": 0.003518481506034732, + "learning_rate": 1.3611458073626293e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2511840, + "step": 13190 + }, + { + "epoch": 6.858108108108108, + "grad_norm": 0.04295013099908829, + "learning_rate": 1.359127412739506e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2512736, + "step": 13195 + }, + { + "epoch": 6.860706860706861, + "grad_norm": 0.002034808974713087, + "learning_rate": 1.3571099569350456e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2513696, + "step": 13200 + }, + { + "epoch": 6.863305613305613, + "grad_norm": 0.00017582399595994502, + "learning_rate": 1.3550934416094058e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2514688, + "step": 13205 + }, + { + "epoch": 6.865904365904366, + "grad_norm": 0.003141609486192465, + "learning_rate": 1.3530778684219648e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2515584, + "step": 13210 + }, + { + "epoch": 6.868503118503119, + "grad_norm": 0.0035159343387931585, + "learning_rate": 1.3510632390313307e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2516544, + "step": 13215 + }, + { + "epoch": 6.871101871101871, + "grad_norm": 0.005057253874838352, + "learning_rate": 1.3490495550953303e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2517536, + "step": 13220 + }, + { + "epoch": 6.873700623700624, + "grad_norm": 0.006594269536435604, + "learning_rate": 1.347036818271018e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2518496, + "step": 13225 + }, + { + "epoch": 6.876299376299376, + "grad_norm": 0.0025228543672710657, + "learning_rate": 1.345025030214661e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2519456, + "step": 13230 + }, + { + "epoch": 6.878898128898129, + "grad_norm": 0.005274635273963213, + "learning_rate": 1.3430141925817532e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2520448, + "step": 13235 + }, + { + "epoch": 6.881496881496881, + "grad_norm": 0.0005771980504505336, + "learning_rate": 1.3410043070270017e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2521344, + "step": 13240 + }, + { + "epoch": 6.884095634095634, + "grad_norm": 0.002194092608988285, + "learning_rate": 1.3389953752043327e-05, + "loss": 0.0, + "num_input_tokens_seen": 2522240, + "step": 13245 + }, + { + "epoch": 6.886694386694387, + "grad_norm": 0.002322620013728738, + "learning_rate": 1.3369873987668873e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2523232, + "step": 13250 + }, + { + "epoch": 6.889293139293139, + "grad_norm": 0.0026318246964365244, + "learning_rate": 1.3349803793670196e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2524224, + "step": 13255 + }, + { + "epoch": 6.891891891891892, + "grad_norm": 0.00022664808784611523, + "learning_rate": 1.332974318656296e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2525184, + "step": 13260 + }, + { + "epoch": 6.894490644490644, + "grad_norm": 0.0006457656272687018, + "learning_rate": 1.3309692182854932e-05, + "loss": 0.0042, + "num_input_tokens_seen": 2526144, + "step": 13265 + }, + { + "epoch": 6.897089397089397, + "grad_norm": 0.0015609192196279764, + "learning_rate": 1.328965079904601e-05, + "loss": 0.0, + "num_input_tokens_seen": 2527104, + "step": 13270 + }, + { + "epoch": 6.899688149688149, + "grad_norm": 0.005919781979173422, + "learning_rate": 1.3269619051628135e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2528096, + "step": 13275 + }, + { + "epoch": 6.902286902286902, + "grad_norm": 0.0011564719025045633, + "learning_rate": 1.3249596957085353e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2529088, + "step": 13280 + }, + { + "epoch": 6.904885654885655, + "grad_norm": 0.00890269037336111, + "learning_rate": 1.322958453189374e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2529952, + "step": 13285 + }, + { + "epoch": 6.907484407484407, + "grad_norm": 0.0013616286450996995, + "learning_rate": 1.3209581792521437e-05, + "loss": 0.0008, + "num_input_tokens_seen": 2530912, + "step": 13290 + }, + { + "epoch": 6.91008316008316, + "grad_norm": 0.0005496891099028289, + "learning_rate": 1.3189588755428598e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2531808, + "step": 13295 + }, + { + "epoch": 6.912681912681912, + "grad_norm": 8.734856965020299e-05, + "learning_rate": 1.3169605437067387e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2532768, + "step": 13300 + }, + { + "epoch": 6.915280665280665, + "grad_norm": 0.004257217049598694, + "learning_rate": 1.3149631853882005e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2533760, + "step": 13305 + }, + { + "epoch": 6.917879417879417, + "grad_norm": 0.001160406623966992, + "learning_rate": 1.3129668022308598e-05, + "loss": 0.0927, + "num_input_tokens_seen": 2534688, + "step": 13310 + }, + { + "epoch": 6.92047817047817, + "grad_norm": 0.00349343940615654, + "learning_rate": 1.3109713958775327e-05, + "loss": 0.0, + "num_input_tokens_seen": 2535584, + "step": 13315 + }, + { + "epoch": 6.923076923076923, + "grad_norm": 0.0038448397535830736, + "learning_rate": 1.3089769679702288e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2536480, + "step": 13320 + }, + { + "epoch": 6.925675675675675, + "grad_norm": 0.017534498125314713, + "learning_rate": 1.3069835201501526e-05, + "loss": 0.0515, + "num_input_tokens_seen": 2537440, + "step": 13325 + }, + { + "epoch": 6.928274428274428, + "grad_norm": 0.00200657919049263, + "learning_rate": 1.3049910540577046e-05, + "loss": 0.0, + "num_input_tokens_seen": 2538304, + "step": 13330 + }, + { + "epoch": 6.9308731808731805, + "grad_norm": 0.027653541415929794, + "learning_rate": 1.3029995713324738e-05, + "loss": 0.0009, + "num_input_tokens_seen": 2539232, + "step": 13335 + }, + { + "epoch": 6.9334719334719335, + "grad_norm": 0.01078090164810419, + "learning_rate": 1.3010090736132442e-05, + "loss": 0.1173, + "num_input_tokens_seen": 2540128, + "step": 13340 + }, + { + "epoch": 6.936070686070686, + "grad_norm": 0.010304652154445648, + "learning_rate": 1.2990195625379847e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2541056, + "step": 13345 + }, + { + "epoch": 6.9386694386694385, + "grad_norm": 0.026167411357164383, + "learning_rate": 1.2970310397438564e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2541984, + "step": 13350 + }, + { + "epoch": 6.9412681912681915, + "grad_norm": 0.043904658406972885, + "learning_rate": 1.2950435068672046e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2542880, + "step": 13355 + }, + { + "epoch": 6.943866943866944, + "grad_norm": 0.7420259118080139, + "learning_rate": 1.2930569655435609e-05, + "loss": 0.0016, + "num_input_tokens_seen": 2543776, + "step": 13360 + }, + { + "epoch": 6.946465696465697, + "grad_norm": 0.0023814886808395386, + "learning_rate": 1.2910714174076394e-05, + "loss": 0.2192, + "num_input_tokens_seen": 2544704, + "step": 13365 + }, + { + "epoch": 6.9490644490644495, + "grad_norm": 8.583130836486816, + "learning_rate": 1.2890868640933395e-05, + "loss": 0.0499, + "num_input_tokens_seen": 2545600, + "step": 13370 + }, + { + "epoch": 6.951663201663202, + "grad_norm": 0.021105319261550903, + "learning_rate": 1.2871033072337413e-05, + "loss": 0.1307, + "num_input_tokens_seen": 2546528, + "step": 13375 + }, + { + "epoch": 6.954261954261955, + "grad_norm": 0.05765030160546303, + "learning_rate": 1.2851207484611033e-05, + "loss": 0.1035, + "num_input_tokens_seen": 2547424, + "step": 13380 + }, + { + "epoch": 6.956860706860707, + "grad_norm": 0.001016755006276071, + "learning_rate": 1.2831391894068647e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2548352, + "step": 13385 + }, + { + "epoch": 6.95945945945946, + "grad_norm": 0.05607987940311432, + "learning_rate": 1.281158631701641e-05, + "loss": 0.0007, + "num_input_tokens_seen": 2549280, + "step": 13390 + }, + { + "epoch": 6.962058212058212, + "grad_norm": 0.02322382479906082, + "learning_rate": 1.2791790769752232e-05, + "loss": 0.0005, + "num_input_tokens_seen": 2550176, + "step": 13395 + }, + { + "epoch": 6.964656964656965, + "grad_norm": 0.021387826651334763, + "learning_rate": 1.2772005268565768e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2551136, + "step": 13400 + }, + { + "epoch": 6.967255717255718, + "grad_norm": 5.885019302368164, + "learning_rate": 1.2752229829738429e-05, + "loss": 0.0806, + "num_input_tokens_seen": 2552096, + "step": 13405 + }, + { + "epoch": 6.96985446985447, + "grad_norm": 3.4990460872650146, + "learning_rate": 1.2732464469543314e-05, + "loss": 0.1508, + "num_input_tokens_seen": 2553024, + "step": 13410 + }, + { + "epoch": 6.972453222453223, + "grad_norm": 0.032695166766643524, + "learning_rate": 1.2712709204245269e-05, + "loss": 0.0009, + "num_input_tokens_seen": 2553952, + "step": 13415 + }, + { + "epoch": 6.975051975051975, + "grad_norm": 0.10924467444419861, + "learning_rate": 1.2692964050100791e-05, + "loss": 0.0025, + "num_input_tokens_seen": 2554880, + "step": 13420 + }, + { + "epoch": 6.977650727650728, + "grad_norm": 0.012303436174988747, + "learning_rate": 1.2673229023358065e-05, + "loss": 0.0474, + "num_input_tokens_seen": 2555808, + "step": 13425 + }, + { + "epoch": 6.98024948024948, + "grad_norm": 0.03347673639655113, + "learning_rate": 1.2653504140256978e-05, + "loss": 0.0675, + "num_input_tokens_seen": 2556800, + "step": 13430 + }, + { + "epoch": 6.982848232848233, + "grad_norm": 0.059240084141492844, + "learning_rate": 1.2633789417029014e-05, + "loss": 0.0019, + "num_input_tokens_seen": 2557792, + "step": 13435 + }, + { + "epoch": 6.985446985446986, + "grad_norm": 0.07698459178209305, + "learning_rate": 1.2614084869897353e-05, + "loss": 0.0008, + "num_input_tokens_seen": 2558752, + "step": 13440 + }, + { + "epoch": 6.988045738045738, + "grad_norm": 0.9229817986488342, + "learning_rate": 1.2594390515076748e-05, + "loss": 0.0022, + "num_input_tokens_seen": 2559680, + "step": 13445 + }, + { + "epoch": 6.990644490644491, + "grad_norm": 0.004038333427160978, + "learning_rate": 1.2574706368773615e-05, + "loss": 0.0009, + "num_input_tokens_seen": 2560800, + "step": 13450 + }, + { + "epoch": 6.993243243243243, + "grad_norm": 0.0018561143660917878, + "learning_rate": 1.2555032447185932e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2561728, + "step": 13455 + }, + { + "epoch": 6.995841995841996, + "grad_norm": 0.00020764439250342548, + "learning_rate": 1.253536876650327e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2562656, + "step": 13460 + }, + { + "epoch": 6.998440748440748, + "grad_norm": 0.0119175361469388, + "learning_rate": 1.2515715342906795e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2563584, + "step": 13465 + }, + { + "epoch": 7.0, + "eval_loss": 0.31064191460609436, + "eval_runtime": 9.2463, + "eval_samples_per_second": 92.577, + "eval_steps_per_second": 23.144, + "num_input_tokens_seen": 2564208, + "step": 13468 + }, + { + "epoch": 7.001039501039501, + "grad_norm": 0.009515440091490746, + "learning_rate": 1.2496072192569197e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2564624, + "step": 13470 + }, + { + "epoch": 7.003638253638254, + "grad_norm": 0.004175928421318531, + "learning_rate": 1.2476439331654754e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2565648, + "step": 13475 + }, + { + "epoch": 7.006237006237006, + "grad_norm": 0.004372606985270977, + "learning_rate": 1.2456816776319233e-05, + "loss": 0.1114, + "num_input_tokens_seen": 2566576, + "step": 13480 + }, + { + "epoch": 7.008835758835759, + "grad_norm": 0.025360073894262314, + "learning_rate": 1.2437204542709974e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2567504, + "step": 13485 + }, + { + "epoch": 7.011434511434511, + "grad_norm": 0.10975649207830429, + "learning_rate": 1.2417602646965749e-05, + "loss": 0.0005, + "num_input_tokens_seen": 2568464, + "step": 13490 + }, + { + "epoch": 7.014033264033264, + "grad_norm": 0.00871952436864376, + "learning_rate": 1.2398011105216883e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2569392, + "step": 13495 + }, + { + "epoch": 7.016632016632016, + "grad_norm": 0.03376268967986107, + "learning_rate": 1.2378429933585179e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2570320, + "step": 13500 + }, + { + "epoch": 7.019230769230769, + "grad_norm": 0.014868305064737797, + "learning_rate": 1.2358859148183868e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2571280, + "step": 13505 + }, + { + "epoch": 7.021829521829522, + "grad_norm": 0.00014522149285767227, + "learning_rate": 1.2339298765117677e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2572240, + "step": 13510 + }, + { + "epoch": 7.024428274428274, + "grad_norm": 0.00809952151030302, + "learning_rate": 1.231974880048273e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2573200, + "step": 13515 + }, + { + "epoch": 7.027027027027027, + "grad_norm": 0.030973726883530617, + "learning_rate": 1.2300209270366636e-05, + "loss": 0.0006, + "num_input_tokens_seen": 2574128, + "step": 13520 + }, + { + "epoch": 7.029625779625779, + "grad_norm": 0.04313269630074501, + "learning_rate": 1.228068019084834e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2575056, + "step": 13525 + }, + { + "epoch": 7.032224532224532, + "grad_norm": 0.0732179507613182, + "learning_rate": 1.2261161577998257e-05, + "loss": 0.0007, + "num_input_tokens_seen": 2575952, + "step": 13530 + }, + { + "epoch": 7.034823284823285, + "grad_norm": 7.09759842720814e-05, + "learning_rate": 1.2241653447878146e-05, + "loss": 0.001, + "num_input_tokens_seen": 2576944, + "step": 13535 + }, + { + "epoch": 7.037422037422037, + "grad_norm": 0.0009378317045047879, + "learning_rate": 1.2222155816541167e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2577936, + "step": 13540 + }, + { + "epoch": 7.04002079002079, + "grad_norm": 0.0023145866580307484, + "learning_rate": 1.220266870003182e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2578832, + "step": 13545 + }, + { + "epoch": 7.042619542619542, + "grad_norm": 0.004093233495950699, + "learning_rate": 1.2183192114385969e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2579792, + "step": 13550 + }, + { + "epoch": 7.045218295218295, + "grad_norm": 0.004539877641946077, + "learning_rate": 1.2163726075630804e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2580688, + "step": 13555 + }, + { + "epoch": 7.047817047817047, + "grad_norm": 0.012313558720052242, + "learning_rate": 1.2144270599784824e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2581712, + "step": 13560 + }, + { + "epoch": 7.0504158004158, + "grad_norm": 0.002392909722402692, + "learning_rate": 1.2124825702857865e-05, + "loss": 0.0189, + "num_input_tokens_seen": 2582672, + "step": 13565 + }, + { + "epoch": 7.053014553014553, + "grad_norm": 0.009846772067248821, + "learning_rate": 1.210539140085102e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2583568, + "step": 13570 + }, + { + "epoch": 7.0556133056133055, + "grad_norm": 0.01770853064954281, + "learning_rate": 1.2085967709756712e-05, + "loss": 0.0006, + "num_input_tokens_seen": 2584560, + "step": 13575 + }, + { + "epoch": 7.058212058212058, + "grad_norm": 0.006635232828557491, + "learning_rate": 1.2066554645558578e-05, + "loss": 0.0003, + "num_input_tokens_seen": 2585584, + "step": 13580 + }, + { + "epoch": 7.0608108108108105, + "grad_norm": 14.358999252319336, + "learning_rate": 1.2047152224231558e-05, + "loss": 0.0156, + "num_input_tokens_seen": 2586544, + "step": 13585 + }, + { + "epoch": 7.0634095634095635, + "grad_norm": 0.0017755995504558086, + "learning_rate": 1.2027760461741804e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2587504, + "step": 13590 + }, + { + "epoch": 7.066008316008316, + "grad_norm": 0.010631152428686619, + "learning_rate": 1.2008379374046696e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2588432, + "step": 13595 + }, + { + "epoch": 7.0686070686070686, + "grad_norm": 0.011791903525590897, + "learning_rate": 1.198900897709486e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2589328, + "step": 13600 + }, + { + "epoch": 7.0712058212058215, + "grad_norm": 7.61967821745202e-05, + "learning_rate": 1.1969649286826082e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2590448, + "step": 13605 + }, + { + "epoch": 7.073804573804574, + "grad_norm": 9.474679245613515e-05, + "learning_rate": 1.195030031917138e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2591408, + "step": 13610 + }, + { + "epoch": 7.076403326403327, + "grad_norm": 33.464210510253906, + "learning_rate": 1.1930962090052918e-05, + "loss": 0.0384, + "num_input_tokens_seen": 2592368, + "step": 13615 + }, + { + "epoch": 7.079002079002079, + "grad_norm": 0.0012457810807973146, + "learning_rate": 1.191163461538403e-05, + "loss": 0.0034, + "num_input_tokens_seen": 2593264, + "step": 13620 + }, + { + "epoch": 7.081600831600832, + "grad_norm": 0.001669880235567689, + "learning_rate": 1.1892317911069212e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2594224, + "step": 13625 + }, + { + "epoch": 7.084199584199585, + "grad_norm": 0.0007494023884646595, + "learning_rate": 1.1873011993004076e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2595184, + "step": 13630 + }, + { + "epoch": 7.086798336798337, + "grad_norm": 0.01539099495857954, + "learning_rate": 1.1853716877075392e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2596176, + "step": 13635 + }, + { + "epoch": 7.08939708939709, + "grad_norm": 0.005730877630412579, + "learning_rate": 1.1834432579160996e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2597040, + "step": 13640 + }, + { + "epoch": 7.091995841995842, + "grad_norm": 7.082582305883989e-05, + "learning_rate": 1.1815159115129865e-05, + "loss": 0.0, + "num_input_tokens_seen": 2598000, + "step": 13645 + }, + { + "epoch": 7.094594594594595, + "grad_norm": 0.29564040899276733, + "learning_rate": 1.1795896500842036e-05, + "loss": 0.0006, + "num_input_tokens_seen": 2599056, + "step": 13650 + }, + { + "epoch": 7.097193347193347, + "grad_norm": 0.013884150423109531, + "learning_rate": 1.1776644752148617e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2599984, + "step": 13655 + }, + { + "epoch": 7.0997920997921, + "grad_norm": 0.004320695996284485, + "learning_rate": 1.175740388489178e-05, + "loss": 0.0, + "num_input_tokens_seen": 2600976, + "step": 13660 + }, + { + "epoch": 7.102390852390853, + "grad_norm": 0.00035625911550596356, + "learning_rate": 1.1738173914904754e-05, + "loss": 0.0, + "num_input_tokens_seen": 2601904, + "step": 13665 + }, + { + "epoch": 7.104989604989605, + "grad_norm": 0.0017315676668658853, + "learning_rate": 1.1718954858011777e-05, + "loss": 0.0, + "num_input_tokens_seen": 2602864, + "step": 13670 + }, + { + "epoch": 7.107588357588358, + "grad_norm": 5.352755069732666, + "learning_rate": 1.169974673002813e-05, + "loss": 0.0632, + "num_input_tokens_seen": 2603888, + "step": 13675 + }, + { + "epoch": 7.11018711018711, + "grad_norm": 0.0033573834225535393, + "learning_rate": 1.1680549546760108e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2604816, + "step": 13680 + }, + { + "epoch": 7.112785862785863, + "grad_norm": 0.0010774297406896949, + "learning_rate": 1.1661363324004943e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2605776, + "step": 13685 + }, + { + "epoch": 7.115384615384615, + "grad_norm": 0.0009340905235148966, + "learning_rate": 1.164218807755092e-05, + "loss": 0.114, + "num_input_tokens_seen": 2606672, + "step": 13690 + }, + { + "epoch": 7.117983367983368, + "grad_norm": 0.0032638818956911564, + "learning_rate": 1.1623023823177235e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2607536, + "step": 13695 + }, + { + "epoch": 7.120582120582121, + "grad_norm": 0.02664531208574772, + "learning_rate": 1.1603870576654083e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2608496, + "step": 13700 + }, + { + "epoch": 7.123180873180873, + "grad_norm": 0.001443565939553082, + "learning_rate": 1.1584728353742563e-05, + "loss": 0.0, + "num_input_tokens_seen": 2609456, + "step": 13705 + }, + { + "epoch": 7.125779625779626, + "grad_norm": 0.00338048511184752, + "learning_rate": 1.1565597170194737e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2610448, + "step": 13710 + }, + { + "epoch": 7.128378378378378, + "grad_norm": 5.909397441428155e-05, + "learning_rate": 1.1546477041753553e-05, + "loss": 0.0523, + "num_input_tokens_seen": 2611408, + "step": 13715 + }, + { + "epoch": 7.130977130977131, + "grad_norm": 0.0021229430567473173, + "learning_rate": 1.1527367984152872e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2612336, + "step": 13720 + }, + { + "epoch": 7.133575883575883, + "grad_norm": 0.00010966735862893984, + "learning_rate": 1.1508270013117465e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2613328, + "step": 13725 + }, + { + "epoch": 7.136174636174636, + "grad_norm": 0.000419322750531137, + "learning_rate": 1.1489183144362936e-05, + "loss": 0.0145, + "num_input_tokens_seen": 2614288, + "step": 13730 + }, + { + "epoch": 7.138773388773389, + "grad_norm": 0.05218367651104927, + "learning_rate": 1.1470107393595805e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2615248, + "step": 13735 + }, + { + "epoch": 7.141372141372141, + "grad_norm": 0.001380327739752829, + "learning_rate": 1.1451042776513396e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2616272, + "step": 13740 + }, + { + "epoch": 7.143970893970894, + "grad_norm": 0.001054604770615697, + "learning_rate": 1.1431989308803911e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2617264, + "step": 13745 + }, + { + "epoch": 7.146569646569646, + "grad_norm": 0.0013844039058312774, + "learning_rate": 1.141294700614635e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2618256, + "step": 13750 + }, + { + "epoch": 7.149168399168399, + "grad_norm": 3.3653581142425537, + "learning_rate": 1.1393915884210523e-05, + "loss": 0.0021, + "num_input_tokens_seen": 2619216, + "step": 13755 + }, + { + "epoch": 7.151767151767152, + "grad_norm": 0.0009130413527600467, + "learning_rate": 1.1374895958657073e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2620112, + "step": 13760 + }, + { + "epoch": 7.154365904365904, + "grad_norm": 5.590239379671402e-05, + "learning_rate": 1.1355887245137383e-05, + "loss": 0.0, + "num_input_tokens_seen": 2621008, + "step": 13765 + }, + { + "epoch": 7.156964656964657, + "grad_norm": 0.0006066067144274712, + "learning_rate": 1.1336889759293656e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2621936, + "step": 13770 + }, + { + "epoch": 7.159563409563409, + "grad_norm": 0.0004962679231539369, + "learning_rate": 1.1317903516758813e-05, + "loss": 0.0322, + "num_input_tokens_seen": 2622864, + "step": 13775 + }, + { + "epoch": 7.162162162162162, + "grad_norm": 0.0032994442153722048, + "learning_rate": 1.1298928533156572e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2623888, + "step": 13780 + }, + { + "epoch": 7.164760914760914, + "grad_norm": 0.0008063738932833076, + "learning_rate": 1.1279964824101321e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2624816, + "step": 13785 + }, + { + "epoch": 7.167359667359667, + "grad_norm": 0.02075912244617939, + "learning_rate": 1.1261012405198231e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2625776, + "step": 13790 + }, + { + "epoch": 7.16995841995842, + "grad_norm": 0.4941444396972656, + "learning_rate": 1.1242071292043144e-05, + "loss": 0.0335, + "num_input_tokens_seen": 2626704, + "step": 13795 + }, + { + "epoch": 7.172557172557172, + "grad_norm": 6.209217826835811e-05, + "learning_rate": 1.122314150022262e-05, + "loss": 0.0, + "num_input_tokens_seen": 2627600, + "step": 13800 + }, + { + "epoch": 7.175155925155925, + "grad_norm": 0.0004847989184781909, + "learning_rate": 1.1204223045313903e-05, + "loss": 0.0, + "num_input_tokens_seen": 2628560, + "step": 13805 + }, + { + "epoch": 7.1777546777546775, + "grad_norm": 0.0024104900658130646, + "learning_rate": 1.1185315942884883e-05, + "loss": 0.0, + "num_input_tokens_seen": 2629424, + "step": 13810 + }, + { + "epoch": 7.18035343035343, + "grad_norm": 0.0020551043562591076, + "learning_rate": 1.1166420208494155e-05, + "loss": 0.0, + "num_input_tokens_seen": 2630384, + "step": 13815 + }, + { + "epoch": 7.182952182952183, + "grad_norm": 0.0013390554813668132, + "learning_rate": 1.1147535857690889e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2631312, + "step": 13820 + }, + { + "epoch": 7.1855509355509355, + "grad_norm": 0.002427122788503766, + "learning_rate": 1.1128662906014958e-05, + "loss": 0.0, + "num_input_tokens_seen": 2632240, + "step": 13825 + }, + { + "epoch": 7.1881496881496885, + "grad_norm": 5.966264143353328e-05, + "learning_rate": 1.1109801368996806e-05, + "loss": 0.0, + "num_input_tokens_seen": 2633232, + "step": 13830 + }, + { + "epoch": 7.1907484407484406, + "grad_norm": 0.010446745902299881, + "learning_rate": 1.109095126215752e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2634096, + "step": 13835 + }, + { + "epoch": 7.1933471933471935, + "grad_norm": 0.0007674357620999217, + "learning_rate": 1.1072112601008746e-05, + "loss": 0.0, + "num_input_tokens_seen": 2635056, + "step": 13840 + }, + { + "epoch": 7.195945945945946, + "grad_norm": 0.000671571004204452, + "learning_rate": 1.1053285401052749e-05, + "loss": 0.0004, + "num_input_tokens_seen": 2636080, + "step": 13845 + }, + { + "epoch": 7.198544698544699, + "grad_norm": 0.0008646822534501553, + "learning_rate": 1.1034469677782333e-05, + "loss": 0.0, + "num_input_tokens_seen": 2637008, + "step": 13850 + }, + { + "epoch": 7.201143451143452, + "grad_norm": 0.0005528838955797255, + "learning_rate": 1.1015665446680859e-05, + "loss": 0.0007, + "num_input_tokens_seen": 2638000, + "step": 13855 + }, + { + "epoch": 7.203742203742204, + "grad_norm": 6.134291470516473e-05, + "learning_rate": 1.0996872723222256e-05, + "loss": 0.0, + "num_input_tokens_seen": 2638928, + "step": 13860 + }, + { + "epoch": 7.206340956340957, + "grad_norm": 7.066523539833724e-05, + "learning_rate": 1.0978091522870954e-05, + "loss": 0.0021, + "num_input_tokens_seen": 2639856, + "step": 13865 + }, + { + "epoch": 7.208939708939709, + "grad_norm": 0.0014683236368000507, + "learning_rate": 1.0959321861081928e-05, + "loss": 0.0, + "num_input_tokens_seen": 2640752, + "step": 13870 + }, + { + "epoch": 7.211538461538462, + "grad_norm": 0.0003518622543197125, + "learning_rate": 1.0940563753300626e-05, + "loss": 0.0264, + "num_input_tokens_seen": 2641648, + "step": 13875 + }, + { + "epoch": 7.214137214137214, + "grad_norm": 0.001909028273075819, + "learning_rate": 1.0921817214963026e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2642608, + "step": 13880 + }, + { + "epoch": 7.216735966735967, + "grad_norm": 0.00015666296530980617, + "learning_rate": 1.0903082261495559e-05, + "loss": 0.0, + "num_input_tokens_seen": 2643568, + "step": 13885 + }, + { + "epoch": 7.21933471933472, + "grad_norm": 0.0007361882599070668, + "learning_rate": 1.0884358908315116e-05, + "loss": 0.0, + "num_input_tokens_seen": 2644464, + "step": 13890 + }, + { + "epoch": 7.221933471933472, + "grad_norm": 7.186397851910442e-05, + "learning_rate": 1.0865647170829075e-05, + "loss": 0.0, + "num_input_tokens_seen": 2645392, + "step": 13895 + }, + { + "epoch": 7.224532224532225, + "grad_norm": 0.0016460276674479246, + "learning_rate": 1.084694706443522e-05, + "loss": 0.0864, + "num_input_tokens_seen": 2646288, + "step": 13900 + }, + { + "epoch": 7.227130977130977, + "grad_norm": 0.005800663027912378, + "learning_rate": 1.0828258604521798e-05, + "loss": 0.0, + "num_input_tokens_seen": 2647248, + "step": 13905 + }, + { + "epoch": 7.22972972972973, + "grad_norm": 0.0003036980051547289, + "learning_rate": 1.0809581806467447e-05, + "loss": 0.0, + "num_input_tokens_seen": 2648144, + "step": 13910 + }, + { + "epoch": 7.232328482328482, + "grad_norm": 0.0008557485416531563, + "learning_rate": 1.0790916685641211e-05, + "loss": 0.0853, + "num_input_tokens_seen": 2649136, + "step": 13915 + }, + { + "epoch": 7.234927234927235, + "grad_norm": 0.053410500288009644, + "learning_rate": 1.0772263257402526e-05, + "loss": 0.0002, + "num_input_tokens_seen": 2650096, + "step": 13920 + }, + { + "epoch": 7.237525987525988, + "grad_norm": 0.0010929855052381754, + "learning_rate": 1.0753621537101216e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2651088, + "step": 13925 + }, + { + "epoch": 7.24012474012474, + "grad_norm": 0.02676020935177803, + "learning_rate": 1.0734991540077474e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2652080, + "step": 13930 + }, + { + "epoch": 7.242723492723493, + "grad_norm": 0.0017249020747840405, + "learning_rate": 1.071637328166182e-05, + "loss": 0.0, + "num_input_tokens_seen": 2653008, + "step": 13935 + }, + { + "epoch": 7.245322245322245, + "grad_norm": 0.0011371360160410404, + "learning_rate": 1.0697766777175146e-05, + "loss": 0.0, + "num_input_tokens_seen": 2653904, + "step": 13940 + }, + { + "epoch": 7.247920997920998, + "grad_norm": 0.00036342363455332816, + "learning_rate": 1.0679172041928654e-05, + "loss": 0.0, + "num_input_tokens_seen": 2654800, + "step": 13945 + }, + { + "epoch": 7.25051975051975, + "grad_norm": 0.00013266871974337846, + "learning_rate": 1.0660589091223855e-05, + "loss": 0.0, + "num_input_tokens_seen": 2655696, + "step": 13950 + }, + { + "epoch": 7.253118503118503, + "grad_norm": 0.0002121936558978632, + "learning_rate": 1.064201794035257e-05, + "loss": 0.0, + "num_input_tokens_seen": 2656624, + "step": 13955 + }, + { + "epoch": 7.255717255717256, + "grad_norm": 0.0002886734437197447, + "learning_rate": 1.0623458604596923e-05, + "loss": 0.0866, + "num_input_tokens_seen": 2657488, + "step": 13960 + }, + { + "epoch": 7.258316008316008, + "grad_norm": 0.004750338848680258, + "learning_rate": 1.0604911099229289e-05, + "loss": 0.0, + "num_input_tokens_seen": 2658416, + "step": 13965 + }, + { + "epoch": 7.260914760914761, + "grad_norm": 5.092715218779631e-05, + "learning_rate": 1.0586375439512341e-05, + "loss": 0.0, + "num_input_tokens_seen": 2659312, + "step": 13970 + }, + { + "epoch": 7.263513513513513, + "grad_norm": 0.0009567339438945055, + "learning_rate": 1.0567851640698978e-05, + "loss": 0.0, + "num_input_tokens_seen": 2660304, + "step": 13975 + }, + { + "epoch": 7.266112266112266, + "grad_norm": 0.004425784572958946, + "learning_rate": 1.0549339718032336e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2661328, + "step": 13980 + }, + { + "epoch": 7.268711018711019, + "grad_norm": 0.0005588639178313315, + "learning_rate": 1.0530839686745805e-05, + "loss": 0.0, + "num_input_tokens_seen": 2662352, + "step": 13985 + }, + { + "epoch": 7.271309771309771, + "grad_norm": 0.0004967461572960019, + "learning_rate": 1.0512351562062958e-05, + "loss": 0.0, + "num_input_tokens_seen": 2663280, + "step": 13990 + }, + { + "epoch": 7.273908523908524, + "grad_norm": 0.0005203180480748415, + "learning_rate": 1.0493875359197599e-05, + "loss": 0.0, + "num_input_tokens_seen": 2664208, + "step": 13995 + }, + { + "epoch": 7.276507276507276, + "grad_norm": 0.0006827180623076856, + "learning_rate": 1.0475411093353698e-05, + "loss": 0.0, + "num_input_tokens_seen": 2665104, + "step": 14000 + }, + { + "epoch": 7.279106029106029, + "grad_norm": 6.479621515609324e-05, + "learning_rate": 1.0456958779725426e-05, + "loss": 0.0, + "num_input_tokens_seen": 2666032, + "step": 14005 + }, + { + "epoch": 7.281704781704781, + "grad_norm": 7.561728125438094e-05, + "learning_rate": 1.0438518433497094e-05, + "loss": 0.0, + "num_input_tokens_seen": 2666960, + "step": 14010 + }, + { + "epoch": 7.284303534303534, + "grad_norm": 0.000803038477897644, + "learning_rate": 1.0420090069843167e-05, + "loss": 0.0, + "num_input_tokens_seen": 2667856, + "step": 14015 + }, + { + "epoch": 7.286902286902287, + "grad_norm": 0.00013969380233902484, + "learning_rate": 1.0401673703928278e-05, + "loss": 0.0, + "num_input_tokens_seen": 2668848, + "step": 14020 + }, + { + "epoch": 7.289501039501039, + "grad_norm": 0.0036909037735313177, + "learning_rate": 1.0383269350907152e-05, + "loss": 0.0, + "num_input_tokens_seen": 2669840, + "step": 14025 + }, + { + "epoch": 7.292099792099792, + "grad_norm": 6.461535667767748e-05, + "learning_rate": 1.0364877025924658e-05, + "loss": 0.0, + "num_input_tokens_seen": 2670800, + "step": 14030 + }, + { + "epoch": 7.294698544698544, + "grad_norm": 0.0010195496724918485, + "learning_rate": 1.0346496744115736e-05, + "loss": 0.0, + "num_input_tokens_seen": 2671728, + "step": 14035 + }, + { + "epoch": 7.297297297297297, + "grad_norm": 0.0027878088876605034, + "learning_rate": 1.032812852060546e-05, + "loss": 0.0, + "num_input_tokens_seen": 2672624, + "step": 14040 + }, + { + "epoch": 7.29989604989605, + "grad_norm": 0.002574851969256997, + "learning_rate": 1.0309772370508922e-05, + "loss": 0.0, + "num_input_tokens_seen": 2673552, + "step": 14045 + }, + { + "epoch": 7.302494802494802, + "grad_norm": 0.003219177247956395, + "learning_rate": 1.0291428308931325e-05, + "loss": 0.0, + "num_input_tokens_seen": 2674512, + "step": 14050 + }, + { + "epoch": 7.305093555093555, + "grad_norm": 0.0015973015688359737, + "learning_rate": 1.027309635096792e-05, + "loss": 0.0, + "num_input_tokens_seen": 2675472, + "step": 14055 + }, + { + "epoch": 7.3076923076923075, + "grad_norm": 0.00021006091265007854, + "learning_rate": 1.0254776511703976e-05, + "loss": 0.0812, + "num_input_tokens_seen": 2676432, + "step": 14060 + }, + { + "epoch": 7.3102910602910605, + "grad_norm": 0.0007979939109645784, + "learning_rate": 1.023646880621481e-05, + "loss": 0.0, + "num_input_tokens_seen": 2677360, + "step": 14065 + }, + { + "epoch": 7.3128898128898125, + "grad_norm": 7.08021325408481e-05, + "learning_rate": 1.0218173249565741e-05, + "loss": 0.0, + "num_input_tokens_seen": 2678320, + "step": 14070 + }, + { + "epoch": 7.3154885654885655, + "grad_norm": 0.008109759539365768, + "learning_rate": 1.0199889856812112e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2679376, + "step": 14075 + }, + { + "epoch": 7.3180873180873185, + "grad_norm": 0.0021501912269741297, + "learning_rate": 1.018161864299921e-05, + "loss": 0.0, + "num_input_tokens_seen": 2680240, + "step": 14080 + }, + { + "epoch": 7.320686070686071, + "grad_norm": 5.471887197927572e-05, + "learning_rate": 1.0163359623162357e-05, + "loss": 0.0, + "num_input_tokens_seen": 2681168, + "step": 14085 + }, + { + "epoch": 7.3232848232848236, + "grad_norm": 0.00042427281732670963, + "learning_rate": 1.0145112812326799e-05, + "loss": 0.0157, + "num_input_tokens_seen": 2682192, + "step": 14090 + }, + { + "epoch": 7.325883575883576, + "grad_norm": 0.00127421657089144, + "learning_rate": 1.0126878225507761e-05, + "loss": 0.0, + "num_input_tokens_seen": 2683184, + "step": 14095 + }, + { + "epoch": 7.328482328482329, + "grad_norm": 6.740605022059754e-05, + "learning_rate": 1.0108655877710386e-05, + "loss": 0.0, + "num_input_tokens_seen": 2684112, + "step": 14100 + }, + { + "epoch": 7.331081081081081, + "grad_norm": 0.00551873492076993, + "learning_rate": 1.0090445783929774e-05, + "loss": 0.0, + "num_input_tokens_seen": 2684976, + "step": 14105 + }, + { + "epoch": 7.333679833679834, + "grad_norm": 0.008688485249876976, + "learning_rate": 1.0072247959150919e-05, + "loss": 0.0, + "num_input_tokens_seen": 2685904, + "step": 14110 + }, + { + "epoch": 7.336278586278587, + "grad_norm": 6.275960186030716e-05, + "learning_rate": 1.0054062418348714e-05, + "loss": 0.0, + "num_input_tokens_seen": 2686896, + "step": 14115 + }, + { + "epoch": 7.338877338877339, + "grad_norm": 0.000982780708000064, + "learning_rate": 1.0035889176487973e-05, + "loss": 0.0057, + "num_input_tokens_seen": 2687792, + "step": 14120 + }, + { + "epoch": 7.341476091476092, + "grad_norm": 0.00026258942671120167, + "learning_rate": 1.001772824852335e-05, + "loss": 0.0001, + "num_input_tokens_seen": 2688816, + "step": 14125 + }, + { + "epoch": 7.344074844074844, + "grad_norm": 0.0011931672925129533, + "learning_rate": 9.999579649399408e-06, + "loss": 0.0, + "num_input_tokens_seen": 2689808, + "step": 14130 + }, + { + "epoch": 7.346673596673597, + "grad_norm": 0.002051079645752907, + "learning_rate": 9.981443394050525e-06, + "loss": 0.0003, + "num_input_tokens_seen": 2690832, + "step": 14135 + }, + { + "epoch": 7.349272349272349, + "grad_norm": 0.00026983956922776997, + "learning_rate": 9.963319497400957e-06, + "loss": 0.0, + "num_input_tokens_seen": 2691792, + "step": 14140 + }, + { + "epoch": 7.351871101871102, + "grad_norm": 6.610821723937988, + "learning_rate": 9.945207974364768e-06, + "loss": 0.0549, + "num_input_tokens_seen": 2692784, + "step": 14145 + }, + { + "epoch": 7.354469854469855, + "grad_norm": 0.0001993367914110422, + "learning_rate": 9.92710883984583e-06, + "loss": 0.0, + "num_input_tokens_seen": 2693872, + "step": 14150 + }, + { + "epoch": 7.357068607068607, + "grad_norm": 0.0002911975607275963, + "learning_rate": 9.909022108737856e-06, + "loss": 0.0, + "num_input_tokens_seen": 2694832, + "step": 14155 + }, + { + "epoch": 7.35966735966736, + "grad_norm": 0.00048027545562945306, + "learning_rate": 9.890947795924313e-06, + "loss": 0.0002, + "num_input_tokens_seen": 2695856, + "step": 14160 + }, + { + "epoch": 7.362266112266112, + "grad_norm": 0.0005180610460229218, + "learning_rate": 9.872885916278488e-06, + "loss": 0.0002, + "num_input_tokens_seen": 2696816, + "step": 14165 + }, + { + "epoch": 7.364864864864865, + "grad_norm": 0.011633763089776039, + "learning_rate": 9.854836484663404e-06, + "loss": 0.0, + "num_input_tokens_seen": 2697776, + "step": 14170 + }, + { + "epoch": 7.367463617463617, + "grad_norm": 0.0007535413606092334, + "learning_rate": 9.836799515931847e-06, + "loss": 0.0, + "num_input_tokens_seen": 2698640, + "step": 14175 + }, + { + "epoch": 7.37006237006237, + "grad_norm": 0.0010740625439211726, + "learning_rate": 9.818775024926369e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2699664, + "step": 14180 + }, + { + "epoch": 7.372661122661123, + "grad_norm": 0.002268711570650339, + "learning_rate": 9.80076302647922e-06, + "loss": 0.1054, + "num_input_tokens_seen": 2700624, + "step": 14185 + }, + { + "epoch": 7.375259875259875, + "grad_norm": 0.00041899128700606525, + "learning_rate": 9.782763535412409e-06, + "loss": 0.0, + "num_input_tokens_seen": 2701520, + "step": 14190 + }, + { + "epoch": 7.377858627858628, + "grad_norm": 0.0009180985507555306, + "learning_rate": 9.764776566537615e-06, + "loss": 0.0009, + "num_input_tokens_seen": 2702512, + "step": 14195 + }, + { + "epoch": 7.38045738045738, + "grad_norm": 5.7853983889799565e-05, + "learning_rate": 9.746802134656245e-06, + "loss": 0.0, + "num_input_tokens_seen": 2703440, + "step": 14200 + }, + { + "epoch": 7.383056133056133, + "grad_norm": 5.876473733223975e-05, + "learning_rate": 9.728840254559366e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2704368, + "step": 14205 + }, + { + "epoch": 7.385654885654886, + "grad_norm": 0.0011238160077482462, + "learning_rate": 9.710890941027722e-06, + "loss": 0.0, + "num_input_tokens_seen": 2705328, + "step": 14210 + }, + { + "epoch": 7.388253638253638, + "grad_norm": 0.0015618127072229981, + "learning_rate": 9.692954208831714e-06, + "loss": 0.0, + "num_input_tokens_seen": 2706288, + "step": 14215 + }, + { + "epoch": 7.390852390852391, + "grad_norm": 0.00013970046711619943, + "learning_rate": 9.67503007273141e-06, + "loss": 0.0, + "num_input_tokens_seen": 2707216, + "step": 14220 + }, + { + "epoch": 7.393451143451143, + "grad_norm": 0.0008748536929488182, + "learning_rate": 9.65711854747648e-06, + "loss": 0.0, + "num_input_tokens_seen": 2708176, + "step": 14225 + }, + { + "epoch": 7.396049896049896, + "grad_norm": 0.33165669441223145, + "learning_rate": 9.639219647806239e-06, + "loss": 0.0002, + "num_input_tokens_seen": 2709072, + "step": 14230 + }, + { + "epoch": 7.398648648648648, + "grad_norm": 0.011922305449843407, + "learning_rate": 9.621333388449619e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2710000, + "step": 14235 + }, + { + "epoch": 7.401247401247401, + "grad_norm": 0.030197417363524437, + "learning_rate": 9.60345978412513e-06, + "loss": 0.0, + "num_input_tokens_seen": 2710960, + "step": 14240 + }, + { + "epoch": 7.403846153846154, + "grad_norm": 0.0002634810225572437, + "learning_rate": 9.585598849540874e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2711984, + "step": 14245 + }, + { + "epoch": 7.406444906444906, + "grad_norm": 0.022111041471362114, + "learning_rate": 9.567750599394524e-06, + "loss": 0.0002, + "num_input_tokens_seen": 2712944, + "step": 14250 + }, + { + "epoch": 7.409043659043659, + "grad_norm": 6.230134022189304e-05, + "learning_rate": 9.549915048373334e-06, + "loss": 0.0, + "num_input_tokens_seen": 2713840, + "step": 14255 + }, + { + "epoch": 7.411642411642411, + "grad_norm": 0.0009834584780037403, + "learning_rate": 9.532092211154082e-06, + "loss": 0.0, + "num_input_tokens_seen": 2714832, + "step": 14260 + }, + { + "epoch": 7.414241164241164, + "grad_norm": 0.00033944397000595927, + "learning_rate": 9.51428210240311e-06, + "loss": 0.0013, + "num_input_tokens_seen": 2715760, + "step": 14265 + }, + { + "epoch": 7.416839916839917, + "grad_norm": 5.425657582236454e-05, + "learning_rate": 9.496484736776267e-06, + "loss": 0.0, + "num_input_tokens_seen": 2716688, + "step": 14270 + }, + { + "epoch": 7.419438669438669, + "grad_norm": 6.677041528746486e-05, + "learning_rate": 9.47870012891891e-06, + "loss": 0.0, + "num_input_tokens_seen": 2717712, + "step": 14275 + }, + { + "epoch": 7.422037422037422, + "grad_norm": 5.851655441801995e-05, + "learning_rate": 9.46092829346593e-06, + "loss": 0.0, + "num_input_tokens_seen": 2718736, + "step": 14280 + }, + { + "epoch": 7.424636174636174, + "grad_norm": 0.00021444898447953165, + "learning_rate": 9.443169245041664e-06, + "loss": 0.0, + "num_input_tokens_seen": 2719728, + "step": 14285 + }, + { + "epoch": 7.427234927234927, + "grad_norm": 5.5530712415929884e-05, + "learning_rate": 9.425422998259966e-06, + "loss": 0.0, + "num_input_tokens_seen": 2720688, + "step": 14290 + }, + { + "epoch": 7.4298336798336795, + "grad_norm": 0.0001907669211504981, + "learning_rate": 9.407689567724129e-06, + "loss": 0.0, + "num_input_tokens_seen": 2721680, + "step": 14295 + }, + { + "epoch": 7.4324324324324325, + "grad_norm": 5.5010397773003206e-05, + "learning_rate": 9.389968968026919e-06, + "loss": 0.0, + "num_input_tokens_seen": 2722576, + "step": 14300 + }, + { + "epoch": 7.435031185031185, + "grad_norm": 0.00045345042599365115, + "learning_rate": 9.372261213750528e-06, + "loss": 0.0, + "num_input_tokens_seen": 2723568, + "step": 14305 + }, + { + "epoch": 7.4376299376299375, + "grad_norm": 0.009598770178854465, + "learning_rate": 9.354566319466573e-06, + "loss": 0.0, + "num_input_tokens_seen": 2724432, + "step": 14310 + }, + { + "epoch": 7.4402286902286905, + "grad_norm": 0.0017199552385136485, + "learning_rate": 9.33688429973612e-06, + "loss": 0.0, + "num_input_tokens_seen": 2725360, + "step": 14315 + }, + { + "epoch": 7.442827442827443, + "grad_norm": 0.00015388289466500282, + "learning_rate": 9.3192151691096e-06, + "loss": 0.0, + "num_input_tokens_seen": 2726352, + "step": 14320 + }, + { + "epoch": 7.4454261954261955, + "grad_norm": 9.601829515304416e-05, + "learning_rate": 9.301558942126872e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2727504, + "step": 14325 + }, + { + "epoch": 7.448024948024948, + "grad_norm": 6.280683994293213, + "learning_rate": 9.28391563331715e-06, + "loss": 0.0096, + "num_input_tokens_seen": 2728432, + "step": 14330 + }, + { + "epoch": 7.450623700623701, + "grad_norm": 0.0017462121322751045, + "learning_rate": 9.266285257199051e-06, + "loss": 0.0, + "num_input_tokens_seen": 2729328, + "step": 14335 + }, + { + "epoch": 7.453222453222454, + "grad_norm": 5.3256560931913555e-05, + "learning_rate": 9.248667828280493e-06, + "loss": 0.0, + "num_input_tokens_seen": 2730256, + "step": 14340 + }, + { + "epoch": 7.455821205821206, + "grad_norm": 0.00022244190040510148, + "learning_rate": 9.231063361058806e-06, + "loss": 0.0, + "num_input_tokens_seen": 2731152, + "step": 14345 + }, + { + "epoch": 7.458419958419959, + "grad_norm": 0.00019553466700017452, + "learning_rate": 9.213471870020601e-06, + "loss": 0.0, + "num_input_tokens_seen": 2732048, + "step": 14350 + }, + { + "epoch": 7.461018711018711, + "grad_norm": 5.426369534689002e-05, + "learning_rate": 9.195893369641841e-06, + "loss": 0.0, + "num_input_tokens_seen": 2732976, + "step": 14355 + }, + { + "epoch": 7.463617463617464, + "grad_norm": 0.00010450564877828583, + "learning_rate": 9.178327874387807e-06, + "loss": 0.0, + "num_input_tokens_seen": 2733872, + "step": 14360 + }, + { + "epoch": 7.466216216216216, + "grad_norm": 4.6630313590867445e-05, + "learning_rate": 9.160775398713037e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2734800, + "step": 14365 + }, + { + "epoch": 7.468814968814969, + "grad_norm": 5.524681546376087e-05, + "learning_rate": 9.143235957061407e-06, + "loss": 0.0, + "num_input_tokens_seen": 2735664, + "step": 14370 + }, + { + "epoch": 7.471413721413722, + "grad_norm": 0.0007797431317158043, + "learning_rate": 9.12570956386601e-06, + "loss": 0.0, + "num_input_tokens_seen": 2736560, + "step": 14375 + }, + { + "epoch": 7.474012474012474, + "grad_norm": 0.00017703919729683548, + "learning_rate": 9.108196233549256e-06, + "loss": 0.1196, + "num_input_tokens_seen": 2737456, + "step": 14380 + }, + { + "epoch": 7.476611226611227, + "grad_norm": 0.17765741050243378, + "learning_rate": 9.090695980522765e-06, + "loss": 0.0008, + "num_input_tokens_seen": 2738384, + "step": 14385 + }, + { + "epoch": 7.479209979209979, + "grad_norm": 0.0003888150386046618, + "learning_rate": 9.073208819187429e-06, + "loss": 0.0, + "num_input_tokens_seen": 2739376, + "step": 14390 + }, + { + "epoch": 7.481808731808732, + "grad_norm": 0.00015818802057765424, + "learning_rate": 9.055734763933335e-06, + "loss": 0.0, + "num_input_tokens_seen": 2740304, + "step": 14395 + }, + { + "epoch": 7.484407484407485, + "grad_norm": 0.0020721324253827333, + "learning_rate": 9.038273829139816e-06, + "loss": 0.0, + "num_input_tokens_seen": 2741264, + "step": 14400 + }, + { + "epoch": 7.487006237006237, + "grad_norm": 0.00041336967842653394, + "learning_rate": 9.020826029175384e-06, + "loss": 0.0, + "num_input_tokens_seen": 2742256, + "step": 14405 + }, + { + "epoch": 7.48960498960499, + "grad_norm": 0.0011128814658150077, + "learning_rate": 9.00339137839774e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2743216, + "step": 14410 + }, + { + "epoch": 7.492203742203742, + "grad_norm": 0.0014779980992898345, + "learning_rate": 8.985969891153801e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2744144, + "step": 14415 + }, + { + "epoch": 7.494802494802495, + "grad_norm": 5.700804103980772e-05, + "learning_rate": 8.968561581779602e-06, + "loss": 0.0, + "num_input_tokens_seen": 2745040, + "step": 14420 + }, + { + "epoch": 7.497401247401247, + "grad_norm": 2.8426740169525146, + "learning_rate": 8.95116646460038e-06, + "loss": 0.0047, + "num_input_tokens_seen": 2746032, + "step": 14425 + }, + { + "epoch": 7.5, + "grad_norm": 0.00892539694905281, + "learning_rate": 8.933784553930478e-06, + "loss": 0.0, + "num_input_tokens_seen": 2746960, + "step": 14430 + }, + { + "epoch": 7.5, + "eval_loss": 0.4077318012714386, + "eval_runtime": 9.2533, + "eval_samples_per_second": 92.508, + "eval_steps_per_second": 23.127, + "num_input_tokens_seen": 2746960, + "step": 14430 + }, + { + "epoch": 7.502598752598753, + "grad_norm": 0.0005203865584917367, + "learning_rate": 8.9164158640734e-06, + "loss": 0.0011, + "num_input_tokens_seen": 2747920, + "step": 14435 + }, + { + "epoch": 7.505197505197505, + "grad_norm": 0.00025123319937847555, + "learning_rate": 8.899060409321755e-06, + "loss": 0.0, + "num_input_tokens_seen": 2748944, + "step": 14440 + }, + { + "epoch": 7.507796257796258, + "grad_norm": 3.793386459350586, + "learning_rate": 8.881718203957254e-06, + "loss": 0.0548, + "num_input_tokens_seen": 2749904, + "step": 14445 + }, + { + "epoch": 7.51039501039501, + "grad_norm": 0.00048171941307373345, + "learning_rate": 8.864389262250732e-06, + "loss": 0.0, + "num_input_tokens_seen": 2750928, + "step": 14450 + }, + { + "epoch": 7.512993762993763, + "grad_norm": 4.182275733910501e-05, + "learning_rate": 8.847073598462082e-06, + "loss": 0.0, + "num_input_tokens_seen": 2751856, + "step": 14455 + }, + { + "epoch": 7.515592515592516, + "grad_norm": 0.0005943463183939457, + "learning_rate": 8.829771226840294e-06, + "loss": 0.0, + "num_input_tokens_seen": 2752880, + "step": 14460 + }, + { + "epoch": 7.518191268191268, + "grad_norm": 0.008986230939626694, + "learning_rate": 8.8124821616234e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2753840, + "step": 14465 + }, + { + "epoch": 7.520790020790021, + "grad_norm": 0.003381817601621151, + "learning_rate": 8.79520641703849e-06, + "loss": 0.0002, + "num_input_tokens_seen": 2754800, + "step": 14470 + }, + { + "epoch": 7.523388773388773, + "grad_norm": 0.0008533629588782787, + "learning_rate": 8.777944007301686e-06, + "loss": 0.0, + "num_input_tokens_seen": 2755728, + "step": 14475 + }, + { + "epoch": 7.525987525987526, + "grad_norm": 0.0039718723855912685, + "learning_rate": 8.760694946618151e-06, + "loss": 0.0, + "num_input_tokens_seen": 2756720, + "step": 14480 + }, + { + "epoch": 7.528586278586278, + "grad_norm": 4.2183637560810894e-05, + "learning_rate": 8.74345924918206e-06, + "loss": 0.0, + "num_input_tokens_seen": 2757680, + "step": 14485 + }, + { + "epoch": 7.531185031185031, + "grad_norm": 0.010042605921626091, + "learning_rate": 8.726236929176576e-06, + "loss": 0.0, + "num_input_tokens_seen": 2758640, + "step": 14490 + }, + { + "epoch": 7.533783783783784, + "grad_norm": 5.439767483039759e-05, + "learning_rate": 8.70902800077388e-06, + "loss": 0.0, + "num_input_tokens_seen": 2759600, + "step": 14495 + }, + { + "epoch": 7.536382536382536, + "grad_norm": 0.003253332572057843, + "learning_rate": 8.69183247813511e-06, + "loss": 0.0, + "num_input_tokens_seen": 2760528, + "step": 14500 + }, + { + "epoch": 7.538981288981289, + "grad_norm": 0.0038096329662948847, + "learning_rate": 8.67465037541038e-06, + "loss": 0.0, + "num_input_tokens_seen": 2761520, + "step": 14505 + }, + { + "epoch": 7.541580041580041, + "grad_norm": 0.061263248324394226, + "learning_rate": 8.657481706738749e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2762448, + "step": 14510 + }, + { + "epoch": 7.544178794178794, + "grad_norm": 5.128950215294026e-05, + "learning_rate": 8.640326486248254e-06, + "loss": 0.0016, + "num_input_tokens_seen": 2763408, + "step": 14515 + }, + { + "epoch": 7.546777546777546, + "grad_norm": 7.892289431765676e-05, + "learning_rate": 8.623184728055828e-06, + "loss": 0.0, + "num_input_tokens_seen": 2764336, + "step": 14520 + }, + { + "epoch": 7.549376299376299, + "grad_norm": 0.000535814615432173, + "learning_rate": 8.60605644626736e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2765296, + "step": 14525 + }, + { + "epoch": 7.551975051975052, + "grad_norm": 0.0018270984292030334, + "learning_rate": 8.588941654977622e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2766256, + "step": 14530 + }, + { + "epoch": 7.5545738045738045, + "grad_norm": 0.00014532939530909061, + "learning_rate": 8.571840368270287e-06, + "loss": 0.0, + "num_input_tokens_seen": 2767152, + "step": 14535 + }, + { + "epoch": 7.557172557172557, + "grad_norm": 0.0003343170683365315, + "learning_rate": 8.554752600217941e-06, + "loss": 0.0, + "num_input_tokens_seen": 2768112, + "step": 14540 + }, + { + "epoch": 7.5597713097713095, + "grad_norm": 0.005361625459045172, + "learning_rate": 8.537678364882013e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2769040, + "step": 14545 + }, + { + "epoch": 7.5623700623700625, + "grad_norm": 6.823295552749187e-05, + "learning_rate": 8.52061767631282e-06, + "loss": 0.0002, + "num_input_tokens_seen": 2770032, + "step": 14550 + }, + { + "epoch": 7.564968814968815, + "grad_norm": 0.016374802216887474, + "learning_rate": 8.503570548549511e-06, + "loss": 0.0, + "num_input_tokens_seen": 2770992, + "step": 14555 + }, + { + "epoch": 7.5675675675675675, + "grad_norm": 0.0014524799771606922, + "learning_rate": 8.486536995620103e-06, + "loss": 0.0, + "num_input_tokens_seen": 2771952, + "step": 14560 + }, + { + "epoch": 7.5701663201663205, + "grad_norm": 5.074249202152714e-05, + "learning_rate": 8.469517031541419e-06, + "loss": 0.0, + "num_input_tokens_seen": 2772912, + "step": 14565 + }, + { + "epoch": 7.572765072765073, + "grad_norm": 0.00028079436742700636, + "learning_rate": 8.452510670319094e-06, + "loss": 0.0, + "num_input_tokens_seen": 2773808, + "step": 14570 + }, + { + "epoch": 7.575363825363826, + "grad_norm": 0.0006416785181500018, + "learning_rate": 8.435517925947606e-06, + "loss": 0.0, + "num_input_tokens_seen": 2774672, + "step": 14575 + }, + { + "epoch": 7.577962577962578, + "grad_norm": 0.00015183063806034625, + "learning_rate": 8.418538812410182e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2775632, + "step": 14580 + }, + { + "epoch": 7.580561330561331, + "grad_norm": 0.0006893770769238472, + "learning_rate": 8.40157334367887e-06, + "loss": 0.0, + "num_input_tokens_seen": 2776656, + "step": 14585 + }, + { + "epoch": 7.583160083160083, + "grad_norm": 0.0012997430749237537, + "learning_rate": 8.384621533714462e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2777552, + "step": 14590 + }, + { + "epoch": 7.585758835758836, + "grad_norm": 0.002527955686673522, + "learning_rate": 8.367683396466547e-06, + "loss": 0.0, + "num_input_tokens_seen": 2778544, + "step": 14595 + }, + { + "epoch": 7.588357588357589, + "grad_norm": 0.0004104315012227744, + "learning_rate": 8.350758945873401e-06, + "loss": 0.0, + "num_input_tokens_seen": 2779344, + "step": 14600 + }, + { + "epoch": 7.590956340956341, + "grad_norm": 3.9943708543432876e-05, + "learning_rate": 8.333848195862093e-06, + "loss": 0.0, + "num_input_tokens_seen": 2780272, + "step": 14605 + }, + { + "epoch": 7.593555093555094, + "grad_norm": 0.0005335921887308359, + "learning_rate": 8.31695116034841e-06, + "loss": 0.0, + "num_input_tokens_seen": 2781232, + "step": 14610 + }, + { + "epoch": 7.596153846153846, + "grad_norm": 0.00018149547395296395, + "learning_rate": 8.300067853236823e-06, + "loss": 0.0, + "num_input_tokens_seen": 2782160, + "step": 14615 + }, + { + "epoch": 7.598752598752599, + "grad_norm": 0.00030937473638914526, + "learning_rate": 8.283198288420543e-06, + "loss": 0.0, + "num_input_tokens_seen": 2783088, + "step": 14620 + }, + { + "epoch": 7.601351351351351, + "grad_norm": 0.0018439715495333076, + "learning_rate": 8.26634247978144e-06, + "loss": 0.0002, + "num_input_tokens_seen": 2784080, + "step": 14625 + }, + { + "epoch": 7.603950103950104, + "grad_norm": 0.000491154904011637, + "learning_rate": 8.249500441190104e-06, + "loss": 0.0, + "num_input_tokens_seen": 2785008, + "step": 14630 + }, + { + "epoch": 7.606548856548857, + "grad_norm": 0.0012558628804981709, + "learning_rate": 8.232672186505733e-06, + "loss": 0.0, + "num_input_tokens_seen": 2786032, + "step": 14635 + }, + { + "epoch": 7.609147609147609, + "grad_norm": 0.0005041549447923899, + "learning_rate": 8.215857729576245e-06, + "loss": 0.0, + "num_input_tokens_seen": 2786992, + "step": 14640 + }, + { + "epoch": 7.611746361746362, + "grad_norm": 0.00022916504531167448, + "learning_rate": 8.199057084238165e-06, + "loss": 0.001, + "num_input_tokens_seen": 2788048, + "step": 14645 + }, + { + "epoch": 7.614345114345114, + "grad_norm": 0.0003485188935883343, + "learning_rate": 8.182270264316674e-06, + "loss": 0.0, + "num_input_tokens_seen": 2788944, + "step": 14650 + }, + { + "epoch": 7.616943866943867, + "grad_norm": 0.001242043450474739, + "learning_rate": 8.165497283625554e-06, + "loss": 0.0, + "num_input_tokens_seen": 2789872, + "step": 14655 + }, + { + "epoch": 7.61954261954262, + "grad_norm": 0.0015398754039779305, + "learning_rate": 8.14873815596722e-06, + "loss": 0.0, + "num_input_tokens_seen": 2790768, + "step": 14660 + }, + { + "epoch": 7.622141372141372, + "grad_norm": 0.00014732267300132662, + "learning_rate": 8.131992895132693e-06, + "loss": 0.0, + "num_input_tokens_seen": 2791728, + "step": 14665 + }, + { + "epoch": 7.624740124740125, + "grad_norm": 0.00018325647397432476, + "learning_rate": 8.11526151490154e-06, + "loss": 0.0, + "num_input_tokens_seen": 2792720, + "step": 14670 + }, + { + "epoch": 7.627338877338877, + "grad_norm": 0.0025489863473922014, + "learning_rate": 8.098544029041955e-06, + "loss": 0.0, + "num_input_tokens_seen": 2793648, + "step": 14675 + }, + { + "epoch": 7.62993762993763, + "grad_norm": 5.757435428677127e-05, + "learning_rate": 8.081840451310666e-06, + "loss": 0.0, + "num_input_tokens_seen": 2794576, + "step": 14680 + }, + { + "epoch": 7.632536382536383, + "grad_norm": 0.0001884944358607754, + "learning_rate": 8.065150795452983e-06, + "loss": 0.0, + "num_input_tokens_seen": 2795504, + "step": 14685 + }, + { + "epoch": 7.635135135135135, + "grad_norm": 0.007191723678261042, + "learning_rate": 8.048475075202727e-06, + "loss": 0.0, + "num_input_tokens_seen": 2796496, + "step": 14690 + }, + { + "epoch": 7.637733887733888, + "grad_norm": 0.00011637169518508017, + "learning_rate": 8.031813304282287e-06, + "loss": 0.0, + "num_input_tokens_seen": 2797488, + "step": 14695 + }, + { + "epoch": 7.64033264033264, + "grad_norm": 0.0007056007161736488, + "learning_rate": 8.015165496402549e-06, + "loss": 0.0096, + "num_input_tokens_seen": 2798352, + "step": 14700 + }, + { + "epoch": 7.642931392931393, + "grad_norm": 0.0004230842459946871, + "learning_rate": 7.998531665262907e-06, + "loss": 0.0, + "num_input_tokens_seen": 2799408, + "step": 14705 + }, + { + "epoch": 7.645530145530145, + "grad_norm": 4.858812098973431e-05, + "learning_rate": 7.981911824551274e-06, + "loss": 0.0, + "num_input_tokens_seen": 2800304, + "step": 14710 + }, + { + "epoch": 7.648128898128898, + "grad_norm": 4.909350900561549e-05, + "learning_rate": 7.965305987944027e-06, + "loss": 0.0, + "num_input_tokens_seen": 2801200, + "step": 14715 + }, + { + "epoch": 7.650727650727651, + "grad_norm": 0.000247537245741114, + "learning_rate": 7.948714169106048e-06, + "loss": 0.0, + "num_input_tokens_seen": 2802096, + "step": 14720 + }, + { + "epoch": 7.653326403326403, + "grad_norm": 4.927552799927071e-05, + "learning_rate": 7.932136381690644e-06, + "loss": 0.0, + "num_input_tokens_seen": 2803024, + "step": 14725 + }, + { + "epoch": 7.655925155925156, + "grad_norm": 9.457590931560844e-05, + "learning_rate": 7.91557263933962e-06, + "loss": 0.0, + "num_input_tokens_seen": 2803952, + "step": 14730 + }, + { + "epoch": 7.658523908523908, + "grad_norm": 6.192547152750194e-05, + "learning_rate": 7.899022955683188e-06, + "loss": 0.0, + "num_input_tokens_seen": 2804880, + "step": 14735 + }, + { + "epoch": 7.661122661122661, + "grad_norm": 0.00030485555180348456, + "learning_rate": 7.88248734434e-06, + "loss": 0.0, + "num_input_tokens_seen": 2805808, + "step": 14740 + }, + { + "epoch": 7.663721413721413, + "grad_norm": 0.00019244557188358158, + "learning_rate": 7.865965818917149e-06, + "loss": 0.0, + "num_input_tokens_seen": 2806736, + "step": 14745 + }, + { + "epoch": 7.666320166320166, + "grad_norm": 7.2401442527771, + "learning_rate": 7.849458393010103e-06, + "loss": 0.1407, + "num_input_tokens_seen": 2807728, + "step": 14750 + }, + { + "epoch": 7.668918918918919, + "grad_norm": 4.4514541514217854e-05, + "learning_rate": 7.832965080202762e-06, + "loss": 0.0, + "num_input_tokens_seen": 2808624, + "step": 14755 + }, + { + "epoch": 7.671517671517671, + "grad_norm": 0.0005228405352681875, + "learning_rate": 7.816485894067382e-06, + "loss": 0.0, + "num_input_tokens_seen": 2809616, + "step": 14760 + }, + { + "epoch": 7.674116424116424, + "grad_norm": 0.0016098467167466879, + "learning_rate": 7.800020848164615e-06, + "loss": 0.0, + "num_input_tokens_seen": 2810576, + "step": 14765 + }, + { + "epoch": 7.6767151767151764, + "grad_norm": 0.0012963636545464396, + "learning_rate": 7.78356995604346e-06, + "loss": 0.0, + "num_input_tokens_seen": 2811536, + "step": 14770 + }, + { + "epoch": 7.679313929313929, + "grad_norm": 0.0001933698367793113, + "learning_rate": 7.767133231241288e-06, + "loss": 0.0, + "num_input_tokens_seen": 2812432, + "step": 14775 + }, + { + "epoch": 7.6819126819126815, + "grad_norm": 0.000939248304348439, + "learning_rate": 7.750710687283793e-06, + "loss": 0.0004, + "num_input_tokens_seen": 2813360, + "step": 14780 + }, + { + "epoch": 7.6845114345114345, + "grad_norm": 0.0002500144764780998, + "learning_rate": 7.734302337685018e-06, + "loss": 0.0, + "num_input_tokens_seen": 2814288, + "step": 14785 + }, + { + "epoch": 7.6871101871101875, + "grad_norm": 4.24457284680102e-05, + "learning_rate": 7.717908195947316e-06, + "loss": 0.0141, + "num_input_tokens_seen": 2815248, + "step": 14790 + }, + { + "epoch": 7.6897089397089395, + "grad_norm": 4.304630419937894e-05, + "learning_rate": 7.701528275561348e-06, + "loss": 0.0, + "num_input_tokens_seen": 2816272, + "step": 14795 + }, + { + "epoch": 7.6923076923076925, + "grad_norm": 0.0019319865386933088, + "learning_rate": 7.68516259000607e-06, + "loss": 0.0, + "num_input_tokens_seen": 2817232, + "step": 14800 + }, + { + "epoch": 7.694906444906445, + "grad_norm": 10.206271171569824, + "learning_rate": 7.668811152748723e-06, + "loss": 0.1224, + "num_input_tokens_seen": 2818224, + "step": 14805 + }, + { + "epoch": 7.697505197505198, + "grad_norm": 0.003400087356567383, + "learning_rate": 7.652473977244837e-06, + "loss": 0.0, + "num_input_tokens_seen": 2819184, + "step": 14810 + }, + { + "epoch": 7.70010395010395, + "grad_norm": 0.0003279370430391282, + "learning_rate": 7.636151076938185e-06, + "loss": 0.0, + "num_input_tokens_seen": 2820208, + "step": 14815 + }, + { + "epoch": 7.702702702702703, + "grad_norm": 0.003811789210885763, + "learning_rate": 7.619842465260824e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2821232, + "step": 14820 + }, + { + "epoch": 7.705301455301456, + "grad_norm": 0.0062790666706860065, + "learning_rate": 7.6035481556330195e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2822128, + "step": 14825 + }, + { + "epoch": 7.707900207900208, + "grad_norm": 4.550227095023729e-05, + "learning_rate": 7.587268161463274e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2823120, + "step": 14830 + }, + { + "epoch": 7.710498960498961, + "grad_norm": 0.00018216847092844546, + "learning_rate": 7.571002496148338e-06, + "loss": 0.0, + "num_input_tokens_seen": 2824080, + "step": 14835 + }, + { + "epoch": 7.713097713097713, + "grad_norm": 0.008256045170128345, + "learning_rate": 7.554751173073133e-06, + "loss": 0.0002, + "num_input_tokens_seen": 2825040, + "step": 14840 + }, + { + "epoch": 7.715696465696466, + "grad_norm": 4.216572779114358e-05, + "learning_rate": 7.538514205610808e-06, + "loss": 0.0004, + "num_input_tokens_seen": 2826064, + "step": 14845 + }, + { + "epoch": 7.718295218295218, + "grad_norm": 0.0005861244862899184, + "learning_rate": 7.522291607122678e-06, + "loss": 0.0, + "num_input_tokens_seen": 2827152, + "step": 14850 + }, + { + "epoch": 7.720893970893971, + "grad_norm": 0.0016688051400706172, + "learning_rate": 7.506083390958252e-06, + "loss": 0.0, + "num_input_tokens_seen": 2828144, + "step": 14855 + }, + { + "epoch": 7.723492723492724, + "grad_norm": 0.004752944689244032, + "learning_rate": 7.489889570455191e-06, + "loss": 0.0007, + "num_input_tokens_seen": 2829104, + "step": 14860 + }, + { + "epoch": 7.726091476091476, + "grad_norm": 3.859998469124548e-05, + "learning_rate": 7.473710158939307e-06, + "loss": 0.0, + "num_input_tokens_seen": 2830064, + "step": 14865 + }, + { + "epoch": 7.728690228690229, + "grad_norm": 3.97497060475871e-05, + "learning_rate": 7.45754516972457e-06, + "loss": 0.0, + "num_input_tokens_seen": 2831024, + "step": 14870 + }, + { + "epoch": 7.731288981288982, + "grad_norm": 3.412437581573613e-05, + "learning_rate": 7.441394616113062e-06, + "loss": 0.0, + "num_input_tokens_seen": 2831952, + "step": 14875 + }, + { + "epoch": 7.733887733887734, + "grad_norm": 0.00045998027781024575, + "learning_rate": 7.425258511395014e-06, + "loss": 0.0, + "num_input_tokens_seen": 2832880, + "step": 14880 + }, + { + "epoch": 7.736486486486487, + "grad_norm": 0.0009271397721022367, + "learning_rate": 7.409136868848735e-06, + "loss": 0.0, + "num_input_tokens_seen": 2833840, + "step": 14885 + }, + { + "epoch": 7.739085239085239, + "grad_norm": 0.0007212624186649919, + "learning_rate": 7.393029701740667e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2834800, + "step": 14890 + }, + { + "epoch": 7.741683991683992, + "grad_norm": 0.0022401143796741962, + "learning_rate": 7.376937023325298e-06, + "loss": 0.0, + "num_input_tokens_seen": 2835696, + "step": 14895 + }, + { + "epoch": 7.744282744282744, + "grad_norm": 0.0002478630340192467, + "learning_rate": 7.360858846845234e-06, + "loss": 0.0, + "num_input_tokens_seen": 2836720, + "step": 14900 + }, + { + "epoch": 7.746881496881497, + "grad_norm": 0.0004898247425444424, + "learning_rate": 7.344795185531117e-06, + "loss": 0.0, + "num_input_tokens_seen": 2837712, + "step": 14905 + }, + { + "epoch": 7.74948024948025, + "grad_norm": 4.3030795495724306e-05, + "learning_rate": 7.328746052601665e-06, + "loss": 0.0, + "num_input_tokens_seen": 2838640, + "step": 14910 + }, + { + "epoch": 7.752079002079002, + "grad_norm": 4.2155741539318115e-05, + "learning_rate": 7.312711461263647e-06, + "loss": 0.0, + "num_input_tokens_seen": 2839600, + "step": 14915 + }, + { + "epoch": 7.754677754677755, + "grad_norm": 0.001982914749532938, + "learning_rate": 7.296691424711826e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2840592, + "step": 14920 + }, + { + "epoch": 7.757276507276507, + "grad_norm": 0.017623968422412872, + "learning_rate": 7.280685956129049e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2841552, + "step": 14925 + }, + { + "epoch": 7.75987525987526, + "grad_norm": 0.001988411648198962, + "learning_rate": 7.2646950686861056e-06, + "loss": 0.0011, + "num_input_tokens_seen": 2842576, + "step": 14930 + }, + { + "epoch": 7.762474012474012, + "grad_norm": 0.0009008675697259605, + "learning_rate": 7.248718775541841e-06, + "loss": 0.0, + "num_input_tokens_seen": 2843504, + "step": 14935 + }, + { + "epoch": 7.765072765072765, + "grad_norm": 3.931903120246716e-05, + "learning_rate": 7.232757089843062e-06, + "loss": 0.0, + "num_input_tokens_seen": 2844496, + "step": 14940 + }, + { + "epoch": 7.767671517671518, + "grad_norm": 0.0032705459743738174, + "learning_rate": 7.216810024724574e-06, + "loss": 0.0, + "num_input_tokens_seen": 2845392, + "step": 14945 + }, + { + "epoch": 7.77027027027027, + "grad_norm": 0.003867857391014695, + "learning_rate": 7.20087759330913e-06, + "loss": 0.1026, + "num_input_tokens_seen": 2846384, + "step": 14950 + }, + { + "epoch": 7.772869022869023, + "grad_norm": 0.0029640512075275183, + "learning_rate": 7.1849598087074645e-06, + "loss": 0.0, + "num_input_tokens_seen": 2847312, + "step": 14955 + }, + { + "epoch": 7.775467775467775, + "grad_norm": 0.0003906809724867344, + "learning_rate": 7.169056684018244e-06, + "loss": 0.0, + "num_input_tokens_seen": 2848208, + "step": 14960 + }, + { + "epoch": 7.778066528066528, + "grad_norm": 4.860025882720947, + "learning_rate": 7.153168232328067e-06, + "loss": 0.0023, + "num_input_tokens_seen": 2849136, + "step": 14965 + }, + { + "epoch": 7.78066528066528, + "grad_norm": 0.007645154371857643, + "learning_rate": 7.137294466711475e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2850032, + "step": 14970 + }, + { + "epoch": 7.783264033264033, + "grad_norm": 5.542107828659937e-05, + "learning_rate": 7.121435400230905e-06, + "loss": 0.0335, + "num_input_tokens_seen": 2851088, + "step": 14975 + }, + { + "epoch": 7.785862785862786, + "grad_norm": 0.015223706141114235, + "learning_rate": 7.105591045936722e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2851984, + "step": 14980 + }, + { + "epoch": 7.788461538461538, + "grad_norm": 3.402736911084503e-05, + "learning_rate": 7.089761416867153e-06, + "loss": 0.0, + "num_input_tokens_seen": 2852976, + "step": 14985 + }, + { + "epoch": 7.791060291060291, + "grad_norm": 0.003968740347772837, + "learning_rate": 7.073946526048342e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2854000, + "step": 14990 + }, + { + "epoch": 7.793659043659043, + "grad_norm": 0.0018506021006032825, + "learning_rate": 7.05814638649428e-06, + "loss": 0.0612, + "num_input_tokens_seen": 2854992, + "step": 14995 + }, + { + "epoch": 7.796257796257796, + "grad_norm": 0.04959118366241455, + "learning_rate": 7.042361011206819e-06, + "loss": 0.0003, + "num_input_tokens_seen": 2855888, + "step": 15000 + }, + { + "epoch": 7.798856548856548, + "grad_norm": 0.004259691573679447, + "learning_rate": 7.026590413175685e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2856848, + "step": 15005 + }, + { + "epoch": 7.801455301455301, + "grad_norm": 0.001659959671087563, + "learning_rate": 7.010834605378414e-06, + "loss": 0.0, + "num_input_tokens_seen": 2857808, + "step": 15010 + }, + { + "epoch": 7.804054054054054, + "grad_norm": 9.817745740292594e-05, + "learning_rate": 6.995093600780403e-06, + "loss": 0.0, + "num_input_tokens_seen": 2858736, + "step": 15015 + }, + { + "epoch": 7.8066528066528065, + "grad_norm": 4.148423613514751e-05, + "learning_rate": 6.979367412334839e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2859728, + "step": 15020 + }, + { + "epoch": 7.8092515592515594, + "grad_norm": 0.0008250129758380353, + "learning_rate": 6.963656052982731e-06, + "loss": 0.0002, + "num_input_tokens_seen": 2860688, + "step": 15025 + }, + { + "epoch": 7.8118503118503115, + "grad_norm": 0.0006480899755842984, + "learning_rate": 6.947959535652873e-06, + "loss": 0.0026, + "num_input_tokens_seen": 2861616, + "step": 15030 + }, + { + "epoch": 7.8144490644490645, + "grad_norm": 0.0028243747074157, + "learning_rate": 6.932277873261864e-06, + "loss": 0.0, + "num_input_tokens_seen": 2862544, + "step": 15035 + }, + { + "epoch": 7.817047817047817, + "grad_norm": 13.873856544494629, + "learning_rate": 6.916611078714077e-06, + "loss": 0.0096, + "num_input_tokens_seen": 2863472, + "step": 15040 + }, + { + "epoch": 7.81964656964657, + "grad_norm": 0.00025363379972986877, + "learning_rate": 6.9009591649016295e-06, + "loss": 0.0, + "num_input_tokens_seen": 2864400, + "step": 15045 + }, + { + "epoch": 7.8222453222453225, + "grad_norm": 4.4475724280346185e-05, + "learning_rate": 6.88532214470442e-06, + "loss": 0.0174, + "num_input_tokens_seen": 2865360, + "step": 15050 + }, + { + "epoch": 7.824844074844075, + "grad_norm": 0.0008718190365470946, + "learning_rate": 6.86970003099007e-06, + "loss": 0.0, + "num_input_tokens_seen": 2866256, + "step": 15055 + }, + { + "epoch": 7.827442827442828, + "grad_norm": 0.003184274071827531, + "learning_rate": 6.854092836613948e-06, + "loss": 0.0005, + "num_input_tokens_seen": 2867184, + "step": 15060 + }, + { + "epoch": 7.83004158004158, + "grad_norm": 0.0010416395962238312, + "learning_rate": 6.838500574419129e-06, + "loss": 0.0, + "num_input_tokens_seen": 2868176, + "step": 15065 + }, + { + "epoch": 7.832640332640333, + "grad_norm": 0.26422739028930664, + "learning_rate": 6.822923257236427e-06, + "loss": 0.0013, + "num_input_tokens_seen": 2869168, + "step": 15070 + }, + { + "epoch": 7.835239085239085, + "grad_norm": 0.00015734757471363991, + "learning_rate": 6.80736089788433e-06, + "loss": 0.0002, + "num_input_tokens_seen": 2870160, + "step": 15075 + }, + { + "epoch": 7.837837837837838, + "grad_norm": 0.002753407694399357, + "learning_rate": 6.7918135091690454e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2871248, + "step": 15080 + }, + { + "epoch": 7.840436590436591, + "grad_norm": 0.0011027028085663915, + "learning_rate": 6.776281103884427e-06, + "loss": 0.0002, + "num_input_tokens_seen": 2872176, + "step": 15085 + }, + { + "epoch": 7.843035343035343, + "grad_norm": 5.805546970805153e-05, + "learning_rate": 6.7607636948120364e-06, + "loss": 0.0002, + "num_input_tokens_seen": 2873104, + "step": 15090 + }, + { + "epoch": 7.845634095634096, + "grad_norm": 0.00039908484905026853, + "learning_rate": 6.745261294721067e-06, + "loss": 0.0, + "num_input_tokens_seen": 2874128, + "step": 15095 + }, + { + "epoch": 7.848232848232849, + "grad_norm": 0.0006577758467756212, + "learning_rate": 6.729773916368365e-06, + "loss": 0.0002, + "num_input_tokens_seen": 2875056, + "step": 15100 + }, + { + "epoch": 7.850831600831601, + "grad_norm": 0.0001403926726197824, + "learning_rate": 6.714301572498435e-06, + "loss": 0.0, + "num_input_tokens_seen": 2875952, + "step": 15105 + }, + { + "epoch": 7.853430353430354, + "grad_norm": 0.00028973992448300123, + "learning_rate": 6.6988442758433805e-06, + "loss": 0.0, + "num_input_tokens_seen": 2876880, + "step": 15110 + }, + { + "epoch": 7.856029106029106, + "grad_norm": 0.0002996415423694998, + "learning_rate": 6.683402039122949e-06, + "loss": 0.0, + "num_input_tokens_seen": 2877872, + "step": 15115 + }, + { + "epoch": 7.858627858627859, + "grad_norm": 3.995071165263653e-05, + "learning_rate": 6.667974875044483e-06, + "loss": 0.0, + "num_input_tokens_seen": 2878832, + "step": 15120 + }, + { + "epoch": 7.861226611226611, + "grad_norm": 0.0010888917604461312, + "learning_rate": 6.652562796302913e-06, + "loss": 0.0, + "num_input_tokens_seen": 2879760, + "step": 15125 + }, + { + "epoch": 7.863825363825364, + "grad_norm": 3.5660261346492916e-05, + "learning_rate": 6.637165815580782e-06, + "loss": 0.0002, + "num_input_tokens_seen": 2880752, + "step": 15130 + }, + { + "epoch": 7.866424116424117, + "grad_norm": 0.00012275055632926524, + "learning_rate": 6.621783945548174e-06, + "loss": 0.0, + "num_input_tokens_seen": 2881680, + "step": 15135 + }, + { + "epoch": 7.869022869022869, + "grad_norm": 0.020609507337212563, + "learning_rate": 6.6064171988627775e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2882608, + "step": 15140 + }, + { + "epoch": 7.871621621621622, + "grad_norm": 3.981278496212326e-05, + "learning_rate": 6.591065588169795e-06, + "loss": 0.0, + "num_input_tokens_seen": 2883536, + "step": 15145 + }, + { + "epoch": 7.874220374220374, + "grad_norm": 0.000540028850082308, + "learning_rate": 6.5757291261020145e-06, + "loss": 0.0, + "num_input_tokens_seen": 2884496, + "step": 15150 + }, + { + "epoch": 7.876819126819127, + "grad_norm": 0.00039036059752106667, + "learning_rate": 6.5604078252797265e-06, + "loss": 0.0, + "num_input_tokens_seen": 2885488, + "step": 15155 + }, + { + "epoch": 7.879417879417879, + "grad_norm": 0.006823372561484575, + "learning_rate": 6.545101698310755e-06, + "loss": 0.0, + "num_input_tokens_seen": 2886448, + "step": 15160 + }, + { + "epoch": 7.882016632016632, + "grad_norm": 0.0003212781739421189, + "learning_rate": 6.5298107577904474e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2887408, + "step": 15165 + }, + { + "epoch": 7.884615384615385, + "grad_norm": 0.004585127346217632, + "learning_rate": 6.514535016301637e-06, + "loss": 0.0, + "num_input_tokens_seen": 2888304, + "step": 15170 + }, + { + "epoch": 7.887214137214137, + "grad_norm": 0.000386605883250013, + "learning_rate": 6.499274486414672e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2889200, + "step": 15175 + }, + { + "epoch": 7.88981288981289, + "grad_norm": 0.0002000716485781595, + "learning_rate": 6.484029180687357e-06, + "loss": 0.0, + "num_input_tokens_seen": 2890128, + "step": 15180 + }, + { + "epoch": 7.892411642411642, + "grad_norm": 0.025518540292978287, + "learning_rate": 6.468799111665003e-06, + "loss": 0.0051, + "num_input_tokens_seen": 2891056, + "step": 15185 + }, + { + "epoch": 7.895010395010395, + "grad_norm": 0.0009962673066183925, + "learning_rate": 6.4535842918803326e-06, + "loss": 0.0, + "num_input_tokens_seen": 2892048, + "step": 15190 + }, + { + "epoch": 7.897609147609147, + "grad_norm": 0.009834659285843372, + "learning_rate": 6.4383847338535725e-06, + "loss": 0.0, + "num_input_tokens_seen": 2893072, + "step": 15195 + }, + { + "epoch": 7.9002079002079, + "grad_norm": 3.8559490349143744e-05, + "learning_rate": 6.423200450092351e-06, + "loss": 0.0, + "num_input_tokens_seen": 2894000, + "step": 15200 + }, + { + "epoch": 7.902806652806653, + "grad_norm": 0.00351192825473845, + "learning_rate": 6.4080314530917565e-06, + "loss": 0.0, + "num_input_tokens_seen": 2894992, + "step": 15205 + }, + { + "epoch": 7.905405405405405, + "grad_norm": 0.009929636493325233, + "learning_rate": 6.392877755334276e-06, + "loss": 0.0, + "num_input_tokens_seen": 2895920, + "step": 15210 + }, + { + "epoch": 7.908004158004158, + "grad_norm": 0.1230366975069046, + "learning_rate": 6.377739369289815e-06, + "loss": 0.0006, + "num_input_tokens_seen": 2896880, + "step": 15215 + }, + { + "epoch": 7.91060291060291, + "grad_norm": 4.600924512487836e-05, + "learning_rate": 6.362616307415703e-06, + "loss": 0.0, + "num_input_tokens_seen": 2897872, + "step": 15220 + }, + { + "epoch": 7.913201663201663, + "grad_norm": 0.007152107544243336, + "learning_rate": 6.3475085821566e-06, + "loss": 0.0, + "num_input_tokens_seen": 2898832, + "step": 15225 + }, + { + "epoch": 7.915800415800415, + "grad_norm": 0.0010033701546490192, + "learning_rate": 6.332416205944611e-06, + "loss": 0.0, + "num_input_tokens_seen": 2899792, + "step": 15230 + }, + { + "epoch": 7.918399168399168, + "grad_norm": 0.0013612378388643265, + "learning_rate": 6.3173391911991595e-06, + "loss": 0.0185, + "num_input_tokens_seen": 2900752, + "step": 15235 + }, + { + "epoch": 7.920997920997921, + "grad_norm": 0.0002933363721240312, + "learning_rate": 6.3022775503270656e-06, + "loss": 0.0, + "num_input_tokens_seen": 2901712, + "step": 15240 + }, + { + "epoch": 7.923596673596673, + "grad_norm": 0.00011634729889919981, + "learning_rate": 6.28723129572247e-06, + "loss": 0.0, + "num_input_tokens_seen": 2902672, + "step": 15245 + }, + { + "epoch": 7.926195426195426, + "grad_norm": 0.0005557405529543757, + "learning_rate": 6.272200439766882e-06, + "loss": 0.0, + "num_input_tokens_seen": 2903664, + "step": 15250 + }, + { + "epoch": 7.9287941787941785, + "grad_norm": 0.005288934335112572, + "learning_rate": 6.257184994829108e-06, + "loss": 0.0404, + "num_input_tokens_seen": 2904624, + "step": 15255 + }, + { + "epoch": 7.9313929313929314, + "grad_norm": 0.0010106870904564857, + "learning_rate": 6.242184973265283e-06, + "loss": 0.0, + "num_input_tokens_seen": 2905520, + "step": 15260 + }, + { + "epoch": 7.9339916839916835, + "grad_norm": 0.00020805852545890957, + "learning_rate": 6.227200387418869e-06, + "loss": 0.0, + "num_input_tokens_seen": 2906448, + "step": 15265 + }, + { + "epoch": 7.9365904365904365, + "grad_norm": 15.660173416137695, + "learning_rate": 6.212231249620595e-06, + "loss": 0.0307, + "num_input_tokens_seen": 2907312, + "step": 15270 + }, + { + "epoch": 7.9391891891891895, + "grad_norm": 0.0015859012492001057, + "learning_rate": 6.197277572188509e-06, + "loss": 0.0, + "num_input_tokens_seen": 2908240, + "step": 15275 + }, + { + "epoch": 7.941787941787942, + "grad_norm": 6.875916733406484e-05, + "learning_rate": 6.182339367427906e-06, + "loss": 0.0, + "num_input_tokens_seen": 2909136, + "step": 15280 + }, + { + "epoch": 7.9443866943866945, + "grad_norm": 0.0016106362454593182, + "learning_rate": 6.16741664763138e-06, + "loss": 0.0, + "num_input_tokens_seen": 2910032, + "step": 15285 + }, + { + "epoch": 7.946985446985447, + "grad_norm": 4.6209333959268406e-05, + "learning_rate": 6.152509425078759e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2910992, + "step": 15290 + }, + { + "epoch": 7.9495841995842, + "grad_norm": 0.0013789376243948936, + "learning_rate": 6.137617712037116e-06, + "loss": 0.0, + "num_input_tokens_seen": 2911920, + "step": 15295 + }, + { + "epoch": 7.952182952182953, + "grad_norm": 4.613691271515563e-05, + "learning_rate": 6.122741520760791e-06, + "loss": 0.0245, + "num_input_tokens_seen": 2912944, + "step": 15300 + }, + { + "epoch": 7.954781704781705, + "grad_norm": 4.331399395596236e-05, + "learning_rate": 6.1078808634913165e-06, + "loss": 0.0, + "num_input_tokens_seen": 2913872, + "step": 15305 + }, + { + "epoch": 7.957380457380458, + "grad_norm": 4.037090184283443e-05, + "learning_rate": 6.093035752457468e-06, + "loss": 0.0, + "num_input_tokens_seen": 2914800, + "step": 15310 + }, + { + "epoch": 7.95997920997921, + "grad_norm": 0.00012950737436767668, + "learning_rate": 6.078206199875211e-06, + "loss": 0.0, + "num_input_tokens_seen": 2915728, + "step": 15315 + }, + { + "epoch": 7.962577962577963, + "grad_norm": 0.0003076647990383208, + "learning_rate": 6.063392217947714e-06, + "loss": 0.0, + "num_input_tokens_seen": 2916752, + "step": 15320 + }, + { + "epoch": 7.965176715176716, + "grad_norm": 0.00013686141755897552, + "learning_rate": 6.048593818865328e-06, + "loss": 0.0, + "num_input_tokens_seen": 2917648, + "step": 15325 + }, + { + "epoch": 7.967775467775468, + "grad_norm": 0.0010110853472724557, + "learning_rate": 6.033811014805599e-06, + "loss": 0.0, + "num_input_tokens_seen": 2918640, + "step": 15330 + }, + { + "epoch": 7.970374220374221, + "grad_norm": 0.00047971567255444825, + "learning_rate": 6.019043817933212e-06, + "loss": 0.0, + "num_input_tokens_seen": 2919536, + "step": 15335 + }, + { + "epoch": 7.972972972972973, + "grad_norm": 0.0005016656359657645, + "learning_rate": 6.004292240400031e-06, + "loss": 0.0002, + "num_input_tokens_seen": 2920464, + "step": 15340 + }, + { + "epoch": 7.975571725571726, + "grad_norm": 0.0007475215825252235, + "learning_rate": 5.989556294345067e-06, + "loss": 0.145, + "num_input_tokens_seen": 2921392, + "step": 15345 + }, + { + "epoch": 7.978170478170478, + "grad_norm": 3.5157012462150306e-05, + "learning_rate": 5.9748359918944504e-06, + "loss": 0.0, + "num_input_tokens_seen": 2922384, + "step": 15350 + }, + { + "epoch": 7.980769230769231, + "grad_norm": 4.4957359932595864e-05, + "learning_rate": 5.960131345161454e-06, + "loss": 0.0, + "num_input_tokens_seen": 2923280, + "step": 15355 + }, + { + "epoch": 7.983367983367984, + "grad_norm": 0.00010367407958256081, + "learning_rate": 5.945442366246448e-06, + "loss": 0.0, + "num_input_tokens_seen": 2924176, + "step": 15360 + }, + { + "epoch": 7.985966735966736, + "grad_norm": 0.00019496274762786925, + "learning_rate": 5.930769067236944e-06, + "loss": 0.0, + "num_input_tokens_seen": 2925104, + "step": 15365 + }, + { + "epoch": 7.988565488565489, + "grad_norm": 0.00010897958418354392, + "learning_rate": 5.916111460207516e-06, + "loss": 0.0, + "num_input_tokens_seen": 2926096, + "step": 15370 + }, + { + "epoch": 7.991164241164241, + "grad_norm": 4.7705263568786904e-05, + "learning_rate": 5.901469557219849e-06, + "loss": 0.0, + "num_input_tokens_seen": 2927056, + "step": 15375 + }, + { + "epoch": 7.993762993762994, + "grad_norm": 0.00017580168787389994, + "learning_rate": 5.886843370322692e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2928016, + "step": 15380 + }, + { + "epoch": 7.996361746361746, + "grad_norm": 0.00040031757089309394, + "learning_rate": 5.872232911551859e-06, + "loss": 0.0006, + "num_input_tokens_seen": 2928944, + "step": 15385 + }, + { + "epoch": 7.998960498960499, + "grad_norm": 5.857404539710842e-05, + "learning_rate": 5.85763819293024e-06, + "loss": 0.0, + "num_input_tokens_seen": 2929904, + "step": 15390 + }, + { + "epoch": 8.0, + "eval_loss": 0.4232122004032135, + "eval_runtime": 9.2482, + "eval_samples_per_second": 92.559, + "eval_steps_per_second": 23.14, + "num_input_tokens_seen": 2930240, + "step": 15392 + }, + { + "epoch": 8.001559251559252, + "grad_norm": 0.012791800312697887, + "learning_rate": 5.843059226467745e-06, + "loss": 0.0, + "num_input_tokens_seen": 2930784, + "step": 15395 + }, + { + "epoch": 8.004158004158004, + "grad_norm": 0.00018642937357071787, + "learning_rate": 5.828496024161353e-06, + "loss": 0.0, + "num_input_tokens_seen": 2931712, + "step": 15400 + }, + { + "epoch": 8.006756756756756, + "grad_norm": 0.0005576782859861851, + "learning_rate": 5.81394859799504e-06, + "loss": 0.0758, + "num_input_tokens_seen": 2932672, + "step": 15405 + }, + { + "epoch": 8.00935550935551, + "grad_norm": 0.0007308530039153993, + "learning_rate": 5.799416959939827e-06, + "loss": 0.0005, + "num_input_tokens_seen": 2933568, + "step": 15410 + }, + { + "epoch": 8.011954261954262, + "grad_norm": 0.000277509301668033, + "learning_rate": 5.784901121953723e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2934496, + "step": 15415 + }, + { + "epoch": 8.014553014553014, + "grad_norm": 0.00011251964315306395, + "learning_rate": 5.770401095981739e-06, + "loss": 0.0, + "num_input_tokens_seen": 2935424, + "step": 15420 + }, + { + "epoch": 8.017151767151768, + "grad_norm": 0.0022070494014769793, + "learning_rate": 5.755916893955887e-06, + "loss": 0.0, + "num_input_tokens_seen": 2936416, + "step": 15425 + }, + { + "epoch": 8.01975051975052, + "grad_norm": 3.8252968806773424e-05, + "learning_rate": 5.741448527795137e-06, + "loss": 0.0, + "num_input_tokens_seen": 2937312, + "step": 15430 + }, + { + "epoch": 8.022349272349272, + "grad_norm": 0.0010891449637711048, + "learning_rate": 5.726996009405455e-06, + "loss": 0.0, + "num_input_tokens_seen": 2938272, + "step": 15435 + }, + { + "epoch": 8.024948024948024, + "grad_norm": 0.0004446979146450758, + "learning_rate": 5.712559350679733e-06, + "loss": 0.0, + "num_input_tokens_seen": 2939264, + "step": 15440 + }, + { + "epoch": 8.027546777546778, + "grad_norm": 0.0003358771500643343, + "learning_rate": 5.698138563497854e-06, + "loss": 0.0, + "num_input_tokens_seen": 2940192, + "step": 15445 + }, + { + "epoch": 8.03014553014553, + "grad_norm": 4.207672827760689e-05, + "learning_rate": 5.683733659726581e-06, + "loss": 0.0, + "num_input_tokens_seen": 2941184, + "step": 15450 + }, + { + "epoch": 8.032744282744282, + "grad_norm": 0.03529589623212814, + "learning_rate": 5.669344651219663e-06, + "loss": 0.0006, + "num_input_tokens_seen": 2942112, + "step": 15455 + }, + { + "epoch": 8.035343035343036, + "grad_norm": 0.0009074818808585405, + "learning_rate": 5.654971549817748e-06, + "loss": 0.0549, + "num_input_tokens_seen": 2943072, + "step": 15460 + }, + { + "epoch": 8.037941787941788, + "grad_norm": 0.001864942372776568, + "learning_rate": 5.640614367348385e-06, + "loss": 0.0, + "num_input_tokens_seen": 2943936, + "step": 15465 + }, + { + "epoch": 8.04054054054054, + "grad_norm": 0.0009933606488630176, + "learning_rate": 5.626273115626038e-06, + "loss": 0.0, + "num_input_tokens_seen": 2944960, + "step": 15470 + }, + { + "epoch": 8.043139293139292, + "grad_norm": 4.536345659289509e-05, + "learning_rate": 5.61194780645205e-06, + "loss": 0.0002, + "num_input_tokens_seen": 2945856, + "step": 15475 + }, + { + "epoch": 8.045738045738046, + "grad_norm": 3.991768244304694e-05, + "learning_rate": 5.597638451614665e-06, + "loss": 0.0, + "num_input_tokens_seen": 2946784, + "step": 15480 + }, + { + "epoch": 8.048336798336798, + "grad_norm": 4.623997301678173e-05, + "learning_rate": 5.583345062888956e-06, + "loss": 0.0, + "num_input_tokens_seen": 2947744, + "step": 15485 + }, + { + "epoch": 8.05093555093555, + "grad_norm": 5.037779192207381e-05, + "learning_rate": 5.569067652036911e-06, + "loss": 0.0, + "num_input_tokens_seen": 2948736, + "step": 15490 + }, + { + "epoch": 8.053534303534304, + "grad_norm": 0.0005626599304378033, + "learning_rate": 5.5548062308073246e-06, + "loss": 0.0, + "num_input_tokens_seen": 2949664, + "step": 15495 + }, + { + "epoch": 8.056133056133056, + "grad_norm": 0.06433740258216858, + "learning_rate": 5.540560810935871e-06, + "loss": 0.0017, + "num_input_tokens_seen": 2950592, + "step": 15500 + }, + { + "epoch": 8.058731808731808, + "grad_norm": 3.4401095035718754e-05, + "learning_rate": 5.526331404145021e-06, + "loss": 0.0002, + "num_input_tokens_seen": 2951616, + "step": 15505 + }, + { + "epoch": 8.06133056133056, + "grad_norm": 0.0008488184539601207, + "learning_rate": 5.512118022144105e-06, + "loss": 0.0, + "num_input_tokens_seen": 2952512, + "step": 15510 + }, + { + "epoch": 8.063929313929314, + "grad_norm": 4.092829112778418e-05, + "learning_rate": 5.497920676629234e-06, + "loss": 0.0, + "num_input_tokens_seen": 2953440, + "step": 15515 + }, + { + "epoch": 8.066528066528067, + "grad_norm": 0.0004149024607613683, + "learning_rate": 5.483739379283337e-06, + "loss": 0.0, + "num_input_tokens_seen": 2954400, + "step": 15520 + }, + { + "epoch": 8.069126819126819, + "grad_norm": 0.0020355137530714273, + "learning_rate": 5.469574141776146e-06, + "loss": 0.0, + "num_input_tokens_seen": 2955328, + "step": 15525 + }, + { + "epoch": 8.071725571725572, + "grad_norm": 7.657004607608542e-05, + "learning_rate": 5.455424975764156e-06, + "loss": 0.0, + "num_input_tokens_seen": 2956320, + "step": 15530 + }, + { + "epoch": 8.074324324324325, + "grad_norm": 0.00011696209548972547, + "learning_rate": 5.4412918928906625e-06, + "loss": 0.0, + "num_input_tokens_seen": 2957248, + "step": 15535 + }, + { + "epoch": 8.076923076923077, + "grad_norm": 8.324118243763223e-05, + "learning_rate": 5.4271749047856975e-06, + "loss": 0.0, + "num_input_tokens_seen": 2958208, + "step": 15540 + }, + { + "epoch": 8.079521829521829, + "grad_norm": 0.016283202916383743, + "learning_rate": 5.413074023066081e-06, + "loss": 0.0, + "num_input_tokens_seen": 2959200, + "step": 15545 + }, + { + "epoch": 8.082120582120583, + "grad_norm": 0.007412572391331196, + "learning_rate": 5.398989259335352e-06, + "loss": 0.0, + "num_input_tokens_seen": 2960224, + "step": 15550 + }, + { + "epoch": 8.084719334719335, + "grad_norm": 0.0003030497464351356, + "learning_rate": 5.38492062518379e-06, + "loss": 0.0, + "num_input_tokens_seen": 2961184, + "step": 15555 + }, + { + "epoch": 8.087318087318087, + "grad_norm": 0.01047906931489706, + "learning_rate": 5.37086813218842e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2962112, + "step": 15560 + }, + { + "epoch": 8.08991683991684, + "grad_norm": 0.0005186541238799691, + "learning_rate": 5.356831791912958e-06, + "loss": 0.0, + "num_input_tokens_seen": 2963136, + "step": 15565 + }, + { + "epoch": 8.092515592515593, + "grad_norm": 0.0006474682595580816, + "learning_rate": 5.342811615907861e-06, + "loss": 0.0, + "num_input_tokens_seen": 2964096, + "step": 15570 + }, + { + "epoch": 8.095114345114345, + "grad_norm": 4.428795728017576e-05, + "learning_rate": 5.328807615710246e-06, + "loss": 0.0, + "num_input_tokens_seen": 2964992, + "step": 15575 + }, + { + "epoch": 8.097713097713097, + "grad_norm": 0.16496160626411438, + "learning_rate": 5.3148198028439565e-06, + "loss": 0.0008, + "num_input_tokens_seen": 2965856, + "step": 15580 + }, + { + "epoch": 8.10031185031185, + "grad_norm": 0.00036891590571030974, + "learning_rate": 5.300848188819491e-06, + "loss": 0.0687, + "num_input_tokens_seen": 2966784, + "step": 15585 + }, + { + "epoch": 8.102910602910603, + "grad_norm": 6.131286500021815e-05, + "learning_rate": 5.286892785134012e-06, + "loss": 0.0, + "num_input_tokens_seen": 2967680, + "step": 15590 + }, + { + "epoch": 8.105509355509355, + "grad_norm": 0.00012212475121486932, + "learning_rate": 5.272953603271375e-06, + "loss": 0.0, + "num_input_tokens_seen": 2968704, + "step": 15595 + }, + { + "epoch": 8.108108108108109, + "grad_norm": 0.025939904153347015, + "learning_rate": 5.259030654702052e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2969696, + "step": 15600 + }, + { + "epoch": 8.11070686070686, + "grad_norm": 0.0005396409542299807, + "learning_rate": 5.2451239508831824e-06, + "loss": 0.0, + "num_input_tokens_seen": 2970688, + "step": 15605 + }, + { + "epoch": 8.113305613305613, + "grad_norm": 5.7376189943170175e-05, + "learning_rate": 5.231233503258523e-06, + "loss": 0.0, + "num_input_tokens_seen": 2971648, + "step": 15610 + }, + { + "epoch": 8.115904365904365, + "grad_norm": 0.002456552814692259, + "learning_rate": 5.217359323258459e-06, + "loss": 0.0, + "num_input_tokens_seen": 2972640, + "step": 15615 + }, + { + "epoch": 8.118503118503119, + "grad_norm": 0.00022098574845585972, + "learning_rate": 5.203501422299975e-06, + "loss": 0.0, + "num_input_tokens_seen": 2973568, + "step": 15620 + }, + { + "epoch": 8.121101871101871, + "grad_norm": 0.0005666579236276448, + "learning_rate": 5.1896598117866925e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2974432, + "step": 15625 + }, + { + "epoch": 8.123700623700623, + "grad_norm": 0.0024879444390535355, + "learning_rate": 5.1758345031087895e-06, + "loss": 0.0, + "num_input_tokens_seen": 2975328, + "step": 15630 + }, + { + "epoch": 8.126299376299377, + "grad_norm": 0.000908825546503067, + "learning_rate": 5.162025507643057e-06, + "loss": 0.0711, + "num_input_tokens_seen": 2976320, + "step": 15635 + }, + { + "epoch": 8.128898128898129, + "grad_norm": 0.0004931109142489731, + "learning_rate": 5.148232836752856e-06, + "loss": 0.0, + "num_input_tokens_seen": 2977280, + "step": 15640 + }, + { + "epoch": 8.131496881496881, + "grad_norm": 7.961010851431638e-05, + "learning_rate": 5.134456501788104e-06, + "loss": 0.0006, + "num_input_tokens_seen": 2978272, + "step": 15645 + }, + { + "epoch": 8.134095634095635, + "grad_norm": 4.7027653636178e-05, + "learning_rate": 5.1206965140852825e-06, + "loss": 0.0, + "num_input_tokens_seen": 2979232, + "step": 15650 + }, + { + "epoch": 8.136694386694387, + "grad_norm": 0.03642561286687851, + "learning_rate": 5.106952884967417e-06, + "loss": 0.0003, + "num_input_tokens_seen": 2980224, + "step": 15655 + }, + { + "epoch": 8.13929313929314, + "grad_norm": 0.001769631402567029, + "learning_rate": 5.093225625744083e-06, + "loss": 0.0, + "num_input_tokens_seen": 2981120, + "step": 15660 + }, + { + "epoch": 8.141891891891891, + "grad_norm": 0.006143126171082258, + "learning_rate": 5.079514747711367e-06, + "loss": 0.0, + "num_input_tokens_seen": 2982144, + "step": 15665 + }, + { + "epoch": 8.144490644490645, + "grad_norm": 0.0032091261819005013, + "learning_rate": 5.065820262151899e-06, + "loss": 0.0, + "num_input_tokens_seen": 2983104, + "step": 15670 + }, + { + "epoch": 8.147089397089397, + "grad_norm": 5.9714086091844365e-05, + "learning_rate": 5.052142180334799e-06, + "loss": 0.0, + "num_input_tokens_seen": 2984000, + "step": 15675 + }, + { + "epoch": 8.14968814968815, + "grad_norm": 0.00030512703233398497, + "learning_rate": 5.038480513515689e-06, + "loss": 0.0, + "num_input_tokens_seen": 2984992, + "step": 15680 + }, + { + "epoch": 8.152286902286903, + "grad_norm": 0.006438937969505787, + "learning_rate": 5.024835272936704e-06, + "loss": 0.0, + "num_input_tokens_seen": 2985920, + "step": 15685 + }, + { + "epoch": 8.154885654885655, + "grad_norm": 0.0009361268603242934, + "learning_rate": 5.011206469826435e-06, + "loss": 0.0, + "num_input_tokens_seen": 2986880, + "step": 15690 + }, + { + "epoch": 8.157484407484407, + "grad_norm": 0.0007907726685516536, + "learning_rate": 4.9975941153999725e-06, + "loss": 0.0, + "num_input_tokens_seen": 2987840, + "step": 15695 + }, + { + "epoch": 8.16008316008316, + "grad_norm": 0.0024236789904534817, + "learning_rate": 4.983998220858846e-06, + "loss": 0.0, + "num_input_tokens_seen": 2988768, + "step": 15700 + }, + { + "epoch": 8.162681912681913, + "grad_norm": 0.0007357804570347071, + "learning_rate": 4.9704187973910635e-06, + "loss": 0.0, + "num_input_tokens_seen": 2989696, + "step": 15705 + }, + { + "epoch": 8.165280665280665, + "grad_norm": 0.0018026040634140372, + "learning_rate": 4.956855856171067e-06, + "loss": 0.0, + "num_input_tokens_seen": 2990752, + "step": 15710 + }, + { + "epoch": 8.167879417879417, + "grad_norm": 0.0009418597328476608, + "learning_rate": 4.9433094083597256e-06, + "loss": 0.0, + "num_input_tokens_seen": 2991616, + "step": 15715 + }, + { + "epoch": 8.170478170478171, + "grad_norm": 0.012179587036371231, + "learning_rate": 4.929779465104365e-06, + "loss": 0.0, + "num_input_tokens_seen": 2992512, + "step": 15720 + }, + { + "epoch": 8.173076923076923, + "grad_norm": 0.013293541967868805, + "learning_rate": 4.916266037538691e-06, + "loss": 0.0071, + "num_input_tokens_seen": 2993536, + "step": 15725 + }, + { + "epoch": 8.175675675675675, + "grad_norm": 0.009222462773323059, + "learning_rate": 4.902769136782859e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2994496, + "step": 15730 + }, + { + "epoch": 8.178274428274428, + "grad_norm": 0.00035055362968705595, + "learning_rate": 4.88928877394339e-06, + "loss": 0.0, + "num_input_tokens_seen": 2995392, + "step": 15735 + }, + { + "epoch": 8.180873180873181, + "grad_norm": 0.00022688678291160613, + "learning_rate": 4.875824960113231e-06, + "loss": 0.0, + "num_input_tokens_seen": 2996288, + "step": 15740 + }, + { + "epoch": 8.183471933471933, + "grad_norm": 0.0008334351587109268, + "learning_rate": 4.862377706371665e-06, + "loss": 0.0001, + "num_input_tokens_seen": 2997184, + "step": 15745 + }, + { + "epoch": 8.186070686070686, + "grad_norm": 8.885620627552271e-05, + "learning_rate": 4.848947023784389e-06, + "loss": 0.0, + "num_input_tokens_seen": 2998144, + "step": 15750 + }, + { + "epoch": 8.18866943866944, + "grad_norm": 6.722882972098887e-05, + "learning_rate": 4.835532923403441e-06, + "loss": 0.0006, + "num_input_tokens_seen": 2999136, + "step": 15755 + }, + { + "epoch": 8.191268191268192, + "grad_norm": 3.975689469370991e-05, + "learning_rate": 4.822135416267223e-06, + "loss": 0.0, + "num_input_tokens_seen": 3000064, + "step": 15760 + }, + { + "epoch": 8.193866943866944, + "grad_norm": 7.056103640934452e-05, + "learning_rate": 4.80875451340049e-06, + "loss": 0.0, + "num_input_tokens_seen": 3001056, + "step": 15765 + }, + { + "epoch": 8.196465696465696, + "grad_norm": 0.0005975202657282352, + "learning_rate": 4.795390225814308e-06, + "loss": 0.0, + "num_input_tokens_seen": 3001952, + "step": 15770 + }, + { + "epoch": 8.19906444906445, + "grad_norm": 0.00041255756514146924, + "learning_rate": 4.782042564506109e-06, + "loss": 0.0, + "num_input_tokens_seen": 3002880, + "step": 15775 + }, + { + "epoch": 8.201663201663202, + "grad_norm": 0.0009372268687002361, + "learning_rate": 4.768711540459591e-06, + "loss": 0.0, + "num_input_tokens_seen": 3003776, + "step": 15780 + }, + { + "epoch": 8.204261954261954, + "grad_norm": 0.00031196876079775393, + "learning_rate": 4.755397164644812e-06, + "loss": 0.0001, + "num_input_tokens_seen": 3004736, + "step": 15785 + }, + { + "epoch": 8.206860706860708, + "grad_norm": 0.00017316744197160006, + "learning_rate": 4.742099448018097e-06, + "loss": 0.0, + "num_input_tokens_seen": 3005696, + "step": 15790 + }, + { + "epoch": 8.20945945945946, + "grad_norm": 3.961609399993904e-05, + "learning_rate": 4.728818401522084e-06, + "loss": 0.0007, + "num_input_tokens_seen": 3006688, + "step": 15795 + }, + { + "epoch": 8.212058212058212, + "grad_norm": 0.00026058542425744236, + "learning_rate": 4.715554036085673e-06, + "loss": 0.0, + "num_input_tokens_seen": 3007744, + "step": 15800 + }, + { + "epoch": 8.214656964656964, + "grad_norm": 0.08207233250141144, + "learning_rate": 4.702306362624062e-06, + "loss": 0.0001, + "num_input_tokens_seen": 3008704, + "step": 15805 + }, + { + "epoch": 8.217255717255718, + "grad_norm": 0.0010104168904945254, + "learning_rate": 4.6890753920386885e-06, + "loss": 0.0, + "num_input_tokens_seen": 3009664, + "step": 15810 + }, + { + "epoch": 8.21985446985447, + "grad_norm": 3.4285611036466435e-05, + "learning_rate": 4.675861135217252e-06, + "loss": 0.0, + "num_input_tokens_seen": 3010624, + "step": 15815 + }, + { + "epoch": 8.222453222453222, + "grad_norm": 5.231048635323532e-05, + "learning_rate": 4.662663603033715e-06, + "loss": 0.0, + "num_input_tokens_seen": 3011552, + "step": 15820 + }, + { + "epoch": 8.225051975051976, + "grad_norm": 0.00012192578287795186, + "learning_rate": 4.649482806348249e-06, + "loss": 0.0, + "num_input_tokens_seen": 3012480, + "step": 15825 + }, + { + "epoch": 8.227650727650728, + "grad_norm": 0.0005388545687310398, + "learning_rate": 4.636318756007285e-06, + "loss": 0.0, + "num_input_tokens_seen": 3013440, + "step": 15830 + }, + { + "epoch": 8.23024948024948, + "grad_norm": 0.0001576723443577066, + "learning_rate": 4.6231714628434425e-06, + "loss": 0.0, + "num_input_tokens_seen": 3014400, + "step": 15835 + }, + { + "epoch": 8.232848232848234, + "grad_norm": 0.00011758463369915262, + "learning_rate": 4.610040937675583e-06, + "loss": 0.0, + "num_input_tokens_seen": 3015296, + "step": 15840 + }, + { + "epoch": 8.235446985446986, + "grad_norm": 7.074536551954225e-05, + "learning_rate": 4.596927191308744e-06, + "loss": 0.0, + "num_input_tokens_seen": 3016256, + "step": 15845 + }, + { + "epoch": 8.238045738045738, + "grad_norm": 4.471445572562516e-05, + "learning_rate": 4.583830234534161e-06, + "loss": 0.0, + "num_input_tokens_seen": 3017152, + "step": 15850 + }, + { + "epoch": 8.24064449064449, + "grad_norm": 3.981781264883466e-05, + "learning_rate": 4.5707500781292715e-06, + "loss": 0.0, + "num_input_tokens_seen": 3018080, + "step": 15855 + }, + { + "epoch": 8.243243243243244, + "grad_norm": 0.0016171082388609648, + "learning_rate": 4.557686732857661e-06, + "loss": 0.0, + "num_input_tokens_seen": 3019008, + "step": 15860 + }, + { + "epoch": 8.245841995841996, + "grad_norm": 0.0005471879849210382, + "learning_rate": 4.544640209469103e-06, + "loss": 0.0, + "num_input_tokens_seen": 3020064, + "step": 15865 + }, + { + "epoch": 8.248440748440748, + "grad_norm": 6.401548307621852e-05, + "learning_rate": 4.531610518699514e-06, + "loss": 0.0, + "num_input_tokens_seen": 3020992, + "step": 15870 + }, + { + "epoch": 8.2510395010395, + "grad_norm": 0.0008428345317952335, + "learning_rate": 4.51859767127098e-06, + "loss": 0.0, + "num_input_tokens_seen": 3021888, + "step": 15875 + }, + { + "epoch": 8.253638253638254, + "grad_norm": 0.00023259202134795487, + "learning_rate": 4.505601677891688e-06, + "loss": 0.0, + "num_input_tokens_seen": 3022848, + "step": 15880 + }, + { + "epoch": 8.256237006237006, + "grad_norm": 0.0002204023185186088, + "learning_rate": 4.492622549255992e-06, + "loss": 0.0, + "num_input_tokens_seen": 3023840, + "step": 15885 + }, + { + "epoch": 8.258835758835758, + "grad_norm": 4.7910059947753325e-05, + "learning_rate": 4.4796602960443604e-06, + "loss": 0.0, + "num_input_tokens_seen": 3024768, + "step": 15890 + }, + { + "epoch": 8.261434511434512, + "grad_norm": 0.00018887565238401294, + "learning_rate": 4.46671492892336e-06, + "loss": 0.0, + "num_input_tokens_seen": 3025792, + "step": 15895 + }, + { + "epoch": 8.264033264033264, + "grad_norm": 4.202386116958223e-05, + "learning_rate": 4.4537864585456834e-06, + "loss": 0.0, + "num_input_tokens_seen": 3026688, + "step": 15900 + }, + { + "epoch": 8.266632016632016, + "grad_norm": 8.303323556901887e-05, + "learning_rate": 4.4408748955501015e-06, + "loss": 0.0, + "num_input_tokens_seen": 3027680, + "step": 15905 + }, + { + "epoch": 8.26923076923077, + "grad_norm": 0.00022811921371612698, + "learning_rate": 4.427980250561478e-06, + "loss": 0.0, + "num_input_tokens_seen": 3028608, + "step": 15910 + }, + { + "epoch": 8.271829521829522, + "grad_norm": 0.00018638237088453025, + "learning_rate": 4.415102534190749e-06, + "loss": 0.0, + "num_input_tokens_seen": 3029632, + "step": 15915 + }, + { + "epoch": 8.274428274428274, + "grad_norm": 4.795039785676636e-05, + "learning_rate": 4.4022417570349415e-06, + "loss": 0.0, + "num_input_tokens_seen": 3030528, + "step": 15920 + }, + { + "epoch": 8.277027027027026, + "grad_norm": 4.310990698286332e-05, + "learning_rate": 4.389397929677113e-06, + "loss": 0.0, + "num_input_tokens_seen": 3031488, + "step": 15925 + }, + { + "epoch": 8.27962577962578, + "grad_norm": 5.5399726988980547e-05, + "learning_rate": 4.376571062686405e-06, + "loss": 0.0005, + "num_input_tokens_seen": 3032480, + "step": 15930 + }, + { + "epoch": 8.282224532224532, + "grad_norm": 0.0007272917428053916, + "learning_rate": 4.3637611666179686e-06, + "loss": 0.0032, + "num_input_tokens_seen": 3033376, + "step": 15935 + }, + { + "epoch": 8.284823284823284, + "grad_norm": 0.001628470839932561, + "learning_rate": 4.350968252013021e-06, + "loss": 0.0, + "num_input_tokens_seen": 3034304, + "step": 15940 + }, + { + "epoch": 8.287422037422038, + "grad_norm": 0.00033262078068219125, + "learning_rate": 4.3381923293987855e-06, + "loss": 0.0, + "num_input_tokens_seen": 3035296, + "step": 15945 + }, + { + "epoch": 8.29002079002079, + "grad_norm": 5.575501441955566, + "learning_rate": 4.325433409288498e-06, + "loss": 0.0731, + "num_input_tokens_seen": 3036224, + "step": 15950 + }, + { + "epoch": 8.292619542619542, + "grad_norm": 8.71092634042725e-05, + "learning_rate": 4.3126915021814346e-06, + "loss": 0.0, + "num_input_tokens_seen": 3037152, + "step": 15955 + }, + { + "epoch": 8.295218295218294, + "grad_norm": 3.2440777431475e-05, + "learning_rate": 4.2999666185628315e-06, + "loss": 0.0, + "num_input_tokens_seen": 3038112, + "step": 15960 + }, + { + "epoch": 8.297817047817048, + "grad_norm": 0.00010738080163719133, + "learning_rate": 4.2872587689039484e-06, + "loss": 0.0, + "num_input_tokens_seen": 3039040, + "step": 15965 + }, + { + "epoch": 8.3004158004158, + "grad_norm": 0.020596198737621307, + "learning_rate": 4.27456796366201e-06, + "loss": 0.0, + "num_input_tokens_seen": 3040000, + "step": 15970 + }, + { + "epoch": 8.303014553014552, + "grad_norm": 3.862527591991238e-05, + "learning_rate": 4.261894213280215e-06, + "loss": 0.0, + "num_input_tokens_seen": 3040992, + "step": 15975 + }, + { + "epoch": 8.305613305613306, + "grad_norm": 5.9387122746557e-05, + "learning_rate": 4.249237528187741e-06, + "loss": 0.0013, + "num_input_tokens_seen": 3041952, + "step": 15980 + }, + { + "epoch": 8.308212058212058, + "grad_norm": 0.00022498662292491645, + "learning_rate": 4.236597918799709e-06, + "loss": 0.0, + "num_input_tokens_seen": 3042944, + "step": 15985 + }, + { + "epoch": 8.31081081081081, + "grad_norm": 0.08131267875432968, + "learning_rate": 4.223975395517199e-06, + "loss": 0.0001, + "num_input_tokens_seen": 3043936, + "step": 15990 + }, + { + "epoch": 8.313409563409563, + "grad_norm": 4.289581920602359e-05, + "learning_rate": 4.211369968727216e-06, + "loss": 0.0, + "num_input_tokens_seen": 3044928, + "step": 15995 + }, + { + "epoch": 8.316008316008316, + "grad_norm": 0.00024016483803279698, + "learning_rate": 4.1987816488027186e-06, + "loss": 0.001, + "num_input_tokens_seen": 3046016, + "step": 16000 + }, + { + "epoch": 8.318607068607069, + "grad_norm": 0.00020766269881278276, + "learning_rate": 4.1862104461025704e-06, + "loss": 0.0, + "num_input_tokens_seen": 3047008, + "step": 16005 + }, + { + "epoch": 8.32120582120582, + "grad_norm": 7.351284875767305e-05, + "learning_rate": 4.173656370971549e-06, + "loss": 0.0, + "num_input_tokens_seen": 3047936, + "step": 16010 + }, + { + "epoch": 8.323804573804575, + "grad_norm": 0.00013737495464738458, + "learning_rate": 4.161119433740351e-06, + "loss": 0.0001, + "num_input_tokens_seen": 3048960, + "step": 16015 + }, + { + "epoch": 8.326403326403327, + "grad_norm": 0.0006470976513810456, + "learning_rate": 4.1485996447255595e-06, + "loss": 0.0, + "num_input_tokens_seen": 3049856, + "step": 16020 + }, + { + "epoch": 8.329002079002079, + "grad_norm": 3.975414074375294e-05, + "learning_rate": 4.136097014229653e-06, + "loss": 0.0001, + "num_input_tokens_seen": 3050848, + "step": 16025 + }, + { + "epoch": 8.33160083160083, + "grad_norm": 0.00012332494952715933, + "learning_rate": 4.1236115525409815e-06, + "loss": 0.0, + "num_input_tokens_seen": 3051808, + "step": 16030 + }, + { + "epoch": 8.334199584199585, + "grad_norm": 0.00023907054855953902, + "learning_rate": 4.111143269933787e-06, + "loss": 0.0, + "num_input_tokens_seen": 3052800, + "step": 16035 + }, + { + "epoch": 8.336798336798337, + "grad_norm": 0.0006120908074080944, + "learning_rate": 4.098692176668137e-06, + "loss": 0.0, + "num_input_tokens_seen": 3053760, + "step": 16040 + }, + { + "epoch": 8.339397089397089, + "grad_norm": 0.0003075788263231516, + "learning_rate": 4.086258282989996e-06, + "loss": 0.0, + "num_input_tokens_seen": 3054752, + "step": 16045 + }, + { + "epoch": 8.341995841995843, + "grad_norm": 9.096861322177574e-05, + "learning_rate": 4.073841599131145e-06, + "loss": 0.0, + "num_input_tokens_seen": 3055712, + "step": 16050 + }, + { + "epoch": 8.344594594594595, + "grad_norm": 0.00032625667518004775, + "learning_rate": 4.061442135309224e-06, + "loss": 0.0, + "num_input_tokens_seen": 3056704, + "step": 16055 + }, + { + "epoch": 8.347193347193347, + "grad_norm": 0.00048531321226619184, + "learning_rate": 4.049059901727681e-06, + "loss": 0.0, + "num_input_tokens_seen": 3057696, + "step": 16060 + }, + { + "epoch": 8.3497920997921, + "grad_norm": 7.353661203524098e-05, + "learning_rate": 4.036694908575808e-06, + "loss": 0.0, + "num_input_tokens_seen": 3058592, + "step": 16065 + }, + { + "epoch": 8.352390852390853, + "grad_norm": 3.500135426293127e-05, + "learning_rate": 4.024347166028708e-06, + "loss": 0.0, + "num_input_tokens_seen": 3059584, + "step": 16070 + }, + { + "epoch": 8.354989604989605, + "grad_norm": 0.0010161487152799964, + "learning_rate": 4.012016684247258e-06, + "loss": 0.0, + "num_input_tokens_seen": 3060480, + "step": 16075 + }, + { + "epoch": 8.357588357588357, + "grad_norm": 8.040724060265347e-05, + "learning_rate": 3.999703473378169e-06, + "loss": 0.0, + "num_input_tokens_seen": 3061376, + "step": 16080 + }, + { + "epoch": 8.36018711018711, + "grad_norm": 0.0008822848321869969, + "learning_rate": 3.987407543553911e-06, + "loss": 0.0, + "num_input_tokens_seen": 3062336, + "step": 16085 + }, + { + "epoch": 8.362785862785863, + "grad_norm": 0.0002380140358582139, + "learning_rate": 3.9751289048927635e-06, + "loss": 0.0, + "num_input_tokens_seen": 3063296, + "step": 16090 + }, + { + "epoch": 8.365384615384615, + "grad_norm": 0.00019538136257324368, + "learning_rate": 3.962867567498746e-06, + "loss": 0.0, + "num_input_tokens_seen": 3064224, + "step": 16095 + }, + { + "epoch": 8.367983367983369, + "grad_norm": 0.00015735914348624647, + "learning_rate": 3.950623541461665e-06, + "loss": 0.0, + "num_input_tokens_seen": 3065216, + "step": 16100 + }, + { + "epoch": 8.370582120582121, + "grad_norm": 0.00011280074249953032, + "learning_rate": 3.938396836857067e-06, + "loss": 0.0, + "num_input_tokens_seen": 3066176, + "step": 16105 + }, + { + "epoch": 8.373180873180873, + "grad_norm": 0.00032881717197597027, + "learning_rate": 3.926187463746242e-06, + "loss": 0.0, + "num_input_tokens_seen": 3067104, + "step": 16110 + }, + { + "epoch": 8.375779625779625, + "grad_norm": 0.0003850675711873919, + "learning_rate": 3.913995432176243e-06, + "loss": 0.0, + "num_input_tokens_seen": 3068096, + "step": 16115 + }, + { + "epoch": 8.378378378378379, + "grad_norm": 0.005714063998311758, + "learning_rate": 3.901820752179816e-06, + "loss": 0.0, + "num_input_tokens_seen": 3069088, + "step": 16120 + }, + { + "epoch": 8.380977130977131, + "grad_norm": 0.0003433031961321831, + "learning_rate": 3.889663433775465e-06, + "loss": 0.0, + "num_input_tokens_seen": 3070016, + "step": 16125 + }, + { + "epoch": 8.383575883575883, + "grad_norm": 4.7410423576366156e-05, + "learning_rate": 3.877523486967377e-06, + "loss": 0.0, + "num_input_tokens_seen": 3070976, + "step": 16130 + }, + { + "epoch": 8.386174636174637, + "grad_norm": 0.0005934255896136165, + "learning_rate": 3.865400921745466e-06, + "loss": 0.0, + "num_input_tokens_seen": 3071840, + "step": 16135 + }, + { + "epoch": 8.388773388773389, + "grad_norm": 0.0001248644693987444, + "learning_rate": 3.853295748085331e-06, + "loss": 0.0, + "num_input_tokens_seen": 3072800, + "step": 16140 + }, + { + "epoch": 8.391372141372141, + "grad_norm": 5.88031361985486e-05, + "learning_rate": 3.841207975948255e-06, + "loss": 0.0, + "num_input_tokens_seen": 3073696, + "step": 16145 + }, + { + "epoch": 8.393970893970893, + "grad_norm": 7.016938616288826e-05, + "learning_rate": 3.829137615281217e-06, + "loss": 0.0, + "num_input_tokens_seen": 3074688, + "step": 16150 + }, + { + "epoch": 8.396569646569647, + "grad_norm": 8.849058212945238e-05, + "learning_rate": 3.817084676016855e-06, + "loss": 0.0, + "num_input_tokens_seen": 3075648, + "step": 16155 + }, + { + "epoch": 8.3991683991684, + "grad_norm": 3.7626792618539184e-05, + "learning_rate": 3.8050491680734823e-06, + "loss": 0.0, + "num_input_tokens_seen": 3076608, + "step": 16160 + }, + { + "epoch": 8.401767151767151, + "grad_norm": 0.0001598302333150059, + "learning_rate": 3.793031101355057e-06, + "loss": 0.0, + "num_input_tokens_seen": 3077568, + "step": 16165 + }, + { + "epoch": 8.404365904365905, + "grad_norm": 3.08750313706696e-05, + "learning_rate": 3.7810304857511914e-06, + "loss": 0.0, + "num_input_tokens_seen": 3078528, + "step": 16170 + }, + { + "epoch": 8.406964656964657, + "grad_norm": 0.0010679519036784768, + "learning_rate": 3.7690473311371267e-06, + "loss": 0.0, + "num_input_tokens_seen": 3079456, + "step": 16175 + }, + { + "epoch": 8.40956340956341, + "grad_norm": 3.885081969201565e-05, + "learning_rate": 3.7570816473737584e-06, + "loss": 0.0, + "num_input_tokens_seen": 3080384, + "step": 16180 + }, + { + "epoch": 8.412162162162161, + "grad_norm": 4.684193118009716e-05, + "learning_rate": 3.7451334443075747e-06, + "loss": 0.0, + "num_input_tokens_seen": 3081344, + "step": 16185 + }, + { + "epoch": 8.414760914760915, + "grad_norm": 0.0023671360686421394, + "learning_rate": 3.7332027317707076e-06, + "loss": 0.0, + "num_input_tokens_seen": 3082272, + "step": 16190 + }, + { + "epoch": 8.417359667359667, + "grad_norm": 4.5838707592338324e-05, + "learning_rate": 3.7212895195808868e-06, + "loss": 0.0, + "num_input_tokens_seen": 3083136, + "step": 16195 + }, + { + "epoch": 8.41995841995842, + "grad_norm": 0.0005545251769945025, + "learning_rate": 3.7093938175414344e-06, + "loss": 0.0, + "num_input_tokens_seen": 3083968, + "step": 16200 + }, + { + "epoch": 8.422557172557173, + "grad_norm": 7.373019616352394e-05, + "learning_rate": 3.697515635441262e-06, + "loss": 0.0169, + "num_input_tokens_seen": 3084928, + "step": 16205 + }, + { + "epoch": 8.425155925155925, + "grad_norm": 8.151185465976596e-05, + "learning_rate": 3.6856549830548704e-06, + "loss": 0.0, + "num_input_tokens_seen": 3085920, + "step": 16210 + }, + { + "epoch": 8.427754677754677, + "grad_norm": 0.00016215824871324003, + "learning_rate": 3.6738118701423434e-06, + "loss": 0.0, + "num_input_tokens_seen": 3086912, + "step": 16215 + }, + { + "epoch": 8.43035343035343, + "grad_norm": 0.00012360629625618458, + "learning_rate": 3.661986306449311e-06, + "loss": 0.0, + "num_input_tokens_seen": 3087904, + "step": 16220 + }, + { + "epoch": 8.432952182952183, + "grad_norm": 3.2007344998419285e-05, + "learning_rate": 3.650178301706983e-06, + "loss": 0.0, + "num_input_tokens_seen": 3088896, + "step": 16225 + }, + { + "epoch": 8.435550935550935, + "grad_norm": 3.0717790650669485e-05, + "learning_rate": 3.638387865632109e-06, + "loss": 0.0, + "num_input_tokens_seen": 3089792, + "step": 16230 + }, + { + "epoch": 8.438149688149688, + "grad_norm": 3.804518928518519e-05, + "learning_rate": 3.6266150079269755e-06, + "loss": 0.0448, + "num_input_tokens_seen": 3090752, + "step": 16235 + }, + { + "epoch": 8.440748440748441, + "grad_norm": 0.0002470891340635717, + "learning_rate": 3.614859738279422e-06, + "loss": 0.0, + "num_input_tokens_seen": 3091648, + "step": 16240 + }, + { + "epoch": 8.443347193347194, + "grad_norm": 0.0001489411515649408, + "learning_rate": 3.603122066362796e-06, + "loss": 0.0, + "num_input_tokens_seen": 3092576, + "step": 16245 + }, + { + "epoch": 8.445945945945946, + "grad_norm": 0.0001655630039749667, + "learning_rate": 3.5914020018359804e-06, + "loss": 0.0001, + "num_input_tokens_seen": 3093504, + "step": 16250 + }, + { + "epoch": 8.448544698544698, + "grad_norm": 8.617185085313395e-05, + "learning_rate": 3.579699554343352e-06, + "loss": 0.0, + "num_input_tokens_seen": 3094496, + "step": 16255 + }, + { + "epoch": 8.451143451143452, + "grad_norm": 3.281749741290696e-05, + "learning_rate": 3.56801473351481e-06, + "loss": 0.0, + "num_input_tokens_seen": 3095520, + "step": 16260 + }, + { + "epoch": 8.453742203742204, + "grad_norm": 0.00010631715122144669, + "learning_rate": 3.5563475489657326e-06, + "loss": 0.0, + "num_input_tokens_seen": 3096480, + "step": 16265 + }, + { + "epoch": 8.456340956340956, + "grad_norm": 5.7944147556554526e-05, + "learning_rate": 3.544698010296982e-06, + "loss": 0.0, + "num_input_tokens_seen": 3097440, + "step": 16270 + }, + { + "epoch": 8.45893970893971, + "grad_norm": 0.0001959526853170246, + "learning_rate": 3.533066127094925e-06, + "loss": 0.0, + "num_input_tokens_seen": 3098336, + "step": 16275 + }, + { + "epoch": 8.461538461538462, + "grad_norm": 0.0008968243491835892, + "learning_rate": 3.5214519089313726e-06, + "loss": 0.0, + "num_input_tokens_seen": 3099360, + "step": 16280 + }, + { + "epoch": 8.464137214137214, + "grad_norm": 1.0246353149414062, + "learning_rate": 3.509855365363615e-06, + "loss": 0.0004, + "num_input_tokens_seen": 3100320, + "step": 16285 + }, + { + "epoch": 8.466735966735968, + "grad_norm": 0.02526622824370861, + "learning_rate": 3.4982765059343864e-06, + "loss": 0.0, + "num_input_tokens_seen": 3101312, + "step": 16290 + }, + { + "epoch": 8.46933471933472, + "grad_norm": 3.732120603672229e-05, + "learning_rate": 3.4867153401718865e-06, + "loss": 0.0, + "num_input_tokens_seen": 3102336, + "step": 16295 + }, + { + "epoch": 8.471933471933472, + "grad_norm": 0.0008424412226304412, + "learning_rate": 3.4751718775897392e-06, + "loss": 0.0078, + "num_input_tokens_seen": 3103360, + "step": 16300 + }, + { + "epoch": 8.474532224532224, + "grad_norm": 3.617312540882267e-05, + "learning_rate": 3.4636461276870038e-06, + "loss": 0.0, + "num_input_tokens_seen": 3104288, + "step": 16305 + }, + { + "epoch": 8.477130977130978, + "grad_norm": 0.00022698492102790624, + "learning_rate": 3.45213809994816e-06, + "loss": 0.0, + "num_input_tokens_seen": 3105216, + "step": 16310 + }, + { + "epoch": 8.47972972972973, + "grad_norm": 0.0008280329639092088, + "learning_rate": 3.4406478038431137e-06, + "loss": 0.0, + "num_input_tokens_seen": 3106176, + "step": 16315 + }, + { + "epoch": 8.482328482328482, + "grad_norm": 4.246387106832117e-05, + "learning_rate": 3.429175248827182e-06, + "loss": 0.0, + "num_input_tokens_seen": 3107104, + "step": 16320 + }, + { + "epoch": 8.484927234927236, + "grad_norm": 0.005131552927196026, + "learning_rate": 3.4177204443410688e-06, + "loss": 0.0, + "num_input_tokens_seen": 3108000, + "step": 16325 + }, + { + "epoch": 8.487525987525988, + "grad_norm": 0.00014456463395617902, + "learning_rate": 3.406283399810889e-06, + "loss": 0.2125, + "num_input_tokens_seen": 3109056, + "step": 16330 + }, + { + "epoch": 8.49012474012474, + "grad_norm": 7.276824908331037e-05, + "learning_rate": 3.3948641246481142e-06, + "loss": 0.0016, + "num_input_tokens_seen": 3110048, + "step": 16335 + }, + { + "epoch": 8.492723492723492, + "grad_norm": 0.00048568379133939743, + "learning_rate": 3.3834626282496285e-06, + "loss": 0.0, + "num_input_tokens_seen": 3111008, + "step": 16340 + }, + { + "epoch": 8.495322245322246, + "grad_norm": 3.560605910024606e-05, + "learning_rate": 3.3720789199976567e-06, + "loss": 0.0, + "num_input_tokens_seen": 3111968, + "step": 16345 + }, + { + "epoch": 8.497920997920998, + "grad_norm": 0.0003406826581340283, + "learning_rate": 3.360713009259811e-06, + "loss": 0.0, + "num_input_tokens_seen": 3112928, + "step": 16350 + }, + { + "epoch": 8.5, + "eval_loss": 0.4085700511932373, + "eval_runtime": 9.2224, + "eval_samples_per_second": 92.818, + "eval_steps_per_second": 23.204, + "num_input_tokens_seen": 3113696, + "step": 16354 + }, + { + "epoch": 8.50051975051975, + "grad_norm": 0.0073233977891504765, + "learning_rate": 3.3493649053890326e-06, + "loss": 0.0, + "num_input_tokens_seen": 3113888, + "step": 16355 + }, + { + "epoch": 8.503118503118504, + "grad_norm": 0.0009472208912484348, + "learning_rate": 3.338034617723637e-06, + "loss": 0.0, + "num_input_tokens_seen": 3114848, + "step": 16360 + }, + { + "epoch": 8.505717255717256, + "grad_norm": 0.0008298636530525982, + "learning_rate": 3.3267221555872584e-06, + "loss": 0.0, + "num_input_tokens_seen": 3115872, + "step": 16365 + }, + { + "epoch": 8.508316008316008, + "grad_norm": 0.000916033168323338, + "learning_rate": 3.3154275282888585e-06, + "loss": 0.0, + "num_input_tokens_seen": 3116800, + "step": 16370 + }, + { + "epoch": 8.51091476091476, + "grad_norm": 8.564610470784828e-05, + "learning_rate": 3.30415074512275e-06, + "loss": 0.0011, + "num_input_tokens_seen": 3117792, + "step": 16375 + }, + { + "epoch": 8.513513513513514, + "grad_norm": 0.003057954367250204, + "learning_rate": 3.292891815368526e-06, + "loss": 0.0, + "num_input_tokens_seen": 3118720, + "step": 16380 + }, + { + "epoch": 8.516112266112266, + "grad_norm": 0.00020815804600715637, + "learning_rate": 3.2816507482911264e-06, + "loss": 0.0, + "num_input_tokens_seen": 3119712, + "step": 16385 + }, + { + "epoch": 8.518711018711018, + "grad_norm": 0.00024016336828935891, + "learning_rate": 3.2704275531407565e-06, + "loss": 0.0, + "num_input_tokens_seen": 3120640, + "step": 16390 + }, + { + "epoch": 8.521309771309772, + "grad_norm": 6.90938177285716e-05, + "learning_rate": 3.25922223915294e-06, + "loss": 0.0078, + "num_input_tokens_seen": 3121536, + "step": 16395 + }, + { + "epoch": 8.523908523908524, + "grad_norm": 0.00035677512641996145, + "learning_rate": 3.248034815548473e-06, + "loss": 0.0, + "num_input_tokens_seen": 3122528, + "step": 16400 + }, + { + "epoch": 8.526507276507276, + "grad_norm": 0.004455132409930229, + "learning_rate": 3.2368652915334307e-06, + "loss": 0.0, + "num_input_tokens_seen": 3123392, + "step": 16405 + }, + { + "epoch": 8.529106029106028, + "grad_norm": 0.0003523945633787662, + "learning_rate": 3.225713676299169e-06, + "loss": 0.0, + "num_input_tokens_seen": 3124320, + "step": 16410 + }, + { + "epoch": 8.531704781704782, + "grad_norm": 0.94231116771698, + "learning_rate": 3.2145799790222893e-06, + "loss": 0.0302, + "num_input_tokens_seen": 3125184, + "step": 16415 + }, + { + "epoch": 8.534303534303534, + "grad_norm": 0.001191868563182652, + "learning_rate": 3.2034642088646704e-06, + "loss": 0.0, + "num_input_tokens_seen": 3126112, + "step": 16420 + }, + { + "epoch": 8.536902286902286, + "grad_norm": 0.00011216376879019663, + "learning_rate": 3.1923663749734182e-06, + "loss": 0.0, + "num_input_tokens_seen": 3127072, + "step": 16425 + }, + { + "epoch": 8.53950103950104, + "grad_norm": 2.8884231142001227e-05, + "learning_rate": 3.1812864864808973e-06, + "loss": 0.0, + "num_input_tokens_seen": 3128000, + "step": 16430 + }, + { + "epoch": 8.542099792099792, + "grad_norm": 0.007886259816586971, + "learning_rate": 3.1702245525046803e-06, + "loss": 0.0, + "num_input_tokens_seen": 3128960, + "step": 16435 + }, + { + "epoch": 8.544698544698544, + "grad_norm": 3.0341800083988346e-05, + "learning_rate": 3.159180582147589e-06, + "loss": 0.0, + "num_input_tokens_seen": 3129856, + "step": 16440 + }, + { + "epoch": 8.547297297297296, + "grad_norm": 0.00022900283511262387, + "learning_rate": 3.1481545844976617e-06, + "loss": 0.0001, + "num_input_tokens_seen": 3130784, + "step": 16445 + }, + { + "epoch": 8.54989604989605, + "grad_norm": 0.0002388462598901242, + "learning_rate": 3.137146568628127e-06, + "loss": 0.0, + "num_input_tokens_seen": 3131776, + "step": 16450 + }, + { + "epoch": 8.552494802494802, + "grad_norm": 0.0006347513990476727, + "learning_rate": 3.126156543597439e-06, + "loss": 0.0, + "num_input_tokens_seen": 3132768, + "step": 16455 + }, + { + "epoch": 8.555093555093555, + "grad_norm": 0.0005461592227220535, + "learning_rate": 3.115184518449232e-06, + "loss": 0.0, + "num_input_tokens_seen": 3133728, + "step": 16460 + }, + { + "epoch": 8.557692307692308, + "grad_norm": 0.00013779083383269608, + "learning_rate": 3.104230502212338e-06, + "loss": 0.0, + "num_input_tokens_seen": 3134656, + "step": 16465 + }, + { + "epoch": 8.56029106029106, + "grad_norm": 0.00040023535257205367, + "learning_rate": 3.0932945039007536e-06, + "loss": 0.0, + "num_input_tokens_seen": 3135584, + "step": 16470 + }, + { + "epoch": 8.562889812889813, + "grad_norm": 0.00037170431460253894, + "learning_rate": 3.0823765325136754e-06, + "loss": 0.0, + "num_input_tokens_seen": 3136544, + "step": 16475 + }, + { + "epoch": 8.565488565488565, + "grad_norm": 0.0012272924650460482, + "learning_rate": 3.0714765970354414e-06, + "loss": 0.0, + "num_input_tokens_seen": 3137504, + "step": 16480 + }, + { + "epoch": 8.568087318087318, + "grad_norm": 0.0001975301856873557, + "learning_rate": 3.06059470643556e-06, + "loss": 0.0, + "num_input_tokens_seen": 3138400, + "step": 16485 + }, + { + "epoch": 8.57068607068607, + "grad_norm": 0.005001799203455448, + "learning_rate": 3.049730869668685e-06, + "loss": 0.0, + "num_input_tokens_seen": 3139360, + "step": 16490 + }, + { + "epoch": 8.573284823284823, + "grad_norm": 0.00013524909445550293, + "learning_rate": 3.0388850956746233e-06, + "loss": 0.0, + "num_input_tokens_seen": 3140288, + "step": 16495 + }, + { + "epoch": 8.575883575883577, + "grad_norm": 0.00024992713588289917, + "learning_rate": 3.028057393378306e-06, + "loss": 0.0, + "num_input_tokens_seen": 3141216, + "step": 16500 + }, + { + "epoch": 8.578482328482329, + "grad_norm": 0.0004071888397447765, + "learning_rate": 3.0172477716897934e-06, + "loss": 0.0, + "num_input_tokens_seen": 3142208, + "step": 16505 + }, + { + "epoch": 8.58108108108108, + "grad_norm": 0.001040896400809288, + "learning_rate": 3.0064562395042824e-06, + "loss": 0.0, + "num_input_tokens_seen": 3143232, + "step": 16510 + }, + { + "epoch": 8.583679833679835, + "grad_norm": 2.9550255931098945e-05, + "learning_rate": 2.995682805702063e-06, + "loss": 0.0, + "num_input_tokens_seen": 3144256, + "step": 16515 + }, + { + "epoch": 8.586278586278587, + "grad_norm": 0.00022957193141337484, + "learning_rate": 2.9849274791485554e-06, + "loss": 0.0, + "num_input_tokens_seen": 3145184, + "step": 16520 + }, + { + "epoch": 8.588877338877339, + "grad_norm": 0.0021975203417241573, + "learning_rate": 2.9741902686942575e-06, + "loss": 0.0, + "num_input_tokens_seen": 3146080, + "step": 16525 + }, + { + "epoch": 8.59147609147609, + "grad_norm": 0.00011215035192435607, + "learning_rate": 2.963471183174768e-06, + "loss": 0.0, + "num_input_tokens_seen": 3147008, + "step": 16530 + }, + { + "epoch": 8.594074844074845, + "grad_norm": 0.0004126711282879114, + "learning_rate": 2.9527702314107814e-06, + "loss": 0.0, + "num_input_tokens_seen": 3147872, + "step": 16535 + }, + { + "epoch": 8.596673596673597, + "grad_norm": 0.0032425031531602144, + "learning_rate": 2.942087422208051e-06, + "loss": 0.0, + "num_input_tokens_seen": 3148832, + "step": 16540 + }, + { + "epoch": 8.599272349272349, + "grad_norm": 8.604989852756262e-05, + "learning_rate": 2.9314227643574166e-06, + "loss": 0.0, + "num_input_tokens_seen": 3149792, + "step": 16545 + }, + { + "epoch": 8.601871101871101, + "grad_norm": 0.0071047586388885975, + "learning_rate": 2.920776266634767e-06, + "loss": 0.0, + "num_input_tokens_seen": 3150752, + "step": 16550 + }, + { + "epoch": 8.604469854469855, + "grad_norm": 6.798015965614468e-05, + "learning_rate": 2.9101479378010637e-06, + "loss": 0.0, + "num_input_tokens_seen": 3151712, + "step": 16555 + }, + { + "epoch": 8.607068607068607, + "grad_norm": 0.003138430416584015, + "learning_rate": 2.899537786602305e-06, + "loss": 0.0, + "num_input_tokens_seen": 3152608, + "step": 16560 + }, + { + "epoch": 8.609667359667359, + "grad_norm": 0.0056077237240970135, + "learning_rate": 2.8889458217695287e-06, + "loss": 0.0, + "num_input_tokens_seen": 3153600, + "step": 16565 + }, + { + "epoch": 8.612266112266113, + "grad_norm": 9.328039595857263e-05, + "learning_rate": 2.878372052018824e-06, + "loss": 0.0, + "num_input_tokens_seen": 3154496, + "step": 16570 + }, + { + "epoch": 8.614864864864865, + "grad_norm": 5.265304207568988e-05, + "learning_rate": 2.8678164860512834e-06, + "loss": 0.0, + "num_input_tokens_seen": 3155360, + "step": 16575 + }, + { + "epoch": 8.617463617463617, + "grad_norm": 0.0002838107175193727, + "learning_rate": 2.8572791325530425e-06, + "loss": 0.0, + "num_input_tokens_seen": 3156224, + "step": 16580 + }, + { + "epoch": 8.62006237006237, + "grad_norm": 0.0021419331897050142, + "learning_rate": 2.8467600001952336e-06, + "loss": 0.0001, + "num_input_tokens_seen": 3157216, + "step": 16585 + }, + { + "epoch": 8.622661122661123, + "grad_norm": 0.000107224790554028, + "learning_rate": 2.8362590976340114e-06, + "loss": 0.0226, + "num_input_tokens_seen": 3158112, + "step": 16590 + }, + { + "epoch": 8.625259875259875, + "grad_norm": 0.004558427259325981, + "learning_rate": 2.825776433510499e-06, + "loss": 0.0019, + "num_input_tokens_seen": 3159104, + "step": 16595 + }, + { + "epoch": 8.627858627858627, + "grad_norm": 0.00043075383291579783, + "learning_rate": 2.8153120164508493e-06, + "loss": 0.0, + "num_input_tokens_seen": 3160064, + "step": 16600 + }, + { + "epoch": 8.630457380457381, + "grad_norm": 0.0015695358160883188, + "learning_rate": 2.8048658550661714e-06, + "loss": 0.0, + "num_input_tokens_seen": 3161056, + "step": 16605 + }, + { + "epoch": 8.633056133056133, + "grad_norm": 0.00013780452718492597, + "learning_rate": 2.7944379579525697e-06, + "loss": 0.0006, + "num_input_tokens_seen": 3162048, + "step": 16610 + }, + { + "epoch": 8.635654885654885, + "grad_norm": 0.0030891194473952055, + "learning_rate": 2.784028333691105e-06, + "loss": 0.0, + "num_input_tokens_seen": 3163008, + "step": 16615 + }, + { + "epoch": 8.638253638253639, + "grad_norm": 5.7035471400013193e-05, + "learning_rate": 2.773636990847808e-06, + "loss": 0.0001, + "num_input_tokens_seen": 3164032, + "step": 16620 + }, + { + "epoch": 8.640852390852391, + "grad_norm": 3.1072344427229837e-05, + "learning_rate": 2.763263937973681e-06, + "loss": 0.0, + "num_input_tokens_seen": 3164992, + "step": 16625 + }, + { + "epoch": 8.643451143451143, + "grad_norm": 0.00032469170400872827, + "learning_rate": 2.752909183604635e-06, + "loss": 0.0, + "num_input_tokens_seen": 3165952, + "step": 16630 + }, + { + "epoch": 8.646049896049895, + "grad_norm": 0.001406166353262961, + "learning_rate": 2.74257273626157e-06, + "loss": 0.0313, + "num_input_tokens_seen": 3166880, + "step": 16635 + }, + { + "epoch": 8.64864864864865, + "grad_norm": 3.0538925784640014e-05, + "learning_rate": 2.7322546044502824e-06, + "loss": 0.0001, + "num_input_tokens_seen": 3167904, + "step": 16640 + }, + { + "epoch": 8.651247401247401, + "grad_norm": 0.0002979603596031666, + "learning_rate": 2.72195479666153e-06, + "loss": 0.0, + "num_input_tokens_seen": 3168800, + "step": 16645 + }, + { + "epoch": 8.653846153846153, + "grad_norm": 0.0003025763144250959, + "learning_rate": 2.711673321370961e-06, + "loss": 0.0001, + "num_input_tokens_seen": 3169696, + "step": 16650 + }, + { + "epoch": 8.656444906444907, + "grad_norm": 0.012218689545989037, + "learning_rate": 2.701410187039169e-06, + "loss": 0.0, + "num_input_tokens_seen": 3170688, + "step": 16655 + }, + { + "epoch": 8.65904365904366, + "grad_norm": 3.3884985896293074e-05, + "learning_rate": 2.691165402111628e-06, + "loss": 0.0, + "num_input_tokens_seen": 3171744, + "step": 16660 + }, + { + "epoch": 8.661642411642411, + "grad_norm": 0.0009252094896510243, + "learning_rate": 2.6809389750187208e-06, + "loss": 0.0, + "num_input_tokens_seen": 3172704, + "step": 16665 + }, + { + "epoch": 8.664241164241163, + "grad_norm": 0.00021283452224452049, + "learning_rate": 2.670730914175737e-06, + "loss": 0.0, + "num_input_tokens_seen": 3173664, + "step": 16670 + }, + { + "epoch": 8.666839916839917, + "grad_norm": 0.08577878028154373, + "learning_rate": 2.6605412279828267e-06, + "loss": 0.0002, + "num_input_tokens_seen": 3174656, + "step": 16675 + }, + { + "epoch": 8.66943866943867, + "grad_norm": 0.00013430694525595754, + "learning_rate": 2.6503699248250523e-06, + "loss": 0.0, + "num_input_tokens_seen": 3175616, + "step": 16680 + }, + { + "epoch": 8.672037422037421, + "grad_norm": 3.2471492886543274e-05, + "learning_rate": 2.6402170130723132e-06, + "loss": 0.0, + "num_input_tokens_seen": 3176608, + "step": 16685 + }, + { + "epoch": 8.674636174636175, + "grad_norm": 0.0008410296868532896, + "learning_rate": 2.630082501079409e-06, + "loss": 0.0002, + "num_input_tokens_seen": 3177632, + "step": 16690 + }, + { + "epoch": 8.677234927234927, + "grad_norm": 3.113642742391676e-05, + "learning_rate": 2.619966397185972e-06, + "loss": 0.0024, + "num_input_tokens_seen": 3178592, + "step": 16695 + }, + { + "epoch": 8.67983367983368, + "grad_norm": 2.81989141512895e-05, + "learning_rate": 2.6098687097164955e-06, + "loss": 0.0, + "num_input_tokens_seen": 3179488, + "step": 16700 + }, + { + "epoch": 8.682432432432432, + "grad_norm": 0.00026732883998192847, + "learning_rate": 2.5997894469803247e-06, + "loss": 0.0, + "num_input_tokens_seen": 3180448, + "step": 16705 + }, + { + "epoch": 8.685031185031185, + "grad_norm": 0.0012548834783956409, + "learning_rate": 2.5897286172716307e-06, + "loss": 0.0, + "num_input_tokens_seen": 3181504, + "step": 16710 + }, + { + "epoch": 8.687629937629938, + "grad_norm": 0.00017257113358937204, + "learning_rate": 2.5796862288694324e-06, + "loss": 0.0, + "num_input_tokens_seen": 3182528, + "step": 16715 + }, + { + "epoch": 8.69022869022869, + "grad_norm": 2.729494917730335e-05, + "learning_rate": 2.56966229003755e-06, + "loss": 0.0, + "num_input_tokens_seen": 3183456, + "step": 16720 + }, + { + "epoch": 8.692827442827443, + "grad_norm": 3.1826177291804925e-05, + "learning_rate": 2.5596568090246548e-06, + "loss": 0.0, + "num_input_tokens_seen": 3184448, + "step": 16725 + }, + { + "epoch": 8.695426195426196, + "grad_norm": 2.9466536943800747e-05, + "learning_rate": 2.5496697940641854e-06, + "loss": 0.0, + "num_input_tokens_seen": 3185408, + "step": 16730 + }, + { + "epoch": 8.698024948024948, + "grad_norm": 0.010018882341682911, + "learning_rate": 2.53970125337443e-06, + "loss": 0.004, + "num_input_tokens_seen": 3186336, + "step": 16735 + }, + { + "epoch": 8.700623700623701, + "grad_norm": 0.0007174232741817832, + "learning_rate": 2.5297511951584417e-06, + "loss": 0.0, + "num_input_tokens_seen": 3187296, + "step": 16740 + }, + { + "epoch": 8.703222453222454, + "grad_norm": 0.0012839989503845572, + "learning_rate": 2.5198196276040782e-06, + "loss": 0.0, + "num_input_tokens_seen": 3188224, + "step": 16745 + }, + { + "epoch": 8.705821205821206, + "grad_norm": 4.252362850820646e-05, + "learning_rate": 2.509906558883987e-06, + "loss": 0.0356, + "num_input_tokens_seen": 3189120, + "step": 16750 + }, + { + "epoch": 8.708419958419958, + "grad_norm": 0.0017453586915507913, + "learning_rate": 2.5000119971555826e-06, + "loss": 0.0001, + "num_input_tokens_seen": 3190080, + "step": 16755 + }, + { + "epoch": 8.711018711018712, + "grad_norm": 0.00021903130982536823, + "learning_rate": 2.4901359505610482e-06, + "loss": 0.0, + "num_input_tokens_seen": 3191136, + "step": 16760 + }, + { + "epoch": 8.713617463617464, + "grad_norm": 2.970815420150757, + "learning_rate": 2.480278427227334e-06, + "loss": 0.0655, + "num_input_tokens_seen": 3192064, + "step": 16765 + }, + { + "epoch": 8.716216216216216, + "grad_norm": 0.0003384366864338517, + "learning_rate": 2.4704394352661586e-06, + "loss": 0.0645, + "num_input_tokens_seen": 3193056, + "step": 16770 + }, + { + "epoch": 8.71881496881497, + "grad_norm": 0.002419818192720413, + "learning_rate": 2.460618982773974e-06, + "loss": 0.0, + "num_input_tokens_seen": 3194016, + "step": 16775 + }, + { + "epoch": 8.721413721413722, + "grad_norm": 3.6148841900285333e-05, + "learning_rate": 2.4508170778319904e-06, + "loss": 0.0, + "num_input_tokens_seen": 3194944, + "step": 16780 + }, + { + "epoch": 8.724012474012474, + "grad_norm": 0.0008095129742287099, + "learning_rate": 2.4410337285061424e-06, + "loss": 0.0, + "num_input_tokens_seen": 3195936, + "step": 16785 + }, + { + "epoch": 8.726611226611226, + "grad_norm": 9.94372385321185e-05, + "learning_rate": 2.431268942847112e-06, + "loss": 0.0, + "num_input_tokens_seen": 3196960, + "step": 16790 + }, + { + "epoch": 8.72920997920998, + "grad_norm": 0.00043161623761989176, + "learning_rate": 2.4215227288902883e-06, + "loss": 0.0, + "num_input_tokens_seen": 3197888, + "step": 16795 + }, + { + "epoch": 8.731808731808732, + "grad_norm": 5.0494385504862294e-05, + "learning_rate": 2.4117950946557807e-06, + "loss": 0.0, + "num_input_tokens_seen": 3198784, + "step": 16800 + }, + { + "epoch": 8.734407484407484, + "grad_norm": 0.0004440444754436612, + "learning_rate": 2.402086048148422e-06, + "loss": 0.0, + "num_input_tokens_seen": 3199712, + "step": 16805 + }, + { + "epoch": 8.737006237006238, + "grad_norm": 0.001651895814575255, + "learning_rate": 2.3923955973577327e-06, + "loss": 0.0001, + "num_input_tokens_seen": 3200640, + "step": 16810 + }, + { + "epoch": 8.73960498960499, + "grad_norm": 0.00011207886564079672, + "learning_rate": 2.382723750257948e-06, + "loss": 0.0, + "num_input_tokens_seen": 3201568, + "step": 16815 + }, + { + "epoch": 8.742203742203742, + "grad_norm": 0.0001282671873923391, + "learning_rate": 2.373070514807979e-06, + "loss": 0.0, + "num_input_tokens_seen": 3202560, + "step": 16820 + }, + { + "epoch": 8.744802494802494, + "grad_norm": 0.00016034157306421548, + "learning_rate": 2.3634358989514273e-06, + "loss": 0.0, + "num_input_tokens_seen": 3203488, + "step": 16825 + }, + { + "epoch": 8.747401247401248, + "grad_norm": 0.0037442389875650406, + "learning_rate": 2.3538199106165754e-06, + "loss": 0.0, + "num_input_tokens_seen": 3204480, + "step": 16830 + }, + { + "epoch": 8.75, + "grad_norm": 2.9422842999338172e-05, + "learning_rate": 2.3442225577163717e-06, + "loss": 0.0, + "num_input_tokens_seen": 3205408, + "step": 16835 + }, + { + "epoch": 8.752598752598752, + "grad_norm": 0.005720231682062149, + "learning_rate": 2.3346438481484407e-06, + "loss": 0.1515, + "num_input_tokens_seen": 3206336, + "step": 16840 + }, + { + "epoch": 8.755197505197506, + "grad_norm": 2.7439815312391147e-05, + "learning_rate": 2.3250837897950433e-06, + "loss": 0.0002, + "num_input_tokens_seen": 3207296, + "step": 16845 + }, + { + "epoch": 8.757796257796258, + "grad_norm": 0.0005571010406129062, + "learning_rate": 2.3155423905231207e-06, + "loss": 0.0, + "num_input_tokens_seen": 3208224, + "step": 16850 + }, + { + "epoch": 8.76039501039501, + "grad_norm": 2.906422014348209e-05, + "learning_rate": 2.3060196581842385e-06, + "loss": 0.0002, + "num_input_tokens_seen": 3209184, + "step": 16855 + }, + { + "epoch": 8.762993762993762, + "grad_norm": 0.007639177143573761, + "learning_rate": 2.29651560061461e-06, + "loss": 0.0002, + "num_input_tokens_seen": 3210144, + "step": 16860 + }, + { + "epoch": 8.765592515592516, + "grad_norm": 0.00013955878966953605, + "learning_rate": 2.28703022563507e-06, + "loss": 0.0, + "num_input_tokens_seen": 3211168, + "step": 16865 + }, + { + "epoch": 8.768191268191268, + "grad_norm": 0.0005836781347170472, + "learning_rate": 2.2775635410510975e-06, + "loss": 0.0, + "num_input_tokens_seen": 3212128, + "step": 16870 + }, + { + "epoch": 8.77079002079002, + "grad_norm": 0.00027657111058942974, + "learning_rate": 2.2681155546527886e-06, + "loss": 0.0001, + "num_input_tokens_seen": 3213088, + "step": 16875 + }, + { + "epoch": 8.773388773388774, + "grad_norm": 0.00018671077850740403, + "learning_rate": 2.258686274214833e-06, + "loss": 0.0, + "num_input_tokens_seen": 3214016, + "step": 16880 + }, + { + "epoch": 8.775987525987526, + "grad_norm": 0.0008992504444904625, + "learning_rate": 2.2492757074965594e-06, + "loss": 0.0, + "num_input_tokens_seen": 3215040, + "step": 16885 + }, + { + "epoch": 8.778586278586278, + "grad_norm": 0.0005492364871315658, + "learning_rate": 2.2398838622418568e-06, + "loss": 0.0, + "num_input_tokens_seen": 3216032, + "step": 16890 + }, + { + "epoch": 8.78118503118503, + "grad_norm": 0.00021416883100755513, + "learning_rate": 2.230510746179243e-06, + "loss": 0.0, + "num_input_tokens_seen": 3216992, + "step": 16895 + }, + { + "epoch": 8.783783783783784, + "grad_norm": 0.0007256661774590611, + "learning_rate": 2.2211563670218067e-06, + "loss": 0.0216, + "num_input_tokens_seen": 3217952, + "step": 16900 + }, + { + "epoch": 8.786382536382536, + "grad_norm": 0.002248939825221896, + "learning_rate": 2.2118207324672293e-06, + "loss": 0.0, + "num_input_tokens_seen": 3218912, + "step": 16905 + }, + { + "epoch": 8.788981288981288, + "grad_norm": 0.000755171524360776, + "learning_rate": 2.2025038501977486e-06, + "loss": 0.0, + "num_input_tokens_seen": 3219840, + "step": 16910 + }, + { + "epoch": 8.791580041580042, + "grad_norm": 0.005403957329690456, + "learning_rate": 2.193205727880193e-06, + "loss": 0.0, + "num_input_tokens_seen": 3220800, + "step": 16915 + }, + { + "epoch": 8.794178794178794, + "grad_norm": 0.001124614616855979, + "learning_rate": 2.1839263731659425e-06, + "loss": 0.0, + "num_input_tokens_seen": 3221696, + "step": 16920 + }, + { + "epoch": 8.796777546777546, + "grad_norm": 0.000219119741814211, + "learning_rate": 2.1746657936909278e-06, + "loss": 0.0, + "num_input_tokens_seen": 3222656, + "step": 16925 + }, + { + "epoch": 8.799376299376299, + "grad_norm": 0.007687716744840145, + "learning_rate": 2.165423997075644e-06, + "loss": 0.0, + "num_input_tokens_seen": 3223680, + "step": 16930 + }, + { + "epoch": 8.801975051975052, + "grad_norm": 0.042455509305000305, + "learning_rate": 2.156200990925114e-06, + "loss": 0.0001, + "num_input_tokens_seen": 3224576, + "step": 16935 + }, + { + "epoch": 8.804573804573804, + "grad_norm": 2.6726313080871478e-05, + "learning_rate": 2.146996782828914e-06, + "loss": 0.0, + "num_input_tokens_seen": 3225536, + "step": 16940 + }, + { + "epoch": 8.807172557172557, + "grad_norm": 0.002897098660469055, + "learning_rate": 2.137811380361135e-06, + "loss": 0.0, + "num_input_tokens_seen": 3226496, + "step": 16945 + }, + { + "epoch": 8.80977130977131, + "grad_norm": 0.00027184191276319325, + "learning_rate": 2.1286447910804086e-06, + "loss": 0.0, + "num_input_tokens_seen": 3227424, + "step": 16950 + }, + { + "epoch": 8.812370062370062, + "grad_norm": 0.0005986667820252478, + "learning_rate": 2.1194970225298786e-06, + "loss": 0.0, + "num_input_tokens_seen": 3228384, + "step": 16955 + }, + { + "epoch": 8.814968814968815, + "grad_norm": 0.040380511432886124, + "learning_rate": 2.110368082237188e-06, + "loss": 0.0001, + "num_input_tokens_seen": 3229408, + "step": 16960 + }, + { + "epoch": 8.817567567567568, + "grad_norm": 0.00014604153693653643, + "learning_rate": 2.101257977714516e-06, + "loss": 0.0001, + "num_input_tokens_seen": 3230432, + "step": 16965 + }, + { + "epoch": 8.82016632016632, + "grad_norm": 0.00046548660611733794, + "learning_rate": 2.09216671645851e-06, + "loss": 0.0, + "num_input_tokens_seen": 3231424, + "step": 16970 + }, + { + "epoch": 8.822765072765073, + "grad_norm": 0.0008740842458792031, + "learning_rate": 2.0830943059503367e-06, + "loss": 0.0, + "num_input_tokens_seen": 3232320, + "step": 16975 + }, + { + "epoch": 8.825363825363825, + "grad_norm": 0.0003985570219811052, + "learning_rate": 2.0740407536556318e-06, + "loss": 0.0001, + "num_input_tokens_seen": 3233216, + "step": 16980 + }, + { + "epoch": 8.827962577962579, + "grad_norm": 0.04896374046802521, + "learning_rate": 2.0650060670245303e-06, + "loss": 0.0001, + "num_input_tokens_seen": 3234112, + "step": 16985 + }, + { + "epoch": 8.83056133056133, + "grad_norm": 3.7039819289930165e-05, + "learning_rate": 2.0559902534916213e-06, + "loss": 0.0001, + "num_input_tokens_seen": 3235104, + "step": 16990 + }, + { + "epoch": 8.833160083160083, + "grad_norm": 0.002246434800326824, + "learning_rate": 2.046993320475979e-06, + "loss": 0.0, + "num_input_tokens_seen": 3236064, + "step": 16995 + }, + { + "epoch": 8.835758835758837, + "grad_norm": 0.001827094703912735, + "learning_rate": 2.0380152753811443e-06, + "loss": 0.0, + "num_input_tokens_seen": 3237024, + "step": 17000 + }, + { + "epoch": 8.838357588357589, + "grad_norm": 0.0007033930160105228, + "learning_rate": 2.0290561255950967e-06, + "loss": 0.0, + "num_input_tokens_seen": 3237984, + "step": 17005 + }, + { + "epoch": 8.84095634095634, + "grad_norm": 2.788805795717053e-05, + "learning_rate": 2.0201158784902916e-06, + "loss": 0.0, + "num_input_tokens_seen": 3238912, + "step": 17010 + }, + { + "epoch": 8.843555093555093, + "grad_norm": 0.0024700735229998827, + "learning_rate": 2.0111945414236083e-06, + "loss": 0.0001, + "num_input_tokens_seen": 3239904, + "step": 17015 + }, + { + "epoch": 8.846153846153847, + "grad_norm": 7.54561842768453e-05, + "learning_rate": 2.002292121736371e-06, + "loss": 0.0, + "num_input_tokens_seen": 3240832, + "step": 17020 + }, + { + "epoch": 8.848752598752599, + "grad_norm": 3.2406358513981104e-05, + "learning_rate": 1.9934086267543396e-06, + "loss": 0.0, + "num_input_tokens_seen": 3241824, + "step": 17025 + }, + { + "epoch": 8.85135135135135, + "grad_norm": 0.0012475380208343267, + "learning_rate": 1.984544063787705e-06, + "loss": 0.0, + "num_input_tokens_seen": 3242752, + "step": 17030 + }, + { + "epoch": 8.853950103950105, + "grad_norm": 3.057676076423377e-05, + "learning_rate": 1.9756984401310684e-06, + "loss": 0.0, + "num_input_tokens_seen": 3243712, + "step": 17035 + }, + { + "epoch": 8.856548856548857, + "grad_norm": 0.0001885106903500855, + "learning_rate": 1.9668717630634575e-06, + "loss": 0.0, + "num_input_tokens_seen": 3244704, + "step": 17040 + }, + { + "epoch": 8.859147609147609, + "grad_norm": 2.7145686544827186e-05, + "learning_rate": 1.958064039848295e-06, + "loss": 0.0, + "num_input_tokens_seen": 3245600, + "step": 17045 + }, + { + "epoch": 8.861746361746361, + "grad_norm": 0.0008532237843610346, + "learning_rate": 1.9492752777334256e-06, + "loss": 0.0004, + "num_input_tokens_seen": 3246560, + "step": 17050 + }, + { + "epoch": 8.864345114345115, + "grad_norm": 0.0013038825709372759, + "learning_rate": 1.9405054839510744e-06, + "loss": 0.0, + "num_input_tokens_seen": 3247520, + "step": 17055 + }, + { + "epoch": 8.866943866943867, + "grad_norm": 0.005312962923198938, + "learning_rate": 1.931754665717858e-06, + "loss": 0.0128, + "num_input_tokens_seen": 3248448, + "step": 17060 + }, + { + "epoch": 8.869542619542619, + "grad_norm": 3.530437970766798e-05, + "learning_rate": 1.9230228302347942e-06, + "loss": 0.0, + "num_input_tokens_seen": 3249408, + "step": 17065 + }, + { + "epoch": 8.872141372141373, + "grad_norm": 0.00014538939285557717, + "learning_rate": 1.91430998468726e-06, + "loss": 0.0, + "num_input_tokens_seen": 3250336, + "step": 17070 + }, + { + "epoch": 8.874740124740125, + "grad_norm": 0.0009474294492974877, + "learning_rate": 1.9056161362450226e-06, + "loss": 0.0, + "num_input_tokens_seen": 3251200, + "step": 17075 + }, + { + "epoch": 8.877338877338877, + "grad_norm": 0.000235006635193713, + "learning_rate": 1.8969412920622015e-06, + "loss": 0.0, + "num_input_tokens_seen": 3252128, + "step": 17080 + }, + { + "epoch": 8.87993762993763, + "grad_norm": 3.2521176763111725e-05, + "learning_rate": 1.8882854592772892e-06, + "loss": 0.0, + "num_input_tokens_seen": 3253056, + "step": 17085 + }, + { + "epoch": 8.882536382536383, + "grad_norm": 0.0002422707184450701, + "learning_rate": 1.8796486450131296e-06, + "loss": 0.0005, + "num_input_tokens_seen": 3254016, + "step": 17090 + }, + { + "epoch": 8.885135135135135, + "grad_norm": 0.0032728423830121756, + "learning_rate": 1.8710308563769124e-06, + "loss": 0.0, + "num_input_tokens_seen": 3254912, + "step": 17095 + }, + { + "epoch": 8.887733887733887, + "grad_norm": 0.0003776298835873604, + "learning_rate": 1.862432100460182e-06, + "loss": 0.0, + "num_input_tokens_seen": 3255808, + "step": 17100 + }, + { + "epoch": 8.890332640332641, + "grad_norm": 0.0004911364521831274, + "learning_rate": 1.8538523843388056e-06, + "loss": 0.0, + "num_input_tokens_seen": 3256704, + "step": 17105 + }, + { + "epoch": 8.892931392931393, + "grad_norm": 0.0005769854760728776, + "learning_rate": 1.845291715073e-06, + "loss": 0.0, + "num_input_tokens_seen": 3257760, + "step": 17110 + }, + { + "epoch": 8.895530145530145, + "grad_norm": 0.00037962236092425883, + "learning_rate": 1.836750099707296e-06, + "loss": 0.0, + "num_input_tokens_seen": 3258784, + "step": 17115 + }, + { + "epoch": 8.898128898128899, + "grad_norm": 0.0006523430929519236, + "learning_rate": 1.8282275452705444e-06, + "loss": 0.0, + "num_input_tokens_seen": 3259744, + "step": 17120 + }, + { + "epoch": 8.900727650727651, + "grad_norm": 0.0002066921442747116, + "learning_rate": 1.8197240587759207e-06, + "loss": 0.0, + "num_input_tokens_seen": 3260640, + "step": 17125 + }, + { + "epoch": 8.903326403326403, + "grad_norm": 0.00022106178221292794, + "learning_rate": 1.8112396472208997e-06, + "loss": 0.0, + "num_input_tokens_seen": 3261632, + "step": 17130 + }, + { + "epoch": 8.905925155925155, + "grad_norm": 7.574316987302154e-05, + "learning_rate": 1.8027743175872664e-06, + "loss": 0.0, + "num_input_tokens_seen": 3262592, + "step": 17135 + }, + { + "epoch": 8.90852390852391, + "grad_norm": 0.0005028468440286815, + "learning_rate": 1.7943280768410981e-06, + "loss": 0.0001, + "num_input_tokens_seen": 3263584, + "step": 17140 + }, + { + "epoch": 8.911122661122661, + "grad_norm": 4.2259402107447386e-05, + "learning_rate": 1.7859009319327713e-06, + "loss": 0.0, + "num_input_tokens_seen": 3264480, + "step": 17145 + }, + { + "epoch": 8.913721413721413, + "grad_norm": 2.638110709085595e-05, + "learning_rate": 1.7774928897969418e-06, + "loss": 0.0, + "num_input_tokens_seen": 3265408, + "step": 17150 + }, + { + "epoch": 8.916320166320165, + "grad_norm": 0.002542982343584299, + "learning_rate": 1.7691039573525442e-06, + "loss": 0.0, + "num_input_tokens_seen": 3266368, + "step": 17155 + }, + { + "epoch": 8.91891891891892, + "grad_norm": 0.0008157509146258235, + "learning_rate": 1.76073414150279e-06, + "loss": 0.0002, + "num_input_tokens_seen": 3267264, + "step": 17160 + }, + { + "epoch": 8.921517671517671, + "grad_norm": 0.00030731697916053236, + "learning_rate": 1.7523834491351699e-06, + "loss": 0.0, + "num_input_tokens_seen": 3268192, + "step": 17165 + }, + { + "epoch": 8.924116424116423, + "grad_norm": 0.0006300635868683457, + "learning_rate": 1.7440518871214173e-06, + "loss": 0.0, + "num_input_tokens_seen": 3269120, + "step": 17170 + }, + { + "epoch": 8.926715176715177, + "grad_norm": 0.0023767880629748106, + "learning_rate": 1.7357394623175454e-06, + "loss": 0.0, + "num_input_tokens_seen": 3270048, + "step": 17175 + }, + { + "epoch": 8.92931392931393, + "grad_norm": 2.5732304493431002e-05, + "learning_rate": 1.7274461815638104e-06, + "loss": 0.0001, + "num_input_tokens_seen": 3271040, + "step": 17180 + }, + { + "epoch": 8.931912681912682, + "grad_norm": 0.0007680122507736087, + "learning_rate": 1.7191720516847032e-06, + "loss": 0.0, + "num_input_tokens_seen": 3272000, + "step": 17185 + }, + { + "epoch": 8.934511434511435, + "grad_norm": 0.00129531545098871, + "learning_rate": 1.7109170794889773e-06, + "loss": 0.0, + "num_input_tokens_seen": 3272896, + "step": 17190 + }, + { + "epoch": 8.937110187110187, + "grad_norm": 0.0018107002833858132, + "learning_rate": 1.7026812717695988e-06, + "loss": 0.0, + "num_input_tokens_seen": 3273792, + "step": 17195 + }, + { + "epoch": 8.93970893970894, + "grad_norm": 0.0022183102555572987, + "learning_rate": 1.6944646353037858e-06, + "loss": 0.0001, + "num_input_tokens_seen": 3274720, + "step": 17200 + }, + { + "epoch": 8.942307692307692, + "grad_norm": 0.0011575708631426096, + "learning_rate": 1.6862671768529626e-06, + "loss": 0.0001, + "num_input_tokens_seen": 3275680, + "step": 17205 + }, + { + "epoch": 8.944906444906445, + "grad_norm": 0.00037227620487101376, + "learning_rate": 1.6780889031627861e-06, + "loss": 0.0, + "num_input_tokens_seen": 3276672, + "step": 17210 + }, + { + "epoch": 8.947505197505198, + "grad_norm": 0.008954785764217377, + "learning_rate": 1.6699298209631148e-06, + "loss": 0.0, + "num_input_tokens_seen": 3277664, + "step": 17215 + }, + { + "epoch": 8.95010395010395, + "grad_norm": 3.3253254514420405e-05, + "learning_rate": 1.661789936968014e-06, + "loss": 0.0, + "num_input_tokens_seen": 3278560, + "step": 17220 + }, + { + "epoch": 8.952702702702704, + "grad_norm": 0.0007280550198629498, + "learning_rate": 1.6536692578757646e-06, + "loss": 0.0, + "num_input_tokens_seen": 3279520, + "step": 17225 + }, + { + "epoch": 8.955301455301456, + "grad_norm": 0.0007454305305145681, + "learning_rate": 1.6455677903688293e-06, + "loss": 0.0, + "num_input_tokens_seen": 3280512, + "step": 17230 + }, + { + "epoch": 8.957900207900208, + "grad_norm": 0.1375848948955536, + "learning_rate": 1.6374855411138702e-06, + "loss": 0.0004, + "num_input_tokens_seen": 3281504, + "step": 17235 + }, + { + "epoch": 8.96049896049896, + "grad_norm": 3.091476173722185e-05, + "learning_rate": 1.6294225167617305e-06, + "loss": 0.0, + "num_input_tokens_seen": 3282432, + "step": 17240 + }, + { + "epoch": 8.963097713097714, + "grad_norm": 0.0015281705418601632, + "learning_rate": 1.6213787239474365e-06, + "loss": 0.0002, + "num_input_tokens_seen": 3283488, + "step": 17245 + }, + { + "epoch": 8.965696465696466, + "grad_norm": 0.0001886098471004516, + "learning_rate": 1.6133541692901877e-06, + "loss": 0.0, + "num_input_tokens_seen": 3284512, + "step": 17250 + }, + { + "epoch": 8.968295218295218, + "grad_norm": 0.0009083859040401876, + "learning_rate": 1.6053488593933464e-06, + "loss": 0.0, + "num_input_tokens_seen": 3285504, + "step": 17255 + }, + { + "epoch": 8.970893970893972, + "grad_norm": 0.0012201153440400958, + "learning_rate": 1.597362800844454e-06, + "loss": 0.0, + "num_input_tokens_seen": 3286432, + "step": 17260 + }, + { + "epoch": 8.973492723492724, + "grad_norm": 0.001126366201788187, + "learning_rate": 1.5893960002151903e-06, + "loss": 0.0, + "num_input_tokens_seen": 3287392, + "step": 17265 + }, + { + "epoch": 8.976091476091476, + "grad_norm": 0.001664741081185639, + "learning_rate": 1.581448464061408e-06, + "loss": 0.0, + "num_input_tokens_seen": 3288320, + "step": 17270 + }, + { + "epoch": 8.978690228690228, + "grad_norm": 0.00015577521116938442, + "learning_rate": 1.5735201989230868e-06, + "loss": 0.0, + "num_input_tokens_seen": 3289312, + "step": 17275 + }, + { + "epoch": 8.981288981288982, + "grad_norm": 0.001096616848371923, + "learning_rate": 1.5656112113243721e-06, + "loss": 0.0, + "num_input_tokens_seen": 3290304, + "step": 17280 + }, + { + "epoch": 8.983887733887734, + "grad_norm": 0.0003773977514356375, + "learning_rate": 1.5577215077735157e-06, + "loss": 0.0, + "num_input_tokens_seen": 3291296, + "step": 17285 + }, + { + "epoch": 8.986486486486486, + "grad_norm": 0.00019342097220942378, + "learning_rate": 1.5498510947629302e-06, + "loss": 0.0, + "num_input_tokens_seen": 3292192, + "step": 17290 + }, + { + "epoch": 8.98908523908524, + "grad_norm": 0.0014555227244272828, + "learning_rate": 1.541999978769132e-06, + "loss": 0.0, + "num_input_tokens_seen": 3293152, + "step": 17295 + }, + { + "epoch": 8.991683991683992, + "grad_norm": 0.00495307007804513, + "learning_rate": 1.5341681662527724e-06, + "loss": 0.0, + "num_input_tokens_seen": 3294080, + "step": 17300 + }, + { + "epoch": 8.994282744282744, + "grad_norm": 4.012298086308874e-05, + "learning_rate": 1.5263556636586157e-06, + "loss": 0.0, + "num_input_tokens_seen": 3295072, + "step": 17305 + }, + { + "epoch": 8.996881496881496, + "grad_norm": 2.961607970064506e-05, + "learning_rate": 1.5185624774155333e-06, + "loss": 0.0, + "num_input_tokens_seen": 3296000, + "step": 17310 + }, + { + "epoch": 8.99948024948025, + "grad_norm": 0.0006544729112647474, + "learning_rate": 1.5107886139364952e-06, + "loss": 0.0, + "num_input_tokens_seen": 3296992, + "step": 17315 + }, + { + "epoch": 9.0, + "eval_loss": 0.4257082939147949, + "eval_runtime": 9.2172, + "eval_samples_per_second": 92.87, + "eval_steps_per_second": 23.217, + "num_input_tokens_seen": 3297136, + "step": 17316 + }, + { + "epoch": 9.002079002079002, + "grad_norm": 0.00024519237922504544, + "learning_rate": 1.5030340796185787e-06, + "loss": 0.0, + "num_input_tokens_seen": 3298000, + "step": 17320 + }, + { + "epoch": 9.004677754677754, + "grad_norm": 0.0010815632995218039, + "learning_rate": 1.4952988808429575e-06, + "loss": 0.0001, + "num_input_tokens_seen": 3298928, + "step": 17325 + }, + { + "epoch": 9.007276507276508, + "grad_norm": 0.00012508922372944653, + "learning_rate": 1.4875830239748867e-06, + "loss": 0.0, + "num_input_tokens_seen": 3299952, + "step": 17330 + }, + { + "epoch": 9.00987525987526, + "grad_norm": 0.00032533190096728504, + "learning_rate": 1.4798865153637097e-06, + "loss": 0.0, + "num_input_tokens_seen": 3300880, + "step": 17335 + }, + { + "epoch": 9.012474012474012, + "grad_norm": 2.757175207079854e-05, + "learning_rate": 1.472209361342844e-06, + "loss": 0.0, + "num_input_tokens_seen": 3301776, + "step": 17340 + }, + { + "epoch": 9.015072765072764, + "grad_norm": 0.0015981205506250262, + "learning_rate": 1.4645515682297911e-06, + "loss": 0.0, + "num_input_tokens_seen": 3302800, + "step": 17345 + }, + { + "epoch": 9.017671517671518, + "grad_norm": 8.392945164814591e-05, + "learning_rate": 1.456913142326108e-06, + "loss": 0.0, + "num_input_tokens_seen": 3303824, + "step": 17350 + }, + { + "epoch": 9.02027027027027, + "grad_norm": 8.348479605047032e-05, + "learning_rate": 1.4492940899174134e-06, + "loss": 0.0, + "num_input_tokens_seen": 3304720, + "step": 17355 + }, + { + "epoch": 9.022869022869022, + "grad_norm": 0.00013909820700064301, + "learning_rate": 1.441694417273401e-06, + "loss": 0.0, + "num_input_tokens_seen": 3305712, + "step": 17360 + }, + { + "epoch": 9.025467775467776, + "grad_norm": 0.008784877136349678, + "learning_rate": 1.4341141306477957e-06, + "loss": 0.0, + "num_input_tokens_seen": 3306672, + "step": 17365 + }, + { + "epoch": 9.028066528066528, + "grad_norm": 0.0005093244835734367, + "learning_rate": 1.4265532362783884e-06, + "loss": 0.0, + "num_input_tokens_seen": 3307600, + "step": 17370 + }, + { + "epoch": 9.03066528066528, + "grad_norm": 3.0178622182575054e-05, + "learning_rate": 1.4190117403869968e-06, + "loss": 0.0, + "num_input_tokens_seen": 3308592, + "step": 17375 + }, + { + "epoch": 9.033264033264032, + "grad_norm": 0.0003650652652140707, + "learning_rate": 1.4114896491794816e-06, + "loss": 0.0, + "num_input_tokens_seen": 3309552, + "step": 17380 + }, + { + "epoch": 9.035862785862786, + "grad_norm": 4.126054409425706e-05, + "learning_rate": 1.4039869688457414e-06, + "loss": 0.0, + "num_input_tokens_seen": 3310576, + "step": 17385 + }, + { + "epoch": 9.038461538461538, + "grad_norm": 4.263934897608124e-05, + "learning_rate": 1.396503705559693e-06, + "loss": 0.0, + "num_input_tokens_seen": 3311600, + "step": 17390 + }, + { + "epoch": 9.04106029106029, + "grad_norm": 0.0004984585684724152, + "learning_rate": 1.3890398654792803e-06, + "loss": 0.0, + "num_input_tokens_seen": 3312496, + "step": 17395 + }, + { + "epoch": 9.043659043659044, + "grad_norm": 0.00042082814616151154, + "learning_rate": 1.3815954547464565e-06, + "loss": 0.0, + "num_input_tokens_seen": 3313456, + "step": 17400 + }, + { + "epoch": 9.046257796257796, + "grad_norm": 8.844215335557237e-05, + "learning_rate": 1.3741704794872024e-06, + "loss": 0.0, + "num_input_tokens_seen": 3314416, + "step": 17405 + }, + { + "epoch": 9.048856548856548, + "grad_norm": 2.956789467134513e-05, + "learning_rate": 1.3667649458114857e-06, + "loss": 0.0016, + "num_input_tokens_seen": 3315376, + "step": 17410 + }, + { + "epoch": 9.051455301455302, + "grad_norm": 0.009012916125357151, + "learning_rate": 1.3593788598132928e-06, + "loss": 0.0, + "num_input_tokens_seen": 3316272, + "step": 17415 + }, + { + "epoch": 9.054054054054054, + "grad_norm": 0.0021471604704856873, + "learning_rate": 1.3520122275705871e-06, + "loss": 0.0, + "num_input_tokens_seen": 3317200, + "step": 17420 + }, + { + "epoch": 9.056652806652806, + "grad_norm": 0.0005874193739145994, + "learning_rate": 1.344665055145347e-06, + "loss": 0.0, + "num_input_tokens_seen": 3318224, + "step": 17425 + }, + { + "epoch": 9.059251559251559, + "grad_norm": 0.0005674638086929917, + "learning_rate": 1.3373373485835227e-06, + "loss": 0.0, + "num_input_tokens_seen": 3319152, + "step": 17430 + }, + { + "epoch": 9.061850311850312, + "grad_norm": 0.0008338006446138024, + "learning_rate": 1.3300291139150461e-06, + "loss": 0.0, + "num_input_tokens_seen": 3320112, + "step": 17435 + }, + { + "epoch": 9.064449064449065, + "grad_norm": 0.0002749962732195854, + "learning_rate": 1.3227403571538398e-06, + "loss": 0.0, + "num_input_tokens_seen": 3321104, + "step": 17440 + }, + { + "epoch": 9.067047817047817, + "grad_norm": 0.00021779618691653013, + "learning_rate": 1.3154710842977703e-06, + "loss": 0.0, + "num_input_tokens_seen": 3322000, + "step": 17445 + }, + { + "epoch": 9.06964656964657, + "grad_norm": 8.977041579782963e-05, + "learning_rate": 1.3082213013286993e-06, + "loss": 0.0, + "num_input_tokens_seen": 3322928, + "step": 17450 + }, + { + "epoch": 9.072245322245323, + "grad_norm": 0.0004125084378756583, + "learning_rate": 1.3009910142124354e-06, + "loss": 0.0002, + "num_input_tokens_seen": 3323824, + "step": 17455 + }, + { + "epoch": 9.074844074844075, + "grad_norm": 0.00012730721209663898, + "learning_rate": 1.2937802288987499e-06, + "loss": 0.0, + "num_input_tokens_seen": 3324752, + "step": 17460 + }, + { + "epoch": 9.077442827442827, + "grad_norm": 5.1393191824899986e-05, + "learning_rate": 1.286588951321363e-06, + "loss": 0.0, + "num_input_tokens_seen": 3325616, + "step": 17465 + }, + { + "epoch": 9.08004158004158, + "grad_norm": 0.0005161667359061539, + "learning_rate": 1.2794171873979439e-06, + "loss": 0.0, + "num_input_tokens_seen": 3326608, + "step": 17470 + }, + { + "epoch": 9.082640332640333, + "grad_norm": 0.00043588352855294943, + "learning_rate": 1.272264943030102e-06, + "loss": 0.0, + "num_input_tokens_seen": 3327504, + "step": 17475 + }, + { + "epoch": 9.085239085239085, + "grad_norm": 0.00010506583203095943, + "learning_rate": 1.2651322241033825e-06, + "loss": 0.0, + "num_input_tokens_seen": 3328464, + "step": 17480 + }, + { + "epoch": 9.087837837837839, + "grad_norm": 2.752017098828219e-05, + "learning_rate": 1.2580190364872706e-06, + "loss": 0.0, + "num_input_tokens_seen": 3329328, + "step": 17485 + }, + { + "epoch": 9.09043659043659, + "grad_norm": 0.0002369489666307345, + "learning_rate": 1.2509253860351732e-06, + "loss": 0.0, + "num_input_tokens_seen": 3330256, + "step": 17490 + }, + { + "epoch": 9.093035343035343, + "grad_norm": 0.0005296734743751585, + "learning_rate": 1.2438512785844237e-06, + "loss": 0.0, + "num_input_tokens_seen": 3331152, + "step": 17495 + }, + { + "epoch": 9.095634095634095, + "grad_norm": 0.0027435196097940207, + "learning_rate": 1.236796719956268e-06, + "loss": 0.0, + "num_input_tokens_seen": 3332080, + "step": 17500 + }, + { + "epoch": 9.098232848232849, + "grad_norm": 5.6935336033347994e-05, + "learning_rate": 1.229761715955874e-06, + "loss": 0.0, + "num_input_tokens_seen": 3333040, + "step": 17505 + }, + { + "epoch": 9.1008316008316, + "grad_norm": 0.0005662778276018798, + "learning_rate": 1.2227462723723077e-06, + "loss": 0.0005, + "num_input_tokens_seen": 3334064, + "step": 17510 + }, + { + "epoch": 9.103430353430353, + "grad_norm": 4.598467785399407e-05, + "learning_rate": 1.2157503949785487e-06, + "loss": 0.0, + "num_input_tokens_seen": 3335024, + "step": 17515 + }, + { + "epoch": 9.106029106029107, + "grad_norm": 0.00016446318477392197, + "learning_rate": 1.2087740895314697e-06, + "loss": 0.0, + "num_input_tokens_seen": 3335920, + "step": 17520 + }, + { + "epoch": 9.108627858627859, + "grad_norm": 3.1353549275081605e-05, + "learning_rate": 1.201817361771837e-06, + "loss": 0.0, + "num_input_tokens_seen": 3336912, + "step": 17525 + }, + { + "epoch": 9.111226611226611, + "grad_norm": 0.01258544810116291, + "learning_rate": 1.1948802174243158e-06, + "loss": 0.0, + "num_input_tokens_seen": 3337904, + "step": 17530 + }, + { + "epoch": 9.113825363825363, + "grad_norm": 0.00022427622752729803, + "learning_rate": 1.187962662197442e-06, + "loss": 0.0, + "num_input_tokens_seen": 3338864, + "step": 17535 + }, + { + "epoch": 9.116424116424117, + "grad_norm": 5.397464337875135e-05, + "learning_rate": 1.181064701783649e-06, + "loss": 0.0002, + "num_input_tokens_seen": 3339920, + "step": 17540 + }, + { + "epoch": 9.119022869022869, + "grad_norm": 9.923223115038127e-05, + "learning_rate": 1.174186341859221e-06, + "loss": 0.0404, + "num_input_tokens_seen": 3340912, + "step": 17545 + }, + { + "epoch": 9.121621621621621, + "grad_norm": 0.0009247290436178446, + "learning_rate": 1.1673275880843382e-06, + "loss": 0.0, + "num_input_tokens_seen": 3341936, + "step": 17550 + }, + { + "epoch": 9.124220374220375, + "grad_norm": 0.006148544140160084, + "learning_rate": 1.1604884461030392e-06, + "loss": 0.0, + "num_input_tokens_seen": 3342928, + "step": 17555 + }, + { + "epoch": 9.126819126819127, + "grad_norm": 0.0005047255544923246, + "learning_rate": 1.1536689215432106e-06, + "loss": 0.0, + "num_input_tokens_seen": 3343760, + "step": 17560 + }, + { + "epoch": 9.129417879417879, + "grad_norm": 0.002037704223766923, + "learning_rate": 1.1468690200166193e-06, + "loss": 0.0, + "num_input_tokens_seen": 3344784, + "step": 17565 + }, + { + "epoch": 9.132016632016631, + "grad_norm": 0.11435241997241974, + "learning_rate": 1.1400887471188614e-06, + "loss": 0.0008, + "num_input_tokens_seen": 3345712, + "step": 17570 + }, + { + "epoch": 9.134615384615385, + "grad_norm": 2.705134465941228e-05, + "learning_rate": 1.1333281084294045e-06, + "loss": 0.0, + "num_input_tokens_seen": 3346672, + "step": 17575 + }, + { + "epoch": 9.137214137214137, + "grad_norm": 0.00033472556970082223, + "learning_rate": 1.1265871095115315e-06, + "loss": 0.0, + "num_input_tokens_seen": 3347664, + "step": 17580 + }, + { + "epoch": 9.13981288981289, + "grad_norm": 0.0012592956190928817, + "learning_rate": 1.1198657559123888e-06, + "loss": 0.0, + "num_input_tokens_seen": 3348624, + "step": 17585 + }, + { + "epoch": 9.142411642411643, + "grad_norm": 0.00013970723375678062, + "learning_rate": 1.1131640531629377e-06, + "loss": 0.0, + "num_input_tokens_seen": 3349520, + "step": 17590 + }, + { + "epoch": 9.145010395010395, + "grad_norm": 0.0012542122276499867, + "learning_rate": 1.1064820067779897e-06, + "loss": 0.0, + "num_input_tokens_seen": 3350480, + "step": 17595 + }, + { + "epoch": 9.147609147609147, + "grad_norm": 0.0015375057701021433, + "learning_rate": 1.0998196222561568e-06, + "loss": 0.0, + "num_input_tokens_seen": 3351376, + "step": 17600 + }, + { + "epoch": 9.1502079002079, + "grad_norm": 3.1577546906191856e-05, + "learning_rate": 1.093176905079893e-06, + "loss": 0.0, + "num_input_tokens_seen": 3352208, + "step": 17605 + }, + { + "epoch": 9.152806652806653, + "grad_norm": 0.01747054234147072, + "learning_rate": 1.0865538607154557e-06, + "loss": 0.0, + "num_input_tokens_seen": 3353200, + "step": 17610 + }, + { + "epoch": 9.155405405405405, + "grad_norm": 0.006554083898663521, + "learning_rate": 1.0799504946129135e-06, + "loss": 0.0, + "num_input_tokens_seen": 3354192, + "step": 17615 + }, + { + "epoch": 9.158004158004157, + "grad_norm": 0.00041201425483450294, + "learning_rate": 1.0733668122061503e-06, + "loss": 0.0, + "num_input_tokens_seen": 3355120, + "step": 17620 + }, + { + "epoch": 9.160602910602911, + "grad_norm": 0.0006672346498817205, + "learning_rate": 1.0668028189128431e-06, + "loss": 0.0, + "num_input_tokens_seen": 3356080, + "step": 17625 + }, + { + "epoch": 9.163201663201663, + "grad_norm": 4.3054464185843244e-05, + "learning_rate": 1.0602585201344772e-06, + "loss": 0.0, + "num_input_tokens_seen": 3357008, + "step": 17630 + }, + { + "epoch": 9.165800415800415, + "grad_norm": 3.236072006984614e-05, + "learning_rate": 1.053733921256317e-06, + "loss": 0.0, + "num_input_tokens_seen": 3358032, + "step": 17635 + }, + { + "epoch": 9.16839916839917, + "grad_norm": 3.365799057064578e-05, + "learning_rate": 1.0472290276474312e-06, + "loss": 0.0, + "num_input_tokens_seen": 3359024, + "step": 17640 + }, + { + "epoch": 9.170997920997921, + "grad_norm": 0.0014437389327213168, + "learning_rate": 1.0407438446606633e-06, + "loss": 0.0, + "num_input_tokens_seen": 3359984, + "step": 17645 + }, + { + "epoch": 9.173596673596673, + "grad_norm": 0.0004846591327805072, + "learning_rate": 1.034278377632636e-06, + "loss": 0.0033, + "num_input_tokens_seen": 3361008, + "step": 17650 + }, + { + "epoch": 9.176195426195425, + "grad_norm": 3.2107003789860755e-05, + "learning_rate": 1.0278326318837571e-06, + "loss": 0.0, + "num_input_tokens_seen": 3361936, + "step": 17655 + }, + { + "epoch": 9.17879417879418, + "grad_norm": 0.00017804550589062274, + "learning_rate": 1.0214066127181953e-06, + "loss": 0.0, + "num_input_tokens_seen": 3362896, + "step": 17660 + }, + { + "epoch": 9.181392931392931, + "grad_norm": 9.363074786961079e-05, + "learning_rate": 1.0150003254238983e-06, + "loss": 0.0, + "num_input_tokens_seen": 3363792, + "step": 17665 + }, + { + "epoch": 9.183991683991684, + "grad_norm": 0.0018665487878024578, + "learning_rate": 1.0086137752725655e-06, + "loss": 0.0, + "num_input_tokens_seen": 3364752, + "step": 17670 + }, + { + "epoch": 9.186590436590437, + "grad_norm": 0.00020163536828476936, + "learning_rate": 1.0022469675196572e-06, + "loss": 0.0, + "num_input_tokens_seen": 3365680, + "step": 17675 + }, + { + "epoch": 9.18918918918919, + "grad_norm": 3.2678628485882655e-05, + "learning_rate": 9.958999074043935e-07, + "loss": 0.0, + "num_input_tokens_seen": 3366640, + "step": 17680 + }, + { + "epoch": 9.191787941787942, + "grad_norm": 3.0173729101079516e-05, + "learning_rate": 9.895726001497352e-07, + "loss": 0.0, + "num_input_tokens_seen": 3367664, + "step": 17685 + }, + { + "epoch": 9.194386694386694, + "grad_norm": 3.113892671535723e-05, + "learning_rate": 9.83265050962398e-07, + "loss": 0.0, + "num_input_tokens_seen": 3368560, + "step": 17690 + }, + { + "epoch": 9.196985446985448, + "grad_norm": 0.00282269180752337, + "learning_rate": 9.769772650328328e-07, + "loss": 0.0, + "num_input_tokens_seen": 3369488, + "step": 17695 + }, + { + "epoch": 9.1995841995842, + "grad_norm": 0.001524688326753676, + "learning_rate": 9.707092475352285e-07, + "loss": 0.0, + "num_input_tokens_seen": 3370416, + "step": 17700 + }, + { + "epoch": 9.202182952182952, + "grad_norm": 9.261359082302079e-05, + "learning_rate": 9.644610036275093e-07, + "loss": 0.0, + "num_input_tokens_seen": 3371344, + "step": 17705 + }, + { + "epoch": 9.204781704781706, + "grad_norm": 0.00017191126244142652, + "learning_rate": 9.58232538451323e-07, + "loss": 0.0, + "num_input_tokens_seen": 3372304, + "step": 17710 + }, + { + "epoch": 9.207380457380458, + "grad_norm": 0.002353758318349719, + "learning_rate": 9.520238571320423e-07, + "loss": 0.0001, + "num_input_tokens_seen": 3373264, + "step": 17715 + }, + { + "epoch": 9.20997920997921, + "grad_norm": 2.7574618798098527e-05, + "learning_rate": 9.458349647787662e-07, + "loss": 0.0, + "num_input_tokens_seen": 3374160, + "step": 17720 + }, + { + "epoch": 9.212577962577962, + "grad_norm": 3.0228284231270663e-05, + "learning_rate": 9.396658664843017e-07, + "loss": 0.0, + "num_input_tokens_seen": 3375056, + "step": 17725 + }, + { + "epoch": 9.215176715176716, + "grad_norm": 0.0009196996688842773, + "learning_rate": 9.335165673251739e-07, + "loss": 0.0, + "num_input_tokens_seen": 3376016, + "step": 17730 + }, + { + "epoch": 9.217775467775468, + "grad_norm": 0.00031765300082042813, + "learning_rate": 9.273870723616129e-07, + "loss": 0.0, + "num_input_tokens_seen": 3376976, + "step": 17735 + }, + { + "epoch": 9.22037422037422, + "grad_norm": 6.587384996237233e-05, + "learning_rate": 9.212773866375424e-07, + "loss": 0.0, + "num_input_tokens_seen": 3377936, + "step": 17740 + }, + { + "epoch": 9.222972972972974, + "grad_norm": 7.280182035174221e-05, + "learning_rate": 9.151875151806044e-07, + "loss": 0.0, + "num_input_tokens_seen": 3378960, + "step": 17745 + }, + { + "epoch": 9.225571725571726, + "grad_norm": 0.002101112389937043, + "learning_rate": 9.091174630021182e-07, + "loss": 0.0, + "num_input_tokens_seen": 3379952, + "step": 17750 + }, + { + "epoch": 9.228170478170478, + "grad_norm": 4.9864342145156115e-05, + "learning_rate": 9.030672350971076e-07, + "loss": 0.0, + "num_input_tokens_seen": 3380912, + "step": 17755 + }, + { + "epoch": 9.23076923076923, + "grad_norm": 0.0006152680143713951, + "learning_rate": 8.970368364442705e-07, + "loss": 0.0, + "num_input_tokens_seen": 3381808, + "step": 17760 + }, + { + "epoch": 9.233367983367984, + "grad_norm": 0.00011912375339306891, + "learning_rate": 8.910262720059959e-07, + "loss": 0.0012, + "num_input_tokens_seen": 3382800, + "step": 17765 + }, + { + "epoch": 9.235966735966736, + "grad_norm": 3.24623761116527e-05, + "learning_rate": 8.850355467283494e-07, + "loss": 0.0, + "num_input_tokens_seen": 3383760, + "step": 17770 + }, + { + "epoch": 9.238565488565488, + "grad_norm": 0.00025023892521858215, + "learning_rate": 8.790646655410684e-07, + "loss": 0.0002, + "num_input_tokens_seen": 3384784, + "step": 17775 + }, + { + "epoch": 9.241164241164242, + "grad_norm": 0.0001308184291701764, + "learning_rate": 8.731136333575668e-07, + "loss": 0.0, + "num_input_tokens_seen": 3385808, + "step": 17780 + }, + { + "epoch": 9.243762993762994, + "grad_norm": 0.0012960312888026237, + "learning_rate": 8.671824550749164e-07, + "loss": 0.0, + "num_input_tokens_seen": 3386736, + "step": 17785 + }, + { + "epoch": 9.246361746361746, + "grad_norm": 6.022364323143847e-05, + "learning_rate": 8.612711355738601e-07, + "loss": 0.0226, + "num_input_tokens_seen": 3387728, + "step": 17790 + }, + { + "epoch": 9.248960498960498, + "grad_norm": 0.00011812626325991005, + "learning_rate": 8.553796797187902e-07, + "loss": 0.0001, + "num_input_tokens_seen": 3388688, + "step": 17795 + }, + { + "epoch": 9.251559251559252, + "grad_norm": 0.0003728899755515158, + "learning_rate": 8.495080923577619e-07, + "loss": 0.0, + "num_input_tokens_seen": 3389680, + "step": 17800 + }, + { + "epoch": 9.254158004158004, + "grad_norm": 6.470429798355326e-05, + "learning_rate": 8.436563783224744e-07, + "loss": 0.0, + "num_input_tokens_seen": 3390608, + "step": 17805 + }, + { + "epoch": 9.256756756756756, + "grad_norm": 9.379191760672256e-05, + "learning_rate": 8.378245424282755e-07, + "loss": 0.0, + "num_input_tokens_seen": 3391568, + "step": 17810 + }, + { + "epoch": 9.25935550935551, + "grad_norm": 2.8383563403622247e-05, + "learning_rate": 8.320125894741598e-07, + "loss": 0.0, + "num_input_tokens_seen": 3392432, + "step": 17815 + }, + { + "epoch": 9.261954261954262, + "grad_norm": 0.0015485204057767987, + "learning_rate": 8.262205242427462e-07, + "loss": 0.0, + "num_input_tokens_seen": 3393424, + "step": 17820 + }, + { + "epoch": 9.264553014553014, + "grad_norm": 0.00047987833386287093, + "learning_rate": 8.204483515003081e-07, + "loss": 0.0, + "num_input_tokens_seen": 3394416, + "step": 17825 + }, + { + "epoch": 9.267151767151766, + "grad_norm": 0.0010136455530300736, + "learning_rate": 8.146960759967348e-07, + "loss": 0.0, + "num_input_tokens_seen": 3395344, + "step": 17830 + }, + { + "epoch": 9.26975051975052, + "grad_norm": 0.0005100581329315901, + "learning_rate": 8.089637024655483e-07, + "loss": 0.0, + "num_input_tokens_seen": 3396272, + "step": 17835 + }, + { + "epoch": 9.272349272349272, + "grad_norm": 0.0002649901725817472, + "learning_rate": 8.032512356238864e-07, + "loss": 0.0, + "num_input_tokens_seen": 3397264, + "step": 17840 + }, + { + "epoch": 9.274948024948024, + "grad_norm": 0.00015592513955198228, + "learning_rate": 7.975586801725194e-07, + "loss": 0.0, + "num_input_tokens_seen": 3398192, + "step": 17845 + }, + { + "epoch": 9.277546777546778, + "grad_norm": 0.0007201214320957661, + "learning_rate": 7.91886040795814e-07, + "loss": 0.0, + "num_input_tokens_seen": 3399120, + "step": 17850 + }, + { + "epoch": 9.28014553014553, + "grad_norm": 0.0010508253471925855, + "learning_rate": 7.862333221617668e-07, + "loss": 0.0057, + "num_input_tokens_seen": 3400080, + "step": 17855 + }, + { + "epoch": 9.282744282744282, + "grad_norm": 3.4499265893828124e-05, + "learning_rate": 7.806005289219737e-07, + "loss": 0.0, + "num_input_tokens_seen": 3401072, + "step": 17860 + }, + { + "epoch": 9.285343035343036, + "grad_norm": 0.004074655007570982, + "learning_rate": 7.749876657116295e-07, + "loss": 0.0, + "num_input_tokens_seen": 3402000, + "step": 17865 + }, + { + "epoch": 9.287941787941788, + "grad_norm": 5.49044942855835, + "learning_rate": 7.693947371495313e-07, + "loss": 0.0586, + "num_input_tokens_seen": 3403024, + "step": 17870 + }, + { + "epoch": 9.29054054054054, + "grad_norm": 0.002982884179800749, + "learning_rate": 7.638217478380782e-07, + "loss": 0.0032, + "num_input_tokens_seen": 3403952, + "step": 17875 + }, + { + "epoch": 9.293139293139292, + "grad_norm": 3.0282984880614094e-05, + "learning_rate": 7.582687023632545e-07, + "loss": 0.0, + "num_input_tokens_seen": 3404944, + "step": 17880 + }, + { + "epoch": 9.295738045738046, + "grad_norm": 0.0002952135691884905, + "learning_rate": 7.527356052946327e-07, + "loss": 0.0, + "num_input_tokens_seen": 3405936, + "step": 17885 + }, + { + "epoch": 9.298336798336798, + "grad_norm": 0.006216381676495075, + "learning_rate": 7.47222461185379e-07, + "loss": 0.0, + "num_input_tokens_seen": 3406864, + "step": 17890 + }, + { + "epoch": 9.30093555093555, + "grad_norm": 0.00018261984223499894, + "learning_rate": 7.417292745722282e-07, + "loss": 0.0, + "num_input_tokens_seen": 3407856, + "step": 17895 + }, + { + "epoch": 9.303534303534304, + "grad_norm": 3.802837818511762e-05, + "learning_rate": 7.362560499755006e-07, + "loss": 0.0, + "num_input_tokens_seen": 3408848, + "step": 17900 + }, + { + "epoch": 9.306133056133056, + "grad_norm": 0.0003983911301475018, + "learning_rate": 7.30802791899085e-07, + "loss": 0.0, + "num_input_tokens_seen": 3409808, + "step": 17905 + }, + { + "epoch": 9.308731808731808, + "grad_norm": 0.00015475135296583176, + "learning_rate": 7.253695048304394e-07, + "loss": 0.0, + "num_input_tokens_seen": 3410800, + "step": 17910 + }, + { + "epoch": 9.31133056133056, + "grad_norm": 3.201994695700705e-05, + "learning_rate": 7.199561932405952e-07, + "loss": 0.0005, + "num_input_tokens_seen": 3411760, + "step": 17915 + }, + { + "epoch": 9.313929313929314, + "grad_norm": 0.0005763785447925329, + "learning_rate": 7.145628615841365e-07, + "loss": 0.0, + "num_input_tokens_seen": 3412656, + "step": 17920 + }, + { + "epoch": 9.316528066528067, + "grad_norm": 0.00017520671826787293, + "learning_rate": 7.091895142992133e-07, + "loss": 0.0498, + "num_input_tokens_seen": 3413680, + "step": 17925 + }, + { + "epoch": 9.319126819126819, + "grad_norm": 7.747256313450634e-05, + "learning_rate": 7.038361558075273e-07, + "loss": 0.0, + "num_input_tokens_seen": 3414736, + "step": 17930 + }, + { + "epoch": 9.321725571725572, + "grad_norm": 0.0016202969709411263, + "learning_rate": 6.985027905143299e-07, + "loss": 0.0, + "num_input_tokens_seen": 3415696, + "step": 17935 + }, + { + "epoch": 9.324324324324325, + "grad_norm": 0.007398161105811596, + "learning_rate": 6.931894228084268e-07, + "loss": 0.0046, + "num_input_tokens_seen": 3416720, + "step": 17940 + }, + { + "epoch": 9.326923076923077, + "grad_norm": 0.0012656546896323562, + "learning_rate": 6.878960570621568e-07, + "loss": 0.0, + "num_input_tokens_seen": 3417680, + "step": 17945 + }, + { + "epoch": 9.329521829521829, + "grad_norm": 0.00098116893786937, + "learning_rate": 6.826226976314104e-07, + "loss": 0.0, + "num_input_tokens_seen": 3418640, + "step": 17950 + }, + { + "epoch": 9.332120582120583, + "grad_norm": 0.0005293239955790341, + "learning_rate": 6.773693488556083e-07, + "loss": 0.0, + "num_input_tokens_seen": 3419504, + "step": 17955 + }, + { + "epoch": 9.334719334719335, + "grad_norm": 0.0002181453164666891, + "learning_rate": 6.721360150577089e-07, + "loss": 0.0, + "num_input_tokens_seen": 3420464, + "step": 17960 + }, + { + "epoch": 9.337318087318087, + "grad_norm": 0.0021079687867313623, + "learning_rate": 6.669227005441953e-07, + "loss": 0.0, + "num_input_tokens_seen": 3421424, + "step": 17965 + }, + { + "epoch": 9.33991683991684, + "grad_norm": 2.9777716918033548e-05, + "learning_rate": 6.617294096050802e-07, + "loss": 0.0, + "num_input_tokens_seen": 3422384, + "step": 17970 + }, + { + "epoch": 9.342515592515593, + "grad_norm": 0.0002702171041164547, + "learning_rate": 6.565561465138953e-07, + "loss": 0.0, + "num_input_tokens_seen": 3423376, + "step": 17975 + }, + { + "epoch": 9.345114345114345, + "grad_norm": 0.000739042239729315, + "learning_rate": 6.514029155276962e-07, + "loss": 0.0449, + "num_input_tokens_seen": 3424304, + "step": 17980 + }, + { + "epoch": 9.347713097713097, + "grad_norm": 0.0010691623901948333, + "learning_rate": 6.46269720887055e-07, + "loss": 0.0, + "num_input_tokens_seen": 3425232, + "step": 17985 + }, + { + "epoch": 9.35031185031185, + "grad_norm": 0.06832297146320343, + "learning_rate": 6.411565668160507e-07, + "loss": 0.0004, + "num_input_tokens_seen": 3426160, + "step": 17990 + }, + { + "epoch": 9.352910602910603, + "grad_norm": 2.6580890335026197e-05, + "learning_rate": 6.360634575222762e-07, + "loss": 0.0, + "num_input_tokens_seen": 3427280, + "step": 17995 + }, + { + "epoch": 9.355509355509355, + "grad_norm": 0.0018544817576184869, + "learning_rate": 6.309903971968262e-07, + "loss": 0.0, + "num_input_tokens_seen": 3428176, + "step": 18000 + }, + { + "epoch": 9.358108108108109, + "grad_norm": 0.0009188380790874362, + "learning_rate": 6.259373900142945e-07, + "loss": 0.0001, + "num_input_tokens_seen": 3429168, + "step": 18005 + }, + { + "epoch": 9.36070686070686, + "grad_norm": 2.8245769499335438e-05, + "learning_rate": 6.209044401327801e-07, + "loss": 0.0, + "num_input_tokens_seen": 3430160, + "step": 18010 + }, + { + "epoch": 9.363305613305613, + "grad_norm": 5.490072362590581e-05, + "learning_rate": 6.158915516938729e-07, + "loss": 0.0, + "num_input_tokens_seen": 3431120, + "step": 18015 + }, + { + "epoch": 9.365904365904367, + "grad_norm": 0.001259397598914802, + "learning_rate": 6.108987288226536e-07, + "loss": 0.0, + "num_input_tokens_seen": 3432016, + "step": 18020 + }, + { + "epoch": 9.368503118503119, + "grad_norm": 3.260379162384197e-05, + "learning_rate": 6.059259756276969e-07, + "loss": 0.1668, + "num_input_tokens_seen": 3432944, + "step": 18025 + }, + { + "epoch": 9.371101871101871, + "grad_norm": 0.003515507560223341, + "learning_rate": 6.009732962010544e-07, + "loss": 0.0008, + "num_input_tokens_seen": 3433904, + "step": 18030 + }, + { + "epoch": 9.373700623700623, + "grad_norm": 0.0002286206727148965, + "learning_rate": 5.960406946182634e-07, + "loss": 0.0, + "num_input_tokens_seen": 3434864, + "step": 18035 + }, + { + "epoch": 9.376299376299377, + "grad_norm": 0.002097859513014555, + "learning_rate": 5.91128174938338e-07, + "loss": 0.0001, + "num_input_tokens_seen": 3435760, + "step": 18040 + }, + { + "epoch": 9.378898128898129, + "grad_norm": 5.4691568948328495e-05, + "learning_rate": 5.862357412037666e-07, + "loss": 0.0001, + "num_input_tokens_seen": 3436656, + "step": 18045 + }, + { + "epoch": 9.381496881496881, + "grad_norm": 3.220764483558014e-05, + "learning_rate": 5.813633974405153e-07, + "loss": 0.0, + "num_input_tokens_seen": 3437616, + "step": 18050 + }, + { + "epoch": 9.384095634095633, + "grad_norm": 0.00022337533300742507, + "learning_rate": 5.765111476580043e-07, + "loss": 0.0, + "num_input_tokens_seen": 3438512, + "step": 18055 + }, + { + "epoch": 9.386694386694387, + "grad_norm": 2.840024171746336e-05, + "learning_rate": 5.716789958491342e-07, + "loss": 0.0, + "num_input_tokens_seen": 3439472, + "step": 18060 + }, + { + "epoch": 9.38929313929314, + "grad_norm": 0.004452369641512632, + "learning_rate": 5.668669459902576e-07, + "loss": 0.0, + "num_input_tokens_seen": 3440400, + "step": 18065 + }, + { + "epoch": 9.391891891891891, + "grad_norm": 0.0009123242343775928, + "learning_rate": 5.620750020411847e-07, + "loss": 0.0, + "num_input_tokens_seen": 3441328, + "step": 18070 + }, + { + "epoch": 9.394490644490645, + "grad_norm": 7.832537085050717e-05, + "learning_rate": 5.573031679451863e-07, + "loss": 0.0, + "num_input_tokens_seen": 3442256, + "step": 18075 + }, + { + "epoch": 9.397089397089397, + "grad_norm": 0.0001446118694730103, + "learning_rate": 5.525514476289823e-07, + "loss": 0.0, + "num_input_tokens_seen": 3443216, + "step": 18080 + }, + { + "epoch": 9.39968814968815, + "grad_norm": 0.001133911544457078, + "learning_rate": 5.478198450027422e-07, + "loss": 0.0, + "num_input_tokens_seen": 3444176, + "step": 18085 + }, + { + "epoch": 9.402286902286903, + "grad_norm": 0.012203383259475231, + "learning_rate": 5.431083639600737e-07, + "loss": 0.0, + "num_input_tokens_seen": 3445040, + "step": 18090 + }, + { + "epoch": 9.404885654885655, + "grad_norm": 0.0003145070804748684, + "learning_rate": 5.384170083780421e-07, + "loss": 0.0, + "num_input_tokens_seen": 3446000, + "step": 18095 + }, + { + "epoch": 9.407484407484407, + "grad_norm": 0.0006551517872139812, + "learning_rate": 5.337457821171316e-07, + "loss": 0.0005, + "num_input_tokens_seen": 3446928, + "step": 18100 + }, + { + "epoch": 9.41008316008316, + "grad_norm": 5.4340416681952775e-05, + "learning_rate": 5.290946890212756e-07, + "loss": 0.0, + "num_input_tokens_seen": 3447824, + "step": 18105 + }, + { + "epoch": 9.412681912681913, + "grad_norm": 0.000851136283017695, + "learning_rate": 5.244637329178403e-07, + "loss": 0.0, + "num_input_tokens_seen": 3448816, + "step": 18110 + }, + { + "epoch": 9.415280665280665, + "grad_norm": 0.0010104612447321415, + "learning_rate": 5.198529176176109e-07, + "loss": 0.0, + "num_input_tokens_seen": 3449808, + "step": 18115 + }, + { + "epoch": 9.417879417879417, + "grad_norm": 0.0019271825440227985, + "learning_rate": 5.152622469148133e-07, + "loss": 0.0, + "num_input_tokens_seen": 3450736, + "step": 18120 + }, + { + "epoch": 9.420478170478171, + "grad_norm": 0.006848897319287062, + "learning_rate": 5.10691724587084e-07, + "loss": 0.0005, + "num_input_tokens_seen": 3451664, + "step": 18125 + }, + { + "epoch": 9.423076923076923, + "grad_norm": 0.09393268078565598, + "learning_rate": 5.061413543954868e-07, + "loss": 0.0002, + "num_input_tokens_seen": 3452624, + "step": 18130 + }, + { + "epoch": 9.425675675675675, + "grad_norm": 0.00036140726297162473, + "learning_rate": 5.016111400844958e-07, + "loss": 0.0, + "num_input_tokens_seen": 3453552, + "step": 18135 + }, + { + "epoch": 9.428274428274428, + "grad_norm": 0.0004896890604868531, + "learning_rate": 4.971010853820069e-07, + "loss": 0.0, + "num_input_tokens_seen": 3454576, + "step": 18140 + }, + { + "epoch": 9.430873180873181, + "grad_norm": 0.00015185131633188576, + "learning_rate": 4.926111939993206e-07, + "loss": 0.0, + "num_input_tokens_seen": 3455504, + "step": 18145 + }, + { + "epoch": 9.433471933471933, + "grad_norm": 0.00020401868096087128, + "learning_rate": 4.881414696311482e-07, + "loss": 0.0, + "num_input_tokens_seen": 3456432, + "step": 18150 + }, + { + "epoch": 9.436070686070686, + "grad_norm": 3.0165014322847128e-05, + "learning_rate": 4.83691915955603e-07, + "loss": 0.0001, + "num_input_tokens_seen": 3457424, + "step": 18155 + }, + { + "epoch": 9.43866943866944, + "grad_norm": 0.0005299855256453156, + "learning_rate": 4.792625366342062e-07, + "loss": 0.0013, + "num_input_tokens_seen": 3458352, + "step": 18160 + }, + { + "epoch": 9.441268191268192, + "grad_norm": 0.0004165049467701465, + "learning_rate": 4.7485333531187003e-07, + "loss": 0.0, + "num_input_tokens_seen": 3459312, + "step": 18165 + }, + { + "epoch": 9.443866943866944, + "grad_norm": 0.022996824234724045, + "learning_rate": 4.7046431561690307e-07, + "loss": 0.0001, + "num_input_tokens_seen": 3460272, + "step": 18170 + }, + { + "epoch": 9.446465696465696, + "grad_norm": 4.40697003796231e-05, + "learning_rate": 4.6609548116101354e-07, + "loss": 0.0505, + "num_input_tokens_seen": 3461264, + "step": 18175 + }, + { + "epoch": 9.44906444906445, + "grad_norm": 0.0007250399794429541, + "learning_rate": 4.6174683553928954e-07, + "loss": 0.0, + "num_input_tokens_seen": 3462160, + "step": 18180 + }, + { + "epoch": 9.451663201663202, + "grad_norm": 0.0005581245059147477, + "learning_rate": 4.574183823302186e-07, + "loss": 0.0006, + "num_input_tokens_seen": 3463152, + "step": 18185 + }, + { + "epoch": 9.454261954261954, + "grad_norm": 0.0007513653836213052, + "learning_rate": 4.531101250956571e-07, + "loss": 0.0, + "num_input_tokens_seen": 3464112, + "step": 18190 + }, + { + "epoch": 9.456860706860708, + "grad_norm": 0.0003657809866126627, + "learning_rate": 4.4882206738085243e-07, + "loss": 0.0013, + "num_input_tokens_seen": 3465136, + "step": 18195 + }, + { + "epoch": 9.45945945945946, + "grad_norm": 0.0002839324006345123, + "learning_rate": 4.445542127144292e-07, + "loss": 0.0, + "num_input_tokens_seen": 3466128, + "step": 18200 + }, + { + "epoch": 9.462058212058212, + "grad_norm": 0.010498784482479095, + "learning_rate": 4.403065646083809e-07, + "loss": 0.0001, + "num_input_tokens_seen": 3467152, + "step": 18205 + }, + { + "epoch": 9.464656964656964, + "grad_norm": 0.002678766380995512, + "learning_rate": 4.360791265580783e-07, + "loss": 0.0, + "num_input_tokens_seen": 3468080, + "step": 18210 + }, + { + "epoch": 9.467255717255718, + "grad_norm": 0.00032128821476362646, + "learning_rate": 4.318719020422607e-07, + "loss": 0.0, + "num_input_tokens_seen": 3468912, + "step": 18215 + }, + { + "epoch": 9.46985446985447, + "grad_norm": 0.00018762856780085713, + "learning_rate": 4.276848945230366e-07, + "loss": 0.0, + "num_input_tokens_seen": 3469776, + "step": 18220 + }, + { + "epoch": 9.472453222453222, + "grad_norm": 0.00031772066722624004, + "learning_rate": 4.235181074458694e-07, + "loss": 0.0, + "num_input_tokens_seen": 3470672, + "step": 18225 + }, + { + "epoch": 9.475051975051976, + "grad_norm": 0.0011666404316201806, + "learning_rate": 4.193715442395885e-07, + "loss": 0.0, + "num_input_tokens_seen": 3471632, + "step": 18230 + }, + { + "epoch": 9.477650727650728, + "grad_norm": 0.0004688883200287819, + "learning_rate": 4.152452083163866e-07, + "loss": 0.0, + "num_input_tokens_seen": 3472560, + "step": 18235 + }, + { + "epoch": 9.48024948024948, + "grad_norm": 0.000339907273883, + "learning_rate": 4.111391030718004e-07, + "loss": 0.0, + "num_input_tokens_seen": 3473456, + "step": 18240 + }, + { + "epoch": 9.482848232848234, + "grad_norm": 0.030727047473192215, + "learning_rate": 4.07053231884727e-07, + "loss": 0.0001, + "num_input_tokens_seen": 3474352, + "step": 18245 + }, + { + "epoch": 9.485446985446986, + "grad_norm": 0.010007999837398529, + "learning_rate": 4.0298759811741026e-07, + "loss": 0.0, + "num_input_tokens_seen": 3475312, + "step": 18250 + }, + { + "epoch": 9.488045738045738, + "grad_norm": 0.00012161341146565974, + "learning_rate": 3.989422051154407e-07, + "loss": 0.0, + "num_input_tokens_seen": 3476272, + "step": 18255 + }, + { + "epoch": 9.49064449064449, + "grad_norm": 0.00039852026384323835, + "learning_rate": 3.949170562077553e-07, + "loss": 0.0, + "num_input_tokens_seen": 3477200, + "step": 18260 + }, + { + "epoch": 9.493243243243244, + "grad_norm": 0.013236475177109241, + "learning_rate": 3.909121547066297e-07, + "loss": 0.0, + "num_input_tokens_seen": 3478160, + "step": 18265 + }, + { + "epoch": 9.495841995841996, + "grad_norm": 2.8041034966008738e-05, + "learning_rate": 3.8692750390767196e-07, + "loss": 0.0, + "num_input_tokens_seen": 3479088, + "step": 18270 + }, + { + "epoch": 9.498440748440748, + "grad_norm": 4.8214002163149416e-05, + "learning_rate": 3.8296310708984264e-07, + "loss": 0.0, + "num_input_tokens_seen": 3480048, + "step": 18275 + }, + { + "epoch": 9.5, + "eval_loss": 0.43667086958885193, + "eval_runtime": 9.2918, + "eval_samples_per_second": 92.124, + "eval_steps_per_second": 23.031, + "num_input_tokens_seen": 3480592, + "step": 18278 + }, + { + "epoch": 9.5010395010395, + "grad_norm": 2.9020153306191787e-05, + "learning_rate": 3.7901896751541545e-07, + "loss": 0.0, + "num_input_tokens_seen": 3481008, + "step": 18280 + }, + { + "epoch": 9.503638253638254, + "grad_norm": 2.4284541723318398e-05, + "learning_rate": 3.750950884300108e-07, + "loss": 0.0, + "num_input_tokens_seen": 3481968, + "step": 18285 + }, + { + "epoch": 9.506237006237006, + "grad_norm": 0.04357615485787392, + "learning_rate": 3.71191473062571e-07, + "loss": 0.0001, + "num_input_tokens_seen": 3482960, + "step": 18290 + }, + { + "epoch": 9.508835758835758, + "grad_norm": 3.564465441741049e-05, + "learning_rate": 3.6730812462535404e-07, + "loss": 0.0, + "num_input_tokens_seen": 3483888, + "step": 18295 + }, + { + "epoch": 9.511434511434512, + "grad_norm": 0.007082756143063307, + "learning_rate": 3.6344504631395934e-07, + "loss": 0.0, + "num_input_tokens_seen": 3484880, + "step": 18300 + }, + { + "epoch": 9.514033264033264, + "grad_norm": 2.641979153850116e-05, + "learning_rate": 3.5960224130728857e-07, + "loss": 0.0, + "num_input_tokens_seen": 3485808, + "step": 18305 + }, + { + "epoch": 9.516632016632016, + "grad_norm": 2.7854963263962418e-05, + "learning_rate": 3.5577971276757325e-07, + "loss": 0.0, + "num_input_tokens_seen": 3486736, + "step": 18310 + }, + { + "epoch": 9.51923076923077, + "grad_norm": 3.0860843253321946e-05, + "learning_rate": 3.519774638403472e-07, + "loss": 0.0, + "num_input_tokens_seen": 3487600, + "step": 18315 + }, + { + "epoch": 9.521829521829522, + "grad_norm": 0.0005101534188725054, + "learning_rate": 3.481954976544716e-07, + "loss": 0.0, + "num_input_tokens_seen": 3488528, + "step": 18320 + }, + { + "epoch": 9.524428274428274, + "grad_norm": 0.0008934738580137491, + "learning_rate": 3.44433817322104e-07, + "loss": 0.0, + "num_input_tokens_seen": 3489488, + "step": 18325 + }, + { + "epoch": 9.527027027027026, + "grad_norm": 3.373104118509218e-05, + "learning_rate": 3.406924259387101e-07, + "loss": 0.0, + "num_input_tokens_seen": 3490384, + "step": 18330 + }, + { + "epoch": 9.52962577962578, + "grad_norm": 0.0014216230483725667, + "learning_rate": 3.369713265830715e-07, + "loss": 0.0, + "num_input_tokens_seen": 3491344, + "step": 18335 + }, + { + "epoch": 9.532224532224532, + "grad_norm": 0.0016429834067821503, + "learning_rate": 3.3327052231725276e-07, + "loss": 0.0, + "num_input_tokens_seen": 3492336, + "step": 18340 + }, + { + "epoch": 9.534823284823284, + "grad_norm": 9.464219328947365e-05, + "learning_rate": 3.2959001618664e-07, + "loss": 0.0, + "num_input_tokens_seen": 3493296, + "step": 18345 + }, + { + "epoch": 9.537422037422038, + "grad_norm": 2.7225880330661312e-05, + "learning_rate": 3.2592981121989384e-07, + "loss": 0.0, + "num_input_tokens_seen": 3494256, + "step": 18350 + }, + { + "epoch": 9.54002079002079, + "grad_norm": 3.319626557640731e-05, + "learning_rate": 3.222899104289856e-07, + "loss": 0.0, + "num_input_tokens_seen": 3495248, + "step": 18355 + }, + { + "epoch": 9.542619542619542, + "grad_norm": 0.003044809913262725, + "learning_rate": 3.18670316809172e-07, + "loss": 0.0, + "num_input_tokens_seen": 3496176, + "step": 18360 + }, + { + "epoch": 9.545218295218294, + "grad_norm": 0.0005604037432931364, + "learning_rate": 3.150710333389983e-07, + "loss": 0.0, + "num_input_tokens_seen": 3497168, + "step": 18365 + }, + { + "epoch": 9.547817047817048, + "grad_norm": 0.00010497955372557044, + "learning_rate": 3.114920629802981e-07, + "loss": 0.0, + "num_input_tokens_seen": 3498128, + "step": 18370 + }, + { + "epoch": 9.5504158004158, + "grad_norm": 3.584557998692617e-05, + "learning_rate": 3.0793340867818763e-07, + "loss": 0.0, + "num_input_tokens_seen": 3499056, + "step": 18375 + }, + { + "epoch": 9.553014553014552, + "grad_norm": 0.0012122030602768064, + "learning_rate": 3.04395073361069e-07, + "loss": 0.0007, + "num_input_tokens_seen": 3500112, + "step": 18380 + }, + { + "epoch": 9.555613305613306, + "grad_norm": 0.0006245138356462121, + "learning_rate": 3.008770599406213e-07, + "loss": 0.0, + "num_input_tokens_seen": 3501072, + "step": 18385 + }, + { + "epoch": 9.558212058212058, + "grad_norm": 4.4205065933056176e-05, + "learning_rate": 2.973793713118039e-07, + "loss": 0.0, + "num_input_tokens_seen": 3502032, + "step": 18390 + }, + { + "epoch": 9.56081081081081, + "grad_norm": 0.00013946845137979835, + "learning_rate": 2.9390201035284226e-07, + "loss": 0.0, + "num_input_tokens_seen": 3503056, + "step": 18395 + }, + { + "epoch": 9.563409563409563, + "grad_norm": 0.0040742699056863785, + "learning_rate": 2.904449799252418e-07, + "loss": 0.0, + "num_input_tokens_seen": 3504048, + "step": 18400 + }, + { + "epoch": 9.566008316008316, + "grad_norm": 3.65720406989567e-05, + "learning_rate": 2.870082828737797e-07, + "loss": 0.0006, + "num_input_tokens_seen": 3505008, + "step": 18405 + }, + { + "epoch": 9.568607068607069, + "grad_norm": 0.00014116396778263152, + "learning_rate": 2.8359192202649376e-07, + "loss": 0.0, + "num_input_tokens_seen": 3505936, + "step": 18410 + }, + { + "epoch": 9.57120582120582, + "grad_norm": 0.00022093355073593557, + "learning_rate": 2.8019590019469633e-07, + "loss": 0.0001, + "num_input_tokens_seen": 3506928, + "step": 18415 + }, + { + "epoch": 9.573804573804575, + "grad_norm": 0.0007938218768686056, + "learning_rate": 2.7682022017295197e-07, + "loss": 0.0, + "num_input_tokens_seen": 3507920, + "step": 18420 + }, + { + "epoch": 9.576403326403327, + "grad_norm": 0.001327398233115673, + "learning_rate": 2.734648847390997e-07, + "loss": 0.0046, + "num_input_tokens_seen": 3508848, + "step": 18425 + }, + { + "epoch": 9.579002079002079, + "grad_norm": 2.5258026123046875, + "learning_rate": 2.7012989665421706e-07, + "loss": 0.0037, + "num_input_tokens_seen": 3509776, + "step": 18430 + }, + { + "epoch": 9.58160083160083, + "grad_norm": 0.06869048625230789, + "learning_rate": 2.6681525866266157e-07, + "loss": 0.0, + "num_input_tokens_seen": 3510704, + "step": 18435 + }, + { + "epoch": 9.584199584199585, + "grad_norm": 0.003816426731646061, + "learning_rate": 2.635209734920291e-07, + "loss": 0.0, + "num_input_tokens_seen": 3511632, + "step": 18440 + }, + { + "epoch": 9.586798336798337, + "grad_norm": 8.736757445149124e-05, + "learning_rate": 2.602470438531679e-07, + "loss": 0.0, + "num_input_tokens_seen": 3512496, + "step": 18445 + }, + { + "epoch": 9.589397089397089, + "grad_norm": 0.0003165447851642966, + "learning_rate": 2.5699347244018404e-07, + "loss": 0.0, + "num_input_tokens_seen": 3513424, + "step": 18450 + }, + { + "epoch": 9.591995841995843, + "grad_norm": 0.0009925984777510166, + "learning_rate": 2.537602619304247e-07, + "loss": 0.0, + "num_input_tokens_seen": 3514352, + "step": 18455 + }, + { + "epoch": 9.594594594594595, + "grad_norm": 0.00015998008893802762, + "learning_rate": 2.5054741498448386e-07, + "loss": 0.0, + "num_input_tokens_seen": 3515344, + "step": 18460 + }, + { + "epoch": 9.597193347193347, + "grad_norm": 2.7087773560197093e-05, + "learning_rate": 2.4735493424619394e-07, + "loss": 0.0, + "num_input_tokens_seen": 3516240, + "step": 18465 + }, + { + "epoch": 9.5997920997921, + "grad_norm": 8.897163934307173e-05, + "learning_rate": 2.4418282234263957e-07, + "loss": 0.0001, + "num_input_tokens_seen": 3517232, + "step": 18470 + }, + { + "epoch": 9.602390852390853, + "grad_norm": 3.012627894349862e-05, + "learning_rate": 2.410310818841299e-07, + "loss": 0.0, + "num_input_tokens_seen": 3518032, + "step": 18475 + }, + { + "epoch": 9.604989604989605, + "grad_norm": 2.888972520828247, + "learning_rate": 2.3789971546422374e-07, + "loss": 0.0293, + "num_input_tokens_seen": 3518992, + "step": 18480 + }, + { + "epoch": 9.607588357588357, + "grad_norm": 3.0387800507014617e-05, + "learning_rate": 2.3478872565969867e-07, + "loss": 0.0, + "num_input_tokens_seen": 3519984, + "step": 18485 + }, + { + "epoch": 9.61018711018711, + "grad_norm": 2.992710142279975e-05, + "learning_rate": 2.316981150305847e-07, + "loss": 0.0, + "num_input_tokens_seen": 3520880, + "step": 18490 + }, + { + "epoch": 9.612785862785863, + "grad_norm": 0.0032379853073507547, + "learning_rate": 2.2862788612012244e-07, + "loss": 0.0, + "num_input_tokens_seen": 3521744, + "step": 18495 + }, + { + "epoch": 9.615384615384615, + "grad_norm": 0.0033854912035167217, + "learning_rate": 2.255780414547909e-07, + "loss": 0.0, + "num_input_tokens_seen": 3522768, + "step": 18500 + }, + { + "epoch": 9.617983367983367, + "grad_norm": 4.019308107672259e-05, + "learning_rate": 2.2254858354429364e-07, + "loss": 0.0023, + "num_input_tokens_seen": 3523760, + "step": 18505 + }, + { + "epoch": 9.620582120582121, + "grad_norm": 2.6105453798663802e-05, + "learning_rate": 2.19539514881556e-07, + "loss": 0.0, + "num_input_tokens_seen": 3524720, + "step": 18510 + }, + { + "epoch": 9.623180873180873, + "grad_norm": 0.004951151553541422, + "learning_rate": 2.165508379427278e-07, + "loss": 0.0, + "num_input_tokens_seen": 3525712, + "step": 18515 + }, + { + "epoch": 9.625779625779625, + "grad_norm": 0.0003067716024816036, + "learning_rate": 2.1358255518717786e-07, + "loss": 0.0, + "num_input_tokens_seen": 3526576, + "step": 18520 + }, + { + "epoch": 9.628378378378379, + "grad_norm": 0.00014618309796787798, + "learning_rate": 2.106346690574912e-07, + "loss": 0.0, + "num_input_tokens_seen": 3527504, + "step": 18525 + }, + { + "epoch": 9.630977130977131, + "grad_norm": 0.00011081875709351152, + "learning_rate": 2.0770718197946625e-07, + "loss": 0.0, + "num_input_tokens_seen": 3528528, + "step": 18530 + }, + { + "epoch": 9.633575883575883, + "grad_norm": 9.032480738824233e-05, + "learning_rate": 2.0480009636212327e-07, + "loss": 0.0, + "num_input_tokens_seen": 3529424, + "step": 18535 + }, + { + "epoch": 9.636174636174637, + "grad_norm": 0.12627112865447998, + "learning_rate": 2.0191341459768475e-07, + "loss": 0.0001, + "num_input_tokens_seen": 3530416, + "step": 18540 + }, + { + "epoch": 9.638773388773389, + "grad_norm": 2.7514783141668886e-05, + "learning_rate": 1.9904713906159224e-07, + "loss": 0.0, + "num_input_tokens_seen": 3531376, + "step": 18545 + }, + { + "epoch": 9.641372141372141, + "grad_norm": 7.255005039041862e-05, + "learning_rate": 1.9620127211248672e-07, + "loss": 0.0001, + "num_input_tokens_seen": 3532272, + "step": 18550 + }, + { + "epoch": 9.643970893970893, + "grad_norm": 0.00013435316213872284, + "learning_rate": 1.9337581609222277e-07, + "loss": 0.0, + "num_input_tokens_seen": 3533264, + "step": 18555 + }, + { + "epoch": 9.646569646569647, + "grad_norm": 0.0019600673113018274, + "learning_rate": 1.9057077332584883e-07, + "loss": 0.0, + "num_input_tokens_seen": 3534160, + "step": 18560 + }, + { + "epoch": 9.6491683991684, + "grad_norm": 0.0006094026030041277, + "learning_rate": 1.8778614612162404e-07, + "loss": 0.0, + "num_input_tokens_seen": 3535184, + "step": 18565 + }, + { + "epoch": 9.651767151767151, + "grad_norm": 0.00010799027950270101, + "learning_rate": 1.850219367710071e-07, + "loss": 0.0, + "num_input_tokens_seen": 3536144, + "step": 18570 + }, + { + "epoch": 9.654365904365905, + "grad_norm": 2.722335557336919e-05, + "learning_rate": 1.8227814754865068e-07, + "loss": 0.0, + "num_input_tokens_seen": 3537104, + "step": 18575 + }, + { + "epoch": 9.656964656964657, + "grad_norm": 0.0005342878866940737, + "learning_rate": 1.7955478071240706e-07, + "loss": 0.0, + "num_input_tokens_seen": 3538128, + "step": 18580 + }, + { + "epoch": 9.65956340956341, + "grad_norm": 0.0025687762536108494, + "learning_rate": 1.7685183850331965e-07, + "loss": 0.0, + "num_input_tokens_seen": 3539056, + "step": 18585 + }, + { + "epoch": 9.662162162162161, + "grad_norm": 0.0011363346129655838, + "learning_rate": 1.7416932314562872e-07, + "loss": 0.0, + "num_input_tokens_seen": 3539984, + "step": 18590 + }, + { + "epoch": 9.664760914760915, + "grad_norm": 3.651330916909501e-05, + "learning_rate": 1.7150723684676572e-07, + "loss": 0.0002, + "num_input_tokens_seen": 3540880, + "step": 18595 + }, + { + "epoch": 9.667359667359667, + "grad_norm": 2.8878459488623776e-05, + "learning_rate": 1.6886558179734225e-07, + "loss": 0.0, + "num_input_tokens_seen": 3541840, + "step": 18600 + }, + { + "epoch": 9.66995841995842, + "grad_norm": 0.00024087722704280168, + "learning_rate": 1.662443601711694e-07, + "loss": 0.0, + "num_input_tokens_seen": 3542768, + "step": 18605 + }, + { + "epoch": 9.672557172557173, + "grad_norm": 0.002416686387732625, + "learning_rate": 1.6364357412523845e-07, + "loss": 0.0, + "num_input_tokens_seen": 3543696, + "step": 18610 + }, + { + "epoch": 9.675155925155925, + "grad_norm": 0.0006469325162470341, + "learning_rate": 1.6106322579972077e-07, + "loss": 0.0003, + "num_input_tokens_seen": 3544688, + "step": 18615 + }, + { + "epoch": 9.677754677754677, + "grad_norm": 0.001557886484079063, + "learning_rate": 1.585033173179734e-07, + "loss": 0.0, + "num_input_tokens_seen": 3545616, + "step": 18620 + }, + { + "epoch": 9.68035343035343, + "grad_norm": 0.00040730973705649376, + "learning_rate": 1.5596385078653353e-07, + "loss": 0.0042, + "num_input_tokens_seen": 3546576, + "step": 18625 + }, + { + "epoch": 9.682952182952183, + "grad_norm": 0.0006206760299392045, + "learning_rate": 1.5344482829511842e-07, + "loss": 0.0, + "num_input_tokens_seen": 3547440, + "step": 18630 + }, + { + "epoch": 9.685550935550935, + "grad_norm": 0.00018380992696620524, + "learning_rate": 1.5094625191661715e-07, + "loss": 0.0, + "num_input_tokens_seen": 3548400, + "step": 18635 + }, + { + "epoch": 9.688149688149688, + "grad_norm": 3.0558425351046026e-05, + "learning_rate": 1.4846812370709617e-07, + "loss": 0.0, + "num_input_tokens_seen": 3549392, + "step": 18640 + }, + { + "epoch": 9.690748440748441, + "grad_norm": 0.0024298178032040596, + "learning_rate": 1.4601044570579647e-07, + "loss": 0.0001, + "num_input_tokens_seen": 3550416, + "step": 18645 + }, + { + "epoch": 9.693347193347194, + "grad_norm": 2.6055617126985453e-05, + "learning_rate": 1.4357321993513084e-07, + "loss": 0.0, + "num_input_tokens_seen": 3551408, + "step": 18650 + }, + { + "epoch": 9.695945945945946, + "grad_norm": 0.0003365107986610383, + "learning_rate": 1.4115644840067833e-07, + "loss": 0.0, + "num_input_tokens_seen": 3552304, + "step": 18655 + }, + { + "epoch": 9.698544698544698, + "grad_norm": 0.0025624874979257584, + "learning_rate": 1.3876013309118697e-07, + "loss": 0.0, + "num_input_tokens_seen": 3553296, + "step": 18660 + }, + { + "epoch": 9.701143451143452, + "grad_norm": 0.00048816337948665023, + "learning_rate": 1.363842759785794e-07, + "loss": 0.0, + "num_input_tokens_seen": 3554320, + "step": 18665 + }, + { + "epoch": 9.703742203742204, + "grad_norm": 2.828481956385076e-05, + "learning_rate": 1.3402887901793338e-07, + "loss": 0.0, + "num_input_tokens_seen": 3555344, + "step": 18670 + }, + { + "epoch": 9.706340956340956, + "grad_norm": 0.00028078825562261045, + "learning_rate": 1.316939441474957e-07, + "loss": 0.0003, + "num_input_tokens_seen": 3556368, + "step": 18675 + }, + { + "epoch": 9.70893970893971, + "grad_norm": 0.00021840145927853882, + "learning_rate": 1.2937947328867106e-07, + "loss": 0.0, + "num_input_tokens_seen": 3557360, + "step": 18680 + }, + { + "epoch": 9.711538461538462, + "grad_norm": 0.00020729194511659443, + "learning_rate": 1.270854683460304e-07, + "loss": 0.0, + "num_input_tokens_seen": 3558352, + "step": 18685 + }, + { + "epoch": 9.714137214137214, + "grad_norm": 0.000319237558869645, + "learning_rate": 1.2481193120729427e-07, + "loss": 0.0, + "num_input_tokens_seen": 3559376, + "step": 18690 + }, + { + "epoch": 9.716735966735968, + "grad_norm": 0.0001711398654151708, + "learning_rate": 1.2255886374334946e-07, + "loss": 0.0, + "num_input_tokens_seen": 3560336, + "step": 18695 + }, + { + "epoch": 9.71933471933472, + "grad_norm": 0.0026176145765930414, + "learning_rate": 1.203262678082323e-07, + "loss": 0.0, + "num_input_tokens_seen": 3561232, + "step": 18700 + }, + { + "epoch": 9.721933471933472, + "grad_norm": 0.005601261276751757, + "learning_rate": 1.1811414523913711e-07, + "loss": 0.0, + "num_input_tokens_seen": 3562192, + "step": 18705 + }, + { + "epoch": 9.724532224532224, + "grad_norm": 3.0855826480546966e-05, + "learning_rate": 1.1592249785641052e-07, + "loss": 0.0, + "num_input_tokens_seen": 3563120, + "step": 18710 + }, + { + "epoch": 9.727130977130978, + "grad_norm": 0.00219647865742445, + "learning_rate": 1.1375132746354322e-07, + "loss": 0.0, + "num_input_tokens_seen": 3564080, + "step": 18715 + }, + { + "epoch": 9.72972972972973, + "grad_norm": 0.0017456233035773039, + "learning_rate": 1.1160063584718661e-07, + "loss": 0.0, + "num_input_tokens_seen": 3565040, + "step": 18720 + }, + { + "epoch": 9.732328482328482, + "grad_norm": 0.017462264746427536, + "learning_rate": 1.0947042477713332e-07, + "loss": 0.0, + "num_input_tokens_seen": 3565968, + "step": 18725 + }, + { + "epoch": 9.734927234927234, + "grad_norm": 0.000318835984217003, + "learning_rate": 1.0736069600632281e-07, + "loss": 0.0, + "num_input_tokens_seen": 3566864, + "step": 18730 + }, + { + "epoch": 9.737525987525988, + "grad_norm": 2.7955420591752045e-05, + "learning_rate": 1.0527145127084136e-07, + "loss": 0.0, + "num_input_tokens_seen": 3567664, + "step": 18735 + }, + { + "epoch": 9.74012474012474, + "grad_norm": 3.636151450336911e-05, + "learning_rate": 1.032026922899193e-07, + "loss": 0.0, + "num_input_tokens_seen": 3568592, + "step": 18740 + }, + { + "epoch": 9.742723492723492, + "grad_norm": 0.00239665643312037, + "learning_rate": 1.0115442076592541e-07, + "loss": 0.0, + "num_input_tokens_seen": 3569520, + "step": 18745 + }, + { + "epoch": 9.745322245322246, + "grad_norm": 0.00038496681372635067, + "learning_rate": 9.912663838437808e-08, + "loss": 0.0, + "num_input_tokens_seen": 3570480, + "step": 18750 + }, + { + "epoch": 9.747920997920998, + "grad_norm": 0.0003270990855526179, + "learning_rate": 9.711934681392587e-08, + "loss": 0.0, + "num_input_tokens_seen": 3571408, + "step": 18755 + }, + { + "epoch": 9.75051975051975, + "grad_norm": 0.0013020826736465096, + "learning_rate": 9.513254770636137e-08, + "loss": 0.0, + "num_input_tokens_seen": 3572336, + "step": 18760 + }, + { + "epoch": 9.753118503118504, + "grad_norm": 0.0005459475796669722, + "learning_rate": 9.31662426966129e-08, + "loss": 0.0, + "num_input_tokens_seen": 3573200, + "step": 18765 + }, + { + "epoch": 9.755717255717256, + "grad_norm": 0.00031554209999740124, + "learning_rate": 9.122043340273889e-08, + "loss": 0.0, + "num_input_tokens_seen": 3574160, + "step": 18770 + }, + { + "epoch": 9.758316008316008, + "grad_norm": 0.00042455774382688105, + "learning_rate": 8.929512142594187e-08, + "loss": 0.0, + "num_input_tokens_seen": 3575120, + "step": 18775 + }, + { + "epoch": 9.76091476091476, + "grad_norm": 0.0002985481114592403, + "learning_rate": 8.739030835055173e-08, + "loss": 0.0, + "num_input_tokens_seen": 3576016, + "step": 18780 + }, + { + "epoch": 9.763513513513514, + "grad_norm": 2.8314207156654447e-05, + "learning_rate": 8.550599574402574e-08, + "loss": 0.0, + "num_input_tokens_seen": 3577008, + "step": 18785 + }, + { + "epoch": 9.766112266112266, + "grad_norm": 2.8591857699211687e-05, + "learning_rate": 8.364218515695965e-08, + "loss": 0.0, + "num_input_tokens_seen": 3577968, + "step": 18790 + }, + { + "epoch": 9.768711018711018, + "grad_norm": 0.0027335600461810827, + "learning_rate": 8.179887812307386e-08, + "loss": 0.0, + "num_input_tokens_seen": 3578864, + "step": 18795 + }, + { + "epoch": 9.771309771309772, + "grad_norm": 0.8295284509658813, + "learning_rate": 7.99760761592161e-08, + "loss": 0.0032, + "num_input_tokens_seen": 3579824, + "step": 18800 + }, + { + "epoch": 9.773908523908524, + "grad_norm": 0.00042477462557144463, + "learning_rate": 7.817378076536153e-08, + "loss": 0.0001, + "num_input_tokens_seen": 3580816, + "step": 18805 + }, + { + "epoch": 9.776507276507276, + "grad_norm": 0.0007316862465813756, + "learning_rate": 7.63919934246099e-08, + "loss": 0.0, + "num_input_tokens_seen": 3581776, + "step": 18810 + }, + { + "epoch": 9.779106029106028, + "grad_norm": 9.360715193906799e-05, + "learning_rate": 7.463071560318835e-08, + "loss": 0.0313, + "num_input_tokens_seen": 3582736, + "step": 18815 + }, + { + "epoch": 9.781704781704782, + "grad_norm": 0.00025467126397415996, + "learning_rate": 7.288994875044308e-08, + "loss": 0.0, + "num_input_tokens_seen": 3583632, + "step": 18820 + }, + { + "epoch": 9.784303534303534, + "grad_norm": 0.0010216933442279696, + "learning_rate": 7.116969429883935e-08, + "loss": 0.0, + "num_input_tokens_seen": 3584656, + "step": 18825 + }, + { + "epoch": 9.786902286902286, + "grad_norm": 0.0004308489151299, + "learning_rate": 6.946995366397257e-08, + "loss": 0.0, + "num_input_tokens_seen": 3585584, + "step": 18830 + }, + { + "epoch": 9.78950103950104, + "grad_norm": 3.032435051864013e-05, + "learning_rate": 6.779072824454614e-08, + "loss": 0.0, + "num_input_tokens_seen": 3586512, + "step": 18835 + }, + { + "epoch": 9.792099792099792, + "grad_norm": 0.006523940712213516, + "learning_rate": 6.6132019422388e-08, + "loss": 0.0, + "num_input_tokens_seen": 3587440, + "step": 18840 + }, + { + "epoch": 9.794698544698544, + "grad_norm": 0.0005824169493280351, + "learning_rate": 6.449382856244246e-08, + "loss": 0.0, + "num_input_tokens_seen": 3588368, + "step": 18845 + }, + { + "epoch": 9.797297297297296, + "grad_norm": 0.00028836846468038857, + "learning_rate": 6.287615701277005e-08, + "loss": 0.0, + "num_input_tokens_seen": 3589328, + "step": 18850 + }, + { + "epoch": 9.79989604989605, + "grad_norm": 0.000334021111484617, + "learning_rate": 6.127900610454207e-08, + "loss": 0.0, + "num_input_tokens_seen": 3590320, + "step": 18855 + }, + { + "epoch": 9.802494802494802, + "grad_norm": 0.0002808893914334476, + "learning_rate": 5.970237715204885e-08, + "loss": 0.0, + "num_input_tokens_seen": 3591280, + "step": 18860 + }, + { + "epoch": 9.805093555093555, + "grad_norm": 2.8591841328307055e-05, + "learning_rate": 5.814627145269147e-08, + "loss": 0.0, + "num_input_tokens_seen": 3592272, + "step": 18865 + }, + { + "epoch": 9.807692307692308, + "grad_norm": 0.0005848908913321793, + "learning_rate": 5.661069028697896e-08, + "loss": 0.0, + "num_input_tokens_seen": 3593136, + "step": 18870 + }, + { + "epoch": 9.81029106029106, + "grad_norm": 0.004378908313810825, + "learning_rate": 5.509563491853942e-08, + "loss": 0.0, + "num_input_tokens_seen": 3594096, + "step": 18875 + }, + { + "epoch": 9.812889812889813, + "grad_norm": 2.6667117708711885e-05, + "learning_rate": 5.3601106594097784e-08, + "loss": 0.0001, + "num_input_tokens_seen": 3595056, + "step": 18880 + }, + { + "epoch": 9.815488565488565, + "grad_norm": 0.001440273248590529, + "learning_rate": 5.2127106543498063e-08, + "loss": 0.0001, + "num_input_tokens_seen": 3595984, + "step": 18885 + }, + { + "epoch": 9.818087318087318, + "grad_norm": 0.00018748006550595164, + "learning_rate": 5.0673635979686665e-08, + "loss": 0.0, + "num_input_tokens_seen": 3596880, + "step": 18890 + }, + { + "epoch": 9.82068607068607, + "grad_norm": 0.0021661315113306046, + "learning_rate": 4.924069609872073e-08, + "loss": 0.0, + "num_input_tokens_seen": 3597808, + "step": 18895 + }, + { + "epoch": 9.823284823284823, + "grad_norm": 0.0015038087731227279, + "learning_rate": 4.7828288079757035e-08, + "loss": 0.0, + "num_input_tokens_seen": 3598768, + "step": 18900 + }, + { + "epoch": 9.825883575883577, + "grad_norm": 0.006135039497166872, + "learning_rate": 4.643641308505753e-08, + "loss": 0.0, + "num_input_tokens_seen": 3599664, + "step": 18905 + }, + { + "epoch": 9.828482328482329, + "grad_norm": 0.00015737657668069005, + "learning_rate": 4.50650722599949e-08, + "loss": 0.0, + "num_input_tokens_seen": 3600592, + "step": 18910 + }, + { + "epoch": 9.83108108108108, + "grad_norm": 0.0003454650577623397, + "learning_rate": 4.3714266733035914e-08, + "loss": 0.0, + "num_input_tokens_seen": 3601552, + "step": 18915 + }, + { + "epoch": 9.833679833679835, + "grad_norm": 5.608478750218637e-05, + "learning_rate": 4.238399761574974e-08, + "loss": 0.0, + "num_input_tokens_seen": 3602512, + "step": 18920 + }, + { + "epoch": 9.836278586278587, + "grad_norm": 0.0006060526357032359, + "learning_rate": 4.10742660028135e-08, + "loss": 0.0, + "num_input_tokens_seen": 3603472, + "step": 18925 + }, + { + "epoch": 9.838877338877339, + "grad_norm": 0.00012992887059226632, + "learning_rate": 3.978507297199285e-08, + "loss": 0.0, + "num_input_tokens_seen": 3604400, + "step": 18930 + }, + { + "epoch": 9.84147609147609, + "grad_norm": 0.007745292503386736, + "learning_rate": 3.851641958416696e-08, + "loss": 0.0, + "num_input_tokens_seen": 3605360, + "step": 18935 + }, + { + "epoch": 9.844074844074845, + "grad_norm": 0.00042907620081678033, + "learning_rate": 3.7268306883297966e-08, + "loss": 0.014, + "num_input_tokens_seen": 3606384, + "step": 18940 + }, + { + "epoch": 9.846673596673597, + "grad_norm": 0.000165243458468467, + "learning_rate": 3.604073589645596e-08, + "loss": 0.0, + "num_input_tokens_seen": 3607312, + "step": 18945 + }, + { + "epoch": 9.849272349272349, + "grad_norm": 0.001676070154644549, + "learning_rate": 3.4833707633799565e-08, + "loss": 0.0001, + "num_input_tokens_seen": 3608144, + "step": 18950 + }, + { + "epoch": 9.851871101871101, + "grad_norm": 0.0007295624818652868, + "learning_rate": 3.3647223088589805e-08, + "loss": 0.0, + "num_input_tokens_seen": 3609104, + "step": 18955 + }, + { + "epoch": 9.854469854469855, + "grad_norm": 0.0014708656817674637, + "learning_rate": 3.248128323717625e-08, + "loss": 0.0, + "num_input_tokens_seen": 3610096, + "step": 18960 + }, + { + "epoch": 9.857068607068607, + "grad_norm": 2.634536758705508e-05, + "learning_rate": 3.133588903900808e-08, + "loss": 0.0, + "num_input_tokens_seen": 3611056, + "step": 18965 + }, + { + "epoch": 9.859667359667359, + "grad_norm": 0.00023158820113167167, + "learning_rate": 3.021104143662301e-08, + "loss": 0.0, + "num_input_tokens_seen": 3612016, + "step": 18970 + }, + { + "epoch": 9.862266112266113, + "grad_norm": 0.00023250553931575269, + "learning_rate": 2.910674135565561e-08, + "loss": 0.0, + "num_input_tokens_seen": 3612944, + "step": 18975 + }, + { + "epoch": 9.864864864864865, + "grad_norm": 0.00242093694396317, + "learning_rate": 2.8022989704826196e-08, + "loss": 0.0, + "num_input_tokens_seen": 3613936, + "step": 18980 + }, + { + "epoch": 9.867463617463617, + "grad_norm": 0.0006561155314557254, + "learning_rate": 2.6959787375949174e-08, + "loss": 0.0, + "num_input_tokens_seen": 3614832, + "step": 18985 + }, + { + "epoch": 9.87006237006237, + "grad_norm": 0.0002898171078413725, + "learning_rate": 2.5917135243930245e-08, + "loss": 0.0, + "num_input_tokens_seen": 3615824, + "step": 18990 + }, + { + "epoch": 9.872661122661123, + "grad_norm": 0.004703233949840069, + "learning_rate": 2.4895034166760865e-08, + "loss": 0.0, + "num_input_tokens_seen": 3616720, + "step": 18995 + }, + { + "epoch": 9.875259875259875, + "grad_norm": 0.0003510844544507563, + "learning_rate": 2.389348498552657e-08, + "loss": 0.0, + "num_input_tokens_seen": 3617744, + "step": 19000 + }, + { + "epoch": 9.877858627858627, + "grad_norm": 0.00027505363686941564, + "learning_rate": 2.2912488524393095e-08, + "loss": 0.0, + "num_input_tokens_seen": 3618736, + "step": 19005 + }, + { + "epoch": 9.880457380457381, + "grad_norm": 0.0005130342324264348, + "learning_rate": 2.1952045590620253e-08, + "loss": 0.0, + "num_input_tokens_seen": 3619664, + "step": 19010 + }, + { + "epoch": 9.883056133056133, + "grad_norm": 0.0006426223553717136, + "learning_rate": 2.101215697455361e-08, + "loss": 0.0, + "num_input_tokens_seen": 3620656, + "step": 19015 + }, + { + "epoch": 9.885654885654885, + "grad_norm": 0.000506172829773277, + "learning_rate": 2.0092823449618935e-08, + "loss": 0.0, + "num_input_tokens_seen": 3621648, + "step": 19020 + }, + { + "epoch": 9.888253638253639, + "grad_norm": 0.0004729668435174972, + "learning_rate": 1.9194045772336077e-08, + "loss": 0.061, + "num_input_tokens_seen": 3622576, + "step": 19025 + }, + { + "epoch": 9.890852390852391, + "grad_norm": 0.12708772718906403, + "learning_rate": 1.831582468229953e-08, + "loss": 0.0002, + "num_input_tokens_seen": 3623536, + "step": 19030 + }, + { + "epoch": 9.893451143451143, + "grad_norm": 8.848586003296077e-05, + "learning_rate": 1.7458160902197872e-08, + "loss": 0.0, + "num_input_tokens_seen": 3624592, + "step": 19035 + }, + { + "epoch": 9.896049896049895, + "grad_norm": 2.3501106625190005e-05, + "learning_rate": 1.6621055137797105e-08, + "loss": 0.0, + "num_input_tokens_seen": 3625520, + "step": 19040 + }, + { + "epoch": 9.89864864864865, + "grad_norm": 2.9143055144231766e-05, + "learning_rate": 1.5804508077946202e-08, + "loss": 0.0, + "num_input_tokens_seen": 3626480, + "step": 19045 + }, + { + "epoch": 9.901247401247401, + "grad_norm": 0.0002192132087657228, + "learning_rate": 1.500852039458267e-08, + "loss": 0.0, + "num_input_tokens_seen": 3627408, + "step": 19050 + }, + { + "epoch": 9.903846153846153, + "grad_norm": 2.7837249945150688e-05, + "learning_rate": 1.4233092742713116e-08, + "loss": 0.0478, + "num_input_tokens_seen": 3628336, + "step": 19055 + }, + { + "epoch": 9.906444906444907, + "grad_norm": 0.004456010181456804, + "learning_rate": 1.3478225760441e-08, + "loss": 0.0, + "num_input_tokens_seen": 3629200, + "step": 19060 + }, + { + "epoch": 9.90904365904366, + "grad_norm": 0.0010271648643538356, + "learning_rate": 1.2743920068938874e-08, + "loss": 0.0, + "num_input_tokens_seen": 3630192, + "step": 19065 + }, + { + "epoch": 9.911642411642411, + "grad_norm": 0.0004980653175152838, + "learning_rate": 1.203017627246228e-08, + "loss": 0.0, + "num_input_tokens_seen": 3631248, + "step": 19070 + }, + { + "epoch": 9.914241164241163, + "grad_norm": 2.9990156690473668e-05, + "learning_rate": 1.1336994958349723e-08, + "loss": 0.0, + "num_input_tokens_seen": 3632176, + "step": 19075 + }, + { + "epoch": 9.916839916839917, + "grad_norm": 7.283723243745044e-05, + "learning_rate": 1.0664376697017142e-08, + "loss": 0.0, + "num_input_tokens_seen": 3633136, + "step": 19080 + }, + { + "epoch": 9.91943866943867, + "grad_norm": 9.9582874099724e-05, + "learning_rate": 1.0012322041960676e-08, + "loss": 0.0, + "num_input_tokens_seen": 3634032, + "step": 19085 + }, + { + "epoch": 9.922037422037421, + "grad_norm": 0.0001848822139436379, + "learning_rate": 9.38083152974556e-09, + "loss": 0.0, + "num_input_tokens_seen": 3634992, + "step": 19090 + }, + { + "epoch": 9.924636174636175, + "grad_norm": 7.257855759235099e-05, + "learning_rate": 8.76990568003111e-09, + "loss": 0.0, + "num_input_tokens_seen": 3635920, + "step": 19095 + }, + { + "epoch": 9.927234927234927, + "grad_norm": 1.307535171508789, + "learning_rate": 8.17954499554019e-09, + "loss": 0.0087, + "num_input_tokens_seen": 3636816, + "step": 19100 + }, + { + "epoch": 9.92983367983368, + "grad_norm": 0.0005698160384781659, + "learning_rate": 7.609749962081413e-09, + "loss": 0.0, + "num_input_tokens_seen": 3637744, + "step": 19105 + }, + { + "epoch": 9.932432432432432, + "grad_norm": 0.0002383155660936609, + "learning_rate": 7.060521048532498e-09, + "loss": 0.0, + "num_input_tokens_seen": 3638640, + "step": 19110 + }, + { + "epoch": 9.935031185031185, + "grad_norm": 0.0005676773143932223, + "learning_rate": 6.5318587068541325e-09, + "loss": 0.0, + "num_input_tokens_seen": 3639632, + "step": 19115 + }, + { + "epoch": 9.937629937629938, + "grad_norm": 3.30390794260893e-05, + "learning_rate": 6.023763372076108e-09, + "loss": 0.0002, + "num_input_tokens_seen": 3640592, + "step": 19120 + }, + { + "epoch": 9.94022869022869, + "grad_norm": 0.00016956344188656658, + "learning_rate": 5.536235462313965e-09, + "loss": 0.0, + "num_input_tokens_seen": 3641520, + "step": 19125 + }, + { + "epoch": 9.942827442827443, + "grad_norm": 3.032637687283568e-05, + "learning_rate": 5.069275378746796e-09, + "loss": 0.0, + "num_input_tokens_seen": 3642448, + "step": 19130 + }, + { + "epoch": 9.945426195426196, + "grad_norm": 0.3908615708351135, + "learning_rate": 4.622883505636666e-09, + "loss": 0.0006, + "num_input_tokens_seen": 3643312, + "step": 19135 + }, + { + "epoch": 9.948024948024948, + "grad_norm": 7.947752601467073e-05, + "learning_rate": 4.197060210317516e-09, + "loss": 0.0, + "num_input_tokens_seen": 3644208, + "step": 19140 + }, + { + "epoch": 9.950623700623701, + "grad_norm": 2.7132527975481935e-05, + "learning_rate": 3.791805843195162e-09, + "loss": 0.0, + "num_input_tokens_seen": 3645136, + "step": 19145 + }, + { + "epoch": 9.953222453222454, + "grad_norm": 0.0008503007120452821, + "learning_rate": 3.4071207377500693e-09, + "loss": 0.0, + "num_input_tokens_seen": 3646128, + "step": 19150 + }, + { + "epoch": 9.955821205821206, + "grad_norm": 3.440360160311684e-05, + "learning_rate": 3.043005210542904e-09, + "loss": 0.0, + "num_input_tokens_seen": 3647152, + "step": 19155 + }, + { + "epoch": 9.958419958419958, + "grad_norm": 0.00034672272158786654, + "learning_rate": 2.6994595612006566e-09, + "loss": 0.0, + "num_input_tokens_seen": 3648016, + "step": 19160 + }, + { + "epoch": 9.961018711018712, + "grad_norm": 0.0002476323861628771, + "learning_rate": 2.376484072424967e-09, + "loss": 0.0, + "num_input_tokens_seen": 3649040, + "step": 19165 + }, + { + "epoch": 9.963617463617464, + "grad_norm": 0.0038047987036406994, + "learning_rate": 2.074079009989349e-09, + "loss": 0.0, + "num_input_tokens_seen": 3649968, + "step": 19170 + }, + { + "epoch": 9.966216216216216, + "grad_norm": 3.2471365557285026e-05, + "learning_rate": 1.7922446227447432e-09, + "loss": 0.0, + "num_input_tokens_seen": 3650896, + "step": 19175 + }, + { + "epoch": 9.96881496881497, + "grad_norm": 0.013056488707661629, + "learning_rate": 1.5309811426056364e-09, + "loss": 0.0, + "num_input_tokens_seen": 3651888, + "step": 19180 + }, + { + "epoch": 9.971413721413722, + "grad_norm": 0.0003934149572160095, + "learning_rate": 1.2902887845722688e-09, + "loss": 0.0, + "num_input_tokens_seen": 3652816, + "step": 19185 + }, + { + "epoch": 9.974012474012474, + "grad_norm": 0.07886549085378647, + "learning_rate": 1.070167746702877e-09, + "loss": 0.0001, + "num_input_tokens_seen": 3653776, + "step": 19190 + }, + { + "epoch": 9.976611226611226, + "grad_norm": 0.0002914363576564938, + "learning_rate": 8.70618210138674e-10, + "loss": 0.0, + "num_input_tokens_seen": 3654832, + "step": 19195 + }, + { + "epoch": 9.97920997920998, + "grad_norm": 0.0025077967438846827, + "learning_rate": 6.916403390844206e-10, + "loss": 0.0, + "num_input_tokens_seen": 3655856, + "step": 19200 + }, + { + "epoch": 9.981808731808732, + "grad_norm": 0.00022165605332702398, + "learning_rate": 5.332342808223034e-10, + "loss": 0.0, + "num_input_tokens_seen": 3656784, + "step": 19205 + }, + { + "epoch": 9.984407484407484, + "grad_norm": 9.415239765075967e-05, + "learning_rate": 3.9540016570083215e-10, + "loss": 0.0001, + "num_input_tokens_seen": 3657808, + "step": 19210 + }, + { + "epoch": 9.987006237006238, + "grad_norm": 0.0024842352140694857, + "learning_rate": 2.7813810714871767e-10, + "loss": 0.0, + "num_input_tokens_seen": 3658736, + "step": 19215 + }, + { + "epoch": 9.98960498960499, + "grad_norm": 0.004077819641679525, + "learning_rate": 1.8144820165544307e-10, + "loss": 0.0, + "num_input_tokens_seen": 3659632, + "step": 19220 + }, + { + "epoch": 9.992203742203742, + "grad_norm": 9.288378350902349e-05, + "learning_rate": 1.0533052878791694e-10, + "loss": 0.0, + "num_input_tokens_seen": 3660528, + "step": 19225 + }, + { + "epoch": 9.994802494802494, + "grad_norm": 0.00013627219595946372, + "learning_rate": 4.978515118214677e-11, + "loss": 0.0, + "num_input_tokens_seen": 3661456, + "step": 19230 + }, + { + "epoch": 9.997401247401248, + "grad_norm": 0.00014364210073836148, + "learning_rate": 1.4812114548790057e-11, + "loss": 0.0, + "num_input_tokens_seen": 3662416, + "step": 19235 + }, + { + "epoch": 10.0, + "grad_norm": 0.000306283007375896, + "learning_rate": 4.114476648275911e-13, + "loss": 0.0, + "num_input_tokens_seen": 3663392, + "step": 19240 + }, + { + "epoch": 10.0, + "eval_loss": 0.44086962938308716, + "eval_runtime": 9.246, + "eval_samples_per_second": 92.581, + "eval_steps_per_second": 23.145, + "num_input_tokens_seen": 3663392, + "step": 19240 + }, + { + "epoch": 10.0, + "num_input_tokens_seen": 3663392, + "step": 19240, + "total_flos": 1.6528355899814707e+17, + "train_loss": 0.06314140750740707, + "train_runtime": 3846.2926, + "train_samples_per_second": 20.006, + "train_steps_per_second": 5.002 + } + ], + "logging_steps": 5, + "max_steps": 19240, + "num_input_tokens_seen": 3663392, + "num_train_epochs": 10, + "save_steps": 962, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.6528355899814707e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}