diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,65844 @@ +{ + "best_global_step": 18400, + "best_metric": 0.1415577083826065, + "best_model_checkpoint": "saves/lntuning/mistral-7b-instruct/train_cola_1744902678/checkpoint-18400", + "epoch": 83.16008316008316, + "eval_steps": 200, + "global_step": 40000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.010395010395010396, + "grad_norm": 5.530032157897949, + "learning_rate": 4.999999876629946e-05, + "loss": 2.284, + "num_input_tokens_seen": 3392, + "step": 5 + }, + { + "epoch": 0.02079002079002079, + "grad_norm": 5.2393012046813965, + "learning_rate": 4.999999375439123e-05, + "loss": 1.778, + "num_input_tokens_seen": 6944, + "step": 10 + }, + { + "epoch": 0.031185031185031187, + "grad_norm": 4.679736614227295, + "learning_rate": 4.9999984887169785e-05, + "loss": 1.8239, + "num_input_tokens_seen": 10432, + "step": 15 + }, + { + "epoch": 0.04158004158004158, + "grad_norm": 3.344843626022339, + "learning_rate": 4.9999972164636506e-05, + "loss": 1.7107, + "num_input_tokens_seen": 14144, + "step": 20 + }, + { + "epoch": 0.05197505197505198, + "grad_norm": 4.1097307205200195, + "learning_rate": 4.999995558679334e-05, + "loss": 1.346, + "num_input_tokens_seen": 17760, + "step": 25 + }, + { + "epoch": 0.062370062370062374, + "grad_norm": 4.060741901397705, + "learning_rate": 4.999993515364287e-05, + "loss": 1.1836, + "num_input_tokens_seen": 21280, + "step": 30 + }, + { + "epoch": 0.07276507276507277, + "grad_norm": 3.3695547580718994, + "learning_rate": 4.999991086518822e-05, + "loss": 1.3789, + "num_input_tokens_seen": 24864, + "step": 35 + }, + { + "epoch": 0.08316008316008316, + "grad_norm": 3.7849280834198, + "learning_rate": 4.999988272143315e-05, + "loss": 0.9477, + "num_input_tokens_seen": 28416, + "step": 40 + }, + { + "epoch": 0.09355509355509356, + "grad_norm": 3.3889501094818115, + "learning_rate": 4.999985072238199e-05, + "loss": 1.2623, + "num_input_tokens_seen": 32000, + "step": 45 + }, + { + "epoch": 0.10395010395010396, + "grad_norm": 3.5654633045196533, + "learning_rate": 4.999981486803969e-05, + "loss": 1.1699, + "num_input_tokens_seen": 35424, + "step": 50 + }, + { + "epoch": 0.11434511434511435, + "grad_norm": 2.5480198860168457, + "learning_rate": 4.999977515841176e-05, + "loss": 0.6959, + "num_input_tokens_seen": 39040, + "step": 55 + }, + { + "epoch": 0.12474012474012475, + "grad_norm": 2.7439775466918945, + "learning_rate": 4.9999731593504344e-05, + "loss": 0.8749, + "num_input_tokens_seen": 42752, + "step": 60 + }, + { + "epoch": 0.13513513513513514, + "grad_norm": 1.4110090732574463, + "learning_rate": 4.999968417332415e-05, + "loss": 0.8274, + "num_input_tokens_seen": 46304, + "step": 65 + }, + { + "epoch": 0.14553014553014554, + "grad_norm": 2.7792465686798096, + "learning_rate": 4.999963289787848e-05, + "loss": 0.9395, + "num_input_tokens_seen": 49824, + "step": 70 + }, + { + "epoch": 0.15592515592515593, + "grad_norm": 2.710958242416382, + "learning_rate": 4.999957776717526e-05, + "loss": 0.8967, + "num_input_tokens_seen": 53568, + "step": 75 + }, + { + "epoch": 0.16632016632016633, + "grad_norm": 2.441915988922119, + "learning_rate": 4.9999518781222984e-05, + "loss": 0.7017, + "num_input_tokens_seen": 57248, + "step": 80 + }, + { + "epoch": 0.17671517671517672, + "grad_norm": 1.8912341594696045, + "learning_rate": 4.9999455940030746e-05, + "loss": 0.6539, + "num_input_tokens_seen": 60864, + "step": 85 + }, + { + "epoch": 0.18711018711018712, + "grad_norm": 2.523303747177124, + "learning_rate": 4.999938924360824e-05, + "loss": 0.7086, + "num_input_tokens_seen": 64448, + "step": 90 + }, + { + "epoch": 0.19750519750519752, + "grad_norm": 0.9268086552619934, + "learning_rate": 4.999931869196575e-05, + "loss": 0.7095, + "num_input_tokens_seen": 68096, + "step": 95 + }, + { + "epoch": 0.2079002079002079, + "grad_norm": 0.8141545653343201, + "learning_rate": 4.999924428511416e-05, + "loss": 0.5347, + "num_input_tokens_seen": 71680, + "step": 100 + }, + { + "epoch": 0.2182952182952183, + "grad_norm": 1.7154875993728638, + "learning_rate": 4.999916602306494e-05, + "loss": 0.4089, + "num_input_tokens_seen": 75264, + "step": 105 + }, + { + "epoch": 0.2286902286902287, + "grad_norm": 2.123647928237915, + "learning_rate": 4.999908390583016e-05, + "loss": 0.5077, + "num_input_tokens_seen": 79136, + "step": 110 + }, + { + "epoch": 0.2390852390852391, + "grad_norm": 1.4566247463226318, + "learning_rate": 4.999899793342247e-05, + "loss": 0.5062, + "num_input_tokens_seen": 82624, + "step": 115 + }, + { + "epoch": 0.2494802494802495, + "grad_norm": 1.876514196395874, + "learning_rate": 4.999890810585516e-05, + "loss": 0.4909, + "num_input_tokens_seen": 86208, + "step": 120 + }, + { + "epoch": 0.2598752598752599, + "grad_norm": 1.0795706510543823, + "learning_rate": 4.999881442314206e-05, + "loss": 0.6115, + "num_input_tokens_seen": 89984, + "step": 125 + }, + { + "epoch": 0.2702702702702703, + "grad_norm": 1.7839707136154175, + "learning_rate": 4.9998716885297617e-05, + "loss": 0.6674, + "num_input_tokens_seen": 93536, + "step": 130 + }, + { + "epoch": 0.2806652806652807, + "grad_norm": 1.4050040245056152, + "learning_rate": 4.999861549233688e-05, + "loss": 0.3638, + "num_input_tokens_seen": 97472, + "step": 135 + }, + { + "epoch": 0.2910602910602911, + "grad_norm": 0.7169235944747925, + "learning_rate": 4.999851024427548e-05, + "loss": 0.4523, + "num_input_tokens_seen": 101088, + "step": 140 + }, + { + "epoch": 0.30145530145530147, + "grad_norm": 0.8387651443481445, + "learning_rate": 4.999840114112965e-05, + "loss": 0.3146, + "num_input_tokens_seen": 104672, + "step": 145 + }, + { + "epoch": 0.31185031185031187, + "grad_norm": 1.0210840702056885, + "learning_rate": 4.999828818291621e-05, + "loss": 0.2682, + "num_input_tokens_seen": 108128, + "step": 150 + }, + { + "epoch": 0.32224532224532226, + "grad_norm": 1.0619266033172607, + "learning_rate": 4.999817136965259e-05, + "loss": 0.3256, + "num_input_tokens_seen": 111840, + "step": 155 + }, + { + "epoch": 0.33264033264033266, + "grad_norm": 0.7333658933639526, + "learning_rate": 4.9998050701356794e-05, + "loss": 0.4954, + "num_input_tokens_seen": 115456, + "step": 160 + }, + { + "epoch": 0.34303534303534305, + "grad_norm": 0.8287765383720398, + "learning_rate": 4.999792617804744e-05, + "loss": 0.2066, + "num_input_tokens_seen": 119104, + "step": 165 + }, + { + "epoch": 0.35343035343035345, + "grad_norm": 1.6991249322891235, + "learning_rate": 4.9997797799743724e-05, + "loss": 0.6019, + "num_input_tokens_seen": 122528, + "step": 170 + }, + { + "epoch": 0.36382536382536385, + "grad_norm": 1.45650315284729, + "learning_rate": 4.999766556646545e-05, + "loss": 0.3992, + "num_input_tokens_seen": 126144, + "step": 175 + }, + { + "epoch": 0.37422037422037424, + "grad_norm": 0.6022237539291382, + "learning_rate": 4.9997529478232996e-05, + "loss": 0.3136, + "num_input_tokens_seen": 129696, + "step": 180 + }, + { + "epoch": 0.38461538461538464, + "grad_norm": 0.8513803482055664, + "learning_rate": 4.9997389535067365e-05, + "loss": 0.3172, + "num_input_tokens_seen": 133280, + "step": 185 + }, + { + "epoch": 0.39501039501039503, + "grad_norm": 0.5664764046669006, + "learning_rate": 4.999724573699012e-05, + "loss": 0.3267, + "num_input_tokens_seen": 136800, + "step": 190 + }, + { + "epoch": 0.40540540540540543, + "grad_norm": 1.302217721939087, + "learning_rate": 4.9997098084023457e-05, + "loss": 0.4925, + "num_input_tokens_seen": 140256, + "step": 195 + }, + { + "epoch": 0.4158004158004158, + "grad_norm": 0.9057350754737854, + "learning_rate": 4.999694657619013e-05, + "loss": 0.4054, + "num_input_tokens_seen": 143936, + "step": 200 + }, + { + "epoch": 0.4158004158004158, + "eval_loss": 0.31748875975608826, + "eval_runtime": 7.7492, + "eval_samples_per_second": 110.463, + "eval_steps_per_second": 27.616, + "num_input_tokens_seen": 143936, + "step": 200 + }, + { + "epoch": 0.4261954261954262, + "grad_norm": 0.3728487193584442, + "learning_rate": 4.999679121351352e-05, + "loss": 0.1773, + "num_input_tokens_seen": 147392, + "step": 205 + }, + { + "epoch": 0.4365904365904366, + "grad_norm": 0.8523396849632263, + "learning_rate": 4.9996631996017565e-05, + "loss": 0.266, + "num_input_tokens_seen": 150816, + "step": 210 + }, + { + "epoch": 0.446985446985447, + "grad_norm": 1.1594433784484863, + "learning_rate": 4.9996468923726835e-05, + "loss": 0.2727, + "num_input_tokens_seen": 154432, + "step": 215 + }, + { + "epoch": 0.4573804573804574, + "grad_norm": 0.7929168939590454, + "learning_rate": 4.999630199666647e-05, + "loss": 0.3496, + "num_input_tokens_seen": 158336, + "step": 220 + }, + { + "epoch": 0.4677754677754678, + "grad_norm": 0.4711568057537079, + "learning_rate": 4.999613121486222e-05, + "loss": 0.3094, + "num_input_tokens_seen": 162048, + "step": 225 + }, + { + "epoch": 0.4781704781704782, + "grad_norm": 0.4875316023826599, + "learning_rate": 4.999595657834041e-05, + "loss": 0.3951, + "num_input_tokens_seen": 165600, + "step": 230 + }, + { + "epoch": 0.4885654885654886, + "grad_norm": 0.5265671610832214, + "learning_rate": 4.999577808712798e-05, + "loss": 0.2727, + "num_input_tokens_seen": 169056, + "step": 235 + }, + { + "epoch": 0.498960498960499, + "grad_norm": 0.6469897627830505, + "learning_rate": 4.999559574125244e-05, + "loss": 0.2918, + "num_input_tokens_seen": 172608, + "step": 240 + }, + { + "epoch": 0.5093555093555093, + "grad_norm": 0.3754512071609497, + "learning_rate": 4.9995409540741934e-05, + "loss": 0.241, + "num_input_tokens_seen": 176256, + "step": 245 + }, + { + "epoch": 0.5197505197505198, + "grad_norm": 0.48846426606178284, + "learning_rate": 4.999521948562516e-05, + "loss": 0.1899, + "num_input_tokens_seen": 179936, + "step": 250 + }, + { + "epoch": 0.5301455301455301, + "grad_norm": 0.7027150392532349, + "learning_rate": 4.999502557593143e-05, + "loss": 0.3379, + "num_input_tokens_seen": 183424, + "step": 255 + }, + { + "epoch": 0.5405405405405406, + "grad_norm": 1.6098992824554443, + "learning_rate": 4.999482781169066e-05, + "loss": 0.3065, + "num_input_tokens_seen": 186976, + "step": 260 + }, + { + "epoch": 0.5509355509355509, + "grad_norm": 0.29604867100715637, + "learning_rate": 4.9994626192933324e-05, + "loss": 0.3093, + "num_input_tokens_seen": 190592, + "step": 265 + }, + { + "epoch": 0.5613305613305614, + "grad_norm": 1.4281378984451294, + "learning_rate": 4.999442071969054e-05, + "loss": 0.2996, + "num_input_tokens_seen": 194368, + "step": 270 + }, + { + "epoch": 0.5717255717255717, + "grad_norm": 0.5820214152336121, + "learning_rate": 4.999421139199397e-05, + "loss": 0.1686, + "num_input_tokens_seen": 197792, + "step": 275 + }, + { + "epoch": 0.5821205821205822, + "grad_norm": 0.6768797039985657, + "learning_rate": 4.999399820987592e-05, + "loss": 0.181, + "num_input_tokens_seen": 201600, + "step": 280 + }, + { + "epoch": 0.5925155925155925, + "grad_norm": 0.5552245378494263, + "learning_rate": 4.999378117336924e-05, + "loss": 0.3677, + "num_input_tokens_seen": 205120, + "step": 285 + }, + { + "epoch": 0.6029106029106029, + "grad_norm": 0.8072368502616882, + "learning_rate": 4.9993560282507415e-05, + "loss": 0.2698, + "num_input_tokens_seen": 208736, + "step": 290 + }, + { + "epoch": 0.6133056133056133, + "grad_norm": 0.31647002696990967, + "learning_rate": 4.9993335537324495e-05, + "loss": 0.1433, + "num_input_tokens_seen": 212320, + "step": 295 + }, + { + "epoch": 0.6237006237006237, + "grad_norm": 0.5259145498275757, + "learning_rate": 4.999310693785516e-05, + "loss": 0.2275, + "num_input_tokens_seen": 215744, + "step": 300 + }, + { + "epoch": 0.6340956340956341, + "grad_norm": 0.5711801052093506, + "learning_rate": 4.9992874484134653e-05, + "loss": 0.1732, + "num_input_tokens_seen": 219488, + "step": 305 + }, + { + "epoch": 0.6444906444906445, + "grad_norm": 0.17299717664718628, + "learning_rate": 4.999263817619882e-05, + "loss": 0.1891, + "num_input_tokens_seen": 223072, + "step": 310 + }, + { + "epoch": 0.6548856548856549, + "grad_norm": 0.24076908826828003, + "learning_rate": 4.9992398014084105e-05, + "loss": 0.1762, + "num_input_tokens_seen": 226816, + "step": 315 + }, + { + "epoch": 0.6652806652806653, + "grad_norm": 0.43914031982421875, + "learning_rate": 4.999215399782754e-05, + "loss": 0.2656, + "num_input_tokens_seen": 230336, + "step": 320 + }, + { + "epoch": 0.6756756756756757, + "grad_norm": 0.981399655342102, + "learning_rate": 4.999190612746675e-05, + "loss": 0.2269, + "num_input_tokens_seen": 234240, + "step": 325 + }, + { + "epoch": 0.6860706860706861, + "grad_norm": 0.5428009033203125, + "learning_rate": 4.999165440303998e-05, + "loss": 0.2553, + "num_input_tokens_seen": 237888, + "step": 330 + }, + { + "epoch": 0.6964656964656964, + "grad_norm": 0.7826330065727234, + "learning_rate": 4.999139882458603e-05, + "loss": 0.2463, + "num_input_tokens_seen": 241440, + "step": 335 + }, + { + "epoch": 0.7068607068607069, + "grad_norm": 0.6571482419967651, + "learning_rate": 4.9991139392144314e-05, + "loss": 0.2221, + "num_input_tokens_seen": 244992, + "step": 340 + }, + { + "epoch": 0.7172557172557172, + "grad_norm": 0.20186033844947815, + "learning_rate": 4.999087610575485e-05, + "loss": 0.1683, + "num_input_tokens_seen": 248544, + "step": 345 + }, + { + "epoch": 0.7276507276507277, + "grad_norm": 0.3007427752017975, + "learning_rate": 4.999060896545824e-05, + "loss": 0.1854, + "num_input_tokens_seen": 252160, + "step": 350 + }, + { + "epoch": 0.738045738045738, + "grad_norm": 0.3099931478500366, + "learning_rate": 4.999033797129568e-05, + "loss": 0.2462, + "num_input_tokens_seen": 255616, + "step": 355 + }, + { + "epoch": 0.7484407484407485, + "grad_norm": 0.7885267734527588, + "learning_rate": 4.999006312330894e-05, + "loss": 0.2127, + "num_input_tokens_seen": 259104, + "step": 360 + }, + { + "epoch": 0.7588357588357588, + "grad_norm": 0.6801347732543945, + "learning_rate": 4.998978442154043e-05, + "loss": 0.186, + "num_input_tokens_seen": 262656, + "step": 365 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 0.4548673927783966, + "learning_rate": 4.9989501866033125e-05, + "loss": 0.1884, + "num_input_tokens_seen": 266208, + "step": 370 + }, + { + "epoch": 0.7796257796257796, + "grad_norm": 0.23865553736686707, + "learning_rate": 4.998921545683059e-05, + "loss": 0.182, + "num_input_tokens_seen": 269632, + "step": 375 + }, + { + "epoch": 0.7900207900207901, + "grad_norm": 0.447854220867157, + "learning_rate": 4.9988925193976996e-05, + "loss": 0.1438, + "num_input_tokens_seen": 273248, + "step": 380 + }, + { + "epoch": 0.8004158004158004, + "grad_norm": 0.20702749490737915, + "learning_rate": 4.998863107751711e-05, + "loss": 0.1658, + "num_input_tokens_seen": 276672, + "step": 385 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 0.6511743068695068, + "learning_rate": 4.998833310749629e-05, + "loss": 0.2251, + "num_input_tokens_seen": 280352, + "step": 390 + }, + { + "epoch": 0.8212058212058212, + "grad_norm": 0.17242340743541718, + "learning_rate": 4.998803128396047e-05, + "loss": 0.2139, + "num_input_tokens_seen": 283808, + "step": 395 + }, + { + "epoch": 0.8316008316008316, + "grad_norm": 0.3257482349872589, + "learning_rate": 4.9987725606956215e-05, + "loss": 0.2079, + "num_input_tokens_seen": 287392, + "step": 400 + }, + { + "epoch": 0.8316008316008316, + "eval_loss": 0.2185945063829422, + "eval_runtime": 7.7555, + "eval_samples_per_second": 110.373, + "eval_steps_per_second": 27.593, + "num_input_tokens_seen": 287392, + "step": 400 + }, + { + "epoch": 0.841995841995842, + "grad_norm": 0.27675750851631165, + "learning_rate": 4.998741607653066e-05, + "loss": 0.1995, + "num_input_tokens_seen": 290944, + "step": 405 + }, + { + "epoch": 0.8523908523908524, + "grad_norm": 0.8755809664726257, + "learning_rate": 4.9987102692731523e-05, + "loss": 0.227, + "num_input_tokens_seen": 294624, + "step": 410 + }, + { + "epoch": 0.8627858627858628, + "grad_norm": 0.20790110528469086, + "learning_rate": 4.9986785455607157e-05, + "loss": 0.1771, + "num_input_tokens_seen": 298176, + "step": 415 + }, + { + "epoch": 0.8731808731808732, + "grad_norm": 0.5597220063209534, + "learning_rate": 4.9986464365206456e-05, + "loss": 0.1854, + "num_input_tokens_seen": 301632, + "step": 420 + }, + { + "epoch": 0.8835758835758836, + "grad_norm": 0.8316853046417236, + "learning_rate": 4.9986139421578956e-05, + "loss": 0.2434, + "num_input_tokens_seen": 305216, + "step": 425 + }, + { + "epoch": 0.893970893970894, + "grad_norm": 1.8186384439468384, + "learning_rate": 4.998581062477477e-05, + "loss": 0.2837, + "num_input_tokens_seen": 308864, + "step": 430 + }, + { + "epoch": 0.9043659043659044, + "grad_norm": 0.5669469237327576, + "learning_rate": 4.998547797484458e-05, + "loss": 0.2107, + "num_input_tokens_seen": 312416, + "step": 435 + }, + { + "epoch": 0.9147609147609148, + "grad_norm": 0.7106760740280151, + "learning_rate": 4.9985141471839706e-05, + "loss": 0.188, + "num_input_tokens_seen": 316032, + "step": 440 + }, + { + "epoch": 0.9251559251559252, + "grad_norm": 0.8463893532752991, + "learning_rate": 4.998480111581203e-05, + "loss": 0.2796, + "num_input_tokens_seen": 319616, + "step": 445 + }, + { + "epoch": 0.9355509355509356, + "grad_norm": 0.5726463198661804, + "learning_rate": 4.998445690681405e-05, + "loss": 0.2449, + "num_input_tokens_seen": 323136, + "step": 450 + }, + { + "epoch": 0.9459459459459459, + "grad_norm": 0.6106606721878052, + "learning_rate": 4.9984108844898834e-05, + "loss": 0.243, + "num_input_tokens_seen": 326944, + "step": 455 + }, + { + "epoch": 0.9563409563409564, + "grad_norm": 0.2728162407875061, + "learning_rate": 4.9983756930120076e-05, + "loss": 0.183, + "num_input_tokens_seen": 330560, + "step": 460 + }, + { + "epoch": 0.9667359667359667, + "grad_norm": 0.23580828309059143, + "learning_rate": 4.9983401162532025e-05, + "loss": 0.1814, + "num_input_tokens_seen": 334080, + "step": 465 + }, + { + "epoch": 0.9771309771309772, + "grad_norm": 0.9719234108924866, + "learning_rate": 4.998304154218955e-05, + "loss": 0.1977, + "num_input_tokens_seen": 337696, + "step": 470 + }, + { + "epoch": 0.9875259875259875, + "grad_norm": 0.19020849466323853, + "learning_rate": 4.998267806914812e-05, + "loss": 0.2053, + "num_input_tokens_seen": 341248, + "step": 475 + }, + { + "epoch": 0.997920997920998, + "grad_norm": 0.76894211769104, + "learning_rate": 4.998231074346378e-05, + "loss": 0.1927, + "num_input_tokens_seen": 344896, + "step": 480 + }, + { + "epoch": 1.0083160083160083, + "grad_norm": 0.4201458692550659, + "learning_rate": 4.998193956519317e-05, + "loss": 0.2106, + "num_input_tokens_seen": 348504, + "step": 485 + }, + { + "epoch": 1.0187110187110187, + "grad_norm": 0.455824613571167, + "learning_rate": 4.9981564534393545e-05, + "loss": 0.2132, + "num_input_tokens_seen": 351960, + "step": 490 + }, + { + "epoch": 1.0291060291060292, + "grad_norm": 0.1663370579481125, + "learning_rate": 4.998118565112272e-05, + "loss": 0.1642, + "num_input_tokens_seen": 355480, + "step": 495 + }, + { + "epoch": 1.0395010395010396, + "grad_norm": 0.19635306298732758, + "learning_rate": 4.998080291543914e-05, + "loss": 0.2063, + "num_input_tokens_seen": 358968, + "step": 500 + }, + { + "epoch": 1.04989604989605, + "grad_norm": 0.18683373928070068, + "learning_rate": 4.9980416327401826e-05, + "loss": 0.2136, + "num_input_tokens_seen": 362552, + "step": 505 + }, + { + "epoch": 1.0602910602910602, + "grad_norm": 0.21791131794452667, + "learning_rate": 4.998002588707038e-05, + "loss": 0.1712, + "num_input_tokens_seen": 366296, + "step": 510 + }, + { + "epoch": 1.0706860706860706, + "grad_norm": 0.22313368320465088, + "learning_rate": 4.997963159450503e-05, + "loss": 0.1759, + "num_input_tokens_seen": 369880, + "step": 515 + }, + { + "epoch": 1.0810810810810811, + "grad_norm": 0.2486889660358429, + "learning_rate": 4.9979233449766575e-05, + "loss": 0.2025, + "num_input_tokens_seen": 373592, + "step": 520 + }, + { + "epoch": 1.0914760914760915, + "grad_norm": 0.21012866497039795, + "learning_rate": 4.997883145291641e-05, + "loss": 0.1489, + "num_input_tokens_seen": 376920, + "step": 525 + }, + { + "epoch": 1.1018711018711018, + "grad_norm": 0.2540522515773773, + "learning_rate": 4.9978425604016536e-05, + "loss": 0.1813, + "num_input_tokens_seen": 380536, + "step": 530 + }, + { + "epoch": 1.1122661122661124, + "grad_norm": 0.19152314960956573, + "learning_rate": 4.9978015903129536e-05, + "loss": 0.1688, + "num_input_tokens_seen": 384120, + "step": 535 + }, + { + "epoch": 1.1226611226611227, + "grad_norm": 0.6085925102233887, + "learning_rate": 4.997760235031859e-05, + "loss": 0.2297, + "num_input_tokens_seen": 387672, + "step": 540 + }, + { + "epoch": 1.133056133056133, + "grad_norm": 0.36351603269577026, + "learning_rate": 4.9977184945647473e-05, + "loss": 0.2009, + "num_input_tokens_seen": 391160, + "step": 545 + }, + { + "epoch": 1.1434511434511434, + "grad_norm": 0.47148826718330383, + "learning_rate": 4.997676368918055e-05, + "loss": 0.2652, + "num_input_tokens_seen": 394808, + "step": 550 + }, + { + "epoch": 1.1538461538461537, + "grad_norm": 0.22651126980781555, + "learning_rate": 4.9976338580982794e-05, + "loss": 0.1746, + "num_input_tokens_seen": 398456, + "step": 555 + }, + { + "epoch": 1.1642411642411643, + "grad_norm": 0.3994894027709961, + "learning_rate": 4.9975909621119755e-05, + "loss": 0.2162, + "num_input_tokens_seen": 402168, + "step": 560 + }, + { + "epoch": 1.1746361746361746, + "grad_norm": 0.43981078267097473, + "learning_rate": 4.997547680965758e-05, + "loss": 0.2057, + "num_input_tokens_seen": 405720, + "step": 565 + }, + { + "epoch": 1.185031185031185, + "grad_norm": 0.47910168766975403, + "learning_rate": 4.997504014666302e-05, + "loss": 0.1898, + "num_input_tokens_seen": 409304, + "step": 570 + }, + { + "epoch": 1.1954261954261955, + "grad_norm": 0.4334993362426758, + "learning_rate": 4.997459963220342e-05, + "loss": 0.2169, + "num_input_tokens_seen": 412984, + "step": 575 + }, + { + "epoch": 1.2058212058212059, + "grad_norm": 0.4314536452293396, + "learning_rate": 4.997415526634671e-05, + "loss": 0.1804, + "num_input_tokens_seen": 416664, + "step": 580 + }, + { + "epoch": 1.2162162162162162, + "grad_norm": 0.49193400144577026, + "learning_rate": 4.99737070491614e-05, + "loss": 0.1702, + "num_input_tokens_seen": 420216, + "step": 585 + }, + { + "epoch": 1.2266112266112266, + "grad_norm": 0.15584494173526764, + "learning_rate": 4.997325498071663e-05, + "loss": 0.1703, + "num_input_tokens_seen": 423832, + "step": 590 + }, + { + "epoch": 1.237006237006237, + "grad_norm": 0.5063003301620483, + "learning_rate": 4.997279906108211e-05, + "loss": 0.2105, + "num_input_tokens_seen": 427416, + "step": 595 + }, + { + "epoch": 1.2474012474012475, + "grad_norm": 0.24349485337734222, + "learning_rate": 4.9972339290328155e-05, + "loss": 0.1714, + "num_input_tokens_seen": 430968, + "step": 600 + }, + { + "epoch": 1.2474012474012475, + "eval_loss": 0.2034861147403717, + "eval_runtime": 7.7624, + "eval_samples_per_second": 110.275, + "eval_steps_per_second": 27.569, + "num_input_tokens_seen": 430968, + "step": 600 + }, + { + "epoch": 1.2577962577962578, + "grad_norm": 0.18751870095729828, + "learning_rate": 4.9971875668525646e-05, + "loss": 0.168, + "num_input_tokens_seen": 434520, + "step": 605 + }, + { + "epoch": 1.2681912681912682, + "grad_norm": 0.4788508415222168, + "learning_rate": 4.997140819574609e-05, + "loss": 0.1888, + "num_input_tokens_seen": 438072, + "step": 610 + }, + { + "epoch": 1.2785862785862787, + "grad_norm": 0.263256311416626, + "learning_rate": 4.997093687206159e-05, + "loss": 0.187, + "num_input_tokens_seen": 441784, + "step": 615 + }, + { + "epoch": 1.288981288981289, + "grad_norm": 0.24346596002578735, + "learning_rate": 4.997046169754482e-05, + "loss": 0.1715, + "num_input_tokens_seen": 445304, + "step": 620 + }, + { + "epoch": 1.2993762993762994, + "grad_norm": 0.33802568912506104, + "learning_rate": 4.996998267226905e-05, + "loss": 0.2386, + "num_input_tokens_seen": 448984, + "step": 625 + }, + { + "epoch": 1.3097713097713097, + "grad_norm": 0.2563517093658447, + "learning_rate": 4.996949979630817e-05, + "loss": 0.1814, + "num_input_tokens_seen": 452632, + "step": 630 + }, + { + "epoch": 1.32016632016632, + "grad_norm": 0.3655163645744324, + "learning_rate": 4.996901306973663e-05, + "loss": 0.2086, + "num_input_tokens_seen": 456376, + "step": 635 + }, + { + "epoch": 1.3305613305613306, + "grad_norm": 0.2973382771015167, + "learning_rate": 4.996852249262949e-05, + "loss": 0.2101, + "num_input_tokens_seen": 460024, + "step": 640 + }, + { + "epoch": 1.340956340956341, + "grad_norm": 0.16344760358333588, + "learning_rate": 4.996802806506241e-05, + "loss": 0.1748, + "num_input_tokens_seen": 463416, + "step": 645 + }, + { + "epoch": 1.3513513513513513, + "grad_norm": 0.27866891026496887, + "learning_rate": 4.996752978711164e-05, + "loss": 0.1868, + "num_input_tokens_seen": 467064, + "step": 650 + }, + { + "epoch": 1.3617463617463619, + "grad_norm": 0.2804291546344757, + "learning_rate": 4.996702765885401e-05, + "loss": 0.2007, + "num_input_tokens_seen": 470584, + "step": 655 + }, + { + "epoch": 1.3721413721413722, + "grad_norm": 1.6664478778839111, + "learning_rate": 4.9966521680366964e-05, + "loss": 0.2623, + "num_input_tokens_seen": 474008, + "step": 660 + }, + { + "epoch": 1.3825363825363826, + "grad_norm": 0.1567099541425705, + "learning_rate": 4.9966011851728524e-05, + "loss": 0.1824, + "num_input_tokens_seen": 477560, + "step": 665 + }, + { + "epoch": 1.392931392931393, + "grad_norm": 0.38432934880256653, + "learning_rate": 4.996549817301731e-05, + "loss": 0.1827, + "num_input_tokens_seen": 480920, + "step": 670 + }, + { + "epoch": 1.4033264033264032, + "grad_norm": 0.21595098078250885, + "learning_rate": 4.9964980644312544e-05, + "loss": 0.2063, + "num_input_tokens_seen": 484504, + "step": 675 + }, + { + "epoch": 1.4137214137214138, + "grad_norm": 0.5146589279174805, + "learning_rate": 4.996445926569403e-05, + "loss": 0.1997, + "num_input_tokens_seen": 488056, + "step": 680 + }, + { + "epoch": 1.4241164241164241, + "grad_norm": 0.3131435811519623, + "learning_rate": 4.996393403724218e-05, + "loss": 0.2362, + "num_input_tokens_seen": 491640, + "step": 685 + }, + { + "epoch": 1.4345114345114345, + "grad_norm": 0.15158607065677643, + "learning_rate": 4.9963404959037985e-05, + "loss": 0.1683, + "num_input_tokens_seen": 495192, + "step": 690 + }, + { + "epoch": 1.444906444906445, + "grad_norm": 0.4848015010356903, + "learning_rate": 4.996287203116303e-05, + "loss": 0.1985, + "num_input_tokens_seen": 498840, + "step": 695 + }, + { + "epoch": 1.4553014553014554, + "grad_norm": 0.23467743396759033, + "learning_rate": 4.996233525369951e-05, + "loss": 0.2027, + "num_input_tokens_seen": 502424, + "step": 700 + }, + { + "epoch": 1.4656964656964657, + "grad_norm": 0.22414933145046234, + "learning_rate": 4.99617946267302e-05, + "loss": 0.1858, + "num_input_tokens_seen": 506040, + "step": 705 + }, + { + "epoch": 1.476091476091476, + "grad_norm": 0.3129233717918396, + "learning_rate": 4.996125015033846e-05, + "loss": 0.1436, + "num_input_tokens_seen": 509592, + "step": 710 + }, + { + "epoch": 1.4864864864864864, + "grad_norm": 0.2326781302690506, + "learning_rate": 4.996070182460827e-05, + "loss": 0.1858, + "num_input_tokens_seen": 513240, + "step": 715 + }, + { + "epoch": 1.496881496881497, + "grad_norm": 0.24601393938064575, + "learning_rate": 4.996014964962418e-05, + "loss": 0.169, + "num_input_tokens_seen": 516920, + "step": 720 + }, + { + "epoch": 1.5072765072765073, + "grad_norm": 0.17422518134117126, + "learning_rate": 4.9959593625471344e-05, + "loss": 0.194, + "num_input_tokens_seen": 520568, + "step": 725 + }, + { + "epoch": 1.5176715176715176, + "grad_norm": 0.7939184308052063, + "learning_rate": 4.995903375223552e-05, + "loss": 0.2022, + "num_input_tokens_seen": 524280, + "step": 730 + }, + { + "epoch": 1.5280665280665282, + "grad_norm": 0.3737525939941406, + "learning_rate": 4.995847003000302e-05, + "loss": 0.1737, + "num_input_tokens_seen": 527800, + "step": 735 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 0.29248231649398804, + "learning_rate": 4.9957902458860804e-05, + "loss": 0.2017, + "num_input_tokens_seen": 531384, + "step": 740 + }, + { + "epoch": 1.5488565488565489, + "grad_norm": 0.2775634527206421, + "learning_rate": 4.995733103889639e-05, + "loss": 0.1557, + "num_input_tokens_seen": 534968, + "step": 745 + }, + { + "epoch": 1.5592515592515592, + "grad_norm": 0.18828873336315155, + "learning_rate": 4.99567557701979e-05, + "loss": 0.2142, + "num_input_tokens_seen": 538488, + "step": 750 + }, + { + "epoch": 1.5696465696465696, + "grad_norm": 0.7118074297904968, + "learning_rate": 4.995617665285403e-05, + "loss": 0.2209, + "num_input_tokens_seen": 542328, + "step": 755 + }, + { + "epoch": 1.5800415800415801, + "grad_norm": 0.2723216116428375, + "learning_rate": 4.99555936869541e-05, + "loss": 0.2004, + "num_input_tokens_seen": 545912, + "step": 760 + }, + { + "epoch": 1.5904365904365905, + "grad_norm": 0.2996287941932678, + "learning_rate": 4.995500687258803e-05, + "loss": 0.1671, + "num_input_tokens_seen": 549464, + "step": 765 + }, + { + "epoch": 1.6008316008316008, + "grad_norm": 0.2377077341079712, + "learning_rate": 4.995441620984628e-05, + "loss": 0.2027, + "num_input_tokens_seen": 552920, + "step": 770 + }, + { + "epoch": 1.6112266112266114, + "grad_norm": 0.608710765838623, + "learning_rate": 4.995382169881996e-05, + "loss": 0.2072, + "num_input_tokens_seen": 556536, + "step": 775 + }, + { + "epoch": 1.6216216216216215, + "grad_norm": 0.4926723539829254, + "learning_rate": 4.9953223339600755e-05, + "loss": 0.1914, + "num_input_tokens_seen": 560184, + "step": 780 + }, + { + "epoch": 1.632016632016632, + "grad_norm": 0.3226834237575531, + "learning_rate": 4.995262113228091e-05, + "loss": 0.1973, + "num_input_tokens_seen": 563832, + "step": 785 + }, + { + "epoch": 1.6424116424116424, + "grad_norm": 0.40056145191192627, + "learning_rate": 4.995201507695332e-05, + "loss": 0.1631, + "num_input_tokens_seen": 567256, + "step": 790 + }, + { + "epoch": 1.6528066528066527, + "grad_norm": 0.29292407631874084, + "learning_rate": 4.995140517371144e-05, + "loss": 0.1917, + "num_input_tokens_seen": 570872, + "step": 795 + }, + { + "epoch": 1.6632016632016633, + "grad_norm": 0.48931601643562317, + "learning_rate": 4.995079142264932e-05, + "loss": 0.2061, + "num_input_tokens_seen": 574456, + "step": 800 + }, + { + "epoch": 1.6632016632016633, + "eval_loss": 0.19441358745098114, + "eval_runtime": 7.7729, + "eval_samples_per_second": 110.126, + "eval_steps_per_second": 27.532, + "num_input_tokens_seen": 574456, + "step": 800 + }, + { + "epoch": 1.6735966735966736, + "grad_norm": 0.20199142396450043, + "learning_rate": 4.995017382386162e-05, + "loss": 0.1895, + "num_input_tokens_seen": 578136, + "step": 805 + }, + { + "epoch": 1.683991683991684, + "grad_norm": 0.11028966307640076, + "learning_rate": 4.994955237744356e-05, + "loss": 0.1597, + "num_input_tokens_seen": 581656, + "step": 810 + }, + { + "epoch": 1.6943866943866945, + "grad_norm": 0.2557755708694458, + "learning_rate": 4.994892708349101e-05, + "loss": 0.1706, + "num_input_tokens_seen": 585368, + "step": 815 + }, + { + "epoch": 1.7047817047817047, + "grad_norm": 0.29813307523727417, + "learning_rate": 4.994829794210035e-05, + "loss": 0.1609, + "num_input_tokens_seen": 588984, + "step": 820 + }, + { + "epoch": 1.7151767151767152, + "grad_norm": 0.23700349032878876, + "learning_rate": 4.994766495336864e-05, + "loss": 0.1781, + "num_input_tokens_seen": 592632, + "step": 825 + }, + { + "epoch": 1.7255717255717256, + "grad_norm": 0.2501082122325897, + "learning_rate": 4.994702811739348e-05, + "loss": 0.193, + "num_input_tokens_seen": 596184, + "step": 830 + }, + { + "epoch": 1.735966735966736, + "grad_norm": 0.42217496037483215, + "learning_rate": 4.994638743427308e-05, + "loss": 0.1876, + "num_input_tokens_seen": 599704, + "step": 835 + }, + { + "epoch": 1.7463617463617465, + "grad_norm": 0.31838852167129517, + "learning_rate": 4.994574290410624e-05, + "loss": 0.1608, + "num_input_tokens_seen": 603256, + "step": 840 + }, + { + "epoch": 1.7567567567567568, + "grad_norm": 0.1763755977153778, + "learning_rate": 4.9945094526992364e-05, + "loss": 0.1688, + "num_input_tokens_seen": 606808, + "step": 845 + }, + { + "epoch": 1.7671517671517671, + "grad_norm": 0.22812531888484955, + "learning_rate": 4.994444230303142e-05, + "loss": 0.1833, + "num_input_tokens_seen": 610424, + "step": 850 + }, + { + "epoch": 1.7775467775467777, + "grad_norm": 0.3080637753009796, + "learning_rate": 4.994378623232402e-05, + "loss": 0.1684, + "num_input_tokens_seen": 613816, + "step": 855 + }, + { + "epoch": 1.7879417879417878, + "grad_norm": 0.4063854217529297, + "learning_rate": 4.99431263149713e-05, + "loss": 0.1869, + "num_input_tokens_seen": 617432, + "step": 860 + }, + { + "epoch": 1.7983367983367984, + "grad_norm": 0.34311655163764954, + "learning_rate": 4.9942462551075056e-05, + "loss": 0.1784, + "num_input_tokens_seen": 621016, + "step": 865 + }, + { + "epoch": 1.8087318087318087, + "grad_norm": 0.21738094091415405, + "learning_rate": 4.994179494073764e-05, + "loss": 0.1739, + "num_input_tokens_seen": 624632, + "step": 870 + }, + { + "epoch": 1.819126819126819, + "grad_norm": 0.27619072794914246, + "learning_rate": 4.9941123484062e-05, + "loss": 0.136, + "num_input_tokens_seen": 628280, + "step": 875 + }, + { + "epoch": 1.8295218295218296, + "grad_norm": 1.017405390739441, + "learning_rate": 4.99404481811517e-05, + "loss": 0.3088, + "num_input_tokens_seen": 631992, + "step": 880 + }, + { + "epoch": 1.83991683991684, + "grad_norm": 0.261287659406662, + "learning_rate": 4.9939769032110864e-05, + "loss": 0.1935, + "num_input_tokens_seen": 635640, + "step": 885 + }, + { + "epoch": 1.8503118503118503, + "grad_norm": 0.43490102887153625, + "learning_rate": 4.993908603704423e-05, + "loss": 0.1486, + "num_input_tokens_seen": 639160, + "step": 890 + }, + { + "epoch": 1.8607068607068609, + "grad_norm": 0.29255086183547974, + "learning_rate": 4.9938399196057126e-05, + "loss": 0.1957, + "num_input_tokens_seen": 642872, + "step": 895 + }, + { + "epoch": 1.871101871101871, + "grad_norm": 0.22625836730003357, + "learning_rate": 4.993770850925547e-05, + "loss": 0.1832, + "num_input_tokens_seen": 646520, + "step": 900 + }, + { + "epoch": 1.8814968814968815, + "grad_norm": 0.385452002286911, + "learning_rate": 4.993701397674577e-05, + "loss": 0.1646, + "num_input_tokens_seen": 649976, + "step": 905 + }, + { + "epoch": 1.8918918918918919, + "grad_norm": 0.306797593832016, + "learning_rate": 4.993631559863515e-05, + "loss": 0.153, + "num_input_tokens_seen": 653656, + "step": 910 + }, + { + "epoch": 1.9022869022869022, + "grad_norm": 0.2628289759159088, + "learning_rate": 4.9935613375031283e-05, + "loss": 0.1907, + "num_input_tokens_seen": 657208, + "step": 915 + }, + { + "epoch": 1.9126819126819128, + "grad_norm": 0.1658107042312622, + "learning_rate": 4.993490730604248e-05, + "loss": 0.2261, + "num_input_tokens_seen": 660984, + "step": 920 + }, + { + "epoch": 1.9230769230769231, + "grad_norm": 0.54591965675354, + "learning_rate": 4.993419739177761e-05, + "loss": 0.1947, + "num_input_tokens_seen": 664568, + "step": 925 + }, + { + "epoch": 1.9334719334719335, + "grad_norm": 0.19419875741004944, + "learning_rate": 4.9933483632346164e-05, + "loss": 0.2095, + "num_input_tokens_seen": 668120, + "step": 930 + }, + { + "epoch": 1.943866943866944, + "grad_norm": 0.19868797063827515, + "learning_rate": 4.993276602785821e-05, + "loss": 0.1826, + "num_input_tokens_seen": 671672, + "step": 935 + }, + { + "epoch": 1.9542619542619541, + "grad_norm": 0.43898117542266846, + "learning_rate": 4.993204457842441e-05, + "loss": 0.167, + "num_input_tokens_seen": 675320, + "step": 940 + }, + { + "epoch": 1.9646569646569647, + "grad_norm": 0.6336140632629395, + "learning_rate": 4.993131928415602e-05, + "loss": 0.1777, + "num_input_tokens_seen": 678808, + "step": 945 + }, + { + "epoch": 1.975051975051975, + "grad_norm": 0.4234926700592041, + "learning_rate": 4.993059014516489e-05, + "loss": 0.1942, + "num_input_tokens_seen": 682328, + "step": 950 + }, + { + "epoch": 1.9854469854469854, + "grad_norm": 0.33715131878852844, + "learning_rate": 4.9929857161563464e-05, + "loss": 0.1838, + "num_input_tokens_seen": 686008, + "step": 955 + }, + { + "epoch": 1.995841995841996, + "grad_norm": 0.23445479571819305, + "learning_rate": 4.992912033346477e-05, + "loss": 0.2154, + "num_input_tokens_seen": 689560, + "step": 960 + }, + { + "epoch": 2.006237006237006, + "grad_norm": 0.22460278868675232, + "learning_rate": 4.992837966098245e-05, + "loss": 0.1978, + "num_input_tokens_seen": 693040, + "step": 965 + }, + { + "epoch": 2.0166320166320166, + "grad_norm": 0.222771555185318, + "learning_rate": 4.992763514423071e-05, + "loss": 0.1573, + "num_input_tokens_seen": 696624, + "step": 970 + }, + { + "epoch": 2.027027027027027, + "grad_norm": 0.7554253339767456, + "learning_rate": 4.992688678332437e-05, + "loss": 0.2673, + "num_input_tokens_seen": 700176, + "step": 975 + }, + { + "epoch": 2.0374220374220373, + "grad_norm": 0.7037953734397888, + "learning_rate": 4.992613457837884e-05, + "loss": 0.1558, + "num_input_tokens_seen": 703824, + "step": 980 + }, + { + "epoch": 2.047817047817048, + "grad_norm": 0.3107267916202545, + "learning_rate": 4.992537852951011e-05, + "loss": 0.205, + "num_input_tokens_seen": 707536, + "step": 985 + }, + { + "epoch": 2.0582120582120584, + "grad_norm": 0.4517793357372284, + "learning_rate": 4.9924618636834785e-05, + "loss": 0.187, + "num_input_tokens_seen": 711344, + "step": 990 + }, + { + "epoch": 2.0686070686070686, + "grad_norm": 0.2730054557323456, + "learning_rate": 4.9923854900470046e-05, + "loss": 0.1255, + "num_input_tokens_seen": 714864, + "step": 995 + }, + { + "epoch": 2.079002079002079, + "grad_norm": 0.34871554374694824, + "learning_rate": 4.992308732053367e-05, + "loss": 0.2034, + "num_input_tokens_seen": 718448, + "step": 1000 + }, + { + "epoch": 2.079002079002079, + "eval_loss": 0.19246484339237213, + "eval_runtime": 7.7604, + "eval_samples_per_second": 110.303, + "eval_steps_per_second": 27.576, + "num_input_tokens_seen": 718448, + "step": 1000 + }, + { + "epoch": 2.0893970893970892, + "grad_norm": 0.2288864403963089, + "learning_rate": 4.992231589714402e-05, + "loss": 0.1577, + "num_input_tokens_seen": 722032, + "step": 1005 + }, + { + "epoch": 2.0997920997921, + "grad_norm": 0.32042133808135986, + "learning_rate": 4.992154063042007e-05, + "loss": 0.2049, + "num_input_tokens_seen": 725552, + "step": 1010 + }, + { + "epoch": 2.1101871101871104, + "grad_norm": 0.5933064222335815, + "learning_rate": 4.992076152048136e-05, + "loss": 0.2076, + "num_input_tokens_seen": 729232, + "step": 1015 + }, + { + "epoch": 2.1205821205821205, + "grad_norm": 0.22172623872756958, + "learning_rate": 4.991997856744807e-05, + "loss": 0.1642, + "num_input_tokens_seen": 732944, + "step": 1020 + }, + { + "epoch": 2.130977130977131, + "grad_norm": 0.30446234345436096, + "learning_rate": 4.9919191771440905e-05, + "loss": 0.1517, + "num_input_tokens_seen": 736560, + "step": 1025 + }, + { + "epoch": 2.141372141372141, + "grad_norm": 0.20692558586597443, + "learning_rate": 4.991840113258122e-05, + "loss": 0.1321, + "num_input_tokens_seen": 740112, + "step": 1030 + }, + { + "epoch": 2.1517671517671517, + "grad_norm": 0.1738484501838684, + "learning_rate": 4.9917606650990933e-05, + "loss": 0.1539, + "num_input_tokens_seen": 743696, + "step": 1035 + }, + { + "epoch": 2.1621621621621623, + "grad_norm": 0.23179610073566437, + "learning_rate": 4.9916808326792566e-05, + "loss": 0.2335, + "num_input_tokens_seen": 747280, + "step": 1040 + }, + { + "epoch": 2.1725571725571724, + "grad_norm": 0.5021642446517944, + "learning_rate": 4.9916006160109235e-05, + "loss": 0.2958, + "num_input_tokens_seen": 750960, + "step": 1045 + }, + { + "epoch": 2.182952182952183, + "grad_norm": 0.25670912861824036, + "learning_rate": 4.991520015106464e-05, + "loss": 0.2047, + "num_input_tokens_seen": 754640, + "step": 1050 + }, + { + "epoch": 2.1933471933471935, + "grad_norm": 0.27878978848457336, + "learning_rate": 4.991439029978308e-05, + "loss": 0.1845, + "num_input_tokens_seen": 758384, + "step": 1055 + }, + { + "epoch": 2.2037422037422036, + "grad_norm": 0.48113325238227844, + "learning_rate": 4.9913576606389434e-05, + "loss": 0.1957, + "num_input_tokens_seen": 762128, + "step": 1060 + }, + { + "epoch": 2.214137214137214, + "grad_norm": 0.2020827680826187, + "learning_rate": 4.991275907100919e-05, + "loss": 0.2058, + "num_input_tokens_seen": 765744, + "step": 1065 + }, + { + "epoch": 2.2245322245322248, + "grad_norm": 0.2702235281467438, + "learning_rate": 4.9911937693768434e-05, + "loss": 0.1903, + "num_input_tokens_seen": 769296, + "step": 1070 + }, + { + "epoch": 2.234927234927235, + "grad_norm": 0.2020954042673111, + "learning_rate": 4.991111247479382e-05, + "loss": 0.1734, + "num_input_tokens_seen": 772752, + "step": 1075 + }, + { + "epoch": 2.2453222453222454, + "grad_norm": 0.1616896688938141, + "learning_rate": 4.9910283414212605e-05, + "loss": 0.1679, + "num_input_tokens_seen": 776272, + "step": 1080 + }, + { + "epoch": 2.2557172557172556, + "grad_norm": 0.17610102891921997, + "learning_rate": 4.990945051215265e-05, + "loss": 0.1525, + "num_input_tokens_seen": 779760, + "step": 1085 + }, + { + "epoch": 2.266112266112266, + "grad_norm": 0.2010200321674347, + "learning_rate": 4.99086137687424e-05, + "loss": 0.1583, + "num_input_tokens_seen": 783344, + "step": 1090 + }, + { + "epoch": 2.2765072765072767, + "grad_norm": 0.6282720565795898, + "learning_rate": 4.9907773184110874e-05, + "loss": 0.2212, + "num_input_tokens_seen": 786992, + "step": 1095 + }, + { + "epoch": 2.286902286902287, + "grad_norm": 0.184901624917984, + "learning_rate": 4.9906928758387715e-05, + "loss": 0.1483, + "num_input_tokens_seen": 790672, + "step": 1100 + }, + { + "epoch": 2.2972972972972974, + "grad_norm": 0.2513189911842346, + "learning_rate": 4.9906080491703146e-05, + "loss": 0.1751, + "num_input_tokens_seen": 794288, + "step": 1105 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 0.2163839340209961, + "learning_rate": 4.990522838418797e-05, + "loss": 0.1873, + "num_input_tokens_seen": 797808, + "step": 1110 + }, + { + "epoch": 2.318087318087318, + "grad_norm": 0.2669987380504608, + "learning_rate": 4.9904372435973604e-05, + "loss": 0.2235, + "num_input_tokens_seen": 801392, + "step": 1115 + }, + { + "epoch": 2.3284823284823286, + "grad_norm": 0.2139214724302292, + "learning_rate": 4.990351264719203e-05, + "loss": 0.1835, + "num_input_tokens_seen": 804976, + "step": 1120 + }, + { + "epoch": 2.3388773388773387, + "grad_norm": 0.4331375062465668, + "learning_rate": 4.990264901797586e-05, + "loss": 0.2007, + "num_input_tokens_seen": 808656, + "step": 1125 + }, + { + "epoch": 2.3492723492723493, + "grad_norm": 0.2259889543056488, + "learning_rate": 4.990178154845826e-05, + "loss": 0.1665, + "num_input_tokens_seen": 812080, + "step": 1130 + }, + { + "epoch": 2.35966735966736, + "grad_norm": 0.1521359533071518, + "learning_rate": 4.9900910238773014e-05, + "loss": 0.1817, + "num_input_tokens_seen": 815760, + "step": 1135 + }, + { + "epoch": 2.37006237006237, + "grad_norm": 0.25992342829704285, + "learning_rate": 4.990003508905448e-05, + "loss": 0.2042, + "num_input_tokens_seen": 819440, + "step": 1140 + }, + { + "epoch": 2.3804573804573805, + "grad_norm": 0.2644314467906952, + "learning_rate": 4.989915609943763e-05, + "loss": 0.1538, + "num_input_tokens_seen": 822896, + "step": 1145 + }, + { + "epoch": 2.390852390852391, + "grad_norm": 0.3539649248123169, + "learning_rate": 4.9898273270058e-05, + "loss": 0.1734, + "num_input_tokens_seen": 826384, + "step": 1150 + }, + { + "epoch": 2.401247401247401, + "grad_norm": 0.23388250172138214, + "learning_rate": 4.989738660105174e-05, + "loss": 0.1872, + "num_input_tokens_seen": 829872, + "step": 1155 + }, + { + "epoch": 2.4116424116424118, + "grad_norm": 0.4229956567287445, + "learning_rate": 4.989649609255559e-05, + "loss": 0.1555, + "num_input_tokens_seen": 833616, + "step": 1160 + }, + { + "epoch": 2.422037422037422, + "grad_norm": 0.15735560655593872, + "learning_rate": 4.989560174470687e-05, + "loss": 0.2008, + "num_input_tokens_seen": 837168, + "step": 1165 + }, + { + "epoch": 2.4324324324324325, + "grad_norm": 0.25073525309562683, + "learning_rate": 4.989470355764351e-05, + "loss": 0.1364, + "num_input_tokens_seen": 840720, + "step": 1170 + }, + { + "epoch": 2.442827442827443, + "grad_norm": 0.5913508534431458, + "learning_rate": 4.9893801531504e-05, + "loss": 0.1564, + "num_input_tokens_seen": 844336, + "step": 1175 + }, + { + "epoch": 2.453222453222453, + "grad_norm": 0.22609549760818481, + "learning_rate": 4.9892895666427475e-05, + "loss": 0.166, + "num_input_tokens_seen": 847888, + "step": 1180 + }, + { + "epoch": 2.4636174636174637, + "grad_norm": 0.3281961977481842, + "learning_rate": 4.9891985962553606e-05, + "loss": 0.168, + "num_input_tokens_seen": 851472, + "step": 1185 + }, + { + "epoch": 2.474012474012474, + "grad_norm": 0.40156346559524536, + "learning_rate": 4.989107242002269e-05, + "loss": 0.1742, + "num_input_tokens_seen": 855088, + "step": 1190 + }, + { + "epoch": 2.4844074844074844, + "grad_norm": 0.38301828503608704, + "learning_rate": 4.989015503897561e-05, + "loss": 0.1614, + "num_input_tokens_seen": 858800, + "step": 1195 + }, + { + "epoch": 2.494802494802495, + "grad_norm": 0.5007478594779968, + "learning_rate": 4.988923381955383e-05, + "loss": 0.1608, + "num_input_tokens_seen": 862224, + "step": 1200 + }, + { + "epoch": 2.494802494802495, + "eval_loss": 0.18966278433799744, + "eval_runtime": 7.7625, + "eval_samples_per_second": 110.274, + "eval_steps_per_second": 27.568, + "num_input_tokens_seen": 862224, + "step": 1200 + }, + { + "epoch": 2.505197505197505, + "grad_norm": 0.407569944858551, + "learning_rate": 4.988830876189942e-05, + "loss": 0.208, + "num_input_tokens_seen": 865840, + "step": 1205 + }, + { + "epoch": 2.5155925155925156, + "grad_norm": 0.18509455025196075, + "learning_rate": 4.988737986615503e-05, + "loss": 0.166, + "num_input_tokens_seen": 869392, + "step": 1210 + }, + { + "epoch": 2.525987525987526, + "grad_norm": 0.5273256301879883, + "learning_rate": 4.988644713246391e-05, + "loss": 0.2063, + "num_input_tokens_seen": 872848, + "step": 1215 + }, + { + "epoch": 2.5363825363825363, + "grad_norm": 0.3689562678337097, + "learning_rate": 4.988551056096991e-05, + "loss": 0.1751, + "num_input_tokens_seen": 876496, + "step": 1220 + }, + { + "epoch": 2.546777546777547, + "grad_norm": 0.5375843048095703, + "learning_rate": 4.988457015181743e-05, + "loss": 0.1594, + "num_input_tokens_seen": 880048, + "step": 1225 + }, + { + "epoch": 2.5571725571725574, + "grad_norm": 0.17789341509342194, + "learning_rate": 4.988362590515153e-05, + "loss": 0.1271, + "num_input_tokens_seen": 883600, + "step": 1230 + }, + { + "epoch": 2.5675675675675675, + "grad_norm": 0.23167453706264496, + "learning_rate": 4.9882677821117805e-05, + "loss": 0.1769, + "num_input_tokens_seen": 887152, + "step": 1235 + }, + { + "epoch": 2.577962577962578, + "grad_norm": 0.20745849609375, + "learning_rate": 4.988172589986246e-05, + "loss": 0.145, + "num_input_tokens_seen": 890832, + "step": 1240 + }, + { + "epoch": 2.5883575883575882, + "grad_norm": 0.19841565191745758, + "learning_rate": 4.9880770141532304e-05, + "loss": 0.1376, + "num_input_tokens_seen": 894256, + "step": 1245 + }, + { + "epoch": 2.598752598752599, + "grad_norm": 0.25246483087539673, + "learning_rate": 4.987981054627472e-05, + "loss": 0.1697, + "num_input_tokens_seen": 897904, + "step": 1250 + }, + { + "epoch": 2.609147609147609, + "grad_norm": 0.20315439999103546, + "learning_rate": 4.987884711423769e-05, + "loss": 0.2002, + "num_input_tokens_seen": 901424, + "step": 1255 + }, + { + "epoch": 2.6195426195426195, + "grad_norm": 0.44576239585876465, + "learning_rate": 4.9877879845569784e-05, + "loss": 0.1847, + "num_input_tokens_seen": 904944, + "step": 1260 + }, + { + "epoch": 2.62993762993763, + "grad_norm": 0.1539914309978485, + "learning_rate": 4.9876908740420175e-05, + "loss": 0.1674, + "num_input_tokens_seen": 908528, + "step": 1265 + }, + { + "epoch": 2.64033264033264, + "grad_norm": 0.3359895646572113, + "learning_rate": 4.987593379893861e-05, + "loss": 0.1597, + "num_input_tokens_seen": 912112, + "step": 1270 + }, + { + "epoch": 2.6507276507276507, + "grad_norm": 0.2106781303882599, + "learning_rate": 4.987495502127545e-05, + "loss": 0.1326, + "num_input_tokens_seen": 915792, + "step": 1275 + }, + { + "epoch": 2.6611226611226613, + "grad_norm": 0.24684961140155792, + "learning_rate": 4.987397240758162e-05, + "loss": 0.1958, + "num_input_tokens_seen": 919280, + "step": 1280 + }, + { + "epoch": 2.6715176715176714, + "grad_norm": 0.2047549933195114, + "learning_rate": 4.9872985958008664e-05, + "loss": 0.1634, + "num_input_tokens_seen": 922832, + "step": 1285 + }, + { + "epoch": 2.681912681912682, + "grad_norm": 0.365891695022583, + "learning_rate": 4.987199567270871e-05, + "loss": 0.1565, + "num_input_tokens_seen": 926352, + "step": 1290 + }, + { + "epoch": 2.6923076923076925, + "grad_norm": 0.602281928062439, + "learning_rate": 4.9871001551834444e-05, + "loss": 0.1596, + "num_input_tokens_seen": 929840, + "step": 1295 + }, + { + "epoch": 2.7027027027027026, + "grad_norm": 0.5674124956130981, + "learning_rate": 4.98700035955392e-05, + "loss": 0.1367, + "num_input_tokens_seen": 933296, + "step": 1300 + }, + { + "epoch": 2.713097713097713, + "grad_norm": 0.3327973783016205, + "learning_rate": 4.986900180397686e-05, + "loss": 0.2096, + "num_input_tokens_seen": 936816, + "step": 1305 + }, + { + "epoch": 2.7234927234927238, + "grad_norm": 0.6175124645233154, + "learning_rate": 4.9867996177301926e-05, + "loss": 0.1779, + "num_input_tokens_seen": 940336, + "step": 1310 + }, + { + "epoch": 2.733887733887734, + "grad_norm": 0.18234093487262726, + "learning_rate": 4.9866986715669464e-05, + "loss": 0.1639, + "num_input_tokens_seen": 944080, + "step": 1315 + }, + { + "epoch": 2.7442827442827444, + "grad_norm": 0.25627291202545166, + "learning_rate": 4.9865973419235155e-05, + "loss": 0.1352, + "num_input_tokens_seen": 947600, + "step": 1320 + }, + { + "epoch": 2.7546777546777546, + "grad_norm": 0.2769981622695923, + "learning_rate": 4.986495628815526e-05, + "loss": 0.1511, + "num_input_tokens_seen": 951280, + "step": 1325 + }, + { + "epoch": 2.765072765072765, + "grad_norm": 0.27908656001091003, + "learning_rate": 4.986393532258663e-05, + "loss": 0.1976, + "num_input_tokens_seen": 954832, + "step": 1330 + }, + { + "epoch": 2.7754677754677752, + "grad_norm": 0.17011278867721558, + "learning_rate": 4.986291052268671e-05, + "loss": 0.2122, + "num_input_tokens_seen": 958256, + "step": 1335 + }, + { + "epoch": 2.785862785862786, + "grad_norm": 0.3269628584384918, + "learning_rate": 4.986188188861355e-05, + "loss": 0.2067, + "num_input_tokens_seen": 961872, + "step": 1340 + }, + { + "epoch": 2.7962577962577964, + "grad_norm": 0.1424427032470703, + "learning_rate": 4.9860849420525766e-05, + "loss": 0.1346, + "num_input_tokens_seen": 965456, + "step": 1345 + }, + { + "epoch": 2.8066528066528065, + "grad_norm": 0.3241112530231476, + "learning_rate": 4.9859813118582575e-05, + "loss": 0.2382, + "num_input_tokens_seen": 968944, + "step": 1350 + }, + { + "epoch": 2.817047817047817, + "grad_norm": 0.1882600486278534, + "learning_rate": 4.98587729829438e-05, + "loss": 0.1885, + "num_input_tokens_seen": 972560, + "step": 1355 + }, + { + "epoch": 2.8274428274428276, + "grad_norm": 0.24786075949668884, + "learning_rate": 4.985772901376983e-05, + "loss": 0.1551, + "num_input_tokens_seen": 976144, + "step": 1360 + }, + { + "epoch": 2.8378378378378377, + "grad_norm": 0.334253191947937, + "learning_rate": 4.9856681211221666e-05, + "loss": 0.1701, + "num_input_tokens_seen": 979696, + "step": 1365 + }, + { + "epoch": 2.8482328482328483, + "grad_norm": 0.5868493318557739, + "learning_rate": 4.985562957546089e-05, + "loss": 0.1698, + "num_input_tokens_seen": 983152, + "step": 1370 + }, + { + "epoch": 2.858627858627859, + "grad_norm": 0.28893908858299255, + "learning_rate": 4.9854574106649686e-05, + "loss": 0.2104, + "num_input_tokens_seen": 986768, + "step": 1375 + }, + { + "epoch": 2.869022869022869, + "grad_norm": 0.18523149192333221, + "learning_rate": 4.985351480495081e-05, + "loss": 0.1989, + "num_input_tokens_seen": 990480, + "step": 1380 + }, + { + "epoch": 2.8794178794178795, + "grad_norm": 0.2556129992008209, + "learning_rate": 4.985245167052762e-05, + "loss": 0.2331, + "num_input_tokens_seen": 994000, + "step": 1385 + }, + { + "epoch": 2.88981288981289, + "grad_norm": 0.2541482448577881, + "learning_rate": 4.9851384703544066e-05, + "loss": 0.1696, + "num_input_tokens_seen": 997584, + "step": 1390 + }, + { + "epoch": 2.9002079002079, + "grad_norm": 0.2526537775993347, + "learning_rate": 4.985031390416469e-05, + "loss": 0.1544, + "num_input_tokens_seen": 1001232, + "step": 1395 + }, + { + "epoch": 2.9106029106029108, + "grad_norm": 0.18850897252559662, + "learning_rate": 4.984923927255461e-05, + "loss": 0.2277, + "num_input_tokens_seen": 1004880, + "step": 1400 + }, + { + "epoch": 2.9106029106029108, + "eval_loss": 0.18409308791160583, + "eval_runtime": 7.7562, + "eval_samples_per_second": 110.363, + "eval_steps_per_second": 27.591, + "num_input_tokens_seen": 1004880, + "step": 1400 + }, + { + "epoch": 2.920997920997921, + "grad_norm": 0.17565573751926422, + "learning_rate": 4.984816080887958e-05, + "loss": 0.1464, + "num_input_tokens_seen": 1008304, + "step": 1405 + }, + { + "epoch": 2.9313929313929314, + "grad_norm": 0.2493305802345276, + "learning_rate": 4.9847078513305875e-05, + "loss": 0.1534, + "num_input_tokens_seen": 1011888, + "step": 1410 + }, + { + "epoch": 2.9417879417879416, + "grad_norm": 0.8638669848442078, + "learning_rate": 4.984599238600043e-05, + "loss": 0.1726, + "num_input_tokens_seen": 1015504, + "step": 1415 + }, + { + "epoch": 2.952182952182952, + "grad_norm": 0.22162407636642456, + "learning_rate": 4.9844902427130716e-05, + "loss": 0.1507, + "num_input_tokens_seen": 1019184, + "step": 1420 + }, + { + "epoch": 2.9625779625779627, + "grad_norm": 0.43739157915115356, + "learning_rate": 4.984380863686482e-05, + "loss": 0.1528, + "num_input_tokens_seen": 1022960, + "step": 1425 + }, + { + "epoch": 2.972972972972973, + "grad_norm": 0.9655539393424988, + "learning_rate": 4.984271101537143e-05, + "loss": 0.2431, + "num_input_tokens_seen": 1026672, + "step": 1430 + }, + { + "epoch": 2.9833679833679834, + "grad_norm": 0.16790461540222168, + "learning_rate": 4.9841609562819816e-05, + "loss": 0.2096, + "num_input_tokens_seen": 1030288, + "step": 1435 + }, + { + "epoch": 2.993762993762994, + "grad_norm": 0.17429198324680328, + "learning_rate": 4.984050427937983e-05, + "loss": 0.1475, + "num_input_tokens_seen": 1033840, + "step": 1440 + }, + { + "epoch": 3.004158004158004, + "grad_norm": 0.5616788268089294, + "learning_rate": 4.983939516522191e-05, + "loss": 0.1773, + "num_input_tokens_seen": 1037352, + "step": 1445 + }, + { + "epoch": 3.0145530145530146, + "grad_norm": 0.2681989371776581, + "learning_rate": 4.983828222051711e-05, + "loss": 0.1847, + "num_input_tokens_seen": 1041064, + "step": 1450 + }, + { + "epoch": 3.024948024948025, + "grad_norm": 0.2199888527393341, + "learning_rate": 4.983716544543705e-05, + "loss": 0.1254, + "num_input_tokens_seen": 1044552, + "step": 1455 + }, + { + "epoch": 3.0353430353430353, + "grad_norm": 0.18660783767700195, + "learning_rate": 4.983604484015395e-05, + "loss": 0.1856, + "num_input_tokens_seen": 1048264, + "step": 1460 + }, + { + "epoch": 3.045738045738046, + "grad_norm": 0.2361350804567337, + "learning_rate": 4.983492040484064e-05, + "loss": 0.157, + "num_input_tokens_seen": 1051880, + "step": 1465 + }, + { + "epoch": 3.056133056133056, + "grad_norm": 0.23746472597122192, + "learning_rate": 4.98337921396705e-05, + "loss": 0.14, + "num_input_tokens_seen": 1055464, + "step": 1470 + }, + { + "epoch": 3.0665280665280665, + "grad_norm": 0.38519346714019775, + "learning_rate": 4.983266004481753e-05, + "loss": 0.184, + "num_input_tokens_seen": 1058920, + "step": 1475 + }, + { + "epoch": 3.076923076923077, + "grad_norm": 0.23965288698673248, + "learning_rate": 4.9831524120456316e-05, + "loss": 0.1713, + "num_input_tokens_seen": 1062632, + "step": 1480 + }, + { + "epoch": 3.087318087318087, + "grad_norm": 0.3848338723182678, + "learning_rate": 4.9830384366762026e-05, + "loss": 0.1546, + "num_input_tokens_seen": 1066248, + "step": 1485 + }, + { + "epoch": 3.0977130977130978, + "grad_norm": 0.4958535432815552, + "learning_rate": 4.9829240783910436e-05, + "loss": 0.2295, + "num_input_tokens_seen": 1069864, + "step": 1490 + }, + { + "epoch": 3.108108108108108, + "grad_norm": 0.47314882278442383, + "learning_rate": 4.982809337207789e-05, + "loss": 0.1376, + "num_input_tokens_seen": 1073576, + "step": 1495 + }, + { + "epoch": 3.1185031185031185, + "grad_norm": 0.29914432764053345, + "learning_rate": 4.9826942131441337e-05, + "loss": 0.1755, + "num_input_tokens_seen": 1077352, + "step": 1500 + }, + { + "epoch": 3.128898128898129, + "grad_norm": 0.1898111253976822, + "learning_rate": 4.9825787062178315e-05, + "loss": 0.1692, + "num_input_tokens_seen": 1080808, + "step": 1505 + }, + { + "epoch": 3.139293139293139, + "grad_norm": 0.3157567083835602, + "learning_rate": 4.9824628164466945e-05, + "loss": 0.1738, + "num_input_tokens_seen": 1084296, + "step": 1510 + }, + { + "epoch": 3.1496881496881497, + "grad_norm": 0.2628602981567383, + "learning_rate": 4.982346543848595e-05, + "loss": 0.157, + "num_input_tokens_seen": 1088008, + "step": 1515 + }, + { + "epoch": 3.1600831600831603, + "grad_norm": 0.5477745532989502, + "learning_rate": 4.9822298884414626e-05, + "loss": 0.2064, + "num_input_tokens_seen": 1091528, + "step": 1520 + }, + { + "epoch": 3.1704781704781704, + "grad_norm": 0.26966509222984314, + "learning_rate": 4.982112850243288e-05, + "loss": 0.1819, + "num_input_tokens_seen": 1095144, + "step": 1525 + }, + { + "epoch": 3.180873180873181, + "grad_norm": 0.40925759077072144, + "learning_rate": 4.98199542927212e-05, + "loss": 0.1447, + "num_input_tokens_seen": 1098600, + "step": 1530 + }, + { + "epoch": 3.1912681912681915, + "grad_norm": 0.5873309969902039, + "learning_rate": 4.981877625546066e-05, + "loss": 0.1729, + "num_input_tokens_seen": 1102184, + "step": 1535 + }, + { + "epoch": 3.2016632016632016, + "grad_norm": 0.38649782538414, + "learning_rate": 4.981759439083293e-05, + "loss": 0.1986, + "num_input_tokens_seen": 1105896, + "step": 1540 + }, + { + "epoch": 3.212058212058212, + "grad_norm": 0.42547130584716797, + "learning_rate": 4.981640869902027e-05, + "loss": 0.1626, + "num_input_tokens_seen": 1109448, + "step": 1545 + }, + { + "epoch": 3.2224532224532223, + "grad_norm": 0.20766551792621613, + "learning_rate": 4.9815219180205517e-05, + "loss": 0.1534, + "num_input_tokens_seen": 1113064, + "step": 1550 + }, + { + "epoch": 3.232848232848233, + "grad_norm": 0.2291135936975479, + "learning_rate": 4.9814025834572126e-05, + "loss": 0.1562, + "num_input_tokens_seen": 1116680, + "step": 1555 + }, + { + "epoch": 3.2432432432432434, + "grad_norm": 0.3681188225746155, + "learning_rate": 4.981282866230411e-05, + "loss": 0.1862, + "num_input_tokens_seen": 1120104, + "step": 1560 + }, + { + "epoch": 3.2536382536382535, + "grad_norm": 0.41176533699035645, + "learning_rate": 4.981162766358611e-05, + "loss": 0.1513, + "num_input_tokens_seen": 1123560, + "step": 1565 + }, + { + "epoch": 3.264033264033264, + "grad_norm": 1.6824368238449097, + "learning_rate": 4.9810422838603316e-05, + "loss": 0.1901, + "num_input_tokens_seen": 1126920, + "step": 1570 + }, + { + "epoch": 3.274428274428274, + "grad_norm": 0.14548936486244202, + "learning_rate": 4.9809214187541533e-05, + "loss": 0.1437, + "num_input_tokens_seen": 1130440, + "step": 1575 + }, + { + "epoch": 3.284823284823285, + "grad_norm": 0.44655659794807434, + "learning_rate": 4.980800171058715e-05, + "loss": 0.197, + "num_input_tokens_seen": 1134120, + "step": 1580 + }, + { + "epoch": 3.2952182952182953, + "grad_norm": 0.5662634968757629, + "learning_rate": 4.980678540792715e-05, + "loss": 0.1792, + "num_input_tokens_seen": 1137736, + "step": 1585 + }, + { + "epoch": 3.3056133056133055, + "grad_norm": 0.37743183970451355, + "learning_rate": 4.980556527974909e-05, + "loss": 0.1573, + "num_input_tokens_seen": 1141256, + "step": 1590 + }, + { + "epoch": 3.316008316008316, + "grad_norm": 0.40834110975265503, + "learning_rate": 4.980434132624114e-05, + "loss": 0.1905, + "num_input_tokens_seen": 1144744, + "step": 1595 + }, + { + "epoch": 3.3264033264033266, + "grad_norm": 0.20493151247501373, + "learning_rate": 4.980311354759205e-05, + "loss": 0.1451, + "num_input_tokens_seen": 1148296, + "step": 1600 + }, + { + "epoch": 3.3264033264033266, + "eval_loss": 0.17657047510147095, + "eval_runtime": 7.7521, + "eval_samples_per_second": 110.421, + "eval_steps_per_second": 27.605, + "num_input_tokens_seen": 1148296, + "step": 1600 + }, + { + "epoch": 3.3367983367983367, + "grad_norm": 0.1594935953617096, + "learning_rate": 4.980188194399116e-05, + "loss": 0.2621, + "num_input_tokens_seen": 1151944, + "step": 1605 + }, + { + "epoch": 3.3471933471933473, + "grad_norm": 0.2397795170545578, + "learning_rate": 4.9800646515628384e-05, + "loss": 0.1608, + "num_input_tokens_seen": 1155432, + "step": 1610 + }, + { + "epoch": 3.357588357588358, + "grad_norm": 0.3772222101688385, + "learning_rate": 4.979940726269426e-05, + "loss": 0.1938, + "num_input_tokens_seen": 1159080, + "step": 1615 + }, + { + "epoch": 3.367983367983368, + "grad_norm": 0.22013406455516815, + "learning_rate": 4.979816418537988e-05, + "loss": 0.1301, + "num_input_tokens_seen": 1162600, + "step": 1620 + }, + { + "epoch": 3.3783783783783785, + "grad_norm": 0.44508904218673706, + "learning_rate": 4.979691728387696e-05, + "loss": 0.1625, + "num_input_tokens_seen": 1166120, + "step": 1625 + }, + { + "epoch": 3.3887733887733886, + "grad_norm": 0.41026169061660767, + "learning_rate": 4.979566655837776e-05, + "loss": 0.1557, + "num_input_tokens_seen": 1169736, + "step": 1630 + }, + { + "epoch": 3.399168399168399, + "grad_norm": 0.2955121099948883, + "learning_rate": 4.9794412009075184e-05, + "loss": 0.1963, + "num_input_tokens_seen": 1173352, + "step": 1635 + }, + { + "epoch": 3.4095634095634098, + "grad_norm": 1.1032601594924927, + "learning_rate": 4.979315363616269e-05, + "loss": 0.1984, + "num_input_tokens_seen": 1177160, + "step": 1640 + }, + { + "epoch": 3.41995841995842, + "grad_norm": 0.24336816370487213, + "learning_rate": 4.979189143983434e-05, + "loss": 0.1419, + "num_input_tokens_seen": 1180680, + "step": 1645 + }, + { + "epoch": 3.4303534303534304, + "grad_norm": 0.8224805593490601, + "learning_rate": 4.979062542028478e-05, + "loss": 0.1636, + "num_input_tokens_seen": 1184136, + "step": 1650 + }, + { + "epoch": 3.4407484407484406, + "grad_norm": 0.42630335688591003, + "learning_rate": 4.978935557770923e-05, + "loss": 0.1602, + "num_input_tokens_seen": 1187688, + "step": 1655 + }, + { + "epoch": 3.451143451143451, + "grad_norm": 0.7884613275527954, + "learning_rate": 4.978808191230353e-05, + "loss": 0.2118, + "num_input_tokens_seen": 1191272, + "step": 1660 + }, + { + "epoch": 3.4615384615384617, + "grad_norm": 0.6497827768325806, + "learning_rate": 4.9786804424264085e-05, + "loss": 0.167, + "num_input_tokens_seen": 1194856, + "step": 1665 + }, + { + "epoch": 3.471933471933472, + "grad_norm": 0.23233354091644287, + "learning_rate": 4.978552311378792e-05, + "loss": 0.1458, + "num_input_tokens_seen": 1198440, + "step": 1670 + }, + { + "epoch": 3.4823284823284824, + "grad_norm": 0.22447575628757477, + "learning_rate": 4.978423798107261e-05, + "loss": 0.183, + "num_input_tokens_seen": 1201960, + "step": 1675 + }, + { + "epoch": 3.492723492723493, + "grad_norm": 0.18096020817756653, + "learning_rate": 4.978294902631635e-05, + "loss": 0.1767, + "num_input_tokens_seen": 1205576, + "step": 1680 + }, + { + "epoch": 3.503118503118503, + "grad_norm": 0.4499590992927551, + "learning_rate": 4.9781656249717914e-05, + "loss": 0.1791, + "num_input_tokens_seen": 1209192, + "step": 1685 + }, + { + "epoch": 3.5135135135135136, + "grad_norm": 0.43786168098449707, + "learning_rate": 4.9780359651476645e-05, + "loss": 0.1523, + "num_input_tokens_seen": 1212808, + "step": 1690 + }, + { + "epoch": 3.523908523908524, + "grad_norm": 0.32336166501045227, + "learning_rate": 4.977905923179251e-05, + "loss": 0.1795, + "num_input_tokens_seen": 1216296, + "step": 1695 + }, + { + "epoch": 3.5343035343035343, + "grad_norm": 0.2302243709564209, + "learning_rate": 4.977775499086606e-05, + "loss": 0.1303, + "num_input_tokens_seen": 1219976, + "step": 1700 + }, + { + "epoch": 3.544698544698545, + "grad_norm": 0.3885316550731659, + "learning_rate": 4.97764469288984e-05, + "loss": 0.1744, + "num_input_tokens_seen": 1223624, + "step": 1705 + }, + { + "epoch": 3.555093555093555, + "grad_norm": 0.21515633165836334, + "learning_rate": 4.977513504609127e-05, + "loss": 0.1886, + "num_input_tokens_seen": 1227304, + "step": 1710 + }, + { + "epoch": 3.5654885654885655, + "grad_norm": 0.30951163172721863, + "learning_rate": 4.9773819342646965e-05, + "loss": 0.1362, + "num_input_tokens_seen": 1230888, + "step": 1715 + }, + { + "epoch": 3.5758835758835756, + "grad_norm": 0.17893879115581512, + "learning_rate": 4.97724998187684e-05, + "loss": 0.1709, + "num_input_tokens_seen": 1234408, + "step": 1720 + }, + { + "epoch": 3.586278586278586, + "grad_norm": 0.1742841601371765, + "learning_rate": 4.9771176474659045e-05, + "loss": 0.1481, + "num_input_tokens_seen": 1238184, + "step": 1725 + }, + { + "epoch": 3.5966735966735968, + "grad_norm": 0.2324562668800354, + "learning_rate": 4.976984931052299e-05, + "loss": 0.1283, + "num_input_tokens_seen": 1241800, + "step": 1730 + }, + { + "epoch": 3.607068607068607, + "grad_norm": 0.2718299329280853, + "learning_rate": 4.976851832656489e-05, + "loss": 0.157, + "num_input_tokens_seen": 1245288, + "step": 1735 + }, + { + "epoch": 3.6174636174636174, + "grad_norm": 0.34208738803863525, + "learning_rate": 4.9767183522990004e-05, + "loss": 0.1496, + "num_input_tokens_seen": 1249000, + "step": 1740 + }, + { + "epoch": 3.627858627858628, + "grad_norm": 0.5552517175674438, + "learning_rate": 4.9765844900004176e-05, + "loss": 0.1738, + "num_input_tokens_seen": 1252488, + "step": 1745 + }, + { + "epoch": 3.638253638253638, + "grad_norm": 0.3622688055038452, + "learning_rate": 4.9764502457813834e-05, + "loss": 0.1897, + "num_input_tokens_seen": 1256264, + "step": 1750 + }, + { + "epoch": 3.6486486486486487, + "grad_norm": 0.31561988592147827, + "learning_rate": 4.9763156196626005e-05, + "loss": 0.1399, + "num_input_tokens_seen": 1259816, + "step": 1755 + }, + { + "epoch": 3.6590436590436592, + "grad_norm": 0.49789318442344666, + "learning_rate": 4.97618061166483e-05, + "loss": 0.1552, + "num_input_tokens_seen": 1263368, + "step": 1760 + }, + { + "epoch": 3.6694386694386694, + "grad_norm": 0.2679223418235779, + "learning_rate": 4.9760452218088915e-05, + "loss": 0.1419, + "num_input_tokens_seen": 1266824, + "step": 1765 + }, + { + "epoch": 3.67983367983368, + "grad_norm": 0.26692232489585876, + "learning_rate": 4.975909450115663e-05, + "loss": 0.1554, + "num_input_tokens_seen": 1270440, + "step": 1770 + }, + { + "epoch": 3.6902286902286905, + "grad_norm": 0.32916024327278137, + "learning_rate": 4.975773296606084e-05, + "loss": 0.157, + "num_input_tokens_seen": 1274248, + "step": 1775 + }, + { + "epoch": 3.7006237006237006, + "grad_norm": 0.23005615174770355, + "learning_rate": 4.97563676130115e-05, + "loss": 0.1833, + "num_input_tokens_seen": 1277896, + "step": 1780 + }, + { + "epoch": 3.711018711018711, + "grad_norm": 0.6661831140518188, + "learning_rate": 4.9754998442219166e-05, + "loss": 0.1779, + "num_input_tokens_seen": 1281576, + "step": 1785 + }, + { + "epoch": 3.7214137214137213, + "grad_norm": 0.2540232837200165, + "learning_rate": 4.9753625453894984e-05, + "loss": 0.2064, + "num_input_tokens_seen": 1285160, + "step": 1790 + }, + { + "epoch": 3.731808731808732, + "grad_norm": 0.19021828472614288, + "learning_rate": 4.975224864825068e-05, + "loss": 0.1914, + "num_input_tokens_seen": 1288904, + "step": 1795 + }, + { + "epoch": 3.742203742203742, + "grad_norm": 0.2544374167919159, + "learning_rate": 4.9750868025498576e-05, + "loss": 0.1774, + "num_input_tokens_seen": 1292616, + "step": 1800 + }, + { + "epoch": 3.742203742203742, + "eval_loss": 0.174961119890213, + "eval_runtime": 7.7616, + "eval_samples_per_second": 110.286, + "eval_steps_per_second": 27.572, + "num_input_tokens_seen": 1292616, + "step": 1800 + }, + { + "epoch": 3.7525987525987525, + "grad_norm": 0.5310657024383545, + "learning_rate": 4.974948358585158e-05, + "loss": 0.1599, + "num_input_tokens_seen": 1296200, + "step": 1805 + }, + { + "epoch": 3.762993762993763, + "grad_norm": 0.23538221418857574, + "learning_rate": 4.9748095329523205e-05, + "loss": 0.141, + "num_input_tokens_seen": 1299688, + "step": 1810 + }, + { + "epoch": 3.773388773388773, + "grad_norm": 0.18888302147388458, + "learning_rate": 4.974670325672752e-05, + "loss": 0.1473, + "num_input_tokens_seen": 1303208, + "step": 1815 + }, + { + "epoch": 3.7837837837837838, + "grad_norm": 0.3724542558193207, + "learning_rate": 4.974530736767921e-05, + "loss": 0.1738, + "num_input_tokens_seen": 1306760, + "step": 1820 + }, + { + "epoch": 3.7941787941787943, + "grad_norm": 0.2708316445350647, + "learning_rate": 4.9743907662593524e-05, + "loss": 0.136, + "num_input_tokens_seen": 1310344, + "step": 1825 + }, + { + "epoch": 3.8045738045738045, + "grad_norm": 0.5319818258285522, + "learning_rate": 4.974250414168633e-05, + "loss": 0.1618, + "num_input_tokens_seen": 1313864, + "step": 1830 + }, + { + "epoch": 3.814968814968815, + "grad_norm": 0.2133249193429947, + "learning_rate": 4.974109680517407e-05, + "loss": 0.1592, + "num_input_tokens_seen": 1317480, + "step": 1835 + }, + { + "epoch": 3.8253638253638256, + "grad_norm": 0.5073072910308838, + "learning_rate": 4.973968565327376e-05, + "loss": 0.1844, + "num_input_tokens_seen": 1321128, + "step": 1840 + }, + { + "epoch": 3.8357588357588357, + "grad_norm": 0.22831594944000244, + "learning_rate": 4.973827068620303e-05, + "loss": 0.1708, + "num_input_tokens_seen": 1324744, + "step": 1845 + }, + { + "epoch": 3.8461538461538463, + "grad_norm": 0.2408497929573059, + "learning_rate": 4.973685190418008e-05, + "loss": 0.1892, + "num_input_tokens_seen": 1328360, + "step": 1850 + }, + { + "epoch": 3.856548856548857, + "grad_norm": 0.26212984323501587, + "learning_rate": 4.97354293074237e-05, + "loss": 0.1751, + "num_input_tokens_seen": 1331816, + "step": 1855 + }, + { + "epoch": 3.866943866943867, + "grad_norm": 0.1910029798746109, + "learning_rate": 4.9734002896153276e-05, + "loss": 0.1762, + "num_input_tokens_seen": 1335304, + "step": 1860 + }, + { + "epoch": 3.8773388773388775, + "grad_norm": 0.20365580916404724, + "learning_rate": 4.973257267058877e-05, + "loss": 0.1961, + "num_input_tokens_seen": 1338984, + "step": 1865 + }, + { + "epoch": 3.8877338877338876, + "grad_norm": 0.28244492411613464, + "learning_rate": 4.973113863095076e-05, + "loss": 0.1471, + "num_input_tokens_seen": 1342664, + "step": 1870 + }, + { + "epoch": 3.898128898128898, + "grad_norm": 0.6091713905334473, + "learning_rate": 4.9729700777460384e-05, + "loss": 0.188, + "num_input_tokens_seen": 1346440, + "step": 1875 + }, + { + "epoch": 3.9085239085239083, + "grad_norm": 0.5180817246437073, + "learning_rate": 4.972825911033937e-05, + "loss": 0.1791, + "num_input_tokens_seen": 1349896, + "step": 1880 + }, + { + "epoch": 3.918918918918919, + "grad_norm": 0.3358602821826935, + "learning_rate": 4.9726813629810056e-05, + "loss": 0.1345, + "num_input_tokens_seen": 1353448, + "step": 1885 + }, + { + "epoch": 3.9293139293139294, + "grad_norm": 0.19717632234096527, + "learning_rate": 4.9725364336095326e-05, + "loss": 0.1185, + "num_input_tokens_seen": 1356968, + "step": 1890 + }, + { + "epoch": 3.9397089397089395, + "grad_norm": 0.47568491101264954, + "learning_rate": 4.972391122941871e-05, + "loss": 0.189, + "num_input_tokens_seen": 1360584, + "step": 1895 + }, + { + "epoch": 3.95010395010395, + "grad_norm": 0.14795055985450745, + "learning_rate": 4.972245431000428e-05, + "loss": 0.1483, + "num_input_tokens_seen": 1364168, + "step": 1900 + }, + { + "epoch": 3.9604989604989607, + "grad_norm": 0.15135303139686584, + "learning_rate": 4.972099357807671e-05, + "loss": 0.179, + "num_input_tokens_seen": 1367816, + "step": 1905 + }, + { + "epoch": 3.970893970893971, + "grad_norm": 0.35753363370895386, + "learning_rate": 4.971952903386127e-05, + "loss": 0.1894, + "num_input_tokens_seen": 1371400, + "step": 1910 + }, + { + "epoch": 3.9812889812889813, + "grad_norm": 0.4864388704299927, + "learning_rate": 4.971806067758381e-05, + "loss": 0.1846, + "num_input_tokens_seen": 1375080, + "step": 1915 + }, + { + "epoch": 3.991683991683992, + "grad_norm": 0.22188718616962433, + "learning_rate": 4.971658850947076e-05, + "loss": 0.1772, + "num_input_tokens_seen": 1378664, + "step": 1920 + }, + { + "epoch": 4.002079002079002, + "grad_norm": 0.18295426666736603, + "learning_rate": 4.9715112529749165e-05, + "loss": 0.1603, + "num_input_tokens_seen": 1382448, + "step": 1925 + }, + { + "epoch": 4.012474012474012, + "grad_norm": 0.23924142122268677, + "learning_rate": 4.9713632738646624e-05, + "loss": 0.1413, + "num_input_tokens_seen": 1386128, + "step": 1930 + }, + { + "epoch": 4.022869022869023, + "grad_norm": 0.5819074511528015, + "learning_rate": 4.971214913639134e-05, + "loss": 0.1395, + "num_input_tokens_seen": 1389616, + "step": 1935 + }, + { + "epoch": 4.033264033264033, + "grad_norm": 0.3893031179904938, + "learning_rate": 4.9710661723212104e-05, + "loss": 0.2287, + "num_input_tokens_seen": 1393104, + "step": 1940 + }, + { + "epoch": 4.043659043659043, + "grad_norm": 0.34786897897720337, + "learning_rate": 4.9709170499338295e-05, + "loss": 0.1911, + "num_input_tokens_seen": 1396688, + "step": 1945 + }, + { + "epoch": 4.054054054054054, + "grad_norm": 0.1973234862089157, + "learning_rate": 4.9707675464999895e-05, + "loss": 0.1603, + "num_input_tokens_seen": 1400368, + "step": 1950 + }, + { + "epoch": 4.0644490644490645, + "grad_norm": 0.5474634766578674, + "learning_rate": 4.970617662042743e-05, + "loss": 0.1864, + "num_input_tokens_seen": 1403856, + "step": 1955 + }, + { + "epoch": 4.074844074844075, + "grad_norm": 0.17509035766124725, + "learning_rate": 4.970467396585206e-05, + "loss": 0.1552, + "num_input_tokens_seen": 1407568, + "step": 1960 + }, + { + "epoch": 4.085239085239086, + "grad_norm": 0.19517630338668823, + "learning_rate": 4.97031675015055e-05, + "loss": 0.1606, + "num_input_tokens_seen": 1411184, + "step": 1965 + }, + { + "epoch": 4.095634095634096, + "grad_norm": 0.33853253722190857, + "learning_rate": 4.9701657227620075e-05, + "loss": 0.2014, + "num_input_tokens_seen": 1414800, + "step": 1970 + }, + { + "epoch": 4.106029106029106, + "grad_norm": 0.1507486253976822, + "learning_rate": 4.9700143144428685e-05, + "loss": 0.153, + "num_input_tokens_seen": 1418384, + "step": 1975 + }, + { + "epoch": 4.116424116424117, + "grad_norm": 0.16939398646354675, + "learning_rate": 4.969862525216482e-05, + "loss": 0.159, + "num_input_tokens_seen": 1421936, + "step": 1980 + }, + { + "epoch": 4.126819126819127, + "grad_norm": 0.20906953513622284, + "learning_rate": 4.9697103551062556e-05, + "loss": 0.1324, + "num_input_tokens_seen": 1425360, + "step": 1985 + }, + { + "epoch": 4.137214137214137, + "grad_norm": 0.21713018417358398, + "learning_rate": 4.9695578041356565e-05, + "loss": 0.1619, + "num_input_tokens_seen": 1428912, + "step": 1990 + }, + { + "epoch": 4.147609147609147, + "grad_norm": 0.4635274410247803, + "learning_rate": 4.969404872328209e-05, + "loss": 0.2007, + "num_input_tokens_seen": 1432496, + "step": 1995 + }, + { + "epoch": 4.158004158004158, + "grad_norm": 0.3410338759422302, + "learning_rate": 4.969251559707498e-05, + "loss": 0.1937, + "num_input_tokens_seen": 1436240, + "step": 2000 + }, + { + "epoch": 4.158004158004158, + "eval_loss": 0.17497918009757996, + "eval_runtime": 7.7744, + "eval_samples_per_second": 110.105, + "eval_steps_per_second": 27.526, + "num_input_tokens_seen": 1436240, + "step": 2000 + }, + { + "epoch": 4.168399168399168, + "grad_norm": 0.5627536177635193, + "learning_rate": 4.9690978662971674e-05, + "loss": 0.1328, + "num_input_tokens_seen": 1439952, + "step": 2005 + }, + { + "epoch": 4.1787941787941785, + "grad_norm": 0.1963369995355606, + "learning_rate": 4.968943792120916e-05, + "loss": 0.1444, + "num_input_tokens_seen": 1443568, + "step": 2010 + }, + { + "epoch": 4.1891891891891895, + "grad_norm": 0.35104900598526, + "learning_rate": 4.9687893372025046e-05, + "loss": 0.1289, + "num_input_tokens_seen": 1447088, + "step": 2015 + }, + { + "epoch": 4.1995841995842, + "grad_norm": 0.5209385752677917, + "learning_rate": 4.9686345015657535e-05, + "loss": 0.1589, + "num_input_tokens_seen": 1450960, + "step": 2020 + }, + { + "epoch": 4.20997920997921, + "grad_norm": 0.2699461579322815, + "learning_rate": 4.968479285234538e-05, + "loss": 0.1716, + "num_input_tokens_seen": 1454480, + "step": 2025 + }, + { + "epoch": 4.220374220374221, + "grad_norm": 0.19213557243347168, + "learning_rate": 4.9683236882327974e-05, + "loss": 0.1255, + "num_input_tokens_seen": 1458096, + "step": 2030 + }, + { + "epoch": 4.230769230769231, + "grad_norm": 0.24746453762054443, + "learning_rate": 4.968167710584526e-05, + "loss": 0.1256, + "num_input_tokens_seen": 1461584, + "step": 2035 + }, + { + "epoch": 4.241164241164241, + "grad_norm": 0.21941407024860382, + "learning_rate": 4.968011352313775e-05, + "loss": 0.1292, + "num_input_tokens_seen": 1465104, + "step": 2040 + }, + { + "epoch": 4.251559251559252, + "grad_norm": 0.2193700522184372, + "learning_rate": 4.967854613444659e-05, + "loss": 0.183, + "num_input_tokens_seen": 1468592, + "step": 2045 + }, + { + "epoch": 4.261954261954262, + "grad_norm": 0.205315500497818, + "learning_rate": 4.967697494001349e-05, + "loss": 0.1591, + "num_input_tokens_seen": 1472272, + "step": 2050 + }, + { + "epoch": 4.272349272349272, + "grad_norm": 0.43559083342552185, + "learning_rate": 4.9675399940080736e-05, + "loss": 0.1711, + "num_input_tokens_seen": 1475792, + "step": 2055 + }, + { + "epoch": 4.282744282744282, + "grad_norm": 0.5556605458259583, + "learning_rate": 4.9673821134891226e-05, + "loss": 0.1602, + "num_input_tokens_seen": 1479248, + "step": 2060 + }, + { + "epoch": 4.293139293139293, + "grad_norm": 0.16564710438251495, + "learning_rate": 4.967223852468842e-05, + "loss": 0.1693, + "num_input_tokens_seen": 1482768, + "step": 2065 + }, + { + "epoch": 4.303534303534303, + "grad_norm": 0.5886558294296265, + "learning_rate": 4.967065210971639e-05, + "loss": 0.1469, + "num_input_tokens_seen": 1486352, + "step": 2070 + }, + { + "epoch": 4.313929313929314, + "grad_norm": 0.3512877821922302, + "learning_rate": 4.966906189021977e-05, + "loss": 0.18, + "num_input_tokens_seen": 1490000, + "step": 2075 + }, + { + "epoch": 4.324324324324325, + "grad_norm": 0.13137614727020264, + "learning_rate": 4.966746786644379e-05, + "loss": 0.1621, + "num_input_tokens_seen": 1493552, + "step": 2080 + }, + { + "epoch": 4.334719334719335, + "grad_norm": 0.21210062503814697, + "learning_rate": 4.966587003863429e-05, + "loss": 0.1511, + "num_input_tokens_seen": 1497040, + "step": 2085 + }, + { + "epoch": 4.345114345114345, + "grad_norm": 0.19431081414222717, + "learning_rate": 4.966426840703765e-05, + "loss": 0.1698, + "num_input_tokens_seen": 1500528, + "step": 2090 + }, + { + "epoch": 4.355509355509356, + "grad_norm": 0.21403177082538605, + "learning_rate": 4.9662662971900875e-05, + "loss": 0.1899, + "num_input_tokens_seen": 1504016, + "step": 2095 + }, + { + "epoch": 4.365904365904366, + "grad_norm": 0.47135546803474426, + "learning_rate": 4.9661053733471534e-05, + "loss": 0.1786, + "num_input_tokens_seen": 1507440, + "step": 2100 + }, + { + "epoch": 4.376299376299376, + "grad_norm": 0.43856579065322876, + "learning_rate": 4.965944069199781e-05, + "loss": 0.1618, + "num_input_tokens_seen": 1511056, + "step": 2105 + }, + { + "epoch": 4.386694386694387, + "grad_norm": 0.245207279920578, + "learning_rate": 4.965782384772842e-05, + "loss": 0.1932, + "num_input_tokens_seen": 1514768, + "step": 2110 + }, + { + "epoch": 4.397089397089397, + "grad_norm": 0.3389219343662262, + "learning_rate": 4.9656203200912734e-05, + "loss": 0.1467, + "num_input_tokens_seen": 1518384, + "step": 2115 + }, + { + "epoch": 4.407484407484407, + "grad_norm": 0.29376259446144104, + "learning_rate": 4.965457875180067e-05, + "loss": 0.1983, + "num_input_tokens_seen": 1522032, + "step": 2120 + }, + { + "epoch": 4.417879417879418, + "grad_norm": 0.23610325157642365, + "learning_rate": 4.9652950500642724e-05, + "loss": 0.1539, + "num_input_tokens_seen": 1525616, + "step": 2125 + }, + { + "epoch": 4.428274428274428, + "grad_norm": 0.31709957122802734, + "learning_rate": 4.965131844769001e-05, + "loss": 0.1392, + "num_input_tokens_seen": 1529168, + "step": 2130 + }, + { + "epoch": 4.4386694386694385, + "grad_norm": 0.22266900539398193, + "learning_rate": 4.96496825931942e-05, + "loss": 0.165, + "num_input_tokens_seen": 1532880, + "step": 2135 + }, + { + "epoch": 4.4490644490644495, + "grad_norm": 0.3742765486240387, + "learning_rate": 4.9648042937407566e-05, + "loss": 0.1536, + "num_input_tokens_seen": 1536560, + "step": 2140 + }, + { + "epoch": 4.45945945945946, + "grad_norm": 0.4627426564693451, + "learning_rate": 4.964639948058297e-05, + "loss": 0.1404, + "num_input_tokens_seen": 1540304, + "step": 2145 + }, + { + "epoch": 4.46985446985447, + "grad_norm": 0.6231946349143982, + "learning_rate": 4.9644752222973846e-05, + "loss": 0.1767, + "num_input_tokens_seen": 1544048, + "step": 2150 + }, + { + "epoch": 4.48024948024948, + "grad_norm": 0.20790085196495056, + "learning_rate": 4.964310116483422e-05, + "loss": 0.1503, + "num_input_tokens_seen": 1547600, + "step": 2155 + }, + { + "epoch": 4.490644490644491, + "grad_norm": 0.3657433092594147, + "learning_rate": 4.964144630641872e-05, + "loss": 0.1494, + "num_input_tokens_seen": 1551152, + "step": 2160 + }, + { + "epoch": 4.501039501039501, + "grad_norm": 0.3910110890865326, + "learning_rate": 4.9639787647982525e-05, + "loss": 0.1459, + "num_input_tokens_seen": 1554704, + "step": 2165 + }, + { + "epoch": 4.511434511434511, + "grad_norm": 0.19937026500701904, + "learning_rate": 4.963812518978143e-05, + "loss": 0.1667, + "num_input_tokens_seen": 1558256, + "step": 2170 + }, + { + "epoch": 4.521829521829522, + "grad_norm": 0.29105111956596375, + "learning_rate": 4.963645893207182e-05, + "loss": 0.1526, + "num_input_tokens_seen": 1561744, + "step": 2175 + }, + { + "epoch": 4.532224532224532, + "grad_norm": 0.7502774596214294, + "learning_rate": 4.963478887511063e-05, + "loss": 0.1799, + "num_input_tokens_seen": 1565360, + "step": 2180 + }, + { + "epoch": 4.542619542619542, + "grad_norm": 0.3398224711418152, + "learning_rate": 4.963311501915542e-05, + "loss": 0.1329, + "num_input_tokens_seen": 1568944, + "step": 2185 + }, + { + "epoch": 4.553014553014553, + "grad_norm": 0.3443964123725891, + "learning_rate": 4.963143736446432e-05, + "loss": 0.1522, + "num_input_tokens_seen": 1572400, + "step": 2190 + }, + { + "epoch": 4.5634095634095635, + "grad_norm": 0.20074212551116943, + "learning_rate": 4.962975591129603e-05, + "loss": 0.1731, + "num_input_tokens_seen": 1575888, + "step": 2195 + }, + { + "epoch": 4.573804573804574, + "grad_norm": 0.2928250730037689, + "learning_rate": 4.962807065990986e-05, + "loss": 0.115, + "num_input_tokens_seen": 1579408, + "step": 2200 + }, + { + "epoch": 4.573804573804574, + "eval_loss": 0.17297959327697754, + "eval_runtime": 7.7621, + "eval_samples_per_second": 110.28, + "eval_steps_per_second": 27.57, + "num_input_tokens_seen": 1579408, + "step": 2200 + }, + { + "epoch": 4.584199584199585, + "grad_norm": 0.24388423562049866, + "learning_rate": 4.9626381610565714e-05, + "loss": 0.1773, + "num_input_tokens_seen": 1582992, + "step": 2205 + }, + { + "epoch": 4.594594594594595, + "grad_norm": 0.30875271558761597, + "learning_rate": 4.9624688763524043e-05, + "loss": 0.173, + "num_input_tokens_seen": 1586608, + "step": 2210 + }, + { + "epoch": 4.604989604989605, + "grad_norm": 0.46936655044555664, + "learning_rate": 4.962299211904591e-05, + "loss": 0.1602, + "num_input_tokens_seen": 1590064, + "step": 2215 + }, + { + "epoch": 4.615384615384615, + "grad_norm": 0.29174891114234924, + "learning_rate": 4.962129167739296e-05, + "loss": 0.1851, + "num_input_tokens_seen": 1593488, + "step": 2220 + }, + { + "epoch": 4.625779625779626, + "grad_norm": 0.22240759432315826, + "learning_rate": 4.961958743882742e-05, + "loss": 0.1495, + "num_input_tokens_seen": 1597008, + "step": 2225 + }, + { + "epoch": 4.636174636174636, + "grad_norm": 0.471366822719574, + "learning_rate": 4.961787940361211e-05, + "loss": 0.1703, + "num_input_tokens_seen": 1600560, + "step": 2230 + }, + { + "epoch": 4.646569646569646, + "grad_norm": 0.8816280961036682, + "learning_rate": 4.961616757201043e-05, + "loss": 0.2709, + "num_input_tokens_seen": 1604176, + "step": 2235 + }, + { + "epoch": 4.656964656964657, + "grad_norm": 0.2968365252017975, + "learning_rate": 4.961445194428637e-05, + "loss": 0.1398, + "num_input_tokens_seen": 1607568, + "step": 2240 + }, + { + "epoch": 4.667359667359667, + "grad_norm": 0.2635088264942169, + "learning_rate": 4.9612732520704486e-05, + "loss": 0.1752, + "num_input_tokens_seen": 1611120, + "step": 2245 + }, + { + "epoch": 4.6777546777546775, + "grad_norm": 0.19004106521606445, + "learning_rate": 4.961100930152994e-05, + "loss": 0.1837, + "num_input_tokens_seen": 1614864, + "step": 2250 + }, + { + "epoch": 4.6881496881496885, + "grad_norm": 0.4382178783416748, + "learning_rate": 4.960928228702849e-05, + "loss": 0.1916, + "num_input_tokens_seen": 1618544, + "step": 2255 + }, + { + "epoch": 4.698544698544699, + "grad_norm": 0.389707088470459, + "learning_rate": 4.960755147746645e-05, + "loss": 0.1372, + "num_input_tokens_seen": 1622224, + "step": 2260 + }, + { + "epoch": 4.708939708939709, + "grad_norm": 0.31058669090270996, + "learning_rate": 4.9605816873110736e-05, + "loss": 0.1823, + "num_input_tokens_seen": 1625872, + "step": 2265 + }, + { + "epoch": 4.71933471933472, + "grad_norm": 0.19509156048297882, + "learning_rate": 4.960407847422883e-05, + "loss": 0.157, + "num_input_tokens_seen": 1629616, + "step": 2270 + }, + { + "epoch": 4.72972972972973, + "grad_norm": 0.2015274465084076, + "learning_rate": 4.960233628108885e-05, + "loss": 0.1764, + "num_input_tokens_seen": 1633168, + "step": 2275 + }, + { + "epoch": 4.74012474012474, + "grad_norm": 0.20342974364757538, + "learning_rate": 4.960059029395942e-05, + "loss": 0.1781, + "num_input_tokens_seen": 1636816, + "step": 2280 + }, + { + "epoch": 4.75051975051975, + "grad_norm": 0.18851400911808014, + "learning_rate": 4.959884051310983e-05, + "loss": 0.1475, + "num_input_tokens_seen": 1640368, + "step": 2285 + }, + { + "epoch": 4.760914760914761, + "grad_norm": 0.31543904542922974, + "learning_rate": 4.959708693880991e-05, + "loss": 0.1349, + "num_input_tokens_seen": 1643920, + "step": 2290 + }, + { + "epoch": 4.771309771309771, + "grad_norm": 0.5470570921897888, + "learning_rate": 4.9595329571330074e-05, + "loss": 0.1832, + "num_input_tokens_seen": 1647664, + "step": 2295 + }, + { + "epoch": 4.781704781704782, + "grad_norm": 0.29796791076660156, + "learning_rate": 4.9593568410941326e-05, + "loss": 0.1526, + "num_input_tokens_seen": 1651216, + "step": 2300 + }, + { + "epoch": 4.792099792099792, + "grad_norm": 0.24486419558525085, + "learning_rate": 4.959180345791528e-05, + "loss": 0.1623, + "num_input_tokens_seen": 1654864, + "step": 2305 + }, + { + "epoch": 4.802494802494802, + "grad_norm": 0.422421932220459, + "learning_rate": 4.9590034712524086e-05, + "loss": 0.1526, + "num_input_tokens_seen": 1658416, + "step": 2310 + }, + { + "epoch": 4.8128898128898125, + "grad_norm": 0.41836655139923096, + "learning_rate": 4.958826217504053e-05, + "loss": 0.1441, + "num_input_tokens_seen": 1661936, + "step": 2315 + }, + { + "epoch": 4.8232848232848236, + "grad_norm": 0.20271450281143188, + "learning_rate": 4.958648584573795e-05, + "loss": 0.138, + "num_input_tokens_seen": 1665552, + "step": 2320 + }, + { + "epoch": 4.833679833679834, + "grad_norm": 0.17724543809890747, + "learning_rate": 4.958470572489028e-05, + "loss": 0.1621, + "num_input_tokens_seen": 1669264, + "step": 2325 + }, + { + "epoch": 4.844074844074844, + "grad_norm": 0.23096700012683868, + "learning_rate": 4.958292181277203e-05, + "loss": 0.2133, + "num_input_tokens_seen": 1672848, + "step": 2330 + }, + { + "epoch": 4.854469854469855, + "grad_norm": 0.4603653848171234, + "learning_rate": 4.958113410965832e-05, + "loss": 0.1752, + "num_input_tokens_seen": 1676432, + "step": 2335 + }, + { + "epoch": 4.864864864864865, + "grad_norm": 0.24609342217445374, + "learning_rate": 4.957934261582481e-05, + "loss": 0.1466, + "num_input_tokens_seen": 1679952, + "step": 2340 + }, + { + "epoch": 4.875259875259875, + "grad_norm": 0.2397037148475647, + "learning_rate": 4.95775473315478e-05, + "loss": 0.1562, + "num_input_tokens_seen": 1683600, + "step": 2345 + }, + { + "epoch": 4.885654885654886, + "grad_norm": 0.2602541148662567, + "learning_rate": 4.9575748257104124e-05, + "loss": 0.1905, + "num_input_tokens_seen": 1687152, + "step": 2350 + }, + { + "epoch": 4.896049896049896, + "grad_norm": 0.49497923254966736, + "learning_rate": 4.9573945392771224e-05, + "loss": 0.1341, + "num_input_tokens_seen": 1690800, + "step": 2355 + }, + { + "epoch": 4.906444906444906, + "grad_norm": 0.33882054686546326, + "learning_rate": 4.9572138738827134e-05, + "loss": 0.1621, + "num_input_tokens_seen": 1694384, + "step": 2360 + }, + { + "epoch": 4.916839916839917, + "grad_norm": 0.2720268964767456, + "learning_rate": 4.957032829555046e-05, + "loss": 0.1674, + "num_input_tokens_seen": 1698032, + "step": 2365 + }, + { + "epoch": 4.927234927234927, + "grad_norm": 0.19673307240009308, + "learning_rate": 4.956851406322039e-05, + "loss": 0.1842, + "num_input_tokens_seen": 1701648, + "step": 2370 + }, + { + "epoch": 4.9376299376299375, + "grad_norm": 0.30237212777137756, + "learning_rate": 4.9566696042116704e-05, + "loss": 0.1697, + "num_input_tokens_seen": 1705232, + "step": 2375 + }, + { + "epoch": 4.948024948024948, + "grad_norm": 0.29885709285736084, + "learning_rate": 4.9564874232519766e-05, + "loss": 0.1791, + "num_input_tokens_seen": 1708784, + "step": 2380 + }, + { + "epoch": 4.958419958419959, + "grad_norm": 0.23497964441776276, + "learning_rate": 4.9563048634710516e-05, + "loss": 0.1483, + "num_input_tokens_seen": 1712304, + "step": 2385 + }, + { + "epoch": 4.968814968814969, + "grad_norm": 0.530971109867096, + "learning_rate": 4.956121924897049e-05, + "loss": 0.1414, + "num_input_tokens_seen": 1715856, + "step": 2390 + }, + { + "epoch": 4.979209979209979, + "grad_norm": 0.37217479944229126, + "learning_rate": 4.955938607558181e-05, + "loss": 0.1631, + "num_input_tokens_seen": 1719376, + "step": 2395 + }, + { + "epoch": 4.98960498960499, + "grad_norm": 0.3800240755081177, + "learning_rate": 4.955754911482715e-05, + "loss": 0.1229, + "num_input_tokens_seen": 1723056, + "step": 2400 + }, + { + "epoch": 4.98960498960499, + "eval_loss": 0.17434626817703247, + "eval_runtime": 7.7636, + "eval_samples_per_second": 110.259, + "eval_steps_per_second": 27.565, + "num_input_tokens_seen": 1723056, + "step": 2400 + }, + { + "epoch": 5.0, + "grad_norm": 0.2876644432544708, + "learning_rate": 4.9555708366989804e-05, + "loss": 0.1064, + "num_input_tokens_seen": 1726600, + "step": 2405 + }, + { + "epoch": 5.01039501039501, + "grad_norm": 0.2764156460762024, + "learning_rate": 4.9553863832353655e-05, + "loss": 0.2006, + "num_input_tokens_seen": 1730216, + "step": 2410 + }, + { + "epoch": 5.020790020790021, + "grad_norm": 0.5851589441299438, + "learning_rate": 4.955201551120313e-05, + "loss": 0.1489, + "num_input_tokens_seen": 1733672, + "step": 2415 + }, + { + "epoch": 5.031185031185031, + "grad_norm": 0.30559319257736206, + "learning_rate": 4.955016340382328e-05, + "loss": 0.2023, + "num_input_tokens_seen": 1737320, + "step": 2420 + }, + { + "epoch": 5.041580041580041, + "grad_norm": 0.2353772521018982, + "learning_rate": 4.954830751049972e-05, + "loss": 0.1826, + "num_input_tokens_seen": 1741160, + "step": 2425 + }, + { + "epoch": 5.051975051975052, + "grad_norm": 0.19742389023303986, + "learning_rate": 4.954644783151864e-05, + "loss": 0.1701, + "num_input_tokens_seen": 1744936, + "step": 2430 + }, + { + "epoch": 5.0623700623700625, + "grad_norm": 0.21806704998016357, + "learning_rate": 4.954458436716684e-05, + "loss": 0.1628, + "num_input_tokens_seen": 1748616, + "step": 2435 + }, + { + "epoch": 5.072765072765073, + "grad_norm": 0.47250890731811523, + "learning_rate": 4.954271711773168e-05, + "loss": 0.145, + "num_input_tokens_seen": 1752232, + "step": 2440 + }, + { + "epoch": 5.083160083160083, + "grad_norm": 0.32477620244026184, + "learning_rate": 4.9540846083501115e-05, + "loss": 0.1367, + "num_input_tokens_seen": 1755944, + "step": 2445 + }, + { + "epoch": 5.093555093555094, + "grad_norm": 0.24464881420135498, + "learning_rate": 4.953897126476369e-05, + "loss": 0.1656, + "num_input_tokens_seen": 1759464, + "step": 2450 + }, + { + "epoch": 5.103950103950104, + "grad_norm": 0.20243127644062042, + "learning_rate": 4.9537092661808514e-05, + "loss": 0.153, + "num_input_tokens_seen": 1762952, + "step": 2455 + }, + { + "epoch": 5.114345114345114, + "grad_norm": 0.2404347062110901, + "learning_rate": 4.9535210274925306e-05, + "loss": 0.1808, + "num_input_tokens_seen": 1766664, + "step": 2460 + }, + { + "epoch": 5.124740124740125, + "grad_norm": 0.39892804622650146, + "learning_rate": 4.953332410440435e-05, + "loss": 0.1742, + "num_input_tokens_seen": 1770440, + "step": 2465 + }, + { + "epoch": 5.135135135135135, + "grad_norm": 0.21573498845100403, + "learning_rate": 4.9531434150536496e-05, + "loss": 0.2242, + "num_input_tokens_seen": 1773992, + "step": 2470 + }, + { + "epoch": 5.145530145530145, + "grad_norm": 0.45683255791664124, + "learning_rate": 4.952954041361322e-05, + "loss": 0.1635, + "num_input_tokens_seen": 1777544, + "step": 2475 + }, + { + "epoch": 5.155925155925156, + "grad_norm": 0.18466134369373322, + "learning_rate": 4.952764289392655e-05, + "loss": 0.1414, + "num_input_tokens_seen": 1781128, + "step": 2480 + }, + { + "epoch": 5.166320166320166, + "grad_norm": 0.22549167275428772, + "learning_rate": 4.952574159176912e-05, + "loss": 0.1277, + "num_input_tokens_seen": 1784520, + "step": 2485 + }, + { + "epoch": 5.1767151767151764, + "grad_norm": 0.3385047912597656, + "learning_rate": 4.952383650743413e-05, + "loss": 0.168, + "num_input_tokens_seen": 1788232, + "step": 2490 + }, + { + "epoch": 5.1871101871101875, + "grad_norm": 0.5532922148704529, + "learning_rate": 4.952192764121536e-05, + "loss": 0.1716, + "num_input_tokens_seen": 1791848, + "step": 2495 + }, + { + "epoch": 5.197505197505198, + "grad_norm": 0.31314823031425476, + "learning_rate": 4.9520014993407185e-05, + "loss": 0.1403, + "num_input_tokens_seen": 1795336, + "step": 2500 + }, + { + "epoch": 5.207900207900208, + "grad_norm": 0.23620925843715668, + "learning_rate": 4.951809856430456e-05, + "loss": 0.1713, + "num_input_tokens_seen": 1798856, + "step": 2505 + }, + { + "epoch": 5.218295218295219, + "grad_norm": 0.36012592911720276, + "learning_rate": 4.951617835420303e-05, + "loss": 0.1619, + "num_input_tokens_seen": 1802408, + "step": 2510 + }, + { + "epoch": 5.228690228690229, + "grad_norm": 0.4686308801174164, + "learning_rate": 4.951425436339869e-05, + "loss": 0.1837, + "num_input_tokens_seen": 1805960, + "step": 2515 + }, + { + "epoch": 5.239085239085239, + "grad_norm": 0.49547290802001953, + "learning_rate": 4.9512326592188274e-05, + "loss": 0.1666, + "num_input_tokens_seen": 1809416, + "step": 2520 + }, + { + "epoch": 5.24948024948025, + "grad_norm": 0.36740583181381226, + "learning_rate": 4.9510395040869054e-05, + "loss": 0.1419, + "num_input_tokens_seen": 1812872, + "step": 2525 + }, + { + "epoch": 5.25987525987526, + "grad_norm": 0.3465159833431244, + "learning_rate": 4.9508459709738905e-05, + "loss": 0.1132, + "num_input_tokens_seen": 1816328, + "step": 2530 + }, + { + "epoch": 5.27027027027027, + "grad_norm": 0.12562525272369385, + "learning_rate": 4.950652059909627e-05, + "loss": 0.1386, + "num_input_tokens_seen": 1819848, + "step": 2535 + }, + { + "epoch": 5.28066528066528, + "grad_norm": 0.3520587086677551, + "learning_rate": 4.95045777092402e-05, + "loss": 0.1875, + "num_input_tokens_seen": 1823464, + "step": 2540 + }, + { + "epoch": 5.291060291060291, + "grad_norm": 0.6623819470405579, + "learning_rate": 4.950263104047031e-05, + "loss": 0.1754, + "num_input_tokens_seen": 1827016, + "step": 2545 + }, + { + "epoch": 5.301455301455301, + "grad_norm": 0.3305163085460663, + "learning_rate": 4.9500680593086775e-05, + "loss": 0.1561, + "num_input_tokens_seen": 1830920, + "step": 2550 + }, + { + "epoch": 5.3118503118503115, + "grad_norm": 0.20909249782562256, + "learning_rate": 4.94987263673904e-05, + "loss": 0.1417, + "num_input_tokens_seen": 1834632, + "step": 2555 + }, + { + "epoch": 5.3222453222453225, + "grad_norm": 0.2720585763454437, + "learning_rate": 4.949676836368256e-05, + "loss": 0.1477, + "num_input_tokens_seen": 1838024, + "step": 2560 + }, + { + "epoch": 5.332640332640333, + "grad_norm": 0.41749000549316406, + "learning_rate": 4.949480658226518e-05, + "loss": 0.1092, + "num_input_tokens_seen": 1841576, + "step": 2565 + }, + { + "epoch": 5.343035343035343, + "grad_norm": 0.22875700891017914, + "learning_rate": 4.949284102344082e-05, + "loss": 0.1641, + "num_input_tokens_seen": 1845160, + "step": 2570 + }, + { + "epoch": 5.353430353430354, + "grad_norm": 0.40476658940315247, + "learning_rate": 4.9490871687512565e-05, + "loss": 0.1382, + "num_input_tokens_seen": 1848616, + "step": 2575 + }, + { + "epoch": 5.363825363825364, + "grad_norm": 0.7360061407089233, + "learning_rate": 4.948889857478413e-05, + "loss": 0.1809, + "num_input_tokens_seen": 1852136, + "step": 2580 + }, + { + "epoch": 5.374220374220374, + "grad_norm": 0.35413387417793274, + "learning_rate": 4.948692168555978e-05, + "loss": 0.1634, + "num_input_tokens_seen": 1855784, + "step": 2585 + }, + { + "epoch": 5.384615384615385, + "grad_norm": 0.1633775681257248, + "learning_rate": 4.94849410201444e-05, + "loss": 0.1426, + "num_input_tokens_seen": 1859368, + "step": 2590 + }, + { + "epoch": 5.395010395010395, + "grad_norm": 0.2676032483577728, + "learning_rate": 4.948295657884341e-05, + "loss": 0.1711, + "num_input_tokens_seen": 1862984, + "step": 2595 + }, + { + "epoch": 5.405405405405405, + "grad_norm": 0.48169419169425964, + "learning_rate": 4.9480968361962835e-05, + "loss": 0.1039, + "num_input_tokens_seen": 1866504, + "step": 2600 + }, + { + "epoch": 5.405405405405405, + "eval_loss": 0.16545061767101288, + "eval_runtime": 7.7792, + "eval_samples_per_second": 110.037, + "eval_steps_per_second": 27.509, + "num_input_tokens_seen": 1866504, + "step": 2600 + }, + { + "epoch": 5.415800415800415, + "grad_norm": 0.2698635458946228, + "learning_rate": 4.9478976369809305e-05, + "loss": 0.1528, + "num_input_tokens_seen": 1870024, + "step": 2605 + }, + { + "epoch": 5.426195426195426, + "grad_norm": 0.47878125309944153, + "learning_rate": 4.947698060268999e-05, + "loss": 0.1732, + "num_input_tokens_seen": 1873704, + "step": 2610 + }, + { + "epoch": 5.4365904365904365, + "grad_norm": 0.16066096723079681, + "learning_rate": 4.9474981060912665e-05, + "loss": 0.1834, + "num_input_tokens_seen": 1877480, + "step": 2615 + }, + { + "epoch": 5.446985446985447, + "grad_norm": 0.21050839126110077, + "learning_rate": 4.94729777447857e-05, + "loss": 0.1795, + "num_input_tokens_seen": 1881160, + "step": 2620 + }, + { + "epoch": 5.457380457380458, + "grad_norm": 0.42417261004447937, + "learning_rate": 4.947097065461801e-05, + "loss": 0.1405, + "num_input_tokens_seen": 1884680, + "step": 2625 + }, + { + "epoch": 5.467775467775468, + "grad_norm": 0.7349763512611389, + "learning_rate": 4.9468959790719125e-05, + "loss": 0.1507, + "num_input_tokens_seen": 1888200, + "step": 2630 + }, + { + "epoch": 5.478170478170478, + "grad_norm": 0.19738374650478363, + "learning_rate": 4.9466945153399146e-05, + "loss": 0.1453, + "num_input_tokens_seen": 1891720, + "step": 2635 + }, + { + "epoch": 5.488565488565489, + "grad_norm": 0.2536933124065399, + "learning_rate": 4.9464926742968755e-05, + "loss": 0.1324, + "num_input_tokens_seen": 1895336, + "step": 2640 + }, + { + "epoch": 5.498960498960499, + "grad_norm": 0.22575455904006958, + "learning_rate": 4.946290455973921e-05, + "loss": 0.1303, + "num_input_tokens_seen": 1899112, + "step": 2645 + }, + { + "epoch": 5.509355509355509, + "grad_norm": 0.3608172535896301, + "learning_rate": 4.9460878604022365e-05, + "loss": 0.1409, + "num_input_tokens_seen": 1902952, + "step": 2650 + }, + { + "epoch": 5.51975051975052, + "grad_norm": 0.3506828844547272, + "learning_rate": 4.945884887613065e-05, + "loss": 0.1573, + "num_input_tokens_seen": 1906568, + "step": 2655 + }, + { + "epoch": 5.53014553014553, + "grad_norm": 0.29418477416038513, + "learning_rate": 4.9456815376377055e-05, + "loss": 0.1518, + "num_input_tokens_seen": 1910312, + "step": 2660 + }, + { + "epoch": 5.54054054054054, + "grad_norm": 0.5495036840438843, + "learning_rate": 4.9454778105075195e-05, + "loss": 0.139, + "num_input_tokens_seen": 1913896, + "step": 2665 + }, + { + "epoch": 5.5509355509355505, + "grad_norm": 0.30390429496765137, + "learning_rate": 4.945273706253924e-05, + "loss": 0.1677, + "num_input_tokens_seen": 1917416, + "step": 2670 + }, + { + "epoch": 5.5613305613305615, + "grad_norm": 0.16203831136226654, + "learning_rate": 4.9450692249083925e-05, + "loss": 0.1436, + "num_input_tokens_seen": 1921032, + "step": 2675 + }, + { + "epoch": 5.571725571725572, + "grad_norm": 0.2603548467159271, + "learning_rate": 4.9448643665024605e-05, + "loss": 0.1554, + "num_input_tokens_seen": 1924520, + "step": 2680 + }, + { + "epoch": 5.582120582120583, + "grad_norm": 0.19187791645526886, + "learning_rate": 4.944659131067719e-05, + "loss": 0.1078, + "num_input_tokens_seen": 1928040, + "step": 2685 + }, + { + "epoch": 5.592515592515593, + "grad_norm": 0.22277623414993286, + "learning_rate": 4.944453518635818e-05, + "loss": 0.1482, + "num_input_tokens_seen": 1931624, + "step": 2690 + }, + { + "epoch": 5.602910602910603, + "grad_norm": 0.2704801559448242, + "learning_rate": 4.944247529238465e-05, + "loss": 0.1679, + "num_input_tokens_seen": 1935112, + "step": 2695 + }, + { + "epoch": 5.613305613305613, + "grad_norm": 0.33046892285346985, + "learning_rate": 4.944041162907427e-05, + "loss": 0.1632, + "num_input_tokens_seen": 1938792, + "step": 2700 + }, + { + "epoch": 5.623700623700624, + "grad_norm": 0.39902040362358093, + "learning_rate": 4.943834419674529e-05, + "loss": 0.147, + "num_input_tokens_seen": 1942248, + "step": 2705 + }, + { + "epoch": 5.634095634095634, + "grad_norm": 0.4181129038333893, + "learning_rate": 4.9436272995716506e-05, + "loss": 0.159, + "num_input_tokens_seen": 1945672, + "step": 2710 + }, + { + "epoch": 5.644490644490644, + "grad_norm": 0.35147058963775635, + "learning_rate": 4.943419802630735e-05, + "loss": 0.1571, + "num_input_tokens_seen": 1949224, + "step": 2715 + }, + { + "epoch": 5.654885654885655, + "grad_norm": 0.38561272621154785, + "learning_rate": 4.94321192888378e-05, + "loss": 0.1762, + "num_input_tokens_seen": 1952840, + "step": 2720 + }, + { + "epoch": 5.665280665280665, + "grad_norm": 0.15730181336402893, + "learning_rate": 4.943003678362842e-05, + "loss": 0.1514, + "num_input_tokens_seen": 1956552, + "step": 2725 + }, + { + "epoch": 5.675675675675675, + "grad_norm": 0.4665904939174652, + "learning_rate": 4.942795051100036e-05, + "loss": 0.1906, + "num_input_tokens_seen": 1960104, + "step": 2730 + }, + { + "epoch": 5.686070686070686, + "grad_norm": 0.23538681864738464, + "learning_rate": 4.942586047127536e-05, + "loss": 0.1172, + "num_input_tokens_seen": 1963656, + "step": 2735 + }, + { + "epoch": 5.696465696465697, + "grad_norm": 0.3907185196876526, + "learning_rate": 4.942376666477571e-05, + "loss": 0.1681, + "num_input_tokens_seen": 1967336, + "step": 2740 + }, + { + "epoch": 5.706860706860707, + "grad_norm": 0.4109727442264557, + "learning_rate": 4.9421669091824304e-05, + "loss": 0.152, + "num_input_tokens_seen": 1970824, + "step": 2745 + }, + { + "epoch": 5.717255717255718, + "grad_norm": 0.31305447220802307, + "learning_rate": 4.9419567752744634e-05, + "loss": 0.1958, + "num_input_tokens_seen": 1974472, + "step": 2750 + }, + { + "epoch": 5.727650727650728, + "grad_norm": 0.16795653104782104, + "learning_rate": 4.941746264786074e-05, + "loss": 0.1433, + "num_input_tokens_seen": 1977992, + "step": 2755 + }, + { + "epoch": 5.738045738045738, + "grad_norm": 0.39670446515083313, + "learning_rate": 4.9415353777497254e-05, + "loss": 0.1779, + "num_input_tokens_seen": 1981448, + "step": 2760 + }, + { + "epoch": 5.748440748440748, + "grad_norm": 0.24629159271717072, + "learning_rate": 4.9413241141979394e-05, + "loss": 0.1511, + "num_input_tokens_seen": 1985032, + "step": 2765 + }, + { + "epoch": 5.758835758835759, + "grad_norm": 0.557577908039093, + "learning_rate": 4.9411124741632956e-05, + "loss": 0.1733, + "num_input_tokens_seen": 1988488, + "step": 2770 + }, + { + "epoch": 5.769230769230769, + "grad_norm": 0.39969465136528015, + "learning_rate": 4.940900457678431e-05, + "loss": 0.177, + "num_input_tokens_seen": 1991944, + "step": 2775 + }, + { + "epoch": 5.779625779625779, + "grad_norm": 0.3218066990375519, + "learning_rate": 4.9406880647760425e-05, + "loss": 0.1749, + "num_input_tokens_seen": 1995368, + "step": 2780 + }, + { + "epoch": 5.79002079002079, + "grad_norm": 0.2002917230129242, + "learning_rate": 4.9404752954888824e-05, + "loss": 0.123, + "num_input_tokens_seen": 1999016, + "step": 2785 + }, + { + "epoch": 5.8004158004158, + "grad_norm": 0.38256219029426575, + "learning_rate": 4.940262149849762e-05, + "loss": 0.1538, + "num_input_tokens_seen": 2002568, + "step": 2790 + }, + { + "epoch": 5.8108108108108105, + "grad_norm": 0.31599026918411255, + "learning_rate": 4.9400486278915526e-05, + "loss": 0.1014, + "num_input_tokens_seen": 2006248, + "step": 2795 + }, + { + "epoch": 5.8212058212058215, + "grad_norm": 0.46947795152664185, + "learning_rate": 4.939834729647181e-05, + "loss": 0.1567, + "num_input_tokens_seen": 2009832, + "step": 2800 + }, + { + "epoch": 5.8212058212058215, + "eval_loss": 0.1636151224374771, + "eval_runtime": 7.7633, + "eval_samples_per_second": 110.263, + "eval_steps_per_second": 27.566, + "num_input_tokens_seen": 2009832, + "step": 2800 + }, + { + "epoch": 5.831600831600832, + "grad_norm": 0.2453927844762802, + "learning_rate": 4.9396204551496326e-05, + "loss": 0.1426, + "num_input_tokens_seen": 2013352, + "step": 2805 + }, + { + "epoch": 5.841995841995842, + "grad_norm": 0.1769685447216034, + "learning_rate": 4.939405804431952e-05, + "loss": 0.1692, + "num_input_tokens_seen": 2017032, + "step": 2810 + }, + { + "epoch": 5.852390852390853, + "grad_norm": 0.22686851024627686, + "learning_rate": 4.9391907775272414e-05, + "loss": 0.16, + "num_input_tokens_seen": 2020616, + "step": 2815 + }, + { + "epoch": 5.862785862785863, + "grad_norm": 0.36280402541160583, + "learning_rate": 4.9389753744686604e-05, + "loss": 0.1365, + "num_input_tokens_seen": 2024136, + "step": 2820 + }, + { + "epoch": 5.873180873180873, + "grad_norm": 0.3744208812713623, + "learning_rate": 4.938759595289426e-05, + "loss": 0.1443, + "num_input_tokens_seen": 2027752, + "step": 2825 + }, + { + "epoch": 5.883575883575883, + "grad_norm": 0.13817498087882996, + "learning_rate": 4.938543440022815e-05, + "loss": 0.1414, + "num_input_tokens_seen": 2031240, + "step": 2830 + }, + { + "epoch": 5.893970893970894, + "grad_norm": 0.40712109208106995, + "learning_rate": 4.938326908702161e-05, + "loss": 0.1692, + "num_input_tokens_seen": 2034856, + "step": 2835 + }, + { + "epoch": 5.904365904365904, + "grad_norm": 0.3736468553543091, + "learning_rate": 4.9381100013608554e-05, + "loss": 0.1931, + "num_input_tokens_seen": 2038664, + "step": 2840 + }, + { + "epoch": 5.914760914760915, + "grad_norm": 0.277558833360672, + "learning_rate": 4.9378927180323485e-05, + "loss": 0.1584, + "num_input_tokens_seen": 2042216, + "step": 2845 + }, + { + "epoch": 5.925155925155925, + "grad_norm": 0.3797983229160309, + "learning_rate": 4.937675058750148e-05, + "loss": 0.1257, + "num_input_tokens_seen": 2045768, + "step": 2850 + }, + { + "epoch": 5.9355509355509355, + "grad_norm": 0.20416311919689178, + "learning_rate": 4.937457023547819e-05, + "loss": 0.1532, + "num_input_tokens_seen": 2049288, + "step": 2855 + }, + { + "epoch": 5.945945945945946, + "grad_norm": 0.2700265645980835, + "learning_rate": 4.9372386124589876e-05, + "loss": 0.1768, + "num_input_tokens_seen": 2052936, + "step": 2860 + }, + { + "epoch": 5.956340956340957, + "grad_norm": 0.38308262825012207, + "learning_rate": 4.937019825517333e-05, + "loss": 0.1553, + "num_input_tokens_seen": 2056616, + "step": 2865 + }, + { + "epoch": 5.966735966735967, + "grad_norm": 0.5040315985679626, + "learning_rate": 4.9368006627565954e-05, + "loss": 0.1515, + "num_input_tokens_seen": 2060168, + "step": 2870 + }, + { + "epoch": 5.977130977130977, + "grad_norm": 0.3872227072715759, + "learning_rate": 4.936581124210573e-05, + "loss": 0.1806, + "num_input_tokens_seen": 2063784, + "step": 2875 + }, + { + "epoch": 5.987525987525988, + "grad_norm": 0.2920781970024109, + "learning_rate": 4.9363612099131216e-05, + "loss": 0.1632, + "num_input_tokens_seen": 2067336, + "step": 2880 + }, + { + "epoch": 5.997920997920998, + "grad_norm": 0.3611353635787964, + "learning_rate": 4.936140919898155e-05, + "loss": 0.1368, + "num_input_tokens_seen": 2070760, + "step": 2885 + }, + { + "epoch": 6.008316008316008, + "grad_norm": 0.5024675130844116, + "learning_rate": 4.9359202541996426e-05, + "loss": 0.1052, + "num_input_tokens_seen": 2074368, + "step": 2890 + }, + { + "epoch": 6.018711018711019, + "grad_norm": 0.20987863838672638, + "learning_rate": 4.935699212851616e-05, + "loss": 0.1425, + "num_input_tokens_seen": 2077856, + "step": 2895 + }, + { + "epoch": 6.029106029106029, + "grad_norm": 0.16517342627048492, + "learning_rate": 4.935477795888162e-05, + "loss": 0.1503, + "num_input_tokens_seen": 2081408, + "step": 2900 + }, + { + "epoch": 6.039501039501039, + "grad_norm": 0.2599283456802368, + "learning_rate": 4.935256003343426e-05, + "loss": 0.1253, + "num_input_tokens_seen": 2084928, + "step": 2905 + }, + { + "epoch": 6.04989604989605, + "grad_norm": 0.4382787048816681, + "learning_rate": 4.93503383525161e-05, + "loss": 0.1277, + "num_input_tokens_seen": 2088384, + "step": 2910 + }, + { + "epoch": 6.0602910602910605, + "grad_norm": 0.44216886162757874, + "learning_rate": 4.934811291646977e-05, + "loss": 0.1546, + "num_input_tokens_seen": 2092096, + "step": 2915 + }, + { + "epoch": 6.070686070686071, + "grad_norm": 0.18915005028247833, + "learning_rate": 4.934588372563845e-05, + "loss": 0.1579, + "num_input_tokens_seen": 2095552, + "step": 2920 + }, + { + "epoch": 6.081081081081081, + "grad_norm": 0.18479503691196442, + "learning_rate": 4.93436507803659e-05, + "loss": 0.1308, + "num_input_tokens_seen": 2098976, + "step": 2925 + }, + { + "epoch": 6.091476091476092, + "grad_norm": 0.6110181212425232, + "learning_rate": 4.934141408099649e-05, + "loss": 0.1856, + "num_input_tokens_seen": 2102592, + "step": 2930 + }, + { + "epoch": 6.101871101871102, + "grad_norm": 0.5055030584335327, + "learning_rate": 4.9339173627875135e-05, + "loss": 0.1748, + "num_input_tokens_seen": 2106112, + "step": 2935 + }, + { + "epoch": 6.112266112266112, + "grad_norm": 0.19557233154773712, + "learning_rate": 4.9336929421347335e-05, + "loss": 0.1496, + "num_input_tokens_seen": 2110016, + "step": 2940 + }, + { + "epoch": 6.122661122661123, + "grad_norm": 0.28695574402809143, + "learning_rate": 4.933468146175918e-05, + "loss": 0.1223, + "num_input_tokens_seen": 2113792, + "step": 2945 + }, + { + "epoch": 6.133056133056133, + "grad_norm": 0.3579184412956238, + "learning_rate": 4.933242974945734e-05, + "loss": 0.1434, + "num_input_tokens_seen": 2117376, + "step": 2950 + }, + { + "epoch": 6.143451143451143, + "grad_norm": 0.2899359464645386, + "learning_rate": 4.933017428478906e-05, + "loss": 0.1717, + "num_input_tokens_seen": 2121120, + "step": 2955 + }, + { + "epoch": 6.153846153846154, + "grad_norm": 0.6582514643669128, + "learning_rate": 4.932791506810214e-05, + "loss": 0.1499, + "num_input_tokens_seen": 2124608, + "step": 2960 + }, + { + "epoch": 6.164241164241164, + "grad_norm": 0.5584684610366821, + "learning_rate": 4.932565209974499e-05, + "loss": 0.2123, + "num_input_tokens_seen": 2128384, + "step": 2965 + }, + { + "epoch": 6.174636174636174, + "grad_norm": 0.3006340265274048, + "learning_rate": 4.93233853800666e-05, + "loss": 0.1467, + "num_input_tokens_seen": 2132032, + "step": 2970 + }, + { + "epoch": 6.185031185031185, + "grad_norm": 0.34513118863105774, + "learning_rate": 4.932111490941651e-05, + "loss": 0.1545, + "num_input_tokens_seen": 2135520, + "step": 2975 + }, + { + "epoch": 6.1954261954261955, + "grad_norm": 0.39564791321754456, + "learning_rate": 4.9318840688144876e-05, + "loss": 0.1604, + "num_input_tokens_seen": 2139168, + "step": 2980 + }, + { + "epoch": 6.205821205821206, + "grad_norm": 0.21965065598487854, + "learning_rate": 4.9316562716602387e-05, + "loss": 0.1504, + "num_input_tokens_seen": 2142688, + "step": 2985 + }, + { + "epoch": 6.216216216216216, + "grad_norm": 0.5062464475631714, + "learning_rate": 4.9314280995140346e-05, + "loss": 0.1517, + "num_input_tokens_seen": 2146400, + "step": 2990 + }, + { + "epoch": 6.226611226611227, + "grad_norm": 0.22062727808952332, + "learning_rate": 4.931199552411063e-05, + "loss": 0.1663, + "num_input_tokens_seen": 2149856, + "step": 2995 + }, + { + "epoch": 6.237006237006237, + "grad_norm": 0.2140887975692749, + "learning_rate": 4.930970630386568e-05, + "loss": 0.1797, + "num_input_tokens_seen": 2153504, + "step": 3000 + }, + { + "epoch": 6.237006237006237, + "eval_loss": 0.16414405405521393, + "eval_runtime": 7.767, + "eval_samples_per_second": 110.209, + "eval_steps_per_second": 27.552, + "num_input_tokens_seen": 2153504, + "step": 3000 + }, + { + "epoch": 6.247401247401247, + "grad_norm": 0.23492011427879333, + "learning_rate": 4.9307413334758524e-05, + "loss": 0.1327, + "num_input_tokens_seen": 2157120, + "step": 3005 + }, + { + "epoch": 6.257796257796258, + "grad_norm": 0.16624701023101807, + "learning_rate": 4.930511661714276e-05, + "loss": 0.1816, + "num_input_tokens_seen": 2160736, + "step": 3010 + }, + { + "epoch": 6.268191268191268, + "grad_norm": 0.20580601692199707, + "learning_rate": 4.9302816151372576e-05, + "loss": 0.122, + "num_input_tokens_seen": 2164352, + "step": 3015 + }, + { + "epoch": 6.278586278586278, + "grad_norm": 0.49535682797431946, + "learning_rate": 4.930051193780274e-05, + "loss": 0.171, + "num_input_tokens_seen": 2168032, + "step": 3020 + }, + { + "epoch": 6.288981288981289, + "grad_norm": 0.3153409957885742, + "learning_rate": 4.929820397678858e-05, + "loss": 0.1562, + "num_input_tokens_seen": 2171584, + "step": 3025 + }, + { + "epoch": 6.299376299376299, + "grad_norm": 0.20224396884441376, + "learning_rate": 4.9295892268686015e-05, + "loss": 0.1487, + "num_input_tokens_seen": 2175040, + "step": 3030 + }, + { + "epoch": 6.3097713097713095, + "grad_norm": 0.4024536609649658, + "learning_rate": 4.9293576813851536e-05, + "loss": 0.1323, + "num_input_tokens_seen": 2178592, + "step": 3035 + }, + { + "epoch": 6.3201663201663205, + "grad_norm": 0.4277821481227875, + "learning_rate": 4.929125761264223e-05, + "loss": 0.1603, + "num_input_tokens_seen": 2182144, + "step": 3040 + }, + { + "epoch": 6.330561330561331, + "grad_norm": 0.5431337952613831, + "learning_rate": 4.928893466541573e-05, + "loss": 0.138, + "num_input_tokens_seen": 2185824, + "step": 3045 + }, + { + "epoch": 6.340956340956341, + "grad_norm": 0.23914189636707306, + "learning_rate": 4.928660797253027e-05, + "loss": 0.1373, + "num_input_tokens_seen": 2189376, + "step": 3050 + }, + { + "epoch": 6.351351351351352, + "grad_norm": 0.7902743816375732, + "learning_rate": 4.928427753434467e-05, + "loss": 0.1543, + "num_input_tokens_seen": 2192960, + "step": 3055 + }, + { + "epoch": 6.361746361746362, + "grad_norm": 0.563460111618042, + "learning_rate": 4.9281943351218286e-05, + "loss": 0.1542, + "num_input_tokens_seen": 2196480, + "step": 3060 + }, + { + "epoch": 6.372141372141372, + "grad_norm": 0.3082915246486664, + "learning_rate": 4.9279605423511095e-05, + "loss": 0.1497, + "num_input_tokens_seen": 2200128, + "step": 3065 + }, + { + "epoch": 6.382536382536383, + "grad_norm": 0.2649827301502228, + "learning_rate": 4.927726375158363e-05, + "loss": 0.1761, + "num_input_tokens_seen": 2203584, + "step": 3070 + }, + { + "epoch": 6.392931392931393, + "grad_norm": 0.3984677493572235, + "learning_rate": 4.9274918335797004e-05, + "loss": 0.1684, + "num_input_tokens_seen": 2207072, + "step": 3075 + }, + { + "epoch": 6.403326403326403, + "grad_norm": 0.20210276544094086, + "learning_rate": 4.927256917651292e-05, + "loss": 0.1481, + "num_input_tokens_seen": 2210720, + "step": 3080 + }, + { + "epoch": 6.413721413721413, + "grad_norm": 0.6237998008728027, + "learning_rate": 4.927021627409364e-05, + "loss": 0.1958, + "num_input_tokens_seen": 2214208, + "step": 3085 + }, + { + "epoch": 6.424116424116424, + "grad_norm": 0.2447492480278015, + "learning_rate": 4.9267859628902005e-05, + "loss": 0.1742, + "num_input_tokens_seen": 2217760, + "step": 3090 + }, + { + "epoch": 6.4345114345114345, + "grad_norm": 0.412129282951355, + "learning_rate": 4.9265499241301454e-05, + "loss": 0.1412, + "num_input_tokens_seen": 2221344, + "step": 3095 + }, + { + "epoch": 6.444906444906445, + "grad_norm": 0.5116413235664368, + "learning_rate": 4.926313511165598e-05, + "loss": 0.1259, + "num_input_tokens_seen": 2224992, + "step": 3100 + }, + { + "epoch": 6.455301455301456, + "grad_norm": 0.3399733304977417, + "learning_rate": 4.926076724033016e-05, + "loss": 0.1573, + "num_input_tokens_seen": 2228480, + "step": 3105 + }, + { + "epoch": 6.465696465696466, + "grad_norm": 0.26369935274124146, + "learning_rate": 4.9258395627689146e-05, + "loss": 0.1541, + "num_input_tokens_seen": 2232128, + "step": 3110 + }, + { + "epoch": 6.476091476091476, + "grad_norm": 0.538689911365509, + "learning_rate": 4.925602027409868e-05, + "loss": 0.1494, + "num_input_tokens_seen": 2235776, + "step": 3115 + }, + { + "epoch": 6.486486486486487, + "grad_norm": 0.14908485114574432, + "learning_rate": 4.925364117992507e-05, + "loss": 0.1341, + "num_input_tokens_seen": 2239392, + "step": 3120 + }, + { + "epoch": 6.496881496881497, + "grad_norm": 0.23820312321186066, + "learning_rate": 4.92512583455352e-05, + "loss": 0.1287, + "num_input_tokens_seen": 2242720, + "step": 3125 + }, + { + "epoch": 6.507276507276507, + "grad_norm": 0.38057956099510193, + "learning_rate": 4.9248871771296536e-05, + "loss": 0.1588, + "num_input_tokens_seen": 2246272, + "step": 3130 + }, + { + "epoch": 6.517671517671518, + "grad_norm": 0.24204520881175995, + "learning_rate": 4.924648145757711e-05, + "loss": 0.2063, + "num_input_tokens_seen": 2249888, + "step": 3135 + }, + { + "epoch": 6.528066528066528, + "grad_norm": 0.19087541103363037, + "learning_rate": 4.924408740474554e-05, + "loss": 0.1316, + "num_input_tokens_seen": 2253472, + "step": 3140 + }, + { + "epoch": 6.538461538461538, + "grad_norm": 0.1676514595746994, + "learning_rate": 4.924168961317103e-05, + "loss": 0.1057, + "num_input_tokens_seen": 2257088, + "step": 3145 + }, + { + "epoch": 6.548856548856548, + "grad_norm": 0.15919534862041473, + "learning_rate": 4.9239288083223334e-05, + "loss": 0.1453, + "num_input_tokens_seen": 2260608, + "step": 3150 + }, + { + "epoch": 6.5592515592515594, + "grad_norm": 0.1969936639070511, + "learning_rate": 4.9236882815272803e-05, + "loss": 0.1312, + "num_input_tokens_seen": 2264192, + "step": 3155 + }, + { + "epoch": 6.56964656964657, + "grad_norm": 0.5233212113380432, + "learning_rate": 4.9234473809690365e-05, + "loss": 0.136, + "num_input_tokens_seen": 2267872, + "step": 3160 + }, + { + "epoch": 6.58004158004158, + "grad_norm": 0.35122108459472656, + "learning_rate": 4.923206106684752e-05, + "loss": 0.1498, + "num_input_tokens_seen": 2271520, + "step": 3165 + }, + { + "epoch": 6.590436590436591, + "grad_norm": 0.17075306177139282, + "learning_rate": 4.922964458711634e-05, + "loss": 0.1537, + "num_input_tokens_seen": 2275040, + "step": 3170 + }, + { + "epoch": 6.600831600831601, + "grad_norm": 0.4325787127017975, + "learning_rate": 4.9227224370869474e-05, + "loss": 0.1348, + "num_input_tokens_seen": 2278624, + "step": 3175 + }, + { + "epoch": 6.611226611226611, + "grad_norm": 0.39773762226104736, + "learning_rate": 4.9224800418480155e-05, + "loss": 0.1494, + "num_input_tokens_seen": 2282144, + "step": 3180 + }, + { + "epoch": 6.621621621621622, + "grad_norm": 0.43032610416412354, + "learning_rate": 4.9222372730322176e-05, + "loss": 0.1473, + "num_input_tokens_seen": 2285792, + "step": 3185 + }, + { + "epoch": 6.632016632016632, + "grad_norm": 0.2621281147003174, + "learning_rate": 4.921994130676993e-05, + "loss": 0.214, + "num_input_tokens_seen": 2289504, + "step": 3190 + }, + { + "epoch": 6.642411642411642, + "grad_norm": 0.3201293647289276, + "learning_rate": 4.9217506148198366e-05, + "loss": 0.1608, + "num_input_tokens_seen": 2293024, + "step": 3195 + }, + { + "epoch": 6.652806652806653, + "grad_norm": 0.27147868275642395, + "learning_rate": 4.921506725498302e-05, + "loss": 0.1581, + "num_input_tokens_seen": 2296672, + "step": 3200 + }, + { + "epoch": 6.652806652806653, + "eval_loss": 0.16605804860591888, + "eval_runtime": 7.7822, + "eval_samples_per_second": 109.994, + "eval_steps_per_second": 27.499, + "num_input_tokens_seen": 2296672, + "step": 3200 + }, + { + "epoch": 6.663201663201663, + "grad_norm": 0.4219726026058197, + "learning_rate": 4.9212624627499994e-05, + "loss": 0.1728, + "num_input_tokens_seen": 2300224, + "step": 3205 + }, + { + "epoch": 6.673596673596673, + "grad_norm": 0.17060089111328125, + "learning_rate": 4.921017826612597e-05, + "loss": 0.1513, + "num_input_tokens_seen": 2303872, + "step": 3210 + }, + { + "epoch": 6.6839916839916835, + "grad_norm": 0.5995436906814575, + "learning_rate": 4.9207728171238223e-05, + "loss": 0.1364, + "num_input_tokens_seen": 2307552, + "step": 3215 + }, + { + "epoch": 6.6943866943866945, + "grad_norm": 0.2442438006401062, + "learning_rate": 4.920527434321458e-05, + "loss": 0.1595, + "num_input_tokens_seen": 2311168, + "step": 3220 + }, + { + "epoch": 6.704781704781705, + "grad_norm": 0.4440081715583801, + "learning_rate": 4.920281678243345e-05, + "loss": 0.1884, + "num_input_tokens_seen": 2314816, + "step": 3225 + }, + { + "epoch": 6.715176715176716, + "grad_norm": 0.2566767632961273, + "learning_rate": 4.920035548927381e-05, + "loss": 0.1346, + "num_input_tokens_seen": 2318432, + "step": 3230 + }, + { + "epoch": 6.725571725571726, + "grad_norm": 0.17123174667358398, + "learning_rate": 4.919789046411525e-05, + "loss": 0.1656, + "num_input_tokens_seen": 2322080, + "step": 3235 + }, + { + "epoch": 6.735966735966736, + "grad_norm": 0.39199987053871155, + "learning_rate": 4.919542170733787e-05, + "loss": 0.152, + "num_input_tokens_seen": 2325728, + "step": 3240 + }, + { + "epoch": 6.746361746361746, + "grad_norm": 0.28636646270751953, + "learning_rate": 4.919294921932242e-05, + "loss": 0.201, + "num_input_tokens_seen": 2329440, + "step": 3245 + }, + { + "epoch": 6.756756756756757, + "grad_norm": 0.37575775384902954, + "learning_rate": 4.919047300045016e-05, + "loss": 0.1333, + "num_input_tokens_seen": 2333024, + "step": 3250 + }, + { + "epoch": 6.767151767151767, + "grad_norm": 0.7235891819000244, + "learning_rate": 4.918799305110299e-05, + "loss": 0.1548, + "num_input_tokens_seen": 2336448, + "step": 3255 + }, + { + "epoch": 6.777546777546777, + "grad_norm": 0.5033214092254639, + "learning_rate": 4.918550937166331e-05, + "loss": 0.1566, + "num_input_tokens_seen": 2339936, + "step": 3260 + }, + { + "epoch": 6.787941787941788, + "grad_norm": 0.17990820109844208, + "learning_rate": 4.918302196251415e-05, + "loss": 0.1666, + "num_input_tokens_seen": 2343360, + "step": 3265 + }, + { + "epoch": 6.798336798336798, + "grad_norm": 0.3582819700241089, + "learning_rate": 4.91805308240391e-05, + "loss": 0.1416, + "num_input_tokens_seen": 2346848, + "step": 3270 + }, + { + "epoch": 6.8087318087318085, + "grad_norm": 0.22959619760513306, + "learning_rate": 4.9178035956622326e-05, + "loss": 0.1429, + "num_input_tokens_seen": 2350496, + "step": 3275 + }, + { + "epoch": 6.8191268191268195, + "grad_norm": 0.30627739429473877, + "learning_rate": 4.917553736064857e-05, + "loss": 0.1387, + "num_input_tokens_seen": 2353952, + "step": 3280 + }, + { + "epoch": 6.82952182952183, + "grad_norm": 0.2778383493423462, + "learning_rate": 4.917303503650314e-05, + "loss": 0.1427, + "num_input_tokens_seen": 2357664, + "step": 3285 + }, + { + "epoch": 6.83991683991684, + "grad_norm": 0.5767640471458435, + "learning_rate": 4.917052898457194e-05, + "loss": 0.1415, + "num_input_tokens_seen": 2361216, + "step": 3290 + }, + { + "epoch": 6.850311850311851, + "grad_norm": 0.20598681271076202, + "learning_rate": 4.916801920524141e-05, + "loss": 0.1299, + "num_input_tokens_seen": 2365024, + "step": 3295 + }, + { + "epoch": 6.860706860706861, + "grad_norm": 0.2935587465763092, + "learning_rate": 4.916550569889862e-05, + "loss": 0.1902, + "num_input_tokens_seen": 2368480, + "step": 3300 + }, + { + "epoch": 6.871101871101871, + "grad_norm": 0.37763282656669617, + "learning_rate": 4.916298846593116e-05, + "loss": 0.1133, + "num_input_tokens_seen": 2372032, + "step": 3305 + }, + { + "epoch": 6.881496881496881, + "grad_norm": 0.2099263072013855, + "learning_rate": 4.916046750672722e-05, + "loss": 0.1413, + "num_input_tokens_seen": 2375520, + "step": 3310 + }, + { + "epoch": 6.891891891891892, + "grad_norm": 0.3938628137111664, + "learning_rate": 4.915794282167559e-05, + "loss": 0.1225, + "num_input_tokens_seen": 2379104, + "step": 3315 + }, + { + "epoch": 6.902286902286902, + "grad_norm": 0.49298861622810364, + "learning_rate": 4.915541441116558e-05, + "loss": 0.1395, + "num_input_tokens_seen": 2382560, + "step": 3320 + }, + { + "epoch": 6.912681912681912, + "grad_norm": 0.5870598554611206, + "learning_rate": 4.915288227558711e-05, + "loss": 0.1329, + "num_input_tokens_seen": 2386144, + "step": 3325 + }, + { + "epoch": 6.923076923076923, + "grad_norm": 0.2073337882757187, + "learning_rate": 4.915034641533066e-05, + "loss": 0.1112, + "num_input_tokens_seen": 2389728, + "step": 3330 + }, + { + "epoch": 6.9334719334719335, + "grad_norm": 0.3428483307361603, + "learning_rate": 4.914780683078731e-05, + "loss": 0.1321, + "num_input_tokens_seen": 2393344, + "step": 3335 + }, + { + "epoch": 6.943866943866944, + "grad_norm": 0.718245267868042, + "learning_rate": 4.9145263522348695e-05, + "loss": 0.1474, + "num_input_tokens_seen": 2396928, + "step": 3340 + }, + { + "epoch": 6.954261954261955, + "grad_norm": 0.26816630363464355, + "learning_rate": 4.9142716490407e-05, + "loss": 0.1907, + "num_input_tokens_seen": 2400544, + "step": 3345 + }, + { + "epoch": 6.964656964656965, + "grad_norm": 0.29795390367507935, + "learning_rate": 4.914016573535504e-05, + "loss": 0.199, + "num_input_tokens_seen": 2404160, + "step": 3350 + }, + { + "epoch": 6.975051975051975, + "grad_norm": 0.6824160814285278, + "learning_rate": 4.9137611257586154e-05, + "loss": 0.1489, + "num_input_tokens_seen": 2407744, + "step": 3355 + }, + { + "epoch": 6.985446985446986, + "grad_norm": 0.3722708523273468, + "learning_rate": 4.9135053057494274e-05, + "loss": 0.1521, + "num_input_tokens_seen": 2411328, + "step": 3360 + }, + { + "epoch": 6.995841995841996, + "grad_norm": 0.20015670359134674, + "learning_rate": 4.913249113547392e-05, + "loss": 0.1392, + "num_input_tokens_seen": 2414976, + "step": 3365 + }, + { + "epoch": 7.006237006237006, + "grad_norm": 0.5256457924842834, + "learning_rate": 4.912992549192016e-05, + "loss": 0.1393, + "num_input_tokens_seen": 2418448, + "step": 3370 + }, + { + "epoch": 7.016632016632016, + "grad_norm": 0.9522621631622314, + "learning_rate": 4.9127356127228665e-05, + "loss": 0.1646, + "num_input_tokens_seen": 2421968, + "step": 3375 + }, + { + "epoch": 7.027027027027027, + "grad_norm": 0.4730372130870819, + "learning_rate": 4.912478304179564e-05, + "loss": 0.1464, + "num_input_tokens_seen": 2425520, + "step": 3380 + }, + { + "epoch": 7.037422037422037, + "grad_norm": 0.8964717984199524, + "learning_rate": 4.9122206236017896e-05, + "loss": 0.1865, + "num_input_tokens_seen": 2429296, + "step": 3385 + }, + { + "epoch": 7.047817047817047, + "grad_norm": 0.46383213996887207, + "learning_rate": 4.911962571029282e-05, + "loss": 0.1422, + "num_input_tokens_seen": 2432912, + "step": 3390 + }, + { + "epoch": 7.058212058212058, + "grad_norm": 0.34146425127983093, + "learning_rate": 4.9117041465018353e-05, + "loss": 0.1613, + "num_input_tokens_seen": 2436560, + "step": 3395 + }, + { + "epoch": 7.0686070686070686, + "grad_norm": 0.5648390650749207, + "learning_rate": 4.911445350059302e-05, + "loss": 0.1829, + "num_input_tokens_seen": 2440240, + "step": 3400 + }, + { + "epoch": 7.0686070686070686, + "eval_loss": 0.16341079771518707, + "eval_runtime": 7.7641, + "eval_samples_per_second": 110.25, + "eval_steps_per_second": 27.563, + "num_input_tokens_seen": 2440240, + "step": 3400 + }, + { + "epoch": 7.079002079002079, + "grad_norm": 0.6418298482894897, + "learning_rate": 4.9111861817415905e-05, + "loss": 0.1675, + "num_input_tokens_seen": 2443856, + "step": 3405 + }, + { + "epoch": 7.08939708939709, + "grad_norm": 0.4398673474788666, + "learning_rate": 4.91092664158867e-05, + "loss": 0.1519, + "num_input_tokens_seen": 2447568, + "step": 3410 + }, + { + "epoch": 7.0997920997921, + "grad_norm": 0.3375225365161896, + "learning_rate": 4.910666729640563e-05, + "loss": 0.144, + "num_input_tokens_seen": 2450992, + "step": 3415 + }, + { + "epoch": 7.11018711018711, + "grad_norm": 0.3453805446624756, + "learning_rate": 4.910406445937353e-05, + "loss": 0.2037, + "num_input_tokens_seen": 2454512, + "step": 3420 + }, + { + "epoch": 7.120582120582121, + "grad_norm": 0.41904860734939575, + "learning_rate": 4.9101457905191774e-05, + "loss": 0.1511, + "num_input_tokens_seen": 2458064, + "step": 3425 + }, + { + "epoch": 7.130977130977131, + "grad_norm": 0.48081204295158386, + "learning_rate": 4.909884763426233e-05, + "loss": 0.135, + "num_input_tokens_seen": 2461744, + "step": 3430 + }, + { + "epoch": 7.141372141372141, + "grad_norm": 0.18690234422683716, + "learning_rate": 4.9096233646987736e-05, + "loss": 0.1214, + "num_input_tokens_seen": 2465264, + "step": 3435 + }, + { + "epoch": 7.151767151767152, + "grad_norm": 0.22695964574813843, + "learning_rate": 4.9093615943771104e-05, + "loss": 0.1285, + "num_input_tokens_seen": 2468880, + "step": 3440 + }, + { + "epoch": 7.162162162162162, + "grad_norm": 0.29716336727142334, + "learning_rate": 4.909099452501611e-05, + "loss": 0.142, + "num_input_tokens_seen": 2472528, + "step": 3445 + }, + { + "epoch": 7.172557172557172, + "grad_norm": 0.35938960313796997, + "learning_rate": 4.908836939112702e-05, + "loss": 0.1623, + "num_input_tokens_seen": 2476016, + "step": 3450 + }, + { + "epoch": 7.182952182952183, + "grad_norm": 0.18448328971862793, + "learning_rate": 4.908574054250865e-05, + "loss": 0.1223, + "num_input_tokens_seen": 2479568, + "step": 3455 + }, + { + "epoch": 7.1933471933471935, + "grad_norm": 0.3317047655582428, + "learning_rate": 4.9083107979566414e-05, + "loss": 0.1617, + "num_input_tokens_seen": 2483088, + "step": 3460 + }, + { + "epoch": 7.203742203742204, + "grad_norm": 0.37664735317230225, + "learning_rate": 4.908047170270628e-05, + "loss": 0.1398, + "num_input_tokens_seen": 2486544, + "step": 3465 + }, + { + "epoch": 7.214137214137214, + "grad_norm": 0.1882849484682083, + "learning_rate": 4.9077831712334784e-05, + "loss": 0.1158, + "num_input_tokens_seen": 2490224, + "step": 3470 + }, + { + "epoch": 7.224532224532225, + "grad_norm": 0.33475035429000854, + "learning_rate": 4.907518800885907e-05, + "loss": 0.1817, + "num_input_tokens_seen": 2493744, + "step": 3475 + }, + { + "epoch": 7.234927234927235, + "grad_norm": 0.2612738013267517, + "learning_rate": 4.907254059268681e-05, + "loss": 0.1617, + "num_input_tokens_seen": 2497296, + "step": 3480 + }, + { + "epoch": 7.245322245322245, + "grad_norm": 0.3209158778190613, + "learning_rate": 4.906988946422628e-05, + "loss": 0.1445, + "num_input_tokens_seen": 2500944, + "step": 3485 + }, + { + "epoch": 7.255717255717256, + "grad_norm": 0.3997252285480499, + "learning_rate": 4.9067234623886315e-05, + "loss": 0.1545, + "num_input_tokens_seen": 2504432, + "step": 3490 + }, + { + "epoch": 7.266112266112266, + "grad_norm": 0.22156406939029694, + "learning_rate": 4.9064576072076316e-05, + "loss": 0.1459, + "num_input_tokens_seen": 2507984, + "step": 3495 + }, + { + "epoch": 7.276507276507276, + "grad_norm": 0.25088784098625183, + "learning_rate": 4.906191380920628e-05, + "loss": 0.1704, + "num_input_tokens_seen": 2511600, + "step": 3500 + }, + { + "epoch": 7.286902286902287, + "grad_norm": 0.43601635098457336, + "learning_rate": 4.905924783568675e-05, + "loss": 0.1475, + "num_input_tokens_seen": 2515152, + "step": 3505 + }, + { + "epoch": 7.297297297297297, + "grad_norm": 0.5042214393615723, + "learning_rate": 4.905657815192886e-05, + "loss": 0.1833, + "num_input_tokens_seen": 2518640, + "step": 3510 + }, + { + "epoch": 7.3076923076923075, + "grad_norm": 0.4192360043525696, + "learning_rate": 4.90539047583443e-05, + "loss": 0.1151, + "num_input_tokens_seen": 2522128, + "step": 3515 + }, + { + "epoch": 7.3180873180873185, + "grad_norm": 0.17778019607067108, + "learning_rate": 4.905122765534534e-05, + "loss": 0.154, + "num_input_tokens_seen": 2525840, + "step": 3520 + }, + { + "epoch": 7.328482328482329, + "grad_norm": 0.2313457578420639, + "learning_rate": 4.9048546843344846e-05, + "loss": 0.1281, + "num_input_tokens_seen": 2529424, + "step": 3525 + }, + { + "epoch": 7.338877338877339, + "grad_norm": 0.524095892906189, + "learning_rate": 4.9045862322756206e-05, + "loss": 0.1663, + "num_input_tokens_seen": 2532944, + "step": 3530 + }, + { + "epoch": 7.349272349272349, + "grad_norm": 0.3069426119327545, + "learning_rate": 4.904317409399342e-05, + "loss": 0.1421, + "num_input_tokens_seen": 2536688, + "step": 3535 + }, + { + "epoch": 7.35966735966736, + "grad_norm": 0.288662314414978, + "learning_rate": 4.904048215747104e-05, + "loss": 0.1177, + "num_input_tokens_seen": 2540336, + "step": 3540 + }, + { + "epoch": 7.37006237006237, + "grad_norm": 0.21741731464862823, + "learning_rate": 4.90377865136042e-05, + "loss": 0.1594, + "num_input_tokens_seen": 2544112, + "step": 3545 + }, + { + "epoch": 7.38045738045738, + "grad_norm": 0.4901990294456482, + "learning_rate": 4.90350871628086e-05, + "loss": 0.141, + "num_input_tokens_seen": 2547568, + "step": 3550 + }, + { + "epoch": 7.390852390852391, + "grad_norm": 0.19238071143627167, + "learning_rate": 4.903238410550052e-05, + "loss": 0.1517, + "num_input_tokens_seen": 2551280, + "step": 3555 + }, + { + "epoch": 7.401247401247401, + "grad_norm": 0.20177605748176575, + "learning_rate": 4.90296773420968e-05, + "loss": 0.1645, + "num_input_tokens_seen": 2554896, + "step": 3560 + }, + { + "epoch": 7.411642411642411, + "grad_norm": 0.2917158007621765, + "learning_rate": 4.902696687301486e-05, + "loss": 0.1575, + "num_input_tokens_seen": 2558480, + "step": 3565 + }, + { + "epoch": 7.422037422037422, + "grad_norm": 0.622026801109314, + "learning_rate": 4.902425269867268e-05, + "loss": 0.2442, + "num_input_tokens_seen": 2562288, + "step": 3570 + }, + { + "epoch": 7.4324324324324325, + "grad_norm": 0.1475946605205536, + "learning_rate": 4.902153481948883e-05, + "loss": 0.1594, + "num_input_tokens_seen": 2565936, + "step": 3575 + }, + { + "epoch": 7.442827442827443, + "grad_norm": 0.19583864510059357, + "learning_rate": 4.901881323588244e-05, + "loss": 0.1528, + "num_input_tokens_seen": 2569424, + "step": 3580 + }, + { + "epoch": 7.453222453222454, + "grad_norm": 0.3018905818462372, + "learning_rate": 4.90160879482732e-05, + "loss": 0.1186, + "num_input_tokens_seen": 2572880, + "step": 3585 + }, + { + "epoch": 7.463617463617464, + "grad_norm": 0.2790660262107849, + "learning_rate": 4.9013358957081405e-05, + "loss": 0.1664, + "num_input_tokens_seen": 2576592, + "step": 3590 + }, + { + "epoch": 7.474012474012474, + "grad_norm": 0.3032047152519226, + "learning_rate": 4.901062626272789e-05, + "loss": 0.159, + "num_input_tokens_seen": 2580112, + "step": 3595 + }, + { + "epoch": 7.484407484407485, + "grad_norm": 0.5670721530914307, + "learning_rate": 4.900788986563406e-05, + "loss": 0.1354, + "num_input_tokens_seen": 2583952, + "step": 3600 + }, + { + "epoch": 7.484407484407485, + "eval_loss": 0.16117407381534576, + "eval_runtime": 7.7626, + "eval_samples_per_second": 110.273, + "eval_steps_per_second": 27.568, + "num_input_tokens_seen": 2583952, + "step": 3600 + }, + { + "epoch": 7.494802494802495, + "grad_norm": 0.2398006170988083, + "learning_rate": 4.9005149766221915e-05, + "loss": 0.1763, + "num_input_tokens_seen": 2587536, + "step": 3605 + }, + { + "epoch": 7.505197505197505, + "grad_norm": 0.3245507478713989, + "learning_rate": 4.9002405964914e-05, + "loss": 0.1539, + "num_input_tokens_seen": 2591024, + "step": 3610 + }, + { + "epoch": 7.515592515592516, + "grad_norm": 0.592323899269104, + "learning_rate": 4.899965846213346e-05, + "loss": 0.1527, + "num_input_tokens_seen": 2594640, + "step": 3615 + }, + { + "epoch": 7.525987525987526, + "grad_norm": 0.16735514998435974, + "learning_rate": 4.899690725830399e-05, + "loss": 0.1642, + "num_input_tokens_seen": 2598288, + "step": 3620 + }, + { + "epoch": 7.536382536382536, + "grad_norm": 0.37229546904563904, + "learning_rate": 4.899415235384985e-05, + "loss": 0.1543, + "num_input_tokens_seen": 2601712, + "step": 3625 + }, + { + "epoch": 7.546777546777546, + "grad_norm": 0.20870840549468994, + "learning_rate": 4.899139374919589e-05, + "loss": 0.1864, + "num_input_tokens_seen": 2605520, + "step": 3630 + }, + { + "epoch": 7.557172557172557, + "grad_norm": 0.1586972326040268, + "learning_rate": 4.898863144476752e-05, + "loss": 0.1301, + "num_input_tokens_seen": 2609072, + "step": 3635 + }, + { + "epoch": 7.5675675675675675, + "grad_norm": 0.18452054262161255, + "learning_rate": 4.898586544099072e-05, + "loss": 0.1345, + "num_input_tokens_seen": 2612560, + "step": 3640 + }, + { + "epoch": 7.577962577962578, + "grad_norm": 0.14044198393821716, + "learning_rate": 4.898309573829204e-05, + "loss": 0.1109, + "num_input_tokens_seen": 2616144, + "step": 3645 + }, + { + "epoch": 7.588357588357589, + "grad_norm": 0.37934476137161255, + "learning_rate": 4.898032233709862e-05, + "loss": 0.162, + "num_input_tokens_seen": 2619760, + "step": 3650 + }, + { + "epoch": 7.598752598752599, + "grad_norm": 0.24433784186840057, + "learning_rate": 4.8977545237838123e-05, + "loss": 0.1196, + "num_input_tokens_seen": 2623216, + "step": 3655 + }, + { + "epoch": 7.609147609147609, + "grad_norm": 0.20140907168388367, + "learning_rate": 4.8974764440938836e-05, + "loss": 0.1328, + "num_input_tokens_seen": 2626704, + "step": 3660 + }, + { + "epoch": 7.61954261954262, + "grad_norm": 0.44581353664398193, + "learning_rate": 4.897197994682959e-05, + "loss": 0.1843, + "num_input_tokens_seen": 2630192, + "step": 3665 + }, + { + "epoch": 7.62993762993763, + "grad_norm": 0.15399235486984253, + "learning_rate": 4.8969191755939786e-05, + "loss": 0.1114, + "num_input_tokens_seen": 2633584, + "step": 3670 + }, + { + "epoch": 7.64033264033264, + "grad_norm": 0.27894458174705505, + "learning_rate": 4.8966399868699396e-05, + "loss": 0.1592, + "num_input_tokens_seen": 2637104, + "step": 3675 + }, + { + "epoch": 7.650727650727651, + "grad_norm": 0.49414366483688354, + "learning_rate": 4.8963604285538965e-05, + "loss": 0.1164, + "num_input_tokens_seen": 2640816, + "step": 3680 + }, + { + "epoch": 7.661122661122661, + "grad_norm": 0.2946232557296753, + "learning_rate": 4.8960805006889604e-05, + "loss": 0.126, + "num_input_tokens_seen": 2644432, + "step": 3685 + }, + { + "epoch": 7.671517671517671, + "grad_norm": 0.5090503096580505, + "learning_rate": 4.8958002033183004e-05, + "loss": 0.1572, + "num_input_tokens_seen": 2647920, + "step": 3690 + }, + { + "epoch": 7.6819126819126815, + "grad_norm": 0.40186959505081177, + "learning_rate": 4.8955195364851414e-05, + "loss": 0.1564, + "num_input_tokens_seen": 2651600, + "step": 3695 + }, + { + "epoch": 7.6923076923076925, + "grad_norm": 0.19232776761054993, + "learning_rate": 4.895238500232766e-05, + "loss": 0.1775, + "num_input_tokens_seen": 2655376, + "step": 3700 + }, + { + "epoch": 7.702702702702703, + "grad_norm": 0.2774108946323395, + "learning_rate": 4.8949570946045143e-05, + "loss": 0.1266, + "num_input_tokens_seen": 2659024, + "step": 3705 + }, + { + "epoch": 7.713097713097713, + "grad_norm": 0.37578073143959045, + "learning_rate": 4.89467531964378e-05, + "loss": 0.1374, + "num_input_tokens_seen": 2662704, + "step": 3710 + }, + { + "epoch": 7.723492723492724, + "grad_norm": 0.33899062871932983, + "learning_rate": 4.894393175394019e-05, + "loss": 0.1352, + "num_input_tokens_seen": 2666160, + "step": 3715 + }, + { + "epoch": 7.733887733887734, + "grad_norm": 0.27304303646087646, + "learning_rate": 4.8941106618987406e-05, + "loss": 0.1301, + "num_input_tokens_seen": 2669776, + "step": 3720 + }, + { + "epoch": 7.744282744282744, + "grad_norm": 0.5402517318725586, + "learning_rate": 4.893827779201512e-05, + "loss": 0.146, + "num_input_tokens_seen": 2673328, + "step": 3725 + }, + { + "epoch": 7.754677754677755, + "grad_norm": 0.35070285201072693, + "learning_rate": 4.893544527345957e-05, + "loss": 0.1493, + "num_input_tokens_seen": 2676784, + "step": 3730 + }, + { + "epoch": 7.765072765072765, + "grad_norm": 0.5058175921440125, + "learning_rate": 4.8932609063757563e-05, + "loss": 0.1682, + "num_input_tokens_seen": 2680336, + "step": 3735 + }, + { + "epoch": 7.775467775467775, + "grad_norm": 0.21279636025428772, + "learning_rate": 4.8929769163346484e-05, + "loss": 0.111, + "num_input_tokens_seen": 2684016, + "step": 3740 + }, + { + "epoch": 7.785862785862786, + "grad_norm": 0.4132220149040222, + "learning_rate": 4.892692557266429e-05, + "loss": 0.1916, + "num_input_tokens_seen": 2687664, + "step": 3745 + }, + { + "epoch": 7.796257796257796, + "grad_norm": 0.340943843126297, + "learning_rate": 4.8924078292149464e-05, + "loss": 0.1344, + "num_input_tokens_seen": 2691312, + "step": 3750 + }, + { + "epoch": 7.8066528066528065, + "grad_norm": 0.22081264853477478, + "learning_rate": 4.892122732224114e-05, + "loss": 0.1359, + "num_input_tokens_seen": 2694960, + "step": 3755 + }, + { + "epoch": 7.817047817047817, + "grad_norm": 0.46451136469841003, + "learning_rate": 4.8918372663378944e-05, + "loss": 0.1297, + "num_input_tokens_seen": 2698384, + "step": 3760 + }, + { + "epoch": 7.827442827442828, + "grad_norm": 0.34114211797714233, + "learning_rate": 4.89155143160031e-05, + "loss": 0.16, + "num_input_tokens_seen": 2702256, + "step": 3765 + }, + { + "epoch": 7.837837837837838, + "grad_norm": 0.27815163135528564, + "learning_rate": 4.891265228055441e-05, + "loss": 0.1484, + "num_input_tokens_seen": 2705968, + "step": 3770 + }, + { + "epoch": 7.848232848232849, + "grad_norm": 0.2888100743293762, + "learning_rate": 4.890978655747424e-05, + "loss": 0.1308, + "num_input_tokens_seen": 2709584, + "step": 3775 + }, + { + "epoch": 7.858627858627859, + "grad_norm": 0.2359665483236313, + "learning_rate": 4.89069171472045e-05, + "loss": 0.1262, + "num_input_tokens_seen": 2713104, + "step": 3780 + }, + { + "epoch": 7.869022869022869, + "grad_norm": 0.34461647272109985, + "learning_rate": 4.890404405018772e-05, + "loss": 0.1492, + "num_input_tokens_seen": 2716880, + "step": 3785 + }, + { + "epoch": 7.879417879417879, + "grad_norm": 0.17956596612930298, + "learning_rate": 4.8901167266866934e-05, + "loss": 0.1807, + "num_input_tokens_seen": 2720368, + "step": 3790 + }, + { + "epoch": 7.88981288981289, + "grad_norm": 0.6245737671852112, + "learning_rate": 4.88982867976858e-05, + "loss": 0.176, + "num_input_tokens_seen": 2724016, + "step": 3795 + }, + { + "epoch": 7.9002079002079, + "grad_norm": 0.3481173813343048, + "learning_rate": 4.889540264308852e-05, + "loss": 0.1195, + "num_input_tokens_seen": 2727536, + "step": 3800 + }, + { + "epoch": 7.9002079002079, + "eval_loss": 0.15897950530052185, + "eval_runtime": 7.7504, + "eval_samples_per_second": 110.446, + "eval_steps_per_second": 27.611, + "num_input_tokens_seen": 2727536, + "step": 3800 + }, + { + "epoch": 7.91060291060291, + "grad_norm": 0.15328888595104218, + "learning_rate": 4.889251480351986e-05, + "loss": 0.1473, + "num_input_tokens_seen": 2731120, + "step": 3805 + }, + { + "epoch": 7.920997920997921, + "grad_norm": 0.22506405413150787, + "learning_rate": 4.888962327942517e-05, + "loss": 0.1691, + "num_input_tokens_seen": 2734736, + "step": 3810 + }, + { + "epoch": 7.9313929313929314, + "grad_norm": 0.3574959635734558, + "learning_rate": 4.8886728071250356e-05, + "loss": 0.1607, + "num_input_tokens_seen": 2738352, + "step": 3815 + }, + { + "epoch": 7.941787941787942, + "grad_norm": 0.25159797072410583, + "learning_rate": 4.8883829179441884e-05, + "loss": 0.1407, + "num_input_tokens_seen": 2741904, + "step": 3820 + }, + { + "epoch": 7.952182952182953, + "grad_norm": 0.19287729263305664, + "learning_rate": 4.888092660444682e-05, + "loss": 0.1307, + "num_input_tokens_seen": 2745360, + "step": 3825 + }, + { + "epoch": 7.962577962577963, + "grad_norm": 0.20727689564228058, + "learning_rate": 4.887802034671276e-05, + "loss": 0.1535, + "num_input_tokens_seen": 2748944, + "step": 3830 + }, + { + "epoch": 7.972972972972973, + "grad_norm": 0.1768556386232376, + "learning_rate": 4.88751104066879e-05, + "loss": 0.1325, + "num_input_tokens_seen": 2752592, + "step": 3835 + }, + { + "epoch": 7.983367983367984, + "grad_norm": 0.23327742516994476, + "learning_rate": 4.887219678482098e-05, + "loss": 0.1138, + "num_input_tokens_seen": 2756176, + "step": 3840 + }, + { + "epoch": 7.993762993762994, + "grad_norm": 0.42043912410736084, + "learning_rate": 4.8869279481561316e-05, + "loss": 0.1285, + "num_input_tokens_seen": 2759760, + "step": 3845 + }, + { + "epoch": 8.004158004158004, + "grad_norm": 0.7492491006851196, + "learning_rate": 4.88663584973588e-05, + "loss": 0.1694, + "num_input_tokens_seen": 2763232, + "step": 3850 + }, + { + "epoch": 8.014553014553014, + "grad_norm": 0.21996207535266876, + "learning_rate": 4.8863433832663874e-05, + "loss": 0.128, + "num_input_tokens_seen": 2766720, + "step": 3855 + }, + { + "epoch": 8.024948024948024, + "grad_norm": 0.24666564166545868, + "learning_rate": 4.886050548792757e-05, + "loss": 0.1418, + "num_input_tokens_seen": 2770176, + "step": 3860 + }, + { + "epoch": 8.035343035343036, + "grad_norm": 0.4530559778213501, + "learning_rate": 4.8857573463601465e-05, + "loss": 0.1221, + "num_input_tokens_seen": 2773696, + "step": 3865 + }, + { + "epoch": 8.045738045738046, + "grad_norm": 0.27242955565452576, + "learning_rate": 4.885463776013772e-05, + "loss": 0.1279, + "num_input_tokens_seen": 2777280, + "step": 3870 + }, + { + "epoch": 8.056133056133056, + "grad_norm": 0.9601758718490601, + "learning_rate": 4.8851698377989056e-05, + "loss": 0.1294, + "num_input_tokens_seen": 2780896, + "step": 3875 + }, + { + "epoch": 8.066528066528067, + "grad_norm": 0.15701408684253693, + "learning_rate": 4.884875531760876e-05, + "loss": 0.1097, + "num_input_tokens_seen": 2784288, + "step": 3880 + }, + { + "epoch": 8.076923076923077, + "grad_norm": 0.2423793524503708, + "learning_rate": 4.88458085794507e-05, + "loss": 0.1334, + "num_input_tokens_seen": 2787712, + "step": 3885 + }, + { + "epoch": 8.087318087318087, + "grad_norm": 0.5639021396636963, + "learning_rate": 4.884285816396929e-05, + "loss": 0.1603, + "num_input_tokens_seen": 2791328, + "step": 3890 + }, + { + "epoch": 8.097713097713097, + "grad_norm": 0.23657603561878204, + "learning_rate": 4.8839904071619526e-05, + "loss": 0.1699, + "num_input_tokens_seen": 2794944, + "step": 3895 + }, + { + "epoch": 8.108108108108109, + "grad_norm": 0.1949663609266281, + "learning_rate": 4.8836946302856955e-05, + "loss": 0.0991, + "num_input_tokens_seen": 2798336, + "step": 3900 + }, + { + "epoch": 8.118503118503119, + "grad_norm": 0.2561724781990051, + "learning_rate": 4.8833984858137715e-05, + "loss": 0.1344, + "num_input_tokens_seen": 2801952, + "step": 3905 + }, + { + "epoch": 8.128898128898129, + "grad_norm": 0.4314907193183899, + "learning_rate": 4.8831019737918494e-05, + "loss": 0.1406, + "num_input_tokens_seen": 2805568, + "step": 3910 + }, + { + "epoch": 8.13929313929314, + "grad_norm": 0.702556312084198, + "learning_rate": 4.882805094265655e-05, + "loss": 0.1418, + "num_input_tokens_seen": 2809184, + "step": 3915 + }, + { + "epoch": 8.14968814968815, + "grad_norm": 0.31566575169563293, + "learning_rate": 4.8825078472809706e-05, + "loss": 0.1582, + "num_input_tokens_seen": 2812768, + "step": 3920 + }, + { + "epoch": 8.16008316008316, + "grad_norm": 0.37519025802612305, + "learning_rate": 4.882210232883635e-05, + "loss": 0.1644, + "num_input_tokens_seen": 2816448, + "step": 3925 + }, + { + "epoch": 8.170478170478171, + "grad_norm": 0.8188552856445312, + "learning_rate": 4.881912251119546e-05, + "loss": 0.2119, + "num_input_tokens_seen": 2820096, + "step": 3930 + }, + { + "epoch": 8.180873180873181, + "grad_norm": 0.14023041725158691, + "learning_rate": 4.881613902034654e-05, + "loss": 0.11, + "num_input_tokens_seen": 2823616, + "step": 3935 + }, + { + "epoch": 8.191268191268192, + "grad_norm": 2.7940471172332764, + "learning_rate": 4.88131518567497e-05, + "loss": 0.1921, + "num_input_tokens_seen": 2827200, + "step": 3940 + }, + { + "epoch": 8.201663201663202, + "grad_norm": 0.31606096029281616, + "learning_rate": 4.881016102086558e-05, + "loss": 0.1675, + "num_input_tokens_seen": 2830688, + "step": 3945 + }, + { + "epoch": 8.212058212058212, + "grad_norm": 0.21453985571861267, + "learning_rate": 4.8807166513155425e-05, + "loss": 0.1186, + "num_input_tokens_seen": 2834240, + "step": 3950 + }, + { + "epoch": 8.222453222453222, + "grad_norm": 0.34473100304603577, + "learning_rate": 4.8804168334081004e-05, + "loss": 0.1742, + "num_input_tokens_seen": 2837888, + "step": 3955 + }, + { + "epoch": 8.232848232848234, + "grad_norm": 0.15076828002929688, + "learning_rate": 4.880116648410468e-05, + "loss": 0.1465, + "num_input_tokens_seen": 2841600, + "step": 3960 + }, + { + "epoch": 8.243243243243244, + "grad_norm": 0.23625919222831726, + "learning_rate": 4.879816096368939e-05, + "loss": 0.1414, + "num_input_tokens_seen": 2845152, + "step": 3965 + }, + { + "epoch": 8.253638253638254, + "grad_norm": 0.7830603122711182, + "learning_rate": 4.879515177329861e-05, + "loss": 0.1752, + "num_input_tokens_seen": 2848704, + "step": 3970 + }, + { + "epoch": 8.264033264033264, + "grad_norm": 0.13319696485996246, + "learning_rate": 4.8792138913396394e-05, + "loss": 0.1162, + "num_input_tokens_seen": 2852128, + "step": 3975 + }, + { + "epoch": 8.274428274428274, + "grad_norm": 0.423990398645401, + "learning_rate": 4.8789122384447374e-05, + "loss": 0.2057, + "num_input_tokens_seen": 2855648, + "step": 3980 + }, + { + "epoch": 8.284823284823284, + "grad_norm": 0.286447674036026, + "learning_rate": 4.878610218691673e-05, + "loss": 0.1831, + "num_input_tokens_seen": 2859168, + "step": 3985 + }, + { + "epoch": 8.295218295218294, + "grad_norm": 0.3447604179382324, + "learning_rate": 4.87830783212702e-05, + "loss": 0.1246, + "num_input_tokens_seen": 2862784, + "step": 3990 + }, + { + "epoch": 8.305613305613306, + "grad_norm": 0.6316245198249817, + "learning_rate": 4.878005078797413e-05, + "loss": 0.1206, + "num_input_tokens_seen": 2866400, + "step": 3995 + }, + { + "epoch": 8.316008316008316, + "grad_norm": 0.351954847574234, + "learning_rate": 4.877701958749539e-05, + "loss": 0.1278, + "num_input_tokens_seen": 2870176, + "step": 4000 + }, + { + "epoch": 8.316008316008316, + "eval_loss": 0.15702593326568604, + "eval_runtime": 7.766, + "eval_samples_per_second": 110.224, + "eval_steps_per_second": 27.556, + "num_input_tokens_seen": 2870176, + "step": 4000 + }, + { + "epoch": 8.326403326403327, + "grad_norm": 0.15616875886917114, + "learning_rate": 4.877398472030142e-05, + "loss": 0.1428, + "num_input_tokens_seen": 2873824, + "step": 4005 + }, + { + "epoch": 8.336798336798337, + "grad_norm": 0.31720128655433655, + "learning_rate": 4.877094618686024e-05, + "loss": 0.1223, + "num_input_tokens_seen": 2877280, + "step": 4010 + }, + { + "epoch": 8.347193347193347, + "grad_norm": 0.2656707763671875, + "learning_rate": 4.876790398764045e-05, + "loss": 0.1279, + "num_input_tokens_seen": 2880800, + "step": 4015 + }, + { + "epoch": 8.357588357588357, + "grad_norm": 0.22100241482257843, + "learning_rate": 4.8764858123111167e-05, + "loss": 0.1076, + "num_input_tokens_seen": 2884384, + "step": 4020 + }, + { + "epoch": 8.367983367983369, + "grad_norm": 0.4404619336128235, + "learning_rate": 4.876180859374212e-05, + "loss": 0.1542, + "num_input_tokens_seen": 2887968, + "step": 4025 + }, + { + "epoch": 8.378378378378379, + "grad_norm": 0.4973001778125763, + "learning_rate": 4.875875540000357e-05, + "loss": 0.1368, + "num_input_tokens_seen": 2891488, + "step": 4030 + }, + { + "epoch": 8.388773388773389, + "grad_norm": 0.190226212143898, + "learning_rate": 4.8755698542366376e-05, + "loss": 0.1247, + "num_input_tokens_seen": 2895104, + "step": 4035 + }, + { + "epoch": 8.3991683991684, + "grad_norm": 0.42183825373649597, + "learning_rate": 4.875263802130193e-05, + "loss": 0.1738, + "num_input_tokens_seen": 2898784, + "step": 4040 + }, + { + "epoch": 8.40956340956341, + "grad_norm": 0.2992412745952606, + "learning_rate": 4.8749573837282207e-05, + "loss": 0.1872, + "num_input_tokens_seen": 2902368, + "step": 4045 + }, + { + "epoch": 8.41995841995842, + "grad_norm": 0.21060816943645477, + "learning_rate": 4.874650599077974e-05, + "loss": 0.1363, + "num_input_tokens_seen": 2906080, + "step": 4050 + }, + { + "epoch": 8.43035343035343, + "grad_norm": 0.741044282913208, + "learning_rate": 4.874343448226764e-05, + "loss": 0.2033, + "num_input_tokens_seen": 2909856, + "step": 4055 + }, + { + "epoch": 8.440748440748441, + "grad_norm": 0.24047306180000305, + "learning_rate": 4.874035931221955e-05, + "loss": 0.1104, + "num_input_tokens_seen": 2913568, + "step": 4060 + }, + { + "epoch": 8.451143451143452, + "grad_norm": 0.23941895365715027, + "learning_rate": 4.8737280481109724e-05, + "loss": 0.1254, + "num_input_tokens_seen": 2917088, + "step": 4065 + }, + { + "epoch": 8.461538461538462, + "grad_norm": 0.16839860379695892, + "learning_rate": 4.873419798941294e-05, + "loss": 0.1271, + "num_input_tokens_seen": 2920512, + "step": 4070 + }, + { + "epoch": 8.471933471933472, + "grad_norm": 0.293361097574234, + "learning_rate": 4.873111183760458e-05, + "loss": 0.1382, + "num_input_tokens_seen": 2924128, + "step": 4075 + }, + { + "epoch": 8.482328482328482, + "grad_norm": 0.5209582448005676, + "learning_rate": 4.8728022026160537e-05, + "loss": 0.1344, + "num_input_tokens_seen": 2927680, + "step": 4080 + }, + { + "epoch": 8.492723492723492, + "grad_norm": 0.2857867479324341, + "learning_rate": 4.872492855555732e-05, + "loss": 0.1688, + "num_input_tokens_seen": 2931200, + "step": 4085 + }, + { + "epoch": 8.503118503118504, + "grad_norm": 0.5684158205986023, + "learning_rate": 4.8721831426271956e-05, + "loss": 0.1442, + "num_input_tokens_seen": 2934816, + "step": 4090 + }, + { + "epoch": 8.513513513513514, + "grad_norm": 0.3871106505393982, + "learning_rate": 4.87187306387821e-05, + "loss": 0.1685, + "num_input_tokens_seen": 2938432, + "step": 4095 + }, + { + "epoch": 8.523908523908524, + "grad_norm": 0.2547578811645508, + "learning_rate": 4.87156261935659e-05, + "loss": 0.136, + "num_input_tokens_seen": 2942048, + "step": 4100 + }, + { + "epoch": 8.534303534303534, + "grad_norm": 0.26382309198379517, + "learning_rate": 4.871251809110211e-05, + "loss": 0.1479, + "num_input_tokens_seen": 2945696, + "step": 4105 + }, + { + "epoch": 8.544698544698544, + "grad_norm": 0.27865707874298096, + "learning_rate": 4.8709406331870044e-05, + "loss": 0.1166, + "num_input_tokens_seen": 2949344, + "step": 4110 + }, + { + "epoch": 8.555093555093555, + "grad_norm": 0.24484458565711975, + "learning_rate": 4.8706290916349574e-05, + "loss": 0.1056, + "num_input_tokens_seen": 2952960, + "step": 4115 + }, + { + "epoch": 8.565488565488565, + "grad_norm": 0.27114415168762207, + "learning_rate": 4.8703171845021134e-05, + "loss": 0.1434, + "num_input_tokens_seen": 2956512, + "step": 4120 + }, + { + "epoch": 8.575883575883577, + "grad_norm": 0.43920883536338806, + "learning_rate": 4.870004911836572e-05, + "loss": 0.1347, + "num_input_tokens_seen": 2959904, + "step": 4125 + }, + { + "epoch": 8.586278586278587, + "grad_norm": 0.15606115758419037, + "learning_rate": 4.869692273686489e-05, + "loss": 0.191, + "num_input_tokens_seen": 2963712, + "step": 4130 + }, + { + "epoch": 8.596673596673597, + "grad_norm": 0.27159401774406433, + "learning_rate": 4.869379270100079e-05, + "loss": 0.1343, + "num_input_tokens_seen": 2967328, + "step": 4135 + }, + { + "epoch": 8.607068607068607, + "grad_norm": 0.42932310700416565, + "learning_rate": 4.86906590112561e-05, + "loss": 0.11, + "num_input_tokens_seen": 2971072, + "step": 4140 + }, + { + "epoch": 8.617463617463617, + "grad_norm": 0.2053198218345642, + "learning_rate": 4.8687521668114064e-05, + "loss": 0.1036, + "num_input_tokens_seen": 2974592, + "step": 4145 + }, + { + "epoch": 8.627858627858627, + "grad_norm": 0.22683531045913696, + "learning_rate": 4.868438067205853e-05, + "loss": 0.1455, + "num_input_tokens_seen": 2978176, + "step": 4150 + }, + { + "epoch": 8.638253638253639, + "grad_norm": 0.3232594132423401, + "learning_rate": 4.8681236023573844e-05, + "loss": 0.1194, + "num_input_tokens_seen": 2981632, + "step": 4155 + }, + { + "epoch": 8.64864864864865, + "grad_norm": 0.37288719415664673, + "learning_rate": 4.867808772314497e-05, + "loss": 0.1602, + "num_input_tokens_seen": 2985056, + "step": 4160 + }, + { + "epoch": 8.65904365904366, + "grad_norm": 0.17601755261421204, + "learning_rate": 4.867493577125741e-05, + "loss": 0.1341, + "num_input_tokens_seen": 2988768, + "step": 4165 + }, + { + "epoch": 8.66943866943867, + "grad_norm": 0.219468355178833, + "learning_rate": 4.867178016839725e-05, + "loss": 0.1579, + "num_input_tokens_seen": 2992448, + "step": 4170 + }, + { + "epoch": 8.67983367983368, + "grad_norm": 0.1761338710784912, + "learning_rate": 4.8668620915051094e-05, + "loss": 0.1511, + "num_input_tokens_seen": 2996000, + "step": 4175 + }, + { + "epoch": 8.69022869022869, + "grad_norm": 0.1605132520198822, + "learning_rate": 4.866545801170616e-05, + "loss": 0.1063, + "num_input_tokens_seen": 2999520, + "step": 4180 + }, + { + "epoch": 8.700623700623701, + "grad_norm": 0.28420984745025635, + "learning_rate": 4.86622914588502e-05, + "loss": 0.1505, + "num_input_tokens_seen": 3003200, + "step": 4185 + }, + { + "epoch": 8.711018711018712, + "grad_norm": 0.6203392744064331, + "learning_rate": 4.865912125697154e-05, + "loss": 0.1487, + "num_input_tokens_seen": 3006848, + "step": 4190 + }, + { + "epoch": 8.721413721413722, + "grad_norm": 0.3685559332370758, + "learning_rate": 4.865594740655907e-05, + "loss": 0.1112, + "num_input_tokens_seen": 3010272, + "step": 4195 + }, + { + "epoch": 8.731808731808732, + "grad_norm": 0.2537704408168793, + "learning_rate": 4.865276990810222e-05, + "loss": 0.1559, + "num_input_tokens_seen": 3013792, + "step": 4200 + }, + { + "epoch": 8.731808731808732, + "eval_loss": 0.1622593253850937, + "eval_runtime": 7.7745, + "eval_samples_per_second": 110.104, + "eval_steps_per_second": 27.526, + "num_input_tokens_seen": 3013792, + "step": 4200 + }, + { + "epoch": 8.742203742203742, + "grad_norm": 0.37489786744117737, + "learning_rate": 4.8649588762091016e-05, + "loss": 0.1294, + "num_input_tokens_seen": 3017344, + "step": 4205 + }, + { + "epoch": 8.752598752598752, + "grad_norm": 0.3139389753341675, + "learning_rate": 4.8646403969016016e-05, + "loss": 0.1377, + "num_input_tokens_seen": 3021024, + "step": 4210 + }, + { + "epoch": 8.762993762993762, + "grad_norm": 0.4428741931915283, + "learning_rate": 4.864321552936838e-05, + "loss": 0.1925, + "num_input_tokens_seen": 3024576, + "step": 4215 + }, + { + "epoch": 8.773388773388774, + "grad_norm": 0.3922085464000702, + "learning_rate": 4.864002344363978e-05, + "loss": 0.1769, + "num_input_tokens_seen": 3028160, + "step": 4220 + }, + { + "epoch": 8.783783783783784, + "grad_norm": 0.317539244890213, + "learning_rate": 4.863682771232248e-05, + "loss": 0.1632, + "num_input_tokens_seen": 3031808, + "step": 4225 + }, + { + "epoch": 8.794178794178794, + "grad_norm": 0.241957888007164, + "learning_rate": 4.8633628335909324e-05, + "loss": 0.1515, + "num_input_tokens_seen": 3035424, + "step": 4230 + }, + { + "epoch": 8.804573804573804, + "grad_norm": 0.1821354478597641, + "learning_rate": 4.8630425314893676e-05, + "loss": 0.145, + "num_input_tokens_seen": 3039200, + "step": 4235 + }, + { + "epoch": 8.814968814968815, + "grad_norm": 0.32134705781936646, + "learning_rate": 4.862721864976948e-05, + "loss": 0.1621, + "num_input_tokens_seen": 3042720, + "step": 4240 + }, + { + "epoch": 8.825363825363825, + "grad_norm": 0.4265681207180023, + "learning_rate": 4.862400834103125e-05, + "loss": 0.1549, + "num_input_tokens_seen": 3046400, + "step": 4245 + }, + { + "epoch": 8.835758835758837, + "grad_norm": 0.2649822235107422, + "learning_rate": 4.862079438917406e-05, + "loss": 0.1811, + "num_input_tokens_seen": 3050208, + "step": 4250 + }, + { + "epoch": 8.846153846153847, + "grad_norm": 0.17230741679668427, + "learning_rate": 4.8617576794693536e-05, + "loss": 0.1684, + "num_input_tokens_seen": 3053824, + "step": 4255 + }, + { + "epoch": 8.856548856548857, + "grad_norm": 0.21292369067668915, + "learning_rate": 4.8614355558085875e-05, + "loss": 0.1463, + "num_input_tokens_seen": 3057312, + "step": 4260 + }, + { + "epoch": 8.866943866943867, + "grad_norm": 0.47252988815307617, + "learning_rate": 4.861113067984783e-05, + "loss": 0.1583, + "num_input_tokens_seen": 3061088, + "step": 4265 + }, + { + "epoch": 8.877338877338877, + "grad_norm": 0.2324046492576599, + "learning_rate": 4.860790216047671e-05, + "loss": 0.1347, + "num_input_tokens_seen": 3064736, + "step": 4270 + }, + { + "epoch": 8.887733887733887, + "grad_norm": 0.22694937884807587, + "learning_rate": 4.860467000047041e-05, + "loss": 0.0963, + "num_input_tokens_seen": 3068320, + "step": 4275 + }, + { + "epoch": 8.898128898128899, + "grad_norm": 0.353981077671051, + "learning_rate": 4.860143420032737e-05, + "loss": 0.1797, + "num_input_tokens_seen": 3071968, + "step": 4280 + }, + { + "epoch": 8.90852390852391, + "grad_norm": 0.39937323331832886, + "learning_rate": 4.859819476054657e-05, + "loss": 0.1474, + "num_input_tokens_seen": 3075584, + "step": 4285 + }, + { + "epoch": 8.91891891891892, + "grad_norm": 0.35355886816978455, + "learning_rate": 4.859495168162758e-05, + "loss": 0.1636, + "num_input_tokens_seen": 3079136, + "step": 4290 + }, + { + "epoch": 8.92931392931393, + "grad_norm": 0.2869257628917694, + "learning_rate": 4.859170496407054e-05, + "loss": 0.1733, + "num_input_tokens_seen": 3082816, + "step": 4295 + }, + { + "epoch": 8.93970893970894, + "grad_norm": 0.2125956416130066, + "learning_rate": 4.8588454608376114e-05, + "loss": 0.125, + "num_input_tokens_seen": 3086432, + "step": 4300 + }, + { + "epoch": 8.95010395010395, + "grad_norm": 0.31592243909835815, + "learning_rate": 4.8585200615045555e-05, + "loss": 0.1338, + "num_input_tokens_seen": 3090016, + "step": 4305 + }, + { + "epoch": 8.96049896049896, + "grad_norm": 0.25009801983833313, + "learning_rate": 4.8581942984580674e-05, + "loss": 0.1495, + "num_input_tokens_seen": 3093568, + "step": 4310 + }, + { + "epoch": 8.970893970893972, + "grad_norm": 0.2068023383617401, + "learning_rate": 4.857868171748384e-05, + "loss": 0.1324, + "num_input_tokens_seen": 3097024, + "step": 4315 + }, + { + "epoch": 8.981288981288982, + "grad_norm": 0.5032361149787903, + "learning_rate": 4.8575416814257976e-05, + "loss": 0.1454, + "num_input_tokens_seen": 3100608, + "step": 4320 + }, + { + "epoch": 8.991683991683992, + "grad_norm": 0.3708304166793823, + "learning_rate": 4.857214827540657e-05, + "loss": 0.1441, + "num_input_tokens_seen": 3104256, + "step": 4325 + }, + { + "epoch": 9.002079002079002, + "grad_norm": 0.22054150700569153, + "learning_rate": 4.856887610143367e-05, + "loss": 0.1355, + "num_input_tokens_seen": 3107832, + "step": 4330 + }, + { + "epoch": 9.012474012474012, + "grad_norm": 0.7480131983757019, + "learning_rate": 4.8565600292843896e-05, + "loss": 0.177, + "num_input_tokens_seen": 3111704, + "step": 4335 + }, + { + "epoch": 9.022869022869022, + "grad_norm": 0.39408376812934875, + "learning_rate": 4.856232085014241e-05, + "loss": 0.1176, + "num_input_tokens_seen": 3115288, + "step": 4340 + }, + { + "epoch": 9.033264033264032, + "grad_norm": 0.49258512258529663, + "learning_rate": 4.855903777383495e-05, + "loss": 0.1591, + "num_input_tokens_seen": 3118840, + "step": 4345 + }, + { + "epoch": 9.043659043659044, + "grad_norm": 0.2557823657989502, + "learning_rate": 4.85557510644278e-05, + "loss": 0.1265, + "num_input_tokens_seen": 3122552, + "step": 4350 + }, + { + "epoch": 9.054054054054054, + "grad_norm": 0.2996658384799957, + "learning_rate": 4.855246072242782e-05, + "loss": 0.1454, + "num_input_tokens_seen": 3126040, + "step": 4355 + }, + { + "epoch": 9.064449064449065, + "grad_norm": 0.3078649938106537, + "learning_rate": 4.8549166748342414e-05, + "loss": 0.1175, + "num_input_tokens_seen": 3129720, + "step": 4360 + }, + { + "epoch": 9.074844074844075, + "grad_norm": 0.3677508533000946, + "learning_rate": 4.8545869142679556e-05, + "loss": 0.1411, + "num_input_tokens_seen": 3133144, + "step": 4365 + }, + { + "epoch": 9.085239085239085, + "grad_norm": 0.22188450396060944, + "learning_rate": 4.8542567905947776e-05, + "loss": 0.1648, + "num_input_tokens_seen": 3136664, + "step": 4370 + }, + { + "epoch": 9.095634095634095, + "grad_norm": 0.21790172159671783, + "learning_rate": 4.853926303865618e-05, + "loss": 0.1723, + "num_input_tokens_seen": 3140184, + "step": 4375 + }, + { + "epoch": 9.106029106029107, + "grad_norm": 0.5443068742752075, + "learning_rate": 4.853595454131441e-05, + "loss": 0.1568, + "num_input_tokens_seen": 3143896, + "step": 4380 + }, + { + "epoch": 9.116424116424117, + "grad_norm": 0.2024877965450287, + "learning_rate": 4.8532642414432674e-05, + "loss": 0.1408, + "num_input_tokens_seen": 3147416, + "step": 4385 + }, + { + "epoch": 9.126819126819127, + "grad_norm": 0.3803432285785675, + "learning_rate": 4.8529326658521754e-05, + "loss": 0.1551, + "num_input_tokens_seen": 3151000, + "step": 4390 + }, + { + "epoch": 9.137214137214137, + "grad_norm": 0.16740967333316803, + "learning_rate": 4.8526007274092965e-05, + "loss": 0.1117, + "num_input_tokens_seen": 3154552, + "step": 4395 + }, + { + "epoch": 9.147609147609147, + "grad_norm": 0.2773268222808838, + "learning_rate": 4.852268426165822e-05, + "loss": 0.1162, + "num_input_tokens_seen": 3157976, + "step": 4400 + }, + { + "epoch": 9.147609147609147, + "eval_loss": 0.15863661468029022, + "eval_runtime": 7.7688, + "eval_samples_per_second": 110.185, + "eval_steps_per_second": 27.546, + "num_input_tokens_seen": 3157976, + "step": 4400 + }, + { + "epoch": 9.158004158004157, + "grad_norm": 0.31268075108528137, + "learning_rate": 4.851935762172995e-05, + "loss": 0.1685, + "num_input_tokens_seen": 3161592, + "step": 4405 + }, + { + "epoch": 9.16839916839917, + "grad_norm": 0.5402635931968689, + "learning_rate": 4.8516027354821175e-05, + "loss": 0.1372, + "num_input_tokens_seen": 3165112, + "step": 4410 + }, + { + "epoch": 9.17879417879418, + "grad_norm": 0.18274526298046112, + "learning_rate": 4.851269346144546e-05, + "loss": 0.159, + "num_input_tokens_seen": 3168664, + "step": 4415 + }, + { + "epoch": 9.18918918918919, + "grad_norm": 0.37747547030448914, + "learning_rate": 4.850935594211693e-05, + "loss": 0.1225, + "num_input_tokens_seen": 3172216, + "step": 4420 + }, + { + "epoch": 9.1995841995842, + "grad_norm": 0.28124165534973145, + "learning_rate": 4.850601479735029e-05, + "loss": 0.1919, + "num_input_tokens_seen": 3175896, + "step": 4425 + }, + { + "epoch": 9.20997920997921, + "grad_norm": 0.19472192227840424, + "learning_rate": 4.850267002766076e-05, + "loss": 0.1222, + "num_input_tokens_seen": 3179480, + "step": 4430 + }, + { + "epoch": 9.22037422037422, + "grad_norm": 0.5040925145149231, + "learning_rate": 4.849932163356417e-05, + "loss": 0.1205, + "num_input_tokens_seen": 3183128, + "step": 4435 + }, + { + "epoch": 9.23076923076923, + "grad_norm": 0.4256506860256195, + "learning_rate": 4.8495969615576864e-05, + "loss": 0.1249, + "num_input_tokens_seen": 3186744, + "step": 4440 + }, + { + "epoch": 9.241164241164242, + "grad_norm": 0.19586654007434845, + "learning_rate": 4.849261397421577e-05, + "loss": 0.1281, + "num_input_tokens_seen": 3190424, + "step": 4445 + }, + { + "epoch": 9.251559251559252, + "grad_norm": 0.7807199954986572, + "learning_rate": 4.848925470999839e-05, + "loss": 0.1346, + "num_input_tokens_seen": 3193976, + "step": 4450 + }, + { + "epoch": 9.261954261954262, + "grad_norm": 0.2993946671485901, + "learning_rate": 4.848589182344273e-05, + "loss": 0.1425, + "num_input_tokens_seen": 3197560, + "step": 4455 + }, + { + "epoch": 9.272349272349272, + "grad_norm": 0.3280732333660126, + "learning_rate": 4.848252531506742e-05, + "loss": 0.1227, + "num_input_tokens_seen": 3201176, + "step": 4460 + }, + { + "epoch": 9.282744282744282, + "grad_norm": 0.37443968653678894, + "learning_rate": 4.847915518539161e-05, + "loss": 0.1603, + "num_input_tokens_seen": 3204760, + "step": 4465 + }, + { + "epoch": 9.293139293139292, + "grad_norm": 0.2576562762260437, + "learning_rate": 4.847578143493501e-05, + "loss": 0.1165, + "num_input_tokens_seen": 3208216, + "step": 4470 + }, + { + "epoch": 9.303534303534304, + "grad_norm": 0.2515905499458313, + "learning_rate": 4.847240406421789e-05, + "loss": 0.1467, + "num_input_tokens_seen": 3211704, + "step": 4475 + }, + { + "epoch": 9.313929313929314, + "grad_norm": 0.34945040941238403, + "learning_rate": 4.84690230737611e-05, + "loss": 0.2074, + "num_input_tokens_seen": 3215192, + "step": 4480 + }, + { + "epoch": 9.324324324324325, + "grad_norm": 0.6154036521911621, + "learning_rate": 4.846563846408602e-05, + "loss": 0.1739, + "num_input_tokens_seen": 3218680, + "step": 4485 + }, + { + "epoch": 9.334719334719335, + "grad_norm": 0.1916683167219162, + "learning_rate": 4.84622502357146e-05, + "loss": 0.1352, + "num_input_tokens_seen": 3222232, + "step": 4490 + }, + { + "epoch": 9.345114345114345, + "grad_norm": 0.33855879306793213, + "learning_rate": 4.8458858389169345e-05, + "loss": 0.1257, + "num_input_tokens_seen": 3225976, + "step": 4495 + }, + { + "epoch": 9.355509355509355, + "grad_norm": 0.5435149669647217, + "learning_rate": 4.8455462924973334e-05, + "loss": 0.154, + "num_input_tokens_seen": 3229432, + "step": 4500 + }, + { + "epoch": 9.365904365904367, + "grad_norm": 0.12774546444416046, + "learning_rate": 4.845206384365018e-05, + "loss": 0.1244, + "num_input_tokens_seen": 3233080, + "step": 4505 + }, + { + "epoch": 9.376299376299377, + "grad_norm": 0.6887985467910767, + "learning_rate": 4.844866114572405e-05, + "loss": 0.1152, + "num_input_tokens_seen": 3236600, + "step": 4510 + }, + { + "epoch": 9.386694386694387, + "grad_norm": 0.28375551104545593, + "learning_rate": 4.8445254831719706e-05, + "loss": 0.1056, + "num_input_tokens_seen": 3240120, + "step": 4515 + }, + { + "epoch": 9.397089397089397, + "grad_norm": 0.30952298641204834, + "learning_rate": 4.8441844902162434e-05, + "loss": 0.1419, + "num_input_tokens_seen": 3243736, + "step": 4520 + }, + { + "epoch": 9.407484407484407, + "grad_norm": 0.16129767894744873, + "learning_rate": 4.843843135757809e-05, + "loss": 0.1311, + "num_input_tokens_seen": 3247288, + "step": 4525 + }, + { + "epoch": 9.417879417879417, + "grad_norm": 0.1489826887845993, + "learning_rate": 4.843501419849308e-05, + "loss": 0.1336, + "num_input_tokens_seen": 3250936, + "step": 4530 + }, + { + "epoch": 9.428274428274428, + "grad_norm": 0.1849830150604248, + "learning_rate": 4.8431593425434386e-05, + "loss": 0.1604, + "num_input_tokens_seen": 3254712, + "step": 4535 + }, + { + "epoch": 9.43866943866944, + "grad_norm": 0.20174755156040192, + "learning_rate": 4.8428169038929526e-05, + "loss": 0.14, + "num_input_tokens_seen": 3258104, + "step": 4540 + }, + { + "epoch": 9.44906444906445, + "grad_norm": 0.30683889985084534, + "learning_rate": 4.8424741039506575e-05, + "loss": 0.1092, + "num_input_tokens_seen": 3261624, + "step": 4545 + }, + { + "epoch": 9.45945945945946, + "grad_norm": 0.3024084270000458, + "learning_rate": 4.842130942769419e-05, + "loss": 0.1333, + "num_input_tokens_seen": 3265336, + "step": 4550 + }, + { + "epoch": 9.46985446985447, + "grad_norm": 0.16825638711452484, + "learning_rate": 4.841787420402156e-05, + "loss": 0.1422, + "num_input_tokens_seen": 3269048, + "step": 4555 + }, + { + "epoch": 9.48024948024948, + "grad_norm": 0.38702890276908875, + "learning_rate": 4.841443536901844e-05, + "loss": 0.1823, + "num_input_tokens_seen": 3272760, + "step": 4560 + }, + { + "epoch": 9.49064449064449, + "grad_norm": 0.345951110124588, + "learning_rate": 4.841099292321514e-05, + "loss": 0.1664, + "num_input_tokens_seen": 3276536, + "step": 4565 + }, + { + "epoch": 9.5010395010395, + "grad_norm": 0.47576677799224854, + "learning_rate": 4.8407546867142525e-05, + "loss": 0.1424, + "num_input_tokens_seen": 3280024, + "step": 4570 + }, + { + "epoch": 9.511434511434512, + "grad_norm": 0.3977706730365753, + "learning_rate": 4.840409720133203e-05, + "loss": 0.127, + "num_input_tokens_seen": 3283640, + "step": 4575 + }, + { + "epoch": 9.521829521829522, + "grad_norm": 0.30214396119117737, + "learning_rate": 4.8400643926315634e-05, + "loss": 0.148, + "num_input_tokens_seen": 3287064, + "step": 4580 + }, + { + "epoch": 9.532224532224532, + "grad_norm": 0.20104850828647614, + "learning_rate": 4.839718704262587e-05, + "loss": 0.1128, + "num_input_tokens_seen": 3290584, + "step": 4585 + }, + { + "epoch": 9.542619542619542, + "grad_norm": 0.5940218567848206, + "learning_rate": 4.839372655079585e-05, + "loss": 0.1372, + "num_input_tokens_seen": 3294136, + "step": 4590 + }, + { + "epoch": 9.553014553014552, + "grad_norm": 0.48701754212379456, + "learning_rate": 4.83902624513592e-05, + "loss": 0.1564, + "num_input_tokens_seen": 3297880, + "step": 4595 + }, + { + "epoch": 9.563409563409563, + "grad_norm": 0.30642274022102356, + "learning_rate": 4.838679474485014e-05, + "loss": 0.1551, + "num_input_tokens_seen": 3301400, + "step": 4600 + }, + { + "epoch": 9.563409563409563, + "eval_loss": 0.15909497439861298, + "eval_runtime": 7.7607, + "eval_samples_per_second": 110.3, + "eval_steps_per_second": 27.575, + "num_input_tokens_seen": 3301400, + "step": 4600 + }, + { + "epoch": 9.573804573804575, + "grad_norm": 0.19734421372413635, + "learning_rate": 4.838332343180343e-05, + "loss": 0.0835, + "num_input_tokens_seen": 3304952, + "step": 4605 + }, + { + "epoch": 9.584199584199585, + "grad_norm": 0.41786015033721924, + "learning_rate": 4.83798485127544e-05, + "loss": 0.1214, + "num_input_tokens_seen": 3308504, + "step": 4610 + }, + { + "epoch": 9.594594594594595, + "grad_norm": 0.40348687767982483, + "learning_rate": 4.837636998823892e-05, + "loss": 0.1256, + "num_input_tokens_seen": 3312056, + "step": 4615 + }, + { + "epoch": 9.604989604989605, + "grad_norm": 0.172202929854393, + "learning_rate": 4.8372887858793414e-05, + "loss": 0.1533, + "num_input_tokens_seen": 3315800, + "step": 4620 + }, + { + "epoch": 9.615384615384615, + "grad_norm": 0.27265438437461853, + "learning_rate": 4.836940212495489e-05, + "loss": 0.1398, + "num_input_tokens_seen": 3319384, + "step": 4625 + }, + { + "epoch": 9.625779625779625, + "grad_norm": 0.6148355603218079, + "learning_rate": 4.836591278726087e-05, + "loss": 0.1481, + "num_input_tokens_seen": 3322968, + "step": 4630 + }, + { + "epoch": 9.636174636174637, + "grad_norm": 0.3283523619174957, + "learning_rate": 4.836241984624947e-05, + "loss": 0.1413, + "num_input_tokens_seen": 3326520, + "step": 4635 + }, + { + "epoch": 9.646569646569647, + "grad_norm": 0.5713521838188171, + "learning_rate": 4.8358923302459336e-05, + "loss": 0.2039, + "num_input_tokens_seen": 3330136, + "step": 4640 + }, + { + "epoch": 9.656964656964657, + "grad_norm": 0.3635396957397461, + "learning_rate": 4.835542315642968e-05, + "loss": 0.1651, + "num_input_tokens_seen": 3333688, + "step": 4645 + }, + { + "epoch": 9.667359667359667, + "grad_norm": 0.37131014466285706, + "learning_rate": 4.8351919408700274e-05, + "loss": 0.1562, + "num_input_tokens_seen": 3337176, + "step": 4650 + }, + { + "epoch": 9.677754677754677, + "grad_norm": 0.3076980412006378, + "learning_rate": 4.834841205981144e-05, + "loss": 0.1416, + "num_input_tokens_seen": 3340952, + "step": 4655 + }, + { + "epoch": 9.688149688149688, + "grad_norm": 0.16204164922237396, + "learning_rate": 4.8344901110304054e-05, + "loss": 0.148, + "num_input_tokens_seen": 3344632, + "step": 4660 + }, + { + "epoch": 9.698544698544698, + "grad_norm": 0.22206032276153564, + "learning_rate": 4.8341386560719534e-05, + "loss": 0.1397, + "num_input_tokens_seen": 3348216, + "step": 4665 + }, + { + "epoch": 9.70893970893971, + "grad_norm": 0.17310352623462677, + "learning_rate": 4.833786841159989e-05, + "loss": 0.1521, + "num_input_tokens_seen": 3351736, + "step": 4670 + }, + { + "epoch": 9.71933471933472, + "grad_norm": 0.2817917466163635, + "learning_rate": 4.833434666348765e-05, + "loss": 0.1362, + "num_input_tokens_seen": 3355352, + "step": 4675 + }, + { + "epoch": 9.72972972972973, + "grad_norm": 0.4625668227672577, + "learning_rate": 4.833082131692592e-05, + "loss": 0.1751, + "num_input_tokens_seen": 3358904, + "step": 4680 + }, + { + "epoch": 9.74012474012474, + "grad_norm": 0.3313356041908264, + "learning_rate": 4.832729237245835e-05, + "loss": 0.1411, + "num_input_tokens_seen": 3362552, + "step": 4685 + }, + { + "epoch": 9.75051975051975, + "grad_norm": 0.24874334037303925, + "learning_rate": 4.8323759830629145e-05, + "loss": 0.1208, + "num_input_tokens_seen": 3366232, + "step": 4690 + }, + { + "epoch": 9.76091476091476, + "grad_norm": 0.3763890266418457, + "learning_rate": 4.8320223691983066e-05, + "loss": 0.1665, + "num_input_tokens_seen": 3369816, + "step": 4695 + }, + { + "epoch": 9.771309771309772, + "grad_norm": 0.25151529908180237, + "learning_rate": 4.831668395706544e-05, + "loss": 0.1565, + "num_input_tokens_seen": 3373304, + "step": 4700 + }, + { + "epoch": 9.781704781704782, + "grad_norm": 0.5387188196182251, + "learning_rate": 4.8313140626422125e-05, + "loss": 0.1348, + "num_input_tokens_seen": 3377080, + "step": 4705 + }, + { + "epoch": 9.792099792099792, + "grad_norm": 0.14996860921382904, + "learning_rate": 4.830959370059956e-05, + "loss": 0.1331, + "num_input_tokens_seen": 3380760, + "step": 4710 + }, + { + "epoch": 9.802494802494802, + "grad_norm": 0.44819995760917664, + "learning_rate": 4.830604318014472e-05, + "loss": 0.1751, + "num_input_tokens_seen": 3384408, + "step": 4715 + }, + { + "epoch": 9.812889812889813, + "grad_norm": 0.22961167991161346, + "learning_rate": 4.830248906560514e-05, + "loss": 0.1319, + "num_input_tokens_seen": 3388088, + "step": 4720 + }, + { + "epoch": 9.823284823284823, + "grad_norm": 0.35746681690216064, + "learning_rate": 4.829893135752891e-05, + "loss": 0.1158, + "num_input_tokens_seen": 3391768, + "step": 4725 + }, + { + "epoch": 9.833679833679835, + "grad_norm": 0.3711194097995758, + "learning_rate": 4.829537005646466e-05, + "loss": 0.114, + "num_input_tokens_seen": 3395352, + "step": 4730 + }, + { + "epoch": 9.844074844074845, + "grad_norm": 0.3774109184741974, + "learning_rate": 4.8291805162961615e-05, + "loss": 0.1323, + "num_input_tokens_seen": 3398968, + "step": 4735 + }, + { + "epoch": 9.854469854469855, + "grad_norm": 0.22688761353492737, + "learning_rate": 4.82882366775695e-05, + "loss": 0.1383, + "num_input_tokens_seen": 3402648, + "step": 4740 + }, + { + "epoch": 9.864864864864865, + "grad_norm": 0.3572632074356079, + "learning_rate": 4.828466460083864e-05, + "loss": 0.1271, + "num_input_tokens_seen": 3406168, + "step": 4745 + }, + { + "epoch": 9.875259875259875, + "grad_norm": 0.20554475486278534, + "learning_rate": 4.8281088933319877e-05, + "loss": 0.1439, + "num_input_tokens_seen": 3409688, + "step": 4750 + }, + { + "epoch": 9.885654885654885, + "grad_norm": 0.5609762072563171, + "learning_rate": 4.827750967556464e-05, + "loss": 0.1168, + "num_input_tokens_seen": 3413304, + "step": 4755 + }, + { + "epoch": 9.896049896049895, + "grad_norm": 0.6570037603378296, + "learning_rate": 4.827392682812488e-05, + "loss": 0.1807, + "num_input_tokens_seen": 3417048, + "step": 4760 + }, + { + "epoch": 9.906444906444907, + "grad_norm": 0.13413049280643463, + "learning_rate": 4.827034039155312e-05, + "loss": 0.1073, + "num_input_tokens_seen": 3420696, + "step": 4765 + }, + { + "epoch": 9.916839916839917, + "grad_norm": 0.2095513492822647, + "learning_rate": 4.8266750366402445e-05, + "loss": 0.1212, + "num_input_tokens_seen": 3424280, + "step": 4770 + }, + { + "epoch": 9.927234927234927, + "grad_norm": 0.2507452070713043, + "learning_rate": 4.8263156753226476e-05, + "loss": 0.1772, + "num_input_tokens_seen": 3427768, + "step": 4775 + }, + { + "epoch": 9.937629937629938, + "grad_norm": 0.40432867407798767, + "learning_rate": 4.8259559552579394e-05, + "loss": 0.1921, + "num_input_tokens_seen": 3431192, + "step": 4780 + }, + { + "epoch": 9.948024948024948, + "grad_norm": 0.19768942892551422, + "learning_rate": 4.825595876501593e-05, + "loss": 0.125, + "num_input_tokens_seen": 3434744, + "step": 4785 + }, + { + "epoch": 9.958419958419958, + "grad_norm": 0.25956597924232483, + "learning_rate": 4.825235439109137e-05, + "loss": 0.2331, + "num_input_tokens_seen": 3438296, + "step": 4790 + }, + { + "epoch": 9.96881496881497, + "grad_norm": 0.14911910891532898, + "learning_rate": 4.824874643136156e-05, + "loss": 0.1322, + "num_input_tokens_seen": 3441912, + "step": 4795 + }, + { + "epoch": 9.97920997920998, + "grad_norm": 0.32710108160972595, + "learning_rate": 4.824513488638288e-05, + "loss": 0.146, + "num_input_tokens_seen": 3445528, + "step": 4800 + }, + { + "epoch": 9.97920997920998, + "eval_loss": 0.15505944192409515, + "eval_runtime": 7.7551, + "eval_samples_per_second": 110.379, + "eval_steps_per_second": 27.595, + "num_input_tokens_seen": 3445528, + "step": 4800 + }, + { + "epoch": 9.98960498960499, + "grad_norm": 0.3327561914920807, + "learning_rate": 4.8241519756712293e-05, + "loss": 0.0845, + "num_input_tokens_seen": 3448952, + "step": 4805 + }, + { + "epoch": 10.0, + "grad_norm": 0.47315165400505066, + "learning_rate": 4.8237901042907285e-05, + "loss": 0.1567, + "num_input_tokens_seen": 3452592, + "step": 4810 + }, + { + "epoch": 10.01039501039501, + "grad_norm": 0.3660740852355957, + "learning_rate": 4.823427874552591e-05, + "loss": 0.1381, + "num_input_tokens_seen": 3456208, + "step": 4815 + }, + { + "epoch": 10.02079002079002, + "grad_norm": 0.24818532168865204, + "learning_rate": 4.823065286512677e-05, + "loss": 0.1249, + "num_input_tokens_seen": 3459824, + "step": 4820 + }, + { + "epoch": 10.03118503118503, + "grad_norm": 0.2867206931114197, + "learning_rate": 4.8227023402269025e-05, + "loss": 0.1313, + "num_input_tokens_seen": 3463504, + "step": 4825 + }, + { + "epoch": 10.041580041580042, + "grad_norm": 0.35022681951522827, + "learning_rate": 4.822339035751239e-05, + "loss": 0.1639, + "num_input_tokens_seen": 3467088, + "step": 4830 + }, + { + "epoch": 10.051975051975052, + "grad_norm": 0.44373947381973267, + "learning_rate": 4.8219753731417104e-05, + "loss": 0.1362, + "num_input_tokens_seen": 3470672, + "step": 4835 + }, + { + "epoch": 10.062370062370062, + "grad_norm": 0.2842780351638794, + "learning_rate": 4.821611352454401e-05, + "loss": 0.1414, + "num_input_tokens_seen": 3474224, + "step": 4840 + }, + { + "epoch": 10.072765072765073, + "grad_norm": 0.18302452564239502, + "learning_rate": 4.8212469737454444e-05, + "loss": 0.1351, + "num_input_tokens_seen": 3477936, + "step": 4845 + }, + { + "epoch": 10.083160083160083, + "grad_norm": 0.4312876760959625, + "learning_rate": 4.820882237071035e-05, + "loss": 0.1532, + "num_input_tokens_seen": 3481424, + "step": 4850 + }, + { + "epoch": 10.093555093555093, + "grad_norm": 0.3254040479660034, + "learning_rate": 4.820517142487417e-05, + "loss": 0.1408, + "num_input_tokens_seen": 3484784, + "step": 4855 + }, + { + "epoch": 10.103950103950105, + "grad_norm": 0.4599064290523529, + "learning_rate": 4.8201516900508956e-05, + "loss": 0.1478, + "num_input_tokens_seen": 3488336, + "step": 4860 + }, + { + "epoch": 10.114345114345115, + "grad_norm": 0.23293134570121765, + "learning_rate": 4.819785879817827e-05, + "loss": 0.1383, + "num_input_tokens_seen": 3491856, + "step": 4865 + }, + { + "epoch": 10.124740124740125, + "grad_norm": 0.17379878461360931, + "learning_rate": 4.8194197118446226e-05, + "loss": 0.1191, + "num_input_tokens_seen": 3495312, + "step": 4870 + }, + { + "epoch": 10.135135135135135, + "grad_norm": 0.4681810438632965, + "learning_rate": 4.819053186187752e-05, + "loss": 0.1437, + "num_input_tokens_seen": 3498800, + "step": 4875 + }, + { + "epoch": 10.145530145530145, + "grad_norm": 0.29741370677948, + "learning_rate": 4.818686302903736e-05, + "loss": 0.1316, + "num_input_tokens_seen": 3502352, + "step": 4880 + }, + { + "epoch": 10.155925155925155, + "grad_norm": 0.3211456537246704, + "learning_rate": 4.818319062049154e-05, + "loss": 0.0905, + "num_input_tokens_seen": 3505808, + "step": 4885 + }, + { + "epoch": 10.166320166320165, + "grad_norm": 0.22134575247764587, + "learning_rate": 4.817951463680639e-05, + "loss": 0.1615, + "num_input_tokens_seen": 3509328, + "step": 4890 + }, + { + "epoch": 10.176715176715177, + "grad_norm": 0.09246786683797836, + "learning_rate": 4.817583507854879e-05, + "loss": 0.1509, + "num_input_tokens_seen": 3513040, + "step": 4895 + }, + { + "epoch": 10.187110187110187, + "grad_norm": 0.8970663547515869, + "learning_rate": 4.817215194628617e-05, + "loss": 0.1838, + "num_input_tokens_seen": 3516752, + "step": 4900 + }, + { + "epoch": 10.197505197505198, + "grad_norm": 0.3051356077194214, + "learning_rate": 4.816846524058653e-05, + "loss": 0.145, + "num_input_tokens_seen": 3520400, + "step": 4905 + }, + { + "epoch": 10.207900207900208, + "grad_norm": 0.4676916003227234, + "learning_rate": 4.816477496201839e-05, + "loss": 0.1522, + "num_input_tokens_seen": 3523984, + "step": 4910 + }, + { + "epoch": 10.218295218295218, + "grad_norm": 0.1627822369337082, + "learning_rate": 4.8161081111150845e-05, + "loss": 0.1619, + "num_input_tokens_seen": 3527504, + "step": 4915 + }, + { + "epoch": 10.228690228690228, + "grad_norm": 0.2499762326478958, + "learning_rate": 4.815738368855354e-05, + "loss": 0.1147, + "num_input_tokens_seen": 3530992, + "step": 4920 + }, + { + "epoch": 10.23908523908524, + "grad_norm": 0.4798378348350525, + "learning_rate": 4.815368269479664e-05, + "loss": 0.1004, + "num_input_tokens_seen": 3534544, + "step": 4925 + }, + { + "epoch": 10.24948024948025, + "grad_norm": 0.16795794665813446, + "learning_rate": 4.814997813045092e-05, + "loss": 0.1652, + "num_input_tokens_seen": 3538128, + "step": 4930 + }, + { + "epoch": 10.25987525987526, + "grad_norm": 0.28971678018569946, + "learning_rate": 4.814626999608764e-05, + "loss": 0.1186, + "num_input_tokens_seen": 3541744, + "step": 4935 + }, + { + "epoch": 10.27027027027027, + "grad_norm": 0.41961169242858887, + "learning_rate": 4.814255829227865e-05, + "loss": 0.1241, + "num_input_tokens_seen": 3545360, + "step": 4940 + }, + { + "epoch": 10.28066528066528, + "grad_norm": 0.21111123263835907, + "learning_rate": 4.813884301959635e-05, + "loss": 0.1227, + "num_input_tokens_seen": 3548816, + "step": 4945 + }, + { + "epoch": 10.29106029106029, + "grad_norm": 0.2871948480606079, + "learning_rate": 4.813512417861368e-05, + "loss": 0.2353, + "num_input_tokens_seen": 3552528, + "step": 4950 + }, + { + "epoch": 10.301455301455302, + "grad_norm": 0.28629520535469055, + "learning_rate": 4.813140176990411e-05, + "loss": 0.1997, + "num_input_tokens_seen": 3556048, + "step": 4955 + }, + { + "epoch": 10.311850311850312, + "grad_norm": 0.27501773834228516, + "learning_rate": 4.8127675794041714e-05, + "loss": 0.1323, + "num_input_tokens_seen": 3559728, + "step": 4960 + }, + { + "epoch": 10.322245322245323, + "grad_norm": 0.35303378105163574, + "learning_rate": 4.812394625160107e-05, + "loss": 0.1492, + "num_input_tokens_seen": 3563344, + "step": 4965 + }, + { + "epoch": 10.332640332640333, + "grad_norm": 0.505358099937439, + "learning_rate": 4.812021314315732e-05, + "loss": 0.1691, + "num_input_tokens_seen": 3566896, + "step": 4970 + }, + { + "epoch": 10.343035343035343, + "grad_norm": 0.17586801946163177, + "learning_rate": 4.811647646928616e-05, + "loss": 0.1635, + "num_input_tokens_seen": 3570352, + "step": 4975 + }, + { + "epoch": 10.353430353430353, + "grad_norm": 0.32754093408584595, + "learning_rate": 4.8112736230563814e-05, + "loss": 0.1096, + "num_input_tokens_seen": 3573840, + "step": 4980 + }, + { + "epoch": 10.363825363825363, + "grad_norm": 0.3432084619998932, + "learning_rate": 4.81089924275671e-05, + "loss": 0.111, + "num_input_tokens_seen": 3577264, + "step": 4985 + }, + { + "epoch": 10.374220374220375, + "grad_norm": 0.1561436653137207, + "learning_rate": 4.810524506087335e-05, + "loss": 0.1262, + "num_input_tokens_seen": 3580912, + "step": 4990 + }, + { + "epoch": 10.384615384615385, + "grad_norm": 0.23185205459594727, + "learning_rate": 4.810149413106044e-05, + "loss": 0.1243, + "num_input_tokens_seen": 3584528, + "step": 4995 + }, + { + "epoch": 10.395010395010395, + "grad_norm": 0.16769418120384216, + "learning_rate": 4.809773963870684e-05, + "loss": 0.1104, + "num_input_tokens_seen": 3588176, + "step": 5000 + }, + { + "epoch": 10.395010395010395, + "eval_loss": 0.1561512053012848, + "eval_runtime": 7.7568, + "eval_samples_per_second": 110.355, + "eval_steps_per_second": 27.589, + "num_input_tokens_seen": 3588176, + "step": 5000 + }, + { + "epoch": 10.405405405405405, + "grad_norm": 0.4347681403160095, + "learning_rate": 4.809398158439151e-05, + "loss": 0.1741, + "num_input_tokens_seen": 3591792, + "step": 5005 + }, + { + "epoch": 10.415800415800415, + "grad_norm": 0.46501150727272034, + "learning_rate": 4.8090219968694005e-05, + "loss": 0.1513, + "num_input_tokens_seen": 3595344, + "step": 5010 + }, + { + "epoch": 10.426195426195425, + "grad_norm": 0.7515895366668701, + "learning_rate": 4.808645479219442e-05, + "loss": 0.132, + "num_input_tokens_seen": 3599088, + "step": 5015 + }, + { + "epoch": 10.436590436590437, + "grad_norm": 0.41993534564971924, + "learning_rate": 4.8082686055473375e-05, + "loss": 0.1624, + "num_input_tokens_seen": 3602608, + "step": 5020 + }, + { + "epoch": 10.446985446985448, + "grad_norm": 0.2987852096557617, + "learning_rate": 4.8078913759112066e-05, + "loss": 0.1596, + "num_input_tokens_seen": 3606160, + "step": 5025 + }, + { + "epoch": 10.457380457380458, + "grad_norm": 0.2715710401535034, + "learning_rate": 4.807513790369223e-05, + "loss": 0.139, + "num_input_tokens_seen": 3609776, + "step": 5030 + }, + { + "epoch": 10.467775467775468, + "grad_norm": 0.3032274544239044, + "learning_rate": 4.8071358489796145e-05, + "loss": 0.1389, + "num_input_tokens_seen": 3613296, + "step": 5035 + }, + { + "epoch": 10.478170478170478, + "grad_norm": 0.16615277528762817, + "learning_rate": 4.806757551800665e-05, + "loss": 0.1405, + "num_input_tokens_seen": 3617136, + "step": 5040 + }, + { + "epoch": 10.488565488565488, + "grad_norm": 0.4073868691921234, + "learning_rate": 4.806378898890713e-05, + "loss": 0.121, + "num_input_tokens_seen": 3620720, + "step": 5045 + }, + { + "epoch": 10.4989604989605, + "grad_norm": 0.20629164576530457, + "learning_rate": 4.80599989030815e-05, + "loss": 0.1075, + "num_input_tokens_seen": 3624336, + "step": 5050 + }, + { + "epoch": 10.50935550935551, + "grad_norm": 0.7612397074699402, + "learning_rate": 4.805620526111426e-05, + "loss": 0.1954, + "num_input_tokens_seen": 3627824, + "step": 5055 + }, + { + "epoch": 10.51975051975052, + "grad_norm": 0.4267899990081787, + "learning_rate": 4.805240806359042e-05, + "loss": 0.1242, + "num_input_tokens_seen": 3631504, + "step": 5060 + }, + { + "epoch": 10.53014553014553, + "grad_norm": 0.17109666764736176, + "learning_rate": 4.804860731109557e-05, + "loss": 0.1512, + "num_input_tokens_seen": 3635280, + "step": 5065 + }, + { + "epoch": 10.54054054054054, + "grad_norm": 0.4574415981769562, + "learning_rate": 4.804480300421581e-05, + "loss": 0.1555, + "num_input_tokens_seen": 3638960, + "step": 5070 + }, + { + "epoch": 10.55093555093555, + "grad_norm": 0.17465011775493622, + "learning_rate": 4.804099514353784e-05, + "loss": 0.1735, + "num_input_tokens_seen": 3642480, + "step": 5075 + }, + { + "epoch": 10.56133056133056, + "grad_norm": 0.43005096912384033, + "learning_rate": 4.8037183729648867e-05, + "loss": 0.1132, + "num_input_tokens_seen": 3646000, + "step": 5080 + }, + { + "epoch": 10.571725571725572, + "grad_norm": 0.5462619662284851, + "learning_rate": 4.803336876313666e-05, + "loss": 0.1127, + "num_input_tokens_seen": 3649648, + "step": 5085 + }, + { + "epoch": 10.582120582120583, + "grad_norm": 0.28906112909317017, + "learning_rate": 4.802955024458953e-05, + "loss": 0.1338, + "num_input_tokens_seen": 3653200, + "step": 5090 + }, + { + "epoch": 10.592515592515593, + "grad_norm": 0.8552227020263672, + "learning_rate": 4.802572817459634e-05, + "loss": 0.1733, + "num_input_tokens_seen": 3656880, + "step": 5095 + }, + { + "epoch": 10.602910602910603, + "grad_norm": 0.18689967691898346, + "learning_rate": 4.802190255374651e-05, + "loss": 0.1267, + "num_input_tokens_seen": 3660464, + "step": 5100 + }, + { + "epoch": 10.613305613305613, + "grad_norm": 0.2495257705450058, + "learning_rate": 4.801807338263e-05, + "loss": 0.1336, + "num_input_tokens_seen": 3664080, + "step": 5105 + }, + { + "epoch": 10.623700623700623, + "grad_norm": 0.5591150522232056, + "learning_rate": 4.8014240661837306e-05, + "loss": 0.1561, + "num_input_tokens_seen": 3667504, + "step": 5110 + }, + { + "epoch": 10.634095634095633, + "grad_norm": 0.4785183370113373, + "learning_rate": 4.80104043919595e-05, + "loss": 0.1476, + "num_input_tokens_seen": 3671088, + "step": 5115 + }, + { + "epoch": 10.644490644490645, + "grad_norm": 0.3092675507068634, + "learning_rate": 4.800656457358815e-05, + "loss": 0.1599, + "num_input_tokens_seen": 3674704, + "step": 5120 + }, + { + "epoch": 10.654885654885655, + "grad_norm": 0.20480097830295563, + "learning_rate": 4.800272120731544e-05, + "loss": 0.134, + "num_input_tokens_seen": 3678320, + "step": 5125 + }, + { + "epoch": 10.665280665280665, + "grad_norm": 0.33279576897621155, + "learning_rate": 4.799887429373404e-05, + "loss": 0.1358, + "num_input_tokens_seen": 3681808, + "step": 5130 + }, + { + "epoch": 10.675675675675675, + "grad_norm": 0.47730517387390137, + "learning_rate": 4.79950238334372e-05, + "loss": 0.1625, + "num_input_tokens_seen": 3685456, + "step": 5135 + }, + { + "epoch": 10.686070686070686, + "grad_norm": 0.26659172773361206, + "learning_rate": 4.799116982701872e-05, + "loss": 0.151, + "num_input_tokens_seen": 3689008, + "step": 5140 + }, + { + "epoch": 10.696465696465696, + "grad_norm": 0.12058316171169281, + "learning_rate": 4.7987312275072926e-05, + "loss": 0.1333, + "num_input_tokens_seen": 3692368, + "step": 5145 + }, + { + "epoch": 10.706860706860708, + "grad_norm": 0.23961010575294495, + "learning_rate": 4.79834511781947e-05, + "loss": 0.1, + "num_input_tokens_seen": 3696016, + "step": 5150 + }, + { + "epoch": 10.717255717255718, + "grad_norm": 0.23739805817604065, + "learning_rate": 4.797958653697947e-05, + "loss": 0.1393, + "num_input_tokens_seen": 3699792, + "step": 5155 + }, + { + "epoch": 10.727650727650728, + "grad_norm": 0.1995849758386612, + "learning_rate": 4.7975718352023225e-05, + "loss": 0.1227, + "num_input_tokens_seen": 3703312, + "step": 5160 + }, + { + "epoch": 10.738045738045738, + "grad_norm": 0.4405716061592102, + "learning_rate": 4.7971846623922476e-05, + "loss": 0.1464, + "num_input_tokens_seen": 3706864, + "step": 5165 + }, + { + "epoch": 10.748440748440748, + "grad_norm": 0.2761804759502411, + "learning_rate": 4.7967971353274294e-05, + "loss": 0.1307, + "num_input_tokens_seen": 3710416, + "step": 5170 + }, + { + "epoch": 10.758835758835758, + "grad_norm": 0.3562782108783722, + "learning_rate": 4.79640925406763e-05, + "loss": 0.1419, + "num_input_tokens_seen": 3714000, + "step": 5175 + }, + { + "epoch": 10.76923076923077, + "grad_norm": 0.5390232801437378, + "learning_rate": 4.796021018672664e-05, + "loss": 0.1752, + "num_input_tokens_seen": 3717456, + "step": 5180 + }, + { + "epoch": 10.77962577962578, + "grad_norm": 0.44922956824302673, + "learning_rate": 4.795632429202405e-05, + "loss": 0.1123, + "num_input_tokens_seen": 3721040, + "step": 5185 + }, + { + "epoch": 10.79002079002079, + "grad_norm": 0.25994256138801575, + "learning_rate": 4.795243485716775e-05, + "loss": 0.1631, + "num_input_tokens_seen": 3724528, + "step": 5190 + }, + { + "epoch": 10.8004158004158, + "grad_norm": 0.25348785519599915, + "learning_rate": 4.794854188275757e-05, + "loss": 0.1487, + "num_input_tokens_seen": 3728272, + "step": 5195 + }, + { + "epoch": 10.81081081081081, + "grad_norm": 0.28743988275527954, + "learning_rate": 4.794464536939384e-05, + "loss": 0.15, + "num_input_tokens_seen": 3731888, + "step": 5200 + }, + { + "epoch": 10.81081081081081, + "eval_loss": 0.15689212083816528, + "eval_runtime": 7.7635, + "eval_samples_per_second": 110.26, + "eval_steps_per_second": 27.565, + "num_input_tokens_seen": 3731888, + "step": 5200 + }, + { + "epoch": 10.82120582120582, + "grad_norm": 0.19263814389705658, + "learning_rate": 4.794074531767745e-05, + "loss": 0.1232, + "num_input_tokens_seen": 3735632, + "step": 5205 + }, + { + "epoch": 10.83160083160083, + "grad_norm": 0.31251654028892517, + "learning_rate": 4.7936841728209834e-05, + "loss": 0.1174, + "num_input_tokens_seen": 3739344, + "step": 5210 + }, + { + "epoch": 10.841995841995843, + "grad_norm": 0.21447040140628815, + "learning_rate": 4.7932934601593e-05, + "loss": 0.1461, + "num_input_tokens_seen": 3742960, + "step": 5215 + }, + { + "epoch": 10.852390852390853, + "grad_norm": 0.2845449149608612, + "learning_rate": 4.792902393842943e-05, + "loss": 0.15, + "num_input_tokens_seen": 3746480, + "step": 5220 + }, + { + "epoch": 10.862785862785863, + "grad_norm": 0.213857963681221, + "learning_rate": 4.792510973932225e-05, + "loss": 0.1464, + "num_input_tokens_seen": 3749968, + "step": 5225 + }, + { + "epoch": 10.873180873180873, + "grad_norm": 0.37974244356155396, + "learning_rate": 4.7921192004875036e-05, + "loss": 0.1916, + "num_input_tokens_seen": 3753552, + "step": 5230 + }, + { + "epoch": 10.883575883575883, + "grad_norm": 0.23992890119552612, + "learning_rate": 4.791727073569198e-05, + "loss": 0.1116, + "num_input_tokens_seen": 3756912, + "step": 5235 + }, + { + "epoch": 10.893970893970893, + "grad_norm": 0.16624899208545685, + "learning_rate": 4.7913345932377775e-05, + "loss": 0.1199, + "num_input_tokens_seen": 3760624, + "step": 5240 + }, + { + "epoch": 10.904365904365905, + "grad_norm": 0.2697955369949341, + "learning_rate": 4.790941759553769e-05, + "loss": 0.1289, + "num_input_tokens_seen": 3764304, + "step": 5245 + }, + { + "epoch": 10.914760914760915, + "grad_norm": 0.2570788264274597, + "learning_rate": 4.79054857257775e-05, + "loss": 0.1235, + "num_input_tokens_seen": 3767920, + "step": 5250 + }, + { + "epoch": 10.925155925155925, + "grad_norm": 0.35726502537727356, + "learning_rate": 4.790155032370357e-05, + "loss": 0.1184, + "num_input_tokens_seen": 3771504, + "step": 5255 + }, + { + "epoch": 10.935550935550935, + "grad_norm": 0.20899154245853424, + "learning_rate": 4.789761138992278e-05, + "loss": 0.154, + "num_input_tokens_seen": 3775216, + "step": 5260 + }, + { + "epoch": 10.945945945945946, + "grad_norm": 0.4877677261829376, + "learning_rate": 4.7893668925042565e-05, + "loss": 0.1545, + "num_input_tokens_seen": 3778800, + "step": 5265 + }, + { + "epoch": 10.956340956340956, + "grad_norm": 0.29594680666923523, + "learning_rate": 4.78897229296709e-05, + "loss": 0.111, + "num_input_tokens_seen": 3782320, + "step": 5270 + }, + { + "epoch": 10.966735966735968, + "grad_norm": 0.38610151410102844, + "learning_rate": 4.7885773404416315e-05, + "loss": 0.1761, + "num_input_tokens_seen": 3785936, + "step": 5275 + }, + { + "epoch": 10.977130977130978, + "grad_norm": 0.3024933636188507, + "learning_rate": 4.788182034988786e-05, + "loss": 0.1429, + "num_input_tokens_seen": 3789552, + "step": 5280 + }, + { + "epoch": 10.987525987525988, + "grad_norm": 0.28814566135406494, + "learning_rate": 4.787786376669516e-05, + "loss": 0.1073, + "num_input_tokens_seen": 3793040, + "step": 5285 + }, + { + "epoch": 10.997920997920998, + "grad_norm": 0.15158995985984802, + "learning_rate": 4.787390365544837e-05, + "loss": 0.1285, + "num_input_tokens_seen": 3796528, + "step": 5290 + }, + { + "epoch": 11.008316008316008, + "grad_norm": 0.3045738637447357, + "learning_rate": 4.786994001675818e-05, + "loss": 0.1268, + "num_input_tokens_seen": 3799912, + "step": 5295 + }, + { + "epoch": 11.018711018711018, + "grad_norm": 0.1794101744890213, + "learning_rate": 4.786597285123584e-05, + "loss": 0.1112, + "num_input_tokens_seen": 3803592, + "step": 5300 + }, + { + "epoch": 11.029106029106028, + "grad_norm": 0.47718268632888794, + "learning_rate": 4.7862002159493135e-05, + "loss": 0.1471, + "num_input_tokens_seen": 3807304, + "step": 5305 + }, + { + "epoch": 11.03950103950104, + "grad_norm": 0.34451472759246826, + "learning_rate": 4.785802794214239e-05, + "loss": 0.1349, + "num_input_tokens_seen": 3810792, + "step": 5310 + }, + { + "epoch": 11.04989604989605, + "grad_norm": 0.6450057625770569, + "learning_rate": 4.7854050199796495e-05, + "loss": 0.1462, + "num_input_tokens_seen": 3814312, + "step": 5315 + }, + { + "epoch": 11.06029106029106, + "grad_norm": 0.32756441831588745, + "learning_rate": 4.7850068933068845e-05, + "loss": 0.15, + "num_input_tokens_seen": 3817704, + "step": 5320 + }, + { + "epoch": 11.07068607068607, + "grad_norm": 0.4961933195590973, + "learning_rate": 4.7846084142573425e-05, + "loss": 0.1507, + "num_input_tokens_seen": 3821512, + "step": 5325 + }, + { + "epoch": 11.08108108108108, + "grad_norm": 0.20626065135002136, + "learning_rate": 4.7842095828924725e-05, + "loss": 0.115, + "num_input_tokens_seen": 3825032, + "step": 5330 + }, + { + "epoch": 11.09147609147609, + "grad_norm": 0.3087444603443146, + "learning_rate": 4.783810399273779e-05, + "loss": 0.1543, + "num_input_tokens_seen": 3828712, + "step": 5335 + }, + { + "epoch": 11.101871101871103, + "grad_norm": 0.2824331223964691, + "learning_rate": 4.7834108634628226e-05, + "loss": 0.0947, + "num_input_tokens_seen": 3832488, + "step": 5340 + }, + { + "epoch": 11.112266112266113, + "grad_norm": 0.2672958970069885, + "learning_rate": 4.783010975521216e-05, + "loss": 0.1167, + "num_input_tokens_seen": 3836168, + "step": 5345 + }, + { + "epoch": 11.122661122661123, + "grad_norm": 0.27010437846183777, + "learning_rate": 4.782610735510626e-05, + "loss": 0.1643, + "num_input_tokens_seen": 3839816, + "step": 5350 + }, + { + "epoch": 11.133056133056133, + "grad_norm": 0.2351756989955902, + "learning_rate": 4.782210143492776e-05, + "loss": 0.1498, + "num_input_tokens_seen": 3843432, + "step": 5355 + }, + { + "epoch": 11.143451143451143, + "grad_norm": 0.3706151843070984, + "learning_rate": 4.781809199529442e-05, + "loss": 0.1609, + "num_input_tokens_seen": 3847016, + "step": 5360 + }, + { + "epoch": 11.153846153846153, + "grad_norm": 0.16725996136665344, + "learning_rate": 4.781407903682454e-05, + "loss": 0.1602, + "num_input_tokens_seen": 3850696, + "step": 5365 + }, + { + "epoch": 11.164241164241163, + "grad_norm": 0.3062683343887329, + "learning_rate": 4.781006256013698e-05, + "loss": 0.1303, + "num_input_tokens_seen": 3854376, + "step": 5370 + }, + { + "epoch": 11.174636174636175, + "grad_norm": 0.21912817656993866, + "learning_rate": 4.7806042565851115e-05, + "loss": 0.156, + "num_input_tokens_seen": 3858024, + "step": 5375 + }, + { + "epoch": 11.185031185031185, + "grad_norm": 0.18791697919368744, + "learning_rate": 4.7802019054586895e-05, + "loss": 0.119, + "num_input_tokens_seen": 3861512, + "step": 5380 + }, + { + "epoch": 11.195426195426196, + "grad_norm": 0.22612091898918152, + "learning_rate": 4.779799202696479e-05, + "loss": 0.1486, + "num_input_tokens_seen": 3865064, + "step": 5385 + }, + { + "epoch": 11.205821205821206, + "grad_norm": 0.210261270403862, + "learning_rate": 4.779396148360581e-05, + "loss": 0.1193, + "num_input_tokens_seen": 3868712, + "step": 5390 + }, + { + "epoch": 11.216216216216216, + "grad_norm": 0.6055476069450378, + "learning_rate": 4.7789927425131517e-05, + "loss": 0.1488, + "num_input_tokens_seen": 3872360, + "step": 5395 + }, + { + "epoch": 11.226611226611226, + "grad_norm": 0.2309110015630722, + "learning_rate": 4.778588985216403e-05, + "loss": 0.1356, + "num_input_tokens_seen": 3876072, + "step": 5400 + }, + { + "epoch": 11.226611226611226, + "eval_loss": 0.15539436042308807, + "eval_runtime": 7.7746, + "eval_samples_per_second": 110.102, + "eval_steps_per_second": 27.526, + "num_input_tokens_seen": 3876072, + "step": 5400 + }, + { + "epoch": 11.237006237006238, + "grad_norm": 0.36559823155403137, + "learning_rate": 4.778184876532598e-05, + "loss": 0.127, + "num_input_tokens_seen": 3879464, + "step": 5405 + }, + { + "epoch": 11.247401247401248, + "grad_norm": 0.2958822250366211, + "learning_rate": 4.7777804165240556e-05, + "loss": 0.1442, + "num_input_tokens_seen": 3883272, + "step": 5410 + }, + { + "epoch": 11.257796257796258, + "grad_norm": 0.2005353718996048, + "learning_rate": 4.7773756052531485e-05, + "loss": 0.1358, + "num_input_tokens_seen": 3886728, + "step": 5415 + }, + { + "epoch": 11.268191268191268, + "grad_norm": 0.3047106862068176, + "learning_rate": 4.7769704427823035e-05, + "loss": 0.1384, + "num_input_tokens_seen": 3890280, + "step": 5420 + }, + { + "epoch": 11.278586278586278, + "grad_norm": 0.16788332164287567, + "learning_rate": 4.776564929174003e-05, + "loss": 0.127, + "num_input_tokens_seen": 3893928, + "step": 5425 + }, + { + "epoch": 11.288981288981288, + "grad_norm": 0.4122418463230133, + "learning_rate": 4.7761590644907806e-05, + "loss": 0.2065, + "num_input_tokens_seen": 3897576, + "step": 5430 + }, + { + "epoch": 11.299376299376299, + "grad_norm": 0.30766022205352783, + "learning_rate": 4.7757528487952263e-05, + "loss": 0.1341, + "num_input_tokens_seen": 3900968, + "step": 5435 + }, + { + "epoch": 11.30977130977131, + "grad_norm": 0.3541299104690552, + "learning_rate": 4.7753462821499836e-05, + "loss": 0.0944, + "num_input_tokens_seen": 3904584, + "step": 5440 + }, + { + "epoch": 11.32016632016632, + "grad_norm": 0.22977136075496674, + "learning_rate": 4.774939364617751e-05, + "loss": 0.1101, + "num_input_tokens_seen": 3908072, + "step": 5445 + }, + { + "epoch": 11.33056133056133, + "grad_norm": 0.4370339512825012, + "learning_rate": 4.7745320962612795e-05, + "loss": 0.1633, + "num_input_tokens_seen": 3911752, + "step": 5450 + }, + { + "epoch": 11.34095634095634, + "grad_norm": 1.0429551601409912, + "learning_rate": 4.7741244771433756e-05, + "loss": 0.1864, + "num_input_tokens_seen": 3915304, + "step": 5455 + }, + { + "epoch": 11.35135135135135, + "grad_norm": 0.23671595752239227, + "learning_rate": 4.7737165073268985e-05, + "loss": 0.1472, + "num_input_tokens_seen": 3918920, + "step": 5460 + }, + { + "epoch": 11.361746361746361, + "grad_norm": 0.16985315084457397, + "learning_rate": 4.7733081868747626e-05, + "loss": 0.1565, + "num_input_tokens_seen": 3922536, + "step": 5465 + }, + { + "epoch": 11.372141372141373, + "grad_norm": 0.23275388777256012, + "learning_rate": 4.772899515849936e-05, + "loss": 0.1133, + "num_input_tokens_seen": 3926248, + "step": 5470 + }, + { + "epoch": 11.382536382536383, + "grad_norm": 0.4798728823661804, + "learning_rate": 4.7724904943154414e-05, + "loss": 0.2146, + "num_input_tokens_seen": 3929928, + "step": 5475 + }, + { + "epoch": 11.392931392931393, + "grad_norm": 0.5179730653762817, + "learning_rate": 4.772081122334354e-05, + "loss": 0.1575, + "num_input_tokens_seen": 3933512, + "step": 5480 + }, + { + "epoch": 11.403326403326403, + "grad_norm": 0.31057387590408325, + "learning_rate": 4.771671399969806e-05, + "loss": 0.1176, + "num_input_tokens_seen": 3937096, + "step": 5485 + }, + { + "epoch": 11.413721413721413, + "grad_norm": 0.18224601447582245, + "learning_rate": 4.7712613272849794e-05, + "loss": 0.1424, + "num_input_tokens_seen": 3940648, + "step": 5490 + }, + { + "epoch": 11.424116424116423, + "grad_norm": 0.24161408841609955, + "learning_rate": 4.770850904343114e-05, + "loss": 0.1052, + "num_input_tokens_seen": 3944072, + "step": 5495 + }, + { + "epoch": 11.434511434511435, + "grad_norm": 0.21032166481018066, + "learning_rate": 4.770440131207502e-05, + "loss": 0.1516, + "num_input_tokens_seen": 3947784, + "step": 5500 + }, + { + "epoch": 11.444906444906445, + "grad_norm": 0.21216948330402374, + "learning_rate": 4.7700290079414896e-05, + "loss": 0.1574, + "num_input_tokens_seen": 3951304, + "step": 5505 + }, + { + "epoch": 11.455301455301456, + "grad_norm": 2.8125193119049072, + "learning_rate": 4.769617534608477e-05, + "loss": 0.2108, + "num_input_tokens_seen": 3954728, + "step": 5510 + }, + { + "epoch": 11.465696465696466, + "grad_norm": 0.2686039209365845, + "learning_rate": 4.7692057112719193e-05, + "loss": 0.1592, + "num_input_tokens_seen": 3958280, + "step": 5515 + }, + { + "epoch": 11.476091476091476, + "grad_norm": 0.22052708268165588, + "learning_rate": 4.7687935379953234e-05, + "loss": 0.1313, + "num_input_tokens_seen": 3961736, + "step": 5520 + }, + { + "epoch": 11.486486486486486, + "grad_norm": 0.23148119449615479, + "learning_rate": 4.7683810148422534e-05, + "loss": 0.1396, + "num_input_tokens_seen": 3965448, + "step": 5525 + }, + { + "epoch": 11.496881496881496, + "grad_norm": 0.20441922545433044, + "learning_rate": 4.767968141876324e-05, + "loss": 0.1229, + "num_input_tokens_seen": 3969096, + "step": 5530 + }, + { + "epoch": 11.507276507276508, + "grad_norm": 0.2761306166648865, + "learning_rate": 4.767554919161207e-05, + "loss": 0.1258, + "num_input_tokens_seen": 3972552, + "step": 5535 + }, + { + "epoch": 11.517671517671518, + "grad_norm": 0.2432236522436142, + "learning_rate": 4.767141346760624e-05, + "loss": 0.128, + "num_input_tokens_seen": 3976296, + "step": 5540 + }, + { + "epoch": 11.528066528066528, + "grad_norm": 0.23983845114707947, + "learning_rate": 4.766727424738356e-05, + "loss": 0.111, + "num_input_tokens_seen": 3979912, + "step": 5545 + }, + { + "epoch": 11.538461538461538, + "grad_norm": 1.001966953277588, + "learning_rate": 4.7663131531582325e-05, + "loss": 0.199, + "num_input_tokens_seen": 3983528, + "step": 5550 + }, + { + "epoch": 11.548856548856548, + "grad_norm": 0.23959878087043762, + "learning_rate": 4.765898532084142e-05, + "loss": 0.1179, + "num_input_tokens_seen": 3987400, + "step": 5555 + }, + { + "epoch": 11.559251559251559, + "grad_norm": 0.27654382586479187, + "learning_rate": 4.765483561580022e-05, + "loss": 0.155, + "num_input_tokens_seen": 3990920, + "step": 5560 + }, + { + "epoch": 11.56964656964657, + "grad_norm": 0.293974906206131, + "learning_rate": 4.7650682417098666e-05, + "loss": 0.1479, + "num_input_tokens_seen": 3994504, + "step": 5565 + }, + { + "epoch": 11.58004158004158, + "grad_norm": 0.3372308015823364, + "learning_rate": 4.7646525725377244e-05, + "loss": 0.1515, + "num_input_tokens_seen": 3998216, + "step": 5570 + }, + { + "epoch": 11.59043659043659, + "grad_norm": 0.18945609033107758, + "learning_rate": 4.764236554127696e-05, + "loss": 0.1155, + "num_input_tokens_seen": 4001768, + "step": 5575 + }, + { + "epoch": 11.6008316008316, + "grad_norm": 0.21002203226089478, + "learning_rate": 4.7638201865439356e-05, + "loss": 0.1325, + "num_input_tokens_seen": 4005352, + "step": 5580 + }, + { + "epoch": 11.611226611226611, + "grad_norm": 0.19088757038116455, + "learning_rate": 4.7634034698506545e-05, + "loss": 0.126, + "num_input_tokens_seen": 4008968, + "step": 5585 + }, + { + "epoch": 11.621621621621621, + "grad_norm": 0.3403811752796173, + "learning_rate": 4.762986404112115e-05, + "loss": 0.1862, + "num_input_tokens_seen": 4012648, + "step": 5590 + }, + { + "epoch": 11.632016632016633, + "grad_norm": 0.26428118348121643, + "learning_rate": 4.762568989392633e-05, + "loss": 0.1079, + "num_input_tokens_seen": 4016392, + "step": 5595 + }, + { + "epoch": 11.642411642411643, + "grad_norm": 0.39011770486831665, + "learning_rate": 4.76215122575658e-05, + "loss": 0.2153, + "num_input_tokens_seen": 4020200, + "step": 5600 + }, + { + "epoch": 11.642411642411643, + "eval_loss": 0.15655632317066193, + "eval_runtime": 7.7968, + "eval_samples_per_second": 109.788, + "eval_steps_per_second": 27.447, + "num_input_tokens_seen": 4020200, + "step": 5600 + }, + { + "epoch": 11.652806652806653, + "grad_norm": 0.34753745794296265, + "learning_rate": 4.7617331132683795e-05, + "loss": 0.1158, + "num_input_tokens_seen": 4023848, + "step": 5605 + }, + { + "epoch": 11.663201663201663, + "grad_norm": 0.22542709112167358, + "learning_rate": 4.7613146519925105e-05, + "loss": 0.116, + "num_input_tokens_seen": 4027400, + "step": 5610 + }, + { + "epoch": 11.673596673596673, + "grad_norm": 0.1969311684370041, + "learning_rate": 4.7608958419935045e-05, + "loss": 0.1538, + "num_input_tokens_seen": 4030984, + "step": 5615 + }, + { + "epoch": 11.683991683991684, + "grad_norm": 0.1631045639514923, + "learning_rate": 4.760476683335948e-05, + "loss": 0.1284, + "num_input_tokens_seen": 4034760, + "step": 5620 + }, + { + "epoch": 11.694386694386694, + "grad_norm": 0.46929246187210083, + "learning_rate": 4.760057176084479e-05, + "loss": 0.1278, + "num_input_tokens_seen": 4038280, + "step": 5625 + }, + { + "epoch": 11.704781704781706, + "grad_norm": 0.2803540527820587, + "learning_rate": 4.759637320303793e-05, + "loss": 0.1185, + "num_input_tokens_seen": 4041832, + "step": 5630 + }, + { + "epoch": 11.715176715176716, + "grad_norm": 0.2240537703037262, + "learning_rate": 4.759217116058635e-05, + "loss": 0.1463, + "num_input_tokens_seen": 4045320, + "step": 5635 + }, + { + "epoch": 11.725571725571726, + "grad_norm": 0.4522040784358978, + "learning_rate": 4.758796563413807e-05, + "loss": 0.1065, + "num_input_tokens_seen": 4048904, + "step": 5640 + }, + { + "epoch": 11.735966735966736, + "grad_norm": 0.5009204149246216, + "learning_rate": 4.758375662434163e-05, + "loss": 0.1462, + "num_input_tokens_seen": 4052392, + "step": 5645 + }, + { + "epoch": 11.746361746361746, + "grad_norm": 0.5508689880371094, + "learning_rate": 4.7579544131846114e-05, + "loss": 0.1096, + "num_input_tokens_seen": 4055880, + "step": 5650 + }, + { + "epoch": 11.756756756756756, + "grad_norm": 0.6823063492774963, + "learning_rate": 4.757532815730114e-05, + "loss": 0.1411, + "num_input_tokens_seen": 4059368, + "step": 5655 + }, + { + "epoch": 11.767151767151766, + "grad_norm": 0.21262837946414948, + "learning_rate": 4.7571108701356865e-05, + "loss": 0.1217, + "num_input_tokens_seen": 4062952, + "step": 5660 + }, + { + "epoch": 11.777546777546778, + "grad_norm": 0.24894167482852936, + "learning_rate": 4.756688576466398e-05, + "loss": 0.1335, + "num_input_tokens_seen": 4066536, + "step": 5665 + }, + { + "epoch": 11.787941787941788, + "grad_norm": 0.17928537726402283, + "learning_rate": 4.756265934787372e-05, + "loss": 0.1398, + "num_input_tokens_seen": 4070312, + "step": 5670 + }, + { + "epoch": 11.798336798336798, + "grad_norm": 0.184279665350914, + "learning_rate": 4.755842945163785e-05, + "loss": 0.116, + "num_input_tokens_seen": 4073832, + "step": 5675 + }, + { + "epoch": 11.808731808731808, + "grad_norm": 0.1371917873620987, + "learning_rate": 4.755419607660867e-05, + "loss": 0.1477, + "num_input_tokens_seen": 4077416, + "step": 5680 + }, + { + "epoch": 11.819126819126819, + "grad_norm": 0.26036128401756287, + "learning_rate": 4.7549959223439016e-05, + "loss": 0.1477, + "num_input_tokens_seen": 4081032, + "step": 5685 + }, + { + "epoch": 11.829521829521829, + "grad_norm": 0.3592894375324249, + "learning_rate": 4.754571889278228e-05, + "loss": 0.1438, + "num_input_tokens_seen": 4084424, + "step": 5690 + }, + { + "epoch": 11.83991683991684, + "grad_norm": 0.1753467470407486, + "learning_rate": 4.754147508529235e-05, + "loss": 0.1017, + "num_input_tokens_seen": 4087976, + "step": 5695 + }, + { + "epoch": 11.85031185031185, + "grad_norm": 0.38942602276802063, + "learning_rate": 4.75372278016237e-05, + "loss": 0.1134, + "num_input_tokens_seen": 4091432, + "step": 5700 + }, + { + "epoch": 11.86070686070686, + "grad_norm": 0.21930965781211853, + "learning_rate": 4.753297704243129e-05, + "loss": 0.1455, + "num_input_tokens_seen": 4095016, + "step": 5705 + }, + { + "epoch": 11.871101871101871, + "grad_norm": 0.39353635907173157, + "learning_rate": 4.752872280837066e-05, + "loss": 0.1193, + "num_input_tokens_seen": 4098632, + "step": 5710 + }, + { + "epoch": 11.881496881496881, + "grad_norm": 0.3913963735103607, + "learning_rate": 4.752446510009786e-05, + "loss": 0.1361, + "num_input_tokens_seen": 4102248, + "step": 5715 + }, + { + "epoch": 11.891891891891891, + "grad_norm": 0.22873061895370483, + "learning_rate": 4.7520203918269476e-05, + "loss": 0.1383, + "num_input_tokens_seen": 4105800, + "step": 5720 + }, + { + "epoch": 11.902286902286903, + "grad_norm": 0.20138147473335266, + "learning_rate": 4.751593926354265e-05, + "loss": 0.1658, + "num_input_tokens_seen": 4109352, + "step": 5725 + }, + { + "epoch": 11.912681912681913, + "grad_norm": 0.4566987156867981, + "learning_rate": 4.751167113657503e-05, + "loss": 0.0949, + "num_input_tokens_seen": 4112872, + "step": 5730 + }, + { + "epoch": 11.923076923076923, + "grad_norm": 0.1846582144498825, + "learning_rate": 4.7507399538024834e-05, + "loss": 0.1136, + "num_input_tokens_seen": 4116456, + "step": 5735 + }, + { + "epoch": 11.933471933471933, + "grad_norm": 0.40191638469696045, + "learning_rate": 4.750312446855077e-05, + "loss": 0.132, + "num_input_tokens_seen": 4120104, + "step": 5740 + }, + { + "epoch": 11.943866943866944, + "grad_norm": 0.358822226524353, + "learning_rate": 4.749884592881212e-05, + "loss": 0.1459, + "num_input_tokens_seen": 4123528, + "step": 5745 + }, + { + "epoch": 11.954261954261954, + "grad_norm": 0.1535165160894394, + "learning_rate": 4.74945639194687e-05, + "loss": 0.1465, + "num_input_tokens_seen": 4127144, + "step": 5750 + }, + { + "epoch": 11.964656964656964, + "grad_norm": 0.5015078186988831, + "learning_rate": 4.749027844118083e-05, + "loss": 0.1341, + "num_input_tokens_seen": 4130568, + "step": 5755 + }, + { + "epoch": 11.975051975051976, + "grad_norm": 0.18543292582035065, + "learning_rate": 4.7485989494609395e-05, + "loss": 0.1226, + "num_input_tokens_seen": 4134088, + "step": 5760 + }, + { + "epoch": 11.985446985446986, + "grad_norm": 0.13550105690956116, + "learning_rate": 4.748169708041581e-05, + "loss": 0.1348, + "num_input_tokens_seen": 4137608, + "step": 5765 + }, + { + "epoch": 11.995841995841996, + "grad_norm": 0.3585164546966553, + "learning_rate": 4.7477401199262004e-05, + "loss": 0.1114, + "num_input_tokens_seen": 4141192, + "step": 5770 + }, + { + "epoch": 12.006237006237006, + "grad_norm": 0.5285150408744812, + "learning_rate": 4.747310185181048e-05, + "loss": 0.1231, + "num_input_tokens_seen": 4144768, + "step": 5775 + }, + { + "epoch": 12.016632016632016, + "grad_norm": 0.2875455617904663, + "learning_rate": 4.746879903872422e-05, + "loss": 0.1249, + "num_input_tokens_seen": 4148544, + "step": 5780 + }, + { + "epoch": 12.027027027027026, + "grad_norm": 0.17172716557979584, + "learning_rate": 4.746449276066679e-05, + "loss": 0.1718, + "num_input_tokens_seen": 4152096, + "step": 5785 + }, + { + "epoch": 12.037422037422038, + "grad_norm": 0.3361891508102417, + "learning_rate": 4.746018301830227e-05, + "loss": 0.1552, + "num_input_tokens_seen": 4155680, + "step": 5790 + }, + { + "epoch": 12.047817047817048, + "grad_norm": 0.4423771798610687, + "learning_rate": 4.7455869812295275e-05, + "loss": 0.1474, + "num_input_tokens_seen": 4159264, + "step": 5795 + }, + { + "epoch": 12.058212058212058, + "grad_norm": 0.4050516188144684, + "learning_rate": 4.7451553143310964e-05, + "loss": 0.1705, + "num_input_tokens_seen": 4162880, + "step": 5800 + }, + { + "epoch": 12.058212058212058, + "eval_loss": 0.15653303265571594, + "eval_runtime": 7.7631, + "eval_samples_per_second": 110.266, + "eval_steps_per_second": 27.566, + "num_input_tokens_seen": 4162880, + "step": 5800 + }, + { + "epoch": 12.068607068607069, + "grad_norm": 0.5435624122619629, + "learning_rate": 4.744723301201501e-05, + "loss": 0.1491, + "num_input_tokens_seen": 4166400, + "step": 5805 + }, + { + "epoch": 12.079002079002079, + "grad_norm": 0.2011677473783493, + "learning_rate": 4.744290941907364e-05, + "loss": 0.1461, + "num_input_tokens_seen": 4170048, + "step": 5810 + }, + { + "epoch": 12.089397089397089, + "grad_norm": 0.21525128185749054, + "learning_rate": 4.7438582365153594e-05, + "loss": 0.1569, + "num_input_tokens_seen": 4173728, + "step": 5815 + }, + { + "epoch": 12.0997920997921, + "grad_norm": 0.24194687604904175, + "learning_rate": 4.743425185092217e-05, + "loss": 0.1623, + "num_input_tokens_seen": 4177184, + "step": 5820 + }, + { + "epoch": 12.11018711018711, + "grad_norm": 0.4856181740760803, + "learning_rate": 4.742991787704719e-05, + "loss": 0.1161, + "num_input_tokens_seen": 4180672, + "step": 5825 + }, + { + "epoch": 12.120582120582121, + "grad_norm": 0.2691323459148407, + "learning_rate": 4.7425580444196994e-05, + "loss": 0.1417, + "num_input_tokens_seen": 4184256, + "step": 5830 + }, + { + "epoch": 12.130977130977131, + "grad_norm": 0.3246041536331177, + "learning_rate": 4.742123955304048e-05, + "loss": 0.1719, + "num_input_tokens_seen": 4187840, + "step": 5835 + }, + { + "epoch": 12.141372141372141, + "grad_norm": 0.2996188700199127, + "learning_rate": 4.741689520424706e-05, + "loss": 0.1559, + "num_input_tokens_seen": 4191456, + "step": 5840 + }, + { + "epoch": 12.151767151767151, + "grad_norm": 0.2108037769794464, + "learning_rate": 4.741254739848669e-05, + "loss": 0.1611, + "num_input_tokens_seen": 4195008, + "step": 5845 + }, + { + "epoch": 12.162162162162161, + "grad_norm": 0.20963265001773834, + "learning_rate": 4.740819613642987e-05, + "loss": 0.161, + "num_input_tokens_seen": 4198560, + "step": 5850 + }, + { + "epoch": 12.172557172557173, + "grad_norm": 0.34317460656166077, + "learning_rate": 4.74038414187476e-05, + "loss": 0.1698, + "num_input_tokens_seen": 4202304, + "step": 5855 + }, + { + "epoch": 12.182952182952183, + "grad_norm": 0.23224258422851562, + "learning_rate": 4.739948324611144e-05, + "loss": 0.1452, + "num_input_tokens_seen": 4205920, + "step": 5860 + }, + { + "epoch": 12.193347193347194, + "grad_norm": 0.23998792469501495, + "learning_rate": 4.7395121619193465e-05, + "loss": 0.1613, + "num_input_tokens_seen": 4209632, + "step": 5865 + }, + { + "epoch": 12.203742203742204, + "grad_norm": 0.173082172870636, + "learning_rate": 4.7390756538666313e-05, + "loss": 0.1227, + "num_input_tokens_seen": 4213216, + "step": 5870 + }, + { + "epoch": 12.214137214137214, + "grad_norm": 0.24338971078395844, + "learning_rate": 4.738638800520311e-05, + "loss": 0.1583, + "num_input_tokens_seen": 4216768, + "step": 5875 + }, + { + "epoch": 12.224532224532224, + "grad_norm": 0.35901591181755066, + "learning_rate": 4.738201601947757e-05, + "loss": 0.1659, + "num_input_tokens_seen": 4220416, + "step": 5880 + }, + { + "epoch": 12.234927234927236, + "grad_norm": 0.49406135082244873, + "learning_rate": 4.7377640582163876e-05, + "loss": 0.1292, + "num_input_tokens_seen": 4223872, + "step": 5885 + }, + { + "epoch": 12.245322245322246, + "grad_norm": 0.27330684661865234, + "learning_rate": 4.7373261693936786e-05, + "loss": 0.1439, + "num_input_tokens_seen": 4227520, + "step": 5890 + }, + { + "epoch": 12.255717255717256, + "grad_norm": 0.20275837182998657, + "learning_rate": 4.7368879355471595e-05, + "loss": 0.1196, + "num_input_tokens_seen": 4230976, + "step": 5895 + }, + { + "epoch": 12.266112266112266, + "grad_norm": 0.11405466496944427, + "learning_rate": 4.736449356744409e-05, + "loss": 0.1097, + "num_input_tokens_seen": 4234400, + "step": 5900 + }, + { + "epoch": 12.276507276507276, + "grad_norm": 0.17346976697444916, + "learning_rate": 4.736010433053064e-05, + "loss": 0.133, + "num_input_tokens_seen": 4237920, + "step": 5905 + }, + { + "epoch": 12.286902286902286, + "grad_norm": 0.27947336435317993, + "learning_rate": 4.73557116454081e-05, + "loss": 0.118, + "num_input_tokens_seen": 4241472, + "step": 5910 + }, + { + "epoch": 12.297297297297296, + "grad_norm": 0.6705723404884338, + "learning_rate": 4.735131551275389e-05, + "loss": 0.137, + "num_input_tokens_seen": 4245120, + "step": 5915 + }, + { + "epoch": 12.307692307692308, + "grad_norm": 0.4312475025653839, + "learning_rate": 4.734691593324594e-05, + "loss": 0.124, + "num_input_tokens_seen": 4248544, + "step": 5920 + }, + { + "epoch": 12.318087318087318, + "grad_norm": 0.1711849421262741, + "learning_rate": 4.734251290756272e-05, + "loss": 0.1317, + "num_input_tokens_seen": 4252064, + "step": 5925 + }, + { + "epoch": 12.328482328482329, + "grad_norm": 0.32620930671691895, + "learning_rate": 4.7338106436383246e-05, + "loss": 0.1136, + "num_input_tokens_seen": 4255616, + "step": 5930 + }, + { + "epoch": 12.338877338877339, + "grad_norm": 0.5365301966667175, + "learning_rate": 4.733369652038703e-05, + "loss": 0.1412, + "num_input_tokens_seen": 4259232, + "step": 5935 + }, + { + "epoch": 12.349272349272349, + "grad_norm": 0.20049403607845306, + "learning_rate": 4.7329283160254156e-05, + "loss": 0.1435, + "num_input_tokens_seen": 4262848, + "step": 5940 + }, + { + "epoch": 12.359667359667359, + "grad_norm": 0.553752601146698, + "learning_rate": 4.732486635666521e-05, + "loss": 0.1117, + "num_input_tokens_seen": 4266464, + "step": 5945 + }, + { + "epoch": 12.37006237006237, + "grad_norm": 0.35016995668411255, + "learning_rate": 4.732044611030132e-05, + "loss": 0.1084, + "num_input_tokens_seen": 4269984, + "step": 5950 + }, + { + "epoch": 12.380457380457381, + "grad_norm": 0.1441822648048401, + "learning_rate": 4.731602242184414e-05, + "loss": 0.1699, + "num_input_tokens_seen": 4273600, + "step": 5955 + }, + { + "epoch": 12.390852390852391, + "grad_norm": 0.25327375531196594, + "learning_rate": 4.7311595291975864e-05, + "loss": 0.1219, + "num_input_tokens_seen": 4277344, + "step": 5960 + }, + { + "epoch": 12.401247401247401, + "grad_norm": 0.2755127251148224, + "learning_rate": 4.7307164721379216e-05, + "loss": 0.1522, + "num_input_tokens_seen": 4281024, + "step": 5965 + }, + { + "epoch": 12.411642411642411, + "grad_norm": 0.1971409171819687, + "learning_rate": 4.730273071073743e-05, + "loss": 0.1666, + "num_input_tokens_seen": 4284512, + "step": 5970 + }, + { + "epoch": 12.422037422037421, + "grad_norm": 0.3229224383831024, + "learning_rate": 4.729829326073429e-05, + "loss": 0.1591, + "num_input_tokens_seen": 4288032, + "step": 5975 + }, + { + "epoch": 12.432432432432432, + "grad_norm": 0.4476662874221802, + "learning_rate": 4.7293852372054126e-05, + "loss": 0.121, + "num_input_tokens_seen": 4291488, + "step": 5980 + }, + { + "epoch": 12.442827442827443, + "grad_norm": 0.3350240886211395, + "learning_rate": 4.728940804538176e-05, + "loss": 0.1243, + "num_input_tokens_seen": 4295040, + "step": 5985 + }, + { + "epoch": 12.453222453222454, + "grad_norm": 0.17650891840457916, + "learning_rate": 4.7284960281402556e-05, + "loss": 0.1543, + "num_input_tokens_seen": 4298528, + "step": 5990 + }, + { + "epoch": 12.463617463617464, + "grad_norm": 0.3530198633670807, + "learning_rate": 4.728050908080244e-05, + "loss": 0.1154, + "num_input_tokens_seen": 4301952, + "step": 5995 + }, + { + "epoch": 12.474012474012474, + "grad_norm": 0.13698308169841766, + "learning_rate": 4.727605444426782e-05, + "loss": 0.1616, + "num_input_tokens_seen": 4305664, + "step": 6000 + }, + { + "epoch": 12.474012474012474, + "eval_loss": 0.15231779217720032, + "eval_runtime": 7.7708, + "eval_samples_per_second": 110.156, + "eval_steps_per_second": 27.539, + "num_input_tokens_seen": 4305664, + "step": 6000 + }, + { + "epoch": 12.484407484407484, + "grad_norm": 0.38997986912727356, + "learning_rate": 4.727159637248567e-05, + "loss": 0.1817, + "num_input_tokens_seen": 4309184, + "step": 6005 + }, + { + "epoch": 12.494802494802494, + "grad_norm": 0.2995315194129944, + "learning_rate": 4.7267134866143474e-05, + "loss": 0.1444, + "num_input_tokens_seen": 4312896, + "step": 6010 + }, + { + "epoch": 12.505197505197506, + "grad_norm": 0.20229366421699524, + "learning_rate": 4.726266992592926e-05, + "loss": 0.0783, + "num_input_tokens_seen": 4316352, + "step": 6015 + }, + { + "epoch": 12.515592515592516, + "grad_norm": 0.16632437705993652, + "learning_rate": 4.725820155253157e-05, + "loss": 0.1198, + "num_input_tokens_seen": 4320032, + "step": 6020 + }, + { + "epoch": 12.525987525987526, + "grad_norm": 0.27540573477745056, + "learning_rate": 4.725372974663948e-05, + "loss": 0.1478, + "num_input_tokens_seen": 4323520, + "step": 6025 + }, + { + "epoch": 12.536382536382536, + "grad_norm": 0.7965169548988342, + "learning_rate": 4.724925450894262e-05, + "loss": 0.149, + "num_input_tokens_seen": 4327072, + "step": 6030 + }, + { + "epoch": 12.546777546777546, + "grad_norm": 0.21804364025592804, + "learning_rate": 4.72447758401311e-05, + "loss": 0.1403, + "num_input_tokens_seen": 4330656, + "step": 6035 + }, + { + "epoch": 12.557172557172557, + "grad_norm": 0.23719611763954163, + "learning_rate": 4.7240293740895616e-05, + "loss": 0.1145, + "num_input_tokens_seen": 4334208, + "step": 6040 + }, + { + "epoch": 12.567567567567568, + "grad_norm": 0.24101251363754272, + "learning_rate": 4.723580821192733e-05, + "loss": 0.1118, + "num_input_tokens_seen": 4337760, + "step": 6045 + }, + { + "epoch": 12.577962577962579, + "grad_norm": 0.2534365952014923, + "learning_rate": 4.7231319253917996e-05, + "loss": 0.1619, + "num_input_tokens_seen": 4341440, + "step": 6050 + }, + { + "epoch": 12.588357588357589, + "grad_norm": 0.200970858335495, + "learning_rate": 4.722682686755986e-05, + "loss": 0.1181, + "num_input_tokens_seen": 4344928, + "step": 6055 + }, + { + "epoch": 12.598752598752599, + "grad_norm": 0.45905494689941406, + "learning_rate": 4.722233105354569e-05, + "loss": 0.1352, + "num_input_tokens_seen": 4348672, + "step": 6060 + }, + { + "epoch": 12.609147609147609, + "grad_norm": 0.16777797043323517, + "learning_rate": 4.7217831812568815e-05, + "loss": 0.1874, + "num_input_tokens_seen": 4352288, + "step": 6065 + }, + { + "epoch": 12.619542619542619, + "grad_norm": 0.40942034125328064, + "learning_rate": 4.721332914532307e-05, + "loss": 0.1913, + "num_input_tokens_seen": 4356064, + "step": 6070 + }, + { + "epoch": 12.62993762993763, + "grad_norm": 0.2274274379014969, + "learning_rate": 4.720882305250281e-05, + "loss": 0.1162, + "num_input_tokens_seen": 4359456, + "step": 6075 + }, + { + "epoch": 12.640332640332641, + "grad_norm": 0.26312193274497986, + "learning_rate": 4.720431353480295e-05, + "loss": 0.0869, + "num_input_tokens_seen": 4363040, + "step": 6080 + }, + { + "epoch": 12.650727650727651, + "grad_norm": 0.13952893018722534, + "learning_rate": 4.719980059291891e-05, + "loss": 0.1117, + "num_input_tokens_seen": 4366560, + "step": 6085 + }, + { + "epoch": 12.661122661122661, + "grad_norm": 0.41529375314712524, + "learning_rate": 4.7195284227546634e-05, + "loss": 0.1449, + "num_input_tokens_seen": 4370304, + "step": 6090 + }, + { + "epoch": 12.671517671517671, + "grad_norm": 0.2034720778465271, + "learning_rate": 4.7190764439382604e-05, + "loss": 0.138, + "num_input_tokens_seen": 4373952, + "step": 6095 + }, + { + "epoch": 12.681912681912682, + "grad_norm": 0.3635331094264984, + "learning_rate": 4.7186241229123826e-05, + "loss": 0.1132, + "num_input_tokens_seen": 4377472, + "step": 6100 + }, + { + "epoch": 12.692307692307692, + "grad_norm": 0.7172232270240784, + "learning_rate": 4.718171459746785e-05, + "loss": 0.1267, + "num_input_tokens_seen": 4380928, + "step": 6105 + }, + { + "epoch": 12.702702702702704, + "grad_norm": 0.2846471071243286, + "learning_rate": 4.717718454511273e-05, + "loss": 0.0802, + "num_input_tokens_seen": 4384544, + "step": 6110 + }, + { + "epoch": 12.713097713097714, + "grad_norm": 0.8108124732971191, + "learning_rate": 4.7172651072757056e-05, + "loss": 0.1231, + "num_input_tokens_seen": 4388160, + "step": 6115 + }, + { + "epoch": 12.723492723492724, + "grad_norm": 0.22902259230613708, + "learning_rate": 4.7168114181099945e-05, + "loss": 0.1237, + "num_input_tokens_seen": 4391744, + "step": 6120 + }, + { + "epoch": 12.733887733887734, + "grad_norm": 0.31569352746009827, + "learning_rate": 4.716357387084105e-05, + "loss": 0.1073, + "num_input_tokens_seen": 4395328, + "step": 6125 + }, + { + "epoch": 12.744282744282744, + "grad_norm": 0.42664429545402527, + "learning_rate": 4.715903014268054e-05, + "loss": 0.1252, + "num_input_tokens_seen": 4398816, + "step": 6130 + }, + { + "epoch": 12.754677754677754, + "grad_norm": 0.12942588329315186, + "learning_rate": 4.715448299731911e-05, + "loss": 0.1106, + "num_input_tokens_seen": 4402368, + "step": 6135 + }, + { + "epoch": 12.765072765072766, + "grad_norm": 0.2405804842710495, + "learning_rate": 4.7149932435457986e-05, + "loss": 0.1267, + "num_input_tokens_seen": 4405952, + "step": 6140 + }, + { + "epoch": 12.775467775467776, + "grad_norm": 0.2119690626859665, + "learning_rate": 4.714537845779894e-05, + "loss": 0.0966, + "num_input_tokens_seen": 4409600, + "step": 6145 + }, + { + "epoch": 12.785862785862786, + "grad_norm": 0.39923128485679626, + "learning_rate": 4.714082106504423e-05, + "loss": 0.1529, + "num_input_tokens_seen": 4413216, + "step": 6150 + }, + { + "epoch": 12.796257796257796, + "grad_norm": 0.3444095551967621, + "learning_rate": 4.713626025789667e-05, + "loss": 0.1143, + "num_input_tokens_seen": 4416704, + "step": 6155 + }, + { + "epoch": 12.806652806652806, + "grad_norm": 0.6027960777282715, + "learning_rate": 4.7131696037059606e-05, + "loss": 0.1528, + "num_input_tokens_seen": 4420224, + "step": 6160 + }, + { + "epoch": 12.817047817047817, + "grad_norm": 0.3308596611022949, + "learning_rate": 4.712712840323689e-05, + "loss": 0.116, + "num_input_tokens_seen": 4423744, + "step": 6165 + }, + { + "epoch": 12.827442827442827, + "grad_norm": 0.17304261028766632, + "learning_rate": 4.71225573571329e-05, + "loss": 0.1263, + "num_input_tokens_seen": 4427552, + "step": 6170 + }, + { + "epoch": 12.837837837837839, + "grad_norm": 0.49257418513298035, + "learning_rate": 4.711798289945256e-05, + "loss": 0.1478, + "num_input_tokens_seen": 4431232, + "step": 6175 + }, + { + "epoch": 12.848232848232849, + "grad_norm": 0.26792001724243164, + "learning_rate": 4.71134050309013e-05, + "loss": 0.1418, + "num_input_tokens_seen": 4434880, + "step": 6180 + }, + { + "epoch": 12.858627858627859, + "grad_norm": 0.1546504646539688, + "learning_rate": 4.710882375218509e-05, + "loss": 0.081, + "num_input_tokens_seen": 4438656, + "step": 6185 + }, + { + "epoch": 12.869022869022869, + "grad_norm": 0.38383108377456665, + "learning_rate": 4.7104239064010424e-05, + "loss": 0.1266, + "num_input_tokens_seen": 4442304, + "step": 6190 + }, + { + "epoch": 12.879417879417879, + "grad_norm": 0.5112090110778809, + "learning_rate": 4.709965096708432e-05, + "loss": 0.1557, + "num_input_tokens_seen": 4445856, + "step": 6195 + }, + { + "epoch": 12.88981288981289, + "grad_norm": 0.1764337569475174, + "learning_rate": 4.709505946211431e-05, + "loss": 0.0836, + "num_input_tokens_seen": 4449504, + "step": 6200 + }, + { + "epoch": 12.88981288981289, + "eval_loss": 0.1556951403617859, + "eval_runtime": 7.7628, + "eval_samples_per_second": 110.269, + "eval_steps_per_second": 27.567, + "num_input_tokens_seen": 4449504, + "step": 6200 + }, + { + "epoch": 12.9002079002079, + "grad_norm": 0.301156222820282, + "learning_rate": 4.709046454980846e-05, + "loss": 0.1257, + "num_input_tokens_seen": 4453088, + "step": 6205 + }, + { + "epoch": 12.910602910602911, + "grad_norm": 0.35898593068122864, + "learning_rate": 4.708586623087538e-05, + "loss": 0.1395, + "num_input_tokens_seen": 4456704, + "step": 6210 + }, + { + "epoch": 12.920997920997921, + "grad_norm": 0.25016260147094727, + "learning_rate": 4.708126450602418e-05, + "loss": 0.1943, + "num_input_tokens_seen": 4460352, + "step": 6215 + }, + { + "epoch": 12.931392931392931, + "grad_norm": 0.3658762276172638, + "learning_rate": 4.7076659375964495e-05, + "loss": 0.189, + "num_input_tokens_seen": 4464096, + "step": 6220 + }, + { + "epoch": 12.941787941787942, + "grad_norm": 0.27577346563339233, + "learning_rate": 4.707205084140651e-05, + "loss": 0.1033, + "num_input_tokens_seen": 4467648, + "step": 6225 + }, + { + "epoch": 12.952182952182952, + "grad_norm": 0.31982535123825073, + "learning_rate": 4.7067438903060904e-05, + "loss": 0.1493, + "num_input_tokens_seen": 4471136, + "step": 6230 + }, + { + "epoch": 12.962577962577962, + "grad_norm": 0.33339327573776245, + "learning_rate": 4.70628235616389e-05, + "loss": 0.1056, + "num_input_tokens_seen": 4474720, + "step": 6235 + }, + { + "epoch": 12.972972972972974, + "grad_norm": 0.24781577289104462, + "learning_rate": 4.7058204817852256e-05, + "loss": 0.1153, + "num_input_tokens_seen": 4478144, + "step": 6240 + }, + { + "epoch": 12.983367983367984, + "grad_norm": 0.22278323769569397, + "learning_rate": 4.705358267241322e-05, + "loss": 0.201, + "num_input_tokens_seen": 4481856, + "step": 6245 + }, + { + "epoch": 12.993762993762994, + "grad_norm": 0.18601199984550476, + "learning_rate": 4.704895712603459e-05, + "loss": 0.1618, + "num_input_tokens_seen": 4485536, + "step": 6250 + }, + { + "epoch": 13.004158004158004, + "grad_norm": 0.16532184183597565, + "learning_rate": 4.704432817942969e-05, + "loss": 0.1227, + "num_input_tokens_seen": 4489272, + "step": 6255 + }, + { + "epoch": 13.014553014553014, + "grad_norm": 0.6289628148078918, + "learning_rate": 4.703969583331236e-05, + "loss": 0.1757, + "num_input_tokens_seen": 4492792, + "step": 6260 + }, + { + "epoch": 13.024948024948024, + "grad_norm": 0.31525343656539917, + "learning_rate": 4.7035060088396965e-05, + "loss": 0.1011, + "num_input_tokens_seen": 4496216, + "step": 6265 + }, + { + "epoch": 13.035343035343036, + "grad_norm": 0.15883903205394745, + "learning_rate": 4.703042094539839e-05, + "loss": 0.1016, + "num_input_tokens_seen": 4499864, + "step": 6270 + }, + { + "epoch": 13.045738045738046, + "grad_norm": 0.22852876782417297, + "learning_rate": 4.702577840503206e-05, + "loss": 0.119, + "num_input_tokens_seen": 4503320, + "step": 6275 + }, + { + "epoch": 13.056133056133056, + "grad_norm": 0.5528218746185303, + "learning_rate": 4.70211324680139e-05, + "loss": 0.2078, + "num_input_tokens_seen": 4506904, + "step": 6280 + }, + { + "epoch": 13.066528066528067, + "grad_norm": 0.18913687765598297, + "learning_rate": 4.7016483135060386e-05, + "loss": 0.1099, + "num_input_tokens_seen": 4510520, + "step": 6285 + }, + { + "epoch": 13.076923076923077, + "grad_norm": 0.5122366547584534, + "learning_rate": 4.701183040688849e-05, + "loss": 0.1255, + "num_input_tokens_seen": 4514296, + "step": 6290 + }, + { + "epoch": 13.087318087318087, + "grad_norm": 0.40515682101249695, + "learning_rate": 4.700717428421573e-05, + "loss": 0.1508, + "num_input_tokens_seen": 4517912, + "step": 6295 + }, + { + "epoch": 13.097713097713097, + "grad_norm": 0.18163777887821198, + "learning_rate": 4.700251476776014e-05, + "loss": 0.1499, + "num_input_tokens_seen": 4521496, + "step": 6300 + }, + { + "epoch": 13.108108108108109, + "grad_norm": 0.2672940790653229, + "learning_rate": 4.699785185824026e-05, + "loss": 0.1122, + "num_input_tokens_seen": 4525016, + "step": 6305 + }, + { + "epoch": 13.118503118503119, + "grad_norm": 0.360408216714859, + "learning_rate": 4.699318555637519e-05, + "loss": 0.1398, + "num_input_tokens_seen": 4528536, + "step": 6310 + }, + { + "epoch": 13.128898128898129, + "grad_norm": 0.22988854348659515, + "learning_rate": 4.6988515862884525e-05, + "loss": 0.1229, + "num_input_tokens_seen": 4532216, + "step": 6315 + }, + { + "epoch": 13.13929313929314, + "grad_norm": 0.3408355116844177, + "learning_rate": 4.698384277848838e-05, + "loss": 0.156, + "num_input_tokens_seen": 4535704, + "step": 6320 + }, + { + "epoch": 13.14968814968815, + "grad_norm": 0.2434535026550293, + "learning_rate": 4.6979166303907425e-05, + "loss": 0.123, + "num_input_tokens_seen": 4539384, + "step": 6325 + }, + { + "epoch": 13.16008316008316, + "grad_norm": 0.2887505292892456, + "learning_rate": 4.697448643986281e-05, + "loss": 0.1415, + "num_input_tokens_seen": 4542872, + "step": 6330 + }, + { + "epoch": 13.170478170478171, + "grad_norm": 0.2622537910938263, + "learning_rate": 4.696980318707624e-05, + "loss": 0.1158, + "num_input_tokens_seen": 4546392, + "step": 6335 + }, + { + "epoch": 13.180873180873181, + "grad_norm": 0.35556700825691223, + "learning_rate": 4.6965116546269924e-05, + "loss": 0.1283, + "num_input_tokens_seen": 4549912, + "step": 6340 + }, + { + "epoch": 13.191268191268192, + "grad_norm": 0.6879853010177612, + "learning_rate": 4.6960426518166615e-05, + "loss": 0.1105, + "num_input_tokens_seen": 4553496, + "step": 6345 + }, + { + "epoch": 13.201663201663202, + "grad_norm": 0.24379733204841614, + "learning_rate": 4.6955733103489556e-05, + "loss": 0.1354, + "num_input_tokens_seen": 4557208, + "step": 6350 + }, + { + "epoch": 13.212058212058212, + "grad_norm": 0.1654547154903412, + "learning_rate": 4.695103630296255e-05, + "loss": 0.1433, + "num_input_tokens_seen": 4560728, + "step": 6355 + }, + { + "epoch": 13.222453222453222, + "grad_norm": 0.32699474692344666, + "learning_rate": 4.694633611730988e-05, + "loss": 0.1571, + "num_input_tokens_seen": 4564408, + "step": 6360 + }, + { + "epoch": 13.232848232848234, + "grad_norm": 0.47416120767593384, + "learning_rate": 4.694163254725639e-05, + "loss": 0.1398, + "num_input_tokens_seen": 4567896, + "step": 6365 + }, + { + "epoch": 13.243243243243244, + "grad_norm": 0.2039574235677719, + "learning_rate": 4.693692559352743e-05, + "loss": 0.1515, + "num_input_tokens_seen": 4571256, + "step": 6370 + }, + { + "epoch": 13.253638253638254, + "grad_norm": 0.5388666987419128, + "learning_rate": 4.693221525684886e-05, + "loss": 0.1261, + "num_input_tokens_seen": 4574904, + "step": 6375 + }, + { + "epoch": 13.264033264033264, + "grad_norm": 0.36443331837654114, + "learning_rate": 4.6927501537947084e-05, + "loss": 0.1055, + "num_input_tokens_seen": 4578552, + "step": 6380 + }, + { + "epoch": 13.274428274428274, + "grad_norm": 0.21137557923793793, + "learning_rate": 4.692278443754901e-05, + "loss": 0.1305, + "num_input_tokens_seen": 4582104, + "step": 6385 + }, + { + "epoch": 13.284823284823284, + "grad_norm": 0.2321098893880844, + "learning_rate": 4.691806395638208e-05, + "loss": 0.1557, + "num_input_tokens_seen": 4585816, + "step": 6390 + }, + { + "epoch": 13.295218295218294, + "grad_norm": 0.1822207272052765, + "learning_rate": 4.6913340095174255e-05, + "loss": 0.1865, + "num_input_tokens_seen": 4589368, + "step": 6395 + }, + { + "epoch": 13.305613305613306, + "grad_norm": 0.20048247277736664, + "learning_rate": 4.690861285465399e-05, + "loss": 0.146, + "num_input_tokens_seen": 4592824, + "step": 6400 + }, + { + "epoch": 13.305613305613306, + "eval_loss": 0.14829862117767334, + "eval_runtime": 7.7728, + "eval_samples_per_second": 110.127, + "eval_steps_per_second": 27.532, + "num_input_tokens_seen": 4592824, + "step": 6400 + }, + { + "epoch": 13.316008316008316, + "grad_norm": 0.23779290914535522, + "learning_rate": 4.690388223555031e-05, + "loss": 0.1485, + "num_input_tokens_seen": 4596312, + "step": 6405 + }, + { + "epoch": 13.326403326403327, + "grad_norm": 0.2795467972755432, + "learning_rate": 4.689914823859273e-05, + "loss": 0.112, + "num_input_tokens_seen": 4599864, + "step": 6410 + }, + { + "epoch": 13.336798336798337, + "grad_norm": 0.4631289541721344, + "learning_rate": 4.689441086451129e-05, + "loss": 0.1037, + "num_input_tokens_seen": 4603416, + "step": 6415 + }, + { + "epoch": 13.347193347193347, + "grad_norm": 0.44174936413764954, + "learning_rate": 4.688967011403655e-05, + "loss": 0.1139, + "num_input_tokens_seen": 4607000, + "step": 6420 + }, + { + "epoch": 13.357588357588357, + "grad_norm": 0.3328201174736023, + "learning_rate": 4.68849259878996e-05, + "loss": 0.1034, + "num_input_tokens_seen": 4610648, + "step": 6425 + }, + { + "epoch": 13.367983367983369, + "grad_norm": 0.3980903625488281, + "learning_rate": 4.6880178486832036e-05, + "loss": 0.1314, + "num_input_tokens_seen": 4614264, + "step": 6430 + }, + { + "epoch": 13.378378378378379, + "grad_norm": 0.2921513020992279, + "learning_rate": 4.687542761156598e-05, + "loss": 0.1443, + "num_input_tokens_seen": 4617912, + "step": 6435 + }, + { + "epoch": 13.388773388773389, + "grad_norm": 0.2347293198108673, + "learning_rate": 4.6870673362834096e-05, + "loss": 0.091, + "num_input_tokens_seen": 4621432, + "step": 6440 + }, + { + "epoch": 13.3991683991684, + "grad_norm": 0.2950640320777893, + "learning_rate": 4.6865915741369526e-05, + "loss": 0.1011, + "num_input_tokens_seen": 4625112, + "step": 6445 + }, + { + "epoch": 13.40956340956341, + "grad_norm": 0.33001837134361267, + "learning_rate": 4.686115474790597e-05, + "loss": 0.1312, + "num_input_tokens_seen": 4628664, + "step": 6450 + }, + { + "epoch": 13.41995841995842, + "grad_norm": 0.15870089828968048, + "learning_rate": 4.685639038317762e-05, + "loss": 0.0875, + "num_input_tokens_seen": 4632216, + "step": 6455 + }, + { + "epoch": 13.43035343035343, + "grad_norm": 0.26550060510635376, + "learning_rate": 4.685162264791921e-05, + "loss": 0.1645, + "num_input_tokens_seen": 4635896, + "step": 6460 + }, + { + "epoch": 13.440748440748441, + "grad_norm": 0.4690372943878174, + "learning_rate": 4.684685154286599e-05, + "loss": 0.1367, + "num_input_tokens_seen": 4639512, + "step": 6465 + }, + { + "epoch": 13.451143451143452, + "grad_norm": 0.3423502445220947, + "learning_rate": 4.684207706875371e-05, + "loss": 0.1287, + "num_input_tokens_seen": 4643096, + "step": 6470 + }, + { + "epoch": 13.461538461538462, + "grad_norm": 0.26217886805534363, + "learning_rate": 4.683729922631866e-05, + "loss": 0.1084, + "num_input_tokens_seen": 4646648, + "step": 6475 + }, + { + "epoch": 13.471933471933472, + "grad_norm": 0.2306244820356369, + "learning_rate": 4.683251801629765e-05, + "loss": 0.1538, + "num_input_tokens_seen": 4650200, + "step": 6480 + }, + { + "epoch": 13.482328482328482, + "grad_norm": 0.46668577194213867, + "learning_rate": 4.6827733439428e-05, + "loss": 0.1157, + "num_input_tokens_seen": 4653656, + "step": 6485 + }, + { + "epoch": 13.492723492723492, + "grad_norm": 0.2334165871143341, + "learning_rate": 4.682294549644754e-05, + "loss": 0.1461, + "num_input_tokens_seen": 4657272, + "step": 6490 + }, + { + "epoch": 13.503118503118504, + "grad_norm": 0.43996819853782654, + "learning_rate": 4.681815418809464e-05, + "loss": 0.1532, + "num_input_tokens_seen": 4661080, + "step": 6495 + }, + { + "epoch": 13.513513513513514, + "grad_norm": 0.7447423338890076, + "learning_rate": 4.681335951510819e-05, + "loss": 0.1846, + "num_input_tokens_seen": 4664696, + "step": 6500 + }, + { + "epoch": 13.523908523908524, + "grad_norm": 0.44975078105926514, + "learning_rate": 4.6808561478227576e-05, + "loss": 0.1207, + "num_input_tokens_seen": 4668344, + "step": 6505 + }, + { + "epoch": 13.534303534303534, + "grad_norm": 0.15541210770606995, + "learning_rate": 4.680376007819271e-05, + "loss": 0.1393, + "num_input_tokens_seen": 4671992, + "step": 6510 + }, + { + "epoch": 13.544698544698544, + "grad_norm": 0.39336854219436646, + "learning_rate": 4.679895531574405e-05, + "loss": 0.1216, + "num_input_tokens_seen": 4675576, + "step": 6515 + }, + { + "epoch": 13.555093555093555, + "grad_norm": 0.39494872093200684, + "learning_rate": 4.679414719162253e-05, + "loss": 0.1134, + "num_input_tokens_seen": 4679224, + "step": 6520 + }, + { + "epoch": 13.565488565488565, + "grad_norm": 0.24018438160419464, + "learning_rate": 4.6789335706569635e-05, + "loss": 0.1424, + "num_input_tokens_seen": 4682744, + "step": 6525 + }, + { + "epoch": 13.575883575883577, + "grad_norm": 0.23548544943332672, + "learning_rate": 4.678452086132734e-05, + "loss": 0.1478, + "num_input_tokens_seen": 4686328, + "step": 6530 + }, + { + "epoch": 13.586278586278587, + "grad_norm": 0.26589804887771606, + "learning_rate": 4.677970265663818e-05, + "loss": 0.1273, + "num_input_tokens_seen": 4690040, + "step": 6535 + }, + { + "epoch": 13.596673596673597, + "grad_norm": 0.4111003279685974, + "learning_rate": 4.677488109324517e-05, + "loss": 0.1189, + "num_input_tokens_seen": 4693656, + "step": 6540 + }, + { + "epoch": 13.607068607068607, + "grad_norm": 0.2997049391269684, + "learning_rate": 4.6770056171891846e-05, + "loss": 0.1343, + "num_input_tokens_seen": 4697304, + "step": 6545 + }, + { + "epoch": 13.617463617463617, + "grad_norm": 0.20258161425590515, + "learning_rate": 4.6765227893322286e-05, + "loss": 0.1815, + "num_input_tokens_seen": 4701016, + "step": 6550 + }, + { + "epoch": 13.627858627858627, + "grad_norm": 0.8264641761779785, + "learning_rate": 4.676039625828107e-05, + "loss": 0.1298, + "num_input_tokens_seen": 4704696, + "step": 6555 + }, + { + "epoch": 13.638253638253639, + "grad_norm": 0.25974881649017334, + "learning_rate": 4.675556126751328e-05, + "loss": 0.1499, + "num_input_tokens_seen": 4708344, + "step": 6560 + }, + { + "epoch": 13.64864864864865, + "grad_norm": 0.36490583419799805, + "learning_rate": 4.6750722921764556e-05, + "loss": 0.1696, + "num_input_tokens_seen": 4712056, + "step": 6565 + }, + { + "epoch": 13.65904365904366, + "grad_norm": 0.4593743085861206, + "learning_rate": 4.674588122178102e-05, + "loss": 0.1191, + "num_input_tokens_seen": 4715576, + "step": 6570 + }, + { + "epoch": 13.66943866943867, + "grad_norm": 0.2133546769618988, + "learning_rate": 4.674103616830931e-05, + "loss": 0.1111, + "num_input_tokens_seen": 4719224, + "step": 6575 + }, + { + "epoch": 13.67983367983368, + "grad_norm": 0.4068198502063751, + "learning_rate": 4.673618776209663e-05, + "loss": 0.1566, + "num_input_tokens_seen": 4722840, + "step": 6580 + }, + { + "epoch": 13.69022869022869, + "grad_norm": 0.32288384437561035, + "learning_rate": 4.673133600389063e-05, + "loss": 0.1196, + "num_input_tokens_seen": 4726424, + "step": 6585 + }, + { + "epoch": 13.700623700623701, + "grad_norm": 0.3039219379425049, + "learning_rate": 4.672648089443953e-05, + "loss": 0.1201, + "num_input_tokens_seen": 4729976, + "step": 6590 + }, + { + "epoch": 13.711018711018712, + "grad_norm": 0.3020685315132141, + "learning_rate": 4.672162243449204e-05, + "loss": 0.1567, + "num_input_tokens_seen": 4733592, + "step": 6595 + }, + { + "epoch": 13.721413721413722, + "grad_norm": 0.4467168152332306, + "learning_rate": 4.67167606247974e-05, + "loss": 0.136, + "num_input_tokens_seen": 4737208, + "step": 6600 + }, + { + "epoch": 13.721413721413722, + "eval_loss": 0.15221557021141052, + "eval_runtime": 7.7662, + "eval_samples_per_second": 110.222, + "eval_steps_per_second": 27.555, + "num_input_tokens_seen": 4737208, + "step": 6600 + }, + { + "epoch": 13.731808731808732, + "grad_norm": 0.23119904100894928, + "learning_rate": 4.671189546610536e-05, + "loss": 0.1085, + "num_input_tokens_seen": 4740824, + "step": 6605 + }, + { + "epoch": 13.742203742203742, + "grad_norm": 0.165739968419075, + "learning_rate": 4.67070269591662e-05, + "loss": 0.1402, + "num_input_tokens_seen": 4744152, + "step": 6610 + }, + { + "epoch": 13.752598752598752, + "grad_norm": 0.5048744082450867, + "learning_rate": 4.670215510473068e-05, + "loss": 0.1363, + "num_input_tokens_seen": 4747768, + "step": 6615 + }, + { + "epoch": 13.762993762993762, + "grad_norm": 0.25703054666519165, + "learning_rate": 4.669727990355013e-05, + "loss": 0.126, + "num_input_tokens_seen": 4751320, + "step": 6620 + }, + { + "epoch": 13.773388773388774, + "grad_norm": 0.25536733865737915, + "learning_rate": 4.669240135637635e-05, + "loss": 0.1285, + "num_input_tokens_seen": 4754904, + "step": 6625 + }, + { + "epoch": 13.783783783783784, + "grad_norm": 0.3308922052383423, + "learning_rate": 4.6687519463961675e-05, + "loss": 0.1026, + "num_input_tokens_seen": 4758424, + "step": 6630 + }, + { + "epoch": 13.794178794178794, + "grad_norm": 0.21978645026683807, + "learning_rate": 4.668263422705896e-05, + "loss": 0.1278, + "num_input_tokens_seen": 4761912, + "step": 6635 + }, + { + "epoch": 13.804573804573804, + "grad_norm": 0.25805652141571045, + "learning_rate": 4.667774564642156e-05, + "loss": 0.1065, + "num_input_tokens_seen": 4765496, + "step": 6640 + }, + { + "epoch": 13.814968814968815, + "grad_norm": 0.13856098055839539, + "learning_rate": 4.6672853722803365e-05, + "loss": 0.11, + "num_input_tokens_seen": 4769016, + "step": 6645 + }, + { + "epoch": 13.825363825363825, + "grad_norm": 0.27171194553375244, + "learning_rate": 4.666795845695877e-05, + "loss": 0.1391, + "num_input_tokens_seen": 4772536, + "step": 6650 + }, + { + "epoch": 13.835758835758837, + "grad_norm": 0.33405575156211853, + "learning_rate": 4.666305984964269e-05, + "loss": 0.1222, + "num_input_tokens_seen": 4776152, + "step": 6655 + }, + { + "epoch": 13.846153846153847, + "grad_norm": 0.8149657249450684, + "learning_rate": 4.6658157901610535e-05, + "loss": 0.2293, + "num_input_tokens_seen": 4779832, + "step": 6660 + }, + { + "epoch": 13.856548856548857, + "grad_norm": 0.26354464888572693, + "learning_rate": 4.665325261361826e-05, + "loss": 0.0884, + "num_input_tokens_seen": 4783352, + "step": 6665 + }, + { + "epoch": 13.866943866943867, + "grad_norm": 0.5817683339118958, + "learning_rate": 4.664834398642232e-05, + "loss": 0.1786, + "num_input_tokens_seen": 4786872, + "step": 6670 + }, + { + "epoch": 13.877338877338877, + "grad_norm": 0.29001250863075256, + "learning_rate": 4.6643432020779686e-05, + "loss": 0.1111, + "num_input_tokens_seen": 4790488, + "step": 6675 + }, + { + "epoch": 13.887733887733887, + "grad_norm": 0.5671786665916443, + "learning_rate": 4.663851671744786e-05, + "loss": 0.1544, + "num_input_tokens_seen": 4794136, + "step": 6680 + }, + { + "epoch": 13.898128898128899, + "grad_norm": 0.38097551465034485, + "learning_rate": 4.6633598077184815e-05, + "loss": 0.1657, + "num_input_tokens_seen": 4797752, + "step": 6685 + }, + { + "epoch": 13.90852390852391, + "grad_norm": 0.3397182822227478, + "learning_rate": 4.662867610074908e-05, + "loss": 0.1476, + "num_input_tokens_seen": 4801432, + "step": 6690 + }, + { + "epoch": 13.91891891891892, + "grad_norm": 0.18701551854610443, + "learning_rate": 4.6623750788899696e-05, + "loss": 0.1536, + "num_input_tokens_seen": 4805144, + "step": 6695 + }, + { + "epoch": 13.92931392931393, + "grad_norm": 0.21402184665203094, + "learning_rate": 4.6618822142396195e-05, + "loss": 0.1029, + "num_input_tokens_seen": 4808728, + "step": 6700 + }, + { + "epoch": 13.93970893970894, + "grad_norm": 0.3110713064670563, + "learning_rate": 4.661389016199864e-05, + "loss": 0.1723, + "num_input_tokens_seen": 4812344, + "step": 6705 + }, + { + "epoch": 13.95010395010395, + "grad_norm": 0.2690982222557068, + "learning_rate": 4.660895484846761e-05, + "loss": 0.1079, + "num_input_tokens_seen": 4815800, + "step": 6710 + }, + { + "epoch": 13.96049896049896, + "grad_norm": 0.2620570957660675, + "learning_rate": 4.660401620256418e-05, + "loss": 0.1295, + "num_input_tokens_seen": 4819544, + "step": 6715 + }, + { + "epoch": 13.970893970893972, + "grad_norm": 0.154879629611969, + "learning_rate": 4.659907422504997e-05, + "loss": 0.0877, + "num_input_tokens_seen": 4823064, + "step": 6720 + }, + { + "epoch": 13.981288981288982, + "grad_norm": 0.37390172481536865, + "learning_rate": 4.6594128916687074e-05, + "loss": 0.1971, + "num_input_tokens_seen": 4826648, + "step": 6725 + }, + { + "epoch": 13.991683991683992, + "grad_norm": 0.27629056572914124, + "learning_rate": 4.658918027823813e-05, + "loss": 0.1414, + "num_input_tokens_seen": 4830136, + "step": 6730 + }, + { + "epoch": 14.002079002079002, + "grad_norm": 0.2666909992694855, + "learning_rate": 4.658422831046628e-05, + "loss": 0.1804, + "num_input_tokens_seen": 4833640, + "step": 6735 + }, + { + "epoch": 14.012474012474012, + "grad_norm": 0.3316201865673065, + "learning_rate": 4.657927301413518e-05, + "loss": 0.149, + "num_input_tokens_seen": 4837192, + "step": 6740 + }, + { + "epoch": 14.022869022869022, + "grad_norm": 0.16631370782852173, + "learning_rate": 4.657431439000901e-05, + "loss": 0.111, + "num_input_tokens_seen": 4840680, + "step": 6745 + }, + { + "epoch": 14.033264033264032, + "grad_norm": 0.15110467374324799, + "learning_rate": 4.656935243885243e-05, + "loss": 0.1756, + "num_input_tokens_seen": 4844424, + "step": 6750 + }, + { + "epoch": 14.043659043659044, + "grad_norm": 0.22220613062381744, + "learning_rate": 4.656438716143066e-05, + "loss": 0.1281, + "num_input_tokens_seen": 4848008, + "step": 6755 + }, + { + "epoch": 14.054054054054054, + "grad_norm": 0.28008660674095154, + "learning_rate": 4.6559418558509384e-05, + "loss": 0.1355, + "num_input_tokens_seen": 4851464, + "step": 6760 + }, + { + "epoch": 14.064449064449065, + "grad_norm": 0.1626136749982834, + "learning_rate": 4.6554446630854833e-05, + "loss": 0.1023, + "num_input_tokens_seen": 4854920, + "step": 6765 + }, + { + "epoch": 14.074844074844075, + "grad_norm": 0.8161218166351318, + "learning_rate": 4.654947137923374e-05, + "loss": 0.1903, + "num_input_tokens_seen": 4858536, + "step": 6770 + }, + { + "epoch": 14.085239085239085, + "grad_norm": 0.29074716567993164, + "learning_rate": 4.654449280441335e-05, + "loss": 0.1223, + "num_input_tokens_seen": 4861992, + "step": 6775 + }, + { + "epoch": 14.095634095634095, + "grad_norm": 0.34513989090919495, + "learning_rate": 4.653951090716143e-05, + "loss": 0.1446, + "num_input_tokens_seen": 4865544, + "step": 6780 + }, + { + "epoch": 14.106029106029107, + "grad_norm": 0.28448405861854553, + "learning_rate": 4.653452568824625e-05, + "loss": 0.152, + "num_input_tokens_seen": 4869128, + "step": 6785 + }, + { + "epoch": 14.116424116424117, + "grad_norm": 0.19650989770889282, + "learning_rate": 4.6529537148436585e-05, + "loss": 0.1171, + "num_input_tokens_seen": 4872872, + "step": 6790 + }, + { + "epoch": 14.126819126819127, + "grad_norm": 0.2794972062110901, + "learning_rate": 4.6524545288501734e-05, + "loss": 0.1625, + "num_input_tokens_seen": 4876552, + "step": 6795 + }, + { + "epoch": 14.137214137214137, + "grad_norm": 0.1367776244878769, + "learning_rate": 4.6519550109211506e-05, + "loss": 0.1068, + "num_input_tokens_seen": 4880104, + "step": 6800 + }, + { + "epoch": 14.137214137214137, + "eval_loss": 0.15028069913387299, + "eval_runtime": 7.7914, + "eval_samples_per_second": 109.865, + "eval_steps_per_second": 27.466, + "num_input_tokens_seen": 4880104, + "step": 6800 + }, + { + "epoch": 14.147609147609147, + "grad_norm": 0.31814318895339966, + "learning_rate": 4.651455161133622e-05, + "loss": 0.1159, + "num_input_tokens_seen": 4883624, + "step": 6805 + }, + { + "epoch": 14.158004158004157, + "grad_norm": 0.16287486255168915, + "learning_rate": 4.6509549795646704e-05, + "loss": 0.1214, + "num_input_tokens_seen": 4887208, + "step": 6810 + }, + { + "epoch": 14.16839916839917, + "grad_norm": 0.2411595731973648, + "learning_rate": 4.6504544662914306e-05, + "loss": 0.1234, + "num_input_tokens_seen": 4890824, + "step": 6815 + }, + { + "epoch": 14.17879417879418, + "grad_norm": 0.2892874479293823, + "learning_rate": 4.6499536213910876e-05, + "loss": 0.1444, + "num_input_tokens_seen": 4894312, + "step": 6820 + }, + { + "epoch": 14.18918918918919, + "grad_norm": 0.25414103269577026, + "learning_rate": 4.6494524449408786e-05, + "loss": 0.1671, + "num_input_tokens_seen": 4897928, + "step": 6825 + }, + { + "epoch": 14.1995841995842, + "grad_norm": 0.26235073804855347, + "learning_rate": 4.6489509370180903e-05, + "loss": 0.1229, + "num_input_tokens_seen": 4901512, + "step": 6830 + }, + { + "epoch": 14.20997920997921, + "grad_norm": 0.20430827140808105, + "learning_rate": 4.648449097700063e-05, + "loss": 0.1304, + "num_input_tokens_seen": 4905192, + "step": 6835 + }, + { + "epoch": 14.22037422037422, + "grad_norm": 0.2986273169517517, + "learning_rate": 4.647946927064185e-05, + "loss": 0.1536, + "num_input_tokens_seen": 4908840, + "step": 6840 + }, + { + "epoch": 14.23076923076923, + "grad_norm": 0.460219144821167, + "learning_rate": 4.647444425187898e-05, + "loss": 0.1288, + "num_input_tokens_seen": 4912552, + "step": 6845 + }, + { + "epoch": 14.241164241164242, + "grad_norm": 0.44355952739715576, + "learning_rate": 4.646941592148695e-05, + "loss": 0.1367, + "num_input_tokens_seen": 4916232, + "step": 6850 + }, + { + "epoch": 14.251559251559252, + "grad_norm": 0.31494104862213135, + "learning_rate": 4.646438428024117e-05, + "loss": 0.1269, + "num_input_tokens_seen": 4920072, + "step": 6855 + }, + { + "epoch": 14.261954261954262, + "grad_norm": 0.15355724096298218, + "learning_rate": 4.64593493289176e-05, + "loss": 0.1205, + "num_input_tokens_seen": 4923624, + "step": 6860 + }, + { + "epoch": 14.272349272349272, + "grad_norm": 0.45474618673324585, + "learning_rate": 4.64543110682927e-05, + "loss": 0.1413, + "num_input_tokens_seen": 4927240, + "step": 6865 + }, + { + "epoch": 14.282744282744282, + "grad_norm": 0.2530617117881775, + "learning_rate": 4.644926949914341e-05, + "loss": 0.1552, + "num_input_tokens_seen": 4930760, + "step": 6870 + }, + { + "epoch": 14.293139293139292, + "grad_norm": 0.19724534451961517, + "learning_rate": 4.644422462224722e-05, + "loss": 0.1495, + "num_input_tokens_seen": 4934216, + "step": 6875 + }, + { + "epoch": 14.303534303534304, + "grad_norm": 0.5420207977294922, + "learning_rate": 4.643917643838211e-05, + "loss": 0.1092, + "num_input_tokens_seen": 4937768, + "step": 6880 + }, + { + "epoch": 14.313929313929314, + "grad_norm": 0.1952446550130844, + "learning_rate": 4.6434124948326564e-05, + "loss": 0.1228, + "num_input_tokens_seen": 4941352, + "step": 6885 + }, + { + "epoch": 14.324324324324325, + "grad_norm": 0.3269888162612915, + "learning_rate": 4.6429070152859594e-05, + "loss": 0.1295, + "num_input_tokens_seen": 4944744, + "step": 6890 + }, + { + "epoch": 14.334719334719335, + "grad_norm": 0.2142140120267868, + "learning_rate": 4.6424012052760714e-05, + "loss": 0.1307, + "num_input_tokens_seen": 4948232, + "step": 6895 + }, + { + "epoch": 14.345114345114345, + "grad_norm": 0.20271757245063782, + "learning_rate": 4.6418950648809945e-05, + "loss": 0.1346, + "num_input_tokens_seen": 4951816, + "step": 6900 + }, + { + "epoch": 14.355509355509355, + "grad_norm": 0.26822298765182495, + "learning_rate": 4.641388594178782e-05, + "loss": 0.1564, + "num_input_tokens_seen": 4955432, + "step": 6905 + }, + { + "epoch": 14.365904365904367, + "grad_norm": 0.1582370400428772, + "learning_rate": 4.640881793247538e-05, + "loss": 0.1447, + "num_input_tokens_seen": 4959048, + "step": 6910 + }, + { + "epoch": 14.376299376299377, + "grad_norm": 0.3157139718532562, + "learning_rate": 4.6403746621654173e-05, + "loss": 0.1387, + "num_input_tokens_seen": 4962536, + "step": 6915 + }, + { + "epoch": 14.386694386694387, + "grad_norm": 0.23629382252693176, + "learning_rate": 4.639867201010626e-05, + "loss": 0.1117, + "num_input_tokens_seen": 4966184, + "step": 6920 + }, + { + "epoch": 14.397089397089397, + "grad_norm": 0.31121736764907837, + "learning_rate": 4.6393594098614204e-05, + "loss": 0.144, + "num_input_tokens_seen": 4969704, + "step": 6925 + }, + { + "epoch": 14.407484407484407, + "grad_norm": 0.3077380955219269, + "learning_rate": 4.63885128879611e-05, + "loss": 0.118, + "num_input_tokens_seen": 4973352, + "step": 6930 + }, + { + "epoch": 14.417879417879417, + "grad_norm": 0.1939311921596527, + "learning_rate": 4.638342837893052e-05, + "loss": 0.1486, + "num_input_tokens_seen": 4977224, + "step": 6935 + }, + { + "epoch": 14.428274428274428, + "grad_norm": 0.6368141770362854, + "learning_rate": 4.6378340572306565e-05, + "loss": 0.1397, + "num_input_tokens_seen": 4980936, + "step": 6940 + }, + { + "epoch": 14.43866943866944, + "grad_norm": 0.23949939012527466, + "learning_rate": 4.6373249468873833e-05, + "loss": 0.1016, + "num_input_tokens_seen": 4984456, + "step": 6945 + }, + { + "epoch": 14.44906444906445, + "grad_norm": 0.35903993248939514, + "learning_rate": 4.636815506941744e-05, + "loss": 0.1565, + "num_input_tokens_seen": 4988008, + "step": 6950 + }, + { + "epoch": 14.45945945945946, + "grad_norm": 0.17080722749233246, + "learning_rate": 4.6363057374723004e-05, + "loss": 0.1473, + "num_input_tokens_seen": 4991528, + "step": 6955 + }, + { + "epoch": 14.46985446985447, + "grad_norm": 0.24676482379436493, + "learning_rate": 4.635795638557666e-05, + "loss": 0.1294, + "num_input_tokens_seen": 4994952, + "step": 6960 + }, + { + "epoch": 14.48024948024948, + "grad_norm": 0.34127265214920044, + "learning_rate": 4.635285210276504e-05, + "loss": 0.1551, + "num_input_tokens_seen": 4998504, + "step": 6965 + }, + { + "epoch": 14.49064449064449, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.6347744527075295e-05, + "loss": 0.1396, + "num_input_tokens_seen": 5002024, + "step": 6970 + }, + { + "epoch": 14.5010395010395, + "grad_norm": 0.3758174180984497, + "learning_rate": 4.634263365929506e-05, + "loss": 0.1262, + "num_input_tokens_seen": 5005736, + "step": 6975 + }, + { + "epoch": 14.511434511434512, + "grad_norm": 0.2213446944952011, + "learning_rate": 4.6337519500212515e-05, + "loss": 0.1209, + "num_input_tokens_seen": 5009352, + "step": 6980 + }, + { + "epoch": 14.521829521829522, + "grad_norm": 0.5398876070976257, + "learning_rate": 4.633240205061632e-05, + "loss": 0.1298, + "num_input_tokens_seen": 5013032, + "step": 6985 + }, + { + "epoch": 14.532224532224532, + "grad_norm": 0.33827123045921326, + "learning_rate": 4.632728131129565e-05, + "loss": 0.1315, + "num_input_tokens_seen": 5016808, + "step": 6990 + }, + { + "epoch": 14.542619542619542, + "grad_norm": 0.4199237823486328, + "learning_rate": 4.632215728304018e-05, + "loss": 0.1054, + "num_input_tokens_seen": 5020552, + "step": 6995 + }, + { + "epoch": 14.553014553014552, + "grad_norm": 0.486012727022171, + "learning_rate": 4.63170299666401e-05, + "loss": 0.1307, + "num_input_tokens_seen": 5024232, + "step": 7000 + }, + { + "epoch": 14.553014553014552, + "eval_loss": 0.1544971615076065, + "eval_runtime": 7.7647, + "eval_samples_per_second": 110.243, + "eval_steps_per_second": 27.561, + "num_input_tokens_seen": 5024232, + "step": 7000 + }, + { + "epoch": 14.563409563409563, + "grad_norm": 0.6983143091201782, + "learning_rate": 4.631189936288612e-05, + "loss": 0.1495, + "num_input_tokens_seen": 5028072, + "step": 7005 + }, + { + "epoch": 14.573804573804575, + "grad_norm": 0.12342843413352966, + "learning_rate": 4.630676547256944e-05, + "loss": 0.1158, + "num_input_tokens_seen": 5031592, + "step": 7010 + }, + { + "epoch": 14.584199584199585, + "grad_norm": 0.4448213577270508, + "learning_rate": 4.630162829648176e-05, + "loss": 0.1462, + "num_input_tokens_seen": 5035240, + "step": 7015 + }, + { + "epoch": 14.594594594594595, + "grad_norm": 0.643597424030304, + "learning_rate": 4.629648783541531e-05, + "loss": 0.1489, + "num_input_tokens_seen": 5038952, + "step": 7020 + }, + { + "epoch": 14.604989604989605, + "grad_norm": 0.2179875671863556, + "learning_rate": 4.6291344090162804e-05, + "loss": 0.087, + "num_input_tokens_seen": 5042472, + "step": 7025 + }, + { + "epoch": 14.615384615384615, + "grad_norm": 0.2975456416606903, + "learning_rate": 4.628619706151748e-05, + "loss": 0.136, + "num_input_tokens_seen": 5046152, + "step": 7030 + }, + { + "epoch": 14.625779625779625, + "grad_norm": 0.20607823133468628, + "learning_rate": 4.628104675027306e-05, + "loss": 0.1481, + "num_input_tokens_seen": 5049768, + "step": 7035 + }, + { + "epoch": 14.636174636174637, + "grad_norm": 0.14142075181007385, + "learning_rate": 4.6275893157223805e-05, + "loss": 0.1468, + "num_input_tokens_seen": 5053384, + "step": 7040 + }, + { + "epoch": 14.646569646569647, + "grad_norm": 0.21488161385059357, + "learning_rate": 4.627073628316445e-05, + "loss": 0.1385, + "num_input_tokens_seen": 5057000, + "step": 7045 + }, + { + "epoch": 14.656964656964657, + "grad_norm": 0.40887340903282166, + "learning_rate": 4.626557612889026e-05, + "loss": 0.1012, + "num_input_tokens_seen": 5060712, + "step": 7050 + }, + { + "epoch": 14.667359667359667, + "grad_norm": 0.3617543876171112, + "learning_rate": 4.626041269519699e-05, + "loss": 0.1081, + "num_input_tokens_seen": 5064360, + "step": 7055 + }, + { + "epoch": 14.677754677754677, + "grad_norm": 0.13657481968402863, + "learning_rate": 4.6255245982880905e-05, + "loss": 0.0918, + "num_input_tokens_seen": 5067816, + "step": 7060 + }, + { + "epoch": 14.688149688149688, + "grad_norm": 0.27187755703926086, + "learning_rate": 4.625007599273879e-05, + "loss": 0.1271, + "num_input_tokens_seen": 5071400, + "step": 7065 + }, + { + "epoch": 14.698544698544698, + "grad_norm": 0.28622761368751526, + "learning_rate": 4.6244902725567895e-05, + "loss": 0.1332, + "num_input_tokens_seen": 5074952, + "step": 7070 + }, + { + "epoch": 14.70893970893971, + "grad_norm": 0.32436785101890564, + "learning_rate": 4.6239726182166024e-05, + "loss": 0.138, + "num_input_tokens_seen": 5078536, + "step": 7075 + }, + { + "epoch": 14.71933471933472, + "grad_norm": 0.7786275744438171, + "learning_rate": 4.623454636333147e-05, + "loss": 0.1675, + "num_input_tokens_seen": 5082184, + "step": 7080 + }, + { + "epoch": 14.72972972972973, + "grad_norm": 0.21182557940483093, + "learning_rate": 4.622936326986301e-05, + "loss": 0.1328, + "num_input_tokens_seen": 5085768, + "step": 7085 + }, + { + "epoch": 14.74012474012474, + "grad_norm": 0.2204027622938156, + "learning_rate": 4.6224176902559946e-05, + "loss": 0.0946, + "num_input_tokens_seen": 5089288, + "step": 7090 + }, + { + "epoch": 14.75051975051975, + "grad_norm": 0.4106265902519226, + "learning_rate": 4.621898726222209e-05, + "loss": 0.1011, + "num_input_tokens_seen": 5092904, + "step": 7095 + }, + { + "epoch": 14.76091476091476, + "grad_norm": 0.22767607867717743, + "learning_rate": 4.6213794349649744e-05, + "loss": 0.1389, + "num_input_tokens_seen": 5096456, + "step": 7100 + }, + { + "epoch": 14.771309771309772, + "grad_norm": 0.24449700117111206, + "learning_rate": 4.6208598165643715e-05, + "loss": 0.1258, + "num_input_tokens_seen": 5099944, + "step": 7105 + }, + { + "epoch": 14.781704781704782, + "grad_norm": 0.533557116985321, + "learning_rate": 4.620339871100533e-05, + "loss": 0.1487, + "num_input_tokens_seen": 5103560, + "step": 7110 + }, + { + "epoch": 14.792099792099792, + "grad_norm": 0.34528031945228577, + "learning_rate": 4.6198195986536394e-05, + "loss": 0.1297, + "num_input_tokens_seen": 5106952, + "step": 7115 + }, + { + "epoch": 14.802494802494802, + "grad_norm": 0.4567480981349945, + "learning_rate": 4.619298999303926e-05, + "loss": 0.0955, + "num_input_tokens_seen": 5110472, + "step": 7120 + }, + { + "epoch": 14.812889812889813, + "grad_norm": 0.41013309359550476, + "learning_rate": 4.618778073131673e-05, + "loss": 0.1407, + "num_input_tokens_seen": 5113928, + "step": 7125 + }, + { + "epoch": 14.823284823284823, + "grad_norm": 0.5407742261886597, + "learning_rate": 4.618256820217215e-05, + "loss": 0.1177, + "num_input_tokens_seen": 5117480, + "step": 7130 + }, + { + "epoch": 14.833679833679835, + "grad_norm": 0.35246363282203674, + "learning_rate": 4.617735240640936e-05, + "loss": 0.1565, + "num_input_tokens_seen": 5121160, + "step": 7135 + }, + { + "epoch": 14.844074844074845, + "grad_norm": 0.25698456168174744, + "learning_rate": 4.6172133344832705e-05, + "loss": 0.1379, + "num_input_tokens_seen": 5124776, + "step": 7140 + }, + { + "epoch": 14.854469854469855, + "grad_norm": 0.952627420425415, + "learning_rate": 4.6166911018247004e-05, + "loss": 0.1839, + "num_input_tokens_seen": 5128296, + "step": 7145 + }, + { + "epoch": 14.864864864864865, + "grad_norm": 0.7146037220954895, + "learning_rate": 4.616168542745764e-05, + "loss": 0.1269, + "num_input_tokens_seen": 5131752, + "step": 7150 + }, + { + "epoch": 14.875259875259875, + "grad_norm": 0.2679068446159363, + "learning_rate": 4.6156456573270446e-05, + "loss": 0.2111, + "num_input_tokens_seen": 5135464, + "step": 7155 + }, + { + "epoch": 14.885654885654885, + "grad_norm": 0.3040771484375, + "learning_rate": 4.615122445649177e-05, + "loss": 0.1524, + "num_input_tokens_seen": 5139048, + "step": 7160 + }, + { + "epoch": 14.896049896049895, + "grad_norm": 0.6332122683525085, + "learning_rate": 4.6145989077928486e-05, + "loss": 0.1383, + "num_input_tokens_seen": 5142696, + "step": 7165 + }, + { + "epoch": 14.906444906444907, + "grad_norm": 0.20469626784324646, + "learning_rate": 4.6140750438387953e-05, + "loss": 0.0962, + "num_input_tokens_seen": 5146152, + "step": 7170 + }, + { + "epoch": 14.916839916839917, + "grad_norm": 0.18935923278331757, + "learning_rate": 4.613550853867803e-05, + "loss": 0.1025, + "num_input_tokens_seen": 5149640, + "step": 7175 + }, + { + "epoch": 14.927234927234927, + "grad_norm": 0.27803486585617065, + "learning_rate": 4.613026337960708e-05, + "loss": 0.1136, + "num_input_tokens_seen": 5153128, + "step": 7180 + }, + { + "epoch": 14.937629937629938, + "grad_norm": 0.2589099705219269, + "learning_rate": 4.612501496198398e-05, + "loss": 0.1056, + "num_input_tokens_seen": 5156584, + "step": 7185 + }, + { + "epoch": 14.948024948024948, + "grad_norm": 0.29613059759140015, + "learning_rate": 4.61197632866181e-05, + "loss": 0.1678, + "num_input_tokens_seen": 5160168, + "step": 7190 + }, + { + "epoch": 14.958419958419958, + "grad_norm": 0.13302090764045715, + "learning_rate": 4.611450835431931e-05, + "loss": 0.1296, + "num_input_tokens_seen": 5163752, + "step": 7195 + }, + { + "epoch": 14.96881496881497, + "grad_norm": 0.20021088421344757, + "learning_rate": 4.6109250165898e-05, + "loss": 0.1475, + "num_input_tokens_seen": 5167336, + "step": 7200 + }, + { + "epoch": 14.96881496881497, + "eval_loss": 0.1488584578037262, + "eval_runtime": 7.7565, + "eval_samples_per_second": 110.36, + "eval_steps_per_second": 27.59, + "num_input_tokens_seen": 5167336, + "step": 7200 + }, + { + "epoch": 14.97920997920998, + "grad_norm": 0.25243064761161804, + "learning_rate": 4.610398872216503e-05, + "loss": 0.1191, + "num_input_tokens_seen": 5170856, + "step": 7205 + }, + { + "epoch": 14.98960498960499, + "grad_norm": 0.7697832584381104, + "learning_rate": 4.6098724023931796e-05, + "loss": 0.1092, + "num_input_tokens_seen": 5174600, + "step": 7210 + }, + { + "epoch": 15.0, + "grad_norm": 0.23380087316036224, + "learning_rate": 4.609345607201017e-05, + "loss": 0.187, + "num_input_tokens_seen": 5178136, + "step": 7215 + }, + { + "epoch": 15.01039501039501, + "grad_norm": 0.36638304591178894, + "learning_rate": 4.608818486721254e-05, + "loss": 0.1139, + "num_input_tokens_seen": 5181880, + "step": 7220 + }, + { + "epoch": 15.02079002079002, + "grad_norm": 0.3785219192504883, + "learning_rate": 4.608291041035179e-05, + "loss": 0.108, + "num_input_tokens_seen": 5185560, + "step": 7225 + }, + { + "epoch": 15.03118503118503, + "grad_norm": 0.43873947858810425, + "learning_rate": 4.607763270224132e-05, + "loss": 0.1479, + "num_input_tokens_seen": 5189176, + "step": 7230 + }, + { + "epoch": 15.041580041580042, + "grad_norm": 0.46573692560195923, + "learning_rate": 4.6072351743695e-05, + "loss": 0.1259, + "num_input_tokens_seen": 5192760, + "step": 7235 + }, + { + "epoch": 15.051975051975052, + "grad_norm": 0.1880912482738495, + "learning_rate": 4.606706753552723e-05, + "loss": 0.1509, + "num_input_tokens_seen": 5196344, + "step": 7240 + }, + { + "epoch": 15.062370062370062, + "grad_norm": 0.2793061435222626, + "learning_rate": 4.6061780078552906e-05, + "loss": 0.1389, + "num_input_tokens_seen": 5200024, + "step": 7245 + }, + { + "epoch": 15.072765072765073, + "grad_norm": 0.5034422278404236, + "learning_rate": 4.605648937358742e-05, + "loss": 0.1299, + "num_input_tokens_seen": 5203576, + "step": 7250 + }, + { + "epoch": 15.083160083160083, + "grad_norm": 0.21686549484729767, + "learning_rate": 4.605119542144665e-05, + "loss": 0.093, + "num_input_tokens_seen": 5207096, + "step": 7255 + }, + { + "epoch": 15.093555093555093, + "grad_norm": 0.23500056564807892, + "learning_rate": 4.604589822294701e-05, + "loss": 0.1706, + "num_input_tokens_seen": 5210840, + "step": 7260 + }, + { + "epoch": 15.103950103950105, + "grad_norm": 0.24031919240951538, + "learning_rate": 4.604059777890537e-05, + "loss": 0.1697, + "num_input_tokens_seen": 5214296, + "step": 7265 + }, + { + "epoch": 15.114345114345115, + "grad_norm": 0.2282891720533371, + "learning_rate": 4.6035294090139145e-05, + "loss": 0.1578, + "num_input_tokens_seen": 5217880, + "step": 7270 + }, + { + "epoch": 15.124740124740125, + "grad_norm": 0.5024453997612, + "learning_rate": 4.6029987157466226e-05, + "loss": 0.1433, + "num_input_tokens_seen": 5221688, + "step": 7275 + }, + { + "epoch": 15.135135135135135, + "grad_norm": 0.15691730380058289, + "learning_rate": 4.602467698170502e-05, + "loss": 0.1357, + "num_input_tokens_seen": 5225336, + "step": 7280 + }, + { + "epoch": 15.145530145530145, + "grad_norm": 0.15593189001083374, + "learning_rate": 4.601936356367439e-05, + "loss": 0.1232, + "num_input_tokens_seen": 5228920, + "step": 7285 + }, + { + "epoch": 15.155925155925155, + "grad_norm": 0.230337992310524, + "learning_rate": 4.601404690419377e-05, + "loss": 0.1302, + "num_input_tokens_seen": 5232440, + "step": 7290 + }, + { + "epoch": 15.166320166320165, + "grad_norm": 0.486013263463974, + "learning_rate": 4.600872700408303e-05, + "loss": 0.0997, + "num_input_tokens_seen": 5236088, + "step": 7295 + }, + { + "epoch": 15.176715176715177, + "grad_norm": 0.2885189354419708, + "learning_rate": 4.600340386416258e-05, + "loss": 0.1518, + "num_input_tokens_seen": 5239736, + "step": 7300 + }, + { + "epoch": 15.187110187110187, + "grad_norm": 0.21442818641662598, + "learning_rate": 4.5998077485253296e-05, + "loss": 0.137, + "num_input_tokens_seen": 5243256, + "step": 7305 + }, + { + "epoch": 15.197505197505198, + "grad_norm": 0.20300433039665222, + "learning_rate": 4.59927478681766e-05, + "loss": 0.1243, + "num_input_tokens_seen": 5246936, + "step": 7310 + }, + { + "epoch": 15.207900207900208, + "grad_norm": 0.16474245488643646, + "learning_rate": 4.5987415013754366e-05, + "loss": 0.112, + "num_input_tokens_seen": 5250264, + "step": 7315 + }, + { + "epoch": 15.218295218295218, + "grad_norm": 0.3203970193862915, + "learning_rate": 4.598207892280899e-05, + "loss": 0.1078, + "num_input_tokens_seen": 5253784, + "step": 7320 + }, + { + "epoch": 15.228690228690228, + "grad_norm": 0.2080407589673996, + "learning_rate": 4.597673959616337e-05, + "loss": 0.1337, + "num_input_tokens_seen": 5257528, + "step": 7325 + }, + { + "epoch": 15.23908523908524, + "grad_norm": 0.545668363571167, + "learning_rate": 4.597139703464089e-05, + "loss": 0.1522, + "num_input_tokens_seen": 5261240, + "step": 7330 + }, + { + "epoch": 15.24948024948025, + "grad_norm": 0.2554899752140045, + "learning_rate": 4.596605123906545e-05, + "loss": 0.1279, + "num_input_tokens_seen": 5264760, + "step": 7335 + }, + { + "epoch": 15.25987525987526, + "grad_norm": 0.2633365988731384, + "learning_rate": 4.596070221026143e-05, + "loss": 0.1206, + "num_input_tokens_seen": 5268248, + "step": 7340 + }, + { + "epoch": 15.27027027027027, + "grad_norm": 0.3653009235858917, + "learning_rate": 4.595534994905372e-05, + "loss": 0.1249, + "num_input_tokens_seen": 5271736, + "step": 7345 + }, + { + "epoch": 15.28066528066528, + "grad_norm": 0.7676973938941956, + "learning_rate": 4.594999445626771e-05, + "loss": 0.1157, + "num_input_tokens_seen": 5275448, + "step": 7350 + }, + { + "epoch": 15.29106029106029, + "grad_norm": 0.40899285674095154, + "learning_rate": 4.5944635732729276e-05, + "loss": 0.1211, + "num_input_tokens_seen": 5279064, + "step": 7355 + }, + { + "epoch": 15.301455301455302, + "grad_norm": 0.24404063820838928, + "learning_rate": 4.5939273779264804e-05, + "loss": 0.1183, + "num_input_tokens_seen": 5282744, + "step": 7360 + }, + { + "epoch": 15.311850311850312, + "grad_norm": 0.2042302042245865, + "learning_rate": 4.593390859670118e-05, + "loss": 0.1209, + "num_input_tokens_seen": 5286424, + "step": 7365 + }, + { + "epoch": 15.322245322245323, + "grad_norm": 0.4778387248516083, + "learning_rate": 4.5928540185865776e-05, + "loss": 0.1701, + "num_input_tokens_seen": 5289976, + "step": 7370 + }, + { + "epoch": 15.332640332640333, + "grad_norm": 0.1690654158592224, + "learning_rate": 4.592316854758648e-05, + "loss": 0.1447, + "num_input_tokens_seen": 5293560, + "step": 7375 + }, + { + "epoch": 15.343035343035343, + "grad_norm": 0.2989077866077423, + "learning_rate": 4.5917793682691646e-05, + "loss": 0.1131, + "num_input_tokens_seen": 5297208, + "step": 7380 + }, + { + "epoch": 15.353430353430353, + "grad_norm": 0.3571385145187378, + "learning_rate": 4.5912415592010164e-05, + "loss": 0.1464, + "num_input_tokens_seen": 5300696, + "step": 7385 + }, + { + "epoch": 15.363825363825363, + "grad_norm": 0.2907087206840515, + "learning_rate": 4.5907034276371386e-05, + "loss": 0.1069, + "num_input_tokens_seen": 5304248, + "step": 7390 + }, + { + "epoch": 15.374220374220375, + "grad_norm": 0.3667556047439575, + "learning_rate": 4.5901649736605196e-05, + "loss": 0.1111, + "num_input_tokens_seen": 5307832, + "step": 7395 + }, + { + "epoch": 15.384615384615385, + "grad_norm": 0.3983479142189026, + "learning_rate": 4.589626197354195e-05, + "loss": 0.1159, + "num_input_tokens_seen": 5311512, + "step": 7400 + }, + { + "epoch": 15.384615384615385, + "eval_loss": 0.14758281409740448, + "eval_runtime": 7.7688, + "eval_samples_per_second": 110.184, + "eval_steps_per_second": 27.546, + "num_input_tokens_seen": 5311512, + "step": 7400 + }, + { + "epoch": 15.395010395010395, + "grad_norm": 0.2903364300727844, + "learning_rate": 4.5890870988012504e-05, + "loss": 0.1225, + "num_input_tokens_seen": 5315000, + "step": 7405 + }, + { + "epoch": 15.405405405405405, + "grad_norm": 0.2805035710334778, + "learning_rate": 4.5885476780848226e-05, + "loss": 0.1517, + "num_input_tokens_seen": 5318520, + "step": 7410 + }, + { + "epoch": 15.415800415800415, + "grad_norm": 0.21800467371940613, + "learning_rate": 4.5880079352880964e-05, + "loss": 0.1551, + "num_input_tokens_seen": 5322264, + "step": 7415 + }, + { + "epoch": 15.426195426195425, + "grad_norm": 0.19275489449501038, + "learning_rate": 4.5874678704943065e-05, + "loss": 0.1147, + "num_input_tokens_seen": 5325784, + "step": 7420 + }, + { + "epoch": 15.436590436590437, + "grad_norm": 0.3262617886066437, + "learning_rate": 4.5869274837867394e-05, + "loss": 0.1156, + "num_input_tokens_seen": 5329240, + "step": 7425 + }, + { + "epoch": 15.446985446985448, + "grad_norm": 0.256415456533432, + "learning_rate": 4.5863867752487275e-05, + "loss": 0.1186, + "num_input_tokens_seen": 5332728, + "step": 7430 + }, + { + "epoch": 15.457380457380458, + "grad_norm": 0.18405887484550476, + "learning_rate": 4.5858457449636554e-05, + "loss": 0.1117, + "num_input_tokens_seen": 5336312, + "step": 7435 + }, + { + "epoch": 15.467775467775468, + "grad_norm": 0.3794616460800171, + "learning_rate": 4.5853043930149574e-05, + "loss": 0.1619, + "num_input_tokens_seen": 5339992, + "step": 7440 + }, + { + "epoch": 15.478170478170478, + "grad_norm": 0.43071529269218445, + "learning_rate": 4.584762719486117e-05, + "loss": 0.1431, + "num_input_tokens_seen": 5343672, + "step": 7445 + }, + { + "epoch": 15.488565488565488, + "grad_norm": 0.3968266248703003, + "learning_rate": 4.584220724460665e-05, + "loss": 0.1596, + "num_input_tokens_seen": 5347384, + "step": 7450 + }, + { + "epoch": 15.4989604989605, + "grad_norm": 0.36487314105033875, + "learning_rate": 4.5836784080221865e-05, + "loss": 0.1747, + "num_input_tokens_seen": 5350840, + "step": 7455 + }, + { + "epoch": 15.50935550935551, + "grad_norm": 0.2708618938922882, + "learning_rate": 4.583135770254312e-05, + "loss": 0.1161, + "num_input_tokens_seen": 5354424, + "step": 7460 + }, + { + "epoch": 15.51975051975052, + "grad_norm": 0.18802356719970703, + "learning_rate": 4.5825928112407236e-05, + "loss": 0.1476, + "num_input_tokens_seen": 5358104, + "step": 7465 + }, + { + "epoch": 15.53014553014553, + "grad_norm": 0.20636363327503204, + "learning_rate": 4.582049531065152e-05, + "loss": 0.1326, + "num_input_tokens_seen": 5361720, + "step": 7470 + }, + { + "epoch": 15.54054054054054, + "grad_norm": 0.4212092459201813, + "learning_rate": 4.5815059298113783e-05, + "loss": 0.1501, + "num_input_tokens_seen": 5365272, + "step": 7475 + }, + { + "epoch": 15.55093555093555, + "grad_norm": 0.16625845432281494, + "learning_rate": 4.580962007563232e-05, + "loss": 0.1135, + "num_input_tokens_seen": 5368824, + "step": 7480 + }, + { + "epoch": 15.56133056133056, + "grad_norm": 0.3813585937023163, + "learning_rate": 4.5804177644045935e-05, + "loss": 0.1547, + "num_input_tokens_seen": 5372312, + "step": 7485 + }, + { + "epoch": 15.571725571725572, + "grad_norm": 0.14761626720428467, + "learning_rate": 4.579873200419391e-05, + "loss": 0.1378, + "num_input_tokens_seen": 5375864, + "step": 7490 + }, + { + "epoch": 15.582120582120583, + "grad_norm": 0.2742709517478943, + "learning_rate": 4.5793283156916046e-05, + "loss": 0.1189, + "num_input_tokens_seen": 5379384, + "step": 7495 + }, + { + "epoch": 15.592515592515593, + "grad_norm": 0.36745685338974, + "learning_rate": 4.578783110305261e-05, + "loss": 0.1459, + "num_input_tokens_seen": 5382904, + "step": 7500 + }, + { + "epoch": 15.602910602910603, + "grad_norm": 0.33181479573249817, + "learning_rate": 4.578237584344438e-05, + "loss": 0.1428, + "num_input_tokens_seen": 5386456, + "step": 7505 + }, + { + "epoch": 15.613305613305613, + "grad_norm": 0.26788124442100525, + "learning_rate": 4.577691737893263e-05, + "loss": 0.1612, + "num_input_tokens_seen": 5390232, + "step": 7510 + }, + { + "epoch": 15.623700623700623, + "grad_norm": 0.21255354583263397, + "learning_rate": 4.577145571035912e-05, + "loss": 0.107, + "num_input_tokens_seen": 5393848, + "step": 7515 + }, + { + "epoch": 15.634095634095633, + "grad_norm": 0.3717169165611267, + "learning_rate": 4.576599083856611e-05, + "loss": 0.118, + "num_input_tokens_seen": 5397304, + "step": 7520 + }, + { + "epoch": 15.644490644490645, + "grad_norm": 0.19069357216358185, + "learning_rate": 4.576052276439635e-05, + "loss": 0.1254, + "num_input_tokens_seen": 5400888, + "step": 7525 + }, + { + "epoch": 15.654885654885655, + "grad_norm": 0.3654121160507202, + "learning_rate": 4.575505148869308e-05, + "loss": 0.1462, + "num_input_tokens_seen": 5404312, + "step": 7530 + }, + { + "epoch": 15.665280665280665, + "grad_norm": 0.3157210052013397, + "learning_rate": 4.574957701230006e-05, + "loss": 0.1384, + "num_input_tokens_seen": 5407992, + "step": 7535 + }, + { + "epoch": 15.675675675675675, + "grad_norm": 0.3912818729877472, + "learning_rate": 4.57440993360615e-05, + "loss": 0.1057, + "num_input_tokens_seen": 5411704, + "step": 7540 + }, + { + "epoch": 15.686070686070686, + "grad_norm": 0.46024301648139954, + "learning_rate": 4.5738618460822134e-05, + "loss": 0.1242, + "num_input_tokens_seen": 5415320, + "step": 7545 + }, + { + "epoch": 15.696465696465696, + "grad_norm": 0.2476300448179245, + "learning_rate": 4.573313438742719e-05, + "loss": 0.111, + "num_input_tokens_seen": 5418968, + "step": 7550 + }, + { + "epoch": 15.706860706860708, + "grad_norm": 0.19124750792980194, + "learning_rate": 4.5727647116722374e-05, + "loss": 0.2065, + "num_input_tokens_seen": 5422552, + "step": 7555 + }, + { + "epoch": 15.717255717255718, + "grad_norm": 0.3723098039627075, + "learning_rate": 4.5722156649553884e-05, + "loss": 0.0974, + "num_input_tokens_seen": 5426104, + "step": 7560 + }, + { + "epoch": 15.727650727650728, + "grad_norm": 0.4009799659252167, + "learning_rate": 4.571666298676843e-05, + "loss": 0.1087, + "num_input_tokens_seen": 5429656, + "step": 7565 + }, + { + "epoch": 15.738045738045738, + "grad_norm": 0.18270964920520782, + "learning_rate": 4.571116612921321e-05, + "loss": 0.1347, + "num_input_tokens_seen": 5433304, + "step": 7570 + }, + { + "epoch": 15.748440748440748, + "grad_norm": 0.30291947722435, + "learning_rate": 4.57056660777359e-05, + "loss": 0.1484, + "num_input_tokens_seen": 5436856, + "step": 7575 + }, + { + "epoch": 15.758835758835758, + "grad_norm": 0.1630444973707199, + "learning_rate": 4.5700162833184666e-05, + "loss": 0.129, + "num_input_tokens_seen": 5440472, + "step": 7580 + }, + { + "epoch": 15.76923076923077, + "grad_norm": 0.33605462312698364, + "learning_rate": 4.5694656396408195e-05, + "loss": 0.137, + "num_input_tokens_seen": 5443960, + "step": 7585 + }, + { + "epoch": 15.77962577962578, + "grad_norm": 0.5480735301971436, + "learning_rate": 4.5689146768255646e-05, + "loss": 0.1506, + "num_input_tokens_seen": 5447704, + "step": 7590 + }, + { + "epoch": 15.79002079002079, + "grad_norm": 0.2373262196779251, + "learning_rate": 4.568363394957667e-05, + "loss": 0.1135, + "num_input_tokens_seen": 5451064, + "step": 7595 + }, + { + "epoch": 15.8004158004158, + "grad_norm": 0.1262313276529312, + "learning_rate": 4.567811794122141e-05, + "loss": 0.1145, + "num_input_tokens_seen": 5454712, + "step": 7600 + }, + { + "epoch": 15.8004158004158, + "eval_loss": 0.15012232959270477, + "eval_runtime": 7.7656, + "eval_samples_per_second": 110.229, + "eval_steps_per_second": 27.557, + "num_input_tokens_seen": 5454712, + "step": 7600 + }, + { + "epoch": 15.81081081081081, + "grad_norm": 0.2939552366733551, + "learning_rate": 4.56725987440405e-05, + "loss": 0.1269, + "num_input_tokens_seen": 5458264, + "step": 7605 + }, + { + "epoch": 15.82120582120582, + "grad_norm": 0.3041917383670807, + "learning_rate": 4.566707635888508e-05, + "loss": 0.1227, + "num_input_tokens_seen": 5461784, + "step": 7610 + }, + { + "epoch": 15.83160083160083, + "grad_norm": 0.09942673146724701, + "learning_rate": 4.566155078660677e-05, + "loss": 0.1723, + "num_input_tokens_seen": 5465528, + "step": 7615 + }, + { + "epoch": 15.841995841995843, + "grad_norm": 0.201682910323143, + "learning_rate": 4.565602202805768e-05, + "loss": 0.1104, + "num_input_tokens_seen": 5469080, + "step": 7620 + }, + { + "epoch": 15.852390852390853, + "grad_norm": 0.5534563064575195, + "learning_rate": 4.56504900840904e-05, + "loss": 0.1252, + "num_input_tokens_seen": 5472728, + "step": 7625 + }, + { + "epoch": 15.862785862785863, + "grad_norm": 0.2137930691242218, + "learning_rate": 4.564495495555805e-05, + "loss": 0.0898, + "num_input_tokens_seen": 5476216, + "step": 7630 + }, + { + "epoch": 15.873180873180873, + "grad_norm": 0.13802997767925262, + "learning_rate": 4.5639416643314204e-05, + "loss": 0.1269, + "num_input_tokens_seen": 5479832, + "step": 7635 + }, + { + "epoch": 15.883575883575883, + "grad_norm": 0.16616110503673553, + "learning_rate": 4.5633875148212946e-05, + "loss": 0.1611, + "num_input_tokens_seen": 5483384, + "step": 7640 + }, + { + "epoch": 15.893970893970893, + "grad_norm": 0.20408688485622406, + "learning_rate": 4.562833047110883e-05, + "loss": 0.1367, + "num_input_tokens_seen": 5486936, + "step": 7645 + }, + { + "epoch": 15.904365904365905, + "grad_norm": 0.21497687697410583, + "learning_rate": 4.5622782612856923e-05, + "loss": 0.1252, + "num_input_tokens_seen": 5490360, + "step": 7650 + }, + { + "epoch": 15.914760914760915, + "grad_norm": 0.22410665452480316, + "learning_rate": 4.561723157431278e-05, + "loss": 0.1395, + "num_input_tokens_seen": 5493720, + "step": 7655 + }, + { + "epoch": 15.925155925155925, + "grad_norm": 0.277241975069046, + "learning_rate": 4.5611677356332435e-05, + "loss": 0.1081, + "num_input_tokens_seen": 5497272, + "step": 7660 + }, + { + "epoch": 15.935550935550935, + "grad_norm": 0.579868495464325, + "learning_rate": 4.560611995977242e-05, + "loss": 0.1337, + "num_input_tokens_seen": 5500888, + "step": 7665 + }, + { + "epoch": 15.945945945945946, + "grad_norm": 0.48044201731681824, + "learning_rate": 4.560055938548975e-05, + "loss": 0.1441, + "num_input_tokens_seen": 5504440, + "step": 7670 + }, + { + "epoch": 15.956340956340956, + "grad_norm": 0.26731762290000916, + "learning_rate": 4.5594995634341944e-05, + "loss": 0.116, + "num_input_tokens_seen": 5508056, + "step": 7675 + }, + { + "epoch": 15.966735966735968, + "grad_norm": 0.347577840089798, + "learning_rate": 4.5589428707187e-05, + "loss": 0.1703, + "num_input_tokens_seen": 5511896, + "step": 7680 + }, + { + "epoch": 15.977130977130978, + "grad_norm": 0.4147574007511139, + "learning_rate": 4.55838586048834e-05, + "loss": 0.1233, + "num_input_tokens_seen": 5515608, + "step": 7685 + }, + { + "epoch": 15.987525987525988, + "grad_norm": 0.2657991051673889, + "learning_rate": 4.557828532829013e-05, + "loss": 0.1106, + "num_input_tokens_seen": 5519256, + "step": 7690 + }, + { + "epoch": 15.997920997920998, + "grad_norm": 0.2465164065361023, + "learning_rate": 4.557270887826667e-05, + "loss": 0.1233, + "num_input_tokens_seen": 5523000, + "step": 7695 + }, + { + "epoch": 16.008316008316008, + "grad_norm": 0.3795037567615509, + "learning_rate": 4.556712925567296e-05, + "loss": 0.1127, + "num_input_tokens_seen": 5526736, + "step": 7700 + }, + { + "epoch": 16.01871101871102, + "grad_norm": 0.2679063081741333, + "learning_rate": 4.5561546461369454e-05, + "loss": 0.0972, + "num_input_tokens_seen": 5530608, + "step": 7705 + }, + { + "epoch": 16.02910602910603, + "grad_norm": 0.26509949564933777, + "learning_rate": 4.55559604962171e-05, + "loss": 0.1451, + "num_input_tokens_seen": 5534256, + "step": 7710 + }, + { + "epoch": 16.03950103950104, + "grad_norm": 0.2693500518798828, + "learning_rate": 4.55503713610773e-05, + "loss": 0.1218, + "num_input_tokens_seen": 5537712, + "step": 7715 + }, + { + "epoch": 16.04989604989605, + "grad_norm": 0.361881285905838, + "learning_rate": 4.5544779056812e-05, + "loss": 0.1387, + "num_input_tokens_seen": 5541232, + "step": 7720 + }, + { + "epoch": 16.06029106029106, + "grad_norm": 0.27428871393203735, + "learning_rate": 4.553918358428358e-05, + "loss": 0.146, + "num_input_tokens_seen": 5544752, + "step": 7725 + }, + { + "epoch": 16.070686070686072, + "grad_norm": 0.1578449010848999, + "learning_rate": 4.553358494435494e-05, + "loss": 0.1443, + "num_input_tokens_seen": 5548304, + "step": 7730 + }, + { + "epoch": 16.08108108108108, + "grad_norm": 0.8290319442749023, + "learning_rate": 4.5527983137889464e-05, + "loss": 0.1363, + "num_input_tokens_seen": 5552016, + "step": 7735 + }, + { + "epoch": 16.091476091476093, + "grad_norm": 0.20306752622127533, + "learning_rate": 4.5522378165751015e-05, + "loss": 0.0848, + "num_input_tokens_seen": 5555408, + "step": 7740 + }, + { + "epoch": 16.1018711018711, + "grad_norm": 0.2236398309469223, + "learning_rate": 4.5516770028803954e-05, + "loss": 0.1453, + "num_input_tokens_seen": 5558928, + "step": 7745 + }, + { + "epoch": 16.112266112266113, + "grad_norm": 0.3536190092563629, + "learning_rate": 4.5511158727913116e-05, + "loss": 0.0992, + "num_input_tokens_seen": 5562480, + "step": 7750 + }, + { + "epoch": 16.12266112266112, + "grad_norm": 0.4012061357498169, + "learning_rate": 4.5505544263943856e-05, + "loss": 0.145, + "num_input_tokens_seen": 5566096, + "step": 7755 + }, + { + "epoch": 16.133056133056133, + "grad_norm": 0.33367055654525757, + "learning_rate": 4.549992663776197e-05, + "loss": 0.1429, + "num_input_tokens_seen": 5569744, + "step": 7760 + }, + { + "epoch": 16.143451143451145, + "grad_norm": 0.1987580955028534, + "learning_rate": 4.5494305850233786e-05, + "loss": 0.1164, + "num_input_tokens_seen": 5573264, + "step": 7765 + }, + { + "epoch": 16.153846153846153, + "grad_norm": 0.6922571063041687, + "learning_rate": 4.5488681902226094e-05, + "loss": 0.1259, + "num_input_tokens_seen": 5576848, + "step": 7770 + }, + { + "epoch": 16.164241164241165, + "grad_norm": 0.3358442783355713, + "learning_rate": 4.5483054794606174e-05, + "loss": 0.1312, + "num_input_tokens_seen": 5580496, + "step": 7775 + }, + { + "epoch": 16.174636174636174, + "grad_norm": 0.25079625844955444, + "learning_rate": 4.547742452824179e-05, + "loss": 0.1366, + "num_input_tokens_seen": 5584112, + "step": 7780 + }, + { + "epoch": 16.185031185031185, + "grad_norm": 0.18513938784599304, + "learning_rate": 4.5471791104001215e-05, + "loss": 0.1362, + "num_input_tokens_seen": 5587856, + "step": 7785 + }, + { + "epoch": 16.195426195426194, + "grad_norm": 0.7227036952972412, + "learning_rate": 4.546615452275319e-05, + "loss": 0.157, + "num_input_tokens_seen": 5591344, + "step": 7790 + }, + { + "epoch": 16.205821205821206, + "grad_norm": 0.24080996215343475, + "learning_rate": 4.5460514785366944e-05, + "loss": 0.1608, + "num_input_tokens_seen": 5594928, + "step": 7795 + }, + { + "epoch": 16.216216216216218, + "grad_norm": 0.18569394946098328, + "learning_rate": 4.545487189271219e-05, + "loss": 0.1116, + "num_input_tokens_seen": 5598576, + "step": 7800 + }, + { + "epoch": 16.216216216216218, + "eval_loss": 0.158003568649292, + "eval_runtime": 7.7521, + "eval_samples_per_second": 110.421, + "eval_steps_per_second": 27.605, + "num_input_tokens_seen": 5598576, + "step": 7800 + }, + { + "epoch": 16.226611226611226, + "grad_norm": 0.20471103489398956, + "learning_rate": 4.544922584565914e-05, + "loss": 0.1159, + "num_input_tokens_seen": 5602256, + "step": 7805 + }, + { + "epoch": 16.237006237006238, + "grad_norm": 0.30852505564689636, + "learning_rate": 4.544357664507848e-05, + "loss": 0.11, + "num_input_tokens_seen": 5605776, + "step": 7810 + }, + { + "epoch": 16.247401247401246, + "grad_norm": 0.41783273220062256, + "learning_rate": 4.54379242918414e-05, + "loss": 0.1742, + "num_input_tokens_seen": 5609392, + "step": 7815 + }, + { + "epoch": 16.257796257796258, + "grad_norm": 0.5217536091804504, + "learning_rate": 4.543226878681955e-05, + "loss": 0.15, + "num_input_tokens_seen": 5612912, + "step": 7820 + }, + { + "epoch": 16.26819126819127, + "grad_norm": 0.24963288009166718, + "learning_rate": 4.5426610130885087e-05, + "loss": 0.1223, + "num_input_tokens_seen": 5616528, + "step": 7825 + }, + { + "epoch": 16.27858627858628, + "grad_norm": 0.2086094170808792, + "learning_rate": 4.542094832491064e-05, + "loss": 0.1897, + "num_input_tokens_seen": 5620208, + "step": 7830 + }, + { + "epoch": 16.28898128898129, + "grad_norm": 0.24440467357635498, + "learning_rate": 4.541528336976934e-05, + "loss": 0.1281, + "num_input_tokens_seen": 5623888, + "step": 7835 + }, + { + "epoch": 16.2993762993763, + "grad_norm": 0.23689942061901093, + "learning_rate": 4.540961526633479e-05, + "loss": 0.1051, + "num_input_tokens_seen": 5627600, + "step": 7840 + }, + { + "epoch": 16.30977130977131, + "grad_norm": 0.33180660009384155, + "learning_rate": 4.540394401548108e-05, + "loss": 0.1056, + "num_input_tokens_seen": 5631120, + "step": 7845 + }, + { + "epoch": 16.32016632016632, + "grad_norm": 0.15424764156341553, + "learning_rate": 4.539826961808279e-05, + "loss": 0.1408, + "num_input_tokens_seen": 5634704, + "step": 7850 + }, + { + "epoch": 16.33056133056133, + "grad_norm": 0.3079388439655304, + "learning_rate": 4.5392592075014994e-05, + "loss": 0.1469, + "num_input_tokens_seen": 5638224, + "step": 7855 + }, + { + "epoch": 16.340956340956343, + "grad_norm": 0.46974390745162964, + "learning_rate": 4.538691138715322e-05, + "loss": 0.1746, + "num_input_tokens_seen": 5641744, + "step": 7860 + }, + { + "epoch": 16.35135135135135, + "grad_norm": 0.3071437180042267, + "learning_rate": 4.5381227555373516e-05, + "loss": 0.1228, + "num_input_tokens_seen": 5645360, + "step": 7865 + }, + { + "epoch": 16.361746361746363, + "grad_norm": 0.34040701389312744, + "learning_rate": 4.537554058055239e-05, + "loss": 0.1057, + "num_input_tokens_seen": 5648880, + "step": 7870 + }, + { + "epoch": 16.37214137214137, + "grad_norm": 0.5405741930007935, + "learning_rate": 4.5369850463566865e-05, + "loss": 0.1566, + "num_input_tokens_seen": 5652496, + "step": 7875 + }, + { + "epoch": 16.382536382536383, + "grad_norm": 0.15367886424064636, + "learning_rate": 4.5364157205294404e-05, + "loss": 0.1133, + "num_input_tokens_seen": 5655984, + "step": 7880 + }, + { + "epoch": 16.39293139293139, + "grad_norm": 0.187472864985466, + "learning_rate": 4.5358460806612996e-05, + "loss": 0.1194, + "num_input_tokens_seen": 5659504, + "step": 7885 + }, + { + "epoch": 16.403326403326403, + "grad_norm": 0.26851630210876465, + "learning_rate": 4.535276126840109e-05, + "loss": 0.1432, + "num_input_tokens_seen": 5663056, + "step": 7890 + }, + { + "epoch": 16.413721413721415, + "grad_norm": 0.2202342003583908, + "learning_rate": 4.5347058591537626e-05, + "loss": 0.1619, + "num_input_tokens_seen": 5666736, + "step": 7895 + }, + { + "epoch": 16.424116424116423, + "grad_norm": 0.389209121465683, + "learning_rate": 4.534135277690203e-05, + "loss": 0.1196, + "num_input_tokens_seen": 5670288, + "step": 7900 + }, + { + "epoch": 16.434511434511435, + "grad_norm": 0.15304096043109894, + "learning_rate": 4.533564382537421e-05, + "loss": 0.0988, + "num_input_tokens_seen": 5674032, + "step": 7905 + }, + { + "epoch": 16.444906444906444, + "grad_norm": 0.2746342122554779, + "learning_rate": 4.532993173783456e-05, + "loss": 0.1627, + "num_input_tokens_seen": 5677616, + "step": 7910 + }, + { + "epoch": 16.455301455301456, + "grad_norm": 0.1704639345407486, + "learning_rate": 4.5324216515163954e-05, + "loss": 0.1397, + "num_input_tokens_seen": 5681200, + "step": 7915 + }, + { + "epoch": 16.465696465696467, + "grad_norm": 0.41125622391700745, + "learning_rate": 4.531849815824375e-05, + "loss": 0.1283, + "num_input_tokens_seen": 5684848, + "step": 7920 + }, + { + "epoch": 16.476091476091476, + "grad_norm": 0.37067678570747375, + "learning_rate": 4.5312776667955795e-05, + "loss": 0.1059, + "num_input_tokens_seen": 5688368, + "step": 7925 + }, + { + "epoch": 16.486486486486488, + "grad_norm": 0.3914305865764618, + "learning_rate": 4.5307052045182405e-05, + "loss": 0.1073, + "num_input_tokens_seen": 5691888, + "step": 7930 + }, + { + "epoch": 16.496881496881496, + "grad_norm": 0.21004106104373932, + "learning_rate": 4.53013242908064e-05, + "loss": 0.1202, + "num_input_tokens_seen": 5695600, + "step": 7935 + }, + { + "epoch": 16.507276507276508, + "grad_norm": 0.3609844446182251, + "learning_rate": 4.529559340571107e-05, + "loss": 0.1442, + "num_input_tokens_seen": 5699088, + "step": 7940 + }, + { + "epoch": 16.517671517671516, + "grad_norm": 0.3602500259876251, + "learning_rate": 4.528985939078018e-05, + "loss": 0.1779, + "num_input_tokens_seen": 5702672, + "step": 7945 + }, + { + "epoch": 16.528066528066528, + "grad_norm": 0.6075959205627441, + "learning_rate": 4.5284122246898e-05, + "loss": 0.1422, + "num_input_tokens_seen": 5706256, + "step": 7950 + }, + { + "epoch": 16.53846153846154, + "grad_norm": 0.17445078492164612, + "learning_rate": 4.527838197494926e-05, + "loss": 0.1158, + "num_input_tokens_seen": 5709744, + "step": 7955 + }, + { + "epoch": 16.54885654885655, + "grad_norm": 0.4522636830806732, + "learning_rate": 4.527263857581918e-05, + "loss": 0.1164, + "num_input_tokens_seen": 5713392, + "step": 7960 + }, + { + "epoch": 16.55925155925156, + "grad_norm": 0.22737188637256622, + "learning_rate": 4.526689205039347e-05, + "loss": 0.1221, + "num_input_tokens_seen": 5716816, + "step": 7965 + }, + { + "epoch": 16.56964656964657, + "grad_norm": 0.22274766862392426, + "learning_rate": 4.5261142399558324e-05, + "loss": 0.1405, + "num_input_tokens_seen": 5720464, + "step": 7970 + }, + { + "epoch": 16.58004158004158, + "grad_norm": 0.5564790964126587, + "learning_rate": 4.525538962420041e-05, + "loss": 0.1195, + "num_input_tokens_seen": 5723920, + "step": 7975 + }, + { + "epoch": 16.59043659043659, + "grad_norm": 0.39676564931869507, + "learning_rate": 4.524963372520685e-05, + "loss": 0.1283, + "num_input_tokens_seen": 5727408, + "step": 7980 + }, + { + "epoch": 16.6008316008316, + "grad_norm": 0.1995701640844345, + "learning_rate": 4.524387470346531e-05, + "loss": 0.1415, + "num_input_tokens_seen": 5731024, + "step": 7985 + }, + { + "epoch": 16.611226611226613, + "grad_norm": 0.19246092438697815, + "learning_rate": 4.5238112559863885e-05, + "loss": 0.0905, + "num_input_tokens_seen": 5734608, + "step": 7990 + }, + { + "epoch": 16.62162162162162, + "grad_norm": 0.18629080057144165, + "learning_rate": 4.5232347295291175e-05, + "loss": 0.0909, + "num_input_tokens_seen": 5738288, + "step": 7995 + }, + { + "epoch": 16.632016632016633, + "grad_norm": 0.7035895586013794, + "learning_rate": 4.522657891063626e-05, + "loss": 0.1438, + "num_input_tokens_seen": 5741776, + "step": 8000 + }, + { + "epoch": 16.632016632016633, + "eval_loss": 0.15470466017723083, + "eval_runtime": 7.7516, + "eval_samples_per_second": 110.429, + "eval_steps_per_second": 27.607, + "num_input_tokens_seen": 5741776, + "step": 8000 + }, + { + "epoch": 16.64241164241164, + "grad_norm": 0.43114879727363586, + "learning_rate": 4.52208074067887e-05, + "loss": 0.167, + "num_input_tokens_seen": 5745424, + "step": 8005 + }, + { + "epoch": 16.652806652806653, + "grad_norm": 0.17195121943950653, + "learning_rate": 4.5215032784638516e-05, + "loss": 0.1227, + "num_input_tokens_seen": 5749200, + "step": 8010 + }, + { + "epoch": 16.66320166320166, + "grad_norm": 0.44245895743370056, + "learning_rate": 4.5209255045076245e-05, + "loss": 0.158, + "num_input_tokens_seen": 5752912, + "step": 8015 + }, + { + "epoch": 16.673596673596673, + "grad_norm": 0.18354246020317078, + "learning_rate": 4.5203474188992875e-05, + "loss": 0.1043, + "num_input_tokens_seen": 5756560, + "step": 8020 + }, + { + "epoch": 16.683991683991685, + "grad_norm": 0.3171703815460205, + "learning_rate": 4.51976902172799e-05, + "loss": 0.1036, + "num_input_tokens_seen": 5760144, + "step": 8025 + }, + { + "epoch": 16.694386694386694, + "grad_norm": 0.15507130324840546, + "learning_rate": 4.519190313082927e-05, + "loss": 0.1398, + "num_input_tokens_seen": 5763632, + "step": 8030 + }, + { + "epoch": 16.704781704781706, + "grad_norm": 0.518431544303894, + "learning_rate": 4.518611293053343e-05, + "loss": 0.0926, + "num_input_tokens_seen": 5767184, + "step": 8035 + }, + { + "epoch": 16.715176715176714, + "grad_norm": 0.31999367475509644, + "learning_rate": 4.51803196172853e-05, + "loss": 0.107, + "num_input_tokens_seen": 5770768, + "step": 8040 + }, + { + "epoch": 16.725571725571726, + "grad_norm": 0.3604424297809601, + "learning_rate": 4.517452319197828e-05, + "loss": 0.1589, + "num_input_tokens_seen": 5774384, + "step": 8045 + }, + { + "epoch": 16.735966735966738, + "grad_norm": 0.21960976719856262, + "learning_rate": 4.5168723655506265e-05, + "loss": 0.1479, + "num_input_tokens_seen": 5778160, + "step": 8050 + }, + { + "epoch": 16.746361746361746, + "grad_norm": 0.22522984445095062, + "learning_rate": 4.51629210087636e-05, + "loss": 0.1162, + "num_input_tokens_seen": 5781744, + "step": 8055 + }, + { + "epoch": 16.756756756756758, + "grad_norm": 0.49326038360595703, + "learning_rate": 4.515711525264513e-05, + "loss": 0.1111, + "num_input_tokens_seen": 5785296, + "step": 8060 + }, + { + "epoch": 16.767151767151766, + "grad_norm": 0.3223242461681366, + "learning_rate": 4.5151306388046175e-05, + "loss": 0.1658, + "num_input_tokens_seen": 5788944, + "step": 8065 + }, + { + "epoch": 16.777546777546778, + "grad_norm": 0.2735491991043091, + "learning_rate": 4.514549441586255e-05, + "loss": 0.119, + "num_input_tokens_seen": 5792304, + "step": 8070 + }, + { + "epoch": 16.787941787941786, + "grad_norm": 0.2077500969171524, + "learning_rate": 4.513967933699051e-05, + "loss": 0.1407, + "num_input_tokens_seen": 5796016, + "step": 8075 + }, + { + "epoch": 16.7983367983368, + "grad_norm": 0.1972196102142334, + "learning_rate": 4.513386115232684e-05, + "loss": 0.1636, + "num_input_tokens_seen": 5799696, + "step": 8080 + }, + { + "epoch": 16.80873180873181, + "grad_norm": 0.18948428332805634, + "learning_rate": 4.5128039862768745e-05, + "loss": 0.1502, + "num_input_tokens_seen": 5803440, + "step": 8085 + }, + { + "epoch": 16.81912681912682, + "grad_norm": 0.23113365471363068, + "learning_rate": 4.512221546921397e-05, + "loss": 0.1516, + "num_input_tokens_seen": 5807088, + "step": 8090 + }, + { + "epoch": 16.82952182952183, + "grad_norm": 0.18126651644706726, + "learning_rate": 4.5116387972560694e-05, + "loss": 0.1079, + "num_input_tokens_seen": 5810768, + "step": 8095 + }, + { + "epoch": 16.83991683991684, + "grad_norm": 0.25146523118019104, + "learning_rate": 4.511055737370759e-05, + "loss": 0.1114, + "num_input_tokens_seen": 5814416, + "step": 8100 + }, + { + "epoch": 16.85031185031185, + "grad_norm": 0.5408759117126465, + "learning_rate": 4.510472367355383e-05, + "loss": 0.1178, + "num_input_tokens_seen": 5817840, + "step": 8105 + }, + { + "epoch": 16.86070686070686, + "grad_norm": 0.16442124545574188, + "learning_rate": 4.509888687299901e-05, + "loss": 0.1005, + "num_input_tokens_seen": 5821584, + "step": 8110 + }, + { + "epoch": 16.87110187110187, + "grad_norm": 0.12501128017902374, + "learning_rate": 4.5093046972943266e-05, + "loss": 0.1227, + "num_input_tokens_seen": 5825136, + "step": 8115 + }, + { + "epoch": 16.881496881496883, + "grad_norm": 0.3090190589427948, + "learning_rate": 4.508720397428717e-05, + "loss": 0.0913, + "num_input_tokens_seen": 5828624, + "step": 8120 + }, + { + "epoch": 16.89189189189189, + "grad_norm": 0.3122842311859131, + "learning_rate": 4.508135787793178e-05, + "loss": 0.1035, + "num_input_tokens_seen": 5832112, + "step": 8125 + }, + { + "epoch": 16.902286902286903, + "grad_norm": 0.19587986171245575, + "learning_rate": 4.5075508684778664e-05, + "loss": 0.0991, + "num_input_tokens_seen": 5835760, + "step": 8130 + }, + { + "epoch": 16.91268191268191, + "grad_norm": 0.24527576565742493, + "learning_rate": 4.506965639572982e-05, + "loss": 0.1477, + "num_input_tokens_seen": 5839376, + "step": 8135 + }, + { + "epoch": 16.923076923076923, + "grad_norm": 0.3255420923233032, + "learning_rate": 4.506380101168774e-05, + "loss": 0.127, + "num_input_tokens_seen": 5843088, + "step": 8140 + }, + { + "epoch": 16.933471933471935, + "grad_norm": 0.18673036992549896, + "learning_rate": 4.505794253355542e-05, + "loss": 0.0968, + "num_input_tokens_seen": 5846608, + "step": 8145 + }, + { + "epoch": 16.943866943866944, + "grad_norm": 0.37442290782928467, + "learning_rate": 4.5052080962236286e-05, + "loss": 0.1099, + "num_input_tokens_seen": 5850128, + "step": 8150 + }, + { + "epoch": 16.954261954261955, + "grad_norm": 0.45604372024536133, + "learning_rate": 4.504621629863428e-05, + "loss": 0.1614, + "num_input_tokens_seen": 5853744, + "step": 8155 + }, + { + "epoch": 16.964656964656964, + "grad_norm": 0.272945761680603, + "learning_rate": 4.504034854365381e-05, + "loss": 0.2158, + "num_input_tokens_seen": 5857296, + "step": 8160 + }, + { + "epoch": 16.975051975051976, + "grad_norm": 0.23323193192481995, + "learning_rate": 4.503447769819974e-05, + "loss": 0.1311, + "num_input_tokens_seen": 5860944, + "step": 8165 + }, + { + "epoch": 16.985446985446984, + "grad_norm": 0.18863815069198608, + "learning_rate": 4.502860376317745e-05, + "loss": 0.1247, + "num_input_tokens_seen": 5864496, + "step": 8170 + }, + { + "epoch": 16.995841995841996, + "grad_norm": 0.7487926483154297, + "learning_rate": 4.502272673949276e-05, + "loss": 0.1744, + "num_input_tokens_seen": 5868048, + "step": 8175 + }, + { + "epoch": 17.006237006237008, + "grad_norm": 0.379686564207077, + "learning_rate": 4.501684662805199e-05, + "loss": 0.1446, + "num_input_tokens_seen": 5871496, + "step": 8180 + }, + { + "epoch": 17.016632016632016, + "grad_norm": 0.3587222099304199, + "learning_rate": 4.5010963429761924e-05, + "loss": 0.1512, + "num_input_tokens_seen": 5875304, + "step": 8185 + }, + { + "epoch": 17.027027027027028, + "grad_norm": 0.4435117840766907, + "learning_rate": 4.500507714552982e-05, + "loss": 0.1144, + "num_input_tokens_seen": 5878760, + "step": 8190 + }, + { + "epoch": 17.037422037422036, + "grad_norm": 0.164734348654747, + "learning_rate": 4.499918777626342e-05, + "loss": 0.0981, + "num_input_tokens_seen": 5882440, + "step": 8195 + }, + { + "epoch": 17.04781704781705, + "grad_norm": 0.2558342218399048, + "learning_rate": 4.499329532287093e-05, + "loss": 0.1108, + "num_input_tokens_seen": 5885896, + "step": 8200 + }, + { + "epoch": 17.04781704781705, + "eval_loss": 0.15302522480487823, + "eval_runtime": 7.749, + "eval_samples_per_second": 110.466, + "eval_steps_per_second": 27.616, + "num_input_tokens_seen": 5885896, + "step": 8200 + }, + { + "epoch": 17.058212058212057, + "grad_norm": 0.38229796290397644, + "learning_rate": 4.4987399786261064e-05, + "loss": 0.1171, + "num_input_tokens_seen": 5889544, + "step": 8205 + }, + { + "epoch": 17.06860706860707, + "grad_norm": 0.44392943382263184, + "learning_rate": 4.498150116734297e-05, + "loss": 0.1157, + "num_input_tokens_seen": 5893224, + "step": 8210 + }, + { + "epoch": 17.07900207900208, + "grad_norm": 0.20662528276443481, + "learning_rate": 4.4975599467026294e-05, + "loss": 0.1448, + "num_input_tokens_seen": 5896904, + "step": 8215 + }, + { + "epoch": 17.08939708939709, + "grad_norm": 0.48409250378608704, + "learning_rate": 4.496969468622114e-05, + "loss": 0.1267, + "num_input_tokens_seen": 5900488, + "step": 8220 + }, + { + "epoch": 17.0997920997921, + "grad_norm": 0.1805739551782608, + "learning_rate": 4.496378682583813e-05, + "loss": 0.1207, + "num_input_tokens_seen": 5904136, + "step": 8225 + }, + { + "epoch": 17.11018711018711, + "grad_norm": 0.2706165313720703, + "learning_rate": 4.495787588678829e-05, + "loss": 0.1241, + "num_input_tokens_seen": 5907688, + "step": 8230 + }, + { + "epoch": 17.12058212058212, + "grad_norm": 0.3747027516365051, + "learning_rate": 4.4951961869983196e-05, + "loss": 0.1136, + "num_input_tokens_seen": 5911368, + "step": 8235 + }, + { + "epoch": 17.13097713097713, + "grad_norm": 0.16602662205696106, + "learning_rate": 4.494604477633485e-05, + "loss": 0.0815, + "num_input_tokens_seen": 5915080, + "step": 8240 + }, + { + "epoch": 17.14137214137214, + "grad_norm": 0.45122742652893066, + "learning_rate": 4.4940124606755734e-05, + "loss": 0.1335, + "num_input_tokens_seen": 5918696, + "step": 8245 + }, + { + "epoch": 17.151767151767153, + "grad_norm": 0.3816382884979248, + "learning_rate": 4.493420136215882e-05, + "loss": 0.1114, + "num_input_tokens_seen": 5922280, + "step": 8250 + }, + { + "epoch": 17.16216216216216, + "grad_norm": 0.39621463418006897, + "learning_rate": 4.492827504345756e-05, + "loss": 0.1058, + "num_input_tokens_seen": 5925800, + "step": 8255 + }, + { + "epoch": 17.172557172557173, + "grad_norm": 0.17244690656661987, + "learning_rate": 4.492234565156584e-05, + "loss": 0.1082, + "num_input_tokens_seen": 5929512, + "step": 8260 + }, + { + "epoch": 17.18295218295218, + "grad_norm": 0.16138780117034912, + "learning_rate": 4.491641318739807e-05, + "loss": 0.1267, + "num_input_tokens_seen": 5933192, + "step": 8265 + }, + { + "epoch": 17.193347193347194, + "grad_norm": 0.22833523154258728, + "learning_rate": 4.4910477651869096e-05, + "loss": 0.1147, + "num_input_tokens_seen": 5936776, + "step": 8270 + }, + { + "epoch": 17.203742203742205, + "grad_norm": 0.18812189996242523, + "learning_rate": 4.4904539045894254e-05, + "loss": 0.1212, + "num_input_tokens_seen": 5940488, + "step": 8275 + }, + { + "epoch": 17.214137214137214, + "grad_norm": 0.31602153182029724, + "learning_rate": 4.4898597370389364e-05, + "loss": 0.1471, + "num_input_tokens_seen": 5944200, + "step": 8280 + }, + { + "epoch": 17.224532224532226, + "grad_norm": 0.29228848218917847, + "learning_rate": 4.489265262627069e-05, + "loss": 0.1783, + "num_input_tokens_seen": 5947720, + "step": 8285 + }, + { + "epoch": 17.234927234927234, + "grad_norm": 0.42806485295295715, + "learning_rate": 4.488670481445499e-05, + "loss": 0.1305, + "num_input_tokens_seen": 5951336, + "step": 8290 + }, + { + "epoch": 17.245322245322246, + "grad_norm": 0.23182480037212372, + "learning_rate": 4.488075393585951e-05, + "loss": 0.1403, + "num_input_tokens_seen": 5955080, + "step": 8295 + }, + { + "epoch": 17.255717255717254, + "grad_norm": 0.13901939988136292, + "learning_rate": 4.487479999140193e-05, + "loss": 0.0948, + "num_input_tokens_seen": 5958632, + "step": 8300 + }, + { + "epoch": 17.266112266112266, + "grad_norm": 0.3635881543159485, + "learning_rate": 4.4868842982000425e-05, + "loss": 0.1316, + "num_input_tokens_seen": 5962120, + "step": 8305 + }, + { + "epoch": 17.276507276507278, + "grad_norm": 0.8919298648834229, + "learning_rate": 4.486288290857365e-05, + "loss": 0.1392, + "num_input_tokens_seen": 5965576, + "step": 8310 + }, + { + "epoch": 17.286902286902286, + "grad_norm": 0.13474144041538239, + "learning_rate": 4.4856919772040715e-05, + "loss": 0.1078, + "num_input_tokens_seen": 5969224, + "step": 8315 + }, + { + "epoch": 17.2972972972973, + "grad_norm": 0.1232575848698616, + "learning_rate": 4.485095357332122e-05, + "loss": 0.1341, + "num_input_tokens_seen": 5972936, + "step": 8320 + }, + { + "epoch": 17.307692307692307, + "grad_norm": 0.747658908367157, + "learning_rate": 4.484498431333521e-05, + "loss": 0.1486, + "num_input_tokens_seen": 5976520, + "step": 8325 + }, + { + "epoch": 17.31808731808732, + "grad_norm": 0.7327951788902283, + "learning_rate": 4.4839011993003245e-05, + "loss": 0.1945, + "num_input_tokens_seen": 5980072, + "step": 8330 + }, + { + "epoch": 17.328482328482327, + "grad_norm": 0.21796062588691711, + "learning_rate": 4.4833036613246305e-05, + "loss": 0.1462, + "num_input_tokens_seen": 5983720, + "step": 8335 + }, + { + "epoch": 17.33887733887734, + "grad_norm": 0.3164018988609314, + "learning_rate": 4.482705817498589e-05, + "loss": 0.1348, + "num_input_tokens_seen": 5987208, + "step": 8340 + }, + { + "epoch": 17.34927234927235, + "grad_norm": 0.5389726758003235, + "learning_rate": 4.4821076679143934e-05, + "loss": 0.1422, + "num_input_tokens_seen": 5990888, + "step": 8345 + }, + { + "epoch": 17.35966735966736, + "grad_norm": 0.8575286269187927, + "learning_rate": 4.481509212664288e-05, + "loss": 0.1297, + "num_input_tokens_seen": 5994472, + "step": 8350 + }, + { + "epoch": 17.37006237006237, + "grad_norm": 0.16591615974903107, + "learning_rate": 4.480910451840559e-05, + "loss": 0.0748, + "num_input_tokens_seen": 5998216, + "step": 8355 + }, + { + "epoch": 17.38045738045738, + "grad_norm": 0.45836734771728516, + "learning_rate": 4.480311385535546e-05, + "loss": 0.1579, + "num_input_tokens_seen": 6001672, + "step": 8360 + }, + { + "epoch": 17.39085239085239, + "grad_norm": 0.27935925126075745, + "learning_rate": 4.47971201384163e-05, + "loss": 0.1433, + "num_input_tokens_seen": 6005416, + "step": 8365 + }, + { + "epoch": 17.401247401247403, + "grad_norm": 0.20682895183563232, + "learning_rate": 4.4791123368512446e-05, + "loss": 0.1164, + "num_input_tokens_seen": 6009096, + "step": 8370 + }, + { + "epoch": 17.41164241164241, + "grad_norm": 0.26601865887641907, + "learning_rate": 4.478512354656864e-05, + "loss": 0.1705, + "num_input_tokens_seen": 6012712, + "step": 8375 + }, + { + "epoch": 17.422037422037423, + "grad_norm": 0.6892343759536743, + "learning_rate": 4.477912067351016e-05, + "loss": 0.1515, + "num_input_tokens_seen": 6016392, + "step": 8380 + }, + { + "epoch": 17.43243243243243, + "grad_norm": 0.5331936478614807, + "learning_rate": 4.477311475026271e-05, + "loss": 0.1249, + "num_input_tokens_seen": 6019912, + "step": 8385 + }, + { + "epoch": 17.442827442827443, + "grad_norm": 0.17212870717048645, + "learning_rate": 4.476710577775248e-05, + "loss": 0.0716, + "num_input_tokens_seen": 6023400, + "step": 8390 + }, + { + "epoch": 17.453222453222452, + "grad_norm": 0.49858617782592773, + "learning_rate": 4.476109375690612e-05, + "loss": 0.0901, + "num_input_tokens_seen": 6026920, + "step": 8395 + }, + { + "epoch": 17.463617463617464, + "grad_norm": 0.16159503161907196, + "learning_rate": 4.4755078688650784e-05, + "loss": 0.1097, + "num_input_tokens_seen": 6030472, + "step": 8400 + }, + { + "epoch": 17.463617463617464, + "eval_loss": 0.14560846984386444, + "eval_runtime": 7.7433, + "eval_samples_per_second": 110.547, + "eval_steps_per_second": 27.637, + "num_input_tokens_seen": 6030472, + "step": 8400 + }, + { + "epoch": 17.474012474012476, + "grad_norm": 0.2202703058719635, + "learning_rate": 4.474906057391406e-05, + "loss": 0.1326, + "num_input_tokens_seen": 6034088, + "step": 8405 + }, + { + "epoch": 17.484407484407484, + "grad_norm": 0.23857758939266205, + "learning_rate": 4.4743039413624e-05, + "loss": 0.1422, + "num_input_tokens_seen": 6037608, + "step": 8410 + }, + { + "epoch": 17.494802494802496, + "grad_norm": 0.45182809233665466, + "learning_rate": 4.473701520870916e-05, + "loss": 0.1349, + "num_input_tokens_seen": 6041128, + "step": 8415 + }, + { + "epoch": 17.505197505197504, + "grad_norm": 0.38359132409095764, + "learning_rate": 4.4730987960098544e-05, + "loss": 0.1462, + "num_input_tokens_seen": 6044776, + "step": 8420 + }, + { + "epoch": 17.515592515592516, + "grad_norm": 0.4218614399433136, + "learning_rate": 4.4724957668721635e-05, + "loss": 0.1204, + "num_input_tokens_seen": 6048456, + "step": 8425 + }, + { + "epoch": 17.525987525987524, + "grad_norm": 0.2625064253807068, + "learning_rate": 4.471892433550836e-05, + "loss": 0.1204, + "num_input_tokens_seen": 6051976, + "step": 8430 + }, + { + "epoch": 17.536382536382536, + "grad_norm": 0.18447045981884003, + "learning_rate": 4.471288796138916e-05, + "loss": 0.1221, + "num_input_tokens_seen": 6055592, + "step": 8435 + }, + { + "epoch": 17.546777546777548, + "grad_norm": 0.1446329802274704, + "learning_rate": 4.470684854729491e-05, + "loss": 0.1289, + "num_input_tokens_seen": 6059016, + "step": 8440 + }, + { + "epoch": 17.557172557172557, + "grad_norm": 0.19352523982524872, + "learning_rate": 4.4700806094156955e-05, + "loss": 0.1017, + "num_input_tokens_seen": 6062568, + "step": 8445 + }, + { + "epoch": 17.56756756756757, + "grad_norm": 0.8651833534240723, + "learning_rate": 4.469476060290713e-05, + "loss": 0.1372, + "num_input_tokens_seen": 6066152, + "step": 8450 + }, + { + "epoch": 17.577962577962577, + "grad_norm": 0.2728608548641205, + "learning_rate": 4.468871207447772e-05, + "loss": 0.1571, + "num_input_tokens_seen": 6069672, + "step": 8455 + }, + { + "epoch": 17.58835758835759, + "grad_norm": 0.25445520877838135, + "learning_rate": 4.4682660509801486e-05, + "loss": 0.1339, + "num_input_tokens_seen": 6073128, + "step": 8460 + }, + { + "epoch": 17.598752598752597, + "grad_norm": 0.2624770998954773, + "learning_rate": 4.467660590981165e-05, + "loss": 0.1328, + "num_input_tokens_seen": 6076680, + "step": 8465 + }, + { + "epoch": 17.60914760914761, + "grad_norm": 0.1706918627023697, + "learning_rate": 4.467054827544191e-05, + "loss": 0.1113, + "num_input_tokens_seen": 6080296, + "step": 8470 + }, + { + "epoch": 17.61954261954262, + "grad_norm": 0.28319135308265686, + "learning_rate": 4.4664487607626434e-05, + "loss": 0.1332, + "num_input_tokens_seen": 6083944, + "step": 8475 + }, + { + "epoch": 17.62993762993763, + "grad_norm": 0.23826836049556732, + "learning_rate": 4.4658423907299845e-05, + "loss": 0.1495, + "num_input_tokens_seen": 6087464, + "step": 8480 + }, + { + "epoch": 17.64033264033264, + "grad_norm": 0.2149663269519806, + "learning_rate": 4.465235717539725e-05, + "loss": 0.1184, + "num_input_tokens_seen": 6091112, + "step": 8485 + }, + { + "epoch": 17.65072765072765, + "grad_norm": 0.47250431776046753, + "learning_rate": 4.464628741285421e-05, + "loss": 0.1121, + "num_input_tokens_seen": 6094792, + "step": 8490 + }, + { + "epoch": 17.66112266112266, + "grad_norm": 0.34352099895477295, + "learning_rate": 4.4640214620606754e-05, + "loss": 0.1623, + "num_input_tokens_seen": 6098344, + "step": 8495 + }, + { + "epoch": 17.671517671517673, + "grad_norm": 0.35410434007644653, + "learning_rate": 4.46341387995914e-05, + "loss": 0.1791, + "num_input_tokens_seen": 6101864, + "step": 8500 + }, + { + "epoch": 17.68191268191268, + "grad_norm": 0.13273581862449646, + "learning_rate": 4.4628059950745106e-05, + "loss": 0.1004, + "num_input_tokens_seen": 6105352, + "step": 8505 + }, + { + "epoch": 17.692307692307693, + "grad_norm": 0.2887408435344696, + "learning_rate": 4.4621978075005297e-05, + "loss": 0.1387, + "num_input_tokens_seen": 6108872, + "step": 8510 + }, + { + "epoch": 17.7027027027027, + "grad_norm": 0.6665064096450806, + "learning_rate": 4.461589317330989e-05, + "loss": 0.1268, + "num_input_tokens_seen": 6112488, + "step": 8515 + }, + { + "epoch": 17.713097713097714, + "grad_norm": 0.3766379654407501, + "learning_rate": 4.460980524659724e-05, + "loss": 0.1386, + "num_input_tokens_seen": 6116136, + "step": 8520 + }, + { + "epoch": 17.723492723492722, + "grad_norm": 0.1369980424642563, + "learning_rate": 4.46037142958062e-05, + "loss": 0.0884, + "num_input_tokens_seen": 6119624, + "step": 8525 + }, + { + "epoch": 17.733887733887734, + "grad_norm": 0.5986160635948181, + "learning_rate": 4.4597620321876046e-05, + "loss": 0.1112, + "num_input_tokens_seen": 6123112, + "step": 8530 + }, + { + "epoch": 17.744282744282746, + "grad_norm": 0.2413136512041092, + "learning_rate": 4.459152332574656e-05, + "loss": 0.1368, + "num_input_tokens_seen": 6126728, + "step": 8535 + }, + { + "epoch": 17.754677754677754, + "grad_norm": 0.25971564650535583, + "learning_rate": 4.4585423308357985e-05, + "loss": 0.1008, + "num_input_tokens_seen": 6130312, + "step": 8540 + }, + { + "epoch": 17.765072765072766, + "grad_norm": 0.5555673241615295, + "learning_rate": 4.457932027065102e-05, + "loss": 0.0991, + "num_input_tokens_seen": 6133896, + "step": 8545 + }, + { + "epoch": 17.775467775467774, + "grad_norm": 0.20810282230377197, + "learning_rate": 4.45732142135668e-05, + "loss": 0.1162, + "num_input_tokens_seen": 6137384, + "step": 8550 + }, + { + "epoch": 17.785862785862786, + "grad_norm": 0.26466232538223267, + "learning_rate": 4.4567105138046986e-05, + "loss": 0.138, + "num_input_tokens_seen": 6140968, + "step": 8555 + }, + { + "epoch": 17.796257796257795, + "grad_norm": 0.5082611441612244, + "learning_rate": 4.456099304503365e-05, + "loss": 0.1096, + "num_input_tokens_seen": 6144648, + "step": 8560 + }, + { + "epoch": 17.806652806652806, + "grad_norm": 0.36431723833084106, + "learning_rate": 4.455487793546939e-05, + "loss": 0.127, + "num_input_tokens_seen": 6148232, + "step": 8565 + }, + { + "epoch": 17.81704781704782, + "grad_norm": 0.41230571269989014, + "learning_rate": 4.454875981029719e-05, + "loss": 0.171, + "num_input_tokens_seen": 6151688, + "step": 8570 + }, + { + "epoch": 17.827442827442827, + "grad_norm": 0.2970428168773651, + "learning_rate": 4.454263867046057e-05, + "loss": 0.1591, + "num_input_tokens_seen": 6155240, + "step": 8575 + }, + { + "epoch": 17.83783783783784, + "grad_norm": 0.47316670417785645, + "learning_rate": 4.4536514516903484e-05, + "loss": 0.1172, + "num_input_tokens_seen": 6158728, + "step": 8580 + }, + { + "epoch": 17.848232848232847, + "grad_norm": 0.5816906690597534, + "learning_rate": 4.453038735057034e-05, + "loss": 0.118, + "num_input_tokens_seen": 6162408, + "step": 8585 + }, + { + "epoch": 17.85862785862786, + "grad_norm": 0.3219534754753113, + "learning_rate": 4.4524257172406034e-05, + "loss": 0.1073, + "num_input_tokens_seen": 6165864, + "step": 8590 + }, + { + "epoch": 17.86902286902287, + "grad_norm": 0.38535892963409424, + "learning_rate": 4.451812398335592e-05, + "loss": 0.1179, + "num_input_tokens_seen": 6169384, + "step": 8595 + }, + { + "epoch": 17.87941787941788, + "grad_norm": 0.3004549741744995, + "learning_rate": 4.4511987784365805e-05, + "loss": 0.123, + "num_input_tokens_seen": 6172872, + "step": 8600 + }, + { + "epoch": 17.87941787941788, + "eval_loss": 0.14859724044799805, + "eval_runtime": 7.7605, + "eval_samples_per_second": 110.302, + "eval_steps_per_second": 27.575, + "num_input_tokens_seen": 6172872, + "step": 8600 + }, + { + "epoch": 17.88981288981289, + "grad_norm": 0.2410660684108734, + "learning_rate": 4.450584857638197e-05, + "loss": 0.137, + "num_input_tokens_seen": 6176328, + "step": 8605 + }, + { + "epoch": 17.9002079002079, + "grad_norm": 0.3440147638320923, + "learning_rate": 4.449970636035116e-05, + "loss": 0.1454, + "num_input_tokens_seen": 6179912, + "step": 8610 + }, + { + "epoch": 17.91060291060291, + "grad_norm": 0.2244568020105362, + "learning_rate": 4.4493561137220574e-05, + "loss": 0.1567, + "num_input_tokens_seen": 6183560, + "step": 8615 + }, + { + "epoch": 17.92099792099792, + "grad_norm": 0.16505080461502075, + "learning_rate": 4.44874129079379e-05, + "loss": 0.1211, + "num_input_tokens_seen": 6187080, + "step": 8620 + }, + { + "epoch": 17.93139293139293, + "grad_norm": 0.18361563980579376, + "learning_rate": 4.4481261673451255e-05, + "loss": 0.1138, + "num_input_tokens_seen": 6190696, + "step": 8625 + }, + { + "epoch": 17.941787941787943, + "grad_norm": 0.3314555585384369, + "learning_rate": 4.4475107434709245e-05, + "loss": 0.1709, + "num_input_tokens_seen": 6194440, + "step": 8630 + }, + { + "epoch": 17.95218295218295, + "grad_norm": 0.3860648572444916, + "learning_rate": 4.446895019266093e-05, + "loss": 0.1311, + "num_input_tokens_seen": 6197896, + "step": 8635 + }, + { + "epoch": 17.962577962577964, + "grad_norm": 0.197195366024971, + "learning_rate": 4.446278994825583e-05, + "loss": 0.112, + "num_input_tokens_seen": 6201640, + "step": 8640 + }, + { + "epoch": 17.972972972972972, + "grad_norm": 0.22205880284309387, + "learning_rate": 4.445662670244394e-05, + "loss": 0.1181, + "num_input_tokens_seen": 6205192, + "step": 8645 + }, + { + "epoch": 17.983367983367984, + "grad_norm": 0.6138115525245667, + "learning_rate": 4.44504604561757e-05, + "loss": 0.1682, + "num_input_tokens_seen": 6208776, + "step": 8650 + }, + { + "epoch": 17.993762993762992, + "grad_norm": 0.2987188994884491, + "learning_rate": 4.4444291210402035e-05, + "loss": 0.1573, + "num_input_tokens_seen": 6212392, + "step": 8655 + }, + { + "epoch": 18.004158004158004, + "grad_norm": 0.5324123501777649, + "learning_rate": 4.443811896607431e-05, + "loss": 0.1454, + "num_input_tokens_seen": 6215968, + "step": 8660 + }, + { + "epoch": 18.014553014553016, + "grad_norm": 0.4147639870643616, + "learning_rate": 4.443194372414436e-05, + "loss": 0.1246, + "num_input_tokens_seen": 6219584, + "step": 8665 + }, + { + "epoch": 18.024948024948024, + "grad_norm": 0.10242084413766861, + "learning_rate": 4.442576548556449e-05, + "loss": 0.0982, + "num_input_tokens_seen": 6223296, + "step": 8670 + }, + { + "epoch": 18.035343035343036, + "grad_norm": 0.6527388691902161, + "learning_rate": 4.441958425128747e-05, + "loss": 0.1083, + "num_input_tokens_seen": 6226688, + "step": 8675 + }, + { + "epoch": 18.045738045738045, + "grad_norm": 0.36987438797950745, + "learning_rate": 4.4413400022266515e-05, + "loss": 0.1409, + "num_input_tokens_seen": 6230368, + "step": 8680 + }, + { + "epoch": 18.056133056133056, + "grad_norm": 0.427683562040329, + "learning_rate": 4.4407212799455313e-05, + "loss": 0.1262, + "num_input_tokens_seen": 6234080, + "step": 8685 + }, + { + "epoch": 18.066528066528065, + "grad_norm": 0.16757965087890625, + "learning_rate": 4.4401022583808003e-05, + "loss": 0.1543, + "num_input_tokens_seen": 6237696, + "step": 8690 + }, + { + "epoch": 18.076923076923077, + "grad_norm": 0.24860480427742004, + "learning_rate": 4.439482937627921e-05, + "loss": 0.0968, + "num_input_tokens_seen": 6241344, + "step": 8695 + }, + { + "epoch": 18.08731808731809, + "grad_norm": 0.4224541187286377, + "learning_rate": 4.4388633177824004e-05, + "loss": 0.119, + "num_input_tokens_seen": 6244992, + "step": 8700 + }, + { + "epoch": 18.097713097713097, + "grad_norm": 0.19783371686935425, + "learning_rate": 4.4382433989397895e-05, + "loss": 0.1479, + "num_input_tokens_seen": 6248448, + "step": 8705 + }, + { + "epoch": 18.10810810810811, + "grad_norm": 0.28942644596099854, + "learning_rate": 4.4376231811956895e-05, + "loss": 0.108, + "num_input_tokens_seen": 6251936, + "step": 8710 + }, + { + "epoch": 18.118503118503117, + "grad_norm": 0.16800802946090698, + "learning_rate": 4.437002664645745e-05, + "loss": 0.1266, + "num_input_tokens_seen": 6255584, + "step": 8715 + }, + { + "epoch": 18.12889812889813, + "grad_norm": 0.2271391898393631, + "learning_rate": 4.436381849385649e-05, + "loss": 0.08, + "num_input_tokens_seen": 6259136, + "step": 8720 + }, + { + "epoch": 18.13929313929314, + "grad_norm": 0.17427854239940643, + "learning_rate": 4.435760735511136e-05, + "loss": 0.1121, + "num_input_tokens_seen": 6262848, + "step": 8725 + }, + { + "epoch": 18.14968814968815, + "grad_norm": 0.2385103851556778, + "learning_rate": 4.435139323117992e-05, + "loss": 0.1198, + "num_input_tokens_seen": 6266464, + "step": 8730 + }, + { + "epoch": 18.16008316008316, + "grad_norm": 0.27059847116470337, + "learning_rate": 4.434517612302046e-05, + "loss": 0.1398, + "num_input_tokens_seen": 6269952, + "step": 8735 + }, + { + "epoch": 18.17047817047817, + "grad_norm": 0.1689353734254837, + "learning_rate": 4.433895603159174e-05, + "loss": 0.1099, + "num_input_tokens_seen": 6273472, + "step": 8740 + }, + { + "epoch": 18.18087318087318, + "grad_norm": 0.2938687801361084, + "learning_rate": 4.433273295785296e-05, + "loss": 0.0951, + "num_input_tokens_seen": 6277248, + "step": 8745 + }, + { + "epoch": 18.19126819126819, + "grad_norm": 0.2557108700275421, + "learning_rate": 4.432650690276382e-05, + "loss": 0.1215, + "num_input_tokens_seen": 6280768, + "step": 8750 + }, + { + "epoch": 18.2016632016632, + "grad_norm": 0.12496031820774078, + "learning_rate": 4.4320277867284435e-05, + "loss": 0.1704, + "num_input_tokens_seen": 6284448, + "step": 8755 + }, + { + "epoch": 18.212058212058214, + "grad_norm": 0.6009784936904907, + "learning_rate": 4.431404585237541e-05, + "loss": 0.0889, + "num_input_tokens_seen": 6288032, + "step": 8760 + }, + { + "epoch": 18.222453222453222, + "grad_norm": 0.2032383680343628, + "learning_rate": 4.43078108589978e-05, + "loss": 0.1257, + "num_input_tokens_seen": 6291680, + "step": 8765 + }, + { + "epoch": 18.232848232848234, + "grad_norm": 0.37949204444885254, + "learning_rate": 4.4301572888113116e-05, + "loss": 0.1508, + "num_input_tokens_seen": 6295232, + "step": 8770 + }, + { + "epoch": 18.243243243243242, + "grad_norm": 0.21609343588352203, + "learning_rate": 4.4295331940683337e-05, + "loss": 0.1982, + "num_input_tokens_seen": 6298784, + "step": 8775 + }, + { + "epoch": 18.253638253638254, + "grad_norm": 0.2095288783311844, + "learning_rate": 4.428908801767089e-05, + "loss": 0.1125, + "num_input_tokens_seen": 6302272, + "step": 8780 + }, + { + "epoch": 18.264033264033262, + "grad_norm": 0.27498942613601685, + "learning_rate": 4.428284112003868e-05, + "loss": 0.1581, + "num_input_tokens_seen": 6305824, + "step": 8785 + }, + { + "epoch": 18.274428274428274, + "grad_norm": 0.14394082129001617, + "learning_rate": 4.4276591248750033e-05, + "loss": 0.089, + "num_input_tokens_seen": 6309280, + "step": 8790 + }, + { + "epoch": 18.284823284823286, + "grad_norm": 0.209044948220253, + "learning_rate": 4.4270338404768774e-05, + "loss": 0.1036, + "num_input_tokens_seen": 6312608, + "step": 8795 + }, + { + "epoch": 18.295218295218294, + "grad_norm": 0.4531625807285309, + "learning_rate": 4.426408258905917e-05, + "loss": 0.1569, + "num_input_tokens_seen": 6316224, + "step": 8800 + }, + { + "epoch": 18.295218295218294, + "eval_loss": 0.147343248128891, + "eval_runtime": 7.7813, + "eval_samples_per_second": 110.007, + "eval_steps_per_second": 27.502, + "num_input_tokens_seen": 6316224, + "step": 8800 + }, + { + "epoch": 18.305613305613306, + "grad_norm": 0.3639976382255554, + "learning_rate": 4.425782380258594e-05, + "loss": 0.1352, + "num_input_tokens_seen": 6319808, + "step": 8805 + }, + { + "epoch": 18.316008316008315, + "grad_norm": 0.16918860375881195, + "learning_rate": 4.425156204631427e-05, + "loss": 0.1243, + "num_input_tokens_seen": 6323552, + "step": 8810 + }, + { + "epoch": 18.326403326403327, + "grad_norm": 0.7030711770057678, + "learning_rate": 4.424529732120981e-05, + "loss": 0.1651, + "num_input_tokens_seen": 6327008, + "step": 8815 + }, + { + "epoch": 18.33679833679834, + "grad_norm": 0.343333899974823, + "learning_rate": 4.423902962823864e-05, + "loss": 0.1237, + "num_input_tokens_seen": 6330400, + "step": 8820 + }, + { + "epoch": 18.347193347193347, + "grad_norm": 0.5292505621910095, + "learning_rate": 4.423275896836733e-05, + "loss": 0.1062, + "num_input_tokens_seen": 6333856, + "step": 8825 + }, + { + "epoch": 18.35758835758836, + "grad_norm": 0.46538564562797546, + "learning_rate": 4.42264853425629e-05, + "loss": 0.1729, + "num_input_tokens_seen": 6337664, + "step": 8830 + }, + { + "epoch": 18.367983367983367, + "grad_norm": 0.49598658084869385, + "learning_rate": 4.4220208751792816e-05, + "loss": 0.1372, + "num_input_tokens_seen": 6341312, + "step": 8835 + }, + { + "epoch": 18.37837837837838, + "grad_norm": 0.2725369334220886, + "learning_rate": 4.421392919702499e-05, + "loss": 0.1469, + "num_input_tokens_seen": 6344896, + "step": 8840 + }, + { + "epoch": 18.388773388773387, + "grad_norm": 0.513523280620575, + "learning_rate": 4.4207646679227846e-05, + "loss": 0.1454, + "num_input_tokens_seen": 6348480, + "step": 8845 + }, + { + "epoch": 18.3991683991684, + "grad_norm": 0.3805491328239441, + "learning_rate": 4.42013611993702e-05, + "loss": 0.143, + "num_input_tokens_seen": 6352032, + "step": 8850 + }, + { + "epoch": 18.40956340956341, + "grad_norm": 0.5710710883140564, + "learning_rate": 4.419507275842135e-05, + "loss": 0.1341, + "num_input_tokens_seen": 6355616, + "step": 8855 + }, + { + "epoch": 18.41995841995842, + "grad_norm": 0.32209512591362, + "learning_rate": 4.418878135735106e-05, + "loss": 0.1048, + "num_input_tokens_seen": 6359232, + "step": 8860 + }, + { + "epoch": 18.43035343035343, + "grad_norm": 0.2829788029193878, + "learning_rate": 4.418248699712955e-05, + "loss": 0.1464, + "num_input_tokens_seen": 6362912, + "step": 8865 + }, + { + "epoch": 18.44074844074844, + "grad_norm": 0.2538456320762634, + "learning_rate": 4.417618967872748e-05, + "loss": 0.1086, + "num_input_tokens_seen": 6366592, + "step": 8870 + }, + { + "epoch": 18.45114345114345, + "grad_norm": 0.23190617561340332, + "learning_rate": 4.4169889403115985e-05, + "loss": 0.1173, + "num_input_tokens_seen": 6370048, + "step": 8875 + }, + { + "epoch": 18.46153846153846, + "grad_norm": 0.6558137536048889, + "learning_rate": 4.4163586171266627e-05, + "loss": 0.1125, + "num_input_tokens_seen": 6373536, + "step": 8880 + }, + { + "epoch": 18.471933471933472, + "grad_norm": 0.22940151393413544, + "learning_rate": 4.415727998415147e-05, + "loss": 0.1057, + "num_input_tokens_seen": 6377184, + "step": 8885 + }, + { + "epoch": 18.482328482328484, + "grad_norm": 0.24906572699546814, + "learning_rate": 4.4150970842742985e-05, + "loss": 0.1235, + "num_input_tokens_seen": 6380832, + "step": 8890 + }, + { + "epoch": 18.492723492723492, + "grad_norm": 0.28300538659095764, + "learning_rate": 4.4144658748014134e-05, + "loss": 0.1413, + "num_input_tokens_seen": 6384480, + "step": 8895 + }, + { + "epoch": 18.503118503118504, + "grad_norm": 0.49399539828300476, + "learning_rate": 4.413834370093831e-05, + "loss": 0.1486, + "num_input_tokens_seen": 6388032, + "step": 8900 + }, + { + "epoch": 18.513513513513512, + "grad_norm": 0.2651132643222809, + "learning_rate": 4.413202570248939e-05, + "loss": 0.138, + "num_input_tokens_seen": 6391680, + "step": 8905 + }, + { + "epoch": 18.523908523908524, + "grad_norm": 0.17466899752616882, + "learning_rate": 4.412570475364167e-05, + "loss": 0.1065, + "num_input_tokens_seen": 6395264, + "step": 8910 + }, + { + "epoch": 18.534303534303533, + "grad_norm": 0.19902345538139343, + "learning_rate": 4.411938085536994e-05, + "loss": 0.1088, + "num_input_tokens_seen": 6398880, + "step": 8915 + }, + { + "epoch": 18.544698544698544, + "grad_norm": 0.40631553530693054, + "learning_rate": 4.41130540086494e-05, + "loss": 0.1611, + "num_input_tokens_seen": 6402432, + "step": 8920 + }, + { + "epoch": 18.555093555093556, + "grad_norm": 0.1963224858045578, + "learning_rate": 4.4106724214455754e-05, + "loss": 0.1045, + "num_input_tokens_seen": 6406144, + "step": 8925 + }, + { + "epoch": 18.565488565488565, + "grad_norm": 0.32368969917297363, + "learning_rate": 4.4100391473765115e-05, + "loss": 0.1769, + "num_input_tokens_seen": 6409728, + "step": 8930 + }, + { + "epoch": 18.575883575883577, + "grad_norm": 0.22151774168014526, + "learning_rate": 4.409405578755408e-05, + "loss": 0.0962, + "num_input_tokens_seen": 6413248, + "step": 8935 + }, + { + "epoch": 18.586278586278585, + "grad_norm": 0.165611132979393, + "learning_rate": 4.4087717156799705e-05, + "loss": 0.1443, + "num_input_tokens_seen": 6416864, + "step": 8940 + }, + { + "epoch": 18.596673596673597, + "grad_norm": 0.3351968824863434, + "learning_rate": 4.408137558247946e-05, + "loss": 0.203, + "num_input_tokens_seen": 6420448, + "step": 8945 + }, + { + "epoch": 18.60706860706861, + "grad_norm": 0.5440288782119751, + "learning_rate": 4.4075031065571306e-05, + "loss": 0.0931, + "num_input_tokens_seen": 6424256, + "step": 8950 + }, + { + "epoch": 18.617463617463617, + "grad_norm": 0.3574448525905609, + "learning_rate": 4.406868360705366e-05, + "loss": 0.12, + "num_input_tokens_seen": 6427936, + "step": 8955 + }, + { + "epoch": 18.62785862785863, + "grad_norm": 0.3617769777774811, + "learning_rate": 4.406233320790536e-05, + "loss": 0.1373, + "num_input_tokens_seen": 6431424, + "step": 8960 + }, + { + "epoch": 18.638253638253637, + "grad_norm": 0.31215792894363403, + "learning_rate": 4.4055979869105734e-05, + "loss": 0.1219, + "num_input_tokens_seen": 6434912, + "step": 8965 + }, + { + "epoch": 18.64864864864865, + "grad_norm": 0.3092806339263916, + "learning_rate": 4.404962359163454e-05, + "loss": 0.1517, + "num_input_tokens_seen": 6438656, + "step": 8970 + }, + { + "epoch": 18.659043659043657, + "grad_norm": 0.17771145701408386, + "learning_rate": 4.404326437647199e-05, + "loss": 0.1079, + "num_input_tokens_seen": 6442176, + "step": 8975 + }, + { + "epoch": 18.66943866943867, + "grad_norm": 0.20230183005332947, + "learning_rate": 4.403690222459877e-05, + "loss": 0.1189, + "num_input_tokens_seen": 6445856, + "step": 8980 + }, + { + "epoch": 18.67983367983368, + "grad_norm": 0.1379844695329666, + "learning_rate": 4.4030537136995984e-05, + "loss": 0.128, + "num_input_tokens_seen": 6449376, + "step": 8985 + }, + { + "epoch": 18.69022869022869, + "grad_norm": 0.19630815088748932, + "learning_rate": 4.402416911464523e-05, + "loss": 0.1231, + "num_input_tokens_seen": 6452960, + "step": 8990 + }, + { + "epoch": 18.7006237006237, + "grad_norm": 0.22999607026576996, + "learning_rate": 4.4017798158528516e-05, + "loss": 0.0767, + "num_input_tokens_seen": 6456576, + "step": 8995 + }, + { + "epoch": 18.71101871101871, + "grad_norm": 0.44546055793762207, + "learning_rate": 4.401142426962834e-05, + "loss": 0.1355, + "num_input_tokens_seen": 6460064, + "step": 9000 + }, + { + "epoch": 18.71101871101871, + "eval_loss": 0.15406696498394012, + "eval_runtime": 7.7561, + "eval_samples_per_second": 110.364, + "eval_steps_per_second": 27.591, + "num_input_tokens_seen": 6460064, + "step": 9000 + }, + { + "epoch": 18.72141372141372, + "grad_norm": 0.3101491630077362, + "learning_rate": 4.400504744892763e-05, + "loss": 0.1543, + "num_input_tokens_seen": 6463584, + "step": 9005 + }, + { + "epoch": 18.731808731808734, + "grad_norm": 0.5860581994056702, + "learning_rate": 4.399866769740975e-05, + "loss": 0.0939, + "num_input_tokens_seen": 6467232, + "step": 9010 + }, + { + "epoch": 18.742203742203742, + "grad_norm": 0.39611050486564636, + "learning_rate": 4.399228501605859e-05, + "loss": 0.1029, + "num_input_tokens_seen": 6470592, + "step": 9015 + }, + { + "epoch": 18.752598752598754, + "grad_norm": 0.15609975159168243, + "learning_rate": 4.398589940585839e-05, + "loss": 0.1062, + "num_input_tokens_seen": 6474208, + "step": 9020 + }, + { + "epoch": 18.762993762993762, + "grad_norm": 0.29718974232673645, + "learning_rate": 4.3979510867793917e-05, + "loss": 0.1359, + "num_input_tokens_seen": 6477760, + "step": 9025 + }, + { + "epoch": 18.773388773388774, + "grad_norm": 0.4189716875553131, + "learning_rate": 4.3973119402850346e-05, + "loss": 0.1031, + "num_input_tokens_seen": 6481312, + "step": 9030 + }, + { + "epoch": 18.783783783783782, + "grad_norm": 0.32322943210601807, + "learning_rate": 4.396672501201334e-05, + "loss": 0.1007, + "num_input_tokens_seen": 6484928, + "step": 9035 + }, + { + "epoch": 18.794178794178794, + "grad_norm": 0.1515711396932602, + "learning_rate": 4.396032769626899e-05, + "loss": 0.1286, + "num_input_tokens_seen": 6488512, + "step": 9040 + }, + { + "epoch": 18.804573804573806, + "grad_norm": 0.4215414226055145, + "learning_rate": 4.395392745660384e-05, + "loss": 0.0915, + "num_input_tokens_seen": 6492224, + "step": 9045 + }, + { + "epoch": 18.814968814968815, + "grad_norm": 0.44454237818717957, + "learning_rate": 4.394752429400488e-05, + "loss": 0.1122, + "num_input_tokens_seen": 6495744, + "step": 9050 + }, + { + "epoch": 18.825363825363826, + "grad_norm": 0.6163035035133362, + "learning_rate": 4.394111820945957e-05, + "loss": 0.1361, + "num_input_tokens_seen": 6499296, + "step": 9055 + }, + { + "epoch": 18.835758835758835, + "grad_norm": 0.5488680601119995, + "learning_rate": 4.393470920395579e-05, + "loss": 0.1399, + "num_input_tokens_seen": 6503008, + "step": 9060 + }, + { + "epoch": 18.846153846153847, + "grad_norm": 0.173929363489151, + "learning_rate": 4.392829727848192e-05, + "loss": 0.1018, + "num_input_tokens_seen": 6506624, + "step": 9065 + }, + { + "epoch": 18.856548856548855, + "grad_norm": 0.3080010712146759, + "learning_rate": 4.392188243402673e-05, + "loss": 0.1104, + "num_input_tokens_seen": 6510272, + "step": 9070 + }, + { + "epoch": 18.866943866943867, + "grad_norm": 0.28988897800445557, + "learning_rate": 4.391546467157949e-05, + "loss": 0.1291, + "num_input_tokens_seen": 6513792, + "step": 9075 + }, + { + "epoch": 18.87733887733888, + "grad_norm": 0.40657636523246765, + "learning_rate": 4.390904399212988e-05, + "loss": 0.1472, + "num_input_tokens_seen": 6517312, + "step": 9080 + }, + { + "epoch": 18.887733887733887, + "grad_norm": 0.14858272671699524, + "learning_rate": 4.390262039666807e-05, + "loss": 0.1413, + "num_input_tokens_seen": 6520992, + "step": 9085 + }, + { + "epoch": 18.8981288981289, + "grad_norm": 0.2530178725719452, + "learning_rate": 4.389619388618464e-05, + "loss": 0.1613, + "num_input_tokens_seen": 6524640, + "step": 9090 + }, + { + "epoch": 18.908523908523907, + "grad_norm": 0.2117803990840912, + "learning_rate": 4.3889764461670655e-05, + "loss": 0.1056, + "num_input_tokens_seen": 6528192, + "step": 9095 + }, + { + "epoch": 18.91891891891892, + "grad_norm": 0.8299070596694946, + "learning_rate": 4.38833321241176e-05, + "loss": 0.1644, + "num_input_tokens_seen": 6531744, + "step": 9100 + }, + { + "epoch": 18.929313929313928, + "grad_norm": 0.4169202446937561, + "learning_rate": 4.3876896874517434e-05, + "loss": 0.1531, + "num_input_tokens_seen": 6535296, + "step": 9105 + }, + { + "epoch": 18.93970893970894, + "grad_norm": 0.2151591032743454, + "learning_rate": 4.3870458713862554e-05, + "loss": 0.154, + "num_input_tokens_seen": 6538848, + "step": 9110 + }, + { + "epoch": 18.95010395010395, + "grad_norm": 0.28545942902565, + "learning_rate": 4.386401764314579e-05, + "loss": 0.1324, + "num_input_tokens_seen": 6542464, + "step": 9115 + }, + { + "epoch": 18.96049896049896, + "grad_norm": 0.25723782181739807, + "learning_rate": 4.385757366336045e-05, + "loss": 0.0855, + "num_input_tokens_seen": 6546112, + "step": 9120 + }, + { + "epoch": 18.97089397089397, + "grad_norm": 0.39762017130851746, + "learning_rate": 4.385112677550027e-05, + "loss": 0.1059, + "num_input_tokens_seen": 6549856, + "step": 9125 + }, + { + "epoch": 18.98128898128898, + "grad_norm": 0.3457249701023102, + "learning_rate": 4.384467698055945e-05, + "loss": 0.1707, + "num_input_tokens_seen": 6553472, + "step": 9130 + }, + { + "epoch": 18.991683991683992, + "grad_norm": 0.2792756259441376, + "learning_rate": 4.383822427953261e-05, + "loss": 0.1644, + "num_input_tokens_seen": 6556992, + "step": 9135 + }, + { + "epoch": 19.002079002079004, + "grad_norm": 0.25210368633270264, + "learning_rate": 4.3831768673414864e-05, + "loss": 0.137, + "num_input_tokens_seen": 6560568, + "step": 9140 + }, + { + "epoch": 19.012474012474012, + "grad_norm": 0.13314884901046753, + "learning_rate": 4.382531016320173e-05, + "loss": 0.1259, + "num_input_tokens_seen": 6564152, + "step": 9145 + }, + { + "epoch": 19.022869022869024, + "grad_norm": 0.2218407541513443, + "learning_rate": 4.3818848749889184e-05, + "loss": 0.1025, + "num_input_tokens_seen": 6567544, + "step": 9150 + }, + { + "epoch": 19.033264033264032, + "grad_norm": 0.8314236402511597, + "learning_rate": 4.381238443447368e-05, + "loss": 0.1449, + "num_input_tokens_seen": 6571160, + "step": 9155 + }, + { + "epoch": 19.043659043659044, + "grad_norm": 0.24505122005939484, + "learning_rate": 4.380591721795208e-05, + "loss": 0.1331, + "num_input_tokens_seen": 6574808, + "step": 9160 + }, + { + "epoch": 19.054054054054053, + "grad_norm": 0.6801697015762329, + "learning_rate": 4.3799447101321723e-05, + "loss": 0.0963, + "num_input_tokens_seen": 6578296, + "step": 9165 + }, + { + "epoch": 19.064449064449065, + "grad_norm": 0.13633699715137482, + "learning_rate": 4.379297408558036e-05, + "loss": 0.1454, + "num_input_tokens_seen": 6581944, + "step": 9170 + }, + { + "epoch": 19.074844074844076, + "grad_norm": 0.3190704882144928, + "learning_rate": 4.378649817172624e-05, + "loss": 0.1256, + "num_input_tokens_seen": 6585464, + "step": 9175 + }, + { + "epoch": 19.085239085239085, + "grad_norm": 0.32661595940589905, + "learning_rate": 4.378001936075801e-05, + "loss": 0.1162, + "num_input_tokens_seen": 6589112, + "step": 9180 + }, + { + "epoch": 19.095634095634097, + "grad_norm": 0.1630261391401291, + "learning_rate": 4.377353765367479e-05, + "loss": 0.1304, + "num_input_tokens_seen": 6592728, + "step": 9185 + }, + { + "epoch": 19.106029106029105, + "grad_norm": 0.7156330347061157, + "learning_rate": 4.376705305147614e-05, + "loss": 0.1256, + "num_input_tokens_seen": 6596344, + "step": 9190 + }, + { + "epoch": 19.116424116424117, + "grad_norm": 0.19349242746829987, + "learning_rate": 4.376056555516206e-05, + "loss": 0.099, + "num_input_tokens_seen": 6599928, + "step": 9195 + }, + { + "epoch": 19.126819126819125, + "grad_norm": 0.2816258668899536, + "learning_rate": 4.375407516573302e-05, + "loss": 0.1568, + "num_input_tokens_seen": 6603384, + "step": 9200 + }, + { + "epoch": 19.126819126819125, + "eval_loss": 0.1443786323070526, + "eval_runtime": 7.7714, + "eval_samples_per_second": 110.147, + "eval_steps_per_second": 27.537, + "num_input_tokens_seen": 6603384, + "step": 9200 + }, + { + "epoch": 19.137214137214137, + "grad_norm": 0.3412700593471527, + "learning_rate": 4.3747581884189913e-05, + "loss": 0.1229, + "num_input_tokens_seen": 6607128, + "step": 9205 + }, + { + "epoch": 19.14760914760915, + "grad_norm": 0.16944636404514313, + "learning_rate": 4.374108571153408e-05, + "loss": 0.1025, + "num_input_tokens_seen": 6610776, + "step": 9210 + }, + { + "epoch": 19.158004158004157, + "grad_norm": 0.36532917618751526, + "learning_rate": 4.3734586648767316e-05, + "loss": 0.1265, + "num_input_tokens_seen": 6614360, + "step": 9215 + }, + { + "epoch": 19.16839916839917, + "grad_norm": 0.16063635051250458, + "learning_rate": 4.372808469689186e-05, + "loss": 0.1343, + "num_input_tokens_seen": 6617880, + "step": 9220 + }, + { + "epoch": 19.178794178794178, + "grad_norm": 0.31036701798439026, + "learning_rate": 4.372157985691039e-05, + "loss": 0.1317, + "num_input_tokens_seen": 6621368, + "step": 9225 + }, + { + "epoch": 19.18918918918919, + "grad_norm": 0.32625532150268555, + "learning_rate": 4.371507212982603e-05, + "loss": 0.128, + "num_input_tokens_seen": 6624920, + "step": 9230 + }, + { + "epoch": 19.1995841995842, + "grad_norm": 0.2718980610370636, + "learning_rate": 4.370856151664236e-05, + "loss": 0.1047, + "num_input_tokens_seen": 6628504, + "step": 9235 + }, + { + "epoch": 19.20997920997921, + "grad_norm": 0.45139098167419434, + "learning_rate": 4.3702048018363404e-05, + "loss": 0.1099, + "num_input_tokens_seen": 6632024, + "step": 9240 + }, + { + "epoch": 19.22037422037422, + "grad_norm": 0.1939103901386261, + "learning_rate": 4.369553163599362e-05, + "loss": 0.111, + "num_input_tokens_seen": 6635672, + "step": 9245 + }, + { + "epoch": 19.23076923076923, + "grad_norm": 0.2763054370880127, + "learning_rate": 4.3689012370537904e-05, + "loss": 0.1485, + "num_input_tokens_seen": 6639224, + "step": 9250 + }, + { + "epoch": 19.241164241164242, + "grad_norm": 0.12644119560718536, + "learning_rate": 4.368249022300164e-05, + "loss": 0.1049, + "num_input_tokens_seen": 6642584, + "step": 9255 + }, + { + "epoch": 19.25155925155925, + "grad_norm": 0.23326429724693298, + "learning_rate": 4.367596519439059e-05, + "loss": 0.0814, + "num_input_tokens_seen": 6646328, + "step": 9260 + }, + { + "epoch": 19.261954261954262, + "grad_norm": 0.6031051278114319, + "learning_rate": 4.366943728571101e-05, + "loss": 0.0923, + "num_input_tokens_seen": 6649816, + "step": 9265 + }, + { + "epoch": 19.272349272349274, + "grad_norm": 0.2905048727989197, + "learning_rate": 4.366290649796959e-05, + "loss": 0.1227, + "num_input_tokens_seen": 6653240, + "step": 9270 + }, + { + "epoch": 19.282744282744282, + "grad_norm": 0.9277584552764893, + "learning_rate": 4.3656372832173456e-05, + "loss": 0.1253, + "num_input_tokens_seen": 6656920, + "step": 9275 + }, + { + "epoch": 19.293139293139294, + "grad_norm": 0.8951232433319092, + "learning_rate": 4.364983628933017e-05, + "loss": 0.1564, + "num_input_tokens_seen": 6660536, + "step": 9280 + }, + { + "epoch": 19.303534303534303, + "grad_norm": 0.3852221369743347, + "learning_rate": 4.364329687044777e-05, + "loss": 0.1529, + "num_input_tokens_seen": 6663928, + "step": 9285 + }, + { + "epoch": 19.313929313929314, + "grad_norm": 0.38284367322921753, + "learning_rate": 4.36367545765347e-05, + "loss": 0.1453, + "num_input_tokens_seen": 6667608, + "step": 9290 + }, + { + "epoch": 19.324324324324323, + "grad_norm": 0.24520903825759888, + "learning_rate": 4.363020940859988e-05, + "loss": 0.127, + "num_input_tokens_seen": 6671192, + "step": 9295 + }, + { + "epoch": 19.334719334719335, + "grad_norm": 0.32828718423843384, + "learning_rate": 4.362366136765263e-05, + "loss": 0.1481, + "num_input_tokens_seen": 6674840, + "step": 9300 + }, + { + "epoch": 19.345114345114347, + "grad_norm": 0.6462175846099854, + "learning_rate": 4.361711045470278e-05, + "loss": 0.1387, + "num_input_tokens_seen": 6678424, + "step": 9305 + }, + { + "epoch": 19.355509355509355, + "grad_norm": 0.35450512170791626, + "learning_rate": 4.3610556670760524e-05, + "loss": 0.1169, + "num_input_tokens_seen": 6682072, + "step": 9310 + }, + { + "epoch": 19.365904365904367, + "grad_norm": 0.2739982604980469, + "learning_rate": 4.360400001683657e-05, + "loss": 0.0983, + "num_input_tokens_seen": 6685688, + "step": 9315 + }, + { + "epoch": 19.376299376299375, + "grad_norm": 0.22888554632663727, + "learning_rate": 4.3597440493942e-05, + "loss": 0.1014, + "num_input_tokens_seen": 6689240, + "step": 9320 + }, + { + "epoch": 19.386694386694387, + "grad_norm": 0.33420851826667786, + "learning_rate": 4.3590878103088405e-05, + "loss": 0.1473, + "num_input_tokens_seen": 6692952, + "step": 9325 + }, + { + "epoch": 19.397089397089395, + "grad_norm": 0.28567764163017273, + "learning_rate": 4.358431284528779e-05, + "loss": 0.126, + "num_input_tokens_seen": 6696536, + "step": 9330 + }, + { + "epoch": 19.407484407484407, + "grad_norm": 0.46559056639671326, + "learning_rate": 4.357774472155257e-05, + "loss": 0.1507, + "num_input_tokens_seen": 6700216, + "step": 9335 + }, + { + "epoch": 19.41787941787942, + "grad_norm": 0.4298064708709717, + "learning_rate": 4.3571173732895664e-05, + "loss": 0.1645, + "num_input_tokens_seen": 6703800, + "step": 9340 + }, + { + "epoch": 19.428274428274428, + "grad_norm": 1.0463576316833496, + "learning_rate": 4.356459988033039e-05, + "loss": 0.1425, + "num_input_tokens_seen": 6707352, + "step": 9345 + }, + { + "epoch": 19.43866943866944, + "grad_norm": 0.18749482929706573, + "learning_rate": 4.355802316487051e-05, + "loss": 0.1095, + "num_input_tokens_seen": 6710968, + "step": 9350 + }, + { + "epoch": 19.449064449064448, + "grad_norm": 0.4241730570793152, + "learning_rate": 4.355144358753025e-05, + "loss": 0.1299, + "num_input_tokens_seen": 6714648, + "step": 9355 + }, + { + "epoch": 19.45945945945946, + "grad_norm": 0.23642387986183167, + "learning_rate": 4.354486114932425e-05, + "loss": 0.1351, + "num_input_tokens_seen": 6718136, + "step": 9360 + }, + { + "epoch": 19.46985446985447, + "grad_norm": 0.42245906591415405, + "learning_rate": 4.353827585126762e-05, + "loss": 0.1154, + "num_input_tokens_seen": 6721688, + "step": 9365 + }, + { + "epoch": 19.48024948024948, + "grad_norm": 0.17095519602298737, + "learning_rate": 4.353168769437588e-05, + "loss": 0.1704, + "num_input_tokens_seen": 6725112, + "step": 9370 + }, + { + "epoch": 19.490644490644492, + "grad_norm": 0.1561601459980011, + "learning_rate": 4.3525096679665014e-05, + "loss": 0.1372, + "num_input_tokens_seen": 6728664, + "step": 9375 + }, + { + "epoch": 19.5010395010395, + "grad_norm": 0.1988736391067505, + "learning_rate": 4.351850280815144e-05, + "loss": 0.1189, + "num_input_tokens_seen": 6732376, + "step": 9380 + }, + { + "epoch": 19.511434511434512, + "grad_norm": 0.19084276258945465, + "learning_rate": 4.3511906080852014e-05, + "loss": 0.1262, + "num_input_tokens_seen": 6736024, + "step": 9385 + }, + { + "epoch": 19.52182952182952, + "grad_norm": 0.3442710340023041, + "learning_rate": 4.350530649878404e-05, + "loss": 0.1579, + "num_input_tokens_seen": 6739704, + "step": 9390 + }, + { + "epoch": 19.532224532224532, + "grad_norm": 0.32639846205711365, + "learning_rate": 4.3498704062965246e-05, + "loss": 0.1097, + "num_input_tokens_seen": 6743192, + "step": 9395 + }, + { + "epoch": 19.542619542619544, + "grad_norm": 0.23203881084918976, + "learning_rate": 4.3492098774413815e-05, + "loss": 0.1126, + "num_input_tokens_seen": 6746616, + "step": 9400 + }, + { + "epoch": 19.542619542619544, + "eval_loss": 0.14530201256275177, + "eval_runtime": 7.7712, + "eval_samples_per_second": 110.151, + "eval_steps_per_second": 27.538, + "num_input_tokens_seen": 6746616, + "step": 9400 + }, + { + "epoch": 19.553014553014552, + "grad_norm": 0.192438006401062, + "learning_rate": 4.3485490634148375e-05, + "loss": 0.0977, + "num_input_tokens_seen": 6750136, + "step": 9405 + }, + { + "epoch": 19.563409563409564, + "grad_norm": 0.16430741548538208, + "learning_rate": 4.347887964318797e-05, + "loss": 0.1234, + "num_input_tokens_seen": 6753752, + "step": 9410 + }, + { + "epoch": 19.573804573804573, + "grad_norm": 0.21643473207950592, + "learning_rate": 4.34722658025521e-05, + "loss": 0.1922, + "num_input_tokens_seen": 6757304, + "step": 9415 + }, + { + "epoch": 19.584199584199585, + "grad_norm": 0.25415676832199097, + "learning_rate": 4.346564911326071e-05, + "loss": 0.1638, + "num_input_tokens_seen": 6760952, + "step": 9420 + }, + { + "epoch": 19.594594594594593, + "grad_norm": 0.22749686241149902, + "learning_rate": 4.345902957633418e-05, + "loss": 0.1299, + "num_input_tokens_seen": 6764632, + "step": 9425 + }, + { + "epoch": 19.604989604989605, + "grad_norm": 0.30494722723960876, + "learning_rate": 4.345240719279331e-05, + "loss": 0.1127, + "num_input_tokens_seen": 6768344, + "step": 9430 + }, + { + "epoch": 19.615384615384617, + "grad_norm": 0.19603022933006287, + "learning_rate": 4.3445781963659374e-05, + "loss": 0.1265, + "num_input_tokens_seen": 6771896, + "step": 9435 + }, + { + "epoch": 19.625779625779625, + "grad_norm": 0.3630090057849884, + "learning_rate": 4.3439153889954045e-05, + "loss": 0.1548, + "num_input_tokens_seen": 6775544, + "step": 9440 + }, + { + "epoch": 19.636174636174637, + "grad_norm": 0.16754986345767975, + "learning_rate": 4.343252297269946e-05, + "loss": 0.0822, + "num_input_tokens_seen": 6779000, + "step": 9445 + }, + { + "epoch": 19.646569646569645, + "grad_norm": 0.161770761013031, + "learning_rate": 4.342588921291821e-05, + "loss": 0.1295, + "num_input_tokens_seen": 6782488, + "step": 9450 + }, + { + "epoch": 19.656964656964657, + "grad_norm": 0.2661038339138031, + "learning_rate": 4.341925261163328e-05, + "loss": 0.102, + "num_input_tokens_seen": 6785976, + "step": 9455 + }, + { + "epoch": 19.66735966735967, + "grad_norm": 0.20725271105766296, + "learning_rate": 4.341261316986813e-05, + "loss": 0.1317, + "num_input_tokens_seen": 6789656, + "step": 9460 + }, + { + "epoch": 19.677754677754677, + "grad_norm": 0.43516209721565247, + "learning_rate": 4.340597088864664e-05, + "loss": 0.114, + "num_input_tokens_seen": 6793464, + "step": 9465 + }, + { + "epoch": 19.68814968814969, + "grad_norm": 0.25454089045524597, + "learning_rate": 4.339932576899313e-05, + "loss": 0.1018, + "num_input_tokens_seen": 6797016, + "step": 9470 + }, + { + "epoch": 19.698544698544698, + "grad_norm": 0.46244940161705017, + "learning_rate": 4.3392677811932375e-05, + "loss": 0.1109, + "num_input_tokens_seen": 6800568, + "step": 9475 + }, + { + "epoch": 19.70893970893971, + "grad_norm": 0.19509343802928925, + "learning_rate": 4.338602701848956e-05, + "loss": 0.0804, + "num_input_tokens_seen": 6804216, + "step": 9480 + }, + { + "epoch": 19.719334719334718, + "grad_norm": 0.48462221026420593, + "learning_rate": 4.337937338969033e-05, + "loss": 0.1096, + "num_input_tokens_seen": 6807896, + "step": 9485 + }, + { + "epoch": 19.72972972972973, + "grad_norm": 0.32147467136383057, + "learning_rate": 4.337271692656075e-05, + "loss": 0.116, + "num_input_tokens_seen": 6811352, + "step": 9490 + }, + { + "epoch": 19.74012474012474, + "grad_norm": 0.34697818756103516, + "learning_rate": 4.336605763012733e-05, + "loss": 0.1323, + "num_input_tokens_seen": 6815064, + "step": 9495 + }, + { + "epoch": 19.75051975051975, + "grad_norm": 0.539384663105011, + "learning_rate": 4.3359395501417026e-05, + "loss": 0.1183, + "num_input_tokens_seen": 6818680, + "step": 9500 + }, + { + "epoch": 19.760914760914762, + "grad_norm": 0.2660115361213684, + "learning_rate": 4.335273054145722e-05, + "loss": 0.1292, + "num_input_tokens_seen": 6822296, + "step": 9505 + }, + { + "epoch": 19.77130977130977, + "grad_norm": 0.171153724193573, + "learning_rate": 4.334606275127572e-05, + "loss": 0.1536, + "num_input_tokens_seen": 6825912, + "step": 9510 + }, + { + "epoch": 19.781704781704782, + "grad_norm": 0.4092075228691101, + "learning_rate": 4.33393921319008e-05, + "loss": 0.1164, + "num_input_tokens_seen": 6829496, + "step": 9515 + }, + { + "epoch": 19.79209979209979, + "grad_norm": 0.383002907037735, + "learning_rate": 4.3332718684361146e-05, + "loss": 0.1468, + "num_input_tokens_seen": 6832952, + "step": 9520 + }, + { + "epoch": 19.802494802494802, + "grad_norm": 0.22510424256324768, + "learning_rate": 4.332604240968588e-05, + "loss": 0.0959, + "num_input_tokens_seen": 6836600, + "step": 9525 + }, + { + "epoch": 19.812889812889814, + "grad_norm": 0.4720372259616852, + "learning_rate": 4.331936330890459e-05, + "loss": 0.1366, + "num_input_tokens_seen": 6839992, + "step": 9530 + }, + { + "epoch": 19.823284823284823, + "grad_norm": 0.22849582135677338, + "learning_rate": 4.331268138304725e-05, + "loss": 0.1152, + "num_input_tokens_seen": 6843576, + "step": 9535 + }, + { + "epoch": 19.833679833679835, + "grad_norm": 0.252510130405426, + "learning_rate": 4.330599663314431e-05, + "loss": 0.0998, + "num_input_tokens_seen": 6847192, + "step": 9540 + }, + { + "epoch": 19.844074844074843, + "grad_norm": 0.39608633518218994, + "learning_rate": 4.329930906022665e-05, + "loss": 0.1377, + "num_input_tokens_seen": 6850840, + "step": 9545 + }, + { + "epoch": 19.854469854469855, + "grad_norm": 0.4501285254955292, + "learning_rate": 4.3292618665325564e-05, + "loss": 0.1419, + "num_input_tokens_seen": 6854392, + "step": 9550 + }, + { + "epoch": 19.864864864864863, + "grad_norm": 0.8281221389770508, + "learning_rate": 4.3285925449472796e-05, + "loss": 0.1525, + "num_input_tokens_seen": 6858040, + "step": 9555 + }, + { + "epoch": 19.875259875259875, + "grad_norm": 0.32105785608291626, + "learning_rate": 4.327922941370054e-05, + "loss": 0.1508, + "num_input_tokens_seen": 6861912, + "step": 9560 + }, + { + "epoch": 19.885654885654887, + "grad_norm": 0.46231481432914734, + "learning_rate": 4.3272530559041384e-05, + "loss": 0.1646, + "num_input_tokens_seen": 6865464, + "step": 9565 + }, + { + "epoch": 19.896049896049895, + "grad_norm": 0.20751889050006866, + "learning_rate": 4.32658288865284e-05, + "loss": 0.1278, + "num_input_tokens_seen": 6869048, + "step": 9570 + }, + { + "epoch": 19.906444906444907, + "grad_norm": 0.40630796551704407, + "learning_rate": 4.325912439719505e-05, + "loss": 0.1253, + "num_input_tokens_seen": 6872664, + "step": 9575 + }, + { + "epoch": 19.916839916839916, + "grad_norm": 0.35491111874580383, + "learning_rate": 4.3252417092075266e-05, + "loss": 0.1277, + "num_input_tokens_seen": 6876312, + "step": 9580 + }, + { + "epoch": 19.927234927234927, + "grad_norm": 0.2141173630952835, + "learning_rate": 4.3245706972203385e-05, + "loss": 0.1098, + "num_input_tokens_seen": 6879896, + "step": 9585 + }, + { + "epoch": 19.93762993762994, + "grad_norm": 0.3326916992664337, + "learning_rate": 4.323899403861421e-05, + "loss": 0.1326, + "num_input_tokens_seen": 6883768, + "step": 9590 + }, + { + "epoch": 19.948024948024948, + "grad_norm": 0.28334006667137146, + "learning_rate": 4.3232278292342935e-05, + "loss": 0.1272, + "num_input_tokens_seen": 6887288, + "step": 9595 + }, + { + "epoch": 19.95841995841996, + "grad_norm": 0.4403286874294281, + "learning_rate": 4.322555973442524e-05, + "loss": 0.0971, + "num_input_tokens_seen": 6890808, + "step": 9600 + }, + { + "epoch": 19.95841995841996, + "eval_loss": 0.14585043489933014, + "eval_runtime": 7.7501, + "eval_samples_per_second": 110.45, + "eval_steps_per_second": 27.613, + "num_input_tokens_seen": 6890808, + "step": 9600 + }, + { + "epoch": 19.968814968814968, + "grad_norm": 0.1790160834789276, + "learning_rate": 4.3218838365897184e-05, + "loss": 0.1242, + "num_input_tokens_seen": 6894296, + "step": 9605 + }, + { + "epoch": 19.97920997920998, + "grad_norm": 0.21480779349803925, + "learning_rate": 4.3212114187795306e-05, + "loss": 0.1123, + "num_input_tokens_seen": 6898040, + "step": 9610 + }, + { + "epoch": 19.989604989604988, + "grad_norm": 0.2038034051656723, + "learning_rate": 4.320538720115656e-05, + "loss": 0.1121, + "num_input_tokens_seen": 6901560, + "step": 9615 + }, + { + "epoch": 20.0, + "grad_norm": 0.2702585756778717, + "learning_rate": 4.319865740701831e-05, + "loss": 0.1461, + "num_input_tokens_seen": 6905072, + "step": 9620 + }, + { + "epoch": 20.010395010395012, + "grad_norm": 0.634387731552124, + "learning_rate": 4.3191924806418396e-05, + "loss": 0.1497, + "num_input_tokens_seen": 6908592, + "step": 9625 + }, + { + "epoch": 20.02079002079002, + "grad_norm": 0.28210657835006714, + "learning_rate": 4.318518940039507e-05, + "loss": 0.1151, + "num_input_tokens_seen": 6912144, + "step": 9630 + }, + { + "epoch": 20.031185031185032, + "grad_norm": 0.16668397188186646, + "learning_rate": 4.3178451189987e-05, + "loss": 0.1264, + "num_input_tokens_seen": 6915728, + "step": 9635 + }, + { + "epoch": 20.04158004158004, + "grad_norm": 0.22366730868816376, + "learning_rate": 4.3171710176233315e-05, + "loss": 0.1807, + "num_input_tokens_seen": 6919280, + "step": 9640 + }, + { + "epoch": 20.051975051975052, + "grad_norm": 0.3853430449962616, + "learning_rate": 4.316496636017355e-05, + "loss": 0.1634, + "num_input_tokens_seen": 6922768, + "step": 9645 + }, + { + "epoch": 20.06237006237006, + "grad_norm": 0.7249703407287598, + "learning_rate": 4.315821974284771e-05, + "loss": 0.1073, + "num_input_tokens_seen": 6926480, + "step": 9650 + }, + { + "epoch": 20.072765072765073, + "grad_norm": 0.18319547176361084, + "learning_rate": 4.315147032529619e-05, + "loss": 0.1147, + "num_input_tokens_seen": 6930256, + "step": 9655 + }, + { + "epoch": 20.083160083160084, + "grad_norm": 0.26738440990448, + "learning_rate": 4.3144718108559845e-05, + "loss": 0.1141, + "num_input_tokens_seen": 6933744, + "step": 9660 + }, + { + "epoch": 20.093555093555093, + "grad_norm": 0.22562699019908905, + "learning_rate": 4.3137963093679945e-05, + "loss": 0.1286, + "num_input_tokens_seen": 6937328, + "step": 9665 + }, + { + "epoch": 20.103950103950105, + "grad_norm": 0.14522911608219147, + "learning_rate": 4.31312052816982e-05, + "loss": 0.133, + "num_input_tokens_seen": 6940976, + "step": 9670 + }, + { + "epoch": 20.114345114345113, + "grad_norm": 0.3220185339450836, + "learning_rate": 4.312444467365675e-05, + "loss": 0.1395, + "num_input_tokens_seen": 6944528, + "step": 9675 + }, + { + "epoch": 20.124740124740125, + "grad_norm": 0.42131999135017395, + "learning_rate": 4.311768127059816e-05, + "loss": 0.1179, + "num_input_tokens_seen": 6947984, + "step": 9680 + }, + { + "epoch": 20.135135135135137, + "grad_norm": 0.28592580556869507, + "learning_rate": 4.3110915073565444e-05, + "loss": 0.1243, + "num_input_tokens_seen": 6951504, + "step": 9685 + }, + { + "epoch": 20.145530145530145, + "grad_norm": 0.18347777426242828, + "learning_rate": 4.310414608360203e-05, + "loss": 0.1322, + "num_input_tokens_seen": 6955184, + "step": 9690 + }, + { + "epoch": 20.155925155925157, + "grad_norm": 0.1221843808889389, + "learning_rate": 4.309737430175177e-05, + "loss": 0.1076, + "num_input_tokens_seen": 6958768, + "step": 9695 + }, + { + "epoch": 20.166320166320165, + "grad_norm": 0.12071305513381958, + "learning_rate": 4.309059972905897e-05, + "loss": 0.0891, + "num_input_tokens_seen": 6962256, + "step": 9700 + }, + { + "epoch": 20.176715176715177, + "grad_norm": 0.2335301637649536, + "learning_rate": 4.308382236656836e-05, + "loss": 0.1478, + "num_input_tokens_seen": 6965712, + "step": 9705 + }, + { + "epoch": 20.187110187110186, + "grad_norm": 0.26959121227264404, + "learning_rate": 4.307704221532507e-05, + "loss": 0.1433, + "num_input_tokens_seen": 6969392, + "step": 9710 + }, + { + "epoch": 20.197505197505198, + "grad_norm": 0.1613924652338028, + "learning_rate": 4.307025927637471e-05, + "loss": 0.0953, + "num_input_tokens_seen": 6972912, + "step": 9715 + }, + { + "epoch": 20.20790020790021, + "grad_norm": 0.32943689823150635, + "learning_rate": 4.306347355076328e-05, + "loss": 0.1579, + "num_input_tokens_seen": 6976528, + "step": 9720 + }, + { + "epoch": 20.218295218295218, + "grad_norm": 0.5056285858154297, + "learning_rate": 4.305668503953724e-05, + "loss": 0.1101, + "num_input_tokens_seen": 6980176, + "step": 9725 + }, + { + "epoch": 20.22869022869023, + "grad_norm": 0.6447094082832336, + "learning_rate": 4.3049893743743436e-05, + "loss": 0.1476, + "num_input_tokens_seen": 6983728, + "step": 9730 + }, + { + "epoch": 20.239085239085238, + "grad_norm": 0.35511502623558044, + "learning_rate": 4.304309966442919e-05, + "loss": 0.1766, + "num_input_tokens_seen": 6987280, + "step": 9735 + }, + { + "epoch": 20.24948024948025, + "grad_norm": 0.2605590224266052, + "learning_rate": 4.303630280264224e-05, + "loss": 0.1054, + "num_input_tokens_seen": 6990896, + "step": 9740 + }, + { + "epoch": 20.25987525987526, + "grad_norm": 0.21104717254638672, + "learning_rate": 4.302950315943074e-05, + "loss": 0.1234, + "num_input_tokens_seen": 6994416, + "step": 9745 + }, + { + "epoch": 20.27027027027027, + "grad_norm": 0.4469887912273407, + "learning_rate": 4.3022700735843275e-05, + "loss": 0.1122, + "num_input_tokens_seen": 6998064, + "step": 9750 + }, + { + "epoch": 20.280665280665282, + "grad_norm": 0.385088175535202, + "learning_rate": 4.301589553292887e-05, + "loss": 0.1013, + "num_input_tokens_seen": 7001520, + "step": 9755 + }, + { + "epoch": 20.29106029106029, + "grad_norm": 0.5273288488388062, + "learning_rate": 4.300908755173697e-05, + "loss": 0.1203, + "num_input_tokens_seen": 7005360, + "step": 9760 + }, + { + "epoch": 20.301455301455302, + "grad_norm": 0.20622946321964264, + "learning_rate": 4.300227679331745e-05, + "loss": 0.1635, + "num_input_tokens_seen": 7009040, + "step": 9765 + }, + { + "epoch": 20.31185031185031, + "grad_norm": 0.6026328802108765, + "learning_rate": 4.299546325872063e-05, + "loss": 0.142, + "num_input_tokens_seen": 7012784, + "step": 9770 + }, + { + "epoch": 20.322245322245323, + "grad_norm": 0.5804100036621094, + "learning_rate": 4.2988646948997225e-05, + "loss": 0.0961, + "num_input_tokens_seen": 7016336, + "step": 9775 + }, + { + "epoch": 20.33264033264033, + "grad_norm": 0.4987211227416992, + "learning_rate": 4.29818278651984e-05, + "loss": 0.1271, + "num_input_tokens_seen": 7019856, + "step": 9780 + }, + { + "epoch": 20.343035343035343, + "grad_norm": 0.4349326491355896, + "learning_rate": 4.297500600837574e-05, + "loss": 0.137, + "num_input_tokens_seen": 7023344, + "step": 9785 + }, + { + "epoch": 20.353430353430355, + "grad_norm": 0.2888640761375427, + "learning_rate": 4.2968181379581276e-05, + "loss": 0.1075, + "num_input_tokens_seen": 7026832, + "step": 9790 + }, + { + "epoch": 20.363825363825363, + "grad_norm": 0.24695001542568207, + "learning_rate": 4.296135397986743e-05, + "loss": 0.1412, + "num_input_tokens_seen": 7030320, + "step": 9795 + }, + { + "epoch": 20.374220374220375, + "grad_norm": 0.31802642345428467, + "learning_rate": 4.295452381028709e-05, + "loss": 0.1144, + "num_input_tokens_seen": 7033840, + "step": 9800 + }, + { + "epoch": 20.374220374220375, + "eval_loss": 0.1509167104959488, + "eval_runtime": 7.7527, + "eval_samples_per_second": 110.413, + "eval_steps_per_second": 27.603, + "num_input_tokens_seen": 7033840, + "step": 9800 + }, + { + "epoch": 20.384615384615383, + "grad_norm": 0.4734637439250946, + "learning_rate": 4.294769087189354e-05, + "loss": 0.1132, + "num_input_tokens_seen": 7037296, + "step": 9805 + }, + { + "epoch": 20.395010395010395, + "grad_norm": 0.17939208447933197, + "learning_rate": 4.294085516574052e-05, + "loss": 0.1567, + "num_input_tokens_seen": 7040944, + "step": 9810 + }, + { + "epoch": 20.405405405405407, + "grad_norm": 0.2334163337945938, + "learning_rate": 4.2934016692882176e-05, + "loss": 0.1604, + "num_input_tokens_seen": 7044592, + "step": 9815 + }, + { + "epoch": 20.415800415800415, + "grad_norm": 0.301034539937973, + "learning_rate": 4.292717545437308e-05, + "loss": 0.1561, + "num_input_tokens_seen": 7048048, + "step": 9820 + }, + { + "epoch": 20.426195426195427, + "grad_norm": 0.2292051911354065, + "learning_rate": 4.292033145126825e-05, + "loss": 0.1053, + "num_input_tokens_seen": 7051632, + "step": 9825 + }, + { + "epoch": 20.436590436590436, + "grad_norm": 0.854910135269165, + "learning_rate": 4.29134846846231e-05, + "loss": 0.1558, + "num_input_tokens_seen": 7055216, + "step": 9830 + }, + { + "epoch": 20.446985446985448, + "grad_norm": 0.32050442695617676, + "learning_rate": 4.29066351554935e-05, + "loss": 0.1083, + "num_input_tokens_seen": 7058832, + "step": 9835 + }, + { + "epoch": 20.457380457380456, + "grad_norm": 0.8249935507774353, + "learning_rate": 4.289978286493574e-05, + "loss": 0.1271, + "num_input_tokens_seen": 7062608, + "step": 9840 + }, + { + "epoch": 20.467775467775468, + "grad_norm": 0.24449753761291504, + "learning_rate": 4.28929278140065e-05, + "loss": 0.1092, + "num_input_tokens_seen": 7066448, + "step": 9845 + }, + { + "epoch": 20.47817047817048, + "grad_norm": 0.30164635181427, + "learning_rate": 4.288607000376295e-05, + "loss": 0.1603, + "num_input_tokens_seen": 7070096, + "step": 9850 + }, + { + "epoch": 20.488565488565488, + "grad_norm": 0.23067861795425415, + "learning_rate": 4.2879209435262624e-05, + "loss": 0.1266, + "num_input_tokens_seen": 7073808, + "step": 9855 + }, + { + "epoch": 20.4989604989605, + "grad_norm": 0.33774518966674805, + "learning_rate": 4.287234610956353e-05, + "loss": 0.1241, + "num_input_tokens_seen": 7077360, + "step": 9860 + }, + { + "epoch": 20.509355509355508, + "grad_norm": 0.31871896982192993, + "learning_rate": 4.2865480027724056e-05, + "loss": 0.1085, + "num_input_tokens_seen": 7080912, + "step": 9865 + }, + { + "epoch": 20.51975051975052, + "grad_norm": 0.2298680990934372, + "learning_rate": 4.285861119080306e-05, + "loss": 0.0892, + "num_input_tokens_seen": 7084400, + "step": 9870 + }, + { + "epoch": 20.53014553014553, + "grad_norm": 0.3797898292541504, + "learning_rate": 4.2851739599859784e-05, + "loss": 0.1201, + "num_input_tokens_seen": 7087888, + "step": 9875 + }, + { + "epoch": 20.54054054054054, + "grad_norm": 0.2736720144748688, + "learning_rate": 4.2844865255953934e-05, + "loss": 0.136, + "num_input_tokens_seen": 7091472, + "step": 9880 + }, + { + "epoch": 20.550935550935552, + "grad_norm": 0.1848464012145996, + "learning_rate": 4.2837988160145605e-05, + "loss": 0.118, + "num_input_tokens_seen": 7094928, + "step": 9885 + }, + { + "epoch": 20.56133056133056, + "grad_norm": 0.6259697675704956, + "learning_rate": 4.2831108313495336e-05, + "loss": 0.1002, + "num_input_tokens_seen": 7098576, + "step": 9890 + }, + { + "epoch": 20.571725571725572, + "grad_norm": 0.5459137558937073, + "learning_rate": 4.282422571706408e-05, + "loss": 0.1098, + "num_input_tokens_seen": 7102096, + "step": 9895 + }, + { + "epoch": 20.58212058212058, + "grad_norm": 0.8980330228805542, + "learning_rate": 4.281734037191323e-05, + "loss": 0.1111, + "num_input_tokens_seen": 7105616, + "step": 9900 + }, + { + "epoch": 20.592515592515593, + "grad_norm": 0.4214492440223694, + "learning_rate": 4.281045227910459e-05, + "loss": 0.1636, + "num_input_tokens_seen": 7109168, + "step": 9905 + }, + { + "epoch": 20.602910602910605, + "grad_norm": 0.2937159538269043, + "learning_rate": 4.280356143970038e-05, + "loss": 0.1159, + "num_input_tokens_seen": 7112848, + "step": 9910 + }, + { + "epoch": 20.613305613305613, + "grad_norm": 0.22168298065662384, + "learning_rate": 4.279666785476327e-05, + "loss": 0.102, + "num_input_tokens_seen": 7116432, + "step": 9915 + }, + { + "epoch": 20.623700623700625, + "grad_norm": 0.30024492740631104, + "learning_rate": 4.2789771525356325e-05, + "loss": 0.1393, + "num_input_tokens_seen": 7119952, + "step": 9920 + }, + { + "epoch": 20.634095634095633, + "grad_norm": 0.43162301182746887, + "learning_rate": 4.2782872452543056e-05, + "loss": 0.1299, + "num_input_tokens_seen": 7123568, + "step": 9925 + }, + { + "epoch": 20.644490644490645, + "grad_norm": 0.4080905616283417, + "learning_rate": 4.2775970637387376e-05, + "loss": 0.15, + "num_input_tokens_seen": 7127280, + "step": 9930 + }, + { + "epoch": 20.654885654885653, + "grad_norm": 0.23404955863952637, + "learning_rate": 4.276906608095363e-05, + "loss": 0.0846, + "num_input_tokens_seen": 7130800, + "step": 9935 + }, + { + "epoch": 20.665280665280665, + "grad_norm": 0.5215165615081787, + "learning_rate": 4.276215878430661e-05, + "loss": 0.0964, + "num_input_tokens_seen": 7134288, + "step": 9940 + }, + { + "epoch": 20.675675675675677, + "grad_norm": 0.2337476760149002, + "learning_rate": 4.275524874851149e-05, + "loss": 0.1319, + "num_input_tokens_seen": 7137904, + "step": 9945 + }, + { + "epoch": 20.686070686070686, + "grad_norm": 0.307748943567276, + "learning_rate": 4.274833597463388e-05, + "loss": 0.1093, + "num_input_tokens_seen": 7141520, + "step": 9950 + }, + { + "epoch": 20.696465696465697, + "grad_norm": 0.3471037447452545, + "learning_rate": 4.2741420463739824e-05, + "loss": 0.1203, + "num_input_tokens_seen": 7145072, + "step": 9955 + }, + { + "epoch": 20.706860706860706, + "grad_norm": 0.23517267405986786, + "learning_rate": 4.273450221689578e-05, + "loss": 0.1247, + "num_input_tokens_seen": 7148720, + "step": 9960 + }, + { + "epoch": 20.717255717255718, + "grad_norm": 0.4547646939754486, + "learning_rate": 4.272758123516863e-05, + "loss": 0.1149, + "num_input_tokens_seen": 7152368, + "step": 9965 + }, + { + "epoch": 20.727650727650726, + "grad_norm": 0.3631754219532013, + "learning_rate": 4.272065751962567e-05, + "loss": 0.1164, + "num_input_tokens_seen": 7155856, + "step": 9970 + }, + { + "epoch": 20.738045738045738, + "grad_norm": 0.28920409083366394, + "learning_rate": 4.271373107133464e-05, + "loss": 0.101, + "num_input_tokens_seen": 7159344, + "step": 9975 + }, + { + "epoch": 20.74844074844075, + "grad_norm": 0.20755863189697266, + "learning_rate": 4.270680189136366e-05, + "loss": 0.1486, + "num_input_tokens_seen": 7162960, + "step": 9980 + }, + { + "epoch": 20.758835758835758, + "grad_norm": 0.18205039203166962, + "learning_rate": 4.269986998078132e-05, + "loss": 0.1229, + "num_input_tokens_seen": 7166512, + "step": 9985 + }, + { + "epoch": 20.76923076923077, + "grad_norm": 0.2556726932525635, + "learning_rate": 4.2692935340656595e-05, + "loss": 0.1191, + "num_input_tokens_seen": 7169968, + "step": 9990 + }, + { + "epoch": 20.77962577962578, + "grad_norm": 0.2541142404079437, + "learning_rate": 4.26859979720589e-05, + "loss": 0.1309, + "num_input_tokens_seen": 7173552, + "step": 9995 + }, + { + "epoch": 20.79002079002079, + "grad_norm": 0.38699060678482056, + "learning_rate": 4.267905787605806e-05, + "loss": 0.1154, + "num_input_tokens_seen": 7177136, + "step": 10000 + }, + { + "epoch": 20.79002079002079, + "eval_loss": 0.14724312722682953, + "eval_runtime": 7.7579, + "eval_samples_per_second": 110.339, + "eval_steps_per_second": 27.585, + "num_input_tokens_seen": 7177136, + "step": 10000 + }, + { + "epoch": 20.8004158004158, + "grad_norm": 0.6623563766479492, + "learning_rate": 4.267211505372433e-05, + "loss": 0.1317, + "num_input_tokens_seen": 7180784, + "step": 10005 + }, + { + "epoch": 20.81081081081081, + "grad_norm": 0.21080851554870605, + "learning_rate": 4.266516950612837e-05, + "loss": 0.1497, + "num_input_tokens_seen": 7184304, + "step": 10010 + }, + { + "epoch": 20.821205821205822, + "grad_norm": 0.2572473883628845, + "learning_rate": 4.265822123434128e-05, + "loss": 0.1124, + "num_input_tokens_seen": 7187888, + "step": 10015 + }, + { + "epoch": 20.83160083160083, + "grad_norm": 0.23407183587551117, + "learning_rate": 4.265127023943457e-05, + "loss": 0.1501, + "num_input_tokens_seen": 7191536, + "step": 10020 + }, + { + "epoch": 20.841995841995843, + "grad_norm": 0.3809235095977783, + "learning_rate": 4.2644316522480176e-05, + "loss": 0.1349, + "num_input_tokens_seen": 7195152, + "step": 10025 + }, + { + "epoch": 20.85239085239085, + "grad_norm": 0.4784112572669983, + "learning_rate": 4.263736008455044e-05, + "loss": 0.0998, + "num_input_tokens_seen": 7198832, + "step": 10030 + }, + { + "epoch": 20.862785862785863, + "grad_norm": 0.44024166464805603, + "learning_rate": 4.2630400926718125e-05, + "loss": 0.1398, + "num_input_tokens_seen": 7202480, + "step": 10035 + }, + { + "epoch": 20.873180873180875, + "grad_norm": 0.22741863131523132, + "learning_rate": 4.262343905005644e-05, + "loss": 0.109, + "num_input_tokens_seen": 7206032, + "step": 10040 + }, + { + "epoch": 20.883575883575883, + "grad_norm": 0.17296485602855682, + "learning_rate": 4.261647445563897e-05, + "loss": 0.1585, + "num_input_tokens_seen": 7209712, + "step": 10045 + }, + { + "epoch": 20.893970893970895, + "grad_norm": 0.18708395957946777, + "learning_rate": 4.260950714453976e-05, + "loss": 0.1214, + "num_input_tokens_seen": 7213296, + "step": 10050 + }, + { + "epoch": 20.904365904365903, + "grad_norm": 0.2480963170528412, + "learning_rate": 4.2602537117833266e-05, + "loss": 0.0988, + "num_input_tokens_seen": 7216880, + "step": 10055 + }, + { + "epoch": 20.914760914760915, + "grad_norm": 0.2404201179742813, + "learning_rate": 4.259556437659433e-05, + "loss": 0.1082, + "num_input_tokens_seen": 7220368, + "step": 10060 + }, + { + "epoch": 20.925155925155924, + "grad_norm": 0.3173222839832306, + "learning_rate": 4.258858892189825e-05, + "loss": 0.1363, + "num_input_tokens_seen": 7224112, + "step": 10065 + }, + { + "epoch": 20.935550935550935, + "grad_norm": 0.3636591136455536, + "learning_rate": 4.2581610754820725e-05, + "loss": 0.1304, + "num_input_tokens_seen": 7227600, + "step": 10070 + }, + { + "epoch": 20.945945945945947, + "grad_norm": 0.3072545528411865, + "learning_rate": 4.2574629876437876e-05, + "loss": 0.1015, + "num_input_tokens_seen": 7231216, + "step": 10075 + }, + { + "epoch": 20.956340956340956, + "grad_norm": 0.3391786813735962, + "learning_rate": 4.256764628782625e-05, + "loss": 0.1187, + "num_input_tokens_seen": 7234736, + "step": 10080 + }, + { + "epoch": 20.966735966735968, + "grad_norm": 0.1718297153711319, + "learning_rate": 4.256065999006279e-05, + "loss": 0.0985, + "num_input_tokens_seen": 7238416, + "step": 10085 + }, + { + "epoch": 20.977130977130976, + "grad_norm": 0.22172273695468903, + "learning_rate": 4.2553670984224885e-05, + "loss": 0.149, + "num_input_tokens_seen": 7241872, + "step": 10090 + }, + { + "epoch": 20.987525987525988, + "grad_norm": 0.37720170617103577, + "learning_rate": 4.254667927139032e-05, + "loss": 0.1128, + "num_input_tokens_seen": 7245456, + "step": 10095 + }, + { + "epoch": 20.997920997921, + "grad_norm": 0.1988500952720642, + "learning_rate": 4.2539684852637295e-05, + "loss": 0.1326, + "num_input_tokens_seen": 7249040, + "step": 10100 + }, + { + "epoch": 21.008316008316008, + "grad_norm": 0.17418897151947021, + "learning_rate": 4.253268772904446e-05, + "loss": 0.0847, + "num_input_tokens_seen": 7252488, + "step": 10105 + }, + { + "epoch": 21.01871101871102, + "grad_norm": 0.4755983352661133, + "learning_rate": 4.252568790169085e-05, + "loss": 0.1304, + "num_input_tokens_seen": 7255976, + "step": 10110 + }, + { + "epoch": 21.02910602910603, + "grad_norm": 0.235621377825737, + "learning_rate": 4.251868537165592e-05, + "loss": 0.136, + "num_input_tokens_seen": 7259816, + "step": 10115 + }, + { + "epoch": 21.03950103950104, + "grad_norm": 0.2274366021156311, + "learning_rate": 4.251168014001955e-05, + "loss": 0.0867, + "num_input_tokens_seen": 7263496, + "step": 10120 + }, + { + "epoch": 21.04989604989605, + "grad_norm": 0.25313177704811096, + "learning_rate": 4.250467220786204e-05, + "loss": 0.1166, + "num_input_tokens_seen": 7266920, + "step": 10125 + }, + { + "epoch": 21.06029106029106, + "grad_norm": 0.2673417925834656, + "learning_rate": 4.249766157626409e-05, + "loss": 0.1081, + "num_input_tokens_seen": 7270440, + "step": 10130 + }, + { + "epoch": 21.070686070686072, + "grad_norm": 0.13586574792861938, + "learning_rate": 4.249064824630684e-05, + "loss": 0.1, + "num_input_tokens_seen": 7273960, + "step": 10135 + }, + { + "epoch": 21.08108108108108, + "grad_norm": 0.22333917021751404, + "learning_rate": 4.248363221907183e-05, + "loss": 0.1195, + "num_input_tokens_seen": 7277544, + "step": 10140 + }, + { + "epoch": 21.091476091476093, + "grad_norm": 0.4050735831260681, + "learning_rate": 4.2476613495641026e-05, + "loss": 0.1113, + "num_input_tokens_seen": 7281096, + "step": 10145 + }, + { + "epoch": 21.1018711018711, + "grad_norm": 0.16709256172180176, + "learning_rate": 4.246959207709679e-05, + "loss": 0.0978, + "num_input_tokens_seen": 7284552, + "step": 10150 + }, + { + "epoch": 21.112266112266113, + "grad_norm": 0.3567046821117401, + "learning_rate": 4.246256796452192e-05, + "loss": 0.1237, + "num_input_tokens_seen": 7288264, + "step": 10155 + }, + { + "epoch": 21.12266112266112, + "grad_norm": 0.1911902278661728, + "learning_rate": 4.245554115899962e-05, + "loss": 0.0907, + "num_input_tokens_seen": 7291784, + "step": 10160 + }, + { + "epoch": 21.133056133056133, + "grad_norm": 0.2516063153743744, + "learning_rate": 4.2448511661613514e-05, + "loss": 0.1202, + "num_input_tokens_seen": 7295496, + "step": 10165 + }, + { + "epoch": 21.143451143451145, + "grad_norm": 0.6631391048431396, + "learning_rate": 4.2441479473447635e-05, + "loss": 0.1393, + "num_input_tokens_seen": 7299016, + "step": 10170 + }, + { + "epoch": 21.153846153846153, + "grad_norm": 0.2420254498720169, + "learning_rate": 4.243444459558644e-05, + "loss": 0.132, + "num_input_tokens_seen": 7302632, + "step": 10175 + }, + { + "epoch": 21.164241164241165, + "grad_norm": 0.4776613712310791, + "learning_rate": 4.24274070291148e-05, + "loss": 0.1188, + "num_input_tokens_seen": 7306280, + "step": 10180 + }, + { + "epoch": 21.174636174636174, + "grad_norm": 0.223314106464386, + "learning_rate": 4.242036677511798e-05, + "loss": 0.1675, + "num_input_tokens_seen": 7309832, + "step": 10185 + }, + { + "epoch": 21.185031185031185, + "grad_norm": 0.33540892601013184, + "learning_rate": 4.241332383468169e-05, + "loss": 0.1259, + "num_input_tokens_seen": 7313288, + "step": 10190 + }, + { + "epoch": 21.195426195426194, + "grad_norm": 0.15462151169776917, + "learning_rate": 4.2406278208892034e-05, + "loss": 0.1316, + "num_input_tokens_seen": 7316808, + "step": 10195 + }, + { + "epoch": 21.205821205821206, + "grad_norm": 0.5420085787773132, + "learning_rate": 4.2399229898835536e-05, + "loss": 0.1629, + "num_input_tokens_seen": 7320168, + "step": 10200 + }, + { + "epoch": 21.205821205821206, + "eval_loss": 0.14695754647254944, + "eval_runtime": 7.7538, + "eval_samples_per_second": 110.398, + "eval_steps_per_second": 27.599, + "num_input_tokens_seen": 7320168, + "step": 10200 + }, + { + "epoch": 21.216216216216218, + "grad_norm": 0.5341407060623169, + "learning_rate": 4.239217890559914e-05, + "loss": 0.1068, + "num_input_tokens_seen": 7323912, + "step": 10205 + }, + { + "epoch": 21.226611226611226, + "grad_norm": 0.24962855875492096, + "learning_rate": 4.238512523027019e-05, + "loss": 0.124, + "num_input_tokens_seen": 7327528, + "step": 10210 + }, + { + "epoch": 21.237006237006238, + "grad_norm": 0.45126864314079285, + "learning_rate": 4.237806887393645e-05, + "loss": 0.1048, + "num_input_tokens_seen": 7331112, + "step": 10215 + }, + { + "epoch": 21.247401247401246, + "grad_norm": 0.4890509247779846, + "learning_rate": 4.237100983768611e-05, + "loss": 0.1733, + "num_input_tokens_seen": 7334664, + "step": 10220 + }, + { + "epoch": 21.257796257796258, + "grad_norm": 0.5610995888710022, + "learning_rate": 4.2363948122607756e-05, + "loss": 0.1423, + "num_input_tokens_seen": 7338344, + "step": 10225 + }, + { + "epoch": 21.26819126819127, + "grad_norm": 0.2627185881137848, + "learning_rate": 4.235688372979039e-05, + "loss": 0.0897, + "num_input_tokens_seen": 7341960, + "step": 10230 + }, + { + "epoch": 21.27858627858628, + "grad_norm": 0.4567221701145172, + "learning_rate": 4.234981666032343e-05, + "loss": 0.1166, + "num_input_tokens_seen": 7345576, + "step": 10235 + }, + { + "epoch": 21.28898128898129, + "grad_norm": 0.5607088208198547, + "learning_rate": 4.2342746915296704e-05, + "loss": 0.1193, + "num_input_tokens_seen": 7349096, + "step": 10240 + }, + { + "epoch": 21.2993762993763, + "grad_norm": 0.36236825585365295, + "learning_rate": 4.233567449580047e-05, + "loss": 0.1032, + "num_input_tokens_seen": 7352520, + "step": 10245 + }, + { + "epoch": 21.30977130977131, + "grad_norm": 0.27828142046928406, + "learning_rate": 4.232859940292537e-05, + "loss": 0.1341, + "num_input_tokens_seen": 7356104, + "step": 10250 + }, + { + "epoch": 21.32016632016632, + "grad_norm": 0.34001195430755615, + "learning_rate": 4.232152163776248e-05, + "loss": 0.1431, + "num_input_tokens_seen": 7359592, + "step": 10255 + }, + { + "epoch": 21.33056133056133, + "grad_norm": 0.31307676434516907, + "learning_rate": 4.231444120140328e-05, + "loss": 0.1315, + "num_input_tokens_seen": 7363112, + "step": 10260 + }, + { + "epoch": 21.340956340956343, + "grad_norm": 0.13462428748607635, + "learning_rate": 4.230735809493967e-05, + "loss": 0.0879, + "num_input_tokens_seen": 7366632, + "step": 10265 + }, + { + "epoch": 21.35135135135135, + "grad_norm": 0.14849887788295746, + "learning_rate": 4.2300272319463926e-05, + "loss": 0.1268, + "num_input_tokens_seen": 7370216, + "step": 10270 + }, + { + "epoch": 21.361746361746363, + "grad_norm": 0.2890027165412903, + "learning_rate": 4.2293183876068786e-05, + "loss": 0.1089, + "num_input_tokens_seen": 7373992, + "step": 10275 + }, + { + "epoch": 21.37214137214137, + "grad_norm": 0.2007463127374649, + "learning_rate": 4.228609276584737e-05, + "loss": 0.1115, + "num_input_tokens_seen": 7377480, + "step": 10280 + }, + { + "epoch": 21.382536382536383, + "grad_norm": 0.23131221532821655, + "learning_rate": 4.227899898989323e-05, + "loss": 0.1117, + "num_input_tokens_seen": 7381064, + "step": 10285 + }, + { + "epoch": 21.39293139293139, + "grad_norm": 0.28444257378578186, + "learning_rate": 4.2271902549300293e-05, + "loss": 0.112, + "num_input_tokens_seen": 7384840, + "step": 10290 + }, + { + "epoch": 21.403326403326403, + "grad_norm": 1.0441508293151855, + "learning_rate": 4.226480344516294e-05, + "loss": 0.169, + "num_input_tokens_seen": 7388296, + "step": 10295 + }, + { + "epoch": 21.413721413721415, + "grad_norm": 0.197988361120224, + "learning_rate": 4.2257701678575925e-05, + "loss": 0.1157, + "num_input_tokens_seen": 7391944, + "step": 10300 + }, + { + "epoch": 21.424116424116423, + "grad_norm": 0.467063844203949, + "learning_rate": 4.225059725063444e-05, + "loss": 0.1294, + "num_input_tokens_seen": 7395400, + "step": 10305 + }, + { + "epoch": 21.434511434511435, + "grad_norm": 0.19179581105709076, + "learning_rate": 4.2243490162434074e-05, + "loss": 0.1246, + "num_input_tokens_seen": 7399016, + "step": 10310 + }, + { + "epoch": 21.444906444906444, + "grad_norm": 0.2663164436817169, + "learning_rate": 4.223638041507083e-05, + "loss": 0.1419, + "num_input_tokens_seen": 7402568, + "step": 10315 + }, + { + "epoch": 21.455301455301456, + "grad_norm": 0.19549624621868134, + "learning_rate": 4.2229268009641124e-05, + "loss": 0.1305, + "num_input_tokens_seen": 7406216, + "step": 10320 + }, + { + "epoch": 21.465696465696467, + "grad_norm": 0.18999478220939636, + "learning_rate": 4.222215294724177e-05, + "loss": 0.0994, + "num_input_tokens_seen": 7410120, + "step": 10325 + }, + { + "epoch": 21.476091476091476, + "grad_norm": 0.3753526210784912, + "learning_rate": 4.2215035228970005e-05, + "loss": 0.134, + "num_input_tokens_seen": 7413800, + "step": 10330 + }, + { + "epoch": 21.486486486486488, + "grad_norm": 0.3681851625442505, + "learning_rate": 4.2207914855923464e-05, + "loss": 0.1081, + "num_input_tokens_seen": 7417384, + "step": 10335 + }, + { + "epoch": 21.496881496881496, + "grad_norm": 0.19582325220108032, + "learning_rate": 4.220079182920021e-05, + "loss": 0.0877, + "num_input_tokens_seen": 7421096, + "step": 10340 + }, + { + "epoch": 21.507276507276508, + "grad_norm": 0.2504428029060364, + "learning_rate": 4.2193666149898705e-05, + "loss": 0.1386, + "num_input_tokens_seen": 7424808, + "step": 10345 + }, + { + "epoch": 21.517671517671516, + "grad_norm": 0.2424832284450531, + "learning_rate": 4.21865378191178e-05, + "loss": 0.1403, + "num_input_tokens_seen": 7428360, + "step": 10350 + }, + { + "epoch": 21.528066528066528, + "grad_norm": 0.2231181412935257, + "learning_rate": 4.217940683795678e-05, + "loss": 0.1887, + "num_input_tokens_seen": 7431976, + "step": 10355 + }, + { + "epoch": 21.53846153846154, + "grad_norm": 0.43250179290771484, + "learning_rate": 4.217227320751534e-05, + "loss": 0.159, + "num_input_tokens_seen": 7435464, + "step": 10360 + }, + { + "epoch": 21.54885654885655, + "grad_norm": 0.4152504503726959, + "learning_rate": 4.216513692889358e-05, + "loss": 0.1008, + "num_input_tokens_seen": 7439144, + "step": 10365 + }, + { + "epoch": 21.55925155925156, + "grad_norm": 0.20960572361946106, + "learning_rate": 4.215799800319199e-05, + "loss": 0.1461, + "num_input_tokens_seen": 7442664, + "step": 10370 + }, + { + "epoch": 21.56964656964657, + "grad_norm": 0.20803706347942352, + "learning_rate": 4.2150856431511485e-05, + "loss": 0.0913, + "num_input_tokens_seen": 7446120, + "step": 10375 + }, + { + "epoch": 21.58004158004158, + "grad_norm": 0.41112151741981506, + "learning_rate": 4.214371221495339e-05, + "loss": 0.1415, + "num_input_tokens_seen": 7449768, + "step": 10380 + }, + { + "epoch": 21.59043659043659, + "grad_norm": 0.3832927644252777, + "learning_rate": 4.213656535461942e-05, + "loss": 0.0948, + "num_input_tokens_seen": 7453384, + "step": 10385 + }, + { + "epoch": 21.6008316008316, + "grad_norm": 0.2400759756565094, + "learning_rate": 4.2129415851611734e-05, + "loss": 0.1293, + "num_input_tokens_seen": 7457032, + "step": 10390 + }, + { + "epoch": 21.611226611226613, + "grad_norm": 0.16728521883487701, + "learning_rate": 4.2122263707032855e-05, + "loss": 0.1496, + "num_input_tokens_seen": 7460648, + "step": 10395 + }, + { + "epoch": 21.62162162162162, + "grad_norm": 0.22790572047233582, + "learning_rate": 4.211510892198574e-05, + "loss": 0.114, + "num_input_tokens_seen": 7464136, + "step": 10400 + }, + { + "epoch": 21.62162162162162, + "eval_loss": 0.14996081590652466, + "eval_runtime": 7.7463, + "eval_samples_per_second": 110.504, + "eval_steps_per_second": 27.626, + "num_input_tokens_seen": 7464136, + "step": 10400 + }, + { + "epoch": 21.632016632016633, + "grad_norm": 0.2713649868965149, + "learning_rate": 4.210795149757375e-05, + "loss": 0.1383, + "num_input_tokens_seen": 7467816, + "step": 10405 + }, + { + "epoch": 21.64241164241164, + "grad_norm": 0.18683089315891266, + "learning_rate": 4.210079143490065e-05, + "loss": 0.0772, + "num_input_tokens_seen": 7471464, + "step": 10410 + }, + { + "epoch": 21.652806652806653, + "grad_norm": 0.32271549105644226, + "learning_rate": 4.2093628735070604e-05, + "loss": 0.1111, + "num_input_tokens_seen": 7475176, + "step": 10415 + }, + { + "epoch": 21.66320166320166, + "grad_norm": 0.2997967302799225, + "learning_rate": 4.208646339918819e-05, + "loss": 0.1291, + "num_input_tokens_seen": 7478856, + "step": 10420 + }, + { + "epoch": 21.673596673596673, + "grad_norm": 0.16849663853645325, + "learning_rate": 4.2079295428358414e-05, + "loss": 0.1272, + "num_input_tokens_seen": 7482280, + "step": 10425 + }, + { + "epoch": 21.683991683991685, + "grad_norm": 0.14277009665966034, + "learning_rate": 4.207212482368664e-05, + "loss": 0.1391, + "num_input_tokens_seen": 7485736, + "step": 10430 + }, + { + "epoch": 21.694386694386694, + "grad_norm": 0.5531315207481384, + "learning_rate": 4.206495158627867e-05, + "loss": 0.1123, + "num_input_tokens_seen": 7489288, + "step": 10435 + }, + { + "epoch": 21.704781704781706, + "grad_norm": 0.23673099279403687, + "learning_rate": 4.205777571724073e-05, + "loss": 0.098, + "num_input_tokens_seen": 7492808, + "step": 10440 + }, + { + "epoch": 21.715176715176714, + "grad_norm": 0.19377486407756805, + "learning_rate": 4.20505972176794e-05, + "loss": 0.088, + "num_input_tokens_seen": 7496424, + "step": 10445 + }, + { + "epoch": 21.725571725571726, + "grad_norm": 0.3093172609806061, + "learning_rate": 4.204341608870171e-05, + "loss": 0.1236, + "num_input_tokens_seen": 7500232, + "step": 10450 + }, + { + "epoch": 21.735966735966738, + "grad_norm": 0.2637651562690735, + "learning_rate": 4.203623233141508e-05, + "loss": 0.107, + "num_input_tokens_seen": 7503752, + "step": 10455 + }, + { + "epoch": 21.746361746361746, + "grad_norm": 0.5043386816978455, + "learning_rate": 4.2029045946927334e-05, + "loss": 0.0991, + "num_input_tokens_seen": 7507240, + "step": 10460 + }, + { + "epoch": 21.756756756756758, + "grad_norm": 0.3908558785915375, + "learning_rate": 4.20218569363467e-05, + "loss": 0.1179, + "num_input_tokens_seen": 7510792, + "step": 10465 + }, + { + "epoch": 21.767151767151766, + "grad_norm": 0.23476287722587585, + "learning_rate": 4.2014665300781834e-05, + "loss": 0.1482, + "num_input_tokens_seen": 7514408, + "step": 10470 + }, + { + "epoch": 21.777546777546778, + "grad_norm": 0.4666488468647003, + "learning_rate": 4.200747104134174e-05, + "loss": 0.1658, + "num_input_tokens_seen": 7518024, + "step": 10475 + }, + { + "epoch": 21.787941787941786, + "grad_norm": 0.42379480600357056, + "learning_rate": 4.200027415913588e-05, + "loss": 0.1352, + "num_input_tokens_seen": 7521608, + "step": 10480 + }, + { + "epoch": 21.7983367983368, + "grad_norm": 0.33154696226119995, + "learning_rate": 4.1993074655274126e-05, + "loss": 0.1763, + "num_input_tokens_seen": 7525256, + "step": 10485 + }, + { + "epoch": 21.80873180873181, + "grad_norm": 0.6436675786972046, + "learning_rate": 4.198587253086669e-05, + "loss": 0.1235, + "num_input_tokens_seen": 7528680, + "step": 10490 + }, + { + "epoch": 21.81912681912682, + "grad_norm": 0.5661633610725403, + "learning_rate": 4.197866778702426e-05, + "loss": 0.1035, + "num_input_tokens_seen": 7532360, + "step": 10495 + }, + { + "epoch": 21.82952182952183, + "grad_norm": 0.1942034363746643, + "learning_rate": 4.197146042485789e-05, + "loss": 0.0777, + "num_input_tokens_seen": 7535848, + "step": 10500 + }, + { + "epoch": 21.83991683991684, + "grad_norm": 0.2661137580871582, + "learning_rate": 4.1964250445479046e-05, + "loss": 0.1217, + "num_input_tokens_seen": 7539496, + "step": 10505 + }, + { + "epoch": 21.85031185031185, + "grad_norm": 0.3152462840080261, + "learning_rate": 4.19570378499996e-05, + "loss": 0.1205, + "num_input_tokens_seen": 7543208, + "step": 10510 + }, + { + "epoch": 21.86070686070686, + "grad_norm": 0.24122871458530426, + "learning_rate": 4.194982263953182e-05, + "loss": 0.1496, + "num_input_tokens_seen": 7546792, + "step": 10515 + }, + { + "epoch": 21.87110187110187, + "grad_norm": 0.21301165223121643, + "learning_rate": 4.194260481518838e-05, + "loss": 0.1309, + "num_input_tokens_seen": 7550280, + "step": 10520 + }, + { + "epoch": 21.881496881496883, + "grad_norm": 0.4460885524749756, + "learning_rate": 4.1935384378082366e-05, + "loss": 0.1351, + "num_input_tokens_seen": 7553992, + "step": 10525 + }, + { + "epoch": 21.89189189189189, + "grad_norm": 0.43847405910491943, + "learning_rate": 4.1928161329327267e-05, + "loss": 0.1292, + "num_input_tokens_seen": 7557608, + "step": 10530 + }, + { + "epoch": 21.902286902286903, + "grad_norm": 0.2492506206035614, + "learning_rate": 4.1920935670036945e-05, + "loss": 0.1571, + "num_input_tokens_seen": 7561160, + "step": 10535 + }, + { + "epoch": 21.91268191268191, + "grad_norm": 0.3508757948875427, + "learning_rate": 4.1913707401325705e-05, + "loss": 0.1208, + "num_input_tokens_seen": 7564808, + "step": 10540 + }, + { + "epoch": 21.923076923076923, + "grad_norm": 0.6333933472633362, + "learning_rate": 4.1906476524308235e-05, + "loss": 0.1047, + "num_input_tokens_seen": 7568328, + "step": 10545 + }, + { + "epoch": 21.933471933471935, + "grad_norm": 0.5708673000335693, + "learning_rate": 4.189924304009962e-05, + "loss": 0.1544, + "num_input_tokens_seen": 7571880, + "step": 10550 + }, + { + "epoch": 21.943866943866944, + "grad_norm": 0.3198999762535095, + "learning_rate": 4.189200694981537e-05, + "loss": 0.1531, + "num_input_tokens_seen": 7575688, + "step": 10555 + }, + { + "epoch": 21.954261954261955, + "grad_norm": 0.5458848476409912, + "learning_rate": 4.188476825457136e-05, + "loss": 0.132, + "num_input_tokens_seen": 7579240, + "step": 10560 + }, + { + "epoch": 21.964656964656964, + "grad_norm": 0.20929758250713348, + "learning_rate": 4.18775269554839e-05, + "loss": 0.1118, + "num_input_tokens_seen": 7582792, + "step": 10565 + }, + { + "epoch": 21.975051975051976, + "grad_norm": 0.2882833182811737, + "learning_rate": 4.187028305366969e-05, + "loss": 0.1134, + "num_input_tokens_seen": 7586344, + "step": 10570 + }, + { + "epoch": 21.985446985446984, + "grad_norm": 0.4223215878009796, + "learning_rate": 4.1863036550245824e-05, + "loss": 0.1796, + "num_input_tokens_seen": 7589800, + "step": 10575 + }, + { + "epoch": 21.995841995841996, + "grad_norm": 0.21729148924350739, + "learning_rate": 4.1855787446329806e-05, + "loss": 0.0895, + "num_input_tokens_seen": 7593320, + "step": 10580 + }, + { + "epoch": 22.006237006237008, + "grad_norm": 0.29058995842933655, + "learning_rate": 4.184853574303955e-05, + "loss": 0.1278, + "num_input_tokens_seen": 7596776, + "step": 10585 + }, + { + "epoch": 22.016632016632016, + "grad_norm": 0.5804756283760071, + "learning_rate": 4.184128144149334e-05, + "loss": 0.1582, + "num_input_tokens_seen": 7600456, + "step": 10590 + }, + { + "epoch": 22.027027027027028, + "grad_norm": 0.2905721366405487, + "learning_rate": 4.1834024542809896e-05, + "loss": 0.1451, + "num_input_tokens_seen": 7603944, + "step": 10595 + }, + { + "epoch": 22.037422037422036, + "grad_norm": 0.3304908871650696, + "learning_rate": 4.1826765048108315e-05, + "loss": 0.1185, + "num_input_tokens_seen": 7607816, + "step": 10600 + }, + { + "epoch": 22.037422037422036, + "eval_loss": 0.14494797587394714, + "eval_runtime": 7.7508, + "eval_samples_per_second": 110.44, + "eval_steps_per_second": 27.61, + "num_input_tokens_seen": 7607816, + "step": 10600 + }, + { + "epoch": 22.04781704781705, + "grad_norm": 0.2358291745185852, + "learning_rate": 4.181950295850811e-05, + "loss": 0.0992, + "num_input_tokens_seen": 7611432, + "step": 10605 + }, + { + "epoch": 22.058212058212057, + "grad_norm": 0.1807708889245987, + "learning_rate": 4.181223827512918e-05, + "loss": 0.1452, + "num_input_tokens_seen": 7614920, + "step": 10610 + }, + { + "epoch": 22.06860706860707, + "grad_norm": 0.4907183349132538, + "learning_rate": 4.180497099909183e-05, + "loss": 0.1256, + "num_input_tokens_seen": 7618504, + "step": 10615 + }, + { + "epoch": 22.07900207900208, + "grad_norm": 0.2483902871608734, + "learning_rate": 4.179770113151677e-05, + "loss": 0.1033, + "num_input_tokens_seen": 7622056, + "step": 10620 + }, + { + "epoch": 22.08939708939709, + "grad_norm": 0.2495504766702652, + "learning_rate": 4.179042867352511e-05, + "loss": 0.1258, + "num_input_tokens_seen": 7625896, + "step": 10625 + }, + { + "epoch": 22.0997920997921, + "grad_norm": 0.2173927277326584, + "learning_rate": 4.1783153626238334e-05, + "loss": 0.1269, + "num_input_tokens_seen": 7629448, + "step": 10630 + }, + { + "epoch": 22.11018711018711, + "grad_norm": 0.2672603726387024, + "learning_rate": 4.177587599077836e-05, + "loss": 0.1347, + "num_input_tokens_seen": 7632936, + "step": 10635 + }, + { + "epoch": 22.12058212058212, + "grad_norm": 0.15699008107185364, + "learning_rate": 4.1768595768267494e-05, + "loss": 0.0827, + "num_input_tokens_seen": 7636520, + "step": 10640 + }, + { + "epoch": 22.13097713097713, + "grad_norm": 0.21180064976215363, + "learning_rate": 4.176131295982843e-05, + "loss": 0.097, + "num_input_tokens_seen": 7640200, + "step": 10645 + }, + { + "epoch": 22.14137214137214, + "grad_norm": 0.2012011706829071, + "learning_rate": 4.1754027566584276e-05, + "loss": 0.1129, + "num_input_tokens_seen": 7643784, + "step": 10650 + }, + { + "epoch": 22.151767151767153, + "grad_norm": 0.39720088243484497, + "learning_rate": 4.174673958965852e-05, + "loss": 0.12, + "num_input_tokens_seen": 7647208, + "step": 10655 + }, + { + "epoch": 22.16216216216216, + "grad_norm": 0.2865127623081207, + "learning_rate": 4.173944903017507e-05, + "loss": 0.1054, + "num_input_tokens_seen": 7650760, + "step": 10660 + }, + { + "epoch": 22.172557172557173, + "grad_norm": 0.31624293327331543, + "learning_rate": 4.173215588925822e-05, + "loss": 0.1286, + "num_input_tokens_seen": 7654216, + "step": 10665 + }, + { + "epoch": 22.18295218295218, + "grad_norm": 0.5231801867485046, + "learning_rate": 4.172486016803266e-05, + "loss": 0.1145, + "num_input_tokens_seen": 7657736, + "step": 10670 + }, + { + "epoch": 22.193347193347194, + "grad_norm": 0.3176431953907013, + "learning_rate": 4.171756186762349e-05, + "loss": 0.1646, + "num_input_tokens_seen": 7661448, + "step": 10675 + }, + { + "epoch": 22.203742203742205, + "grad_norm": 0.35102829337120056, + "learning_rate": 4.171026098915619e-05, + "loss": 0.1093, + "num_input_tokens_seen": 7665032, + "step": 10680 + }, + { + "epoch": 22.214137214137214, + "grad_norm": 0.13170452415943146, + "learning_rate": 4.170295753375665e-05, + "loss": 0.1355, + "num_input_tokens_seen": 7668712, + "step": 10685 + }, + { + "epoch": 22.224532224532226, + "grad_norm": 0.22080405056476593, + "learning_rate": 4.169565150255117e-05, + "loss": 0.107, + "num_input_tokens_seen": 7672232, + "step": 10690 + }, + { + "epoch": 22.234927234927234, + "grad_norm": 0.5611045360565186, + "learning_rate": 4.16883428966664e-05, + "loss": 0.1252, + "num_input_tokens_seen": 7675848, + "step": 10695 + }, + { + "epoch": 22.245322245322246, + "grad_norm": 0.139058917760849, + "learning_rate": 4.168103171722944e-05, + "loss": 0.1108, + "num_input_tokens_seen": 7679336, + "step": 10700 + }, + { + "epoch": 22.255717255717254, + "grad_norm": 0.4090261161327362, + "learning_rate": 4.167371796536777e-05, + "loss": 0.1328, + "num_input_tokens_seen": 7683048, + "step": 10705 + }, + { + "epoch": 22.266112266112266, + "grad_norm": 0.26174241304397583, + "learning_rate": 4.166640164220924e-05, + "loss": 0.1559, + "num_input_tokens_seen": 7686600, + "step": 10710 + }, + { + "epoch": 22.276507276507278, + "grad_norm": 0.17435508966445923, + "learning_rate": 4.1659082748882144e-05, + "loss": 0.1128, + "num_input_tokens_seen": 7690216, + "step": 10715 + }, + { + "epoch": 22.286902286902286, + "grad_norm": 0.20880544185638428, + "learning_rate": 4.1651761286515135e-05, + "loss": 0.0726, + "num_input_tokens_seen": 7693800, + "step": 10720 + }, + { + "epoch": 22.2972972972973, + "grad_norm": 0.41896504163742065, + "learning_rate": 4.164443725623728e-05, + "loss": 0.1286, + "num_input_tokens_seen": 7697352, + "step": 10725 + }, + { + "epoch": 22.307692307692307, + "grad_norm": 0.11527625471353531, + "learning_rate": 4.163711065917802e-05, + "loss": 0.1165, + "num_input_tokens_seen": 7700968, + "step": 10730 + }, + { + "epoch": 22.31808731808732, + "grad_norm": 0.428521066904068, + "learning_rate": 4.1629781496467234e-05, + "loss": 0.1286, + "num_input_tokens_seen": 7704776, + "step": 10735 + }, + { + "epoch": 22.328482328482327, + "grad_norm": 0.17147673666477203, + "learning_rate": 4.1622449769235164e-05, + "loss": 0.1335, + "num_input_tokens_seen": 7708424, + "step": 10740 + }, + { + "epoch": 22.33887733887734, + "grad_norm": 0.2785366475582123, + "learning_rate": 4.161511547861243e-05, + "loss": 0.1539, + "num_input_tokens_seen": 7711944, + "step": 10745 + }, + { + "epoch": 22.34927234927235, + "grad_norm": 0.5471400022506714, + "learning_rate": 4.1607778625730104e-05, + "loss": 0.139, + "num_input_tokens_seen": 7715656, + "step": 10750 + }, + { + "epoch": 22.35966735966736, + "grad_norm": 0.13166847825050354, + "learning_rate": 4.160043921171961e-05, + "loss": 0.0746, + "num_input_tokens_seen": 7719176, + "step": 10755 + }, + { + "epoch": 22.37006237006237, + "grad_norm": 0.3018263876438141, + "learning_rate": 4.159309723771276e-05, + "loss": 0.1621, + "num_input_tokens_seen": 7722600, + "step": 10760 + }, + { + "epoch": 22.38045738045738, + "grad_norm": 0.259527325630188, + "learning_rate": 4.158575270484181e-05, + "loss": 0.1391, + "num_input_tokens_seen": 7726248, + "step": 10765 + }, + { + "epoch": 22.39085239085239, + "grad_norm": 0.24468094110488892, + "learning_rate": 4.157840561423936e-05, + "loss": 0.0935, + "num_input_tokens_seen": 7729960, + "step": 10770 + }, + { + "epoch": 22.401247401247403, + "grad_norm": 0.17488443851470947, + "learning_rate": 4.1571055967038416e-05, + "loss": 0.1466, + "num_input_tokens_seen": 7733640, + "step": 10775 + }, + { + "epoch": 22.41164241164241, + "grad_norm": 0.28555747866630554, + "learning_rate": 4.156370376437241e-05, + "loss": 0.1133, + "num_input_tokens_seen": 7737192, + "step": 10780 + }, + { + "epoch": 22.422037422037423, + "grad_norm": 0.4126053750514984, + "learning_rate": 4.155634900737513e-05, + "loss": 0.1237, + "num_input_tokens_seen": 7740808, + "step": 10785 + }, + { + "epoch": 22.43243243243243, + "grad_norm": 0.28007251024246216, + "learning_rate": 4.1548991697180764e-05, + "loss": 0.1369, + "num_input_tokens_seen": 7744392, + "step": 10790 + }, + { + "epoch": 22.442827442827443, + "grad_norm": 0.33332812786102295, + "learning_rate": 4.1541631834923914e-05, + "loss": 0.0976, + "num_input_tokens_seen": 7747944, + "step": 10795 + }, + { + "epoch": 22.453222453222452, + "grad_norm": 0.18908622860908508, + "learning_rate": 4.153426942173956e-05, + "loss": 0.1286, + "num_input_tokens_seen": 7751560, + "step": 10800 + }, + { + "epoch": 22.453222453222452, + "eval_loss": 0.1436823159456253, + "eval_runtime": 7.7535, + "eval_samples_per_second": 110.401, + "eval_steps_per_second": 27.6, + "num_input_tokens_seen": 7751560, + "step": 10800 + }, + { + "epoch": 22.463617463617464, + "grad_norm": 0.6829400658607483, + "learning_rate": 4.152690445876308e-05, + "loss": 0.1177, + "num_input_tokens_seen": 7755144, + "step": 10805 + }, + { + "epoch": 22.474012474012476, + "grad_norm": 0.5448296666145325, + "learning_rate": 4.1519536947130245e-05, + "loss": 0.0896, + "num_input_tokens_seen": 7758760, + "step": 10810 + }, + { + "epoch": 22.484407484407484, + "grad_norm": 0.14242437481880188, + "learning_rate": 4.151216688797722e-05, + "loss": 0.1124, + "num_input_tokens_seen": 7762536, + "step": 10815 + }, + { + "epoch": 22.494802494802496, + "grad_norm": 0.48329856991767883, + "learning_rate": 4.150479428244054e-05, + "loss": 0.1346, + "num_input_tokens_seen": 7766120, + "step": 10820 + }, + { + "epoch": 22.505197505197504, + "grad_norm": 0.29118892550468445, + "learning_rate": 4.1497419131657176e-05, + "loss": 0.1434, + "num_input_tokens_seen": 7769672, + "step": 10825 + }, + { + "epoch": 22.515592515592516, + "grad_norm": 0.14176005125045776, + "learning_rate": 4.149004143676447e-05, + "loss": 0.0887, + "num_input_tokens_seen": 7773352, + "step": 10830 + }, + { + "epoch": 22.525987525987524, + "grad_norm": 0.334581196308136, + "learning_rate": 4.148266119890015e-05, + "loss": 0.1134, + "num_input_tokens_seen": 7776808, + "step": 10835 + }, + { + "epoch": 22.536382536382536, + "grad_norm": 0.35045212507247925, + "learning_rate": 4.1475278419202324e-05, + "loss": 0.1155, + "num_input_tokens_seen": 7780488, + "step": 10840 + }, + { + "epoch": 22.546777546777548, + "grad_norm": 0.15770956873893738, + "learning_rate": 4.146789309880953e-05, + "loss": 0.0888, + "num_input_tokens_seen": 7783944, + "step": 10845 + }, + { + "epoch": 22.557172557172557, + "grad_norm": 0.4913598895072937, + "learning_rate": 4.146050523886068e-05, + "loss": 0.1175, + "num_input_tokens_seen": 7787528, + "step": 10850 + }, + { + "epoch": 22.56756756756757, + "grad_norm": 0.2973099946975708, + "learning_rate": 4.1453114840495055e-05, + "loss": 0.1364, + "num_input_tokens_seen": 7791048, + "step": 10855 + }, + { + "epoch": 22.577962577962577, + "grad_norm": 0.21412886679172516, + "learning_rate": 4.1445721904852364e-05, + "loss": 0.1282, + "num_input_tokens_seen": 7794728, + "step": 10860 + }, + { + "epoch": 22.58835758835759, + "grad_norm": 0.42102041840553284, + "learning_rate": 4.143832643307269e-05, + "loss": 0.0933, + "num_input_tokens_seen": 7798120, + "step": 10865 + }, + { + "epoch": 22.598752598752597, + "grad_norm": 0.36536169052124023, + "learning_rate": 4.1430928426296503e-05, + "loss": 0.0948, + "num_input_tokens_seen": 7801672, + "step": 10870 + }, + { + "epoch": 22.60914760914761, + "grad_norm": 0.24463410675525665, + "learning_rate": 4.142352788566466e-05, + "loss": 0.0877, + "num_input_tokens_seen": 7805224, + "step": 10875 + }, + { + "epoch": 22.61954261954262, + "grad_norm": 0.44084885716438293, + "learning_rate": 4.1416124812318424e-05, + "loss": 0.1213, + "num_input_tokens_seen": 7808808, + "step": 10880 + }, + { + "epoch": 22.62993762993763, + "grad_norm": 0.5530669689178467, + "learning_rate": 4.1408719207399453e-05, + "loss": 0.1194, + "num_input_tokens_seen": 7812392, + "step": 10885 + }, + { + "epoch": 22.64033264033264, + "grad_norm": 0.5976298451423645, + "learning_rate": 4.140131107204978e-05, + "loss": 0.1405, + "num_input_tokens_seen": 7815912, + "step": 10890 + }, + { + "epoch": 22.65072765072765, + "grad_norm": 0.6117827296257019, + "learning_rate": 4.139390040741182e-05, + "loss": 0.1503, + "num_input_tokens_seen": 7819560, + "step": 10895 + }, + { + "epoch": 22.66112266112266, + "grad_norm": 0.21214613318443298, + "learning_rate": 4.1386487214628396e-05, + "loss": 0.1747, + "num_input_tokens_seen": 7823272, + "step": 10900 + }, + { + "epoch": 22.671517671517673, + "grad_norm": 0.22584807872772217, + "learning_rate": 4.137907149484272e-05, + "loss": 0.1008, + "num_input_tokens_seen": 7826728, + "step": 10905 + }, + { + "epoch": 22.68191268191268, + "grad_norm": 0.12605471909046173, + "learning_rate": 4.137165324919839e-05, + "loss": 0.1218, + "num_input_tokens_seen": 7830248, + "step": 10910 + }, + { + "epoch": 22.692307692307693, + "grad_norm": 0.3878920376300812, + "learning_rate": 4.136423247883939e-05, + "loss": 0.1352, + "num_input_tokens_seen": 7834088, + "step": 10915 + }, + { + "epoch": 22.7027027027027, + "grad_norm": 0.2558475136756897, + "learning_rate": 4.135680918491009e-05, + "loss": 0.1193, + "num_input_tokens_seen": 7837608, + "step": 10920 + }, + { + "epoch": 22.713097713097714, + "grad_norm": 0.19979174435138702, + "learning_rate": 4.1349383368555265e-05, + "loss": 0.075, + "num_input_tokens_seen": 7841352, + "step": 10925 + }, + { + "epoch": 22.723492723492722, + "grad_norm": 0.7629387974739075, + "learning_rate": 4.1341955030920065e-05, + "loss": 0.1296, + "num_input_tokens_seen": 7844936, + "step": 10930 + }, + { + "epoch": 22.733887733887734, + "grad_norm": 0.47827354073524475, + "learning_rate": 4.1334524173150036e-05, + "loss": 0.1257, + "num_input_tokens_seen": 7848552, + "step": 10935 + }, + { + "epoch": 22.744282744282746, + "grad_norm": 0.37972521781921387, + "learning_rate": 4.13270907963911e-05, + "loss": 0.1178, + "num_input_tokens_seen": 7852104, + "step": 10940 + }, + { + "epoch": 22.754677754677754, + "grad_norm": 0.41152194142341614, + "learning_rate": 4.131965490178959e-05, + "loss": 0.1189, + "num_input_tokens_seen": 7855848, + "step": 10945 + }, + { + "epoch": 22.765072765072766, + "grad_norm": 0.2093062847852707, + "learning_rate": 4.131221649049222e-05, + "loss": 0.1235, + "num_input_tokens_seen": 7859464, + "step": 10950 + }, + { + "epoch": 22.775467775467774, + "grad_norm": 0.3853543698787689, + "learning_rate": 4.130477556364606e-05, + "loss": 0.1302, + "num_input_tokens_seen": 7863048, + "step": 10955 + }, + { + "epoch": 22.785862785862786, + "grad_norm": 0.19770602881908417, + "learning_rate": 4.129733212239861e-05, + "loss": 0.0969, + "num_input_tokens_seen": 7866664, + "step": 10960 + }, + { + "epoch": 22.796257796257795, + "grad_norm": 0.6566045880317688, + "learning_rate": 4.128988616789774e-05, + "loss": 0.1097, + "num_input_tokens_seen": 7870184, + "step": 10965 + }, + { + "epoch": 22.806652806652806, + "grad_norm": 0.4780392646789551, + "learning_rate": 4.1282437701291724e-05, + "loss": 0.1322, + "num_input_tokens_seen": 7873864, + "step": 10970 + }, + { + "epoch": 22.81704781704782, + "grad_norm": 0.5341290831565857, + "learning_rate": 4.1274986723729184e-05, + "loss": 0.1296, + "num_input_tokens_seen": 7877512, + "step": 10975 + }, + { + "epoch": 22.827442827442827, + "grad_norm": 0.36711645126342773, + "learning_rate": 4.126753323635917e-05, + "loss": 0.1441, + "num_input_tokens_seen": 7881064, + "step": 10980 + }, + { + "epoch": 22.83783783783784, + "grad_norm": 0.4133055508136749, + "learning_rate": 4.12600772403311e-05, + "loss": 0.1624, + "num_input_tokens_seen": 7884680, + "step": 10985 + }, + { + "epoch": 22.848232848232847, + "grad_norm": 0.5607908368110657, + "learning_rate": 4.125261873679479e-05, + "loss": 0.122, + "num_input_tokens_seen": 7888136, + "step": 10990 + }, + { + "epoch": 22.85862785862786, + "grad_norm": 0.26289036870002747, + "learning_rate": 4.124515772690042e-05, + "loss": 0.1192, + "num_input_tokens_seen": 7891848, + "step": 10995 + }, + { + "epoch": 22.86902286902287, + "grad_norm": 0.11784607172012329, + "learning_rate": 4.123769421179858e-05, + "loss": 0.1344, + "num_input_tokens_seen": 7895400, + "step": 11000 + }, + { + "epoch": 22.86902286902287, + "eval_loss": 0.15105149149894714, + "eval_runtime": 7.7607, + "eval_samples_per_second": 110.299, + "eval_steps_per_second": 27.575, + "num_input_tokens_seen": 7895400, + "step": 11000 + }, + { + "epoch": 22.87941787941788, + "grad_norm": 0.16751889884471893, + "learning_rate": 4.1230228192640236e-05, + "loss": 0.1146, + "num_input_tokens_seen": 7898952, + "step": 11005 + }, + { + "epoch": 22.88981288981289, + "grad_norm": 0.2190587967634201, + "learning_rate": 4.122275967057675e-05, + "loss": 0.0982, + "num_input_tokens_seen": 7902472, + "step": 11010 + }, + { + "epoch": 22.9002079002079, + "grad_norm": 0.28377267718315125, + "learning_rate": 4.1215288646759846e-05, + "loss": 0.1735, + "num_input_tokens_seen": 7905960, + "step": 11015 + }, + { + "epoch": 22.91060291060291, + "grad_norm": 0.45189693570137024, + "learning_rate": 4.120781512234166e-05, + "loss": 0.1741, + "num_input_tokens_seen": 7909512, + "step": 11020 + }, + { + "epoch": 22.92099792099792, + "grad_norm": 0.4046567976474762, + "learning_rate": 4.120033909847471e-05, + "loss": 0.1409, + "num_input_tokens_seen": 7912936, + "step": 11025 + }, + { + "epoch": 22.93139293139293, + "grad_norm": 0.22645269334316254, + "learning_rate": 4.119286057631187e-05, + "loss": 0.1306, + "num_input_tokens_seen": 7916488, + "step": 11030 + }, + { + "epoch": 22.941787941787943, + "grad_norm": 0.36600011587142944, + "learning_rate": 4.118537955700646e-05, + "loss": 0.1028, + "num_input_tokens_seen": 7920104, + "step": 11035 + }, + { + "epoch": 22.95218295218295, + "grad_norm": 0.21168603003025055, + "learning_rate": 4.11778960417121e-05, + "loss": 0.1208, + "num_input_tokens_seen": 7923656, + "step": 11040 + }, + { + "epoch": 22.962577962577964, + "grad_norm": 0.15500034391880035, + "learning_rate": 4.117041003158288e-05, + "loss": 0.152, + "num_input_tokens_seen": 7927208, + "step": 11045 + }, + { + "epoch": 22.972972972972972, + "grad_norm": 0.29655590653419495, + "learning_rate": 4.1162921527773215e-05, + "loss": 0.1626, + "num_input_tokens_seen": 7930856, + "step": 11050 + }, + { + "epoch": 22.983367983367984, + "grad_norm": 0.32906970381736755, + "learning_rate": 4.115543053143794e-05, + "loss": 0.1179, + "num_input_tokens_seen": 7934408, + "step": 11055 + }, + { + "epoch": 22.993762993762992, + "grad_norm": 0.5722373723983765, + "learning_rate": 4.114793704373226e-05, + "loss": 0.1373, + "num_input_tokens_seen": 7938216, + "step": 11060 + }, + { + "epoch": 23.004158004158004, + "grad_norm": 0.4721086919307709, + "learning_rate": 4.114044106581175e-05, + "loss": 0.1183, + "num_input_tokens_seen": 7941680, + "step": 11065 + }, + { + "epoch": 23.014553014553016, + "grad_norm": 0.31051647663116455, + "learning_rate": 4.11329425988324e-05, + "loss": 0.1306, + "num_input_tokens_seen": 7945296, + "step": 11070 + }, + { + "epoch": 23.024948024948024, + "grad_norm": 0.28589093685150146, + "learning_rate": 4.112544164395056e-05, + "loss": 0.1199, + "num_input_tokens_seen": 7948880, + "step": 11075 + }, + { + "epoch": 23.035343035343036, + "grad_norm": 0.6116350293159485, + "learning_rate": 4.111793820232297e-05, + "loss": 0.1484, + "num_input_tokens_seen": 7952496, + "step": 11080 + }, + { + "epoch": 23.045738045738045, + "grad_norm": 0.27513840794563293, + "learning_rate": 4.1110432275106767e-05, + "loss": 0.0861, + "num_input_tokens_seen": 7955952, + "step": 11085 + }, + { + "epoch": 23.056133056133056, + "grad_norm": 0.5640369653701782, + "learning_rate": 4.110292386345944e-05, + "loss": 0.0991, + "num_input_tokens_seen": 7959536, + "step": 11090 + }, + { + "epoch": 23.066528066528065, + "grad_norm": 0.21707940101623535, + "learning_rate": 4.109541296853891e-05, + "loss": 0.1246, + "num_input_tokens_seen": 7963312, + "step": 11095 + }, + { + "epoch": 23.076923076923077, + "grad_norm": 0.5328802466392517, + "learning_rate": 4.108789959150341e-05, + "loss": 0.1065, + "num_input_tokens_seen": 7966992, + "step": 11100 + }, + { + "epoch": 23.08731808731809, + "grad_norm": 0.49835893511772156, + "learning_rate": 4.108038373351163e-05, + "loss": 0.1737, + "num_input_tokens_seen": 7970736, + "step": 11105 + }, + { + "epoch": 23.097713097713097, + "grad_norm": 0.1464008390903473, + "learning_rate": 4.10728653957226e-05, + "loss": 0.1361, + "num_input_tokens_seen": 7974320, + "step": 11110 + }, + { + "epoch": 23.10810810810811, + "grad_norm": 0.20138303935527802, + "learning_rate": 4.106534457929575e-05, + "loss": 0.0965, + "num_input_tokens_seen": 7977968, + "step": 11115 + }, + { + "epoch": 23.118503118503117, + "grad_norm": 0.32779058814048767, + "learning_rate": 4.105782128539086e-05, + "loss": 0.1247, + "num_input_tokens_seen": 7981744, + "step": 11120 + }, + { + "epoch": 23.12889812889813, + "grad_norm": 0.463198721408844, + "learning_rate": 4.1050295515168144e-05, + "loss": 0.1103, + "num_input_tokens_seen": 7985392, + "step": 11125 + }, + { + "epoch": 23.13929313929314, + "grad_norm": 0.20002561807632446, + "learning_rate": 4.1042767269788155e-05, + "loss": 0.1147, + "num_input_tokens_seen": 7988816, + "step": 11130 + }, + { + "epoch": 23.14968814968815, + "grad_norm": 0.4661111533641815, + "learning_rate": 4.103523655041185e-05, + "loss": 0.1059, + "num_input_tokens_seen": 7992336, + "step": 11135 + }, + { + "epoch": 23.16008316008316, + "grad_norm": 0.3042159080505371, + "learning_rate": 4.102770335820055e-05, + "loss": 0.1311, + "num_input_tokens_seen": 7995824, + "step": 11140 + }, + { + "epoch": 23.17047817047817, + "grad_norm": 0.1317271888256073, + "learning_rate": 4.1020167694315984e-05, + "loss": 0.1062, + "num_input_tokens_seen": 7999184, + "step": 11145 + }, + { + "epoch": 23.18087318087318, + "grad_norm": 0.38881203532218933, + "learning_rate": 4.101262955992023e-05, + "loss": 0.1227, + "num_input_tokens_seen": 8002736, + "step": 11150 + }, + { + "epoch": 23.19126819126819, + "grad_norm": 0.2425960898399353, + "learning_rate": 4.100508895617578e-05, + "loss": 0.1311, + "num_input_tokens_seen": 8006288, + "step": 11155 + }, + { + "epoch": 23.2016632016632, + "grad_norm": 0.3914942145347595, + "learning_rate": 4.099754588424547e-05, + "loss": 0.1304, + "num_input_tokens_seen": 8009968, + "step": 11160 + }, + { + "epoch": 23.212058212058214, + "grad_norm": 0.4390495717525482, + "learning_rate": 4.0990000345292546e-05, + "loss": 0.1159, + "num_input_tokens_seen": 8013520, + "step": 11165 + }, + { + "epoch": 23.222453222453222, + "grad_norm": 0.4270687699317932, + "learning_rate": 4.098245234048064e-05, + "loss": 0.1499, + "num_input_tokens_seen": 8017104, + "step": 11170 + }, + { + "epoch": 23.232848232848234, + "grad_norm": 0.1303327977657318, + "learning_rate": 4.0974901870973726e-05, + "loss": 0.1335, + "num_input_tokens_seen": 8020656, + "step": 11175 + }, + { + "epoch": 23.243243243243242, + "grad_norm": 0.6990579962730408, + "learning_rate": 4.096734893793619e-05, + "loss": 0.1455, + "num_input_tokens_seen": 8024240, + "step": 11180 + }, + { + "epoch": 23.253638253638254, + "grad_norm": 0.41967928409576416, + "learning_rate": 4.095979354253279e-05, + "loss": 0.1024, + "num_input_tokens_seen": 8027888, + "step": 11185 + }, + { + "epoch": 23.264033264033262, + "grad_norm": 0.16328604519367218, + "learning_rate": 4.0952235685928656e-05, + "loss": 0.1039, + "num_input_tokens_seen": 8031376, + "step": 11190 + }, + { + "epoch": 23.274428274428274, + "grad_norm": 0.11498109996318817, + "learning_rate": 4.094467536928932e-05, + "loss": 0.115, + "num_input_tokens_seen": 8034960, + "step": 11195 + }, + { + "epoch": 23.284823284823286, + "grad_norm": 0.17710953950881958, + "learning_rate": 4.093711259378067e-05, + "loss": 0.0899, + "num_input_tokens_seen": 8038480, + "step": 11200 + }, + { + "epoch": 23.284823284823286, + "eval_loss": 0.14320866763591766, + "eval_runtime": 7.7683, + "eval_samples_per_second": 110.191, + "eval_steps_per_second": 27.548, + "num_input_tokens_seen": 8038480, + "step": 11200 + }, + { + "epoch": 23.295218295218294, + "grad_norm": 0.4673921763896942, + "learning_rate": 4.092954736056897e-05, + "loss": 0.1635, + "num_input_tokens_seen": 8042224, + "step": 11205 + }, + { + "epoch": 23.305613305613306, + "grad_norm": 0.23071230947971344, + "learning_rate": 4.09219796708209e-05, + "loss": 0.1434, + "num_input_tokens_seen": 8045936, + "step": 11210 + }, + { + "epoch": 23.316008316008315, + "grad_norm": 0.2582714259624481, + "learning_rate": 4.0914409525703464e-05, + "loss": 0.1211, + "num_input_tokens_seen": 8049552, + "step": 11215 + }, + { + "epoch": 23.326403326403327, + "grad_norm": 0.6789159178733826, + "learning_rate": 4.090683692638408e-05, + "loss": 0.1121, + "num_input_tokens_seen": 8053168, + "step": 11220 + }, + { + "epoch": 23.33679833679834, + "grad_norm": 0.4480714499950409, + "learning_rate": 4.089926187403056e-05, + "loss": 0.1062, + "num_input_tokens_seen": 8056592, + "step": 11225 + }, + { + "epoch": 23.347193347193347, + "grad_norm": 0.41300255060195923, + "learning_rate": 4.0891684369811044e-05, + "loss": 0.1396, + "num_input_tokens_seen": 8060176, + "step": 11230 + }, + { + "epoch": 23.35758835758836, + "grad_norm": 0.5924111008644104, + "learning_rate": 4.0884104414894107e-05, + "loss": 0.1187, + "num_input_tokens_seen": 8063760, + "step": 11235 + }, + { + "epoch": 23.367983367983367, + "grad_norm": 0.25385189056396484, + "learning_rate": 4.087652201044864e-05, + "loss": 0.1209, + "num_input_tokens_seen": 8067536, + "step": 11240 + }, + { + "epoch": 23.37837837837838, + "grad_norm": 0.7007571458816528, + "learning_rate": 4.086893715764397e-05, + "loss": 0.111, + "num_input_tokens_seen": 8071120, + "step": 11245 + }, + { + "epoch": 23.388773388773387, + "grad_norm": 0.28139355778694153, + "learning_rate": 4.086134985764977e-05, + "loss": 0.0744, + "num_input_tokens_seen": 8074704, + "step": 11250 + }, + { + "epoch": 23.3991683991684, + "grad_norm": 0.2885574996471405, + "learning_rate": 4.0853760111636085e-05, + "loss": 0.124, + "num_input_tokens_seen": 8078352, + "step": 11255 + }, + { + "epoch": 23.40956340956341, + "grad_norm": 0.1268310695886612, + "learning_rate": 4.084616792077337e-05, + "loss": 0.1231, + "num_input_tokens_seen": 8081872, + "step": 11260 + }, + { + "epoch": 23.41995841995842, + "grad_norm": 0.2387133091688156, + "learning_rate": 4.083857328623243e-05, + "loss": 0.1071, + "num_input_tokens_seen": 8085488, + "step": 11265 + }, + { + "epoch": 23.43035343035343, + "grad_norm": 0.6621506214141846, + "learning_rate": 4.083097620918444e-05, + "loss": 0.139, + "num_input_tokens_seen": 8089168, + "step": 11270 + }, + { + "epoch": 23.44074844074844, + "grad_norm": 0.2903144359588623, + "learning_rate": 4.082337669080097e-05, + "loss": 0.1347, + "num_input_tokens_seen": 8092784, + "step": 11275 + }, + { + "epoch": 23.45114345114345, + "grad_norm": 0.15635891258716583, + "learning_rate": 4.081577473225398e-05, + "loss": 0.0885, + "num_input_tokens_seen": 8096368, + "step": 11280 + }, + { + "epoch": 23.46153846153846, + "grad_norm": 0.43389081954956055, + "learning_rate": 4.080817033471577e-05, + "loss": 0.0928, + "num_input_tokens_seen": 8099824, + "step": 11285 + }, + { + "epoch": 23.471933471933472, + "grad_norm": 0.3110330402851105, + "learning_rate": 4.080056349935903e-05, + "loss": 0.0767, + "num_input_tokens_seen": 8103472, + "step": 11290 + }, + { + "epoch": 23.482328482328484, + "grad_norm": 0.31676074862480164, + "learning_rate": 4.079295422735684e-05, + "loss": 0.1477, + "num_input_tokens_seen": 8106992, + "step": 11295 + }, + { + "epoch": 23.492723492723492, + "grad_norm": 0.21122519671916962, + "learning_rate": 4.078534251988264e-05, + "loss": 0.0989, + "num_input_tokens_seen": 8110576, + "step": 11300 + }, + { + "epoch": 23.503118503118504, + "grad_norm": 0.39633023738861084, + "learning_rate": 4.077772837811025e-05, + "loss": 0.1134, + "num_input_tokens_seen": 8114096, + "step": 11305 + }, + { + "epoch": 23.513513513513512, + "grad_norm": 0.3586365878582001, + "learning_rate": 4.0770111803213874e-05, + "loss": 0.1225, + "num_input_tokens_seen": 8117680, + "step": 11310 + }, + { + "epoch": 23.523908523908524, + "grad_norm": 0.597912073135376, + "learning_rate": 4.076249279636807e-05, + "loss": 0.1908, + "num_input_tokens_seen": 8121360, + "step": 11315 + }, + { + "epoch": 23.534303534303533, + "grad_norm": 0.3796900808811188, + "learning_rate": 4.075487135874781e-05, + "loss": 0.1514, + "num_input_tokens_seen": 8124976, + "step": 11320 + }, + { + "epoch": 23.544698544698544, + "grad_norm": 0.15470349788665771, + "learning_rate": 4.074724749152837e-05, + "loss": 0.1188, + "num_input_tokens_seen": 8128656, + "step": 11325 + }, + { + "epoch": 23.555093555093556, + "grad_norm": 0.28445789217948914, + "learning_rate": 4.07396211958855e-05, + "loss": 0.1622, + "num_input_tokens_seen": 8132304, + "step": 11330 + }, + { + "epoch": 23.565488565488565, + "grad_norm": 0.34071439504623413, + "learning_rate": 4.073199247299523e-05, + "loss": 0.1039, + "num_input_tokens_seen": 8135952, + "step": 11335 + }, + { + "epoch": 23.575883575883577, + "grad_norm": 0.35598763823509216, + "learning_rate": 4.072436132403403e-05, + "loss": 0.1004, + "num_input_tokens_seen": 8139568, + "step": 11340 + }, + { + "epoch": 23.586278586278585, + "grad_norm": 0.48047909140586853, + "learning_rate": 4.0716727750178704e-05, + "loss": 0.1237, + "num_input_tokens_seen": 8143216, + "step": 11345 + }, + { + "epoch": 23.596673596673597, + "grad_norm": 0.17925693094730377, + "learning_rate": 4.0709091752606455e-05, + "loss": 0.1116, + "num_input_tokens_seen": 8146800, + "step": 11350 + }, + { + "epoch": 23.60706860706861, + "grad_norm": 0.3082643449306488, + "learning_rate": 4.070145333249484e-05, + "loss": 0.1165, + "num_input_tokens_seen": 8150416, + "step": 11355 + }, + { + "epoch": 23.617463617463617, + "grad_norm": 0.32184526324272156, + "learning_rate": 4.069381249102181e-05, + "loss": 0.0978, + "num_input_tokens_seen": 8154032, + "step": 11360 + }, + { + "epoch": 23.62785862785863, + "grad_norm": 0.7145310640335083, + "learning_rate": 4.0686169229365665e-05, + "loss": 0.1234, + "num_input_tokens_seen": 8157680, + "step": 11365 + }, + { + "epoch": 23.638253638253637, + "grad_norm": 0.23416073620319366, + "learning_rate": 4.067852354870511e-05, + "loss": 0.146, + "num_input_tokens_seen": 8161168, + "step": 11370 + }, + { + "epoch": 23.64864864864865, + "grad_norm": 0.4062473177909851, + "learning_rate": 4.067087545021919e-05, + "loss": 0.1208, + "num_input_tokens_seen": 8164592, + "step": 11375 + }, + { + "epoch": 23.659043659043657, + "grad_norm": 0.19593076407909393, + "learning_rate": 4.066322493508734e-05, + "loss": 0.084, + "num_input_tokens_seen": 8168112, + "step": 11380 + }, + { + "epoch": 23.66943866943867, + "grad_norm": 0.2549164891242981, + "learning_rate": 4.065557200448937e-05, + "loss": 0.0897, + "num_input_tokens_seen": 8171600, + "step": 11385 + }, + { + "epoch": 23.67983367983368, + "grad_norm": 0.7431758642196655, + "learning_rate": 4.064791665960546e-05, + "loss": 0.1583, + "num_input_tokens_seen": 8175184, + "step": 11390 + }, + { + "epoch": 23.69022869022869, + "grad_norm": 0.1878584325313568, + "learning_rate": 4.064025890161615e-05, + "loss": 0.1076, + "num_input_tokens_seen": 8178928, + "step": 11395 + }, + { + "epoch": 23.7006237006237, + "grad_norm": 0.5848361849784851, + "learning_rate": 4.0632598731702373e-05, + "loss": 0.0867, + "num_input_tokens_seen": 8182416, + "step": 11400 + }, + { + "epoch": 23.7006237006237, + "eval_loss": 0.1456780880689621, + "eval_runtime": 7.7576, + "eval_samples_per_second": 110.344, + "eval_steps_per_second": 27.586, + "num_input_tokens_seen": 8182416, + "step": 11400 + }, + { + "epoch": 23.71101871101871, + "grad_norm": 0.23348845541477203, + "learning_rate": 4.0624936151045426e-05, + "loss": 0.1423, + "num_input_tokens_seen": 8186000, + "step": 11405 + }, + { + "epoch": 23.72141372141372, + "grad_norm": 0.21220435202121735, + "learning_rate": 4.061727116082696e-05, + "loss": 0.117, + "num_input_tokens_seen": 8189616, + "step": 11410 + }, + { + "epoch": 23.731808731808734, + "grad_norm": 0.2543031573295593, + "learning_rate": 4.060960376222903e-05, + "loss": 0.1428, + "num_input_tokens_seen": 8193168, + "step": 11415 + }, + { + "epoch": 23.742203742203742, + "grad_norm": 0.25288423895835876, + "learning_rate": 4.0601933956434034e-05, + "loss": 0.1542, + "num_input_tokens_seen": 8196720, + "step": 11420 + }, + { + "epoch": 23.752598752598754, + "grad_norm": 0.24736064672470093, + "learning_rate": 4.059426174462476e-05, + "loss": 0.1291, + "num_input_tokens_seen": 8200400, + "step": 11425 + }, + { + "epoch": 23.762993762993762, + "grad_norm": 0.255851686000824, + "learning_rate": 4.058658712798435e-05, + "loss": 0.0982, + "num_input_tokens_seen": 8203952, + "step": 11430 + }, + { + "epoch": 23.773388773388774, + "grad_norm": 0.3436691462993622, + "learning_rate": 4.0578910107696336e-05, + "loss": 0.1425, + "num_input_tokens_seen": 8207408, + "step": 11435 + }, + { + "epoch": 23.783783783783782, + "grad_norm": 0.49191632866859436, + "learning_rate": 4.05712306849446e-05, + "loss": 0.1236, + "num_input_tokens_seen": 8211120, + "step": 11440 + }, + { + "epoch": 23.794178794178794, + "grad_norm": 0.27660298347473145, + "learning_rate": 4.0563548860913415e-05, + "loss": 0.1213, + "num_input_tokens_seen": 8214704, + "step": 11445 + }, + { + "epoch": 23.804573804573806, + "grad_norm": 0.45018815994262695, + "learning_rate": 4.0555864636787414e-05, + "loss": 0.1686, + "num_input_tokens_seen": 8218288, + "step": 11450 + }, + { + "epoch": 23.814968814968815, + "grad_norm": 0.2144283503293991, + "learning_rate": 4.054817801375159e-05, + "loss": 0.1357, + "num_input_tokens_seen": 8221872, + "step": 11455 + }, + { + "epoch": 23.825363825363826, + "grad_norm": 0.1886749416589737, + "learning_rate": 4.054048899299134e-05, + "loss": 0.104, + "num_input_tokens_seen": 8225392, + "step": 11460 + }, + { + "epoch": 23.835758835758835, + "grad_norm": 0.3989754319190979, + "learning_rate": 4.0532797575692385e-05, + "loss": 0.1211, + "num_input_tokens_seen": 8228944, + "step": 11465 + }, + { + "epoch": 23.846153846153847, + "grad_norm": 0.23517435789108276, + "learning_rate": 4.052510376304085e-05, + "loss": 0.1536, + "num_input_tokens_seen": 8232496, + "step": 11470 + }, + { + "epoch": 23.856548856548855, + "grad_norm": 0.45155197381973267, + "learning_rate": 4.051740755622321e-05, + "loss": 0.1594, + "num_input_tokens_seen": 8236048, + "step": 11475 + }, + { + "epoch": 23.866943866943867, + "grad_norm": 0.4423081576824188, + "learning_rate": 4.050970895642632e-05, + "loss": 0.115, + "num_input_tokens_seen": 8239472, + "step": 11480 + }, + { + "epoch": 23.87733887733888, + "grad_norm": 0.20430028438568115, + "learning_rate": 4.050200796483741e-05, + "loss": 0.1262, + "num_input_tokens_seen": 8243120, + "step": 11485 + }, + { + "epoch": 23.887733887733887, + "grad_norm": 0.3231617510318756, + "learning_rate": 4.049430458264405e-05, + "loss": 0.1342, + "num_input_tokens_seen": 8246640, + "step": 11490 + }, + { + "epoch": 23.8981288981289, + "grad_norm": 0.430331826210022, + "learning_rate": 4.048659881103422e-05, + "loss": 0.1001, + "num_input_tokens_seen": 8250352, + "step": 11495 + }, + { + "epoch": 23.908523908523907, + "grad_norm": 0.33833351731300354, + "learning_rate": 4.0478890651196235e-05, + "loss": 0.1215, + "num_input_tokens_seen": 8253936, + "step": 11500 + }, + { + "epoch": 23.91891891891892, + "grad_norm": 0.17093084752559662, + "learning_rate": 4.047118010431879e-05, + "loss": 0.1491, + "num_input_tokens_seen": 8257520, + "step": 11505 + }, + { + "epoch": 23.929313929313928, + "grad_norm": 0.20659315586090088, + "learning_rate": 4.046346717159094e-05, + "loss": 0.1304, + "num_input_tokens_seen": 8261072, + "step": 11510 + }, + { + "epoch": 23.93970893970894, + "grad_norm": 0.24378854036331177, + "learning_rate": 4.045575185420214e-05, + "loss": 0.1218, + "num_input_tokens_seen": 8264560, + "step": 11515 + }, + { + "epoch": 23.95010395010395, + "grad_norm": 0.18864350020885468, + "learning_rate": 4.0448034153342165e-05, + "loss": 0.1266, + "num_input_tokens_seen": 8268176, + "step": 11520 + }, + { + "epoch": 23.96049896049896, + "grad_norm": 0.14648061990737915, + "learning_rate": 4.0440314070201194e-05, + "loss": 0.0764, + "num_input_tokens_seen": 8271664, + "step": 11525 + }, + { + "epoch": 23.97089397089397, + "grad_norm": 0.19617103040218353, + "learning_rate": 4.043259160596976e-05, + "loss": 0.1578, + "num_input_tokens_seen": 8275248, + "step": 11530 + }, + { + "epoch": 23.98128898128898, + "grad_norm": 0.3222077786922455, + "learning_rate": 4.0424866761838767e-05, + "loss": 0.1342, + "num_input_tokens_seen": 8278800, + "step": 11535 + }, + { + "epoch": 23.991683991683992, + "grad_norm": 0.1512206792831421, + "learning_rate": 4.041713953899948e-05, + "loss": 0.0921, + "num_input_tokens_seen": 8282384, + "step": 11540 + }, + { + "epoch": 24.002079002079004, + "grad_norm": 0.222870334982872, + "learning_rate": 4.0409409938643515e-05, + "loss": 0.1056, + "num_input_tokens_seen": 8286048, + "step": 11545 + }, + { + "epoch": 24.012474012474012, + "grad_norm": 0.2816820442676544, + "learning_rate": 4.0401677961962904e-05, + "loss": 0.148, + "num_input_tokens_seen": 8289632, + "step": 11550 + }, + { + "epoch": 24.022869022869024, + "grad_norm": 0.2354666292667389, + "learning_rate": 4.039394361015001e-05, + "loss": 0.1097, + "num_input_tokens_seen": 8293440, + "step": 11555 + }, + { + "epoch": 24.033264033264032, + "grad_norm": 0.517677366733551, + "learning_rate": 4.038620688439755e-05, + "loss": 0.1371, + "num_input_tokens_seen": 8297152, + "step": 11560 + }, + { + "epoch": 24.043659043659044, + "grad_norm": 0.14465771615505219, + "learning_rate": 4.037846778589862e-05, + "loss": 0.1274, + "num_input_tokens_seen": 8300768, + "step": 11565 + }, + { + "epoch": 24.054054054054053, + "grad_norm": 0.24851368367671967, + "learning_rate": 4.0370726315846715e-05, + "loss": 0.1388, + "num_input_tokens_seen": 8304480, + "step": 11570 + }, + { + "epoch": 24.064449064449065, + "grad_norm": 0.37562209367752075, + "learning_rate": 4.036298247543565e-05, + "loss": 0.0881, + "num_input_tokens_seen": 8308032, + "step": 11575 + }, + { + "epoch": 24.074844074844076, + "grad_norm": 0.38388094305992126, + "learning_rate": 4.035523626585962e-05, + "loss": 0.139, + "num_input_tokens_seen": 8311680, + "step": 11580 + }, + { + "epoch": 24.085239085239085, + "grad_norm": 0.4398118555545807, + "learning_rate": 4.0347487688313194e-05, + "loss": 0.1141, + "num_input_tokens_seen": 8315264, + "step": 11585 + }, + { + "epoch": 24.095634095634097, + "grad_norm": 0.42462649941444397, + "learning_rate": 4.0339736743991296e-05, + "loss": 0.107, + "num_input_tokens_seen": 8318848, + "step": 11590 + }, + { + "epoch": 24.106029106029105, + "grad_norm": 0.2795347571372986, + "learning_rate": 4.0331983434089227e-05, + "loss": 0.0983, + "num_input_tokens_seen": 8322336, + "step": 11595 + }, + { + "epoch": 24.116424116424117, + "grad_norm": 0.3574084937572479, + "learning_rate": 4.032422775980264e-05, + "loss": 0.1388, + "num_input_tokens_seen": 8325888, + "step": 11600 + }, + { + "epoch": 24.116424116424117, + "eval_loss": 0.15014620125293732, + "eval_runtime": 7.7539, + "eval_samples_per_second": 110.395, + "eval_steps_per_second": 27.599, + "num_input_tokens_seen": 8325888, + "step": 11600 + }, + { + "epoch": 24.126819126819125, + "grad_norm": 0.23319557309150696, + "learning_rate": 4.031646972232754e-05, + "loss": 0.1314, + "num_input_tokens_seen": 8329536, + "step": 11605 + }, + { + "epoch": 24.137214137214137, + "grad_norm": 0.11965842545032501, + "learning_rate": 4.0308709322860344e-05, + "loss": 0.1163, + "num_input_tokens_seen": 8333088, + "step": 11610 + }, + { + "epoch": 24.14760914760915, + "grad_norm": 0.2979270815849304, + "learning_rate": 4.0300946562597784e-05, + "loss": 0.1021, + "num_input_tokens_seen": 8336544, + "step": 11615 + }, + { + "epoch": 24.158004158004157, + "grad_norm": 0.43164297938346863, + "learning_rate": 4.029318144273698e-05, + "loss": 0.1562, + "num_input_tokens_seen": 8340224, + "step": 11620 + }, + { + "epoch": 24.16839916839917, + "grad_norm": 0.3243095576763153, + "learning_rate": 4.0285413964475415e-05, + "loss": 0.1179, + "num_input_tokens_seen": 8343776, + "step": 11625 + }, + { + "epoch": 24.178794178794178, + "grad_norm": 0.189182847738266, + "learning_rate": 4.0277644129010927e-05, + "loss": 0.1059, + "num_input_tokens_seen": 8347360, + "step": 11630 + }, + { + "epoch": 24.18918918918919, + "grad_norm": 0.24977438151836395, + "learning_rate": 4.0269871937541724e-05, + "loss": 0.108, + "num_input_tokens_seen": 8350816, + "step": 11635 + }, + { + "epoch": 24.1995841995842, + "grad_norm": 0.2651330828666687, + "learning_rate": 4.026209739126637e-05, + "loss": 0.1471, + "num_input_tokens_seen": 8354368, + "step": 11640 + }, + { + "epoch": 24.20997920997921, + "grad_norm": 0.4678947329521179, + "learning_rate": 4.025432049138381e-05, + "loss": 0.1298, + "num_input_tokens_seen": 8358208, + "step": 11645 + }, + { + "epoch": 24.22037422037422, + "grad_norm": 0.3230912387371063, + "learning_rate": 4.0246541239093325e-05, + "loss": 0.1328, + "num_input_tokens_seen": 8361952, + "step": 11650 + }, + { + "epoch": 24.23076923076923, + "grad_norm": 0.3043174743652344, + "learning_rate": 4.023875963559459e-05, + "loss": 0.1006, + "num_input_tokens_seen": 8365600, + "step": 11655 + }, + { + "epoch": 24.241164241164242, + "grad_norm": 0.239776149392128, + "learning_rate": 4.023097568208761e-05, + "loss": 0.1089, + "num_input_tokens_seen": 8369120, + "step": 11660 + }, + { + "epoch": 24.25155925155925, + "grad_norm": 0.16932319104671478, + "learning_rate": 4.022318937977277e-05, + "loss": 0.1112, + "num_input_tokens_seen": 8372576, + "step": 11665 + }, + { + "epoch": 24.261954261954262, + "grad_norm": 0.20665334165096283, + "learning_rate": 4.021540072985084e-05, + "loss": 0.0933, + "num_input_tokens_seen": 8376256, + "step": 11670 + }, + { + "epoch": 24.272349272349274, + "grad_norm": 0.34423843026161194, + "learning_rate": 4.020760973352289e-05, + "loss": 0.0992, + "num_input_tokens_seen": 8379840, + "step": 11675 + }, + { + "epoch": 24.282744282744282, + "grad_norm": 0.17188376188278198, + "learning_rate": 4.019981639199042e-05, + "loss": 0.1145, + "num_input_tokens_seen": 8383552, + "step": 11680 + }, + { + "epoch": 24.293139293139294, + "grad_norm": 0.6096827983856201, + "learning_rate": 4.0192020706455245e-05, + "loss": 0.1246, + "num_input_tokens_seen": 8387104, + "step": 11685 + }, + { + "epoch": 24.303534303534303, + "grad_norm": 0.2818431258201599, + "learning_rate": 4.018422267811956e-05, + "loss": 0.1199, + "num_input_tokens_seen": 8390720, + "step": 11690 + }, + { + "epoch": 24.313929313929314, + "grad_norm": 0.5720828771591187, + "learning_rate": 4.017642230818592e-05, + "loss": 0.1607, + "num_input_tokens_seen": 8394368, + "step": 11695 + }, + { + "epoch": 24.324324324324323, + "grad_norm": 0.3056250512599945, + "learning_rate": 4.0168619597857246e-05, + "loss": 0.1225, + "num_input_tokens_seen": 8397952, + "step": 11700 + }, + { + "epoch": 24.334719334719335, + "grad_norm": 0.6742897033691406, + "learning_rate": 4.016081454833681e-05, + "loss": 0.1228, + "num_input_tokens_seen": 8401376, + "step": 11705 + }, + { + "epoch": 24.345114345114347, + "grad_norm": 0.13257426023483276, + "learning_rate": 4.0153007160828245e-05, + "loss": 0.108, + "num_input_tokens_seen": 8404832, + "step": 11710 + }, + { + "epoch": 24.355509355509355, + "grad_norm": 0.43669381737709045, + "learning_rate": 4.0145197436535555e-05, + "loss": 0.1192, + "num_input_tokens_seen": 8408352, + "step": 11715 + }, + { + "epoch": 24.365904365904367, + "grad_norm": 0.210834801197052, + "learning_rate": 4.0137385376663095e-05, + "loss": 0.1076, + "num_input_tokens_seen": 8411968, + "step": 11720 + }, + { + "epoch": 24.376299376299375, + "grad_norm": 0.21338263154029846, + "learning_rate": 4.012957098241558e-05, + "loss": 0.1054, + "num_input_tokens_seen": 8415520, + "step": 11725 + }, + { + "epoch": 24.386694386694387, + "grad_norm": 0.23817583918571472, + "learning_rate": 4.0121754254998076e-05, + "loss": 0.1079, + "num_input_tokens_seen": 8419136, + "step": 11730 + }, + { + "epoch": 24.397089397089395, + "grad_norm": 0.6417600512504578, + "learning_rate": 4.011393519561606e-05, + "loss": 0.1169, + "num_input_tokens_seen": 8422688, + "step": 11735 + }, + { + "epoch": 24.407484407484407, + "grad_norm": 0.8246933221817017, + "learning_rate": 4.010611380547529e-05, + "loss": 0.1264, + "num_input_tokens_seen": 8426304, + "step": 11740 + }, + { + "epoch": 24.41787941787942, + "grad_norm": 0.3601992428302765, + "learning_rate": 4.009829008578192e-05, + "loss": 0.1396, + "num_input_tokens_seen": 8429824, + "step": 11745 + }, + { + "epoch": 24.428274428274428, + "grad_norm": 0.7157518863677979, + "learning_rate": 4.00904640377425e-05, + "loss": 0.1721, + "num_input_tokens_seen": 8433440, + "step": 11750 + }, + { + "epoch": 24.43866943866944, + "grad_norm": 0.40297332406044006, + "learning_rate": 4.0082635662563886e-05, + "loss": 0.0988, + "num_input_tokens_seen": 8436992, + "step": 11755 + }, + { + "epoch": 24.449064449064448, + "grad_norm": 0.1718110740184784, + "learning_rate": 4.007480496145331e-05, + "loss": 0.1361, + "num_input_tokens_seen": 8440608, + "step": 11760 + }, + { + "epoch": 24.45945945945946, + "grad_norm": 0.1611831933259964, + "learning_rate": 4.006697193561837e-05, + "loss": 0.1296, + "num_input_tokens_seen": 8444320, + "step": 11765 + }, + { + "epoch": 24.46985446985447, + "grad_norm": 0.6987238526344299, + "learning_rate": 4.005913658626701e-05, + "loss": 0.1656, + "num_input_tokens_seen": 8447808, + "step": 11770 + }, + { + "epoch": 24.48024948024948, + "grad_norm": 0.10867996513843536, + "learning_rate": 4.005129891460754e-05, + "loss": 0.0963, + "num_input_tokens_seen": 8451296, + "step": 11775 + }, + { + "epoch": 24.490644490644492, + "grad_norm": 0.14004568755626678, + "learning_rate": 4.004345892184864e-05, + "loss": 0.1202, + "num_input_tokens_seen": 8454816, + "step": 11780 + }, + { + "epoch": 24.5010395010395, + "grad_norm": 0.1897408813238144, + "learning_rate": 4.003561660919932e-05, + "loss": 0.093, + "num_input_tokens_seen": 8458272, + "step": 11785 + }, + { + "epoch": 24.511434511434512, + "grad_norm": 0.1538175493478775, + "learning_rate": 4.002777197786897e-05, + "loss": 0.127, + "num_input_tokens_seen": 8461760, + "step": 11790 + }, + { + "epoch": 24.52182952182952, + "grad_norm": 0.4451586604118347, + "learning_rate": 4.0019925029067326e-05, + "loss": 0.1073, + "num_input_tokens_seen": 8465440, + "step": 11795 + }, + { + "epoch": 24.532224532224532, + "grad_norm": 0.3361944854259491, + "learning_rate": 4.0012075764004495e-05, + "loss": 0.1396, + "num_input_tokens_seen": 8468992, + "step": 11800 + }, + { + "epoch": 24.532224532224532, + "eval_loss": 0.1526889055967331, + "eval_runtime": 7.7516, + "eval_samples_per_second": 110.429, + "eval_steps_per_second": 27.607, + "num_input_tokens_seen": 8468992, + "step": 11800 + }, + { + "epoch": 24.542619542619544, + "grad_norm": 0.30628135800361633, + "learning_rate": 4.000422418389094e-05, + "loss": 0.1553, + "num_input_tokens_seen": 8472608, + "step": 11805 + }, + { + "epoch": 24.553014553014552, + "grad_norm": 0.5271356105804443, + "learning_rate": 3.999637028993744e-05, + "loss": 0.1568, + "num_input_tokens_seen": 8476320, + "step": 11810 + }, + { + "epoch": 24.563409563409564, + "grad_norm": 0.8748605251312256, + "learning_rate": 3.99885140833552e-05, + "loss": 0.1065, + "num_input_tokens_seen": 8479808, + "step": 11815 + }, + { + "epoch": 24.573804573804573, + "grad_norm": 0.14615380764007568, + "learning_rate": 3.998065556535572e-05, + "loss": 0.0952, + "num_input_tokens_seen": 8483296, + "step": 11820 + }, + { + "epoch": 24.584199584199585, + "grad_norm": 0.14114944636821747, + "learning_rate": 3.9972794737150895e-05, + "loss": 0.1241, + "num_input_tokens_seen": 8486784, + "step": 11825 + }, + { + "epoch": 24.594594594594593, + "grad_norm": 0.2208952158689499, + "learning_rate": 3.996493159995297e-05, + "loss": 0.1296, + "num_input_tokens_seen": 8490304, + "step": 11830 + }, + { + "epoch": 24.604989604989605, + "grad_norm": 0.9681908488273621, + "learning_rate": 3.995706615497453e-05, + "loss": 0.1268, + "num_input_tokens_seen": 8493888, + "step": 11835 + }, + { + "epoch": 24.615384615384617, + "grad_norm": 0.6732742190361023, + "learning_rate": 3.994919840342852e-05, + "loss": 0.0858, + "num_input_tokens_seen": 8497408, + "step": 11840 + }, + { + "epoch": 24.625779625779625, + "grad_norm": 0.4530969262123108, + "learning_rate": 3.994132834652825e-05, + "loss": 0.1245, + "num_input_tokens_seen": 8501088, + "step": 11845 + }, + { + "epoch": 24.636174636174637, + "grad_norm": 0.16238665580749512, + "learning_rate": 3.99334559854874e-05, + "loss": 0.089, + "num_input_tokens_seen": 8504576, + "step": 11850 + }, + { + "epoch": 24.646569646569645, + "grad_norm": 0.4811859726905823, + "learning_rate": 3.9925581321519955e-05, + "loss": 0.1809, + "num_input_tokens_seen": 8508192, + "step": 11855 + }, + { + "epoch": 24.656964656964657, + "grad_norm": 0.3830215036869049, + "learning_rate": 3.991770435584031e-05, + "loss": 0.1309, + "num_input_tokens_seen": 8511776, + "step": 11860 + }, + { + "epoch": 24.66735966735967, + "grad_norm": 0.15398049354553223, + "learning_rate": 3.990982508966319e-05, + "loss": 0.104, + "num_input_tokens_seen": 8515232, + "step": 11865 + }, + { + "epoch": 24.677754677754677, + "grad_norm": 0.3415064811706543, + "learning_rate": 3.990194352420367e-05, + "loss": 0.1186, + "num_input_tokens_seen": 8518720, + "step": 11870 + }, + { + "epoch": 24.68814968814969, + "grad_norm": 0.489666223526001, + "learning_rate": 3.9894059660677184e-05, + "loss": 0.1501, + "num_input_tokens_seen": 8522272, + "step": 11875 + }, + { + "epoch": 24.698544698544698, + "grad_norm": 0.5384767055511475, + "learning_rate": 3.9886173500299526e-05, + "loss": 0.1097, + "num_input_tokens_seen": 8525888, + "step": 11880 + }, + { + "epoch": 24.70893970893971, + "grad_norm": 0.22134734690189362, + "learning_rate": 3.987828504428685e-05, + "loss": 0.1143, + "num_input_tokens_seen": 8529344, + "step": 11885 + }, + { + "epoch": 24.719334719334718, + "grad_norm": 0.27870967984199524, + "learning_rate": 3.987039429385565e-05, + "loss": 0.1059, + "num_input_tokens_seen": 8532768, + "step": 11890 + }, + { + "epoch": 24.72972972972973, + "grad_norm": 0.14534799754619598, + "learning_rate": 3.986250125022277e-05, + "loss": 0.1391, + "num_input_tokens_seen": 8536352, + "step": 11895 + }, + { + "epoch": 24.74012474012474, + "grad_norm": 0.21937650442123413, + "learning_rate": 3.985460591460544e-05, + "loss": 0.1381, + "num_input_tokens_seen": 8540000, + "step": 11900 + }, + { + "epoch": 24.75051975051975, + "grad_norm": 0.3230377733707428, + "learning_rate": 3.984670828822118e-05, + "loss": 0.0937, + "num_input_tokens_seen": 8543584, + "step": 11905 + }, + { + "epoch": 24.760914760914762, + "grad_norm": 0.29278525710105896, + "learning_rate": 3.983880837228794e-05, + "loss": 0.1061, + "num_input_tokens_seen": 8547296, + "step": 11910 + }, + { + "epoch": 24.77130977130977, + "grad_norm": 0.23198312520980835, + "learning_rate": 3.983090616802396e-05, + "loss": 0.1437, + "num_input_tokens_seen": 8550944, + "step": 11915 + }, + { + "epoch": 24.781704781704782, + "grad_norm": 0.3541935086250305, + "learning_rate": 3.982300167664788e-05, + "loss": 0.1201, + "num_input_tokens_seen": 8554496, + "step": 11920 + }, + { + "epoch": 24.79209979209979, + "grad_norm": 0.4197089374065399, + "learning_rate": 3.981509489937868e-05, + "loss": 0.1263, + "num_input_tokens_seen": 8557984, + "step": 11925 + }, + { + "epoch": 24.802494802494802, + "grad_norm": 0.18105828762054443, + "learning_rate": 3.9807185837435643e-05, + "loss": 0.1249, + "num_input_tokens_seen": 8561568, + "step": 11930 + }, + { + "epoch": 24.812889812889814, + "grad_norm": 0.1921226978302002, + "learning_rate": 3.9799274492038484e-05, + "loss": 0.0961, + "num_input_tokens_seen": 8565216, + "step": 11935 + }, + { + "epoch": 24.823284823284823, + "grad_norm": 0.45486804842948914, + "learning_rate": 3.979136086440722e-05, + "loss": 0.1098, + "num_input_tokens_seen": 8568704, + "step": 11940 + }, + { + "epoch": 24.833679833679835, + "grad_norm": 0.28134939074516296, + "learning_rate": 3.9783444955762226e-05, + "loss": 0.1199, + "num_input_tokens_seen": 8572352, + "step": 11945 + }, + { + "epoch": 24.844074844074843, + "grad_norm": 0.2710910439491272, + "learning_rate": 3.977552676732424e-05, + "loss": 0.14, + "num_input_tokens_seen": 8575936, + "step": 11950 + }, + { + "epoch": 24.854469854469855, + "grad_norm": 0.23723334074020386, + "learning_rate": 3.976760630031435e-05, + "loss": 0.0857, + "num_input_tokens_seen": 8579392, + "step": 11955 + }, + { + "epoch": 24.864864864864863, + "grad_norm": 0.5637953877449036, + "learning_rate": 3.975968355595398e-05, + "loss": 0.142, + "num_input_tokens_seen": 8583040, + "step": 11960 + }, + { + "epoch": 24.875259875259875, + "grad_norm": 0.22467008233070374, + "learning_rate": 3.9751758535464935e-05, + "loss": 0.071, + "num_input_tokens_seen": 8586592, + "step": 11965 + }, + { + "epoch": 24.885654885654887, + "grad_norm": 0.195469930768013, + "learning_rate": 3.9743831240069326e-05, + "loss": 0.0758, + "num_input_tokens_seen": 8590112, + "step": 11970 + }, + { + "epoch": 24.896049896049895, + "grad_norm": 0.2125709056854248, + "learning_rate": 3.9735901670989675e-05, + "loss": 0.104, + "num_input_tokens_seen": 8593600, + "step": 11975 + }, + { + "epoch": 24.906444906444907, + "grad_norm": 0.5396873950958252, + "learning_rate": 3.97279698294488e-05, + "loss": 0.1101, + "num_input_tokens_seen": 8597088, + "step": 11980 + }, + { + "epoch": 24.916839916839916, + "grad_norm": 0.6125814318656921, + "learning_rate": 3.9720035716669876e-05, + "loss": 0.1765, + "num_input_tokens_seen": 8600832, + "step": 11985 + }, + { + "epoch": 24.927234927234927, + "grad_norm": 0.339976966381073, + "learning_rate": 3.9712099333876474e-05, + "loss": 0.1283, + "num_input_tokens_seen": 8604544, + "step": 11990 + }, + { + "epoch": 24.93762993762994, + "grad_norm": 0.48398107290267944, + "learning_rate": 3.9704160682292475e-05, + "loss": 0.122, + "num_input_tokens_seen": 8608352, + "step": 11995 + }, + { + "epoch": 24.948024948024948, + "grad_norm": 0.3099190294742584, + "learning_rate": 3.9696219763142106e-05, + "loss": 0.0853, + "num_input_tokens_seen": 8612096, + "step": 12000 + }, + { + "epoch": 24.948024948024948, + "eval_loss": 0.14773821830749512, + "eval_runtime": 7.7732, + "eval_samples_per_second": 110.122, + "eval_steps_per_second": 27.531, + "num_input_tokens_seen": 8612096, + "step": 12000 + }, + { + "epoch": 24.95841995841996, + "grad_norm": 0.18464937806129456, + "learning_rate": 3.968827657764997e-05, + "loss": 0.1071, + "num_input_tokens_seen": 8615584, + "step": 12005 + }, + { + "epoch": 24.968814968814968, + "grad_norm": 0.30967482924461365, + "learning_rate": 3.9680331127041e-05, + "loss": 0.1474, + "num_input_tokens_seen": 8619168, + "step": 12010 + }, + { + "epoch": 24.97920997920998, + "grad_norm": 0.5000686049461365, + "learning_rate": 3.9672383412540495e-05, + "loss": 0.1539, + "num_input_tokens_seen": 8622688, + "step": 12015 + }, + { + "epoch": 24.989604989604988, + "grad_norm": 0.17645204067230225, + "learning_rate": 3.966443343537407e-05, + "loss": 0.1198, + "num_input_tokens_seen": 8626240, + "step": 12020 + }, + { + "epoch": 25.0, + "grad_norm": 0.3010944128036499, + "learning_rate": 3.965648119676772e-05, + "loss": 0.1066, + "num_input_tokens_seen": 8629848, + "step": 12025 + }, + { + "epoch": 25.010395010395012, + "grad_norm": 0.11216054111719131, + "learning_rate": 3.96485266979478e-05, + "loss": 0.0631, + "num_input_tokens_seen": 8633304, + "step": 12030 + }, + { + "epoch": 25.02079002079002, + "grad_norm": 0.19011880457401276, + "learning_rate": 3.9640569940140974e-05, + "loss": 0.1063, + "num_input_tokens_seen": 8637016, + "step": 12035 + }, + { + "epoch": 25.031185031185032, + "grad_norm": 0.41749435663223267, + "learning_rate": 3.963261092457428e-05, + "loss": 0.1103, + "num_input_tokens_seen": 8640568, + "step": 12040 + }, + { + "epoch": 25.04158004158004, + "grad_norm": 0.6509074568748474, + "learning_rate": 3.962464965247509e-05, + "loss": 0.1398, + "num_input_tokens_seen": 8644120, + "step": 12045 + }, + { + "epoch": 25.051975051975052, + "grad_norm": 0.33088594675064087, + "learning_rate": 3.9616686125071135e-05, + "loss": 0.1087, + "num_input_tokens_seen": 8647576, + "step": 12050 + }, + { + "epoch": 25.06237006237006, + "grad_norm": 0.3586499094963074, + "learning_rate": 3.9608720343590506e-05, + "loss": 0.076, + "num_input_tokens_seen": 8651288, + "step": 12055 + }, + { + "epoch": 25.072765072765073, + "grad_norm": 0.19381634891033173, + "learning_rate": 3.960075230926161e-05, + "loss": 0.1409, + "num_input_tokens_seen": 8654808, + "step": 12060 + }, + { + "epoch": 25.083160083160084, + "grad_norm": 0.32925939559936523, + "learning_rate": 3.959278202331322e-05, + "loss": 0.1253, + "num_input_tokens_seen": 8658392, + "step": 12065 + }, + { + "epoch": 25.093555093555093, + "grad_norm": 0.2912203371524811, + "learning_rate": 3.958480948697446e-05, + "loss": 0.1395, + "num_input_tokens_seen": 8661944, + "step": 12070 + }, + { + "epoch": 25.103950103950105, + "grad_norm": 0.6534401178359985, + "learning_rate": 3.95768347014748e-05, + "loss": 0.1605, + "num_input_tokens_seen": 8665720, + "step": 12075 + }, + { + "epoch": 25.114345114345113, + "grad_norm": 0.5371231436729431, + "learning_rate": 3.956885766804404e-05, + "loss": 0.142, + "num_input_tokens_seen": 8669304, + "step": 12080 + }, + { + "epoch": 25.124740124740125, + "grad_norm": 0.38810381293296814, + "learning_rate": 3.956087838791235e-05, + "loss": 0.1083, + "num_input_tokens_seen": 8672856, + "step": 12085 + }, + { + "epoch": 25.135135135135137, + "grad_norm": 0.2202349752187729, + "learning_rate": 3.955289686231022e-05, + "loss": 0.1137, + "num_input_tokens_seen": 8676376, + "step": 12090 + }, + { + "epoch": 25.145530145530145, + "grad_norm": 0.5224432349205017, + "learning_rate": 3.9544913092468504e-05, + "loss": 0.1163, + "num_input_tokens_seen": 8680120, + "step": 12095 + }, + { + "epoch": 25.155925155925157, + "grad_norm": 0.2892228960990906, + "learning_rate": 3.9536927079618425e-05, + "loss": 0.1568, + "num_input_tokens_seen": 8683928, + "step": 12100 + }, + { + "epoch": 25.166320166320165, + "grad_norm": 0.15049366652965546, + "learning_rate": 3.9528938824991494e-05, + "loss": 0.0978, + "num_input_tokens_seen": 8687384, + "step": 12105 + }, + { + "epoch": 25.176715176715177, + "grad_norm": 0.5841385722160339, + "learning_rate": 3.952094832981962e-05, + "loss": 0.1099, + "num_input_tokens_seen": 8691128, + "step": 12110 + }, + { + "epoch": 25.187110187110186, + "grad_norm": 0.28838929533958435, + "learning_rate": 3.951295559533503e-05, + "loss": 0.1428, + "num_input_tokens_seen": 8694840, + "step": 12115 + }, + { + "epoch": 25.197505197505198, + "grad_norm": 0.46196144819259644, + "learning_rate": 3.95049606227703e-05, + "loss": 0.1298, + "num_input_tokens_seen": 8698456, + "step": 12120 + }, + { + "epoch": 25.20790020790021, + "grad_norm": 0.4106258451938629, + "learning_rate": 3.949696341335838e-05, + "loss": 0.1249, + "num_input_tokens_seen": 8702040, + "step": 12125 + }, + { + "epoch": 25.218295218295218, + "grad_norm": 0.12442348152399063, + "learning_rate": 3.9488963968332503e-05, + "loss": 0.0723, + "num_input_tokens_seen": 8705624, + "step": 12130 + }, + { + "epoch": 25.22869022869023, + "grad_norm": 0.30732011795043945, + "learning_rate": 3.948096228892631e-05, + "loss": 0.1107, + "num_input_tokens_seen": 8709080, + "step": 12135 + }, + { + "epoch": 25.239085239085238, + "grad_norm": 0.39597538113594055, + "learning_rate": 3.947295837637375e-05, + "loss": 0.1194, + "num_input_tokens_seen": 8712600, + "step": 12140 + }, + { + "epoch": 25.24948024948025, + "grad_norm": 0.3049348294734955, + "learning_rate": 3.9464952231909135e-05, + "loss": 0.1688, + "num_input_tokens_seen": 8716280, + "step": 12145 + }, + { + "epoch": 25.25987525987526, + "grad_norm": 0.18427211046218872, + "learning_rate": 3.945694385676711e-05, + "loss": 0.1243, + "num_input_tokens_seen": 8719896, + "step": 12150 + }, + { + "epoch": 25.27027027027027, + "grad_norm": 0.4586152732372284, + "learning_rate": 3.944893325218265e-05, + "loss": 0.143, + "num_input_tokens_seen": 8723416, + "step": 12155 + }, + { + "epoch": 25.280665280665282, + "grad_norm": 0.17441946268081665, + "learning_rate": 3.944092041939112e-05, + "loss": 0.104, + "num_input_tokens_seen": 8727160, + "step": 12160 + }, + { + "epoch": 25.29106029106029, + "grad_norm": 0.27531763911247253, + "learning_rate": 3.943290535962818e-05, + "loss": 0.1407, + "num_input_tokens_seen": 8730744, + "step": 12165 + }, + { + "epoch": 25.301455301455302, + "grad_norm": 0.20115144550800323, + "learning_rate": 3.942488807412985e-05, + "loss": 0.1098, + "num_input_tokens_seen": 8734392, + "step": 12170 + }, + { + "epoch": 25.31185031185031, + "grad_norm": 0.18809613585472107, + "learning_rate": 3.941686856413251e-05, + "loss": 0.1145, + "num_input_tokens_seen": 8737976, + "step": 12175 + }, + { + "epoch": 25.322245322245323, + "grad_norm": 0.4466586410999298, + "learning_rate": 3.9408846830872874e-05, + "loss": 0.0936, + "num_input_tokens_seen": 8741688, + "step": 12180 + }, + { + "epoch": 25.33264033264033, + "grad_norm": 0.34045886993408203, + "learning_rate": 3.940082287558798e-05, + "loss": 0.1497, + "num_input_tokens_seen": 8745304, + "step": 12185 + }, + { + "epoch": 25.343035343035343, + "grad_norm": 0.28827568888664246, + "learning_rate": 3.939279669951522e-05, + "loss": 0.133, + "num_input_tokens_seen": 8748888, + "step": 12190 + }, + { + "epoch": 25.353430353430355, + "grad_norm": 0.6226892471313477, + "learning_rate": 3.938476830389234e-05, + "loss": 0.1529, + "num_input_tokens_seen": 8752632, + "step": 12195 + }, + { + "epoch": 25.363825363825363, + "grad_norm": 0.38737836480140686, + "learning_rate": 3.937673768995742e-05, + "loss": 0.098, + "num_input_tokens_seen": 8756152, + "step": 12200 + }, + { + "epoch": 25.363825363825363, + "eval_loss": 0.1427498757839203, + "eval_runtime": 7.7582, + "eval_samples_per_second": 110.336, + "eval_steps_per_second": 27.584, + "num_input_tokens_seen": 8756152, + "step": 12200 + }, + { + "epoch": 25.374220374220375, + "grad_norm": 0.2650088965892792, + "learning_rate": 3.936870485894888e-05, + "loss": 0.1107, + "num_input_tokens_seen": 8759736, + "step": 12205 + }, + { + "epoch": 25.384615384615383, + "grad_norm": 0.6325446367263794, + "learning_rate": 3.9360669812105475e-05, + "loss": 0.1191, + "num_input_tokens_seen": 8763224, + "step": 12210 + }, + { + "epoch": 25.395010395010395, + "grad_norm": 0.20097105205059052, + "learning_rate": 3.9352632550666325e-05, + "loss": 0.1488, + "num_input_tokens_seen": 8766776, + "step": 12215 + }, + { + "epoch": 25.405405405405407, + "grad_norm": 0.35455572605133057, + "learning_rate": 3.9344593075870866e-05, + "loss": 0.0799, + "num_input_tokens_seen": 8770232, + "step": 12220 + }, + { + "epoch": 25.415800415800415, + "grad_norm": 0.23001554608345032, + "learning_rate": 3.933655138895889e-05, + "loss": 0.0902, + "num_input_tokens_seen": 8773912, + "step": 12225 + }, + { + "epoch": 25.426195426195427, + "grad_norm": 0.5531957745552063, + "learning_rate": 3.932850749117053e-05, + "loss": 0.095, + "num_input_tokens_seen": 8777592, + "step": 12230 + }, + { + "epoch": 25.436590436590436, + "grad_norm": 0.5687368512153625, + "learning_rate": 3.932046138374624e-05, + "loss": 0.1534, + "num_input_tokens_seen": 8781080, + "step": 12235 + }, + { + "epoch": 25.446985446985448, + "grad_norm": 0.28444090485572815, + "learning_rate": 3.9312413067926854e-05, + "loss": 0.112, + "num_input_tokens_seen": 8784472, + "step": 12240 + }, + { + "epoch": 25.457380457380456, + "grad_norm": 0.8621196150779724, + "learning_rate": 3.9304362544953506e-05, + "loss": 0.1446, + "num_input_tokens_seen": 8788216, + "step": 12245 + }, + { + "epoch": 25.467775467775468, + "grad_norm": 0.20543818175792694, + "learning_rate": 3.929630981606769e-05, + "loss": 0.0821, + "num_input_tokens_seen": 8791896, + "step": 12250 + }, + { + "epoch": 25.47817047817048, + "grad_norm": 0.3392077684402466, + "learning_rate": 3.928825488251124e-05, + "loss": 0.135, + "num_input_tokens_seen": 8795512, + "step": 12255 + }, + { + "epoch": 25.488565488565488, + "grad_norm": 0.20547401905059814, + "learning_rate": 3.9280197745526344e-05, + "loss": 0.1126, + "num_input_tokens_seen": 8799128, + "step": 12260 + }, + { + "epoch": 25.4989604989605, + "grad_norm": 0.2733428180217743, + "learning_rate": 3.9272138406355495e-05, + "loss": 0.1072, + "num_input_tokens_seen": 8802680, + "step": 12265 + }, + { + "epoch": 25.509355509355508, + "grad_norm": 0.1452403962612152, + "learning_rate": 3.926407686624154e-05, + "loss": 0.1219, + "num_input_tokens_seen": 8806360, + "step": 12270 + }, + { + "epoch": 25.51975051975052, + "grad_norm": 0.14080768823623657, + "learning_rate": 3.9256013126427684e-05, + "loss": 0.1038, + "num_input_tokens_seen": 8809848, + "step": 12275 + }, + { + "epoch": 25.53014553014553, + "grad_norm": 0.7693833112716675, + "learning_rate": 3.9247947188157455e-05, + "loss": 0.1421, + "num_input_tokens_seen": 8813304, + "step": 12280 + }, + { + "epoch": 25.54054054054054, + "grad_norm": 0.5548558831214905, + "learning_rate": 3.9239879052674715e-05, + "loss": 0.1213, + "num_input_tokens_seen": 8816920, + "step": 12285 + }, + { + "epoch": 25.550935550935552, + "grad_norm": 0.18956062197685242, + "learning_rate": 3.9231808721223673e-05, + "loss": 0.1019, + "num_input_tokens_seen": 8820376, + "step": 12290 + }, + { + "epoch": 25.56133056133056, + "grad_norm": 0.4120422601699829, + "learning_rate": 3.9223736195048886e-05, + "loss": 0.1575, + "num_input_tokens_seen": 8823832, + "step": 12295 + }, + { + "epoch": 25.571725571725572, + "grad_norm": 0.3013385534286499, + "learning_rate": 3.921566147539523e-05, + "loss": 0.1469, + "num_input_tokens_seen": 8827480, + "step": 12300 + }, + { + "epoch": 25.58212058212058, + "grad_norm": 0.2105867862701416, + "learning_rate": 3.920758456350792e-05, + "loss": 0.1023, + "num_input_tokens_seen": 8831032, + "step": 12305 + }, + { + "epoch": 25.592515592515593, + "grad_norm": 0.2904379069805145, + "learning_rate": 3.919950546063253e-05, + "loss": 0.1571, + "num_input_tokens_seen": 8834648, + "step": 12310 + }, + { + "epoch": 25.602910602910605, + "grad_norm": 0.1915794163942337, + "learning_rate": 3.919142416801496e-05, + "loss": 0.1096, + "num_input_tokens_seen": 8838200, + "step": 12315 + }, + { + "epoch": 25.613305613305613, + "grad_norm": 0.22032153606414795, + "learning_rate": 3.918334068690144e-05, + "loss": 0.1405, + "num_input_tokens_seen": 8841752, + "step": 12320 + }, + { + "epoch": 25.623700623700625, + "grad_norm": 0.3796781301498413, + "learning_rate": 3.917525501853855e-05, + "loss": 0.1294, + "num_input_tokens_seen": 8845432, + "step": 12325 + }, + { + "epoch": 25.634095634095633, + "grad_norm": 0.30154168605804443, + "learning_rate": 3.916716716417319e-05, + "loss": 0.1337, + "num_input_tokens_seen": 8849240, + "step": 12330 + }, + { + "epoch": 25.644490644490645, + "grad_norm": 0.3739176392555237, + "learning_rate": 3.915907712505263e-05, + "loss": 0.1173, + "num_input_tokens_seen": 8853048, + "step": 12335 + }, + { + "epoch": 25.654885654885653, + "grad_norm": 0.2639774680137634, + "learning_rate": 3.915098490242444e-05, + "loss": 0.1109, + "num_input_tokens_seen": 8856632, + "step": 12340 + }, + { + "epoch": 25.665280665280665, + "grad_norm": 0.31051620841026306, + "learning_rate": 3.914289049753654e-05, + "loss": 0.1501, + "num_input_tokens_seen": 8860184, + "step": 12345 + }, + { + "epoch": 25.675675675675677, + "grad_norm": 0.1761752963066101, + "learning_rate": 3.913479391163719e-05, + "loss": 0.0707, + "num_input_tokens_seen": 8863704, + "step": 12350 + }, + { + "epoch": 25.686070686070686, + "grad_norm": 0.255208820104599, + "learning_rate": 3.9126695145975e-05, + "loss": 0.1201, + "num_input_tokens_seen": 8867416, + "step": 12355 + }, + { + "epoch": 25.696465696465697, + "grad_norm": 0.17642520368099213, + "learning_rate": 3.911859420179889e-05, + "loss": 0.0888, + "num_input_tokens_seen": 8870904, + "step": 12360 + }, + { + "epoch": 25.706860706860706, + "grad_norm": 0.22039315104484558, + "learning_rate": 3.911049108035813e-05, + "loss": 0.142, + "num_input_tokens_seen": 8874456, + "step": 12365 + }, + { + "epoch": 25.717255717255718, + "grad_norm": 0.37042397260665894, + "learning_rate": 3.910238578290232e-05, + "loss": 0.1269, + "num_input_tokens_seen": 8878072, + "step": 12370 + }, + { + "epoch": 25.727650727650726, + "grad_norm": 0.21136553585529327, + "learning_rate": 3.90942783106814e-05, + "loss": 0.1295, + "num_input_tokens_seen": 8881784, + "step": 12375 + }, + { + "epoch": 25.738045738045738, + "grad_norm": 0.20795217156410217, + "learning_rate": 3.908616866494564e-05, + "loss": 0.1133, + "num_input_tokens_seen": 8885368, + "step": 12380 + }, + { + "epoch": 25.74844074844075, + "grad_norm": 0.1615399420261383, + "learning_rate": 3.907805684694566e-05, + "loss": 0.155, + "num_input_tokens_seen": 8888952, + "step": 12385 + }, + { + "epoch": 25.758835758835758, + "grad_norm": 0.16445356607437134, + "learning_rate": 3.90699428579324e-05, + "loss": 0.1014, + "num_input_tokens_seen": 8892600, + "step": 12390 + }, + { + "epoch": 25.76923076923077, + "grad_norm": 0.623841404914856, + "learning_rate": 3.906182669915713e-05, + "loss": 0.0776, + "num_input_tokens_seen": 8896184, + "step": 12395 + }, + { + "epoch": 25.77962577962578, + "grad_norm": 0.2582470774650574, + "learning_rate": 3.9053708371871476e-05, + "loss": 0.1308, + "num_input_tokens_seen": 8899640, + "step": 12400 + }, + { + "epoch": 25.77962577962578, + "eval_loss": 0.14656221866607666, + "eval_runtime": 7.7582, + "eval_samples_per_second": 110.334, + "eval_steps_per_second": 27.584, + "num_input_tokens_seen": 8899640, + "step": 12400 + }, + { + "epoch": 25.79002079002079, + "grad_norm": 0.23504996299743652, + "learning_rate": 3.904558787732738e-05, + "loss": 0.103, + "num_input_tokens_seen": 8903288, + "step": 12405 + }, + { + "epoch": 25.8004158004158, + "grad_norm": 0.20745137333869934, + "learning_rate": 3.9037465216777135e-05, + "loss": 0.1675, + "num_input_tokens_seen": 8906840, + "step": 12410 + }, + { + "epoch": 25.81081081081081, + "grad_norm": 0.3223641812801361, + "learning_rate": 3.902934039147334e-05, + "loss": 0.0853, + "num_input_tokens_seen": 8910232, + "step": 12415 + }, + { + "epoch": 25.821205821205822, + "grad_norm": 0.420799195766449, + "learning_rate": 3.902121340266894e-05, + "loss": 0.1412, + "num_input_tokens_seen": 8913848, + "step": 12420 + }, + { + "epoch": 25.83160083160083, + "grad_norm": 0.24140165746212006, + "learning_rate": 3.9013084251617246e-05, + "loss": 0.1285, + "num_input_tokens_seen": 8917304, + "step": 12425 + }, + { + "epoch": 25.841995841995843, + "grad_norm": 0.46263739466667175, + "learning_rate": 3.9004952939571865e-05, + "loss": 0.1194, + "num_input_tokens_seen": 8920856, + "step": 12430 + }, + { + "epoch": 25.85239085239085, + "grad_norm": 0.307971328496933, + "learning_rate": 3.899681946778673e-05, + "loss": 0.096, + "num_input_tokens_seen": 8924440, + "step": 12435 + }, + { + "epoch": 25.862785862785863, + "grad_norm": 0.336664080619812, + "learning_rate": 3.898868383751615e-05, + "loss": 0.1227, + "num_input_tokens_seen": 8928056, + "step": 12440 + }, + { + "epoch": 25.873180873180875, + "grad_norm": 0.17872802913188934, + "learning_rate": 3.8980546050014724e-05, + "loss": 0.0877, + "num_input_tokens_seen": 8931672, + "step": 12445 + }, + { + "epoch": 25.883575883575883, + "grad_norm": 0.3576180040836334, + "learning_rate": 3.897240610653741e-05, + "loss": 0.1361, + "num_input_tokens_seen": 8935192, + "step": 12450 + }, + { + "epoch": 25.893970893970895, + "grad_norm": 0.14735789597034454, + "learning_rate": 3.896426400833948e-05, + "loss": 0.1322, + "num_input_tokens_seen": 8938808, + "step": 12455 + }, + { + "epoch": 25.904365904365903, + "grad_norm": 0.7055342197418213, + "learning_rate": 3.895611975667656e-05, + "loss": 0.1399, + "num_input_tokens_seen": 8942488, + "step": 12460 + }, + { + "epoch": 25.914760914760915, + "grad_norm": 0.5857628583908081, + "learning_rate": 3.8947973352804584e-05, + "loss": 0.1064, + "num_input_tokens_seen": 8946040, + "step": 12465 + }, + { + "epoch": 25.925155925155924, + "grad_norm": 0.17567887902259827, + "learning_rate": 3.893982479797984e-05, + "loss": 0.0927, + "num_input_tokens_seen": 8949592, + "step": 12470 + }, + { + "epoch": 25.935550935550935, + "grad_norm": 0.503862738609314, + "learning_rate": 3.8931674093458926e-05, + "loss": 0.1004, + "num_input_tokens_seen": 8953112, + "step": 12475 + }, + { + "epoch": 25.945945945945947, + "grad_norm": 0.4004562199115753, + "learning_rate": 3.89235212404988e-05, + "loss": 0.1551, + "num_input_tokens_seen": 8956728, + "step": 12480 + }, + { + "epoch": 25.956340956340956, + "grad_norm": 0.318131685256958, + "learning_rate": 3.891536624035672e-05, + "loss": 0.102, + "num_input_tokens_seen": 8960216, + "step": 12485 + }, + { + "epoch": 25.966735966735968, + "grad_norm": 0.231619194149971, + "learning_rate": 3.8907209094290295e-05, + "loss": 0.1005, + "num_input_tokens_seen": 8963768, + "step": 12490 + }, + { + "epoch": 25.977130977130976, + "grad_norm": 0.3737405240535736, + "learning_rate": 3.8899049803557466e-05, + "loss": 0.1276, + "num_input_tokens_seen": 8967288, + "step": 12495 + }, + { + "epoch": 25.987525987525988, + "grad_norm": 0.2760157883167267, + "learning_rate": 3.889088836941648e-05, + "loss": 0.1223, + "num_input_tokens_seen": 8970872, + "step": 12500 + }, + { + "epoch": 25.997920997921, + "grad_norm": 0.4269867241382599, + "learning_rate": 3.8882724793125946e-05, + "loss": 0.1289, + "num_input_tokens_seen": 8974360, + "step": 12505 + }, + { + "epoch": 26.008316008316008, + "grad_norm": 0.44967183470726013, + "learning_rate": 3.8874559075944794e-05, + "loss": 0.1329, + "num_input_tokens_seen": 8977728, + "step": 12510 + }, + { + "epoch": 26.01871101871102, + "grad_norm": 0.6949271559715271, + "learning_rate": 3.886639121913227e-05, + "loss": 0.1402, + "num_input_tokens_seen": 8981376, + "step": 12515 + }, + { + "epoch": 26.02910602910603, + "grad_norm": 0.2090485692024231, + "learning_rate": 3.885822122394797e-05, + "loss": 0.1178, + "num_input_tokens_seen": 8985088, + "step": 12520 + }, + { + "epoch": 26.03950103950104, + "grad_norm": 0.5210825800895691, + "learning_rate": 3.8850049091651794e-05, + "loss": 0.1199, + "num_input_tokens_seen": 8988608, + "step": 12525 + }, + { + "epoch": 26.04989604989605, + "grad_norm": 0.26455655694007874, + "learning_rate": 3.8841874823504e-05, + "loss": 0.1117, + "num_input_tokens_seen": 8992288, + "step": 12530 + }, + { + "epoch": 26.06029106029106, + "grad_norm": 0.18735326826572418, + "learning_rate": 3.8833698420765157e-05, + "loss": 0.0903, + "num_input_tokens_seen": 8995744, + "step": 12535 + }, + { + "epoch": 26.070686070686072, + "grad_norm": 0.36411669850349426, + "learning_rate": 3.882551988469618e-05, + "loss": 0.1184, + "num_input_tokens_seen": 8999360, + "step": 12540 + }, + { + "epoch": 26.08108108108108, + "grad_norm": 0.31920140981674194, + "learning_rate": 3.881733921655829e-05, + "loss": 0.1045, + "num_input_tokens_seen": 9003104, + "step": 12545 + }, + { + "epoch": 26.091476091476093, + "grad_norm": 0.5474215149879456, + "learning_rate": 3.8809156417613054e-05, + "loss": 0.1516, + "num_input_tokens_seen": 9006752, + "step": 12550 + }, + { + "epoch": 26.1018711018711, + "grad_norm": 0.5632835626602173, + "learning_rate": 3.8800971489122364e-05, + "loss": 0.1077, + "num_input_tokens_seen": 9010432, + "step": 12555 + }, + { + "epoch": 26.112266112266113, + "grad_norm": 0.2830876410007477, + "learning_rate": 3.8792784432348434e-05, + "loss": 0.1144, + "num_input_tokens_seen": 9014048, + "step": 12560 + }, + { + "epoch": 26.12266112266112, + "grad_norm": 0.24031886458396912, + "learning_rate": 3.878459524855381e-05, + "loss": 0.0955, + "num_input_tokens_seen": 9017536, + "step": 12565 + }, + { + "epoch": 26.133056133056133, + "grad_norm": 0.36940091848373413, + "learning_rate": 3.8776403939001384e-05, + "loss": 0.122, + "num_input_tokens_seen": 9021184, + "step": 12570 + }, + { + "epoch": 26.143451143451145, + "grad_norm": 0.9946577548980713, + "learning_rate": 3.876821050495433e-05, + "loss": 0.1249, + "num_input_tokens_seen": 9024608, + "step": 12575 + }, + { + "epoch": 26.153846153846153, + "grad_norm": 0.13052764534950256, + "learning_rate": 3.87600149476762e-05, + "loss": 0.0832, + "num_input_tokens_seen": 9028192, + "step": 12580 + }, + { + "epoch": 26.164241164241165, + "grad_norm": 0.42596668004989624, + "learning_rate": 3.8751817268430843e-05, + "loss": 0.1253, + "num_input_tokens_seen": 9031808, + "step": 12585 + }, + { + "epoch": 26.174636174636174, + "grad_norm": 0.2053380161523819, + "learning_rate": 3.8743617468482464e-05, + "loss": 0.1191, + "num_input_tokens_seen": 9035488, + "step": 12590 + }, + { + "epoch": 26.185031185031185, + "grad_norm": 0.16772659122943878, + "learning_rate": 3.8735415549095535e-05, + "loss": 0.1193, + "num_input_tokens_seen": 9039104, + "step": 12595 + }, + { + "epoch": 26.195426195426194, + "grad_norm": 0.3230827748775482, + "learning_rate": 3.8727211511534934e-05, + "loss": 0.1043, + "num_input_tokens_seen": 9042656, + "step": 12600 + }, + { + "epoch": 26.195426195426194, + "eval_loss": 0.1493704468011856, + "eval_runtime": 7.752, + "eval_samples_per_second": 110.423, + "eval_steps_per_second": 27.606, + "num_input_tokens_seen": 9042656, + "step": 12600 + }, + { + "epoch": 26.205821205821206, + "grad_norm": 0.22390498220920563, + "learning_rate": 3.8719005357065804e-05, + "loss": 0.0739, + "num_input_tokens_seen": 9046112, + "step": 12605 + }, + { + "epoch": 26.216216216216218, + "grad_norm": 0.266574501991272, + "learning_rate": 3.8710797086953645e-05, + "loss": 0.1078, + "num_input_tokens_seen": 9049632, + "step": 12610 + }, + { + "epoch": 26.226611226611226, + "grad_norm": 0.37352800369262695, + "learning_rate": 3.870258670246427e-05, + "loss": 0.1087, + "num_input_tokens_seen": 9053248, + "step": 12615 + }, + { + "epoch": 26.237006237006238, + "grad_norm": 0.30226001143455505, + "learning_rate": 3.869437420486384e-05, + "loss": 0.1232, + "num_input_tokens_seen": 9056864, + "step": 12620 + }, + { + "epoch": 26.247401247401246, + "grad_norm": 0.49574464559555054, + "learning_rate": 3.8686159595418805e-05, + "loss": 0.1328, + "num_input_tokens_seen": 9060608, + "step": 12625 + }, + { + "epoch": 26.257796257796258, + "grad_norm": 0.3110545575618744, + "learning_rate": 3.867794287539597e-05, + "loss": 0.1478, + "num_input_tokens_seen": 9064384, + "step": 12630 + }, + { + "epoch": 26.26819126819127, + "grad_norm": 0.17537406086921692, + "learning_rate": 3.866972404606245e-05, + "loss": 0.1019, + "num_input_tokens_seen": 9067904, + "step": 12635 + }, + { + "epoch": 26.27858627858628, + "grad_norm": 0.16936734318733215, + "learning_rate": 3.866150310868571e-05, + "loss": 0.1375, + "num_input_tokens_seen": 9071712, + "step": 12640 + }, + { + "epoch": 26.28898128898129, + "grad_norm": 0.3532311022281647, + "learning_rate": 3.8653280064533506e-05, + "loss": 0.086, + "num_input_tokens_seen": 9075328, + "step": 12645 + }, + { + "epoch": 26.2993762993763, + "grad_norm": 0.5954794883728027, + "learning_rate": 3.864505491487394e-05, + "loss": 0.1304, + "num_input_tokens_seen": 9078944, + "step": 12650 + }, + { + "epoch": 26.30977130977131, + "grad_norm": 0.23422974348068237, + "learning_rate": 3.8636827660975414e-05, + "loss": 0.1302, + "num_input_tokens_seen": 9082496, + "step": 12655 + }, + { + "epoch": 26.32016632016632, + "grad_norm": 0.7687913775444031, + "learning_rate": 3.862859830410671e-05, + "loss": 0.1509, + "num_input_tokens_seen": 9086240, + "step": 12660 + }, + { + "epoch": 26.33056133056133, + "grad_norm": 0.1919182389974594, + "learning_rate": 3.862036684553688e-05, + "loss": 0.1298, + "num_input_tokens_seen": 9089696, + "step": 12665 + }, + { + "epoch": 26.340956340956343, + "grad_norm": 0.3163926601409912, + "learning_rate": 3.8612133286535314e-05, + "loss": 0.1406, + "num_input_tokens_seen": 9093376, + "step": 12670 + }, + { + "epoch": 26.35135135135135, + "grad_norm": 0.22175268828868866, + "learning_rate": 3.860389762837173e-05, + "loss": 0.1114, + "num_input_tokens_seen": 9096832, + "step": 12675 + }, + { + "epoch": 26.361746361746363, + "grad_norm": 0.33912086486816406, + "learning_rate": 3.859565987231618e-05, + "loss": 0.1242, + "num_input_tokens_seen": 9100480, + "step": 12680 + }, + { + "epoch": 26.37214137214137, + "grad_norm": 0.6159701943397522, + "learning_rate": 3.858742001963902e-05, + "loss": 0.1079, + "num_input_tokens_seen": 9104320, + "step": 12685 + }, + { + "epoch": 26.382536382536383, + "grad_norm": 0.7550447583198547, + "learning_rate": 3.857917807161094e-05, + "loss": 0.1594, + "num_input_tokens_seen": 9107936, + "step": 12690 + }, + { + "epoch": 26.39293139293139, + "grad_norm": 0.1766660362482071, + "learning_rate": 3.857093402950296e-05, + "loss": 0.088, + "num_input_tokens_seen": 9111360, + "step": 12695 + }, + { + "epoch": 26.403326403326403, + "grad_norm": 0.443167120218277, + "learning_rate": 3.8562687894586414e-05, + "loss": 0.104, + "num_input_tokens_seen": 9114976, + "step": 12700 + }, + { + "epoch": 26.413721413721415, + "grad_norm": 0.4858100116252899, + "learning_rate": 3.8554439668132946e-05, + "loss": 0.1434, + "num_input_tokens_seen": 9118528, + "step": 12705 + }, + { + "epoch": 26.424116424116423, + "grad_norm": 0.19291439652442932, + "learning_rate": 3.854618935141455e-05, + "loss": 0.0924, + "num_input_tokens_seen": 9121888, + "step": 12710 + }, + { + "epoch": 26.434511434511435, + "grad_norm": 0.2775370478630066, + "learning_rate": 3.8537936945703525e-05, + "loss": 0.1297, + "num_input_tokens_seen": 9125472, + "step": 12715 + }, + { + "epoch": 26.444906444906444, + "grad_norm": 0.22270467877388, + "learning_rate": 3.852968245227249e-05, + "loss": 0.1225, + "num_input_tokens_seen": 9129184, + "step": 12720 + }, + { + "epoch": 26.455301455301456, + "grad_norm": 0.44220930337905884, + "learning_rate": 3.85214258723944e-05, + "loss": 0.1018, + "num_input_tokens_seen": 9132736, + "step": 12725 + }, + { + "epoch": 26.465696465696467, + "grad_norm": 0.3796941041946411, + "learning_rate": 3.8513167207342524e-05, + "loss": 0.1392, + "num_input_tokens_seen": 9136416, + "step": 12730 + }, + { + "epoch": 26.476091476091476, + "grad_norm": 0.20074909925460815, + "learning_rate": 3.850490645839044e-05, + "loss": 0.1276, + "num_input_tokens_seen": 9139968, + "step": 12735 + }, + { + "epoch": 26.486486486486488, + "grad_norm": 0.2752671539783478, + "learning_rate": 3.849664362681207e-05, + "loss": 0.1275, + "num_input_tokens_seen": 9143584, + "step": 12740 + }, + { + "epoch": 26.496881496881496, + "grad_norm": 0.7049771547317505, + "learning_rate": 3.848837871388165e-05, + "loss": 0.1797, + "num_input_tokens_seen": 9147136, + "step": 12745 + }, + { + "epoch": 26.507276507276508, + "grad_norm": 0.313277006149292, + "learning_rate": 3.848011172087371e-05, + "loss": 0.1268, + "num_input_tokens_seen": 9150784, + "step": 12750 + }, + { + "epoch": 26.517671517671516, + "grad_norm": 0.16029947996139526, + "learning_rate": 3.847184264906315e-05, + "loss": 0.108, + "num_input_tokens_seen": 9154240, + "step": 12755 + }, + { + "epoch": 26.528066528066528, + "grad_norm": 0.3479287624359131, + "learning_rate": 3.846357149972516e-05, + "loss": 0.1021, + "num_input_tokens_seen": 9157856, + "step": 12760 + }, + { + "epoch": 26.53846153846154, + "grad_norm": 0.25229576230049133, + "learning_rate": 3.8455298274135246e-05, + "loss": 0.0794, + "num_input_tokens_seen": 9161376, + "step": 12765 + }, + { + "epoch": 26.54885654885655, + "grad_norm": 0.30558153986930847, + "learning_rate": 3.8447022973569254e-05, + "loss": 0.1027, + "num_input_tokens_seen": 9164992, + "step": 12770 + }, + { + "epoch": 26.55925155925156, + "grad_norm": 0.6063570380210876, + "learning_rate": 3.843874559930332e-05, + "loss": 0.1244, + "num_input_tokens_seen": 9168576, + "step": 12775 + }, + { + "epoch": 26.56964656964657, + "grad_norm": 0.7240133285522461, + "learning_rate": 3.843046615261394e-05, + "loss": 0.1323, + "num_input_tokens_seen": 9172256, + "step": 12780 + }, + { + "epoch": 26.58004158004158, + "grad_norm": 0.4249672293663025, + "learning_rate": 3.842218463477791e-05, + "loss": 0.1083, + "num_input_tokens_seen": 9175808, + "step": 12785 + }, + { + "epoch": 26.59043659043659, + "grad_norm": 0.379672646522522, + "learning_rate": 3.841390104707233e-05, + "loss": 0.1042, + "num_input_tokens_seen": 9179424, + "step": 12790 + }, + { + "epoch": 26.6008316008316, + "grad_norm": 0.2731981873512268, + "learning_rate": 3.8405615390774643e-05, + "loss": 0.1331, + "num_input_tokens_seen": 9182944, + "step": 12795 + }, + { + "epoch": 26.611226611226613, + "grad_norm": 0.2113301306962967, + "learning_rate": 3.839732766716259e-05, + "loss": 0.1072, + "num_input_tokens_seen": 9186656, + "step": 12800 + }, + { + "epoch": 26.611226611226613, + "eval_loss": 0.1438954472541809, + "eval_runtime": 7.745, + "eval_samples_per_second": 110.522, + "eval_steps_per_second": 27.631, + "num_input_tokens_seen": 9186656, + "step": 12800 + }, + { + "epoch": 26.62162162162162, + "grad_norm": 0.24186669290065765, + "learning_rate": 3.838903787751425e-05, + "loss": 0.1442, + "num_input_tokens_seen": 9190240, + "step": 12805 + }, + { + "epoch": 26.632016632016633, + "grad_norm": 0.7322450280189514, + "learning_rate": 3.838074602310802e-05, + "loss": 0.1137, + "num_input_tokens_seen": 9193728, + "step": 12810 + }, + { + "epoch": 26.64241164241164, + "grad_norm": 0.5433211922645569, + "learning_rate": 3.837245210522258e-05, + "loss": 0.114, + "num_input_tokens_seen": 9197408, + "step": 12815 + }, + { + "epoch": 26.652806652806653, + "grad_norm": 0.16580192744731903, + "learning_rate": 3.8364156125136996e-05, + "loss": 0.117, + "num_input_tokens_seen": 9200768, + "step": 12820 + }, + { + "epoch": 26.66320166320166, + "grad_norm": 0.1942148208618164, + "learning_rate": 3.835585808413059e-05, + "loss": 0.1751, + "num_input_tokens_seen": 9204320, + "step": 12825 + }, + { + "epoch": 26.673596673596673, + "grad_norm": 0.4905228912830353, + "learning_rate": 3.8347557983483024e-05, + "loss": 0.1684, + "num_input_tokens_seen": 9207712, + "step": 12830 + }, + { + "epoch": 26.683991683991685, + "grad_norm": 0.22715717554092407, + "learning_rate": 3.833925582447428e-05, + "loss": 0.1186, + "num_input_tokens_seen": 9211360, + "step": 12835 + }, + { + "epoch": 26.694386694386694, + "grad_norm": 0.2637358605861664, + "learning_rate": 3.8330951608384656e-05, + "loss": 0.1198, + "num_input_tokens_seen": 9214880, + "step": 12840 + }, + { + "epoch": 26.704781704781706, + "grad_norm": 0.25020623207092285, + "learning_rate": 3.832264533649477e-05, + "loss": 0.1485, + "num_input_tokens_seen": 9218528, + "step": 12845 + }, + { + "epoch": 26.715176715176714, + "grad_norm": 0.33152323961257935, + "learning_rate": 3.8314337010085555e-05, + "loss": 0.1467, + "num_input_tokens_seen": 9222208, + "step": 12850 + }, + { + "epoch": 26.725571725571726, + "grad_norm": 0.1591457724571228, + "learning_rate": 3.830602663043824e-05, + "loss": 0.083, + "num_input_tokens_seen": 9225856, + "step": 12855 + }, + { + "epoch": 26.735966735966738, + "grad_norm": 0.2138020247220993, + "learning_rate": 3.8297714198834414e-05, + "loss": 0.1043, + "num_input_tokens_seen": 9229728, + "step": 12860 + }, + { + "epoch": 26.746361746361746, + "grad_norm": 0.22609147429466248, + "learning_rate": 3.828939971655595e-05, + "loss": 0.1244, + "num_input_tokens_seen": 9233312, + "step": 12865 + }, + { + "epoch": 26.756756756756758, + "grad_norm": 0.25277623534202576, + "learning_rate": 3.828108318488505e-05, + "loss": 0.0915, + "num_input_tokens_seen": 9236928, + "step": 12870 + }, + { + "epoch": 26.767151767151766, + "grad_norm": 0.4937661588191986, + "learning_rate": 3.8272764605104216e-05, + "loss": 0.1118, + "num_input_tokens_seen": 9240480, + "step": 12875 + }, + { + "epoch": 26.777546777546778, + "grad_norm": 0.35296347737312317, + "learning_rate": 3.826444397849628e-05, + "loss": 0.1112, + "num_input_tokens_seen": 9244000, + "step": 12880 + }, + { + "epoch": 26.787941787941786, + "grad_norm": 0.10462974011898041, + "learning_rate": 3.825612130634439e-05, + "loss": 0.0869, + "num_input_tokens_seen": 9247360, + "step": 12885 + }, + { + "epoch": 26.7983367983368, + "grad_norm": 0.2038564383983612, + "learning_rate": 3.824779658993202e-05, + "loss": 0.1296, + "num_input_tokens_seen": 9250880, + "step": 12890 + }, + { + "epoch": 26.80873180873181, + "grad_norm": 0.48367825150489807, + "learning_rate": 3.823946983054292e-05, + "loss": 0.1523, + "num_input_tokens_seen": 9254432, + "step": 12895 + }, + { + "epoch": 26.81912681912682, + "grad_norm": 0.33669814467430115, + "learning_rate": 3.82311410294612e-05, + "loss": 0.1332, + "num_input_tokens_seen": 9257984, + "step": 12900 + }, + { + "epoch": 26.82952182952183, + "grad_norm": 0.32100024819374084, + "learning_rate": 3.822281018797127e-05, + "loss": 0.1804, + "num_input_tokens_seen": 9261696, + "step": 12905 + }, + { + "epoch": 26.83991683991684, + "grad_norm": 0.7786471843719482, + "learning_rate": 3.821447730735783e-05, + "loss": 0.1181, + "num_input_tokens_seen": 9265312, + "step": 12910 + }, + { + "epoch": 26.85031185031185, + "grad_norm": 0.16041986644268036, + "learning_rate": 3.820614238890592e-05, + "loss": 0.0942, + "num_input_tokens_seen": 9268768, + "step": 12915 + }, + { + "epoch": 26.86070686070686, + "grad_norm": 0.270174503326416, + "learning_rate": 3.819780543390091e-05, + "loss": 0.132, + "num_input_tokens_seen": 9272480, + "step": 12920 + }, + { + "epoch": 26.87110187110187, + "grad_norm": 0.15396326780319214, + "learning_rate": 3.818946644362844e-05, + "loss": 0.0929, + "num_input_tokens_seen": 9275840, + "step": 12925 + }, + { + "epoch": 26.881496881496883, + "grad_norm": 0.5485267043113708, + "learning_rate": 3.81811254193745e-05, + "loss": 0.1217, + "num_input_tokens_seen": 9279488, + "step": 12930 + }, + { + "epoch": 26.89189189189189, + "grad_norm": 0.2786194980144501, + "learning_rate": 3.8172782362425366e-05, + "loss": 0.116, + "num_input_tokens_seen": 9283104, + "step": 12935 + }, + { + "epoch": 26.902286902286903, + "grad_norm": 0.21970440447330475, + "learning_rate": 3.816443727406765e-05, + "loss": 0.105, + "num_input_tokens_seen": 9286592, + "step": 12940 + }, + { + "epoch": 26.91268191268191, + "grad_norm": 0.20234665274620056, + "learning_rate": 3.815609015558829e-05, + "loss": 0.0867, + "num_input_tokens_seen": 9290272, + "step": 12945 + }, + { + "epoch": 26.923076923076923, + "grad_norm": 0.2401823103427887, + "learning_rate": 3.814774100827448e-05, + "loss": 0.1192, + "num_input_tokens_seen": 9293920, + "step": 12950 + }, + { + "epoch": 26.933471933471935, + "grad_norm": 0.22247080504894257, + "learning_rate": 3.813938983341379e-05, + "loss": 0.1043, + "num_input_tokens_seen": 9297504, + "step": 12955 + }, + { + "epoch": 26.943866943866944, + "grad_norm": 0.5823832750320435, + "learning_rate": 3.813103663229407e-05, + "loss": 0.1076, + "num_input_tokens_seen": 9301056, + "step": 12960 + }, + { + "epoch": 26.954261954261955, + "grad_norm": 0.2822021543979645, + "learning_rate": 3.812268140620349e-05, + "loss": 0.1111, + "num_input_tokens_seen": 9304704, + "step": 12965 + }, + { + "epoch": 26.964656964656964, + "grad_norm": 0.5521904826164246, + "learning_rate": 3.811432415643051e-05, + "loss": 0.1087, + "num_input_tokens_seen": 9308320, + "step": 12970 + }, + { + "epoch": 26.975051975051976, + "grad_norm": 0.23270517587661743, + "learning_rate": 3.8105964884263954e-05, + "loss": 0.1345, + "num_input_tokens_seen": 9311872, + "step": 12975 + }, + { + "epoch": 26.985446985446984, + "grad_norm": 0.18383753299713135, + "learning_rate": 3.809760359099291e-05, + "loss": 0.1021, + "num_input_tokens_seen": 9315520, + "step": 12980 + }, + { + "epoch": 26.995841995841996, + "grad_norm": 0.16933444142341614, + "learning_rate": 3.8089240277906804e-05, + "loss": 0.0979, + "num_input_tokens_seen": 9318944, + "step": 12985 + }, + { + "epoch": 27.006237006237008, + "grad_norm": 0.3870989978313446, + "learning_rate": 3.808087494629535e-05, + "loss": 0.0955, + "num_input_tokens_seen": 9322584, + "step": 12990 + }, + { + "epoch": 27.016632016632016, + "grad_norm": 0.632149875164032, + "learning_rate": 3.8072507597448595e-05, + "loss": 0.1251, + "num_input_tokens_seen": 9325976, + "step": 12995 + }, + { + "epoch": 27.027027027027028, + "grad_norm": 0.2717894911766052, + "learning_rate": 3.806413823265689e-05, + "loss": 0.1031, + "num_input_tokens_seen": 9329688, + "step": 13000 + }, + { + "epoch": 27.027027027027028, + "eval_loss": 0.14755728840827942, + "eval_runtime": 7.7467, + "eval_samples_per_second": 110.499, + "eval_steps_per_second": 27.625, + "num_input_tokens_seen": 9329688, + "step": 13000 + }, + { + "epoch": 27.037422037422036, + "grad_norm": 0.7700693607330322, + "learning_rate": 3.805576685321089e-05, + "loss": 0.1224, + "num_input_tokens_seen": 9333272, + "step": 13005 + }, + { + "epoch": 27.04781704781705, + "grad_norm": 0.2731897532939911, + "learning_rate": 3.804739346040158e-05, + "loss": 0.0517, + "num_input_tokens_seen": 9336792, + "step": 13010 + }, + { + "epoch": 27.058212058212057, + "grad_norm": 0.40932396054267883, + "learning_rate": 3.8039018055520234e-05, + "loss": 0.0934, + "num_input_tokens_seen": 9340312, + "step": 13015 + }, + { + "epoch": 27.06860706860707, + "grad_norm": 0.322160542011261, + "learning_rate": 3.803064063985844e-05, + "loss": 0.1097, + "num_input_tokens_seen": 9343736, + "step": 13020 + }, + { + "epoch": 27.07900207900208, + "grad_norm": 0.4280484616756439, + "learning_rate": 3.802226121470811e-05, + "loss": 0.1205, + "num_input_tokens_seen": 9347160, + "step": 13025 + }, + { + "epoch": 27.08939708939709, + "grad_norm": 0.6212530136108398, + "learning_rate": 3.801387978136145e-05, + "loss": 0.1083, + "num_input_tokens_seen": 9350520, + "step": 13030 + }, + { + "epoch": 27.0997920997921, + "grad_norm": 0.24507179856300354, + "learning_rate": 3.800549634111099e-05, + "loss": 0.1613, + "num_input_tokens_seen": 9354200, + "step": 13035 + }, + { + "epoch": 27.11018711018711, + "grad_norm": 0.2836575210094452, + "learning_rate": 3.799711089524955e-05, + "loss": 0.1311, + "num_input_tokens_seen": 9357784, + "step": 13040 + }, + { + "epoch": 27.12058212058212, + "grad_norm": 0.2847013771533966, + "learning_rate": 3.7988723445070285e-05, + "loss": 0.1008, + "num_input_tokens_seen": 9361208, + "step": 13045 + }, + { + "epoch": 27.13097713097713, + "grad_norm": 0.6408721208572388, + "learning_rate": 3.798033399186663e-05, + "loss": 0.1381, + "num_input_tokens_seen": 9365048, + "step": 13050 + }, + { + "epoch": 27.14137214137214, + "grad_norm": 0.4490075409412384, + "learning_rate": 3.797194253693237e-05, + "loss": 0.0899, + "num_input_tokens_seen": 9368504, + "step": 13055 + }, + { + "epoch": 27.151767151767153, + "grad_norm": 0.384026437997818, + "learning_rate": 3.796354908156153e-05, + "loss": 0.1342, + "num_input_tokens_seen": 9371992, + "step": 13060 + }, + { + "epoch": 27.16216216216216, + "grad_norm": 0.303545743227005, + "learning_rate": 3.795515362704853e-05, + "loss": 0.0866, + "num_input_tokens_seen": 9375544, + "step": 13065 + }, + { + "epoch": 27.172557172557173, + "grad_norm": 0.32037118077278137, + "learning_rate": 3.794675617468803e-05, + "loss": 0.1036, + "num_input_tokens_seen": 9379064, + "step": 13070 + }, + { + "epoch": 27.18295218295218, + "grad_norm": 0.1989498883485794, + "learning_rate": 3.793835672577503e-05, + "loss": 0.1207, + "num_input_tokens_seen": 9382680, + "step": 13075 + }, + { + "epoch": 27.193347193347194, + "grad_norm": 0.13509266078472137, + "learning_rate": 3.7929955281604826e-05, + "loss": 0.0799, + "num_input_tokens_seen": 9386232, + "step": 13080 + }, + { + "epoch": 27.203742203742205, + "grad_norm": 0.2728486955165863, + "learning_rate": 3.7921551843473036e-05, + "loss": 0.1047, + "num_input_tokens_seen": 9390040, + "step": 13085 + }, + { + "epoch": 27.214137214137214, + "grad_norm": 0.18181191384792328, + "learning_rate": 3.791314641267557e-05, + "loss": 0.1308, + "num_input_tokens_seen": 9393720, + "step": 13090 + }, + { + "epoch": 27.224532224532226, + "grad_norm": 0.3601495921611786, + "learning_rate": 3.790473899050864e-05, + "loss": 0.1055, + "num_input_tokens_seen": 9397400, + "step": 13095 + }, + { + "epoch": 27.234927234927234, + "grad_norm": 0.6299542188644409, + "learning_rate": 3.7896329578268794e-05, + "loss": 0.1384, + "num_input_tokens_seen": 9401016, + "step": 13100 + }, + { + "epoch": 27.245322245322246, + "grad_norm": 0.5686300992965698, + "learning_rate": 3.7887918177252855e-05, + "loss": 0.1198, + "num_input_tokens_seen": 9404728, + "step": 13105 + }, + { + "epoch": 27.255717255717254, + "grad_norm": 0.24861347675323486, + "learning_rate": 3.787950478875798e-05, + "loss": 0.1052, + "num_input_tokens_seen": 9408248, + "step": 13110 + }, + { + "epoch": 27.266112266112266, + "grad_norm": 0.10165547579526901, + "learning_rate": 3.787108941408162e-05, + "loss": 0.0951, + "num_input_tokens_seen": 9411640, + "step": 13115 + }, + { + "epoch": 27.276507276507278, + "grad_norm": 0.29663196206092834, + "learning_rate": 3.786267205452151e-05, + "loss": 0.1592, + "num_input_tokens_seen": 9415384, + "step": 13120 + }, + { + "epoch": 27.286902286902286, + "grad_norm": 0.22142300009727478, + "learning_rate": 3.785425271137573e-05, + "loss": 0.1541, + "num_input_tokens_seen": 9419000, + "step": 13125 + }, + { + "epoch": 27.2972972972973, + "grad_norm": 0.3654574453830719, + "learning_rate": 3.7845831385942655e-05, + "loss": 0.1248, + "num_input_tokens_seen": 9422456, + "step": 13130 + }, + { + "epoch": 27.307692307692307, + "grad_norm": 0.2901967167854309, + "learning_rate": 3.7837408079520944e-05, + "loss": 0.1135, + "num_input_tokens_seen": 9425944, + "step": 13135 + }, + { + "epoch": 27.31808731808732, + "grad_norm": 0.28227463364601135, + "learning_rate": 3.782898279340957e-05, + "loss": 0.1229, + "num_input_tokens_seen": 9429720, + "step": 13140 + }, + { + "epoch": 27.328482328482327, + "grad_norm": 0.17370961606502533, + "learning_rate": 3.782055552890784e-05, + "loss": 0.1123, + "num_input_tokens_seen": 9433400, + "step": 13145 + }, + { + "epoch": 27.33887733887734, + "grad_norm": 0.2742220163345337, + "learning_rate": 3.781212628731534e-05, + "loss": 0.1374, + "num_input_tokens_seen": 9436984, + "step": 13150 + }, + { + "epoch": 27.34927234927235, + "grad_norm": 0.2842203378677368, + "learning_rate": 3.7803695069931946e-05, + "loss": 0.114, + "num_input_tokens_seen": 9440408, + "step": 13155 + }, + { + "epoch": 27.35966735966736, + "grad_norm": 0.21423961222171783, + "learning_rate": 3.779526187805789e-05, + "loss": 0.1291, + "num_input_tokens_seen": 9443960, + "step": 13160 + }, + { + "epoch": 27.37006237006237, + "grad_norm": 0.46721047163009644, + "learning_rate": 3.778682671299364e-05, + "loss": 0.1204, + "num_input_tokens_seen": 9447512, + "step": 13165 + }, + { + "epoch": 27.38045738045738, + "grad_norm": 0.7109936475753784, + "learning_rate": 3.777838957604003e-05, + "loss": 0.151, + "num_input_tokens_seen": 9451096, + "step": 13170 + }, + { + "epoch": 27.39085239085239, + "grad_norm": 0.18904416263103485, + "learning_rate": 3.776995046849816e-05, + "loss": 0.1376, + "num_input_tokens_seen": 9454712, + "step": 13175 + }, + { + "epoch": 27.401247401247403, + "grad_norm": 0.526328980922699, + "learning_rate": 3.776150939166945e-05, + "loss": 0.1197, + "num_input_tokens_seen": 9458264, + "step": 13180 + }, + { + "epoch": 27.41164241164241, + "grad_norm": 0.11064225435256958, + "learning_rate": 3.775306634685562e-05, + "loss": 0.1507, + "num_input_tokens_seen": 9461784, + "step": 13185 + }, + { + "epoch": 27.422037422037423, + "grad_norm": 0.31032276153564453, + "learning_rate": 3.7744621335358696e-05, + "loss": 0.0928, + "num_input_tokens_seen": 9465144, + "step": 13190 + }, + { + "epoch": 27.43243243243243, + "grad_norm": 0.2304452359676361, + "learning_rate": 3.7736174358481e-05, + "loss": 0.2147, + "num_input_tokens_seen": 9468696, + "step": 13195 + }, + { + "epoch": 27.442827442827443, + "grad_norm": 0.6314427852630615, + "learning_rate": 3.7727725417525175e-05, + "loss": 0.1083, + "num_input_tokens_seen": 9472184, + "step": 13200 + }, + { + "epoch": 27.442827442827443, + "eval_loss": 0.14200371503829956, + "eval_runtime": 7.7636, + "eval_samples_per_second": 110.258, + "eval_steps_per_second": 27.565, + "num_input_tokens_seen": 9472184, + "step": 13200 + }, + { + "epoch": 27.453222453222452, + "grad_norm": 0.2525376081466675, + "learning_rate": 3.771927451379414e-05, + "loss": 0.1091, + "num_input_tokens_seen": 9475672, + "step": 13205 + }, + { + "epoch": 27.463617463617464, + "grad_norm": 0.17531165480613708, + "learning_rate": 3.7710821648591135e-05, + "loss": 0.1161, + "num_input_tokens_seen": 9479256, + "step": 13210 + }, + { + "epoch": 27.474012474012476, + "grad_norm": 0.5026007890701294, + "learning_rate": 3.7702366823219694e-05, + "loss": 0.127, + "num_input_tokens_seen": 9482872, + "step": 13215 + }, + { + "epoch": 27.484407484407484, + "grad_norm": 0.37198135256767273, + "learning_rate": 3.769391003898366e-05, + "loss": 0.1227, + "num_input_tokens_seen": 9486648, + "step": 13220 + }, + { + "epoch": 27.494802494802496, + "grad_norm": 0.11145754903554916, + "learning_rate": 3.768545129718718e-05, + "loss": 0.1409, + "num_input_tokens_seen": 9490264, + "step": 13225 + }, + { + "epoch": 27.505197505197504, + "grad_norm": 0.47773924469947815, + "learning_rate": 3.7676990599134686e-05, + "loss": 0.1205, + "num_input_tokens_seen": 9493816, + "step": 13230 + }, + { + "epoch": 27.515592515592516, + "grad_norm": 0.20587584376335144, + "learning_rate": 3.766852794613095e-05, + "loss": 0.1148, + "num_input_tokens_seen": 9497400, + "step": 13235 + }, + { + "epoch": 27.525987525987524, + "grad_norm": 0.2348073422908783, + "learning_rate": 3.766006333948099e-05, + "loss": 0.1145, + "num_input_tokens_seen": 9500952, + "step": 13240 + }, + { + "epoch": 27.536382536382536, + "grad_norm": 0.7123271226882935, + "learning_rate": 3.765159678049017e-05, + "loss": 0.1792, + "num_input_tokens_seen": 9504440, + "step": 13245 + }, + { + "epoch": 27.546777546777548, + "grad_norm": 0.21312038600444794, + "learning_rate": 3.7643128270464134e-05, + "loss": 0.0982, + "num_input_tokens_seen": 9508024, + "step": 13250 + }, + { + "epoch": 27.557172557172557, + "grad_norm": 0.6061801910400391, + "learning_rate": 3.763465781070884e-05, + "loss": 0.1309, + "num_input_tokens_seen": 9511704, + "step": 13255 + }, + { + "epoch": 27.56756756756757, + "grad_norm": 0.20178738236427307, + "learning_rate": 3.762618540253052e-05, + "loss": 0.1304, + "num_input_tokens_seen": 9515192, + "step": 13260 + }, + { + "epoch": 27.577962577962577, + "grad_norm": 0.19288034737110138, + "learning_rate": 3.761771104723576e-05, + "loss": 0.1221, + "num_input_tokens_seen": 9518712, + "step": 13265 + }, + { + "epoch": 27.58835758835759, + "grad_norm": 0.20463456213474274, + "learning_rate": 3.7609234746131386e-05, + "loss": 0.1245, + "num_input_tokens_seen": 9522360, + "step": 13270 + }, + { + "epoch": 27.598752598752597, + "grad_norm": 0.44329550862312317, + "learning_rate": 3.7600756500524556e-05, + "loss": 0.1408, + "num_input_tokens_seen": 9525880, + "step": 13275 + }, + { + "epoch": 27.60914760914761, + "grad_norm": 0.3488925099372864, + "learning_rate": 3.759227631172271e-05, + "loss": 0.0854, + "num_input_tokens_seen": 9529528, + "step": 13280 + }, + { + "epoch": 27.61954261954262, + "grad_norm": 0.19754759967327118, + "learning_rate": 3.758379418103363e-05, + "loss": 0.1351, + "num_input_tokens_seen": 9532984, + "step": 13285 + }, + { + "epoch": 27.62993762993763, + "grad_norm": 0.23186874389648438, + "learning_rate": 3.757531010976534e-05, + "loss": 0.0977, + "num_input_tokens_seen": 9536600, + "step": 13290 + }, + { + "epoch": 27.64033264033264, + "grad_norm": 0.32712769508361816, + "learning_rate": 3.75668240992262e-05, + "loss": 0.1189, + "num_input_tokens_seen": 9540408, + "step": 13295 + }, + { + "epoch": 27.65072765072765, + "grad_norm": 0.18482495844364166, + "learning_rate": 3.7558336150724865e-05, + "loss": 0.1069, + "num_input_tokens_seen": 9544248, + "step": 13300 + }, + { + "epoch": 27.66112266112266, + "grad_norm": 0.43494468927383423, + "learning_rate": 3.754984626557028e-05, + "loss": 0.1485, + "num_input_tokens_seen": 9547832, + "step": 13305 + }, + { + "epoch": 27.671517671517673, + "grad_norm": 0.19976025819778442, + "learning_rate": 3.754135444507168e-05, + "loss": 0.116, + "num_input_tokens_seen": 9551672, + "step": 13310 + }, + { + "epoch": 27.68191268191268, + "grad_norm": 0.43536949157714844, + "learning_rate": 3.753286069053863e-05, + "loss": 0.1055, + "num_input_tokens_seen": 9555096, + "step": 13315 + }, + { + "epoch": 27.692307692307693, + "grad_norm": 0.41814592480659485, + "learning_rate": 3.7524365003280945e-05, + "loss": 0.1005, + "num_input_tokens_seen": 9558584, + "step": 13320 + }, + { + "epoch": 27.7027027027027, + "grad_norm": 0.42255860567092896, + "learning_rate": 3.75158673846088e-05, + "loss": 0.0832, + "num_input_tokens_seen": 9562232, + "step": 13325 + }, + { + "epoch": 27.713097713097714, + "grad_norm": 0.4064202904701233, + "learning_rate": 3.750736783583262e-05, + "loss": 0.0924, + "num_input_tokens_seen": 9565624, + "step": 13330 + }, + { + "epoch": 27.723492723492722, + "grad_norm": 0.3265465199947357, + "learning_rate": 3.7498866358263144e-05, + "loss": 0.1731, + "num_input_tokens_seen": 9569048, + "step": 13335 + }, + { + "epoch": 27.733887733887734, + "grad_norm": 0.30166298151016235, + "learning_rate": 3.74903629532114e-05, + "loss": 0.1332, + "num_input_tokens_seen": 9572696, + "step": 13340 + }, + { + "epoch": 27.744282744282746, + "grad_norm": 0.3361913859844208, + "learning_rate": 3.748185762198873e-05, + "loss": 0.1282, + "num_input_tokens_seen": 9576408, + "step": 13345 + }, + { + "epoch": 27.754677754677754, + "grad_norm": 0.16627229750156403, + "learning_rate": 3.747335036590676e-05, + "loss": 0.091, + "num_input_tokens_seen": 9580088, + "step": 13350 + }, + { + "epoch": 27.765072765072766, + "grad_norm": 0.3743651807308197, + "learning_rate": 3.7464841186277405e-05, + "loss": 0.1075, + "num_input_tokens_seen": 9583800, + "step": 13355 + }, + { + "epoch": 27.775467775467774, + "grad_norm": 0.39945876598358154, + "learning_rate": 3.7456330084412896e-05, + "loss": 0.1526, + "num_input_tokens_seen": 9587384, + "step": 13360 + }, + { + "epoch": 27.785862785862786, + "grad_norm": 0.30259984731674194, + "learning_rate": 3.744781706162576e-05, + "loss": 0.1117, + "num_input_tokens_seen": 9591064, + "step": 13365 + }, + { + "epoch": 27.796257796257795, + "grad_norm": 0.25384417176246643, + "learning_rate": 3.743930211922879e-05, + "loss": 0.0996, + "num_input_tokens_seen": 9594584, + "step": 13370 + }, + { + "epoch": 27.806652806652806, + "grad_norm": 0.15578363835811615, + "learning_rate": 3.743078525853513e-05, + "loss": 0.1026, + "num_input_tokens_seen": 9598104, + "step": 13375 + }, + { + "epoch": 27.81704781704782, + "grad_norm": 0.20182979106903076, + "learning_rate": 3.7422266480858154e-05, + "loss": 0.0933, + "num_input_tokens_seen": 9601720, + "step": 13380 + }, + { + "epoch": 27.827442827442827, + "grad_norm": 0.2797125279903412, + "learning_rate": 3.741374578751158e-05, + "loss": 0.094, + "num_input_tokens_seen": 9605240, + "step": 13385 + }, + { + "epoch": 27.83783783783784, + "grad_norm": 0.20333334803581238, + "learning_rate": 3.740522317980941e-05, + "loss": 0.1049, + "num_input_tokens_seen": 9608888, + "step": 13390 + }, + { + "epoch": 27.848232848232847, + "grad_norm": 0.2504636347293854, + "learning_rate": 3.739669865906593e-05, + "loss": 0.1094, + "num_input_tokens_seen": 9612472, + "step": 13395 + }, + { + "epoch": 27.85862785862786, + "grad_norm": 0.4510945975780487, + "learning_rate": 3.738817222659573e-05, + "loss": 0.1044, + "num_input_tokens_seen": 9616056, + "step": 13400 + }, + { + "epoch": 27.85862785862786, + "eval_loss": 0.15101295709609985, + "eval_runtime": 7.762, + "eval_samples_per_second": 110.28, + "eval_steps_per_second": 27.57, + "num_input_tokens_seen": 9616056, + "step": 13400 + }, + { + "epoch": 27.86902286902287, + "grad_norm": 0.1640130579471588, + "learning_rate": 3.73796438837137e-05, + "loss": 0.1303, + "num_input_tokens_seen": 9619736, + "step": 13405 + }, + { + "epoch": 27.87941787941788, + "grad_norm": 0.667628288269043, + "learning_rate": 3.7371113631735e-05, + "loss": 0.0905, + "num_input_tokens_seen": 9623224, + "step": 13410 + }, + { + "epoch": 27.88981288981289, + "grad_norm": 0.6575522422790527, + "learning_rate": 3.736258147197512e-05, + "loss": 0.0965, + "num_input_tokens_seen": 9626904, + "step": 13415 + }, + { + "epoch": 27.9002079002079, + "grad_norm": 1.0003674030303955, + "learning_rate": 3.735404740574981e-05, + "loss": 0.129, + "num_input_tokens_seen": 9630584, + "step": 13420 + }, + { + "epoch": 27.91060291060291, + "grad_norm": 0.18846416473388672, + "learning_rate": 3.7345511434375145e-05, + "loss": 0.0973, + "num_input_tokens_seen": 9634104, + "step": 13425 + }, + { + "epoch": 27.92099792099792, + "grad_norm": 0.2249540537595749, + "learning_rate": 3.733697355916748e-05, + "loss": 0.1318, + "num_input_tokens_seen": 9637720, + "step": 13430 + }, + { + "epoch": 27.93139293139293, + "grad_norm": 0.3493776023387909, + "learning_rate": 3.732843378144345e-05, + "loss": 0.0986, + "num_input_tokens_seen": 9641304, + "step": 13435 + }, + { + "epoch": 27.941787941787943, + "grad_norm": 0.1976539045572281, + "learning_rate": 3.7319892102519995e-05, + "loss": 0.1009, + "num_input_tokens_seen": 9644856, + "step": 13440 + }, + { + "epoch": 27.95218295218295, + "grad_norm": 0.32584309577941895, + "learning_rate": 3.731134852371436e-05, + "loss": 0.1213, + "num_input_tokens_seen": 9648536, + "step": 13445 + }, + { + "epoch": 27.962577962577964, + "grad_norm": 0.23433032631874084, + "learning_rate": 3.730280304634408e-05, + "loss": 0.0831, + "num_input_tokens_seen": 9652056, + "step": 13450 + }, + { + "epoch": 27.972972972972972, + "grad_norm": 0.24342992901802063, + "learning_rate": 3.729425567172696e-05, + "loss": 0.1377, + "num_input_tokens_seen": 9655672, + "step": 13455 + }, + { + "epoch": 27.983367983367984, + "grad_norm": 0.29406923055648804, + "learning_rate": 3.728570640118111e-05, + "loss": 0.0953, + "num_input_tokens_seen": 9659224, + "step": 13460 + }, + { + "epoch": 27.993762993762992, + "grad_norm": 0.3409288823604584, + "learning_rate": 3.727715523602494e-05, + "loss": 0.1352, + "num_input_tokens_seen": 9662904, + "step": 13465 + }, + { + "epoch": 28.004158004158004, + "grad_norm": 0.3473464250564575, + "learning_rate": 3.726860217757715e-05, + "loss": 0.1341, + "num_input_tokens_seen": 9666416, + "step": 13470 + }, + { + "epoch": 28.014553014553016, + "grad_norm": 0.5756229162216187, + "learning_rate": 3.726004722715673e-05, + "loss": 0.1064, + "num_input_tokens_seen": 9669712, + "step": 13475 + }, + { + "epoch": 28.024948024948024, + "grad_norm": 0.4163796007633209, + "learning_rate": 3.725149038608296e-05, + "loss": 0.1179, + "num_input_tokens_seen": 9673232, + "step": 13480 + }, + { + "epoch": 28.035343035343036, + "grad_norm": 0.38472169637680054, + "learning_rate": 3.7242931655675404e-05, + "loss": 0.0963, + "num_input_tokens_seen": 9676752, + "step": 13485 + }, + { + "epoch": 28.045738045738045, + "grad_norm": 0.32984426617622375, + "learning_rate": 3.7234371037253937e-05, + "loss": 0.0822, + "num_input_tokens_seen": 9680208, + "step": 13490 + }, + { + "epoch": 28.056133056133056, + "grad_norm": 0.1693108081817627, + "learning_rate": 3.7225808532138705e-05, + "loss": 0.0882, + "num_input_tokens_seen": 9683696, + "step": 13495 + }, + { + "epoch": 28.066528066528065, + "grad_norm": 0.501010537147522, + "learning_rate": 3.721724414165016e-05, + "loss": 0.1106, + "num_input_tokens_seen": 9687184, + "step": 13500 + }, + { + "epoch": 28.076923076923077, + "grad_norm": 0.3097720444202423, + "learning_rate": 3.720867786710904e-05, + "loss": 0.1046, + "num_input_tokens_seen": 9691024, + "step": 13505 + }, + { + "epoch": 28.08731808731809, + "grad_norm": 0.39457517862319946, + "learning_rate": 3.7200109709836366e-05, + "loss": 0.089, + "num_input_tokens_seen": 9694608, + "step": 13510 + }, + { + "epoch": 28.097713097713097, + "grad_norm": 0.2579469680786133, + "learning_rate": 3.7191539671153465e-05, + "loss": 0.0824, + "num_input_tokens_seen": 9698032, + "step": 13515 + }, + { + "epoch": 28.10810810810811, + "grad_norm": 0.9464858770370483, + "learning_rate": 3.718296775238193e-05, + "loss": 0.1156, + "num_input_tokens_seen": 9701584, + "step": 13520 + }, + { + "epoch": 28.118503118503117, + "grad_norm": 0.5151515007019043, + "learning_rate": 3.7174393954843675e-05, + "loss": 0.1285, + "num_input_tokens_seen": 9705168, + "step": 13525 + }, + { + "epoch": 28.12889812889813, + "grad_norm": 0.44493764638900757, + "learning_rate": 3.716581827986087e-05, + "loss": 0.1765, + "num_input_tokens_seen": 9709104, + "step": 13530 + }, + { + "epoch": 28.13929313929314, + "grad_norm": 0.39053723216056824, + "learning_rate": 3.7157240728756004e-05, + "loss": 0.0896, + "num_input_tokens_seen": 9712784, + "step": 13535 + }, + { + "epoch": 28.14968814968815, + "grad_norm": 0.8415313363075256, + "learning_rate": 3.714866130285184e-05, + "loss": 0.1485, + "num_input_tokens_seen": 9716592, + "step": 13540 + }, + { + "epoch": 28.16008316008316, + "grad_norm": 0.4272253215312958, + "learning_rate": 3.714008000347143e-05, + "loss": 0.1579, + "num_input_tokens_seen": 9720080, + "step": 13545 + }, + { + "epoch": 28.17047817047817, + "grad_norm": 0.5421146154403687, + "learning_rate": 3.7131496831938126e-05, + "loss": 0.1458, + "num_input_tokens_seen": 9723600, + "step": 13550 + }, + { + "epoch": 28.18087318087318, + "grad_norm": 0.5529787540435791, + "learning_rate": 3.7122911789575565e-05, + "loss": 0.1574, + "num_input_tokens_seen": 9727408, + "step": 13555 + }, + { + "epoch": 28.19126819126819, + "grad_norm": 0.39070454239845276, + "learning_rate": 3.711432487770765e-05, + "loss": 0.116, + "num_input_tokens_seen": 9731024, + "step": 13560 + }, + { + "epoch": 28.2016632016632, + "grad_norm": 0.22910194098949432, + "learning_rate": 3.710573609765861e-05, + "loss": 0.1322, + "num_input_tokens_seen": 9734736, + "step": 13565 + }, + { + "epoch": 28.212058212058214, + "grad_norm": 0.22874057292938232, + "learning_rate": 3.709714545075292e-05, + "loss": 0.0801, + "num_input_tokens_seen": 9738256, + "step": 13570 + }, + { + "epoch": 28.222453222453222, + "grad_norm": 0.5329493284225464, + "learning_rate": 3.708855293831538e-05, + "loss": 0.1111, + "num_input_tokens_seen": 9741840, + "step": 13575 + }, + { + "epoch": 28.232848232848234, + "grad_norm": 0.22176098823547363, + "learning_rate": 3.707995856167107e-05, + "loss": 0.0894, + "num_input_tokens_seen": 9745424, + "step": 13580 + }, + { + "epoch": 28.243243243243242, + "grad_norm": 0.15640242397785187, + "learning_rate": 3.707136232214534e-05, + "loss": 0.1158, + "num_input_tokens_seen": 9749168, + "step": 13585 + }, + { + "epoch": 28.253638253638254, + "grad_norm": 0.6389886140823364, + "learning_rate": 3.7062764221063844e-05, + "loss": 0.0983, + "num_input_tokens_seen": 9752720, + "step": 13590 + }, + { + "epoch": 28.264033264033262, + "grad_norm": 0.7853202223777771, + "learning_rate": 3.705416425975252e-05, + "loss": 0.1212, + "num_input_tokens_seen": 9756272, + "step": 13595 + }, + { + "epoch": 28.274428274428274, + "grad_norm": 0.2545016407966614, + "learning_rate": 3.704556243953758e-05, + "loss": 0.0876, + "num_input_tokens_seen": 9759824, + "step": 13600 + }, + { + "epoch": 28.274428274428274, + "eval_loss": 0.1451934427022934, + "eval_runtime": 7.7542, + "eval_samples_per_second": 110.392, + "eval_steps_per_second": 27.598, + "num_input_tokens_seen": 9759824, + "step": 13600 + }, + { + "epoch": 28.284823284823286, + "grad_norm": 0.3048713803291321, + "learning_rate": 3.7036958761745535e-05, + "loss": 0.1369, + "num_input_tokens_seen": 9763376, + "step": 13605 + }, + { + "epoch": 28.295218295218294, + "grad_norm": 0.37224119901657104, + "learning_rate": 3.702835322770318e-05, + "loss": 0.1006, + "num_input_tokens_seen": 9766928, + "step": 13610 + }, + { + "epoch": 28.305613305613306, + "grad_norm": 0.4921845495700836, + "learning_rate": 3.701974583873761e-05, + "loss": 0.0868, + "num_input_tokens_seen": 9770544, + "step": 13615 + }, + { + "epoch": 28.316008316008315, + "grad_norm": 0.3046557605266571, + "learning_rate": 3.701113659617618e-05, + "loss": 0.1235, + "num_input_tokens_seen": 9774224, + "step": 13620 + }, + { + "epoch": 28.326403326403327, + "grad_norm": 0.2732904255390167, + "learning_rate": 3.7002525501346535e-05, + "loss": 0.1402, + "num_input_tokens_seen": 9777808, + "step": 13625 + }, + { + "epoch": 28.33679833679834, + "grad_norm": 0.494414359331131, + "learning_rate": 3.699391255557664e-05, + "loss": 0.1249, + "num_input_tokens_seen": 9781360, + "step": 13630 + }, + { + "epoch": 28.347193347193347, + "grad_norm": 0.2771608829498291, + "learning_rate": 3.69852977601947e-05, + "loss": 0.1021, + "num_input_tokens_seen": 9785008, + "step": 13635 + }, + { + "epoch": 28.35758835758836, + "grad_norm": 0.2765471339225769, + "learning_rate": 3.697668111652922e-05, + "loss": 0.1467, + "num_input_tokens_seen": 9788624, + "step": 13640 + }, + { + "epoch": 28.367983367983367, + "grad_norm": 0.16484582424163818, + "learning_rate": 3.6968062625909005e-05, + "loss": 0.1223, + "num_input_tokens_seen": 9792336, + "step": 13645 + }, + { + "epoch": 28.37837837837838, + "grad_norm": 0.17534935474395752, + "learning_rate": 3.6959442289663135e-05, + "loss": 0.1353, + "num_input_tokens_seen": 9796048, + "step": 13650 + }, + { + "epoch": 28.388773388773387, + "grad_norm": 0.3146221935749054, + "learning_rate": 3.695082010912098e-05, + "loss": 0.087, + "num_input_tokens_seen": 9799760, + "step": 13655 + }, + { + "epoch": 28.3991683991684, + "grad_norm": 0.59608393907547, + "learning_rate": 3.694219608561217e-05, + "loss": 0.1291, + "num_input_tokens_seen": 9803280, + "step": 13660 + }, + { + "epoch": 28.40956340956341, + "grad_norm": 0.3322247564792633, + "learning_rate": 3.693357022046665e-05, + "loss": 0.0827, + "num_input_tokens_seen": 9806864, + "step": 13665 + }, + { + "epoch": 28.41995841995842, + "grad_norm": 0.6191847920417786, + "learning_rate": 3.6924942515014644e-05, + "loss": 0.1147, + "num_input_tokens_seen": 9810384, + "step": 13670 + }, + { + "epoch": 28.43035343035343, + "grad_norm": 0.4637325406074524, + "learning_rate": 3.691631297058664e-05, + "loss": 0.1383, + "num_input_tokens_seen": 9814032, + "step": 13675 + }, + { + "epoch": 28.44074844074844, + "grad_norm": 0.6023021936416626, + "learning_rate": 3.6907681588513424e-05, + "loss": 0.1228, + "num_input_tokens_seen": 9817840, + "step": 13680 + }, + { + "epoch": 28.45114345114345, + "grad_norm": 0.11291900277137756, + "learning_rate": 3.689904837012606e-05, + "loss": 0.1044, + "num_input_tokens_seen": 9821296, + "step": 13685 + }, + { + "epoch": 28.46153846153846, + "grad_norm": 0.19960124790668488, + "learning_rate": 3.689041331675591e-05, + "loss": 0.0906, + "num_input_tokens_seen": 9824784, + "step": 13690 + }, + { + "epoch": 28.471933471933472, + "grad_norm": 0.7673927545547485, + "learning_rate": 3.688177642973461e-05, + "loss": 0.1582, + "num_input_tokens_seen": 9828368, + "step": 13695 + }, + { + "epoch": 28.482328482328484, + "grad_norm": 0.2768864929676056, + "learning_rate": 3.687313771039406e-05, + "loss": 0.1055, + "num_input_tokens_seen": 9831952, + "step": 13700 + }, + { + "epoch": 28.492723492723492, + "grad_norm": 0.4082251191139221, + "learning_rate": 3.686449716006647e-05, + "loss": 0.1275, + "num_input_tokens_seen": 9835536, + "step": 13705 + }, + { + "epoch": 28.503118503118504, + "grad_norm": 0.9707143306732178, + "learning_rate": 3.685585478008432e-05, + "loss": 0.1877, + "num_input_tokens_seen": 9839056, + "step": 13710 + }, + { + "epoch": 28.513513513513512, + "grad_norm": 0.2680625915527344, + "learning_rate": 3.6847210571780364e-05, + "loss": 0.1042, + "num_input_tokens_seen": 9842576, + "step": 13715 + }, + { + "epoch": 28.523908523908524, + "grad_norm": 0.15738509595394135, + "learning_rate": 3.683856453648767e-05, + "loss": 0.0874, + "num_input_tokens_seen": 9846256, + "step": 13720 + }, + { + "epoch": 28.534303534303533, + "grad_norm": 0.18216317892074585, + "learning_rate": 3.682991667553954e-05, + "loss": 0.108, + "num_input_tokens_seen": 9849744, + "step": 13725 + }, + { + "epoch": 28.544698544698544, + "grad_norm": 0.3248346149921417, + "learning_rate": 3.6821266990269606e-05, + "loss": 0.149, + "num_input_tokens_seen": 9853392, + "step": 13730 + }, + { + "epoch": 28.555093555093556, + "grad_norm": 0.2425808161497116, + "learning_rate": 3.681261548201174e-05, + "loss": 0.1168, + "num_input_tokens_seen": 9856976, + "step": 13735 + }, + { + "epoch": 28.565488565488565, + "grad_norm": 0.8484617471694946, + "learning_rate": 3.6803962152100125e-05, + "loss": 0.1612, + "num_input_tokens_seen": 9860528, + "step": 13740 + }, + { + "epoch": 28.575883575883577, + "grad_norm": 0.47785142064094543, + "learning_rate": 3.67953070018692e-05, + "loss": 0.1076, + "num_input_tokens_seen": 9864208, + "step": 13745 + }, + { + "epoch": 28.586278586278585, + "grad_norm": 0.5499173402786255, + "learning_rate": 3.678665003265371e-05, + "loss": 0.112, + "num_input_tokens_seen": 9867760, + "step": 13750 + }, + { + "epoch": 28.596673596673597, + "grad_norm": 0.2020009607076645, + "learning_rate": 3.677799124578867e-05, + "loss": 0.1049, + "num_input_tokens_seen": 9871216, + "step": 13755 + }, + { + "epoch": 28.60706860706861, + "grad_norm": 0.9639396667480469, + "learning_rate": 3.676933064260937e-05, + "loss": 0.1187, + "num_input_tokens_seen": 9874928, + "step": 13760 + }, + { + "epoch": 28.617463617463617, + "grad_norm": 0.8455857038497925, + "learning_rate": 3.6760668224451365e-05, + "loss": 0.0946, + "num_input_tokens_seen": 9878480, + "step": 13765 + }, + { + "epoch": 28.62785862785863, + "grad_norm": 0.16178233921527863, + "learning_rate": 3.675200399265054e-05, + "loss": 0.0592, + "num_input_tokens_seen": 9882064, + "step": 13770 + }, + { + "epoch": 28.638253638253637, + "grad_norm": 0.2794564962387085, + "learning_rate": 3.6743337948543014e-05, + "loss": 0.1311, + "num_input_tokens_seen": 9885744, + "step": 13775 + }, + { + "epoch": 28.64864864864865, + "grad_norm": 0.6003010272979736, + "learning_rate": 3.6734670093465204e-05, + "loss": 0.1247, + "num_input_tokens_seen": 9889392, + "step": 13780 + }, + { + "epoch": 28.659043659043657, + "grad_norm": 0.47712811827659607, + "learning_rate": 3.672600042875379e-05, + "loss": 0.1105, + "num_input_tokens_seen": 9893008, + "step": 13785 + }, + { + "epoch": 28.66943866943867, + "grad_norm": 0.1697029173374176, + "learning_rate": 3.671732895574575e-05, + "loss": 0.0893, + "num_input_tokens_seen": 9896656, + "step": 13790 + }, + { + "epoch": 28.67983367983368, + "grad_norm": 0.1752805858850479, + "learning_rate": 3.670865567577834e-05, + "loss": 0.1078, + "num_input_tokens_seen": 9900240, + "step": 13795 + }, + { + "epoch": 28.69022869022869, + "grad_norm": 0.31295573711395264, + "learning_rate": 3.669998059018909e-05, + "loss": 0.0652, + "num_input_tokens_seen": 9903824, + "step": 13800 + }, + { + "epoch": 28.69022869022869, + "eval_loss": 0.14634433388710022, + "eval_runtime": 7.7478, + "eval_samples_per_second": 110.483, + "eval_steps_per_second": 27.621, + "num_input_tokens_seen": 9903824, + "step": 13800 + }, + { + "epoch": 28.7006237006237, + "grad_norm": 0.12073328346014023, + "learning_rate": 3.6691303700315796e-05, + "loss": 0.1032, + "num_input_tokens_seen": 9907472, + "step": 13805 + }, + { + "epoch": 28.71101871101871, + "grad_norm": 0.315909206867218, + "learning_rate": 3.668262500749655e-05, + "loss": 0.1471, + "num_input_tokens_seen": 9911184, + "step": 13810 + }, + { + "epoch": 28.72141372141372, + "grad_norm": 0.43676838278770447, + "learning_rate": 3.667394451306971e-05, + "loss": 0.1165, + "num_input_tokens_seen": 9914736, + "step": 13815 + }, + { + "epoch": 28.731808731808734, + "grad_norm": 0.23348002135753632, + "learning_rate": 3.666526221837393e-05, + "loss": 0.0864, + "num_input_tokens_seen": 9918256, + "step": 13820 + }, + { + "epoch": 28.742203742203742, + "grad_norm": 0.5817322731018066, + "learning_rate": 3.665657812474812e-05, + "loss": 0.1351, + "num_input_tokens_seen": 9921776, + "step": 13825 + }, + { + "epoch": 28.752598752598754, + "grad_norm": 0.1322726011276245, + "learning_rate": 3.664789223353147e-05, + "loss": 0.0979, + "num_input_tokens_seen": 9925296, + "step": 13830 + }, + { + "epoch": 28.762993762993762, + "grad_norm": 0.3500148355960846, + "learning_rate": 3.663920454606347e-05, + "loss": 0.1189, + "num_input_tokens_seen": 9928912, + "step": 13835 + }, + { + "epoch": 28.773388773388774, + "grad_norm": 0.6728733777999878, + "learning_rate": 3.6630515063683856e-05, + "loss": 0.136, + "num_input_tokens_seen": 9932464, + "step": 13840 + }, + { + "epoch": 28.783783783783782, + "grad_norm": 0.4349099397659302, + "learning_rate": 3.662182378773267e-05, + "loss": 0.1233, + "num_input_tokens_seen": 9936016, + "step": 13845 + }, + { + "epoch": 28.794178794178794, + "grad_norm": 0.23740610480308533, + "learning_rate": 3.66131307195502e-05, + "loss": 0.0842, + "num_input_tokens_seen": 9939696, + "step": 13850 + }, + { + "epoch": 28.804573804573806, + "grad_norm": 0.20585393905639648, + "learning_rate": 3.6604435860477034e-05, + "loss": 0.0842, + "num_input_tokens_seen": 9943248, + "step": 13855 + }, + { + "epoch": 28.814968814968815, + "grad_norm": 0.4002854824066162, + "learning_rate": 3.6595739211854025e-05, + "loss": 0.1413, + "num_input_tokens_seen": 9946768, + "step": 13860 + }, + { + "epoch": 28.825363825363826, + "grad_norm": 0.2759881317615509, + "learning_rate": 3.658704077502231e-05, + "loss": 0.1302, + "num_input_tokens_seen": 9950352, + "step": 13865 + }, + { + "epoch": 28.835758835758835, + "grad_norm": 0.44040465354919434, + "learning_rate": 3.65783405513233e-05, + "loss": 0.1213, + "num_input_tokens_seen": 9954192, + "step": 13870 + }, + { + "epoch": 28.846153846153847, + "grad_norm": 0.262378990650177, + "learning_rate": 3.656963854209867e-05, + "loss": 0.124, + "num_input_tokens_seen": 9957648, + "step": 13875 + }, + { + "epoch": 28.856548856548855, + "grad_norm": 0.40471330285072327, + "learning_rate": 3.656093474869038e-05, + "loss": 0.1171, + "num_input_tokens_seen": 9961232, + "step": 13880 + }, + { + "epoch": 28.866943866943867, + "grad_norm": 0.24068868160247803, + "learning_rate": 3.655222917244068e-05, + "loss": 0.1497, + "num_input_tokens_seen": 9964912, + "step": 13885 + }, + { + "epoch": 28.87733887733888, + "grad_norm": 0.2954504191875458, + "learning_rate": 3.6543521814692054e-05, + "loss": 0.1509, + "num_input_tokens_seen": 9968560, + "step": 13890 + }, + { + "epoch": 28.887733887733887, + "grad_norm": 0.19453680515289307, + "learning_rate": 3.653481267678731e-05, + "loss": 0.0935, + "num_input_tokens_seen": 9972208, + "step": 13895 + }, + { + "epoch": 28.8981288981289, + "grad_norm": 0.4740135371685028, + "learning_rate": 3.652610176006949e-05, + "loss": 0.1236, + "num_input_tokens_seen": 9975696, + "step": 13900 + }, + { + "epoch": 28.908523908523907, + "grad_norm": 0.1376517415046692, + "learning_rate": 3.6517389065881925e-05, + "loss": 0.1117, + "num_input_tokens_seen": 9979216, + "step": 13905 + }, + { + "epoch": 28.91891891891892, + "grad_norm": 0.21827377378940582, + "learning_rate": 3.650867459556824e-05, + "loss": 0.116, + "num_input_tokens_seen": 9982736, + "step": 13910 + }, + { + "epoch": 28.929313929313928, + "grad_norm": 0.3131656348705292, + "learning_rate": 3.64999583504723e-05, + "loss": 0.1163, + "num_input_tokens_seen": 9986256, + "step": 13915 + }, + { + "epoch": 28.93970893970894, + "grad_norm": 0.9463793635368347, + "learning_rate": 3.649124033193827e-05, + "loss": 0.1265, + "num_input_tokens_seen": 9989872, + "step": 13920 + }, + { + "epoch": 28.95010395010395, + "grad_norm": 0.5447056293487549, + "learning_rate": 3.648252054131057e-05, + "loss": 0.1219, + "num_input_tokens_seen": 9993328, + "step": 13925 + }, + { + "epoch": 28.96049896049896, + "grad_norm": 0.2931716740131378, + "learning_rate": 3.647379897993391e-05, + "loss": 0.1013, + "num_input_tokens_seen": 9996752, + "step": 13930 + }, + { + "epoch": 28.97089397089397, + "grad_norm": 0.792118489742279, + "learning_rate": 3.646507564915325e-05, + "loss": 0.1534, + "num_input_tokens_seen": 10000240, + "step": 13935 + }, + { + "epoch": 28.98128898128898, + "grad_norm": 0.534467875957489, + "learning_rate": 3.645635055031385e-05, + "loss": 0.105, + "num_input_tokens_seen": 10003856, + "step": 13940 + }, + { + "epoch": 28.991683991683992, + "grad_norm": 0.3017849624156952, + "learning_rate": 3.6447623684761224e-05, + "loss": 0.1355, + "num_input_tokens_seen": 10007408, + "step": 13945 + }, + { + "epoch": 29.002079002079004, + "grad_norm": 0.5558481216430664, + "learning_rate": 3.643889505384117e-05, + "loss": 0.1571, + "num_input_tokens_seen": 10010936, + "step": 13950 + }, + { + "epoch": 29.012474012474012, + "grad_norm": 0.3312845826148987, + "learning_rate": 3.6430164658899744e-05, + "loss": 0.1032, + "num_input_tokens_seen": 10014392, + "step": 13955 + }, + { + "epoch": 29.022869022869024, + "grad_norm": 0.2904507517814636, + "learning_rate": 3.642143250128329e-05, + "loss": 0.0977, + "num_input_tokens_seen": 10017944, + "step": 13960 + }, + { + "epoch": 29.033264033264032, + "grad_norm": 0.5718663334846497, + "learning_rate": 3.641269858233841e-05, + "loss": 0.1236, + "num_input_tokens_seen": 10021560, + "step": 13965 + }, + { + "epoch": 29.043659043659044, + "grad_norm": 0.17059046030044556, + "learning_rate": 3.640396290341199e-05, + "loss": 0.1222, + "num_input_tokens_seen": 10025080, + "step": 13970 + }, + { + "epoch": 29.054054054054053, + "grad_norm": 0.19379547238349915, + "learning_rate": 3.639522546585118e-05, + "loss": 0.1349, + "num_input_tokens_seen": 10028664, + "step": 13975 + }, + { + "epoch": 29.064449064449065, + "grad_norm": 0.19841302931308746, + "learning_rate": 3.6386486271003404e-05, + "loss": 0.0943, + "num_input_tokens_seen": 10032312, + "step": 13980 + }, + { + "epoch": 29.074844074844076, + "grad_norm": 0.50764000415802, + "learning_rate": 3.6377745320216346e-05, + "loss": 0.1092, + "num_input_tokens_seen": 10035896, + "step": 13985 + }, + { + "epoch": 29.085239085239085, + "grad_norm": 0.3378729820251465, + "learning_rate": 3.636900261483798e-05, + "loss": 0.1165, + "num_input_tokens_seen": 10039416, + "step": 13990 + }, + { + "epoch": 29.095634095634097, + "grad_norm": 0.5471135377883911, + "learning_rate": 3.636025815621654e-05, + "loss": 0.1235, + "num_input_tokens_seen": 10043256, + "step": 13995 + }, + { + "epoch": 29.106029106029105, + "grad_norm": 0.5877634882926941, + "learning_rate": 3.635151194570054e-05, + "loss": 0.1238, + "num_input_tokens_seen": 10046680, + "step": 14000 + }, + { + "epoch": 29.106029106029105, + "eval_loss": 0.14382043480873108, + "eval_runtime": 7.7493, + "eval_samples_per_second": 110.462, + "eval_steps_per_second": 27.616, + "num_input_tokens_seen": 10046680, + "step": 14000 + }, + { + "epoch": 29.116424116424117, + "grad_norm": 0.31057167053222656, + "learning_rate": 3.634276398463873e-05, + "loss": 0.1305, + "num_input_tokens_seen": 10050168, + "step": 14005 + }, + { + "epoch": 29.126819126819125, + "grad_norm": 0.23526953160762787, + "learning_rate": 3.633401427438018e-05, + "loss": 0.0871, + "num_input_tokens_seen": 10053912, + "step": 14010 + }, + { + "epoch": 29.137214137214137, + "grad_norm": 0.31524696946144104, + "learning_rate": 3.63252628162742e-05, + "loss": 0.1633, + "num_input_tokens_seen": 10057528, + "step": 14015 + }, + { + "epoch": 29.14760914760915, + "grad_norm": 0.3054753243923187, + "learning_rate": 3.6316509611670364e-05, + "loss": 0.1141, + "num_input_tokens_seen": 10061016, + "step": 14020 + }, + { + "epoch": 29.158004158004157, + "grad_norm": 0.29838770627975464, + "learning_rate": 3.630775466191854e-05, + "loss": 0.1098, + "num_input_tokens_seen": 10064632, + "step": 14025 + }, + { + "epoch": 29.16839916839917, + "grad_norm": 0.19790971279144287, + "learning_rate": 3.629899796836884e-05, + "loss": 0.089, + "num_input_tokens_seen": 10068088, + "step": 14030 + }, + { + "epoch": 29.178794178794178, + "grad_norm": 0.1458965539932251, + "learning_rate": 3.6290239532371666e-05, + "loss": 0.0884, + "num_input_tokens_seen": 10071704, + "step": 14035 + }, + { + "epoch": 29.18918918918919, + "grad_norm": 0.16238242387771606, + "learning_rate": 3.628147935527767e-05, + "loss": 0.1081, + "num_input_tokens_seen": 10075352, + "step": 14040 + }, + { + "epoch": 29.1995841995842, + "grad_norm": 0.19004903733730316, + "learning_rate": 3.627271743843779e-05, + "loss": 0.109, + "num_input_tokens_seen": 10078904, + "step": 14045 + }, + { + "epoch": 29.20997920997921, + "grad_norm": 0.31967681646347046, + "learning_rate": 3.626395378320321e-05, + "loss": 0.0726, + "num_input_tokens_seen": 10082456, + "step": 14050 + }, + { + "epoch": 29.22037422037422, + "grad_norm": 0.5472121834754944, + "learning_rate": 3.625518839092541e-05, + "loss": 0.1322, + "num_input_tokens_seen": 10086168, + "step": 14055 + }, + { + "epoch": 29.23076923076923, + "grad_norm": 0.5462338924407959, + "learning_rate": 3.624642126295612e-05, + "loss": 0.1019, + "num_input_tokens_seen": 10089848, + "step": 14060 + }, + { + "epoch": 29.241164241164242, + "grad_norm": 0.48722773790359497, + "learning_rate": 3.6237652400647345e-05, + "loss": 0.118, + "num_input_tokens_seen": 10093304, + "step": 14065 + }, + { + "epoch": 29.25155925155925, + "grad_norm": 0.21530497074127197, + "learning_rate": 3.622888180535134e-05, + "loss": 0.0875, + "num_input_tokens_seen": 10096824, + "step": 14070 + }, + { + "epoch": 29.261954261954262, + "grad_norm": 0.31604182720184326, + "learning_rate": 3.6220109478420655e-05, + "loss": 0.1463, + "num_input_tokens_seen": 10100408, + "step": 14075 + }, + { + "epoch": 29.272349272349274, + "grad_norm": 0.4186307191848755, + "learning_rate": 3.6211335421208084e-05, + "loss": 0.1117, + "num_input_tokens_seen": 10103928, + "step": 14080 + }, + { + "epoch": 29.282744282744282, + "grad_norm": 0.30533745884895325, + "learning_rate": 3.62025596350667e-05, + "loss": 0.1204, + "num_input_tokens_seen": 10107576, + "step": 14085 + }, + { + "epoch": 29.293139293139294, + "grad_norm": 0.14642786979675293, + "learning_rate": 3.619378212134984e-05, + "loss": 0.0932, + "num_input_tokens_seen": 10111224, + "step": 14090 + }, + { + "epoch": 29.303534303534303, + "grad_norm": 0.32563474774360657, + "learning_rate": 3.618500288141111e-05, + "loss": 0.1382, + "num_input_tokens_seen": 10114744, + "step": 14095 + }, + { + "epoch": 29.313929313929314, + "grad_norm": 0.3530136048793793, + "learning_rate": 3.617622191660438e-05, + "loss": 0.0893, + "num_input_tokens_seen": 10118200, + "step": 14100 + }, + { + "epoch": 29.324324324324323, + "grad_norm": 0.2016209363937378, + "learning_rate": 3.616743922828377e-05, + "loss": 0.1175, + "num_input_tokens_seen": 10121848, + "step": 14105 + }, + { + "epoch": 29.334719334719335, + "grad_norm": 0.5264164805412292, + "learning_rate": 3.615865481780371e-05, + "loss": 0.1353, + "num_input_tokens_seen": 10125464, + "step": 14110 + }, + { + "epoch": 29.345114345114347, + "grad_norm": 0.4925020933151245, + "learning_rate": 3.614986868651883e-05, + "loss": 0.0921, + "num_input_tokens_seen": 10129016, + "step": 14115 + }, + { + "epoch": 29.355509355509355, + "grad_norm": 0.20269165933132172, + "learning_rate": 3.614108083578409e-05, + "loss": 0.11, + "num_input_tokens_seen": 10132696, + "step": 14120 + }, + { + "epoch": 29.365904365904367, + "grad_norm": 0.30835291743278503, + "learning_rate": 3.613229126695467e-05, + "loss": 0.1571, + "num_input_tokens_seen": 10136312, + "step": 14125 + }, + { + "epoch": 29.376299376299375, + "grad_norm": 0.22261670231819153, + "learning_rate": 3.612349998138605e-05, + "loss": 0.1041, + "num_input_tokens_seen": 10139896, + "step": 14130 + }, + { + "epoch": 29.386694386694387, + "grad_norm": 1.2794238328933716, + "learning_rate": 3.6114706980433946e-05, + "loss": 0.0914, + "num_input_tokens_seen": 10143416, + "step": 14135 + }, + { + "epoch": 29.397089397089395, + "grad_norm": 0.29885032773017883, + "learning_rate": 3.610591226545435e-05, + "loss": 0.095, + "num_input_tokens_seen": 10147064, + "step": 14140 + }, + { + "epoch": 29.407484407484407, + "grad_norm": 0.3789476454257965, + "learning_rate": 3.6097115837803505e-05, + "loss": 0.1184, + "num_input_tokens_seen": 10150584, + "step": 14145 + }, + { + "epoch": 29.41787941787942, + "grad_norm": 0.276438444852829, + "learning_rate": 3.608831769883795e-05, + "loss": 0.1149, + "num_input_tokens_seen": 10154232, + "step": 14150 + }, + { + "epoch": 29.428274428274428, + "grad_norm": 0.1814860999584198, + "learning_rate": 3.607951784991446e-05, + "loss": 0.1, + "num_input_tokens_seen": 10157688, + "step": 14155 + }, + { + "epoch": 29.43866943866944, + "grad_norm": 0.38682448863983154, + "learning_rate": 3.6070716292390085e-05, + "loss": 0.0933, + "num_input_tokens_seen": 10161304, + "step": 14160 + }, + { + "epoch": 29.449064449064448, + "grad_norm": 0.2975340485572815, + "learning_rate": 3.606191302762213e-05, + "loss": 0.1064, + "num_input_tokens_seen": 10164984, + "step": 14165 + }, + { + "epoch": 29.45945945945946, + "grad_norm": 0.43463200330734253, + "learning_rate": 3.605310805696818e-05, + "loss": 0.1236, + "num_input_tokens_seen": 10168632, + "step": 14170 + }, + { + "epoch": 29.46985446985447, + "grad_norm": 0.2943912744522095, + "learning_rate": 3.6044301381786067e-05, + "loss": 0.1188, + "num_input_tokens_seen": 10172056, + "step": 14175 + }, + { + "epoch": 29.48024948024948, + "grad_norm": 0.7049693465232849, + "learning_rate": 3.6035493003433883e-05, + "loss": 0.1124, + "num_input_tokens_seen": 10175640, + "step": 14180 + }, + { + "epoch": 29.490644490644492, + "grad_norm": 0.5914971232414246, + "learning_rate": 3.6026682923269994e-05, + "loss": 0.1166, + "num_input_tokens_seen": 10179192, + "step": 14185 + }, + { + "epoch": 29.5010395010395, + "grad_norm": 0.4770030379295349, + "learning_rate": 3.6017871142653034e-05, + "loss": 0.1369, + "num_input_tokens_seen": 10182808, + "step": 14190 + }, + { + "epoch": 29.511434511434512, + "grad_norm": 0.2199256718158722, + "learning_rate": 3.600905766294189e-05, + "loss": 0.0864, + "num_input_tokens_seen": 10186360, + "step": 14195 + }, + { + "epoch": 29.52182952182952, + "grad_norm": 0.20971940457820892, + "learning_rate": 3.60002424854957e-05, + "loss": 0.0927, + "num_input_tokens_seen": 10190040, + "step": 14200 + }, + { + "epoch": 29.52182952182952, + "eval_loss": 0.1438344419002533, + "eval_runtime": 7.7579, + "eval_samples_per_second": 110.339, + "eval_steps_per_second": 27.585, + "num_input_tokens_seen": 10190040, + "step": 14200 + }, + { + "epoch": 29.532224532224532, + "grad_norm": 0.7146796584129333, + "learning_rate": 3.5991425611673876e-05, + "loss": 0.1258, + "num_input_tokens_seen": 10193848, + "step": 14205 + }, + { + "epoch": 29.542619542619544, + "grad_norm": 0.39811599254608154, + "learning_rate": 3.5982607042836105e-05, + "loss": 0.1031, + "num_input_tokens_seen": 10197496, + "step": 14210 + }, + { + "epoch": 29.553014553014552, + "grad_norm": 0.17997698485851288, + "learning_rate": 3.597378678034231e-05, + "loss": 0.1323, + "num_input_tokens_seen": 10201080, + "step": 14215 + }, + { + "epoch": 29.563409563409564, + "grad_norm": 0.19174101948738098, + "learning_rate": 3.596496482555269e-05, + "loss": 0.1053, + "num_input_tokens_seen": 10204632, + "step": 14220 + }, + { + "epoch": 29.573804573804573, + "grad_norm": 0.3940986394882202, + "learning_rate": 3.595614117982769e-05, + "loss": 0.1695, + "num_input_tokens_seen": 10208312, + "step": 14225 + }, + { + "epoch": 29.584199584199585, + "grad_norm": 0.31745508313179016, + "learning_rate": 3.594731584452805e-05, + "loss": 0.1352, + "num_input_tokens_seen": 10211800, + "step": 14230 + }, + { + "epoch": 29.594594594594593, + "grad_norm": 0.18073780834674835, + "learning_rate": 3.593848882101472e-05, + "loss": 0.1178, + "num_input_tokens_seen": 10215416, + "step": 14235 + }, + { + "epoch": 29.604989604989605, + "grad_norm": 0.5129613280296326, + "learning_rate": 3.592966011064896e-05, + "loss": 0.1168, + "num_input_tokens_seen": 10219224, + "step": 14240 + }, + { + "epoch": 29.615384615384617, + "grad_norm": 0.3010083734989166, + "learning_rate": 3.592082971479226e-05, + "loss": 0.1351, + "num_input_tokens_seen": 10222712, + "step": 14245 + }, + { + "epoch": 29.625779625779625, + "grad_norm": 0.2013571858406067, + "learning_rate": 3.5911997634806385e-05, + "loss": 0.0894, + "num_input_tokens_seen": 10226424, + "step": 14250 + }, + { + "epoch": 29.636174636174637, + "grad_norm": 0.4220842719078064, + "learning_rate": 3.5903163872053336e-05, + "loss": 0.0902, + "num_input_tokens_seen": 10230072, + "step": 14255 + }, + { + "epoch": 29.646569646569645, + "grad_norm": 0.5420276522636414, + "learning_rate": 3.58943284278954e-05, + "loss": 0.0866, + "num_input_tokens_seen": 10233624, + "step": 14260 + }, + { + "epoch": 29.656964656964657, + "grad_norm": 0.48698267340660095, + "learning_rate": 3.588549130369512e-05, + "loss": 0.132, + "num_input_tokens_seen": 10237176, + "step": 14265 + }, + { + "epoch": 29.66735966735967, + "grad_norm": 0.23074068129062653, + "learning_rate": 3.5876652500815274e-05, + "loss": 0.1359, + "num_input_tokens_seen": 10240760, + "step": 14270 + }, + { + "epoch": 29.677754677754677, + "grad_norm": 0.21006543934345245, + "learning_rate": 3.586781202061894e-05, + "loss": 0.1292, + "num_input_tokens_seen": 10244248, + "step": 14275 + }, + { + "epoch": 29.68814968814969, + "grad_norm": 0.3281995356082916, + "learning_rate": 3.585896986446942e-05, + "loss": 0.12, + "num_input_tokens_seen": 10247896, + "step": 14280 + }, + { + "epoch": 29.698544698544698, + "grad_norm": 0.29469141364097595, + "learning_rate": 3.585012603373028e-05, + "loss": 0.1141, + "num_input_tokens_seen": 10251608, + "step": 14285 + }, + { + "epoch": 29.70893970893971, + "grad_norm": 0.5912255048751831, + "learning_rate": 3.584128052976535e-05, + "loss": 0.1202, + "num_input_tokens_seen": 10255032, + "step": 14290 + }, + { + "epoch": 29.719334719334718, + "grad_norm": 0.23623916506767273, + "learning_rate": 3.5832433353938724e-05, + "loss": 0.1692, + "num_input_tokens_seen": 10258552, + "step": 14295 + }, + { + "epoch": 29.72972972972973, + "grad_norm": 0.1421988159418106, + "learning_rate": 3.5823584507614746e-05, + "loss": 0.0973, + "num_input_tokens_seen": 10261880, + "step": 14300 + }, + { + "epoch": 29.74012474012474, + "grad_norm": 0.23423603177070618, + "learning_rate": 3.581473399215802e-05, + "loss": 0.109, + "num_input_tokens_seen": 10265560, + "step": 14305 + }, + { + "epoch": 29.75051975051975, + "grad_norm": 0.2652464210987091, + "learning_rate": 3.580588180893341e-05, + "loss": 0.1196, + "num_input_tokens_seen": 10269144, + "step": 14310 + }, + { + "epoch": 29.760914760914762, + "grad_norm": 0.3182503581047058, + "learning_rate": 3.579702795930602e-05, + "loss": 0.1107, + "num_input_tokens_seen": 10272696, + "step": 14315 + }, + { + "epoch": 29.77130977130977, + "grad_norm": 0.1886649876832962, + "learning_rate": 3.578817244464125e-05, + "loss": 0.1241, + "num_input_tokens_seen": 10276280, + "step": 14320 + }, + { + "epoch": 29.781704781704782, + "grad_norm": 0.41449791193008423, + "learning_rate": 3.577931526630471e-05, + "loss": 0.088, + "num_input_tokens_seen": 10279928, + "step": 14325 + }, + { + "epoch": 29.79209979209979, + "grad_norm": 0.19249007105827332, + "learning_rate": 3.577045642566229e-05, + "loss": 0.0982, + "num_input_tokens_seen": 10283416, + "step": 14330 + }, + { + "epoch": 29.802494802494802, + "grad_norm": 0.40965330600738525, + "learning_rate": 3.576159592408014e-05, + "loss": 0.1149, + "num_input_tokens_seen": 10287064, + "step": 14335 + }, + { + "epoch": 29.812889812889814, + "grad_norm": 0.36207154393196106, + "learning_rate": 3.575273376292466e-05, + "loss": 0.1316, + "num_input_tokens_seen": 10290680, + "step": 14340 + }, + { + "epoch": 29.823284823284823, + "grad_norm": 0.38431596755981445, + "learning_rate": 3.574386994356251e-05, + "loss": 0.1047, + "num_input_tokens_seen": 10294232, + "step": 14345 + }, + { + "epoch": 29.833679833679835, + "grad_norm": 0.335923433303833, + "learning_rate": 3.573500446736059e-05, + "loss": 0.1488, + "num_input_tokens_seen": 10297752, + "step": 14350 + }, + { + "epoch": 29.844074844074843, + "grad_norm": 0.3270483613014221, + "learning_rate": 3.5726137335686094e-05, + "loss": 0.1206, + "num_input_tokens_seen": 10301240, + "step": 14355 + }, + { + "epoch": 29.854469854469855, + "grad_norm": 0.44211307168006897, + "learning_rate": 3.571726854990642e-05, + "loss": 0.1265, + "num_input_tokens_seen": 10304760, + "step": 14360 + }, + { + "epoch": 29.864864864864863, + "grad_norm": 0.2268085479736328, + "learning_rate": 3.570839811138925e-05, + "loss": 0.1524, + "num_input_tokens_seen": 10308280, + "step": 14365 + }, + { + "epoch": 29.875259875259875, + "grad_norm": 0.3938945531845093, + "learning_rate": 3.569952602150252e-05, + "loss": 0.1052, + "num_input_tokens_seen": 10311928, + "step": 14370 + }, + { + "epoch": 29.885654885654887, + "grad_norm": 0.6083272099494934, + "learning_rate": 3.569065228161442e-05, + "loss": 0.1035, + "num_input_tokens_seen": 10315608, + "step": 14375 + }, + { + "epoch": 29.896049896049895, + "grad_norm": 0.289206862449646, + "learning_rate": 3.5681776893093395e-05, + "loss": 0.201, + "num_input_tokens_seen": 10319448, + "step": 14380 + }, + { + "epoch": 29.906444906444907, + "grad_norm": 0.43858927488327026, + "learning_rate": 3.5672899857308134e-05, + "loss": 0.1244, + "num_input_tokens_seen": 10323160, + "step": 14385 + }, + { + "epoch": 29.916839916839916, + "grad_norm": 0.6338398456573486, + "learning_rate": 3.566402117562759e-05, + "loss": 0.0829, + "num_input_tokens_seen": 10326712, + "step": 14390 + }, + { + "epoch": 29.927234927234927, + "grad_norm": 0.1581788808107376, + "learning_rate": 3.565514084942097e-05, + "loss": 0.1086, + "num_input_tokens_seen": 10330232, + "step": 14395 + }, + { + "epoch": 29.93762993762994, + "grad_norm": 0.2984342575073242, + "learning_rate": 3.564625888005773e-05, + "loss": 0.1054, + "num_input_tokens_seen": 10333816, + "step": 14400 + }, + { + "epoch": 29.93762993762994, + "eval_loss": 0.14919009804725647, + "eval_runtime": 7.7663, + "eval_samples_per_second": 110.22, + "eval_steps_per_second": 27.555, + "num_input_tokens_seen": 10333816, + "step": 14400 + }, + { + "epoch": 29.948024948024948, + "grad_norm": 0.30241507291793823, + "learning_rate": 3.563737526890759e-05, + "loss": 0.1317, + "num_input_tokens_seen": 10337368, + "step": 14405 + }, + { + "epoch": 29.95841995841996, + "grad_norm": 0.20186862349510193, + "learning_rate": 3.562849001734049e-05, + "loss": 0.1192, + "num_input_tokens_seen": 10340984, + "step": 14410 + }, + { + "epoch": 29.968814968814968, + "grad_norm": 0.2673865556716919, + "learning_rate": 3.561960312672667e-05, + "loss": 0.1222, + "num_input_tokens_seen": 10344664, + "step": 14415 + }, + { + "epoch": 29.97920997920998, + "grad_norm": 0.21180547773838043, + "learning_rate": 3.5610714598436596e-05, + "loss": 0.1312, + "num_input_tokens_seen": 10348152, + "step": 14420 + }, + { + "epoch": 29.989604989604988, + "grad_norm": 0.7006992101669312, + "learning_rate": 3.5601824433840986e-05, + "loss": 0.1349, + "num_input_tokens_seen": 10351768, + "step": 14425 + }, + { + "epoch": 30.0, + "grad_norm": 0.6912689208984375, + "learning_rate": 3.559293263431082e-05, + "loss": 0.1149, + "num_input_tokens_seen": 10355248, + "step": 14430 + }, + { + "epoch": 30.010395010395012, + "grad_norm": 0.2909433841705322, + "learning_rate": 3.558403920121732e-05, + "loss": 0.0842, + "num_input_tokens_seen": 10358960, + "step": 14435 + }, + { + "epoch": 30.02079002079002, + "grad_norm": 0.29303982853889465, + "learning_rate": 3.557514413593197e-05, + "loss": 0.1106, + "num_input_tokens_seen": 10362576, + "step": 14440 + }, + { + "epoch": 30.031185031185032, + "grad_norm": 0.20046326518058777, + "learning_rate": 3.55662474398265e-05, + "loss": 0.0942, + "num_input_tokens_seen": 10365936, + "step": 14445 + }, + { + "epoch": 30.04158004158004, + "grad_norm": 0.5848463177680969, + "learning_rate": 3.555734911427288e-05, + "loss": 0.1316, + "num_input_tokens_seen": 10369392, + "step": 14450 + }, + { + "epoch": 30.051975051975052, + "grad_norm": 0.25948604941368103, + "learning_rate": 3.5548449160643363e-05, + "loss": 0.0789, + "num_input_tokens_seen": 10373168, + "step": 14455 + }, + { + "epoch": 30.06237006237006, + "grad_norm": 0.2856426537036896, + "learning_rate": 3.553954758031043e-05, + "loss": 0.1223, + "num_input_tokens_seen": 10376592, + "step": 14460 + }, + { + "epoch": 30.072765072765073, + "grad_norm": 0.28777027130126953, + "learning_rate": 3.5530644374646815e-05, + "loss": 0.1147, + "num_input_tokens_seen": 10380240, + "step": 14465 + }, + { + "epoch": 30.083160083160084, + "grad_norm": 0.9127349853515625, + "learning_rate": 3.552173954502549e-05, + "loss": 0.1343, + "num_input_tokens_seen": 10383824, + "step": 14470 + }, + { + "epoch": 30.093555093555093, + "grad_norm": 0.30400940775871277, + "learning_rate": 3.55128330928197e-05, + "loss": 0.1233, + "num_input_tokens_seen": 10387408, + "step": 14475 + }, + { + "epoch": 30.103950103950105, + "grad_norm": 0.31602075695991516, + "learning_rate": 3.550392501940294e-05, + "loss": 0.1043, + "num_input_tokens_seen": 10391088, + "step": 14480 + }, + { + "epoch": 30.114345114345113, + "grad_norm": 0.3357231914997101, + "learning_rate": 3.5495015326148945e-05, + "loss": 0.122, + "num_input_tokens_seen": 10394800, + "step": 14485 + }, + { + "epoch": 30.124740124740125, + "grad_norm": 0.191120445728302, + "learning_rate": 3.548610401443169e-05, + "loss": 0.0931, + "num_input_tokens_seen": 10398352, + "step": 14490 + }, + { + "epoch": 30.135135135135137, + "grad_norm": 0.32968321442604065, + "learning_rate": 3.547719108562543e-05, + "loss": 0.0923, + "num_input_tokens_seen": 10401840, + "step": 14495 + }, + { + "epoch": 30.145530145530145, + "grad_norm": 0.2560751438140869, + "learning_rate": 3.546827654110464e-05, + "loss": 0.116, + "num_input_tokens_seen": 10405392, + "step": 14500 + }, + { + "epoch": 30.155925155925157, + "grad_norm": 0.30772724747657776, + "learning_rate": 3.545936038224405e-05, + "loss": 0.0958, + "num_input_tokens_seen": 10408912, + "step": 14505 + }, + { + "epoch": 30.166320166320165, + "grad_norm": 0.2769724130630493, + "learning_rate": 3.545044261041864e-05, + "loss": 0.1213, + "num_input_tokens_seen": 10412400, + "step": 14510 + }, + { + "epoch": 30.176715176715177, + "grad_norm": 0.20189522206783295, + "learning_rate": 3.5441523227003657e-05, + "loss": 0.1548, + "num_input_tokens_seen": 10416080, + "step": 14515 + }, + { + "epoch": 30.187110187110186, + "grad_norm": 0.6106767654418945, + "learning_rate": 3.543260223337459e-05, + "loss": 0.1239, + "num_input_tokens_seen": 10419696, + "step": 14520 + }, + { + "epoch": 30.197505197505198, + "grad_norm": 0.19480961561203003, + "learning_rate": 3.542367963090714e-05, + "loss": 0.121, + "num_input_tokens_seen": 10423248, + "step": 14525 + }, + { + "epoch": 30.20790020790021, + "grad_norm": 0.2492566704750061, + "learning_rate": 3.5414755420977295e-05, + "loss": 0.0942, + "num_input_tokens_seen": 10426832, + "step": 14530 + }, + { + "epoch": 30.218295218295218, + "grad_norm": 0.4809865355491638, + "learning_rate": 3.54058296049613e-05, + "loss": 0.1283, + "num_input_tokens_seen": 10430448, + "step": 14535 + }, + { + "epoch": 30.22869022869023, + "grad_norm": 0.3608897030353546, + "learning_rate": 3.53969021842356e-05, + "loss": 0.1097, + "num_input_tokens_seen": 10433968, + "step": 14540 + }, + { + "epoch": 30.239085239085238, + "grad_norm": 0.09528955817222595, + "learning_rate": 3.5387973160176926e-05, + "loss": 0.0855, + "num_input_tokens_seen": 10437520, + "step": 14545 + }, + { + "epoch": 30.24948024948025, + "grad_norm": 0.9664185643196106, + "learning_rate": 3.537904253416224e-05, + "loss": 0.1812, + "num_input_tokens_seen": 10441136, + "step": 14550 + }, + { + "epoch": 30.25987525987526, + "grad_norm": 0.24702423810958862, + "learning_rate": 3.537011030756878e-05, + "loss": 0.1219, + "num_input_tokens_seen": 10444752, + "step": 14555 + }, + { + "epoch": 30.27027027027027, + "grad_norm": 0.17659270763397217, + "learning_rate": 3.536117648177399e-05, + "loss": 0.104, + "num_input_tokens_seen": 10448272, + "step": 14560 + }, + { + "epoch": 30.280665280665282, + "grad_norm": 0.493634968996048, + "learning_rate": 3.535224105815558e-05, + "loss": 0.1669, + "num_input_tokens_seen": 10451792, + "step": 14565 + }, + { + "epoch": 30.29106029106029, + "grad_norm": 0.42208290100097656, + "learning_rate": 3.5343304038091494e-05, + "loss": 0.1009, + "num_input_tokens_seen": 10455344, + "step": 14570 + }, + { + "epoch": 30.301455301455302, + "grad_norm": 0.18277622759342194, + "learning_rate": 3.5334365422959955e-05, + "loss": 0.0981, + "num_input_tokens_seen": 10458832, + "step": 14575 + }, + { + "epoch": 30.31185031185031, + "grad_norm": 0.2746672034263611, + "learning_rate": 3.5325425214139396e-05, + "loss": 0.133, + "num_input_tokens_seen": 10462352, + "step": 14580 + }, + { + "epoch": 30.322245322245323, + "grad_norm": 0.2381458282470703, + "learning_rate": 3.531648341300851e-05, + "loss": 0.1057, + "num_input_tokens_seen": 10465968, + "step": 14585 + }, + { + "epoch": 30.33264033264033, + "grad_norm": 0.7174548506736755, + "learning_rate": 3.530754002094623e-05, + "loss": 0.1563, + "num_input_tokens_seen": 10469616, + "step": 14590 + }, + { + "epoch": 30.343035343035343, + "grad_norm": 0.1529906690120697, + "learning_rate": 3.529859503933175e-05, + "loss": 0.125, + "num_input_tokens_seen": 10473168, + "step": 14595 + }, + { + "epoch": 30.353430353430355, + "grad_norm": 0.16388556361198425, + "learning_rate": 3.52896484695445e-05, + "loss": 0.1422, + "num_input_tokens_seen": 10476752, + "step": 14600 + }, + { + "epoch": 30.353430353430355, + "eval_loss": 0.14470389485359192, + "eval_runtime": 7.759, + "eval_samples_per_second": 110.323, + "eval_steps_per_second": 27.581, + "num_input_tokens_seen": 10476752, + "step": 14600 + }, + { + "epoch": 30.363825363825363, + "grad_norm": 0.20836178958415985, + "learning_rate": 3.528070031296414e-05, + "loss": 0.0919, + "num_input_tokens_seen": 10480368, + "step": 14605 + }, + { + "epoch": 30.374220374220375, + "grad_norm": 0.2419501096010208, + "learning_rate": 3.5271750570970605e-05, + "loss": 0.1132, + "num_input_tokens_seen": 10483952, + "step": 14610 + }, + { + "epoch": 30.384615384615383, + "grad_norm": 0.24322941899299622, + "learning_rate": 3.526279924494405e-05, + "loss": 0.1187, + "num_input_tokens_seen": 10487696, + "step": 14615 + }, + { + "epoch": 30.395010395010395, + "grad_norm": 0.7927371859550476, + "learning_rate": 3.5253846336264874e-05, + "loss": 0.0939, + "num_input_tokens_seen": 10491344, + "step": 14620 + }, + { + "epoch": 30.405405405405407, + "grad_norm": 0.3474263846874237, + "learning_rate": 3.5244891846313736e-05, + "loss": 0.0874, + "num_input_tokens_seen": 10494864, + "step": 14625 + }, + { + "epoch": 30.415800415800415, + "grad_norm": 0.8429543972015381, + "learning_rate": 3.5235935776471527e-05, + "loss": 0.1383, + "num_input_tokens_seen": 10498352, + "step": 14630 + }, + { + "epoch": 30.426195426195427, + "grad_norm": 0.1522154062986374, + "learning_rate": 3.522697812811939e-05, + "loss": 0.1352, + "num_input_tokens_seen": 10502128, + "step": 14635 + }, + { + "epoch": 30.436590436590436, + "grad_norm": 0.3286820650100708, + "learning_rate": 3.521801890263871e-05, + "loss": 0.14, + "num_input_tokens_seen": 10505712, + "step": 14640 + }, + { + "epoch": 30.446985446985448, + "grad_norm": 0.2642168402671814, + "learning_rate": 3.5209058101411114e-05, + "loss": 0.131, + "num_input_tokens_seen": 10509168, + "step": 14645 + }, + { + "epoch": 30.457380457380456, + "grad_norm": 0.5120266079902649, + "learning_rate": 3.520009572581845e-05, + "loss": 0.0952, + "num_input_tokens_seen": 10512784, + "step": 14650 + }, + { + "epoch": 30.467775467775468, + "grad_norm": 0.2883051335811615, + "learning_rate": 3.519113177724285e-05, + "loss": 0.0954, + "num_input_tokens_seen": 10516272, + "step": 14655 + }, + { + "epoch": 30.47817047817048, + "grad_norm": 0.9212605357170105, + "learning_rate": 3.5182166257066656e-05, + "loss": 0.1565, + "num_input_tokens_seen": 10519856, + "step": 14660 + }, + { + "epoch": 30.488565488565488, + "grad_norm": 0.28790122270584106, + "learning_rate": 3.517319916667247e-05, + "loss": 0.1063, + "num_input_tokens_seen": 10523536, + "step": 14665 + }, + { + "epoch": 30.4989604989605, + "grad_norm": 0.20356160402297974, + "learning_rate": 3.516423050744313e-05, + "loss": 0.0885, + "num_input_tokens_seen": 10526992, + "step": 14670 + }, + { + "epoch": 30.509355509355508, + "grad_norm": 0.20931176841259003, + "learning_rate": 3.5155260280761704e-05, + "loss": 0.078, + "num_input_tokens_seen": 10530448, + "step": 14675 + }, + { + "epoch": 30.51975051975052, + "grad_norm": 0.6492215991020203, + "learning_rate": 3.514628848801154e-05, + "loss": 0.113, + "num_input_tokens_seen": 10534032, + "step": 14680 + }, + { + "epoch": 30.53014553014553, + "grad_norm": 0.2828420102596283, + "learning_rate": 3.5137315130576174e-05, + "loss": 0.0968, + "num_input_tokens_seen": 10537808, + "step": 14685 + }, + { + "epoch": 30.54054054054054, + "grad_norm": 0.2436804324388504, + "learning_rate": 3.512834020983942e-05, + "loss": 0.1112, + "num_input_tokens_seen": 10541232, + "step": 14690 + }, + { + "epoch": 30.550935550935552, + "grad_norm": 0.3425699472427368, + "learning_rate": 3.5119363727185334e-05, + "loss": 0.1237, + "num_input_tokens_seen": 10544848, + "step": 14695 + }, + { + "epoch": 30.56133056133056, + "grad_norm": 0.6399596333503723, + "learning_rate": 3.511038568399819e-05, + "loss": 0.1159, + "num_input_tokens_seen": 10548560, + "step": 14700 + }, + { + "epoch": 30.571725571725572, + "grad_norm": 0.5634114742279053, + "learning_rate": 3.510140608166251e-05, + "loss": 0.1533, + "num_input_tokens_seen": 10552272, + "step": 14705 + }, + { + "epoch": 30.58212058212058, + "grad_norm": 0.4119117856025696, + "learning_rate": 3.509242492156308e-05, + "loss": 0.0873, + "num_input_tokens_seen": 10555824, + "step": 14710 + }, + { + "epoch": 30.592515592515593, + "grad_norm": 0.22877736389636993, + "learning_rate": 3.5083442205084896e-05, + "loss": 0.1185, + "num_input_tokens_seen": 10559344, + "step": 14715 + }, + { + "epoch": 30.602910602910605, + "grad_norm": 0.2956638038158417, + "learning_rate": 3.507445793361321e-05, + "loss": 0.0992, + "num_input_tokens_seen": 10562864, + "step": 14720 + }, + { + "epoch": 30.613305613305613, + "grad_norm": 0.21734921634197235, + "learning_rate": 3.5065472108533505e-05, + "loss": 0.1135, + "num_input_tokens_seen": 10566768, + "step": 14725 + }, + { + "epoch": 30.623700623700625, + "grad_norm": 0.6527221202850342, + "learning_rate": 3.5056484731231504e-05, + "loss": 0.1639, + "num_input_tokens_seen": 10570320, + "step": 14730 + }, + { + "epoch": 30.634095634095633, + "grad_norm": 0.24573355913162231, + "learning_rate": 3.504749580309319e-05, + "loss": 0.1036, + "num_input_tokens_seen": 10573776, + "step": 14735 + }, + { + "epoch": 30.644490644490645, + "grad_norm": 0.4684932827949524, + "learning_rate": 3.5038505325504753e-05, + "loss": 0.1638, + "num_input_tokens_seen": 10577392, + "step": 14740 + }, + { + "epoch": 30.654885654885653, + "grad_norm": 0.335530549287796, + "learning_rate": 3.502951329985264e-05, + "loss": 0.1305, + "num_input_tokens_seen": 10581040, + "step": 14745 + }, + { + "epoch": 30.665280665280665, + "grad_norm": 0.26669278740882874, + "learning_rate": 3.502051972752354e-05, + "loss": 0.1143, + "num_input_tokens_seen": 10584592, + "step": 14750 + }, + { + "epoch": 30.675675675675677, + "grad_norm": 0.28082039952278137, + "learning_rate": 3.5011524609904374e-05, + "loss": 0.106, + "num_input_tokens_seen": 10588208, + "step": 14755 + }, + { + "epoch": 30.686070686070686, + "grad_norm": 0.5389371514320374, + "learning_rate": 3.50025279483823e-05, + "loss": 0.1038, + "num_input_tokens_seen": 10591664, + "step": 14760 + }, + { + "epoch": 30.696465696465697, + "grad_norm": 0.48583829402923584, + "learning_rate": 3.499352974434472e-05, + "loss": 0.1579, + "num_input_tokens_seen": 10595376, + "step": 14765 + }, + { + "epoch": 30.706860706860706, + "grad_norm": 0.3642211854457855, + "learning_rate": 3.498452999917926e-05, + "loss": 0.1407, + "num_input_tokens_seen": 10598864, + "step": 14770 + }, + { + "epoch": 30.717255717255718, + "grad_norm": 0.22043922543525696, + "learning_rate": 3.4975528714273795e-05, + "loss": 0.087, + "num_input_tokens_seen": 10602448, + "step": 14775 + }, + { + "epoch": 30.727650727650726, + "grad_norm": 0.1937326043844223, + "learning_rate": 3.4966525891016454e-05, + "loss": 0.0903, + "num_input_tokens_seen": 10605904, + "step": 14780 + }, + { + "epoch": 30.738045738045738, + "grad_norm": 0.19009476900100708, + "learning_rate": 3.495752153079557e-05, + "loss": 0.1005, + "num_input_tokens_seen": 10609424, + "step": 14785 + }, + { + "epoch": 30.74844074844075, + "grad_norm": 0.32358741760253906, + "learning_rate": 3.494851563499974e-05, + "loss": 0.0724, + "num_input_tokens_seen": 10612976, + "step": 14790 + }, + { + "epoch": 30.758835758835758, + "grad_norm": 0.21116545796394348, + "learning_rate": 3.493950820501777e-05, + "loss": 0.1161, + "num_input_tokens_seen": 10616560, + "step": 14795 + }, + { + "epoch": 30.76923076923077, + "grad_norm": 0.5251761078834534, + "learning_rate": 3.493049924223872e-05, + "loss": 0.1203, + "num_input_tokens_seen": 10620240, + "step": 14800 + }, + { + "epoch": 30.76923076923077, + "eval_loss": 0.15012438595294952, + "eval_runtime": 7.7467, + "eval_samples_per_second": 110.499, + "eval_steps_per_second": 27.625, + "num_input_tokens_seen": 10620240, + "step": 14800 + }, + { + "epoch": 30.77962577962578, + "grad_norm": 0.16350018978118896, + "learning_rate": 3.49214887480519e-05, + "loss": 0.136, + "num_input_tokens_seen": 10624080, + "step": 14805 + }, + { + "epoch": 30.79002079002079, + "grad_norm": 0.25528255105018616, + "learning_rate": 3.4912476723846834e-05, + "loss": 0.1001, + "num_input_tokens_seen": 10627632, + "step": 14810 + }, + { + "epoch": 30.8004158004158, + "grad_norm": 0.24967047572135925, + "learning_rate": 3.490346317101328e-05, + "loss": 0.1295, + "num_input_tokens_seen": 10631152, + "step": 14815 + }, + { + "epoch": 30.81081081081081, + "grad_norm": 0.5651196241378784, + "learning_rate": 3.4894448090941266e-05, + "loss": 0.0924, + "num_input_tokens_seen": 10634672, + "step": 14820 + }, + { + "epoch": 30.821205821205822, + "grad_norm": 0.2851337194442749, + "learning_rate": 3.488543148502101e-05, + "loss": 0.1393, + "num_input_tokens_seen": 10638192, + "step": 14825 + }, + { + "epoch": 30.83160083160083, + "grad_norm": 0.36312761902809143, + "learning_rate": 3.487641335464299e-05, + "loss": 0.1291, + "num_input_tokens_seen": 10641680, + "step": 14830 + }, + { + "epoch": 30.841995841995843, + "grad_norm": 0.21224071085453033, + "learning_rate": 3.4867393701197914e-05, + "loss": 0.1184, + "num_input_tokens_seen": 10645232, + "step": 14835 + }, + { + "epoch": 30.85239085239085, + "grad_norm": 0.3845854103565216, + "learning_rate": 3.485837252607673e-05, + "loss": 0.0962, + "num_input_tokens_seen": 10648944, + "step": 14840 + }, + { + "epoch": 30.862785862785863, + "grad_norm": 0.5235536098480225, + "learning_rate": 3.4849349830670615e-05, + "loss": 0.1055, + "num_input_tokens_seen": 10652464, + "step": 14845 + }, + { + "epoch": 30.873180873180875, + "grad_norm": 0.6270237565040588, + "learning_rate": 3.4840325616370976e-05, + "loss": 0.1488, + "num_input_tokens_seen": 10656016, + "step": 14850 + }, + { + "epoch": 30.883575883575883, + "grad_norm": 0.30217957496643066, + "learning_rate": 3.483129988456947e-05, + "loss": 0.0993, + "num_input_tokens_seen": 10659824, + "step": 14855 + }, + { + "epoch": 30.893970893970895, + "grad_norm": 0.283278226852417, + "learning_rate": 3.482227263665797e-05, + "loss": 0.1425, + "num_input_tokens_seen": 10663408, + "step": 14860 + }, + { + "epoch": 30.904365904365903, + "grad_norm": 0.27974531054496765, + "learning_rate": 3.48132438740286e-05, + "loss": 0.0974, + "num_input_tokens_seen": 10666960, + "step": 14865 + }, + { + "epoch": 30.914760914760915, + "grad_norm": 0.15868103504180908, + "learning_rate": 3.48042135980737e-05, + "loss": 0.1186, + "num_input_tokens_seen": 10670544, + "step": 14870 + }, + { + "epoch": 30.925155925155924, + "grad_norm": 0.20149606466293335, + "learning_rate": 3.479518181018586e-05, + "loss": 0.0994, + "num_input_tokens_seen": 10674064, + "step": 14875 + }, + { + "epoch": 30.935550935550935, + "grad_norm": 0.332630455493927, + "learning_rate": 3.4786148511757886e-05, + "loss": 0.1186, + "num_input_tokens_seen": 10677584, + "step": 14880 + }, + { + "epoch": 30.945945945945947, + "grad_norm": 0.3998797535896301, + "learning_rate": 3.477711370418284e-05, + "loss": 0.1142, + "num_input_tokens_seen": 10681200, + "step": 14885 + }, + { + "epoch": 30.956340956340956, + "grad_norm": 0.41857457160949707, + "learning_rate": 3.476807738885399e-05, + "loss": 0.1221, + "num_input_tokens_seen": 10684720, + "step": 14890 + }, + { + "epoch": 30.966735966735968, + "grad_norm": 0.2516253888607025, + "learning_rate": 3.475903956716485e-05, + "loss": 0.0613, + "num_input_tokens_seen": 10688368, + "step": 14895 + }, + { + "epoch": 30.977130977130976, + "grad_norm": 0.3234640657901764, + "learning_rate": 3.475000024050917e-05, + "loss": 0.1239, + "num_input_tokens_seen": 10691920, + "step": 14900 + }, + { + "epoch": 30.987525987525988, + "grad_norm": 0.09748966991901398, + "learning_rate": 3.4740959410280926e-05, + "loss": 0.0833, + "num_input_tokens_seen": 10695408, + "step": 14905 + }, + { + "epoch": 30.997920997921, + "grad_norm": 0.4792934060096741, + "learning_rate": 3.4731917077874324e-05, + "loss": 0.1218, + "num_input_tokens_seen": 10698960, + "step": 14910 + }, + { + "epoch": 31.008316008316008, + "grad_norm": 0.2935567796230316, + "learning_rate": 3.4722873244683816e-05, + "loss": 0.144, + "num_input_tokens_seen": 10702408, + "step": 14915 + }, + { + "epoch": 31.01871101871102, + "grad_norm": 0.22986318171024323, + "learning_rate": 3.4713827912104065e-05, + "loss": 0.1327, + "num_input_tokens_seen": 10705832, + "step": 14920 + }, + { + "epoch": 31.02910602910603, + "grad_norm": 0.30495455861091614, + "learning_rate": 3.470478108152998e-05, + "loss": 0.1099, + "num_input_tokens_seen": 10709352, + "step": 14925 + }, + { + "epoch": 31.03950103950104, + "grad_norm": 0.3560434579849243, + "learning_rate": 3.4695732754356695e-05, + "loss": 0.1891, + "num_input_tokens_seen": 10712808, + "step": 14930 + }, + { + "epoch": 31.04989604989605, + "grad_norm": 0.15488538146018982, + "learning_rate": 3.4686682931979576e-05, + "loss": 0.1141, + "num_input_tokens_seen": 10716488, + "step": 14935 + }, + { + "epoch": 31.06029106029106, + "grad_norm": 0.28609052300453186, + "learning_rate": 3.467763161579422e-05, + "loss": 0.1051, + "num_input_tokens_seen": 10720104, + "step": 14940 + }, + { + "epoch": 31.070686070686072, + "grad_norm": 0.4141143262386322, + "learning_rate": 3.466857880719645e-05, + "loss": 0.1623, + "num_input_tokens_seen": 10723816, + "step": 14945 + }, + { + "epoch": 31.08108108108108, + "grad_norm": 0.2666873335838318, + "learning_rate": 3.465952450758233e-05, + "loss": 0.1483, + "num_input_tokens_seen": 10727560, + "step": 14950 + }, + { + "epoch": 31.091476091476093, + "grad_norm": 0.4089498221874237, + "learning_rate": 3.4650468718348126e-05, + "loss": 0.1267, + "num_input_tokens_seen": 10731112, + "step": 14955 + }, + { + "epoch": 31.1018711018711, + "grad_norm": 0.33417239785194397, + "learning_rate": 3.464141144089038e-05, + "loss": 0.1254, + "num_input_tokens_seen": 10734824, + "step": 14960 + }, + { + "epoch": 31.112266112266113, + "grad_norm": 1.2560709714889526, + "learning_rate": 3.463235267660583e-05, + "loss": 0.1886, + "num_input_tokens_seen": 10738376, + "step": 14965 + }, + { + "epoch": 31.12266112266112, + "grad_norm": 0.43549928069114685, + "learning_rate": 3.462329242689145e-05, + "loss": 0.0833, + "num_input_tokens_seen": 10742024, + "step": 14970 + }, + { + "epoch": 31.133056133056133, + "grad_norm": 0.7448317408561707, + "learning_rate": 3.461423069314444e-05, + "loss": 0.1598, + "num_input_tokens_seen": 10745544, + "step": 14975 + }, + { + "epoch": 31.143451143451145, + "grad_norm": 0.31752926111221313, + "learning_rate": 3.460516747676224e-05, + "loss": 0.1117, + "num_input_tokens_seen": 10749288, + "step": 14980 + }, + { + "epoch": 31.153846153846153, + "grad_norm": 0.1337926685810089, + "learning_rate": 3.459610277914251e-05, + "loss": 0.0953, + "num_input_tokens_seen": 10752840, + "step": 14985 + }, + { + "epoch": 31.164241164241165, + "grad_norm": 0.4484148621559143, + "learning_rate": 3.458703660168314e-05, + "loss": 0.1158, + "num_input_tokens_seen": 10756360, + "step": 14990 + }, + { + "epoch": 31.174636174636174, + "grad_norm": 0.24174322187900543, + "learning_rate": 3.457796894578224e-05, + "loss": 0.1305, + "num_input_tokens_seen": 10759880, + "step": 14995 + }, + { + "epoch": 31.185031185031185, + "grad_norm": 0.24621064960956573, + "learning_rate": 3.456889981283817e-05, + "loss": 0.1145, + "num_input_tokens_seen": 10763368, + "step": 15000 + }, + { + "epoch": 31.185031185031185, + "eval_loss": 0.14170345664024353, + "eval_runtime": 7.7703, + "eval_samples_per_second": 110.163, + "eval_steps_per_second": 27.541, + "num_input_tokens_seen": 10763368, + "step": 15000 + }, + { + "epoch": 31.195426195426194, + "grad_norm": 0.199260875582695, + "learning_rate": 3.45598292042495e-05, + "loss": 0.1279, + "num_input_tokens_seen": 10766984, + "step": 15005 + }, + { + "epoch": 31.205821205821206, + "grad_norm": 0.4472009837627411, + "learning_rate": 3.4550757121415035e-05, + "loss": 0.0852, + "num_input_tokens_seen": 10770632, + "step": 15010 + }, + { + "epoch": 31.216216216216218, + "grad_norm": 0.3341779112815857, + "learning_rate": 3.454168356573378e-05, + "loss": 0.1301, + "num_input_tokens_seen": 10774248, + "step": 15015 + }, + { + "epoch": 31.226611226611226, + "grad_norm": 0.3393239676952362, + "learning_rate": 3.453260853860503e-05, + "loss": 0.1407, + "num_input_tokens_seen": 10777896, + "step": 15020 + }, + { + "epoch": 31.237006237006238, + "grad_norm": 0.29653218388557434, + "learning_rate": 3.452353204142824e-05, + "loss": 0.1004, + "num_input_tokens_seen": 10781480, + "step": 15025 + }, + { + "epoch": 31.247401247401246, + "grad_norm": 0.38970816135406494, + "learning_rate": 3.4514454075603136e-05, + "loss": 0.1239, + "num_input_tokens_seen": 10785224, + "step": 15030 + }, + { + "epoch": 31.257796257796258, + "grad_norm": 0.18550996482372284, + "learning_rate": 3.450537464252964e-05, + "loss": 0.1309, + "num_input_tokens_seen": 10788680, + "step": 15035 + }, + { + "epoch": 31.26819126819127, + "grad_norm": 0.3917528986930847, + "learning_rate": 3.4496293743607925e-05, + "loss": 0.14, + "num_input_tokens_seen": 10792168, + "step": 15040 + }, + { + "epoch": 31.27858627858628, + "grad_norm": 0.20565447211265564, + "learning_rate": 3.448721138023838e-05, + "loss": 0.0756, + "num_input_tokens_seen": 10795816, + "step": 15045 + }, + { + "epoch": 31.28898128898129, + "grad_norm": 0.23069798946380615, + "learning_rate": 3.447812755382162e-05, + "loss": 0.0992, + "num_input_tokens_seen": 10799464, + "step": 15050 + }, + { + "epoch": 31.2993762993763, + "grad_norm": 0.1854415237903595, + "learning_rate": 3.446904226575847e-05, + "loss": 0.0774, + "num_input_tokens_seen": 10802984, + "step": 15055 + }, + { + "epoch": 31.30977130977131, + "grad_norm": 0.6562426686286926, + "learning_rate": 3.445995551745002e-05, + "loss": 0.1437, + "num_input_tokens_seen": 10806504, + "step": 15060 + }, + { + "epoch": 31.32016632016632, + "grad_norm": 0.22031264007091522, + "learning_rate": 3.445086731029753e-05, + "loss": 0.1137, + "num_input_tokens_seen": 10809896, + "step": 15065 + }, + { + "epoch": 31.33056133056133, + "grad_norm": 0.6435470581054688, + "learning_rate": 3.444177764570255e-05, + "loss": 0.1342, + "num_input_tokens_seen": 10813480, + "step": 15070 + }, + { + "epoch": 31.340956340956343, + "grad_norm": 0.1688508838415146, + "learning_rate": 3.44326865250668e-05, + "loss": 0.117, + "num_input_tokens_seen": 10817032, + "step": 15075 + }, + { + "epoch": 31.35135135135135, + "grad_norm": 0.2112405002117157, + "learning_rate": 3.442359394979225e-05, + "loss": 0.0847, + "num_input_tokens_seen": 10820552, + "step": 15080 + }, + { + "epoch": 31.361746361746363, + "grad_norm": 0.4648202657699585, + "learning_rate": 3.441449992128108e-05, + "loss": 0.0988, + "num_input_tokens_seen": 10824008, + "step": 15085 + }, + { + "epoch": 31.37214137214137, + "grad_norm": 0.25837886333465576, + "learning_rate": 3.440540444093573e-05, + "loss": 0.109, + "num_input_tokens_seen": 10827656, + "step": 15090 + }, + { + "epoch": 31.382536382536383, + "grad_norm": 0.24449419975280762, + "learning_rate": 3.43963075101588e-05, + "loss": 0.0969, + "num_input_tokens_seen": 10831400, + "step": 15095 + }, + { + "epoch": 31.39293139293139, + "grad_norm": 0.19265064597129822, + "learning_rate": 3.438720913035318e-05, + "loss": 0.1114, + "num_input_tokens_seen": 10834920, + "step": 15100 + }, + { + "epoch": 31.403326403326403, + "grad_norm": 0.5099555850028992, + "learning_rate": 3.437810930292195e-05, + "loss": 0.0974, + "num_input_tokens_seen": 10838408, + "step": 15105 + }, + { + "epoch": 31.413721413721415, + "grad_norm": 0.4742102324962616, + "learning_rate": 3.43690080292684e-05, + "loss": 0.1121, + "num_input_tokens_seen": 10841960, + "step": 15110 + }, + { + "epoch": 31.424116424116423, + "grad_norm": 0.27809950709342957, + "learning_rate": 3.435990531079608e-05, + "loss": 0.0954, + "num_input_tokens_seen": 10845640, + "step": 15115 + }, + { + "epoch": 31.434511434511435, + "grad_norm": 0.2242393046617508, + "learning_rate": 3.435080114890874e-05, + "loss": 0.0911, + "num_input_tokens_seen": 10849320, + "step": 15120 + }, + { + "epoch": 31.444906444906444, + "grad_norm": 0.2847963869571686, + "learning_rate": 3.434169554501035e-05, + "loss": 0.115, + "num_input_tokens_seen": 10852968, + "step": 15125 + }, + { + "epoch": 31.455301455301456, + "grad_norm": 0.7997297048568726, + "learning_rate": 3.433258850050511e-05, + "loss": 0.131, + "num_input_tokens_seen": 10856552, + "step": 15130 + }, + { + "epoch": 31.465696465696467, + "grad_norm": 0.1955518275499344, + "learning_rate": 3.4323480016797446e-05, + "loss": 0.0855, + "num_input_tokens_seen": 10860040, + "step": 15135 + }, + { + "epoch": 31.476091476091476, + "grad_norm": 0.569369375705719, + "learning_rate": 3.4314370095291995e-05, + "loss": 0.1343, + "num_input_tokens_seen": 10863624, + "step": 15140 + }, + { + "epoch": 31.486486486486488, + "grad_norm": 0.32658159732818604, + "learning_rate": 3.430525873739363e-05, + "loss": 0.0941, + "num_input_tokens_seen": 10867240, + "step": 15145 + }, + { + "epoch": 31.496881496881496, + "grad_norm": 0.26486900448799133, + "learning_rate": 3.429614594450743e-05, + "loss": 0.1184, + "num_input_tokens_seen": 10870792, + "step": 15150 + }, + { + "epoch": 31.507276507276508, + "grad_norm": 0.4448113739490509, + "learning_rate": 3.428703171803869e-05, + "loss": 0.1393, + "num_input_tokens_seen": 10874280, + "step": 15155 + }, + { + "epoch": 31.517671517671516, + "grad_norm": 0.24841134250164032, + "learning_rate": 3.4277916059392964e-05, + "loss": 0.1134, + "num_input_tokens_seen": 10877864, + "step": 15160 + }, + { + "epoch": 31.528066528066528, + "grad_norm": 0.36461734771728516, + "learning_rate": 3.426879896997598e-05, + "loss": 0.1092, + "num_input_tokens_seen": 10881576, + "step": 15165 + }, + { + "epoch": 31.53846153846154, + "grad_norm": 0.9862841367721558, + "learning_rate": 3.425968045119372e-05, + "loss": 0.1318, + "num_input_tokens_seen": 10885128, + "step": 15170 + }, + { + "epoch": 31.54885654885655, + "grad_norm": 0.5096420645713806, + "learning_rate": 3.425056050445237e-05, + "loss": 0.1004, + "num_input_tokens_seen": 10888648, + "step": 15175 + }, + { + "epoch": 31.55925155925156, + "grad_norm": 0.24924375116825104, + "learning_rate": 3.4241439131158336e-05, + "loss": 0.1257, + "num_input_tokens_seen": 10892296, + "step": 15180 + }, + { + "epoch": 31.56964656964657, + "grad_norm": 0.4950539171695709, + "learning_rate": 3.423231633271825e-05, + "loss": 0.1207, + "num_input_tokens_seen": 10895944, + "step": 15185 + }, + { + "epoch": 31.58004158004158, + "grad_norm": 0.32401788234710693, + "learning_rate": 3.4223192110538985e-05, + "loss": 0.0924, + "num_input_tokens_seen": 10899464, + "step": 15190 + }, + { + "epoch": 31.59043659043659, + "grad_norm": 0.3788856267929077, + "learning_rate": 3.4214066466027575e-05, + "loss": 0.1012, + "num_input_tokens_seen": 10902920, + "step": 15195 + }, + { + "epoch": 31.6008316008316, + "grad_norm": 0.25095412135124207, + "learning_rate": 3.4204939400591325e-05, + "loss": 0.0727, + "num_input_tokens_seen": 10906568, + "step": 15200 + }, + { + "epoch": 31.6008316008316, + "eval_loss": 0.1447717696428299, + "eval_runtime": 7.756, + "eval_samples_per_second": 110.366, + "eval_steps_per_second": 27.591, + "num_input_tokens_seen": 10906568, + "step": 15200 + }, + { + "epoch": 31.611226611226613, + "grad_norm": 0.23912803828716278, + "learning_rate": 3.419581091563775e-05, + "loss": 0.0849, + "num_input_tokens_seen": 10910120, + "step": 15205 + }, + { + "epoch": 31.62162162162162, + "grad_norm": 0.6851762533187866, + "learning_rate": 3.418668101257456e-05, + "loss": 0.1329, + "num_input_tokens_seen": 10913544, + "step": 15210 + }, + { + "epoch": 31.632016632016633, + "grad_norm": 0.31959107518196106, + "learning_rate": 3.417754969280971e-05, + "loss": 0.1065, + "num_input_tokens_seen": 10917128, + "step": 15215 + }, + { + "epoch": 31.64241164241164, + "grad_norm": 0.15929371118545532, + "learning_rate": 3.416841695775137e-05, + "loss": 0.0927, + "num_input_tokens_seen": 10920712, + "step": 15220 + }, + { + "epoch": 31.652806652806653, + "grad_norm": 0.3355207145214081, + "learning_rate": 3.415928280880792e-05, + "loss": 0.1643, + "num_input_tokens_seen": 10924296, + "step": 15225 + }, + { + "epoch": 31.66320166320166, + "grad_norm": 1.0838857889175415, + "learning_rate": 3.4150147247387965e-05, + "loss": 0.0928, + "num_input_tokens_seen": 10928104, + "step": 15230 + }, + { + "epoch": 31.673596673596673, + "grad_norm": 0.23599933087825775, + "learning_rate": 3.4141010274900306e-05, + "loss": 0.1243, + "num_input_tokens_seen": 10931784, + "step": 15235 + }, + { + "epoch": 31.683991683991685, + "grad_norm": 0.47703078389167786, + "learning_rate": 3.413187189275399e-05, + "loss": 0.1195, + "num_input_tokens_seen": 10935368, + "step": 15240 + }, + { + "epoch": 31.694386694386694, + "grad_norm": 0.12481817603111267, + "learning_rate": 3.4122732102358265e-05, + "loss": 0.0899, + "num_input_tokens_seen": 10938824, + "step": 15245 + }, + { + "epoch": 31.704781704781706, + "grad_norm": 0.2491186261177063, + "learning_rate": 3.411359090512261e-05, + "loss": 0.0912, + "num_input_tokens_seen": 10942472, + "step": 15250 + }, + { + "epoch": 31.715176715176714, + "grad_norm": 0.19679787755012512, + "learning_rate": 3.410444830245672e-05, + "loss": 0.1354, + "num_input_tokens_seen": 10946024, + "step": 15255 + }, + { + "epoch": 31.725571725571726, + "grad_norm": 0.20485955476760864, + "learning_rate": 3.409530429577048e-05, + "loss": 0.0781, + "num_input_tokens_seen": 10949608, + "step": 15260 + }, + { + "epoch": 31.735966735966738, + "grad_norm": 0.4422963559627533, + "learning_rate": 3.408615888647402e-05, + "loss": 0.1053, + "num_input_tokens_seen": 10953096, + "step": 15265 + }, + { + "epoch": 31.746361746361746, + "grad_norm": 0.30721327662467957, + "learning_rate": 3.4077012075977675e-05, + "loss": 0.0756, + "num_input_tokens_seen": 10956744, + "step": 15270 + }, + { + "epoch": 31.756756756756758, + "grad_norm": 0.42067626118659973, + "learning_rate": 3.4067863865692e-05, + "loss": 0.0941, + "num_input_tokens_seen": 10960168, + "step": 15275 + }, + { + "epoch": 31.767151767151766, + "grad_norm": 0.43333300948143005, + "learning_rate": 3.4058714257027755e-05, + "loss": 0.1114, + "num_input_tokens_seen": 10963912, + "step": 15280 + }, + { + "epoch": 31.777546777546778, + "grad_norm": 0.4050144553184509, + "learning_rate": 3.404956325139594e-05, + "loss": 0.0784, + "num_input_tokens_seen": 10967400, + "step": 15285 + }, + { + "epoch": 31.787941787941786, + "grad_norm": 0.30172351002693176, + "learning_rate": 3.404041085020775e-05, + "loss": 0.0953, + "num_input_tokens_seen": 10971112, + "step": 15290 + }, + { + "epoch": 31.7983367983368, + "grad_norm": 0.17736545205116272, + "learning_rate": 3.403125705487459e-05, + "loss": 0.0941, + "num_input_tokens_seen": 10974536, + "step": 15295 + }, + { + "epoch": 31.80873180873181, + "grad_norm": 0.27804747223854065, + "learning_rate": 3.402210186680811e-05, + "loss": 0.1381, + "num_input_tokens_seen": 10978088, + "step": 15300 + }, + { + "epoch": 31.81912681912682, + "grad_norm": 0.2741745114326477, + "learning_rate": 3.4012945287420137e-05, + "loss": 0.0918, + "num_input_tokens_seen": 10981640, + "step": 15305 + }, + { + "epoch": 31.82952182952183, + "grad_norm": 0.6909206509590149, + "learning_rate": 3.400378731812274e-05, + "loss": 0.0823, + "num_input_tokens_seen": 10985160, + "step": 15310 + }, + { + "epoch": 31.83991683991684, + "grad_norm": 0.2382153421640396, + "learning_rate": 3.399462796032817e-05, + "loss": 0.0894, + "num_input_tokens_seen": 10988648, + "step": 15315 + }, + { + "epoch": 31.85031185031185, + "grad_norm": 0.3029731214046478, + "learning_rate": 3.3985467215448954e-05, + "loss": 0.1371, + "num_input_tokens_seen": 10992104, + "step": 15320 + }, + { + "epoch": 31.86070686070686, + "grad_norm": 0.333990216255188, + "learning_rate": 3.3976305084897776e-05, + "loss": 0.1162, + "num_input_tokens_seen": 10995560, + "step": 15325 + }, + { + "epoch": 31.87110187110187, + "grad_norm": 0.22686298191547394, + "learning_rate": 3.3967141570087544e-05, + "loss": 0.1054, + "num_input_tokens_seen": 10999208, + "step": 15330 + }, + { + "epoch": 31.881496881496883, + "grad_norm": 0.3401951193809509, + "learning_rate": 3.39579766724314e-05, + "loss": 0.109, + "num_input_tokens_seen": 11002952, + "step": 15335 + }, + { + "epoch": 31.89189189189189, + "grad_norm": 0.2399640828371048, + "learning_rate": 3.3948810393342677e-05, + "loss": 0.1033, + "num_input_tokens_seen": 11006472, + "step": 15340 + }, + { + "epoch": 31.902286902286903, + "grad_norm": 0.6455249786376953, + "learning_rate": 3.3939642734234936e-05, + "loss": 0.0921, + "num_input_tokens_seen": 11010024, + "step": 15345 + }, + { + "epoch": 31.91268191268191, + "grad_norm": 0.24325300753116608, + "learning_rate": 3.393047369652194e-05, + "loss": 0.1701, + "num_input_tokens_seen": 11013864, + "step": 15350 + }, + { + "epoch": 31.923076923076923, + "grad_norm": 0.1991899311542511, + "learning_rate": 3.3921303281617664e-05, + "loss": 0.1397, + "num_input_tokens_seen": 11017416, + "step": 15355 + }, + { + "epoch": 31.933471933471935, + "grad_norm": 0.24266503751277924, + "learning_rate": 3.391213149093632e-05, + "loss": 0.1126, + "num_input_tokens_seen": 11020968, + "step": 15360 + }, + { + "epoch": 31.943866943866944, + "grad_norm": 0.3139474391937256, + "learning_rate": 3.3902958325892303e-05, + "loss": 0.1094, + "num_input_tokens_seen": 11024616, + "step": 15365 + }, + { + "epoch": 31.954261954261955, + "grad_norm": 0.7523162961006165, + "learning_rate": 3.389378378790023e-05, + "loss": 0.1398, + "num_input_tokens_seen": 11028264, + "step": 15370 + }, + { + "epoch": 31.964656964656964, + "grad_norm": 0.2566222548484802, + "learning_rate": 3.388460787837493e-05, + "loss": 0.0758, + "num_input_tokens_seen": 11031912, + "step": 15375 + }, + { + "epoch": 31.975051975051976, + "grad_norm": 0.17023424804210663, + "learning_rate": 3.387543059873145e-05, + "loss": 0.1293, + "num_input_tokens_seen": 11035528, + "step": 15380 + }, + { + "epoch": 31.985446985446984, + "grad_norm": 0.9409210681915283, + "learning_rate": 3.386625195038503e-05, + "loss": 0.1621, + "num_input_tokens_seen": 11039208, + "step": 15385 + }, + { + "epoch": 31.995841995841996, + "grad_norm": 1.027238130569458, + "learning_rate": 3.3857071934751136e-05, + "loss": 0.1111, + "num_input_tokens_seen": 11042632, + "step": 15390 + }, + { + "epoch": 32.00623700623701, + "grad_norm": 0.3631748855113983, + "learning_rate": 3.384789055324544e-05, + "loss": 0.0969, + "num_input_tokens_seen": 11046216, + "step": 15395 + }, + { + "epoch": 32.016632016632016, + "grad_norm": 0.3262808918952942, + "learning_rate": 3.3838707807283843e-05, + "loss": 0.1571, + "num_input_tokens_seen": 11049768, + "step": 15400 + }, + { + "epoch": 32.016632016632016, + "eval_loss": 0.14941611886024475, + "eval_runtime": 7.7551, + "eval_samples_per_second": 110.379, + "eval_steps_per_second": 27.595, + "num_input_tokens_seen": 11049768, + "step": 15400 + }, + { + "epoch": 32.027027027027025, + "grad_norm": 0.3880894184112549, + "learning_rate": 3.382952369828243e-05, + "loss": 0.0743, + "num_input_tokens_seen": 11053192, + "step": 15405 + }, + { + "epoch": 32.03742203742204, + "grad_norm": 1.3735435009002686, + "learning_rate": 3.38203382276575e-05, + "loss": 0.1681, + "num_input_tokens_seen": 11056680, + "step": 15410 + }, + { + "epoch": 32.04781704781705, + "grad_norm": 0.30646440386772156, + "learning_rate": 3.381115139682557e-05, + "loss": 0.1241, + "num_input_tokens_seen": 11060328, + "step": 15415 + }, + { + "epoch": 32.05821205821206, + "grad_norm": 0.23132893443107605, + "learning_rate": 3.3801963207203366e-05, + "loss": 0.1278, + "num_input_tokens_seen": 11063848, + "step": 15420 + }, + { + "epoch": 32.06860706860707, + "grad_norm": 0.37274670600891113, + "learning_rate": 3.379277366020782e-05, + "loss": 0.1248, + "num_input_tokens_seen": 11067528, + "step": 15425 + }, + { + "epoch": 32.07900207900208, + "grad_norm": 0.17682713270187378, + "learning_rate": 3.3783582757256085e-05, + "loss": 0.0954, + "num_input_tokens_seen": 11071176, + "step": 15430 + }, + { + "epoch": 32.08939708939709, + "grad_norm": 0.843623161315918, + "learning_rate": 3.3774390499765504e-05, + "loss": 0.1275, + "num_input_tokens_seen": 11074920, + "step": 15435 + }, + { + "epoch": 32.0997920997921, + "grad_norm": 0.5125965476036072, + "learning_rate": 3.376519688915364e-05, + "loss": 0.0911, + "num_input_tokens_seen": 11078568, + "step": 15440 + }, + { + "epoch": 32.11018711018711, + "grad_norm": 0.22901779413223267, + "learning_rate": 3.3756001926838273e-05, + "loss": 0.0827, + "num_input_tokens_seen": 11082120, + "step": 15445 + }, + { + "epoch": 32.12058212058212, + "grad_norm": 0.28577736020088196, + "learning_rate": 3.374680561423737e-05, + "loss": 0.1281, + "num_input_tokens_seen": 11085672, + "step": 15450 + }, + { + "epoch": 32.13097713097713, + "grad_norm": 0.2770964205265045, + "learning_rate": 3.373760795276912e-05, + "loss": 0.1407, + "num_input_tokens_seen": 11089288, + "step": 15455 + }, + { + "epoch": 32.141372141372145, + "grad_norm": 0.22049829363822937, + "learning_rate": 3.372840894385192e-05, + "loss": 0.0849, + "num_input_tokens_seen": 11092808, + "step": 15460 + }, + { + "epoch": 32.15176715176715, + "grad_norm": 0.17528513073921204, + "learning_rate": 3.3719208588904375e-05, + "loss": 0.0482, + "num_input_tokens_seen": 11096328, + "step": 15465 + }, + { + "epoch": 32.16216216216216, + "grad_norm": 0.18365280330181122, + "learning_rate": 3.371000688934529e-05, + "loss": 0.0715, + "num_input_tokens_seen": 11099944, + "step": 15470 + }, + { + "epoch": 32.17255717255717, + "grad_norm": 0.24587863683700562, + "learning_rate": 3.370080384659369e-05, + "loss": 0.0975, + "num_input_tokens_seen": 11103464, + "step": 15475 + }, + { + "epoch": 32.182952182952185, + "grad_norm": 0.37632304430007935, + "learning_rate": 3.36915994620688e-05, + "loss": 0.1342, + "num_input_tokens_seen": 11107112, + "step": 15480 + }, + { + "epoch": 32.19334719334719, + "grad_norm": 0.24020545184612274, + "learning_rate": 3.3682393737190035e-05, + "loss": 0.1004, + "num_input_tokens_seen": 11110760, + "step": 15485 + }, + { + "epoch": 32.2037422037422, + "grad_norm": 0.43841415643692017, + "learning_rate": 3.3673186673377054e-05, + "loss": 0.1585, + "num_input_tokens_seen": 11114216, + "step": 15490 + }, + { + "epoch": 32.21413721413722, + "grad_norm": 0.2684189975261688, + "learning_rate": 3.366397827204969e-05, + "loss": 0.1284, + "num_input_tokens_seen": 11117832, + "step": 15495 + }, + { + "epoch": 32.224532224532226, + "grad_norm": 0.3127928078174591, + "learning_rate": 3.3654768534628e-05, + "loss": 0.0859, + "num_input_tokens_seen": 11121288, + "step": 15500 + }, + { + "epoch": 32.234927234927234, + "grad_norm": 0.9920690655708313, + "learning_rate": 3.3645557462532245e-05, + "loss": 0.0861, + "num_input_tokens_seen": 11124872, + "step": 15505 + }, + { + "epoch": 32.24532224532224, + "grad_norm": 0.22074908018112183, + "learning_rate": 3.363634505718288e-05, + "loss": 0.076, + "num_input_tokens_seen": 11128584, + "step": 15510 + }, + { + "epoch": 32.25571725571726, + "grad_norm": 0.10265649855136871, + "learning_rate": 3.362713132000057e-05, + "loss": 0.0852, + "num_input_tokens_seen": 11132104, + "step": 15515 + }, + { + "epoch": 32.266112266112266, + "grad_norm": 0.4646231532096863, + "learning_rate": 3.36179162524062e-05, + "loss": 0.1476, + "num_input_tokens_seen": 11135656, + "step": 15520 + }, + { + "epoch": 32.276507276507274, + "grad_norm": 0.8655553460121155, + "learning_rate": 3.3608699855820846e-05, + "loss": 0.117, + "num_input_tokens_seen": 11139272, + "step": 15525 + }, + { + "epoch": 32.28690228690229, + "grad_norm": 0.6602957844734192, + "learning_rate": 3.359948213166578e-05, + "loss": 0.136, + "num_input_tokens_seen": 11142888, + "step": 15530 + }, + { + "epoch": 32.2972972972973, + "grad_norm": 0.6213573813438416, + "learning_rate": 3.359026308136252e-05, + "loss": 0.1087, + "num_input_tokens_seen": 11146408, + "step": 15535 + }, + { + "epoch": 32.30769230769231, + "grad_norm": 0.3003048896789551, + "learning_rate": 3.358104270633272e-05, + "loss": 0.0887, + "num_input_tokens_seen": 11149928, + "step": 15540 + }, + { + "epoch": 32.318087318087315, + "grad_norm": 0.33454424142837524, + "learning_rate": 3.357182100799831e-05, + "loss": 0.1021, + "num_input_tokens_seen": 11153800, + "step": 15545 + }, + { + "epoch": 32.32848232848233, + "grad_norm": 0.6750790476799011, + "learning_rate": 3.3562597987781384e-05, + "loss": 0.111, + "num_input_tokens_seen": 11157320, + "step": 15550 + }, + { + "epoch": 32.33887733887734, + "grad_norm": 0.28157612681388855, + "learning_rate": 3.355337364710424e-05, + "loss": 0.0985, + "num_input_tokens_seen": 11160968, + "step": 15555 + }, + { + "epoch": 32.34927234927235, + "grad_norm": 0.28646594285964966, + "learning_rate": 3.354414798738939e-05, + "loss": 0.1482, + "num_input_tokens_seen": 11164872, + "step": 15560 + }, + { + "epoch": 32.35966735966736, + "grad_norm": 0.3084107041358948, + "learning_rate": 3.353492101005955e-05, + "loss": 0.1156, + "num_input_tokens_seen": 11168712, + "step": 15565 + }, + { + "epoch": 32.37006237006237, + "grad_norm": 0.22229881584644318, + "learning_rate": 3.352569271653763e-05, + "loss": 0.1249, + "num_input_tokens_seen": 11172200, + "step": 15570 + }, + { + "epoch": 32.38045738045738, + "grad_norm": 0.45681485533714294, + "learning_rate": 3.351646310824675e-05, + "loss": 0.1154, + "num_input_tokens_seen": 11175720, + "step": 15575 + }, + { + "epoch": 32.39085239085239, + "grad_norm": 0.4266888499259949, + "learning_rate": 3.350723218661023e-05, + "loss": 0.1096, + "num_input_tokens_seen": 11179240, + "step": 15580 + }, + { + "epoch": 32.4012474012474, + "grad_norm": 0.17321327328681946, + "learning_rate": 3.349799995305162e-05, + "loss": 0.0607, + "num_input_tokens_seen": 11182824, + "step": 15585 + }, + { + "epoch": 32.41164241164241, + "grad_norm": 0.3986856937408447, + "learning_rate": 3.348876640899461e-05, + "loss": 0.1205, + "num_input_tokens_seen": 11186344, + "step": 15590 + }, + { + "epoch": 32.42203742203742, + "grad_norm": 0.225809246301651, + "learning_rate": 3.3479531555863144e-05, + "loss": 0.0901, + "num_input_tokens_seen": 11189800, + "step": 15595 + }, + { + "epoch": 32.432432432432435, + "grad_norm": 0.49829837679862976, + "learning_rate": 3.3470295395081344e-05, + "loss": 0.0968, + "num_input_tokens_seen": 11193256, + "step": 15600 + }, + { + "epoch": 32.432432432432435, + "eval_loss": 0.1503501832485199, + "eval_runtime": 7.7696, + "eval_samples_per_second": 110.173, + "eval_steps_per_second": 27.543, + "num_input_tokens_seen": 11193256, + "step": 15600 + }, + { + "epoch": 32.44282744282744, + "grad_norm": 0.42874979972839355, + "learning_rate": 3.3461057928073556e-05, + "loss": 0.1131, + "num_input_tokens_seen": 11196936, + "step": 15605 + }, + { + "epoch": 32.45322245322245, + "grad_norm": 0.24726137518882751, + "learning_rate": 3.345181915626431e-05, + "loss": 0.1056, + "num_input_tokens_seen": 11200488, + "step": 15610 + }, + { + "epoch": 32.46361746361746, + "grad_norm": 0.4352732002735138, + "learning_rate": 3.344257908107834e-05, + "loss": 0.1317, + "num_input_tokens_seen": 11204072, + "step": 15615 + }, + { + "epoch": 32.474012474012476, + "grad_norm": 0.444985032081604, + "learning_rate": 3.343333770394058e-05, + "loss": 0.1051, + "num_input_tokens_seen": 11207592, + "step": 15620 + }, + { + "epoch": 32.484407484407484, + "grad_norm": 0.49676796793937683, + "learning_rate": 3.342409502627616e-05, + "loss": 0.1147, + "num_input_tokens_seen": 11211208, + "step": 15625 + }, + { + "epoch": 32.49480249480249, + "grad_norm": 0.3115270733833313, + "learning_rate": 3.341485104951043e-05, + "loss": 0.127, + "num_input_tokens_seen": 11214824, + "step": 15630 + }, + { + "epoch": 32.50519750519751, + "grad_norm": 0.5946514010429382, + "learning_rate": 3.340560577506892e-05, + "loss": 0.1433, + "num_input_tokens_seen": 11218344, + "step": 15635 + }, + { + "epoch": 32.515592515592516, + "grad_norm": 0.49604707956314087, + "learning_rate": 3.339635920437735e-05, + "loss": 0.1485, + "num_input_tokens_seen": 11221928, + "step": 15640 + }, + { + "epoch": 32.525987525987524, + "grad_norm": 0.28011026978492737, + "learning_rate": 3.338711133886169e-05, + "loss": 0.1383, + "num_input_tokens_seen": 11225384, + "step": 15645 + }, + { + "epoch": 32.53638253638254, + "grad_norm": 0.4889337122440338, + "learning_rate": 3.3377862179948064e-05, + "loss": 0.1055, + "num_input_tokens_seen": 11229064, + "step": 15650 + }, + { + "epoch": 32.54677754677755, + "grad_norm": 0.45862218737602234, + "learning_rate": 3.336861172906281e-05, + "loss": 0.1193, + "num_input_tokens_seen": 11232648, + "step": 15655 + }, + { + "epoch": 32.55717255717256, + "grad_norm": 0.2866257429122925, + "learning_rate": 3.335935998763245e-05, + "loss": 0.1257, + "num_input_tokens_seen": 11236200, + "step": 15660 + }, + { + "epoch": 32.567567567567565, + "grad_norm": 0.2877812087535858, + "learning_rate": 3.3350106957083744e-05, + "loss": 0.1245, + "num_input_tokens_seen": 11239848, + "step": 15665 + }, + { + "epoch": 32.57796257796258, + "grad_norm": 0.7417880296707153, + "learning_rate": 3.33408526388436e-05, + "loss": 0.1617, + "num_input_tokens_seen": 11243464, + "step": 15670 + }, + { + "epoch": 32.58835758835759, + "grad_norm": 0.831821620464325, + "learning_rate": 3.3331597034339166e-05, + "loss": 0.1311, + "num_input_tokens_seen": 11247272, + "step": 15675 + }, + { + "epoch": 32.5987525987526, + "grad_norm": 0.16141104698181152, + "learning_rate": 3.3322340144997764e-05, + "loss": 0.1119, + "num_input_tokens_seen": 11250984, + "step": 15680 + }, + { + "epoch": 32.60914760914761, + "grad_norm": 0.14285215735435486, + "learning_rate": 3.331308197224693e-05, + "loss": 0.0865, + "num_input_tokens_seen": 11254536, + "step": 15685 + }, + { + "epoch": 32.61954261954262, + "grad_norm": 0.17524965107440948, + "learning_rate": 3.330382251751438e-05, + "loss": 0.0891, + "num_input_tokens_seen": 11257992, + "step": 15690 + }, + { + "epoch": 32.62993762993763, + "grad_norm": 0.4169473350048065, + "learning_rate": 3.3294561782228054e-05, + "loss": 0.0859, + "num_input_tokens_seen": 11261544, + "step": 15695 + }, + { + "epoch": 32.64033264033264, + "grad_norm": 0.17457488179206848, + "learning_rate": 3.328529976781607e-05, + "loss": 0.0689, + "num_input_tokens_seen": 11265096, + "step": 15700 + }, + { + "epoch": 32.65072765072765, + "grad_norm": 0.5901395082473755, + "learning_rate": 3.327603647570673e-05, + "loss": 0.1381, + "num_input_tokens_seen": 11268840, + "step": 15705 + }, + { + "epoch": 32.66112266112266, + "grad_norm": 0.297249972820282, + "learning_rate": 3.326677190732857e-05, + "loss": 0.1496, + "num_input_tokens_seen": 11272456, + "step": 15710 + }, + { + "epoch": 32.67151767151767, + "grad_norm": 0.2101944088935852, + "learning_rate": 3.325750606411029e-05, + "loss": 0.0978, + "num_input_tokens_seen": 11276008, + "step": 15715 + }, + { + "epoch": 32.681912681912685, + "grad_norm": 0.33530086278915405, + "learning_rate": 3.3248238947480804e-05, + "loss": 0.1172, + "num_input_tokens_seen": 11279720, + "step": 15720 + }, + { + "epoch": 32.69230769230769, + "grad_norm": 0.21197867393493652, + "learning_rate": 3.323897055886922e-05, + "loss": 0.1143, + "num_input_tokens_seen": 11283368, + "step": 15725 + }, + { + "epoch": 32.7027027027027, + "grad_norm": 0.24303144216537476, + "learning_rate": 3.322970089970484e-05, + "loss": 0.1195, + "num_input_tokens_seen": 11286856, + "step": 15730 + }, + { + "epoch": 32.71309771309771, + "grad_norm": 0.2894599735736847, + "learning_rate": 3.3220429971417165e-05, + "loss": 0.1295, + "num_input_tokens_seen": 11290344, + "step": 15735 + }, + { + "epoch": 32.723492723492726, + "grad_norm": 0.45852604508399963, + "learning_rate": 3.321115777543588e-05, + "loss": 0.0769, + "num_input_tokens_seen": 11293832, + "step": 15740 + }, + { + "epoch": 32.733887733887734, + "grad_norm": 0.20934465527534485, + "learning_rate": 3.320188431319088e-05, + "loss": 0.076, + "num_input_tokens_seen": 11297448, + "step": 15745 + }, + { + "epoch": 32.74428274428274, + "grad_norm": 0.46664589643478394, + "learning_rate": 3.319260958611224e-05, + "loss": 0.1385, + "num_input_tokens_seen": 11300808, + "step": 15750 + }, + { + "epoch": 32.75467775467776, + "grad_norm": 0.242249995470047, + "learning_rate": 3.3183333595630256e-05, + "loss": 0.1096, + "num_input_tokens_seen": 11304680, + "step": 15755 + }, + { + "epoch": 32.765072765072766, + "grad_norm": 0.7814558744430542, + "learning_rate": 3.317405634317538e-05, + "loss": 0.1578, + "num_input_tokens_seen": 11308296, + "step": 15760 + }, + { + "epoch": 32.775467775467774, + "grad_norm": 0.25899738073349, + "learning_rate": 3.3164777830178315e-05, + "loss": 0.1577, + "num_input_tokens_seen": 11311720, + "step": 15765 + }, + { + "epoch": 32.78586278586278, + "grad_norm": 0.4329071640968323, + "learning_rate": 3.315549805806989e-05, + "loss": 0.0985, + "num_input_tokens_seen": 11315336, + "step": 15770 + }, + { + "epoch": 32.7962577962578, + "grad_norm": 0.2871430814266205, + "learning_rate": 3.314621702828118e-05, + "loss": 0.1028, + "num_input_tokens_seen": 11318888, + "step": 15775 + }, + { + "epoch": 32.80665280665281, + "grad_norm": 0.16076844930648804, + "learning_rate": 3.313693474224342e-05, + "loss": 0.138, + "num_input_tokens_seen": 11322344, + "step": 15780 + }, + { + "epoch": 32.817047817047815, + "grad_norm": 0.3413338363170624, + "learning_rate": 3.312765120138809e-05, + "loss": 0.1099, + "num_input_tokens_seen": 11325864, + "step": 15785 + }, + { + "epoch": 32.82744282744283, + "grad_norm": 0.3219384551048279, + "learning_rate": 3.311836640714679e-05, + "loss": 0.1259, + "num_input_tokens_seen": 11329320, + "step": 15790 + }, + { + "epoch": 32.83783783783784, + "grad_norm": 0.42214828729629517, + "learning_rate": 3.310908036095137e-05, + "loss": 0.1085, + "num_input_tokens_seen": 11333000, + "step": 15795 + }, + { + "epoch": 32.84823284823285, + "grad_norm": 0.14682762324810028, + "learning_rate": 3.309979306423386e-05, + "loss": 0.0854, + "num_input_tokens_seen": 11336648, + "step": 15800 + }, + { + "epoch": 32.84823284823285, + "eval_loss": 0.14463010430335999, + "eval_runtime": 7.7633, + "eval_samples_per_second": 110.263, + "eval_steps_per_second": 27.566, + "num_input_tokens_seen": 11336648, + "step": 15800 + }, + { + "epoch": 32.858627858627855, + "grad_norm": 0.1476428359746933, + "learning_rate": 3.309050451842647e-05, + "loss": 0.1058, + "num_input_tokens_seen": 11340200, + "step": 15805 + }, + { + "epoch": 32.86902286902287, + "grad_norm": 0.25208452343940735, + "learning_rate": 3.3081214724961604e-05, + "loss": 0.1127, + "num_input_tokens_seen": 11343752, + "step": 15810 + }, + { + "epoch": 32.87941787941788, + "grad_norm": 0.20949850976467133, + "learning_rate": 3.307192368527188e-05, + "loss": 0.101, + "num_input_tokens_seen": 11347272, + "step": 15815 + }, + { + "epoch": 32.88981288981289, + "grad_norm": 0.468916654586792, + "learning_rate": 3.306263140079008e-05, + "loss": 0.1142, + "num_input_tokens_seen": 11350984, + "step": 15820 + }, + { + "epoch": 32.9002079002079, + "grad_norm": 0.9215754270553589, + "learning_rate": 3.30533378729492e-05, + "loss": 0.1653, + "num_input_tokens_seen": 11354664, + "step": 15825 + }, + { + "epoch": 32.91060291060291, + "grad_norm": 0.3284206688404083, + "learning_rate": 3.304404310318242e-05, + "loss": 0.0888, + "num_input_tokens_seen": 11358024, + "step": 15830 + }, + { + "epoch": 32.92099792099792, + "grad_norm": 0.19996018707752228, + "learning_rate": 3.3034747092923105e-05, + "loss": 0.1559, + "num_input_tokens_seen": 11361672, + "step": 15835 + }, + { + "epoch": 32.931392931392935, + "grad_norm": 0.17097575962543488, + "learning_rate": 3.3025449843604806e-05, + "loss": 0.1336, + "num_input_tokens_seen": 11365224, + "step": 15840 + }, + { + "epoch": 32.94178794178794, + "grad_norm": 0.2244734764099121, + "learning_rate": 3.30161513566613e-05, + "loss": 0.111, + "num_input_tokens_seen": 11368872, + "step": 15845 + }, + { + "epoch": 32.95218295218295, + "grad_norm": 0.4077819585800171, + "learning_rate": 3.3006851633526506e-05, + "loss": 0.1018, + "num_input_tokens_seen": 11372424, + "step": 15850 + }, + { + "epoch": 32.96257796257796, + "grad_norm": 0.22133323550224304, + "learning_rate": 3.2997550675634584e-05, + "loss": 0.1064, + "num_input_tokens_seen": 11376040, + "step": 15855 + }, + { + "epoch": 32.972972972972975, + "grad_norm": 0.23004476726055145, + "learning_rate": 3.2988248484419825e-05, + "loss": 0.0958, + "num_input_tokens_seen": 11379624, + "step": 15860 + }, + { + "epoch": 32.983367983367984, + "grad_norm": 0.29541853070259094, + "learning_rate": 3.2978945061316776e-05, + "loss": 0.086, + "num_input_tokens_seen": 11383368, + "step": 15865 + }, + { + "epoch": 32.99376299376299, + "grad_norm": 0.19123277068138123, + "learning_rate": 3.296964040776013e-05, + "loss": 0.1126, + "num_input_tokens_seen": 11386920, + "step": 15870 + }, + { + "epoch": 33.00415800415801, + "grad_norm": 0.2596879303455353, + "learning_rate": 3.296033452518478e-05, + "loss": 0.1411, + "num_input_tokens_seen": 11390616, + "step": 15875 + }, + { + "epoch": 33.014553014553016, + "grad_norm": 0.259531706571579, + "learning_rate": 3.2951027415025806e-05, + "loss": 0.0916, + "num_input_tokens_seen": 11394296, + "step": 15880 + }, + { + "epoch": 33.024948024948024, + "grad_norm": 0.3113299012184143, + "learning_rate": 3.294171907871849e-05, + "loss": 0.0786, + "num_input_tokens_seen": 11397976, + "step": 15885 + }, + { + "epoch": 33.03534303534303, + "grad_norm": 0.37494683265686035, + "learning_rate": 3.293240951769828e-05, + "loss": 0.1133, + "num_input_tokens_seen": 11401400, + "step": 15890 + }, + { + "epoch": 33.04573804573805, + "grad_norm": 0.4049595296382904, + "learning_rate": 3.2923098733400846e-05, + "loss": 0.147, + "num_input_tokens_seen": 11404888, + "step": 15895 + }, + { + "epoch": 33.056133056133056, + "grad_norm": 0.4369422495365143, + "learning_rate": 3.291378672726202e-05, + "loss": 0.0839, + "num_input_tokens_seen": 11408472, + "step": 15900 + }, + { + "epoch": 33.066528066528065, + "grad_norm": 0.4820076823234558, + "learning_rate": 3.2904473500717824e-05, + "loss": 0.1658, + "num_input_tokens_seen": 11411896, + "step": 15905 + }, + { + "epoch": 33.07692307692308, + "grad_norm": 0.17815333604812622, + "learning_rate": 3.289515905520449e-05, + "loss": 0.1132, + "num_input_tokens_seen": 11415672, + "step": 15910 + }, + { + "epoch": 33.08731808731809, + "grad_norm": 0.2098708599805832, + "learning_rate": 3.288584339215841e-05, + "loss": 0.0967, + "num_input_tokens_seen": 11419352, + "step": 15915 + }, + { + "epoch": 33.0977130977131, + "grad_norm": 0.43450936675071716, + "learning_rate": 3.287652651301617e-05, + "loss": 0.1708, + "num_input_tokens_seen": 11423032, + "step": 15920 + }, + { + "epoch": 33.108108108108105, + "grad_norm": 0.36527538299560547, + "learning_rate": 3.286720841921457e-05, + "loss": 0.0971, + "num_input_tokens_seen": 11426616, + "step": 15925 + }, + { + "epoch": 33.11850311850312, + "grad_norm": 0.20888946950435638, + "learning_rate": 3.285788911219056e-05, + "loss": 0.1143, + "num_input_tokens_seen": 11430232, + "step": 15930 + }, + { + "epoch": 33.12889812889813, + "grad_norm": 0.29336848855018616, + "learning_rate": 3.284856859338131e-05, + "loss": 0.1148, + "num_input_tokens_seen": 11433784, + "step": 15935 + }, + { + "epoch": 33.13929313929314, + "grad_norm": 0.6112812757492065, + "learning_rate": 3.283924686422414e-05, + "loss": 0.1008, + "num_input_tokens_seen": 11437400, + "step": 15940 + }, + { + "epoch": 33.14968814968815, + "grad_norm": 0.2118680328130722, + "learning_rate": 3.282992392615659e-05, + "loss": 0.1073, + "num_input_tokens_seen": 11440984, + "step": 15945 + }, + { + "epoch": 33.16008316008316, + "grad_norm": 0.5280019044876099, + "learning_rate": 3.282059978061638e-05, + "loss": 0.1248, + "num_input_tokens_seen": 11444568, + "step": 15950 + }, + { + "epoch": 33.17047817047817, + "grad_norm": 0.3425813317298889, + "learning_rate": 3.28112744290414e-05, + "loss": 0.1197, + "num_input_tokens_seen": 11448280, + "step": 15955 + }, + { + "epoch": 33.18087318087318, + "grad_norm": 0.3413282632827759, + "learning_rate": 3.280194787286974e-05, + "loss": 0.1286, + "num_input_tokens_seen": 11451960, + "step": 15960 + }, + { + "epoch": 33.19126819126819, + "grad_norm": 0.1330193728208542, + "learning_rate": 3.2792620113539674e-05, + "loss": 0.0752, + "num_input_tokens_seen": 11455480, + "step": 15965 + }, + { + "epoch": 33.2016632016632, + "grad_norm": 0.26091253757476807, + "learning_rate": 3.278329115248966e-05, + "loss": 0.1077, + "num_input_tokens_seen": 11459160, + "step": 15970 + }, + { + "epoch": 33.21205821205821, + "grad_norm": 0.2659338414669037, + "learning_rate": 3.277396099115834e-05, + "loss": 0.0935, + "num_input_tokens_seen": 11462904, + "step": 15975 + }, + { + "epoch": 33.222453222453225, + "grad_norm": 0.14724574983119965, + "learning_rate": 3.276462963098454e-05, + "loss": 0.124, + "num_input_tokens_seen": 11466552, + "step": 15980 + }, + { + "epoch": 33.232848232848234, + "grad_norm": 0.19727779924869537, + "learning_rate": 3.275529707340728e-05, + "loss": 0.1251, + "num_input_tokens_seen": 11470136, + "step": 15985 + }, + { + "epoch": 33.24324324324324, + "grad_norm": 0.18859586119651794, + "learning_rate": 3.274596331986574e-05, + "loss": 0.1191, + "num_input_tokens_seen": 11473816, + "step": 15990 + }, + { + "epoch": 33.25363825363825, + "grad_norm": 0.5611175894737244, + "learning_rate": 3.273662837179932e-05, + "loss": 0.0686, + "num_input_tokens_seen": 11477592, + "step": 15995 + }, + { + "epoch": 33.264033264033266, + "grad_norm": 0.41490817070007324, + "learning_rate": 3.272729223064758e-05, + "loss": 0.0739, + "num_input_tokens_seen": 11481080, + "step": 16000 + }, + { + "epoch": 33.264033264033266, + "eval_loss": 0.1453711837530136, + "eval_runtime": 7.7499, + "eval_samples_per_second": 110.454, + "eval_steps_per_second": 27.613, + "num_input_tokens_seen": 11481080, + "step": 16000 + }, + { + "epoch": 33.274428274428274, + "grad_norm": 0.9718169569969177, + "learning_rate": 3.2717954897850264e-05, + "loss": 0.1503, + "num_input_tokens_seen": 11484664, + "step": 16005 + }, + { + "epoch": 33.28482328482328, + "grad_norm": 0.24440810084342957, + "learning_rate": 3.270861637484733e-05, + "loss": 0.1056, + "num_input_tokens_seen": 11488120, + "step": 16010 + }, + { + "epoch": 33.2952182952183, + "grad_norm": 0.27634376287460327, + "learning_rate": 3.2699276663078867e-05, + "loss": 0.1214, + "num_input_tokens_seen": 11491480, + "step": 16015 + }, + { + "epoch": 33.305613305613306, + "grad_norm": 0.2158578336238861, + "learning_rate": 3.268993576398519e-05, + "loss": 0.091, + "num_input_tokens_seen": 11495064, + "step": 16020 + }, + { + "epoch": 33.316008316008315, + "grad_norm": 0.2431740015745163, + "learning_rate": 3.268059367900678e-05, + "loss": 0.1111, + "num_input_tokens_seen": 11498648, + "step": 16025 + }, + { + "epoch": 33.32640332640332, + "grad_norm": 0.4671292304992676, + "learning_rate": 3.26712504095843e-05, + "loss": 0.1272, + "num_input_tokens_seen": 11502296, + "step": 16030 + }, + { + "epoch": 33.33679833679834, + "grad_norm": 0.2081066071987152, + "learning_rate": 3.2661905957158615e-05, + "loss": 0.1449, + "num_input_tokens_seen": 11505848, + "step": 16035 + }, + { + "epoch": 33.34719334719335, + "grad_norm": 0.20414593815803528, + "learning_rate": 3.2652560323170734e-05, + "loss": 0.0959, + "num_input_tokens_seen": 11509240, + "step": 16040 + }, + { + "epoch": 33.357588357588355, + "grad_norm": 0.27395400404930115, + "learning_rate": 3.264321350906189e-05, + "loss": 0.0651, + "num_input_tokens_seen": 11512632, + "step": 16045 + }, + { + "epoch": 33.36798336798337, + "grad_norm": 0.3776133954524994, + "learning_rate": 3.263386551627346e-05, + "loss": 0.1297, + "num_input_tokens_seen": 11516184, + "step": 16050 + }, + { + "epoch": 33.37837837837838, + "grad_norm": 1.0116713047027588, + "learning_rate": 3.2624516346247055e-05, + "loss": 0.1465, + "num_input_tokens_seen": 11519736, + "step": 16055 + }, + { + "epoch": 33.38877338877339, + "grad_norm": 0.1352328360080719, + "learning_rate": 3.2615166000424404e-05, + "loss": 0.0824, + "num_input_tokens_seen": 11523160, + "step": 16060 + }, + { + "epoch": 33.3991683991684, + "grad_norm": 0.4552645981311798, + "learning_rate": 3.260581448024745e-05, + "loss": 0.0843, + "num_input_tokens_seen": 11526712, + "step": 16065 + }, + { + "epoch": 33.40956340956341, + "grad_norm": 0.41140270233154297, + "learning_rate": 3.2596461787158335e-05, + "loss": 0.1269, + "num_input_tokens_seen": 11530456, + "step": 16070 + }, + { + "epoch": 33.41995841995842, + "grad_norm": 0.3531512916088104, + "learning_rate": 3.258710792259934e-05, + "loss": 0.1057, + "num_input_tokens_seen": 11533976, + "step": 16075 + }, + { + "epoch": 33.43035343035343, + "grad_norm": 0.3321774899959564, + "learning_rate": 3.257775288801296e-05, + "loss": 0.0948, + "num_input_tokens_seen": 11537592, + "step": 16080 + }, + { + "epoch": 33.44074844074844, + "grad_norm": 0.196044459939003, + "learning_rate": 3.256839668484186e-05, + "loss": 0.1079, + "num_input_tokens_seen": 11541208, + "step": 16085 + }, + { + "epoch": 33.45114345114345, + "grad_norm": 0.7077928781509399, + "learning_rate": 3.255903931452888e-05, + "loss": 0.1207, + "num_input_tokens_seen": 11544792, + "step": 16090 + }, + { + "epoch": 33.46153846153846, + "grad_norm": 0.3917582333087921, + "learning_rate": 3.2549680778517045e-05, + "loss": 0.1504, + "num_input_tokens_seen": 11548376, + "step": 16095 + }, + { + "epoch": 33.471933471933475, + "grad_norm": 0.405590295791626, + "learning_rate": 3.2540321078249556e-05, + "loss": 0.1048, + "num_input_tokens_seen": 11552024, + "step": 16100 + }, + { + "epoch": 33.482328482328484, + "grad_norm": 0.4040853679180145, + "learning_rate": 3.2530960215169795e-05, + "loss": 0.1066, + "num_input_tokens_seen": 11555704, + "step": 16105 + }, + { + "epoch": 33.49272349272349, + "grad_norm": 0.2759716808795929, + "learning_rate": 3.2521598190721345e-05, + "loss": 0.1184, + "num_input_tokens_seen": 11559192, + "step": 16110 + }, + { + "epoch": 33.5031185031185, + "grad_norm": 0.5027992725372314, + "learning_rate": 3.251223500634792e-05, + "loss": 0.1149, + "num_input_tokens_seen": 11562712, + "step": 16115 + }, + { + "epoch": 33.513513513513516, + "grad_norm": 0.24815762042999268, + "learning_rate": 3.2502870663493445e-05, + "loss": 0.0983, + "num_input_tokens_seen": 11566456, + "step": 16120 + }, + { + "epoch": 33.523908523908524, + "grad_norm": 0.5599642395973206, + "learning_rate": 3.249350516360203e-05, + "loss": 0.0778, + "num_input_tokens_seen": 11570008, + "step": 16125 + }, + { + "epoch": 33.53430353430353, + "grad_norm": 0.2622649371623993, + "learning_rate": 3.248413850811797e-05, + "loss": 0.1101, + "num_input_tokens_seen": 11573656, + "step": 16130 + }, + { + "epoch": 33.54469854469855, + "grad_norm": 0.5164893269538879, + "learning_rate": 3.2474770698485677e-05, + "loss": 0.1462, + "num_input_tokens_seen": 11577272, + "step": 16135 + }, + { + "epoch": 33.555093555093556, + "grad_norm": 0.28538697957992554, + "learning_rate": 3.246540173614983e-05, + "loss": 0.1111, + "num_input_tokens_seen": 11580952, + "step": 16140 + }, + { + "epoch": 33.565488565488565, + "grad_norm": 0.3097423315048218, + "learning_rate": 3.2456031622555197e-05, + "loss": 0.1063, + "num_input_tokens_seen": 11584504, + "step": 16145 + }, + { + "epoch": 33.57588357588357, + "grad_norm": 0.3848194181919098, + "learning_rate": 3.2446660359146794e-05, + "loss": 0.1318, + "num_input_tokens_seen": 11588152, + "step": 16150 + }, + { + "epoch": 33.58627858627859, + "grad_norm": 0.424970805644989, + "learning_rate": 3.2437287947369786e-05, + "loss": 0.1199, + "num_input_tokens_seen": 11591736, + "step": 16155 + }, + { + "epoch": 33.5966735966736, + "grad_norm": 0.26935482025146484, + "learning_rate": 3.2427914388669525e-05, + "loss": 0.1257, + "num_input_tokens_seen": 11595352, + "step": 16160 + }, + { + "epoch": 33.607068607068605, + "grad_norm": 0.6728416681289673, + "learning_rate": 3.241853968449151e-05, + "loss": 0.117, + "num_input_tokens_seen": 11598872, + "step": 16165 + }, + { + "epoch": 33.61746361746362, + "grad_norm": 0.1378238946199417, + "learning_rate": 3.240916383628144e-05, + "loss": 0.1288, + "num_input_tokens_seen": 11602616, + "step": 16170 + }, + { + "epoch": 33.62785862785863, + "grad_norm": 0.32399728894233704, + "learning_rate": 3.239978684548521e-05, + "loss": 0.1195, + "num_input_tokens_seen": 11606328, + "step": 16175 + }, + { + "epoch": 33.63825363825364, + "grad_norm": 0.2977190911769867, + "learning_rate": 3.239040871354885e-05, + "loss": 0.1363, + "num_input_tokens_seen": 11610104, + "step": 16180 + }, + { + "epoch": 33.648648648648646, + "grad_norm": 0.4575028121471405, + "learning_rate": 3.2381029441918596e-05, + "loss": 0.1024, + "num_input_tokens_seen": 11613880, + "step": 16185 + }, + { + "epoch": 33.65904365904366, + "grad_norm": 0.24857573211193085, + "learning_rate": 3.2371649032040845e-05, + "loss": 0.0991, + "num_input_tokens_seen": 11617400, + "step": 16190 + }, + { + "epoch": 33.66943866943867, + "grad_norm": 0.20964136719703674, + "learning_rate": 3.2362267485362174e-05, + "loss": 0.1127, + "num_input_tokens_seen": 11620824, + "step": 16195 + }, + { + "epoch": 33.67983367983368, + "grad_norm": 0.3688659965991974, + "learning_rate": 3.235288480332934e-05, + "loss": 0.0903, + "num_input_tokens_seen": 11624376, + "step": 16200 + }, + { + "epoch": 33.67983367983368, + "eval_loss": 0.14391747117042542, + "eval_runtime": 7.7456, + "eval_samples_per_second": 110.515, + "eval_steps_per_second": 27.629, + "num_input_tokens_seen": 11624376, + "step": 16200 + }, + { + "epoch": 33.69022869022869, + "grad_norm": 0.20503611862659454, + "learning_rate": 3.234350098738927e-05, + "loss": 0.0957, + "num_input_tokens_seen": 11627992, + "step": 16205 + }, + { + "epoch": 33.7006237006237, + "grad_norm": 0.21510061621665955, + "learning_rate": 3.233411603898906e-05, + "loss": 0.0712, + "num_input_tokens_seen": 11631320, + "step": 16210 + }, + { + "epoch": 33.71101871101871, + "grad_norm": 0.24153146147727966, + "learning_rate": 3.232472995957599e-05, + "loss": 0.1104, + "num_input_tokens_seen": 11634968, + "step": 16215 + }, + { + "epoch": 33.72141372141372, + "grad_norm": 0.7379464507102966, + "learning_rate": 3.231534275059751e-05, + "loss": 0.1115, + "num_input_tokens_seen": 11638424, + "step": 16220 + }, + { + "epoch": 33.731808731808734, + "grad_norm": 0.23059949278831482, + "learning_rate": 3.230595441350125e-05, + "loss": 0.1238, + "num_input_tokens_seen": 11642072, + "step": 16225 + }, + { + "epoch": 33.74220374220374, + "grad_norm": 0.2468891739845276, + "learning_rate": 3.2296564949735e-05, + "loss": 0.1138, + "num_input_tokens_seen": 11645592, + "step": 16230 + }, + { + "epoch": 33.75259875259875, + "grad_norm": 0.3916254937648773, + "learning_rate": 3.228717436074675e-05, + "loss": 0.1014, + "num_input_tokens_seen": 11649112, + "step": 16235 + }, + { + "epoch": 33.762993762993766, + "grad_norm": 0.20077654719352722, + "learning_rate": 3.227778264798463e-05, + "loss": 0.08, + "num_input_tokens_seen": 11652728, + "step": 16240 + }, + { + "epoch": 33.773388773388774, + "grad_norm": 0.3969309628009796, + "learning_rate": 3.226838981289698e-05, + "loss": 0.1099, + "num_input_tokens_seen": 11656376, + "step": 16245 + }, + { + "epoch": 33.78378378378378, + "grad_norm": 0.3344154953956604, + "learning_rate": 3.225899585693227e-05, + "loss": 0.1332, + "num_input_tokens_seen": 11659928, + "step": 16250 + }, + { + "epoch": 33.79417879417879, + "grad_norm": 0.19982142746448517, + "learning_rate": 3.224960078153918e-05, + "loss": 0.0996, + "num_input_tokens_seen": 11663544, + "step": 16255 + }, + { + "epoch": 33.804573804573806, + "grad_norm": 0.42078515887260437, + "learning_rate": 3.224020458816655e-05, + "loss": 0.132, + "num_input_tokens_seen": 11667224, + "step": 16260 + }, + { + "epoch": 33.814968814968815, + "grad_norm": 0.3675118386745453, + "learning_rate": 3.223080727826337e-05, + "loss": 0.1073, + "num_input_tokens_seen": 11670744, + "step": 16265 + }, + { + "epoch": 33.82536382536382, + "grad_norm": 0.18993282318115234, + "learning_rate": 3.222140885327885e-05, + "loss": 0.1033, + "num_input_tokens_seen": 11674328, + "step": 16270 + }, + { + "epoch": 33.83575883575884, + "grad_norm": 0.35988494753837585, + "learning_rate": 3.221200931466234e-05, + "loss": 0.0707, + "num_input_tokens_seen": 11677880, + "step": 16275 + }, + { + "epoch": 33.84615384615385, + "grad_norm": 0.2219291627407074, + "learning_rate": 3.220260866386336e-05, + "loss": 0.1167, + "num_input_tokens_seen": 11681464, + "step": 16280 + }, + { + "epoch": 33.856548856548855, + "grad_norm": 0.3653600513935089, + "learning_rate": 3.21932069023316e-05, + "loss": 0.1197, + "num_input_tokens_seen": 11685016, + "step": 16285 + }, + { + "epoch": 33.86694386694387, + "grad_norm": 0.3620465397834778, + "learning_rate": 3.218380403151695e-05, + "loss": 0.1283, + "num_input_tokens_seen": 11688632, + "step": 16290 + }, + { + "epoch": 33.87733887733888, + "grad_norm": 0.5196066498756409, + "learning_rate": 3.217440005286943e-05, + "loss": 0.1221, + "num_input_tokens_seen": 11692216, + "step": 16295 + }, + { + "epoch": 33.88773388773389, + "grad_norm": 0.43824881315231323, + "learning_rate": 3.216499496783928e-05, + "loss": 0.1491, + "num_input_tokens_seen": 11695800, + "step": 16300 + }, + { + "epoch": 33.898128898128896, + "grad_norm": 0.5104526281356812, + "learning_rate": 3.2155588777876856e-05, + "loss": 0.1434, + "num_input_tokens_seen": 11699416, + "step": 16305 + }, + { + "epoch": 33.90852390852391, + "grad_norm": 0.9445152878761292, + "learning_rate": 3.214618148443273e-05, + "loss": 0.155, + "num_input_tokens_seen": 11703000, + "step": 16310 + }, + { + "epoch": 33.91891891891892, + "grad_norm": 0.19147184491157532, + "learning_rate": 3.2136773088957595e-05, + "loss": 0.1212, + "num_input_tokens_seen": 11706584, + "step": 16315 + }, + { + "epoch": 33.92931392931393, + "grad_norm": 0.13145920634269714, + "learning_rate": 3.2127363592902374e-05, + "loss": 0.0948, + "num_input_tokens_seen": 11710072, + "step": 16320 + }, + { + "epoch": 33.93970893970894, + "grad_norm": 0.3328855037689209, + "learning_rate": 3.211795299771812e-05, + "loss": 0.1018, + "num_input_tokens_seen": 11713656, + "step": 16325 + }, + { + "epoch": 33.95010395010395, + "grad_norm": 0.29434895515441895, + "learning_rate": 3.210854130485605e-05, + "loss": 0.1242, + "num_input_tokens_seen": 11717240, + "step": 16330 + }, + { + "epoch": 33.96049896049896, + "grad_norm": 0.1639203578233719, + "learning_rate": 3.209912851576759e-05, + "loss": 0.1099, + "num_input_tokens_seen": 11720856, + "step": 16335 + }, + { + "epoch": 33.97089397089397, + "grad_norm": 0.12746815383434296, + "learning_rate": 3.208971463190431e-05, + "loss": 0.1177, + "num_input_tokens_seen": 11724248, + "step": 16340 + }, + { + "epoch": 33.981288981288984, + "grad_norm": 0.4865598678588867, + "learning_rate": 3.208029965471793e-05, + "loss": 0.1001, + "num_input_tokens_seen": 11727736, + "step": 16345 + }, + { + "epoch": 33.99168399168399, + "grad_norm": 0.6253346800804138, + "learning_rate": 3.2070883585660364e-05, + "loss": 0.1215, + "num_input_tokens_seen": 11731288, + "step": 16350 + }, + { + "epoch": 34.002079002079, + "grad_norm": 0.16306735575199127, + "learning_rate": 3.20614664261837e-05, + "loss": 0.0901, + "num_input_tokens_seen": 11734736, + "step": 16355 + }, + { + "epoch": 34.012474012474016, + "grad_norm": 0.36569687724113464, + "learning_rate": 3.205204817774016e-05, + "loss": 0.1438, + "num_input_tokens_seen": 11738288, + "step": 16360 + }, + { + "epoch": 34.022869022869024, + "grad_norm": 0.25897714495658875, + "learning_rate": 3.204262884178218e-05, + "loss": 0.106, + "num_input_tokens_seen": 11742000, + "step": 16365 + }, + { + "epoch": 34.03326403326403, + "grad_norm": 0.21272824704647064, + "learning_rate": 3.2033208419762314e-05, + "loss": 0.1411, + "num_input_tokens_seen": 11745648, + "step": 16370 + }, + { + "epoch": 34.04365904365904, + "grad_norm": 0.17943842709064484, + "learning_rate": 3.2023786913133344e-05, + "loss": 0.1528, + "num_input_tokens_seen": 11749296, + "step": 16375 + }, + { + "epoch": 34.054054054054056, + "grad_norm": 0.6140493750572205, + "learning_rate": 3.201436432334816e-05, + "loss": 0.1496, + "num_input_tokens_seen": 11752752, + "step": 16380 + }, + { + "epoch": 34.064449064449065, + "grad_norm": 0.41507771611213684, + "learning_rate": 3.2004940651859844e-05, + "loss": 0.1072, + "num_input_tokens_seen": 11756272, + "step": 16385 + }, + { + "epoch": 34.07484407484407, + "grad_norm": 0.25816723704338074, + "learning_rate": 3.1995515900121655e-05, + "loss": 0.0978, + "num_input_tokens_seen": 11759824, + "step": 16390 + }, + { + "epoch": 34.08523908523909, + "grad_norm": 0.3261186182498932, + "learning_rate": 3.1986090069587e-05, + "loss": 0.1276, + "num_input_tokens_seen": 11763408, + "step": 16395 + }, + { + "epoch": 34.0956340956341, + "grad_norm": 0.46925705671310425, + "learning_rate": 3.1976663161709466e-05, + "loss": 0.0906, + "num_input_tokens_seen": 11766832, + "step": 16400 + }, + { + "epoch": 34.0956340956341, + "eval_loss": 0.14292724430561066, + "eval_runtime": 7.7551, + "eval_samples_per_second": 110.38, + "eval_steps_per_second": 27.595, + "num_input_tokens_seen": 11766832, + "step": 16400 + }, + { + "epoch": 34.106029106029105, + "grad_norm": 0.17847660183906555, + "learning_rate": 3.196723517794279e-05, + "loss": 0.0922, + "num_input_tokens_seen": 11770352, + "step": 16405 + }, + { + "epoch": 34.11642411642411, + "grad_norm": 0.15077881515026093, + "learning_rate": 3.19578061197409e-05, + "loss": 0.0745, + "num_input_tokens_seen": 11773904, + "step": 16410 + }, + { + "epoch": 34.12681912681913, + "grad_norm": 0.20353026688098907, + "learning_rate": 3.194837598855787e-05, + "loss": 0.0828, + "num_input_tokens_seen": 11777488, + "step": 16415 + }, + { + "epoch": 34.13721413721414, + "grad_norm": 0.5568561553955078, + "learning_rate": 3.193894478584794e-05, + "loss": 0.1128, + "num_input_tokens_seen": 11781072, + "step": 16420 + }, + { + "epoch": 34.147609147609145, + "grad_norm": 0.5172379612922668, + "learning_rate": 3.192951251306553e-05, + "loss": 0.1198, + "num_input_tokens_seen": 11784656, + "step": 16425 + }, + { + "epoch": 34.15800415800416, + "grad_norm": 0.19558382034301758, + "learning_rate": 3.192007917166521e-05, + "loss": 0.101, + "num_input_tokens_seen": 11788208, + "step": 16430 + }, + { + "epoch": 34.16839916839917, + "grad_norm": 0.38089263439178467, + "learning_rate": 3.191064476310171e-05, + "loss": 0.0968, + "num_input_tokens_seen": 11791792, + "step": 16435 + }, + { + "epoch": 34.17879417879418, + "grad_norm": 0.3612632155418396, + "learning_rate": 3.1901209288829944e-05, + "loss": 0.1236, + "num_input_tokens_seen": 11795504, + "step": 16440 + }, + { + "epoch": 34.189189189189186, + "grad_norm": 0.33688050508499146, + "learning_rate": 3.1891772750304985e-05, + "loss": 0.1474, + "num_input_tokens_seen": 11799344, + "step": 16445 + }, + { + "epoch": 34.1995841995842, + "grad_norm": 0.19785204529762268, + "learning_rate": 3.188233514898206e-05, + "loss": 0.1028, + "num_input_tokens_seen": 11802736, + "step": 16450 + }, + { + "epoch": 34.20997920997921, + "grad_norm": 0.3304867744445801, + "learning_rate": 3.187289648631657e-05, + "loss": 0.1257, + "num_input_tokens_seen": 11806352, + "step": 16455 + }, + { + "epoch": 34.22037422037422, + "grad_norm": 0.5315737128257751, + "learning_rate": 3.186345676376406e-05, + "loss": 0.1751, + "num_input_tokens_seen": 11810192, + "step": 16460 + }, + { + "epoch": 34.23076923076923, + "grad_norm": 0.4822412431240082, + "learning_rate": 3.1854015982780275e-05, + "loss": 0.0992, + "num_input_tokens_seen": 11813616, + "step": 16465 + }, + { + "epoch": 34.24116424116424, + "grad_norm": 0.4345518946647644, + "learning_rate": 3.1844574144821084e-05, + "loss": 0.082, + "num_input_tokens_seen": 11817168, + "step": 16470 + }, + { + "epoch": 34.25155925155925, + "grad_norm": 0.6092364192008972, + "learning_rate": 3.1835131251342554e-05, + "loss": 0.0883, + "num_input_tokens_seen": 11820784, + "step": 16475 + }, + { + "epoch": 34.26195426195426, + "grad_norm": 0.38851219415664673, + "learning_rate": 3.182568730380089e-05, + "loss": 0.1429, + "num_input_tokens_seen": 11824560, + "step": 16480 + }, + { + "epoch": 34.272349272349274, + "grad_norm": 0.2748623192310333, + "learning_rate": 3.181624230365245e-05, + "loss": 0.1075, + "num_input_tokens_seen": 11828208, + "step": 16485 + }, + { + "epoch": 34.28274428274428, + "grad_norm": 0.1873142123222351, + "learning_rate": 3.180679625235381e-05, + "loss": 0.0817, + "num_input_tokens_seen": 11831792, + "step": 16490 + }, + { + "epoch": 34.29313929313929, + "grad_norm": 0.5428599715232849, + "learning_rate": 3.1797349151361646e-05, + "loss": 0.0822, + "num_input_tokens_seen": 11835376, + "step": 16495 + }, + { + "epoch": 34.303534303534306, + "grad_norm": 0.4415946304798126, + "learning_rate": 3.178790100213281e-05, + "loss": 0.1396, + "num_input_tokens_seen": 11838800, + "step": 16500 + }, + { + "epoch": 34.313929313929314, + "grad_norm": 0.23185963928699493, + "learning_rate": 3.1778451806124346e-05, + "loss": 0.1068, + "num_input_tokens_seen": 11842448, + "step": 16505 + }, + { + "epoch": 34.32432432432432, + "grad_norm": 0.21504971385002136, + "learning_rate": 3.176900156479342e-05, + "loss": 0.0943, + "num_input_tokens_seen": 11846352, + "step": 16510 + }, + { + "epoch": 34.33471933471934, + "grad_norm": 0.24072033166885376, + "learning_rate": 3.17595502795974e-05, + "loss": 0.0981, + "num_input_tokens_seen": 11849936, + "step": 16515 + }, + { + "epoch": 34.34511434511435, + "grad_norm": 0.6524914503097534, + "learning_rate": 3.175009795199377e-05, + "loss": 0.1363, + "num_input_tokens_seen": 11853552, + "step": 16520 + }, + { + "epoch": 34.355509355509355, + "grad_norm": 0.32358551025390625, + "learning_rate": 3.1740644583440224e-05, + "loss": 0.1191, + "num_input_tokens_seen": 11857040, + "step": 16525 + }, + { + "epoch": 34.36590436590436, + "grad_norm": 0.5050221085548401, + "learning_rate": 3.173119017539457e-05, + "loss": 0.0859, + "num_input_tokens_seen": 11860464, + "step": 16530 + }, + { + "epoch": 34.37629937629938, + "grad_norm": 0.17022691667079926, + "learning_rate": 3.172173472931479e-05, + "loss": 0.0912, + "num_input_tokens_seen": 11864112, + "step": 16535 + }, + { + "epoch": 34.38669438669439, + "grad_norm": 0.22415763139724731, + "learning_rate": 3.1712278246659055e-05, + "loss": 0.0953, + "num_input_tokens_seen": 11867664, + "step": 16540 + }, + { + "epoch": 34.397089397089395, + "grad_norm": 0.4232676029205322, + "learning_rate": 3.170282072888566e-05, + "loss": 0.1406, + "num_input_tokens_seen": 11871312, + "step": 16545 + }, + { + "epoch": 34.40748440748441, + "grad_norm": 0.22804692387580872, + "learning_rate": 3.169336217745307e-05, + "loss": 0.1177, + "num_input_tokens_seen": 11874928, + "step": 16550 + }, + { + "epoch": 34.41787941787942, + "grad_norm": 0.2257530391216278, + "learning_rate": 3.1683902593819924e-05, + "loss": 0.0951, + "num_input_tokens_seen": 11878544, + "step": 16555 + }, + { + "epoch": 34.42827442827443, + "grad_norm": 0.5373679995536804, + "learning_rate": 3.1674441979445e-05, + "loss": 0.1333, + "num_input_tokens_seen": 11882000, + "step": 16560 + }, + { + "epoch": 34.438669438669436, + "grad_norm": 0.44481173157691956, + "learning_rate": 3.166498033578725e-05, + "loss": 0.1193, + "num_input_tokens_seen": 11885680, + "step": 16565 + }, + { + "epoch": 34.44906444906445, + "grad_norm": 0.4380831718444824, + "learning_rate": 3.165551766430578e-05, + "loss": 0.1234, + "num_input_tokens_seen": 11889264, + "step": 16570 + }, + { + "epoch": 34.45945945945946, + "grad_norm": 0.41426563262939453, + "learning_rate": 3.164605396645984e-05, + "loss": 0.1282, + "num_input_tokens_seen": 11892720, + "step": 16575 + }, + { + "epoch": 34.46985446985447, + "grad_norm": 0.6311758756637573, + "learning_rate": 3.163658924370886e-05, + "loss": 0.1336, + "num_input_tokens_seen": 11896240, + "step": 16580 + }, + { + "epoch": 34.48024948024948, + "grad_norm": 0.15821188688278198, + "learning_rate": 3.1627123497512415e-05, + "loss": 0.1029, + "num_input_tokens_seen": 11900016, + "step": 16585 + }, + { + "epoch": 34.49064449064449, + "grad_norm": 0.2982153594493866, + "learning_rate": 3.1617656729330245e-05, + "loss": 0.0694, + "num_input_tokens_seen": 11903600, + "step": 16590 + }, + { + "epoch": 34.5010395010395, + "grad_norm": 0.5365673899650574, + "learning_rate": 3.1608188940622255e-05, + "loss": 0.0959, + "num_input_tokens_seen": 11907088, + "step": 16595 + }, + { + "epoch": 34.51143451143451, + "grad_norm": 0.5774056315422058, + "learning_rate": 3.159872013284847e-05, + "loss": 0.1062, + "num_input_tokens_seen": 11910672, + "step": 16600 + }, + { + "epoch": 34.51143451143451, + "eval_loss": 0.14633497595787048, + "eval_runtime": 7.7604, + "eval_samples_per_second": 110.304, + "eval_steps_per_second": 27.576, + "num_input_tokens_seen": 11910672, + "step": 16600 + }, + { + "epoch": 34.521829521829524, + "grad_norm": 0.16955579817295074, + "learning_rate": 3.1589250307469134e-05, + "loss": 0.1571, + "num_input_tokens_seen": 11914352, + "step": 16605 + }, + { + "epoch": 34.53222453222453, + "grad_norm": 0.27418920397758484, + "learning_rate": 3.1579779465944586e-05, + "loss": 0.1126, + "num_input_tokens_seen": 11917904, + "step": 16610 + }, + { + "epoch": 34.54261954261954, + "grad_norm": 0.4652021825313568, + "learning_rate": 3.1570307609735363e-05, + "loss": 0.1222, + "num_input_tokens_seen": 11921488, + "step": 16615 + }, + { + "epoch": 34.553014553014556, + "grad_norm": 0.848962664604187, + "learning_rate": 3.156083474030213e-05, + "loss": 0.0936, + "num_input_tokens_seen": 11925232, + "step": 16620 + }, + { + "epoch": 34.563409563409564, + "grad_norm": 0.5306152701377869, + "learning_rate": 3.155136085910573e-05, + "loss": 0.1295, + "num_input_tokens_seen": 11928912, + "step": 16625 + }, + { + "epoch": 34.57380457380457, + "grad_norm": 0.19796594977378845, + "learning_rate": 3.154188596760717e-05, + "loss": 0.1473, + "num_input_tokens_seen": 11932688, + "step": 16630 + }, + { + "epoch": 34.58419958419958, + "grad_norm": 0.2561175525188446, + "learning_rate": 3.153241006726757e-05, + "loss": 0.0934, + "num_input_tokens_seen": 11936208, + "step": 16635 + }, + { + "epoch": 34.5945945945946, + "grad_norm": 0.35158249735832214, + "learning_rate": 3.152293315954825e-05, + "loss": 0.1265, + "num_input_tokens_seen": 11940048, + "step": 16640 + }, + { + "epoch": 34.604989604989605, + "grad_norm": 0.3250681757926941, + "learning_rate": 3.1513455245910666e-05, + "loss": 0.1206, + "num_input_tokens_seen": 11943728, + "step": 16645 + }, + { + "epoch": 34.61538461538461, + "grad_norm": 0.20912082493305206, + "learning_rate": 3.150397632781643e-05, + "loss": 0.1107, + "num_input_tokens_seen": 11947312, + "step": 16650 + }, + { + "epoch": 34.62577962577963, + "grad_norm": 0.33906036615371704, + "learning_rate": 3.149449640672731e-05, + "loss": 0.116, + "num_input_tokens_seen": 11950960, + "step": 16655 + }, + { + "epoch": 34.63617463617464, + "grad_norm": 0.24979056417942047, + "learning_rate": 3.148501548410523e-05, + "loss": 0.1059, + "num_input_tokens_seen": 11954512, + "step": 16660 + }, + { + "epoch": 34.646569646569645, + "grad_norm": 0.5931586027145386, + "learning_rate": 3.1475533561412256e-05, + "loss": 0.0982, + "num_input_tokens_seen": 11958128, + "step": 16665 + }, + { + "epoch": 34.656964656964654, + "grad_norm": 0.4365948438644409, + "learning_rate": 3.146605064011065e-05, + "loss": 0.1288, + "num_input_tokens_seen": 11961744, + "step": 16670 + }, + { + "epoch": 34.66735966735967, + "grad_norm": 0.17474058270454407, + "learning_rate": 3.145656672166277e-05, + "loss": 0.0885, + "num_input_tokens_seen": 11965360, + "step": 16675 + }, + { + "epoch": 34.67775467775468, + "grad_norm": 0.28069743514060974, + "learning_rate": 3.144708180753116e-05, + "loss": 0.0717, + "num_input_tokens_seen": 11968880, + "step": 16680 + }, + { + "epoch": 34.688149688149686, + "grad_norm": 0.19185276329517365, + "learning_rate": 3.143759589917851e-05, + "loss": 0.128, + "num_input_tokens_seen": 11972464, + "step": 16685 + }, + { + "epoch": 34.6985446985447, + "grad_norm": 0.19686517119407654, + "learning_rate": 3.142810899806768e-05, + "loss": 0.1152, + "num_input_tokens_seen": 11975984, + "step": 16690 + }, + { + "epoch": 34.70893970893971, + "grad_norm": 0.3279672861099243, + "learning_rate": 3.141862110566166e-05, + "loss": 0.1079, + "num_input_tokens_seen": 11979504, + "step": 16695 + }, + { + "epoch": 34.71933471933472, + "grad_norm": 0.38214996457099915, + "learning_rate": 3.1409132223423606e-05, + "loss": 0.093, + "num_input_tokens_seen": 11983024, + "step": 16700 + }, + { + "epoch": 34.729729729729726, + "grad_norm": 0.1579667627811432, + "learning_rate": 3.139964235281682e-05, + "loss": 0.115, + "num_input_tokens_seen": 11986768, + "step": 16705 + }, + { + "epoch": 34.74012474012474, + "grad_norm": 0.1671779602766037, + "learning_rate": 3.139015149530476e-05, + "loss": 0.1079, + "num_input_tokens_seen": 11990320, + "step": 16710 + }, + { + "epoch": 34.75051975051975, + "grad_norm": 0.3891982436180115, + "learning_rate": 3.1380659652351034e-05, + "loss": 0.1097, + "num_input_tokens_seen": 11993872, + "step": 16715 + }, + { + "epoch": 34.76091476091476, + "grad_norm": 0.9436984658241272, + "learning_rate": 3.137116682541941e-05, + "loss": 0.1034, + "num_input_tokens_seen": 11997456, + "step": 16720 + }, + { + "epoch": 34.771309771309774, + "grad_norm": 0.2554793655872345, + "learning_rate": 3.136167301597379e-05, + "loss": 0.1193, + "num_input_tokens_seen": 12001104, + "step": 16725 + }, + { + "epoch": 34.78170478170478, + "grad_norm": 0.3739221394062042, + "learning_rate": 3.1352178225478254e-05, + "loss": 0.1186, + "num_input_tokens_seen": 12004816, + "step": 16730 + }, + { + "epoch": 34.79209979209979, + "grad_norm": 0.23404249548912048, + "learning_rate": 3.1342682455396996e-05, + "loss": 0.1053, + "num_input_tokens_seen": 12008176, + "step": 16735 + }, + { + "epoch": 34.802494802494806, + "grad_norm": 0.24432958662509918, + "learning_rate": 3.133318570719441e-05, + "loss": 0.1011, + "num_input_tokens_seen": 12011792, + "step": 16740 + }, + { + "epoch": 34.812889812889814, + "grad_norm": 0.2016676515340805, + "learning_rate": 3.132368798233499e-05, + "loss": 0.0813, + "num_input_tokens_seen": 12015376, + "step": 16745 + }, + { + "epoch": 34.82328482328482, + "grad_norm": 0.2216041386127472, + "learning_rate": 3.131418928228342e-05, + "loss": 0.0757, + "num_input_tokens_seen": 12018832, + "step": 16750 + }, + { + "epoch": 34.83367983367983, + "grad_norm": 0.7938005924224854, + "learning_rate": 3.1304689608504514e-05, + "loss": 0.1025, + "num_input_tokens_seen": 12022352, + "step": 16755 + }, + { + "epoch": 34.84407484407485, + "grad_norm": 0.49888351559638977, + "learning_rate": 3.129518896246324e-05, + "loss": 0.1094, + "num_input_tokens_seen": 12025968, + "step": 16760 + }, + { + "epoch": 34.854469854469855, + "grad_norm": 0.49620378017425537, + "learning_rate": 3.128568734562472e-05, + "loss": 0.1596, + "num_input_tokens_seen": 12029488, + "step": 16765 + }, + { + "epoch": 34.86486486486486, + "grad_norm": 0.3750350773334503, + "learning_rate": 3.127618475945421e-05, + "loss": 0.1241, + "num_input_tokens_seen": 12033136, + "step": 16770 + }, + { + "epoch": 34.87525987525988, + "grad_norm": 0.4983413517475128, + "learning_rate": 3.126668120541715e-05, + "loss": 0.0805, + "num_input_tokens_seen": 12036624, + "step": 16775 + }, + { + "epoch": 34.88565488565489, + "grad_norm": 0.5366518497467041, + "learning_rate": 3.1257176684979096e-05, + "loss": 0.1428, + "num_input_tokens_seen": 12040272, + "step": 16780 + }, + { + "epoch": 34.896049896049895, + "grad_norm": 0.2454678863286972, + "learning_rate": 3.124767119960576e-05, + "loss": 0.0817, + "num_input_tokens_seen": 12043824, + "step": 16785 + }, + { + "epoch": 34.906444906444904, + "grad_norm": 0.3153742253780365, + "learning_rate": 3.123816475076301e-05, + "loss": 0.1151, + "num_input_tokens_seen": 12047472, + "step": 16790 + }, + { + "epoch": 34.91683991683992, + "grad_norm": 0.6404836773872375, + "learning_rate": 3.122865733991687e-05, + "loss": 0.1272, + "num_input_tokens_seen": 12050928, + "step": 16795 + }, + { + "epoch": 34.92723492723493, + "grad_norm": 0.464626669883728, + "learning_rate": 3.1219148968533486e-05, + "loss": 0.1066, + "num_input_tokens_seen": 12054512, + "step": 16800 + }, + { + "epoch": 34.92723492723493, + "eval_loss": 0.1443641483783722, + "eval_runtime": 7.767, + "eval_samples_per_second": 110.21, + "eval_steps_per_second": 27.552, + "num_input_tokens_seen": 12054512, + "step": 16800 + }, + { + "epoch": 34.937629937629936, + "grad_norm": 0.2559433877468109, + "learning_rate": 3.120963963807918e-05, + "loss": 0.0627, + "num_input_tokens_seen": 12058000, + "step": 16805 + }, + { + "epoch": 34.94802494802495, + "grad_norm": 0.27599120140075684, + "learning_rate": 3.12001293500204e-05, + "loss": 0.1473, + "num_input_tokens_seen": 12061552, + "step": 16810 + }, + { + "epoch": 34.95841995841996, + "grad_norm": 0.3040160536766052, + "learning_rate": 3.1190618105823765e-05, + "loss": 0.1181, + "num_input_tokens_seen": 12065200, + "step": 16815 + }, + { + "epoch": 34.96881496881497, + "grad_norm": 0.2756327688694, + "learning_rate": 3.118110590695603e-05, + "loss": 0.1332, + "num_input_tokens_seen": 12068816, + "step": 16820 + }, + { + "epoch": 34.979209979209976, + "grad_norm": 0.319839209318161, + "learning_rate": 3.117159275488407e-05, + "loss": 0.0939, + "num_input_tokens_seen": 12072400, + "step": 16825 + }, + { + "epoch": 34.98960498960499, + "grad_norm": 0.1742393672466278, + "learning_rate": 3.1162078651074956e-05, + "loss": 0.1431, + "num_input_tokens_seen": 12075952, + "step": 16830 + }, + { + "epoch": 35.0, + "grad_norm": 0.9999896883964539, + "learning_rate": 3.1152563596995885e-05, + "loss": 0.1259, + "num_input_tokens_seen": 12079520, + "step": 16835 + }, + { + "epoch": 35.01039501039501, + "grad_norm": 0.5885186195373535, + "learning_rate": 3.1143047594114186e-05, + "loss": 0.1006, + "num_input_tokens_seen": 12083264, + "step": 16840 + }, + { + "epoch": 35.020790020790024, + "grad_norm": 0.28173840045928955, + "learning_rate": 3.113353064389734e-05, + "loss": 0.0907, + "num_input_tokens_seen": 12086816, + "step": 16845 + }, + { + "epoch": 35.03118503118503, + "grad_norm": 0.7134169340133667, + "learning_rate": 3.1124012747812993e-05, + "loss": 0.0843, + "num_input_tokens_seen": 12090304, + "step": 16850 + }, + { + "epoch": 35.04158004158004, + "grad_norm": 0.41464313864707947, + "learning_rate": 3.1114493907328936e-05, + "loss": 0.0898, + "num_input_tokens_seen": 12094144, + "step": 16855 + }, + { + "epoch": 35.05197505197505, + "grad_norm": 0.4626343548297882, + "learning_rate": 3.110497412391306e-05, + "loss": 0.1365, + "num_input_tokens_seen": 12097792, + "step": 16860 + }, + { + "epoch": 35.062370062370064, + "grad_norm": 0.574844241142273, + "learning_rate": 3.1095453399033466e-05, + "loss": 0.1287, + "num_input_tokens_seen": 12101408, + "step": 16865 + }, + { + "epoch": 35.07276507276507, + "grad_norm": 0.23174327611923218, + "learning_rate": 3.108593173415835e-05, + "loss": 0.0866, + "num_input_tokens_seen": 12105120, + "step": 16870 + }, + { + "epoch": 35.08316008316008, + "grad_norm": 0.28843557834625244, + "learning_rate": 3.107640913075609e-05, + "loss": 0.1168, + "num_input_tokens_seen": 12108896, + "step": 16875 + }, + { + "epoch": 35.093555093555096, + "grad_norm": 0.5208300948143005, + "learning_rate": 3.106688559029517e-05, + "loss": 0.1266, + "num_input_tokens_seen": 12112416, + "step": 16880 + }, + { + "epoch": 35.103950103950105, + "grad_norm": 0.27068912982940674, + "learning_rate": 3.105736111424425e-05, + "loss": 0.108, + "num_input_tokens_seen": 12116032, + "step": 16885 + }, + { + "epoch": 35.11434511434511, + "grad_norm": 0.22064220905303955, + "learning_rate": 3.1047835704072136e-05, + "loss": 0.1007, + "num_input_tokens_seen": 12119616, + "step": 16890 + }, + { + "epoch": 35.12474012474012, + "grad_norm": 0.18159961700439453, + "learning_rate": 3.103830936124775e-05, + "loss": 0.1009, + "num_input_tokens_seen": 12123328, + "step": 16895 + }, + { + "epoch": 35.13513513513514, + "grad_norm": 0.2728988230228424, + "learning_rate": 3.102878208724018e-05, + "loss": 0.092, + "num_input_tokens_seen": 12126976, + "step": 16900 + }, + { + "epoch": 35.145530145530145, + "grad_norm": 0.22906115651130676, + "learning_rate": 3.101925388351865e-05, + "loss": 0.0772, + "num_input_tokens_seen": 12130656, + "step": 16905 + }, + { + "epoch": 35.15592515592515, + "grad_norm": 0.2911921441555023, + "learning_rate": 3.1009724751552515e-05, + "loss": 0.1574, + "num_input_tokens_seen": 12134240, + "step": 16910 + }, + { + "epoch": 35.16632016632017, + "grad_norm": 0.23126950860023499, + "learning_rate": 3.100019469281131e-05, + "loss": 0.1108, + "num_input_tokens_seen": 12137920, + "step": 16915 + }, + { + "epoch": 35.17671517671518, + "grad_norm": 0.3433573544025421, + "learning_rate": 3.0990663708764685e-05, + "loss": 0.1109, + "num_input_tokens_seen": 12141504, + "step": 16920 + }, + { + "epoch": 35.187110187110186, + "grad_norm": 0.38242051005363464, + "learning_rate": 3.098113180088243e-05, + "loss": 0.1148, + "num_input_tokens_seen": 12145152, + "step": 16925 + }, + { + "epoch": 35.197505197505194, + "grad_norm": 0.4367828667163849, + "learning_rate": 3.097159897063448e-05, + "loss": 0.1886, + "num_input_tokens_seen": 12148672, + "step": 16930 + }, + { + "epoch": 35.20790020790021, + "grad_norm": 0.3464427590370178, + "learning_rate": 3.096206521949094e-05, + "loss": 0.1204, + "num_input_tokens_seen": 12152192, + "step": 16935 + }, + { + "epoch": 35.21829521829522, + "grad_norm": 0.280970960855484, + "learning_rate": 3.0952530548922006e-05, + "loss": 0.0697, + "num_input_tokens_seen": 12155552, + "step": 16940 + }, + { + "epoch": 35.228690228690226, + "grad_norm": 0.2884570062160492, + "learning_rate": 3.0942994960398064e-05, + "loss": 0.0925, + "num_input_tokens_seen": 12159008, + "step": 16945 + }, + { + "epoch": 35.23908523908524, + "grad_norm": 0.18330760300159454, + "learning_rate": 3.093345845538961e-05, + "loss": 0.076, + "num_input_tokens_seen": 12162496, + "step": 16950 + }, + { + "epoch": 35.24948024948025, + "grad_norm": 0.41974717378616333, + "learning_rate": 3.09239210353673e-05, + "loss": 0.1309, + "num_input_tokens_seen": 12166112, + "step": 16955 + }, + { + "epoch": 35.25987525987526, + "grad_norm": 0.15869757533073425, + "learning_rate": 3.0914382701801926e-05, + "loss": 0.0834, + "num_input_tokens_seen": 12169600, + "step": 16960 + }, + { + "epoch": 35.270270270270274, + "grad_norm": 0.15942206978797913, + "learning_rate": 3.090484345616441e-05, + "loss": 0.1324, + "num_input_tokens_seen": 12173248, + "step": 16965 + }, + { + "epoch": 35.28066528066528, + "grad_norm": 0.11775801330804825, + "learning_rate": 3.0895303299925825e-05, + "loss": 0.0872, + "num_input_tokens_seen": 12176960, + "step": 16970 + }, + { + "epoch": 35.29106029106029, + "grad_norm": 0.4241228997707367, + "learning_rate": 3.0885762234557393e-05, + "loss": 0.0962, + "num_input_tokens_seen": 12180416, + "step": 16975 + }, + { + "epoch": 35.3014553014553, + "grad_norm": 0.27875640988349915, + "learning_rate": 3.087622026153045e-05, + "loss": 0.109, + "num_input_tokens_seen": 12184128, + "step": 16980 + }, + { + "epoch": 35.311850311850314, + "grad_norm": 0.6513495445251465, + "learning_rate": 3.086667738231651e-05, + "loss": 0.0502, + "num_input_tokens_seen": 12187584, + "step": 16985 + }, + { + "epoch": 35.32224532224532, + "grad_norm": 0.5029211044311523, + "learning_rate": 3.085713359838718e-05, + "loss": 0.1033, + "num_input_tokens_seen": 12191264, + "step": 16990 + }, + { + "epoch": 35.33264033264033, + "grad_norm": 0.38164469599723816, + "learning_rate": 3.084758891121425e-05, + "loss": 0.1257, + "num_input_tokens_seen": 12194848, + "step": 16995 + }, + { + "epoch": 35.343035343035346, + "grad_norm": 0.23661264777183533, + "learning_rate": 3.083804332226963e-05, + "loss": 0.1179, + "num_input_tokens_seen": 12198464, + "step": 17000 + }, + { + "epoch": 35.343035343035346, + "eval_loss": 0.1450623869895935, + "eval_runtime": 7.7495, + "eval_samples_per_second": 110.458, + "eval_steps_per_second": 27.615, + "num_input_tokens_seen": 12198464, + "step": 17000 + }, + { + "epoch": 35.353430353430355, + "grad_norm": 0.28209224343299866, + "learning_rate": 3.082849683302536e-05, + "loss": 0.0782, + "num_input_tokens_seen": 12202048, + "step": 17005 + }, + { + "epoch": 35.36382536382536, + "grad_norm": 0.21476268768310547, + "learning_rate": 3.081894944495363e-05, + "loss": 0.0811, + "num_input_tokens_seen": 12205600, + "step": 17010 + }, + { + "epoch": 35.37422037422037, + "grad_norm": 0.23135089874267578, + "learning_rate": 3.080940115952677e-05, + "loss": 0.1163, + "num_input_tokens_seen": 12208992, + "step": 17015 + }, + { + "epoch": 35.38461538461539, + "grad_norm": 0.1716577112674713, + "learning_rate": 3.0799851978217245e-05, + "loss": 0.0931, + "num_input_tokens_seen": 12212416, + "step": 17020 + }, + { + "epoch": 35.395010395010395, + "grad_norm": 0.24885274469852448, + "learning_rate": 3.0790301902497666e-05, + "loss": 0.1329, + "num_input_tokens_seen": 12215872, + "step": 17025 + }, + { + "epoch": 35.4054054054054, + "grad_norm": 0.2773771286010742, + "learning_rate": 3.078075093384076e-05, + "loss": 0.0942, + "num_input_tokens_seen": 12219552, + "step": 17030 + }, + { + "epoch": 35.41580041580042, + "grad_norm": 0.38298720121383667, + "learning_rate": 3.077119907371942e-05, + "loss": 0.1349, + "num_input_tokens_seen": 12223072, + "step": 17035 + }, + { + "epoch": 35.42619542619543, + "grad_norm": 0.2768319547176361, + "learning_rate": 3.076164632360666e-05, + "loss": 0.0967, + "num_input_tokens_seen": 12226592, + "step": 17040 + }, + { + "epoch": 35.436590436590436, + "grad_norm": 0.514074981212616, + "learning_rate": 3.075209268497563e-05, + "loss": 0.1265, + "num_input_tokens_seen": 12230144, + "step": 17045 + }, + { + "epoch": 35.446985446985444, + "grad_norm": 0.3226972818374634, + "learning_rate": 3.074253815929961e-05, + "loss": 0.1378, + "num_input_tokens_seen": 12233792, + "step": 17050 + }, + { + "epoch": 35.45738045738046, + "grad_norm": 0.20544008910655975, + "learning_rate": 3.0732982748052054e-05, + "loss": 0.1085, + "num_input_tokens_seen": 12237472, + "step": 17055 + }, + { + "epoch": 35.46777546777547, + "grad_norm": 0.4395405650138855, + "learning_rate": 3.072342645270651e-05, + "loss": 0.1504, + "num_input_tokens_seen": 12241088, + "step": 17060 + }, + { + "epoch": 35.478170478170476, + "grad_norm": 0.2647678852081299, + "learning_rate": 3.071386927473668e-05, + "loss": 0.1445, + "num_input_tokens_seen": 12244672, + "step": 17065 + }, + { + "epoch": 35.48856548856549, + "grad_norm": 0.2801402807235718, + "learning_rate": 3.0704311215616404e-05, + "loss": 0.0985, + "num_input_tokens_seen": 12248224, + "step": 17070 + }, + { + "epoch": 35.4989604989605, + "grad_norm": 0.25204047560691833, + "learning_rate": 3.0694752276819656e-05, + "loss": 0.1081, + "num_input_tokens_seen": 12252032, + "step": 17075 + }, + { + "epoch": 35.50935550935551, + "grad_norm": 0.21269242465496063, + "learning_rate": 3.068519245982054e-05, + "loss": 0.0861, + "num_input_tokens_seen": 12255424, + "step": 17080 + }, + { + "epoch": 35.51975051975052, + "grad_norm": 0.23634923994541168, + "learning_rate": 3.0675631766093304e-05, + "loss": 0.167, + "num_input_tokens_seen": 12259232, + "step": 17085 + }, + { + "epoch": 35.53014553014553, + "grad_norm": 0.4244726002216339, + "learning_rate": 3.066607019711232e-05, + "loss": 0.1275, + "num_input_tokens_seen": 12262976, + "step": 17090 + }, + { + "epoch": 35.54054054054054, + "grad_norm": 0.1847052276134491, + "learning_rate": 3.065650775435211e-05, + "loss": 0.0828, + "num_input_tokens_seen": 12266496, + "step": 17095 + }, + { + "epoch": 35.55093555093555, + "grad_norm": 0.23721055686473846, + "learning_rate": 3.0646944439287326e-05, + "loss": 0.1024, + "num_input_tokens_seen": 12270016, + "step": 17100 + }, + { + "epoch": 35.561330561330564, + "grad_norm": 0.14792963862419128, + "learning_rate": 3.0637380253392736e-05, + "loss": 0.0887, + "num_input_tokens_seen": 12273664, + "step": 17105 + }, + { + "epoch": 35.57172557172557, + "grad_norm": 0.24041229486465454, + "learning_rate": 3.062781519814327e-05, + "loss": 0.1049, + "num_input_tokens_seen": 12277152, + "step": 17110 + }, + { + "epoch": 35.58212058212058, + "grad_norm": 0.17810271680355072, + "learning_rate": 3.0618249275013985e-05, + "loss": 0.1011, + "num_input_tokens_seen": 12280640, + "step": 17115 + }, + { + "epoch": 35.59251559251559, + "grad_norm": 0.3872023820877075, + "learning_rate": 3.060868248548005e-05, + "loss": 0.1398, + "num_input_tokens_seen": 12284256, + "step": 17120 + }, + { + "epoch": 35.602910602910605, + "grad_norm": 0.841284453868866, + "learning_rate": 3.0599114831016796e-05, + "loss": 0.1225, + "num_input_tokens_seen": 12287936, + "step": 17125 + }, + { + "epoch": 35.61330561330561, + "grad_norm": 0.4201371669769287, + "learning_rate": 3.0589546313099666e-05, + "loss": 0.1086, + "num_input_tokens_seen": 12291520, + "step": 17130 + }, + { + "epoch": 35.62370062370062, + "grad_norm": 0.6237224340438843, + "learning_rate": 3.0579976933204255e-05, + "loss": 0.1086, + "num_input_tokens_seen": 12295072, + "step": 17135 + }, + { + "epoch": 35.63409563409564, + "grad_norm": 0.3305570185184479, + "learning_rate": 3.0570406692806284e-05, + "loss": 0.0879, + "num_input_tokens_seen": 12298688, + "step": 17140 + }, + { + "epoch": 35.644490644490645, + "grad_norm": 0.3736273944377899, + "learning_rate": 3.05608355933816e-05, + "loss": 0.1074, + "num_input_tokens_seen": 12302272, + "step": 17145 + }, + { + "epoch": 35.65488565488565, + "grad_norm": 0.40262874960899353, + "learning_rate": 3.055126363640618e-05, + "loss": 0.1019, + "num_input_tokens_seen": 12305888, + "step": 17150 + }, + { + "epoch": 35.66528066528066, + "grad_norm": 0.15754733979701996, + "learning_rate": 3.0541690823356146e-05, + "loss": 0.1009, + "num_input_tokens_seen": 12309600, + "step": 17155 + }, + { + "epoch": 35.67567567567568, + "grad_norm": 0.19088102877140045, + "learning_rate": 3.053211715570775e-05, + "loss": 0.142, + "num_input_tokens_seen": 12313120, + "step": 17160 + }, + { + "epoch": 35.686070686070686, + "grad_norm": 0.7188622951507568, + "learning_rate": 3.052254263493736e-05, + "loss": 0.1205, + "num_input_tokens_seen": 12316672, + "step": 17165 + }, + { + "epoch": 35.696465696465694, + "grad_norm": 0.2590081989765167, + "learning_rate": 3.0512967262521498e-05, + "loss": 0.1226, + "num_input_tokens_seen": 12320224, + "step": 17170 + }, + { + "epoch": 35.70686070686071, + "grad_norm": 0.8459123969078064, + "learning_rate": 3.0503391039936803e-05, + "loss": 0.1374, + "num_input_tokens_seen": 12323808, + "step": 17175 + }, + { + "epoch": 35.71725571725572, + "grad_norm": 0.29741111397743225, + "learning_rate": 3.0493813968660056e-05, + "loss": 0.1398, + "num_input_tokens_seen": 12327424, + "step": 17180 + }, + { + "epoch": 35.727650727650726, + "grad_norm": 0.6160571575164795, + "learning_rate": 3.0484236050168153e-05, + "loss": 0.1218, + "num_input_tokens_seen": 12331040, + "step": 17185 + }, + { + "epoch": 35.73804573804574, + "grad_norm": 0.20199526846408844, + "learning_rate": 3.0474657285938123e-05, + "loss": 0.0781, + "num_input_tokens_seen": 12334528, + "step": 17190 + }, + { + "epoch": 35.74844074844075, + "grad_norm": 0.13445399701595306, + "learning_rate": 3.046507767744715e-05, + "loss": 0.1084, + "num_input_tokens_seen": 12337984, + "step": 17195 + }, + { + "epoch": 35.75883575883576, + "grad_norm": 0.5576385259628296, + "learning_rate": 3.045549722617252e-05, + "loss": 0.1434, + "num_input_tokens_seen": 12341536, + "step": 17200 + }, + { + "epoch": 35.75883575883576, + "eval_loss": 0.14383302628993988, + "eval_runtime": 7.7498, + "eval_samples_per_second": 110.454, + "eval_steps_per_second": 27.614, + "num_input_tokens_seen": 12341536, + "step": 17200 + }, + { + "epoch": 35.76923076923077, + "grad_norm": 0.1633196771144867, + "learning_rate": 3.0445915933591658e-05, + "loss": 0.1448, + "num_input_tokens_seen": 12345184, + "step": 17205 + }, + { + "epoch": 35.77962577962578, + "grad_norm": 0.1729091852903366, + "learning_rate": 3.0436333801182114e-05, + "loss": 0.1027, + "num_input_tokens_seen": 12348768, + "step": 17210 + }, + { + "epoch": 35.79002079002079, + "grad_norm": 0.25993314385414124, + "learning_rate": 3.0426750830421596e-05, + "loss": 0.1306, + "num_input_tokens_seen": 12352352, + "step": 17215 + }, + { + "epoch": 35.8004158004158, + "grad_norm": 0.31021562218666077, + "learning_rate": 3.0417167022787897e-05, + "loss": 0.1337, + "num_input_tokens_seen": 12355904, + "step": 17220 + }, + { + "epoch": 35.810810810810814, + "grad_norm": 0.1291225701570511, + "learning_rate": 3.0407582379758966e-05, + "loss": 0.1051, + "num_input_tokens_seen": 12359296, + "step": 17225 + }, + { + "epoch": 35.82120582120582, + "grad_norm": 0.675135612487793, + "learning_rate": 3.039799690281287e-05, + "loss": 0.098, + "num_input_tokens_seen": 12362944, + "step": 17230 + }, + { + "epoch": 35.83160083160083, + "grad_norm": 0.3411877155303955, + "learning_rate": 3.0388410593427823e-05, + "loss": 0.1206, + "num_input_tokens_seen": 12366592, + "step": 17235 + }, + { + "epoch": 35.84199584199584, + "grad_norm": 0.28708577156066895, + "learning_rate": 3.0378823453082146e-05, + "loss": 0.1196, + "num_input_tokens_seen": 12370272, + "step": 17240 + }, + { + "epoch": 35.852390852390855, + "grad_norm": 0.2346109002828598, + "learning_rate": 3.03692354832543e-05, + "loss": 0.1012, + "num_input_tokens_seen": 12373952, + "step": 17245 + }, + { + "epoch": 35.86278586278586, + "grad_norm": 0.27167314291000366, + "learning_rate": 3.0359646685422865e-05, + "loss": 0.1133, + "num_input_tokens_seen": 12377472, + "step": 17250 + }, + { + "epoch": 35.87318087318087, + "grad_norm": 0.2762356102466583, + "learning_rate": 3.035005706106656e-05, + "loss": 0.1176, + "num_input_tokens_seen": 12381056, + "step": 17255 + }, + { + "epoch": 35.88357588357589, + "grad_norm": 0.19703546166419983, + "learning_rate": 3.034046661166422e-05, + "loss": 0.1011, + "num_input_tokens_seen": 12384544, + "step": 17260 + }, + { + "epoch": 35.893970893970895, + "grad_norm": 0.40994617342948914, + "learning_rate": 3.033087533869482e-05, + "loss": 0.0958, + "num_input_tokens_seen": 12388032, + "step": 17265 + }, + { + "epoch": 35.9043659043659, + "grad_norm": 0.5644000172615051, + "learning_rate": 3.0321283243637444e-05, + "loss": 0.0994, + "num_input_tokens_seen": 12391616, + "step": 17270 + }, + { + "epoch": 35.91476091476091, + "grad_norm": 0.3316878378391266, + "learning_rate": 3.0311690327971326e-05, + "loss": 0.1085, + "num_input_tokens_seen": 12395232, + "step": 17275 + }, + { + "epoch": 35.92515592515593, + "grad_norm": 0.7684923410415649, + "learning_rate": 3.030209659317581e-05, + "loss": 0.1314, + "num_input_tokens_seen": 12398944, + "step": 17280 + }, + { + "epoch": 35.935550935550935, + "grad_norm": 0.8021697998046875, + "learning_rate": 3.0292502040730362e-05, + "loss": 0.0993, + "num_input_tokens_seen": 12402528, + "step": 17285 + }, + { + "epoch": 35.945945945945944, + "grad_norm": 0.4530142545700073, + "learning_rate": 3.0282906672114597e-05, + "loss": 0.0924, + "num_input_tokens_seen": 12406016, + "step": 17290 + }, + { + "epoch": 35.95634095634096, + "grad_norm": 0.16788169741630554, + "learning_rate": 3.027331048880823e-05, + "loss": 0.1165, + "num_input_tokens_seen": 12409664, + "step": 17295 + }, + { + "epoch": 35.96673596673597, + "grad_norm": 0.21490849554538727, + "learning_rate": 3.0263713492291123e-05, + "loss": 0.123, + "num_input_tokens_seen": 12413440, + "step": 17300 + }, + { + "epoch": 35.977130977130976, + "grad_norm": 0.2901933789253235, + "learning_rate": 3.0254115684043242e-05, + "loss": 0.0976, + "num_input_tokens_seen": 12416832, + "step": 17305 + }, + { + "epoch": 35.987525987525984, + "grad_norm": 0.3500523567199707, + "learning_rate": 3.024451706554469e-05, + "loss": 0.1112, + "num_input_tokens_seen": 12420384, + "step": 17310 + }, + { + "epoch": 35.997920997921, + "grad_norm": 0.4518726170063019, + "learning_rate": 3.0234917638275705e-05, + "loss": 0.1422, + "num_input_tokens_seen": 12424000, + "step": 17315 + }, + { + "epoch": 36.00831600831601, + "grad_norm": 0.38009974360466003, + "learning_rate": 3.0225317403716635e-05, + "loss": 0.073, + "num_input_tokens_seen": 12427512, + "step": 17320 + }, + { + "epoch": 36.018711018711016, + "grad_norm": 0.4628983736038208, + "learning_rate": 3.0215716363347956e-05, + "loss": 0.0775, + "num_input_tokens_seen": 12431032, + "step": 17325 + }, + { + "epoch": 36.02910602910603, + "grad_norm": 0.2937721312046051, + "learning_rate": 3.0206114518650275e-05, + "loss": 0.0963, + "num_input_tokens_seen": 12434584, + "step": 17330 + }, + { + "epoch": 36.03950103950104, + "grad_norm": 0.31000542640686035, + "learning_rate": 3.0196511871104304e-05, + "loss": 0.0734, + "num_input_tokens_seen": 12438040, + "step": 17335 + }, + { + "epoch": 36.04989604989605, + "grad_norm": 0.30612969398498535, + "learning_rate": 3.01869084221909e-05, + "loss": 0.1202, + "num_input_tokens_seen": 12441624, + "step": 17340 + }, + { + "epoch": 36.06029106029106, + "grad_norm": 0.27952951192855835, + "learning_rate": 3.0177304173391037e-05, + "loss": 0.1198, + "num_input_tokens_seen": 12445336, + "step": 17345 + }, + { + "epoch": 36.07068607068607, + "grad_norm": 0.6731916069984436, + "learning_rate": 3.01676991261858e-05, + "loss": 0.0947, + "num_input_tokens_seen": 12448888, + "step": 17350 + }, + { + "epoch": 36.08108108108108, + "grad_norm": 0.3465827405452728, + "learning_rate": 3.015809328205642e-05, + "loss": 0.0936, + "num_input_tokens_seen": 12452536, + "step": 17355 + }, + { + "epoch": 36.09147609147609, + "grad_norm": 0.6786193251609802, + "learning_rate": 3.0148486642484248e-05, + "loss": 0.0764, + "num_input_tokens_seen": 12456024, + "step": 17360 + }, + { + "epoch": 36.101871101871104, + "grad_norm": 0.3604417145252228, + "learning_rate": 3.0138879208950722e-05, + "loss": 0.1016, + "num_input_tokens_seen": 12459736, + "step": 17365 + }, + { + "epoch": 36.11226611226611, + "grad_norm": 0.36818963289260864, + "learning_rate": 3.012927098293744e-05, + "loss": 0.1395, + "num_input_tokens_seen": 12463480, + "step": 17370 + }, + { + "epoch": 36.12266112266112, + "grad_norm": 0.8490399718284607, + "learning_rate": 3.0119661965926123e-05, + "loss": 0.1218, + "num_input_tokens_seen": 12467256, + "step": 17375 + }, + { + "epoch": 36.13305613305613, + "grad_norm": 0.1886049509048462, + "learning_rate": 3.0110052159398587e-05, + "loss": 0.0949, + "num_input_tokens_seen": 12470904, + "step": 17380 + }, + { + "epoch": 36.143451143451145, + "grad_norm": 0.24522921442985535, + "learning_rate": 3.0100441564836802e-05, + "loss": 0.1418, + "num_input_tokens_seen": 12474520, + "step": 17385 + }, + { + "epoch": 36.15384615384615, + "grad_norm": 0.580901026725769, + "learning_rate": 3.0090830183722817e-05, + "loss": 0.0863, + "num_input_tokens_seen": 12478136, + "step": 17390 + }, + { + "epoch": 36.16424116424116, + "grad_norm": 0.3398225009441376, + "learning_rate": 3.0081218017538852e-05, + "loss": 0.104, + "num_input_tokens_seen": 12481816, + "step": 17395 + }, + { + "epoch": 36.17463617463618, + "grad_norm": 0.2239653617143631, + "learning_rate": 3.0071605067767212e-05, + "loss": 0.1222, + "num_input_tokens_seen": 12485368, + "step": 17400 + }, + { + "epoch": 36.17463617463618, + "eval_loss": 0.1430988609790802, + "eval_runtime": 7.7561, + "eval_samples_per_second": 110.365, + "eval_steps_per_second": 27.591, + "num_input_tokens_seen": 12485368, + "step": 17400 + }, + { + "epoch": 36.185031185031185, + "grad_norm": 0.4831000566482544, + "learning_rate": 3.006199133589034e-05, + "loss": 0.1051, + "num_input_tokens_seen": 12489112, + "step": 17405 + }, + { + "epoch": 36.195426195426194, + "grad_norm": 0.22460579872131348, + "learning_rate": 3.005237682339079e-05, + "loss": 0.1122, + "num_input_tokens_seen": 12492760, + "step": 17410 + }, + { + "epoch": 36.20582120582121, + "grad_norm": 0.1706618368625641, + "learning_rate": 3.0042761531751228e-05, + "loss": 0.1336, + "num_input_tokens_seen": 12496440, + "step": 17415 + }, + { + "epoch": 36.21621621621622, + "grad_norm": 0.19577045738697052, + "learning_rate": 3.0033145462454482e-05, + "loss": 0.0699, + "num_input_tokens_seen": 12499864, + "step": 17420 + }, + { + "epoch": 36.226611226611226, + "grad_norm": 0.5727338790893555, + "learning_rate": 3.002352861698345e-05, + "loss": 0.1818, + "num_input_tokens_seen": 12503512, + "step": 17425 + }, + { + "epoch": 36.237006237006234, + "grad_norm": 0.25462016463279724, + "learning_rate": 3.0013910996821178e-05, + "loss": 0.0921, + "num_input_tokens_seen": 12507128, + "step": 17430 + }, + { + "epoch": 36.24740124740125, + "grad_norm": 0.5625225305557251, + "learning_rate": 3.0004292603450817e-05, + "loss": 0.1244, + "num_input_tokens_seen": 12510712, + "step": 17435 + }, + { + "epoch": 36.25779625779626, + "grad_norm": 0.22668379545211792, + "learning_rate": 2.9994673438355653e-05, + "loss": 0.1226, + "num_input_tokens_seen": 12514392, + "step": 17440 + }, + { + "epoch": 36.268191268191266, + "grad_norm": 0.13273702561855316, + "learning_rate": 2.9985053503019078e-05, + "loss": 0.0788, + "num_input_tokens_seen": 12517912, + "step": 17445 + }, + { + "epoch": 36.27858627858628, + "grad_norm": 0.5195407271385193, + "learning_rate": 2.99754327989246e-05, + "loss": 0.118, + "num_input_tokens_seen": 12521528, + "step": 17450 + }, + { + "epoch": 36.28898128898129, + "grad_norm": 0.1852436661720276, + "learning_rate": 2.9965811327555864e-05, + "loss": 0.1273, + "num_input_tokens_seen": 12525176, + "step": 17455 + }, + { + "epoch": 36.2993762993763, + "grad_norm": 0.1784590631723404, + "learning_rate": 2.995618909039662e-05, + "loss": 0.1031, + "num_input_tokens_seen": 12528824, + "step": 17460 + }, + { + "epoch": 36.30977130977131, + "grad_norm": 0.35336825251579285, + "learning_rate": 2.9946566088930727e-05, + "loss": 0.0995, + "num_input_tokens_seen": 12532440, + "step": 17465 + }, + { + "epoch": 36.32016632016632, + "grad_norm": 0.3190956711769104, + "learning_rate": 2.9936942324642192e-05, + "loss": 0.1045, + "num_input_tokens_seen": 12535928, + "step": 17470 + }, + { + "epoch": 36.33056133056133, + "grad_norm": 0.14613161981105804, + "learning_rate": 2.9927317799015097e-05, + "loss": 0.0703, + "num_input_tokens_seen": 12539416, + "step": 17475 + }, + { + "epoch": 36.34095634095634, + "grad_norm": 0.53785640001297, + "learning_rate": 2.9917692513533685e-05, + "loss": 0.0969, + "num_input_tokens_seen": 12542936, + "step": 17480 + }, + { + "epoch": 36.351351351351354, + "grad_norm": 0.2780667245388031, + "learning_rate": 2.990806646968229e-05, + "loss": 0.0828, + "num_input_tokens_seen": 12546488, + "step": 17485 + }, + { + "epoch": 36.36174636174636, + "grad_norm": 0.6028905510902405, + "learning_rate": 2.989843966894536e-05, + "loss": 0.1451, + "num_input_tokens_seen": 12550392, + "step": 17490 + }, + { + "epoch": 36.37214137214137, + "grad_norm": 0.40607649087905884, + "learning_rate": 2.9888812112807472e-05, + "loss": 0.0817, + "num_input_tokens_seen": 12553912, + "step": 17495 + }, + { + "epoch": 36.38253638253638, + "grad_norm": 1.0194228887557983, + "learning_rate": 2.987918380275333e-05, + "loss": 0.1341, + "num_input_tokens_seen": 12557656, + "step": 17500 + }, + { + "epoch": 36.392931392931395, + "grad_norm": 0.6599639654159546, + "learning_rate": 2.9869554740267724e-05, + "loss": 0.141, + "num_input_tokens_seen": 12561208, + "step": 17505 + }, + { + "epoch": 36.4033264033264, + "grad_norm": 0.5329135060310364, + "learning_rate": 2.9859924926835585e-05, + "loss": 0.0882, + "num_input_tokens_seen": 12564824, + "step": 17510 + }, + { + "epoch": 36.41372141372141, + "grad_norm": 0.4757513999938965, + "learning_rate": 2.9850294363941944e-05, + "loss": 0.1233, + "num_input_tokens_seen": 12568632, + "step": 17515 + }, + { + "epoch": 36.42411642411643, + "grad_norm": 0.9347524046897888, + "learning_rate": 2.9840663053071967e-05, + "loss": 0.1029, + "num_input_tokens_seen": 12572216, + "step": 17520 + }, + { + "epoch": 36.434511434511435, + "grad_norm": 0.4026625156402588, + "learning_rate": 2.983103099571091e-05, + "loss": 0.071, + "num_input_tokens_seen": 12575800, + "step": 17525 + }, + { + "epoch": 36.444906444906444, + "grad_norm": 0.33527764678001404, + "learning_rate": 2.9821398193344164e-05, + "loss": 0.095, + "num_input_tokens_seen": 12579384, + "step": 17530 + }, + { + "epoch": 36.45530145530145, + "grad_norm": 0.23302574455738068, + "learning_rate": 2.9811764647457226e-05, + "loss": 0.0788, + "num_input_tokens_seen": 12582872, + "step": 17535 + }, + { + "epoch": 36.46569646569647, + "grad_norm": 0.559134304523468, + "learning_rate": 2.9802130359535714e-05, + "loss": 0.1014, + "num_input_tokens_seen": 12586488, + "step": 17540 + }, + { + "epoch": 36.476091476091476, + "grad_norm": 0.9121450185775757, + "learning_rate": 2.979249533106535e-05, + "loss": 0.1321, + "num_input_tokens_seen": 12589976, + "step": 17545 + }, + { + "epoch": 36.486486486486484, + "grad_norm": 0.19882194697856903, + "learning_rate": 2.9782859563531986e-05, + "loss": 0.1059, + "num_input_tokens_seen": 12593560, + "step": 17550 + }, + { + "epoch": 36.4968814968815, + "grad_norm": 0.34820157289505005, + "learning_rate": 2.977322305842156e-05, + "loss": 0.1206, + "num_input_tokens_seen": 12597240, + "step": 17555 + }, + { + "epoch": 36.50727650727651, + "grad_norm": 0.2860790491104126, + "learning_rate": 2.9763585817220162e-05, + "loss": 0.0887, + "num_input_tokens_seen": 12601016, + "step": 17560 + }, + { + "epoch": 36.517671517671516, + "grad_norm": 0.5966243743896484, + "learning_rate": 2.975394784141397e-05, + "loss": 0.0853, + "num_input_tokens_seen": 12604664, + "step": 17565 + }, + { + "epoch": 36.528066528066525, + "grad_norm": 0.6140239238739014, + "learning_rate": 2.974430913248928e-05, + "loss": 0.1623, + "num_input_tokens_seen": 12608216, + "step": 17570 + }, + { + "epoch": 36.53846153846154, + "grad_norm": 0.6576801538467407, + "learning_rate": 2.9734669691932497e-05, + "loss": 0.1177, + "num_input_tokens_seen": 12611832, + "step": 17575 + }, + { + "epoch": 36.54885654885655, + "grad_norm": 0.278925359249115, + "learning_rate": 2.9725029521230147e-05, + "loss": 0.0991, + "num_input_tokens_seen": 12615384, + "step": 17580 + }, + { + "epoch": 36.55925155925156, + "grad_norm": 0.48333844542503357, + "learning_rate": 2.9715388621868873e-05, + "loss": 0.1212, + "num_input_tokens_seen": 12618904, + "step": 17585 + }, + { + "epoch": 36.56964656964657, + "grad_norm": 0.26785358786582947, + "learning_rate": 2.970574699533541e-05, + "loss": 0.0884, + "num_input_tokens_seen": 12622456, + "step": 17590 + }, + { + "epoch": 36.58004158004158, + "grad_norm": 0.5851180553436279, + "learning_rate": 2.969610464311662e-05, + "loss": 0.1329, + "num_input_tokens_seen": 12625944, + "step": 17595 + }, + { + "epoch": 36.59043659043659, + "grad_norm": 0.1610933095216751, + "learning_rate": 2.9686461566699487e-05, + "loss": 0.1897, + "num_input_tokens_seen": 12629496, + "step": 17600 + }, + { + "epoch": 36.59043659043659, + "eval_loss": 0.1428898125886917, + "eval_runtime": 7.7586, + "eval_samples_per_second": 110.329, + "eval_steps_per_second": 27.582, + "num_input_tokens_seen": 12629496, + "step": 17600 + }, + { + "epoch": 36.6008316008316, + "grad_norm": 0.22844158113002777, + "learning_rate": 2.9676817767571086e-05, + "loss": 0.0925, + "num_input_tokens_seen": 12633176, + "step": 17605 + }, + { + "epoch": 36.61122661122661, + "grad_norm": 0.15519925951957703, + "learning_rate": 2.966717324721861e-05, + "loss": 0.1367, + "num_input_tokens_seen": 12636920, + "step": 17610 + }, + { + "epoch": 36.62162162162162, + "grad_norm": 0.3242643475532532, + "learning_rate": 2.9657528007129366e-05, + "loss": 0.1035, + "num_input_tokens_seen": 12640312, + "step": 17615 + }, + { + "epoch": 36.63201663201663, + "grad_norm": 0.5905215740203857, + "learning_rate": 2.9647882048790777e-05, + "loss": 0.1187, + "num_input_tokens_seen": 12643928, + "step": 17620 + }, + { + "epoch": 36.642411642411645, + "grad_norm": 0.2117292732000351, + "learning_rate": 2.963823537369037e-05, + "loss": 0.1223, + "num_input_tokens_seen": 12647480, + "step": 17625 + }, + { + "epoch": 36.65280665280665, + "grad_norm": 0.3206782042980194, + "learning_rate": 2.9628587983315775e-05, + "loss": 0.0907, + "num_input_tokens_seen": 12651000, + "step": 17630 + }, + { + "epoch": 36.66320166320166, + "grad_norm": 0.32815131545066833, + "learning_rate": 2.9618939879154746e-05, + "loss": 0.1065, + "num_input_tokens_seen": 12654520, + "step": 17635 + }, + { + "epoch": 36.67359667359668, + "grad_norm": 0.4164120554924011, + "learning_rate": 2.9609291062695143e-05, + "loss": 0.0809, + "num_input_tokens_seen": 12658040, + "step": 17640 + }, + { + "epoch": 36.683991683991685, + "grad_norm": 0.42596495151519775, + "learning_rate": 2.9599641535424938e-05, + "loss": 0.1004, + "num_input_tokens_seen": 12661656, + "step": 17645 + }, + { + "epoch": 36.694386694386694, + "grad_norm": 0.21815930306911469, + "learning_rate": 2.9589991298832202e-05, + "loss": 0.111, + "num_input_tokens_seen": 12665272, + "step": 17650 + }, + { + "epoch": 36.7047817047817, + "grad_norm": 0.24635475873947144, + "learning_rate": 2.958034035440513e-05, + "loss": 0.1473, + "num_input_tokens_seen": 12668856, + "step": 17655 + }, + { + "epoch": 36.71517671517672, + "grad_norm": 0.27290141582489014, + "learning_rate": 2.957068870363201e-05, + "loss": 0.1054, + "num_input_tokens_seen": 12672472, + "step": 17660 + }, + { + "epoch": 36.725571725571726, + "grad_norm": 0.3183286190032959, + "learning_rate": 2.956103634800126e-05, + "loss": 0.0952, + "num_input_tokens_seen": 12675864, + "step": 17665 + }, + { + "epoch": 36.735966735966734, + "grad_norm": 0.6970550417900085, + "learning_rate": 2.9551383289001384e-05, + "loss": 0.1394, + "num_input_tokens_seen": 12679640, + "step": 17670 + }, + { + "epoch": 36.74636174636175, + "grad_norm": 0.18463164567947388, + "learning_rate": 2.9541729528121005e-05, + "loss": 0.081, + "num_input_tokens_seen": 12683320, + "step": 17675 + }, + { + "epoch": 36.75675675675676, + "grad_norm": 0.33363860845565796, + "learning_rate": 2.9532075066848856e-05, + "loss": 0.1182, + "num_input_tokens_seen": 12686840, + "step": 17680 + }, + { + "epoch": 36.767151767151766, + "grad_norm": 0.42062902450561523, + "learning_rate": 2.9522419906673786e-05, + "loss": 0.0726, + "num_input_tokens_seen": 12690296, + "step": 17685 + }, + { + "epoch": 36.777546777546775, + "grad_norm": 0.5715202689170837, + "learning_rate": 2.951276404908474e-05, + "loss": 0.1192, + "num_input_tokens_seen": 12693752, + "step": 17690 + }, + { + "epoch": 36.78794178794179, + "grad_norm": 0.1871034801006317, + "learning_rate": 2.9503107495570752e-05, + "loss": 0.0714, + "num_input_tokens_seen": 12697272, + "step": 17695 + }, + { + "epoch": 36.7983367983368, + "grad_norm": 0.5410982966423035, + "learning_rate": 2.9493450247621003e-05, + "loss": 0.0842, + "num_input_tokens_seen": 12700856, + "step": 17700 + }, + { + "epoch": 36.80873180873181, + "grad_norm": 0.19792570173740387, + "learning_rate": 2.948379230672476e-05, + "loss": 0.1263, + "num_input_tokens_seen": 12704312, + "step": 17705 + }, + { + "epoch": 36.81912681912682, + "grad_norm": 0.2370944768190384, + "learning_rate": 2.9474133674371396e-05, + "loss": 0.1321, + "num_input_tokens_seen": 12707896, + "step": 17710 + }, + { + "epoch": 36.82952182952183, + "grad_norm": 0.5563426613807678, + "learning_rate": 2.9464474352050387e-05, + "loss": 0.157, + "num_input_tokens_seen": 12711608, + "step": 17715 + }, + { + "epoch": 36.83991683991684, + "grad_norm": 0.2858503460884094, + "learning_rate": 2.9454814341251336e-05, + "loss": 0.1595, + "num_input_tokens_seen": 12715160, + "step": 17720 + }, + { + "epoch": 36.85031185031185, + "grad_norm": 0.2558605968952179, + "learning_rate": 2.9445153643463942e-05, + "loss": 0.1022, + "num_input_tokens_seen": 12718552, + "step": 17725 + }, + { + "epoch": 36.86070686070686, + "grad_norm": 0.5729241371154785, + "learning_rate": 2.943549226017798e-05, + "loss": 0.0956, + "num_input_tokens_seen": 12722424, + "step": 17730 + }, + { + "epoch": 36.87110187110187, + "grad_norm": 0.27765384316444397, + "learning_rate": 2.942583019288337e-05, + "loss": 0.1012, + "num_input_tokens_seen": 12725976, + "step": 17735 + }, + { + "epoch": 36.88149688149688, + "grad_norm": 0.2694315016269684, + "learning_rate": 2.9416167443070132e-05, + "loss": 0.0966, + "num_input_tokens_seen": 12729464, + "step": 17740 + }, + { + "epoch": 36.891891891891895, + "grad_norm": 0.3776600658893585, + "learning_rate": 2.9406504012228375e-05, + "loss": 0.094, + "num_input_tokens_seen": 12733112, + "step": 17745 + }, + { + "epoch": 36.9022869022869, + "grad_norm": 0.1924627423286438, + "learning_rate": 2.939683990184832e-05, + "loss": 0.0994, + "num_input_tokens_seen": 12736632, + "step": 17750 + }, + { + "epoch": 36.91268191268191, + "grad_norm": 0.23893873393535614, + "learning_rate": 2.93871751134203e-05, + "loss": 0.1196, + "num_input_tokens_seen": 12740344, + "step": 17755 + }, + { + "epoch": 36.92307692307692, + "grad_norm": 0.8336755633354187, + "learning_rate": 2.9377509648434752e-05, + "loss": 0.1406, + "num_input_tokens_seen": 12743896, + "step": 17760 + }, + { + "epoch": 36.933471933471935, + "grad_norm": 0.23505854606628418, + "learning_rate": 2.9367843508382203e-05, + "loss": 0.0699, + "num_input_tokens_seen": 12747448, + "step": 17765 + }, + { + "epoch": 36.943866943866944, + "grad_norm": 0.5281694531440735, + "learning_rate": 2.9358176694753293e-05, + "loss": 0.1652, + "num_input_tokens_seen": 12751032, + "step": 17770 + }, + { + "epoch": 36.95426195426195, + "grad_norm": 0.5391123294830322, + "learning_rate": 2.9348509209038766e-05, + "loss": 0.1515, + "num_input_tokens_seen": 12754648, + "step": 17775 + }, + { + "epoch": 36.96465696465697, + "grad_norm": 0.22085367143154144, + "learning_rate": 2.933884105272947e-05, + "loss": 0.0923, + "num_input_tokens_seen": 12758200, + "step": 17780 + }, + { + "epoch": 36.975051975051976, + "grad_norm": 0.6947952508926392, + "learning_rate": 2.9329172227316366e-05, + "loss": 0.1163, + "num_input_tokens_seen": 12761592, + "step": 17785 + }, + { + "epoch": 36.985446985446984, + "grad_norm": 1.1609631776809692, + "learning_rate": 2.93195027342905e-05, + "loss": 0.1131, + "num_input_tokens_seen": 12765176, + "step": 17790 + }, + { + "epoch": 36.99584199584199, + "grad_norm": 0.2978915572166443, + "learning_rate": 2.9309832575143024e-05, + "loss": 0.1303, + "num_input_tokens_seen": 12768728, + "step": 17795 + }, + { + "epoch": 37.00623700623701, + "grad_norm": 0.2690447270870209, + "learning_rate": 2.930016175136521e-05, + "loss": 0.1307, + "num_input_tokens_seen": 12772208, + "step": 17800 + }, + { + "epoch": 37.00623700623701, + "eval_loss": 0.14246784150600433, + "eval_runtime": 7.7665, + "eval_samples_per_second": 110.216, + "eval_steps_per_second": 27.554, + "num_input_tokens_seen": 12772208, + "step": 17800 + }, + { + "epoch": 37.016632016632016, + "grad_norm": 0.2798997759819031, + "learning_rate": 2.9290490264448412e-05, + "loss": 0.1395, + "num_input_tokens_seen": 12775856, + "step": 17805 + }, + { + "epoch": 37.027027027027025, + "grad_norm": 0.5096025466918945, + "learning_rate": 2.9280818115884094e-05, + "loss": 0.1308, + "num_input_tokens_seen": 12779472, + "step": 17810 + }, + { + "epoch": 37.03742203742204, + "grad_norm": 0.2083929032087326, + "learning_rate": 2.9271145307163828e-05, + "loss": 0.0846, + "num_input_tokens_seen": 12783120, + "step": 17815 + }, + { + "epoch": 37.04781704781705, + "grad_norm": 1.0078794956207275, + "learning_rate": 2.9261471839779287e-05, + "loss": 0.0873, + "num_input_tokens_seen": 12786704, + "step": 17820 + }, + { + "epoch": 37.05821205821206, + "grad_norm": 0.20980605483055115, + "learning_rate": 2.925179771522223e-05, + "loss": 0.0815, + "num_input_tokens_seen": 12790160, + "step": 17825 + }, + { + "epoch": 37.06860706860707, + "grad_norm": 0.21558330953121185, + "learning_rate": 2.9242122934984535e-05, + "loss": 0.0909, + "num_input_tokens_seen": 12793616, + "step": 17830 + }, + { + "epoch": 37.07900207900208, + "grad_norm": 0.2577698528766632, + "learning_rate": 2.9232447500558176e-05, + "loss": 0.0913, + "num_input_tokens_seen": 12797264, + "step": 17835 + }, + { + "epoch": 37.08939708939709, + "grad_norm": 0.2778533399105072, + "learning_rate": 2.9222771413435225e-05, + "loss": 0.1033, + "num_input_tokens_seen": 12800816, + "step": 17840 + }, + { + "epoch": 37.0997920997921, + "grad_norm": 0.3112342655658722, + "learning_rate": 2.9213094675107848e-05, + "loss": 0.1193, + "num_input_tokens_seen": 12804432, + "step": 17845 + }, + { + "epoch": 37.11018711018711, + "grad_norm": 0.7777373790740967, + "learning_rate": 2.9203417287068335e-05, + "loss": 0.1551, + "num_input_tokens_seen": 12808016, + "step": 17850 + }, + { + "epoch": 37.12058212058212, + "grad_norm": 0.231193408370018, + "learning_rate": 2.9193739250809042e-05, + "loss": 0.1063, + "num_input_tokens_seen": 12811984, + "step": 17855 + }, + { + "epoch": 37.13097713097713, + "grad_norm": 0.7241834998130798, + "learning_rate": 2.9184060567822463e-05, + "loss": 0.1119, + "num_input_tokens_seen": 12815568, + "step": 17860 + }, + { + "epoch": 37.141372141372145, + "grad_norm": 0.5215422511100769, + "learning_rate": 2.9174381239601166e-05, + "loss": 0.1373, + "num_input_tokens_seen": 12819248, + "step": 17865 + }, + { + "epoch": 37.15176715176715, + "grad_norm": 0.3767201900482178, + "learning_rate": 2.916470126763783e-05, + "loss": 0.1009, + "num_input_tokens_seen": 12823056, + "step": 17870 + }, + { + "epoch": 37.16216216216216, + "grad_norm": 0.3658088743686676, + "learning_rate": 2.9155020653425203e-05, + "loss": 0.1344, + "num_input_tokens_seen": 12826608, + "step": 17875 + }, + { + "epoch": 37.17255717255717, + "grad_norm": 0.2615048587322235, + "learning_rate": 2.9145339398456184e-05, + "loss": 0.1171, + "num_input_tokens_seen": 12830000, + "step": 17880 + }, + { + "epoch": 37.182952182952185, + "grad_norm": 0.24590632319450378, + "learning_rate": 2.913565750422374e-05, + "loss": 0.0963, + "num_input_tokens_seen": 12833584, + "step": 17885 + }, + { + "epoch": 37.19334719334719, + "grad_norm": 0.1553317755460739, + "learning_rate": 2.9125974972220938e-05, + "loss": 0.1256, + "num_input_tokens_seen": 12837104, + "step": 17890 + }, + { + "epoch": 37.2037422037422, + "grad_norm": 0.35087284445762634, + "learning_rate": 2.9116291803940932e-05, + "loss": 0.1253, + "num_input_tokens_seen": 12840816, + "step": 17895 + }, + { + "epoch": 37.21413721413722, + "grad_norm": 0.1457904726266861, + "learning_rate": 2.910660800087701e-05, + "loss": 0.0953, + "num_input_tokens_seen": 12844464, + "step": 17900 + }, + { + "epoch": 37.224532224532226, + "grad_norm": 0.35152843594551086, + "learning_rate": 2.909692356452254e-05, + "loss": 0.0902, + "num_input_tokens_seen": 12848144, + "step": 17905 + }, + { + "epoch": 37.234927234927234, + "grad_norm": 0.1949710100889206, + "learning_rate": 2.9087238496370962e-05, + "loss": 0.1175, + "num_input_tokens_seen": 12851792, + "step": 17910 + }, + { + "epoch": 37.24532224532224, + "grad_norm": 0.2325606644153595, + "learning_rate": 2.907755279791583e-05, + "loss": 0.1058, + "num_input_tokens_seen": 12855312, + "step": 17915 + }, + { + "epoch": 37.25571725571726, + "grad_norm": 0.29999181628227234, + "learning_rate": 2.906786647065083e-05, + "loss": 0.1401, + "num_input_tokens_seen": 12858960, + "step": 17920 + }, + { + "epoch": 37.266112266112266, + "grad_norm": 0.43964868783950806, + "learning_rate": 2.9058179516069695e-05, + "loss": 0.1105, + "num_input_tokens_seen": 12862512, + "step": 17925 + }, + { + "epoch": 37.276507276507274, + "grad_norm": 0.3294840157032013, + "learning_rate": 2.9048491935666282e-05, + "loss": 0.1181, + "num_input_tokens_seen": 12866032, + "step": 17930 + }, + { + "epoch": 37.28690228690229, + "grad_norm": 0.19387514889240265, + "learning_rate": 2.9038803730934534e-05, + "loss": 0.1239, + "num_input_tokens_seen": 12869680, + "step": 17935 + }, + { + "epoch": 37.2972972972973, + "grad_norm": 0.5678212642669678, + "learning_rate": 2.9029114903368503e-05, + "loss": 0.0997, + "num_input_tokens_seen": 12873264, + "step": 17940 + }, + { + "epoch": 37.30769230769231, + "grad_norm": 0.30787307024002075, + "learning_rate": 2.9019425454462318e-05, + "loss": 0.0985, + "num_input_tokens_seen": 12876944, + "step": 17945 + }, + { + "epoch": 37.318087318087315, + "grad_norm": 0.24651730060577393, + "learning_rate": 2.9009735385710212e-05, + "loss": 0.0909, + "num_input_tokens_seen": 12880400, + "step": 17950 + }, + { + "epoch": 37.32848232848233, + "grad_norm": 0.35717159509658813, + "learning_rate": 2.900004469860652e-05, + "loss": 0.1271, + "num_input_tokens_seen": 12883824, + "step": 17955 + }, + { + "epoch": 37.33887733887734, + "grad_norm": 0.20598043501377106, + "learning_rate": 2.8990353394645668e-05, + "loss": 0.1021, + "num_input_tokens_seen": 12887408, + "step": 17960 + }, + { + "epoch": 37.34927234927235, + "grad_norm": 0.8474673628807068, + "learning_rate": 2.8980661475322186e-05, + "loss": 0.1313, + "num_input_tokens_seen": 12890928, + "step": 17965 + }, + { + "epoch": 37.35966735966736, + "grad_norm": 0.18497510254383087, + "learning_rate": 2.897096894213067e-05, + "loss": 0.1398, + "num_input_tokens_seen": 12894544, + "step": 17970 + }, + { + "epoch": 37.37006237006237, + "grad_norm": 0.36082226037979126, + "learning_rate": 2.8961275796565845e-05, + "loss": 0.0924, + "num_input_tokens_seen": 12898128, + "step": 17975 + }, + { + "epoch": 37.38045738045738, + "grad_norm": 0.535905659198761, + "learning_rate": 2.8951582040122517e-05, + "loss": 0.1141, + "num_input_tokens_seen": 12901680, + "step": 17980 + }, + { + "epoch": 37.39085239085239, + "grad_norm": 0.3055116534233093, + "learning_rate": 2.894188767429557e-05, + "loss": 0.1299, + "num_input_tokens_seen": 12905264, + "step": 17985 + }, + { + "epoch": 37.4012474012474, + "grad_norm": 0.1656157672405243, + "learning_rate": 2.8932192700580014e-05, + "loss": 0.0833, + "num_input_tokens_seen": 12908816, + "step": 17990 + }, + { + "epoch": 37.41164241164241, + "grad_norm": 0.24881431460380554, + "learning_rate": 2.8922497120470916e-05, + "loss": 0.0908, + "num_input_tokens_seen": 12912304, + "step": 17995 + }, + { + "epoch": 37.42203742203742, + "grad_norm": 0.1690702885389328, + "learning_rate": 2.891280093546348e-05, + "loss": 0.1357, + "num_input_tokens_seen": 12915888, + "step": 18000 + }, + { + "epoch": 37.42203742203742, + "eval_loss": 0.14392614364624023, + "eval_runtime": 7.7666, + "eval_samples_per_second": 110.216, + "eval_steps_per_second": 27.554, + "num_input_tokens_seen": 12915888, + "step": 18000 + }, + { + "epoch": 37.432432432432435, + "grad_norm": 0.15624667704105377, + "learning_rate": 2.890310414705297e-05, + "loss": 0.1233, + "num_input_tokens_seen": 12919472, + "step": 18005 + }, + { + "epoch": 37.44282744282744, + "grad_norm": 0.24960923194885254, + "learning_rate": 2.8893406756734742e-05, + "loss": 0.1002, + "num_input_tokens_seen": 12923120, + "step": 18010 + }, + { + "epoch": 37.45322245322245, + "grad_norm": 0.23241843283176422, + "learning_rate": 2.888370876600427e-05, + "loss": 0.0756, + "num_input_tokens_seen": 12926800, + "step": 18015 + }, + { + "epoch": 37.46361746361746, + "grad_norm": 0.29353100061416626, + "learning_rate": 2.8874010176357104e-05, + "loss": 0.0901, + "num_input_tokens_seen": 12930320, + "step": 18020 + }, + { + "epoch": 37.474012474012476, + "grad_norm": 0.24277083575725555, + "learning_rate": 2.886431098928888e-05, + "loss": 0.0861, + "num_input_tokens_seen": 12933744, + "step": 18025 + }, + { + "epoch": 37.484407484407484, + "grad_norm": 0.4020705819129944, + "learning_rate": 2.885461120629534e-05, + "loss": 0.1311, + "num_input_tokens_seen": 12937360, + "step": 18030 + }, + { + "epoch": 37.49480249480249, + "grad_norm": 0.8695703148841858, + "learning_rate": 2.8844910828872317e-05, + "loss": 0.114, + "num_input_tokens_seen": 12940784, + "step": 18035 + }, + { + "epoch": 37.50519750519751, + "grad_norm": 0.6502944827079773, + "learning_rate": 2.8835209858515715e-05, + "loss": 0.1252, + "num_input_tokens_seen": 12944624, + "step": 18040 + }, + { + "epoch": 37.515592515592516, + "grad_norm": 0.5825732350349426, + "learning_rate": 2.8825508296721566e-05, + "loss": 0.1314, + "num_input_tokens_seen": 12948368, + "step": 18045 + }, + { + "epoch": 37.525987525987524, + "grad_norm": 0.42592841386795044, + "learning_rate": 2.881580614498596e-05, + "loss": 0.122, + "num_input_tokens_seen": 12951888, + "step": 18050 + }, + { + "epoch": 37.53638253638254, + "grad_norm": 0.4540844261646271, + "learning_rate": 2.8806103404805103e-05, + "loss": 0.115, + "num_input_tokens_seen": 12955504, + "step": 18055 + }, + { + "epoch": 37.54677754677755, + "grad_norm": 0.3242596685886383, + "learning_rate": 2.8796400077675257e-05, + "loss": 0.0996, + "num_input_tokens_seen": 12959248, + "step": 18060 + }, + { + "epoch": 37.55717255717256, + "grad_norm": 0.16550979018211365, + "learning_rate": 2.8786696165092812e-05, + "loss": 0.1108, + "num_input_tokens_seen": 12962864, + "step": 18065 + }, + { + "epoch": 37.567567567567565, + "grad_norm": 0.5624067783355713, + "learning_rate": 2.8776991668554236e-05, + "loss": 0.1085, + "num_input_tokens_seen": 12966352, + "step": 18070 + }, + { + "epoch": 37.57796257796258, + "grad_norm": 0.7278993725776672, + "learning_rate": 2.876728658955608e-05, + "loss": 0.1015, + "num_input_tokens_seen": 12970000, + "step": 18075 + }, + { + "epoch": 37.58835758835759, + "grad_norm": 0.15504641830921173, + "learning_rate": 2.8757580929594986e-05, + "loss": 0.07, + "num_input_tokens_seen": 12973488, + "step": 18080 + }, + { + "epoch": 37.5987525987526, + "grad_norm": 0.16618601977825165, + "learning_rate": 2.87478746901677e-05, + "loss": 0.08, + "num_input_tokens_seen": 12976880, + "step": 18085 + }, + { + "epoch": 37.60914760914761, + "grad_norm": 0.37767788767814636, + "learning_rate": 2.873816787277103e-05, + "loss": 0.0905, + "num_input_tokens_seen": 12980336, + "step": 18090 + }, + { + "epoch": 37.61954261954262, + "grad_norm": 0.30543267726898193, + "learning_rate": 2.8728460478901903e-05, + "loss": 0.1135, + "num_input_tokens_seen": 12984016, + "step": 18095 + }, + { + "epoch": 37.62993762993763, + "grad_norm": 0.7662866115570068, + "learning_rate": 2.8718752510057307e-05, + "loss": 0.1303, + "num_input_tokens_seen": 12987664, + "step": 18100 + }, + { + "epoch": 37.64033264033264, + "grad_norm": 0.6743650436401367, + "learning_rate": 2.870904396773435e-05, + "loss": 0.1252, + "num_input_tokens_seen": 12991344, + "step": 18105 + }, + { + "epoch": 37.65072765072765, + "grad_norm": 0.3256584703922272, + "learning_rate": 2.86993348534302e-05, + "loss": 0.105, + "num_input_tokens_seen": 12994832, + "step": 18110 + }, + { + "epoch": 37.66112266112266, + "grad_norm": 0.18070247769355774, + "learning_rate": 2.868962516864212e-05, + "loss": 0.0678, + "num_input_tokens_seen": 12998416, + "step": 18115 + }, + { + "epoch": 37.67151767151767, + "grad_norm": 0.27721428871154785, + "learning_rate": 2.8679914914867477e-05, + "loss": 0.085, + "num_input_tokens_seen": 13001936, + "step": 18120 + }, + { + "epoch": 37.681912681912685, + "grad_norm": 0.38360169529914856, + "learning_rate": 2.8670204093603713e-05, + "loss": 0.1418, + "num_input_tokens_seen": 13005520, + "step": 18125 + }, + { + "epoch": 37.69230769230769, + "grad_norm": 0.16437934339046478, + "learning_rate": 2.8660492706348357e-05, + "loss": 0.1052, + "num_input_tokens_seen": 13009104, + "step": 18130 + }, + { + "epoch": 37.7027027027027, + "grad_norm": 0.29475685954093933, + "learning_rate": 2.8650780754599022e-05, + "loss": 0.0921, + "num_input_tokens_seen": 13012528, + "step": 18135 + }, + { + "epoch": 37.71309771309771, + "grad_norm": 0.15947051346302032, + "learning_rate": 2.8641068239853407e-05, + "loss": 0.1061, + "num_input_tokens_seen": 13016048, + "step": 18140 + }, + { + "epoch": 37.723492723492726, + "grad_norm": 0.2756950259208679, + "learning_rate": 2.863135516360932e-05, + "loss": 0.1082, + "num_input_tokens_seen": 13019568, + "step": 18145 + }, + { + "epoch": 37.733887733887734, + "grad_norm": 0.385649710893631, + "learning_rate": 2.8621641527364633e-05, + "loss": 0.1038, + "num_input_tokens_seen": 13023152, + "step": 18150 + }, + { + "epoch": 37.74428274428274, + "grad_norm": 0.2682144045829773, + "learning_rate": 2.8611927332617313e-05, + "loss": 0.1021, + "num_input_tokens_seen": 13026640, + "step": 18155 + }, + { + "epoch": 37.75467775467776, + "grad_norm": 0.19330337643623352, + "learning_rate": 2.8602212580865405e-05, + "loss": 0.102, + "num_input_tokens_seen": 13030448, + "step": 18160 + }, + { + "epoch": 37.765072765072766, + "grad_norm": 0.14626379311084747, + "learning_rate": 2.859249727360705e-05, + "loss": 0.1008, + "num_input_tokens_seen": 13034064, + "step": 18165 + }, + { + "epoch": 37.775467775467774, + "grad_norm": 1.0421607494354248, + "learning_rate": 2.8582781412340465e-05, + "loss": 0.095, + "num_input_tokens_seen": 13037584, + "step": 18170 + }, + { + "epoch": 37.78586278586278, + "grad_norm": 0.34299802780151367, + "learning_rate": 2.857306499856397e-05, + "loss": 0.098, + "num_input_tokens_seen": 13041136, + "step": 18175 + }, + { + "epoch": 37.7962577962578, + "grad_norm": 0.5907062292098999, + "learning_rate": 2.856334803377594e-05, + "loss": 0.1208, + "num_input_tokens_seen": 13044656, + "step": 18180 + }, + { + "epoch": 37.80665280665281, + "grad_norm": 0.43207913637161255, + "learning_rate": 2.8553630519474867e-05, + "loss": 0.1173, + "num_input_tokens_seen": 13048208, + "step": 18185 + }, + { + "epoch": 37.817047817047815, + "grad_norm": 0.6227362155914307, + "learning_rate": 2.8543912457159317e-05, + "loss": 0.1337, + "num_input_tokens_seen": 13051728, + "step": 18190 + }, + { + "epoch": 37.82744282744283, + "grad_norm": 1.096488356590271, + "learning_rate": 2.853419384832792e-05, + "loss": 0.1341, + "num_input_tokens_seen": 13055312, + "step": 18195 + }, + { + "epoch": 37.83783783783784, + "grad_norm": 0.2650856375694275, + "learning_rate": 2.8524474694479423e-05, + "loss": 0.151, + "num_input_tokens_seen": 13058896, + "step": 18200 + }, + { + "epoch": 37.83783783783784, + "eval_loss": 0.1416320949792862, + "eval_runtime": 7.7603, + "eval_samples_per_second": 110.305, + "eval_steps_per_second": 27.576, + "num_input_tokens_seen": 13058896, + "step": 18200 + }, + { + "epoch": 37.84823284823285, + "grad_norm": 0.6838926672935486, + "learning_rate": 2.851475499711264e-05, + "loss": 0.1122, + "num_input_tokens_seen": 13062416, + "step": 18205 + }, + { + "epoch": 37.858627858627855, + "grad_norm": 0.10969207435846329, + "learning_rate": 2.8505034757726468e-05, + "loss": 0.0843, + "num_input_tokens_seen": 13066128, + "step": 18210 + }, + { + "epoch": 37.86902286902287, + "grad_norm": 0.6436992883682251, + "learning_rate": 2.8495313977819886e-05, + "loss": 0.114, + "num_input_tokens_seen": 13069680, + "step": 18215 + }, + { + "epoch": 37.87941787941788, + "grad_norm": 0.47545555233955383, + "learning_rate": 2.8485592658891956e-05, + "loss": 0.0847, + "num_input_tokens_seen": 13073360, + "step": 18220 + }, + { + "epoch": 37.88981288981289, + "grad_norm": 0.3479813039302826, + "learning_rate": 2.8475870802441844e-05, + "loss": 0.108, + "num_input_tokens_seen": 13076816, + "step": 18225 + }, + { + "epoch": 37.9002079002079, + "grad_norm": 0.32622507214546204, + "learning_rate": 2.8466148409968774e-05, + "loss": 0.0755, + "num_input_tokens_seen": 13080432, + "step": 18230 + }, + { + "epoch": 37.91060291060291, + "grad_norm": 0.4272646903991699, + "learning_rate": 2.8456425482972067e-05, + "loss": 0.1416, + "num_input_tokens_seen": 13084048, + "step": 18235 + }, + { + "epoch": 37.92099792099792, + "grad_norm": 0.6420887112617493, + "learning_rate": 2.84467020229511e-05, + "loss": 0.1112, + "num_input_tokens_seen": 13087632, + "step": 18240 + }, + { + "epoch": 37.931392931392935, + "grad_norm": 0.32833218574523926, + "learning_rate": 2.8436978031405375e-05, + "loss": 0.1051, + "num_input_tokens_seen": 13091088, + "step": 18245 + }, + { + "epoch": 37.94178794178794, + "grad_norm": 0.6261723637580872, + "learning_rate": 2.842725350983445e-05, + "loss": 0.1023, + "num_input_tokens_seen": 13094608, + "step": 18250 + }, + { + "epoch": 37.95218295218295, + "grad_norm": 0.41341641545295715, + "learning_rate": 2.8417528459737957e-05, + "loss": 0.105, + "num_input_tokens_seen": 13098448, + "step": 18255 + }, + { + "epoch": 37.96257796257796, + "grad_norm": 0.28442734479904175, + "learning_rate": 2.8407802882615624e-05, + "loss": 0.1169, + "num_input_tokens_seen": 13101872, + "step": 18260 + }, + { + "epoch": 37.972972972972975, + "grad_norm": 0.32687637209892273, + "learning_rate": 2.8398076779967277e-05, + "loss": 0.1041, + "num_input_tokens_seen": 13105392, + "step": 18265 + }, + { + "epoch": 37.983367983367984, + "grad_norm": 0.1741325706243515, + "learning_rate": 2.8388350153292774e-05, + "loss": 0.0924, + "num_input_tokens_seen": 13108880, + "step": 18270 + }, + { + "epoch": 37.99376299376299, + "grad_norm": 0.2663373649120331, + "learning_rate": 2.8378623004092103e-05, + "loss": 0.1797, + "num_input_tokens_seen": 13112336, + "step": 18275 + }, + { + "epoch": 38.00415800415801, + "grad_norm": 0.5323374271392822, + "learning_rate": 2.8368895333865302e-05, + "loss": 0.0925, + "num_input_tokens_seen": 13116000, + "step": 18280 + }, + { + "epoch": 38.014553014553016, + "grad_norm": 0.21696430444717407, + "learning_rate": 2.835916714411251e-05, + "loss": 0.1254, + "num_input_tokens_seen": 13119552, + "step": 18285 + }, + { + "epoch": 38.024948024948024, + "grad_norm": 0.35251423716545105, + "learning_rate": 2.8349438436333926e-05, + "loss": 0.0941, + "num_input_tokens_seen": 13123136, + "step": 18290 + }, + { + "epoch": 38.03534303534303, + "grad_norm": 0.5515832901000977, + "learning_rate": 2.833970921202984e-05, + "loss": 0.1215, + "num_input_tokens_seen": 13126720, + "step": 18295 + }, + { + "epoch": 38.04573804573805, + "grad_norm": 0.5191999673843384, + "learning_rate": 2.8329979472700628e-05, + "loss": 0.1391, + "num_input_tokens_seen": 13130464, + "step": 18300 + }, + { + "epoch": 38.056133056133056, + "grad_norm": 0.21367651224136353, + "learning_rate": 2.832024921984674e-05, + "loss": 0.0772, + "num_input_tokens_seen": 13133984, + "step": 18305 + }, + { + "epoch": 38.066528066528065, + "grad_norm": 0.24610523879528046, + "learning_rate": 2.8310518454968693e-05, + "loss": 0.145, + "num_input_tokens_seen": 13137440, + "step": 18310 + }, + { + "epoch": 38.07692307692308, + "grad_norm": 0.18704912066459656, + "learning_rate": 2.8300787179567095e-05, + "loss": 0.1499, + "num_input_tokens_seen": 13141056, + "step": 18315 + }, + { + "epoch": 38.08731808731809, + "grad_norm": 0.22063247859477997, + "learning_rate": 2.8291055395142636e-05, + "loss": 0.1405, + "num_input_tokens_seen": 13144640, + "step": 18320 + }, + { + "epoch": 38.0977130977131, + "grad_norm": 0.21718178689479828, + "learning_rate": 2.8281323103196073e-05, + "loss": 0.092, + "num_input_tokens_seen": 13148064, + "step": 18325 + }, + { + "epoch": 38.108108108108105, + "grad_norm": 0.38638561964035034, + "learning_rate": 2.8271590305228256e-05, + "loss": 0.1171, + "num_input_tokens_seen": 13151776, + "step": 18330 + }, + { + "epoch": 38.11850311850312, + "grad_norm": 0.5782386064529419, + "learning_rate": 2.82618570027401e-05, + "loss": 0.1111, + "num_input_tokens_seen": 13155488, + "step": 18335 + }, + { + "epoch": 38.12889812889813, + "grad_norm": 0.213387131690979, + "learning_rate": 2.8252123197232604e-05, + "loss": 0.1026, + "num_input_tokens_seen": 13159168, + "step": 18340 + }, + { + "epoch": 38.13929313929314, + "grad_norm": 0.25434741377830505, + "learning_rate": 2.8242388890206843e-05, + "loss": 0.0758, + "num_input_tokens_seen": 13162720, + "step": 18345 + }, + { + "epoch": 38.14968814968815, + "grad_norm": 0.20512154698371887, + "learning_rate": 2.8232654083163967e-05, + "loss": 0.1196, + "num_input_tokens_seen": 13166080, + "step": 18350 + }, + { + "epoch": 38.16008316008316, + "grad_norm": 0.6836459040641785, + "learning_rate": 2.822291877760521e-05, + "loss": 0.1376, + "num_input_tokens_seen": 13169632, + "step": 18355 + }, + { + "epoch": 38.17047817047817, + "grad_norm": 0.42475801706314087, + "learning_rate": 2.8213182975031864e-05, + "loss": 0.143, + "num_input_tokens_seen": 13173280, + "step": 18360 + }, + { + "epoch": 38.18087318087318, + "grad_norm": 0.20524288713932037, + "learning_rate": 2.8203446676945337e-05, + "loss": 0.0943, + "num_input_tokens_seen": 13176672, + "step": 18365 + }, + { + "epoch": 38.19126819126819, + "grad_norm": 0.29030683636665344, + "learning_rate": 2.8193709884847075e-05, + "loss": 0.0907, + "num_input_tokens_seen": 13180224, + "step": 18370 + }, + { + "epoch": 38.2016632016632, + "grad_norm": 0.5264385938644409, + "learning_rate": 2.8183972600238605e-05, + "loss": 0.1545, + "num_input_tokens_seen": 13183904, + "step": 18375 + }, + { + "epoch": 38.21205821205821, + "grad_norm": 0.3932039141654968, + "learning_rate": 2.817423482462156e-05, + "loss": 0.0909, + "num_input_tokens_seen": 13187552, + "step": 18380 + }, + { + "epoch": 38.222453222453225, + "grad_norm": 0.5574740171432495, + "learning_rate": 2.8164496559497605e-05, + "loss": 0.0857, + "num_input_tokens_seen": 13191136, + "step": 18385 + }, + { + "epoch": 38.232848232848234, + "grad_norm": 0.47347933053970337, + "learning_rate": 2.815475780636852e-05, + "loss": 0.151, + "num_input_tokens_seen": 13194752, + "step": 18390 + }, + { + "epoch": 38.24324324324324, + "grad_norm": 0.2966447174549103, + "learning_rate": 2.814501856673613e-05, + "loss": 0.1079, + "num_input_tokens_seen": 13198400, + "step": 18395 + }, + { + "epoch": 38.25363825363825, + "grad_norm": 0.32544779777526855, + "learning_rate": 2.8135278842102353e-05, + "loss": 0.102, + "num_input_tokens_seen": 13201856, + "step": 18400 + }, + { + "epoch": 38.25363825363825, + "eval_loss": 0.1415577083826065, + "eval_runtime": 7.7468, + "eval_samples_per_second": 110.497, + "eval_steps_per_second": 27.624, + "num_input_tokens_seen": 13201856, + "step": 18400 + }, + { + "epoch": 38.264033264033266, + "grad_norm": 0.23799654841423035, + "learning_rate": 2.8125538633969183e-05, + "loss": 0.1127, + "num_input_tokens_seen": 13205440, + "step": 18405 + }, + { + "epoch": 38.274428274428274, + "grad_norm": 0.17257994413375854, + "learning_rate": 2.8115797943838677e-05, + "loss": 0.1149, + "num_input_tokens_seen": 13208992, + "step": 18410 + }, + { + "epoch": 38.28482328482328, + "grad_norm": 0.7631309628486633, + "learning_rate": 2.810605677321298e-05, + "loss": 0.1273, + "num_input_tokens_seen": 13212544, + "step": 18415 + }, + { + "epoch": 38.2952182952183, + "grad_norm": 0.17880821228027344, + "learning_rate": 2.809631512359428e-05, + "loss": 0.096, + "num_input_tokens_seen": 13216064, + "step": 18420 + }, + { + "epoch": 38.305613305613306, + "grad_norm": 0.144456148147583, + "learning_rate": 2.8086572996484884e-05, + "loss": 0.1108, + "num_input_tokens_seen": 13219840, + "step": 18425 + }, + { + "epoch": 38.316008316008315, + "grad_norm": 0.4208962321281433, + "learning_rate": 2.8076830393387143e-05, + "loss": 0.1231, + "num_input_tokens_seen": 13223488, + "step": 18430 + }, + { + "epoch": 38.32640332640332, + "grad_norm": 0.5036035776138306, + "learning_rate": 2.8067087315803497e-05, + "loss": 0.12, + "num_input_tokens_seen": 13226880, + "step": 18435 + }, + { + "epoch": 38.33679833679834, + "grad_norm": 0.42446404695510864, + "learning_rate": 2.8057343765236433e-05, + "loss": 0.1454, + "num_input_tokens_seen": 13230400, + "step": 18440 + }, + { + "epoch": 38.34719334719335, + "grad_norm": 0.29014280438423157, + "learning_rate": 2.804759974318854e-05, + "loss": 0.1315, + "num_input_tokens_seen": 13233952, + "step": 18445 + }, + { + "epoch": 38.357588357588355, + "grad_norm": 0.3616035282611847, + "learning_rate": 2.8037855251162482e-05, + "loss": 0.1156, + "num_input_tokens_seen": 13237504, + "step": 18450 + }, + { + "epoch": 38.36798336798337, + "grad_norm": 0.5070623755455017, + "learning_rate": 2.802811029066096e-05, + "loss": 0.0904, + "num_input_tokens_seen": 13241088, + "step": 18455 + }, + { + "epoch": 38.37837837837838, + "grad_norm": 0.24113622307777405, + "learning_rate": 2.8018364863186764e-05, + "loss": 0.1097, + "num_input_tokens_seen": 13244640, + "step": 18460 + }, + { + "epoch": 38.38877338877339, + "grad_norm": 0.40092721581459045, + "learning_rate": 2.800861897024279e-05, + "loss": 0.1254, + "num_input_tokens_seen": 13248256, + "step": 18465 + }, + { + "epoch": 38.3991683991684, + "grad_norm": 0.1800389587879181, + "learning_rate": 2.799887261333196e-05, + "loss": 0.0871, + "num_input_tokens_seen": 13251904, + "step": 18470 + }, + { + "epoch": 38.40956340956341, + "grad_norm": 0.3256418704986572, + "learning_rate": 2.798912579395728e-05, + "loss": 0.0906, + "num_input_tokens_seen": 13255456, + "step": 18475 + }, + { + "epoch": 38.41995841995842, + "grad_norm": 0.4427424669265747, + "learning_rate": 2.797937851362185e-05, + "loss": 0.1059, + "num_input_tokens_seen": 13258976, + "step": 18480 + }, + { + "epoch": 38.43035343035343, + "grad_norm": 0.6519160866737366, + "learning_rate": 2.7969630773828802e-05, + "loss": 0.1052, + "num_input_tokens_seen": 13262560, + "step": 18485 + }, + { + "epoch": 38.44074844074844, + "grad_norm": 0.2421419769525528, + "learning_rate": 2.7959882576081382e-05, + "loss": 0.0957, + "num_input_tokens_seen": 13265952, + "step": 18490 + }, + { + "epoch": 38.45114345114345, + "grad_norm": 0.24922232329845428, + "learning_rate": 2.795013392188286e-05, + "loss": 0.1311, + "num_input_tokens_seen": 13269504, + "step": 18495 + }, + { + "epoch": 38.46153846153846, + "grad_norm": 0.427341490983963, + "learning_rate": 2.7940384812736614e-05, + "loss": 0.1398, + "num_input_tokens_seen": 13272960, + "step": 18500 + }, + { + "epoch": 38.471933471933475, + "grad_norm": 0.30533111095428467, + "learning_rate": 2.7930635250146087e-05, + "loss": 0.1354, + "num_input_tokens_seen": 13276640, + "step": 18505 + }, + { + "epoch": 38.482328482328484, + "grad_norm": 0.2457328885793686, + "learning_rate": 2.792088523561477e-05, + "loss": 0.0983, + "num_input_tokens_seen": 13280192, + "step": 18510 + }, + { + "epoch": 38.49272349272349, + "grad_norm": 0.2083563357591629, + "learning_rate": 2.7911134770646246e-05, + "loss": 0.0944, + "num_input_tokens_seen": 13283776, + "step": 18515 + }, + { + "epoch": 38.5031185031185, + "grad_norm": 0.3790026009082794, + "learning_rate": 2.7901383856744157e-05, + "loss": 0.1101, + "num_input_tokens_seen": 13287392, + "step": 18520 + }, + { + "epoch": 38.513513513513516, + "grad_norm": 0.21367190778255463, + "learning_rate": 2.7891632495412217e-05, + "loss": 0.0981, + "num_input_tokens_seen": 13290976, + "step": 18525 + }, + { + "epoch": 38.523908523908524, + "grad_norm": 0.1935194432735443, + "learning_rate": 2.7881880688154205e-05, + "loss": 0.0772, + "num_input_tokens_seen": 13294560, + "step": 18530 + }, + { + "epoch": 38.53430353430353, + "grad_norm": 0.3557822108268738, + "learning_rate": 2.7872128436473977e-05, + "loss": 0.1397, + "num_input_tokens_seen": 13298176, + "step": 18535 + }, + { + "epoch": 38.54469854469855, + "grad_norm": 0.20163516700267792, + "learning_rate": 2.7862375741875448e-05, + "loss": 0.071, + "num_input_tokens_seen": 13301856, + "step": 18540 + }, + { + "epoch": 38.555093555093556, + "grad_norm": 0.20924687385559082, + "learning_rate": 2.785262260586261e-05, + "loss": 0.1002, + "num_input_tokens_seen": 13305440, + "step": 18545 + }, + { + "epoch": 38.565488565488565, + "grad_norm": 0.33655110001564026, + "learning_rate": 2.7842869029939517e-05, + "loss": 0.1154, + "num_input_tokens_seen": 13309024, + "step": 18550 + }, + { + "epoch": 38.57588357588357, + "grad_norm": 0.23892654478549957, + "learning_rate": 2.7833115015610296e-05, + "loss": 0.0954, + "num_input_tokens_seen": 13312576, + "step": 18555 + }, + { + "epoch": 38.58627858627859, + "grad_norm": 0.34927359223365784, + "learning_rate": 2.7823360564379136e-05, + "loss": 0.1159, + "num_input_tokens_seen": 13316192, + "step": 18560 + }, + { + "epoch": 38.5966735966736, + "grad_norm": 0.4826953411102295, + "learning_rate": 2.7813605677750297e-05, + "loss": 0.0902, + "num_input_tokens_seen": 13319712, + "step": 18565 + }, + { + "epoch": 38.607068607068605, + "grad_norm": 1.055915355682373, + "learning_rate": 2.7803850357228102e-05, + "loss": 0.1141, + "num_input_tokens_seen": 13323360, + "step": 18570 + }, + { + "epoch": 38.61746361746362, + "grad_norm": 0.38968512415885925, + "learning_rate": 2.779409460431695e-05, + "loss": 0.1058, + "num_input_tokens_seen": 13326944, + "step": 18575 + }, + { + "epoch": 38.62785862785863, + "grad_norm": 0.2288217842578888, + "learning_rate": 2.778433842052129e-05, + "loss": 0.1242, + "num_input_tokens_seen": 13330400, + "step": 18580 + }, + { + "epoch": 38.63825363825364, + "grad_norm": 0.4467751979827881, + "learning_rate": 2.7774581807345664e-05, + "loss": 0.1359, + "num_input_tokens_seen": 13333984, + "step": 18585 + }, + { + "epoch": 38.648648648648646, + "grad_norm": 0.44520488381385803, + "learning_rate": 2.776482476629465e-05, + "loss": 0.1032, + "num_input_tokens_seen": 13337504, + "step": 18590 + }, + { + "epoch": 38.65904365904366, + "grad_norm": 0.24758648872375488, + "learning_rate": 2.7755067298872924e-05, + "loss": 0.1275, + "num_input_tokens_seen": 13341120, + "step": 18595 + }, + { + "epoch": 38.66943866943867, + "grad_norm": 0.20674562454223633, + "learning_rate": 2.774530940658518e-05, + "loss": 0.1296, + "num_input_tokens_seen": 13344736, + "step": 18600 + }, + { + "epoch": 38.66943866943867, + "eval_loss": 0.14556938409805298, + "eval_runtime": 7.7506, + "eval_samples_per_second": 110.444, + "eval_steps_per_second": 27.611, + "num_input_tokens_seen": 13344736, + "step": 18600 + }, + { + "epoch": 38.67983367983368, + "grad_norm": 0.25741007924079895, + "learning_rate": 2.7735551090936236e-05, + "loss": 0.0772, + "num_input_tokens_seen": 13348352, + "step": 18605 + }, + { + "epoch": 38.69022869022869, + "grad_norm": 0.16405698657035828, + "learning_rate": 2.7725792353430934e-05, + "loss": 0.0857, + "num_input_tokens_seen": 13351904, + "step": 18610 + }, + { + "epoch": 38.7006237006237, + "grad_norm": 0.2283054143190384, + "learning_rate": 2.77160331955742e-05, + "loss": 0.0676, + "num_input_tokens_seen": 13355520, + "step": 18615 + }, + { + "epoch": 38.71101871101871, + "grad_norm": 0.30671948194503784, + "learning_rate": 2.7706273618871008e-05, + "loss": 0.0772, + "num_input_tokens_seen": 13359328, + "step": 18620 + }, + { + "epoch": 38.72141372141372, + "grad_norm": 0.27427926659584045, + "learning_rate": 2.769651362482642e-05, + "loss": 0.0679, + "num_input_tokens_seen": 13362912, + "step": 18625 + }, + { + "epoch": 38.731808731808734, + "grad_norm": 0.5642015337944031, + "learning_rate": 2.768675321494555e-05, + "loss": 0.1009, + "num_input_tokens_seen": 13366560, + "step": 18630 + }, + { + "epoch": 38.74220374220374, + "grad_norm": 0.5908440947532654, + "learning_rate": 2.7676992390733565e-05, + "loss": 0.1225, + "num_input_tokens_seen": 13370272, + "step": 18635 + }, + { + "epoch": 38.75259875259875, + "grad_norm": 0.5268304347991943, + "learning_rate": 2.766723115369571e-05, + "loss": 0.0877, + "num_input_tokens_seen": 13373632, + "step": 18640 + }, + { + "epoch": 38.762993762993766, + "grad_norm": 0.2677420973777771, + "learning_rate": 2.765746950533729e-05, + "loss": 0.106, + "num_input_tokens_seen": 13377248, + "step": 18645 + }, + { + "epoch": 38.773388773388774, + "grad_norm": 0.3907186686992645, + "learning_rate": 2.7647707447163684e-05, + "loss": 0.0884, + "num_input_tokens_seen": 13380896, + "step": 18650 + }, + { + "epoch": 38.78378378378378, + "grad_norm": 0.4555759131908417, + "learning_rate": 2.7637944980680315e-05, + "loss": 0.09, + "num_input_tokens_seen": 13384576, + "step": 18655 + }, + { + "epoch": 38.79417879417879, + "grad_norm": 0.22884206473827362, + "learning_rate": 2.762818210739268e-05, + "loss": 0.1163, + "num_input_tokens_seen": 13388256, + "step": 18660 + }, + { + "epoch": 38.804573804573806, + "grad_norm": 0.9975020885467529, + "learning_rate": 2.7618418828806332e-05, + "loss": 0.107, + "num_input_tokens_seen": 13391904, + "step": 18665 + }, + { + "epoch": 38.814968814968815, + "grad_norm": 0.4043629765510559, + "learning_rate": 2.76086551464269e-05, + "loss": 0.1171, + "num_input_tokens_seen": 13395456, + "step": 18670 + }, + { + "epoch": 38.82536382536382, + "grad_norm": 0.4191618263721466, + "learning_rate": 2.759889106176006e-05, + "loss": 0.1165, + "num_input_tokens_seen": 13399200, + "step": 18675 + }, + { + "epoch": 38.83575883575884, + "grad_norm": 0.26396751403808594, + "learning_rate": 2.758912657631156e-05, + "loss": 0.0892, + "num_input_tokens_seen": 13403168, + "step": 18680 + }, + { + "epoch": 38.84615384615385, + "grad_norm": 0.5252792835235596, + "learning_rate": 2.7579361691587198e-05, + "loss": 0.0744, + "num_input_tokens_seen": 13406592, + "step": 18685 + }, + { + "epoch": 38.856548856548855, + "grad_norm": 0.3402622640132904, + "learning_rate": 2.756959640909285e-05, + "loss": 0.153, + "num_input_tokens_seen": 13410176, + "step": 18690 + }, + { + "epoch": 38.86694386694387, + "grad_norm": 0.7749537229537964, + "learning_rate": 2.7559830730334452e-05, + "loss": 0.1113, + "num_input_tokens_seen": 13414144, + "step": 18695 + }, + { + "epoch": 38.87733887733888, + "grad_norm": 0.28600430488586426, + "learning_rate": 2.7550064656817988e-05, + "loss": 0.0905, + "num_input_tokens_seen": 13417824, + "step": 18700 + }, + { + "epoch": 38.88773388773389, + "grad_norm": 0.21459560096263885, + "learning_rate": 2.7540298190049503e-05, + "loss": 0.0901, + "num_input_tokens_seen": 13421408, + "step": 18705 + }, + { + "epoch": 38.898128898128896, + "grad_norm": 0.10967829823493958, + "learning_rate": 2.7530531331535107e-05, + "loss": 0.0789, + "num_input_tokens_seen": 13424928, + "step": 18710 + }, + { + "epoch": 38.90852390852391, + "grad_norm": 0.5766472816467285, + "learning_rate": 2.752076408278099e-05, + "loss": 0.1009, + "num_input_tokens_seen": 13428416, + "step": 18715 + }, + { + "epoch": 38.91891891891892, + "grad_norm": 0.28281450271606445, + "learning_rate": 2.751099644529337e-05, + "loss": 0.133, + "num_input_tokens_seen": 13432192, + "step": 18720 + }, + { + "epoch": 38.92931392931393, + "grad_norm": 0.5721721649169922, + "learning_rate": 2.7501228420578533e-05, + "loss": 0.0651, + "num_input_tokens_seen": 13435840, + "step": 18725 + }, + { + "epoch": 38.93970893970894, + "grad_norm": 0.2234087884426117, + "learning_rate": 2.7491460010142857e-05, + "loss": 0.1105, + "num_input_tokens_seen": 13439360, + "step": 18730 + }, + { + "epoch": 38.95010395010395, + "grad_norm": 0.30369803309440613, + "learning_rate": 2.7481691215492727e-05, + "loss": 0.1015, + "num_input_tokens_seen": 13442912, + "step": 18735 + }, + { + "epoch": 38.96049896049896, + "grad_norm": 0.6661486625671387, + "learning_rate": 2.747192203813463e-05, + "loss": 0.1063, + "num_input_tokens_seen": 13446400, + "step": 18740 + }, + { + "epoch": 38.97089397089397, + "grad_norm": 0.7525142431259155, + "learning_rate": 2.7462152479575087e-05, + "loss": 0.1474, + "num_input_tokens_seen": 13449888, + "step": 18745 + }, + { + "epoch": 38.981288981288984, + "grad_norm": 0.536411702632904, + "learning_rate": 2.7452382541320697e-05, + "loss": 0.1251, + "num_input_tokens_seen": 13453344, + "step": 18750 + }, + { + "epoch": 38.99168399168399, + "grad_norm": 0.31303322315216064, + "learning_rate": 2.7442612224878096e-05, + "loss": 0.1242, + "num_input_tokens_seen": 13456864, + "step": 18755 + }, + { + "epoch": 39.002079002079, + "grad_norm": 0.725500226020813, + "learning_rate": 2.7432841531753994e-05, + "loss": 0.1306, + "num_input_tokens_seen": 13460472, + "step": 18760 + }, + { + "epoch": 39.012474012474016, + "grad_norm": 0.4169811010360718, + "learning_rate": 2.7423070463455147e-05, + "loss": 0.0817, + "num_input_tokens_seen": 13463928, + "step": 18765 + }, + { + "epoch": 39.022869022869024, + "grad_norm": 0.2900410294532776, + "learning_rate": 2.7413299021488397e-05, + "loss": 0.138, + "num_input_tokens_seen": 13467608, + "step": 18770 + }, + { + "epoch": 39.03326403326403, + "grad_norm": 0.20371031761169434, + "learning_rate": 2.7403527207360615e-05, + "loss": 0.125, + "num_input_tokens_seen": 13471096, + "step": 18775 + }, + { + "epoch": 39.04365904365904, + "grad_norm": 0.6260344982147217, + "learning_rate": 2.7393755022578722e-05, + "loss": 0.1478, + "num_input_tokens_seen": 13474616, + "step": 18780 + }, + { + "epoch": 39.054054054054056, + "grad_norm": 0.46917250752449036, + "learning_rate": 2.7383982468649714e-05, + "loss": 0.1133, + "num_input_tokens_seen": 13478232, + "step": 18785 + }, + { + "epoch": 39.064449064449065, + "grad_norm": 0.2741759717464447, + "learning_rate": 2.7374209547080665e-05, + "loss": 0.0755, + "num_input_tokens_seen": 13481816, + "step": 18790 + }, + { + "epoch": 39.07484407484407, + "grad_norm": 0.3211381137371063, + "learning_rate": 2.7364436259378663e-05, + "loss": 0.0764, + "num_input_tokens_seen": 13485400, + "step": 18795 + }, + { + "epoch": 39.08523908523909, + "grad_norm": 0.6670470237731934, + "learning_rate": 2.735466260705088e-05, + "loss": 0.142, + "num_input_tokens_seen": 13489016, + "step": 18800 + }, + { + "epoch": 39.08523908523909, + "eval_loss": 0.14676378667354584, + "eval_runtime": 7.7638, + "eval_samples_per_second": 110.255, + "eval_steps_per_second": 27.564, + "num_input_tokens_seen": 13489016, + "step": 18800 + }, + { + "epoch": 39.0956340956341, + "grad_norm": 0.2200082391500473, + "learning_rate": 2.7344888591604524e-05, + "loss": 0.1292, + "num_input_tokens_seen": 13492536, + "step": 18805 + }, + { + "epoch": 39.106029106029105, + "grad_norm": 0.21819844841957092, + "learning_rate": 2.7335114214546893e-05, + "loss": 0.1347, + "num_input_tokens_seen": 13495992, + "step": 18810 + }, + { + "epoch": 39.11642411642411, + "grad_norm": 0.48828908801078796, + "learning_rate": 2.7325339477385293e-05, + "loss": 0.1409, + "num_input_tokens_seen": 13499608, + "step": 18815 + }, + { + "epoch": 39.12681912681913, + "grad_norm": 0.2926994264125824, + "learning_rate": 2.7315564381627128e-05, + "loss": 0.1043, + "num_input_tokens_seen": 13503288, + "step": 18820 + }, + { + "epoch": 39.13721413721414, + "grad_norm": 0.322451114654541, + "learning_rate": 2.7305788928779835e-05, + "loss": 0.105, + "num_input_tokens_seen": 13506744, + "step": 18825 + }, + { + "epoch": 39.147609147609145, + "grad_norm": 0.3500359356403351, + "learning_rate": 2.729601312035091e-05, + "loss": 0.104, + "num_input_tokens_seen": 13510296, + "step": 18830 + }, + { + "epoch": 39.15800415800416, + "grad_norm": 0.14534810185432434, + "learning_rate": 2.7286236957847915e-05, + "loss": 0.0639, + "num_input_tokens_seen": 13513784, + "step": 18835 + }, + { + "epoch": 39.16839916839917, + "grad_norm": 0.2640881836414337, + "learning_rate": 2.7276460442778446e-05, + "loss": 0.1037, + "num_input_tokens_seen": 13517304, + "step": 18840 + }, + { + "epoch": 39.17879417879418, + "grad_norm": 0.4699813425540924, + "learning_rate": 2.726668357665017e-05, + "loss": 0.0865, + "num_input_tokens_seen": 13520920, + "step": 18845 + }, + { + "epoch": 39.189189189189186, + "grad_norm": 0.23816809058189392, + "learning_rate": 2.7256906360970808e-05, + "loss": 0.1205, + "num_input_tokens_seen": 13524568, + "step": 18850 + }, + { + "epoch": 39.1995841995842, + "grad_norm": 0.1386617124080658, + "learning_rate": 2.7247128797248117e-05, + "loss": 0.095, + "num_input_tokens_seen": 13528152, + "step": 18855 + }, + { + "epoch": 39.20997920997921, + "grad_norm": 0.45245566964149475, + "learning_rate": 2.7237350886989925e-05, + "loss": 0.1066, + "num_input_tokens_seen": 13531736, + "step": 18860 + }, + { + "epoch": 39.22037422037422, + "grad_norm": 0.2577979266643524, + "learning_rate": 2.7227572631704107e-05, + "loss": 0.0989, + "num_input_tokens_seen": 13535544, + "step": 18865 + }, + { + "epoch": 39.23076923076923, + "grad_norm": 0.6509095430374146, + "learning_rate": 2.7217794032898596e-05, + "loss": 0.1174, + "num_input_tokens_seen": 13539000, + "step": 18870 + }, + { + "epoch": 39.24116424116424, + "grad_norm": 0.5377166271209717, + "learning_rate": 2.7208015092081384e-05, + "loss": 0.1077, + "num_input_tokens_seen": 13542520, + "step": 18875 + }, + { + "epoch": 39.25155925155925, + "grad_norm": 0.24357253313064575, + "learning_rate": 2.719823581076049e-05, + "loss": 0.0871, + "num_input_tokens_seen": 13546104, + "step": 18880 + }, + { + "epoch": 39.26195426195426, + "grad_norm": 0.4206794798374176, + "learning_rate": 2.718845619044401e-05, + "loss": 0.1372, + "num_input_tokens_seen": 13549592, + "step": 18885 + }, + { + "epoch": 39.272349272349274, + "grad_norm": 0.5362982749938965, + "learning_rate": 2.7178676232640088e-05, + "loss": 0.1159, + "num_input_tokens_seen": 13552984, + "step": 18890 + }, + { + "epoch": 39.28274428274428, + "grad_norm": 0.6524403095245361, + "learning_rate": 2.716889593885691e-05, + "loss": 0.0935, + "num_input_tokens_seen": 13556600, + "step": 18895 + }, + { + "epoch": 39.29313929313929, + "grad_norm": 0.1820514053106308, + "learning_rate": 2.7159115310602716e-05, + "loss": 0.1113, + "num_input_tokens_seen": 13560088, + "step": 18900 + }, + { + "epoch": 39.303534303534306, + "grad_norm": 0.3208516538143158, + "learning_rate": 2.7149334349385814e-05, + "loss": 0.116, + "num_input_tokens_seen": 13563736, + "step": 18905 + }, + { + "epoch": 39.313929313929314, + "grad_norm": 0.18236517906188965, + "learning_rate": 2.713955305671454e-05, + "loss": 0.0845, + "num_input_tokens_seen": 13567192, + "step": 18910 + }, + { + "epoch": 39.32432432432432, + "grad_norm": 0.43107178807258606, + "learning_rate": 2.71297714340973e-05, + "loss": 0.0972, + "num_input_tokens_seen": 13570904, + "step": 18915 + }, + { + "epoch": 39.33471933471934, + "grad_norm": 0.1703558713197708, + "learning_rate": 2.7119989483042545e-05, + "loss": 0.0862, + "num_input_tokens_seen": 13574680, + "step": 18920 + }, + { + "epoch": 39.34511434511435, + "grad_norm": 0.15334926545619965, + "learning_rate": 2.7110207205058768e-05, + "loss": 0.0656, + "num_input_tokens_seen": 13578104, + "step": 18925 + }, + { + "epoch": 39.355509355509355, + "grad_norm": 0.6369320154190063, + "learning_rate": 2.7100424601654517e-05, + "loss": 0.1057, + "num_input_tokens_seen": 13581656, + "step": 18930 + }, + { + "epoch": 39.36590436590436, + "grad_norm": 0.8743125796318054, + "learning_rate": 2.7090641674338403e-05, + "loss": 0.0813, + "num_input_tokens_seen": 13585272, + "step": 18935 + }, + { + "epoch": 39.37629937629938, + "grad_norm": 0.23565562069416046, + "learning_rate": 2.7080858424619072e-05, + "loss": 0.1124, + "num_input_tokens_seen": 13588856, + "step": 18940 + }, + { + "epoch": 39.38669438669439, + "grad_norm": 0.2727449834346771, + "learning_rate": 2.707107485400521e-05, + "loss": 0.0921, + "num_input_tokens_seen": 13592632, + "step": 18945 + }, + { + "epoch": 39.397089397089395, + "grad_norm": 0.31364718079566956, + "learning_rate": 2.7061290964005586e-05, + "loss": 0.0853, + "num_input_tokens_seen": 13596184, + "step": 18950 + }, + { + "epoch": 39.40748440748441, + "grad_norm": 0.7101359963417053, + "learning_rate": 2.7051506756129e-05, + "loss": 0.1432, + "num_input_tokens_seen": 13599704, + "step": 18955 + }, + { + "epoch": 39.41787941787942, + "grad_norm": 0.2840200662612915, + "learning_rate": 2.704172223188428e-05, + "loss": 0.1144, + "num_input_tokens_seen": 13603256, + "step": 18960 + }, + { + "epoch": 39.42827442827443, + "grad_norm": 0.2365512251853943, + "learning_rate": 2.7031937392780334e-05, + "loss": 0.093, + "num_input_tokens_seen": 13606904, + "step": 18965 + }, + { + "epoch": 39.438669438669436, + "grad_norm": 0.5383256077766418, + "learning_rate": 2.702215224032611e-05, + "loss": 0.1162, + "num_input_tokens_seen": 13610552, + "step": 18970 + }, + { + "epoch": 39.44906444906445, + "grad_norm": 0.37030941247940063, + "learning_rate": 2.70123667760306e-05, + "loss": 0.0948, + "num_input_tokens_seen": 13614104, + "step": 18975 + }, + { + "epoch": 39.45945945945946, + "grad_norm": 0.24539335072040558, + "learning_rate": 2.7002581001402845e-05, + "loss": 0.1015, + "num_input_tokens_seen": 13617752, + "step": 18980 + }, + { + "epoch": 39.46985446985447, + "grad_norm": 0.32840263843536377, + "learning_rate": 2.6992794917951923e-05, + "loss": 0.1155, + "num_input_tokens_seen": 13621432, + "step": 18985 + }, + { + "epoch": 39.48024948024948, + "grad_norm": 0.258226603269577, + "learning_rate": 2.6983008527187e-05, + "loss": 0.0657, + "num_input_tokens_seen": 13625240, + "step": 18990 + }, + { + "epoch": 39.49064449064449, + "grad_norm": 1.0472519397735596, + "learning_rate": 2.697322183061723e-05, + "loss": 0.1133, + "num_input_tokens_seen": 13628792, + "step": 18995 + }, + { + "epoch": 39.5010395010395, + "grad_norm": 0.15410037338733673, + "learning_rate": 2.696343482975186e-05, + "loss": 0.0924, + "num_input_tokens_seen": 13632312, + "step": 19000 + }, + { + "epoch": 39.5010395010395, + "eval_loss": 0.15101297199726105, + "eval_runtime": 7.7485, + "eval_samples_per_second": 110.473, + "eval_steps_per_second": 27.618, + "num_input_tokens_seen": 13632312, + "step": 19000 + }, + { + "epoch": 39.51143451143451, + "grad_norm": 0.38208332657814026, + "learning_rate": 2.695364752610016e-05, + "loss": 0.1194, + "num_input_tokens_seen": 13635960, + "step": 19005 + }, + { + "epoch": 39.521829521829524, + "grad_norm": 0.23682591319084167, + "learning_rate": 2.6943859921171467e-05, + "loss": 0.0814, + "num_input_tokens_seen": 13639576, + "step": 19010 + }, + { + "epoch": 39.53222453222453, + "grad_norm": 0.21776853501796722, + "learning_rate": 2.6934072016475143e-05, + "loss": 0.1216, + "num_input_tokens_seen": 13643192, + "step": 19015 + }, + { + "epoch": 39.54261954261954, + "grad_norm": 0.3274799883365631, + "learning_rate": 2.6924283813520606e-05, + "loss": 0.1295, + "num_input_tokens_seen": 13646744, + "step": 19020 + }, + { + "epoch": 39.553014553014556, + "grad_norm": 0.8191629648208618, + "learning_rate": 2.691449531381733e-05, + "loss": 0.1063, + "num_input_tokens_seen": 13650296, + "step": 19025 + }, + { + "epoch": 39.563409563409564, + "grad_norm": 0.369937539100647, + "learning_rate": 2.6904706518874816e-05, + "loss": 0.1029, + "num_input_tokens_seen": 13654072, + "step": 19030 + }, + { + "epoch": 39.57380457380457, + "grad_norm": 0.29539239406585693, + "learning_rate": 2.6894917430202615e-05, + "loss": 0.0712, + "num_input_tokens_seen": 13657496, + "step": 19035 + }, + { + "epoch": 39.58419958419958, + "grad_norm": 0.19756238162517548, + "learning_rate": 2.6885128049310343e-05, + "loss": 0.1692, + "num_input_tokens_seen": 13661208, + "step": 19040 + }, + { + "epoch": 39.5945945945946, + "grad_norm": 0.31692934036254883, + "learning_rate": 2.687533837770762e-05, + "loss": 0.0986, + "num_input_tokens_seen": 13664728, + "step": 19045 + }, + { + "epoch": 39.604989604989605, + "grad_norm": 0.16209359467029572, + "learning_rate": 2.6865548416904162e-05, + "loss": 0.0918, + "num_input_tokens_seen": 13668152, + "step": 19050 + }, + { + "epoch": 39.61538461538461, + "grad_norm": 0.2238561064004898, + "learning_rate": 2.68557581684097e-05, + "loss": 0.0888, + "num_input_tokens_seen": 13671800, + "step": 19055 + }, + { + "epoch": 39.62577962577963, + "grad_norm": 0.47628289461135864, + "learning_rate": 2.6845967633733998e-05, + "loss": 0.0989, + "num_input_tokens_seen": 13675288, + "step": 19060 + }, + { + "epoch": 39.63617463617464, + "grad_norm": 0.4011562466621399, + "learning_rate": 2.683617681438689e-05, + "loss": 0.151, + "num_input_tokens_seen": 13678936, + "step": 19065 + }, + { + "epoch": 39.646569646569645, + "grad_norm": 0.23329804837703705, + "learning_rate": 2.682638571187825e-05, + "loss": 0.1056, + "num_input_tokens_seen": 13682424, + "step": 19070 + }, + { + "epoch": 39.656964656964654, + "grad_norm": 0.5390228033065796, + "learning_rate": 2.6816594327717976e-05, + "loss": 0.1187, + "num_input_tokens_seen": 13685976, + "step": 19075 + }, + { + "epoch": 39.66735966735967, + "grad_norm": 0.2615775763988495, + "learning_rate": 2.680680266341603e-05, + "loss": 0.0772, + "num_input_tokens_seen": 13689464, + "step": 19080 + }, + { + "epoch": 39.67775467775468, + "grad_norm": 0.9655698537826538, + "learning_rate": 2.67970107204824e-05, + "loss": 0.1361, + "num_input_tokens_seen": 13693080, + "step": 19085 + }, + { + "epoch": 39.688149688149686, + "grad_norm": 0.3267953395843506, + "learning_rate": 2.6787218500427142e-05, + "loss": 0.1084, + "num_input_tokens_seen": 13696792, + "step": 19090 + }, + { + "epoch": 39.6985446985447, + "grad_norm": 0.7002673745155334, + "learning_rate": 2.6777426004760332e-05, + "loss": 0.109, + "num_input_tokens_seen": 13700216, + "step": 19095 + }, + { + "epoch": 39.70893970893971, + "grad_norm": 0.3019634485244751, + "learning_rate": 2.6767633234992094e-05, + "loss": 0.0998, + "num_input_tokens_seen": 13703832, + "step": 19100 + }, + { + "epoch": 39.71933471933472, + "grad_norm": 0.3805224299430847, + "learning_rate": 2.6757840192632598e-05, + "loss": 0.1243, + "num_input_tokens_seen": 13707480, + "step": 19105 + }, + { + "epoch": 39.729729729729726, + "grad_norm": 0.1825021505355835, + "learning_rate": 2.6748046879192052e-05, + "loss": 0.093, + "num_input_tokens_seen": 13711064, + "step": 19110 + }, + { + "epoch": 39.74012474012474, + "grad_norm": 0.5096988677978516, + "learning_rate": 2.673825329618071e-05, + "loss": 0.1385, + "num_input_tokens_seen": 13714648, + "step": 19115 + }, + { + "epoch": 39.75051975051975, + "grad_norm": 0.15979798138141632, + "learning_rate": 2.6728459445108866e-05, + "loss": 0.1064, + "num_input_tokens_seen": 13718296, + "step": 19120 + }, + { + "epoch": 39.76091476091476, + "grad_norm": 0.20986820757389069, + "learning_rate": 2.6718665327486854e-05, + "loss": 0.0915, + "num_input_tokens_seen": 13721752, + "step": 19125 + }, + { + "epoch": 39.771309771309774, + "grad_norm": 0.7251355648040771, + "learning_rate": 2.6708870944825048e-05, + "loss": 0.0974, + "num_input_tokens_seen": 13725368, + "step": 19130 + }, + { + "epoch": 39.78170478170478, + "grad_norm": 0.5911964178085327, + "learning_rate": 2.6699076298633874e-05, + "loss": 0.0899, + "num_input_tokens_seen": 13728984, + "step": 19135 + }, + { + "epoch": 39.79209979209979, + "grad_norm": 0.2774468958377838, + "learning_rate": 2.6689281390423788e-05, + "loss": 0.0963, + "num_input_tokens_seen": 13732472, + "step": 19140 + }, + { + "epoch": 39.802494802494806, + "grad_norm": 0.3365897536277771, + "learning_rate": 2.667948622170527e-05, + "loss": 0.1101, + "num_input_tokens_seen": 13736184, + "step": 19145 + }, + { + "epoch": 39.812889812889814, + "grad_norm": 0.2604829967021942, + "learning_rate": 2.6669690793988873e-05, + "loss": 0.0918, + "num_input_tokens_seen": 13739864, + "step": 19150 + }, + { + "epoch": 39.82328482328482, + "grad_norm": 0.40162450075149536, + "learning_rate": 2.665989510878518e-05, + "loss": 0.1319, + "num_input_tokens_seen": 13743480, + "step": 19155 + }, + { + "epoch": 39.83367983367983, + "grad_norm": 0.4891359806060791, + "learning_rate": 2.6650099167604793e-05, + "loss": 0.1482, + "num_input_tokens_seen": 13746968, + "step": 19160 + }, + { + "epoch": 39.84407484407485, + "grad_norm": 0.6398388743400574, + "learning_rate": 2.6640302971958376e-05, + "loss": 0.1294, + "num_input_tokens_seen": 13750520, + "step": 19165 + }, + { + "epoch": 39.854469854469855, + "grad_norm": 0.8660076856613159, + "learning_rate": 2.6630506523356635e-05, + "loss": 0.1394, + "num_input_tokens_seen": 13754264, + "step": 19170 + }, + { + "epoch": 39.86486486486486, + "grad_norm": 0.4148049056529999, + "learning_rate": 2.6620709823310297e-05, + "loss": 0.1773, + "num_input_tokens_seen": 13758040, + "step": 19175 + }, + { + "epoch": 39.87525987525988, + "grad_norm": 0.48064959049224854, + "learning_rate": 2.661091287333014e-05, + "loss": 0.1296, + "num_input_tokens_seen": 13761592, + "step": 19180 + }, + { + "epoch": 39.88565488565489, + "grad_norm": 0.13363926112651825, + "learning_rate": 2.660111567492696e-05, + "loss": 0.1148, + "num_input_tokens_seen": 13765176, + "step": 19185 + }, + { + "epoch": 39.896049896049895, + "grad_norm": 0.4519987106323242, + "learning_rate": 2.6591318229611635e-05, + "loss": 0.1166, + "num_input_tokens_seen": 13768792, + "step": 19190 + }, + { + "epoch": 39.906444906444904, + "grad_norm": 0.3668392598628998, + "learning_rate": 2.6581520538895037e-05, + "loss": 0.1219, + "num_input_tokens_seen": 13772344, + "step": 19195 + }, + { + "epoch": 39.91683991683992, + "grad_norm": 0.25380775332450867, + "learning_rate": 2.6571722604288102e-05, + "loss": 0.0935, + "num_input_tokens_seen": 13775960, + "step": 19200 + }, + { + "epoch": 39.91683991683992, + "eval_loss": 0.14543850719928741, + "eval_runtime": 7.7508, + "eval_samples_per_second": 110.441, + "eval_steps_per_second": 27.61, + "num_input_tokens_seen": 13775960, + "step": 19200 + }, + { + "epoch": 39.92723492723493, + "grad_norm": 0.39195799827575684, + "learning_rate": 2.656192442730179e-05, + "loss": 0.1081, + "num_input_tokens_seen": 13779512, + "step": 19205 + }, + { + "epoch": 39.937629937629936, + "grad_norm": 0.27053239941596985, + "learning_rate": 2.6552126009447098e-05, + "loss": 0.0861, + "num_input_tokens_seen": 13783160, + "step": 19210 + }, + { + "epoch": 39.94802494802495, + "grad_norm": 0.5998237133026123, + "learning_rate": 2.654232735223507e-05, + "loss": 0.1141, + "num_input_tokens_seen": 13786744, + "step": 19215 + }, + { + "epoch": 39.95841995841996, + "grad_norm": 0.5168777704238892, + "learning_rate": 2.6532528457176787e-05, + "loss": 0.097, + "num_input_tokens_seen": 13790328, + "step": 19220 + }, + { + "epoch": 39.96881496881497, + "grad_norm": 0.6233017444610596, + "learning_rate": 2.6522729325783348e-05, + "loss": 0.0943, + "num_input_tokens_seen": 13793720, + "step": 19225 + }, + { + "epoch": 39.979209979209976, + "grad_norm": 0.2992507517337799, + "learning_rate": 2.6512929959565914e-05, + "loss": 0.0924, + "num_input_tokens_seen": 13797496, + "step": 19230 + }, + { + "epoch": 39.98960498960499, + "grad_norm": 0.5613521933555603, + "learning_rate": 2.6503130360035673e-05, + "loss": 0.1029, + "num_input_tokens_seen": 13801144, + "step": 19235 + }, + { + "epoch": 40.0, + "grad_norm": 0.8074992895126343, + "learning_rate": 2.6493330528703835e-05, + "loss": 0.1111, + "num_input_tokens_seen": 13804680, + "step": 19240 + }, + { + "epoch": 40.01039501039501, + "grad_norm": 0.8635570406913757, + "learning_rate": 2.648353046708167e-05, + "loss": 0.1114, + "num_input_tokens_seen": 13808264, + "step": 19245 + }, + { + "epoch": 40.020790020790024, + "grad_norm": 0.3425300419330597, + "learning_rate": 2.647373017668046e-05, + "loss": 0.103, + "num_input_tokens_seen": 13811784, + "step": 19250 + }, + { + "epoch": 40.03118503118503, + "grad_norm": 0.5144860744476318, + "learning_rate": 2.6463929659011537e-05, + "loss": 0.1039, + "num_input_tokens_seen": 13815272, + "step": 19255 + }, + { + "epoch": 40.04158004158004, + "grad_norm": 0.3795711398124695, + "learning_rate": 2.6454128915586262e-05, + "loss": 0.0722, + "num_input_tokens_seen": 13818728, + "step": 19260 + }, + { + "epoch": 40.05197505197505, + "grad_norm": 0.5127480030059814, + "learning_rate": 2.6444327947916036e-05, + "loss": 0.1197, + "num_input_tokens_seen": 13822216, + "step": 19265 + }, + { + "epoch": 40.062370062370064, + "grad_norm": 1.0084748268127441, + "learning_rate": 2.6434526757512292e-05, + "loss": 0.0901, + "num_input_tokens_seen": 13825640, + "step": 19270 + }, + { + "epoch": 40.07276507276507, + "grad_norm": 0.35611018538475037, + "learning_rate": 2.6424725345886486e-05, + "loss": 0.0754, + "num_input_tokens_seen": 13829256, + "step": 19275 + }, + { + "epoch": 40.08316008316008, + "grad_norm": 0.45444023609161377, + "learning_rate": 2.641492371455014e-05, + "loss": 0.1014, + "num_input_tokens_seen": 13832872, + "step": 19280 + }, + { + "epoch": 40.093555093555096, + "grad_norm": 0.4959760010242462, + "learning_rate": 2.640512186501477e-05, + "loss": 0.094, + "num_input_tokens_seen": 13836488, + "step": 19285 + }, + { + "epoch": 40.103950103950105, + "grad_norm": 0.20182380080223083, + "learning_rate": 2.639531979879195e-05, + "loss": 0.0853, + "num_input_tokens_seen": 13840072, + "step": 19290 + }, + { + "epoch": 40.11434511434511, + "grad_norm": 0.5750865936279297, + "learning_rate": 2.638551751739328e-05, + "loss": 0.105, + "num_input_tokens_seen": 13843496, + "step": 19295 + }, + { + "epoch": 40.12474012474012, + "grad_norm": 0.17416013777256012, + "learning_rate": 2.6375715022330404e-05, + "loss": 0.0953, + "num_input_tokens_seen": 13847240, + "step": 19300 + }, + { + "epoch": 40.13513513513514, + "grad_norm": 1.008286952972412, + "learning_rate": 2.6365912315114976e-05, + "loss": 0.1113, + "num_input_tokens_seen": 13850664, + "step": 19305 + }, + { + "epoch": 40.145530145530145, + "grad_norm": 0.27142825722694397, + "learning_rate": 2.6356109397258704e-05, + "loss": 0.0824, + "num_input_tokens_seen": 13854216, + "step": 19310 + }, + { + "epoch": 40.15592515592515, + "grad_norm": 0.2582736909389496, + "learning_rate": 2.6346306270273325e-05, + "loss": 0.1041, + "num_input_tokens_seen": 13857832, + "step": 19315 + }, + { + "epoch": 40.16632016632017, + "grad_norm": 0.343018501996994, + "learning_rate": 2.6336502935670608e-05, + "loss": 0.1112, + "num_input_tokens_seen": 13861480, + "step": 19320 + }, + { + "epoch": 40.17671517671518, + "grad_norm": 0.20227810740470886, + "learning_rate": 2.6326699394962333e-05, + "loss": 0.1444, + "num_input_tokens_seen": 13865160, + "step": 19325 + }, + { + "epoch": 40.187110187110186, + "grad_norm": 0.2282029390335083, + "learning_rate": 2.6316895649660334e-05, + "loss": 0.091, + "num_input_tokens_seen": 13868680, + "step": 19330 + }, + { + "epoch": 40.197505197505194, + "grad_norm": 0.26079249382019043, + "learning_rate": 2.6307091701276486e-05, + "loss": 0.0652, + "num_input_tokens_seen": 13872424, + "step": 19335 + }, + { + "epoch": 40.20790020790021, + "grad_norm": 0.3828830420970917, + "learning_rate": 2.629728755132267e-05, + "loss": 0.1016, + "num_input_tokens_seen": 13875848, + "step": 19340 + }, + { + "epoch": 40.21829521829522, + "grad_norm": 0.41819965839385986, + "learning_rate": 2.628748320131081e-05, + "loss": 0.0995, + "num_input_tokens_seen": 13879464, + "step": 19345 + }, + { + "epoch": 40.228690228690226, + "grad_norm": 0.2567567229270935, + "learning_rate": 2.6277678652752856e-05, + "loss": 0.0963, + "num_input_tokens_seen": 13883272, + "step": 19350 + }, + { + "epoch": 40.23908523908524, + "grad_norm": 0.7785451412200928, + "learning_rate": 2.6267873907160807e-05, + "loss": 0.1121, + "num_input_tokens_seen": 13886696, + "step": 19355 + }, + { + "epoch": 40.24948024948025, + "grad_norm": 0.20946885645389557, + "learning_rate": 2.6258068966046668e-05, + "loss": 0.0851, + "num_input_tokens_seen": 13890440, + "step": 19360 + }, + { + "epoch": 40.25987525987526, + "grad_norm": 0.21804362535476685, + "learning_rate": 2.6248263830922475e-05, + "loss": 0.1416, + "num_input_tokens_seen": 13893992, + "step": 19365 + }, + { + "epoch": 40.270270270270274, + "grad_norm": 0.4832150638103485, + "learning_rate": 2.6238458503300318e-05, + "loss": 0.0884, + "num_input_tokens_seen": 13897512, + "step": 19370 + }, + { + "epoch": 40.28066528066528, + "grad_norm": 0.278629869222641, + "learning_rate": 2.6228652984692292e-05, + "loss": 0.1275, + "num_input_tokens_seen": 13901000, + "step": 19375 + }, + { + "epoch": 40.29106029106029, + "grad_norm": 0.09765106439590454, + "learning_rate": 2.621884727661054e-05, + "loss": 0.0541, + "num_input_tokens_seen": 13904488, + "step": 19380 + }, + { + "epoch": 40.3014553014553, + "grad_norm": 0.33133378624916077, + "learning_rate": 2.6209041380567222e-05, + "loss": 0.1199, + "num_input_tokens_seen": 13908168, + "step": 19385 + }, + { + "epoch": 40.311850311850314, + "grad_norm": 0.12150298058986664, + "learning_rate": 2.6199235298074527e-05, + "loss": 0.0885, + "num_input_tokens_seen": 13911784, + "step": 19390 + }, + { + "epoch": 40.32224532224532, + "grad_norm": 0.4616077244281769, + "learning_rate": 2.618942903064468e-05, + "loss": 0.0966, + "num_input_tokens_seen": 13915208, + "step": 19395 + }, + { + "epoch": 40.33264033264033, + "grad_norm": 0.28204861283302307, + "learning_rate": 2.6179622579789932e-05, + "loss": 0.118, + "num_input_tokens_seen": 13918888, + "step": 19400 + }, + { + "epoch": 40.33264033264033, + "eval_loss": 0.1424219161272049, + "eval_runtime": 7.7425, + "eval_samples_per_second": 110.559, + "eval_steps_per_second": 27.64, + "num_input_tokens_seen": 13918888, + "step": 19400 + }, + { + "epoch": 40.343035343035346, + "grad_norm": 0.7913638353347778, + "learning_rate": 2.6169815947022553e-05, + "loss": 0.1246, + "num_input_tokens_seen": 13922408, + "step": 19405 + }, + { + "epoch": 40.353430353430355, + "grad_norm": 0.2164427638053894, + "learning_rate": 2.6160009133854853e-05, + "loss": 0.0928, + "num_input_tokens_seen": 13926216, + "step": 19410 + }, + { + "epoch": 40.36382536382536, + "grad_norm": 0.6421902775764465, + "learning_rate": 2.6150202141799168e-05, + "loss": 0.115, + "num_input_tokens_seen": 13929928, + "step": 19415 + }, + { + "epoch": 40.37422037422037, + "grad_norm": 0.20930540561676025, + "learning_rate": 2.614039497236786e-05, + "loss": 0.118, + "num_input_tokens_seen": 13933416, + "step": 19420 + }, + { + "epoch": 40.38461538461539, + "grad_norm": 0.6493310332298279, + "learning_rate": 2.6130587627073315e-05, + "loss": 0.1508, + "num_input_tokens_seen": 13937032, + "step": 19425 + }, + { + "epoch": 40.395010395010395, + "grad_norm": 0.40842822194099426, + "learning_rate": 2.6120780107427956e-05, + "loss": 0.1276, + "num_input_tokens_seen": 13940616, + "step": 19430 + }, + { + "epoch": 40.4054054054054, + "grad_norm": 0.25941330194473267, + "learning_rate": 2.6110972414944214e-05, + "loss": 0.0924, + "num_input_tokens_seen": 13944136, + "step": 19435 + }, + { + "epoch": 40.41580041580042, + "grad_norm": 0.28494539856910706, + "learning_rate": 2.6101164551134565e-05, + "loss": 0.1002, + "num_input_tokens_seen": 13947720, + "step": 19440 + }, + { + "epoch": 40.42619542619543, + "grad_norm": 0.5772368907928467, + "learning_rate": 2.6091356517511505e-05, + "loss": 0.0716, + "num_input_tokens_seen": 13951240, + "step": 19445 + }, + { + "epoch": 40.436590436590436, + "grad_norm": 0.43955880403518677, + "learning_rate": 2.608154831558755e-05, + "loss": 0.1369, + "num_input_tokens_seen": 13954792, + "step": 19450 + }, + { + "epoch": 40.446985446985444, + "grad_norm": 0.35871103405952454, + "learning_rate": 2.607173994687526e-05, + "loss": 0.1043, + "num_input_tokens_seen": 13958440, + "step": 19455 + }, + { + "epoch": 40.45738045738046, + "grad_norm": 0.20987293124198914, + "learning_rate": 2.6061931412887196e-05, + "loss": 0.118, + "num_input_tokens_seen": 13962088, + "step": 19460 + }, + { + "epoch": 40.46777546777547, + "grad_norm": 0.4345960021018982, + "learning_rate": 2.6052122715135973e-05, + "loss": 0.1121, + "num_input_tokens_seen": 13965704, + "step": 19465 + }, + { + "epoch": 40.478170478170476, + "grad_norm": 0.5768040418624878, + "learning_rate": 2.60423138551342e-05, + "loss": 0.1191, + "num_input_tokens_seen": 13969192, + "step": 19470 + }, + { + "epoch": 40.48856548856549, + "grad_norm": 0.3944752514362335, + "learning_rate": 2.6032504834394527e-05, + "loss": 0.1087, + "num_input_tokens_seen": 13972648, + "step": 19475 + }, + { + "epoch": 40.4989604989605, + "grad_norm": 0.9003592729568481, + "learning_rate": 2.602269565442964e-05, + "loss": 0.1245, + "num_input_tokens_seen": 13976232, + "step": 19480 + }, + { + "epoch": 40.50935550935551, + "grad_norm": 0.35804814100265503, + "learning_rate": 2.6012886316752227e-05, + "loss": 0.0825, + "num_input_tokens_seen": 13979880, + "step": 19485 + }, + { + "epoch": 40.51975051975052, + "grad_norm": 0.18165366351604462, + "learning_rate": 2.6003076822875018e-05, + "loss": 0.114, + "num_input_tokens_seen": 13983496, + "step": 19490 + }, + { + "epoch": 40.53014553014553, + "grad_norm": 0.36806806921958923, + "learning_rate": 2.5993267174310755e-05, + "loss": 0.1237, + "num_input_tokens_seen": 13986952, + "step": 19495 + }, + { + "epoch": 40.54054054054054, + "grad_norm": 0.4343177378177643, + "learning_rate": 2.5983457372572218e-05, + "loss": 0.1106, + "num_input_tokens_seen": 13990504, + "step": 19500 + }, + { + "epoch": 40.55093555093555, + "grad_norm": 0.18776041269302368, + "learning_rate": 2.597364741917219e-05, + "loss": 0.1305, + "num_input_tokens_seen": 13994088, + "step": 19505 + }, + { + "epoch": 40.561330561330564, + "grad_norm": 0.3604666292667389, + "learning_rate": 2.5963837315623492e-05, + "loss": 0.1149, + "num_input_tokens_seen": 13997608, + "step": 19510 + }, + { + "epoch": 40.57172557172557, + "grad_norm": 0.2176283895969391, + "learning_rate": 2.595402706343897e-05, + "loss": 0.1288, + "num_input_tokens_seen": 14001224, + "step": 19515 + }, + { + "epoch": 40.58212058212058, + "grad_norm": 0.3004068434238434, + "learning_rate": 2.594421666413148e-05, + "loss": 0.1199, + "num_input_tokens_seen": 14004968, + "step": 19520 + }, + { + "epoch": 40.59251559251559, + "grad_norm": 0.2583160400390625, + "learning_rate": 2.5934406119213928e-05, + "loss": 0.1351, + "num_input_tokens_seen": 14008680, + "step": 19525 + }, + { + "epoch": 40.602910602910605, + "grad_norm": 0.17249628901481628, + "learning_rate": 2.5924595430199193e-05, + "loss": 0.0759, + "num_input_tokens_seen": 14012200, + "step": 19530 + }, + { + "epoch": 40.61330561330561, + "grad_norm": 0.4501660168170929, + "learning_rate": 2.5914784598600238e-05, + "loss": 0.1067, + "num_input_tokens_seen": 14015560, + "step": 19535 + }, + { + "epoch": 40.62370062370062, + "grad_norm": 0.26225486397743225, + "learning_rate": 2.5904973625930002e-05, + "loss": 0.0805, + "num_input_tokens_seen": 14019144, + "step": 19540 + }, + { + "epoch": 40.63409563409564, + "grad_norm": 0.2015170454978943, + "learning_rate": 2.5895162513701456e-05, + "loss": 0.0871, + "num_input_tokens_seen": 14022600, + "step": 19545 + }, + { + "epoch": 40.644490644490645, + "grad_norm": 0.3625563085079193, + "learning_rate": 2.5885351263427593e-05, + "loss": 0.1298, + "num_input_tokens_seen": 14026184, + "step": 19550 + }, + { + "epoch": 40.65488565488565, + "grad_norm": 0.31193023920059204, + "learning_rate": 2.5875539876621448e-05, + "loss": 0.0962, + "num_input_tokens_seen": 14029768, + "step": 19555 + }, + { + "epoch": 40.66528066528066, + "grad_norm": 0.6131166219711304, + "learning_rate": 2.586572835479605e-05, + "loss": 0.0874, + "num_input_tokens_seen": 14033352, + "step": 19560 + }, + { + "epoch": 40.67567567567568, + "grad_norm": 0.45870256423950195, + "learning_rate": 2.585591669946446e-05, + "loss": 0.1269, + "num_input_tokens_seen": 14036936, + "step": 19565 + }, + { + "epoch": 40.686070686070686, + "grad_norm": 0.17191468179225922, + "learning_rate": 2.5846104912139756e-05, + "loss": 0.1075, + "num_input_tokens_seen": 14040520, + "step": 19570 + }, + { + "epoch": 40.696465696465694, + "grad_norm": 1.013159155845642, + "learning_rate": 2.583629299433505e-05, + "loss": 0.1186, + "num_input_tokens_seen": 14044136, + "step": 19575 + }, + { + "epoch": 40.70686070686071, + "grad_norm": 0.2216557115316391, + "learning_rate": 2.582648094756345e-05, + "loss": 0.0955, + "num_input_tokens_seen": 14047816, + "step": 19580 + }, + { + "epoch": 40.71725571725572, + "grad_norm": 0.25772911310195923, + "learning_rate": 2.5816668773338098e-05, + "loss": 0.1218, + "num_input_tokens_seen": 14051336, + "step": 19585 + }, + { + "epoch": 40.727650727650726, + "grad_norm": 0.18427696824073792, + "learning_rate": 2.580685647317216e-05, + "loss": 0.0855, + "num_input_tokens_seen": 14055112, + "step": 19590 + }, + { + "epoch": 40.73804573804574, + "grad_norm": 0.39436182379722595, + "learning_rate": 2.5797044048578818e-05, + "loss": 0.1172, + "num_input_tokens_seen": 14058728, + "step": 19595 + }, + { + "epoch": 40.74844074844075, + "grad_norm": 0.5472216010093689, + "learning_rate": 2.5787231501071262e-05, + "loss": 0.0833, + "num_input_tokens_seen": 14062184, + "step": 19600 + }, + { + "epoch": 40.74844074844075, + "eval_loss": 0.14985831081867218, + "eval_runtime": 7.7472, + "eval_samples_per_second": 110.492, + "eval_steps_per_second": 27.623, + "num_input_tokens_seen": 14062184, + "step": 19600 + }, + { + "epoch": 40.75883575883576, + "grad_norm": 0.4063683748245239, + "learning_rate": 2.577741883216272e-05, + "loss": 0.1031, + "num_input_tokens_seen": 14065640, + "step": 19605 + }, + { + "epoch": 40.76923076923077, + "grad_norm": 0.5220100283622742, + "learning_rate": 2.576760604336642e-05, + "loss": 0.1213, + "num_input_tokens_seen": 14069480, + "step": 19610 + }, + { + "epoch": 40.77962577962578, + "grad_norm": 0.54262375831604, + "learning_rate": 2.575779313619563e-05, + "loss": 0.1251, + "num_input_tokens_seen": 14073160, + "step": 19615 + }, + { + "epoch": 40.79002079002079, + "grad_norm": 0.37337443232536316, + "learning_rate": 2.5747980112163605e-05, + "loss": 0.1094, + "num_input_tokens_seen": 14076936, + "step": 19620 + }, + { + "epoch": 40.8004158004158, + "grad_norm": 0.2975703477859497, + "learning_rate": 2.5738166972783656e-05, + "loss": 0.1263, + "num_input_tokens_seen": 14080520, + "step": 19625 + }, + { + "epoch": 40.810810810810814, + "grad_norm": 0.3265359401702881, + "learning_rate": 2.5728353719569075e-05, + "loss": 0.1057, + "num_input_tokens_seen": 14084264, + "step": 19630 + }, + { + "epoch": 40.82120582120582, + "grad_norm": 0.3872709572315216, + "learning_rate": 2.57185403540332e-05, + "loss": 0.1588, + "num_input_tokens_seen": 14087848, + "step": 19635 + }, + { + "epoch": 40.83160083160083, + "grad_norm": 0.40704602003097534, + "learning_rate": 2.5708726877689375e-05, + "loss": 0.1101, + "num_input_tokens_seen": 14091432, + "step": 19640 + }, + { + "epoch": 40.84199584199584, + "grad_norm": 0.24919229745864868, + "learning_rate": 2.5698913292050964e-05, + "loss": 0.1132, + "num_input_tokens_seen": 14094984, + "step": 19645 + }, + { + "epoch": 40.852390852390855, + "grad_norm": 0.2843540906906128, + "learning_rate": 2.568909959863133e-05, + "loss": 0.0819, + "num_input_tokens_seen": 14098536, + "step": 19650 + }, + { + "epoch": 40.86278586278586, + "grad_norm": 0.24741551280021667, + "learning_rate": 2.5679285798943887e-05, + "loss": 0.1112, + "num_input_tokens_seen": 14101992, + "step": 19655 + }, + { + "epoch": 40.87318087318087, + "grad_norm": 0.4627975523471832, + "learning_rate": 2.5669471894502035e-05, + "loss": 0.1011, + "num_input_tokens_seen": 14105640, + "step": 19660 + }, + { + "epoch": 40.88357588357589, + "grad_norm": 0.13452991843223572, + "learning_rate": 2.56596578868192e-05, + "loss": 0.0526, + "num_input_tokens_seen": 14109224, + "step": 19665 + }, + { + "epoch": 40.893970893970895, + "grad_norm": 0.6691818833351135, + "learning_rate": 2.564984377740883e-05, + "loss": 0.1065, + "num_input_tokens_seen": 14112680, + "step": 19670 + }, + { + "epoch": 40.9043659043659, + "grad_norm": 0.20730078220367432, + "learning_rate": 2.564002956778438e-05, + "loss": 0.1169, + "num_input_tokens_seen": 14116360, + "step": 19675 + }, + { + "epoch": 40.91476091476091, + "grad_norm": 0.16619811952114105, + "learning_rate": 2.563021525945934e-05, + "loss": 0.1043, + "num_input_tokens_seen": 14119912, + "step": 19680 + }, + { + "epoch": 40.92515592515593, + "grad_norm": 0.6913954019546509, + "learning_rate": 2.562040085394718e-05, + "loss": 0.1243, + "num_input_tokens_seen": 14123432, + "step": 19685 + }, + { + "epoch": 40.935550935550935, + "grad_norm": 0.7154430747032166, + "learning_rate": 2.56105863527614e-05, + "loss": 0.1767, + "num_input_tokens_seen": 14127048, + "step": 19690 + }, + { + "epoch": 40.945945945945944, + "grad_norm": 0.6216135025024414, + "learning_rate": 2.5600771757415548e-05, + "loss": 0.0934, + "num_input_tokens_seen": 14130664, + "step": 19695 + }, + { + "epoch": 40.95634095634096, + "grad_norm": 0.33651596307754517, + "learning_rate": 2.5590957069423134e-05, + "loss": 0.0775, + "num_input_tokens_seen": 14134472, + "step": 19700 + }, + { + "epoch": 40.96673596673597, + "grad_norm": 0.4824076294898987, + "learning_rate": 2.5581142290297716e-05, + "loss": 0.1645, + "num_input_tokens_seen": 14138120, + "step": 19705 + }, + { + "epoch": 40.977130977130976, + "grad_norm": 0.20306406915187836, + "learning_rate": 2.557132742155285e-05, + "loss": 0.1498, + "num_input_tokens_seen": 14141672, + "step": 19710 + }, + { + "epoch": 40.987525987525984, + "grad_norm": 0.23502567410469055, + "learning_rate": 2.556151246470212e-05, + "loss": 0.1048, + "num_input_tokens_seen": 14145256, + "step": 19715 + }, + { + "epoch": 40.997920997921, + "grad_norm": 0.32236745953559875, + "learning_rate": 2.5551697421259114e-05, + "loss": 0.0807, + "num_input_tokens_seen": 14148936, + "step": 19720 + }, + { + "epoch": 41.00831600831601, + "grad_norm": 0.3181665539741516, + "learning_rate": 2.554188229273743e-05, + "loss": 0.0874, + "num_input_tokens_seen": 14152552, + "step": 19725 + }, + { + "epoch": 41.018711018711016, + "grad_norm": 0.5975819826126099, + "learning_rate": 2.5532067080650678e-05, + "loss": 0.1037, + "num_input_tokens_seen": 14156072, + "step": 19730 + }, + { + "epoch": 41.02910602910603, + "grad_norm": 0.10531169921159744, + "learning_rate": 2.55222517865125e-05, + "loss": 0.0711, + "num_input_tokens_seen": 14159624, + "step": 19735 + }, + { + "epoch": 41.03950103950104, + "grad_norm": 0.29332584142684937, + "learning_rate": 2.5512436411836538e-05, + "loss": 0.1206, + "num_input_tokens_seen": 14163176, + "step": 19740 + }, + { + "epoch": 41.04989604989605, + "grad_norm": 0.238509401679039, + "learning_rate": 2.5502620958136443e-05, + "loss": 0.1334, + "num_input_tokens_seen": 14166696, + "step": 19745 + }, + { + "epoch": 41.06029106029106, + "grad_norm": 0.725173830986023, + "learning_rate": 2.5492805426925874e-05, + "loss": 0.1249, + "num_input_tokens_seen": 14170312, + "step": 19750 + }, + { + "epoch": 41.07068607068607, + "grad_norm": 0.3818502128124237, + "learning_rate": 2.5482989819718523e-05, + "loss": 0.096, + "num_input_tokens_seen": 14173768, + "step": 19755 + }, + { + "epoch": 41.08108108108108, + "grad_norm": 0.4680292010307312, + "learning_rate": 2.5473174138028065e-05, + "loss": 0.112, + "num_input_tokens_seen": 14177384, + "step": 19760 + }, + { + "epoch": 41.09147609147609, + "grad_norm": 0.5346848964691162, + "learning_rate": 2.5463358383368212e-05, + "loss": 0.1113, + "num_input_tokens_seen": 14181000, + "step": 19765 + }, + { + "epoch": 41.101871101871104, + "grad_norm": 0.15050996840000153, + "learning_rate": 2.545354255725267e-05, + "loss": 0.1015, + "num_input_tokens_seen": 14184616, + "step": 19770 + }, + { + "epoch": 41.11226611226611, + "grad_norm": 0.2223748117685318, + "learning_rate": 2.5443726661195165e-05, + "loss": 0.0955, + "num_input_tokens_seen": 14188328, + "step": 19775 + }, + { + "epoch": 41.12266112266112, + "grad_norm": 0.29449009895324707, + "learning_rate": 2.543391069670944e-05, + "loss": 0.0877, + "num_input_tokens_seen": 14192168, + "step": 19780 + }, + { + "epoch": 41.13305613305613, + "grad_norm": 0.3132009506225586, + "learning_rate": 2.5424094665309228e-05, + "loss": 0.0923, + "num_input_tokens_seen": 14195912, + "step": 19785 + }, + { + "epoch": 41.143451143451145, + "grad_norm": 0.34734466671943665, + "learning_rate": 2.5414278568508292e-05, + "loss": 0.0996, + "num_input_tokens_seen": 14199560, + "step": 19790 + }, + { + "epoch": 41.15384615384615, + "grad_norm": 0.2993946969509125, + "learning_rate": 2.540446240782039e-05, + "loss": 0.1257, + "num_input_tokens_seen": 14203240, + "step": 19795 + }, + { + "epoch": 41.16424116424116, + "grad_norm": 0.21708260476589203, + "learning_rate": 2.5394646184759307e-05, + "loss": 0.1225, + "num_input_tokens_seen": 14206632, + "step": 19800 + }, + { + "epoch": 41.16424116424116, + "eval_loss": 0.14177359640598297, + "eval_runtime": 7.7524, + "eval_samples_per_second": 110.418, + "eval_steps_per_second": 27.604, + "num_input_tokens_seen": 14206632, + "step": 19800 + }, + { + "epoch": 41.17463617463618, + "grad_norm": 0.3955796957015991, + "learning_rate": 2.538482990083882e-05, + "loss": 0.116, + "num_input_tokens_seen": 14210280, + "step": 19805 + }, + { + "epoch": 41.185031185031185, + "grad_norm": 0.7009750604629517, + "learning_rate": 2.5375013557572725e-05, + "loss": 0.1144, + "num_input_tokens_seen": 14213864, + "step": 19810 + }, + { + "epoch": 41.195426195426194, + "grad_norm": 0.13556092977523804, + "learning_rate": 2.536519715647483e-05, + "loss": 0.0823, + "num_input_tokens_seen": 14217576, + "step": 19815 + }, + { + "epoch": 41.20582120582121, + "grad_norm": 0.6561002135276794, + "learning_rate": 2.535538069905894e-05, + "loss": 0.0784, + "num_input_tokens_seen": 14221064, + "step": 19820 + }, + { + "epoch": 41.21621621621622, + "grad_norm": 0.3688555657863617, + "learning_rate": 2.534556418683888e-05, + "loss": 0.1212, + "num_input_tokens_seen": 14224616, + "step": 19825 + }, + { + "epoch": 41.226611226611226, + "grad_norm": 0.2559059262275696, + "learning_rate": 2.5335747621328486e-05, + "loss": 0.1145, + "num_input_tokens_seen": 14228168, + "step": 19830 + }, + { + "epoch": 41.237006237006234, + "grad_norm": 0.6115173697471619, + "learning_rate": 2.5325931004041586e-05, + "loss": 0.1731, + "num_input_tokens_seen": 14231592, + "step": 19835 + }, + { + "epoch": 41.24740124740125, + "grad_norm": 0.16178083419799805, + "learning_rate": 2.5316114336492032e-05, + "loss": 0.0622, + "num_input_tokens_seen": 14235208, + "step": 19840 + }, + { + "epoch": 41.25779625779626, + "grad_norm": 0.19360151886940002, + "learning_rate": 2.530629762019367e-05, + "loss": 0.0827, + "num_input_tokens_seen": 14238984, + "step": 19845 + }, + { + "epoch": 41.268191268191266, + "grad_norm": 0.22946445643901825, + "learning_rate": 2.5296480856660364e-05, + "loss": 0.0933, + "num_input_tokens_seen": 14242344, + "step": 19850 + }, + { + "epoch": 41.27858627858628, + "grad_norm": 0.9402499198913574, + "learning_rate": 2.528666404740599e-05, + "loss": 0.1189, + "num_input_tokens_seen": 14245864, + "step": 19855 + }, + { + "epoch": 41.28898128898129, + "grad_norm": 0.2666162848472595, + "learning_rate": 2.527684719394442e-05, + "loss": 0.0764, + "num_input_tokens_seen": 14249512, + "step": 19860 + }, + { + "epoch": 41.2993762993763, + "grad_norm": 0.28479862213134766, + "learning_rate": 2.526703029778953e-05, + "loss": 0.1293, + "num_input_tokens_seen": 14252936, + "step": 19865 + }, + { + "epoch": 41.30977130977131, + "grad_norm": 0.21400600671768188, + "learning_rate": 2.5257213360455208e-05, + "loss": 0.1024, + "num_input_tokens_seen": 14256264, + "step": 19870 + }, + { + "epoch": 41.32016632016632, + "grad_norm": 0.2561366856098175, + "learning_rate": 2.5247396383455353e-05, + "loss": 0.1009, + "num_input_tokens_seen": 14259784, + "step": 19875 + }, + { + "epoch": 41.33056133056133, + "grad_norm": 0.47869500517845154, + "learning_rate": 2.523757936830387e-05, + "loss": 0.1154, + "num_input_tokens_seen": 14263400, + "step": 19880 + }, + { + "epoch": 41.34095634095634, + "grad_norm": 0.3302435874938965, + "learning_rate": 2.5227762316514662e-05, + "loss": 0.0957, + "num_input_tokens_seen": 14267080, + "step": 19885 + }, + { + "epoch": 41.351351351351354, + "grad_norm": 0.4462837278842926, + "learning_rate": 2.5217945229601648e-05, + "loss": 0.1074, + "num_input_tokens_seen": 14270632, + "step": 19890 + }, + { + "epoch": 41.36174636174636, + "grad_norm": 0.3197474181652069, + "learning_rate": 2.5208128109078738e-05, + "loss": 0.0806, + "num_input_tokens_seen": 14274312, + "step": 19895 + }, + { + "epoch": 41.37214137214137, + "grad_norm": 0.1958031952381134, + "learning_rate": 2.5198310956459853e-05, + "loss": 0.0875, + "num_input_tokens_seen": 14278088, + "step": 19900 + }, + { + "epoch": 41.38253638253638, + "grad_norm": 0.19941715896129608, + "learning_rate": 2.518849377325893e-05, + "loss": 0.1189, + "num_input_tokens_seen": 14281640, + "step": 19905 + }, + { + "epoch": 41.392931392931395, + "grad_norm": 0.2657510042190552, + "learning_rate": 2.51786765609899e-05, + "loss": 0.1073, + "num_input_tokens_seen": 14285064, + "step": 19910 + }, + { + "epoch": 41.4033264033264, + "grad_norm": 0.6484656929969788, + "learning_rate": 2.5168859321166694e-05, + "loss": 0.1283, + "num_input_tokens_seen": 14288584, + "step": 19915 + }, + { + "epoch": 41.41372141372141, + "grad_norm": 0.21557164192199707, + "learning_rate": 2.515904205530326e-05, + "loss": 0.0877, + "num_input_tokens_seen": 14292104, + "step": 19920 + }, + { + "epoch": 41.42411642411643, + "grad_norm": 0.22848935425281525, + "learning_rate": 2.514922476491355e-05, + "loss": 0.0891, + "num_input_tokens_seen": 14295720, + "step": 19925 + }, + { + "epoch": 41.434511434511435, + "grad_norm": 0.4000210464000702, + "learning_rate": 2.51394074515115e-05, + "loss": 0.1466, + "num_input_tokens_seen": 14299432, + "step": 19930 + }, + { + "epoch": 41.444906444906444, + "grad_norm": 0.6227059960365295, + "learning_rate": 2.5129590116611067e-05, + "loss": 0.0683, + "num_input_tokens_seen": 14303112, + "step": 19935 + }, + { + "epoch": 41.45530145530145, + "grad_norm": 0.37655484676361084, + "learning_rate": 2.5119772761726212e-05, + "loss": 0.161, + "num_input_tokens_seen": 14306728, + "step": 19940 + }, + { + "epoch": 41.46569646569647, + "grad_norm": 0.5280719995498657, + "learning_rate": 2.5109955388370893e-05, + "loss": 0.108, + "num_input_tokens_seen": 14310376, + "step": 19945 + }, + { + "epoch": 41.476091476091476, + "grad_norm": 0.2470412701368332, + "learning_rate": 2.510013799805907e-05, + "loss": 0.0978, + "num_input_tokens_seen": 14314024, + "step": 19950 + }, + { + "epoch": 41.486486486486484, + "grad_norm": 0.2935231328010559, + "learning_rate": 2.5090320592304706e-05, + "loss": 0.127, + "num_input_tokens_seen": 14317640, + "step": 19955 + }, + { + "epoch": 41.4968814968815, + "grad_norm": 1.0749189853668213, + "learning_rate": 2.5080503172621777e-05, + "loss": 0.1159, + "num_input_tokens_seen": 14321192, + "step": 19960 + }, + { + "epoch": 41.50727650727651, + "grad_norm": 0.3525862395763397, + "learning_rate": 2.5070685740524246e-05, + "loss": 0.0988, + "num_input_tokens_seen": 14324808, + "step": 19965 + }, + { + "epoch": 41.517671517671516, + "grad_norm": 0.15637996792793274, + "learning_rate": 2.5060868297526084e-05, + "loss": 0.0463, + "num_input_tokens_seen": 14328296, + "step": 19970 + }, + { + "epoch": 41.528066528066525, + "grad_norm": 0.20142562687397003, + "learning_rate": 2.5051050845141267e-05, + "loss": 0.1119, + "num_input_tokens_seen": 14331784, + "step": 19975 + }, + { + "epoch": 41.53846153846154, + "grad_norm": 0.3442474901676178, + "learning_rate": 2.5041233384883765e-05, + "loss": 0.1058, + "num_input_tokens_seen": 14335368, + "step": 19980 + }, + { + "epoch": 41.54885654885655, + "grad_norm": 0.4611093997955322, + "learning_rate": 2.5031415918267564e-05, + "loss": 0.1045, + "num_input_tokens_seen": 14338920, + "step": 19985 + }, + { + "epoch": 41.55925155925156, + "grad_norm": 0.46915122866630554, + "learning_rate": 2.5021598446806626e-05, + "loss": 0.0831, + "num_input_tokens_seen": 14342344, + "step": 19990 + }, + { + "epoch": 41.56964656964657, + "grad_norm": 0.8812874555587769, + "learning_rate": 2.5011780972014937e-05, + "loss": 0.0998, + "num_input_tokens_seen": 14346280, + "step": 19995 + }, + { + "epoch": 41.58004158004158, + "grad_norm": 0.3036177456378937, + "learning_rate": 2.5001963495406478e-05, + "loss": 0.1059, + "num_input_tokens_seen": 14349800, + "step": 20000 + }, + { + "epoch": 41.58004158004158, + "eval_loss": 0.14877921342849731, + "eval_runtime": 7.7572, + "eval_samples_per_second": 110.349, + "eval_steps_per_second": 27.587, + "num_input_tokens_seen": 14349800, + "step": 20000 + }, + { + "epoch": 41.59043659043659, + "grad_norm": 0.173417329788208, + "learning_rate": 2.499214601849522e-05, + "loss": 0.0657, + "num_input_tokens_seen": 14353288, + "step": 20005 + }, + { + "epoch": 41.6008316008316, + "grad_norm": 0.2924911677837372, + "learning_rate": 2.4982328542795148e-05, + "loss": 0.1217, + "num_input_tokens_seen": 14357096, + "step": 20010 + }, + { + "epoch": 41.61122661122661, + "grad_norm": 0.12828238308429718, + "learning_rate": 2.497251106982024e-05, + "loss": 0.0884, + "num_input_tokens_seen": 14360744, + "step": 20015 + }, + { + "epoch": 41.62162162162162, + "grad_norm": 0.22078098356723785, + "learning_rate": 2.4962693601084458e-05, + "loss": 0.0816, + "num_input_tokens_seen": 14364264, + "step": 20020 + }, + { + "epoch": 41.63201663201663, + "grad_norm": 0.3396976590156555, + "learning_rate": 2.4952876138101794e-05, + "loss": 0.0975, + "num_input_tokens_seen": 14367752, + "step": 20025 + }, + { + "epoch": 41.642411642411645, + "grad_norm": 0.2632692754268646, + "learning_rate": 2.4943058682386233e-05, + "loss": 0.1077, + "num_input_tokens_seen": 14371336, + "step": 20030 + }, + { + "epoch": 41.65280665280665, + "grad_norm": 0.1517001837491989, + "learning_rate": 2.493324123545173e-05, + "loss": 0.082, + "num_input_tokens_seen": 14374920, + "step": 20035 + }, + { + "epoch": 41.66320166320166, + "grad_norm": 0.24038222432136536, + "learning_rate": 2.4923423798812272e-05, + "loss": 0.0976, + "num_input_tokens_seen": 14378472, + "step": 20040 + }, + { + "epoch": 41.67359667359668, + "grad_norm": 0.30391600728034973, + "learning_rate": 2.4913606373981825e-05, + "loss": 0.1261, + "num_input_tokens_seen": 14381960, + "step": 20045 + }, + { + "epoch": 41.683991683991685, + "grad_norm": 0.6312097311019897, + "learning_rate": 2.4903788962474357e-05, + "loss": 0.0845, + "num_input_tokens_seen": 14385384, + "step": 20050 + }, + { + "epoch": 41.694386694386694, + "grad_norm": 1.0501385927200317, + "learning_rate": 2.489397156580385e-05, + "loss": 0.1474, + "num_input_tokens_seen": 14389032, + "step": 20055 + }, + { + "epoch": 41.7047817047817, + "grad_norm": 0.8787755370140076, + "learning_rate": 2.4884154185484246e-05, + "loss": 0.1773, + "num_input_tokens_seen": 14392776, + "step": 20060 + }, + { + "epoch": 41.71517671517672, + "grad_norm": 0.25355157256126404, + "learning_rate": 2.4874336823029526e-05, + "loss": 0.1164, + "num_input_tokens_seen": 14396392, + "step": 20065 + }, + { + "epoch": 41.725571725571726, + "grad_norm": 0.44221416115760803, + "learning_rate": 2.4864519479953656e-05, + "loss": 0.112, + "num_input_tokens_seen": 14399816, + "step": 20070 + }, + { + "epoch": 41.735966735966734, + "grad_norm": 0.3966493010520935, + "learning_rate": 2.485470215777058e-05, + "loss": 0.1067, + "num_input_tokens_seen": 14403592, + "step": 20075 + }, + { + "epoch": 41.74636174636175, + "grad_norm": 0.26086854934692383, + "learning_rate": 2.4844884857994258e-05, + "loss": 0.1364, + "num_input_tokens_seen": 14407272, + "step": 20080 + }, + { + "epoch": 41.75675675675676, + "grad_norm": 0.36126992106437683, + "learning_rate": 2.4835067582138638e-05, + "loss": 0.0767, + "num_input_tokens_seen": 14410760, + "step": 20085 + }, + { + "epoch": 41.767151767151766, + "grad_norm": 0.17406466603279114, + "learning_rate": 2.4825250331717666e-05, + "loss": 0.0936, + "num_input_tokens_seen": 14414408, + "step": 20090 + }, + { + "epoch": 41.777546777546775, + "grad_norm": 0.6351653337478638, + "learning_rate": 2.4815433108245298e-05, + "loss": 0.1419, + "num_input_tokens_seen": 14417992, + "step": 20095 + }, + { + "epoch": 41.78794178794179, + "grad_norm": 0.4067232012748718, + "learning_rate": 2.4805615913235456e-05, + "loss": 0.106, + "num_input_tokens_seen": 14421544, + "step": 20100 + }, + { + "epoch": 41.7983367983368, + "grad_norm": 0.185044065117836, + "learning_rate": 2.479579874820208e-05, + "loss": 0.1103, + "num_input_tokens_seen": 14425064, + "step": 20105 + }, + { + "epoch": 41.80873180873181, + "grad_norm": 0.28080475330352783, + "learning_rate": 2.4785981614659115e-05, + "loss": 0.1449, + "num_input_tokens_seen": 14428552, + "step": 20110 + }, + { + "epoch": 41.81912681912682, + "grad_norm": 0.2404882162809372, + "learning_rate": 2.477616451412047e-05, + "loss": 0.1164, + "num_input_tokens_seen": 14432072, + "step": 20115 + }, + { + "epoch": 41.82952182952183, + "grad_norm": 0.32969167828559875, + "learning_rate": 2.476634744810007e-05, + "loss": 0.0574, + "num_input_tokens_seen": 14435624, + "step": 20120 + }, + { + "epoch": 41.83991683991684, + "grad_norm": 0.47260573506355286, + "learning_rate": 2.475653041811183e-05, + "loss": 0.0878, + "num_input_tokens_seen": 14439144, + "step": 20125 + }, + { + "epoch": 41.85031185031185, + "grad_norm": 0.4875951409339905, + "learning_rate": 2.4746713425669652e-05, + "loss": 0.1448, + "num_input_tokens_seen": 14442632, + "step": 20130 + }, + { + "epoch": 41.86070686070686, + "grad_norm": 0.2073792815208435, + "learning_rate": 2.4736896472287458e-05, + "loss": 0.1563, + "num_input_tokens_seen": 14446408, + "step": 20135 + }, + { + "epoch": 41.87110187110187, + "grad_norm": 0.5393523573875427, + "learning_rate": 2.4727079559479124e-05, + "loss": 0.0934, + "num_input_tokens_seen": 14450152, + "step": 20140 + }, + { + "epoch": 41.88149688149688, + "grad_norm": 0.24641883373260498, + "learning_rate": 2.4717262688758557e-05, + "loss": 0.117, + "num_input_tokens_seen": 14453704, + "step": 20145 + }, + { + "epoch": 41.891891891891895, + "grad_norm": 0.3116382360458374, + "learning_rate": 2.4707445861639637e-05, + "loss": 0.1158, + "num_input_tokens_seen": 14457352, + "step": 20150 + }, + { + "epoch": 41.9022869022869, + "grad_norm": 0.21338213980197906, + "learning_rate": 2.4697629079636244e-05, + "loss": 0.0856, + "num_input_tokens_seen": 14460968, + "step": 20155 + }, + { + "epoch": 41.91268191268191, + "grad_norm": 0.1650713086128235, + "learning_rate": 2.4687812344262244e-05, + "loss": 0.0886, + "num_input_tokens_seen": 14464328, + "step": 20160 + }, + { + "epoch": 41.92307692307692, + "grad_norm": 0.16223302483558655, + "learning_rate": 2.46779956570315e-05, + "loss": 0.1592, + "num_input_tokens_seen": 14467912, + "step": 20165 + }, + { + "epoch": 41.933471933471935, + "grad_norm": 0.6677025556564331, + "learning_rate": 2.466817901945787e-05, + "loss": 0.1605, + "num_input_tokens_seen": 14471336, + "step": 20170 + }, + { + "epoch": 41.943866943866944, + "grad_norm": 0.42056408524513245, + "learning_rate": 2.4658362433055217e-05, + "loss": 0.0829, + "num_input_tokens_seen": 14474920, + "step": 20175 + }, + { + "epoch": 41.95426195426195, + "grad_norm": 0.49364325404167175, + "learning_rate": 2.4648545899337356e-05, + "loss": 0.1055, + "num_input_tokens_seen": 14478568, + "step": 20180 + }, + { + "epoch": 41.96465696465697, + "grad_norm": 0.26650479435920715, + "learning_rate": 2.4638729419818143e-05, + "loss": 0.0906, + "num_input_tokens_seen": 14482376, + "step": 20185 + }, + { + "epoch": 41.975051975051976, + "grad_norm": 0.22683526575565338, + "learning_rate": 2.46289129960114e-05, + "loss": 0.0978, + "num_input_tokens_seen": 14485960, + "step": 20190 + }, + { + "epoch": 41.985446985446984, + "grad_norm": 0.6211627721786499, + "learning_rate": 2.4619096629430924e-05, + "loss": 0.1086, + "num_input_tokens_seen": 14489576, + "step": 20195 + }, + { + "epoch": 41.99584199584199, + "grad_norm": 0.2051703929901123, + "learning_rate": 2.4609280321590543e-05, + "loss": 0.1191, + "num_input_tokens_seen": 14493096, + "step": 20200 + }, + { + "epoch": 41.99584199584199, + "eval_loss": 0.14557473361492157, + "eval_runtime": 7.7612, + "eval_samples_per_second": 110.293, + "eval_steps_per_second": 27.573, + "num_input_tokens_seen": 14493096, + "step": 20200 + }, + { + "epoch": 42.00623700623701, + "grad_norm": 0.5810629725456238, + "learning_rate": 2.4599464074004037e-05, + "loss": 0.1255, + "num_input_tokens_seen": 14496696, + "step": 20205 + }, + { + "epoch": 42.016632016632016, + "grad_norm": 0.18402321636676788, + "learning_rate": 2.4589647888185204e-05, + "loss": 0.0832, + "num_input_tokens_seen": 14500280, + "step": 20210 + }, + { + "epoch": 42.027027027027025, + "grad_norm": 0.5301243662834167, + "learning_rate": 2.4579831765647836e-05, + "loss": 0.1136, + "num_input_tokens_seen": 14503768, + "step": 20215 + }, + { + "epoch": 42.03742203742204, + "grad_norm": 0.5616496801376343, + "learning_rate": 2.4570015707905676e-05, + "loss": 0.1349, + "num_input_tokens_seen": 14507352, + "step": 20220 + }, + { + "epoch": 42.04781704781705, + "grad_norm": 0.2916930317878723, + "learning_rate": 2.4560199716472508e-05, + "loss": 0.1272, + "num_input_tokens_seen": 14511064, + "step": 20225 + }, + { + "epoch": 42.05821205821206, + "grad_norm": 0.3257554769515991, + "learning_rate": 2.455038379286207e-05, + "loss": 0.0906, + "num_input_tokens_seen": 14514808, + "step": 20230 + }, + { + "epoch": 42.06860706860707, + "grad_norm": 0.3326234519481659, + "learning_rate": 2.4540567938588095e-05, + "loss": 0.0993, + "num_input_tokens_seen": 14518296, + "step": 20235 + }, + { + "epoch": 42.07900207900208, + "grad_norm": 0.19148418307304382, + "learning_rate": 2.4530752155164328e-05, + "loss": 0.143, + "num_input_tokens_seen": 14522040, + "step": 20240 + }, + { + "epoch": 42.08939708939709, + "grad_norm": 0.42942821979522705, + "learning_rate": 2.4520936444104463e-05, + "loss": 0.1421, + "num_input_tokens_seen": 14525656, + "step": 20245 + }, + { + "epoch": 42.0997920997921, + "grad_norm": 0.3060411214828491, + "learning_rate": 2.4511120806922218e-05, + "loss": 0.115, + "num_input_tokens_seen": 14529240, + "step": 20250 + }, + { + "epoch": 42.11018711018711, + "grad_norm": 0.22840383648872375, + "learning_rate": 2.45013052451313e-05, + "loss": 0.0748, + "num_input_tokens_seen": 14532760, + "step": 20255 + }, + { + "epoch": 42.12058212058212, + "grad_norm": 0.43203604221343994, + "learning_rate": 2.4491489760245376e-05, + "loss": 0.1127, + "num_input_tokens_seen": 14536376, + "step": 20260 + }, + { + "epoch": 42.13097713097713, + "grad_norm": 0.4423207640647888, + "learning_rate": 2.4481674353778115e-05, + "loss": 0.0639, + "num_input_tokens_seen": 14539928, + "step": 20265 + }, + { + "epoch": 42.141372141372145, + "grad_norm": 0.20514510571956635, + "learning_rate": 2.447185902724319e-05, + "loss": 0.087, + "num_input_tokens_seen": 14543448, + "step": 20270 + }, + { + "epoch": 42.15176715176715, + "grad_norm": 0.4876963496208191, + "learning_rate": 2.4462043782154233e-05, + "loss": 0.0869, + "num_input_tokens_seen": 14547160, + "step": 20275 + }, + { + "epoch": 42.16216216216216, + "grad_norm": 0.20063959062099457, + "learning_rate": 2.4452228620024895e-05, + "loss": 0.0755, + "num_input_tokens_seen": 14550776, + "step": 20280 + }, + { + "epoch": 42.17255717255717, + "grad_norm": 0.19721323251724243, + "learning_rate": 2.4442413542368776e-05, + "loss": 0.0858, + "num_input_tokens_seen": 14554552, + "step": 20285 + }, + { + "epoch": 42.182952182952185, + "grad_norm": 0.152627095580101, + "learning_rate": 2.4432598550699502e-05, + "loss": 0.1202, + "num_input_tokens_seen": 14558200, + "step": 20290 + }, + { + "epoch": 42.19334719334719, + "grad_norm": 0.2879612147808075, + "learning_rate": 2.4422783646530663e-05, + "loss": 0.1043, + "num_input_tokens_seen": 14561816, + "step": 20295 + }, + { + "epoch": 42.2037422037422, + "grad_norm": 0.2354881465435028, + "learning_rate": 2.441296883137584e-05, + "loss": 0.0872, + "num_input_tokens_seen": 14565464, + "step": 20300 + }, + { + "epoch": 42.21413721413722, + "grad_norm": 0.5070115923881531, + "learning_rate": 2.4403154106748592e-05, + "loss": 0.1058, + "num_input_tokens_seen": 14569144, + "step": 20305 + }, + { + "epoch": 42.224532224532226, + "grad_norm": 0.259208083152771, + "learning_rate": 2.4393339474162494e-05, + "loss": 0.1257, + "num_input_tokens_seen": 14572824, + "step": 20310 + }, + { + "epoch": 42.234927234927234, + "grad_norm": 0.5698161125183105, + "learning_rate": 2.4383524935131062e-05, + "loss": 0.1553, + "num_input_tokens_seen": 14576504, + "step": 20315 + }, + { + "epoch": 42.24532224532224, + "grad_norm": 0.27325209975242615, + "learning_rate": 2.437371049116784e-05, + "loss": 0.1092, + "num_input_tokens_seen": 14579992, + "step": 20320 + }, + { + "epoch": 42.25571725571726, + "grad_norm": 0.3169372081756592, + "learning_rate": 2.436389614378632e-05, + "loss": 0.1101, + "num_input_tokens_seen": 14583480, + "step": 20325 + }, + { + "epoch": 42.266112266112266, + "grad_norm": 0.4268248975276947, + "learning_rate": 2.435408189450002e-05, + "loss": 0.1007, + "num_input_tokens_seen": 14587032, + "step": 20330 + }, + { + "epoch": 42.276507276507274, + "grad_norm": 0.3564453125, + "learning_rate": 2.4344267744822406e-05, + "loss": 0.0564, + "num_input_tokens_seen": 14590424, + "step": 20335 + }, + { + "epoch": 42.28690228690229, + "grad_norm": 0.18504516780376434, + "learning_rate": 2.4334453696266944e-05, + "loss": 0.0855, + "num_input_tokens_seen": 14594200, + "step": 20340 + }, + { + "epoch": 42.2972972972973, + "grad_norm": 0.49176767468452454, + "learning_rate": 2.432463975034708e-05, + "loss": 0.063, + "num_input_tokens_seen": 14597656, + "step": 20345 + }, + { + "epoch": 42.30769230769231, + "grad_norm": 0.28995272517204285, + "learning_rate": 2.4314825908576265e-05, + "loss": 0.1336, + "num_input_tokens_seen": 14601240, + "step": 20350 + }, + { + "epoch": 42.318087318087315, + "grad_norm": 0.3541010916233063, + "learning_rate": 2.4305012172467897e-05, + "loss": 0.09, + "num_input_tokens_seen": 14604600, + "step": 20355 + }, + { + "epoch": 42.32848232848233, + "grad_norm": 0.6911720633506775, + "learning_rate": 2.4295198543535393e-05, + "loss": 0.111, + "num_input_tokens_seen": 14608152, + "step": 20360 + }, + { + "epoch": 42.33887733887734, + "grad_norm": 0.6932796835899353, + "learning_rate": 2.4285385023292124e-05, + "loss": 0.1055, + "num_input_tokens_seen": 14611768, + "step": 20365 + }, + { + "epoch": 42.34927234927235, + "grad_norm": 0.17226341366767883, + "learning_rate": 2.427557161325147e-05, + "loss": 0.0937, + "num_input_tokens_seen": 14615384, + "step": 20370 + }, + { + "epoch": 42.35966735966736, + "grad_norm": 0.2326127141714096, + "learning_rate": 2.4265758314926778e-05, + "loss": 0.1179, + "num_input_tokens_seen": 14618968, + "step": 20375 + }, + { + "epoch": 42.37006237006237, + "grad_norm": 0.27838948369026184, + "learning_rate": 2.4255945129831373e-05, + "loss": 0.0855, + "num_input_tokens_seen": 14622424, + "step": 20380 + }, + { + "epoch": 42.38045738045738, + "grad_norm": 0.4001257121562958, + "learning_rate": 2.4246132059478578e-05, + "loss": 0.1096, + "num_input_tokens_seen": 14626008, + "step": 20385 + }, + { + "epoch": 42.39085239085239, + "grad_norm": 0.3123578727245331, + "learning_rate": 2.4236319105381706e-05, + "loss": 0.0828, + "num_input_tokens_seen": 14629624, + "step": 20390 + }, + { + "epoch": 42.4012474012474, + "grad_norm": 0.2988993525505066, + "learning_rate": 2.422650626905401e-05, + "loss": 0.1429, + "num_input_tokens_seen": 14633208, + "step": 20395 + }, + { + "epoch": 42.41164241164241, + "grad_norm": 0.26568564772605896, + "learning_rate": 2.4216693552008785e-05, + "loss": 0.0844, + "num_input_tokens_seen": 14636824, + "step": 20400 + }, + { + "epoch": 42.41164241164241, + "eval_loss": 0.14238493144512177, + "eval_runtime": 7.7456, + "eval_samples_per_second": 110.514, + "eval_steps_per_second": 27.628, + "num_input_tokens_seen": 14636824, + "step": 20400 + }, + { + "epoch": 42.42203742203742, + "grad_norm": 0.20907644927501678, + "learning_rate": 2.4206880955759247e-05, + "loss": 0.0977, + "num_input_tokens_seen": 14640408, + "step": 20405 + }, + { + "epoch": 42.432432432432435, + "grad_norm": 0.36180558800697327, + "learning_rate": 2.419706848181863e-05, + "loss": 0.0856, + "num_input_tokens_seen": 14643960, + "step": 20410 + }, + { + "epoch": 42.44282744282744, + "grad_norm": 0.454194575548172, + "learning_rate": 2.4187256131700153e-05, + "loss": 0.096, + "num_input_tokens_seen": 14647672, + "step": 20415 + }, + { + "epoch": 42.45322245322245, + "grad_norm": 0.3683416545391083, + "learning_rate": 2.4177443906916985e-05, + "loss": 0.1125, + "num_input_tokens_seen": 14651224, + "step": 20420 + }, + { + "epoch": 42.46361746361746, + "grad_norm": 0.31437984108924866, + "learning_rate": 2.4167631808982303e-05, + "loss": 0.108, + "num_input_tokens_seen": 14654904, + "step": 20425 + }, + { + "epoch": 42.474012474012476, + "grad_norm": 0.8338416814804077, + "learning_rate": 2.4157819839409264e-05, + "loss": 0.1267, + "num_input_tokens_seen": 14658616, + "step": 20430 + }, + { + "epoch": 42.484407484407484, + "grad_norm": 0.5639057159423828, + "learning_rate": 2.414800799971098e-05, + "loss": 0.1245, + "num_input_tokens_seen": 14662168, + "step": 20435 + }, + { + "epoch": 42.49480249480249, + "grad_norm": 0.44891971349716187, + "learning_rate": 2.4138196291400582e-05, + "loss": 0.1308, + "num_input_tokens_seen": 14665752, + "step": 20440 + }, + { + "epoch": 42.50519750519751, + "grad_norm": 0.1643853783607483, + "learning_rate": 2.412838471599114e-05, + "loss": 0.0795, + "num_input_tokens_seen": 14669336, + "step": 20445 + }, + { + "epoch": 42.515592515592516, + "grad_norm": 0.38678857684135437, + "learning_rate": 2.411857327499572e-05, + "loss": 0.1746, + "num_input_tokens_seen": 14673016, + "step": 20450 + }, + { + "epoch": 42.525987525987524, + "grad_norm": 0.6718041300773621, + "learning_rate": 2.410876196992739e-05, + "loss": 0.1113, + "num_input_tokens_seen": 14676600, + "step": 20455 + }, + { + "epoch": 42.53638253638254, + "grad_norm": 1.201128602027893, + "learning_rate": 2.4098950802299156e-05, + "loss": 0.1263, + "num_input_tokens_seen": 14680152, + "step": 20460 + }, + { + "epoch": 42.54677754677755, + "grad_norm": 0.1779109388589859, + "learning_rate": 2.4089139773624027e-05, + "loss": 0.1279, + "num_input_tokens_seen": 14683736, + "step": 20465 + }, + { + "epoch": 42.55717255717256, + "grad_norm": 0.5510916113853455, + "learning_rate": 2.4079328885415007e-05, + "loss": 0.0975, + "num_input_tokens_seen": 14687224, + "step": 20470 + }, + { + "epoch": 42.567567567567565, + "grad_norm": 0.2793324291706085, + "learning_rate": 2.4069518139185036e-05, + "loss": 0.0952, + "num_input_tokens_seen": 14690840, + "step": 20475 + }, + { + "epoch": 42.57796257796258, + "grad_norm": 0.21373853087425232, + "learning_rate": 2.405970753644706e-05, + "loss": 0.087, + "num_input_tokens_seen": 14694360, + "step": 20480 + }, + { + "epoch": 42.58835758835759, + "grad_norm": 0.7522479295730591, + "learning_rate": 2.4049897078714e-05, + "loss": 0.1262, + "num_input_tokens_seen": 14697976, + "step": 20485 + }, + { + "epoch": 42.5987525987526, + "grad_norm": 0.21600918471813202, + "learning_rate": 2.404008676749874e-05, + "loss": 0.1155, + "num_input_tokens_seen": 14701624, + "step": 20490 + }, + { + "epoch": 42.60914760914761, + "grad_norm": 0.22104518115520477, + "learning_rate": 2.403027660431418e-05, + "loss": 0.0823, + "num_input_tokens_seen": 14705080, + "step": 20495 + }, + { + "epoch": 42.61954261954262, + "grad_norm": 0.28825080394744873, + "learning_rate": 2.402046659067314e-05, + "loss": 0.0607, + "num_input_tokens_seen": 14708792, + "step": 20500 + }, + { + "epoch": 42.62993762993763, + "grad_norm": 0.2549653947353363, + "learning_rate": 2.401065672808847e-05, + "loss": 0.1795, + "num_input_tokens_seen": 14712472, + "step": 20505 + }, + { + "epoch": 42.64033264033264, + "grad_norm": 0.2999918460845947, + "learning_rate": 2.400084701807296e-05, + "loss": 0.1133, + "num_input_tokens_seen": 14716120, + "step": 20510 + }, + { + "epoch": 42.65072765072765, + "grad_norm": 0.37285536527633667, + "learning_rate": 2.39910374621394e-05, + "loss": 0.1196, + "num_input_tokens_seen": 14719512, + "step": 20515 + }, + { + "epoch": 42.66112266112266, + "grad_norm": 0.4497453272342682, + "learning_rate": 2.3981228061800544e-05, + "loss": 0.0832, + "num_input_tokens_seen": 14723032, + "step": 20520 + }, + { + "epoch": 42.67151767151767, + "grad_norm": 0.5943066477775574, + "learning_rate": 2.3971418818569115e-05, + "loss": 0.0988, + "num_input_tokens_seen": 14726712, + "step": 20525 + }, + { + "epoch": 42.681912681912685, + "grad_norm": 0.8048247694969177, + "learning_rate": 2.3961609733957832e-05, + "loss": 0.1039, + "num_input_tokens_seen": 14730296, + "step": 20530 + }, + { + "epoch": 42.69230769230769, + "grad_norm": 0.29046085476875305, + "learning_rate": 2.395180080947939e-05, + "loss": 0.1036, + "num_input_tokens_seen": 14733816, + "step": 20535 + }, + { + "epoch": 42.7027027027027, + "grad_norm": 0.4337664544582367, + "learning_rate": 2.394199204664642e-05, + "loss": 0.0926, + "num_input_tokens_seen": 14737432, + "step": 20540 + }, + { + "epoch": 42.71309771309771, + "grad_norm": 0.18926696479320526, + "learning_rate": 2.3932183446971583e-05, + "loss": 0.1345, + "num_input_tokens_seen": 14740984, + "step": 20545 + }, + { + "epoch": 42.723492723492726, + "grad_norm": 0.4473419189453125, + "learning_rate": 2.3922375011967473e-05, + "loss": 0.1149, + "num_input_tokens_seen": 14744504, + "step": 20550 + }, + { + "epoch": 42.733887733887734, + "grad_norm": 0.9803493022918701, + "learning_rate": 2.3912566743146676e-05, + "loss": 0.1215, + "num_input_tokens_seen": 14747992, + "step": 20555 + }, + { + "epoch": 42.74428274428274, + "grad_norm": 0.47590142488479614, + "learning_rate": 2.390275864202176e-05, + "loss": 0.0961, + "num_input_tokens_seen": 14751672, + "step": 20560 + }, + { + "epoch": 42.75467775467776, + "grad_norm": 0.2169932723045349, + "learning_rate": 2.3892950710105243e-05, + "loss": 0.0866, + "num_input_tokens_seen": 14755416, + "step": 20565 + }, + { + "epoch": 42.765072765072766, + "grad_norm": 0.17054630815982819, + "learning_rate": 2.3883142948909635e-05, + "loss": 0.0895, + "num_input_tokens_seen": 14758808, + "step": 20570 + }, + { + "epoch": 42.775467775467774, + "grad_norm": 0.40838295221328735, + "learning_rate": 2.3873335359947433e-05, + "loss": 0.1133, + "num_input_tokens_seen": 14762168, + "step": 20575 + }, + { + "epoch": 42.78586278586278, + "grad_norm": 0.2055441290140152, + "learning_rate": 2.3863527944731066e-05, + "loss": 0.0936, + "num_input_tokens_seen": 14765752, + "step": 20580 + }, + { + "epoch": 42.7962577962578, + "grad_norm": 0.40772855281829834, + "learning_rate": 2.385372070477298e-05, + "loss": 0.147, + "num_input_tokens_seen": 14769272, + "step": 20585 + }, + { + "epoch": 42.80665280665281, + "grad_norm": 0.2710825800895691, + "learning_rate": 2.384391364158556e-05, + "loss": 0.0678, + "num_input_tokens_seen": 14772792, + "step": 20590 + }, + { + "epoch": 42.817047817047815, + "grad_norm": 0.31841111183166504, + "learning_rate": 2.3834106756681185e-05, + "loss": 0.1187, + "num_input_tokens_seen": 14776536, + "step": 20595 + }, + { + "epoch": 42.82744282744283, + "grad_norm": 0.29197511076927185, + "learning_rate": 2.3824300051572206e-05, + "loss": 0.094, + "num_input_tokens_seen": 14780056, + "step": 20600 + }, + { + "epoch": 42.82744282744283, + "eval_loss": 0.14453601837158203, + "eval_runtime": 7.7433, + "eval_samples_per_second": 110.547, + "eval_steps_per_second": 27.637, + "num_input_tokens_seen": 14780056, + "step": 20600 + }, + { + "epoch": 42.83783783783784, + "grad_norm": 0.7036839723587036, + "learning_rate": 2.3814493527770923e-05, + "loss": 0.1534, + "num_input_tokens_seen": 14783608, + "step": 20605 + }, + { + "epoch": 42.84823284823285, + "grad_norm": 0.38559702038764954, + "learning_rate": 2.3804687186789637e-05, + "loss": 0.068, + "num_input_tokens_seen": 14787288, + "step": 20610 + }, + { + "epoch": 42.858627858627855, + "grad_norm": 0.25413525104522705, + "learning_rate": 2.379488103014062e-05, + "loss": 0.1119, + "num_input_tokens_seen": 14790904, + "step": 20615 + }, + { + "epoch": 42.86902286902287, + "grad_norm": 0.37757235765457153, + "learning_rate": 2.3785075059336086e-05, + "loss": 0.086, + "num_input_tokens_seen": 14794520, + "step": 20620 + }, + { + "epoch": 42.87941787941788, + "grad_norm": 0.27874618768692017, + "learning_rate": 2.3775269275888248e-05, + "loss": 0.1115, + "num_input_tokens_seen": 14798136, + "step": 20625 + }, + { + "epoch": 42.88981288981289, + "grad_norm": 0.34888771176338196, + "learning_rate": 2.3765463681309274e-05, + "loss": 0.1311, + "num_input_tokens_seen": 14801720, + "step": 20630 + }, + { + "epoch": 42.9002079002079, + "grad_norm": 0.38308823108673096, + "learning_rate": 2.3755658277111313e-05, + "loss": 0.1501, + "num_input_tokens_seen": 14805400, + "step": 20635 + }, + { + "epoch": 42.91060291060291, + "grad_norm": 0.34878304600715637, + "learning_rate": 2.374585306480649e-05, + "loss": 0.0913, + "num_input_tokens_seen": 14809048, + "step": 20640 + }, + { + "epoch": 42.92099792099792, + "grad_norm": 0.357803612947464, + "learning_rate": 2.3736048045906877e-05, + "loss": 0.1184, + "num_input_tokens_seen": 14812600, + "step": 20645 + }, + { + "epoch": 42.931392931392935, + "grad_norm": 0.283480167388916, + "learning_rate": 2.372624322192454e-05, + "loss": 0.0751, + "num_input_tokens_seen": 14816024, + "step": 20650 + }, + { + "epoch": 42.94178794178794, + "grad_norm": 0.6611958742141724, + "learning_rate": 2.3716438594371516e-05, + "loss": 0.1239, + "num_input_tokens_seen": 14819544, + "step": 20655 + }, + { + "epoch": 42.95218295218295, + "grad_norm": 0.4963371157646179, + "learning_rate": 2.3706634164759784e-05, + "loss": 0.0983, + "num_input_tokens_seen": 14823096, + "step": 20660 + }, + { + "epoch": 42.96257796257796, + "grad_norm": 0.2584529519081116, + "learning_rate": 2.3696829934601323e-05, + "loss": 0.1131, + "num_input_tokens_seen": 14826680, + "step": 20665 + }, + { + "epoch": 42.972972972972975, + "grad_norm": 0.303194522857666, + "learning_rate": 2.3687025905408053e-05, + "loss": 0.1039, + "num_input_tokens_seen": 14830296, + "step": 20670 + }, + { + "epoch": 42.983367983367984, + "grad_norm": 0.1953074038028717, + "learning_rate": 2.3677222078691886e-05, + "loss": 0.078, + "num_input_tokens_seen": 14833816, + "step": 20675 + }, + { + "epoch": 42.99376299376299, + "grad_norm": 0.22282034158706665, + "learning_rate": 2.366741845596471e-05, + "loss": 0.0973, + "num_input_tokens_seen": 14837304, + "step": 20680 + }, + { + "epoch": 43.00415800415801, + "grad_norm": 0.2954413890838623, + "learning_rate": 2.3657615038738343e-05, + "loss": 0.0872, + "num_input_tokens_seen": 14840744, + "step": 20685 + }, + { + "epoch": 43.014553014553016, + "grad_norm": 0.21497638523578644, + "learning_rate": 2.3647811828524614e-05, + "loss": 0.111, + "num_input_tokens_seen": 14844296, + "step": 20690 + }, + { + "epoch": 43.024948024948024, + "grad_norm": 0.5654615163803101, + "learning_rate": 2.363800882683529e-05, + "loss": 0.0795, + "num_input_tokens_seen": 14847816, + "step": 20695 + }, + { + "epoch": 43.03534303534303, + "grad_norm": 0.21136274933815002, + "learning_rate": 2.3628206035182125e-05, + "loss": 0.1298, + "num_input_tokens_seen": 14851496, + "step": 20700 + }, + { + "epoch": 43.04573804573805, + "grad_norm": 0.33222803473472595, + "learning_rate": 2.361840345507683e-05, + "loss": 0.1039, + "num_input_tokens_seen": 14855016, + "step": 20705 + }, + { + "epoch": 43.056133056133056, + "grad_norm": 0.7219361066818237, + "learning_rate": 2.3608601088031073e-05, + "loss": 0.1193, + "num_input_tokens_seen": 14858632, + "step": 20710 + }, + { + "epoch": 43.066528066528065, + "grad_norm": 0.6034374237060547, + "learning_rate": 2.3598798935556516e-05, + "loss": 0.0999, + "num_input_tokens_seen": 14862248, + "step": 20715 + }, + { + "epoch": 43.07692307692308, + "grad_norm": 0.22876209020614624, + "learning_rate": 2.3588996999164784e-05, + "loss": 0.0942, + "num_input_tokens_seen": 14865864, + "step": 20720 + }, + { + "epoch": 43.08731808731809, + "grad_norm": 0.6025635004043579, + "learning_rate": 2.3579195280367434e-05, + "loss": 0.0955, + "num_input_tokens_seen": 14869320, + "step": 20725 + }, + { + "epoch": 43.0977130977131, + "grad_norm": 0.38507771492004395, + "learning_rate": 2.356939378067603e-05, + "loss": 0.0779, + "num_input_tokens_seen": 14872808, + "step": 20730 + }, + { + "epoch": 43.108108108108105, + "grad_norm": 0.2673186659812927, + "learning_rate": 2.3559592501602092e-05, + "loss": 0.1008, + "num_input_tokens_seen": 14876456, + "step": 20735 + }, + { + "epoch": 43.11850311850312, + "grad_norm": 0.46434286236763, + "learning_rate": 2.3549791444657076e-05, + "loss": 0.1014, + "num_input_tokens_seen": 14880008, + "step": 20740 + }, + { + "epoch": 43.12889812889813, + "grad_norm": 0.42303624749183655, + "learning_rate": 2.353999061135246e-05, + "loss": 0.1058, + "num_input_tokens_seen": 14883560, + "step": 20745 + }, + { + "epoch": 43.13929313929314, + "grad_norm": 0.21352027356624603, + "learning_rate": 2.3530190003199626e-05, + "loss": 0.1094, + "num_input_tokens_seen": 14887176, + "step": 20750 + }, + { + "epoch": 43.14968814968815, + "grad_norm": 0.5585854053497314, + "learning_rate": 2.3520389621709965e-05, + "loss": 0.0778, + "num_input_tokens_seen": 14890696, + "step": 20755 + }, + { + "epoch": 43.16008316008316, + "grad_norm": 0.2958071231842041, + "learning_rate": 2.351058946839483e-05, + "loss": 0.11, + "num_input_tokens_seen": 14894280, + "step": 20760 + }, + { + "epoch": 43.17047817047817, + "grad_norm": 0.4372217655181885, + "learning_rate": 2.350078954476551e-05, + "loss": 0.1321, + "num_input_tokens_seen": 14897736, + "step": 20765 + }, + { + "epoch": 43.18087318087318, + "grad_norm": 0.2783046364784241, + "learning_rate": 2.3490989852333272e-05, + "loss": 0.0939, + "num_input_tokens_seen": 14901320, + "step": 20770 + }, + { + "epoch": 43.19126819126819, + "grad_norm": 0.36543047428131104, + "learning_rate": 2.3481190392609377e-05, + "loss": 0.1112, + "num_input_tokens_seen": 14904872, + "step": 20775 + }, + { + "epoch": 43.2016632016632, + "grad_norm": 0.23342975974082947, + "learning_rate": 2.3471391167105e-05, + "loss": 0.0977, + "num_input_tokens_seen": 14908360, + "step": 20780 + }, + { + "epoch": 43.21205821205821, + "grad_norm": 0.48830631375312805, + "learning_rate": 2.3461592177331325e-05, + "loss": 0.1034, + "num_input_tokens_seen": 14912072, + "step": 20785 + }, + { + "epoch": 43.222453222453225, + "grad_norm": 0.7551466226577759, + "learning_rate": 2.345179342479946e-05, + "loss": 0.1459, + "num_input_tokens_seen": 14915688, + "step": 20790 + }, + { + "epoch": 43.232848232848234, + "grad_norm": 0.6859937310218811, + "learning_rate": 2.3441994911020503e-05, + "loss": 0.0957, + "num_input_tokens_seen": 14919240, + "step": 20795 + }, + { + "epoch": 43.24324324324324, + "grad_norm": 0.5200982093811035, + "learning_rate": 2.3432196637505522e-05, + "loss": 0.0911, + "num_input_tokens_seen": 14922952, + "step": 20800 + }, + { + "epoch": 43.24324324324324, + "eval_loss": 0.14698730409145355, + "eval_runtime": 7.7484, + "eval_samples_per_second": 110.475, + "eval_steps_per_second": 27.619, + "num_input_tokens_seen": 14922952, + "step": 20800 + }, + { + "epoch": 43.25363825363825, + "grad_norm": 0.9065433740615845, + "learning_rate": 2.3422398605765515e-05, + "loss": 0.1221, + "num_input_tokens_seen": 14926568, + "step": 20805 + }, + { + "epoch": 43.264033264033266, + "grad_norm": 0.2566598355770111, + "learning_rate": 2.3412600817311462e-05, + "loss": 0.1344, + "num_input_tokens_seen": 14930120, + "step": 20810 + }, + { + "epoch": 43.274428274428274, + "grad_norm": 0.4835755228996277, + "learning_rate": 2.3402803273654326e-05, + "loss": 0.112, + "num_input_tokens_seen": 14933736, + "step": 20815 + }, + { + "epoch": 43.28482328482328, + "grad_norm": 0.37506982684135437, + "learning_rate": 2.3393005976304983e-05, + "loss": 0.104, + "num_input_tokens_seen": 14937480, + "step": 20820 + }, + { + "epoch": 43.2952182952183, + "grad_norm": 0.6328285336494446, + "learning_rate": 2.338320892677432e-05, + "loss": 0.0904, + "num_input_tokens_seen": 14941064, + "step": 20825 + }, + { + "epoch": 43.305613305613306, + "grad_norm": 0.11774607002735138, + "learning_rate": 2.3373412126573155e-05, + "loss": 0.0922, + "num_input_tokens_seen": 14944552, + "step": 20830 + }, + { + "epoch": 43.316008316008315, + "grad_norm": 0.28549015522003174, + "learning_rate": 2.3363615577212285e-05, + "loss": 0.098, + "num_input_tokens_seen": 14948232, + "step": 20835 + }, + { + "epoch": 43.32640332640332, + "grad_norm": 0.5451861023902893, + "learning_rate": 2.3353819280202455e-05, + "loss": 0.0598, + "num_input_tokens_seen": 14951656, + "step": 20840 + }, + { + "epoch": 43.33679833679834, + "grad_norm": 0.2317715734243393, + "learning_rate": 2.334402323705438e-05, + "loss": 0.1069, + "num_input_tokens_seen": 14955144, + "step": 20845 + }, + { + "epoch": 43.34719334719335, + "grad_norm": 0.34678059816360474, + "learning_rate": 2.3334227449278725e-05, + "loss": 0.1195, + "num_input_tokens_seen": 14958664, + "step": 20850 + }, + { + "epoch": 43.357588357588355, + "grad_norm": 0.31263652443885803, + "learning_rate": 2.3324431918386143e-05, + "loss": 0.1382, + "num_input_tokens_seen": 14962248, + "step": 20855 + }, + { + "epoch": 43.36798336798337, + "grad_norm": 0.3413008153438568, + "learning_rate": 2.3314636645887207e-05, + "loss": 0.1554, + "num_input_tokens_seen": 14966024, + "step": 20860 + }, + { + "epoch": 43.37837837837838, + "grad_norm": 0.6248111128807068, + "learning_rate": 2.3304841633292487e-05, + "loss": 0.0886, + "num_input_tokens_seen": 14969672, + "step": 20865 + }, + { + "epoch": 43.38877338877339, + "grad_norm": 0.8884491324424744, + "learning_rate": 2.329504688211248e-05, + "loss": 0.0753, + "num_input_tokens_seen": 14973160, + "step": 20870 + }, + { + "epoch": 43.3991683991684, + "grad_norm": 0.8133902549743652, + "learning_rate": 2.3285252393857677e-05, + "loss": 0.0955, + "num_input_tokens_seen": 14976776, + "step": 20875 + }, + { + "epoch": 43.40956340956341, + "grad_norm": 0.4287045896053314, + "learning_rate": 2.327545817003851e-05, + "loss": 0.1006, + "num_input_tokens_seen": 14980328, + "step": 20880 + }, + { + "epoch": 43.41995841995842, + "grad_norm": 0.46220898628234863, + "learning_rate": 2.326566421216535e-05, + "loss": 0.1365, + "num_input_tokens_seen": 14983912, + "step": 20885 + }, + { + "epoch": 43.43035343035343, + "grad_norm": 0.11444266885519028, + "learning_rate": 2.3255870521748565e-05, + "loss": 0.0961, + "num_input_tokens_seen": 14987368, + "step": 20890 + }, + { + "epoch": 43.44074844074844, + "grad_norm": 0.48282110691070557, + "learning_rate": 2.3246077100298474e-05, + "loss": 0.1266, + "num_input_tokens_seen": 14990856, + "step": 20895 + }, + { + "epoch": 43.45114345114345, + "grad_norm": 0.41112852096557617, + "learning_rate": 2.3236283949325328e-05, + "loss": 0.1051, + "num_input_tokens_seen": 14994344, + "step": 20900 + }, + { + "epoch": 43.46153846153846, + "grad_norm": 0.3909340500831604, + "learning_rate": 2.3226491070339368e-05, + "loss": 0.1324, + "num_input_tokens_seen": 14997832, + "step": 20905 + }, + { + "epoch": 43.471933471933475, + "grad_norm": 0.3684174120426178, + "learning_rate": 2.3216698464850762e-05, + "loss": 0.1178, + "num_input_tokens_seen": 15001224, + "step": 20910 + }, + { + "epoch": 43.482328482328484, + "grad_norm": 0.20651273429393768, + "learning_rate": 2.320690613436967e-05, + "loss": 0.1013, + "num_input_tokens_seen": 15004808, + "step": 20915 + }, + { + "epoch": 43.49272349272349, + "grad_norm": 0.4083767533302307, + "learning_rate": 2.3197114080406192e-05, + "loss": 0.1114, + "num_input_tokens_seen": 15008360, + "step": 20920 + }, + { + "epoch": 43.5031185031185, + "grad_norm": 0.22275158762931824, + "learning_rate": 2.3187322304470365e-05, + "loss": 0.0874, + "num_input_tokens_seen": 15011912, + "step": 20925 + }, + { + "epoch": 43.513513513513516, + "grad_norm": 0.361848384141922, + "learning_rate": 2.3177530808072222e-05, + "loss": 0.0888, + "num_input_tokens_seen": 15015432, + "step": 20930 + }, + { + "epoch": 43.523908523908524, + "grad_norm": 0.6179969906806946, + "learning_rate": 2.316773959272174e-05, + "loss": 0.1198, + "num_input_tokens_seen": 15019176, + "step": 20935 + }, + { + "epoch": 43.53430353430353, + "grad_norm": 0.23831211030483246, + "learning_rate": 2.3157948659928823e-05, + "loss": 0.0835, + "num_input_tokens_seen": 15022792, + "step": 20940 + }, + { + "epoch": 43.54469854469855, + "grad_norm": 0.63591068983078, + "learning_rate": 2.3148158011203388e-05, + "loss": 0.0949, + "num_input_tokens_seen": 15026632, + "step": 20945 + }, + { + "epoch": 43.555093555093556, + "grad_norm": 0.5971193313598633, + "learning_rate": 2.3138367648055253e-05, + "loss": 0.1204, + "num_input_tokens_seen": 15030184, + "step": 20950 + }, + { + "epoch": 43.565488565488565, + "grad_norm": 0.25127682089805603, + "learning_rate": 2.312857757199422e-05, + "loss": 0.069, + "num_input_tokens_seen": 15033832, + "step": 20955 + }, + { + "epoch": 43.57588357588357, + "grad_norm": 0.3380781412124634, + "learning_rate": 2.3118787784530048e-05, + "loss": 0.1138, + "num_input_tokens_seen": 15037448, + "step": 20960 + }, + { + "epoch": 43.58627858627859, + "grad_norm": 0.1540568321943283, + "learning_rate": 2.310899828717243e-05, + "loss": 0.0969, + "num_input_tokens_seen": 15041000, + "step": 20965 + }, + { + "epoch": 43.5966735966736, + "grad_norm": 0.18210415542125702, + "learning_rate": 2.309920908143104e-05, + "loss": 0.1185, + "num_input_tokens_seen": 15044648, + "step": 20970 + }, + { + "epoch": 43.607068607068605, + "grad_norm": 0.12792453169822693, + "learning_rate": 2.308942016881551e-05, + "loss": 0.0573, + "num_input_tokens_seen": 15048072, + "step": 20975 + }, + { + "epoch": 43.61746361746362, + "grad_norm": 0.5226095914840698, + "learning_rate": 2.307963155083539e-05, + "loss": 0.1046, + "num_input_tokens_seen": 15051688, + "step": 20980 + }, + { + "epoch": 43.62785862785863, + "grad_norm": 0.1556711494922638, + "learning_rate": 2.306984322900022e-05, + "loss": 0.0846, + "num_input_tokens_seen": 15055368, + "step": 20985 + }, + { + "epoch": 43.63825363825364, + "grad_norm": 0.2747315764427185, + "learning_rate": 2.3060055204819482e-05, + "loss": 0.1135, + "num_input_tokens_seen": 15059048, + "step": 20990 + }, + { + "epoch": 43.648648648648646, + "grad_norm": 0.5666902661323547, + "learning_rate": 2.3050267479802604e-05, + "loss": 0.0855, + "num_input_tokens_seen": 15062600, + "step": 20995 + }, + { + "epoch": 43.65904365904366, + "grad_norm": 0.18228112161159515, + "learning_rate": 2.304048005545899e-05, + "loss": 0.1289, + "num_input_tokens_seen": 15066120, + "step": 21000 + }, + { + "epoch": 43.65904365904366, + "eval_loss": 0.14687246084213257, + "eval_runtime": 7.757, + "eval_samples_per_second": 110.352, + "eval_steps_per_second": 27.588, + "num_input_tokens_seen": 15066120, + "step": 21000 + }, + { + "epoch": 43.66943866943867, + "grad_norm": 0.4026024341583252, + "learning_rate": 2.3030692933297972e-05, + "loss": 0.112, + "num_input_tokens_seen": 15069576, + "step": 21005 + }, + { + "epoch": 43.67983367983368, + "grad_norm": 0.8957807421684265, + "learning_rate": 2.3020906114828843e-05, + "loss": 0.0865, + "num_input_tokens_seen": 15073192, + "step": 21010 + }, + { + "epoch": 43.69022869022869, + "grad_norm": 0.25900998711586, + "learning_rate": 2.301111960156088e-05, + "loss": 0.1117, + "num_input_tokens_seen": 15076904, + "step": 21015 + }, + { + "epoch": 43.7006237006237, + "grad_norm": 0.1534496247768402, + "learning_rate": 2.300133339500326e-05, + "loss": 0.1092, + "num_input_tokens_seen": 15080456, + "step": 21020 + }, + { + "epoch": 43.71101871101871, + "grad_norm": 0.23831987380981445, + "learning_rate": 2.2991547496665148e-05, + "loss": 0.094, + "num_input_tokens_seen": 15084104, + "step": 21025 + }, + { + "epoch": 43.72141372141372, + "grad_norm": 0.36557310819625854, + "learning_rate": 2.298176190805565e-05, + "loss": 0.141, + "num_input_tokens_seen": 15087592, + "step": 21030 + }, + { + "epoch": 43.731808731808734, + "grad_norm": 0.6931670308113098, + "learning_rate": 2.2971976630683826e-05, + "loss": 0.1127, + "num_input_tokens_seen": 15091272, + "step": 21035 + }, + { + "epoch": 43.74220374220374, + "grad_norm": 0.1931036412715912, + "learning_rate": 2.29621916660587e-05, + "loss": 0.1158, + "num_input_tokens_seen": 15094760, + "step": 21040 + }, + { + "epoch": 43.75259875259875, + "grad_norm": 0.5772156119346619, + "learning_rate": 2.295240701568922e-05, + "loss": 0.064, + "num_input_tokens_seen": 15098312, + "step": 21045 + }, + { + "epoch": 43.762993762993766, + "grad_norm": 0.28117796778678894, + "learning_rate": 2.2942622681084312e-05, + "loss": 0.1313, + "num_input_tokens_seen": 15101800, + "step": 21050 + }, + { + "epoch": 43.773388773388774, + "grad_norm": 0.6003273129463196, + "learning_rate": 2.293283866375284e-05, + "loss": 0.1103, + "num_input_tokens_seen": 15105352, + "step": 21055 + }, + { + "epoch": 43.78378378378378, + "grad_norm": 0.2401362657546997, + "learning_rate": 2.2923054965203627e-05, + "loss": 0.1159, + "num_input_tokens_seen": 15108808, + "step": 21060 + }, + { + "epoch": 43.79417879417879, + "grad_norm": 0.712719202041626, + "learning_rate": 2.2913271586945443e-05, + "loss": 0.113, + "num_input_tokens_seen": 15112296, + "step": 21065 + }, + { + "epoch": 43.804573804573806, + "grad_norm": 0.41449570655822754, + "learning_rate": 2.290348853048699e-05, + "loss": 0.0991, + "num_input_tokens_seen": 15115752, + "step": 21070 + }, + { + "epoch": 43.814968814968815, + "grad_norm": 0.25554507970809937, + "learning_rate": 2.2893705797336956e-05, + "loss": 0.1343, + "num_input_tokens_seen": 15119272, + "step": 21075 + }, + { + "epoch": 43.82536382536382, + "grad_norm": 0.4453192949295044, + "learning_rate": 2.288392338900397e-05, + "loss": 0.1483, + "num_input_tokens_seen": 15123048, + "step": 21080 + }, + { + "epoch": 43.83575883575884, + "grad_norm": 0.19687651097774506, + "learning_rate": 2.2874141306996576e-05, + "loss": 0.0934, + "num_input_tokens_seen": 15126760, + "step": 21085 + }, + { + "epoch": 43.84615384615385, + "grad_norm": 0.7939815521240234, + "learning_rate": 2.2864359552823312e-05, + "loss": 0.1022, + "num_input_tokens_seen": 15130440, + "step": 21090 + }, + { + "epoch": 43.856548856548855, + "grad_norm": 0.7244030833244324, + "learning_rate": 2.2854578127992648e-05, + "loss": 0.1072, + "num_input_tokens_seen": 15133992, + "step": 21095 + }, + { + "epoch": 43.86694386694387, + "grad_norm": 0.33808863162994385, + "learning_rate": 2.2844797034012988e-05, + "loss": 0.0853, + "num_input_tokens_seen": 15137544, + "step": 21100 + }, + { + "epoch": 43.87733887733888, + "grad_norm": 0.2505565881729126, + "learning_rate": 2.2835016272392722e-05, + "loss": 0.1387, + "num_input_tokens_seen": 15141352, + "step": 21105 + }, + { + "epoch": 43.88773388773389, + "grad_norm": 0.19562825560569763, + "learning_rate": 2.2825235844640142e-05, + "loss": 0.0978, + "num_input_tokens_seen": 15144968, + "step": 21110 + }, + { + "epoch": 43.898128898128896, + "grad_norm": 0.3252008259296417, + "learning_rate": 2.2815455752263522e-05, + "loss": 0.0975, + "num_input_tokens_seen": 15148680, + "step": 21115 + }, + { + "epoch": 43.90852390852391, + "grad_norm": 0.1901642382144928, + "learning_rate": 2.2805675996771092e-05, + "loss": 0.0746, + "num_input_tokens_seen": 15152136, + "step": 21120 + }, + { + "epoch": 43.91891891891892, + "grad_norm": 0.45024123787879944, + "learning_rate": 2.2795896579670987e-05, + "loss": 0.0867, + "num_input_tokens_seen": 15155912, + "step": 21125 + }, + { + "epoch": 43.92931392931393, + "grad_norm": 0.35201898217201233, + "learning_rate": 2.2786117502471337e-05, + "loss": 0.0727, + "num_input_tokens_seen": 15159720, + "step": 21130 + }, + { + "epoch": 43.93970893970894, + "grad_norm": 0.389907568693161, + "learning_rate": 2.2776338766680185e-05, + "loss": 0.1049, + "num_input_tokens_seen": 15163432, + "step": 21135 + }, + { + "epoch": 43.95010395010395, + "grad_norm": 0.41656097769737244, + "learning_rate": 2.2766560373805533e-05, + "loss": 0.1028, + "num_input_tokens_seen": 15167016, + "step": 21140 + }, + { + "epoch": 43.96049896049896, + "grad_norm": 0.2185751050710678, + "learning_rate": 2.2756782325355353e-05, + "loss": 0.0987, + "num_input_tokens_seen": 15170632, + "step": 21145 + }, + { + "epoch": 43.97089397089397, + "grad_norm": 0.29687419533729553, + "learning_rate": 2.2747004622837514e-05, + "loss": 0.1158, + "num_input_tokens_seen": 15174248, + "step": 21150 + }, + { + "epoch": 43.981288981288984, + "grad_norm": 0.28411152958869934, + "learning_rate": 2.2737227267759878e-05, + "loss": 0.1185, + "num_input_tokens_seen": 15177928, + "step": 21155 + }, + { + "epoch": 43.99168399168399, + "grad_norm": 0.4970964789390564, + "learning_rate": 2.272745026163024e-05, + "loss": 0.1008, + "num_input_tokens_seen": 15181480, + "step": 21160 + }, + { + "epoch": 44.002079002079, + "grad_norm": 0.4499903619289398, + "learning_rate": 2.271767360595633e-05, + "loss": 0.1148, + "num_input_tokens_seen": 15184928, + "step": 21165 + }, + { + "epoch": 44.012474012474016, + "grad_norm": 0.6879810094833374, + "learning_rate": 2.270789730224583e-05, + "loss": 0.1141, + "num_input_tokens_seen": 15188416, + "step": 21170 + }, + { + "epoch": 44.022869022869024, + "grad_norm": 0.46370452642440796, + "learning_rate": 2.2698121352006367e-05, + "loss": 0.1049, + "num_input_tokens_seen": 15191968, + "step": 21175 + }, + { + "epoch": 44.03326403326403, + "grad_norm": 0.24539989233016968, + "learning_rate": 2.2688345756745517e-05, + "loss": 0.0793, + "num_input_tokens_seen": 15195584, + "step": 21180 + }, + { + "epoch": 44.04365904365904, + "grad_norm": 0.5069547891616821, + "learning_rate": 2.267857051797081e-05, + "loss": 0.1139, + "num_input_tokens_seen": 15199072, + "step": 21185 + }, + { + "epoch": 44.054054054054056, + "grad_norm": 0.24444366991519928, + "learning_rate": 2.2668795637189695e-05, + "loss": 0.0678, + "num_input_tokens_seen": 15202496, + "step": 21190 + }, + { + "epoch": 44.064449064449065, + "grad_norm": 0.3427337110042572, + "learning_rate": 2.2659021115909586e-05, + "loss": 0.0659, + "num_input_tokens_seen": 15206016, + "step": 21195 + }, + { + "epoch": 44.07484407484407, + "grad_norm": 0.6840268969535828, + "learning_rate": 2.2649246955637847e-05, + "loss": 0.1489, + "num_input_tokens_seen": 15209536, + "step": 21200 + }, + { + "epoch": 44.07484407484407, + "eval_loss": 0.14363740384578705, + "eval_runtime": 7.7543, + "eval_samples_per_second": 110.39, + "eval_steps_per_second": 27.598, + "num_input_tokens_seen": 15209536, + "step": 21200 + }, + { + "epoch": 44.08523908523909, + "grad_norm": 0.5429663062095642, + "learning_rate": 2.2639473157881766e-05, + "loss": 0.1803, + "num_input_tokens_seen": 15213216, + "step": 21205 + }, + { + "epoch": 44.0956340956341, + "grad_norm": 0.3996959626674652, + "learning_rate": 2.2629699724148594e-05, + "loss": 0.1031, + "num_input_tokens_seen": 15217120, + "step": 21210 + }, + { + "epoch": 44.106029106029105, + "grad_norm": 0.35653960704803467, + "learning_rate": 2.26199266559455e-05, + "loss": 0.0941, + "num_input_tokens_seen": 15220704, + "step": 21215 + }, + { + "epoch": 44.11642411642411, + "grad_norm": 0.3067396581172943, + "learning_rate": 2.2610153954779625e-05, + "loss": 0.112, + "num_input_tokens_seen": 15224448, + "step": 21220 + }, + { + "epoch": 44.12681912681913, + "grad_norm": 0.27700328826904297, + "learning_rate": 2.2600381622158056e-05, + "loss": 0.0958, + "num_input_tokens_seen": 15228128, + "step": 21225 + }, + { + "epoch": 44.13721413721414, + "grad_norm": 0.22520798444747925, + "learning_rate": 2.2590609659587783e-05, + "loss": 0.0957, + "num_input_tokens_seen": 15231648, + "step": 21230 + }, + { + "epoch": 44.147609147609145, + "grad_norm": 0.3148146867752075, + "learning_rate": 2.2580838068575787e-05, + "loss": 0.1071, + "num_input_tokens_seen": 15235296, + "step": 21235 + }, + { + "epoch": 44.15800415800416, + "grad_norm": 0.27368173003196716, + "learning_rate": 2.257106685062896e-05, + "loss": 0.1205, + "num_input_tokens_seen": 15239040, + "step": 21240 + }, + { + "epoch": 44.16839916839917, + "grad_norm": 0.2557937204837799, + "learning_rate": 2.256129600725415e-05, + "loss": 0.1013, + "num_input_tokens_seen": 15242432, + "step": 21245 + }, + { + "epoch": 44.17879417879418, + "grad_norm": 0.14084333181381226, + "learning_rate": 2.2551525539958145e-05, + "loss": 0.109, + "num_input_tokens_seen": 15245984, + "step": 21250 + }, + { + "epoch": 44.189189189189186, + "grad_norm": 0.2742539346218109, + "learning_rate": 2.2541755450247663e-05, + "loss": 0.0869, + "num_input_tokens_seen": 15249536, + "step": 21255 + }, + { + "epoch": 44.1995841995842, + "grad_norm": 0.2853770852088928, + "learning_rate": 2.2531985739629382e-05, + "loss": 0.0966, + "num_input_tokens_seen": 15253248, + "step": 21260 + }, + { + "epoch": 44.20997920997921, + "grad_norm": 0.9243318438529968, + "learning_rate": 2.2522216409609924e-05, + "loss": 0.1144, + "num_input_tokens_seen": 15256832, + "step": 21265 + }, + { + "epoch": 44.22037422037422, + "grad_norm": 0.5635542869567871, + "learning_rate": 2.2512447461695826e-05, + "loss": 0.079, + "num_input_tokens_seen": 15260352, + "step": 21270 + }, + { + "epoch": 44.23076923076923, + "grad_norm": 0.15177816152572632, + "learning_rate": 2.2502678897393593e-05, + "loss": 0.1011, + "num_input_tokens_seen": 15263904, + "step": 21275 + }, + { + "epoch": 44.24116424116424, + "grad_norm": 0.18405839800834656, + "learning_rate": 2.2492910718209665e-05, + "loss": 0.1356, + "num_input_tokens_seen": 15267680, + "step": 21280 + }, + { + "epoch": 44.25155925155925, + "grad_norm": 0.4182160198688507, + "learning_rate": 2.2483142925650398e-05, + "loss": 0.1038, + "num_input_tokens_seen": 15271072, + "step": 21285 + }, + { + "epoch": 44.26195426195426, + "grad_norm": 0.2747960686683655, + "learning_rate": 2.247337552122213e-05, + "loss": 0.0968, + "num_input_tokens_seen": 15274560, + "step": 21290 + }, + { + "epoch": 44.272349272349274, + "grad_norm": 0.2523753046989441, + "learning_rate": 2.24636085064311e-05, + "loss": 0.0907, + "num_input_tokens_seen": 15278144, + "step": 21295 + }, + { + "epoch": 44.28274428274428, + "grad_norm": 1.177970290184021, + "learning_rate": 2.245384188278351e-05, + "loss": 0.1551, + "num_input_tokens_seen": 15281888, + "step": 21300 + }, + { + "epoch": 44.29313929313929, + "grad_norm": 0.39132487773895264, + "learning_rate": 2.2444075651785513e-05, + "loss": 0.1138, + "num_input_tokens_seen": 15285408, + "step": 21305 + }, + { + "epoch": 44.303534303534306, + "grad_norm": 0.46609073877334595, + "learning_rate": 2.243430981494316e-05, + "loss": 0.1075, + "num_input_tokens_seen": 15288800, + "step": 21310 + }, + { + "epoch": 44.313929313929314, + "grad_norm": 0.33513426780700684, + "learning_rate": 2.2424544373762475e-05, + "loss": 0.0831, + "num_input_tokens_seen": 15292384, + "step": 21315 + }, + { + "epoch": 44.32432432432432, + "grad_norm": 0.2515997886657715, + "learning_rate": 2.2414779329749418e-05, + "loss": 0.0769, + "num_input_tokens_seen": 15296032, + "step": 21320 + }, + { + "epoch": 44.33471933471934, + "grad_norm": 0.4956669509410858, + "learning_rate": 2.2405014684409873e-05, + "loss": 0.12, + "num_input_tokens_seen": 15299584, + "step": 21325 + }, + { + "epoch": 44.34511434511435, + "grad_norm": 0.11778823286294937, + "learning_rate": 2.239525043924968e-05, + "loss": 0.0609, + "num_input_tokens_seen": 15302944, + "step": 21330 + }, + { + "epoch": 44.355509355509355, + "grad_norm": 0.5970571041107178, + "learning_rate": 2.2385486595774592e-05, + "loss": 0.142, + "num_input_tokens_seen": 15306560, + "step": 21335 + }, + { + "epoch": 44.36590436590436, + "grad_norm": 0.46541112661361694, + "learning_rate": 2.237572315549033e-05, + "loss": 0.0962, + "num_input_tokens_seen": 15310176, + "step": 21340 + }, + { + "epoch": 44.37629937629938, + "grad_norm": 0.4874017834663391, + "learning_rate": 2.2365960119902545e-05, + "loss": 0.0739, + "num_input_tokens_seen": 15313824, + "step": 21345 + }, + { + "epoch": 44.38669438669439, + "grad_norm": 0.23368456959724426, + "learning_rate": 2.2356197490516806e-05, + "loss": 0.1241, + "num_input_tokens_seen": 15317504, + "step": 21350 + }, + { + "epoch": 44.397089397089395, + "grad_norm": 0.38399943709373474, + "learning_rate": 2.234643526883863e-05, + "loss": 0.1262, + "num_input_tokens_seen": 15321184, + "step": 21355 + }, + { + "epoch": 44.40748440748441, + "grad_norm": 0.10395118594169617, + "learning_rate": 2.2336673456373497e-05, + "loss": 0.0726, + "num_input_tokens_seen": 15324800, + "step": 21360 + }, + { + "epoch": 44.41787941787942, + "grad_norm": 0.2166614681482315, + "learning_rate": 2.2326912054626772e-05, + "loss": 0.0852, + "num_input_tokens_seen": 15328320, + "step": 21365 + }, + { + "epoch": 44.42827442827443, + "grad_norm": 0.2172916829586029, + "learning_rate": 2.2317151065103813e-05, + "loss": 0.0793, + "num_input_tokens_seen": 15332064, + "step": 21370 + }, + { + "epoch": 44.438669438669436, + "grad_norm": 0.21661807596683502, + "learning_rate": 2.2307390489309865e-05, + "loss": 0.1328, + "num_input_tokens_seen": 15335712, + "step": 21375 + }, + { + "epoch": 44.44906444906445, + "grad_norm": 0.40014365315437317, + "learning_rate": 2.2297630328750146e-05, + "loss": 0.1003, + "num_input_tokens_seen": 15339392, + "step": 21380 + }, + { + "epoch": 44.45945945945946, + "grad_norm": 0.4184725284576416, + "learning_rate": 2.228787058492979e-05, + "loss": 0.0941, + "num_input_tokens_seen": 15343136, + "step": 21385 + }, + { + "epoch": 44.46985446985447, + "grad_norm": 0.5087153315544128, + "learning_rate": 2.2278111259353875e-05, + "loss": 0.1516, + "num_input_tokens_seen": 15346784, + "step": 21390 + }, + { + "epoch": 44.48024948024948, + "grad_norm": 0.2611673176288605, + "learning_rate": 2.2268352353527395e-05, + "loss": 0.0883, + "num_input_tokens_seen": 15350272, + "step": 21395 + }, + { + "epoch": 44.49064449064449, + "grad_norm": 0.35910242795944214, + "learning_rate": 2.225859386895533e-05, + "loss": 0.094, + "num_input_tokens_seen": 15353920, + "step": 21400 + }, + { + "epoch": 44.49064449064449, + "eval_loss": 0.14329685270786285, + "eval_runtime": 7.7434, + "eval_samples_per_second": 110.546, + "eval_steps_per_second": 27.637, + "num_input_tokens_seen": 15353920, + "step": 21400 + }, + { + "epoch": 44.5010395010395, + "grad_norm": 0.22536958754062653, + "learning_rate": 2.2248835807142525e-05, + "loss": 0.0937, + "num_input_tokens_seen": 15357600, + "step": 21405 + }, + { + "epoch": 44.51143451143451, + "grad_norm": 0.16174185276031494, + "learning_rate": 2.2239078169593826e-05, + "loss": 0.0596, + "num_input_tokens_seen": 15361152, + "step": 21410 + }, + { + "epoch": 44.521829521829524, + "grad_norm": 0.39027732610702515, + "learning_rate": 2.222932095781396e-05, + "loss": 0.0912, + "num_input_tokens_seen": 15364832, + "step": 21415 + }, + { + "epoch": 44.53222453222453, + "grad_norm": 0.43955060839653015, + "learning_rate": 2.221956417330762e-05, + "loss": 0.1051, + "num_input_tokens_seen": 15368352, + "step": 21420 + }, + { + "epoch": 44.54261954261954, + "grad_norm": 0.3067525327205658, + "learning_rate": 2.2209807817579438e-05, + "loss": 0.1178, + "num_input_tokens_seen": 15371872, + "step": 21425 + }, + { + "epoch": 44.553014553014556, + "grad_norm": 0.5519826412200928, + "learning_rate": 2.220005189213394e-05, + "loss": 0.1293, + "num_input_tokens_seen": 15375584, + "step": 21430 + }, + { + "epoch": 44.563409563409564, + "grad_norm": 0.6044353246688843, + "learning_rate": 2.2190296398475624e-05, + "loss": 0.1142, + "num_input_tokens_seen": 15379040, + "step": 21435 + }, + { + "epoch": 44.57380457380457, + "grad_norm": 0.25406453013420105, + "learning_rate": 2.2180541338108926e-05, + "loss": 0.0559, + "num_input_tokens_seen": 15382560, + "step": 21440 + }, + { + "epoch": 44.58419958419958, + "grad_norm": 0.2272670716047287, + "learning_rate": 2.2170786712538176e-05, + "loss": 0.0879, + "num_input_tokens_seen": 15386048, + "step": 21445 + }, + { + "epoch": 44.5945945945946, + "grad_norm": 0.14831110835075378, + "learning_rate": 2.216103252326768e-05, + "loss": 0.1016, + "num_input_tokens_seen": 15389568, + "step": 21450 + }, + { + "epoch": 44.604989604989605, + "grad_norm": 0.29288044571876526, + "learning_rate": 2.2151278771801635e-05, + "loss": 0.1121, + "num_input_tokens_seen": 15393184, + "step": 21455 + }, + { + "epoch": 44.61538461538461, + "grad_norm": 0.37880197167396545, + "learning_rate": 2.21415254596442e-05, + "loss": 0.0853, + "num_input_tokens_seen": 15396640, + "step": 21460 + }, + { + "epoch": 44.62577962577963, + "grad_norm": 0.3217269480228424, + "learning_rate": 2.213177258829947e-05, + "loss": 0.1551, + "num_input_tokens_seen": 15400160, + "step": 21465 + }, + { + "epoch": 44.63617463617464, + "grad_norm": 0.3151080012321472, + "learning_rate": 2.2122020159271445e-05, + "loss": 0.1132, + "num_input_tokens_seen": 15403616, + "step": 21470 + }, + { + "epoch": 44.646569646569645, + "grad_norm": 0.5308640599250793, + "learning_rate": 2.2112268174064075e-05, + "loss": 0.0872, + "num_input_tokens_seen": 15407040, + "step": 21475 + }, + { + "epoch": 44.656964656964654, + "grad_norm": 0.3085906505584717, + "learning_rate": 2.2102516634181253e-05, + "loss": 0.0822, + "num_input_tokens_seen": 15410656, + "step": 21480 + }, + { + "epoch": 44.66735966735967, + "grad_norm": 0.41246944665908813, + "learning_rate": 2.209276554112677e-05, + "loss": 0.1078, + "num_input_tokens_seen": 15414208, + "step": 21485 + }, + { + "epoch": 44.67775467775468, + "grad_norm": 0.33304116129875183, + "learning_rate": 2.2083014896404384e-05, + "loss": 0.0976, + "num_input_tokens_seen": 15417888, + "step": 21490 + }, + { + "epoch": 44.688149688149686, + "grad_norm": 0.14864715933799744, + "learning_rate": 2.207326470151775e-05, + "loss": 0.0812, + "num_input_tokens_seen": 15421344, + "step": 21495 + }, + { + "epoch": 44.6985446985447, + "grad_norm": 0.38787442445755005, + "learning_rate": 2.2063514957970477e-05, + "loss": 0.1551, + "num_input_tokens_seen": 15424896, + "step": 21500 + }, + { + "epoch": 44.70893970893971, + "grad_norm": 0.4545612037181854, + "learning_rate": 2.205376566726611e-05, + "loss": 0.1275, + "num_input_tokens_seen": 15428544, + "step": 21505 + }, + { + "epoch": 44.71933471933472, + "grad_norm": 0.41664180159568787, + "learning_rate": 2.204401683090809e-05, + "loss": 0.1126, + "num_input_tokens_seen": 15432192, + "step": 21510 + }, + { + "epoch": 44.729729729729726, + "grad_norm": 0.5166271924972534, + "learning_rate": 2.203426845039982e-05, + "loss": 0.1381, + "num_input_tokens_seen": 15436032, + "step": 21515 + }, + { + "epoch": 44.74012474012474, + "grad_norm": 0.49872297048568726, + "learning_rate": 2.202452052724464e-05, + "loss": 0.1041, + "num_input_tokens_seen": 15439680, + "step": 21520 + }, + { + "epoch": 44.75051975051975, + "grad_norm": 0.5061789155006409, + "learning_rate": 2.2014773062945777e-05, + "loss": 0.1757, + "num_input_tokens_seen": 15443328, + "step": 21525 + }, + { + "epoch": 44.76091476091476, + "grad_norm": 0.36489611864089966, + "learning_rate": 2.2005026059006427e-05, + "loss": 0.078, + "num_input_tokens_seen": 15447072, + "step": 21530 + }, + { + "epoch": 44.771309771309774, + "grad_norm": 0.22897560894489288, + "learning_rate": 2.1995279516929695e-05, + "loss": 0.1127, + "num_input_tokens_seen": 15450752, + "step": 21535 + }, + { + "epoch": 44.78170478170478, + "grad_norm": 0.368352472782135, + "learning_rate": 2.1985533438218613e-05, + "loss": 0.0731, + "num_input_tokens_seen": 15454304, + "step": 21540 + }, + { + "epoch": 44.79209979209979, + "grad_norm": 0.25527217984199524, + "learning_rate": 2.197578782437617e-05, + "loss": 0.0723, + "num_input_tokens_seen": 15457888, + "step": 21545 + }, + { + "epoch": 44.802494802494806, + "grad_norm": 0.3137395977973938, + "learning_rate": 2.196604267690524e-05, + "loss": 0.1287, + "num_input_tokens_seen": 15461600, + "step": 21550 + }, + { + "epoch": 44.812889812889814, + "grad_norm": 0.19629377126693726, + "learning_rate": 2.195629799730865e-05, + "loss": 0.0995, + "num_input_tokens_seen": 15465184, + "step": 21555 + }, + { + "epoch": 44.82328482328482, + "grad_norm": 0.42879343032836914, + "learning_rate": 2.1946553787089173e-05, + "loss": 0.0798, + "num_input_tokens_seen": 15468832, + "step": 21560 + }, + { + "epoch": 44.83367983367983, + "grad_norm": 0.20250160992145538, + "learning_rate": 2.193681004774947e-05, + "loss": 0.1013, + "num_input_tokens_seen": 15472544, + "step": 21565 + }, + { + "epoch": 44.84407484407485, + "grad_norm": 0.2624375820159912, + "learning_rate": 2.1927066780792154e-05, + "loss": 0.0779, + "num_input_tokens_seen": 15476000, + "step": 21570 + }, + { + "epoch": 44.854469854469855, + "grad_norm": 0.4991358816623688, + "learning_rate": 2.191732398771975e-05, + "loss": 0.1529, + "num_input_tokens_seen": 15479616, + "step": 21575 + }, + { + "epoch": 44.86486486486486, + "grad_norm": 0.7504229545593262, + "learning_rate": 2.1907581670034725e-05, + "loss": 0.1419, + "num_input_tokens_seen": 15483360, + "step": 21580 + }, + { + "epoch": 44.87525987525988, + "grad_norm": 0.30532971024513245, + "learning_rate": 2.189783982923948e-05, + "loss": 0.0818, + "num_input_tokens_seen": 15486688, + "step": 21585 + }, + { + "epoch": 44.88565488565489, + "grad_norm": 0.9003528356552124, + "learning_rate": 2.1888098466836303e-05, + "loss": 0.1317, + "num_input_tokens_seen": 15490304, + "step": 21590 + }, + { + "epoch": 44.896049896049895, + "grad_norm": 0.4625099301338196, + "learning_rate": 2.1878357584327457e-05, + "loss": 0.0958, + "num_input_tokens_seen": 15493728, + "step": 21595 + }, + { + "epoch": 44.906444906444904, + "grad_norm": 0.7176742553710938, + "learning_rate": 2.1868617183215103e-05, + "loss": 0.1047, + "num_input_tokens_seen": 15497376, + "step": 21600 + }, + { + "epoch": 44.906444906444904, + "eval_loss": 0.14304064214229584, + "eval_runtime": 7.749, + "eval_samples_per_second": 110.466, + "eval_steps_per_second": 27.616, + "num_input_tokens_seen": 15497376, + "step": 21600 + }, + { + "epoch": 44.91683991683992, + "grad_norm": 0.26747143268585205, + "learning_rate": 2.1858877265001327e-05, + "loss": 0.0775, + "num_input_tokens_seen": 15500992, + "step": 21605 + }, + { + "epoch": 44.92723492723493, + "grad_norm": 0.4704034626483917, + "learning_rate": 2.184913783118816e-05, + "loss": 0.1049, + "num_input_tokens_seen": 15504416, + "step": 21610 + }, + { + "epoch": 44.937629937629936, + "grad_norm": 0.24264231324195862, + "learning_rate": 2.1839398883277522e-05, + "loss": 0.1316, + "num_input_tokens_seen": 15508064, + "step": 21615 + }, + { + "epoch": 44.94802494802495, + "grad_norm": 0.44918403029441833, + "learning_rate": 2.182966042277129e-05, + "loss": 0.1599, + "num_input_tokens_seen": 15511872, + "step": 21620 + }, + { + "epoch": 44.95841995841996, + "grad_norm": 0.1701088547706604, + "learning_rate": 2.181992245117128e-05, + "loss": 0.0942, + "num_input_tokens_seen": 15515488, + "step": 21625 + }, + { + "epoch": 44.96881496881497, + "grad_norm": 0.2860129177570343, + "learning_rate": 2.181018496997918e-05, + "loss": 0.0824, + "num_input_tokens_seen": 15519104, + "step": 21630 + }, + { + "epoch": 44.979209979209976, + "grad_norm": 0.27449172735214233, + "learning_rate": 2.1800447980696648e-05, + "loss": 0.1262, + "num_input_tokens_seen": 15522656, + "step": 21635 + }, + { + "epoch": 44.98960498960499, + "grad_norm": 0.4405285716056824, + "learning_rate": 2.1790711484825248e-05, + "loss": 0.1121, + "num_input_tokens_seen": 15526208, + "step": 21640 + }, + { + "epoch": 45.0, + "grad_norm": 0.18015526235103607, + "learning_rate": 2.178097548386646e-05, + "loss": 0.0805, + "num_input_tokens_seen": 15529752, + "step": 21645 + }, + { + "epoch": 45.01039501039501, + "grad_norm": 0.4362819492816925, + "learning_rate": 2.1771239979321712e-05, + "loss": 0.1123, + "num_input_tokens_seen": 15533240, + "step": 21650 + }, + { + "epoch": 45.020790020790024, + "grad_norm": 0.5028203129768372, + "learning_rate": 2.1761504972692327e-05, + "loss": 0.0955, + "num_input_tokens_seen": 15536824, + "step": 21655 + }, + { + "epoch": 45.03118503118503, + "grad_norm": 0.22163653373718262, + "learning_rate": 2.1751770465479572e-05, + "loss": 0.0992, + "num_input_tokens_seen": 15540344, + "step": 21660 + }, + { + "epoch": 45.04158004158004, + "grad_norm": 0.4307694733142853, + "learning_rate": 2.174203645918464e-05, + "loss": 0.0881, + "num_input_tokens_seen": 15543928, + "step": 21665 + }, + { + "epoch": 45.05197505197505, + "grad_norm": 0.7421835660934448, + "learning_rate": 2.1732302955308624e-05, + "loss": 0.1487, + "num_input_tokens_seen": 15547480, + "step": 21670 + }, + { + "epoch": 45.062370062370064, + "grad_norm": 0.7514640688896179, + "learning_rate": 2.172256995535255e-05, + "loss": 0.0788, + "num_input_tokens_seen": 15551032, + "step": 21675 + }, + { + "epoch": 45.07276507276507, + "grad_norm": 0.276593953371048, + "learning_rate": 2.171283746081739e-05, + "loss": 0.1118, + "num_input_tokens_seen": 15554616, + "step": 21680 + }, + { + "epoch": 45.08316008316008, + "grad_norm": 0.2506631016731262, + "learning_rate": 2.1703105473203988e-05, + "loss": 0.1032, + "num_input_tokens_seen": 15558296, + "step": 21685 + }, + { + "epoch": 45.093555093555096, + "grad_norm": 0.24773705005645752, + "learning_rate": 2.1693373994013168e-05, + "loss": 0.084, + "num_input_tokens_seen": 15561976, + "step": 21690 + }, + { + "epoch": 45.103950103950105, + "grad_norm": 0.5698941946029663, + "learning_rate": 2.168364302474562e-05, + "loss": 0.1313, + "num_input_tokens_seen": 15565496, + "step": 21695 + }, + { + "epoch": 45.11434511434511, + "grad_norm": 0.6212994456291199, + "learning_rate": 2.167391256690199e-05, + "loss": 0.12, + "num_input_tokens_seen": 15569240, + "step": 21700 + }, + { + "epoch": 45.12474012474012, + "grad_norm": 0.4913616180419922, + "learning_rate": 2.1664182621982855e-05, + "loss": 0.0904, + "num_input_tokens_seen": 15572888, + "step": 21705 + }, + { + "epoch": 45.13513513513514, + "grad_norm": 0.638789713382721, + "learning_rate": 2.1654453191488673e-05, + "loss": 0.1289, + "num_input_tokens_seen": 15576536, + "step": 21710 + }, + { + "epoch": 45.145530145530145, + "grad_norm": 0.2964896857738495, + "learning_rate": 2.1644724276919846e-05, + "loss": 0.125, + "num_input_tokens_seen": 15580056, + "step": 21715 + }, + { + "epoch": 45.15592515592515, + "grad_norm": 0.1696990728378296, + "learning_rate": 2.1634995879776715e-05, + "loss": 0.1077, + "num_input_tokens_seen": 15583640, + "step": 21720 + }, + { + "epoch": 45.16632016632017, + "grad_norm": 0.34226512908935547, + "learning_rate": 2.162526800155949e-05, + "loss": 0.1481, + "num_input_tokens_seen": 15587384, + "step": 21725 + }, + { + "epoch": 45.17671517671518, + "grad_norm": 0.1576625406742096, + "learning_rate": 2.1615540643768363e-05, + "loss": 0.1275, + "num_input_tokens_seen": 15590968, + "step": 21730 + }, + { + "epoch": 45.187110187110186, + "grad_norm": 0.41918379068374634, + "learning_rate": 2.160581380790339e-05, + "loss": 0.1399, + "num_input_tokens_seen": 15594520, + "step": 21735 + }, + { + "epoch": 45.197505197505194, + "grad_norm": 0.7163155674934387, + "learning_rate": 2.1596087495464586e-05, + "loss": 0.1211, + "num_input_tokens_seen": 15597944, + "step": 21740 + }, + { + "epoch": 45.20790020790021, + "grad_norm": 0.15659886598587036, + "learning_rate": 2.1586361707951866e-05, + "loss": 0.1096, + "num_input_tokens_seen": 15601528, + "step": 21745 + }, + { + "epoch": 45.21829521829522, + "grad_norm": 0.48612624406814575, + "learning_rate": 2.157663644686507e-05, + "loss": 0.0893, + "num_input_tokens_seen": 15605144, + "step": 21750 + }, + { + "epoch": 45.228690228690226, + "grad_norm": 0.1320328563451767, + "learning_rate": 2.156691171370396e-05, + "loss": 0.1024, + "num_input_tokens_seen": 15608824, + "step": 21755 + }, + { + "epoch": 45.23908523908524, + "grad_norm": 0.5935676097869873, + "learning_rate": 2.1557187509968195e-05, + "loss": 0.1048, + "num_input_tokens_seen": 15612312, + "step": 21760 + }, + { + "epoch": 45.24948024948025, + "grad_norm": 0.20281599462032318, + "learning_rate": 2.1547463837157382e-05, + "loss": 0.1004, + "num_input_tokens_seen": 15616056, + "step": 21765 + }, + { + "epoch": 45.25987525987526, + "grad_norm": 0.2936372458934784, + "learning_rate": 2.1537740696771045e-05, + "loss": 0.0936, + "num_input_tokens_seen": 15619480, + "step": 21770 + }, + { + "epoch": 45.270270270270274, + "grad_norm": 0.32151031494140625, + "learning_rate": 2.1528018090308587e-05, + "loss": 0.1004, + "num_input_tokens_seen": 15623096, + "step": 21775 + }, + { + "epoch": 45.28066528066528, + "grad_norm": 0.5480704307556152, + "learning_rate": 2.151829601926938e-05, + "loss": 0.0899, + "num_input_tokens_seen": 15626808, + "step": 21780 + }, + { + "epoch": 45.29106029106029, + "grad_norm": 0.4617905616760254, + "learning_rate": 2.1508574485152684e-05, + "loss": 0.1024, + "num_input_tokens_seen": 15630296, + "step": 21785 + }, + { + "epoch": 45.3014553014553, + "grad_norm": 0.4643784165382385, + "learning_rate": 2.1498853489457667e-05, + "loss": 0.0955, + "num_input_tokens_seen": 15634040, + "step": 21790 + }, + { + "epoch": 45.311850311850314, + "grad_norm": 0.14904063940048218, + "learning_rate": 2.1489133033683455e-05, + "loss": 0.0947, + "num_input_tokens_seen": 15637560, + "step": 21795 + }, + { + "epoch": 45.32224532224532, + "grad_norm": 0.770622730255127, + "learning_rate": 2.1479413119329038e-05, + "loss": 0.1176, + "num_input_tokens_seen": 15641208, + "step": 21800 + }, + { + "epoch": 45.32224532224532, + "eval_loss": 0.14183759689331055, + "eval_runtime": 7.7438, + "eval_samples_per_second": 110.54, + "eval_steps_per_second": 27.635, + "num_input_tokens_seen": 15641208, + "step": 21800 + }, + { + "epoch": 45.33264033264033, + "grad_norm": 0.31915128231048584, + "learning_rate": 2.1469693747893355e-05, + "loss": 0.131, + "num_input_tokens_seen": 15645016, + "step": 21805 + }, + { + "epoch": 45.343035343035346, + "grad_norm": 0.44242870807647705, + "learning_rate": 2.1459974920875274e-05, + "loss": 0.1055, + "num_input_tokens_seen": 15648760, + "step": 21810 + }, + { + "epoch": 45.353430353430355, + "grad_norm": 0.26276278495788574, + "learning_rate": 2.145025663977354e-05, + "loss": 0.1228, + "num_input_tokens_seen": 15652248, + "step": 21815 + }, + { + "epoch": 45.36382536382536, + "grad_norm": 0.36128464341163635, + "learning_rate": 2.1440538906086844e-05, + "loss": 0.0883, + "num_input_tokens_seen": 15655768, + "step": 21820 + }, + { + "epoch": 45.37422037422037, + "grad_norm": 0.38507702946662903, + "learning_rate": 2.1430821721313782e-05, + "loss": 0.0853, + "num_input_tokens_seen": 15659224, + "step": 21825 + }, + { + "epoch": 45.38461538461539, + "grad_norm": 1.358605146408081, + "learning_rate": 2.142110508695286e-05, + "loss": 0.0984, + "num_input_tokens_seen": 15662712, + "step": 21830 + }, + { + "epoch": 45.395010395010395, + "grad_norm": 0.18198032677173615, + "learning_rate": 2.1411389004502515e-05, + "loss": 0.1148, + "num_input_tokens_seen": 15666104, + "step": 21835 + }, + { + "epoch": 45.4054054054054, + "grad_norm": 0.232199028134346, + "learning_rate": 2.140167347546107e-05, + "loss": 0.0999, + "num_input_tokens_seen": 15669624, + "step": 21840 + }, + { + "epoch": 45.41580041580042, + "grad_norm": 0.6061207056045532, + "learning_rate": 2.1391958501326793e-05, + "loss": 0.108, + "num_input_tokens_seen": 15673304, + "step": 21845 + }, + { + "epoch": 45.42619542619543, + "grad_norm": 0.39271050691604614, + "learning_rate": 2.1382244083597873e-05, + "loss": 0.101, + "num_input_tokens_seen": 15676888, + "step": 21850 + }, + { + "epoch": 45.436590436590436, + "grad_norm": 0.6138716340065002, + "learning_rate": 2.137253022377237e-05, + "loss": 0.1306, + "num_input_tokens_seen": 15680472, + "step": 21855 + }, + { + "epoch": 45.446985446985444, + "grad_norm": 0.21904781460762024, + "learning_rate": 2.136281692334829e-05, + "loss": 0.0991, + "num_input_tokens_seen": 15684184, + "step": 21860 + }, + { + "epoch": 45.45738045738046, + "grad_norm": 0.5989813208580017, + "learning_rate": 2.135310418382356e-05, + "loss": 0.1059, + "num_input_tokens_seen": 15687800, + "step": 21865 + }, + { + "epoch": 45.46777546777547, + "grad_norm": 0.2102815955877304, + "learning_rate": 2.134339200669598e-05, + "loss": 0.0809, + "num_input_tokens_seen": 15691288, + "step": 21870 + }, + { + "epoch": 45.478170478170476, + "grad_norm": 0.30701756477355957, + "learning_rate": 2.133368039346331e-05, + "loss": 0.1323, + "num_input_tokens_seen": 15694840, + "step": 21875 + }, + { + "epoch": 45.48856548856549, + "grad_norm": 0.1585642248392105, + "learning_rate": 2.1323969345623195e-05, + "loss": 0.106, + "num_input_tokens_seen": 15698648, + "step": 21880 + }, + { + "epoch": 45.4989604989605, + "grad_norm": 0.26612767577171326, + "learning_rate": 2.1314258864673207e-05, + "loss": 0.0658, + "num_input_tokens_seen": 15702264, + "step": 21885 + }, + { + "epoch": 45.50935550935551, + "grad_norm": 0.528351366519928, + "learning_rate": 2.130454895211082e-05, + "loss": 0.1235, + "num_input_tokens_seen": 15705944, + "step": 21890 + }, + { + "epoch": 45.51975051975052, + "grad_norm": 0.32771384716033936, + "learning_rate": 2.129483960943342e-05, + "loss": 0.1386, + "num_input_tokens_seen": 15709464, + "step": 21895 + }, + { + "epoch": 45.53014553014553, + "grad_norm": 0.3834046423435211, + "learning_rate": 2.128513083813831e-05, + "loss": 0.0828, + "num_input_tokens_seen": 15713048, + "step": 21900 + }, + { + "epoch": 45.54054054054054, + "grad_norm": 0.2437812238931656, + "learning_rate": 2.1275422639722724e-05, + "loss": 0.0938, + "num_input_tokens_seen": 15716600, + "step": 21905 + }, + { + "epoch": 45.55093555093555, + "grad_norm": 0.189383864402771, + "learning_rate": 2.126571501568376e-05, + "loss": 0.0603, + "num_input_tokens_seen": 15720280, + "step": 21910 + }, + { + "epoch": 45.561330561330564, + "grad_norm": 0.16722023487091064, + "learning_rate": 2.1256007967518478e-05, + "loss": 0.093, + "num_input_tokens_seen": 15723896, + "step": 21915 + }, + { + "epoch": 45.57172557172557, + "grad_norm": 0.2514719069004059, + "learning_rate": 2.124630149672381e-05, + "loss": 0.0929, + "num_input_tokens_seen": 15727544, + "step": 21920 + }, + { + "epoch": 45.58212058212058, + "grad_norm": 0.14282868802547455, + "learning_rate": 2.1236595604796624e-05, + "loss": 0.1147, + "num_input_tokens_seen": 15731096, + "step": 21925 + }, + { + "epoch": 45.59251559251559, + "grad_norm": 0.30594897270202637, + "learning_rate": 2.1226890293233693e-05, + "loss": 0.0999, + "num_input_tokens_seen": 15734648, + "step": 21930 + }, + { + "epoch": 45.602910602910605, + "grad_norm": 0.13027560710906982, + "learning_rate": 2.1217185563531694e-05, + "loss": 0.0749, + "num_input_tokens_seen": 15738200, + "step": 21935 + }, + { + "epoch": 45.61330561330561, + "grad_norm": 0.5739872455596924, + "learning_rate": 2.120748141718721e-05, + "loss": 0.1063, + "num_input_tokens_seen": 15741752, + "step": 21940 + }, + { + "epoch": 45.62370062370062, + "grad_norm": 0.46864888072013855, + "learning_rate": 2.1197777855696765e-05, + "loss": 0.1062, + "num_input_tokens_seen": 15745208, + "step": 21945 + }, + { + "epoch": 45.63409563409564, + "grad_norm": 0.4322369396686554, + "learning_rate": 2.1188074880556746e-05, + "loss": 0.1107, + "num_input_tokens_seen": 15748856, + "step": 21950 + }, + { + "epoch": 45.644490644490645, + "grad_norm": 0.1388307511806488, + "learning_rate": 2.1178372493263495e-05, + "loss": 0.1282, + "num_input_tokens_seen": 15752408, + "step": 21955 + }, + { + "epoch": 45.65488565488565, + "grad_norm": 0.36025184392929077, + "learning_rate": 2.116867069531322e-05, + "loss": 0.1326, + "num_input_tokens_seen": 15756056, + "step": 21960 + }, + { + "epoch": 45.66528066528066, + "grad_norm": 0.5335524082183838, + "learning_rate": 2.1158969488202073e-05, + "loss": 0.1091, + "num_input_tokens_seen": 15759672, + "step": 21965 + }, + { + "epoch": 45.67567567567568, + "grad_norm": 0.27270910143852234, + "learning_rate": 2.114926887342611e-05, + "loss": 0.085, + "num_input_tokens_seen": 15763256, + "step": 21970 + }, + { + "epoch": 45.686070686070686, + "grad_norm": 0.24215811491012573, + "learning_rate": 2.113956885248127e-05, + "loss": 0.067, + "num_input_tokens_seen": 15766712, + "step": 21975 + }, + { + "epoch": 45.696465696465694, + "grad_norm": 0.39377114176750183, + "learning_rate": 2.112986942686342e-05, + "loss": 0.0808, + "num_input_tokens_seen": 15770200, + "step": 21980 + }, + { + "epoch": 45.70686070686071, + "grad_norm": 0.5577604174613953, + "learning_rate": 2.112017059806835e-05, + "loss": 0.1276, + "num_input_tokens_seen": 15773784, + "step": 21985 + }, + { + "epoch": 45.71725571725572, + "grad_norm": 0.5248695611953735, + "learning_rate": 2.1110472367591724e-05, + "loss": 0.1371, + "num_input_tokens_seen": 15777304, + "step": 21990 + }, + { + "epoch": 45.727650727650726, + "grad_norm": 0.22350221872329712, + "learning_rate": 2.1100774736929145e-05, + "loss": 0.061, + "num_input_tokens_seen": 15780824, + "step": 21995 + }, + { + "epoch": 45.73804573804574, + "grad_norm": 0.3684428930282593, + "learning_rate": 2.10910777075761e-05, + "loss": 0.0974, + "num_input_tokens_seen": 15784536, + "step": 22000 + }, + { + "epoch": 45.73804573804574, + "eval_loss": 0.14436741173267365, + "eval_runtime": 7.7635, + "eval_samples_per_second": 110.26, + "eval_steps_per_second": 27.565, + "num_input_tokens_seen": 15784536, + "step": 22000 + }, + { + "epoch": 45.74844074844075, + "grad_norm": 0.7044190764427185, + "learning_rate": 2.108138128102799e-05, + "loss": 0.0809, + "num_input_tokens_seen": 15787992, + "step": 22005 + }, + { + "epoch": 45.75883575883576, + "grad_norm": 0.40222683548927307, + "learning_rate": 2.107168545878014e-05, + "loss": 0.085, + "num_input_tokens_seen": 15791640, + "step": 22010 + }, + { + "epoch": 45.76923076923077, + "grad_norm": 0.13372057676315308, + "learning_rate": 2.106199024232775e-05, + "loss": 0.0847, + "num_input_tokens_seen": 15795256, + "step": 22015 + }, + { + "epoch": 45.77962577962578, + "grad_norm": 0.29446953535079956, + "learning_rate": 2.105229563316595e-05, + "loss": 0.0884, + "num_input_tokens_seen": 15798872, + "step": 22020 + }, + { + "epoch": 45.79002079002079, + "grad_norm": 0.304277241230011, + "learning_rate": 2.1042601632789784e-05, + "loss": 0.1298, + "num_input_tokens_seen": 15802584, + "step": 22025 + }, + { + "epoch": 45.8004158004158, + "grad_norm": 0.38235244154930115, + "learning_rate": 2.103290824269417e-05, + "loss": 0.0963, + "num_input_tokens_seen": 15806296, + "step": 22030 + }, + { + "epoch": 45.810810810810814, + "grad_norm": 0.30193039774894714, + "learning_rate": 2.1023215464373965e-05, + "loss": 0.1328, + "num_input_tokens_seen": 15809880, + "step": 22035 + }, + { + "epoch": 45.82120582120582, + "grad_norm": 0.38330206274986267, + "learning_rate": 2.1013523299323908e-05, + "loss": 0.0989, + "num_input_tokens_seen": 15813400, + "step": 22040 + }, + { + "epoch": 45.83160083160083, + "grad_norm": 0.24037526547908783, + "learning_rate": 2.1003831749038654e-05, + "loss": 0.1207, + "num_input_tokens_seen": 15816984, + "step": 22045 + }, + { + "epoch": 45.84199584199584, + "grad_norm": 0.5685610175132751, + "learning_rate": 2.099414081501277e-05, + "loss": 0.1191, + "num_input_tokens_seen": 15820696, + "step": 22050 + }, + { + "epoch": 45.852390852390855, + "grad_norm": 0.5045654773712158, + "learning_rate": 2.09844504987407e-05, + "loss": 0.0735, + "num_input_tokens_seen": 15824312, + "step": 22055 + }, + { + "epoch": 45.86278586278586, + "grad_norm": 0.1556316763162613, + "learning_rate": 2.097476080171683e-05, + "loss": 0.1, + "num_input_tokens_seen": 15827832, + "step": 22060 + }, + { + "epoch": 45.87318087318087, + "grad_norm": 0.3612091839313507, + "learning_rate": 2.0965071725435436e-05, + "loss": 0.0849, + "num_input_tokens_seen": 15831384, + "step": 22065 + }, + { + "epoch": 45.88357588357589, + "grad_norm": 0.23848484456539154, + "learning_rate": 2.0955383271390684e-05, + "loss": 0.0856, + "num_input_tokens_seen": 15834936, + "step": 22070 + }, + { + "epoch": 45.893970893970895, + "grad_norm": 0.3834417462348938, + "learning_rate": 2.094569544107666e-05, + "loss": 0.1365, + "num_input_tokens_seen": 15838520, + "step": 22075 + }, + { + "epoch": 45.9043659043659, + "grad_norm": 0.1837914139032364, + "learning_rate": 2.093600823598735e-05, + "loss": 0.1156, + "num_input_tokens_seen": 15842200, + "step": 22080 + }, + { + "epoch": 45.91476091476091, + "grad_norm": 0.19110405445098877, + "learning_rate": 2.092632165761663e-05, + "loss": 0.1011, + "num_input_tokens_seen": 15845848, + "step": 22085 + }, + { + "epoch": 45.92515592515593, + "grad_norm": 0.33203551173210144, + "learning_rate": 2.091663570745832e-05, + "loss": 0.13, + "num_input_tokens_seen": 15849656, + "step": 22090 + }, + { + "epoch": 45.935550935550935, + "grad_norm": 0.18122564256191254, + "learning_rate": 2.0906950387006086e-05, + "loss": 0.0773, + "num_input_tokens_seen": 15853112, + "step": 22095 + }, + { + "epoch": 45.945945945945944, + "grad_norm": 0.34572720527648926, + "learning_rate": 2.0897265697753543e-05, + "loss": 0.0972, + "num_input_tokens_seen": 15856600, + "step": 22100 + }, + { + "epoch": 45.95634095634096, + "grad_norm": 0.10803062468767166, + "learning_rate": 2.088758164119419e-05, + "loss": 0.0802, + "num_input_tokens_seen": 15860056, + "step": 22105 + }, + { + "epoch": 45.96673596673597, + "grad_norm": 0.32027536630630493, + "learning_rate": 2.0877898218821428e-05, + "loss": 0.1316, + "num_input_tokens_seen": 15863672, + "step": 22110 + }, + { + "epoch": 45.977130977130976, + "grad_norm": 0.5292006134986877, + "learning_rate": 2.0868215432128565e-05, + "loss": 0.0922, + "num_input_tokens_seen": 15867256, + "step": 22115 + }, + { + "epoch": 45.987525987525984, + "grad_norm": 0.39965125918388367, + "learning_rate": 2.0858533282608796e-05, + "loss": 0.1034, + "num_input_tokens_seen": 15870808, + "step": 22120 + }, + { + "epoch": 45.997920997921, + "grad_norm": 1.1297757625579834, + "learning_rate": 2.084885177175524e-05, + "loss": 0.1359, + "num_input_tokens_seen": 15874360, + "step": 22125 + }, + { + "epoch": 46.00831600831601, + "grad_norm": 0.3094165325164795, + "learning_rate": 2.0839170901060917e-05, + "loss": 0.0589, + "num_input_tokens_seen": 15877936, + "step": 22130 + }, + { + "epoch": 46.018711018711016, + "grad_norm": 0.38554930686950684, + "learning_rate": 2.082949067201872e-05, + "loss": 0.0713, + "num_input_tokens_seen": 15881488, + "step": 22135 + }, + { + "epoch": 46.02910602910603, + "grad_norm": 0.23190228641033173, + "learning_rate": 2.0819811086121475e-05, + "loss": 0.0854, + "num_input_tokens_seen": 15885104, + "step": 22140 + }, + { + "epoch": 46.03950103950104, + "grad_norm": 0.5361478328704834, + "learning_rate": 2.08101321448619e-05, + "loss": 0.0868, + "num_input_tokens_seen": 15888816, + "step": 22145 + }, + { + "epoch": 46.04989604989605, + "grad_norm": 0.28925028443336487, + "learning_rate": 2.080045384973259e-05, + "loss": 0.0823, + "num_input_tokens_seen": 15892336, + "step": 22150 + }, + { + "epoch": 46.06029106029106, + "grad_norm": 0.7664822936058044, + "learning_rate": 2.0790776202226082e-05, + "loss": 0.1044, + "num_input_tokens_seen": 15895920, + "step": 22155 + }, + { + "epoch": 46.07068607068607, + "grad_norm": 0.27852630615234375, + "learning_rate": 2.078109920383477e-05, + "loss": 0.1253, + "num_input_tokens_seen": 15899632, + "step": 22160 + }, + { + "epoch": 46.08108108108108, + "grad_norm": 0.5831912755966187, + "learning_rate": 2.0771422856050978e-05, + "loss": 0.1235, + "num_input_tokens_seen": 15903312, + "step": 22165 + }, + { + "epoch": 46.09147609147609, + "grad_norm": 1.7949802875518799, + "learning_rate": 2.076174716036693e-05, + "loss": 0.1273, + "num_input_tokens_seen": 15906832, + "step": 22170 + }, + { + "epoch": 46.101871101871104, + "grad_norm": 0.5230218768119812, + "learning_rate": 2.075207211827472e-05, + "loss": 0.0892, + "num_input_tokens_seen": 15910384, + "step": 22175 + }, + { + "epoch": 46.11226611226611, + "grad_norm": 0.3244509994983673, + "learning_rate": 2.074239773126638e-05, + "loss": 0.1041, + "num_input_tokens_seen": 15914064, + "step": 22180 + }, + { + "epoch": 46.12266112266112, + "grad_norm": 0.3469657897949219, + "learning_rate": 2.073272400083382e-05, + "loss": 0.1015, + "num_input_tokens_seen": 15917776, + "step": 22185 + }, + { + "epoch": 46.13305613305613, + "grad_norm": 0.5926372408866882, + "learning_rate": 2.072305092846883e-05, + "loss": 0.092, + "num_input_tokens_seen": 15921296, + "step": 22190 + }, + { + "epoch": 46.143451143451145, + "grad_norm": 0.32404717803001404, + "learning_rate": 2.0713378515663152e-05, + "loss": 0.0746, + "num_input_tokens_seen": 15924944, + "step": 22195 + }, + { + "epoch": 46.15384615384615, + "grad_norm": 0.225862517952919, + "learning_rate": 2.070370676390836e-05, + "loss": 0.0903, + "num_input_tokens_seen": 15928528, + "step": 22200 + }, + { + "epoch": 46.15384615384615, + "eval_loss": 0.14569991827011108, + "eval_runtime": 7.756, + "eval_samples_per_second": 110.367, + "eval_steps_per_second": 27.592, + "num_input_tokens_seen": 15928528, + "step": 22200 + }, + { + "epoch": 46.16424116424116, + "grad_norm": 0.2057313174009323, + "learning_rate": 2.0694035674695974e-05, + "loss": 0.1177, + "num_input_tokens_seen": 15932144, + "step": 22205 + }, + { + "epoch": 46.17463617463618, + "grad_norm": 0.41035225987434387, + "learning_rate": 2.0684365249517416e-05, + "loss": 0.0709, + "num_input_tokens_seen": 15935824, + "step": 22210 + }, + { + "epoch": 46.185031185031185, + "grad_norm": 0.6951080560684204, + "learning_rate": 2.067469548986396e-05, + "loss": 0.0931, + "num_input_tokens_seen": 15939632, + "step": 22215 + }, + { + "epoch": 46.195426195426194, + "grad_norm": 0.2311473786830902, + "learning_rate": 2.066502639722681e-05, + "loss": 0.1337, + "num_input_tokens_seen": 15943248, + "step": 22220 + }, + { + "epoch": 46.20582120582121, + "grad_norm": 0.29863473773002625, + "learning_rate": 2.065535797309708e-05, + "loss": 0.0849, + "num_input_tokens_seen": 15946736, + "step": 22225 + }, + { + "epoch": 46.21621621621622, + "grad_norm": 0.19225908815860748, + "learning_rate": 2.0645690218965736e-05, + "loss": 0.0947, + "num_input_tokens_seen": 15950288, + "step": 22230 + }, + { + "epoch": 46.226611226611226, + "grad_norm": 0.24555465579032898, + "learning_rate": 2.063602313632369e-05, + "loss": 0.0819, + "num_input_tokens_seen": 15954032, + "step": 22235 + }, + { + "epoch": 46.237006237006234, + "grad_norm": 0.3508809208869934, + "learning_rate": 2.0626356726661704e-05, + "loss": 0.1345, + "num_input_tokens_seen": 15957552, + "step": 22240 + }, + { + "epoch": 46.24740124740125, + "grad_norm": 0.6313157081604004, + "learning_rate": 2.0616690991470477e-05, + "loss": 0.0977, + "num_input_tokens_seen": 15961136, + "step": 22245 + }, + { + "epoch": 46.25779625779626, + "grad_norm": 0.2909218966960907, + "learning_rate": 2.0607025932240595e-05, + "loss": 0.0926, + "num_input_tokens_seen": 15964912, + "step": 22250 + }, + { + "epoch": 46.268191268191266, + "grad_norm": 0.15083792805671692, + "learning_rate": 2.059736155046251e-05, + "loss": 0.0982, + "num_input_tokens_seen": 15968464, + "step": 22255 + }, + { + "epoch": 46.27858627858628, + "grad_norm": 0.5259668827056885, + "learning_rate": 2.0587697847626603e-05, + "loss": 0.161, + "num_input_tokens_seen": 15972048, + "step": 22260 + }, + { + "epoch": 46.28898128898129, + "grad_norm": 1.1711900234222412, + "learning_rate": 2.057803482522314e-05, + "loss": 0.1308, + "num_input_tokens_seen": 15975536, + "step": 22265 + }, + { + "epoch": 46.2993762993763, + "grad_norm": 0.2422715425491333, + "learning_rate": 2.056837248474227e-05, + "loss": 0.0807, + "num_input_tokens_seen": 15979024, + "step": 22270 + }, + { + "epoch": 46.30977130977131, + "grad_norm": 0.2193056344985962, + "learning_rate": 2.0558710827674064e-05, + "loss": 0.101, + "num_input_tokens_seen": 15982576, + "step": 22275 + }, + { + "epoch": 46.32016632016632, + "grad_norm": 0.340973824262619, + "learning_rate": 2.054904985550845e-05, + "loss": 0.1112, + "num_input_tokens_seen": 15986096, + "step": 22280 + }, + { + "epoch": 46.33056133056133, + "grad_norm": 0.5795242786407471, + "learning_rate": 2.0539389569735287e-05, + "loss": 0.1122, + "num_input_tokens_seen": 15989840, + "step": 22285 + }, + { + "epoch": 46.34095634095634, + "grad_norm": 1.050465703010559, + "learning_rate": 2.052972997184431e-05, + "loss": 0.1057, + "num_input_tokens_seen": 15993296, + "step": 22290 + }, + { + "epoch": 46.351351351351354, + "grad_norm": 0.29817792773246765, + "learning_rate": 2.0520071063325146e-05, + "loss": 0.0894, + "num_input_tokens_seen": 15996784, + "step": 22295 + }, + { + "epoch": 46.36174636174636, + "grad_norm": 0.1873096227645874, + "learning_rate": 2.051041284566732e-05, + "loss": 0.089, + "num_input_tokens_seen": 16000368, + "step": 22300 + }, + { + "epoch": 46.37214137214137, + "grad_norm": 0.6913887858390808, + "learning_rate": 2.050075532036026e-05, + "loss": 0.0959, + "num_input_tokens_seen": 16004112, + "step": 22305 + }, + { + "epoch": 46.38253638253638, + "grad_norm": 0.24429331719875336, + "learning_rate": 2.0491098488893264e-05, + "loss": 0.0959, + "num_input_tokens_seen": 16007632, + "step": 22310 + }, + { + "epoch": 46.392931392931395, + "grad_norm": 0.1969098448753357, + "learning_rate": 2.0481442352755546e-05, + "loss": 0.0911, + "num_input_tokens_seen": 16011184, + "step": 22315 + }, + { + "epoch": 46.4033264033264, + "grad_norm": 0.35576745867729187, + "learning_rate": 2.0471786913436198e-05, + "loss": 0.1087, + "num_input_tokens_seen": 16014832, + "step": 22320 + }, + { + "epoch": 46.41372141372141, + "grad_norm": 0.3288601338863373, + "learning_rate": 2.0462132172424218e-05, + "loss": 0.1264, + "num_input_tokens_seen": 16018448, + "step": 22325 + }, + { + "epoch": 46.42411642411643, + "grad_norm": 0.3043188750743866, + "learning_rate": 2.0452478131208484e-05, + "loss": 0.1001, + "num_input_tokens_seen": 16021872, + "step": 22330 + }, + { + "epoch": 46.434511434511435, + "grad_norm": 0.23932835459709167, + "learning_rate": 2.0442824791277765e-05, + "loss": 0.1109, + "num_input_tokens_seen": 16025392, + "step": 22335 + }, + { + "epoch": 46.444906444906444, + "grad_norm": 0.542212724685669, + "learning_rate": 2.0433172154120727e-05, + "loss": 0.129, + "num_input_tokens_seen": 16029104, + "step": 22340 + }, + { + "epoch": 46.45530145530145, + "grad_norm": 0.25330981612205505, + "learning_rate": 2.0423520221225947e-05, + "loss": 0.084, + "num_input_tokens_seen": 16032592, + "step": 22345 + }, + { + "epoch": 46.46569646569647, + "grad_norm": 0.3125578463077545, + "learning_rate": 2.0413868994081848e-05, + "loss": 0.1495, + "num_input_tokens_seen": 16036176, + "step": 22350 + }, + { + "epoch": 46.476091476091476, + "grad_norm": 0.4703114628791809, + "learning_rate": 2.0404218474176795e-05, + "loss": 0.1036, + "num_input_tokens_seen": 16039760, + "step": 22355 + }, + { + "epoch": 46.486486486486484, + "grad_norm": 0.23683443665504456, + "learning_rate": 2.0394568662999002e-05, + "loss": 0.1045, + "num_input_tokens_seen": 16043184, + "step": 22360 + }, + { + "epoch": 46.4968814968815, + "grad_norm": 0.3181452751159668, + "learning_rate": 2.0384919562036593e-05, + "loss": 0.1041, + "num_input_tokens_seen": 16046736, + "step": 22365 + }, + { + "epoch": 46.50727650727651, + "grad_norm": 0.45823514461517334, + "learning_rate": 2.0375271172777593e-05, + "loss": 0.0873, + "num_input_tokens_seen": 16050416, + "step": 22370 + }, + { + "epoch": 46.517671517671516, + "grad_norm": 0.8139141798019409, + "learning_rate": 2.0365623496709885e-05, + "loss": 0.0853, + "num_input_tokens_seen": 16053936, + "step": 22375 + }, + { + "epoch": 46.528066528066525, + "grad_norm": 0.2683866322040558, + "learning_rate": 2.0355976535321283e-05, + "loss": 0.1191, + "num_input_tokens_seen": 16057520, + "step": 22380 + }, + { + "epoch": 46.53846153846154, + "grad_norm": 0.5449771881103516, + "learning_rate": 2.034633029009945e-05, + "loss": 0.1449, + "num_input_tokens_seen": 16061296, + "step": 22385 + }, + { + "epoch": 46.54885654885655, + "grad_norm": 0.2515023350715637, + "learning_rate": 2.0336684762531972e-05, + "loss": 0.1032, + "num_input_tokens_seen": 16064784, + "step": 22390 + }, + { + "epoch": 46.55925155925156, + "grad_norm": 0.598454475402832, + "learning_rate": 2.032703995410631e-05, + "loss": 0.101, + "num_input_tokens_seen": 16068496, + "step": 22395 + }, + { + "epoch": 46.56964656964657, + "grad_norm": 0.2833031713962555, + "learning_rate": 2.031739586630981e-05, + "loss": 0.0802, + "num_input_tokens_seen": 16072048, + "step": 22400 + }, + { + "epoch": 46.56964656964657, + "eval_loss": 0.14224855601787567, + "eval_runtime": 7.7532, + "eval_samples_per_second": 110.406, + "eval_steps_per_second": 27.601, + "num_input_tokens_seen": 16072048, + "step": 22400 + }, + { + "epoch": 46.58004158004158, + "grad_norm": 0.20884226262569427, + "learning_rate": 2.0307752500629707e-05, + "loss": 0.0817, + "num_input_tokens_seen": 16075632, + "step": 22405 + }, + { + "epoch": 46.59043659043659, + "grad_norm": 0.8484315872192383, + "learning_rate": 2.0298109858553144e-05, + "loss": 0.1259, + "num_input_tokens_seen": 16079216, + "step": 22410 + }, + { + "epoch": 46.6008316008316, + "grad_norm": 0.3730219602584839, + "learning_rate": 2.028846794156712e-05, + "loss": 0.0698, + "num_input_tokens_seen": 16082736, + "step": 22415 + }, + { + "epoch": 46.61122661122661, + "grad_norm": 0.2080150991678238, + "learning_rate": 2.027882675115856e-05, + "loss": 0.0798, + "num_input_tokens_seen": 16086288, + "step": 22420 + }, + { + "epoch": 46.62162162162162, + "grad_norm": 0.7226110696792603, + "learning_rate": 2.026918628881423e-05, + "loss": 0.1346, + "num_input_tokens_seen": 16089840, + "step": 22425 + }, + { + "epoch": 46.63201663201663, + "grad_norm": 0.8578528761863708, + "learning_rate": 2.0259546556020833e-05, + "loss": 0.1467, + "num_input_tokens_seen": 16093296, + "step": 22430 + }, + { + "epoch": 46.642411642411645, + "grad_norm": 0.44115689396858215, + "learning_rate": 2.024990755426493e-05, + "loss": 0.1106, + "num_input_tokens_seen": 16096848, + "step": 22435 + }, + { + "epoch": 46.65280665280665, + "grad_norm": 0.5230351686477661, + "learning_rate": 2.0240269285032975e-05, + "loss": 0.1448, + "num_input_tokens_seen": 16100496, + "step": 22440 + }, + { + "epoch": 46.66320166320166, + "grad_norm": 0.6867393255233765, + "learning_rate": 2.0230631749811306e-05, + "loss": 0.0926, + "num_input_tokens_seen": 16104080, + "step": 22445 + }, + { + "epoch": 46.67359667359668, + "grad_norm": 0.4893712103366852, + "learning_rate": 2.0220994950086162e-05, + "loss": 0.11, + "num_input_tokens_seen": 16107632, + "step": 22450 + }, + { + "epoch": 46.683991683991685, + "grad_norm": 0.4337364137172699, + "learning_rate": 2.021135888734365e-05, + "loss": 0.1363, + "num_input_tokens_seen": 16111120, + "step": 22455 + }, + { + "epoch": 46.694386694386694, + "grad_norm": 0.3501876890659332, + "learning_rate": 2.0201723563069783e-05, + "loss": 0.1205, + "num_input_tokens_seen": 16114704, + "step": 22460 + }, + { + "epoch": 46.7047817047817, + "grad_norm": 0.5972571969032288, + "learning_rate": 2.0192088978750433e-05, + "loss": 0.078, + "num_input_tokens_seen": 16118256, + "step": 22465 + }, + { + "epoch": 46.71517671517672, + "grad_norm": 0.1433982253074646, + "learning_rate": 2.0182455135871385e-05, + "loss": 0.0765, + "num_input_tokens_seen": 16121680, + "step": 22470 + }, + { + "epoch": 46.725571725571726, + "grad_norm": 0.7630627751350403, + "learning_rate": 2.0172822035918305e-05, + "loss": 0.1105, + "num_input_tokens_seen": 16125168, + "step": 22475 + }, + { + "epoch": 46.735966735966734, + "grad_norm": 0.4514373540878296, + "learning_rate": 2.016318968037671e-05, + "loss": 0.1278, + "num_input_tokens_seen": 16128976, + "step": 22480 + }, + { + "epoch": 46.74636174636175, + "grad_norm": 0.27580755949020386, + "learning_rate": 2.015355807073206e-05, + "loss": 0.0973, + "num_input_tokens_seen": 16132624, + "step": 22485 + }, + { + "epoch": 46.75675675675676, + "grad_norm": 0.4302560091018677, + "learning_rate": 2.0143927208469664e-05, + "loss": 0.1021, + "num_input_tokens_seen": 16136112, + "step": 22490 + }, + { + "epoch": 46.767151767151766, + "grad_norm": 0.39146387577056885, + "learning_rate": 2.0134297095074708e-05, + "loss": 0.0912, + "num_input_tokens_seen": 16139664, + "step": 22495 + }, + { + "epoch": 46.777546777546775, + "grad_norm": 0.19706234335899353, + "learning_rate": 2.0124667732032297e-05, + "loss": 0.1331, + "num_input_tokens_seen": 16143280, + "step": 22500 + }, + { + "epoch": 46.78794178794179, + "grad_norm": 0.8101821541786194, + "learning_rate": 2.011503912082738e-05, + "loss": 0.1089, + "num_input_tokens_seen": 16146832, + "step": 22505 + }, + { + "epoch": 46.7983367983368, + "grad_norm": 0.30666467547416687, + "learning_rate": 2.0105411262944823e-05, + "loss": 0.0904, + "num_input_tokens_seen": 16150416, + "step": 22510 + }, + { + "epoch": 46.80873180873181, + "grad_norm": 0.432606965303421, + "learning_rate": 2.0095784159869366e-05, + "loss": 0.1034, + "num_input_tokens_seen": 16154064, + "step": 22515 + }, + { + "epoch": 46.81912681912682, + "grad_norm": 0.47634628415107727, + "learning_rate": 2.0086157813085608e-05, + "loss": 0.1185, + "num_input_tokens_seen": 16157808, + "step": 22520 + }, + { + "epoch": 46.82952182952183, + "grad_norm": 0.2434937208890915, + "learning_rate": 2.0076532224078068e-05, + "loss": 0.135, + "num_input_tokens_seen": 16161488, + "step": 22525 + }, + { + "epoch": 46.83991683991684, + "grad_norm": 0.5495229363441467, + "learning_rate": 2.0066907394331142e-05, + "loss": 0.1073, + "num_input_tokens_seen": 16164944, + "step": 22530 + }, + { + "epoch": 46.85031185031185, + "grad_norm": 0.24175278842449188, + "learning_rate": 2.0057283325329077e-05, + "loss": 0.1146, + "num_input_tokens_seen": 16168304, + "step": 22535 + }, + { + "epoch": 46.86070686070686, + "grad_norm": 0.25858932733535767, + "learning_rate": 2.0047660018556047e-05, + "loss": 0.095, + "num_input_tokens_seen": 16171920, + "step": 22540 + }, + { + "epoch": 46.87110187110187, + "grad_norm": 0.2839689254760742, + "learning_rate": 2.0038037475496075e-05, + "loss": 0.1346, + "num_input_tokens_seen": 16175568, + "step": 22545 + }, + { + "epoch": 46.88149688149688, + "grad_norm": 0.38980498909950256, + "learning_rate": 2.0028415697633073e-05, + "loss": 0.081, + "num_input_tokens_seen": 16179088, + "step": 22550 + }, + { + "epoch": 46.891891891891895, + "grad_norm": 0.26613911986351013, + "learning_rate": 2.0018794686450858e-05, + "loss": 0.0933, + "num_input_tokens_seen": 16182704, + "step": 22555 + }, + { + "epoch": 46.9022869022869, + "grad_norm": 0.486787885427475, + "learning_rate": 2.0009174443433088e-05, + "loss": 0.1072, + "num_input_tokens_seen": 16186288, + "step": 22560 + }, + { + "epoch": 46.91268191268191, + "grad_norm": 0.5159199833869934, + "learning_rate": 1.999955497006334e-05, + "loss": 0.1196, + "num_input_tokens_seen": 16189808, + "step": 22565 + }, + { + "epoch": 46.92307692307692, + "grad_norm": 0.18767093122005463, + "learning_rate": 1.9989936267825067e-05, + "loss": 0.09, + "num_input_tokens_seen": 16193200, + "step": 22570 + }, + { + "epoch": 46.933471933471935, + "grad_norm": 0.33961039781570435, + "learning_rate": 1.9980318338201572e-05, + "loss": 0.0963, + "num_input_tokens_seen": 16196976, + "step": 22575 + }, + { + "epoch": 46.943866943866944, + "grad_norm": 0.25200992822647095, + "learning_rate": 1.997070118267607e-05, + "loss": 0.104, + "num_input_tokens_seen": 16200464, + "step": 22580 + }, + { + "epoch": 46.95426195426195, + "grad_norm": 0.33510372042655945, + "learning_rate": 1.9961084802731654e-05, + "loss": 0.09, + "num_input_tokens_seen": 16203888, + "step": 22585 + }, + { + "epoch": 46.96465696465697, + "grad_norm": 0.1791493147611618, + "learning_rate": 1.9951469199851273e-05, + "loss": 0.0944, + "num_input_tokens_seen": 16207440, + "step": 22590 + }, + { + "epoch": 46.975051975051976, + "grad_norm": 0.3725089430809021, + "learning_rate": 1.99418543755178e-05, + "loss": 0.1066, + "num_input_tokens_seen": 16211248, + "step": 22595 + }, + { + "epoch": 46.985446985446984, + "grad_norm": 0.23997199535369873, + "learning_rate": 1.9932240331213936e-05, + "loss": 0.0948, + "num_input_tokens_seen": 16214832, + "step": 22600 + }, + { + "epoch": 46.985446985446984, + "eval_loss": 0.14365126192569733, + "eval_runtime": 7.7526, + "eval_samples_per_second": 110.414, + "eval_steps_per_second": 27.604, + "num_input_tokens_seen": 16214832, + "step": 22600 + }, + { + "epoch": 46.99584199584199, + "grad_norm": 0.36791032552719116, + "learning_rate": 1.9922627068422297e-05, + "loss": 0.1071, + "num_input_tokens_seen": 16218480, + "step": 22605 + }, + { + "epoch": 47.00623700623701, + "grad_norm": 0.7064952850341797, + "learning_rate": 1.991301458862538e-05, + "loss": 0.0959, + "num_input_tokens_seen": 16221952, + "step": 22610 + }, + { + "epoch": 47.016632016632016, + "grad_norm": 0.22344309091567993, + "learning_rate": 1.9903402893305536e-05, + "loss": 0.1431, + "num_input_tokens_seen": 16225536, + "step": 22615 + }, + { + "epoch": 47.027027027027025, + "grad_norm": 0.2197193056344986, + "learning_rate": 1.9893791983945016e-05, + "loss": 0.1071, + "num_input_tokens_seen": 16229344, + "step": 22620 + }, + { + "epoch": 47.03742203742204, + "grad_norm": 0.22311872243881226, + "learning_rate": 1.988418186202594e-05, + "loss": 0.0773, + "num_input_tokens_seen": 16232960, + "step": 22625 + }, + { + "epoch": 47.04781704781705, + "grad_norm": 0.4409863352775574, + "learning_rate": 1.98745725290303e-05, + "loss": 0.1343, + "num_input_tokens_seen": 16236480, + "step": 22630 + }, + { + "epoch": 47.05821205821206, + "grad_norm": 0.5807729363441467, + "learning_rate": 1.986496398644e-05, + "loss": 0.1024, + "num_input_tokens_seen": 16240032, + "step": 22635 + }, + { + "epoch": 47.06860706860707, + "grad_norm": 0.5429359078407288, + "learning_rate": 1.9855356235736777e-05, + "loss": 0.1222, + "num_input_tokens_seen": 16243680, + "step": 22640 + }, + { + "epoch": 47.07900207900208, + "grad_norm": 0.3928830921649933, + "learning_rate": 1.9845749278402277e-05, + "loss": 0.0891, + "num_input_tokens_seen": 16247360, + "step": 22645 + }, + { + "epoch": 47.08939708939709, + "grad_norm": 0.4890878200531006, + "learning_rate": 1.9836143115918006e-05, + "loss": 0.1363, + "num_input_tokens_seen": 16251008, + "step": 22650 + }, + { + "epoch": 47.0997920997921, + "grad_norm": 0.2505497932434082, + "learning_rate": 1.9826537749765367e-05, + "loss": 0.081, + "num_input_tokens_seen": 16254592, + "step": 22655 + }, + { + "epoch": 47.11018711018711, + "grad_norm": 0.16003300249576569, + "learning_rate": 1.9816933181425625e-05, + "loss": 0.122, + "num_input_tokens_seen": 16258208, + "step": 22660 + }, + { + "epoch": 47.12058212058212, + "grad_norm": 0.2336069643497467, + "learning_rate": 1.9807329412379903e-05, + "loss": 0.1475, + "num_input_tokens_seen": 16261728, + "step": 22665 + }, + { + "epoch": 47.13097713097713, + "grad_norm": 0.31901460886001587, + "learning_rate": 1.9797726444109247e-05, + "loss": 0.0926, + "num_input_tokens_seen": 16265152, + "step": 22670 + }, + { + "epoch": 47.141372141372145, + "grad_norm": 0.1306157261133194, + "learning_rate": 1.9788124278094557e-05, + "loss": 0.0793, + "num_input_tokens_seen": 16268608, + "step": 22675 + }, + { + "epoch": 47.15176715176715, + "grad_norm": 0.413842111825943, + "learning_rate": 1.9778522915816594e-05, + "loss": 0.116, + "num_input_tokens_seen": 16272320, + "step": 22680 + }, + { + "epoch": 47.16216216216216, + "grad_norm": 0.3963124752044678, + "learning_rate": 1.9768922358756014e-05, + "loss": 0.1016, + "num_input_tokens_seen": 16275904, + "step": 22685 + }, + { + "epoch": 47.17255717255717, + "grad_norm": 0.35928037762641907, + "learning_rate": 1.9759322608393353e-05, + "loss": 0.1478, + "num_input_tokens_seen": 16279360, + "step": 22690 + }, + { + "epoch": 47.182952182952185, + "grad_norm": 0.26167914271354675, + "learning_rate": 1.9749723666208992e-05, + "loss": 0.1261, + "num_input_tokens_seen": 16283168, + "step": 22695 + }, + { + "epoch": 47.19334719334719, + "grad_norm": 0.2820509374141693, + "learning_rate": 1.9740125533683235e-05, + "loss": 0.0843, + "num_input_tokens_seen": 16286752, + "step": 22700 + }, + { + "epoch": 47.2037422037422, + "grad_norm": 0.37354129552841187, + "learning_rate": 1.9730528212296208e-05, + "loss": 0.0966, + "num_input_tokens_seen": 16290432, + "step": 22705 + }, + { + "epoch": 47.21413721413722, + "grad_norm": 0.26285794377326965, + "learning_rate": 1.9720931703527945e-05, + "loss": 0.0855, + "num_input_tokens_seen": 16293920, + "step": 22710 + }, + { + "epoch": 47.224532224532226, + "grad_norm": 0.4201022982597351, + "learning_rate": 1.9711336008858373e-05, + "loss": 0.0862, + "num_input_tokens_seen": 16297472, + "step": 22715 + }, + { + "epoch": 47.234927234927234, + "grad_norm": 0.6402037143707275, + "learning_rate": 1.9701741129767233e-05, + "loss": 0.0997, + "num_input_tokens_seen": 16301184, + "step": 22720 + }, + { + "epoch": 47.24532224532224, + "grad_norm": 0.3694855868816376, + "learning_rate": 1.9692147067734202e-05, + "loss": 0.0745, + "num_input_tokens_seen": 16304768, + "step": 22725 + }, + { + "epoch": 47.25571725571726, + "grad_norm": 0.24184457957744598, + "learning_rate": 1.96825538242388e-05, + "loss": 0.0901, + "num_input_tokens_seen": 16308416, + "step": 22730 + }, + { + "epoch": 47.266112266112266, + "grad_norm": 0.38498857617378235, + "learning_rate": 1.967296140076041e-05, + "loss": 0.1112, + "num_input_tokens_seen": 16312064, + "step": 22735 + }, + { + "epoch": 47.276507276507274, + "grad_norm": 0.4736252427101135, + "learning_rate": 1.966336979877833e-05, + "loss": 0.0742, + "num_input_tokens_seen": 16315680, + "step": 22740 + }, + { + "epoch": 47.28690228690229, + "grad_norm": 0.35557451844215393, + "learning_rate": 1.9653779019771678e-05, + "loss": 0.1204, + "num_input_tokens_seen": 16319392, + "step": 22745 + }, + { + "epoch": 47.2972972972973, + "grad_norm": 0.40065690875053406, + "learning_rate": 1.9644189065219488e-05, + "loss": 0.1006, + "num_input_tokens_seen": 16322944, + "step": 22750 + }, + { + "epoch": 47.30769230769231, + "grad_norm": 0.23516607284545898, + "learning_rate": 1.9634599936600655e-05, + "loss": 0.1335, + "num_input_tokens_seen": 16326496, + "step": 22755 + }, + { + "epoch": 47.318087318087315, + "grad_norm": 0.46030452847480774, + "learning_rate": 1.9625011635393935e-05, + "loss": 0.1125, + "num_input_tokens_seen": 16329984, + "step": 22760 + }, + { + "epoch": 47.32848232848233, + "grad_norm": 0.3827848732471466, + "learning_rate": 1.9615424163077963e-05, + "loss": 0.1051, + "num_input_tokens_seen": 16333504, + "step": 22765 + }, + { + "epoch": 47.33887733887734, + "grad_norm": 0.23792754113674164, + "learning_rate": 1.9605837521131263e-05, + "loss": 0.1016, + "num_input_tokens_seen": 16336960, + "step": 22770 + }, + { + "epoch": 47.34927234927235, + "grad_norm": 0.9271147847175598, + "learning_rate": 1.9596251711032192e-05, + "loss": 0.0938, + "num_input_tokens_seen": 16340640, + "step": 22775 + }, + { + "epoch": 47.35966735966736, + "grad_norm": 0.9476851224899292, + "learning_rate": 1.958666673425903e-05, + "loss": 0.0893, + "num_input_tokens_seen": 16344064, + "step": 22780 + }, + { + "epoch": 47.37006237006237, + "grad_norm": 0.23148328065872192, + "learning_rate": 1.957708259228987e-05, + "loss": 0.1065, + "num_input_tokens_seen": 16347648, + "step": 22785 + }, + { + "epoch": 47.38045738045738, + "grad_norm": 0.21758756041526794, + "learning_rate": 1.956749928660273e-05, + "loss": 0.0823, + "num_input_tokens_seen": 16351136, + "step": 22790 + }, + { + "epoch": 47.39085239085239, + "grad_norm": 0.26135116815567017, + "learning_rate": 1.955791681867547e-05, + "loss": 0.0845, + "num_input_tokens_seen": 16354592, + "step": 22795 + }, + { + "epoch": 47.4012474012474, + "grad_norm": 0.09963449090719223, + "learning_rate": 1.9548335189985824e-05, + "loss": 0.0711, + "num_input_tokens_seen": 16358208, + "step": 22800 + }, + { + "epoch": 47.4012474012474, + "eval_loss": 0.1447683870792389, + "eval_runtime": 7.7477, + "eval_samples_per_second": 110.484, + "eval_steps_per_second": 27.621, + "num_input_tokens_seen": 16358208, + "step": 22800 + }, + { + "epoch": 47.41164241164241, + "grad_norm": 0.176260843873024, + "learning_rate": 1.9538754402011396e-05, + "loss": 0.0791, + "num_input_tokens_seen": 16361824, + "step": 22805 + }, + { + "epoch": 47.42203742203742, + "grad_norm": 0.46361806988716125, + "learning_rate": 1.952917445622968e-05, + "loss": 0.1007, + "num_input_tokens_seen": 16365248, + "step": 22810 + }, + { + "epoch": 47.432432432432435, + "grad_norm": 0.3584257960319519, + "learning_rate": 1.9519595354118005e-05, + "loss": 0.0962, + "num_input_tokens_seen": 16368896, + "step": 22815 + }, + { + "epoch": 47.44282744282744, + "grad_norm": 0.4889318645000458, + "learning_rate": 1.951001709715361e-05, + "loss": 0.1023, + "num_input_tokens_seen": 16372448, + "step": 22820 + }, + { + "epoch": 47.45322245322245, + "grad_norm": 0.6878665685653687, + "learning_rate": 1.9500439686813556e-05, + "loss": 0.1078, + "num_input_tokens_seen": 16375872, + "step": 22825 + }, + { + "epoch": 47.46361746361746, + "grad_norm": 0.3098893165588379, + "learning_rate": 1.949086312457482e-05, + "loss": 0.1185, + "num_input_tokens_seen": 16379488, + "step": 22830 + }, + { + "epoch": 47.474012474012476, + "grad_norm": 0.3884323835372925, + "learning_rate": 1.9481287411914223e-05, + "loss": 0.1039, + "num_input_tokens_seen": 16383136, + "step": 22835 + }, + { + "epoch": 47.484407484407484, + "grad_norm": 0.296234130859375, + "learning_rate": 1.9471712550308457e-05, + "loss": 0.1114, + "num_input_tokens_seen": 16386816, + "step": 22840 + }, + { + "epoch": 47.49480249480249, + "grad_norm": 0.40882408618927, + "learning_rate": 1.946213854123409e-05, + "loss": 0.1111, + "num_input_tokens_seen": 16390560, + "step": 22845 + }, + { + "epoch": 47.50519750519751, + "grad_norm": 0.16997195780277252, + "learning_rate": 1.9452565386167554e-05, + "loss": 0.0776, + "num_input_tokens_seen": 16394080, + "step": 22850 + }, + { + "epoch": 47.515592515592516, + "grad_norm": 0.4250870645046234, + "learning_rate": 1.9442993086585142e-05, + "loss": 0.1127, + "num_input_tokens_seen": 16397664, + "step": 22855 + }, + { + "epoch": 47.525987525987524, + "grad_norm": 0.32754406332969666, + "learning_rate": 1.9433421643963043e-05, + "loss": 0.1225, + "num_input_tokens_seen": 16401056, + "step": 22860 + }, + { + "epoch": 47.53638253638254, + "grad_norm": 0.23513163626194, + "learning_rate": 1.942385105977727e-05, + "loss": 0.0952, + "num_input_tokens_seen": 16404544, + "step": 22865 + }, + { + "epoch": 47.54677754677755, + "grad_norm": 0.25947311520576477, + "learning_rate": 1.9414281335503743e-05, + "loss": 0.1066, + "num_input_tokens_seen": 16408256, + "step": 22870 + }, + { + "epoch": 47.55717255717256, + "grad_norm": 0.36151009798049927, + "learning_rate": 1.9404712472618232e-05, + "loss": 0.1225, + "num_input_tokens_seen": 16411872, + "step": 22875 + }, + { + "epoch": 47.567567567567565, + "grad_norm": 0.19758351147174835, + "learning_rate": 1.939514447259636e-05, + "loss": 0.0897, + "num_input_tokens_seen": 16415456, + "step": 22880 + }, + { + "epoch": 47.57796257796258, + "grad_norm": 0.2662288546562195, + "learning_rate": 1.938557733691365e-05, + "loss": 0.0729, + "num_input_tokens_seen": 16419008, + "step": 22885 + }, + { + "epoch": 47.58835758835759, + "grad_norm": 0.3822391629219055, + "learning_rate": 1.9376011067045476e-05, + "loss": 0.1091, + "num_input_tokens_seen": 16422752, + "step": 22890 + }, + { + "epoch": 47.5987525987526, + "grad_norm": 0.3035261631011963, + "learning_rate": 1.9366445664467065e-05, + "loss": 0.098, + "num_input_tokens_seen": 16426368, + "step": 22895 + }, + { + "epoch": 47.60914760914761, + "grad_norm": 0.2072167694568634, + "learning_rate": 1.9356881130653533e-05, + "loss": 0.1105, + "num_input_tokens_seen": 16429760, + "step": 22900 + }, + { + "epoch": 47.61954261954262, + "grad_norm": 0.399370014667511, + "learning_rate": 1.9347317467079846e-05, + "loss": 0.1212, + "num_input_tokens_seen": 16433376, + "step": 22905 + }, + { + "epoch": 47.62993762993763, + "grad_norm": 0.2964155375957489, + "learning_rate": 1.9337754675220836e-05, + "loss": 0.1117, + "num_input_tokens_seen": 16436928, + "step": 22910 + }, + { + "epoch": 47.64033264033264, + "grad_norm": 0.18278780579566956, + "learning_rate": 1.9328192756551218e-05, + "loss": 0.087, + "num_input_tokens_seen": 16440640, + "step": 22915 + }, + { + "epoch": 47.65072765072765, + "grad_norm": 0.3018403947353363, + "learning_rate": 1.931863171254555e-05, + "loss": 0.0751, + "num_input_tokens_seen": 16444224, + "step": 22920 + }, + { + "epoch": 47.66112266112266, + "grad_norm": 0.24818480014801025, + "learning_rate": 1.930907154467826e-05, + "loss": 0.0895, + "num_input_tokens_seen": 16447712, + "step": 22925 + }, + { + "epoch": 47.67151767151767, + "grad_norm": 0.2450265884399414, + "learning_rate": 1.9299512254423673e-05, + "loss": 0.0948, + "num_input_tokens_seen": 16451232, + "step": 22930 + }, + { + "epoch": 47.681912681912685, + "grad_norm": 0.21107344329357147, + "learning_rate": 1.9289953843255914e-05, + "loss": 0.0935, + "num_input_tokens_seen": 16454784, + "step": 22935 + }, + { + "epoch": 47.69230769230769, + "grad_norm": 0.48154935240745544, + "learning_rate": 1.9280396312649048e-05, + "loss": 0.1004, + "num_input_tokens_seen": 16458368, + "step": 22940 + }, + { + "epoch": 47.7027027027027, + "grad_norm": 0.8471273183822632, + "learning_rate": 1.9270839664076936e-05, + "loss": 0.1285, + "num_input_tokens_seen": 16461952, + "step": 22945 + }, + { + "epoch": 47.71309771309771, + "grad_norm": 0.3591873049736023, + "learning_rate": 1.9261283899013345e-05, + "loss": 0.1123, + "num_input_tokens_seen": 16465568, + "step": 22950 + }, + { + "epoch": 47.723492723492726, + "grad_norm": 0.25720787048339844, + "learning_rate": 1.92517290189319e-05, + "loss": 0.0788, + "num_input_tokens_seen": 16469184, + "step": 22955 + }, + { + "epoch": 47.733887733887734, + "grad_norm": 0.3820970952510834, + "learning_rate": 1.924217502530607e-05, + "loss": 0.1286, + "num_input_tokens_seen": 16472736, + "step": 22960 + }, + { + "epoch": 47.74428274428274, + "grad_norm": 0.4603177011013031, + "learning_rate": 1.9232621919609207e-05, + "loss": 0.093, + "num_input_tokens_seen": 16476480, + "step": 22965 + }, + { + "epoch": 47.75467775467776, + "grad_norm": 0.4235835075378418, + "learning_rate": 1.9223069703314534e-05, + "loss": 0.1153, + "num_input_tokens_seen": 16480128, + "step": 22970 + }, + { + "epoch": 47.765072765072766, + "grad_norm": 0.5454571843147278, + "learning_rate": 1.92135183778951e-05, + "loss": 0.093, + "num_input_tokens_seen": 16483744, + "step": 22975 + }, + { + "epoch": 47.775467775467774, + "grad_norm": 0.937752366065979, + "learning_rate": 1.9203967944823857e-05, + "loss": 0.1392, + "num_input_tokens_seen": 16487232, + "step": 22980 + }, + { + "epoch": 47.78586278586278, + "grad_norm": 0.2682749032974243, + "learning_rate": 1.9194418405573588e-05, + "loss": 0.1003, + "num_input_tokens_seen": 16490816, + "step": 22985 + }, + { + "epoch": 47.7962577962578, + "grad_norm": 0.48371222615242004, + "learning_rate": 1.9184869761616954e-05, + "loss": 0.1105, + "num_input_tokens_seen": 16494432, + "step": 22990 + }, + { + "epoch": 47.80665280665281, + "grad_norm": 0.586957573890686, + "learning_rate": 1.9175322014426495e-05, + "loss": 0.0707, + "num_input_tokens_seen": 16497984, + "step": 22995 + }, + { + "epoch": 47.817047817047815, + "grad_norm": 0.17046642303466797, + "learning_rate": 1.9165775165474565e-05, + "loss": 0.1001, + "num_input_tokens_seen": 16501568, + "step": 23000 + }, + { + "epoch": 47.817047817047815, + "eval_loss": 0.14482107758522034, + "eval_runtime": 7.7539, + "eval_samples_per_second": 110.396, + "eval_steps_per_second": 27.599, + "num_input_tokens_seen": 16501568, + "step": 23000 + }, + { + "epoch": 47.82744282744283, + "grad_norm": 0.8452812433242798, + "learning_rate": 1.9156229216233434e-05, + "loss": 0.073, + "num_input_tokens_seen": 16505152, + "step": 23005 + }, + { + "epoch": 47.83783783783784, + "grad_norm": 0.6118340492248535, + "learning_rate": 1.9146684168175184e-05, + "loss": 0.0955, + "num_input_tokens_seen": 16508704, + "step": 23010 + }, + { + "epoch": 47.84823284823285, + "grad_norm": 0.29551979899406433, + "learning_rate": 1.9137140022771796e-05, + "loss": 0.0905, + "num_input_tokens_seen": 16512192, + "step": 23015 + }, + { + "epoch": 47.858627858627855, + "grad_norm": 0.20206737518310547, + "learning_rate": 1.9127596781495103e-05, + "loss": 0.1106, + "num_input_tokens_seen": 16515872, + "step": 23020 + }, + { + "epoch": 47.86902286902287, + "grad_norm": 0.3415992259979248, + "learning_rate": 1.9118054445816767e-05, + "loss": 0.0712, + "num_input_tokens_seen": 16519456, + "step": 23025 + }, + { + "epoch": 47.87941787941788, + "grad_norm": 0.345868855714798, + "learning_rate": 1.9108513017208356e-05, + "loss": 0.1062, + "num_input_tokens_seen": 16523040, + "step": 23030 + }, + { + "epoch": 47.88981288981289, + "grad_norm": 0.4550297260284424, + "learning_rate": 1.9098972497141287e-05, + "loss": 0.0967, + "num_input_tokens_seen": 16526464, + "step": 23035 + }, + { + "epoch": 47.9002079002079, + "grad_norm": 0.31720831990242004, + "learning_rate": 1.9089432887086806e-05, + "loss": 0.1316, + "num_input_tokens_seen": 16530144, + "step": 23040 + }, + { + "epoch": 47.91060291060291, + "grad_norm": 0.3876282572746277, + "learning_rate": 1.9079894188516056e-05, + "loss": 0.1042, + "num_input_tokens_seen": 16533632, + "step": 23045 + }, + { + "epoch": 47.92099792099792, + "grad_norm": 0.20602092146873474, + "learning_rate": 1.907035640290002e-05, + "loss": 0.1667, + "num_input_tokens_seen": 16537216, + "step": 23050 + }, + { + "epoch": 47.931392931392935, + "grad_norm": 0.40927234292030334, + "learning_rate": 1.9060819531709534e-05, + "loss": 0.1378, + "num_input_tokens_seen": 16540992, + "step": 23055 + }, + { + "epoch": 47.94178794178794, + "grad_norm": 0.26868200302124023, + "learning_rate": 1.9051283576415325e-05, + "loss": 0.125, + "num_input_tokens_seen": 16544736, + "step": 23060 + }, + { + "epoch": 47.95218295218295, + "grad_norm": 0.21744146943092346, + "learning_rate": 1.904174853848793e-05, + "loss": 0.0698, + "num_input_tokens_seen": 16548288, + "step": 23065 + }, + { + "epoch": 47.96257796257796, + "grad_norm": 0.5637621283531189, + "learning_rate": 1.903221441939779e-05, + "loss": 0.0932, + "num_input_tokens_seen": 16551808, + "step": 23070 + }, + { + "epoch": 47.972972972972975, + "grad_norm": 0.5380772352218628, + "learning_rate": 1.9022681220615194e-05, + "loss": 0.113, + "num_input_tokens_seen": 16555296, + "step": 23075 + }, + { + "epoch": 47.983367983367984, + "grad_norm": 0.9681912660598755, + "learning_rate": 1.9013148943610255e-05, + "loss": 0.1127, + "num_input_tokens_seen": 16559104, + "step": 23080 + }, + { + "epoch": 47.99376299376299, + "grad_norm": 0.8614906072616577, + "learning_rate": 1.9003617589852998e-05, + "loss": 0.1393, + "num_input_tokens_seen": 16562816, + "step": 23085 + }, + { + "epoch": 48.00415800415801, + "grad_norm": 0.18139779567718506, + "learning_rate": 1.899408716081326e-05, + "loss": 0.0785, + "num_input_tokens_seen": 16566472, + "step": 23090 + }, + { + "epoch": 48.014553014553016, + "grad_norm": 0.5481164455413818, + "learning_rate": 1.898455765796075e-05, + "loss": 0.1132, + "num_input_tokens_seen": 16570152, + "step": 23095 + }, + { + "epoch": 48.024948024948024, + "grad_norm": 0.23302847146987915, + "learning_rate": 1.8975029082765053e-05, + "loss": 0.1175, + "num_input_tokens_seen": 16573832, + "step": 23100 + }, + { + "epoch": 48.03534303534303, + "grad_norm": 0.2135060578584671, + "learning_rate": 1.8965501436695577e-05, + "loss": 0.1313, + "num_input_tokens_seen": 16577320, + "step": 23105 + }, + { + "epoch": 48.04573804573805, + "grad_norm": 0.23775851726531982, + "learning_rate": 1.895597472122161e-05, + "loss": 0.0947, + "num_input_tokens_seen": 16580968, + "step": 23110 + }, + { + "epoch": 48.056133056133056, + "grad_norm": 0.2113768756389618, + "learning_rate": 1.894644893781231e-05, + "loss": 0.0604, + "num_input_tokens_seen": 16584584, + "step": 23115 + }, + { + "epoch": 48.066528066528065, + "grad_norm": 0.2936844825744629, + "learning_rate": 1.893692408793665e-05, + "loss": 0.1193, + "num_input_tokens_seen": 16588168, + "step": 23120 + }, + { + "epoch": 48.07692307692308, + "grad_norm": 0.27794769406318665, + "learning_rate": 1.8927400173063493e-05, + "loss": 0.1125, + "num_input_tokens_seen": 16591720, + "step": 23125 + }, + { + "epoch": 48.08731808731809, + "grad_norm": 0.5332415699958801, + "learning_rate": 1.891787719466154e-05, + "loss": 0.1138, + "num_input_tokens_seen": 16595464, + "step": 23130 + }, + { + "epoch": 48.0977130977131, + "grad_norm": 0.3917006850242615, + "learning_rate": 1.8908355154199346e-05, + "loss": 0.103, + "num_input_tokens_seen": 16599144, + "step": 23135 + }, + { + "epoch": 48.108108108108105, + "grad_norm": 0.1765415519475937, + "learning_rate": 1.8898834053145357e-05, + "loss": 0.0871, + "num_input_tokens_seen": 16602696, + "step": 23140 + }, + { + "epoch": 48.11850311850312, + "grad_norm": 0.2869004011154175, + "learning_rate": 1.8889313892967813e-05, + "loss": 0.0935, + "num_input_tokens_seen": 16606408, + "step": 23145 + }, + { + "epoch": 48.12889812889813, + "grad_norm": 0.3609766662120819, + "learning_rate": 1.8879794675134863e-05, + "loss": 0.129, + "num_input_tokens_seen": 16609928, + "step": 23150 + }, + { + "epoch": 48.13929313929314, + "grad_norm": 0.4358021914958954, + "learning_rate": 1.8870276401114494e-05, + "loss": 0.0844, + "num_input_tokens_seen": 16613480, + "step": 23155 + }, + { + "epoch": 48.14968814968815, + "grad_norm": 1.1857504844665527, + "learning_rate": 1.886075907237453e-05, + "loss": 0.0961, + "num_input_tokens_seen": 16616904, + "step": 23160 + }, + { + "epoch": 48.16008316008316, + "grad_norm": 0.22260840237140656, + "learning_rate": 1.8851242690382672e-05, + "loss": 0.0751, + "num_input_tokens_seen": 16620520, + "step": 23165 + }, + { + "epoch": 48.17047817047817, + "grad_norm": 0.3602939546108246, + "learning_rate": 1.884172725660645e-05, + "loss": 0.0911, + "num_input_tokens_seen": 16624136, + "step": 23170 + }, + { + "epoch": 48.18087318087318, + "grad_norm": 0.09682425111532211, + "learning_rate": 1.8832212772513277e-05, + "loss": 0.1107, + "num_input_tokens_seen": 16627560, + "step": 23175 + }, + { + "epoch": 48.19126819126819, + "grad_norm": 0.42621174454689026, + "learning_rate": 1.8822699239570414e-05, + "loss": 0.1469, + "num_input_tokens_seen": 16631336, + "step": 23180 + }, + { + "epoch": 48.2016632016632, + "grad_norm": 0.8205127120018005, + "learning_rate": 1.8813186659244943e-05, + "loss": 0.0934, + "num_input_tokens_seen": 16634920, + "step": 23185 + }, + { + "epoch": 48.21205821205821, + "grad_norm": 0.1317511796951294, + "learning_rate": 1.880367503300385e-05, + "loss": 0.0978, + "num_input_tokens_seen": 16638536, + "step": 23190 + }, + { + "epoch": 48.222453222453225, + "grad_norm": 0.15936876833438873, + "learning_rate": 1.8794164362313927e-05, + "loss": 0.0819, + "num_input_tokens_seen": 16642024, + "step": 23195 + }, + { + "epoch": 48.232848232848234, + "grad_norm": 0.13250286877155304, + "learning_rate": 1.878465464864185e-05, + "loss": 0.0753, + "num_input_tokens_seen": 16645480, + "step": 23200 + }, + { + "epoch": 48.232848232848234, + "eval_loss": 0.1460937112569809, + "eval_runtime": 7.7555, + "eval_samples_per_second": 110.373, + "eval_steps_per_second": 27.593, + "num_input_tokens_seen": 16645480, + "step": 23200 + }, + { + "epoch": 48.24324324324324, + "grad_norm": 0.3660391569137573, + "learning_rate": 1.877514589345414e-05, + "loss": 0.1008, + "num_input_tokens_seen": 16649096, + "step": 23205 + }, + { + "epoch": 48.25363825363825, + "grad_norm": 0.30430731177330017, + "learning_rate": 1.876563809821715e-05, + "loss": 0.1124, + "num_input_tokens_seen": 16652584, + "step": 23210 + }, + { + "epoch": 48.264033264033266, + "grad_norm": 0.1963377147912979, + "learning_rate": 1.8756131264397106e-05, + "loss": 0.0808, + "num_input_tokens_seen": 16656328, + "step": 23215 + }, + { + "epoch": 48.274428274428274, + "grad_norm": 0.32325199246406555, + "learning_rate": 1.87466253934601e-05, + "loss": 0.1138, + "num_input_tokens_seen": 16659880, + "step": 23220 + }, + { + "epoch": 48.28482328482328, + "grad_norm": 0.3335878849029541, + "learning_rate": 1.8737120486872033e-05, + "loss": 0.1023, + "num_input_tokens_seen": 16663592, + "step": 23225 + }, + { + "epoch": 48.2952182952183, + "grad_norm": 0.42121896147727966, + "learning_rate": 1.8727616546098696e-05, + "loss": 0.0881, + "num_input_tokens_seen": 16667176, + "step": 23230 + }, + { + "epoch": 48.305613305613306, + "grad_norm": 0.47637036442756653, + "learning_rate": 1.8718113572605716e-05, + "loss": 0.0961, + "num_input_tokens_seen": 16670728, + "step": 23235 + }, + { + "epoch": 48.316008316008315, + "grad_norm": 0.4075010120868683, + "learning_rate": 1.8708611567858554e-05, + "loss": 0.1148, + "num_input_tokens_seen": 16674312, + "step": 23240 + }, + { + "epoch": 48.32640332640332, + "grad_norm": 0.20326220989227295, + "learning_rate": 1.8699110533322565e-05, + "loss": 0.0746, + "num_input_tokens_seen": 16677864, + "step": 23245 + }, + { + "epoch": 48.33679833679834, + "grad_norm": 0.3082646131515503, + "learning_rate": 1.8689610470462897e-05, + "loss": 0.1051, + "num_input_tokens_seen": 16681576, + "step": 23250 + }, + { + "epoch": 48.34719334719335, + "grad_norm": 0.3745791018009186, + "learning_rate": 1.8680111380744604e-05, + "loss": 0.0783, + "num_input_tokens_seen": 16685320, + "step": 23255 + }, + { + "epoch": 48.357588357588355, + "grad_norm": 0.4512580633163452, + "learning_rate": 1.8670613265632564e-05, + "loss": 0.1089, + "num_input_tokens_seen": 16688968, + "step": 23260 + }, + { + "epoch": 48.36798336798337, + "grad_norm": 0.3190629482269287, + "learning_rate": 1.866111612659149e-05, + "loss": 0.1113, + "num_input_tokens_seen": 16692360, + "step": 23265 + }, + { + "epoch": 48.37837837837838, + "grad_norm": 0.836298942565918, + "learning_rate": 1.8651619965085967e-05, + "loss": 0.1342, + "num_input_tokens_seen": 16695848, + "step": 23270 + }, + { + "epoch": 48.38877338877339, + "grad_norm": 0.514756977558136, + "learning_rate": 1.8642124782580433e-05, + "loss": 0.1179, + "num_input_tokens_seen": 16699304, + "step": 23275 + }, + { + "epoch": 48.3991683991684, + "grad_norm": 0.21228276193141937, + "learning_rate": 1.8632630580539144e-05, + "loss": 0.1193, + "num_input_tokens_seen": 16702792, + "step": 23280 + }, + { + "epoch": 48.40956340956341, + "grad_norm": 0.2387125939130783, + "learning_rate": 1.862313736042625e-05, + "loss": 0.1166, + "num_input_tokens_seen": 16706408, + "step": 23285 + }, + { + "epoch": 48.41995841995842, + "grad_norm": 0.42140838503837585, + "learning_rate": 1.8613645123705703e-05, + "loss": 0.1125, + "num_input_tokens_seen": 16710024, + "step": 23290 + }, + { + "epoch": 48.43035343035343, + "grad_norm": 0.3542710244655609, + "learning_rate": 1.8604153871841328e-05, + "loss": 0.0894, + "num_input_tokens_seen": 16713768, + "step": 23295 + }, + { + "epoch": 48.44074844074844, + "grad_norm": 0.3579420745372772, + "learning_rate": 1.859466360629682e-05, + "loss": 0.1145, + "num_input_tokens_seen": 16717256, + "step": 23300 + }, + { + "epoch": 48.45114345114345, + "grad_norm": 0.17193956673145294, + "learning_rate": 1.8585174328535666e-05, + "loss": 0.0853, + "num_input_tokens_seen": 16720936, + "step": 23305 + }, + { + "epoch": 48.46153846153846, + "grad_norm": 0.5878743529319763, + "learning_rate": 1.857568604002124e-05, + "loss": 0.1238, + "num_input_tokens_seen": 16724584, + "step": 23310 + }, + { + "epoch": 48.471933471933475, + "grad_norm": 0.12283790111541748, + "learning_rate": 1.8566198742216774e-05, + "loss": 0.12, + "num_input_tokens_seen": 16728232, + "step": 23315 + }, + { + "epoch": 48.482328482328484, + "grad_norm": 0.2949460446834564, + "learning_rate": 1.85567124365853e-05, + "loss": 0.1175, + "num_input_tokens_seen": 16731784, + "step": 23320 + }, + { + "epoch": 48.49272349272349, + "grad_norm": 0.4209847152233124, + "learning_rate": 1.854722712458975e-05, + "loss": 0.1394, + "num_input_tokens_seen": 16735272, + "step": 23325 + }, + { + "epoch": 48.5031185031185, + "grad_norm": 0.41142910718917847, + "learning_rate": 1.853774280769286e-05, + "loss": 0.1106, + "num_input_tokens_seen": 16738888, + "step": 23330 + }, + { + "epoch": 48.513513513513516, + "grad_norm": 0.6124179363250732, + "learning_rate": 1.852825948735724e-05, + "loss": 0.1029, + "num_input_tokens_seen": 16742568, + "step": 23335 + }, + { + "epoch": 48.523908523908524, + "grad_norm": 0.20287039875984192, + "learning_rate": 1.851877716504534e-05, + "loss": 0.0837, + "num_input_tokens_seen": 16746312, + "step": 23340 + }, + { + "epoch": 48.53430353430353, + "grad_norm": 0.27053606510162354, + "learning_rate": 1.8509295842219448e-05, + "loss": 0.0848, + "num_input_tokens_seen": 16749928, + "step": 23345 + }, + { + "epoch": 48.54469854469855, + "grad_norm": 1.0417410135269165, + "learning_rate": 1.8499815520341697e-05, + "loss": 0.1323, + "num_input_tokens_seen": 16753512, + "step": 23350 + }, + { + "epoch": 48.555093555093556, + "grad_norm": 0.286956250667572, + "learning_rate": 1.8490336200874094e-05, + "loss": 0.0775, + "num_input_tokens_seen": 16757032, + "step": 23355 + }, + { + "epoch": 48.565488565488565, + "grad_norm": 0.33047008514404297, + "learning_rate": 1.848085788527844e-05, + "loss": 0.1107, + "num_input_tokens_seen": 16760552, + "step": 23360 + }, + { + "epoch": 48.57588357588357, + "grad_norm": 0.3035888969898224, + "learning_rate": 1.847138057501644e-05, + "loss": 0.1105, + "num_input_tokens_seen": 16764232, + "step": 23365 + }, + { + "epoch": 48.58627858627859, + "grad_norm": 0.36203354597091675, + "learning_rate": 1.8461904271549582e-05, + "loss": 0.0836, + "num_input_tokens_seen": 16767848, + "step": 23370 + }, + { + "epoch": 48.5966735966736, + "grad_norm": 0.13800521194934845, + "learning_rate": 1.845242897633926e-05, + "loss": 0.0683, + "num_input_tokens_seen": 16771400, + "step": 23375 + }, + { + "epoch": 48.607068607068605, + "grad_norm": 0.3409271836280823, + "learning_rate": 1.844295469084667e-05, + "loss": 0.1211, + "num_input_tokens_seen": 16775144, + "step": 23380 + }, + { + "epoch": 48.61746361746362, + "grad_norm": 0.25940969586372375, + "learning_rate": 1.843348141653286e-05, + "loss": 0.1012, + "num_input_tokens_seen": 16778696, + "step": 23385 + }, + { + "epoch": 48.62785862785863, + "grad_norm": 0.7696616649627686, + "learning_rate": 1.842400915485874e-05, + "loss": 0.1056, + "num_input_tokens_seen": 16782248, + "step": 23390 + }, + { + "epoch": 48.63825363825364, + "grad_norm": 0.5881581902503967, + "learning_rate": 1.8414537907285053e-05, + "loss": 0.1446, + "num_input_tokens_seen": 16785736, + "step": 23395 + }, + { + "epoch": 48.648648648648646, + "grad_norm": 0.1725214719772339, + "learning_rate": 1.840506767527237e-05, + "loss": 0.1133, + "num_input_tokens_seen": 16789224, + "step": 23400 + }, + { + "epoch": 48.648648648648646, + "eval_loss": 0.1431451290845871, + "eval_runtime": 7.7536, + "eval_samples_per_second": 110.401, + "eval_steps_per_second": 27.6, + "num_input_tokens_seen": 16789224, + "step": 23400 + }, + { + "epoch": 48.65904365904366, + "grad_norm": 0.4190114736557007, + "learning_rate": 1.8395598460281137e-05, + "loss": 0.1134, + "num_input_tokens_seen": 16793096, + "step": 23405 + }, + { + "epoch": 48.66943866943867, + "grad_norm": 0.5192102789878845, + "learning_rate": 1.838613026377161e-05, + "loss": 0.1128, + "num_input_tokens_seen": 16796744, + "step": 23410 + }, + { + "epoch": 48.67983367983368, + "grad_norm": 0.20543386042118073, + "learning_rate": 1.8376663087203917e-05, + "loss": 0.0823, + "num_input_tokens_seen": 16800200, + "step": 23415 + }, + { + "epoch": 48.69022869022869, + "grad_norm": 0.348117470741272, + "learning_rate": 1.8367196932038014e-05, + "loss": 0.0922, + "num_input_tokens_seen": 16803848, + "step": 23420 + }, + { + "epoch": 48.7006237006237, + "grad_norm": 0.7742477655410767, + "learning_rate": 1.8357731799733686e-05, + "loss": 0.0788, + "num_input_tokens_seen": 16807400, + "step": 23425 + }, + { + "epoch": 48.71101871101871, + "grad_norm": 0.2730797529220581, + "learning_rate": 1.8348267691750586e-05, + "loss": 0.0959, + "num_input_tokens_seen": 16810984, + "step": 23430 + }, + { + "epoch": 48.72141372141372, + "grad_norm": 0.29115328192710876, + "learning_rate": 1.833880460954821e-05, + "loss": 0.0941, + "num_input_tokens_seen": 16814536, + "step": 23435 + }, + { + "epoch": 48.731808731808734, + "grad_norm": 0.45363467931747437, + "learning_rate": 1.8329342554585866e-05, + "loss": 0.0594, + "num_input_tokens_seen": 16817928, + "step": 23440 + }, + { + "epoch": 48.74220374220374, + "grad_norm": 0.30985110998153687, + "learning_rate": 1.8319881528322735e-05, + "loss": 0.0773, + "num_input_tokens_seen": 16821416, + "step": 23445 + }, + { + "epoch": 48.75259875259875, + "grad_norm": 0.5077912211418152, + "learning_rate": 1.8310421532217815e-05, + "loss": 0.1054, + "num_input_tokens_seen": 16825064, + "step": 23450 + }, + { + "epoch": 48.762993762993766, + "grad_norm": 0.6015976667404175, + "learning_rate": 1.8300962567729958e-05, + "loss": 0.1489, + "num_input_tokens_seen": 16828712, + "step": 23455 + }, + { + "epoch": 48.773388773388774, + "grad_norm": 0.12155133485794067, + "learning_rate": 1.8291504636317866e-05, + "loss": 0.072, + "num_input_tokens_seen": 16832232, + "step": 23460 + }, + { + "epoch": 48.78378378378378, + "grad_norm": 0.17061609029769897, + "learning_rate": 1.8282047739440055e-05, + "loss": 0.075, + "num_input_tokens_seen": 16835688, + "step": 23465 + }, + { + "epoch": 48.79417879417879, + "grad_norm": 0.5264384150505066, + "learning_rate": 1.8272591878554903e-05, + "loss": 0.1284, + "num_input_tokens_seen": 16839432, + "step": 23470 + }, + { + "epoch": 48.804573804573806, + "grad_norm": 0.31635722517967224, + "learning_rate": 1.8263137055120638e-05, + "loss": 0.1031, + "num_input_tokens_seen": 16843016, + "step": 23475 + }, + { + "epoch": 48.814968814968815, + "grad_norm": 0.3440271317958832, + "learning_rate": 1.8253683270595295e-05, + "loss": 0.1262, + "num_input_tokens_seen": 16846504, + "step": 23480 + }, + { + "epoch": 48.82536382536382, + "grad_norm": 0.258174329996109, + "learning_rate": 1.824423052643677e-05, + "loss": 0.0776, + "num_input_tokens_seen": 16850120, + "step": 23485 + }, + { + "epoch": 48.83575883575884, + "grad_norm": 0.34955719113349915, + "learning_rate": 1.82347788241028e-05, + "loss": 0.0982, + "num_input_tokens_seen": 16853640, + "step": 23490 + }, + { + "epoch": 48.84615384615385, + "grad_norm": 0.271145761013031, + "learning_rate": 1.8225328165050942e-05, + "loss": 0.1354, + "num_input_tokens_seen": 16857352, + "step": 23495 + }, + { + "epoch": 48.856548856548855, + "grad_norm": 0.8477951288223267, + "learning_rate": 1.821587855073863e-05, + "loss": 0.1031, + "num_input_tokens_seen": 16861096, + "step": 23500 + }, + { + "epoch": 48.86694386694387, + "grad_norm": 0.32745254039764404, + "learning_rate": 1.8206429982623086e-05, + "loss": 0.1086, + "num_input_tokens_seen": 16864712, + "step": 23505 + }, + { + "epoch": 48.87733887733888, + "grad_norm": 0.2692011594772339, + "learning_rate": 1.8196982462161416e-05, + "loss": 0.1154, + "num_input_tokens_seen": 16868296, + "step": 23510 + }, + { + "epoch": 48.88773388773389, + "grad_norm": 0.4816019535064697, + "learning_rate": 1.818753599081055e-05, + "loss": 0.0921, + "num_input_tokens_seen": 16871848, + "step": 23515 + }, + { + "epoch": 48.898128898128896, + "grad_norm": 0.3274303674697876, + "learning_rate": 1.817809057002724e-05, + "loss": 0.1007, + "num_input_tokens_seen": 16875368, + "step": 23520 + }, + { + "epoch": 48.90852390852391, + "grad_norm": 0.3830528259277344, + "learning_rate": 1.8168646201268096e-05, + "loss": 0.1468, + "num_input_tokens_seen": 16878984, + "step": 23525 + }, + { + "epoch": 48.91891891891892, + "grad_norm": 0.39923396706581116, + "learning_rate": 1.8159202885989557e-05, + "loss": 0.1038, + "num_input_tokens_seen": 16882696, + "step": 23530 + }, + { + "epoch": 48.92931392931393, + "grad_norm": 0.9103984832763672, + "learning_rate": 1.814976062564789e-05, + "loss": 0.0745, + "num_input_tokens_seen": 16886088, + "step": 23535 + }, + { + "epoch": 48.93970893970894, + "grad_norm": 0.25726085901260376, + "learning_rate": 1.8140319421699234e-05, + "loss": 0.1393, + "num_input_tokens_seen": 16889832, + "step": 23540 + }, + { + "epoch": 48.95010395010395, + "grad_norm": 0.45541059970855713, + "learning_rate": 1.8130879275599515e-05, + "loss": 0.1311, + "num_input_tokens_seen": 16893320, + "step": 23545 + }, + { + "epoch": 48.96049896049896, + "grad_norm": 0.16900920867919922, + "learning_rate": 1.8121440188804544e-05, + "loss": 0.0762, + "num_input_tokens_seen": 16896936, + "step": 23550 + }, + { + "epoch": 48.97089397089397, + "grad_norm": 0.37086406350135803, + "learning_rate": 1.811200216276993e-05, + "loss": 0.0767, + "num_input_tokens_seen": 16900456, + "step": 23555 + }, + { + "epoch": 48.981288981288984, + "grad_norm": 0.2995770275592804, + "learning_rate": 1.810256519895115e-05, + "loss": 0.099, + "num_input_tokens_seen": 16904072, + "step": 23560 + }, + { + "epoch": 48.99168399168399, + "grad_norm": 0.321452796459198, + "learning_rate": 1.8093129298803494e-05, + "loss": 0.1007, + "num_input_tokens_seen": 16907528, + "step": 23565 + }, + { + "epoch": 49.002079002079, + "grad_norm": 0.26364830136299133, + "learning_rate": 1.808369446378209e-05, + "loss": 0.1059, + "num_input_tokens_seen": 16911136, + "step": 23570 + }, + { + "epoch": 49.012474012474016, + "grad_norm": 0.17232388257980347, + "learning_rate": 1.8074260695341914e-05, + "loss": 0.0901, + "num_input_tokens_seen": 16914784, + "step": 23575 + }, + { + "epoch": 49.022869022869024, + "grad_norm": 0.2687737047672272, + "learning_rate": 1.8064827994937782e-05, + "loss": 0.0631, + "num_input_tokens_seen": 16918272, + "step": 23580 + }, + { + "epoch": 49.03326403326403, + "grad_norm": 0.30420517921447754, + "learning_rate": 1.8055396364024317e-05, + "loss": 0.1175, + "num_input_tokens_seen": 16921728, + "step": 23585 + }, + { + "epoch": 49.04365904365904, + "grad_norm": 0.31535404920578003, + "learning_rate": 1.804596580405601e-05, + "loss": 0.1091, + "num_input_tokens_seen": 16925440, + "step": 23590 + }, + { + "epoch": 49.054054054054056, + "grad_norm": 0.5160621404647827, + "learning_rate": 1.8036536316487174e-05, + "loss": 0.0905, + "num_input_tokens_seen": 16929120, + "step": 23595 + }, + { + "epoch": 49.064449064449065, + "grad_norm": 0.27671676874160767, + "learning_rate": 1.802710790277193e-05, + "loss": 0.1046, + "num_input_tokens_seen": 16932768, + "step": 23600 + }, + { + "epoch": 49.064449064449065, + "eval_loss": 0.15088996291160583, + "eval_runtime": 7.7451, + "eval_samples_per_second": 110.522, + "eval_steps_per_second": 27.631, + "num_input_tokens_seen": 16932768, + "step": 23600 + }, + { + "epoch": 49.07484407484407, + "grad_norm": 0.20447884500026703, + "learning_rate": 1.801768056436429e-05, + "loss": 0.0651, + "num_input_tokens_seen": 16936384, + "step": 23605 + }, + { + "epoch": 49.08523908523909, + "grad_norm": 0.740466296672821, + "learning_rate": 1.8008254302718035e-05, + "loss": 0.1247, + "num_input_tokens_seen": 16940032, + "step": 23610 + }, + { + "epoch": 49.0956340956341, + "grad_norm": 0.1999683678150177, + "learning_rate": 1.7998829119286837e-05, + "loss": 0.1085, + "num_input_tokens_seen": 16943712, + "step": 23615 + }, + { + "epoch": 49.106029106029105, + "grad_norm": 0.1834011673927307, + "learning_rate": 1.798940501552418e-05, + "loss": 0.0804, + "num_input_tokens_seen": 16947136, + "step": 23620 + }, + { + "epoch": 49.11642411642411, + "grad_norm": 0.1280381679534912, + "learning_rate": 1.797998199288336e-05, + "loss": 0.1261, + "num_input_tokens_seen": 16950720, + "step": 23625 + }, + { + "epoch": 49.12681912681913, + "grad_norm": 0.5009818077087402, + "learning_rate": 1.7970560052817543e-05, + "loss": 0.106, + "num_input_tokens_seen": 16954208, + "step": 23630 + }, + { + "epoch": 49.13721413721414, + "grad_norm": 0.5743697285652161, + "learning_rate": 1.7961139196779702e-05, + "loss": 0.1059, + "num_input_tokens_seen": 16957920, + "step": 23635 + }, + { + "epoch": 49.147609147609145, + "grad_norm": 0.9801022410392761, + "learning_rate": 1.7951719426222647e-05, + "loss": 0.1223, + "num_input_tokens_seen": 16961568, + "step": 23640 + }, + { + "epoch": 49.15800415800416, + "grad_norm": 0.27176472544670105, + "learning_rate": 1.794230074259904e-05, + "loss": 0.097, + "num_input_tokens_seen": 16965312, + "step": 23645 + }, + { + "epoch": 49.16839916839917, + "grad_norm": 0.21662943065166473, + "learning_rate": 1.7932883147361336e-05, + "loss": 0.0912, + "num_input_tokens_seen": 16968960, + "step": 23650 + }, + { + "epoch": 49.17879417879418, + "grad_norm": 0.702857494354248, + "learning_rate": 1.7923466641961865e-05, + "loss": 0.0879, + "num_input_tokens_seen": 16972512, + "step": 23655 + }, + { + "epoch": 49.189189189189186, + "grad_norm": 0.4271685779094696, + "learning_rate": 1.791405122785278e-05, + "loss": 0.0929, + "num_input_tokens_seen": 16976160, + "step": 23660 + }, + { + "epoch": 49.1995841995842, + "grad_norm": 0.2744866907596588, + "learning_rate": 1.7904636906486037e-05, + "loss": 0.1231, + "num_input_tokens_seen": 16979712, + "step": 23665 + }, + { + "epoch": 49.20997920997921, + "grad_norm": 0.20985066890716553, + "learning_rate": 1.7895223679313448e-05, + "loss": 0.1681, + "num_input_tokens_seen": 16983264, + "step": 23670 + }, + { + "epoch": 49.22037422037422, + "grad_norm": 0.24708670377731323, + "learning_rate": 1.7885811547786653e-05, + "loss": 0.1046, + "num_input_tokens_seen": 16986944, + "step": 23675 + }, + { + "epoch": 49.23076923076923, + "grad_norm": 0.38245970010757446, + "learning_rate": 1.7876400513357115e-05, + "loss": 0.0941, + "num_input_tokens_seen": 16990496, + "step": 23680 + }, + { + "epoch": 49.24116424116424, + "grad_norm": 0.5475528240203857, + "learning_rate": 1.7866990577476146e-05, + "loss": 0.1125, + "num_input_tokens_seen": 16994048, + "step": 23685 + }, + { + "epoch": 49.25155925155925, + "grad_norm": 0.3339974284172058, + "learning_rate": 1.7857581741594863e-05, + "loss": 0.1144, + "num_input_tokens_seen": 16997600, + "step": 23690 + }, + { + "epoch": 49.26195426195426, + "grad_norm": 0.7404349446296692, + "learning_rate": 1.7848174007164237e-05, + "loss": 0.0925, + "num_input_tokens_seen": 17001248, + "step": 23695 + }, + { + "epoch": 49.272349272349274, + "grad_norm": 0.2729557454586029, + "learning_rate": 1.7838767375635052e-05, + "loss": 0.1443, + "num_input_tokens_seen": 17004800, + "step": 23700 + }, + { + "epoch": 49.28274428274428, + "grad_norm": 0.2671215534210205, + "learning_rate": 1.782936184845793e-05, + "loss": 0.1065, + "num_input_tokens_seen": 17008352, + "step": 23705 + }, + { + "epoch": 49.29313929313929, + "grad_norm": 0.4320538341999054, + "learning_rate": 1.7819957427083334e-05, + "loss": 0.1232, + "num_input_tokens_seen": 17011936, + "step": 23710 + }, + { + "epoch": 49.303534303534306, + "grad_norm": 0.6581394672393799, + "learning_rate": 1.7810554112961516e-05, + "loss": 0.0707, + "num_input_tokens_seen": 17015552, + "step": 23715 + }, + { + "epoch": 49.313929313929314, + "grad_norm": 0.15737107396125793, + "learning_rate": 1.7801151907542607e-05, + "loss": 0.0967, + "num_input_tokens_seen": 17019104, + "step": 23720 + }, + { + "epoch": 49.32432432432432, + "grad_norm": 0.34847113490104675, + "learning_rate": 1.7791750812276547e-05, + "loss": 0.0818, + "num_input_tokens_seen": 17022688, + "step": 23725 + }, + { + "epoch": 49.33471933471934, + "grad_norm": 0.4781322181224823, + "learning_rate": 1.778235082861309e-05, + "loss": 0.1044, + "num_input_tokens_seen": 17026272, + "step": 23730 + }, + { + "epoch": 49.34511434511435, + "grad_norm": 0.576609194278717, + "learning_rate": 1.777295195800184e-05, + "loss": 0.1312, + "num_input_tokens_seen": 17029824, + "step": 23735 + }, + { + "epoch": 49.355509355509355, + "grad_norm": 0.15792764723300934, + "learning_rate": 1.7763554201892215e-05, + "loss": 0.0937, + "num_input_tokens_seen": 17033344, + "step": 23740 + }, + { + "epoch": 49.36590436590436, + "grad_norm": 0.20499420166015625, + "learning_rate": 1.7754157561733476e-05, + "loss": 0.0848, + "num_input_tokens_seen": 17037024, + "step": 23745 + }, + { + "epoch": 49.37629937629938, + "grad_norm": 0.3514021039009094, + "learning_rate": 1.7744762038974702e-05, + "loss": 0.1069, + "num_input_tokens_seen": 17040768, + "step": 23750 + }, + { + "epoch": 49.38669438669439, + "grad_norm": 0.35418516397476196, + "learning_rate": 1.7735367635064788e-05, + "loss": 0.09, + "num_input_tokens_seen": 17044480, + "step": 23755 + }, + { + "epoch": 49.397089397089395, + "grad_norm": 0.5877873301506042, + "learning_rate": 1.7725974351452474e-05, + "loss": 0.136, + "num_input_tokens_seen": 17048000, + "step": 23760 + }, + { + "epoch": 49.40748440748441, + "grad_norm": 0.2773090898990631, + "learning_rate": 1.771658218958634e-05, + "loss": 0.0863, + "num_input_tokens_seen": 17051520, + "step": 23765 + }, + { + "epoch": 49.41787941787942, + "grad_norm": 0.3519362509250641, + "learning_rate": 1.770719115091475e-05, + "loss": 0.1322, + "num_input_tokens_seen": 17055104, + "step": 23770 + }, + { + "epoch": 49.42827442827443, + "grad_norm": 0.26410216093063354, + "learning_rate": 1.7697801236885935e-05, + "loss": 0.112, + "num_input_tokens_seen": 17058912, + "step": 23775 + }, + { + "epoch": 49.438669438669436, + "grad_norm": 0.22539843618869781, + "learning_rate": 1.7688412448947944e-05, + "loss": 0.0993, + "num_input_tokens_seen": 17062336, + "step": 23780 + }, + { + "epoch": 49.44906444906445, + "grad_norm": 0.278808057308197, + "learning_rate": 1.767902478854862e-05, + "loss": 0.0931, + "num_input_tokens_seen": 17065984, + "step": 23785 + }, + { + "epoch": 49.45945945945946, + "grad_norm": 0.30005767941474915, + "learning_rate": 1.766963825713569e-05, + "loss": 0.0815, + "num_input_tokens_seen": 17069568, + "step": 23790 + }, + { + "epoch": 49.46985446985447, + "grad_norm": 0.17188218235969543, + "learning_rate": 1.766025285615665e-05, + "loss": 0.0614, + "num_input_tokens_seen": 17073088, + "step": 23795 + }, + { + "epoch": 49.48024948024948, + "grad_norm": 0.10836770385503769, + "learning_rate": 1.7650868587058854e-05, + "loss": 0.0668, + "num_input_tokens_seen": 17076672, + "step": 23800 + }, + { + "epoch": 49.48024948024948, + "eval_loss": 0.1451115608215332, + "eval_runtime": 7.7448, + "eval_samples_per_second": 110.525, + "eval_steps_per_second": 27.631, + "num_input_tokens_seen": 17076672, + "step": 23800 + }, + { + "epoch": 49.49064449064449, + "grad_norm": 0.17813198268413544, + "learning_rate": 1.7641485451289484e-05, + "loss": 0.0798, + "num_input_tokens_seen": 17080224, + "step": 23805 + }, + { + "epoch": 49.5010395010395, + "grad_norm": 0.40337124466896057, + "learning_rate": 1.7632103450295534e-05, + "loss": 0.1216, + "num_input_tokens_seen": 17084000, + "step": 23810 + }, + { + "epoch": 49.51143451143451, + "grad_norm": 0.19979070127010345, + "learning_rate": 1.762272258552381e-05, + "loss": 0.0782, + "num_input_tokens_seen": 17087584, + "step": 23815 + }, + { + "epoch": 49.521829521829524, + "grad_norm": 0.4345114231109619, + "learning_rate": 1.7613342858420988e-05, + "loss": 0.1242, + "num_input_tokens_seen": 17091168, + "step": 23820 + }, + { + "epoch": 49.53222453222453, + "grad_norm": 0.615875244140625, + "learning_rate": 1.760396427043351e-05, + "loss": 0.0972, + "num_input_tokens_seen": 17094880, + "step": 23825 + }, + { + "epoch": 49.54261954261954, + "grad_norm": 0.36117029190063477, + "learning_rate": 1.7594586823007696e-05, + "loss": 0.0914, + "num_input_tokens_seen": 17098528, + "step": 23830 + }, + { + "epoch": 49.553014553014556, + "grad_norm": 0.5395487546920776, + "learning_rate": 1.7585210517589646e-05, + "loss": 0.0669, + "num_input_tokens_seen": 17102112, + "step": 23835 + }, + { + "epoch": 49.563409563409564, + "grad_norm": 0.28160360455513, + "learning_rate": 1.7575835355625314e-05, + "loss": 0.1049, + "num_input_tokens_seen": 17105696, + "step": 23840 + }, + { + "epoch": 49.57380457380457, + "grad_norm": 0.4405117332935333, + "learning_rate": 1.756646133856048e-05, + "loss": 0.0714, + "num_input_tokens_seen": 17109056, + "step": 23845 + }, + { + "epoch": 49.58419958419958, + "grad_norm": 0.31261274218559265, + "learning_rate": 1.7557088467840714e-05, + "loss": 0.0699, + "num_input_tokens_seen": 17112512, + "step": 23850 + }, + { + "epoch": 49.5945945945946, + "grad_norm": 0.3727013170719147, + "learning_rate": 1.7547716744911438e-05, + "loss": 0.0769, + "num_input_tokens_seen": 17116064, + "step": 23855 + }, + { + "epoch": 49.604989604989605, + "grad_norm": 0.3611781597137451, + "learning_rate": 1.7538346171217902e-05, + "loss": 0.1055, + "num_input_tokens_seen": 17119616, + "step": 23860 + }, + { + "epoch": 49.61538461538461, + "grad_norm": 0.45822781324386597, + "learning_rate": 1.7528976748205146e-05, + "loss": 0.1221, + "num_input_tokens_seen": 17123136, + "step": 23865 + }, + { + "epoch": 49.62577962577963, + "grad_norm": 0.3237990736961365, + "learning_rate": 1.751960847731807e-05, + "loss": 0.0985, + "num_input_tokens_seen": 17126752, + "step": 23870 + }, + { + "epoch": 49.63617463617464, + "grad_norm": 0.3630797266960144, + "learning_rate": 1.7510241360001362e-05, + "loss": 0.1064, + "num_input_tokens_seen": 17130240, + "step": 23875 + }, + { + "epoch": 49.646569646569645, + "grad_norm": 0.21204552054405212, + "learning_rate": 1.7500875397699562e-05, + "loss": 0.0949, + "num_input_tokens_seen": 17134048, + "step": 23880 + }, + { + "epoch": 49.656964656964654, + "grad_norm": 0.6667659878730774, + "learning_rate": 1.7491510591857015e-05, + "loss": 0.0867, + "num_input_tokens_seen": 17137824, + "step": 23885 + }, + { + "epoch": 49.66735966735967, + "grad_norm": 0.4009595215320587, + "learning_rate": 1.7482146943917896e-05, + "loss": 0.1249, + "num_input_tokens_seen": 17141504, + "step": 23890 + }, + { + "epoch": 49.67775467775468, + "grad_norm": 0.5101662874221802, + "learning_rate": 1.7472784455326185e-05, + "loss": 0.0863, + "num_input_tokens_seen": 17144992, + "step": 23895 + }, + { + "epoch": 49.688149688149686, + "grad_norm": 1.091907262802124, + "learning_rate": 1.746342312752572e-05, + "loss": 0.146, + "num_input_tokens_seen": 17148480, + "step": 23900 + }, + { + "epoch": 49.6985446985447, + "grad_norm": 0.3591326177120209, + "learning_rate": 1.74540629619601e-05, + "loss": 0.0716, + "num_input_tokens_seen": 17152032, + "step": 23905 + }, + { + "epoch": 49.70893970893971, + "grad_norm": 0.20886562764644623, + "learning_rate": 1.7444703960072815e-05, + "loss": 0.106, + "num_input_tokens_seen": 17155456, + "step": 23910 + }, + { + "epoch": 49.71933471933472, + "grad_norm": 0.4601130485534668, + "learning_rate": 1.7435346123307118e-05, + "loss": 0.1072, + "num_input_tokens_seen": 17159072, + "step": 23915 + }, + { + "epoch": 49.729729729729726, + "grad_norm": 0.37777528166770935, + "learning_rate": 1.742598945310611e-05, + "loss": 0.1094, + "num_input_tokens_seen": 17162560, + "step": 23920 + }, + { + "epoch": 49.74012474012474, + "grad_norm": 0.28043127059936523, + "learning_rate": 1.741663395091272e-05, + "loss": 0.0929, + "num_input_tokens_seen": 17166272, + "step": 23925 + }, + { + "epoch": 49.75051975051975, + "grad_norm": 0.1864326447248459, + "learning_rate": 1.7407279618169657e-05, + "loss": 0.0886, + "num_input_tokens_seen": 17169824, + "step": 23930 + }, + { + "epoch": 49.76091476091476, + "grad_norm": 0.2699935734272003, + "learning_rate": 1.73979264563195e-05, + "loss": 0.0718, + "num_input_tokens_seen": 17173440, + "step": 23935 + }, + { + "epoch": 49.771309771309774, + "grad_norm": 0.44331517815589905, + "learning_rate": 1.7388574466804625e-05, + "loss": 0.111, + "num_input_tokens_seen": 17176960, + "step": 23940 + }, + { + "epoch": 49.78170478170478, + "grad_norm": 0.930926501750946, + "learning_rate": 1.7379223651067207e-05, + "loss": 0.1063, + "num_input_tokens_seen": 17180576, + "step": 23945 + }, + { + "epoch": 49.79209979209979, + "grad_norm": 0.565575897693634, + "learning_rate": 1.736987401054928e-05, + "loss": 0.1104, + "num_input_tokens_seen": 17184192, + "step": 23950 + }, + { + "epoch": 49.802494802494806, + "grad_norm": 0.2356468141078949, + "learning_rate": 1.736052554669266e-05, + "loss": 0.1327, + "num_input_tokens_seen": 17187840, + "step": 23955 + }, + { + "epoch": 49.812889812889814, + "grad_norm": 0.2077520191669464, + "learning_rate": 1.7351178260939007e-05, + "loss": 0.0888, + "num_input_tokens_seen": 17191392, + "step": 23960 + }, + { + "epoch": 49.82328482328482, + "grad_norm": 0.6653088331222534, + "learning_rate": 1.7341832154729794e-05, + "loss": 0.1084, + "num_input_tokens_seen": 17195040, + "step": 23965 + }, + { + "epoch": 49.83367983367983, + "grad_norm": 0.39607110619544983, + "learning_rate": 1.7332487229506286e-05, + "loss": 0.1132, + "num_input_tokens_seen": 17198560, + "step": 23970 + }, + { + "epoch": 49.84407484407485, + "grad_norm": 0.2674051821231842, + "learning_rate": 1.732314348670961e-05, + "loss": 0.0811, + "num_input_tokens_seen": 17202176, + "step": 23975 + }, + { + "epoch": 49.854469854469855, + "grad_norm": 0.1798103004693985, + "learning_rate": 1.7313800927780686e-05, + "loss": 0.1016, + "num_input_tokens_seen": 17205760, + "step": 23980 + }, + { + "epoch": 49.86486486486486, + "grad_norm": 0.9418762922286987, + "learning_rate": 1.7304459554160245e-05, + "loss": 0.1303, + "num_input_tokens_seen": 17209248, + "step": 23985 + }, + { + "epoch": 49.87525987525988, + "grad_norm": 0.1660027652978897, + "learning_rate": 1.7295119367288853e-05, + "loss": 0.099, + "num_input_tokens_seen": 17212832, + "step": 23990 + }, + { + "epoch": 49.88565488565489, + "grad_norm": 0.31478968262672424, + "learning_rate": 1.728578036860688e-05, + "loss": 0.092, + "num_input_tokens_seen": 17216512, + "step": 23995 + }, + { + "epoch": 49.896049896049895, + "grad_norm": 0.42795008420944214, + "learning_rate": 1.7276442559554513e-05, + "loss": 0.1376, + "num_input_tokens_seen": 17220000, + "step": 24000 + }, + { + "epoch": 49.896049896049895, + "eval_loss": 0.14429092407226562, + "eval_runtime": 7.7546, + "eval_samples_per_second": 110.386, + "eval_steps_per_second": 27.597, + "num_input_tokens_seen": 17220000, + "step": 24000 + }, + { + "epoch": 49.906444906444904, + "grad_norm": 0.2619156837463379, + "learning_rate": 1.726710594157177e-05, + "loss": 0.1257, + "num_input_tokens_seen": 17223648, + "step": 24005 + }, + { + "epoch": 49.91683991683992, + "grad_norm": 0.2857290804386139, + "learning_rate": 1.725777051609846e-05, + "loss": 0.1333, + "num_input_tokens_seen": 17227200, + "step": 24010 + }, + { + "epoch": 49.92723492723493, + "grad_norm": 0.5566674470901489, + "learning_rate": 1.7248436284574228e-05, + "loss": 0.1365, + "num_input_tokens_seen": 17230912, + "step": 24015 + }, + { + "epoch": 49.937629937629936, + "grad_norm": 0.7890837788581848, + "learning_rate": 1.723910324843855e-05, + "loss": 0.1047, + "num_input_tokens_seen": 17234432, + "step": 24020 + }, + { + "epoch": 49.94802494802495, + "grad_norm": 0.38685789704322815, + "learning_rate": 1.722977140913067e-05, + "loss": 0.1314, + "num_input_tokens_seen": 17238112, + "step": 24025 + }, + { + "epoch": 49.95841995841996, + "grad_norm": 0.22148878872394562, + "learning_rate": 1.7220440768089688e-05, + "loss": 0.08, + "num_input_tokens_seen": 17241504, + "step": 24030 + }, + { + "epoch": 49.96881496881497, + "grad_norm": 0.4310178756713867, + "learning_rate": 1.7211111326754505e-05, + "loss": 0.1234, + "num_input_tokens_seen": 17245280, + "step": 24035 + }, + { + "epoch": 49.979209979209976, + "grad_norm": 0.7268690466880798, + "learning_rate": 1.720178308656383e-05, + "loss": 0.1141, + "num_input_tokens_seen": 17248768, + "step": 24040 + }, + { + "epoch": 49.98960498960499, + "grad_norm": 0.4127441942691803, + "learning_rate": 1.719245604895621e-05, + "loss": 0.0964, + "num_input_tokens_seen": 17252448, + "step": 24045 + }, + { + "epoch": 50.0, + "grad_norm": 0.3228384554386139, + "learning_rate": 1.7183130215369972e-05, + "loss": 0.1178, + "num_input_tokens_seen": 17256040, + "step": 24050 + }, + { + "epoch": 50.01039501039501, + "grad_norm": 0.4973011910915375, + "learning_rate": 1.7173805587243292e-05, + "loss": 0.1045, + "num_input_tokens_seen": 17259656, + "step": 24055 + }, + { + "epoch": 50.020790020790024, + "grad_norm": 0.719908595085144, + "learning_rate": 1.7164482166014147e-05, + "loss": 0.118, + "num_input_tokens_seen": 17263336, + "step": 24060 + }, + { + "epoch": 50.03118503118503, + "grad_norm": 0.13695016503334045, + "learning_rate": 1.7155159953120313e-05, + "loss": 0.1018, + "num_input_tokens_seen": 17266888, + "step": 24065 + }, + { + "epoch": 50.04158004158004, + "grad_norm": 0.34338313341140747, + "learning_rate": 1.714583894999941e-05, + "loss": 0.0956, + "num_input_tokens_seen": 17270440, + "step": 24070 + }, + { + "epoch": 50.05197505197505, + "grad_norm": 0.19823403656482697, + "learning_rate": 1.7136519158088826e-05, + "loss": 0.1196, + "num_input_tokens_seen": 17274024, + "step": 24075 + }, + { + "epoch": 50.062370062370064, + "grad_norm": 0.7314260005950928, + "learning_rate": 1.712720057882581e-05, + "loss": 0.092, + "num_input_tokens_seen": 17277640, + "step": 24080 + }, + { + "epoch": 50.07276507276507, + "grad_norm": 0.4718901216983795, + "learning_rate": 1.7117883213647413e-05, + "loss": 0.1389, + "num_input_tokens_seen": 17281352, + "step": 24085 + }, + { + "epoch": 50.08316008316008, + "grad_norm": 0.18274755775928497, + "learning_rate": 1.710856706399046e-05, + "loss": 0.077, + "num_input_tokens_seen": 17284904, + "step": 24090 + }, + { + "epoch": 50.093555093555096, + "grad_norm": 0.2578517198562622, + "learning_rate": 1.7099252131291648e-05, + "loss": 0.1185, + "num_input_tokens_seen": 17288360, + "step": 24095 + }, + { + "epoch": 50.103950103950105, + "grad_norm": 0.5924055576324463, + "learning_rate": 1.708993841698744e-05, + "loss": 0.0879, + "num_input_tokens_seen": 17292040, + "step": 24100 + }, + { + "epoch": 50.11434511434511, + "grad_norm": 0.4801981747150421, + "learning_rate": 1.7080625922514132e-05, + "loss": 0.1028, + "num_input_tokens_seen": 17295592, + "step": 24105 + }, + { + "epoch": 50.12474012474012, + "grad_norm": 0.29243502020835876, + "learning_rate": 1.7071314649307836e-05, + "loss": 0.0938, + "num_input_tokens_seen": 17299272, + "step": 24110 + }, + { + "epoch": 50.13513513513514, + "grad_norm": 0.20679281651973724, + "learning_rate": 1.7062004598804448e-05, + "loss": 0.1092, + "num_input_tokens_seen": 17303112, + "step": 24115 + }, + { + "epoch": 50.145530145530145, + "grad_norm": 0.23995034396648407, + "learning_rate": 1.7052695772439702e-05, + "loss": 0.0993, + "num_input_tokens_seen": 17306664, + "step": 24120 + }, + { + "epoch": 50.15592515592515, + "grad_norm": 0.26262062788009644, + "learning_rate": 1.7043388171649154e-05, + "loss": 0.1257, + "num_input_tokens_seen": 17310152, + "step": 24125 + }, + { + "epoch": 50.16632016632017, + "grad_norm": 0.7092837691307068, + "learning_rate": 1.7034081797868127e-05, + "loss": 0.0932, + "num_input_tokens_seen": 17313736, + "step": 24130 + }, + { + "epoch": 50.17671517671518, + "grad_norm": 0.4018847942352295, + "learning_rate": 1.70247766525318e-05, + "loss": 0.1259, + "num_input_tokens_seen": 17317320, + "step": 24135 + }, + { + "epoch": 50.187110187110186, + "grad_norm": 0.27437126636505127, + "learning_rate": 1.701547273707514e-05, + "loss": 0.0694, + "num_input_tokens_seen": 17320840, + "step": 24140 + }, + { + "epoch": 50.197505197505194, + "grad_norm": 0.20417103171348572, + "learning_rate": 1.7006170052932916e-05, + "loss": 0.0653, + "num_input_tokens_seen": 17324488, + "step": 24145 + }, + { + "epoch": 50.20790020790021, + "grad_norm": 0.5473166108131409, + "learning_rate": 1.6996868601539735e-05, + "loss": 0.1077, + "num_input_tokens_seen": 17327944, + "step": 24150 + }, + { + "epoch": 50.21829521829522, + "grad_norm": 0.7432544231414795, + "learning_rate": 1.6987568384329977e-05, + "loss": 0.0838, + "num_input_tokens_seen": 17331432, + "step": 24155 + }, + { + "epoch": 50.228690228690226, + "grad_norm": 0.18991857767105103, + "learning_rate": 1.6978269402737866e-05, + "loss": 0.1221, + "num_input_tokens_seen": 17334984, + "step": 24160 + }, + { + "epoch": 50.23908523908524, + "grad_norm": 0.5821559429168701, + "learning_rate": 1.696897165819743e-05, + "loss": 0.1426, + "num_input_tokens_seen": 17338664, + "step": 24165 + }, + { + "epoch": 50.24948024948025, + "grad_norm": 0.6488886475563049, + "learning_rate": 1.6959675152142487e-05, + "loss": 0.0938, + "num_input_tokens_seen": 17342408, + "step": 24170 + }, + { + "epoch": 50.25987525987526, + "grad_norm": 0.27209195494651794, + "learning_rate": 1.6950379886006667e-05, + "loss": 0.088, + "num_input_tokens_seen": 17345992, + "step": 24175 + }, + { + "epoch": 50.270270270270274, + "grad_norm": 0.22815600037574768, + "learning_rate": 1.6941085861223438e-05, + "loss": 0.0589, + "num_input_tokens_seen": 17349672, + "step": 24180 + }, + { + "epoch": 50.28066528066528, + "grad_norm": 0.27376788854599, + "learning_rate": 1.6931793079226034e-05, + "loss": 0.0862, + "num_input_tokens_seen": 17353160, + "step": 24185 + }, + { + "epoch": 50.29106029106029, + "grad_norm": 0.9608116149902344, + "learning_rate": 1.692250154144754e-05, + "loss": 0.0844, + "num_input_tokens_seen": 17356712, + "step": 24190 + }, + { + "epoch": 50.3014553014553, + "grad_norm": 0.8501760959625244, + "learning_rate": 1.6913211249320807e-05, + "loss": 0.1029, + "num_input_tokens_seen": 17360232, + "step": 24195 + }, + { + "epoch": 50.311850311850314, + "grad_norm": 0.2271548956632614, + "learning_rate": 1.6903922204278522e-05, + "loss": 0.0919, + "num_input_tokens_seen": 17363816, + "step": 24200 + }, + { + "epoch": 50.311850311850314, + "eval_loss": 0.14261610805988312, + "eval_runtime": 7.7528, + "eval_samples_per_second": 110.411, + "eval_steps_per_second": 27.603, + "num_input_tokens_seen": 17363816, + "step": 24200 + }, + { + "epoch": 50.32224532224532, + "grad_norm": 0.24383720755577087, + "learning_rate": 1.6894634407753186e-05, + "loss": 0.1027, + "num_input_tokens_seen": 17367432, + "step": 24205 + }, + { + "epoch": 50.33264033264033, + "grad_norm": 0.36341091990470886, + "learning_rate": 1.6885347861177077e-05, + "loss": 0.1032, + "num_input_tokens_seen": 17371176, + "step": 24210 + }, + { + "epoch": 50.343035343035346, + "grad_norm": 0.228022962808609, + "learning_rate": 1.6876062565982298e-05, + "loss": 0.101, + "num_input_tokens_seen": 17374856, + "step": 24215 + }, + { + "epoch": 50.353430353430355, + "grad_norm": 0.23655946552753448, + "learning_rate": 1.6866778523600774e-05, + "loss": 0.0735, + "num_input_tokens_seen": 17378344, + "step": 24220 + }, + { + "epoch": 50.36382536382536, + "grad_norm": 0.5246792435646057, + "learning_rate": 1.6857495735464195e-05, + "loss": 0.1189, + "num_input_tokens_seen": 17382024, + "step": 24225 + }, + { + "epoch": 50.37422037422037, + "grad_norm": 0.15179891884326935, + "learning_rate": 1.6848214203004115e-05, + "loss": 0.0742, + "num_input_tokens_seen": 17385576, + "step": 24230 + }, + { + "epoch": 50.38461538461539, + "grad_norm": 0.25929179787635803, + "learning_rate": 1.6838933927651835e-05, + "loss": 0.0665, + "num_input_tokens_seen": 17389128, + "step": 24235 + }, + { + "epoch": 50.395010395010395, + "grad_norm": 0.6450269222259521, + "learning_rate": 1.6829654910838506e-05, + "loss": 0.1423, + "num_input_tokens_seen": 17392808, + "step": 24240 + }, + { + "epoch": 50.4054054054054, + "grad_norm": 0.7243478894233704, + "learning_rate": 1.6820377153995065e-05, + "loss": 0.1344, + "num_input_tokens_seen": 17396488, + "step": 24245 + }, + { + "epoch": 50.41580041580042, + "grad_norm": 0.34593814611434937, + "learning_rate": 1.681110065855226e-05, + "loss": 0.1182, + "num_input_tokens_seen": 17400072, + "step": 24250 + }, + { + "epoch": 50.42619542619543, + "grad_norm": 0.5557970404624939, + "learning_rate": 1.6801825425940642e-05, + "loss": 0.1254, + "num_input_tokens_seen": 17403720, + "step": 24255 + }, + { + "epoch": 50.436590436590436, + "grad_norm": 0.21575894951820374, + "learning_rate": 1.679255145759056e-05, + "loss": 0.113, + "num_input_tokens_seen": 17407432, + "step": 24260 + }, + { + "epoch": 50.446985446985444, + "grad_norm": 0.3749825060367584, + "learning_rate": 1.6783278754932187e-05, + "loss": 0.1443, + "num_input_tokens_seen": 17410888, + "step": 24265 + }, + { + "epoch": 50.45738045738046, + "grad_norm": 0.48770707845687866, + "learning_rate": 1.6774007319395496e-05, + "loss": 0.0999, + "num_input_tokens_seen": 17414472, + "step": 24270 + }, + { + "epoch": 50.46777546777547, + "grad_norm": 0.47909849882125854, + "learning_rate": 1.6764737152410243e-05, + "loss": 0.0759, + "num_input_tokens_seen": 17417928, + "step": 24275 + }, + { + "epoch": 50.478170478170476, + "grad_norm": 0.500942051410675, + "learning_rate": 1.6755468255406016e-05, + "loss": 0.1221, + "num_input_tokens_seen": 17421544, + "step": 24280 + }, + { + "epoch": 50.48856548856549, + "grad_norm": 0.2337067574262619, + "learning_rate": 1.674620062981219e-05, + "loss": 0.1337, + "num_input_tokens_seen": 17425384, + "step": 24285 + }, + { + "epoch": 50.4989604989605, + "grad_norm": 0.861142635345459, + "learning_rate": 1.6736934277057947e-05, + "loss": 0.0911, + "num_input_tokens_seen": 17428872, + "step": 24290 + }, + { + "epoch": 50.50935550935551, + "grad_norm": 0.13161814212799072, + "learning_rate": 1.6727669198572286e-05, + "loss": 0.0609, + "num_input_tokens_seen": 17432488, + "step": 24295 + }, + { + "epoch": 50.51975051975052, + "grad_norm": 0.6264936327934265, + "learning_rate": 1.6718405395783984e-05, + "loss": 0.1485, + "num_input_tokens_seen": 17436008, + "step": 24300 + }, + { + "epoch": 50.53014553014553, + "grad_norm": 0.23091170191764832, + "learning_rate": 1.6709142870121643e-05, + "loss": 0.1036, + "num_input_tokens_seen": 17439592, + "step": 24305 + }, + { + "epoch": 50.54054054054054, + "grad_norm": 0.2788962721824646, + "learning_rate": 1.669988162301367e-05, + "loss": 0.0778, + "num_input_tokens_seen": 17443144, + "step": 24310 + }, + { + "epoch": 50.55093555093555, + "grad_norm": 0.21968615055084229, + "learning_rate": 1.6690621655888243e-05, + "loss": 0.1092, + "num_input_tokens_seen": 17446824, + "step": 24315 + }, + { + "epoch": 50.561330561330564, + "grad_norm": 0.9077258706092834, + "learning_rate": 1.6681362970173386e-05, + "loss": 0.1704, + "num_input_tokens_seen": 17450408, + "step": 24320 + }, + { + "epoch": 50.57172557172557, + "grad_norm": 0.5070129036903381, + "learning_rate": 1.6672105567296904e-05, + "loss": 0.1225, + "num_input_tokens_seen": 17454088, + "step": 24325 + }, + { + "epoch": 50.58212058212058, + "grad_norm": 0.5205569863319397, + "learning_rate": 1.666284944868639e-05, + "loss": 0.0778, + "num_input_tokens_seen": 17457768, + "step": 24330 + }, + { + "epoch": 50.59251559251559, + "grad_norm": 0.24920086562633514, + "learning_rate": 1.665359461576927e-05, + "loss": 0.0995, + "num_input_tokens_seen": 17461544, + "step": 24335 + }, + { + "epoch": 50.602910602910605, + "grad_norm": 0.2302383929491043, + "learning_rate": 1.6644341069972736e-05, + "loss": 0.1198, + "num_input_tokens_seen": 17465096, + "step": 24340 + }, + { + "epoch": 50.61330561330561, + "grad_norm": 0.19026409089565277, + "learning_rate": 1.6635088812723813e-05, + "loss": 0.08, + "num_input_tokens_seen": 17468648, + "step": 24345 + }, + { + "epoch": 50.62370062370062, + "grad_norm": 0.5046247839927673, + "learning_rate": 1.6625837845449328e-05, + "loss": 0.139, + "num_input_tokens_seen": 17472264, + "step": 24350 + }, + { + "epoch": 50.63409563409564, + "grad_norm": 0.8036573529243469, + "learning_rate": 1.6616588169575874e-05, + "loss": 0.0978, + "num_input_tokens_seen": 17475656, + "step": 24355 + }, + { + "epoch": 50.644490644490645, + "grad_norm": 0.5786957144737244, + "learning_rate": 1.6607339786529878e-05, + "loss": 0.1413, + "num_input_tokens_seen": 17479272, + "step": 24360 + }, + { + "epoch": 50.65488565488565, + "grad_norm": 0.7022925019264221, + "learning_rate": 1.659809269773756e-05, + "loss": 0.0999, + "num_input_tokens_seen": 17482888, + "step": 24365 + }, + { + "epoch": 50.66528066528066, + "grad_norm": 0.1695869117975235, + "learning_rate": 1.658884690462493e-05, + "loss": 0.0854, + "num_input_tokens_seen": 17486376, + "step": 24370 + }, + { + "epoch": 50.67567567567568, + "grad_norm": 0.39019957184791565, + "learning_rate": 1.6579602408617813e-05, + "loss": 0.1149, + "num_input_tokens_seen": 17490056, + "step": 24375 + }, + { + "epoch": 50.686070686070686, + "grad_norm": 0.8127732276916504, + "learning_rate": 1.657035921114181e-05, + "loss": 0.1141, + "num_input_tokens_seen": 17493672, + "step": 24380 + }, + { + "epoch": 50.696465696465694, + "grad_norm": 0.9827529788017273, + "learning_rate": 1.656111731362236e-05, + "loss": 0.1326, + "num_input_tokens_seen": 17497288, + "step": 24385 + }, + { + "epoch": 50.70686070686071, + "grad_norm": 0.18708297610282898, + "learning_rate": 1.6551876717484666e-05, + "loss": 0.1259, + "num_input_tokens_seen": 17500904, + "step": 24390 + }, + { + "epoch": 50.71725571725572, + "grad_norm": 0.2084142118692398, + "learning_rate": 1.6542637424153752e-05, + "loss": 0.086, + "num_input_tokens_seen": 17504456, + "step": 24395 + }, + { + "epoch": 50.727650727650726, + "grad_norm": 0.24157801270484924, + "learning_rate": 1.6533399435054418e-05, + "loss": 0.0665, + "num_input_tokens_seen": 17508072, + "step": 24400 + }, + { + "epoch": 50.727650727650726, + "eval_loss": 0.14225232601165771, + "eval_runtime": 7.7548, + "eval_samples_per_second": 110.384, + "eval_steps_per_second": 27.596, + "num_input_tokens_seen": 17508072, + "step": 24400 + }, + { + "epoch": 50.73804573804574, + "grad_norm": 0.6398531198501587, + "learning_rate": 1.6524162751611304e-05, + "loss": 0.0804, + "num_input_tokens_seen": 17511752, + "step": 24405 + }, + { + "epoch": 50.74844074844075, + "grad_norm": 0.614774763584137, + "learning_rate": 1.6514927375248796e-05, + "loss": 0.1075, + "num_input_tokens_seen": 17515272, + "step": 24410 + }, + { + "epoch": 50.75883575883576, + "grad_norm": 0.18001598119735718, + "learning_rate": 1.6505693307391127e-05, + "loss": 0.0845, + "num_input_tokens_seen": 17518824, + "step": 24415 + }, + { + "epoch": 50.76923076923077, + "grad_norm": 0.20859333872795105, + "learning_rate": 1.6496460549462288e-05, + "loss": 0.1055, + "num_input_tokens_seen": 17522408, + "step": 24420 + }, + { + "epoch": 50.77962577962578, + "grad_norm": 0.7850275039672852, + "learning_rate": 1.6487229102886097e-05, + "loss": 0.1154, + "num_input_tokens_seen": 17526088, + "step": 24425 + }, + { + "epoch": 50.79002079002079, + "grad_norm": 0.4494384527206421, + "learning_rate": 1.6477998969086155e-05, + "loss": 0.0694, + "num_input_tokens_seen": 17529704, + "step": 24430 + }, + { + "epoch": 50.8004158004158, + "grad_norm": 0.208136647939682, + "learning_rate": 1.646877014948587e-05, + "loss": 0.0684, + "num_input_tokens_seen": 17533192, + "step": 24435 + }, + { + "epoch": 50.810810810810814, + "grad_norm": 0.1722421795129776, + "learning_rate": 1.6459542645508433e-05, + "loss": 0.0681, + "num_input_tokens_seen": 17537000, + "step": 24440 + }, + { + "epoch": 50.82120582120582, + "grad_norm": 0.1931123286485672, + "learning_rate": 1.6450316458576852e-05, + "loss": 0.1235, + "num_input_tokens_seen": 17540584, + "step": 24445 + }, + { + "epoch": 50.83160083160083, + "grad_norm": 0.6029858589172363, + "learning_rate": 1.6441091590113912e-05, + "loss": 0.1135, + "num_input_tokens_seen": 17544136, + "step": 24450 + }, + { + "epoch": 50.84199584199584, + "grad_norm": 0.2741427421569824, + "learning_rate": 1.6431868041542213e-05, + "loss": 0.0884, + "num_input_tokens_seen": 17547688, + "step": 24455 + }, + { + "epoch": 50.852390852390855, + "grad_norm": 0.5236220955848694, + "learning_rate": 1.6422645814284123e-05, + "loss": 0.09, + "num_input_tokens_seen": 17551272, + "step": 24460 + }, + { + "epoch": 50.86278586278586, + "grad_norm": 0.2877378761768341, + "learning_rate": 1.6413424909761846e-05, + "loss": 0.0964, + "num_input_tokens_seen": 17554888, + "step": 24465 + }, + { + "epoch": 50.87318087318087, + "grad_norm": 0.1691024899482727, + "learning_rate": 1.640420532939736e-05, + "loss": 0.106, + "num_input_tokens_seen": 17558376, + "step": 24470 + }, + { + "epoch": 50.88357588357589, + "grad_norm": 0.21730269491672516, + "learning_rate": 1.639498707461242e-05, + "loss": 0.0879, + "num_input_tokens_seen": 17561896, + "step": 24475 + }, + { + "epoch": 50.893970893970895, + "grad_norm": 0.31430262327194214, + "learning_rate": 1.6385770146828614e-05, + "loss": 0.1034, + "num_input_tokens_seen": 17565384, + "step": 24480 + }, + { + "epoch": 50.9043659043659, + "grad_norm": 0.11266147345304489, + "learning_rate": 1.637655454746731e-05, + "loss": 0.0732, + "num_input_tokens_seen": 17569000, + "step": 24485 + }, + { + "epoch": 50.91476091476091, + "grad_norm": 0.38576552271842957, + "learning_rate": 1.6367340277949658e-05, + "loss": 0.087, + "num_input_tokens_seen": 17572488, + "step": 24490 + }, + { + "epoch": 50.92515592515593, + "grad_norm": 0.29229220747947693, + "learning_rate": 1.635812733969663e-05, + "loss": 0.069, + "num_input_tokens_seen": 17576104, + "step": 24495 + }, + { + "epoch": 50.935550935550935, + "grad_norm": 0.24684908986091614, + "learning_rate": 1.634891573412896e-05, + "loss": 0.1207, + "num_input_tokens_seen": 17579720, + "step": 24500 + }, + { + "epoch": 50.945945945945944, + "grad_norm": 0.2995191216468811, + "learning_rate": 1.6339705462667196e-05, + "loss": 0.0852, + "num_input_tokens_seen": 17583336, + "step": 24505 + }, + { + "epoch": 50.95634095634096, + "grad_norm": 0.5262821912765503, + "learning_rate": 1.633049652673169e-05, + "loss": 0.0827, + "num_input_tokens_seen": 17586824, + "step": 24510 + }, + { + "epoch": 50.96673596673597, + "grad_norm": 0.7413138151168823, + "learning_rate": 1.632128892774256e-05, + "loss": 0.1182, + "num_input_tokens_seen": 17590376, + "step": 24515 + }, + { + "epoch": 50.977130977130976, + "grad_norm": 0.250557005405426, + "learning_rate": 1.6312082667119737e-05, + "loss": 0.0547, + "num_input_tokens_seen": 17593800, + "step": 24520 + }, + { + "epoch": 50.987525987525984, + "grad_norm": 0.6753647923469543, + "learning_rate": 1.630287774628296e-05, + "loss": 0.1132, + "num_input_tokens_seen": 17597416, + "step": 24525 + }, + { + "epoch": 50.997920997921, + "grad_norm": 0.3306092917919159, + "learning_rate": 1.6293674166651718e-05, + "loss": 0.12, + "num_input_tokens_seen": 17600936, + "step": 24530 + }, + { + "epoch": 51.00831600831601, + "grad_norm": 0.17918488383293152, + "learning_rate": 1.6284471929645338e-05, + "loss": 0.1413, + "num_input_tokens_seen": 17604448, + "step": 24535 + }, + { + "epoch": 51.018711018711016, + "grad_norm": 0.383983850479126, + "learning_rate": 1.627527103668291e-05, + "loss": 0.1064, + "num_input_tokens_seen": 17608000, + "step": 24540 + }, + { + "epoch": 51.02910602910603, + "grad_norm": 0.5478870272636414, + "learning_rate": 1.6266071489183327e-05, + "loss": 0.091, + "num_input_tokens_seen": 17611616, + "step": 24545 + }, + { + "epoch": 51.03950103950104, + "grad_norm": 0.6276240348815918, + "learning_rate": 1.6256873288565283e-05, + "loss": 0.0866, + "num_input_tokens_seen": 17615328, + "step": 24550 + }, + { + "epoch": 51.04989604989605, + "grad_norm": 0.5359646081924438, + "learning_rate": 1.6247676436247245e-05, + "loss": 0.1174, + "num_input_tokens_seen": 17619072, + "step": 24555 + }, + { + "epoch": 51.06029106029106, + "grad_norm": 0.6981486678123474, + "learning_rate": 1.6238480933647486e-05, + "loss": 0.1103, + "num_input_tokens_seen": 17622752, + "step": 24560 + }, + { + "epoch": 51.07068607068607, + "grad_norm": 0.45708170533180237, + "learning_rate": 1.6229286782184083e-05, + "loss": 0.0816, + "num_input_tokens_seen": 17626208, + "step": 24565 + }, + { + "epoch": 51.08108108108108, + "grad_norm": 0.3157943785190582, + "learning_rate": 1.622009398327487e-05, + "loss": 0.055, + "num_input_tokens_seen": 17629824, + "step": 24570 + }, + { + "epoch": 51.09147609147609, + "grad_norm": 0.6297403573989868, + "learning_rate": 1.6210902538337502e-05, + "loss": 0.0945, + "num_input_tokens_seen": 17633440, + "step": 24575 + }, + { + "epoch": 51.101871101871104, + "grad_norm": 0.3721967935562134, + "learning_rate": 1.6201712448789413e-05, + "loss": 0.0823, + "num_input_tokens_seen": 17637056, + "step": 24580 + }, + { + "epoch": 51.11226611226611, + "grad_norm": 0.7513974905014038, + "learning_rate": 1.6192523716047827e-05, + "loss": 0.1482, + "num_input_tokens_seen": 17640768, + "step": 24585 + }, + { + "epoch": 51.12266112266112, + "grad_norm": 0.39708268642425537, + "learning_rate": 1.6183336341529776e-05, + "loss": 0.0825, + "num_input_tokens_seen": 17644448, + "step": 24590 + }, + { + "epoch": 51.13305613305613, + "grad_norm": 0.6079825162887573, + "learning_rate": 1.6174150326652047e-05, + "loss": 0.1337, + "num_input_tokens_seen": 17647904, + "step": 24595 + }, + { + "epoch": 51.143451143451145, + "grad_norm": 0.21138787269592285, + "learning_rate": 1.6164965672831256e-05, + "loss": 0.117, + "num_input_tokens_seen": 17651488, + "step": 24600 + }, + { + "epoch": 51.143451143451145, + "eval_loss": 0.1500862091779709, + "eval_runtime": 7.7528, + "eval_samples_per_second": 110.412, + "eval_steps_per_second": 27.603, + "num_input_tokens_seen": 17651488, + "step": 24600 + }, + { + "epoch": 51.15384615384615, + "grad_norm": 0.41435912251472473, + "learning_rate": 1.6155782381483784e-05, + "loss": 0.101, + "num_input_tokens_seen": 17655008, + "step": 24605 + }, + { + "epoch": 51.16424116424116, + "grad_norm": 0.29341819882392883, + "learning_rate": 1.6146600454025813e-05, + "loss": 0.0601, + "num_input_tokens_seen": 17658592, + "step": 24610 + }, + { + "epoch": 51.17463617463618, + "grad_norm": 0.278536319732666, + "learning_rate": 1.6137419891873317e-05, + "loss": 0.0946, + "num_input_tokens_seen": 17662144, + "step": 24615 + }, + { + "epoch": 51.185031185031185, + "grad_norm": 0.4551237225532532, + "learning_rate": 1.6128240696442038e-05, + "loss": 0.1326, + "num_input_tokens_seen": 17665952, + "step": 24620 + }, + { + "epoch": 51.195426195426194, + "grad_norm": 0.6675313115119934, + "learning_rate": 1.611906286914753e-05, + "loss": 0.0867, + "num_input_tokens_seen": 17669536, + "step": 24625 + }, + { + "epoch": 51.20582120582121, + "grad_norm": 0.3369758427143097, + "learning_rate": 1.6109886411405144e-05, + "loss": 0.0948, + "num_input_tokens_seen": 17672992, + "step": 24630 + }, + { + "epoch": 51.21621621621622, + "grad_norm": 0.2554379403591156, + "learning_rate": 1.6100711324629985e-05, + "loss": 0.0787, + "num_input_tokens_seen": 17676704, + "step": 24635 + }, + { + "epoch": 51.226611226611226, + "grad_norm": 0.33462393283843994, + "learning_rate": 1.609153761023698e-05, + "loss": 0.1216, + "num_input_tokens_seen": 17680352, + "step": 24640 + }, + { + "epoch": 51.237006237006234, + "grad_norm": 0.35443049669265747, + "learning_rate": 1.608236526964083e-05, + "loss": 0.1258, + "num_input_tokens_seen": 17683808, + "step": 24645 + }, + { + "epoch": 51.24740124740125, + "grad_norm": 0.39808207750320435, + "learning_rate": 1.607319430425601e-05, + "loss": 0.0963, + "num_input_tokens_seen": 17687552, + "step": 24650 + }, + { + "epoch": 51.25779625779626, + "grad_norm": 0.4316663146018982, + "learning_rate": 1.606402471549682e-05, + "loss": 0.1077, + "num_input_tokens_seen": 17691200, + "step": 24655 + }, + { + "epoch": 51.268191268191266, + "grad_norm": 0.3013499677181244, + "learning_rate": 1.6054856504777312e-05, + "loss": 0.0848, + "num_input_tokens_seen": 17694784, + "step": 24660 + }, + { + "epoch": 51.27858627858628, + "grad_norm": 0.4021044373512268, + "learning_rate": 1.6045689673511334e-05, + "loss": 0.1, + "num_input_tokens_seen": 17698400, + "step": 24665 + }, + { + "epoch": 51.28898128898129, + "grad_norm": 0.3703979253768921, + "learning_rate": 1.6036524223112548e-05, + "loss": 0.105, + "num_input_tokens_seen": 17702048, + "step": 24670 + }, + { + "epoch": 51.2993762993763, + "grad_norm": 0.3169595003128052, + "learning_rate": 1.602736015499436e-05, + "loss": 0.1364, + "num_input_tokens_seen": 17705760, + "step": 24675 + }, + { + "epoch": 51.30977130977131, + "grad_norm": 0.19502955675125122, + "learning_rate": 1.601819747057e-05, + "loss": 0.0895, + "num_input_tokens_seen": 17709344, + "step": 24680 + }, + { + "epoch": 51.32016632016632, + "grad_norm": 0.24135194718837738, + "learning_rate": 1.6009036171252465e-05, + "loss": 0.0745, + "num_input_tokens_seen": 17712800, + "step": 24685 + }, + { + "epoch": 51.33056133056133, + "grad_norm": 0.17093008756637573, + "learning_rate": 1.599987625845453e-05, + "loss": 0.066, + "num_input_tokens_seen": 17716512, + "step": 24690 + }, + { + "epoch": 51.34095634095634, + "grad_norm": 0.38585564494132996, + "learning_rate": 1.599071773358879e-05, + "loss": 0.113, + "num_input_tokens_seen": 17720128, + "step": 24695 + }, + { + "epoch": 51.351351351351354, + "grad_norm": 0.16482144594192505, + "learning_rate": 1.598156059806758e-05, + "loss": 0.0736, + "num_input_tokens_seen": 17723552, + "step": 24700 + }, + { + "epoch": 51.36174636174636, + "grad_norm": 0.23718175292015076, + "learning_rate": 1.5972404853303062e-05, + "loss": 0.0852, + "num_input_tokens_seen": 17726976, + "step": 24705 + }, + { + "epoch": 51.37214137214137, + "grad_norm": 0.25980767607688904, + "learning_rate": 1.5963250500707172e-05, + "loss": 0.0917, + "num_input_tokens_seen": 17730336, + "step": 24710 + }, + { + "epoch": 51.38253638253638, + "grad_norm": 0.6803204417228699, + "learning_rate": 1.5954097541691612e-05, + "loss": 0.1036, + "num_input_tokens_seen": 17733888, + "step": 24715 + }, + { + "epoch": 51.392931392931395, + "grad_norm": 0.2617872655391693, + "learning_rate": 1.5944945977667884e-05, + "loss": 0.059, + "num_input_tokens_seen": 17737408, + "step": 24720 + }, + { + "epoch": 51.4033264033264, + "grad_norm": 0.48639222979545593, + "learning_rate": 1.593579581004729e-05, + "loss": 0.1261, + "num_input_tokens_seen": 17741088, + "step": 24725 + }, + { + "epoch": 51.41372141372141, + "grad_norm": 0.38783857226371765, + "learning_rate": 1.592664704024088e-05, + "loss": 0.1258, + "num_input_tokens_seen": 17744704, + "step": 24730 + }, + { + "epoch": 51.42411642411643, + "grad_norm": 0.4086344838142395, + "learning_rate": 1.591749966965953e-05, + "loss": 0.0872, + "num_input_tokens_seen": 17748224, + "step": 24735 + }, + { + "epoch": 51.434511434511435, + "grad_norm": 0.48095381259918213, + "learning_rate": 1.5908353699713856e-05, + "loss": 0.1236, + "num_input_tokens_seen": 17751680, + "step": 24740 + }, + { + "epoch": 51.444906444906444, + "grad_norm": 0.2558630704879761, + "learning_rate": 1.5899209131814298e-05, + "loss": 0.0817, + "num_input_tokens_seen": 17755296, + "step": 24745 + }, + { + "epoch": 51.45530145530145, + "grad_norm": 0.3303031623363495, + "learning_rate": 1.5890065967371067e-05, + "loss": 0.1136, + "num_input_tokens_seen": 17758912, + "step": 24750 + }, + { + "epoch": 51.46569646569647, + "grad_norm": 0.4720833897590637, + "learning_rate": 1.5880924207794144e-05, + "loss": 0.1041, + "num_input_tokens_seen": 17762624, + "step": 24755 + }, + { + "epoch": 51.476091476091476, + "grad_norm": 0.6899962425231934, + "learning_rate": 1.5871783854493298e-05, + "loss": 0.0963, + "num_input_tokens_seen": 17766368, + "step": 24760 + }, + { + "epoch": 51.486486486486484, + "grad_norm": 0.3600723445415497, + "learning_rate": 1.5862644908878106e-05, + "loss": 0.1438, + "num_input_tokens_seen": 17769952, + "step": 24765 + }, + { + "epoch": 51.4968814968815, + "grad_norm": 0.2947799861431122, + "learning_rate": 1.5853507372357885e-05, + "loss": 0.088, + "num_input_tokens_seen": 17773632, + "step": 24770 + }, + { + "epoch": 51.50727650727651, + "grad_norm": 0.3693509101867676, + "learning_rate": 1.5844371246341776e-05, + "loss": 0.1102, + "num_input_tokens_seen": 17777184, + "step": 24775 + }, + { + "epoch": 51.517671517671516, + "grad_norm": 0.4810337424278259, + "learning_rate": 1.5835236532238674e-05, + "loss": 0.1235, + "num_input_tokens_seen": 17780992, + "step": 24780 + }, + { + "epoch": 51.528066528066525, + "grad_norm": 0.2067720890045166, + "learning_rate": 1.582610323145727e-05, + "loss": 0.0781, + "num_input_tokens_seen": 17784640, + "step": 24785 + }, + { + "epoch": 51.53846153846154, + "grad_norm": 0.5391148328781128, + "learning_rate": 1.5816971345406035e-05, + "loss": 0.0784, + "num_input_tokens_seen": 17788224, + "step": 24790 + }, + { + "epoch": 51.54885654885655, + "grad_norm": 0.3464488983154297, + "learning_rate": 1.5807840875493225e-05, + "loss": 0.0794, + "num_input_tokens_seen": 17791840, + "step": 24795 + }, + { + "epoch": 51.55925155925156, + "grad_norm": 0.43798547983169556, + "learning_rate": 1.5798711823126854e-05, + "loss": 0.0967, + "num_input_tokens_seen": 17795328, + "step": 24800 + }, + { + "epoch": 51.55925155925156, + "eval_loss": 0.1452546864748001, + "eval_runtime": 7.742, + "eval_samples_per_second": 110.566, + "eval_steps_per_second": 27.642, + "num_input_tokens_seen": 17795328, + "step": 24800 + }, + { + "epoch": 51.56964656964657, + "grad_norm": 0.6114146113395691, + "learning_rate": 1.578958418971477e-05, + "loss": 0.1096, + "num_input_tokens_seen": 17798912, + "step": 24805 + }, + { + "epoch": 51.58004158004158, + "grad_norm": 0.48277151584625244, + "learning_rate": 1.578045797666453e-05, + "loss": 0.0964, + "num_input_tokens_seen": 17802400, + "step": 24810 + }, + { + "epoch": 51.59043659043659, + "grad_norm": 0.42948052287101746, + "learning_rate": 1.5771333185383548e-05, + "loss": 0.0835, + "num_input_tokens_seen": 17806144, + "step": 24815 + }, + { + "epoch": 51.6008316008316, + "grad_norm": 0.48721015453338623, + "learning_rate": 1.576220981727895e-05, + "loss": 0.1141, + "num_input_tokens_seen": 17809696, + "step": 24820 + }, + { + "epoch": 51.61122661122661, + "grad_norm": 0.20711083710193634, + "learning_rate": 1.575308787375769e-05, + "loss": 0.0974, + "num_input_tokens_seen": 17813280, + "step": 24825 + }, + { + "epoch": 51.62162162162162, + "grad_norm": 0.29027336835861206, + "learning_rate": 1.5743967356226492e-05, + "loss": 0.116, + "num_input_tokens_seen": 17816832, + "step": 24830 + }, + { + "epoch": 51.63201663201663, + "grad_norm": 0.4124643802642822, + "learning_rate": 1.5734848266091835e-05, + "loss": 0.0716, + "num_input_tokens_seen": 17820448, + "step": 24835 + }, + { + "epoch": 51.642411642411645, + "grad_norm": 0.6114290952682495, + "learning_rate": 1.572573060476001e-05, + "loss": 0.1056, + "num_input_tokens_seen": 17824096, + "step": 24840 + }, + { + "epoch": 51.65280665280665, + "grad_norm": 0.5315593481063843, + "learning_rate": 1.5716614373637085e-05, + "loss": 0.1138, + "num_input_tokens_seen": 17827808, + "step": 24845 + }, + { + "epoch": 51.66320166320166, + "grad_norm": 0.15469039976596832, + "learning_rate": 1.570749957412887e-05, + "loss": 0.1371, + "num_input_tokens_seen": 17831296, + "step": 24850 + }, + { + "epoch": 51.67359667359668, + "grad_norm": 0.2142082303762436, + "learning_rate": 1.5698386207641013e-05, + "loss": 0.1107, + "num_input_tokens_seen": 17834688, + "step": 24855 + }, + { + "epoch": 51.683991683991685, + "grad_norm": 0.7025565505027771, + "learning_rate": 1.5689274275578884e-05, + "loss": 0.0696, + "num_input_tokens_seen": 17838208, + "step": 24860 + }, + { + "epoch": 51.694386694386694, + "grad_norm": 0.5273837447166443, + "learning_rate": 1.5680163779347667e-05, + "loss": 0.0982, + "num_input_tokens_seen": 17841824, + "step": 24865 + }, + { + "epoch": 51.7047817047817, + "grad_norm": 0.15333996713161469, + "learning_rate": 1.5671054720352327e-05, + "loss": 0.1063, + "num_input_tokens_seen": 17845472, + "step": 24870 + }, + { + "epoch": 51.71517671517672, + "grad_norm": 0.3917558789253235, + "learning_rate": 1.566194709999757e-05, + "loss": 0.1348, + "num_input_tokens_seen": 17848960, + "step": 24875 + }, + { + "epoch": 51.725571725571726, + "grad_norm": 0.9187566041946411, + "learning_rate": 1.5652840919687933e-05, + "loss": 0.0906, + "num_input_tokens_seen": 17852480, + "step": 24880 + }, + { + "epoch": 51.735966735966734, + "grad_norm": 0.28284189105033875, + "learning_rate": 1.5643736180827676e-05, + "loss": 0.1191, + "num_input_tokens_seen": 17856320, + "step": 24885 + }, + { + "epoch": 51.74636174636175, + "grad_norm": 0.26890984177589417, + "learning_rate": 1.5634632884820878e-05, + "loss": 0.0914, + "num_input_tokens_seen": 17859776, + "step": 24890 + }, + { + "epoch": 51.75675675675676, + "grad_norm": 0.5487330555915833, + "learning_rate": 1.5625531033071395e-05, + "loss": 0.0989, + "num_input_tokens_seen": 17863360, + "step": 24895 + }, + { + "epoch": 51.767151767151766, + "grad_norm": 0.7186499238014221, + "learning_rate": 1.5616430626982828e-05, + "loss": 0.1027, + "num_input_tokens_seen": 17866976, + "step": 24900 + }, + { + "epoch": 51.777546777546775, + "grad_norm": 0.23873180150985718, + "learning_rate": 1.5607331667958575e-05, + "loss": 0.1012, + "num_input_tokens_seen": 17870592, + "step": 24905 + }, + { + "epoch": 51.78794178794179, + "grad_norm": 0.7259894609451294, + "learning_rate": 1.5598234157401824e-05, + "loss": 0.0931, + "num_input_tokens_seen": 17874528, + "step": 24910 + }, + { + "epoch": 51.7983367983368, + "grad_norm": 0.3523949384689331, + "learning_rate": 1.5589138096715503e-05, + "loss": 0.1126, + "num_input_tokens_seen": 17878080, + "step": 24915 + }, + { + "epoch": 51.80873180873181, + "grad_norm": 0.228169783949852, + "learning_rate": 1.5580043487302365e-05, + "loss": 0.0971, + "num_input_tokens_seen": 17881728, + "step": 24920 + }, + { + "epoch": 51.81912681912682, + "grad_norm": 0.2048962414264679, + "learning_rate": 1.5570950330564888e-05, + "loss": 0.1284, + "num_input_tokens_seen": 17885504, + "step": 24925 + }, + { + "epoch": 51.82952182952183, + "grad_norm": 0.6124534606933594, + "learning_rate": 1.5561858627905367e-05, + "loss": 0.1113, + "num_input_tokens_seen": 17889088, + "step": 24930 + }, + { + "epoch": 51.83991683991684, + "grad_norm": 0.28364211320877075, + "learning_rate": 1.5552768380725857e-05, + "loss": 0.1362, + "num_input_tokens_seen": 17892576, + "step": 24935 + }, + { + "epoch": 51.85031185031185, + "grad_norm": 0.31703823804855347, + "learning_rate": 1.5543679590428183e-05, + "loss": 0.1462, + "num_input_tokens_seen": 17896032, + "step": 24940 + }, + { + "epoch": 51.86070686070686, + "grad_norm": 0.28799280524253845, + "learning_rate": 1.5534592258413943e-05, + "loss": 0.1078, + "num_input_tokens_seen": 17899616, + "step": 24945 + }, + { + "epoch": 51.87110187110187, + "grad_norm": 0.3314480781555176, + "learning_rate": 1.5525506386084538e-05, + "loss": 0.0832, + "num_input_tokens_seen": 17903136, + "step": 24950 + }, + { + "epoch": 51.88149688149688, + "grad_norm": 0.6531597971916199, + "learning_rate": 1.55164219748411e-05, + "loss": 0.1504, + "num_input_tokens_seen": 17906784, + "step": 24955 + }, + { + "epoch": 51.891891891891895, + "grad_norm": 0.6573678255081177, + "learning_rate": 1.550733902608459e-05, + "loss": 0.1002, + "num_input_tokens_seen": 17910400, + "step": 24960 + }, + { + "epoch": 51.9022869022869, + "grad_norm": 0.2948640286922455, + "learning_rate": 1.549825754121568e-05, + "loss": 0.0936, + "num_input_tokens_seen": 17913920, + "step": 24965 + }, + { + "epoch": 51.91268191268191, + "grad_norm": 0.28343063592910767, + "learning_rate": 1.5489177521634864e-05, + "loss": 0.1083, + "num_input_tokens_seen": 17917376, + "step": 24970 + }, + { + "epoch": 51.92307692307692, + "grad_norm": 0.6478380560874939, + "learning_rate": 1.5480098968742402e-05, + "loss": 0.1193, + "num_input_tokens_seen": 17920896, + "step": 24975 + }, + { + "epoch": 51.933471933471935, + "grad_norm": 0.4757753908634186, + "learning_rate": 1.5471021883938304e-05, + "loss": 0.124, + "num_input_tokens_seen": 17924448, + "step": 24980 + }, + { + "epoch": 51.943866943866944, + "grad_norm": 0.2506638467311859, + "learning_rate": 1.546194626862238e-05, + "loss": 0.0676, + "num_input_tokens_seen": 17927872, + "step": 24985 + }, + { + "epoch": 51.95426195426195, + "grad_norm": 0.3840346336364746, + "learning_rate": 1.5452872124194216e-05, + "loss": 0.0886, + "num_input_tokens_seen": 17931424, + "step": 24990 + }, + { + "epoch": 51.96465696465697, + "grad_norm": 0.31712907552719116, + "learning_rate": 1.5443799452053136e-05, + "loss": 0.0516, + "num_input_tokens_seen": 17934880, + "step": 24995 + }, + { + "epoch": 51.975051975051976, + "grad_norm": 0.3448769450187683, + "learning_rate": 1.543472825359828e-05, + "loss": 0.1266, + "num_input_tokens_seen": 17938368, + "step": 25000 + }, + { + "epoch": 51.975051975051976, + "eval_loss": 0.1447323113679886, + "eval_runtime": 7.7613, + "eval_samples_per_second": 110.291, + "eval_steps_per_second": 27.573, + "num_input_tokens_seen": 17938368, + "step": 25000 + }, + { + "epoch": 51.985446985446984, + "grad_norm": 0.8386533856391907, + "learning_rate": 1.5425658530228522e-05, + "loss": 0.0949, + "num_input_tokens_seen": 17941920, + "step": 25005 + }, + { + "epoch": 51.99584199584199, + "grad_norm": 0.23491285741329193, + "learning_rate": 1.5416590283342546e-05, + "loss": 0.0901, + "num_input_tokens_seen": 17945728, + "step": 25010 + }, + { + "epoch": 52.00623700623701, + "grad_norm": 0.7457054853439331, + "learning_rate": 1.5407523514338783e-05, + "loss": 0.1349, + "num_input_tokens_seen": 17949112, + "step": 25015 + }, + { + "epoch": 52.016632016632016, + "grad_norm": 0.37891191244125366, + "learning_rate": 1.539845822461543e-05, + "loss": 0.1126, + "num_input_tokens_seen": 17952760, + "step": 25020 + }, + { + "epoch": 52.027027027027025, + "grad_norm": 0.5152024030685425, + "learning_rate": 1.538939441557048e-05, + "loss": 0.0773, + "num_input_tokens_seen": 17956248, + "step": 25025 + }, + { + "epoch": 52.03742203742204, + "grad_norm": 0.26143333315849304, + "learning_rate": 1.5380332088601696e-05, + "loss": 0.0989, + "num_input_tokens_seen": 17959896, + "step": 25030 + }, + { + "epoch": 52.04781704781705, + "grad_norm": 0.3897336721420288, + "learning_rate": 1.537127124510658e-05, + "loss": 0.1358, + "num_input_tokens_seen": 17963544, + "step": 25035 + }, + { + "epoch": 52.05821205821206, + "grad_norm": 0.12037506699562073, + "learning_rate": 1.5362211886482457e-05, + "loss": 0.0783, + "num_input_tokens_seen": 17966936, + "step": 25040 + }, + { + "epoch": 52.06860706860707, + "grad_norm": 0.47271186113357544, + "learning_rate": 1.5353154014126363e-05, + "loss": 0.088, + "num_input_tokens_seen": 17970424, + "step": 25045 + }, + { + "epoch": 52.07900207900208, + "grad_norm": 0.3276069462299347, + "learning_rate": 1.534409762943515e-05, + "loss": 0.1139, + "num_input_tokens_seen": 17974008, + "step": 25050 + }, + { + "epoch": 52.08939708939709, + "grad_norm": 0.6147036552429199, + "learning_rate": 1.5335042733805438e-05, + "loss": 0.0838, + "num_input_tokens_seen": 17977752, + "step": 25055 + }, + { + "epoch": 52.0997920997921, + "grad_norm": 0.1900138258934021, + "learning_rate": 1.532598932863358e-05, + "loss": 0.0693, + "num_input_tokens_seen": 17981240, + "step": 25060 + }, + { + "epoch": 52.11018711018711, + "grad_norm": 0.23668593168258667, + "learning_rate": 1.531693741531574e-05, + "loss": 0.1478, + "num_input_tokens_seen": 17984920, + "step": 25065 + }, + { + "epoch": 52.12058212058212, + "grad_norm": 0.6837069392204285, + "learning_rate": 1.5307886995247844e-05, + "loss": 0.1415, + "num_input_tokens_seen": 17988568, + "step": 25070 + }, + { + "epoch": 52.13097713097713, + "grad_norm": 0.23225858807563782, + "learning_rate": 1.529883806982557e-05, + "loss": 0.0873, + "num_input_tokens_seen": 17992056, + "step": 25075 + }, + { + "epoch": 52.141372141372145, + "grad_norm": 0.25741109251976013, + "learning_rate": 1.5289790640444376e-05, + "loss": 0.116, + "num_input_tokens_seen": 17995736, + "step": 25080 + }, + { + "epoch": 52.15176715176715, + "grad_norm": 0.24623416364192963, + "learning_rate": 1.5280744708499494e-05, + "loss": 0.0576, + "num_input_tokens_seen": 17999288, + "step": 25085 + }, + { + "epoch": 52.16216216216216, + "grad_norm": 0.40185248851776123, + "learning_rate": 1.527170027538591e-05, + "loss": 0.0809, + "num_input_tokens_seen": 18002904, + "step": 25090 + }, + { + "epoch": 52.17255717255717, + "grad_norm": 0.32745349407196045, + "learning_rate": 1.5262657342498407e-05, + "loss": 0.0812, + "num_input_tokens_seen": 18006488, + "step": 25095 + }, + { + "epoch": 52.182952182952185, + "grad_norm": 0.24560721218585968, + "learning_rate": 1.52536159112315e-05, + "loss": 0.1247, + "num_input_tokens_seen": 18009912, + "step": 25100 + }, + { + "epoch": 52.19334719334719, + "grad_norm": 0.6159741282463074, + "learning_rate": 1.5244575982979497e-05, + "loss": 0.1003, + "num_input_tokens_seen": 18013464, + "step": 25105 + }, + { + "epoch": 52.2037422037422, + "grad_norm": 0.38627713918685913, + "learning_rate": 1.5235537559136487e-05, + "loss": 0.1026, + "num_input_tokens_seen": 18017048, + "step": 25110 + }, + { + "epoch": 52.21413721413722, + "grad_norm": 0.5212156772613525, + "learning_rate": 1.5226500641096286e-05, + "loss": 0.1218, + "num_input_tokens_seen": 18020536, + "step": 25115 + }, + { + "epoch": 52.224532224532226, + "grad_norm": 0.43091973662376404, + "learning_rate": 1.5217465230252509e-05, + "loss": 0.0719, + "num_input_tokens_seen": 18024088, + "step": 25120 + }, + { + "epoch": 52.234927234927234, + "grad_norm": 0.9940339922904968, + "learning_rate": 1.5208431327998523e-05, + "loss": 0.1144, + "num_input_tokens_seen": 18027576, + "step": 25125 + }, + { + "epoch": 52.24532224532224, + "grad_norm": 0.1848049759864807, + "learning_rate": 1.5199398935727477e-05, + "loss": 0.109, + "num_input_tokens_seen": 18031000, + "step": 25130 + }, + { + "epoch": 52.25571725571726, + "grad_norm": 0.3305843770503998, + "learning_rate": 1.5190368054832282e-05, + "loss": 0.0909, + "num_input_tokens_seen": 18034744, + "step": 25135 + }, + { + "epoch": 52.266112266112266, + "grad_norm": 0.22333474457263947, + "learning_rate": 1.5181338686705601e-05, + "loss": 0.0863, + "num_input_tokens_seen": 18038296, + "step": 25140 + }, + { + "epoch": 52.276507276507274, + "grad_norm": 0.28540587425231934, + "learning_rate": 1.5172310832739889e-05, + "loss": 0.0802, + "num_input_tokens_seen": 18042104, + "step": 25145 + }, + { + "epoch": 52.28690228690229, + "grad_norm": 0.40301352739334106, + "learning_rate": 1.5163284494327346e-05, + "loss": 0.075, + "num_input_tokens_seen": 18045688, + "step": 25150 + }, + { + "epoch": 52.2972972972973, + "grad_norm": 0.4672083854675293, + "learning_rate": 1.5154259672859952e-05, + "loss": 0.1074, + "num_input_tokens_seen": 18049240, + "step": 25155 + }, + { + "epoch": 52.30769230769231, + "grad_norm": 0.6232144832611084, + "learning_rate": 1.5145236369729452e-05, + "loss": 0.0866, + "num_input_tokens_seen": 18052824, + "step": 25160 + }, + { + "epoch": 52.318087318087315, + "grad_norm": 0.18213781714439392, + "learning_rate": 1.5136214586327335e-05, + "loss": 0.0675, + "num_input_tokens_seen": 18056312, + "step": 25165 + }, + { + "epoch": 52.32848232848233, + "grad_norm": 0.27362290024757385, + "learning_rate": 1.5127194324044885e-05, + "loss": 0.1103, + "num_input_tokens_seen": 18059896, + "step": 25170 + }, + { + "epoch": 52.33887733887734, + "grad_norm": 0.2881198525428772, + "learning_rate": 1.5118175584273148e-05, + "loss": 0.1029, + "num_input_tokens_seen": 18063480, + "step": 25175 + }, + { + "epoch": 52.34927234927235, + "grad_norm": 0.3159498870372772, + "learning_rate": 1.5109158368402909e-05, + "loss": 0.1006, + "num_input_tokens_seen": 18067160, + "step": 25180 + }, + { + "epoch": 52.35966735966736, + "grad_norm": 0.3127395212650299, + "learning_rate": 1.5100142677824753e-05, + "loss": 0.122, + "num_input_tokens_seen": 18070616, + "step": 25185 + }, + { + "epoch": 52.37006237006237, + "grad_norm": 0.40331295132637024, + "learning_rate": 1.509112851392901e-05, + "loss": 0.1243, + "num_input_tokens_seen": 18074168, + "step": 25190 + }, + { + "epoch": 52.38045738045738, + "grad_norm": 0.36328279972076416, + "learning_rate": 1.5082115878105763e-05, + "loss": 0.1051, + "num_input_tokens_seen": 18077720, + "step": 25195 + }, + { + "epoch": 52.39085239085239, + "grad_norm": 0.27500271797180176, + "learning_rate": 1.5073104771744892e-05, + "loss": 0.0748, + "num_input_tokens_seen": 18081176, + "step": 25200 + }, + { + "epoch": 52.39085239085239, + "eval_loss": 0.14433473348617554, + "eval_runtime": 7.7578, + "eval_samples_per_second": 110.341, + "eval_steps_per_second": 27.585, + "num_input_tokens_seen": 18081176, + "step": 25200 + }, + { + "epoch": 52.4012474012474, + "grad_norm": 0.6571601629257202, + "learning_rate": 1.5064095196236006e-05, + "loss": 0.1049, + "num_input_tokens_seen": 18084792, + "step": 25205 + }, + { + "epoch": 52.41164241164241, + "grad_norm": 0.8545929193496704, + "learning_rate": 1.50550871529685e-05, + "loss": 0.1306, + "num_input_tokens_seen": 18088312, + "step": 25210 + }, + { + "epoch": 52.42203742203742, + "grad_norm": 0.520564079284668, + "learning_rate": 1.5046080643331546e-05, + "loss": 0.093, + "num_input_tokens_seen": 18091992, + "step": 25215 + }, + { + "epoch": 52.432432432432435, + "grad_norm": 0.4829094111919403, + "learning_rate": 1.5037075668714028e-05, + "loss": 0.1013, + "num_input_tokens_seen": 18095512, + "step": 25220 + }, + { + "epoch": 52.44282744282744, + "grad_norm": 0.16999821364879608, + "learning_rate": 1.5028072230504656e-05, + "loss": 0.1085, + "num_input_tokens_seen": 18099064, + "step": 25225 + }, + { + "epoch": 52.45322245322245, + "grad_norm": 0.38882875442504883, + "learning_rate": 1.5019070330091861e-05, + "loss": 0.1184, + "num_input_tokens_seen": 18102616, + "step": 25230 + }, + { + "epoch": 52.46361746361746, + "grad_norm": 0.32254862785339355, + "learning_rate": 1.5010069968863843e-05, + "loss": 0.1338, + "num_input_tokens_seen": 18106392, + "step": 25235 + }, + { + "epoch": 52.474012474012476, + "grad_norm": 0.2605293393135071, + "learning_rate": 1.5001071148208584e-05, + "loss": 0.0627, + "num_input_tokens_seen": 18109880, + "step": 25240 + }, + { + "epoch": 52.484407484407484, + "grad_norm": 0.3365158140659332, + "learning_rate": 1.49920738695138e-05, + "loss": 0.0827, + "num_input_tokens_seen": 18113464, + "step": 25245 + }, + { + "epoch": 52.49480249480249, + "grad_norm": 0.43753883242607117, + "learning_rate": 1.4983078134166995e-05, + "loss": 0.0993, + "num_input_tokens_seen": 18117176, + "step": 25250 + }, + { + "epoch": 52.50519750519751, + "grad_norm": 0.25141191482543945, + "learning_rate": 1.4974083943555428e-05, + "loss": 0.0778, + "num_input_tokens_seen": 18120920, + "step": 25255 + }, + { + "epoch": 52.515592515592516, + "grad_norm": 0.20311962068080902, + "learning_rate": 1.496509129906611e-05, + "loss": 0.0805, + "num_input_tokens_seen": 18124408, + "step": 25260 + }, + { + "epoch": 52.525987525987524, + "grad_norm": 0.8803796768188477, + "learning_rate": 1.4956100202085809e-05, + "loss": 0.1451, + "num_input_tokens_seen": 18127896, + "step": 25265 + }, + { + "epoch": 52.53638253638254, + "grad_norm": 0.43688085675239563, + "learning_rate": 1.4947110654001093e-05, + "loss": 0.1076, + "num_input_tokens_seen": 18131480, + "step": 25270 + }, + { + "epoch": 52.54677754677755, + "grad_norm": 0.37782803177833557, + "learning_rate": 1.4938122656198234e-05, + "loss": 0.0796, + "num_input_tokens_seen": 18135128, + "step": 25275 + }, + { + "epoch": 52.55717255717256, + "grad_norm": 0.23198740184307098, + "learning_rate": 1.4929136210063316e-05, + "loss": 0.0951, + "num_input_tokens_seen": 18138680, + "step": 25280 + }, + { + "epoch": 52.567567567567565, + "grad_norm": 0.8592529296875, + "learning_rate": 1.4920151316982146e-05, + "loss": 0.1239, + "num_input_tokens_seen": 18142360, + "step": 25285 + }, + { + "epoch": 52.57796257796258, + "grad_norm": 0.25890061259269714, + "learning_rate": 1.4911167978340312e-05, + "loss": 0.0921, + "num_input_tokens_seen": 18145880, + "step": 25290 + }, + { + "epoch": 52.58835758835759, + "grad_norm": 0.1504083126783371, + "learning_rate": 1.4902186195523166e-05, + "loss": 0.1393, + "num_input_tokens_seen": 18149496, + "step": 25295 + }, + { + "epoch": 52.5987525987526, + "grad_norm": 0.19181741774082184, + "learning_rate": 1.4893205969915805e-05, + "loss": 0.1167, + "num_input_tokens_seen": 18153016, + "step": 25300 + }, + { + "epoch": 52.60914760914761, + "grad_norm": 0.33036383986473083, + "learning_rate": 1.4884227302903086e-05, + "loss": 0.0985, + "num_input_tokens_seen": 18156600, + "step": 25305 + }, + { + "epoch": 52.61954261954262, + "grad_norm": 0.38994744420051575, + "learning_rate": 1.4875250195869653e-05, + "loss": 0.0753, + "num_input_tokens_seen": 18160216, + "step": 25310 + }, + { + "epoch": 52.62993762993763, + "grad_norm": 0.30836057662963867, + "learning_rate": 1.4866274650199862e-05, + "loss": 0.1108, + "num_input_tokens_seen": 18163768, + "step": 25315 + }, + { + "epoch": 52.64033264033264, + "grad_norm": 0.3129492402076721, + "learning_rate": 1.485730066727788e-05, + "loss": 0.103, + "num_input_tokens_seen": 18167512, + "step": 25320 + }, + { + "epoch": 52.65072765072765, + "grad_norm": 0.20893824100494385, + "learning_rate": 1.4848328248487586e-05, + "loss": 0.0872, + "num_input_tokens_seen": 18171032, + "step": 25325 + }, + { + "epoch": 52.66112266112266, + "grad_norm": 0.2831841707229614, + "learning_rate": 1.4839357395212656e-05, + "loss": 0.1212, + "num_input_tokens_seen": 18174520, + "step": 25330 + }, + { + "epoch": 52.67151767151767, + "grad_norm": 0.2846580743789673, + "learning_rate": 1.4830388108836502e-05, + "loss": 0.1387, + "num_input_tokens_seen": 18178200, + "step": 25335 + }, + { + "epoch": 52.681912681912685, + "grad_norm": 0.19500409066677094, + "learning_rate": 1.4821420390742299e-05, + "loss": 0.1029, + "num_input_tokens_seen": 18181752, + "step": 25340 + }, + { + "epoch": 52.69230769230769, + "grad_norm": 0.5922791957855225, + "learning_rate": 1.4812454242312979e-05, + "loss": 0.1376, + "num_input_tokens_seen": 18185432, + "step": 25345 + }, + { + "epoch": 52.7027027027027, + "grad_norm": 1.2330307960510254, + "learning_rate": 1.4803489664931253e-05, + "loss": 0.1343, + "num_input_tokens_seen": 18189112, + "step": 25350 + }, + { + "epoch": 52.71309771309771, + "grad_norm": 0.5090454816818237, + "learning_rate": 1.4794526659979544e-05, + "loss": 0.0771, + "num_input_tokens_seen": 18192760, + "step": 25355 + }, + { + "epoch": 52.723492723492726, + "grad_norm": 0.520143449306488, + "learning_rate": 1.4785565228840086e-05, + "loss": 0.1048, + "num_input_tokens_seen": 18196504, + "step": 25360 + }, + { + "epoch": 52.733887733887734, + "grad_norm": 0.26179251074790955, + "learning_rate": 1.4776605372894819e-05, + "loss": 0.0941, + "num_input_tokens_seen": 18200056, + "step": 25365 + }, + { + "epoch": 52.74428274428274, + "grad_norm": 0.8981397747993469, + "learning_rate": 1.4767647093525488e-05, + "loss": 0.1221, + "num_input_tokens_seen": 18203576, + "step": 25370 + }, + { + "epoch": 52.75467775467776, + "grad_norm": 0.3984142243862152, + "learning_rate": 1.4758690392113566e-05, + "loss": 0.094, + "num_input_tokens_seen": 18207032, + "step": 25375 + }, + { + "epoch": 52.765072765072766, + "grad_norm": 0.5357096195220947, + "learning_rate": 1.4749735270040276e-05, + "loss": 0.0753, + "num_input_tokens_seen": 18210648, + "step": 25380 + }, + { + "epoch": 52.775467775467774, + "grad_norm": 0.5975549817085266, + "learning_rate": 1.4740781728686623e-05, + "loss": 0.1275, + "num_input_tokens_seen": 18214200, + "step": 25385 + }, + { + "epoch": 52.78586278586278, + "grad_norm": 0.3221113681793213, + "learning_rate": 1.4731829769433358e-05, + "loss": 0.1133, + "num_input_tokens_seen": 18217624, + "step": 25390 + }, + { + "epoch": 52.7962577962578, + "grad_norm": 0.23464681208133698, + "learning_rate": 1.4722879393660976e-05, + "loss": 0.095, + "num_input_tokens_seen": 18221208, + "step": 25395 + }, + { + "epoch": 52.80665280665281, + "grad_norm": 0.29513412714004517, + "learning_rate": 1.4713930602749748e-05, + "loss": 0.1336, + "num_input_tokens_seen": 18224696, + "step": 25400 + }, + { + "epoch": 52.80665280665281, + "eval_loss": 0.14528189599514008, + "eval_runtime": 7.7508, + "eval_samples_per_second": 110.441, + "eval_steps_per_second": 27.61, + "num_input_tokens_seen": 18224696, + "step": 25400 + }, + { + "epoch": 52.817047817047815, + "grad_norm": 0.47988784313201904, + "learning_rate": 1.470498339807968e-05, + "loss": 0.0779, + "num_input_tokens_seen": 18228504, + "step": 25405 + }, + { + "epoch": 52.82744282744283, + "grad_norm": 0.2117346078157425, + "learning_rate": 1.4696037781030542e-05, + "loss": 0.0806, + "num_input_tokens_seen": 18232152, + "step": 25410 + }, + { + "epoch": 52.83783783783784, + "grad_norm": 0.4493228495121002, + "learning_rate": 1.4687093752981876e-05, + "loss": 0.0879, + "num_input_tokens_seen": 18235576, + "step": 25415 + }, + { + "epoch": 52.84823284823285, + "grad_norm": 0.23342546820640564, + "learning_rate": 1.4678151315312943e-05, + "loss": 0.1117, + "num_input_tokens_seen": 18239064, + "step": 25420 + }, + { + "epoch": 52.858627858627855, + "grad_norm": 0.30869725346565247, + "learning_rate": 1.4669210469402789e-05, + "loss": 0.0982, + "num_input_tokens_seen": 18242712, + "step": 25425 + }, + { + "epoch": 52.86902286902287, + "grad_norm": 0.34584251046180725, + "learning_rate": 1.4660271216630218e-05, + "loss": 0.1168, + "num_input_tokens_seen": 18246328, + "step": 25430 + }, + { + "epoch": 52.87941787941788, + "grad_norm": 0.55291748046875, + "learning_rate": 1.4651333558373748e-05, + "loss": 0.1044, + "num_input_tokens_seen": 18249944, + "step": 25435 + }, + { + "epoch": 52.88981288981289, + "grad_norm": 0.43440568447113037, + "learning_rate": 1.4642397496011707e-05, + "loss": 0.1306, + "num_input_tokens_seen": 18253688, + "step": 25440 + }, + { + "epoch": 52.9002079002079, + "grad_norm": 0.23819375038146973, + "learning_rate": 1.4633463030922129e-05, + "loss": 0.1248, + "num_input_tokens_seen": 18257208, + "step": 25445 + }, + { + "epoch": 52.91060291060291, + "grad_norm": 0.7804007530212402, + "learning_rate": 1.462453016448282e-05, + "loss": 0.094, + "num_input_tokens_seen": 18260888, + "step": 25450 + }, + { + "epoch": 52.92099792099792, + "grad_norm": 0.18383647501468658, + "learning_rate": 1.4615598898071354e-05, + "loss": 0.0841, + "num_input_tokens_seen": 18264504, + "step": 25455 + }, + { + "epoch": 52.931392931392935, + "grad_norm": 0.2175324410200119, + "learning_rate": 1.4606669233065026e-05, + "loss": 0.1032, + "num_input_tokens_seen": 18268216, + "step": 25460 + }, + { + "epoch": 52.94178794178794, + "grad_norm": 0.24152034521102905, + "learning_rate": 1.4597741170840914e-05, + "loss": 0.0791, + "num_input_tokens_seen": 18271768, + "step": 25465 + }, + { + "epoch": 52.95218295218295, + "grad_norm": 0.34612980484962463, + "learning_rate": 1.4588814712775853e-05, + "loss": 0.0834, + "num_input_tokens_seen": 18275320, + "step": 25470 + }, + { + "epoch": 52.96257796257796, + "grad_norm": 0.5453028082847595, + "learning_rate": 1.4579889860246382e-05, + "loss": 0.1155, + "num_input_tokens_seen": 18278776, + "step": 25475 + }, + { + "epoch": 52.972972972972975, + "grad_norm": 0.2673281133174896, + "learning_rate": 1.457096661462885e-05, + "loss": 0.0912, + "num_input_tokens_seen": 18282360, + "step": 25480 + }, + { + "epoch": 52.983367983367984, + "grad_norm": 0.24492275714874268, + "learning_rate": 1.4562044977299322e-05, + "loss": 0.0709, + "num_input_tokens_seen": 18285880, + "step": 25485 + }, + { + "epoch": 52.99376299376299, + "grad_norm": 0.4889525771141052, + "learning_rate": 1.4553124949633623e-05, + "loss": 0.0755, + "num_input_tokens_seen": 18289496, + "step": 25490 + }, + { + "epoch": 53.00415800415801, + "grad_norm": 0.2024327516555786, + "learning_rate": 1.4544206533007354e-05, + "loss": 0.1132, + "num_input_tokens_seen": 18293008, + "step": 25495 + }, + { + "epoch": 53.014553014553016, + "grad_norm": 0.29719865322113037, + "learning_rate": 1.4535289728795821e-05, + "loss": 0.089, + "num_input_tokens_seen": 18296528, + "step": 25500 + }, + { + "epoch": 53.024948024948024, + "grad_norm": 0.2658323347568512, + "learning_rate": 1.4526374538374132e-05, + "loss": 0.0876, + "num_input_tokens_seen": 18300240, + "step": 25505 + }, + { + "epoch": 53.03534303534303, + "grad_norm": 0.27382826805114746, + "learning_rate": 1.4517460963117097e-05, + "loss": 0.0672, + "num_input_tokens_seen": 18303984, + "step": 25510 + }, + { + "epoch": 53.04573804573805, + "grad_norm": 0.39459848403930664, + "learning_rate": 1.4508549004399314e-05, + "loss": 0.0943, + "num_input_tokens_seen": 18307472, + "step": 25515 + }, + { + "epoch": 53.056133056133056, + "grad_norm": 0.4216907024383545, + "learning_rate": 1.449963866359513e-05, + "loss": 0.1145, + "num_input_tokens_seen": 18311056, + "step": 25520 + }, + { + "epoch": 53.066528066528065, + "grad_norm": 0.22479918599128723, + "learning_rate": 1.4490729942078607e-05, + "loss": 0.117, + "num_input_tokens_seen": 18314704, + "step": 25525 + }, + { + "epoch": 53.07692307692308, + "grad_norm": 0.32860100269317627, + "learning_rate": 1.4481822841223608e-05, + "loss": 0.0991, + "num_input_tokens_seen": 18318352, + "step": 25530 + }, + { + "epoch": 53.08731808731809, + "grad_norm": 0.270430326461792, + "learning_rate": 1.4472917362403704e-05, + "loss": 0.0871, + "num_input_tokens_seen": 18322064, + "step": 25535 + }, + { + "epoch": 53.0977130977131, + "grad_norm": 0.3906628489494324, + "learning_rate": 1.4464013506992224e-05, + "loss": 0.0824, + "num_input_tokens_seen": 18325680, + "step": 25540 + }, + { + "epoch": 53.108108108108105, + "grad_norm": 0.4548051953315735, + "learning_rate": 1.4455111276362277e-05, + "loss": 0.0967, + "num_input_tokens_seen": 18329232, + "step": 25545 + }, + { + "epoch": 53.11850311850312, + "grad_norm": 0.7143222093582153, + "learning_rate": 1.4446210671886676e-05, + "loss": 0.0919, + "num_input_tokens_seen": 18332912, + "step": 25550 + }, + { + "epoch": 53.12889812889813, + "grad_norm": 0.24520577490329742, + "learning_rate": 1.4437311694938015e-05, + "loss": 0.0929, + "num_input_tokens_seen": 18336464, + "step": 25555 + }, + { + "epoch": 53.13929313929314, + "grad_norm": 0.29481950402259827, + "learning_rate": 1.442841434688864e-05, + "loss": 0.1178, + "num_input_tokens_seen": 18340080, + "step": 25560 + }, + { + "epoch": 53.14968814968815, + "grad_norm": 0.21104826033115387, + "learning_rate": 1.4419518629110615e-05, + "loss": 0.1037, + "num_input_tokens_seen": 18343792, + "step": 25565 + }, + { + "epoch": 53.16008316008316, + "grad_norm": 0.22903504967689514, + "learning_rate": 1.4410624542975778e-05, + "loss": 0.1271, + "num_input_tokens_seen": 18347536, + "step": 25570 + }, + { + "epoch": 53.17047817047817, + "grad_norm": 0.15418054163455963, + "learning_rate": 1.4401732089855724e-05, + "loss": 0.0849, + "num_input_tokens_seen": 18351248, + "step": 25575 + }, + { + "epoch": 53.18087318087318, + "grad_norm": 0.6365135312080383, + "learning_rate": 1.4392841271121754e-05, + "loss": 0.1071, + "num_input_tokens_seen": 18354800, + "step": 25580 + }, + { + "epoch": 53.19126819126819, + "grad_norm": 0.2273550033569336, + "learning_rate": 1.438395208814497e-05, + "loss": 0.0846, + "num_input_tokens_seen": 18358352, + "step": 25585 + }, + { + "epoch": 53.2016632016632, + "grad_norm": 0.204119473695755, + "learning_rate": 1.4375064542296174e-05, + "loss": 0.1293, + "num_input_tokens_seen": 18362032, + "step": 25590 + }, + { + "epoch": 53.21205821205821, + "grad_norm": 0.5769681334495544, + "learning_rate": 1.4366178634945946e-05, + "loss": 0.0803, + "num_input_tokens_seen": 18365456, + "step": 25595 + }, + { + "epoch": 53.222453222453225, + "grad_norm": 0.22980152070522308, + "learning_rate": 1.4357294367464616e-05, + "loss": 0.0805, + "num_input_tokens_seen": 18369136, + "step": 25600 + }, + { + "epoch": 53.222453222453225, + "eval_loss": 0.1441621482372284, + "eval_runtime": 7.7475, + "eval_samples_per_second": 110.487, + "eval_steps_per_second": 27.622, + "num_input_tokens_seen": 18369136, + "step": 25600 + }, + { + "epoch": 53.232848232848234, + "grad_norm": 0.8152276277542114, + "learning_rate": 1.434841174122224e-05, + "loss": 0.1375, + "num_input_tokens_seen": 18372752, + "step": 25605 + }, + { + "epoch": 53.24324324324324, + "grad_norm": 0.23302936553955078, + "learning_rate": 1.4339530757588615e-05, + "loss": 0.1104, + "num_input_tokens_seen": 18376336, + "step": 25610 + }, + { + "epoch": 53.25363825363825, + "grad_norm": 0.766192615032196, + "learning_rate": 1.433065141793333e-05, + "loss": 0.1077, + "num_input_tokens_seen": 18380080, + "step": 25615 + }, + { + "epoch": 53.264033264033266, + "grad_norm": 0.565602719783783, + "learning_rate": 1.4321773723625665e-05, + "loss": 0.1024, + "num_input_tokens_seen": 18383696, + "step": 25620 + }, + { + "epoch": 53.274428274428274, + "grad_norm": 0.3404081463813782, + "learning_rate": 1.4312897676034693e-05, + "loss": 0.1054, + "num_input_tokens_seen": 18387376, + "step": 25625 + }, + { + "epoch": 53.28482328482328, + "grad_norm": 0.22960391640663147, + "learning_rate": 1.4304023276529188e-05, + "loss": 0.0942, + "num_input_tokens_seen": 18390832, + "step": 25630 + }, + { + "epoch": 53.2952182952183, + "grad_norm": 0.1460946500301361, + "learning_rate": 1.4295150526477712e-05, + "loss": 0.0457, + "num_input_tokens_seen": 18394416, + "step": 25635 + }, + { + "epoch": 53.305613305613306, + "grad_norm": 0.45010611414909363, + "learning_rate": 1.4286279427248562e-05, + "loss": 0.1155, + "num_input_tokens_seen": 18398032, + "step": 25640 + }, + { + "epoch": 53.316008316008315, + "grad_norm": 0.5812288522720337, + "learning_rate": 1.4277409980209747e-05, + "loss": 0.1091, + "num_input_tokens_seen": 18401520, + "step": 25645 + }, + { + "epoch": 53.32640332640332, + "grad_norm": 0.2668590545654297, + "learning_rate": 1.4268542186729061e-05, + "loss": 0.1092, + "num_input_tokens_seen": 18405168, + "step": 25650 + }, + { + "epoch": 53.33679833679834, + "grad_norm": 0.26264408230781555, + "learning_rate": 1.4259676048174043e-05, + "loss": 0.1115, + "num_input_tokens_seen": 18408592, + "step": 25655 + }, + { + "epoch": 53.34719334719335, + "grad_norm": 0.4433216154575348, + "learning_rate": 1.4250811565911937e-05, + "loss": 0.1215, + "num_input_tokens_seen": 18412176, + "step": 25660 + }, + { + "epoch": 53.357588357588355, + "grad_norm": 0.3071039319038391, + "learning_rate": 1.4241948741309782e-05, + "loss": 0.1167, + "num_input_tokens_seen": 18415792, + "step": 25665 + }, + { + "epoch": 53.36798336798337, + "grad_norm": 0.4170471429824829, + "learning_rate": 1.4233087575734317e-05, + "loss": 0.1158, + "num_input_tokens_seen": 18419408, + "step": 25670 + }, + { + "epoch": 53.37837837837838, + "grad_norm": 0.5833674669265747, + "learning_rate": 1.422422807055206e-05, + "loss": 0.141, + "num_input_tokens_seen": 18422864, + "step": 25675 + }, + { + "epoch": 53.38877338877339, + "grad_norm": 1.0418870449066162, + "learning_rate": 1.4215370227129243e-05, + "loss": 0.0801, + "num_input_tokens_seen": 18426416, + "step": 25680 + }, + { + "epoch": 53.3991683991684, + "grad_norm": 0.3962218165397644, + "learning_rate": 1.4206514046831876e-05, + "loss": 0.1542, + "num_input_tokens_seen": 18430000, + "step": 25685 + }, + { + "epoch": 53.40956340956341, + "grad_norm": 0.48769500851631165, + "learning_rate": 1.419765953102567e-05, + "loss": 0.0884, + "num_input_tokens_seen": 18433488, + "step": 25690 + }, + { + "epoch": 53.41995841995842, + "grad_norm": 0.26770198345184326, + "learning_rate": 1.4188806681076125e-05, + "loss": 0.089, + "num_input_tokens_seen": 18437008, + "step": 25695 + }, + { + "epoch": 53.43035343035343, + "grad_norm": 0.3065144121646881, + "learning_rate": 1.4179955498348443e-05, + "loss": 0.0819, + "num_input_tokens_seen": 18440496, + "step": 25700 + }, + { + "epoch": 53.44074844074844, + "grad_norm": 1.0413639545440674, + "learning_rate": 1.4171105984207605e-05, + "loss": 0.1405, + "num_input_tokens_seen": 18444048, + "step": 25705 + }, + { + "epoch": 53.45114345114345, + "grad_norm": 0.1891666054725647, + "learning_rate": 1.4162258140018304e-05, + "loss": 0.0617, + "num_input_tokens_seen": 18447600, + "step": 25710 + }, + { + "epoch": 53.46153846153846, + "grad_norm": 0.6892057657241821, + "learning_rate": 1.4153411967144986e-05, + "loss": 0.1572, + "num_input_tokens_seen": 18451440, + "step": 25715 + }, + { + "epoch": 53.471933471933475, + "grad_norm": 0.9248117804527283, + "learning_rate": 1.4144567466951864e-05, + "loss": 0.1449, + "num_input_tokens_seen": 18454864, + "step": 25720 + }, + { + "epoch": 53.482328482328484, + "grad_norm": 0.5397647023200989, + "learning_rate": 1.4135724640802844e-05, + "loss": 0.1092, + "num_input_tokens_seen": 18458320, + "step": 25725 + }, + { + "epoch": 53.49272349272349, + "grad_norm": 0.35685819387435913, + "learning_rate": 1.4126883490061615e-05, + "loss": 0.102, + "num_input_tokens_seen": 18461744, + "step": 25730 + }, + { + "epoch": 53.5031185031185, + "grad_norm": 0.37613096833229065, + "learning_rate": 1.4118044016091603e-05, + "loss": 0.0935, + "num_input_tokens_seen": 18465264, + "step": 25735 + }, + { + "epoch": 53.513513513513516, + "grad_norm": 0.40370747447013855, + "learning_rate": 1.410920622025594e-05, + "loss": 0.0967, + "num_input_tokens_seen": 18468720, + "step": 25740 + }, + { + "epoch": 53.523908523908524, + "grad_norm": 0.7623723149299622, + "learning_rate": 1.4100370103917554e-05, + "loss": 0.1243, + "num_input_tokens_seen": 18472336, + "step": 25745 + }, + { + "epoch": 53.53430353430353, + "grad_norm": 0.4327532947063446, + "learning_rate": 1.409153566843907e-05, + "loss": 0.1382, + "num_input_tokens_seen": 18476016, + "step": 25750 + }, + { + "epoch": 53.54469854469855, + "grad_norm": 0.6323249340057373, + "learning_rate": 1.408270291518286e-05, + "loss": 0.1113, + "num_input_tokens_seen": 18479536, + "step": 25755 + }, + { + "epoch": 53.555093555093556, + "grad_norm": 0.312122106552124, + "learning_rate": 1.407387184551107e-05, + "loss": 0.087, + "num_input_tokens_seen": 18483120, + "step": 25760 + }, + { + "epoch": 53.565488565488565, + "grad_norm": 0.5054182410240173, + "learning_rate": 1.4065042460785532e-05, + "loss": 0.094, + "num_input_tokens_seen": 18486768, + "step": 25765 + }, + { + "epoch": 53.57588357588357, + "grad_norm": 0.6245802640914917, + "learning_rate": 1.405621476236787e-05, + "loss": 0.1273, + "num_input_tokens_seen": 18490352, + "step": 25770 + }, + { + "epoch": 53.58627858627859, + "grad_norm": 0.289102703332901, + "learning_rate": 1.4047388751619423e-05, + "loss": 0.0853, + "num_input_tokens_seen": 18493872, + "step": 25775 + }, + { + "epoch": 53.5966735966736, + "grad_norm": 0.29544734954833984, + "learning_rate": 1.4038564429901264e-05, + "loss": 0.1321, + "num_input_tokens_seen": 18497584, + "step": 25780 + }, + { + "epoch": 53.607068607068605, + "grad_norm": 0.3447859287261963, + "learning_rate": 1.4029741798574227e-05, + "loss": 0.0869, + "num_input_tokens_seen": 18501008, + "step": 25785 + }, + { + "epoch": 53.61746361746362, + "grad_norm": 0.19515126943588257, + "learning_rate": 1.402092085899886e-05, + "loss": 0.0566, + "num_input_tokens_seen": 18504560, + "step": 25790 + }, + { + "epoch": 53.62785862785863, + "grad_norm": 0.17681504786014557, + "learning_rate": 1.4012101612535464e-05, + "loss": 0.08, + "num_input_tokens_seen": 18508144, + "step": 25795 + }, + { + "epoch": 53.63825363825364, + "grad_norm": 0.47777095437049866, + "learning_rate": 1.4003284060544092e-05, + "loss": 0.0733, + "num_input_tokens_seen": 18511824, + "step": 25800 + }, + { + "epoch": 53.63825363825364, + "eval_loss": 0.14365240931510925, + "eval_runtime": 7.7451, + "eval_samples_per_second": 110.522, + "eval_steps_per_second": 27.63, + "num_input_tokens_seen": 18511824, + "step": 25800 + }, + { + "epoch": 53.648648648648646, + "grad_norm": 0.2774130403995514, + "learning_rate": 1.3994468204384504e-05, + "loss": 0.0852, + "num_input_tokens_seen": 18515312, + "step": 25805 + }, + { + "epoch": 53.65904365904366, + "grad_norm": 0.30511674284935, + "learning_rate": 1.398565404541622e-05, + "loss": 0.0698, + "num_input_tokens_seen": 18518832, + "step": 25810 + }, + { + "epoch": 53.66943866943867, + "grad_norm": 0.35246846079826355, + "learning_rate": 1.3976841584998513e-05, + "loss": 0.1289, + "num_input_tokens_seen": 18522448, + "step": 25815 + }, + { + "epoch": 53.67983367983368, + "grad_norm": 0.45653030276298523, + "learning_rate": 1.3968030824490352e-05, + "loss": 0.102, + "num_input_tokens_seen": 18526096, + "step": 25820 + }, + { + "epoch": 53.69022869022869, + "grad_norm": 0.2928633689880371, + "learning_rate": 1.3959221765250469e-05, + "loss": 0.079, + "num_input_tokens_seen": 18529616, + "step": 25825 + }, + { + "epoch": 53.7006237006237, + "grad_norm": 0.4995104968547821, + "learning_rate": 1.3950414408637343e-05, + "loss": 0.1457, + "num_input_tokens_seen": 18533264, + "step": 25830 + }, + { + "epoch": 53.71101871101871, + "grad_norm": 0.36203572154045105, + "learning_rate": 1.3941608756009166e-05, + "loss": 0.0807, + "num_input_tokens_seen": 18536624, + "step": 25835 + }, + { + "epoch": 53.72141372141372, + "grad_norm": 0.2296677827835083, + "learning_rate": 1.3932804808723898e-05, + "loss": 0.1375, + "num_input_tokens_seen": 18540336, + "step": 25840 + }, + { + "epoch": 53.731808731808734, + "grad_norm": 0.8881204128265381, + "learning_rate": 1.3924002568139194e-05, + "loss": 0.1185, + "num_input_tokens_seen": 18543952, + "step": 25845 + }, + { + "epoch": 53.74220374220374, + "grad_norm": 0.1781667321920395, + "learning_rate": 1.3915202035612485e-05, + "loss": 0.0788, + "num_input_tokens_seen": 18547632, + "step": 25850 + }, + { + "epoch": 53.75259875259875, + "grad_norm": 0.3723093569278717, + "learning_rate": 1.3906403212500935e-05, + "loss": 0.1103, + "num_input_tokens_seen": 18551280, + "step": 25855 + }, + { + "epoch": 53.762993762993766, + "grad_norm": 0.3386898934841156, + "learning_rate": 1.3897606100161409e-05, + "loss": 0.1175, + "num_input_tokens_seen": 18554736, + "step": 25860 + }, + { + "epoch": 53.773388773388774, + "grad_norm": 0.13114729523658752, + "learning_rate": 1.388881069995055e-05, + "loss": 0.0965, + "num_input_tokens_seen": 18558416, + "step": 25865 + }, + { + "epoch": 53.78378378378378, + "grad_norm": 0.4107949733734131, + "learning_rate": 1.3880017013224708e-05, + "loss": 0.0687, + "num_input_tokens_seen": 18562096, + "step": 25870 + }, + { + "epoch": 53.79417879417879, + "grad_norm": 0.3187226355075836, + "learning_rate": 1.3871225041339984e-05, + "loss": 0.0876, + "num_input_tokens_seen": 18565680, + "step": 25875 + }, + { + "epoch": 53.804573804573806, + "grad_norm": 0.4027169942855835, + "learning_rate": 1.386243478565222e-05, + "loss": 0.1109, + "num_input_tokens_seen": 18569232, + "step": 25880 + }, + { + "epoch": 53.814968814968815, + "grad_norm": 0.2230677604675293, + "learning_rate": 1.3853646247516966e-05, + "loss": 0.0876, + "num_input_tokens_seen": 18572720, + "step": 25885 + }, + { + "epoch": 53.82536382536382, + "grad_norm": 0.3305194079875946, + "learning_rate": 1.3844859428289545e-05, + "loss": 0.092, + "num_input_tokens_seen": 18576432, + "step": 25890 + }, + { + "epoch": 53.83575883575884, + "grad_norm": 0.231941357254982, + "learning_rate": 1.3836074329324984e-05, + "loss": 0.0597, + "num_input_tokens_seen": 18579920, + "step": 25895 + }, + { + "epoch": 53.84615384615385, + "grad_norm": 0.20517581701278687, + "learning_rate": 1.3827290951978044e-05, + "loss": 0.0794, + "num_input_tokens_seen": 18583632, + "step": 25900 + }, + { + "epoch": 53.856548856548855, + "grad_norm": 0.5921528935432434, + "learning_rate": 1.381850929760326e-05, + "loss": 0.1161, + "num_input_tokens_seen": 18587280, + "step": 25905 + }, + { + "epoch": 53.86694386694387, + "grad_norm": 0.27819496393203735, + "learning_rate": 1.3809729367554842e-05, + "loss": 0.1075, + "num_input_tokens_seen": 18590832, + "step": 25910 + }, + { + "epoch": 53.87733887733888, + "grad_norm": 0.2580745220184326, + "learning_rate": 1.3800951163186784e-05, + "loss": 0.1415, + "num_input_tokens_seen": 18594416, + "step": 25915 + }, + { + "epoch": 53.88773388773389, + "grad_norm": 0.08132121711969376, + "learning_rate": 1.3792174685852801e-05, + "loss": 0.1032, + "num_input_tokens_seen": 18598064, + "step": 25920 + }, + { + "epoch": 53.898128898128896, + "grad_norm": 0.3580944240093231, + "learning_rate": 1.378339993690632e-05, + "loss": 0.0917, + "num_input_tokens_seen": 18601648, + "step": 25925 + }, + { + "epoch": 53.90852390852391, + "grad_norm": 0.3932356834411621, + "learning_rate": 1.3774626917700523e-05, + "loss": 0.1299, + "num_input_tokens_seen": 18605168, + "step": 25930 + }, + { + "epoch": 53.91891891891892, + "grad_norm": 0.1711324006319046, + "learning_rate": 1.3765855629588334e-05, + "loss": 0.0971, + "num_input_tokens_seen": 18608688, + "step": 25935 + }, + { + "epoch": 53.92931392931393, + "grad_norm": 0.17672502994537354, + "learning_rate": 1.3757086073922374e-05, + "loss": 0.0714, + "num_input_tokens_seen": 18612240, + "step": 25940 + }, + { + "epoch": 53.93970893970894, + "grad_norm": 0.15304607152938843, + "learning_rate": 1.3748318252055038e-05, + "loss": 0.1022, + "num_input_tokens_seen": 18615856, + "step": 25945 + }, + { + "epoch": 53.95010395010395, + "grad_norm": 0.37480926513671875, + "learning_rate": 1.3739552165338416e-05, + "loss": 0.0771, + "num_input_tokens_seen": 18619408, + "step": 25950 + }, + { + "epoch": 53.96049896049896, + "grad_norm": 0.18976086378097534, + "learning_rate": 1.3730787815124354e-05, + "loss": 0.0577, + "num_input_tokens_seen": 18622992, + "step": 25955 + }, + { + "epoch": 53.97089397089397, + "grad_norm": 0.24481938779354095, + "learning_rate": 1.3722025202764443e-05, + "loss": 0.0944, + "num_input_tokens_seen": 18626544, + "step": 25960 + }, + { + "epoch": 53.981288981288984, + "grad_norm": 0.40666714310646057, + "learning_rate": 1.371326432960997e-05, + "loss": 0.0746, + "num_input_tokens_seen": 18630224, + "step": 25965 + }, + { + "epoch": 53.99168399168399, + "grad_norm": 0.27780550718307495, + "learning_rate": 1.3704505197011969e-05, + "loss": 0.1012, + "num_input_tokens_seen": 18633776, + "step": 25970 + }, + { + "epoch": 54.002079002079, + "grad_norm": 0.22492697834968567, + "learning_rate": 1.3695747806321224e-05, + "loss": 0.0865, + "num_input_tokens_seen": 18637184, + "step": 25975 + }, + { + "epoch": 54.012474012474016, + "grad_norm": 0.27788305282592773, + "learning_rate": 1.3686992158888212e-05, + "loss": 0.1043, + "num_input_tokens_seen": 18640832, + "step": 25980 + }, + { + "epoch": 54.022869022869024, + "grad_norm": 0.5916475653648376, + "learning_rate": 1.367823825606319e-05, + "loss": 0.0936, + "num_input_tokens_seen": 18644480, + "step": 25985 + }, + { + "epoch": 54.03326403326403, + "grad_norm": 0.5402680039405823, + "learning_rate": 1.36694860991961e-05, + "loss": 0.1459, + "num_input_tokens_seen": 18648128, + "step": 25990 + }, + { + "epoch": 54.04365904365904, + "grad_norm": 0.2552429139614105, + "learning_rate": 1.3660735689636636e-05, + "loss": 0.0748, + "num_input_tokens_seen": 18651488, + "step": 25995 + }, + { + "epoch": 54.054054054054056, + "grad_norm": 0.22043557465076447, + "learning_rate": 1.365198702873424e-05, + "loss": 0.0814, + "num_input_tokens_seen": 18655008, + "step": 26000 + }, + { + "epoch": 54.054054054054056, + "eval_loss": 0.14321190118789673, + "eval_runtime": 7.7438, + "eval_samples_per_second": 110.54, + "eval_steps_per_second": 27.635, + "num_input_tokens_seen": 18655008, + "step": 26000 + }, + { + "epoch": 54.064449064449065, + "grad_norm": 0.28962451219558716, + "learning_rate": 1.364324011783804e-05, + "loss": 0.1104, + "num_input_tokens_seen": 18658624, + "step": 26005 + }, + { + "epoch": 54.07484407484407, + "grad_norm": 0.3510928153991699, + "learning_rate": 1.3634494958296934e-05, + "loss": 0.0768, + "num_input_tokens_seen": 18662208, + "step": 26010 + }, + { + "epoch": 54.08523908523909, + "grad_norm": 0.3609924018383026, + "learning_rate": 1.3625751551459542e-05, + "loss": 0.1173, + "num_input_tokens_seen": 18665824, + "step": 26015 + }, + { + "epoch": 54.0956340956341, + "grad_norm": 0.1855699121952057, + "learning_rate": 1.3617009898674188e-05, + "loss": 0.1092, + "num_input_tokens_seen": 18669440, + "step": 26020 + }, + { + "epoch": 54.106029106029105, + "grad_norm": 0.47523975372314453, + "learning_rate": 1.3608270001288967e-05, + "loss": 0.0834, + "num_input_tokens_seen": 18673024, + "step": 26025 + }, + { + "epoch": 54.11642411642411, + "grad_norm": 0.20926418900489807, + "learning_rate": 1.359953186065166e-05, + "loss": 0.1109, + "num_input_tokens_seen": 18676608, + "step": 26030 + }, + { + "epoch": 54.12681912681913, + "grad_norm": 0.8423270583152771, + "learning_rate": 1.3590795478109814e-05, + "loss": 0.0937, + "num_input_tokens_seen": 18680224, + "step": 26035 + }, + { + "epoch": 54.13721413721414, + "grad_norm": 0.2562529444694519, + "learning_rate": 1.3582060855010675e-05, + "loss": 0.0675, + "num_input_tokens_seen": 18683872, + "step": 26040 + }, + { + "epoch": 54.147609147609145, + "grad_norm": 0.30598434805870056, + "learning_rate": 1.3573327992701245e-05, + "loss": 0.1254, + "num_input_tokens_seen": 18687584, + "step": 26045 + }, + { + "epoch": 54.15800415800416, + "grad_norm": 0.6976395845413208, + "learning_rate": 1.356459689252823e-05, + "loss": 0.0992, + "num_input_tokens_seen": 18691136, + "step": 26050 + }, + { + "epoch": 54.16839916839917, + "grad_norm": 0.4506864845752716, + "learning_rate": 1.3555867555838087e-05, + "loss": 0.0781, + "num_input_tokens_seen": 18694720, + "step": 26055 + }, + { + "epoch": 54.17879417879418, + "grad_norm": 0.467037558555603, + "learning_rate": 1.3547139983976975e-05, + "loss": 0.1311, + "num_input_tokens_seen": 18698560, + "step": 26060 + }, + { + "epoch": 54.189189189189186, + "grad_norm": 0.4350576102733612, + "learning_rate": 1.3538414178290815e-05, + "loss": 0.1327, + "num_input_tokens_seen": 18702080, + "step": 26065 + }, + { + "epoch": 54.1995841995842, + "grad_norm": 0.6572515964508057, + "learning_rate": 1.3529690140125209e-05, + "loss": 0.1175, + "num_input_tokens_seen": 18705536, + "step": 26070 + }, + { + "epoch": 54.20997920997921, + "grad_norm": 0.4005175828933716, + "learning_rate": 1.352096787082553e-05, + "loss": 0.0909, + "num_input_tokens_seen": 18709056, + "step": 26075 + }, + { + "epoch": 54.22037422037422, + "grad_norm": 0.5242418050765991, + "learning_rate": 1.3512247371736871e-05, + "loss": 0.1063, + "num_input_tokens_seen": 18712608, + "step": 26080 + }, + { + "epoch": 54.23076923076923, + "grad_norm": 0.206784188747406, + "learning_rate": 1.3503528644204022e-05, + "loss": 0.0521, + "num_input_tokens_seen": 18716224, + "step": 26085 + }, + { + "epoch": 54.24116424116424, + "grad_norm": 0.4103730320930481, + "learning_rate": 1.349481168957153e-05, + "loss": 0.109, + "num_input_tokens_seen": 18719776, + "step": 26090 + }, + { + "epoch": 54.25155925155925, + "grad_norm": 0.5634032487869263, + "learning_rate": 1.3486096509183665e-05, + "loss": 0.0815, + "num_input_tokens_seen": 18723264, + "step": 26095 + }, + { + "epoch": 54.26195426195426, + "grad_norm": 0.4116443395614624, + "learning_rate": 1.3477383104384406e-05, + "loss": 0.1365, + "num_input_tokens_seen": 18726912, + "step": 26100 + }, + { + "epoch": 54.272349272349274, + "grad_norm": 0.42525237798690796, + "learning_rate": 1.3468671476517481e-05, + "loss": 0.1704, + "num_input_tokens_seen": 18730368, + "step": 26105 + }, + { + "epoch": 54.28274428274428, + "grad_norm": 0.4351467490196228, + "learning_rate": 1.3459961626926326e-05, + "loss": 0.1294, + "num_input_tokens_seen": 18734016, + "step": 26110 + }, + { + "epoch": 54.29313929313929, + "grad_norm": 0.15654048323631287, + "learning_rate": 1.3451253556954101e-05, + "loss": 0.0432, + "num_input_tokens_seen": 18737760, + "step": 26115 + }, + { + "epoch": 54.303534303534306, + "grad_norm": 0.5966565608978271, + "learning_rate": 1.3442547267943717e-05, + "loss": 0.0993, + "num_input_tokens_seen": 18741312, + "step": 26120 + }, + { + "epoch": 54.313929313929314, + "grad_norm": 0.3064225912094116, + "learning_rate": 1.3433842761237774e-05, + "loss": 0.1166, + "num_input_tokens_seen": 18744896, + "step": 26125 + }, + { + "epoch": 54.32432432432432, + "grad_norm": 0.1560082882642746, + "learning_rate": 1.3425140038178639e-05, + "loss": 0.0891, + "num_input_tokens_seen": 18748512, + "step": 26130 + }, + { + "epoch": 54.33471933471934, + "grad_norm": 0.26956599950790405, + "learning_rate": 1.3416439100108358e-05, + "loss": 0.1144, + "num_input_tokens_seen": 18752192, + "step": 26135 + }, + { + "epoch": 54.34511434511435, + "grad_norm": 0.46974411606788635, + "learning_rate": 1.3407739948368734e-05, + "loss": 0.0802, + "num_input_tokens_seen": 18755680, + "step": 26140 + }, + { + "epoch": 54.355509355509355, + "grad_norm": 0.5162402391433716, + "learning_rate": 1.3399042584301298e-05, + "loss": 0.105, + "num_input_tokens_seen": 18759264, + "step": 26145 + }, + { + "epoch": 54.36590436590436, + "grad_norm": 0.6186916828155518, + "learning_rate": 1.3390347009247272e-05, + "loss": 0.0973, + "num_input_tokens_seen": 18763008, + "step": 26150 + }, + { + "epoch": 54.37629937629938, + "grad_norm": 0.5125771760940552, + "learning_rate": 1.3381653224547635e-05, + "loss": 0.1432, + "num_input_tokens_seen": 18766848, + "step": 26155 + }, + { + "epoch": 54.38669438669439, + "grad_norm": 0.4840444326400757, + "learning_rate": 1.3372961231543086e-05, + "loss": 0.0837, + "num_input_tokens_seen": 18770432, + "step": 26160 + }, + { + "epoch": 54.397089397089395, + "grad_norm": 0.20891296863555908, + "learning_rate": 1.3364271031574016e-05, + "loss": 0.0523, + "num_input_tokens_seen": 18773856, + "step": 26165 + }, + { + "epoch": 54.40748440748441, + "grad_norm": 0.3776865005493164, + "learning_rate": 1.335558262598059e-05, + "loss": 0.1022, + "num_input_tokens_seen": 18777376, + "step": 26170 + }, + { + "epoch": 54.41787941787942, + "grad_norm": 0.3590381443500519, + "learning_rate": 1.3346896016102645e-05, + "loss": 0.0926, + "num_input_tokens_seen": 18780832, + "step": 26175 + }, + { + "epoch": 54.42827442827443, + "grad_norm": 0.22648896276950836, + "learning_rate": 1.3338211203279788e-05, + "loss": 0.117, + "num_input_tokens_seen": 18784448, + "step": 26180 + }, + { + "epoch": 54.438669438669436, + "grad_norm": 0.3773688077926636, + "learning_rate": 1.3329528188851303e-05, + "loss": 0.113, + "num_input_tokens_seen": 18788032, + "step": 26185 + }, + { + "epoch": 54.44906444906445, + "grad_norm": 0.4957881569862366, + "learning_rate": 1.3320846974156242e-05, + "loss": 0.1053, + "num_input_tokens_seen": 18791520, + "step": 26190 + }, + { + "epoch": 54.45945945945946, + "grad_norm": 0.3231875002384186, + "learning_rate": 1.3312167560533337e-05, + "loss": 0.1124, + "num_input_tokens_seen": 18795136, + "step": 26195 + }, + { + "epoch": 54.46985446985447, + "grad_norm": 0.33977553248405457, + "learning_rate": 1.3303489949321082e-05, + "loss": 0.0856, + "num_input_tokens_seen": 18798592, + "step": 26200 + }, + { + "epoch": 54.46985446985447, + "eval_loss": 0.1490035355091095, + "eval_runtime": 7.7531, + "eval_samples_per_second": 110.407, + "eval_steps_per_second": 27.602, + "num_input_tokens_seen": 18798592, + "step": 26200 + }, + { + "epoch": 54.48024948024948, + "grad_norm": 0.24871008098125458, + "learning_rate": 1.3294814141857653e-05, + "loss": 0.1237, + "num_input_tokens_seen": 18802144, + "step": 26205 + }, + { + "epoch": 54.49064449064449, + "grad_norm": 0.36864998936653137, + "learning_rate": 1.3286140139480992e-05, + "loss": 0.0856, + "num_input_tokens_seen": 18805632, + "step": 26210 + }, + { + "epoch": 54.5010395010395, + "grad_norm": 1.1187022924423218, + "learning_rate": 1.3277467943528719e-05, + "loss": 0.0846, + "num_input_tokens_seen": 18809088, + "step": 26215 + }, + { + "epoch": 54.51143451143451, + "grad_norm": 0.3041554093360901, + "learning_rate": 1.3268797555338203e-05, + "loss": 0.0865, + "num_input_tokens_seen": 18812928, + "step": 26220 + }, + { + "epoch": 54.521829521829524, + "grad_norm": 0.2579669654369354, + "learning_rate": 1.3260128976246533e-05, + "loss": 0.0794, + "num_input_tokens_seen": 18816512, + "step": 26225 + }, + { + "epoch": 54.53222453222453, + "grad_norm": 0.17703752219676971, + "learning_rate": 1.32514622075905e-05, + "loss": 0.1356, + "num_input_tokens_seen": 18820032, + "step": 26230 + }, + { + "epoch": 54.54261954261954, + "grad_norm": 0.31566503643989563, + "learning_rate": 1.3242797250706638e-05, + "loss": 0.0814, + "num_input_tokens_seen": 18823392, + "step": 26235 + }, + { + "epoch": 54.553014553014556, + "grad_norm": 0.21956129372119904, + "learning_rate": 1.3234134106931195e-05, + "loss": 0.1145, + "num_input_tokens_seen": 18826944, + "step": 26240 + }, + { + "epoch": 54.563409563409564, + "grad_norm": 0.5572912096977234, + "learning_rate": 1.322547277760013e-05, + "loss": 0.1092, + "num_input_tokens_seen": 18830464, + "step": 26245 + }, + { + "epoch": 54.57380457380457, + "grad_norm": 0.30161696672439575, + "learning_rate": 1.3216813264049132e-05, + "loss": 0.1013, + "num_input_tokens_seen": 18833984, + "step": 26250 + }, + { + "epoch": 54.58419958419958, + "grad_norm": 0.325289785861969, + "learning_rate": 1.32081555676136e-05, + "loss": 0.0979, + "num_input_tokens_seen": 18837536, + "step": 26255 + }, + { + "epoch": 54.5945945945946, + "grad_norm": 0.5142970681190491, + "learning_rate": 1.3199499689628674e-05, + "loss": 0.0985, + "num_input_tokens_seen": 18841088, + "step": 26260 + }, + { + "epoch": 54.604989604989605, + "grad_norm": 0.11784176528453827, + "learning_rate": 1.3190845631429192e-05, + "loss": 0.1063, + "num_input_tokens_seen": 18844608, + "step": 26265 + }, + { + "epoch": 54.61538461538461, + "grad_norm": 0.5691631436347961, + "learning_rate": 1.3182193394349704e-05, + "loss": 0.1173, + "num_input_tokens_seen": 18848256, + "step": 26270 + }, + { + "epoch": 54.62577962577963, + "grad_norm": 0.2222696989774704, + "learning_rate": 1.3173542979724507e-05, + "loss": 0.1077, + "num_input_tokens_seen": 18851872, + "step": 26275 + }, + { + "epoch": 54.63617463617464, + "grad_norm": 0.38427743315696716, + "learning_rate": 1.3164894388887617e-05, + "loss": 0.0862, + "num_input_tokens_seen": 18855392, + "step": 26280 + }, + { + "epoch": 54.646569646569645, + "grad_norm": 0.21479696035385132, + "learning_rate": 1.3156247623172727e-05, + "loss": 0.1045, + "num_input_tokens_seen": 18858880, + "step": 26285 + }, + { + "epoch": 54.656964656964654, + "grad_norm": 0.1678524762392044, + "learning_rate": 1.3147602683913302e-05, + "loss": 0.0903, + "num_input_tokens_seen": 18862464, + "step": 26290 + }, + { + "epoch": 54.66735966735967, + "grad_norm": 0.2661856412887573, + "learning_rate": 1.3138959572442481e-05, + "loss": 0.0898, + "num_input_tokens_seen": 18866016, + "step": 26295 + }, + { + "epoch": 54.67775467775468, + "grad_norm": 0.19913162291049957, + "learning_rate": 1.3130318290093146e-05, + "loss": 0.1081, + "num_input_tokens_seen": 18869600, + "step": 26300 + }, + { + "epoch": 54.688149688149686, + "grad_norm": 0.22699354588985443, + "learning_rate": 1.3121678838197909e-05, + "loss": 0.1424, + "num_input_tokens_seen": 18873440, + "step": 26305 + }, + { + "epoch": 54.6985446985447, + "grad_norm": 0.19648414850234985, + "learning_rate": 1.3113041218089056e-05, + "loss": 0.1013, + "num_input_tokens_seen": 18876832, + "step": 26310 + }, + { + "epoch": 54.70893970893971, + "grad_norm": 0.18894028663635254, + "learning_rate": 1.3104405431098626e-05, + "loss": 0.08, + "num_input_tokens_seen": 18880288, + "step": 26315 + }, + { + "epoch": 54.71933471933472, + "grad_norm": 0.21901488304138184, + "learning_rate": 1.3095771478558377e-05, + "loss": 0.0862, + "num_input_tokens_seen": 18884128, + "step": 26320 + }, + { + "epoch": 54.729729729729726, + "grad_norm": 0.15809312462806702, + "learning_rate": 1.3087139361799766e-05, + "loss": 0.0571, + "num_input_tokens_seen": 18887776, + "step": 26325 + }, + { + "epoch": 54.74012474012474, + "grad_norm": 0.19233736395835876, + "learning_rate": 1.3078509082153964e-05, + "loss": 0.0751, + "num_input_tokens_seen": 18891616, + "step": 26330 + }, + { + "epoch": 54.75051975051975, + "grad_norm": 0.49205195903778076, + "learning_rate": 1.3069880640951885e-05, + "loss": 0.1125, + "num_input_tokens_seen": 18895232, + "step": 26335 + }, + { + "epoch": 54.76091476091476, + "grad_norm": 0.22503872215747833, + "learning_rate": 1.3061254039524123e-05, + "loss": 0.1099, + "num_input_tokens_seen": 18898816, + "step": 26340 + }, + { + "epoch": 54.771309771309774, + "grad_norm": 0.17514951527118683, + "learning_rate": 1.3052629279201028e-05, + "loss": 0.0812, + "num_input_tokens_seen": 18902560, + "step": 26345 + }, + { + "epoch": 54.78170478170478, + "grad_norm": 0.2035159468650818, + "learning_rate": 1.3044006361312633e-05, + "loss": 0.0645, + "num_input_tokens_seen": 18906112, + "step": 26350 + }, + { + "epoch": 54.79209979209979, + "grad_norm": 0.5120819807052612, + "learning_rate": 1.30353852871887e-05, + "loss": 0.0863, + "num_input_tokens_seen": 18909696, + "step": 26355 + }, + { + "epoch": 54.802494802494806, + "grad_norm": 0.5194357633590698, + "learning_rate": 1.302676605815873e-05, + "loss": 0.0815, + "num_input_tokens_seen": 18913248, + "step": 26360 + }, + { + "epoch": 54.812889812889814, + "grad_norm": 0.5224544405937195, + "learning_rate": 1.3018148675551884e-05, + "loss": 0.1094, + "num_input_tokens_seen": 18916800, + "step": 26365 + }, + { + "epoch": 54.82328482328482, + "grad_norm": 0.20700275897979736, + "learning_rate": 1.3009533140697094e-05, + "loss": 0.1144, + "num_input_tokens_seen": 18920384, + "step": 26370 + }, + { + "epoch": 54.83367983367983, + "grad_norm": 0.2903614342212677, + "learning_rate": 1.3000919454922966e-05, + "loss": 0.0741, + "num_input_tokens_seen": 18923968, + "step": 26375 + }, + { + "epoch": 54.84407484407485, + "grad_norm": 0.29904043674468994, + "learning_rate": 1.299230761955785e-05, + "loss": 0.0976, + "num_input_tokens_seen": 18927488, + "step": 26380 + }, + { + "epoch": 54.854469854469855, + "grad_norm": 0.19587811827659607, + "learning_rate": 1.2983697635929807e-05, + "loss": 0.1076, + "num_input_tokens_seen": 18931264, + "step": 26385 + }, + { + "epoch": 54.86486486486486, + "grad_norm": 0.6538382768630981, + "learning_rate": 1.2975089505366584e-05, + "loss": 0.1064, + "num_input_tokens_seen": 18934944, + "step": 26390 + }, + { + "epoch": 54.87525987525988, + "grad_norm": 0.36332911252975464, + "learning_rate": 1.2966483229195683e-05, + "loss": 0.108, + "num_input_tokens_seen": 18938560, + "step": 26395 + }, + { + "epoch": 54.88565488565489, + "grad_norm": 0.3632287383079529, + "learning_rate": 1.2957878808744283e-05, + "loss": 0.1183, + "num_input_tokens_seen": 18942016, + "step": 26400 + }, + { + "epoch": 54.88565488565489, + "eval_loss": 0.1462591141462326, + "eval_runtime": 7.7525, + "eval_samples_per_second": 110.416, + "eval_steps_per_second": 27.604, + "num_input_tokens_seen": 18942016, + "step": 26400 + }, + { + "epoch": 54.896049896049895, + "grad_norm": 0.33095940947532654, + "learning_rate": 1.294927624533931e-05, + "loss": 0.0813, + "num_input_tokens_seen": 18945536, + "step": 26405 + }, + { + "epoch": 54.906444906444904, + "grad_norm": 0.09668830782175064, + "learning_rate": 1.2940675540307378e-05, + "loss": 0.0811, + "num_input_tokens_seen": 18949056, + "step": 26410 + }, + { + "epoch": 54.91683991683992, + "grad_norm": 0.4437720775604248, + "learning_rate": 1.2932076694974814e-05, + "loss": 0.0807, + "num_input_tokens_seen": 18952800, + "step": 26415 + }, + { + "epoch": 54.92723492723493, + "grad_norm": 0.48925554752349854, + "learning_rate": 1.2923479710667682e-05, + "loss": 0.1818, + "num_input_tokens_seen": 18956608, + "step": 26420 + }, + { + "epoch": 54.937629937629936, + "grad_norm": 0.35759350657463074, + "learning_rate": 1.2914884588711751e-05, + "loss": 0.0851, + "num_input_tokens_seen": 18960064, + "step": 26425 + }, + { + "epoch": 54.94802494802495, + "grad_norm": 1.0253862142562866, + "learning_rate": 1.2906291330432475e-05, + "loss": 0.0768, + "num_input_tokens_seen": 18963456, + "step": 26430 + }, + { + "epoch": 54.95841995841996, + "grad_norm": 0.5011595487594604, + "learning_rate": 1.2897699937155055e-05, + "loss": 0.1026, + "num_input_tokens_seen": 18967104, + "step": 26435 + }, + { + "epoch": 54.96881496881497, + "grad_norm": 0.18532119691371918, + "learning_rate": 1.2889110410204403e-05, + "loss": 0.0963, + "num_input_tokens_seen": 18970752, + "step": 26440 + }, + { + "epoch": 54.979209979209976, + "grad_norm": 0.28381988406181335, + "learning_rate": 1.2880522750905111e-05, + "loss": 0.0952, + "num_input_tokens_seen": 18974432, + "step": 26445 + }, + { + "epoch": 54.98960498960499, + "grad_norm": 0.2436131238937378, + "learning_rate": 1.2871936960581523e-05, + "loss": 0.1336, + "num_input_tokens_seen": 18977920, + "step": 26450 + }, + { + "epoch": 55.0, + "grad_norm": 0.1935163140296936, + "learning_rate": 1.2863353040557658e-05, + "loss": 0.0982, + "num_input_tokens_seen": 18981360, + "step": 26455 + }, + { + "epoch": 55.01039501039501, + "grad_norm": 0.6493567228317261, + "learning_rate": 1.2854770992157273e-05, + "loss": 0.1076, + "num_input_tokens_seen": 18984976, + "step": 26460 + }, + { + "epoch": 55.020790020790024, + "grad_norm": 0.4745434820652008, + "learning_rate": 1.2846190816703835e-05, + "loss": 0.0833, + "num_input_tokens_seen": 18988560, + "step": 26465 + }, + { + "epoch": 55.03118503118503, + "grad_norm": 0.21929018199443817, + "learning_rate": 1.2837612515520498e-05, + "loss": 0.0772, + "num_input_tokens_seen": 18992112, + "step": 26470 + }, + { + "epoch": 55.04158004158004, + "grad_norm": 0.1792660355567932, + "learning_rate": 1.2829036089930163e-05, + "loss": 0.1164, + "num_input_tokens_seen": 18995664, + "step": 26475 + }, + { + "epoch": 55.05197505197505, + "grad_norm": 0.3653794229030609, + "learning_rate": 1.2820461541255412e-05, + "loss": 0.1054, + "num_input_tokens_seen": 18999312, + "step": 26480 + }, + { + "epoch": 55.062370062370064, + "grad_norm": 0.5764927864074707, + "learning_rate": 1.2811888870818543e-05, + "loss": 0.1101, + "num_input_tokens_seen": 19002864, + "step": 26485 + }, + { + "epoch": 55.07276507276507, + "grad_norm": 0.26965197920799255, + "learning_rate": 1.2803318079941581e-05, + "loss": 0.0696, + "num_input_tokens_seen": 19006416, + "step": 26490 + }, + { + "epoch": 55.08316008316008, + "grad_norm": 0.2341703176498413, + "learning_rate": 1.2794749169946235e-05, + "loss": 0.0793, + "num_input_tokens_seen": 19009808, + "step": 26495 + }, + { + "epoch": 55.093555093555096, + "grad_norm": 0.2625722885131836, + "learning_rate": 1.2786182142153952e-05, + "loss": 0.0961, + "num_input_tokens_seen": 19013520, + "step": 26500 + }, + { + "epoch": 55.103950103950105, + "grad_norm": 0.2719261944293976, + "learning_rate": 1.2777616997885878e-05, + "loss": 0.1011, + "num_input_tokens_seen": 19016944, + "step": 26505 + }, + { + "epoch": 55.11434511434511, + "grad_norm": 0.48555535078048706, + "learning_rate": 1.2769053738462847e-05, + "loss": 0.0605, + "num_input_tokens_seen": 19020432, + "step": 26510 + }, + { + "epoch": 55.12474012474012, + "grad_norm": 0.4801546335220337, + "learning_rate": 1.2760492365205434e-05, + "loss": 0.1169, + "num_input_tokens_seen": 19023888, + "step": 26515 + }, + { + "epoch": 55.13513513513514, + "grad_norm": 0.7343693375587463, + "learning_rate": 1.2751932879433919e-05, + "loss": 0.0851, + "num_input_tokens_seen": 19027344, + "step": 26520 + }, + { + "epoch": 55.145530145530145, + "grad_norm": 0.2366095781326294, + "learning_rate": 1.2743375282468267e-05, + "loss": 0.0631, + "num_input_tokens_seen": 19030832, + "step": 26525 + }, + { + "epoch": 55.15592515592515, + "grad_norm": 0.516289234161377, + "learning_rate": 1.2734819575628182e-05, + "loss": 0.1054, + "num_input_tokens_seen": 19034544, + "step": 26530 + }, + { + "epoch": 55.16632016632017, + "grad_norm": 0.8062862157821655, + "learning_rate": 1.2726265760233039e-05, + "loss": 0.1132, + "num_input_tokens_seen": 19038000, + "step": 26535 + }, + { + "epoch": 55.17671517671518, + "grad_norm": 0.2967926263809204, + "learning_rate": 1.271771383760197e-05, + "loss": 0.0692, + "num_input_tokens_seen": 19041456, + "step": 26540 + }, + { + "epoch": 55.187110187110186, + "grad_norm": 0.19567129015922546, + "learning_rate": 1.2709163809053764e-05, + "loss": 0.1017, + "num_input_tokens_seen": 19045232, + "step": 26545 + }, + { + "epoch": 55.197505197505194, + "grad_norm": 0.2531607151031494, + "learning_rate": 1.2700615675906963e-05, + "loss": 0.0808, + "num_input_tokens_seen": 19049168, + "step": 26550 + }, + { + "epoch": 55.20790020790021, + "grad_norm": 0.45368292927742004, + "learning_rate": 1.269206943947978e-05, + "loss": 0.1098, + "num_input_tokens_seen": 19052880, + "step": 26555 + }, + { + "epoch": 55.21829521829522, + "grad_norm": 0.47770798206329346, + "learning_rate": 1.2683525101090177e-05, + "loss": 0.1372, + "num_input_tokens_seen": 19056592, + "step": 26560 + }, + { + "epoch": 55.228690228690226, + "grad_norm": 0.1824411004781723, + "learning_rate": 1.2674982662055765e-05, + "loss": 0.0487, + "num_input_tokens_seen": 19060240, + "step": 26565 + }, + { + "epoch": 55.23908523908524, + "grad_norm": 0.37812283635139465, + "learning_rate": 1.2666442123693922e-05, + "loss": 0.1306, + "num_input_tokens_seen": 19063728, + "step": 26570 + }, + { + "epoch": 55.24948024948025, + "grad_norm": 0.6074833273887634, + "learning_rate": 1.265790348732169e-05, + "loss": 0.0842, + "num_input_tokens_seen": 19067184, + "step": 26575 + }, + { + "epoch": 55.25987525987526, + "grad_norm": 0.2371913492679596, + "learning_rate": 1.264936675425584e-05, + "loss": 0.1244, + "num_input_tokens_seen": 19070768, + "step": 26580 + }, + { + "epoch": 55.270270270270274, + "grad_norm": 0.8183914422988892, + "learning_rate": 1.2640831925812852e-05, + "loss": 0.0897, + "num_input_tokens_seen": 19074416, + "step": 26585 + }, + { + "epoch": 55.28066528066528, + "grad_norm": 0.6147729754447937, + "learning_rate": 1.263229900330889e-05, + "loss": 0.1499, + "num_input_tokens_seen": 19078000, + "step": 26590 + }, + { + "epoch": 55.29106029106029, + "grad_norm": 0.34422019124031067, + "learning_rate": 1.2623767988059843e-05, + "loss": 0.1021, + "num_input_tokens_seen": 19081712, + "step": 26595 + }, + { + "epoch": 55.3014553014553, + "grad_norm": 0.6409936547279358, + "learning_rate": 1.2615238881381309e-05, + "loss": 0.1266, + "num_input_tokens_seen": 19085296, + "step": 26600 + }, + { + "epoch": 55.3014553014553, + "eval_loss": 0.1465408205986023, + "eval_runtime": 7.7531, + "eval_samples_per_second": 110.408, + "eval_steps_per_second": 27.602, + "num_input_tokens_seen": 19085296, + "step": 26600 + }, + { + "epoch": 55.311850311850314, + "grad_norm": 0.5293716788291931, + "learning_rate": 1.2606711684588568e-05, + "loss": 0.1264, + "num_input_tokens_seen": 19088848, + "step": 26605 + }, + { + "epoch": 55.32224532224532, + "grad_norm": 0.1905166357755661, + "learning_rate": 1.2598186398996636e-05, + "loss": 0.0804, + "num_input_tokens_seen": 19092496, + "step": 26610 + }, + { + "epoch": 55.33264033264033, + "grad_norm": 0.3131033182144165, + "learning_rate": 1.2589663025920207e-05, + "loss": 0.0939, + "num_input_tokens_seen": 19096016, + "step": 26615 + }, + { + "epoch": 55.343035343035346, + "grad_norm": 0.3312907814979553, + "learning_rate": 1.2581141566673705e-05, + "loss": 0.1371, + "num_input_tokens_seen": 19099632, + "step": 26620 + }, + { + "epoch": 55.353430353430355, + "grad_norm": 0.7588814496994019, + "learning_rate": 1.257262202257124e-05, + "loss": 0.0683, + "num_input_tokens_seen": 19103184, + "step": 26625 + }, + { + "epoch": 55.36382536382536, + "grad_norm": 0.1826421618461609, + "learning_rate": 1.2564104394926618e-05, + "loss": 0.0905, + "num_input_tokens_seen": 19106864, + "step": 26630 + }, + { + "epoch": 55.37422037422037, + "grad_norm": 0.6760432720184326, + "learning_rate": 1.2555588685053383e-05, + "loss": 0.0779, + "num_input_tokens_seen": 19110448, + "step": 26635 + }, + { + "epoch": 55.38461538461539, + "grad_norm": 0.287313848733902, + "learning_rate": 1.2547074894264762e-05, + "loss": 0.1207, + "num_input_tokens_seen": 19114096, + "step": 26640 + }, + { + "epoch": 55.395010395010395, + "grad_norm": 0.3319702446460724, + "learning_rate": 1.2538563023873679e-05, + "loss": 0.0813, + "num_input_tokens_seen": 19117648, + "step": 26645 + }, + { + "epoch": 55.4054054054054, + "grad_norm": 0.37067171931266785, + "learning_rate": 1.2530053075192789e-05, + "loss": 0.1317, + "num_input_tokens_seen": 19121392, + "step": 26650 + }, + { + "epoch": 55.41580041580042, + "grad_norm": 0.1611899584531784, + "learning_rate": 1.252154504953441e-05, + "loss": 0.0752, + "num_input_tokens_seen": 19124912, + "step": 26655 + }, + { + "epoch": 55.42619542619543, + "grad_norm": 0.20363889634609222, + "learning_rate": 1.25130389482106e-05, + "loss": 0.074, + "num_input_tokens_seen": 19128496, + "step": 26660 + }, + { + "epoch": 55.436590436590436, + "grad_norm": 0.2610194981098175, + "learning_rate": 1.2504534772533116e-05, + "loss": 0.0669, + "num_input_tokens_seen": 19131952, + "step": 26665 + }, + { + "epoch": 55.446985446985444, + "grad_norm": 0.3009481728076935, + "learning_rate": 1.2496032523813387e-05, + "loss": 0.0933, + "num_input_tokens_seen": 19135376, + "step": 26670 + }, + { + "epoch": 55.45738045738046, + "grad_norm": 0.306559681892395, + "learning_rate": 1.2487532203362576e-05, + "loss": 0.1049, + "num_input_tokens_seen": 19138992, + "step": 26675 + }, + { + "epoch": 55.46777546777547, + "grad_norm": 0.8921437859535217, + "learning_rate": 1.247903381249155e-05, + "loss": 0.0872, + "num_input_tokens_seen": 19142832, + "step": 26680 + }, + { + "epoch": 55.478170478170476, + "grad_norm": 0.16921450197696686, + "learning_rate": 1.2470537352510853e-05, + "loss": 0.0839, + "num_input_tokens_seen": 19146384, + "step": 26685 + }, + { + "epoch": 55.48856548856549, + "grad_norm": 0.2908928096294403, + "learning_rate": 1.2462042824730758e-05, + "loss": 0.0662, + "num_input_tokens_seen": 19149936, + "step": 26690 + }, + { + "epoch": 55.4989604989605, + "grad_norm": 0.3583665192127228, + "learning_rate": 1.245355023046122e-05, + "loss": 0.1172, + "num_input_tokens_seen": 19153776, + "step": 26695 + }, + { + "epoch": 55.50935550935551, + "grad_norm": 0.4490628242492676, + "learning_rate": 1.2445059571011896e-05, + "loss": 0.1002, + "num_input_tokens_seen": 19157488, + "step": 26700 + }, + { + "epoch": 55.51975051975052, + "grad_norm": 0.21092993021011353, + "learning_rate": 1.2436570847692173e-05, + "loss": 0.1025, + "num_input_tokens_seen": 19161168, + "step": 26705 + }, + { + "epoch": 55.53014553014553, + "grad_norm": 0.26253801584243774, + "learning_rate": 1.2428084061811096e-05, + "loss": 0.11, + "num_input_tokens_seen": 19164880, + "step": 26710 + }, + { + "epoch": 55.54054054054054, + "grad_norm": 1.5042873620986938, + "learning_rate": 1.2419599214677447e-05, + "loss": 0.1064, + "num_input_tokens_seen": 19168528, + "step": 26715 + }, + { + "epoch": 55.55093555093555, + "grad_norm": 0.6034490466117859, + "learning_rate": 1.2411116307599702e-05, + "loss": 0.1383, + "num_input_tokens_seen": 19172176, + "step": 26720 + }, + { + "epoch": 55.561330561330564, + "grad_norm": 0.3130498230457306, + "learning_rate": 1.2402635341886016e-05, + "loss": 0.1278, + "num_input_tokens_seen": 19175824, + "step": 26725 + }, + { + "epoch": 55.57172557172557, + "grad_norm": 0.39866676926612854, + "learning_rate": 1.2394156318844278e-05, + "loss": 0.092, + "num_input_tokens_seen": 19179344, + "step": 26730 + }, + { + "epoch": 55.58212058212058, + "grad_norm": 0.3061556816101074, + "learning_rate": 1.2385679239782039e-05, + "loss": 0.1168, + "num_input_tokens_seen": 19182832, + "step": 26735 + }, + { + "epoch": 55.59251559251559, + "grad_norm": 0.8165110349655151, + "learning_rate": 1.2377204106006585e-05, + "loss": 0.089, + "num_input_tokens_seen": 19186288, + "step": 26740 + }, + { + "epoch": 55.602910602910605, + "grad_norm": 0.5573088526725769, + "learning_rate": 1.2368730918824891e-05, + "loss": 0.1171, + "num_input_tokens_seen": 19189904, + "step": 26745 + }, + { + "epoch": 55.61330561330561, + "grad_norm": 0.3069259524345398, + "learning_rate": 1.236025967954362e-05, + "loss": 0.0837, + "num_input_tokens_seen": 19193616, + "step": 26750 + }, + { + "epoch": 55.62370062370062, + "grad_norm": 0.6967535018920898, + "learning_rate": 1.2351790389469153e-05, + "loss": 0.1223, + "num_input_tokens_seen": 19197264, + "step": 26755 + }, + { + "epoch": 55.63409563409564, + "grad_norm": 0.335216224193573, + "learning_rate": 1.234332304990755e-05, + "loss": 0.0705, + "num_input_tokens_seen": 19200912, + "step": 26760 + }, + { + "epoch": 55.644490644490645, + "grad_norm": 0.3069547414779663, + "learning_rate": 1.2334857662164593e-05, + "loss": 0.0699, + "num_input_tokens_seen": 19204400, + "step": 26765 + }, + { + "epoch": 55.65488565488565, + "grad_norm": 0.31539997458457947, + "learning_rate": 1.2326394227545743e-05, + "loss": 0.0806, + "num_input_tokens_seen": 19208016, + "step": 26770 + }, + { + "epoch": 55.66528066528066, + "grad_norm": 0.4414525032043457, + "learning_rate": 1.2317932747356162e-05, + "loss": 0.1291, + "num_input_tokens_seen": 19211696, + "step": 26775 + }, + { + "epoch": 55.67567567567568, + "grad_norm": 0.6206480860710144, + "learning_rate": 1.2309473222900726e-05, + "loss": 0.1009, + "num_input_tokens_seen": 19215248, + "step": 26780 + }, + { + "epoch": 55.686070686070686, + "grad_norm": 0.15904174745082855, + "learning_rate": 1.2301015655484006e-05, + "loss": 0.0656, + "num_input_tokens_seen": 19218768, + "step": 26785 + }, + { + "epoch": 55.696465696465694, + "grad_norm": 0.2518208622932434, + "learning_rate": 1.2292560046410245e-05, + "loss": 0.1512, + "num_input_tokens_seen": 19222416, + "step": 26790 + }, + { + "epoch": 55.70686070686071, + "grad_norm": 0.6217053532600403, + "learning_rate": 1.228410639698343e-05, + "loss": 0.1198, + "num_input_tokens_seen": 19226096, + "step": 26795 + }, + { + "epoch": 55.71725571725572, + "grad_norm": 0.28279587626457214, + "learning_rate": 1.2275654708507195e-05, + "loss": 0.0854, + "num_input_tokens_seen": 19229616, + "step": 26800 + }, + { + "epoch": 55.71725571725572, + "eval_loss": 0.14578643441200256, + "eval_runtime": 7.7467, + "eval_samples_per_second": 110.498, + "eval_steps_per_second": 27.625, + "num_input_tokens_seen": 19229616, + "step": 26800 + }, + { + "epoch": 55.727650727650726, + "grad_norm": 0.18260139226913452, + "learning_rate": 1.2267204982284908e-05, + "loss": 0.1334, + "num_input_tokens_seen": 19233360, + "step": 26805 + }, + { + "epoch": 55.73804573804574, + "grad_norm": 0.18841812014579773, + "learning_rate": 1.2258757219619635e-05, + "loss": 0.1416, + "num_input_tokens_seen": 19237200, + "step": 26810 + }, + { + "epoch": 55.74844074844075, + "grad_norm": 0.8737364411354065, + "learning_rate": 1.2250311421814104e-05, + "loss": 0.1003, + "num_input_tokens_seen": 19240880, + "step": 26815 + }, + { + "epoch": 55.75883575883576, + "grad_norm": 0.18094004690647125, + "learning_rate": 1.2241867590170772e-05, + "loss": 0.0731, + "num_input_tokens_seen": 19244496, + "step": 26820 + }, + { + "epoch": 55.76923076923077, + "grad_norm": 0.8055041432380676, + "learning_rate": 1.2233425725991799e-05, + "loss": 0.0814, + "num_input_tokens_seen": 19248016, + "step": 26825 + }, + { + "epoch": 55.77962577962578, + "grad_norm": 0.24681444466114044, + "learning_rate": 1.2224985830579003e-05, + "loss": 0.0959, + "num_input_tokens_seen": 19251568, + "step": 26830 + }, + { + "epoch": 55.79002079002079, + "grad_norm": 0.3735385835170746, + "learning_rate": 1.2216547905233944e-05, + "loss": 0.0614, + "num_input_tokens_seen": 19254992, + "step": 26835 + }, + { + "epoch": 55.8004158004158, + "grad_norm": 0.5134844183921814, + "learning_rate": 1.2208111951257842e-05, + "loss": 0.1454, + "num_input_tokens_seen": 19258704, + "step": 26840 + }, + { + "epoch": 55.810810810810814, + "grad_norm": 0.7022000551223755, + "learning_rate": 1.2199677969951622e-05, + "loss": 0.1257, + "num_input_tokens_seen": 19262128, + "step": 26845 + }, + { + "epoch": 55.82120582120582, + "grad_norm": 0.34126371145248413, + "learning_rate": 1.2191245962615927e-05, + "loss": 0.0619, + "num_input_tokens_seen": 19265616, + "step": 26850 + }, + { + "epoch": 55.83160083160083, + "grad_norm": 0.5854537487030029, + "learning_rate": 1.218281593055106e-05, + "loss": 0.1169, + "num_input_tokens_seen": 19269232, + "step": 26855 + }, + { + "epoch": 55.84199584199584, + "grad_norm": 0.3824353516101837, + "learning_rate": 1.217438787505705e-05, + "loss": 0.0985, + "num_input_tokens_seen": 19272784, + "step": 26860 + }, + { + "epoch": 55.852390852390855, + "grad_norm": 0.7196846604347229, + "learning_rate": 1.2165961797433615e-05, + "loss": 0.0911, + "num_input_tokens_seen": 19276144, + "step": 26865 + }, + { + "epoch": 55.86278586278586, + "grad_norm": 0.3817700445652008, + "learning_rate": 1.215753769898014e-05, + "loss": 0.0883, + "num_input_tokens_seen": 19279696, + "step": 26870 + }, + { + "epoch": 55.87318087318087, + "grad_norm": 0.26585981249809265, + "learning_rate": 1.2149115580995755e-05, + "loss": 0.1081, + "num_input_tokens_seen": 19283280, + "step": 26875 + }, + { + "epoch": 55.88357588357589, + "grad_norm": 0.28420165181159973, + "learning_rate": 1.2140695444779227e-05, + "loss": 0.0837, + "num_input_tokens_seen": 19286992, + "step": 26880 + }, + { + "epoch": 55.893970893970895, + "grad_norm": 0.5386560559272766, + "learning_rate": 1.2132277291629066e-05, + "loss": 0.0958, + "num_input_tokens_seen": 19290480, + "step": 26885 + }, + { + "epoch": 55.9043659043659, + "grad_norm": 0.6422737836837769, + "learning_rate": 1.2123861122843458e-05, + "loss": 0.1438, + "num_input_tokens_seen": 19293968, + "step": 26890 + }, + { + "epoch": 55.91476091476091, + "grad_norm": 0.6009212136268616, + "learning_rate": 1.2115446939720271e-05, + "loss": 0.107, + "num_input_tokens_seen": 19297680, + "step": 26895 + }, + { + "epoch": 55.92515592515593, + "grad_norm": 0.3543996214866638, + "learning_rate": 1.210703474355708e-05, + "loss": 0.1222, + "num_input_tokens_seen": 19301328, + "step": 26900 + }, + { + "epoch": 55.935550935550935, + "grad_norm": 0.25861477851867676, + "learning_rate": 1.2098624535651164e-05, + "loss": 0.1355, + "num_input_tokens_seen": 19304784, + "step": 26905 + }, + { + "epoch": 55.945945945945944, + "grad_norm": 0.32875165343284607, + "learning_rate": 1.2090216317299477e-05, + "loss": 0.1046, + "num_input_tokens_seen": 19308336, + "step": 26910 + }, + { + "epoch": 55.95634095634096, + "grad_norm": 0.30833470821380615, + "learning_rate": 1.2081810089798668e-05, + "loss": 0.1062, + "num_input_tokens_seen": 19311984, + "step": 26915 + }, + { + "epoch": 55.96673596673597, + "grad_norm": 0.3439143896102905, + "learning_rate": 1.2073405854445072e-05, + "loss": 0.078, + "num_input_tokens_seen": 19315632, + "step": 26920 + }, + { + "epoch": 55.977130977130976, + "grad_norm": 0.20572640001773834, + "learning_rate": 1.206500361253474e-05, + "loss": 0.1064, + "num_input_tokens_seen": 19319184, + "step": 26925 + }, + { + "epoch": 55.987525987525984, + "grad_norm": 0.9125043749809265, + "learning_rate": 1.2056603365363409e-05, + "loss": 0.1061, + "num_input_tokens_seen": 19322800, + "step": 26930 + }, + { + "epoch": 55.997920997921, + "grad_norm": 0.3181654214859009, + "learning_rate": 1.2048205114226487e-05, + "loss": 0.1089, + "num_input_tokens_seen": 19326384, + "step": 26935 + }, + { + "epoch": 56.00831600831601, + "grad_norm": 0.17113585770130157, + "learning_rate": 1.2039808860419102e-05, + "loss": 0.1073, + "num_input_tokens_seen": 19329800, + "step": 26940 + }, + { + "epoch": 56.018711018711016, + "grad_norm": 0.7813407778739929, + "learning_rate": 1.2031414605236066e-05, + "loss": 0.1106, + "num_input_tokens_seen": 19333544, + "step": 26945 + }, + { + "epoch": 56.02910602910603, + "grad_norm": 0.385400652885437, + "learning_rate": 1.2023022349971862e-05, + "loss": 0.0809, + "num_input_tokens_seen": 19337096, + "step": 26950 + }, + { + "epoch": 56.03950103950104, + "grad_norm": 0.2753855288028717, + "learning_rate": 1.20146320959207e-05, + "loss": 0.0786, + "num_input_tokens_seen": 19340712, + "step": 26955 + }, + { + "epoch": 56.04989604989605, + "grad_norm": 0.23131580650806427, + "learning_rate": 1.2006243844376445e-05, + "loss": 0.1112, + "num_input_tokens_seen": 19344328, + "step": 26960 + }, + { + "epoch": 56.06029106029106, + "grad_norm": 0.2308371365070343, + "learning_rate": 1.1997857596632678e-05, + "loss": 0.0665, + "num_input_tokens_seen": 19347880, + "step": 26965 + }, + { + "epoch": 56.07068607068607, + "grad_norm": 1.0211541652679443, + "learning_rate": 1.1989473353982672e-05, + "loss": 0.1184, + "num_input_tokens_seen": 19351656, + "step": 26970 + }, + { + "epoch": 56.08108108108108, + "grad_norm": 0.24787116050720215, + "learning_rate": 1.198109111771937e-05, + "loss": 0.0834, + "num_input_tokens_seen": 19355144, + "step": 26975 + }, + { + "epoch": 56.09147609147609, + "grad_norm": 0.18490321934223175, + "learning_rate": 1.197271088913543e-05, + "loss": 0.1089, + "num_input_tokens_seen": 19358696, + "step": 26980 + }, + { + "epoch": 56.101871101871104, + "grad_norm": 0.20097669959068298, + "learning_rate": 1.1964332669523182e-05, + "loss": 0.069, + "num_input_tokens_seen": 19362440, + "step": 26985 + }, + { + "epoch": 56.11226611226611, + "grad_norm": 0.27389341592788696, + "learning_rate": 1.1955956460174645e-05, + "loss": 0.1189, + "num_input_tokens_seen": 19366024, + "step": 26990 + }, + { + "epoch": 56.12266112266112, + "grad_norm": 0.23268121480941772, + "learning_rate": 1.1947582262381552e-05, + "loss": 0.0737, + "num_input_tokens_seen": 19369640, + "step": 26995 + }, + { + "epoch": 56.13305613305613, + "grad_norm": 0.3576192557811737, + "learning_rate": 1.1939210077435293e-05, + "loss": 0.0836, + "num_input_tokens_seen": 19373160, + "step": 27000 + }, + { + "epoch": 56.13305613305613, + "eval_loss": 0.14537420868873596, + "eval_runtime": 7.7447, + "eval_samples_per_second": 110.528, + "eval_steps_per_second": 27.632, + "num_input_tokens_seen": 19373160, + "step": 27000 + }, + { + "epoch": 56.143451143451145, + "grad_norm": 0.1381990611553192, + "learning_rate": 1.193083990662697e-05, + "loss": 0.1015, + "num_input_tokens_seen": 19376584, + "step": 27005 + }, + { + "epoch": 56.15384615384615, + "grad_norm": 0.5475471615791321, + "learning_rate": 1.192247175124738e-05, + "loss": 0.1684, + "num_input_tokens_seen": 19380168, + "step": 27010 + }, + { + "epoch": 56.16424116424116, + "grad_norm": 0.24123041331768036, + "learning_rate": 1.191410561258698e-05, + "loss": 0.1528, + "num_input_tokens_seen": 19383816, + "step": 27015 + }, + { + "epoch": 56.17463617463618, + "grad_norm": 0.8879287242889404, + "learning_rate": 1.1905741491935944e-05, + "loss": 0.1064, + "num_input_tokens_seen": 19387240, + "step": 27020 + }, + { + "epoch": 56.185031185031185, + "grad_norm": 0.5347837209701538, + "learning_rate": 1.1897379390584129e-05, + "loss": 0.1341, + "num_input_tokens_seen": 19390696, + "step": 27025 + }, + { + "epoch": 56.195426195426194, + "grad_norm": 0.3695860803127289, + "learning_rate": 1.1889019309821062e-05, + "loss": 0.1221, + "num_input_tokens_seen": 19394408, + "step": 27030 + }, + { + "epoch": 56.20582120582121, + "grad_norm": 0.29511216282844543, + "learning_rate": 1.188066125093599e-05, + "loss": 0.1106, + "num_input_tokens_seen": 19398184, + "step": 27035 + }, + { + "epoch": 56.21621621621622, + "grad_norm": 0.13702307641506195, + "learning_rate": 1.1872305215217811e-05, + "loss": 0.071, + "num_input_tokens_seen": 19401704, + "step": 27040 + }, + { + "epoch": 56.226611226611226, + "grad_norm": 0.16380207240581512, + "learning_rate": 1.186395120395514e-05, + "loss": 0.0814, + "num_input_tokens_seen": 19405320, + "step": 27045 + }, + { + "epoch": 56.237006237006234, + "grad_norm": 0.20207446813583374, + "learning_rate": 1.1855599218436283e-05, + "loss": 0.1233, + "num_input_tokens_seen": 19408936, + "step": 27050 + }, + { + "epoch": 56.24740124740125, + "grad_norm": 0.2022283673286438, + "learning_rate": 1.1847249259949209e-05, + "loss": 0.1274, + "num_input_tokens_seen": 19412552, + "step": 27055 + }, + { + "epoch": 56.25779625779626, + "grad_norm": 0.20611879229545593, + "learning_rate": 1.1838901329781574e-05, + "loss": 0.0935, + "num_input_tokens_seen": 19416072, + "step": 27060 + }, + { + "epoch": 56.268191268191266, + "grad_norm": 0.18686510622501373, + "learning_rate": 1.1830555429220758e-05, + "loss": 0.0908, + "num_input_tokens_seen": 19419624, + "step": 27065 + }, + { + "epoch": 56.27858627858628, + "grad_norm": 0.18924085795879364, + "learning_rate": 1.1822211559553784e-05, + "loss": 0.0821, + "num_input_tokens_seen": 19423112, + "step": 27070 + }, + { + "epoch": 56.28898128898129, + "grad_norm": 0.3975139558315277, + "learning_rate": 1.18138697220674e-05, + "loss": 0.0877, + "num_input_tokens_seen": 19426600, + "step": 27075 + }, + { + "epoch": 56.2993762993763, + "grad_norm": 0.2523653209209442, + "learning_rate": 1.1805529918048e-05, + "loss": 0.1179, + "num_input_tokens_seen": 19430152, + "step": 27080 + }, + { + "epoch": 56.30977130977131, + "grad_norm": 0.46479156613349915, + "learning_rate": 1.1797192148781702e-05, + "loss": 0.0937, + "num_input_tokens_seen": 19433576, + "step": 27085 + }, + { + "epoch": 56.32016632016632, + "grad_norm": 0.4560575485229492, + "learning_rate": 1.1788856415554297e-05, + "loss": 0.0953, + "num_input_tokens_seen": 19437352, + "step": 27090 + }, + { + "epoch": 56.33056133056133, + "grad_norm": 0.5335686802864075, + "learning_rate": 1.1780522719651249e-05, + "loss": 0.1464, + "num_input_tokens_seen": 19440968, + "step": 27095 + }, + { + "epoch": 56.34095634095634, + "grad_norm": 0.15247514843940735, + "learning_rate": 1.1772191062357721e-05, + "loss": 0.0677, + "num_input_tokens_seen": 19444584, + "step": 27100 + }, + { + "epoch": 56.351351351351354, + "grad_norm": 0.1709967702627182, + "learning_rate": 1.1763861444958573e-05, + "loss": 0.0662, + "num_input_tokens_seen": 19448104, + "step": 27105 + }, + { + "epoch": 56.36174636174636, + "grad_norm": 0.8119792938232422, + "learning_rate": 1.1755533868738317e-05, + "loss": 0.1259, + "num_input_tokens_seen": 19451720, + "step": 27110 + }, + { + "epoch": 56.37214137214137, + "grad_norm": 0.173781618475914, + "learning_rate": 1.1747208334981185e-05, + "loss": 0.1131, + "num_input_tokens_seen": 19455304, + "step": 27115 + }, + { + "epoch": 56.38253638253638, + "grad_norm": 0.32526883482933044, + "learning_rate": 1.1738884844971067e-05, + "loss": 0.134, + "num_input_tokens_seen": 19459048, + "step": 27120 + }, + { + "epoch": 56.392931392931395, + "grad_norm": 0.18782028555870056, + "learning_rate": 1.1730563399991563e-05, + "loss": 0.1213, + "num_input_tokens_seen": 19462600, + "step": 27125 + }, + { + "epoch": 56.4033264033264, + "grad_norm": 0.49605610966682434, + "learning_rate": 1.1722244001325938e-05, + "loss": 0.0744, + "num_input_tokens_seen": 19466184, + "step": 27130 + }, + { + "epoch": 56.41372141372141, + "grad_norm": 0.263546884059906, + "learning_rate": 1.1713926650257137e-05, + "loss": 0.1571, + "num_input_tokens_seen": 19469704, + "step": 27135 + }, + { + "epoch": 56.42411642411643, + "grad_norm": 0.2739405930042267, + "learning_rate": 1.170561134806781e-05, + "loss": 0.1057, + "num_input_tokens_seen": 19473256, + "step": 27140 + }, + { + "epoch": 56.434511434511435, + "grad_norm": 0.2360701560974121, + "learning_rate": 1.1697298096040287e-05, + "loss": 0.0936, + "num_input_tokens_seen": 19476872, + "step": 27145 + }, + { + "epoch": 56.444906444906444, + "grad_norm": 0.8526264429092407, + "learning_rate": 1.1688986895456567e-05, + "loss": 0.1247, + "num_input_tokens_seen": 19480520, + "step": 27150 + }, + { + "epoch": 56.45530145530145, + "grad_norm": 0.8272066116333008, + "learning_rate": 1.1680677747598349e-05, + "loss": 0.0836, + "num_input_tokens_seen": 19483912, + "step": 27155 + }, + { + "epoch": 56.46569646569647, + "grad_norm": 0.5525550246238708, + "learning_rate": 1.1672370653746995e-05, + "loss": 0.0849, + "num_input_tokens_seen": 19487400, + "step": 27160 + }, + { + "epoch": 56.476091476091476, + "grad_norm": 0.3285768926143646, + "learning_rate": 1.166406561518357e-05, + "loss": 0.1109, + "num_input_tokens_seen": 19491048, + "step": 27165 + }, + { + "epoch": 56.486486486486484, + "grad_norm": 0.5517465472221375, + "learning_rate": 1.1655762633188826e-05, + "loss": 0.0875, + "num_input_tokens_seen": 19494600, + "step": 27170 + }, + { + "epoch": 56.4968814968815, + "grad_norm": 1.106143593788147, + "learning_rate": 1.1647461709043172e-05, + "loss": 0.1463, + "num_input_tokens_seen": 19498184, + "step": 27175 + }, + { + "epoch": 56.50727650727651, + "grad_norm": 0.2513379752635956, + "learning_rate": 1.1639162844026722e-05, + "loss": 0.1743, + "num_input_tokens_seen": 19501736, + "step": 27180 + }, + { + "epoch": 56.517671517671516, + "grad_norm": 0.48494991660118103, + "learning_rate": 1.163086603941927e-05, + "loss": 0.104, + "num_input_tokens_seen": 19505320, + "step": 27185 + }, + { + "epoch": 56.528066528066525, + "grad_norm": 0.25819092988967896, + "learning_rate": 1.1622571296500273e-05, + "loss": 0.0902, + "num_input_tokens_seen": 19508776, + "step": 27190 + }, + { + "epoch": 56.53846153846154, + "grad_norm": 0.16373248398303986, + "learning_rate": 1.1614278616548904e-05, + "loss": 0.0742, + "num_input_tokens_seen": 19512712, + "step": 27195 + }, + { + "epoch": 56.54885654885655, + "grad_norm": 0.5574084520339966, + "learning_rate": 1.1605988000843986e-05, + "loss": 0.1123, + "num_input_tokens_seen": 19516200, + "step": 27200 + }, + { + "epoch": 56.54885654885655, + "eval_loss": 0.14311620593070984, + "eval_runtime": 7.7461, + "eval_samples_per_second": 110.507, + "eval_steps_per_second": 27.627, + "num_input_tokens_seen": 19516200, + "step": 27200 + }, + { + "epoch": 56.55925155925156, + "grad_norm": 0.33148613572120667, + "learning_rate": 1.1597699450664028e-05, + "loss": 0.0924, + "num_input_tokens_seen": 19519720, + "step": 27205 + }, + { + "epoch": 56.56964656964657, + "grad_norm": 0.3420139253139496, + "learning_rate": 1.1589412967287252e-05, + "loss": 0.1235, + "num_input_tokens_seen": 19523272, + "step": 27210 + }, + { + "epoch": 56.58004158004158, + "grad_norm": 0.29668405652046204, + "learning_rate": 1.1581128551991514e-05, + "loss": 0.067, + "num_input_tokens_seen": 19526696, + "step": 27215 + }, + { + "epoch": 56.59043659043659, + "grad_norm": 0.5230351090431213, + "learning_rate": 1.1572846206054383e-05, + "loss": 0.0858, + "num_input_tokens_seen": 19530248, + "step": 27220 + }, + { + "epoch": 56.6008316008316, + "grad_norm": 0.17273478209972382, + "learning_rate": 1.1564565930753113e-05, + "loss": 0.1223, + "num_input_tokens_seen": 19533992, + "step": 27225 + }, + { + "epoch": 56.61122661122661, + "grad_norm": 0.35894107818603516, + "learning_rate": 1.1556287727364606e-05, + "loss": 0.1037, + "num_input_tokens_seen": 19537608, + "step": 27230 + }, + { + "epoch": 56.62162162162162, + "grad_norm": 0.3413147032260895, + "learning_rate": 1.1548011597165489e-05, + "loss": 0.0858, + "num_input_tokens_seen": 19541448, + "step": 27235 + }, + { + "epoch": 56.63201663201663, + "grad_norm": 0.17305785417556763, + "learning_rate": 1.1539737541432019e-05, + "loss": 0.0794, + "num_input_tokens_seen": 19545192, + "step": 27240 + }, + { + "epoch": 56.642411642411645, + "grad_norm": 0.7978172898292542, + "learning_rate": 1.1531465561440174e-05, + "loss": 0.1302, + "num_input_tokens_seen": 19548808, + "step": 27245 + }, + { + "epoch": 56.65280665280665, + "grad_norm": 0.5918890833854675, + "learning_rate": 1.1523195658465605e-05, + "loss": 0.1135, + "num_input_tokens_seen": 19552584, + "step": 27250 + }, + { + "epoch": 56.66320166320166, + "grad_norm": 0.3734618127346039, + "learning_rate": 1.1514927833783618e-05, + "loss": 0.0945, + "num_input_tokens_seen": 19556136, + "step": 27255 + }, + { + "epoch": 56.67359667359668, + "grad_norm": 0.7612844109535217, + "learning_rate": 1.150666208866922e-05, + "loss": 0.114, + "num_input_tokens_seen": 19559496, + "step": 27260 + }, + { + "epoch": 56.683991683991685, + "grad_norm": 0.35041841864585876, + "learning_rate": 1.1498398424397106e-05, + "loss": 0.0837, + "num_input_tokens_seen": 19563144, + "step": 27265 + }, + { + "epoch": 56.694386694386694, + "grad_norm": 0.4353625476360321, + "learning_rate": 1.1490136842241628e-05, + "loss": 0.0966, + "num_input_tokens_seen": 19566696, + "step": 27270 + }, + { + "epoch": 56.7047817047817, + "grad_norm": 0.17132242023944855, + "learning_rate": 1.1481877343476813e-05, + "loss": 0.0765, + "num_input_tokens_seen": 19570312, + "step": 27275 + }, + { + "epoch": 56.71517671517672, + "grad_norm": 0.4406109154224396, + "learning_rate": 1.14736199293764e-05, + "loss": 0.1024, + "num_input_tokens_seen": 19573864, + "step": 27280 + }, + { + "epoch": 56.725571725571726, + "grad_norm": 0.44528788328170776, + "learning_rate": 1.1465364601213771e-05, + "loss": 0.084, + "num_input_tokens_seen": 19577288, + "step": 27285 + }, + { + "epoch": 56.735966735966734, + "grad_norm": 0.4723668098449707, + "learning_rate": 1.1457111360262012e-05, + "loss": 0.1072, + "num_input_tokens_seen": 19580904, + "step": 27290 + }, + { + "epoch": 56.74636174636175, + "grad_norm": 0.20810508728027344, + "learning_rate": 1.1448860207793869e-05, + "loss": 0.1044, + "num_input_tokens_seen": 19584488, + "step": 27295 + }, + { + "epoch": 56.75675675675676, + "grad_norm": 0.6170647144317627, + "learning_rate": 1.144061114508177e-05, + "loss": 0.0868, + "num_input_tokens_seen": 19587976, + "step": 27300 + }, + { + "epoch": 56.767151767151766, + "grad_norm": 0.337593674659729, + "learning_rate": 1.1432364173397842e-05, + "loss": 0.1248, + "num_input_tokens_seen": 19591496, + "step": 27305 + }, + { + "epoch": 56.777546777546775, + "grad_norm": 0.36169904470443726, + "learning_rate": 1.1424119294013852e-05, + "loss": 0.0985, + "num_input_tokens_seen": 19595144, + "step": 27310 + }, + { + "epoch": 56.78794178794179, + "grad_norm": 0.21849758923053741, + "learning_rate": 1.1415876508201279e-05, + "loss": 0.0945, + "num_input_tokens_seen": 19598728, + "step": 27315 + }, + { + "epoch": 56.7983367983368, + "grad_norm": 0.4774802625179291, + "learning_rate": 1.140763581723125e-05, + "loss": 0.1062, + "num_input_tokens_seen": 19602312, + "step": 27320 + }, + { + "epoch": 56.80873180873181, + "grad_norm": 0.48440682888031006, + "learning_rate": 1.1399397222374588e-05, + "loss": 0.0695, + "num_input_tokens_seen": 19605928, + "step": 27325 + }, + { + "epoch": 56.81912681912682, + "grad_norm": 0.26771116256713867, + "learning_rate": 1.1391160724901804e-05, + "loss": 0.1112, + "num_input_tokens_seen": 19609544, + "step": 27330 + }, + { + "epoch": 56.82952182952183, + "grad_norm": 0.5513423085212708, + "learning_rate": 1.138292632608304e-05, + "loss": 0.1016, + "num_input_tokens_seen": 19613128, + "step": 27335 + }, + { + "epoch": 56.83991683991684, + "grad_norm": 0.5568011403083801, + "learning_rate": 1.1374694027188174e-05, + "loss": 0.0942, + "num_input_tokens_seen": 19616872, + "step": 27340 + }, + { + "epoch": 56.85031185031185, + "grad_norm": 0.5543996691703796, + "learning_rate": 1.1366463829486711e-05, + "loss": 0.0628, + "num_input_tokens_seen": 19620392, + "step": 27345 + }, + { + "epoch": 56.86070686070686, + "grad_norm": 0.33418118953704834, + "learning_rate": 1.1358235734247849e-05, + "loss": 0.1014, + "num_input_tokens_seen": 19623944, + "step": 27350 + }, + { + "epoch": 56.87110187110187, + "grad_norm": 0.995024561882019, + "learning_rate": 1.1350009742740478e-05, + "loss": 0.0971, + "num_input_tokens_seen": 19627624, + "step": 27355 + }, + { + "epoch": 56.88149688149688, + "grad_norm": 0.23644952476024628, + "learning_rate": 1.134178585623313e-05, + "loss": 0.0858, + "num_input_tokens_seen": 19631272, + "step": 27360 + }, + { + "epoch": 56.891891891891895, + "grad_norm": 0.3467412292957306, + "learning_rate": 1.1333564075994047e-05, + "loss": 0.086, + "num_input_tokens_seen": 19634888, + "step": 27365 + }, + { + "epoch": 56.9022869022869, + "grad_norm": 0.22057761251926422, + "learning_rate": 1.1325344403291133e-05, + "loss": 0.0692, + "num_input_tokens_seen": 19638536, + "step": 27370 + }, + { + "epoch": 56.91268191268191, + "grad_norm": 0.3488706946372986, + "learning_rate": 1.1317126839391951e-05, + "loss": 0.0837, + "num_input_tokens_seen": 19642056, + "step": 27375 + }, + { + "epoch": 56.92307692307692, + "grad_norm": 0.16611604392528534, + "learning_rate": 1.1308911385563766e-05, + "loss": 0.055, + "num_input_tokens_seen": 19645576, + "step": 27380 + }, + { + "epoch": 56.933471933471935, + "grad_norm": 0.24118849635124207, + "learning_rate": 1.1300698043073494e-05, + "loss": 0.0907, + "num_input_tokens_seen": 19649128, + "step": 27385 + }, + { + "epoch": 56.943866943866944, + "grad_norm": 0.2824009656906128, + "learning_rate": 1.1292486813187736e-05, + "loss": 0.1141, + "num_input_tokens_seen": 19652680, + "step": 27390 + }, + { + "epoch": 56.95426195426195, + "grad_norm": 0.2031206339597702, + "learning_rate": 1.1284277697172782e-05, + "loss": 0.0883, + "num_input_tokens_seen": 19656264, + "step": 27395 + }, + { + "epoch": 56.96465696465697, + "grad_norm": 0.298296719789505, + "learning_rate": 1.127607069629456e-05, + "loss": 0.1217, + "num_input_tokens_seen": 19659656, + "step": 27400 + }, + { + "epoch": 56.96465696465697, + "eval_loss": 0.14629071950912476, + "eval_runtime": 7.7582, + "eval_samples_per_second": 110.335, + "eval_steps_per_second": 27.584, + "num_input_tokens_seen": 19659656, + "step": 27400 + }, + { + "epoch": 56.975051975051976, + "grad_norm": 0.21964718401432037, + "learning_rate": 1.1267865811818701e-05, + "loss": 0.078, + "num_input_tokens_seen": 19663304, + "step": 27405 + }, + { + "epoch": 56.985446985446984, + "grad_norm": 0.2576998770236969, + "learning_rate": 1.1259663045010513e-05, + "loss": 0.1006, + "num_input_tokens_seen": 19666728, + "step": 27410 + }, + { + "epoch": 56.99584199584199, + "grad_norm": 0.39065229892730713, + "learning_rate": 1.1251462397134957e-05, + "loss": 0.0466, + "num_input_tokens_seen": 19670536, + "step": 27415 + }, + { + "epoch": 57.00623700623701, + "grad_norm": 0.6318016648292542, + "learning_rate": 1.1243263869456664e-05, + "loss": 0.0971, + "num_input_tokens_seen": 19674136, + "step": 27420 + }, + { + "epoch": 57.016632016632016, + "grad_norm": 0.5844587683677673, + "learning_rate": 1.1235067463239967e-05, + "loss": 0.1229, + "num_input_tokens_seen": 19677720, + "step": 27425 + }, + { + "epoch": 57.027027027027025, + "grad_norm": 0.19862566888332367, + "learning_rate": 1.122687317974884e-05, + "loss": 0.0703, + "num_input_tokens_seen": 19681240, + "step": 27430 + }, + { + "epoch": 57.03742203742204, + "grad_norm": 0.4271068871021271, + "learning_rate": 1.1218681020246963e-05, + "loss": 0.1109, + "num_input_tokens_seen": 19684952, + "step": 27435 + }, + { + "epoch": 57.04781704781705, + "grad_norm": 0.37197446823120117, + "learning_rate": 1.1210490985997652e-05, + "loss": 0.1462, + "num_input_tokens_seen": 19688504, + "step": 27440 + }, + { + "epoch": 57.05821205821206, + "grad_norm": 0.4787125289440155, + "learning_rate": 1.1202303078263917e-05, + "loss": 0.1179, + "num_input_tokens_seen": 19691992, + "step": 27445 + }, + { + "epoch": 57.06860706860707, + "grad_norm": 0.1310122162103653, + "learning_rate": 1.1194117298308451e-05, + "loss": 0.0814, + "num_input_tokens_seen": 19695736, + "step": 27450 + }, + { + "epoch": 57.07900207900208, + "grad_norm": 0.48540711402893066, + "learning_rate": 1.1185933647393585e-05, + "loss": 0.0873, + "num_input_tokens_seen": 19699384, + "step": 27455 + }, + { + "epoch": 57.08939708939709, + "grad_norm": 0.18493884801864624, + "learning_rate": 1.1177752126781354e-05, + "loss": 0.0819, + "num_input_tokens_seen": 19703128, + "step": 27460 + }, + { + "epoch": 57.0997920997921, + "grad_norm": 0.382730633020401, + "learning_rate": 1.1169572737733441e-05, + "loss": 0.0906, + "num_input_tokens_seen": 19706776, + "step": 27465 + }, + { + "epoch": 57.11018711018711, + "grad_norm": 0.36940622329711914, + "learning_rate": 1.1161395481511216e-05, + "loss": 0.1086, + "num_input_tokens_seen": 19710424, + "step": 27470 + }, + { + "epoch": 57.12058212058212, + "grad_norm": 0.6350100040435791, + "learning_rate": 1.1153220359375722e-05, + "loss": 0.1606, + "num_input_tokens_seen": 19713944, + "step": 27475 + }, + { + "epoch": 57.13097713097713, + "grad_norm": 0.47939595580101013, + "learning_rate": 1.114504737258765e-05, + "loss": 0.0939, + "num_input_tokens_seen": 19717464, + "step": 27480 + }, + { + "epoch": 57.141372141372145, + "grad_norm": 0.4011693000793457, + "learning_rate": 1.1136876522407393e-05, + "loss": 0.0746, + "num_input_tokens_seen": 19720984, + "step": 27485 + }, + { + "epoch": 57.15176715176715, + "grad_norm": 0.8368695974349976, + "learning_rate": 1.1128707810094985e-05, + "loss": 0.0554, + "num_input_tokens_seen": 19724504, + "step": 27490 + }, + { + "epoch": 57.16216216216216, + "grad_norm": 0.24388428032398224, + "learning_rate": 1.1120541236910157e-05, + "loss": 0.0821, + "num_input_tokens_seen": 19728056, + "step": 27495 + }, + { + "epoch": 57.17255717255717, + "grad_norm": 0.18676280975341797, + "learning_rate": 1.111237680411229e-05, + "loss": 0.0775, + "num_input_tokens_seen": 19731768, + "step": 27500 + }, + { + "epoch": 57.182952182952185, + "grad_norm": 0.39065003395080566, + "learning_rate": 1.1104214512960433e-05, + "loss": 0.1025, + "num_input_tokens_seen": 19735320, + "step": 27505 + }, + { + "epoch": 57.19334719334719, + "grad_norm": 0.3577984869480133, + "learning_rate": 1.1096054364713327e-05, + "loss": 0.0723, + "num_input_tokens_seen": 19738872, + "step": 27510 + }, + { + "epoch": 57.2037422037422, + "grad_norm": 0.1581403911113739, + "learning_rate": 1.1087896360629371e-05, + "loss": 0.0698, + "num_input_tokens_seen": 19742456, + "step": 27515 + }, + { + "epoch": 57.21413721413722, + "grad_norm": 0.42251941561698914, + "learning_rate": 1.107974050196662e-05, + "loss": 0.1322, + "num_input_tokens_seen": 19745976, + "step": 27520 + }, + { + "epoch": 57.224532224532226, + "grad_norm": 0.3622702360153198, + "learning_rate": 1.1071586789982816e-05, + "loss": 0.1079, + "num_input_tokens_seen": 19749624, + "step": 27525 + }, + { + "epoch": 57.234927234927234, + "grad_norm": 0.689855694770813, + "learning_rate": 1.1063435225935373e-05, + "loss": 0.1256, + "num_input_tokens_seen": 19753304, + "step": 27530 + }, + { + "epoch": 57.24532224532224, + "grad_norm": 0.33352115750312805, + "learning_rate": 1.1055285811081348e-05, + "loss": 0.1246, + "num_input_tokens_seen": 19756952, + "step": 27535 + }, + { + "epoch": 57.25571725571726, + "grad_norm": 0.2521042823791504, + "learning_rate": 1.1047138546677499e-05, + "loss": 0.127, + "num_input_tokens_seen": 19760696, + "step": 27540 + }, + { + "epoch": 57.266112266112266, + "grad_norm": 0.7606266736984253, + "learning_rate": 1.1038993433980219e-05, + "loss": 0.0877, + "num_input_tokens_seen": 19764120, + "step": 27545 + }, + { + "epoch": 57.276507276507274, + "grad_norm": 0.423366904258728, + "learning_rate": 1.1030850474245597e-05, + "loss": 0.0774, + "num_input_tokens_seen": 19767704, + "step": 27550 + }, + { + "epoch": 57.28690228690229, + "grad_norm": 0.3500116169452667, + "learning_rate": 1.102270966872939e-05, + "loss": 0.1157, + "num_input_tokens_seen": 19771288, + "step": 27555 + }, + { + "epoch": 57.2972972972973, + "grad_norm": 0.3353980779647827, + "learning_rate": 1.1014571018687e-05, + "loss": 0.1028, + "num_input_tokens_seen": 19774808, + "step": 27560 + }, + { + "epoch": 57.30769230769231, + "grad_norm": 0.32142379879951477, + "learning_rate": 1.1006434525373502e-05, + "loss": 0.112, + "num_input_tokens_seen": 19778456, + "step": 27565 + }, + { + "epoch": 57.318087318087315, + "grad_norm": 0.6238855719566345, + "learning_rate": 1.0998300190043664e-05, + "loss": 0.0849, + "num_input_tokens_seen": 19782136, + "step": 27570 + }, + { + "epoch": 57.32848232848233, + "grad_norm": 0.3624633550643921, + "learning_rate": 1.0990168013951882e-05, + "loss": 0.1365, + "num_input_tokens_seen": 19785592, + "step": 27575 + }, + { + "epoch": 57.33887733887734, + "grad_norm": 0.473384290933609, + "learning_rate": 1.0982037998352263e-05, + "loss": 0.1084, + "num_input_tokens_seen": 19789208, + "step": 27580 + }, + { + "epoch": 57.34927234927235, + "grad_norm": 0.365855872631073, + "learning_rate": 1.0973910144498534e-05, + "loss": 0.0892, + "num_input_tokens_seen": 19792792, + "step": 27585 + }, + { + "epoch": 57.35966735966736, + "grad_norm": 0.21997028589248657, + "learning_rate": 1.0965784453644123e-05, + "loss": 0.077, + "num_input_tokens_seen": 19796504, + "step": 27590 + }, + { + "epoch": 57.37006237006237, + "grad_norm": 0.47580263018608093, + "learning_rate": 1.0957660927042127e-05, + "loss": 0.0932, + "num_input_tokens_seen": 19800152, + "step": 27595 + }, + { + "epoch": 57.38045738045738, + "grad_norm": 0.2568626403808594, + "learning_rate": 1.094953956594527e-05, + "loss": 0.1149, + "num_input_tokens_seen": 19803672, + "step": 27600 + }, + { + "epoch": 57.38045738045738, + "eval_loss": 0.1426340937614441, + "eval_runtime": 7.758, + "eval_samples_per_second": 110.337, + "eval_steps_per_second": 27.584, + "num_input_tokens_seen": 19803672, + "step": 27600 + }, + { + "epoch": 57.39085239085239, + "grad_norm": 0.3755069077014923, + "learning_rate": 1.0941420371605981e-05, + "loss": 0.1129, + "num_input_tokens_seen": 19807192, + "step": 27605 + }, + { + "epoch": 57.4012474012474, + "grad_norm": 0.3825225830078125, + "learning_rate": 1.0933303345276354e-05, + "loss": 0.1042, + "num_input_tokens_seen": 19810744, + "step": 27610 + }, + { + "epoch": 57.41164241164241, + "grad_norm": 0.2315979152917862, + "learning_rate": 1.0925188488208112e-05, + "loss": 0.1054, + "num_input_tokens_seen": 19814296, + "step": 27615 + }, + { + "epoch": 57.42203742203742, + "grad_norm": 0.20032159984111786, + "learning_rate": 1.0917075801652694e-05, + "loss": 0.1021, + "num_input_tokens_seen": 19817784, + "step": 27620 + }, + { + "epoch": 57.432432432432435, + "grad_norm": 0.39823779463768005, + "learning_rate": 1.0908965286861151e-05, + "loss": 0.1051, + "num_input_tokens_seen": 19821272, + "step": 27625 + }, + { + "epoch": 57.44282744282744, + "grad_norm": 0.5279890894889832, + "learning_rate": 1.090085694508425e-05, + "loss": 0.1434, + "num_input_tokens_seen": 19825016, + "step": 27630 + }, + { + "epoch": 57.45322245322245, + "grad_norm": 1.3462010622024536, + "learning_rate": 1.089275077757238e-05, + "loss": 0.1594, + "num_input_tokens_seen": 19828632, + "step": 27635 + }, + { + "epoch": 57.46361746361746, + "grad_norm": 0.31417369842529297, + "learning_rate": 1.0884646785575633e-05, + "loss": 0.1207, + "num_input_tokens_seen": 19832248, + "step": 27640 + }, + { + "epoch": 57.474012474012476, + "grad_norm": 0.35536691546440125, + "learning_rate": 1.0876544970343728e-05, + "loss": 0.104, + "num_input_tokens_seen": 19835864, + "step": 27645 + }, + { + "epoch": 57.484407484407484, + "grad_norm": 0.3148348033428192, + "learning_rate": 1.0868445333126082e-05, + "loss": 0.0767, + "num_input_tokens_seen": 19839608, + "step": 27650 + }, + { + "epoch": 57.49480249480249, + "grad_norm": 0.6813603639602661, + "learning_rate": 1.0860347875171745e-05, + "loss": 0.0934, + "num_input_tokens_seen": 19843384, + "step": 27655 + }, + { + "epoch": 57.50519750519751, + "grad_norm": 0.635286271572113, + "learning_rate": 1.0852252597729465e-05, + "loss": 0.1781, + "num_input_tokens_seen": 19846968, + "step": 27660 + }, + { + "epoch": 57.515592515592516, + "grad_norm": 1.2227035760879517, + "learning_rate": 1.0844159502047615e-05, + "loss": 0.1053, + "num_input_tokens_seen": 19850680, + "step": 27665 + }, + { + "epoch": 57.525987525987524, + "grad_norm": 0.21599939465522766, + "learning_rate": 1.0836068589374265e-05, + "loss": 0.0692, + "num_input_tokens_seen": 19854296, + "step": 27670 + }, + { + "epoch": 57.53638253638254, + "grad_norm": 0.4040634334087372, + "learning_rate": 1.0827979860957144e-05, + "loss": 0.0884, + "num_input_tokens_seen": 19857880, + "step": 27675 + }, + { + "epoch": 57.54677754677755, + "grad_norm": 0.2979867160320282, + "learning_rate": 1.0819893318043615e-05, + "loss": 0.1176, + "num_input_tokens_seen": 19861560, + "step": 27680 + }, + { + "epoch": 57.55717255717256, + "grad_norm": 0.2821402847766876, + "learning_rate": 1.0811808961880734e-05, + "loss": 0.1046, + "num_input_tokens_seen": 19865368, + "step": 27685 + }, + { + "epoch": 57.567567567567565, + "grad_norm": 0.7785196304321289, + "learning_rate": 1.080372679371522e-05, + "loss": 0.0449, + "num_input_tokens_seen": 19868888, + "step": 27690 + }, + { + "epoch": 57.57796257796258, + "grad_norm": 0.5032058954238892, + "learning_rate": 1.0795646814793428e-05, + "loss": 0.1741, + "num_input_tokens_seen": 19872344, + "step": 27695 + }, + { + "epoch": 57.58835758835759, + "grad_norm": 0.19737227261066437, + "learning_rate": 1.078756902636141e-05, + "loss": 0.0869, + "num_input_tokens_seen": 19875928, + "step": 27700 + }, + { + "epoch": 57.5987525987526, + "grad_norm": 0.4568435251712799, + "learning_rate": 1.077949342966485e-05, + "loss": 0.1559, + "num_input_tokens_seen": 19879800, + "step": 27705 + }, + { + "epoch": 57.60914760914761, + "grad_norm": 0.3568611443042755, + "learning_rate": 1.0771420025949103e-05, + "loss": 0.1391, + "num_input_tokens_seen": 19883416, + "step": 27710 + }, + { + "epoch": 57.61954261954262, + "grad_norm": 0.26993680000305176, + "learning_rate": 1.0763348816459204e-05, + "loss": 0.073, + "num_input_tokens_seen": 19887000, + "step": 27715 + }, + { + "epoch": 57.62993762993763, + "grad_norm": 0.22161145508289337, + "learning_rate": 1.0755279802439816e-05, + "loss": 0.0801, + "num_input_tokens_seen": 19890488, + "step": 27720 + }, + { + "epoch": 57.64033264033264, + "grad_norm": 0.2870972454547882, + "learning_rate": 1.0747212985135293e-05, + "loss": 0.1022, + "num_input_tokens_seen": 19894072, + "step": 27725 + }, + { + "epoch": 57.65072765072765, + "grad_norm": 0.15772996842861176, + "learning_rate": 1.073914836578965e-05, + "loss": 0.0885, + "num_input_tokens_seen": 19897688, + "step": 27730 + }, + { + "epoch": 57.66112266112266, + "grad_norm": 0.20758523046970367, + "learning_rate": 1.0731085945646529e-05, + "loss": 0.1005, + "num_input_tokens_seen": 19901336, + "step": 27735 + }, + { + "epoch": 57.67151767151767, + "grad_norm": 0.23966898024082184, + "learning_rate": 1.0723025725949285e-05, + "loss": 0.0854, + "num_input_tokens_seen": 19905016, + "step": 27740 + }, + { + "epoch": 57.681912681912685, + "grad_norm": 0.777538537979126, + "learning_rate": 1.0714967707940875e-05, + "loss": 0.1016, + "num_input_tokens_seen": 19908600, + "step": 27745 + }, + { + "epoch": 57.69230769230769, + "grad_norm": 0.45111095905303955, + "learning_rate": 1.0706911892863963e-05, + "loss": 0.0698, + "num_input_tokens_seen": 19912120, + "step": 27750 + }, + { + "epoch": 57.7027027027027, + "grad_norm": 0.3500806987285614, + "learning_rate": 1.0698858281960866e-05, + "loss": 0.0921, + "num_input_tokens_seen": 19915800, + "step": 27755 + }, + { + "epoch": 57.71309771309771, + "grad_norm": 0.29367220401763916, + "learning_rate": 1.069080687647353e-05, + "loss": 0.1221, + "num_input_tokens_seen": 19919384, + "step": 27760 + }, + { + "epoch": 57.723492723492726, + "grad_norm": 0.2125980705022812, + "learning_rate": 1.0682757677643596e-05, + "loss": 0.0819, + "num_input_tokens_seen": 19923000, + "step": 27765 + }, + { + "epoch": 57.733887733887734, + "grad_norm": 0.18427154421806335, + "learning_rate": 1.0674710686712359e-05, + "loss": 0.055, + "num_input_tokens_seen": 19926456, + "step": 27770 + }, + { + "epoch": 57.74428274428274, + "grad_norm": 0.28662553429603577, + "learning_rate": 1.0666665904920756e-05, + "loss": 0.0711, + "num_input_tokens_seen": 19929912, + "step": 27775 + }, + { + "epoch": 57.75467775467776, + "grad_norm": 0.11729701608419418, + "learning_rate": 1.0658623333509385e-05, + "loss": 0.0664, + "num_input_tokens_seen": 19933432, + "step": 27780 + }, + { + "epoch": 57.765072765072766, + "grad_norm": 0.22821103036403656, + "learning_rate": 1.0650582973718532e-05, + "loss": 0.0836, + "num_input_tokens_seen": 19937112, + "step": 27785 + }, + { + "epoch": 57.775467775467774, + "grad_norm": 0.3604688346385956, + "learning_rate": 1.0642544826788098e-05, + "loss": 0.0633, + "num_input_tokens_seen": 19940664, + "step": 27790 + }, + { + "epoch": 57.78586278586278, + "grad_norm": 0.4078180491924286, + "learning_rate": 1.063450889395769e-05, + "loss": 0.1087, + "num_input_tokens_seen": 19944280, + "step": 27795 + }, + { + "epoch": 57.7962577962578, + "grad_norm": 0.22506961226463318, + "learning_rate": 1.062647517646653e-05, + "loss": 0.0753, + "num_input_tokens_seen": 19947800, + "step": 27800 + }, + { + "epoch": 57.7962577962578, + "eval_loss": 0.14563506841659546, + "eval_runtime": 7.7687, + "eval_samples_per_second": 110.185, + "eval_steps_per_second": 27.546, + "num_input_tokens_seen": 19947800, + "step": 27800 + }, + { + "epoch": 57.80665280665281, + "grad_norm": 0.3019694983959198, + "learning_rate": 1.0618443675553527e-05, + "loss": 0.0778, + "num_input_tokens_seen": 19951192, + "step": 27805 + }, + { + "epoch": 57.817047817047815, + "grad_norm": 0.5946360230445862, + "learning_rate": 1.0610414392457247e-05, + "loss": 0.1064, + "num_input_tokens_seen": 19954648, + "step": 27810 + }, + { + "epoch": 57.82744282744283, + "grad_norm": 0.2546789050102234, + "learning_rate": 1.0602387328415888e-05, + "loss": 0.0919, + "num_input_tokens_seen": 19958232, + "step": 27815 + }, + { + "epoch": 57.83783783783784, + "grad_norm": 0.23403304815292358, + "learning_rate": 1.0594362484667347e-05, + "loss": 0.0769, + "num_input_tokens_seen": 19961912, + "step": 27820 + }, + { + "epoch": 57.84823284823285, + "grad_norm": 0.6020116209983826, + "learning_rate": 1.0586339862449132e-05, + "loss": 0.0884, + "num_input_tokens_seen": 19965752, + "step": 27825 + }, + { + "epoch": 57.858627858627855, + "grad_norm": 0.8696747422218323, + "learning_rate": 1.0578319462998445e-05, + "loss": 0.0927, + "num_input_tokens_seen": 19969336, + "step": 27830 + }, + { + "epoch": 57.86902286902287, + "grad_norm": 0.6892639398574829, + "learning_rate": 1.057030128755214e-05, + "loss": 0.1278, + "num_input_tokens_seen": 19972920, + "step": 27835 + }, + { + "epoch": 57.87941787941788, + "grad_norm": 0.3772810399532318, + "learning_rate": 1.0562285337346703e-05, + "loss": 0.0844, + "num_input_tokens_seen": 19976536, + "step": 27840 + }, + { + "epoch": 57.88981288981289, + "grad_norm": 0.42482346296310425, + "learning_rate": 1.0554271613618308e-05, + "loss": 0.0976, + "num_input_tokens_seen": 19980024, + "step": 27845 + }, + { + "epoch": 57.9002079002079, + "grad_norm": 0.2059994488954544, + "learning_rate": 1.054626011760276e-05, + "loss": 0.105, + "num_input_tokens_seen": 19983832, + "step": 27850 + }, + { + "epoch": 57.91060291060291, + "grad_norm": 0.3500617742538452, + "learning_rate": 1.0538250850535549e-05, + "loss": 0.1045, + "num_input_tokens_seen": 19987384, + "step": 27855 + }, + { + "epoch": 57.92099792099792, + "grad_norm": 0.4673311710357666, + "learning_rate": 1.0530243813651794e-05, + "loss": 0.106, + "num_input_tokens_seen": 19990872, + "step": 27860 + }, + { + "epoch": 57.931392931392935, + "grad_norm": 0.4459138810634613, + "learning_rate": 1.0522239008186271e-05, + "loss": 0.1149, + "num_input_tokens_seen": 19994424, + "step": 27865 + }, + { + "epoch": 57.94178794178794, + "grad_norm": 0.3849537670612335, + "learning_rate": 1.0514236435373434e-05, + "loss": 0.0854, + "num_input_tokens_seen": 19997976, + "step": 27870 + }, + { + "epoch": 57.95218295218295, + "grad_norm": 0.24970562756061554, + "learning_rate": 1.0506236096447386e-05, + "loss": 0.118, + "num_input_tokens_seen": 20001528, + "step": 27875 + }, + { + "epoch": 57.96257796257796, + "grad_norm": 0.14877524971961975, + "learning_rate": 1.049823799264186e-05, + "loss": 0.0871, + "num_input_tokens_seen": 20005208, + "step": 27880 + }, + { + "epoch": 57.972972972972975, + "grad_norm": 0.3423635959625244, + "learning_rate": 1.049024212519028e-05, + "loss": 0.092, + "num_input_tokens_seen": 20008792, + "step": 27885 + }, + { + "epoch": 57.983367983367984, + "grad_norm": 0.5090219974517822, + "learning_rate": 1.0482248495325713e-05, + "loss": 0.087, + "num_input_tokens_seen": 20012280, + "step": 27890 + }, + { + "epoch": 57.99376299376299, + "grad_norm": 0.3545365035533905, + "learning_rate": 1.047425710428086e-05, + "loss": 0.0776, + "num_input_tokens_seen": 20015832, + "step": 27895 + }, + { + "epoch": 58.00415800415801, + "grad_norm": 0.18955104053020477, + "learning_rate": 1.0466267953288114e-05, + "loss": 0.0888, + "num_input_tokens_seen": 20019184, + "step": 27900 + }, + { + "epoch": 58.014553014553016, + "grad_norm": 0.21819645166397095, + "learning_rate": 1.0458281043579482e-05, + "loss": 0.075, + "num_input_tokens_seen": 20022928, + "step": 27905 + }, + { + "epoch": 58.024948024948024, + "grad_norm": 0.42108669877052307, + "learning_rate": 1.0450296376386657e-05, + "loss": 0.1222, + "num_input_tokens_seen": 20026512, + "step": 27910 + }, + { + "epoch": 58.03534303534303, + "grad_norm": 0.6488506197929382, + "learning_rate": 1.044231395294098e-05, + "loss": 0.0688, + "num_input_tokens_seen": 20030128, + "step": 27915 + }, + { + "epoch": 58.04573804573805, + "grad_norm": 0.26560571789741516, + "learning_rate": 1.0434333774473435e-05, + "loss": 0.0874, + "num_input_tokens_seen": 20033616, + "step": 27920 + }, + { + "epoch": 58.056133056133056, + "grad_norm": 0.29597634077072144, + "learning_rate": 1.0426355842214657e-05, + "loss": 0.1431, + "num_input_tokens_seen": 20037392, + "step": 27925 + }, + { + "epoch": 58.066528066528065, + "grad_norm": 0.5683363080024719, + "learning_rate": 1.0418380157394963e-05, + "loss": 0.1471, + "num_input_tokens_seen": 20040880, + "step": 27930 + }, + { + "epoch": 58.07692307692308, + "grad_norm": 0.7469246983528137, + "learning_rate": 1.0410406721244281e-05, + "loss": 0.0889, + "num_input_tokens_seen": 20044432, + "step": 27935 + }, + { + "epoch": 58.08731808731809, + "grad_norm": 0.28763532638549805, + "learning_rate": 1.0402435534992238e-05, + "loss": 0.1446, + "num_input_tokens_seen": 20048048, + "step": 27940 + }, + { + "epoch": 58.0977130977131, + "grad_norm": 0.7003993391990662, + "learning_rate": 1.0394466599868071e-05, + "loss": 0.1099, + "num_input_tokens_seen": 20051696, + "step": 27945 + }, + { + "epoch": 58.108108108108105, + "grad_norm": 0.2602629065513611, + "learning_rate": 1.0386499917100697e-05, + "loss": 0.1196, + "num_input_tokens_seen": 20055216, + "step": 27950 + }, + { + "epoch": 58.11850311850312, + "grad_norm": 0.850204348564148, + "learning_rate": 1.0378535487918692e-05, + "loss": 0.1113, + "num_input_tokens_seen": 20058800, + "step": 27955 + }, + { + "epoch": 58.12889812889813, + "grad_norm": 0.42032721638679504, + "learning_rate": 1.037057331355025e-05, + "loss": 0.0797, + "num_input_tokens_seen": 20062288, + "step": 27960 + }, + { + "epoch": 58.13929313929314, + "grad_norm": 0.2515554428100586, + "learning_rate": 1.0362613395223247e-05, + "loss": 0.0937, + "num_input_tokens_seen": 20065776, + "step": 27965 + }, + { + "epoch": 58.14968814968815, + "grad_norm": 0.2556575536727905, + "learning_rate": 1.0354655734165212e-05, + "loss": 0.0719, + "num_input_tokens_seen": 20069424, + "step": 27970 + }, + { + "epoch": 58.16008316008316, + "grad_norm": 0.5144910216331482, + "learning_rate": 1.03467003316033e-05, + "loss": 0.0786, + "num_input_tokens_seen": 20072848, + "step": 27975 + }, + { + "epoch": 58.17047817047817, + "grad_norm": 0.5839266777038574, + "learning_rate": 1.033874718876435e-05, + "loss": 0.0732, + "num_input_tokens_seen": 20076592, + "step": 27980 + }, + { + "epoch": 58.18087318087318, + "grad_norm": 0.35235583782196045, + "learning_rate": 1.0330796306874818e-05, + "loss": 0.1193, + "num_input_tokens_seen": 20080144, + "step": 27985 + }, + { + "epoch": 58.19126819126819, + "grad_norm": 0.8943825960159302, + "learning_rate": 1.032284768716085e-05, + "loss": 0.119, + "num_input_tokens_seen": 20083696, + "step": 27990 + }, + { + "epoch": 58.2016632016632, + "grad_norm": 0.35261186957359314, + "learning_rate": 1.0314901330848206e-05, + "loss": 0.0832, + "num_input_tokens_seen": 20087248, + "step": 27995 + }, + { + "epoch": 58.21205821205821, + "grad_norm": 0.1771162748336792, + "learning_rate": 1.030695723916233e-05, + "loss": 0.0848, + "num_input_tokens_seen": 20090864, + "step": 28000 + }, + { + "epoch": 58.21205821205821, + "eval_loss": 0.14920523762702942, + "eval_runtime": 7.7472, + "eval_samples_per_second": 110.491, + "eval_steps_per_second": 27.623, + "num_input_tokens_seen": 20090864, + "step": 28000 + }, + { + "epoch": 58.222453222453225, + "grad_norm": 0.2668056786060333, + "learning_rate": 1.0299015413328289e-05, + "loss": 0.0744, + "num_input_tokens_seen": 20094384, + "step": 28005 + }, + { + "epoch": 58.232848232848234, + "grad_norm": 0.675990641117096, + "learning_rate": 1.0291075854570809e-05, + "loss": 0.0607, + "num_input_tokens_seen": 20097840, + "step": 28010 + }, + { + "epoch": 58.24324324324324, + "grad_norm": 0.3917974829673767, + "learning_rate": 1.0283138564114275e-05, + "loss": 0.1255, + "num_input_tokens_seen": 20101488, + "step": 28015 + }, + { + "epoch": 58.25363825363825, + "grad_norm": 0.35413485765457153, + "learning_rate": 1.027520354318273e-05, + "loss": 0.0865, + "num_input_tokens_seen": 20105008, + "step": 28020 + }, + { + "epoch": 58.264033264033266, + "grad_norm": 0.5403872132301331, + "learning_rate": 1.0267270792999828e-05, + "loss": 0.0725, + "num_input_tokens_seen": 20108688, + "step": 28025 + }, + { + "epoch": 58.274428274428274, + "grad_norm": 0.133209228515625, + "learning_rate": 1.0259340314788919e-05, + "loss": 0.077, + "num_input_tokens_seen": 20112272, + "step": 28030 + }, + { + "epoch": 58.28482328482328, + "grad_norm": 0.23654869198799133, + "learning_rate": 1.0251412109772979e-05, + "loss": 0.1075, + "num_input_tokens_seen": 20115696, + "step": 28035 + }, + { + "epoch": 58.2952182952183, + "grad_norm": 0.49034085869789124, + "learning_rate": 1.0243486179174627e-05, + "loss": 0.1076, + "num_input_tokens_seen": 20119408, + "step": 28040 + }, + { + "epoch": 58.305613305613306, + "grad_norm": 0.1406560242176056, + "learning_rate": 1.0235562524216158e-05, + "loss": 0.0766, + "num_input_tokens_seen": 20123056, + "step": 28045 + }, + { + "epoch": 58.316008316008315, + "grad_norm": 0.4440496563911438, + "learning_rate": 1.022764114611948e-05, + "loss": 0.0806, + "num_input_tokens_seen": 20126704, + "step": 28050 + }, + { + "epoch": 58.32640332640332, + "grad_norm": 0.3490053713321686, + "learning_rate": 1.0219722046106178e-05, + "loss": 0.1163, + "num_input_tokens_seen": 20130352, + "step": 28055 + }, + { + "epoch": 58.33679833679834, + "grad_norm": 0.19129815697669983, + "learning_rate": 1.0211805225397486e-05, + "loss": 0.0676, + "num_input_tokens_seen": 20133840, + "step": 28060 + }, + { + "epoch": 58.34719334719335, + "grad_norm": 0.31546446681022644, + "learning_rate": 1.020389068521426e-05, + "loss": 0.0856, + "num_input_tokens_seen": 20137360, + "step": 28065 + }, + { + "epoch": 58.357588357588355, + "grad_norm": 0.27153944969177246, + "learning_rate": 1.0195978426777039e-05, + "loss": 0.0861, + "num_input_tokens_seen": 20140880, + "step": 28070 + }, + { + "epoch": 58.36798336798337, + "grad_norm": 0.7556456327438354, + "learning_rate": 1.0188068451305982e-05, + "loss": 0.0814, + "num_input_tokens_seen": 20144368, + "step": 28075 + }, + { + "epoch": 58.37837837837838, + "grad_norm": 0.32711997628211975, + "learning_rate": 1.0180160760020902e-05, + "loss": 0.0666, + "num_input_tokens_seen": 20148048, + "step": 28080 + }, + { + "epoch": 58.38877338877339, + "grad_norm": 0.33156904578208923, + "learning_rate": 1.0172255354141278e-05, + "loss": 0.1151, + "num_input_tokens_seen": 20151696, + "step": 28085 + }, + { + "epoch": 58.3991683991684, + "grad_norm": 0.914862334728241, + "learning_rate": 1.0164352234886205e-05, + "loss": 0.0855, + "num_input_tokens_seen": 20155184, + "step": 28090 + }, + { + "epoch": 58.40956340956341, + "grad_norm": 0.34230461716651917, + "learning_rate": 1.0156451403474454e-05, + "loss": 0.0922, + "num_input_tokens_seen": 20158704, + "step": 28095 + }, + { + "epoch": 58.41995841995842, + "grad_norm": 0.43213364481925964, + "learning_rate": 1.0148552861124443e-05, + "loss": 0.0918, + "num_input_tokens_seen": 20162192, + "step": 28100 + }, + { + "epoch": 58.43035343035343, + "grad_norm": 0.48236075043678284, + "learning_rate": 1.0140656609054205e-05, + "loss": 0.0844, + "num_input_tokens_seen": 20165776, + "step": 28105 + }, + { + "epoch": 58.44074844074844, + "grad_norm": 0.4513307213783264, + "learning_rate": 1.0132762648481455e-05, + "loss": 0.108, + "num_input_tokens_seen": 20169456, + "step": 28110 + }, + { + "epoch": 58.45114345114345, + "grad_norm": 0.4381714463233948, + "learning_rate": 1.0124870980623543e-05, + "loss": 0.1028, + "num_input_tokens_seen": 20173008, + "step": 28115 + }, + { + "epoch": 58.46153846153846, + "grad_norm": 0.5351822376251221, + "learning_rate": 1.0116981606697453e-05, + "loss": 0.134, + "num_input_tokens_seen": 20176624, + "step": 28120 + }, + { + "epoch": 58.471933471933475, + "grad_norm": 0.5490631461143494, + "learning_rate": 1.0109094527919838e-05, + "loss": 0.1557, + "num_input_tokens_seen": 20180240, + "step": 28125 + }, + { + "epoch": 58.482328482328484, + "grad_norm": 0.37051984667778015, + "learning_rate": 1.010120974550697e-05, + "loss": 0.0954, + "num_input_tokens_seen": 20184048, + "step": 28130 + }, + { + "epoch": 58.49272349272349, + "grad_norm": 0.37642985582351685, + "learning_rate": 1.0093327260674795e-05, + "loss": 0.0716, + "num_input_tokens_seen": 20187632, + "step": 28135 + }, + { + "epoch": 58.5031185031185, + "grad_norm": 0.3242715001106262, + "learning_rate": 1.0085447074638878e-05, + "loss": 0.0843, + "num_input_tokens_seen": 20191184, + "step": 28140 + }, + { + "epoch": 58.513513513513516, + "grad_norm": 0.4912489056587219, + "learning_rate": 1.0077569188614461e-05, + "loss": 0.1027, + "num_input_tokens_seen": 20194928, + "step": 28145 + }, + { + "epoch": 58.523908523908524, + "grad_norm": 0.21558260917663574, + "learning_rate": 1.0069693603816393e-05, + "loss": 0.1079, + "num_input_tokens_seen": 20198512, + "step": 28150 + }, + { + "epoch": 58.53430353430353, + "grad_norm": 0.09944023936986923, + "learning_rate": 1.0061820321459204e-05, + "loss": 0.0809, + "num_input_tokens_seen": 20201904, + "step": 28155 + }, + { + "epoch": 58.54469854469855, + "grad_norm": 0.20899903774261475, + "learning_rate": 1.0053949342757038e-05, + "loss": 0.0811, + "num_input_tokens_seen": 20205360, + "step": 28160 + }, + { + "epoch": 58.555093555093556, + "grad_norm": 0.35712137818336487, + "learning_rate": 1.0046080668923717e-05, + "loss": 0.082, + "num_input_tokens_seen": 20208976, + "step": 28165 + }, + { + "epoch": 58.565488565488565, + "grad_norm": 0.3019368648529053, + "learning_rate": 1.003821430117267e-05, + "loss": 0.0726, + "num_input_tokens_seen": 20212560, + "step": 28170 + }, + { + "epoch": 58.57588357588357, + "grad_norm": 0.36989033222198486, + "learning_rate": 1.0030350240716999e-05, + "loss": 0.1229, + "num_input_tokens_seen": 20216336, + "step": 28175 + }, + { + "epoch": 58.58627858627859, + "grad_norm": 0.5740208625793457, + "learning_rate": 1.0022488488769449e-05, + "loss": 0.1046, + "num_input_tokens_seen": 20219696, + "step": 28180 + }, + { + "epoch": 58.5966735966736, + "grad_norm": 0.42158210277557373, + "learning_rate": 1.0014629046542387e-05, + "loss": 0.1231, + "num_input_tokens_seen": 20223440, + "step": 28185 + }, + { + "epoch": 58.607068607068605, + "grad_norm": 0.45937493443489075, + "learning_rate": 1.0006771915247842e-05, + "loss": 0.1132, + "num_input_tokens_seen": 20227056, + "step": 28190 + }, + { + "epoch": 58.61746361746362, + "grad_norm": 0.2574765682220459, + "learning_rate": 9.998917096097495e-06, + "loss": 0.114, + "num_input_tokens_seen": 20230672, + "step": 28195 + }, + { + "epoch": 58.62785862785863, + "grad_norm": 0.16189225018024445, + "learning_rate": 9.991064590302638e-06, + "loss": 0.0713, + "num_input_tokens_seen": 20234160, + "step": 28200 + }, + { + "epoch": 58.62785862785863, + "eval_loss": 0.1444818526506424, + "eval_runtime": 7.7409, + "eval_samples_per_second": 110.582, + "eval_steps_per_second": 27.645, + "num_input_tokens_seen": 20234160, + "step": 28200 + }, + { + "epoch": 58.63825363825364, + "grad_norm": 0.34269803762435913, + "learning_rate": 9.983214399074241e-06, + "loss": 0.0705, + "num_input_tokens_seen": 20237712, + "step": 28205 + }, + { + "epoch": 58.648648648648646, + "grad_norm": 0.24718596041202545, + "learning_rate": 9.975366523622893e-06, + "loss": 0.1137, + "num_input_tokens_seen": 20241488, + "step": 28210 + }, + { + "epoch": 58.65904365904366, + "grad_norm": 0.28592339158058167, + "learning_rate": 9.967520965158841e-06, + "loss": 0.0786, + "num_input_tokens_seen": 20245008, + "step": 28215 + }, + { + "epoch": 58.66943866943867, + "grad_norm": 0.29751890897750854, + "learning_rate": 9.95967772489197e-06, + "loss": 0.1046, + "num_input_tokens_seen": 20248656, + "step": 28220 + }, + { + "epoch": 58.67983367983368, + "grad_norm": 0.4392874538898468, + "learning_rate": 9.951836804031794e-06, + "loss": 0.1069, + "num_input_tokens_seen": 20252208, + "step": 28225 + }, + { + "epoch": 58.69022869022869, + "grad_norm": 0.3234981596469879, + "learning_rate": 9.943998203787489e-06, + "loss": 0.1122, + "num_input_tokens_seen": 20255888, + "step": 28230 + }, + { + "epoch": 58.7006237006237, + "grad_norm": 0.31904417276382446, + "learning_rate": 9.936161925367874e-06, + "loss": 0.1137, + "num_input_tokens_seen": 20259472, + "step": 28235 + }, + { + "epoch": 58.71101871101871, + "grad_norm": 0.3056570291519165, + "learning_rate": 9.928327969981386e-06, + "loss": 0.1623, + "num_input_tokens_seen": 20263152, + "step": 28240 + }, + { + "epoch": 58.72141372141372, + "grad_norm": 0.6642711758613586, + "learning_rate": 9.920496338836135e-06, + "loss": 0.0786, + "num_input_tokens_seen": 20266640, + "step": 28245 + }, + { + "epoch": 58.731808731808734, + "grad_norm": 0.21090568602085114, + "learning_rate": 9.912667033139844e-06, + "loss": 0.0839, + "num_input_tokens_seen": 20270224, + "step": 28250 + }, + { + "epoch": 58.74220374220374, + "grad_norm": 0.7637282609939575, + "learning_rate": 9.904840054099893e-06, + "loss": 0.093, + "num_input_tokens_seen": 20273776, + "step": 28255 + }, + { + "epoch": 58.75259875259875, + "grad_norm": 0.4189085364341736, + "learning_rate": 9.897015402923312e-06, + "loss": 0.0934, + "num_input_tokens_seen": 20277232, + "step": 28260 + }, + { + "epoch": 58.762993762993766, + "grad_norm": 0.2672889828681946, + "learning_rate": 9.889193080816744e-06, + "loss": 0.106, + "num_input_tokens_seen": 20280880, + "step": 28265 + }, + { + "epoch": 58.773388773388774, + "grad_norm": 0.3183135688304901, + "learning_rate": 9.881373088986498e-06, + "loss": 0.0727, + "num_input_tokens_seen": 20284560, + "step": 28270 + }, + { + "epoch": 58.78378378378378, + "grad_norm": 0.30153799057006836, + "learning_rate": 9.873555428638523e-06, + "loss": 0.1125, + "num_input_tokens_seen": 20288048, + "step": 28275 + }, + { + "epoch": 58.79417879417879, + "grad_norm": 0.26062726974487305, + "learning_rate": 9.865740100978383e-06, + "loss": 0.148, + "num_input_tokens_seen": 20291824, + "step": 28280 + }, + { + "epoch": 58.804573804573806, + "grad_norm": 0.30893152952194214, + "learning_rate": 9.857927107211315e-06, + "loss": 0.105, + "num_input_tokens_seen": 20295504, + "step": 28285 + }, + { + "epoch": 58.814968814968815, + "grad_norm": 0.34006285667419434, + "learning_rate": 9.850116448542177e-06, + "loss": 0.1212, + "num_input_tokens_seen": 20299088, + "step": 28290 + }, + { + "epoch": 58.82536382536382, + "grad_norm": 0.6462916731834412, + "learning_rate": 9.842308126175457e-06, + "loss": 0.13, + "num_input_tokens_seen": 20302608, + "step": 28295 + }, + { + "epoch": 58.83575883575884, + "grad_norm": 0.24858351051807404, + "learning_rate": 9.834502141315315e-06, + "loss": 0.1013, + "num_input_tokens_seen": 20306160, + "step": 28300 + }, + { + "epoch": 58.84615384615385, + "grad_norm": 0.3036775588989258, + "learning_rate": 9.82669849516552e-06, + "loss": 0.1724, + "num_input_tokens_seen": 20309744, + "step": 28305 + }, + { + "epoch": 58.856548856548855, + "grad_norm": 0.18416990339756012, + "learning_rate": 9.818897188929493e-06, + "loss": 0.1095, + "num_input_tokens_seen": 20313424, + "step": 28310 + }, + { + "epoch": 58.86694386694387, + "grad_norm": 0.21064649522304535, + "learning_rate": 9.811098223810309e-06, + "loss": 0.1076, + "num_input_tokens_seen": 20316848, + "step": 28315 + }, + { + "epoch": 58.87733887733888, + "grad_norm": 0.4637925624847412, + "learning_rate": 9.803301601010641e-06, + "loss": 0.1111, + "num_input_tokens_seen": 20320496, + "step": 28320 + }, + { + "epoch": 58.88773388773389, + "grad_norm": 0.23302039504051208, + "learning_rate": 9.795507321732853e-06, + "loss": 0.1038, + "num_input_tokens_seen": 20324208, + "step": 28325 + }, + { + "epoch": 58.898128898128896, + "grad_norm": 0.1621675342321396, + "learning_rate": 9.787715387178898e-06, + "loss": 0.0758, + "num_input_tokens_seen": 20327856, + "step": 28330 + }, + { + "epoch": 58.90852390852391, + "grad_norm": 0.19474264979362488, + "learning_rate": 9.779925798550399e-06, + "loss": 0.1274, + "num_input_tokens_seen": 20331504, + "step": 28335 + }, + { + "epoch": 58.91891891891892, + "grad_norm": 0.2888123095035553, + "learning_rate": 9.772138557048619e-06, + "loss": 0.0982, + "num_input_tokens_seen": 20335120, + "step": 28340 + }, + { + "epoch": 58.92931392931393, + "grad_norm": 0.5270482897758484, + "learning_rate": 9.764353663874426e-06, + "loss": 0.0998, + "num_input_tokens_seen": 20338768, + "step": 28345 + }, + { + "epoch": 58.93970893970894, + "grad_norm": 0.15282534062862396, + "learning_rate": 9.756571120228375e-06, + "loss": 0.0574, + "num_input_tokens_seen": 20342224, + "step": 28350 + }, + { + "epoch": 58.95010395010395, + "grad_norm": 0.46955713629722595, + "learning_rate": 9.748790927310605e-06, + "loss": 0.0978, + "num_input_tokens_seen": 20345744, + "step": 28355 + }, + { + "epoch": 58.96049896049896, + "grad_norm": 0.495496928691864, + "learning_rate": 9.741013086320946e-06, + "loss": 0.0915, + "num_input_tokens_seen": 20349456, + "step": 28360 + }, + { + "epoch": 58.97089397089397, + "grad_norm": 0.18462204933166504, + "learning_rate": 9.733237598458821e-06, + "loss": 0.0812, + "num_input_tokens_seen": 20353072, + "step": 28365 + }, + { + "epoch": 58.981288981288984, + "grad_norm": 0.31288018822669983, + "learning_rate": 9.725464464923308e-06, + "loss": 0.099, + "num_input_tokens_seen": 20356752, + "step": 28370 + }, + { + "epoch": 58.99168399168399, + "grad_norm": 0.3464248478412628, + "learning_rate": 9.717693686913123e-06, + "loss": 0.078, + "num_input_tokens_seen": 20360176, + "step": 28375 + }, + { + "epoch": 59.002079002079, + "grad_norm": 0.6716737151145935, + "learning_rate": 9.709925265626632e-06, + "loss": 0.1362, + "num_input_tokens_seen": 20363720, + "step": 28380 + }, + { + "epoch": 59.012474012474016, + "grad_norm": 0.8004176616668701, + "learning_rate": 9.702159202261801e-06, + "loss": 0.136, + "num_input_tokens_seen": 20367368, + "step": 28385 + }, + { + "epoch": 59.022869022869024, + "grad_norm": 0.2573763132095337, + "learning_rate": 9.694395498016268e-06, + "loss": 0.0945, + "num_input_tokens_seen": 20370984, + "step": 28390 + }, + { + "epoch": 59.03326403326403, + "grad_norm": 0.35781654715538025, + "learning_rate": 9.686634154087298e-06, + "loss": 0.1052, + "num_input_tokens_seen": 20374568, + "step": 28395 + }, + { + "epoch": 59.04365904365904, + "grad_norm": 0.246760755777359, + "learning_rate": 9.678875171671776e-06, + "loss": 0.1056, + "num_input_tokens_seen": 20378152, + "step": 28400 + }, + { + "epoch": 59.04365904365904, + "eval_loss": 0.14727121591567993, + "eval_runtime": 7.7492, + "eval_samples_per_second": 110.463, + "eval_steps_per_second": 27.616, + "num_input_tokens_seen": 20378152, + "step": 28400 + }, + { + "epoch": 59.054054054054056, + "grad_norm": 0.3539092540740967, + "learning_rate": 9.671118551966246e-06, + "loss": 0.1289, + "num_input_tokens_seen": 20381768, + "step": 28405 + }, + { + "epoch": 59.064449064449065, + "grad_norm": 0.2572536766529083, + "learning_rate": 9.66336429616686e-06, + "loss": 0.0773, + "num_input_tokens_seen": 20385384, + "step": 28410 + }, + { + "epoch": 59.07484407484407, + "grad_norm": 0.4285736083984375, + "learning_rate": 9.655612405469436e-06, + "loss": 0.1204, + "num_input_tokens_seen": 20389256, + "step": 28415 + }, + { + "epoch": 59.08523908523909, + "grad_norm": 0.8515718579292297, + "learning_rate": 9.647862881069413e-06, + "loss": 0.0978, + "num_input_tokens_seen": 20392840, + "step": 28420 + }, + { + "epoch": 59.0956340956341, + "grad_norm": 0.22301068902015686, + "learning_rate": 9.640115724161855e-06, + "loss": 0.0669, + "num_input_tokens_seen": 20396424, + "step": 28425 + }, + { + "epoch": 59.106029106029105, + "grad_norm": 0.6347056031227112, + "learning_rate": 9.632370935941483e-06, + "loss": 0.0957, + "num_input_tokens_seen": 20400040, + "step": 28430 + }, + { + "epoch": 59.11642411642411, + "grad_norm": 0.7154555916786194, + "learning_rate": 9.624628517602634e-06, + "loss": 0.0846, + "num_input_tokens_seen": 20403560, + "step": 28435 + }, + { + "epoch": 59.12681912681913, + "grad_norm": 0.2254319041967392, + "learning_rate": 9.61688847033928e-06, + "loss": 0.082, + "num_input_tokens_seen": 20407016, + "step": 28440 + }, + { + "epoch": 59.13721413721414, + "grad_norm": 0.26598823070526123, + "learning_rate": 9.609150795345051e-06, + "loss": 0.0685, + "num_input_tokens_seen": 20410472, + "step": 28445 + }, + { + "epoch": 59.147609147609145, + "grad_norm": 0.5287303924560547, + "learning_rate": 9.601415493813171e-06, + "loss": 0.0704, + "num_input_tokens_seen": 20413960, + "step": 28450 + }, + { + "epoch": 59.15800415800416, + "grad_norm": 0.7343593239784241, + "learning_rate": 9.593682566936533e-06, + "loss": 0.1159, + "num_input_tokens_seen": 20417384, + "step": 28455 + }, + { + "epoch": 59.16839916839917, + "grad_norm": 0.1874227225780487, + "learning_rate": 9.58595201590766e-06, + "loss": 0.1163, + "num_input_tokens_seen": 20421000, + "step": 28460 + }, + { + "epoch": 59.17879417879418, + "grad_norm": 0.5498369336128235, + "learning_rate": 9.578223841918681e-06, + "loss": 0.1107, + "num_input_tokens_seen": 20424456, + "step": 28465 + }, + { + "epoch": 59.189189189189186, + "grad_norm": 0.1393674910068512, + "learning_rate": 9.570498046161389e-06, + "loss": 0.0844, + "num_input_tokens_seen": 20428136, + "step": 28470 + }, + { + "epoch": 59.1995841995842, + "grad_norm": 0.44658225774765015, + "learning_rate": 9.562774629827206e-06, + "loss": 0.0907, + "num_input_tokens_seen": 20431528, + "step": 28475 + }, + { + "epoch": 59.20997920997921, + "grad_norm": 0.395891934633255, + "learning_rate": 9.555053594107163e-06, + "loss": 0.0814, + "num_input_tokens_seen": 20435080, + "step": 28480 + }, + { + "epoch": 59.22037422037422, + "grad_norm": 0.7941494584083557, + "learning_rate": 9.547334940191957e-06, + "loss": 0.0949, + "num_input_tokens_seen": 20438632, + "step": 28485 + }, + { + "epoch": 59.23076923076923, + "grad_norm": 0.23545344173908234, + "learning_rate": 9.539618669271886e-06, + "loss": 0.092, + "num_input_tokens_seen": 20442312, + "step": 28490 + }, + { + "epoch": 59.24116424116424, + "grad_norm": 0.3967248201370239, + "learning_rate": 9.531904782536904e-06, + "loss": 0.1477, + "num_input_tokens_seen": 20445800, + "step": 28495 + }, + { + "epoch": 59.25155925155925, + "grad_norm": 0.3337422013282776, + "learning_rate": 9.524193281176597e-06, + "loss": 0.1175, + "num_input_tokens_seen": 20449512, + "step": 28500 + }, + { + "epoch": 59.26195426195426, + "grad_norm": 0.16198311746120453, + "learning_rate": 9.516484166380165e-06, + "loss": 0.0822, + "num_input_tokens_seen": 20453160, + "step": 28505 + }, + { + "epoch": 59.272349272349274, + "grad_norm": 0.392705500125885, + "learning_rate": 9.508777439336447e-06, + "loss": 0.0859, + "num_input_tokens_seen": 20456680, + "step": 28510 + }, + { + "epoch": 59.28274428274428, + "grad_norm": 0.23264838755130768, + "learning_rate": 9.50107310123393e-06, + "loss": 0.0811, + "num_input_tokens_seen": 20460232, + "step": 28515 + }, + { + "epoch": 59.29313929313929, + "grad_norm": 0.20364555716514587, + "learning_rate": 9.493371153260702e-06, + "loss": 0.1005, + "num_input_tokens_seen": 20463848, + "step": 28520 + }, + { + "epoch": 59.303534303534306, + "grad_norm": 0.4003064036369324, + "learning_rate": 9.485671596604523e-06, + "loss": 0.108, + "num_input_tokens_seen": 20467400, + "step": 28525 + }, + { + "epoch": 59.313929313929314, + "grad_norm": 0.20641762018203735, + "learning_rate": 9.477974432452738e-06, + "loss": 0.1009, + "num_input_tokens_seen": 20470984, + "step": 28530 + }, + { + "epoch": 59.32432432432432, + "grad_norm": 0.4258367419242859, + "learning_rate": 9.470279661992356e-06, + "loss": 0.1425, + "num_input_tokens_seen": 20474600, + "step": 28535 + }, + { + "epoch": 59.33471933471934, + "grad_norm": 0.554241955280304, + "learning_rate": 9.462587286410021e-06, + "loss": 0.085, + "num_input_tokens_seen": 20478184, + "step": 28540 + }, + { + "epoch": 59.34511434511435, + "grad_norm": 0.27428027987480164, + "learning_rate": 9.454897306891972e-06, + "loss": 0.0738, + "num_input_tokens_seen": 20481800, + "step": 28545 + }, + { + "epoch": 59.355509355509355, + "grad_norm": 0.1631852090358734, + "learning_rate": 9.44720972462411e-06, + "loss": 0.0819, + "num_input_tokens_seen": 20485320, + "step": 28550 + }, + { + "epoch": 59.36590436590436, + "grad_norm": 0.48763465881347656, + "learning_rate": 9.439524540791964e-06, + "loss": 0.135, + "num_input_tokens_seen": 20488936, + "step": 28555 + }, + { + "epoch": 59.37629937629938, + "grad_norm": 0.5260617733001709, + "learning_rate": 9.431841756580673e-06, + "loss": 0.0707, + "num_input_tokens_seen": 20492360, + "step": 28560 + }, + { + "epoch": 59.38669438669439, + "grad_norm": 0.39581218361854553, + "learning_rate": 9.42416137317503e-06, + "loss": 0.1245, + "num_input_tokens_seen": 20495912, + "step": 28565 + }, + { + "epoch": 59.397089397089395, + "grad_norm": 0.2728823721408844, + "learning_rate": 9.416483391759437e-06, + "loss": 0.0887, + "num_input_tokens_seen": 20499528, + "step": 28570 + }, + { + "epoch": 59.40748440748441, + "grad_norm": 0.34887298941612244, + "learning_rate": 9.408807813517945e-06, + "loss": 0.097, + "num_input_tokens_seen": 20503144, + "step": 28575 + }, + { + "epoch": 59.41787941787942, + "grad_norm": 0.7549943923950195, + "learning_rate": 9.401134639634221e-06, + "loss": 0.0741, + "num_input_tokens_seen": 20506664, + "step": 28580 + }, + { + "epoch": 59.42827442827443, + "grad_norm": 0.48479127883911133, + "learning_rate": 9.393463871291555e-06, + "loss": 0.0957, + "num_input_tokens_seen": 20510088, + "step": 28585 + }, + { + "epoch": 59.438669438669436, + "grad_norm": 0.4906701445579529, + "learning_rate": 9.385795509672881e-06, + "loss": 0.1046, + "num_input_tokens_seen": 20513608, + "step": 28590 + }, + { + "epoch": 59.44906444906445, + "grad_norm": 0.2644090950489044, + "learning_rate": 9.378129555960771e-06, + "loss": 0.1117, + "num_input_tokens_seen": 20517352, + "step": 28595 + }, + { + "epoch": 59.45945945945946, + "grad_norm": 0.253360390663147, + "learning_rate": 9.370466011337392e-06, + "loss": 0.0931, + "num_input_tokens_seen": 20521096, + "step": 28600 + }, + { + "epoch": 59.45945945945946, + "eval_loss": 0.14587536454200745, + "eval_runtime": 7.752, + "eval_samples_per_second": 110.423, + "eval_steps_per_second": 27.606, + "num_input_tokens_seen": 20521096, + "step": 28600 + }, + { + "epoch": 59.46985446985447, + "grad_norm": 0.5048637390136719, + "learning_rate": 9.362804876984573e-06, + "loss": 0.1336, + "num_input_tokens_seen": 20524712, + "step": 28605 + }, + { + "epoch": 59.48024948024948, + "grad_norm": 0.37770557403564453, + "learning_rate": 9.355146154083747e-06, + "loss": 0.1391, + "num_input_tokens_seen": 20528392, + "step": 28610 + }, + { + "epoch": 59.49064449064449, + "grad_norm": 0.29795414209365845, + "learning_rate": 9.347489843815987e-06, + "loss": 0.1029, + "num_input_tokens_seen": 20532008, + "step": 28615 + }, + { + "epoch": 59.5010395010395, + "grad_norm": 0.6926735043525696, + "learning_rate": 9.339835947362002e-06, + "loss": 0.1109, + "num_input_tokens_seen": 20535592, + "step": 28620 + }, + { + "epoch": 59.51143451143451, + "grad_norm": 0.23445576429367065, + "learning_rate": 9.332184465902105e-06, + "loss": 0.0718, + "num_input_tokens_seen": 20539368, + "step": 28625 + }, + { + "epoch": 59.521829521829524, + "grad_norm": 0.19444194436073303, + "learning_rate": 9.324535400616266e-06, + "loss": 0.0602, + "num_input_tokens_seen": 20542888, + "step": 28630 + }, + { + "epoch": 59.53222453222453, + "grad_norm": 0.29120635986328125, + "learning_rate": 9.31688875268405e-06, + "loss": 0.0876, + "num_input_tokens_seen": 20546472, + "step": 28635 + }, + { + "epoch": 59.54261954261954, + "grad_norm": 0.20014941692352295, + "learning_rate": 9.309244523284674e-06, + "loss": 0.1274, + "num_input_tokens_seen": 20550280, + "step": 28640 + }, + { + "epoch": 59.553014553014556, + "grad_norm": 0.24765563011169434, + "learning_rate": 9.301602713596982e-06, + "loss": 0.0967, + "num_input_tokens_seen": 20553736, + "step": 28645 + }, + { + "epoch": 59.563409563409564, + "grad_norm": 0.24495799839496613, + "learning_rate": 9.293963324799432e-06, + "loss": 0.0693, + "num_input_tokens_seen": 20557448, + "step": 28650 + }, + { + "epoch": 59.57380457380457, + "grad_norm": 0.21016213297843933, + "learning_rate": 9.286326358070104e-06, + "loss": 0.0891, + "num_input_tokens_seen": 20561160, + "step": 28655 + }, + { + "epoch": 59.58419958419958, + "grad_norm": 0.6252123713493347, + "learning_rate": 9.278691814586729e-06, + "loss": 0.0972, + "num_input_tokens_seen": 20564680, + "step": 28660 + }, + { + "epoch": 59.5945945945946, + "grad_norm": 0.4535446763038635, + "learning_rate": 9.271059695526635e-06, + "loss": 0.0757, + "num_input_tokens_seen": 20568136, + "step": 28665 + }, + { + "epoch": 59.604989604989605, + "grad_norm": 0.2857076823711395, + "learning_rate": 9.263430002066805e-06, + "loss": 0.0674, + "num_input_tokens_seen": 20571720, + "step": 28670 + }, + { + "epoch": 59.61538461538461, + "grad_norm": 0.6161196231842041, + "learning_rate": 9.25580273538382e-06, + "loss": 0.1354, + "num_input_tokens_seen": 20575304, + "step": 28675 + }, + { + "epoch": 59.62577962577963, + "grad_norm": 0.527831494808197, + "learning_rate": 9.248177896653907e-06, + "loss": 0.0829, + "num_input_tokens_seen": 20578664, + "step": 28680 + }, + { + "epoch": 59.63617463617464, + "grad_norm": 0.2916470468044281, + "learning_rate": 9.240555487052918e-06, + "loss": 0.07, + "num_input_tokens_seen": 20582280, + "step": 28685 + }, + { + "epoch": 59.646569646569645, + "grad_norm": 0.3868626356124878, + "learning_rate": 9.232935507756313e-06, + "loss": 0.0675, + "num_input_tokens_seen": 20585800, + "step": 28690 + }, + { + "epoch": 59.656964656964654, + "grad_norm": 0.3656567335128784, + "learning_rate": 9.225317959939193e-06, + "loss": 0.1419, + "num_input_tokens_seen": 20589384, + "step": 28695 + }, + { + "epoch": 59.66735966735967, + "grad_norm": 0.15318003296852112, + "learning_rate": 9.217702844776287e-06, + "loss": 0.075, + "num_input_tokens_seen": 20593128, + "step": 28700 + }, + { + "epoch": 59.67775467775468, + "grad_norm": 0.2122807800769806, + "learning_rate": 9.210090163441929e-06, + "loss": 0.0846, + "num_input_tokens_seen": 20596872, + "step": 28705 + }, + { + "epoch": 59.688149688149686, + "grad_norm": 0.39513862133026123, + "learning_rate": 9.202479917110105e-06, + "loss": 0.1488, + "num_input_tokens_seen": 20600520, + "step": 28710 + }, + { + "epoch": 59.6985446985447, + "grad_norm": 0.423727422952652, + "learning_rate": 9.194872106954392e-06, + "loss": 0.0917, + "num_input_tokens_seen": 20604168, + "step": 28715 + }, + { + "epoch": 59.70893970893971, + "grad_norm": 0.38413336873054504, + "learning_rate": 9.187266734148029e-06, + "loss": 0.0977, + "num_input_tokens_seen": 20607752, + "step": 28720 + }, + { + "epoch": 59.71933471933472, + "grad_norm": 0.15879254043102264, + "learning_rate": 9.179663799863849e-06, + "loss": 0.0881, + "num_input_tokens_seen": 20611368, + "step": 28725 + }, + { + "epoch": 59.729729729729726, + "grad_norm": 0.33007416129112244, + "learning_rate": 9.172063305274317e-06, + "loss": 0.0803, + "num_input_tokens_seen": 20614824, + "step": 28730 + }, + { + "epoch": 59.74012474012474, + "grad_norm": 0.3934159576892853, + "learning_rate": 9.164465251551527e-06, + "loss": 0.1018, + "num_input_tokens_seen": 20618248, + "step": 28735 + }, + { + "epoch": 59.75051975051975, + "grad_norm": 0.3682906925678253, + "learning_rate": 9.156869639867205e-06, + "loss": 0.0671, + "num_input_tokens_seen": 20621736, + "step": 28740 + }, + { + "epoch": 59.76091476091476, + "grad_norm": 0.3139001131057739, + "learning_rate": 9.149276471392677e-06, + "loss": 0.0896, + "num_input_tokens_seen": 20625384, + "step": 28745 + }, + { + "epoch": 59.771309771309774, + "grad_norm": 0.240043044090271, + "learning_rate": 9.141685747298914e-06, + "loss": 0.0815, + "num_input_tokens_seen": 20629064, + "step": 28750 + }, + { + "epoch": 59.78170478170478, + "grad_norm": 0.21802125871181488, + "learning_rate": 9.13409746875649e-06, + "loss": 0.0683, + "num_input_tokens_seen": 20632712, + "step": 28755 + }, + { + "epoch": 59.79209979209979, + "grad_norm": 0.24295072257518768, + "learning_rate": 9.12651163693562e-06, + "loss": 0.1195, + "num_input_tokens_seen": 20636424, + "step": 28760 + }, + { + "epoch": 59.802494802494806, + "grad_norm": 0.5262408256530762, + "learning_rate": 9.11892825300614e-06, + "loss": 0.1362, + "num_input_tokens_seen": 20640008, + "step": 28765 + }, + { + "epoch": 59.812889812889814, + "grad_norm": 0.32681435346603394, + "learning_rate": 9.111347318137491e-06, + "loss": 0.1149, + "num_input_tokens_seen": 20643624, + "step": 28770 + }, + { + "epoch": 59.82328482328482, + "grad_norm": 0.5452118515968323, + "learning_rate": 9.103768833498755e-06, + "loss": 0.0877, + "num_input_tokens_seen": 20647016, + "step": 28775 + }, + { + "epoch": 59.83367983367983, + "grad_norm": 0.3927593529224396, + "learning_rate": 9.096192800258639e-06, + "loss": 0.1527, + "num_input_tokens_seen": 20650600, + "step": 28780 + }, + { + "epoch": 59.84407484407485, + "grad_norm": 0.3968268632888794, + "learning_rate": 9.088619219585443e-06, + "loss": 0.1202, + "num_input_tokens_seen": 20654248, + "step": 28785 + }, + { + "epoch": 59.854469854469855, + "grad_norm": 0.17741483449935913, + "learning_rate": 9.081048092647127e-06, + "loss": 0.0755, + "num_input_tokens_seen": 20657800, + "step": 28790 + }, + { + "epoch": 59.86486486486486, + "grad_norm": 0.4784325361251831, + "learning_rate": 9.073479420611245e-06, + "loss": 0.0962, + "num_input_tokens_seen": 20661320, + "step": 28795 + }, + { + "epoch": 59.87525987525988, + "grad_norm": 0.4152628183364868, + "learning_rate": 9.065913204644974e-06, + "loss": 0.0841, + "num_input_tokens_seen": 20664744, + "step": 28800 + }, + { + "epoch": 59.87525987525988, + "eval_loss": 0.14579591155052185, + "eval_runtime": 7.7586, + "eval_samples_per_second": 110.329, + "eval_steps_per_second": 27.582, + "num_input_tokens_seen": 20664744, + "step": 28800 + }, + { + "epoch": 59.88565488565489, + "grad_norm": 0.749068558216095, + "learning_rate": 9.058349445915135e-06, + "loss": 0.1579, + "num_input_tokens_seen": 20668328, + "step": 28805 + }, + { + "epoch": 59.896049896049895, + "grad_norm": 0.2742447555065155, + "learning_rate": 9.050788145588138e-06, + "loss": 0.0945, + "num_input_tokens_seen": 20671880, + "step": 28810 + }, + { + "epoch": 59.906444906444904, + "grad_norm": 0.35365697741508484, + "learning_rate": 9.043229304830039e-06, + "loss": 0.09, + "num_input_tokens_seen": 20675368, + "step": 28815 + }, + { + "epoch": 59.91683991683992, + "grad_norm": 0.27971944212913513, + "learning_rate": 9.035672924806515e-06, + "loss": 0.1502, + "num_input_tokens_seen": 20679048, + "step": 28820 + }, + { + "epoch": 59.92723492723493, + "grad_norm": 0.47812172770500183, + "learning_rate": 9.028119006682839e-06, + "loss": 0.0775, + "num_input_tokens_seen": 20682600, + "step": 28825 + }, + { + "epoch": 59.937629937629936, + "grad_norm": 0.40554070472717285, + "learning_rate": 9.020567551623935e-06, + "loss": 0.1045, + "num_input_tokens_seen": 20686216, + "step": 28830 + }, + { + "epoch": 59.94802494802495, + "grad_norm": 0.49652099609375, + "learning_rate": 9.013018560794318e-06, + "loss": 0.1474, + "num_input_tokens_seen": 20689896, + "step": 28835 + }, + { + "epoch": 59.95841995841996, + "grad_norm": 0.2921479046344757, + "learning_rate": 9.005472035358139e-06, + "loss": 0.0728, + "num_input_tokens_seen": 20693512, + "step": 28840 + }, + { + "epoch": 59.96881496881497, + "grad_norm": 0.40649986267089844, + "learning_rate": 8.997927976479185e-06, + "loss": 0.0978, + "num_input_tokens_seen": 20697096, + "step": 28845 + }, + { + "epoch": 59.979209979209976, + "grad_norm": 0.6524130702018738, + "learning_rate": 8.99038638532082e-06, + "loss": 0.0709, + "num_input_tokens_seen": 20700904, + "step": 28850 + }, + { + "epoch": 59.98960498960499, + "grad_norm": 0.5394526720046997, + "learning_rate": 8.982847263046065e-06, + "loss": 0.1165, + "num_input_tokens_seen": 20704392, + "step": 28855 + }, + { + "epoch": 60.0, + "grad_norm": 0.2383957952260971, + "learning_rate": 8.975310610817555e-06, + "loss": 0.1166, + "num_input_tokens_seen": 20708032, + "step": 28860 + }, + { + "epoch": 60.01039501039501, + "grad_norm": 0.17646105587482452, + "learning_rate": 8.967776429797528e-06, + "loss": 0.1197, + "num_input_tokens_seen": 20711680, + "step": 28865 + }, + { + "epoch": 60.020790020790024, + "grad_norm": 0.4373851716518402, + "learning_rate": 8.960244721147842e-06, + "loss": 0.1051, + "num_input_tokens_seen": 20715136, + "step": 28870 + }, + { + "epoch": 60.03118503118503, + "grad_norm": 0.13095584511756897, + "learning_rate": 8.952715486029995e-06, + "loss": 0.0865, + "num_input_tokens_seen": 20718592, + "step": 28875 + }, + { + "epoch": 60.04158004158004, + "grad_norm": 0.31612128019332886, + "learning_rate": 8.945188725605075e-06, + "loss": 0.0846, + "num_input_tokens_seen": 20722240, + "step": 28880 + }, + { + "epoch": 60.05197505197505, + "grad_norm": 0.16077904403209686, + "learning_rate": 8.937664441033817e-06, + "loss": 0.0876, + "num_input_tokens_seen": 20725888, + "step": 28885 + }, + { + "epoch": 60.062370062370064, + "grad_norm": 0.37222006916999817, + "learning_rate": 8.930142633476549e-06, + "loss": 0.0866, + "num_input_tokens_seen": 20729472, + "step": 28890 + }, + { + "epoch": 60.07276507276507, + "grad_norm": 0.5636044144630432, + "learning_rate": 8.92262330409323e-06, + "loss": 0.1612, + "num_input_tokens_seen": 20733024, + "step": 28895 + }, + { + "epoch": 60.08316008316008, + "grad_norm": 0.2662052810192108, + "learning_rate": 8.915106454043448e-06, + "loss": 0.1011, + "num_input_tokens_seen": 20736512, + "step": 28900 + }, + { + "epoch": 60.093555093555096, + "grad_norm": 0.09003788232803345, + "learning_rate": 8.90759208448638e-06, + "loss": 0.098, + "num_input_tokens_seen": 20740128, + "step": 28905 + }, + { + "epoch": 60.103950103950105, + "grad_norm": 0.5266367197036743, + "learning_rate": 8.900080196580848e-06, + "loss": 0.1018, + "num_input_tokens_seen": 20743776, + "step": 28910 + }, + { + "epoch": 60.11434511434511, + "grad_norm": 0.5340911149978638, + "learning_rate": 8.892570791485267e-06, + "loss": 0.0901, + "num_input_tokens_seen": 20747392, + "step": 28915 + }, + { + "epoch": 60.12474012474012, + "grad_norm": 0.33885788917541504, + "learning_rate": 8.885063870357688e-06, + "loss": 0.09, + "num_input_tokens_seen": 20750944, + "step": 28920 + }, + { + "epoch": 60.13513513513514, + "grad_norm": 0.2501963675022125, + "learning_rate": 8.87755943435578e-06, + "loss": 0.062, + "num_input_tokens_seen": 20754592, + "step": 28925 + }, + { + "epoch": 60.145530145530145, + "grad_norm": 0.2551519572734833, + "learning_rate": 8.87005748463681e-06, + "loss": 0.067, + "num_input_tokens_seen": 20758240, + "step": 28930 + }, + { + "epoch": 60.15592515592515, + "grad_norm": 0.16382251679897308, + "learning_rate": 8.862558022357681e-06, + "loss": 0.1071, + "num_input_tokens_seen": 20761696, + "step": 28935 + }, + { + "epoch": 60.16632016632017, + "grad_norm": 0.38219472765922546, + "learning_rate": 8.855061048674903e-06, + "loss": 0.0942, + "num_input_tokens_seen": 20765344, + "step": 28940 + }, + { + "epoch": 60.17671517671518, + "grad_norm": 0.4749150574207306, + "learning_rate": 8.847566564744595e-06, + "loss": 0.1605, + "num_input_tokens_seen": 20768896, + "step": 28945 + }, + { + "epoch": 60.187110187110186, + "grad_norm": 0.48542267084121704, + "learning_rate": 8.840074571722512e-06, + "loss": 0.1243, + "num_input_tokens_seen": 20772480, + "step": 28950 + }, + { + "epoch": 60.197505197505194, + "grad_norm": 0.23302185535430908, + "learning_rate": 8.832585070764002e-06, + "loss": 0.1205, + "num_input_tokens_seen": 20776224, + "step": 28955 + }, + { + "epoch": 60.20790020790021, + "grad_norm": 0.7348693013191223, + "learning_rate": 8.825098063024045e-06, + "loss": 0.1083, + "num_input_tokens_seen": 20779840, + "step": 28960 + }, + { + "epoch": 60.21829521829522, + "grad_norm": 0.22721417248249054, + "learning_rate": 8.817613549657244e-06, + "loss": 0.0742, + "num_input_tokens_seen": 20783520, + "step": 28965 + }, + { + "epoch": 60.228690228690226, + "grad_norm": 0.16945481300354004, + "learning_rate": 8.810131531817783e-06, + "loss": 0.1606, + "num_input_tokens_seen": 20787072, + "step": 28970 + }, + { + "epoch": 60.23908523908524, + "grad_norm": 0.2827686667442322, + "learning_rate": 8.802652010659496e-06, + "loss": 0.0806, + "num_input_tokens_seen": 20790688, + "step": 28975 + }, + { + "epoch": 60.24948024948025, + "grad_norm": 0.3579750061035156, + "learning_rate": 8.795174987335827e-06, + "loss": 0.1129, + "num_input_tokens_seen": 20794272, + "step": 28980 + }, + { + "epoch": 60.25987525987526, + "grad_norm": 0.1970498114824295, + "learning_rate": 8.787700462999807e-06, + "loss": 0.116, + "num_input_tokens_seen": 20797856, + "step": 28985 + }, + { + "epoch": 60.270270270270274, + "grad_norm": 0.629378616809845, + "learning_rate": 8.780228438804122e-06, + "loss": 0.091, + "num_input_tokens_seen": 20801408, + "step": 28990 + }, + { + "epoch": 60.28066528066528, + "grad_norm": 0.2726498544216156, + "learning_rate": 8.772758915901032e-06, + "loss": 0.0685, + "num_input_tokens_seen": 20804896, + "step": 28995 + }, + { + "epoch": 60.29106029106029, + "grad_norm": 0.3019360303878784, + "learning_rate": 8.765291895442443e-06, + "loss": 0.1066, + "num_input_tokens_seen": 20808544, + "step": 29000 + }, + { + "epoch": 60.29106029106029, + "eval_loss": 0.14497844874858856, + "eval_runtime": 7.7548, + "eval_samples_per_second": 110.383, + "eval_steps_per_second": 27.596, + "num_input_tokens_seen": 20808544, + "step": 29000 + }, + { + "epoch": 60.3014553014553, + "grad_norm": 0.21355925500392914, + "learning_rate": 8.75782737857987e-06, + "loss": 0.1028, + "num_input_tokens_seen": 20812096, + "step": 29005 + }, + { + "epoch": 60.311850311850314, + "grad_norm": 0.1659429967403412, + "learning_rate": 8.750365366464425e-06, + "loss": 0.1033, + "num_input_tokens_seen": 20815808, + "step": 29010 + }, + { + "epoch": 60.32224532224532, + "grad_norm": 0.789692223072052, + "learning_rate": 8.742905860246838e-06, + "loss": 0.0595, + "num_input_tokens_seen": 20819360, + "step": 29015 + }, + { + "epoch": 60.33264033264033, + "grad_norm": 0.4614737927913666, + "learning_rate": 8.735448861077478e-06, + "loss": 0.1174, + "num_input_tokens_seen": 20822848, + "step": 29020 + }, + { + "epoch": 60.343035343035346, + "grad_norm": 0.4885205626487732, + "learning_rate": 8.727994370106288e-06, + "loss": 0.0942, + "num_input_tokens_seen": 20826112, + "step": 29025 + }, + { + "epoch": 60.353430353430355, + "grad_norm": 0.5253474116325378, + "learning_rate": 8.720542388482861e-06, + "loss": 0.1345, + "num_input_tokens_seen": 20829664, + "step": 29030 + }, + { + "epoch": 60.36382536382536, + "grad_norm": 0.13872244954109192, + "learning_rate": 8.71309291735637e-06, + "loss": 0.0887, + "num_input_tokens_seen": 20833280, + "step": 29035 + }, + { + "epoch": 60.37422037422037, + "grad_norm": 0.6807723641395569, + "learning_rate": 8.705645957875621e-06, + "loss": 0.1296, + "num_input_tokens_seen": 20836896, + "step": 29040 + }, + { + "epoch": 60.38461538461539, + "grad_norm": 0.19141389429569244, + "learning_rate": 8.698201511189048e-06, + "loss": 0.0773, + "num_input_tokens_seen": 20840544, + "step": 29045 + }, + { + "epoch": 60.395010395010395, + "grad_norm": 0.2167888730764389, + "learning_rate": 8.690759578444649e-06, + "loss": 0.0687, + "num_input_tokens_seen": 20844192, + "step": 29050 + }, + { + "epoch": 60.4054054054054, + "grad_norm": 0.2482570856809616, + "learning_rate": 8.68332016079008e-06, + "loss": 0.1268, + "num_input_tokens_seen": 20847872, + "step": 29055 + }, + { + "epoch": 60.41580041580042, + "grad_norm": 0.5441279411315918, + "learning_rate": 8.6758832593726e-06, + "loss": 0.099, + "num_input_tokens_seen": 20851424, + "step": 29060 + }, + { + "epoch": 60.42619542619543, + "grad_norm": 0.804174542427063, + "learning_rate": 8.668448875339053e-06, + "loss": 0.2183, + "num_input_tokens_seen": 20854944, + "step": 29065 + }, + { + "epoch": 60.436590436590436, + "grad_norm": 0.7633329629898071, + "learning_rate": 8.661017009835933e-06, + "loss": 0.0754, + "num_input_tokens_seen": 20858400, + "step": 29070 + }, + { + "epoch": 60.446985446985444, + "grad_norm": 0.4310770630836487, + "learning_rate": 8.653587664009311e-06, + "loss": 0.1337, + "num_input_tokens_seen": 20862048, + "step": 29075 + }, + { + "epoch": 60.45738045738046, + "grad_norm": 0.2924729287624359, + "learning_rate": 8.646160839004902e-06, + "loss": 0.0703, + "num_input_tokens_seen": 20865600, + "step": 29080 + }, + { + "epoch": 60.46777546777547, + "grad_norm": 0.437860906124115, + "learning_rate": 8.638736535967998e-06, + "loss": 0.1098, + "num_input_tokens_seen": 20869184, + "step": 29085 + }, + { + "epoch": 60.478170478170476, + "grad_norm": 0.24204730987548828, + "learning_rate": 8.631314756043535e-06, + "loss": 0.112, + "num_input_tokens_seen": 20872736, + "step": 29090 + }, + { + "epoch": 60.48856548856549, + "grad_norm": 0.22708682715892792, + "learning_rate": 8.62389550037603e-06, + "loss": 0.0772, + "num_input_tokens_seen": 20876352, + "step": 29095 + }, + { + "epoch": 60.4989604989605, + "grad_norm": 0.4884282052516937, + "learning_rate": 8.616478770109646e-06, + "loss": 0.0681, + "num_input_tokens_seen": 20879872, + "step": 29100 + }, + { + "epoch": 60.50935550935551, + "grad_norm": 0.15083734691143036, + "learning_rate": 8.609064566388111e-06, + "loss": 0.0786, + "num_input_tokens_seen": 20883552, + "step": 29105 + }, + { + "epoch": 60.51975051975052, + "grad_norm": 0.31436869502067566, + "learning_rate": 8.601652890354815e-06, + "loss": 0.0783, + "num_input_tokens_seen": 20887104, + "step": 29110 + }, + { + "epoch": 60.53014553014553, + "grad_norm": 0.4346787929534912, + "learning_rate": 8.594243743152705e-06, + "loss": 0.0802, + "num_input_tokens_seen": 20890720, + "step": 29115 + }, + { + "epoch": 60.54054054054054, + "grad_norm": 0.5228874683380127, + "learning_rate": 8.58683712592438e-06, + "loss": 0.1004, + "num_input_tokens_seen": 20894304, + "step": 29120 + }, + { + "epoch": 60.55093555093555, + "grad_norm": 0.5228676795959473, + "learning_rate": 8.579433039812037e-06, + "loss": 0.1161, + "num_input_tokens_seen": 20897920, + "step": 29125 + }, + { + "epoch": 60.561330561330564, + "grad_norm": 0.27483081817626953, + "learning_rate": 8.572031485957466e-06, + "loss": 0.0897, + "num_input_tokens_seen": 20901504, + "step": 29130 + }, + { + "epoch": 60.57172557172557, + "grad_norm": 0.11755423247814178, + "learning_rate": 8.564632465502084e-06, + "loss": 0.0758, + "num_input_tokens_seen": 20905312, + "step": 29135 + }, + { + "epoch": 60.58212058212058, + "grad_norm": 0.44243112206459045, + "learning_rate": 8.557235979586928e-06, + "loss": 0.0794, + "num_input_tokens_seen": 20908736, + "step": 29140 + }, + { + "epoch": 60.59251559251559, + "grad_norm": 0.22510744631290436, + "learning_rate": 8.549842029352606e-06, + "loss": 0.0959, + "num_input_tokens_seen": 20912320, + "step": 29145 + }, + { + "epoch": 60.602910602910605, + "grad_norm": 0.18366490304470062, + "learning_rate": 8.542450615939376e-06, + "loss": 0.1182, + "num_input_tokens_seen": 20915968, + "step": 29150 + }, + { + "epoch": 60.61330561330561, + "grad_norm": 0.3031461238861084, + "learning_rate": 8.535061740487082e-06, + "loss": 0.1074, + "num_input_tokens_seen": 20919584, + "step": 29155 + }, + { + "epoch": 60.62370062370062, + "grad_norm": 0.1848813146352768, + "learning_rate": 8.527675404135168e-06, + "loss": 0.0802, + "num_input_tokens_seen": 20923264, + "step": 29160 + }, + { + "epoch": 60.63409563409564, + "grad_norm": 0.3066861927509308, + "learning_rate": 8.520291608022724e-06, + "loss": 0.0988, + "num_input_tokens_seen": 20926848, + "step": 29165 + }, + { + "epoch": 60.644490644490645, + "grad_norm": 0.31105443835258484, + "learning_rate": 8.512910353288398e-06, + "loss": 0.0868, + "num_input_tokens_seen": 20930400, + "step": 29170 + }, + { + "epoch": 60.65488565488565, + "grad_norm": 0.5183464288711548, + "learning_rate": 8.505531641070486e-06, + "loss": 0.0723, + "num_input_tokens_seen": 20934016, + "step": 29175 + }, + { + "epoch": 60.66528066528066, + "grad_norm": 0.22952374815940857, + "learning_rate": 8.498155472506885e-06, + "loss": 0.0661, + "num_input_tokens_seen": 20937568, + "step": 29180 + }, + { + "epoch": 60.67567567567568, + "grad_norm": 0.15272709727287292, + "learning_rate": 8.49078184873508e-06, + "loss": 0.0678, + "num_input_tokens_seen": 20941152, + "step": 29185 + }, + { + "epoch": 60.686070686070686, + "grad_norm": 0.48880425095558167, + "learning_rate": 8.483410770892188e-06, + "loss": 0.1131, + "num_input_tokens_seen": 20944704, + "step": 29190 + }, + { + "epoch": 60.696465696465694, + "grad_norm": 0.6067901849746704, + "learning_rate": 8.476042240114909e-06, + "loss": 0.0834, + "num_input_tokens_seen": 20948352, + "step": 29195 + }, + { + "epoch": 60.70686070686071, + "grad_norm": 0.5927879810333252, + "learning_rate": 8.468676257539568e-06, + "loss": 0.0863, + "num_input_tokens_seen": 20952064, + "step": 29200 + }, + { + "epoch": 60.70686070686071, + "eval_loss": 0.14335104823112488, + "eval_runtime": 7.7458, + "eval_samples_per_second": 110.512, + "eval_steps_per_second": 27.628, + "num_input_tokens_seen": 20952064, + "step": 29200 + }, + { + "epoch": 60.71725571725572, + "grad_norm": 0.9012395739555359, + "learning_rate": 8.4613128243021e-06, + "loss": 0.1068, + "num_input_tokens_seen": 20955744, + "step": 29205 + }, + { + "epoch": 60.727650727650726, + "grad_norm": 0.3551384210586548, + "learning_rate": 8.453951941538028e-06, + "loss": 0.1118, + "num_input_tokens_seen": 20959360, + "step": 29210 + }, + { + "epoch": 60.73804573804574, + "grad_norm": 0.6783225536346436, + "learning_rate": 8.446593610382495e-06, + "loss": 0.1118, + "num_input_tokens_seen": 20962880, + "step": 29215 + }, + { + "epoch": 60.74844074844075, + "grad_norm": 0.3850801885128021, + "learning_rate": 8.439237831970259e-06, + "loss": 0.0959, + "num_input_tokens_seen": 20966208, + "step": 29220 + }, + { + "epoch": 60.75883575883576, + "grad_norm": 0.426845908164978, + "learning_rate": 8.431884607435667e-06, + "loss": 0.0961, + "num_input_tokens_seen": 20969984, + "step": 29225 + }, + { + "epoch": 60.76923076923077, + "grad_norm": 0.28509286046028137, + "learning_rate": 8.424533937912665e-06, + "loss": 0.0651, + "num_input_tokens_seen": 20973568, + "step": 29230 + }, + { + "epoch": 60.77962577962578, + "grad_norm": 0.5644401907920837, + "learning_rate": 8.41718582453484e-06, + "loss": 0.1001, + "num_input_tokens_seen": 20977152, + "step": 29235 + }, + { + "epoch": 60.79002079002079, + "grad_norm": 0.20297272503376007, + "learning_rate": 8.409840268435346e-06, + "loss": 0.0937, + "num_input_tokens_seen": 20980864, + "step": 29240 + }, + { + "epoch": 60.8004158004158, + "grad_norm": 0.21104247868061066, + "learning_rate": 8.402497270746976e-06, + "loss": 0.0769, + "num_input_tokens_seen": 20984480, + "step": 29245 + }, + { + "epoch": 60.810810810810814, + "grad_norm": 0.4726376235485077, + "learning_rate": 8.395156832602095e-06, + "loss": 0.1091, + "num_input_tokens_seen": 20988096, + "step": 29250 + }, + { + "epoch": 60.82120582120582, + "grad_norm": 0.3258678615093231, + "learning_rate": 8.387818955132707e-06, + "loss": 0.0992, + "num_input_tokens_seen": 20991776, + "step": 29255 + }, + { + "epoch": 60.83160083160083, + "grad_norm": 0.23731747269630432, + "learning_rate": 8.38048363947039e-06, + "loss": 0.071, + "num_input_tokens_seen": 20995360, + "step": 29260 + }, + { + "epoch": 60.84199584199584, + "grad_norm": 0.17279604077339172, + "learning_rate": 8.373150886746351e-06, + "loss": 0.0817, + "num_input_tokens_seen": 20998944, + "step": 29265 + }, + { + "epoch": 60.852390852390855, + "grad_norm": 0.17682383954524994, + "learning_rate": 8.365820698091397e-06, + "loss": 0.1137, + "num_input_tokens_seen": 21002560, + "step": 29270 + }, + { + "epoch": 60.86278586278586, + "grad_norm": 0.7227201461791992, + "learning_rate": 8.358493074635922e-06, + "loss": 0.1403, + "num_input_tokens_seen": 21006208, + "step": 29275 + }, + { + "epoch": 60.87318087318087, + "grad_norm": 0.5530303716659546, + "learning_rate": 8.351168017509948e-06, + "loss": 0.0984, + "num_input_tokens_seen": 21009888, + "step": 29280 + }, + { + "epoch": 60.88357588357589, + "grad_norm": 0.3103157877922058, + "learning_rate": 8.343845527843094e-06, + "loss": 0.1446, + "num_input_tokens_seen": 21013440, + "step": 29285 + }, + { + "epoch": 60.893970893970895, + "grad_norm": 0.6725053787231445, + "learning_rate": 8.336525606764566e-06, + "loss": 0.0856, + "num_input_tokens_seen": 21017024, + "step": 29290 + }, + { + "epoch": 60.9043659043659, + "grad_norm": 0.472800612449646, + "learning_rate": 8.329208255403204e-06, + "loss": 0.0755, + "num_input_tokens_seen": 21020864, + "step": 29295 + }, + { + "epoch": 60.91476091476091, + "grad_norm": 0.7760940194129944, + "learning_rate": 8.321893474887426e-06, + "loss": 0.1012, + "num_input_tokens_seen": 21024416, + "step": 29300 + }, + { + "epoch": 60.92515592515593, + "grad_norm": 0.32815003395080566, + "learning_rate": 8.31458126634526e-06, + "loss": 0.104, + "num_input_tokens_seen": 21027808, + "step": 29305 + }, + { + "epoch": 60.935550935550935, + "grad_norm": 0.3878173530101776, + "learning_rate": 8.30727163090435e-06, + "loss": 0.091, + "num_input_tokens_seen": 21031392, + "step": 29310 + }, + { + "epoch": 60.945945945945944, + "grad_norm": 0.6396980285644531, + "learning_rate": 8.29996456969192e-06, + "loss": 0.1024, + "num_input_tokens_seen": 21034784, + "step": 29315 + }, + { + "epoch": 60.95634095634096, + "grad_norm": 0.21699823439121246, + "learning_rate": 8.292660083834818e-06, + "loss": 0.0851, + "num_input_tokens_seen": 21038336, + "step": 29320 + }, + { + "epoch": 60.96673596673597, + "grad_norm": 0.2577188313007355, + "learning_rate": 8.2853581744595e-06, + "loss": 0.0848, + "num_input_tokens_seen": 21042112, + "step": 29325 + }, + { + "epoch": 60.977130977130976, + "grad_norm": 0.7206210494041443, + "learning_rate": 8.278058842691991e-06, + "loss": 0.1371, + "num_input_tokens_seen": 21045536, + "step": 29330 + }, + { + "epoch": 60.987525987525984, + "grad_norm": 0.5821517705917358, + "learning_rate": 8.27076208965796e-06, + "loss": 0.114, + "num_input_tokens_seen": 21049088, + "step": 29335 + }, + { + "epoch": 60.997920997921, + "grad_norm": 0.5774378180503845, + "learning_rate": 8.263467916482637e-06, + "loss": 0.1086, + "num_input_tokens_seen": 21052704, + "step": 29340 + }, + { + "epoch": 61.00831600831601, + "grad_norm": 0.38700979948043823, + "learning_rate": 8.256176324290885e-06, + "loss": 0.0654, + "num_input_tokens_seen": 21056240, + "step": 29345 + }, + { + "epoch": 61.018711018711016, + "grad_norm": 0.3596930503845215, + "learning_rate": 8.248887314207168e-06, + "loss": 0.1509, + "num_input_tokens_seen": 21059856, + "step": 29350 + }, + { + "epoch": 61.02910602910603, + "grad_norm": 0.5911754369735718, + "learning_rate": 8.24160088735553e-06, + "loss": 0.1254, + "num_input_tokens_seen": 21063344, + "step": 29355 + }, + { + "epoch": 61.03950103950104, + "grad_norm": 0.39094266295433044, + "learning_rate": 8.234317044859629e-06, + "loss": 0.094, + "num_input_tokens_seen": 21066960, + "step": 29360 + }, + { + "epoch": 61.04989604989605, + "grad_norm": 0.22677475214004517, + "learning_rate": 8.227035787842744e-06, + "loss": 0.0979, + "num_input_tokens_seen": 21070576, + "step": 29365 + }, + { + "epoch": 61.06029106029106, + "grad_norm": 0.28975480794906616, + "learning_rate": 8.219757117427721e-06, + "loss": 0.0798, + "num_input_tokens_seen": 21074096, + "step": 29370 + }, + { + "epoch": 61.07068607068607, + "grad_norm": 0.1861444115638733, + "learning_rate": 8.212481034737014e-06, + "loss": 0.099, + "num_input_tokens_seen": 21077584, + "step": 29375 + }, + { + "epoch": 61.08108108108108, + "grad_norm": 0.2881191670894623, + "learning_rate": 8.205207540892707e-06, + "loss": 0.1131, + "num_input_tokens_seen": 21081104, + "step": 29380 + }, + { + "epoch": 61.09147609147609, + "grad_norm": 0.23176899552345276, + "learning_rate": 8.197936637016442e-06, + "loss": 0.1138, + "num_input_tokens_seen": 21084752, + "step": 29385 + }, + { + "epoch": 61.101871101871104, + "grad_norm": 0.4262154698371887, + "learning_rate": 8.190668324229508e-06, + "loss": 0.0882, + "num_input_tokens_seen": 21088304, + "step": 29390 + }, + { + "epoch": 61.11226611226611, + "grad_norm": 0.4962862730026245, + "learning_rate": 8.183402603652749e-06, + "loss": 0.1214, + "num_input_tokens_seen": 21091984, + "step": 29395 + }, + { + "epoch": 61.12266112266112, + "grad_norm": 0.9773277044296265, + "learning_rate": 8.176139476406635e-06, + "loss": 0.1233, + "num_input_tokens_seen": 21095536, + "step": 29400 + }, + { + "epoch": 61.12266112266112, + "eval_loss": 0.14702700078487396, + "eval_runtime": 7.7485, + "eval_samples_per_second": 110.472, + "eval_steps_per_second": 27.618, + "num_input_tokens_seen": 21095536, + "step": 29400 + }, + { + "epoch": 61.13305613305613, + "grad_norm": 0.5169813632965088, + "learning_rate": 8.16887894361125e-06, + "loss": 0.1405, + "num_input_tokens_seen": 21099152, + "step": 29405 + }, + { + "epoch": 61.143451143451145, + "grad_norm": 0.17783832550048828, + "learning_rate": 8.161621006386233e-06, + "loss": 0.0938, + "num_input_tokens_seen": 21102896, + "step": 29410 + }, + { + "epoch": 61.15384615384615, + "grad_norm": 0.3919041156768799, + "learning_rate": 8.154365665850869e-06, + "loss": 0.0679, + "num_input_tokens_seen": 21106480, + "step": 29415 + }, + { + "epoch": 61.16424116424116, + "grad_norm": 0.60132896900177, + "learning_rate": 8.147112923124005e-06, + "loss": 0.0896, + "num_input_tokens_seen": 21110064, + "step": 29420 + }, + { + "epoch": 61.17463617463618, + "grad_norm": 0.2821182906627655, + "learning_rate": 8.13986277932412e-06, + "loss": 0.1075, + "num_input_tokens_seen": 21113712, + "step": 29425 + }, + { + "epoch": 61.185031185031185, + "grad_norm": 0.2957124412059784, + "learning_rate": 8.132615235569277e-06, + "loss": 0.0936, + "num_input_tokens_seen": 21117360, + "step": 29430 + }, + { + "epoch": 61.195426195426194, + "grad_norm": 0.3318851590156555, + "learning_rate": 8.125370292977124e-06, + "loss": 0.0767, + "num_input_tokens_seen": 21120784, + "step": 29435 + }, + { + "epoch": 61.20582120582121, + "grad_norm": 0.2523352801799774, + "learning_rate": 8.118127952664944e-06, + "loss": 0.1036, + "num_input_tokens_seen": 21124240, + "step": 29440 + }, + { + "epoch": 61.21621621621622, + "grad_norm": 0.4909975528717041, + "learning_rate": 8.110888215749574e-06, + "loss": 0.115, + "num_input_tokens_seen": 21127728, + "step": 29445 + }, + { + "epoch": 61.226611226611226, + "grad_norm": 0.5990318655967712, + "learning_rate": 8.10365108334749e-06, + "loss": 0.1881, + "num_input_tokens_seen": 21131312, + "step": 29450 + }, + { + "epoch": 61.237006237006234, + "grad_norm": 0.1650487333536148, + "learning_rate": 8.096416556574743e-06, + "loss": 0.0816, + "num_input_tokens_seen": 21135024, + "step": 29455 + }, + { + "epoch": 61.24740124740125, + "grad_norm": 0.1801476627588272, + "learning_rate": 8.08918463654698e-06, + "loss": 0.0938, + "num_input_tokens_seen": 21138608, + "step": 29460 + }, + { + "epoch": 61.25779625779626, + "grad_norm": 0.39674127101898193, + "learning_rate": 8.081955324379458e-06, + "loss": 0.0837, + "num_input_tokens_seen": 21142192, + "step": 29465 + }, + { + "epoch": 61.268191268191266, + "grad_norm": 0.2521074414253235, + "learning_rate": 8.074728621187039e-06, + "loss": 0.0579, + "num_input_tokens_seen": 21145840, + "step": 29470 + }, + { + "epoch": 61.27858627858628, + "grad_norm": 0.19448518753051758, + "learning_rate": 8.067504528084158e-06, + "loss": 0.0654, + "num_input_tokens_seen": 21149552, + "step": 29475 + }, + { + "epoch": 61.28898128898129, + "grad_norm": 0.7854188084602356, + "learning_rate": 8.060283046184861e-06, + "loss": 0.1108, + "num_input_tokens_seen": 21153200, + "step": 29480 + }, + { + "epoch": 61.2993762993763, + "grad_norm": 0.36780044436454773, + "learning_rate": 8.053064176602806e-06, + "loss": 0.0826, + "num_input_tokens_seen": 21156784, + "step": 29485 + }, + { + "epoch": 61.30977130977131, + "grad_norm": 0.5289041996002197, + "learning_rate": 8.045847920451216e-06, + "loss": 0.0885, + "num_input_tokens_seen": 21160304, + "step": 29490 + }, + { + "epoch": 61.32016632016632, + "grad_norm": 0.20177115499973297, + "learning_rate": 8.038634278842944e-06, + "loss": 0.1168, + "num_input_tokens_seen": 21163984, + "step": 29495 + }, + { + "epoch": 61.33056133056133, + "grad_norm": 0.66547691822052, + "learning_rate": 8.031423252890408e-06, + "loss": 0.1016, + "num_input_tokens_seen": 21167632, + "step": 29500 + }, + { + "epoch": 61.34095634095634, + "grad_norm": 0.4583335220813751, + "learning_rate": 8.024214843705646e-06, + "loss": 0.0778, + "num_input_tokens_seen": 21171088, + "step": 29505 + }, + { + "epoch": 61.351351351351354, + "grad_norm": 0.34719663858413696, + "learning_rate": 8.017009052400295e-06, + "loss": 0.1438, + "num_input_tokens_seen": 21174768, + "step": 29510 + }, + { + "epoch": 61.36174636174636, + "grad_norm": 0.4565470218658447, + "learning_rate": 8.00980588008557e-06, + "loss": 0.1252, + "num_input_tokens_seen": 21178320, + "step": 29515 + }, + { + "epoch": 61.37214137214137, + "grad_norm": 0.5055521726608276, + "learning_rate": 8.002605327872282e-06, + "loss": 0.1087, + "num_input_tokens_seen": 21182160, + "step": 29520 + }, + { + "epoch": 61.38253638253638, + "grad_norm": 0.8913677930831909, + "learning_rate": 7.995407396870862e-06, + "loss": 0.0837, + "num_input_tokens_seen": 21185584, + "step": 29525 + }, + { + "epoch": 61.392931392931395, + "grad_norm": 0.6857107877731323, + "learning_rate": 7.988212088191307e-06, + "loss": 0.1202, + "num_input_tokens_seen": 21189264, + "step": 29530 + }, + { + "epoch": 61.4033264033264, + "grad_norm": 0.2119128257036209, + "learning_rate": 7.98101940294324e-06, + "loss": 0.0823, + "num_input_tokens_seen": 21192752, + "step": 29535 + }, + { + "epoch": 61.41372141372141, + "grad_norm": 0.5168678164482117, + "learning_rate": 7.973829342235847e-06, + "loss": 0.1065, + "num_input_tokens_seen": 21196240, + "step": 29540 + }, + { + "epoch": 61.42411642411643, + "grad_norm": 0.44698360562324524, + "learning_rate": 7.966641907177936e-06, + "loss": 0.0956, + "num_input_tokens_seen": 21199792, + "step": 29545 + }, + { + "epoch": 61.434511434511435, + "grad_norm": 0.3678644001483917, + "learning_rate": 7.959457098877901e-06, + "loss": 0.073, + "num_input_tokens_seen": 21203248, + "step": 29550 + }, + { + "epoch": 61.444906444906444, + "grad_norm": 0.4528106451034546, + "learning_rate": 7.952274918443719e-06, + "loss": 0.1018, + "num_input_tokens_seen": 21206864, + "step": 29555 + }, + { + "epoch": 61.45530145530145, + "grad_norm": 0.670876681804657, + "learning_rate": 7.945095366982983e-06, + "loss": 0.091, + "num_input_tokens_seen": 21210512, + "step": 29560 + }, + { + "epoch": 61.46569646569647, + "grad_norm": 0.3483893573284149, + "learning_rate": 7.937918445602871e-06, + "loss": 0.0823, + "num_input_tokens_seen": 21214160, + "step": 29565 + }, + { + "epoch": 61.476091476091476, + "grad_norm": 0.49786192178726196, + "learning_rate": 7.930744155410145e-06, + "loss": 0.0951, + "num_input_tokens_seen": 21217616, + "step": 29570 + }, + { + "epoch": 61.486486486486484, + "grad_norm": 0.48345887660980225, + "learning_rate": 7.923572497511181e-06, + "loss": 0.1018, + "num_input_tokens_seen": 21221200, + "step": 29575 + }, + { + "epoch": 61.4968814968815, + "grad_norm": 0.7340675592422485, + "learning_rate": 7.916403473011927e-06, + "loss": 0.1243, + "num_input_tokens_seen": 21224880, + "step": 29580 + }, + { + "epoch": 61.50727650727651, + "grad_norm": 0.6534602642059326, + "learning_rate": 7.909237083017953e-06, + "loss": 0.1243, + "num_input_tokens_seen": 21228464, + "step": 29585 + }, + { + "epoch": 61.517671517671516, + "grad_norm": 0.2597734034061432, + "learning_rate": 7.902073328634389e-06, + "loss": 0.0767, + "num_input_tokens_seen": 21232080, + "step": 29590 + }, + { + "epoch": 61.528066528066525, + "grad_norm": 0.4583474099636078, + "learning_rate": 7.894912210965987e-06, + "loss": 0.0751, + "num_input_tokens_seen": 21235728, + "step": 29595 + }, + { + "epoch": 61.53846153846154, + "grad_norm": 1.2191187143325806, + "learning_rate": 7.887753731117075e-06, + "loss": 0.1196, + "num_input_tokens_seen": 21239216, + "step": 29600 + }, + { + "epoch": 61.53846153846154, + "eval_loss": 0.14368727803230286, + "eval_runtime": 7.76, + "eval_samples_per_second": 110.31, + "eval_steps_per_second": 27.577, + "num_input_tokens_seen": 21239216, + "step": 29600 + }, + { + "epoch": 61.54885654885655, + "grad_norm": 0.29472145438194275, + "learning_rate": 7.880597890191587e-06, + "loss": 0.0503, + "num_input_tokens_seen": 21242608, + "step": 29605 + }, + { + "epoch": 61.55925155925156, + "grad_norm": 0.7453294396400452, + "learning_rate": 7.873444689293036e-06, + "loss": 0.1205, + "num_input_tokens_seen": 21246256, + "step": 29610 + }, + { + "epoch": 61.56964656964657, + "grad_norm": 0.39794766902923584, + "learning_rate": 7.866294129524548e-06, + "loss": 0.1129, + "num_input_tokens_seen": 21249904, + "step": 29615 + }, + { + "epoch": 61.58004158004158, + "grad_norm": 0.4990961253643036, + "learning_rate": 7.859146211988811e-06, + "loss": 0.1057, + "num_input_tokens_seen": 21253424, + "step": 29620 + }, + { + "epoch": 61.59043659043659, + "grad_norm": 0.6060383319854736, + "learning_rate": 7.852000937788134e-06, + "loss": 0.1164, + "num_input_tokens_seen": 21257040, + "step": 29625 + }, + { + "epoch": 61.6008316008316, + "grad_norm": 0.3472544252872467, + "learning_rate": 7.844858308024416e-06, + "loss": 0.0978, + "num_input_tokens_seen": 21260496, + "step": 29630 + }, + { + "epoch": 61.61122661122661, + "grad_norm": 0.18219788372516632, + "learning_rate": 7.837718323799122e-06, + "loss": 0.1214, + "num_input_tokens_seen": 21264080, + "step": 29635 + }, + { + "epoch": 61.62162162162162, + "grad_norm": 0.29993873834609985, + "learning_rate": 7.83058098621334e-06, + "loss": 0.087, + "num_input_tokens_seen": 21267728, + "step": 29640 + }, + { + "epoch": 61.63201663201663, + "grad_norm": 0.34165680408477783, + "learning_rate": 7.823446296367739e-06, + "loss": 0.0641, + "num_input_tokens_seen": 21271344, + "step": 29645 + }, + { + "epoch": 61.642411642411645, + "grad_norm": 0.35337257385253906, + "learning_rate": 7.81631425536257e-06, + "loss": 0.1142, + "num_input_tokens_seen": 21274864, + "step": 29650 + }, + { + "epoch": 61.65280665280665, + "grad_norm": 0.4934418797492981, + "learning_rate": 7.809184864297689e-06, + "loss": 0.0931, + "num_input_tokens_seen": 21278448, + "step": 29655 + }, + { + "epoch": 61.66320166320166, + "grad_norm": 0.41056832671165466, + "learning_rate": 7.802058124272532e-06, + "loss": 0.0867, + "num_input_tokens_seen": 21281872, + "step": 29660 + }, + { + "epoch": 61.67359667359668, + "grad_norm": 0.39806991815567017, + "learning_rate": 7.79493403638614e-06, + "loss": 0.113, + "num_input_tokens_seen": 21285456, + "step": 29665 + }, + { + "epoch": 61.683991683991685, + "grad_norm": 0.30097782611846924, + "learning_rate": 7.787812601737132e-06, + "loss": 0.0553, + "num_input_tokens_seen": 21288944, + "step": 29670 + }, + { + "epoch": 61.694386694386694, + "grad_norm": 0.21595461666584015, + "learning_rate": 7.780693821423715e-06, + "loss": 0.0811, + "num_input_tokens_seen": 21292592, + "step": 29675 + }, + { + "epoch": 61.7047817047817, + "grad_norm": 0.38876357674598694, + "learning_rate": 7.773577696543705e-06, + "loss": 0.0899, + "num_input_tokens_seen": 21296208, + "step": 29680 + }, + { + "epoch": 61.71517671517672, + "grad_norm": 0.23947925865650177, + "learning_rate": 7.7664642281945e-06, + "loss": 0.0712, + "num_input_tokens_seen": 21299792, + "step": 29685 + }, + { + "epoch": 61.725571725571726, + "grad_norm": 0.46568575501441956, + "learning_rate": 7.759353417473072e-06, + "loss": 0.0999, + "num_input_tokens_seen": 21303408, + "step": 29690 + }, + { + "epoch": 61.735966735966734, + "grad_norm": 0.29096299409866333, + "learning_rate": 7.752245265476016e-06, + "loss": 0.0858, + "num_input_tokens_seen": 21306864, + "step": 29695 + }, + { + "epoch": 61.74636174636175, + "grad_norm": 0.7422049045562744, + "learning_rate": 7.745139773299481e-06, + "loss": 0.0898, + "num_input_tokens_seen": 21310384, + "step": 29700 + }, + { + "epoch": 61.75675675675676, + "grad_norm": 0.2541833221912384, + "learning_rate": 7.738036942039232e-06, + "loss": 0.0624, + "num_input_tokens_seen": 21314032, + "step": 29705 + }, + { + "epoch": 61.767151767151766, + "grad_norm": 0.4245069622993469, + "learning_rate": 7.73093677279062e-06, + "loss": 0.1321, + "num_input_tokens_seen": 21317712, + "step": 29710 + }, + { + "epoch": 61.777546777546775, + "grad_norm": 0.2716270983219147, + "learning_rate": 7.72383926664857e-06, + "loss": 0.0776, + "num_input_tokens_seen": 21321232, + "step": 29715 + }, + { + "epoch": 61.78794178794179, + "grad_norm": 0.23219925165176392, + "learning_rate": 7.716744424707606e-06, + "loss": 0.0767, + "num_input_tokens_seen": 21324848, + "step": 29720 + }, + { + "epoch": 61.7983367983368, + "grad_norm": 0.16447094082832336, + "learning_rate": 7.709652248061858e-06, + "loss": 0.0741, + "num_input_tokens_seen": 21328400, + "step": 29725 + }, + { + "epoch": 61.80873180873181, + "grad_norm": 0.2582675516605377, + "learning_rate": 7.702562737805017e-06, + "loss": 0.0908, + "num_input_tokens_seen": 21332208, + "step": 29730 + }, + { + "epoch": 61.81912681912682, + "grad_norm": 0.29227396845817566, + "learning_rate": 7.695475895030365e-06, + "loss": 0.0965, + "num_input_tokens_seen": 21335760, + "step": 29735 + }, + { + "epoch": 61.82952182952183, + "grad_norm": 0.3639666438102722, + "learning_rate": 7.6883917208308e-06, + "loss": 0.114, + "num_input_tokens_seen": 21339472, + "step": 29740 + }, + { + "epoch": 61.83991683991684, + "grad_norm": 0.1557289958000183, + "learning_rate": 7.681310216298778e-06, + "loss": 0.1222, + "num_input_tokens_seen": 21342960, + "step": 29745 + }, + { + "epoch": 61.85031185031185, + "grad_norm": 0.44421836733818054, + "learning_rate": 7.674231382526367e-06, + "loss": 0.1385, + "num_input_tokens_seen": 21346608, + "step": 29750 + }, + { + "epoch": 61.86070686070686, + "grad_norm": 0.38872185349464417, + "learning_rate": 7.667155220605198e-06, + "loss": 0.1003, + "num_input_tokens_seen": 21350064, + "step": 29755 + }, + { + "epoch": 61.87110187110187, + "grad_norm": 0.13828006386756897, + "learning_rate": 7.660081731626515e-06, + "loss": 0.1145, + "num_input_tokens_seen": 21353872, + "step": 29760 + }, + { + "epoch": 61.88149688149688, + "grad_norm": 0.21674661338329315, + "learning_rate": 7.653010916681141e-06, + "loss": 0.1115, + "num_input_tokens_seen": 21357456, + "step": 29765 + }, + { + "epoch": 61.891891891891895, + "grad_norm": 0.30576378107070923, + "learning_rate": 7.645942776859472e-06, + "loss": 0.0731, + "num_input_tokens_seen": 21360880, + "step": 29770 + }, + { + "epoch": 61.9022869022869, + "grad_norm": 0.17596928775310516, + "learning_rate": 7.63887731325152e-06, + "loss": 0.0737, + "num_input_tokens_seen": 21364400, + "step": 29775 + }, + { + "epoch": 61.91268191268191, + "grad_norm": 0.27015525102615356, + "learning_rate": 7.63181452694685e-06, + "loss": 0.0565, + "num_input_tokens_seen": 21368048, + "step": 29780 + }, + { + "epoch": 61.92307692307692, + "grad_norm": 0.43337082862854004, + "learning_rate": 7.624754419034644e-06, + "loss": 0.0892, + "num_input_tokens_seen": 21371568, + "step": 29785 + }, + { + "epoch": 61.933471933471935, + "grad_norm": 0.3579196035861969, + "learning_rate": 7.6176969906036645e-06, + "loss": 0.1192, + "num_input_tokens_seen": 21375280, + "step": 29790 + }, + { + "epoch": 61.943866943866944, + "grad_norm": 0.4317478537559509, + "learning_rate": 7.610642242742242e-06, + "loss": 0.1148, + "num_input_tokens_seen": 21378960, + "step": 29795 + }, + { + "epoch": 61.95426195426195, + "grad_norm": 0.38485458493232727, + "learning_rate": 7.603590176538322e-06, + "loss": 0.0911, + "num_input_tokens_seen": 21382704, + "step": 29800 + }, + { + "epoch": 61.95426195426195, + "eval_loss": 0.1447734236717224, + "eval_runtime": 7.7643, + "eval_samples_per_second": 110.248, + "eval_steps_per_second": 27.562, + "num_input_tokens_seen": 21382704, + "step": 29800 + }, + { + "epoch": 61.96465696465697, + "grad_norm": 0.27605339884757996, + "learning_rate": 7.596540793079404e-06, + "loss": 0.0833, + "num_input_tokens_seen": 21386128, + "step": 29805 + }, + { + "epoch": 61.975051975051976, + "grad_norm": 0.14270302653312683, + "learning_rate": 7.5894940934526125e-06, + "loss": 0.1096, + "num_input_tokens_seen": 21389808, + "step": 29810 + }, + { + "epoch": 61.985446985446984, + "grad_norm": 0.5408069491386414, + "learning_rate": 7.582450078744621e-06, + "loss": 0.0798, + "num_input_tokens_seen": 21393360, + "step": 29815 + }, + { + "epoch": 61.99584199584199, + "grad_norm": 0.5287277698516846, + "learning_rate": 7.575408750041707e-06, + "loss": 0.1662, + "num_input_tokens_seen": 21397008, + "step": 29820 + }, + { + "epoch": 62.00623700623701, + "grad_norm": 0.26828533411026, + "learning_rate": 7.568370108429732e-06, + "loss": 0.0749, + "num_input_tokens_seen": 21400728, + "step": 29825 + }, + { + "epoch": 62.016632016632016, + "grad_norm": 0.4641673266887665, + "learning_rate": 7.561334154994154e-06, + "loss": 0.1033, + "num_input_tokens_seen": 21404376, + "step": 29830 + }, + { + "epoch": 62.027027027027025, + "grad_norm": 0.39788150787353516, + "learning_rate": 7.55430089081999e-06, + "loss": 0.0671, + "num_input_tokens_seen": 21407896, + "step": 29835 + }, + { + "epoch": 62.03742203742204, + "grad_norm": 0.1976262629032135, + "learning_rate": 7.547270316991864e-06, + "loss": 0.0721, + "num_input_tokens_seen": 21411736, + "step": 29840 + }, + { + "epoch": 62.04781704781705, + "grad_norm": 0.40271157026290894, + "learning_rate": 7.5402424345939884e-06, + "loss": 0.1135, + "num_input_tokens_seen": 21415384, + "step": 29845 + }, + { + "epoch": 62.05821205821206, + "grad_norm": 0.3247649073600769, + "learning_rate": 7.533217244710133e-06, + "loss": 0.1865, + "num_input_tokens_seen": 21419000, + "step": 29850 + }, + { + "epoch": 62.06860706860707, + "grad_norm": 0.30751267075538635, + "learning_rate": 7.52619474842369e-06, + "loss": 0.0601, + "num_input_tokens_seen": 21422616, + "step": 29855 + }, + { + "epoch": 62.07900207900208, + "grad_norm": 0.4817673861980438, + "learning_rate": 7.519174946817597e-06, + "loss": 0.0999, + "num_input_tokens_seen": 21426264, + "step": 29860 + }, + { + "epoch": 62.08939708939709, + "grad_norm": 0.29595693945884705, + "learning_rate": 7.512157840974407e-06, + "loss": 0.1474, + "num_input_tokens_seen": 21429912, + "step": 29865 + }, + { + "epoch": 62.0997920997921, + "grad_norm": 0.503731906414032, + "learning_rate": 7.5051434319762496e-06, + "loss": 0.1099, + "num_input_tokens_seen": 21433464, + "step": 29870 + }, + { + "epoch": 62.11018711018711, + "grad_norm": 0.7117611765861511, + "learning_rate": 7.498131720904822e-06, + "loss": 0.1188, + "num_input_tokens_seen": 21437048, + "step": 29875 + }, + { + "epoch": 62.12058212058212, + "grad_norm": 0.5302498936653137, + "learning_rate": 7.491122708841433e-06, + "loss": 0.0953, + "num_input_tokens_seen": 21440568, + "step": 29880 + }, + { + "epoch": 62.13097713097713, + "grad_norm": 0.24497413635253906, + "learning_rate": 7.4841163968669524e-06, + "loss": 0.0947, + "num_input_tokens_seen": 21444024, + "step": 29885 + }, + { + "epoch": 62.141372141372145, + "grad_norm": 0.45229703187942505, + "learning_rate": 7.4771127860618355e-06, + "loss": 0.1255, + "num_input_tokens_seen": 21447640, + "step": 29890 + }, + { + "epoch": 62.15176715176715, + "grad_norm": 0.5349270701408386, + "learning_rate": 7.470111877506139e-06, + "loss": 0.0863, + "num_input_tokens_seen": 21451224, + "step": 29895 + }, + { + "epoch": 62.16216216216216, + "grad_norm": 0.19701361656188965, + "learning_rate": 7.463113672279479e-06, + "loss": 0.084, + "num_input_tokens_seen": 21454712, + "step": 29900 + }, + { + "epoch": 62.17255717255717, + "grad_norm": 0.3397116959095001, + "learning_rate": 7.456118171461071e-06, + "loss": 0.0843, + "num_input_tokens_seen": 21458392, + "step": 29905 + }, + { + "epoch": 62.182952182952185, + "grad_norm": 0.2195369303226471, + "learning_rate": 7.449125376129721e-06, + "loss": 0.0776, + "num_input_tokens_seen": 21461976, + "step": 29910 + }, + { + "epoch": 62.19334719334719, + "grad_norm": 0.32013052701950073, + "learning_rate": 7.442135287363788e-06, + "loss": 0.0766, + "num_input_tokens_seen": 21465656, + "step": 29915 + }, + { + "epoch": 62.2037422037422, + "grad_norm": 0.7011340856552124, + "learning_rate": 7.435147906241247e-06, + "loss": 0.076, + "num_input_tokens_seen": 21469400, + "step": 29920 + }, + { + "epoch": 62.21413721413722, + "grad_norm": 0.3893817365169525, + "learning_rate": 7.428163233839624e-06, + "loss": 0.083, + "num_input_tokens_seen": 21472984, + "step": 29925 + }, + { + "epoch": 62.224532224532226, + "grad_norm": 0.26402485370635986, + "learning_rate": 7.4211812712360525e-06, + "loss": 0.1144, + "num_input_tokens_seen": 21476408, + "step": 29930 + }, + { + "epoch": 62.234927234927234, + "grad_norm": 0.9250020980834961, + "learning_rate": 7.4142020195072464e-06, + "loss": 0.0758, + "num_input_tokens_seen": 21479960, + "step": 29935 + }, + { + "epoch": 62.24532224532224, + "grad_norm": 0.244293212890625, + "learning_rate": 7.407225479729479e-06, + "loss": 0.0997, + "num_input_tokens_seen": 21483576, + "step": 29940 + }, + { + "epoch": 62.25571725571726, + "grad_norm": 0.26045265793800354, + "learning_rate": 7.400251652978632e-06, + "loss": 0.1054, + "num_input_tokens_seen": 21487032, + "step": 29945 + }, + { + "epoch": 62.266112266112266, + "grad_norm": 0.6906828880310059, + "learning_rate": 7.393280540330147e-06, + "loss": 0.1113, + "num_input_tokens_seen": 21490712, + "step": 29950 + }, + { + "epoch": 62.276507276507274, + "grad_norm": 0.3275722861289978, + "learning_rate": 7.386312142859069e-06, + "loss": 0.1059, + "num_input_tokens_seen": 21494488, + "step": 29955 + }, + { + "epoch": 62.28690228690229, + "grad_norm": 0.4833945631980896, + "learning_rate": 7.379346461640008e-06, + "loss": 0.128, + "num_input_tokens_seen": 21498008, + "step": 29960 + }, + { + "epoch": 62.2972972972973, + "grad_norm": 0.46263718605041504, + "learning_rate": 7.372383497747149e-06, + "loss": 0.1142, + "num_input_tokens_seen": 21501592, + "step": 29965 + }, + { + "epoch": 62.30769230769231, + "grad_norm": 0.21232473850250244, + "learning_rate": 7.3654232522542775e-06, + "loss": 0.1256, + "num_input_tokens_seen": 21505144, + "step": 29970 + }, + { + "epoch": 62.318087318087315, + "grad_norm": 0.38139811158180237, + "learning_rate": 7.358465726234756e-06, + "loss": 0.1141, + "num_input_tokens_seen": 21508664, + "step": 29975 + }, + { + "epoch": 62.32848232848233, + "grad_norm": 1.2101967334747314, + "learning_rate": 7.351510920761512e-06, + "loss": 0.0845, + "num_input_tokens_seen": 21512152, + "step": 29980 + }, + { + "epoch": 62.33887733887734, + "grad_norm": 0.1709677278995514, + "learning_rate": 7.344558836907067e-06, + "loss": 0.1106, + "num_input_tokens_seen": 21515736, + "step": 29985 + }, + { + "epoch": 62.34927234927235, + "grad_norm": 0.4353572130203247, + "learning_rate": 7.3376094757435285e-06, + "loss": 0.0988, + "num_input_tokens_seen": 21519384, + "step": 29990 + }, + { + "epoch": 62.35966735966736, + "grad_norm": 0.33812540769577026, + "learning_rate": 7.330662838342561e-06, + "loss": 0.1202, + "num_input_tokens_seen": 21523000, + "step": 29995 + }, + { + "epoch": 62.37006237006237, + "grad_norm": 0.4068968892097473, + "learning_rate": 7.323718925775438e-06, + "loss": 0.0734, + "num_input_tokens_seen": 21526584, + "step": 30000 + }, + { + "epoch": 62.37006237006237, + "eval_loss": 0.14422526955604553, + "eval_runtime": 7.7549, + "eval_samples_per_second": 110.382, + "eval_steps_per_second": 27.595, + "num_input_tokens_seen": 21526584, + "step": 30000 + }, + { + "epoch": 62.38045738045738, + "grad_norm": 0.4271976053714752, + "learning_rate": 7.316777739112985e-06, + "loss": 0.0832, + "num_input_tokens_seen": 21529912, + "step": 30005 + }, + { + "epoch": 62.39085239085239, + "grad_norm": 0.18290133774280548, + "learning_rate": 7.309839279425626e-06, + "loss": 0.1245, + "num_input_tokens_seen": 21533592, + "step": 30010 + }, + { + "epoch": 62.4012474012474, + "grad_norm": 0.5432464480400085, + "learning_rate": 7.302903547783366e-06, + "loss": 0.0832, + "num_input_tokens_seen": 21537240, + "step": 30015 + }, + { + "epoch": 62.41164241164241, + "grad_norm": 0.37612685561180115, + "learning_rate": 7.2959705452557644e-06, + "loss": 0.0991, + "num_input_tokens_seen": 21540888, + "step": 30020 + }, + { + "epoch": 62.42203742203742, + "grad_norm": 0.21517512202262878, + "learning_rate": 7.289040272911996e-06, + "loss": 0.108, + "num_input_tokens_seen": 21544536, + "step": 30025 + }, + { + "epoch": 62.432432432432435, + "grad_norm": 0.5744320154190063, + "learning_rate": 7.282112731820789e-06, + "loss": 0.1036, + "num_input_tokens_seen": 21548088, + "step": 30030 + }, + { + "epoch": 62.44282744282744, + "grad_norm": 0.22669608891010284, + "learning_rate": 7.275187923050447e-06, + "loss": 0.1122, + "num_input_tokens_seen": 21551544, + "step": 30035 + }, + { + "epoch": 62.45322245322245, + "grad_norm": 0.2302623838186264, + "learning_rate": 7.268265847668879e-06, + "loss": 0.0803, + "num_input_tokens_seen": 21555320, + "step": 30040 + }, + { + "epoch": 62.46361746361746, + "grad_norm": 0.2086804211139679, + "learning_rate": 7.261346506743538e-06, + "loss": 0.0608, + "num_input_tokens_seen": 21558840, + "step": 30045 + }, + { + "epoch": 62.474012474012476, + "grad_norm": 0.41470250487327576, + "learning_rate": 7.254429901341486e-06, + "loss": 0.0751, + "num_input_tokens_seen": 21562456, + "step": 30050 + }, + { + "epoch": 62.484407484407484, + "grad_norm": 0.23143932223320007, + "learning_rate": 7.247516032529356e-06, + "loss": 0.1184, + "num_input_tokens_seen": 21566136, + "step": 30055 + }, + { + "epoch": 62.49480249480249, + "grad_norm": 0.2350008338689804, + "learning_rate": 7.240604901373338e-06, + "loss": 0.0878, + "num_input_tokens_seen": 21569816, + "step": 30060 + }, + { + "epoch": 62.50519750519751, + "grad_norm": 0.3575727343559265, + "learning_rate": 7.233696508939223e-06, + "loss": 0.1031, + "num_input_tokens_seen": 21573240, + "step": 30065 + }, + { + "epoch": 62.515592515592516, + "grad_norm": 0.6114826202392578, + "learning_rate": 7.226790856292376e-06, + "loss": 0.0991, + "num_input_tokens_seen": 21576792, + "step": 30070 + }, + { + "epoch": 62.525987525987524, + "grad_norm": 0.1934897005558014, + "learning_rate": 7.219887944497727e-06, + "loss": 0.1114, + "num_input_tokens_seen": 21580248, + "step": 30075 + }, + { + "epoch": 62.53638253638254, + "grad_norm": 0.3372589349746704, + "learning_rate": 7.2129877746198e-06, + "loss": 0.1107, + "num_input_tokens_seen": 21583960, + "step": 30080 + }, + { + "epoch": 62.54677754677755, + "grad_norm": 0.14173148572444916, + "learning_rate": 7.20609034772268e-06, + "loss": 0.0974, + "num_input_tokens_seen": 21587320, + "step": 30085 + }, + { + "epoch": 62.55717255717256, + "grad_norm": 0.6013827919960022, + "learning_rate": 7.19919566487004e-06, + "loss": 0.1035, + "num_input_tokens_seen": 21590936, + "step": 30090 + }, + { + "epoch": 62.567567567567565, + "grad_norm": 0.36612024903297424, + "learning_rate": 7.192303727125132e-06, + "loss": 0.0607, + "num_input_tokens_seen": 21594392, + "step": 30095 + }, + { + "epoch": 62.57796257796258, + "grad_norm": 0.39457935094833374, + "learning_rate": 7.185414535550777e-06, + "loss": 0.0897, + "num_input_tokens_seen": 21597944, + "step": 30100 + }, + { + "epoch": 62.58835758835759, + "grad_norm": 0.35972779989242554, + "learning_rate": 7.178528091209363e-06, + "loss": 0.1128, + "num_input_tokens_seen": 21601528, + "step": 30105 + }, + { + "epoch": 62.5987525987526, + "grad_norm": 0.33076024055480957, + "learning_rate": 7.171644395162888e-06, + "loss": 0.0963, + "num_input_tokens_seen": 21605176, + "step": 30110 + }, + { + "epoch": 62.60914760914761, + "grad_norm": 0.3249669373035431, + "learning_rate": 7.164763448472881e-06, + "loss": 0.1174, + "num_input_tokens_seen": 21608888, + "step": 30115 + }, + { + "epoch": 62.61954261954262, + "grad_norm": 0.3050781786441803, + "learning_rate": 7.157885252200491e-06, + "loss": 0.0901, + "num_input_tokens_seen": 21612504, + "step": 30120 + }, + { + "epoch": 62.62993762993763, + "grad_norm": 0.345174103975296, + "learning_rate": 7.151009807406403e-06, + "loss": 0.0929, + "num_input_tokens_seen": 21616152, + "step": 30125 + }, + { + "epoch": 62.64033264033264, + "grad_norm": 0.33202099800109863, + "learning_rate": 7.144137115150909e-06, + "loss": 0.0947, + "num_input_tokens_seen": 21619640, + "step": 30130 + }, + { + "epoch": 62.65072765072765, + "grad_norm": 0.36021775007247925, + "learning_rate": 7.1372671764938725e-06, + "loss": 0.0895, + "num_input_tokens_seen": 21623192, + "step": 30135 + }, + { + "epoch": 62.66112266112266, + "grad_norm": 0.35306426882743835, + "learning_rate": 7.130399992494705e-06, + "loss": 0.0742, + "num_input_tokens_seen": 21626744, + "step": 30140 + }, + { + "epoch": 62.67151767151767, + "grad_norm": 0.737252414226532, + "learning_rate": 7.123535564212419e-06, + "loss": 0.0973, + "num_input_tokens_seen": 21630424, + "step": 30145 + }, + { + "epoch": 62.681912681912685, + "grad_norm": 0.44248008728027344, + "learning_rate": 7.116673892705611e-06, + "loss": 0.1306, + "num_input_tokens_seen": 21633944, + "step": 30150 + }, + { + "epoch": 62.69230769230769, + "grad_norm": 0.39018192887306213, + "learning_rate": 7.109814979032415e-06, + "loss": 0.0831, + "num_input_tokens_seen": 21637496, + "step": 30155 + }, + { + "epoch": 62.7027027027027, + "grad_norm": 0.45066288113594055, + "learning_rate": 7.102958824250577e-06, + "loss": 0.1073, + "num_input_tokens_seen": 21641112, + "step": 30160 + }, + { + "epoch": 62.71309771309771, + "grad_norm": 0.5397529602050781, + "learning_rate": 7.096105429417393e-06, + "loss": 0.0733, + "num_input_tokens_seen": 21644728, + "step": 30165 + }, + { + "epoch": 62.723492723492726, + "grad_norm": 0.580613911151886, + "learning_rate": 7.0892547955897506e-06, + "loss": 0.1062, + "num_input_tokens_seen": 21648504, + "step": 30170 + }, + { + "epoch": 62.733887733887734, + "grad_norm": 0.7884788513183594, + "learning_rate": 7.0824069238241e-06, + "loss": 0.0833, + "num_input_tokens_seen": 21652120, + "step": 30175 + }, + { + "epoch": 62.74428274428274, + "grad_norm": 0.42459017038345337, + "learning_rate": 7.075561815176462e-06, + "loss": 0.1441, + "num_input_tokens_seen": 21655928, + "step": 30180 + }, + { + "epoch": 62.75467775467776, + "grad_norm": 0.3115615248680115, + "learning_rate": 7.068719470702445e-06, + "loss": 0.1021, + "num_input_tokens_seen": 21659576, + "step": 30185 + }, + { + "epoch": 62.765072765072766, + "grad_norm": 0.3635050058364868, + "learning_rate": 7.061879891457229e-06, + "loss": 0.104, + "num_input_tokens_seen": 21663448, + "step": 30190 + }, + { + "epoch": 62.775467775467774, + "grad_norm": 0.45912304520606995, + "learning_rate": 7.0550430784955515e-06, + "loss": 0.14, + "num_input_tokens_seen": 21667064, + "step": 30195 + }, + { + "epoch": 62.78586278586278, + "grad_norm": 0.35901933908462524, + "learning_rate": 7.048209032871752e-06, + "loss": 0.143, + "num_input_tokens_seen": 21670744, + "step": 30200 + }, + { + "epoch": 62.78586278586278, + "eval_loss": 0.14548081159591675, + "eval_runtime": 7.746, + "eval_samples_per_second": 110.509, + "eval_steps_per_second": 27.627, + "num_input_tokens_seen": 21670744, + "step": 30200 + }, + { + "epoch": 62.7962577962578, + "grad_norm": 0.5232723951339722, + "learning_rate": 7.0413777556397055e-06, + "loss": 0.1057, + "num_input_tokens_seen": 21674392, + "step": 30205 + }, + { + "epoch": 62.80665280665281, + "grad_norm": 0.33505457639694214, + "learning_rate": 7.0345492478528925e-06, + "loss": 0.1023, + "num_input_tokens_seen": 21678104, + "step": 30210 + }, + { + "epoch": 62.817047817047815, + "grad_norm": 0.5277310013771057, + "learning_rate": 7.02772351056436e-06, + "loss": 0.0897, + "num_input_tokens_seen": 21681752, + "step": 30215 + }, + { + "epoch": 62.82744282744283, + "grad_norm": 0.35861489176750183, + "learning_rate": 7.020900544826709e-06, + "loss": 0.1057, + "num_input_tokens_seen": 21685400, + "step": 30220 + }, + { + "epoch": 62.83783783783784, + "grad_norm": 0.4743371903896332, + "learning_rate": 7.014080351692134e-06, + "loss": 0.1437, + "num_input_tokens_seen": 21689048, + "step": 30225 + }, + { + "epoch": 62.84823284823285, + "grad_norm": 0.5475440621376038, + "learning_rate": 7.0072629322124024e-06, + "loss": 0.1281, + "num_input_tokens_seen": 21692440, + "step": 30230 + }, + { + "epoch": 62.858627858627855, + "grad_norm": 0.16949115693569183, + "learning_rate": 7.000448287438827e-06, + "loss": 0.0819, + "num_input_tokens_seen": 21695960, + "step": 30235 + }, + { + "epoch": 62.86902286902287, + "grad_norm": 0.5480917096138, + "learning_rate": 6.993636418422331e-06, + "loss": 0.0857, + "num_input_tokens_seen": 21699576, + "step": 30240 + }, + { + "epoch": 62.87941787941788, + "grad_norm": 0.482022762298584, + "learning_rate": 6.986827326213383e-06, + "loss": 0.0816, + "num_input_tokens_seen": 21703064, + "step": 30245 + }, + { + "epoch": 62.88981288981289, + "grad_norm": 0.11545027792453766, + "learning_rate": 6.9800210118620205e-06, + "loss": 0.0752, + "num_input_tokens_seen": 21706616, + "step": 30250 + }, + { + "epoch": 62.9002079002079, + "grad_norm": 0.2824226915836334, + "learning_rate": 6.973217476417876e-06, + "loss": 0.0627, + "num_input_tokens_seen": 21710072, + "step": 30255 + }, + { + "epoch": 62.91060291060291, + "grad_norm": 0.5481848120689392, + "learning_rate": 6.96641672093013e-06, + "loss": 0.0906, + "num_input_tokens_seen": 21713560, + "step": 30260 + }, + { + "epoch": 62.92099792099792, + "grad_norm": 1.2874269485473633, + "learning_rate": 6.95961874644755e-06, + "loss": 0.0976, + "num_input_tokens_seen": 21717016, + "step": 30265 + }, + { + "epoch": 62.931392931392935, + "grad_norm": 0.30993178486824036, + "learning_rate": 6.952823554018476e-06, + "loss": 0.07, + "num_input_tokens_seen": 21720600, + "step": 30270 + }, + { + "epoch": 62.94178794178794, + "grad_norm": 0.19704236090183258, + "learning_rate": 6.946031144690798e-06, + "loss": 0.0787, + "num_input_tokens_seen": 21724184, + "step": 30275 + }, + { + "epoch": 62.95218295218295, + "grad_norm": 1.1689963340759277, + "learning_rate": 6.939241519512005e-06, + "loss": 0.0822, + "num_input_tokens_seen": 21727704, + "step": 30280 + }, + { + "epoch": 62.96257796257796, + "grad_norm": 0.1680075079202652, + "learning_rate": 6.932454679529129e-06, + "loss": 0.0845, + "num_input_tokens_seen": 21731256, + "step": 30285 + }, + { + "epoch": 62.972972972972975, + "grad_norm": 0.46348536014556885, + "learning_rate": 6.925670625788791e-06, + "loss": 0.1035, + "num_input_tokens_seen": 21734872, + "step": 30290 + }, + { + "epoch": 62.983367983367984, + "grad_norm": 0.4294303357601166, + "learning_rate": 6.918889359337186e-06, + "loss": 0.0853, + "num_input_tokens_seen": 21738552, + "step": 30295 + }, + { + "epoch": 62.99376299376299, + "grad_norm": 0.42760732769966125, + "learning_rate": 6.912110881220058e-06, + "loss": 0.1195, + "num_input_tokens_seen": 21742168, + "step": 30300 + }, + { + "epoch": 63.00415800415801, + "grad_norm": 0.4040086567401886, + "learning_rate": 6.905335192482735e-06, + "loss": 0.0797, + "num_input_tokens_seen": 21745920, + "step": 30305 + }, + { + "epoch": 63.014553014553016, + "grad_norm": 0.25227630138397217, + "learning_rate": 6.8985622941701275e-06, + "loss": 0.0807, + "num_input_tokens_seen": 21749312, + "step": 30310 + }, + { + "epoch": 63.024948024948024, + "grad_norm": 0.21109001338481903, + "learning_rate": 6.89179218732669e-06, + "loss": 0.1455, + "num_input_tokens_seen": 21752992, + "step": 30315 + }, + { + "epoch": 63.03534303534303, + "grad_norm": 0.2111741453409195, + "learning_rate": 6.8850248729964595e-06, + "loss": 0.086, + "num_input_tokens_seen": 21756480, + "step": 30320 + }, + { + "epoch": 63.04573804573805, + "grad_norm": 0.610285758972168, + "learning_rate": 6.8782603522230314e-06, + "loss": 0.0755, + "num_input_tokens_seen": 21759904, + "step": 30325 + }, + { + "epoch": 63.056133056133056, + "grad_norm": 0.25443723797798157, + "learning_rate": 6.871498626049591e-06, + "loss": 0.0959, + "num_input_tokens_seen": 21763488, + "step": 30330 + }, + { + "epoch": 63.066528066528065, + "grad_norm": 0.168515145778656, + "learning_rate": 6.8647396955188875e-06, + "loss": 0.1051, + "num_input_tokens_seen": 21766976, + "step": 30335 + }, + { + "epoch": 63.07692307692308, + "grad_norm": 0.6043838858604431, + "learning_rate": 6.857983561673218e-06, + "loss": 0.1208, + "num_input_tokens_seen": 21770496, + "step": 30340 + }, + { + "epoch": 63.08731808731809, + "grad_norm": 0.5803593993186951, + "learning_rate": 6.851230225554467e-06, + "loss": 0.0847, + "num_input_tokens_seen": 21774176, + "step": 30345 + }, + { + "epoch": 63.0977130977131, + "grad_norm": 0.1764269769191742, + "learning_rate": 6.8444796882040946e-06, + "loss": 0.0995, + "num_input_tokens_seen": 21777696, + "step": 30350 + }, + { + "epoch": 63.108108108108105, + "grad_norm": 0.1623394638299942, + "learning_rate": 6.837731950663106e-06, + "loss": 0.0694, + "num_input_tokens_seen": 21781472, + "step": 30355 + }, + { + "epoch": 63.11850311850312, + "grad_norm": 0.18555571138858795, + "learning_rate": 6.830987013972098e-06, + "loss": 0.0768, + "num_input_tokens_seen": 21785056, + "step": 30360 + }, + { + "epoch": 63.12889812889813, + "grad_norm": 0.26957616209983826, + "learning_rate": 6.82424487917121e-06, + "loss": 0.0805, + "num_input_tokens_seen": 21788672, + "step": 30365 + }, + { + "epoch": 63.13929313929314, + "grad_norm": 0.9323917031288147, + "learning_rate": 6.8175055473001735e-06, + "loss": 0.0794, + "num_input_tokens_seen": 21792352, + "step": 30370 + }, + { + "epoch": 63.14968814968815, + "grad_norm": 0.3089073896408081, + "learning_rate": 6.8107690193982855e-06, + "loss": 0.0909, + "num_input_tokens_seen": 21796032, + "step": 30375 + }, + { + "epoch": 63.16008316008316, + "grad_norm": 0.29122740030288696, + "learning_rate": 6.804035296504385e-06, + "loss": 0.1143, + "num_input_tokens_seen": 21799616, + "step": 30380 + }, + { + "epoch": 63.17047817047817, + "grad_norm": 0.3409477472305298, + "learning_rate": 6.797304379656916e-06, + "loss": 0.1135, + "num_input_tokens_seen": 21803264, + "step": 30385 + }, + { + "epoch": 63.18087318087318, + "grad_norm": 0.16898049414157867, + "learning_rate": 6.790576269893861e-06, + "loss": 0.0885, + "num_input_tokens_seen": 21806784, + "step": 30390 + }, + { + "epoch": 63.19126819126819, + "grad_norm": 0.8519042134284973, + "learning_rate": 6.783850968252772e-06, + "loss": 0.0694, + "num_input_tokens_seen": 21810432, + "step": 30395 + }, + { + "epoch": 63.2016632016632, + "grad_norm": 0.439696729183197, + "learning_rate": 6.777128475770789e-06, + "loss": 0.0983, + "num_input_tokens_seen": 21813952, + "step": 30400 + }, + { + "epoch": 63.2016632016632, + "eval_loss": 0.1442524939775467, + "eval_runtime": 7.7458, + "eval_samples_per_second": 110.512, + "eval_steps_per_second": 27.628, + "num_input_tokens_seen": 21813952, + "step": 30400 + }, + { + "epoch": 63.21205821205821, + "grad_norm": 0.39900150895118713, + "learning_rate": 6.77040879348459e-06, + "loss": 0.1176, + "num_input_tokens_seen": 21817568, + "step": 30405 + }, + { + "epoch": 63.222453222453225, + "grad_norm": 0.6462161540985107, + "learning_rate": 6.763691922430443e-06, + "loss": 0.0984, + "num_input_tokens_seen": 21821216, + "step": 30410 + }, + { + "epoch": 63.232848232848234, + "grad_norm": 0.4307728707790375, + "learning_rate": 6.756977863644178e-06, + "loss": 0.1059, + "num_input_tokens_seen": 21824736, + "step": 30415 + }, + { + "epoch": 63.24324324324324, + "grad_norm": 1.1105272769927979, + "learning_rate": 6.7502666181611804e-06, + "loss": 0.1249, + "num_input_tokens_seen": 21828384, + "step": 30420 + }, + { + "epoch": 63.25363825363825, + "grad_norm": 0.28460627794265747, + "learning_rate": 6.743558187016405e-06, + "loss": 0.1223, + "num_input_tokens_seen": 21831872, + "step": 30425 + }, + { + "epoch": 63.264033264033266, + "grad_norm": 0.5766275525093079, + "learning_rate": 6.7368525712443925e-06, + "loss": 0.0989, + "num_input_tokens_seen": 21835360, + "step": 30430 + }, + { + "epoch": 63.274428274428274, + "grad_norm": 0.3616970479488373, + "learning_rate": 6.7301497718792155e-06, + "loss": 0.1119, + "num_input_tokens_seen": 21839168, + "step": 30435 + }, + { + "epoch": 63.28482328482328, + "grad_norm": 0.33256208896636963, + "learning_rate": 6.723449789954544e-06, + "loss": 0.0826, + "num_input_tokens_seen": 21842656, + "step": 30440 + }, + { + "epoch": 63.2952182952183, + "grad_norm": 0.5143681764602661, + "learning_rate": 6.716752626503586e-06, + "loss": 0.111, + "num_input_tokens_seen": 21846112, + "step": 30445 + }, + { + "epoch": 63.305613305613306, + "grad_norm": 0.2945709228515625, + "learning_rate": 6.710058282559131e-06, + "loss": 0.0774, + "num_input_tokens_seen": 21849728, + "step": 30450 + }, + { + "epoch": 63.316008316008315, + "grad_norm": 0.3661181628704071, + "learning_rate": 6.703366759153545e-06, + "loss": 0.0776, + "num_input_tokens_seen": 21853248, + "step": 30455 + }, + { + "epoch": 63.32640332640332, + "grad_norm": 0.32543104887008667, + "learning_rate": 6.6966780573187335e-06, + "loss": 0.0819, + "num_input_tokens_seen": 21856768, + "step": 30460 + }, + { + "epoch": 63.33679833679834, + "grad_norm": 0.19103051722049713, + "learning_rate": 6.689992178086174e-06, + "loss": 0.1112, + "num_input_tokens_seen": 21860320, + "step": 30465 + }, + { + "epoch": 63.34719334719335, + "grad_norm": 0.11932847648859024, + "learning_rate": 6.683309122486925e-06, + "loss": 0.1147, + "num_input_tokens_seen": 21863968, + "step": 30470 + }, + { + "epoch": 63.357588357588355, + "grad_norm": 0.24023166298866272, + "learning_rate": 6.676628891551584e-06, + "loss": 0.088, + "num_input_tokens_seen": 21867456, + "step": 30475 + }, + { + "epoch": 63.36798336798337, + "grad_norm": 0.6074498295783997, + "learning_rate": 6.6699514863103385e-06, + "loss": 0.1005, + "num_input_tokens_seen": 21871136, + "step": 30480 + }, + { + "epoch": 63.37837837837838, + "grad_norm": 0.8822551965713501, + "learning_rate": 6.663276907792921e-06, + "loss": 0.1111, + "num_input_tokens_seen": 21874560, + "step": 30485 + }, + { + "epoch": 63.38877338877339, + "grad_norm": 0.43880945444107056, + "learning_rate": 6.656605157028634e-06, + "loss": 0.1041, + "num_input_tokens_seen": 21878048, + "step": 30490 + }, + { + "epoch": 63.3991683991684, + "grad_norm": 0.40620356798171997, + "learning_rate": 6.649936235046358e-06, + "loss": 0.1138, + "num_input_tokens_seen": 21881536, + "step": 30495 + }, + { + "epoch": 63.40956340956341, + "grad_norm": 0.3743766248226166, + "learning_rate": 6.643270142874508e-06, + "loss": 0.1118, + "num_input_tokens_seen": 21885120, + "step": 30500 + }, + { + "epoch": 63.41995841995842, + "grad_norm": 0.8289491534233093, + "learning_rate": 6.636606881541094e-06, + "loss": 0.0751, + "num_input_tokens_seen": 21888640, + "step": 30505 + }, + { + "epoch": 63.43035343035343, + "grad_norm": 0.34978342056274414, + "learning_rate": 6.629946452073662e-06, + "loss": 0.1104, + "num_input_tokens_seen": 21892288, + "step": 30510 + }, + { + "epoch": 63.44074844074844, + "grad_norm": 0.3095940351486206, + "learning_rate": 6.6232888554993375e-06, + "loss": 0.0714, + "num_input_tokens_seen": 21895936, + "step": 30515 + }, + { + "epoch": 63.45114345114345, + "grad_norm": 0.22568973898887634, + "learning_rate": 6.616634092844817e-06, + "loss": 0.0821, + "num_input_tokens_seen": 21899488, + "step": 30520 + }, + { + "epoch": 63.46153846153846, + "grad_norm": 0.2847702205181122, + "learning_rate": 6.609982165136331e-06, + "loss": 0.0621, + "num_input_tokens_seen": 21903072, + "step": 30525 + }, + { + "epoch": 63.471933471933475, + "grad_norm": 0.3069413900375366, + "learning_rate": 6.603333073399706e-06, + "loss": 0.1341, + "num_input_tokens_seen": 21906592, + "step": 30530 + }, + { + "epoch": 63.482328482328484, + "grad_norm": 0.6030454039573669, + "learning_rate": 6.596686818660308e-06, + "loss": 0.1376, + "num_input_tokens_seen": 21910400, + "step": 30535 + }, + { + "epoch": 63.49272349272349, + "grad_norm": 0.5543022751808167, + "learning_rate": 6.590043401943066e-06, + "loss": 0.1222, + "num_input_tokens_seen": 21914144, + "step": 30540 + }, + { + "epoch": 63.5031185031185, + "grad_norm": 0.5457144975662231, + "learning_rate": 6.583402824272494e-06, + "loss": 0.0758, + "num_input_tokens_seen": 21917632, + "step": 30545 + }, + { + "epoch": 63.513513513513516, + "grad_norm": 0.6052541732788086, + "learning_rate": 6.576765086672634e-06, + "loss": 0.1005, + "num_input_tokens_seen": 21921216, + "step": 30550 + }, + { + "epoch": 63.523908523908524, + "grad_norm": 0.3055388033390045, + "learning_rate": 6.57013019016712e-06, + "loss": 0.0945, + "num_input_tokens_seen": 21924800, + "step": 30555 + }, + { + "epoch": 63.53430353430353, + "grad_norm": 0.22087013721466064, + "learning_rate": 6.563498135779142e-06, + "loss": 0.0851, + "num_input_tokens_seen": 21928448, + "step": 30560 + }, + { + "epoch": 63.54469854469855, + "grad_norm": 0.19636639952659607, + "learning_rate": 6.556868924531431e-06, + "loss": 0.1075, + "num_input_tokens_seen": 21931968, + "step": 30565 + }, + { + "epoch": 63.555093555093556, + "grad_norm": 0.6034240126609802, + "learning_rate": 6.550242557446304e-06, + "loss": 0.0472, + "num_input_tokens_seen": 21935616, + "step": 30570 + }, + { + "epoch": 63.565488565488565, + "grad_norm": 0.3807307183742523, + "learning_rate": 6.543619035545634e-06, + "loss": 0.0481, + "num_input_tokens_seen": 21939168, + "step": 30575 + }, + { + "epoch": 63.57588357588357, + "grad_norm": 0.3494590222835541, + "learning_rate": 6.53699835985084e-06, + "loss": 0.1075, + "num_input_tokens_seen": 21942816, + "step": 30580 + }, + { + "epoch": 63.58627858627859, + "grad_norm": 0.789337694644928, + "learning_rate": 6.530380531382927e-06, + "loss": 0.0965, + "num_input_tokens_seen": 21946304, + "step": 30585 + }, + { + "epoch": 63.5966735966736, + "grad_norm": 1.5430736541748047, + "learning_rate": 6.523765551162433e-06, + "loss": 0.1237, + "num_input_tokens_seen": 21949888, + "step": 30590 + }, + { + "epoch": 63.607068607068605, + "grad_norm": 0.4154430627822876, + "learning_rate": 6.517153420209476e-06, + "loss": 0.0939, + "num_input_tokens_seen": 21953344, + "step": 30595 + }, + { + "epoch": 63.61746361746362, + "grad_norm": 0.4154554307460785, + "learning_rate": 6.510544139543739e-06, + "loss": 0.1579, + "num_input_tokens_seen": 21956992, + "step": 30600 + }, + { + "epoch": 63.61746361746362, + "eval_loss": 0.14402000606060028, + "eval_runtime": 7.747, + "eval_samples_per_second": 110.494, + "eval_steps_per_second": 27.623, + "num_input_tokens_seen": 21956992, + "step": 30600 + }, + { + "epoch": 63.62785862785863, + "grad_norm": 0.33599525690078735, + "learning_rate": 6.503937710184452e-06, + "loss": 0.0836, + "num_input_tokens_seen": 21960640, + "step": 30605 + }, + { + "epoch": 63.63825363825364, + "grad_norm": 0.19278891384601593, + "learning_rate": 6.4973341331503954e-06, + "loss": 0.0775, + "num_input_tokens_seen": 21964192, + "step": 30610 + }, + { + "epoch": 63.648648648648646, + "grad_norm": 0.3908211290836334, + "learning_rate": 6.490733409459942e-06, + "loss": 0.1024, + "num_input_tokens_seen": 21967840, + "step": 30615 + }, + { + "epoch": 63.65904365904366, + "grad_norm": 0.4422494173049927, + "learning_rate": 6.484135540130995e-06, + "loss": 0.1076, + "num_input_tokens_seen": 21971424, + "step": 30620 + }, + { + "epoch": 63.66943866943867, + "grad_norm": 0.2895282506942749, + "learning_rate": 6.4775405261810364e-06, + "loss": 0.0574, + "num_input_tokens_seen": 21975040, + "step": 30625 + }, + { + "epoch": 63.67983367983368, + "grad_norm": 0.3589029014110565, + "learning_rate": 6.470948368627092e-06, + "loss": 0.0924, + "num_input_tokens_seen": 21978464, + "step": 30630 + }, + { + "epoch": 63.69022869022869, + "grad_norm": 0.1955741047859192, + "learning_rate": 6.464359068485756e-06, + "loss": 0.1015, + "num_input_tokens_seen": 21981952, + "step": 30635 + }, + { + "epoch": 63.7006237006237, + "grad_norm": 0.8267438411712646, + "learning_rate": 6.457772626773195e-06, + "loss": 0.1312, + "num_input_tokens_seen": 21985504, + "step": 30640 + }, + { + "epoch": 63.71101871101871, + "grad_norm": 0.9242731332778931, + "learning_rate": 6.451189044505104e-06, + "loss": 0.0825, + "num_input_tokens_seen": 21988928, + "step": 30645 + }, + { + "epoch": 63.72141372141372, + "grad_norm": 0.24995484948158264, + "learning_rate": 6.44460832269676e-06, + "loss": 0.0919, + "num_input_tokens_seen": 21992640, + "step": 30650 + }, + { + "epoch": 63.731808731808734, + "grad_norm": 0.3056824207305908, + "learning_rate": 6.438030462363001e-06, + "loss": 0.1247, + "num_input_tokens_seen": 21996288, + "step": 30655 + }, + { + "epoch": 63.74220374220374, + "grad_norm": 0.19954630732536316, + "learning_rate": 6.431455464518205e-06, + "loss": 0.0626, + "num_input_tokens_seen": 21999776, + "step": 30660 + }, + { + "epoch": 63.75259875259875, + "grad_norm": 0.5984325408935547, + "learning_rate": 6.424883330176326e-06, + "loss": 0.0956, + "num_input_tokens_seen": 22003392, + "step": 30665 + }, + { + "epoch": 63.762993762993766, + "grad_norm": 0.14792034029960632, + "learning_rate": 6.418314060350864e-06, + "loss": 0.0874, + "num_input_tokens_seen": 22007104, + "step": 30670 + }, + { + "epoch": 63.773388773388774, + "grad_norm": 0.20753388106822968, + "learning_rate": 6.4117476560548895e-06, + "loss": 0.0959, + "num_input_tokens_seen": 22010624, + "step": 30675 + }, + { + "epoch": 63.78378378378378, + "grad_norm": 0.16695857048034668, + "learning_rate": 6.405184118301016e-06, + "loss": 0.0594, + "num_input_tokens_seen": 22014304, + "step": 30680 + }, + { + "epoch": 63.79417879417879, + "grad_norm": 0.2040882259607315, + "learning_rate": 6.398623448101434e-06, + "loss": 0.0918, + "num_input_tokens_seen": 22018112, + "step": 30685 + }, + { + "epoch": 63.804573804573806, + "grad_norm": 0.5429984927177429, + "learning_rate": 6.392065646467871e-06, + "loss": 0.14, + "num_input_tokens_seen": 22021632, + "step": 30690 + }, + { + "epoch": 63.814968814968815, + "grad_norm": 0.7670748233795166, + "learning_rate": 6.385510714411632e-06, + "loss": 0.0892, + "num_input_tokens_seen": 22025280, + "step": 30695 + }, + { + "epoch": 63.82536382536382, + "grad_norm": 0.2698380947113037, + "learning_rate": 6.378958652943559e-06, + "loss": 0.1026, + "num_input_tokens_seen": 22028896, + "step": 30700 + }, + { + "epoch": 63.83575883575884, + "grad_norm": 0.2077121138572693, + "learning_rate": 6.3724094630740776e-06, + "loss": 0.104, + "num_input_tokens_seen": 22032704, + "step": 30705 + }, + { + "epoch": 63.84615384615385, + "grad_norm": 0.155582994222641, + "learning_rate": 6.365863145813136e-06, + "loss": 0.1048, + "num_input_tokens_seen": 22036320, + "step": 30710 + }, + { + "epoch": 63.856548856548855, + "grad_norm": 0.26782509684562683, + "learning_rate": 6.359319702170269e-06, + "loss": 0.1188, + "num_input_tokens_seen": 22039936, + "step": 30715 + }, + { + "epoch": 63.86694386694387, + "grad_norm": 0.21097970008850098, + "learning_rate": 6.352779133154566e-06, + "loss": 0.1245, + "num_input_tokens_seen": 22043520, + "step": 30720 + }, + { + "epoch": 63.87733887733888, + "grad_norm": 0.14239107072353363, + "learning_rate": 6.346241439774648e-06, + "loss": 0.0832, + "num_input_tokens_seen": 22047040, + "step": 30725 + }, + { + "epoch": 63.88773388773389, + "grad_norm": 0.729770302772522, + "learning_rate": 6.339706623038716e-06, + "loss": 0.1067, + "num_input_tokens_seen": 22050560, + "step": 30730 + }, + { + "epoch": 63.898128898128896, + "grad_norm": 0.28314876556396484, + "learning_rate": 6.333174683954532e-06, + "loss": 0.0886, + "num_input_tokens_seen": 22054144, + "step": 30735 + }, + { + "epoch": 63.90852390852391, + "grad_norm": 0.3011552095413208, + "learning_rate": 6.326645623529387e-06, + "loss": 0.0601, + "num_input_tokens_seen": 22057600, + "step": 30740 + }, + { + "epoch": 63.91891891891892, + "grad_norm": 0.5693212151527405, + "learning_rate": 6.320119442770156e-06, + "loss": 0.0953, + "num_input_tokens_seen": 22061152, + "step": 30745 + }, + { + "epoch": 63.92931392931393, + "grad_norm": 0.8781076073646545, + "learning_rate": 6.313596142683254e-06, + "loss": 0.1966, + "num_input_tokens_seen": 22064768, + "step": 30750 + }, + { + "epoch": 63.93970893970894, + "grad_norm": 0.3348020315170288, + "learning_rate": 6.307075724274647e-06, + "loss": 0.0664, + "num_input_tokens_seen": 22068448, + "step": 30755 + }, + { + "epoch": 63.95010395010395, + "grad_norm": 0.25939279794692993, + "learning_rate": 6.300558188549882e-06, + "loss": 0.1163, + "num_input_tokens_seen": 22072064, + "step": 30760 + }, + { + "epoch": 63.96049896049896, + "grad_norm": 0.7102640867233276, + "learning_rate": 6.29404353651403e-06, + "loss": 0.1089, + "num_input_tokens_seen": 22075520, + "step": 30765 + }, + { + "epoch": 63.97089397089397, + "grad_norm": 0.23627127707004547, + "learning_rate": 6.287531769171737e-06, + "loss": 0.1073, + "num_input_tokens_seen": 22079168, + "step": 30770 + }, + { + "epoch": 63.981288981288984, + "grad_norm": 0.15077197551727295, + "learning_rate": 6.2810228875272045e-06, + "loss": 0.1015, + "num_input_tokens_seen": 22082624, + "step": 30775 + }, + { + "epoch": 63.99168399168399, + "grad_norm": 0.2168017029762268, + "learning_rate": 6.274516892584179e-06, + "loss": 0.0913, + "num_input_tokens_seen": 22086368, + "step": 30780 + }, + { + "epoch": 64.002079002079, + "grad_norm": 0.3294331729412079, + "learning_rate": 6.268013785345969e-06, + "loss": 0.0911, + "num_input_tokens_seen": 22090000, + "step": 30785 + }, + { + "epoch": 64.01247401247402, + "grad_norm": 0.44355112314224243, + "learning_rate": 6.26151356681543e-06, + "loss": 0.0869, + "num_input_tokens_seen": 22093680, + "step": 30790 + }, + { + "epoch": 64.02286902286902, + "grad_norm": 0.15831290185451508, + "learning_rate": 6.255016237994981e-06, + "loss": 0.1287, + "num_input_tokens_seen": 22097104, + "step": 30795 + }, + { + "epoch": 64.03326403326403, + "grad_norm": 0.4730667173862457, + "learning_rate": 6.248521799886603e-06, + "loss": 0.0536, + "num_input_tokens_seen": 22100720, + "step": 30800 + }, + { + "epoch": 64.03326403326403, + "eval_loss": 0.1432715654373169, + "eval_runtime": 7.7548, + "eval_samples_per_second": 110.383, + "eval_steps_per_second": 27.596, + "num_input_tokens_seen": 22100720, + "step": 30800 + }, + { + "epoch": 64.04365904365905, + "grad_norm": 0.2545302212238312, + "learning_rate": 6.242030253491798e-06, + "loss": 0.084, + "num_input_tokens_seen": 22104272, + "step": 30805 + }, + { + "epoch": 64.05405405405405, + "grad_norm": 0.26733970642089844, + "learning_rate": 6.235541599811656e-06, + "loss": 0.0875, + "num_input_tokens_seen": 22108080, + "step": 30810 + }, + { + "epoch": 64.06444906444906, + "grad_norm": 0.42659205198287964, + "learning_rate": 6.229055839846814e-06, + "loss": 0.1005, + "num_input_tokens_seen": 22111632, + "step": 30815 + }, + { + "epoch": 64.07484407484408, + "grad_norm": 0.5164576172828674, + "learning_rate": 6.222572974597455e-06, + "loss": 0.058, + "num_input_tokens_seen": 22115184, + "step": 30820 + }, + { + "epoch": 64.08523908523908, + "grad_norm": 0.15844927728176117, + "learning_rate": 6.216093005063306e-06, + "loss": 0.0564, + "num_input_tokens_seen": 22118704, + "step": 30825 + }, + { + "epoch": 64.0956340956341, + "grad_norm": 0.4685628414154053, + "learning_rate": 6.209615932243678e-06, + "loss": 0.1222, + "num_input_tokens_seen": 22122320, + "step": 30830 + }, + { + "epoch": 64.10602910602911, + "grad_norm": 0.3342475891113281, + "learning_rate": 6.203141757137399e-06, + "loss": 0.1217, + "num_input_tokens_seen": 22126000, + "step": 30835 + }, + { + "epoch": 64.11642411642411, + "grad_norm": 0.4711781144142151, + "learning_rate": 6.196670480742886e-06, + "loss": 0.0933, + "num_input_tokens_seen": 22129616, + "step": 30840 + }, + { + "epoch": 64.12681912681913, + "grad_norm": 0.17871195077896118, + "learning_rate": 6.190202104058074e-06, + "loss": 0.0906, + "num_input_tokens_seen": 22132976, + "step": 30845 + }, + { + "epoch": 64.13721413721414, + "grad_norm": 0.23022831976413727, + "learning_rate": 6.183736628080475e-06, + "loss": 0.0853, + "num_input_tokens_seen": 22136464, + "step": 30850 + }, + { + "epoch": 64.14760914760915, + "grad_norm": 0.3935551345348358, + "learning_rate": 6.177274053807155e-06, + "loss": 0.1243, + "num_input_tokens_seen": 22139952, + "step": 30855 + }, + { + "epoch": 64.15800415800416, + "grad_norm": 0.6350350379943848, + "learning_rate": 6.170814382234713e-06, + "loss": 0.0724, + "num_input_tokens_seen": 22143440, + "step": 30860 + }, + { + "epoch": 64.16839916839916, + "grad_norm": 0.23224104940891266, + "learning_rate": 6.16435761435932e-06, + "loss": 0.097, + "num_input_tokens_seen": 22147152, + "step": 30865 + }, + { + "epoch": 64.17879417879418, + "grad_norm": 0.43672582507133484, + "learning_rate": 6.157903751176681e-06, + "loss": 0.1138, + "num_input_tokens_seen": 22150800, + "step": 30870 + }, + { + "epoch": 64.1891891891892, + "grad_norm": 0.1828576624393463, + "learning_rate": 6.151452793682066e-06, + "loss": 0.0694, + "num_input_tokens_seen": 22154320, + "step": 30875 + }, + { + "epoch": 64.1995841995842, + "grad_norm": 0.42674532532691956, + "learning_rate": 6.145004742870305e-06, + "loss": 0.1241, + "num_input_tokens_seen": 22157840, + "step": 30880 + }, + { + "epoch": 64.20997920997921, + "grad_norm": 0.25954440236091614, + "learning_rate": 6.138559599735752e-06, + "loss": 0.1425, + "num_input_tokens_seen": 22161424, + "step": 30885 + }, + { + "epoch": 64.22037422037423, + "grad_norm": 0.15427692234516144, + "learning_rate": 6.132117365272344e-06, + "loss": 0.0951, + "num_input_tokens_seen": 22165104, + "step": 30890 + }, + { + "epoch": 64.23076923076923, + "grad_norm": 0.1581757664680481, + "learning_rate": 6.125678040473545e-06, + "loss": 0.092, + "num_input_tokens_seen": 22168560, + "step": 30895 + }, + { + "epoch": 64.24116424116424, + "grad_norm": 0.7690258622169495, + "learning_rate": 6.1192416263323755e-06, + "loss": 0.1077, + "num_input_tokens_seen": 22172112, + "step": 30900 + }, + { + "epoch": 64.25155925155926, + "grad_norm": 0.4073503017425537, + "learning_rate": 6.112808123841424e-06, + "loss": 0.1306, + "num_input_tokens_seen": 22175696, + "step": 30905 + }, + { + "epoch": 64.26195426195426, + "grad_norm": 0.6100408434867859, + "learning_rate": 6.106377533992805e-06, + "loss": 0.0736, + "num_input_tokens_seen": 22179280, + "step": 30910 + }, + { + "epoch": 64.27234927234927, + "grad_norm": 0.35026806592941284, + "learning_rate": 6.099949857778204e-06, + "loss": 0.1014, + "num_input_tokens_seen": 22182640, + "step": 30915 + }, + { + "epoch": 64.28274428274429, + "grad_norm": 0.4497988522052765, + "learning_rate": 6.093525096188852e-06, + "loss": 0.1015, + "num_input_tokens_seen": 22186096, + "step": 30920 + }, + { + "epoch": 64.29313929313929, + "grad_norm": 0.26462453603744507, + "learning_rate": 6.087103250215518e-06, + "loss": 0.0604, + "num_input_tokens_seen": 22189776, + "step": 30925 + }, + { + "epoch": 64.3035343035343, + "grad_norm": 1.0523523092269897, + "learning_rate": 6.080684320848537e-06, + "loss": 0.1464, + "num_input_tokens_seen": 22193424, + "step": 30930 + }, + { + "epoch": 64.31392931392931, + "grad_norm": 0.46885138750076294, + "learning_rate": 6.074268309077794e-06, + "loss": 0.1697, + "num_input_tokens_seen": 22197040, + "step": 30935 + }, + { + "epoch": 64.32432432432432, + "grad_norm": 0.26437631249427795, + "learning_rate": 6.067855215892709e-06, + "loss": 0.0783, + "num_input_tokens_seen": 22200656, + "step": 30940 + }, + { + "epoch": 64.33471933471934, + "grad_norm": 0.4724324345588684, + "learning_rate": 6.061445042282271e-06, + "loss": 0.1088, + "num_input_tokens_seen": 22204176, + "step": 30945 + }, + { + "epoch": 64.34511434511434, + "grad_norm": 0.518401026725769, + "learning_rate": 6.055037789234999e-06, + "loss": 0.0767, + "num_input_tokens_seen": 22207792, + "step": 30950 + }, + { + "epoch": 64.35550935550935, + "grad_norm": 0.27327024936676025, + "learning_rate": 6.048633457738975e-06, + "loss": 0.1048, + "num_input_tokens_seen": 22211504, + "step": 30955 + }, + { + "epoch": 64.36590436590437, + "grad_norm": 0.3219030499458313, + "learning_rate": 6.042232048781837e-06, + "loss": 0.0986, + "num_input_tokens_seen": 22215120, + "step": 30960 + }, + { + "epoch": 64.37629937629937, + "grad_norm": 0.1856164187192917, + "learning_rate": 6.035833563350757e-06, + "loss": 0.079, + "num_input_tokens_seen": 22218768, + "step": 30965 + }, + { + "epoch": 64.38669438669439, + "grad_norm": 0.344462513923645, + "learning_rate": 6.0294380024324525e-06, + "loss": 0.1039, + "num_input_tokens_seen": 22222416, + "step": 30970 + }, + { + "epoch": 64.3970893970894, + "grad_norm": 0.5719214677810669, + "learning_rate": 6.023045367013213e-06, + "loss": 0.1182, + "num_input_tokens_seen": 22226256, + "step": 30975 + }, + { + "epoch": 64.4074844074844, + "grad_norm": 0.5844897627830505, + "learning_rate": 6.016655658078851e-06, + "loss": 0.1459, + "num_input_tokens_seen": 22229936, + "step": 30980 + }, + { + "epoch": 64.41787941787942, + "grad_norm": 0.2524600923061371, + "learning_rate": 6.010268876614753e-06, + "loss": 0.1384, + "num_input_tokens_seen": 22233520, + "step": 30985 + }, + { + "epoch": 64.42827442827443, + "grad_norm": 0.44305944442749023, + "learning_rate": 6.0038850236058266e-06, + "loss": 0.1009, + "num_input_tokens_seen": 22237072, + "step": 30990 + }, + { + "epoch": 64.43866943866944, + "grad_norm": 0.7480392456054688, + "learning_rate": 5.997504100036549e-06, + "loss": 0.118, + "num_input_tokens_seen": 22240688, + "step": 30995 + }, + { + "epoch": 64.44906444906445, + "grad_norm": 0.27015456557273865, + "learning_rate": 5.991126106890949e-06, + "loss": 0.1065, + "num_input_tokens_seen": 22244240, + "step": 31000 + }, + { + "epoch": 64.44906444906445, + "eval_loss": 0.145252525806427, + "eval_runtime": 7.7535, + "eval_samples_per_second": 110.401, + "eval_steps_per_second": 27.6, + "num_input_tokens_seen": 22244240, + "step": 31000 + }, + { + "epoch": 64.45945945945945, + "grad_norm": 0.23121777176856995, + "learning_rate": 5.984751045152576e-06, + "loss": 0.0792, + "num_input_tokens_seen": 22248048, + "step": 31005 + }, + { + "epoch": 64.46985446985447, + "grad_norm": 0.31599587202072144, + "learning_rate": 5.978378915804553e-06, + "loss": 0.0567, + "num_input_tokens_seen": 22251632, + "step": 31010 + }, + { + "epoch": 64.48024948024948, + "grad_norm": 0.5254318118095398, + "learning_rate": 5.972009719829547e-06, + "loss": 0.0539, + "num_input_tokens_seen": 22255120, + "step": 31015 + }, + { + "epoch": 64.49064449064448, + "grad_norm": 0.22334371507167816, + "learning_rate": 5.965643458209755e-06, + "loss": 0.0813, + "num_input_tokens_seen": 22258608, + "step": 31020 + }, + { + "epoch": 64.5010395010395, + "grad_norm": 0.6121246814727783, + "learning_rate": 5.95928013192695e-06, + "loss": 0.089, + "num_input_tokens_seen": 22262416, + "step": 31025 + }, + { + "epoch": 64.51143451143452, + "grad_norm": 0.4600532352924347, + "learning_rate": 5.952919741962423e-06, + "loss": 0.1037, + "num_input_tokens_seen": 22265840, + "step": 31030 + }, + { + "epoch": 64.52182952182952, + "grad_norm": 0.594806969165802, + "learning_rate": 5.946562289297042e-06, + "loss": 0.0842, + "num_input_tokens_seen": 22269552, + "step": 31035 + }, + { + "epoch": 64.53222453222453, + "grad_norm": 0.4159550964832306, + "learning_rate": 5.9402077749111855e-06, + "loss": 0.0897, + "num_input_tokens_seen": 22273232, + "step": 31040 + }, + { + "epoch": 64.54261954261955, + "grad_norm": 0.22521790862083435, + "learning_rate": 5.933856199784821e-06, + "loss": 0.122, + "num_input_tokens_seen": 22276720, + "step": 31045 + }, + { + "epoch": 64.55301455301455, + "grad_norm": 0.27470195293426514, + "learning_rate": 5.927507564897419e-06, + "loss": 0.1396, + "num_input_tokens_seen": 22280496, + "step": 31050 + }, + { + "epoch": 64.56340956340956, + "grad_norm": 0.2053939700126648, + "learning_rate": 5.9211618712280395e-06, + "loss": 0.1167, + "num_input_tokens_seen": 22284112, + "step": 31055 + }, + { + "epoch": 64.57380457380458, + "grad_norm": 0.21069200336933136, + "learning_rate": 5.914819119755255e-06, + "loss": 0.083, + "num_input_tokens_seen": 22287728, + "step": 31060 + }, + { + "epoch": 64.58419958419958, + "grad_norm": 0.2478482574224472, + "learning_rate": 5.908479311457205e-06, + "loss": 0.083, + "num_input_tokens_seen": 22291472, + "step": 31065 + }, + { + "epoch": 64.5945945945946, + "grad_norm": 0.44609901309013367, + "learning_rate": 5.902142447311559e-06, + "loss": 0.0871, + "num_input_tokens_seen": 22295088, + "step": 31070 + }, + { + "epoch": 64.60498960498961, + "grad_norm": 0.30240991711616516, + "learning_rate": 5.895808528295546e-06, + "loss": 0.0615, + "num_input_tokens_seen": 22298800, + "step": 31075 + }, + { + "epoch": 64.61538461538461, + "grad_norm": 0.7106569409370422, + "learning_rate": 5.889477555385941e-06, + "loss": 0.1283, + "num_input_tokens_seen": 22302544, + "step": 31080 + }, + { + "epoch": 64.62577962577963, + "grad_norm": 0.4522416591644287, + "learning_rate": 5.883149529559051e-06, + "loss": 0.1456, + "num_input_tokens_seen": 22306192, + "step": 31085 + }, + { + "epoch": 64.63617463617463, + "grad_norm": 0.5210926532745361, + "learning_rate": 5.876824451790738e-06, + "loss": 0.0851, + "num_input_tokens_seen": 22309680, + "step": 31090 + }, + { + "epoch": 64.64656964656965, + "grad_norm": 0.6468263864517212, + "learning_rate": 5.87050232305642e-06, + "loss": 0.0998, + "num_input_tokens_seen": 22313104, + "step": 31095 + }, + { + "epoch": 64.65696465696466, + "grad_norm": 0.8572673797607422, + "learning_rate": 5.864183144331034e-06, + "loss": 0.0931, + "num_input_tokens_seen": 22316784, + "step": 31100 + }, + { + "epoch": 64.66735966735966, + "grad_norm": 0.2004685252904892, + "learning_rate": 5.857866916589089e-06, + "loss": 0.0832, + "num_input_tokens_seen": 22320336, + "step": 31105 + }, + { + "epoch": 64.67775467775468, + "grad_norm": 0.3470945954322815, + "learning_rate": 5.8515536408046216e-06, + "loss": 0.0913, + "num_input_tokens_seen": 22324048, + "step": 31110 + }, + { + "epoch": 64.6881496881497, + "grad_norm": 0.21919851005077362, + "learning_rate": 5.845243317951208e-06, + "loss": 0.1064, + "num_input_tokens_seen": 22327600, + "step": 31115 + }, + { + "epoch": 64.6985446985447, + "grad_norm": 0.323873907327652, + "learning_rate": 5.838935949001997e-06, + "loss": 0.121, + "num_input_tokens_seen": 22331184, + "step": 31120 + }, + { + "epoch": 64.70893970893971, + "grad_norm": 0.4524122178554535, + "learning_rate": 5.8326315349296476e-06, + "loss": 0.1472, + "num_input_tokens_seen": 22334768, + "step": 31125 + }, + { + "epoch": 64.71933471933473, + "grad_norm": 0.5544092059135437, + "learning_rate": 5.826330076706396e-06, + "loss": 0.0914, + "num_input_tokens_seen": 22338384, + "step": 31130 + }, + { + "epoch": 64.72972972972973, + "grad_norm": 0.20957474410533905, + "learning_rate": 5.820031575303988e-06, + "loss": 0.0663, + "num_input_tokens_seen": 22341872, + "step": 31135 + }, + { + "epoch": 64.74012474012474, + "grad_norm": 0.1917864978313446, + "learning_rate": 5.813736031693745e-06, + "loss": 0.1189, + "num_input_tokens_seen": 22345488, + "step": 31140 + }, + { + "epoch": 64.75051975051976, + "grad_norm": 0.2843709886074066, + "learning_rate": 5.807443446846522e-06, + "loss": 0.1163, + "num_input_tokens_seen": 22348976, + "step": 31145 + }, + { + "epoch": 64.76091476091476, + "grad_norm": 0.642810583114624, + "learning_rate": 5.801153821732699e-06, + "loss": 0.0628, + "num_input_tokens_seen": 22352400, + "step": 31150 + }, + { + "epoch": 64.77130977130977, + "grad_norm": 0.9129586219787598, + "learning_rate": 5.794867157322229e-06, + "loss": 0.1022, + "num_input_tokens_seen": 22355888, + "step": 31155 + }, + { + "epoch": 64.78170478170478, + "grad_norm": 0.13451385498046875, + "learning_rate": 5.788583454584593e-06, + "loss": 0.0807, + "num_input_tokens_seen": 22359600, + "step": 31160 + }, + { + "epoch": 64.79209979209979, + "grad_norm": 0.31292659044265747, + "learning_rate": 5.7823027144888075e-06, + "loss": 0.1034, + "num_input_tokens_seen": 22363216, + "step": 31165 + }, + { + "epoch": 64.8024948024948, + "grad_norm": 0.6263284683227539, + "learning_rate": 5.776024938003455e-06, + "loss": 0.0723, + "num_input_tokens_seen": 22366576, + "step": 31170 + }, + { + "epoch": 64.81288981288981, + "grad_norm": 0.2181500345468521, + "learning_rate": 5.7697501260966345e-06, + "loss": 0.0878, + "num_input_tokens_seen": 22370128, + "step": 31175 + }, + { + "epoch": 64.82328482328482, + "grad_norm": 0.1977342665195465, + "learning_rate": 5.7634782797360145e-06, + "loss": 0.087, + "num_input_tokens_seen": 22373808, + "step": 31180 + }, + { + "epoch": 64.83367983367984, + "grad_norm": 0.21400795876979828, + "learning_rate": 5.757209399888777e-06, + "loss": 0.1663, + "num_input_tokens_seen": 22377488, + "step": 31185 + }, + { + "epoch": 64.84407484407484, + "grad_norm": 0.1762186884880066, + "learning_rate": 5.750943487521679e-06, + "loss": 0.078, + "num_input_tokens_seen": 22381104, + "step": 31190 + }, + { + "epoch": 64.85446985446985, + "grad_norm": 0.5694141387939453, + "learning_rate": 5.744680543600986e-06, + "loss": 0.0931, + "num_input_tokens_seen": 22384688, + "step": 31195 + }, + { + "epoch": 64.86486486486487, + "grad_norm": 0.597856879234314, + "learning_rate": 5.738420569092537e-06, + "loss": 0.1196, + "num_input_tokens_seen": 22388368, + "step": 31200 + }, + { + "epoch": 64.86486486486487, + "eval_loss": 0.14403758943080902, + "eval_runtime": 7.7666, + "eval_samples_per_second": 110.215, + "eval_steps_per_second": 27.554, + "num_input_tokens_seen": 22388368, + "step": 31200 + }, + { + "epoch": 64.87525987525987, + "grad_norm": 0.44694241881370544, + "learning_rate": 5.732163564961684e-06, + "loss": 0.1151, + "num_input_tokens_seen": 22392048, + "step": 31205 + }, + { + "epoch": 64.88565488565489, + "grad_norm": 0.25048717856407166, + "learning_rate": 5.725909532173354e-06, + "loss": 0.0735, + "num_input_tokens_seen": 22395664, + "step": 31210 + }, + { + "epoch": 64.8960498960499, + "grad_norm": 0.23767684400081635, + "learning_rate": 5.719658471691977e-06, + "loss": 0.1004, + "num_input_tokens_seen": 22399344, + "step": 31215 + }, + { + "epoch": 64.9064449064449, + "grad_norm": 0.21028539538383484, + "learning_rate": 5.71341038448156e-06, + "loss": 0.0832, + "num_input_tokens_seen": 22402960, + "step": 31220 + }, + { + "epoch": 64.91683991683992, + "grad_norm": 0.3245115578174591, + "learning_rate": 5.707165271505635e-06, + "loss": 0.0952, + "num_input_tokens_seen": 22406352, + "step": 31225 + }, + { + "epoch": 64.92723492723492, + "grad_norm": 0.22870755195617676, + "learning_rate": 5.700923133727271e-06, + "loss": 0.1221, + "num_input_tokens_seen": 22409872, + "step": 31230 + }, + { + "epoch": 64.93762993762994, + "grad_norm": 0.16415204107761383, + "learning_rate": 5.694683972109083e-06, + "loss": 0.057, + "num_input_tokens_seen": 22413488, + "step": 31235 + }, + { + "epoch": 64.94802494802495, + "grad_norm": 0.19942109286785126, + "learning_rate": 5.688447787613241e-06, + "loss": 0.0827, + "num_input_tokens_seen": 22417136, + "step": 31240 + }, + { + "epoch": 64.95841995841995, + "grad_norm": 0.4831061363220215, + "learning_rate": 5.6822145812014285e-06, + "loss": 0.1227, + "num_input_tokens_seen": 22420560, + "step": 31245 + }, + { + "epoch": 64.96881496881497, + "grad_norm": 0.9763786196708679, + "learning_rate": 5.675984353834896e-06, + "loss": 0.1237, + "num_input_tokens_seen": 22424080, + "step": 31250 + }, + { + "epoch": 64.97920997920998, + "grad_norm": 0.18612323701381683, + "learning_rate": 5.66975710647441e-06, + "loss": 0.0474, + "num_input_tokens_seen": 22427856, + "step": 31255 + }, + { + "epoch": 64.98960498960498, + "grad_norm": 0.796837568283081, + "learning_rate": 5.663532840080304e-06, + "loss": 0.1228, + "num_input_tokens_seen": 22431472, + "step": 31260 + }, + { + "epoch": 65.0, + "grad_norm": 0.14446064829826355, + "learning_rate": 5.6573115556124325e-06, + "loss": 0.0787, + "num_input_tokens_seen": 22435072, + "step": 31265 + }, + { + "epoch": 65.01039501039502, + "grad_norm": 0.42022284865379333, + "learning_rate": 5.651093254030185e-06, + "loss": 0.0679, + "num_input_tokens_seen": 22438784, + "step": 31270 + }, + { + "epoch": 65.02079002079002, + "grad_norm": 0.1922328919172287, + "learning_rate": 5.644877936292514e-06, + "loss": 0.0482, + "num_input_tokens_seen": 22442240, + "step": 31275 + }, + { + "epoch": 65.03118503118503, + "grad_norm": 0.36154937744140625, + "learning_rate": 5.638665603357901e-06, + "loss": 0.1205, + "num_input_tokens_seen": 22445920, + "step": 31280 + }, + { + "epoch": 65.04158004158005, + "grad_norm": 0.3890257179737091, + "learning_rate": 5.632456256184357e-06, + "loss": 0.1133, + "num_input_tokens_seen": 22449536, + "step": 31285 + }, + { + "epoch": 65.05197505197505, + "grad_norm": 0.24894601106643677, + "learning_rate": 5.626249895729452e-06, + "loss": 0.0634, + "num_input_tokens_seen": 22453088, + "step": 31290 + }, + { + "epoch": 65.06237006237006, + "grad_norm": 0.23736120760440826, + "learning_rate": 5.620046522950273e-06, + "loss": 0.1024, + "num_input_tokens_seen": 22456576, + "step": 31295 + }, + { + "epoch": 65.07276507276508, + "grad_norm": 0.8543118238449097, + "learning_rate": 5.613846138803464e-06, + "loss": 0.0722, + "num_input_tokens_seen": 22459936, + "step": 31300 + }, + { + "epoch": 65.08316008316008, + "grad_norm": 0.6942669749259949, + "learning_rate": 5.607648744245206e-06, + "loss": 0.1303, + "num_input_tokens_seen": 22463520, + "step": 31305 + }, + { + "epoch": 65.0935550935551, + "grad_norm": 0.26574915647506714, + "learning_rate": 5.601454340231207e-06, + "loss": 0.0748, + "num_input_tokens_seen": 22467104, + "step": 31310 + }, + { + "epoch": 65.1039501039501, + "grad_norm": 0.6695547103881836, + "learning_rate": 5.595262927716724e-06, + "loss": 0.0807, + "num_input_tokens_seen": 22470720, + "step": 31315 + }, + { + "epoch": 65.11434511434511, + "grad_norm": 0.6030182242393494, + "learning_rate": 5.589074507656561e-06, + "loss": 0.0688, + "num_input_tokens_seen": 22474368, + "step": 31320 + }, + { + "epoch": 65.12474012474013, + "grad_norm": 0.5250293612480164, + "learning_rate": 5.582889081005044e-06, + "loss": 0.1108, + "num_input_tokens_seen": 22478016, + "step": 31325 + }, + { + "epoch": 65.13513513513513, + "grad_norm": 0.3680535554885864, + "learning_rate": 5.5767066487160316e-06, + "loss": 0.0883, + "num_input_tokens_seen": 22481568, + "step": 31330 + }, + { + "epoch": 65.14553014553015, + "grad_norm": 0.2297213226556778, + "learning_rate": 5.570527211742949e-06, + "loss": 0.0829, + "num_input_tokens_seen": 22485120, + "step": 31335 + }, + { + "epoch": 65.15592515592516, + "grad_norm": 0.3868819773197174, + "learning_rate": 5.564350771038731e-06, + "loss": 0.0976, + "num_input_tokens_seen": 22488704, + "step": 31340 + }, + { + "epoch": 65.16632016632016, + "grad_norm": 0.38897261023521423, + "learning_rate": 5.558177327555875e-06, + "loss": 0.0929, + "num_input_tokens_seen": 22492352, + "step": 31345 + }, + { + "epoch": 65.17671517671518, + "grad_norm": 0.25179797410964966, + "learning_rate": 5.552006882246388e-06, + "loss": 0.0823, + "num_input_tokens_seen": 22496000, + "step": 31350 + }, + { + "epoch": 65.18711018711019, + "grad_norm": 0.4013758599758148, + "learning_rate": 5.545839436061839e-06, + "loss": 0.072, + "num_input_tokens_seen": 22499584, + "step": 31355 + }, + { + "epoch": 65.1975051975052, + "grad_norm": 0.22494763135910034, + "learning_rate": 5.539674989953331e-06, + "loss": 0.0722, + "num_input_tokens_seen": 22503200, + "step": 31360 + }, + { + "epoch": 65.20790020790021, + "grad_norm": 0.42611604928970337, + "learning_rate": 5.533513544871488e-06, + "loss": 0.1176, + "num_input_tokens_seen": 22506880, + "step": 31365 + }, + { + "epoch": 65.21829521829522, + "grad_norm": 0.5576034188270569, + "learning_rate": 5.527355101766493e-06, + "loss": 0.0965, + "num_input_tokens_seen": 22510400, + "step": 31370 + }, + { + "epoch": 65.22869022869023, + "grad_norm": 0.27529919147491455, + "learning_rate": 5.521199661588044e-06, + "loss": 0.0874, + "num_input_tokens_seen": 22514016, + "step": 31375 + }, + { + "epoch": 65.23908523908524, + "grad_norm": 0.4386137127876282, + "learning_rate": 5.5150472252853944e-06, + "loss": 0.0954, + "num_input_tokens_seen": 22517440, + "step": 31380 + }, + { + "epoch": 65.24948024948024, + "grad_norm": 0.48523351550102234, + "learning_rate": 5.50889779380733e-06, + "loss": 0.0755, + "num_input_tokens_seen": 22520960, + "step": 31385 + }, + { + "epoch": 65.25987525987526, + "grad_norm": 1.1746044158935547, + "learning_rate": 5.5027513681021605e-06, + "loss": 0.1118, + "num_input_tokens_seen": 22524544, + "step": 31390 + }, + { + "epoch": 65.27027027027027, + "grad_norm": 0.5120991468429565, + "learning_rate": 5.4966079491177545e-06, + "loss": 0.1444, + "num_input_tokens_seen": 22528256, + "step": 31395 + }, + { + "epoch": 65.28066528066527, + "grad_norm": 0.5731244087219238, + "learning_rate": 5.490467537801491e-06, + "loss": 0.132, + "num_input_tokens_seen": 22531840, + "step": 31400 + }, + { + "epoch": 65.28066528066527, + "eval_loss": 0.14439475536346436, + "eval_runtime": 7.7557, + "eval_samples_per_second": 110.37, + "eval_steps_per_second": 27.593, + "num_input_tokens_seen": 22531840, + "step": 31400 + }, + { + "epoch": 65.29106029106029, + "grad_norm": 0.6499722003936768, + "learning_rate": 5.484330135100313e-06, + "loss": 0.0949, + "num_input_tokens_seen": 22535456, + "step": 31405 + }, + { + "epoch": 65.3014553014553, + "grad_norm": 0.785932183265686, + "learning_rate": 5.4781957419606785e-06, + "loss": 0.1163, + "num_input_tokens_seen": 22538944, + "step": 31410 + }, + { + "epoch": 65.3118503118503, + "grad_norm": 0.31053829193115234, + "learning_rate": 5.472064359328577e-06, + "loss": 0.1057, + "num_input_tokens_seen": 22542496, + "step": 31415 + }, + { + "epoch": 65.32224532224532, + "grad_norm": 0.23045235872268677, + "learning_rate": 5.4659359881495565e-06, + "loss": 0.1011, + "num_input_tokens_seen": 22545888, + "step": 31420 + }, + { + "epoch": 65.33264033264034, + "grad_norm": 0.29562440514564514, + "learning_rate": 5.4598106293686916e-06, + "loss": 0.0844, + "num_input_tokens_seen": 22549312, + "step": 31425 + }, + { + "epoch": 65.34303534303534, + "grad_norm": 0.24387982487678528, + "learning_rate": 5.45368828393058e-06, + "loss": 0.0761, + "num_input_tokens_seen": 22552800, + "step": 31430 + }, + { + "epoch": 65.35343035343035, + "grad_norm": 0.7969462275505066, + "learning_rate": 5.44756895277937e-06, + "loss": 0.0912, + "num_input_tokens_seen": 22556320, + "step": 31435 + }, + { + "epoch": 65.36382536382537, + "grad_norm": 0.11400920897722244, + "learning_rate": 5.441452636858746e-06, + "loss": 0.0714, + "num_input_tokens_seen": 22559936, + "step": 31440 + }, + { + "epoch": 65.37422037422037, + "grad_norm": 0.3470611274242401, + "learning_rate": 5.435339337111905e-06, + "loss": 0.1107, + "num_input_tokens_seen": 22563520, + "step": 31445 + }, + { + "epoch": 65.38461538461539, + "grad_norm": 0.7369604110717773, + "learning_rate": 5.42922905448161e-06, + "loss": 0.097, + "num_input_tokens_seen": 22567040, + "step": 31450 + }, + { + "epoch": 65.39501039501039, + "grad_norm": 0.3575364649295807, + "learning_rate": 5.423121789910129e-06, + "loss": 0.0944, + "num_input_tokens_seen": 22570592, + "step": 31455 + }, + { + "epoch": 65.4054054054054, + "grad_norm": 0.21631038188934326, + "learning_rate": 5.417017544339287e-06, + "loss": 0.0652, + "num_input_tokens_seen": 22574080, + "step": 31460 + }, + { + "epoch": 65.41580041580042, + "grad_norm": 0.1997499018907547, + "learning_rate": 5.410916318710443e-06, + "loss": 0.0711, + "num_input_tokens_seen": 22577568, + "step": 31465 + }, + { + "epoch": 65.42619542619542, + "grad_norm": 0.3892296552658081, + "learning_rate": 5.404818113964466e-06, + "loss": 0.0856, + "num_input_tokens_seen": 22581152, + "step": 31470 + }, + { + "epoch": 65.43659043659044, + "grad_norm": 0.3111460208892822, + "learning_rate": 5.398722931041792e-06, + "loss": 0.0944, + "num_input_tokens_seen": 22584704, + "step": 31475 + }, + { + "epoch": 65.44698544698545, + "grad_norm": 0.21274176239967346, + "learning_rate": 5.392630770882367e-06, + "loss": 0.0807, + "num_input_tokens_seen": 22588224, + "step": 31480 + }, + { + "epoch": 65.45738045738045, + "grad_norm": 0.20206262171268463, + "learning_rate": 5.3865416344256705e-06, + "loss": 0.0892, + "num_input_tokens_seen": 22591936, + "step": 31485 + }, + { + "epoch": 65.46777546777547, + "grad_norm": 0.2551477253437042, + "learning_rate": 5.380455522610742e-06, + "loss": 0.0735, + "num_input_tokens_seen": 22595584, + "step": 31490 + }, + { + "epoch": 65.47817047817048, + "grad_norm": 0.593043863773346, + "learning_rate": 5.374372436376116e-06, + "loss": 0.1127, + "num_input_tokens_seen": 22599104, + "step": 31495 + }, + { + "epoch": 65.48856548856548, + "grad_norm": 0.7351667881011963, + "learning_rate": 5.368292376659895e-06, + "loss": 0.1597, + "num_input_tokens_seen": 22602944, + "step": 31500 + }, + { + "epoch": 65.4989604989605, + "grad_norm": 0.3836387097835541, + "learning_rate": 5.362215344399701e-06, + "loss": 0.0985, + "num_input_tokens_seen": 22606592, + "step": 31505 + }, + { + "epoch": 65.50935550935552, + "grad_norm": 0.40231671929359436, + "learning_rate": 5.356141340532678e-06, + "loss": 0.0821, + "num_input_tokens_seen": 22610208, + "step": 31510 + }, + { + "epoch": 65.51975051975052, + "grad_norm": 0.2371170073747635, + "learning_rate": 5.350070365995522e-06, + "loss": 0.0657, + "num_input_tokens_seen": 22613728, + "step": 31515 + }, + { + "epoch": 65.53014553014553, + "grad_norm": 0.5376015901565552, + "learning_rate": 5.344002421724459e-06, + "loss": 0.1152, + "num_input_tokens_seen": 22617312, + "step": 31520 + }, + { + "epoch": 65.54054054054055, + "grad_norm": 0.889509916305542, + "learning_rate": 5.337937508655228e-06, + "loss": 0.094, + "num_input_tokens_seen": 22620896, + "step": 31525 + }, + { + "epoch": 65.55093555093555, + "grad_norm": 0.4362793564796448, + "learning_rate": 5.331875627723126e-06, + "loss": 0.0941, + "num_input_tokens_seen": 22624512, + "step": 31530 + }, + { + "epoch": 65.56133056133056, + "grad_norm": 0.5806521773338318, + "learning_rate": 5.325816779862963e-06, + "loss": 0.0594, + "num_input_tokens_seen": 22628192, + "step": 31535 + }, + { + "epoch": 65.57172557172557, + "grad_norm": 0.2627500593662262, + "learning_rate": 5.319760966009102e-06, + "loss": 0.0992, + "num_input_tokens_seen": 22631744, + "step": 31540 + }, + { + "epoch": 65.58212058212058, + "grad_norm": 0.2512948215007782, + "learning_rate": 5.3137081870954096e-06, + "loss": 0.0882, + "num_input_tokens_seen": 22635392, + "step": 31545 + }, + { + "epoch": 65.5925155925156, + "grad_norm": 0.48615434765815735, + "learning_rate": 5.307658444055313e-06, + "loss": 0.0819, + "num_input_tokens_seen": 22639008, + "step": 31550 + }, + { + "epoch": 65.6029106029106, + "grad_norm": 1.1034198999404907, + "learning_rate": 5.301611737821749e-06, + "loss": 0.1106, + "num_input_tokens_seen": 22642592, + "step": 31555 + }, + { + "epoch": 65.61330561330561, + "grad_norm": 0.6793763041496277, + "learning_rate": 5.295568069327206e-06, + "loss": 0.1336, + "num_input_tokens_seen": 22646208, + "step": 31560 + }, + { + "epoch": 65.62370062370063, + "grad_norm": 0.27983951568603516, + "learning_rate": 5.289527439503683e-06, + "loss": 0.0779, + "num_input_tokens_seen": 22649696, + "step": 31565 + }, + { + "epoch": 65.63409563409563, + "grad_norm": 0.2597922682762146, + "learning_rate": 5.28348984928273e-06, + "loss": 0.1354, + "num_input_tokens_seen": 22653152, + "step": 31570 + }, + { + "epoch": 65.64449064449065, + "grad_norm": 0.5724174380302429, + "learning_rate": 5.27745529959541e-06, + "loss": 0.1181, + "num_input_tokens_seen": 22656768, + "step": 31575 + }, + { + "epoch": 65.65488565488566, + "grad_norm": 0.3869037628173828, + "learning_rate": 5.271423791372335e-06, + "loss": 0.0991, + "num_input_tokens_seen": 22660352, + "step": 31580 + }, + { + "epoch": 65.66528066528066, + "grad_norm": 0.18997223675251007, + "learning_rate": 5.26539532554364e-06, + "loss": 0.0755, + "num_input_tokens_seen": 22664000, + "step": 31585 + }, + { + "epoch": 65.67567567567568, + "grad_norm": 0.37229084968566895, + "learning_rate": 5.25936990303898e-06, + "loss": 0.1172, + "num_input_tokens_seen": 22667616, + "step": 31590 + }, + { + "epoch": 65.68607068607069, + "grad_norm": 0.15559294819831848, + "learning_rate": 5.253347524787555e-06, + "loss": 0.0791, + "num_input_tokens_seen": 22671296, + "step": 31595 + }, + { + "epoch": 65.6964656964657, + "grad_norm": 0.24294370412826538, + "learning_rate": 5.2473281917181035e-06, + "loss": 0.0858, + "num_input_tokens_seen": 22674688, + "step": 31600 + }, + { + "epoch": 65.6964656964657, + "eval_loss": 0.1459350436925888, + "eval_runtime": 7.7436, + "eval_samples_per_second": 110.543, + "eval_steps_per_second": 27.636, + "num_input_tokens_seen": 22674688, + "step": 31600 + }, + { + "epoch": 65.70686070686071, + "grad_norm": 0.5911042094230652, + "learning_rate": 5.241311904758864e-06, + "loss": 0.1304, + "num_input_tokens_seen": 22678432, + "step": 31605 + }, + { + "epoch": 65.71725571725571, + "grad_norm": 1.1058721542358398, + "learning_rate": 5.23529866483764e-06, + "loss": 0.1609, + "num_input_tokens_seen": 22682048, + "step": 31610 + }, + { + "epoch": 65.72765072765073, + "grad_norm": 0.21576948463916779, + "learning_rate": 5.229288472881732e-06, + "loss": 0.1049, + "num_input_tokens_seen": 22685664, + "step": 31615 + }, + { + "epoch": 65.73804573804574, + "grad_norm": 0.31117576360702515, + "learning_rate": 5.2232813298180025e-06, + "loss": 0.0997, + "num_input_tokens_seen": 22689216, + "step": 31620 + }, + { + "epoch": 65.74844074844074, + "grad_norm": 0.36411136388778687, + "learning_rate": 5.217277236572824e-06, + "loss": 0.0845, + "num_input_tokens_seen": 22692736, + "step": 31625 + }, + { + "epoch": 65.75883575883576, + "grad_norm": 0.1850099116563797, + "learning_rate": 5.211276194072093e-06, + "loss": 0.0931, + "num_input_tokens_seen": 22696352, + "step": 31630 + }, + { + "epoch": 65.76923076923077, + "grad_norm": 0.7429699301719666, + "learning_rate": 5.205278203241254e-06, + "loss": 0.1185, + "num_input_tokens_seen": 22700064, + "step": 31635 + }, + { + "epoch": 65.77962577962577, + "grad_norm": 0.2999574542045593, + "learning_rate": 5.199283265005278e-06, + "loss": 0.0971, + "num_input_tokens_seen": 22703808, + "step": 31640 + }, + { + "epoch": 65.79002079002079, + "grad_norm": 0.4975307881832123, + "learning_rate": 5.193291380288648e-06, + "loss": 0.0819, + "num_input_tokens_seen": 22707424, + "step": 31645 + }, + { + "epoch": 65.8004158004158, + "grad_norm": 0.194259375333786, + "learning_rate": 5.1873025500153995e-06, + "loss": 0.0926, + "num_input_tokens_seen": 22711072, + "step": 31650 + }, + { + "epoch": 65.8108108108108, + "grad_norm": 0.7576483488082886, + "learning_rate": 5.181316775109071e-06, + "loss": 0.1519, + "num_input_tokens_seen": 22714688, + "step": 31655 + }, + { + "epoch": 65.82120582120582, + "grad_norm": 0.9128345251083374, + "learning_rate": 5.1753340564927564e-06, + "loss": 0.1117, + "num_input_tokens_seen": 22718176, + "step": 31660 + }, + { + "epoch": 65.83160083160084, + "grad_norm": 0.17707955837249756, + "learning_rate": 5.169354395089068e-06, + "loss": 0.0941, + "num_input_tokens_seen": 22721696, + "step": 31665 + }, + { + "epoch": 65.84199584199584, + "grad_norm": 0.2873551845550537, + "learning_rate": 5.1633777918201346e-06, + "loss": 0.1481, + "num_input_tokens_seen": 22725248, + "step": 31670 + }, + { + "epoch": 65.85239085239085, + "grad_norm": 0.3504568338394165, + "learning_rate": 5.157404247607625e-06, + "loss": 0.0673, + "num_input_tokens_seen": 22728832, + "step": 31675 + }, + { + "epoch": 65.86278586278586, + "grad_norm": 0.2688019275665283, + "learning_rate": 5.1514337633727454e-06, + "loss": 0.0952, + "num_input_tokens_seen": 22732288, + "step": 31680 + }, + { + "epoch": 65.87318087318087, + "grad_norm": 0.21848447620868683, + "learning_rate": 5.145466340036206e-06, + "loss": 0.1009, + "num_input_tokens_seen": 22735840, + "step": 31685 + }, + { + "epoch": 65.88357588357589, + "grad_norm": 0.3991180658340454, + "learning_rate": 5.139501978518274e-06, + "loss": 0.097, + "num_input_tokens_seen": 22739456, + "step": 31690 + }, + { + "epoch": 65.89397089397089, + "grad_norm": 0.552785336971283, + "learning_rate": 5.133540679738716e-06, + "loss": 0.1365, + "num_input_tokens_seen": 22743072, + "step": 31695 + }, + { + "epoch": 65.9043659043659, + "grad_norm": 0.17024102807044983, + "learning_rate": 5.127582444616838e-06, + "loss": 0.1018, + "num_input_tokens_seen": 22746720, + "step": 31700 + }, + { + "epoch": 65.91476091476092, + "grad_norm": 0.48275938630104065, + "learning_rate": 5.121627274071486e-06, + "loss": 0.101, + "num_input_tokens_seen": 22750240, + "step": 31705 + }, + { + "epoch": 65.92515592515592, + "grad_norm": 0.4515998065471649, + "learning_rate": 5.115675169021009e-06, + "loss": 0.1453, + "num_input_tokens_seen": 22753792, + "step": 31710 + }, + { + "epoch": 65.93555093555094, + "grad_norm": 0.2683252692222595, + "learning_rate": 5.1097261303832994e-06, + "loss": 0.1237, + "num_input_tokens_seen": 22757312, + "step": 31715 + }, + { + "epoch": 65.94594594594595, + "grad_norm": 0.17823943495750427, + "learning_rate": 5.103780159075788e-06, + "loss": 0.093, + "num_input_tokens_seen": 22760992, + "step": 31720 + }, + { + "epoch": 65.95634095634095, + "grad_norm": 0.276814341545105, + "learning_rate": 5.0978372560154e-06, + "loss": 0.0748, + "num_input_tokens_seen": 22764480, + "step": 31725 + }, + { + "epoch": 65.96673596673597, + "grad_norm": 0.2759748697280884, + "learning_rate": 5.091897422118619e-06, + "loss": 0.1246, + "num_input_tokens_seen": 22768192, + "step": 31730 + }, + { + "epoch": 65.97713097713098, + "grad_norm": 0.206235870718956, + "learning_rate": 5.0859606583014305e-06, + "loss": 0.0791, + "num_input_tokens_seen": 22771680, + "step": 31735 + }, + { + "epoch": 65.98752598752598, + "grad_norm": 0.20266839861869812, + "learning_rate": 5.080026965479365e-06, + "loss": 0.1266, + "num_input_tokens_seen": 22775168, + "step": 31740 + }, + { + "epoch": 65.997920997921, + "grad_norm": 0.40908658504486084, + "learning_rate": 5.074096344567475e-06, + "loss": 0.0927, + "num_input_tokens_seen": 22778880, + "step": 31745 + }, + { + "epoch": 66.00831600831602, + "grad_norm": 0.2277100682258606, + "learning_rate": 5.0681687964803294e-06, + "loss": 0.083, + "num_input_tokens_seen": 22782328, + "step": 31750 + }, + { + "epoch": 66.01871101871102, + "grad_norm": 0.4339008033275604, + "learning_rate": 5.06224432213204e-06, + "loss": 0.1127, + "num_input_tokens_seen": 22785976, + "step": 31755 + }, + { + "epoch": 66.02910602910603, + "grad_norm": 0.2573918402194977, + "learning_rate": 5.056322922436224e-06, + "loss": 0.0745, + "num_input_tokens_seen": 22789528, + "step": 31760 + }, + { + "epoch": 66.03950103950103, + "grad_norm": 0.4023597240447998, + "learning_rate": 5.0504045983060465e-06, + "loss": 0.1024, + "num_input_tokens_seen": 22793176, + "step": 31765 + }, + { + "epoch": 66.04989604989605, + "grad_norm": 0.2186843752861023, + "learning_rate": 5.044489350654183e-06, + "loss": 0.0695, + "num_input_tokens_seen": 22796664, + "step": 31770 + }, + { + "epoch": 66.06029106029106, + "grad_norm": 0.3447563946247101, + "learning_rate": 5.038577180392831e-06, + "loss": 0.0902, + "num_input_tokens_seen": 22800184, + "step": 31775 + }, + { + "epoch": 66.07068607068607, + "grad_norm": 0.22903630137443542, + "learning_rate": 5.032668088433729e-06, + "loss": 0.1122, + "num_input_tokens_seen": 22803608, + "step": 31780 + }, + { + "epoch": 66.08108108108108, + "grad_norm": 0.3262785077095032, + "learning_rate": 5.02676207568814e-06, + "loss": 0.0915, + "num_input_tokens_seen": 22807224, + "step": 31785 + }, + { + "epoch": 66.0914760914761, + "grad_norm": 0.18877442181110382, + "learning_rate": 5.02085914306683e-06, + "loss": 0.0779, + "num_input_tokens_seen": 22810776, + "step": 31790 + }, + { + "epoch": 66.1018711018711, + "grad_norm": 0.22951726615428925, + "learning_rate": 5.014959291480123e-06, + "loss": 0.1003, + "num_input_tokens_seen": 22814264, + "step": 31795 + }, + { + "epoch": 66.11226611226611, + "grad_norm": 0.6777262687683105, + "learning_rate": 5.009062521837835e-06, + "loss": 0.0828, + "num_input_tokens_seen": 22817880, + "step": 31800 + }, + { + "epoch": 66.11226611226611, + "eval_loss": 0.14334900677204132, + "eval_runtime": 7.7556, + "eval_samples_per_second": 110.372, + "eval_steps_per_second": 27.593, + "num_input_tokens_seen": 22817880, + "step": 31800 + }, + { + "epoch": 66.12266112266113, + "grad_norm": 0.2153942734003067, + "learning_rate": 5.003168835049324e-06, + "loss": 0.0602, + "num_input_tokens_seen": 22821336, + "step": 31805 + }, + { + "epoch": 66.13305613305613, + "grad_norm": 0.3409616947174072, + "learning_rate": 4.997278232023483e-06, + "loss": 0.074, + "num_input_tokens_seen": 22824952, + "step": 31810 + }, + { + "epoch": 66.14345114345114, + "grad_norm": 0.18530087172985077, + "learning_rate": 4.9913907136687036e-06, + "loss": 0.1032, + "num_input_tokens_seen": 22828536, + "step": 31815 + }, + { + "epoch": 66.15384615384616, + "grad_norm": 0.32681283354759216, + "learning_rate": 4.985506280892918e-06, + "loss": 0.0825, + "num_input_tokens_seen": 22832280, + "step": 31820 + }, + { + "epoch": 66.16424116424116, + "grad_norm": 0.5747520923614502, + "learning_rate": 4.979624934603589e-06, + "loss": 0.0805, + "num_input_tokens_seen": 22835928, + "step": 31825 + }, + { + "epoch": 66.17463617463618, + "grad_norm": 0.5912531018257141, + "learning_rate": 4.97374667570768e-06, + "loss": 0.0858, + "num_input_tokens_seen": 22839416, + "step": 31830 + }, + { + "epoch": 66.18503118503118, + "grad_norm": 0.10625794529914856, + "learning_rate": 4.967871505111704e-06, + "loss": 0.0588, + "num_input_tokens_seen": 22843032, + "step": 31835 + }, + { + "epoch": 66.1954261954262, + "grad_norm": 0.17993037402629852, + "learning_rate": 4.961999423721686e-06, + "loss": 0.072, + "num_input_tokens_seen": 22846584, + "step": 31840 + }, + { + "epoch": 66.20582120582121, + "grad_norm": 0.28817933797836304, + "learning_rate": 4.956130432443159e-06, + "loss": 0.0642, + "num_input_tokens_seen": 22850328, + "step": 31845 + }, + { + "epoch": 66.21621621621621, + "grad_norm": 0.45197880268096924, + "learning_rate": 4.950264532181215e-06, + "loss": 0.0871, + "num_input_tokens_seen": 22854072, + "step": 31850 + }, + { + "epoch": 66.22661122661123, + "grad_norm": 0.5044910311698914, + "learning_rate": 4.944401723840433e-06, + "loss": 0.1289, + "num_input_tokens_seen": 22857880, + "step": 31855 + }, + { + "epoch": 66.23700623700624, + "grad_norm": 0.7405793070793152, + "learning_rate": 4.938542008324942e-06, + "loss": 0.1563, + "num_input_tokens_seen": 22861496, + "step": 31860 + }, + { + "epoch": 66.24740124740124, + "grad_norm": 0.18063178658485413, + "learning_rate": 4.9326853865383855e-06, + "loss": 0.1123, + "num_input_tokens_seen": 22865272, + "step": 31865 + }, + { + "epoch": 66.25779625779626, + "grad_norm": 0.1684969812631607, + "learning_rate": 4.926831859383918e-06, + "loss": 0.1145, + "num_input_tokens_seen": 22869080, + "step": 31870 + }, + { + "epoch": 66.26819126819127, + "grad_norm": 0.7190316319465637, + "learning_rate": 4.92098142776424e-06, + "loss": 0.1102, + "num_input_tokens_seen": 22872600, + "step": 31875 + }, + { + "epoch": 66.27858627858627, + "grad_norm": 0.45360225439071655, + "learning_rate": 4.91513409258155e-06, + "loss": 0.1321, + "num_input_tokens_seen": 22875992, + "step": 31880 + }, + { + "epoch": 66.28898128898129, + "grad_norm": 0.2598673701286316, + "learning_rate": 4.909289854737581e-06, + "loss": 0.1191, + "num_input_tokens_seen": 22879480, + "step": 31885 + }, + { + "epoch": 66.2993762993763, + "grad_norm": 0.3931884169578552, + "learning_rate": 4.903448715133602e-06, + "loss": 0.1161, + "num_input_tokens_seen": 22882968, + "step": 31890 + }, + { + "epoch": 66.3097713097713, + "grad_norm": 0.5987743735313416, + "learning_rate": 4.897610674670372e-06, + "loss": 0.1062, + "num_input_tokens_seen": 22886488, + "step": 31895 + }, + { + "epoch": 66.32016632016632, + "grad_norm": 0.365413099527359, + "learning_rate": 4.8917757342482e-06, + "loss": 0.1027, + "num_input_tokens_seen": 22890008, + "step": 31900 + }, + { + "epoch": 66.33056133056132, + "grad_norm": 0.4778042733669281, + "learning_rate": 4.885943894766909e-06, + "loss": 0.1338, + "num_input_tokens_seen": 22893784, + "step": 31905 + }, + { + "epoch": 66.34095634095634, + "grad_norm": 0.21968160569667816, + "learning_rate": 4.880115157125842e-06, + "loss": 0.1208, + "num_input_tokens_seen": 22897432, + "step": 31910 + }, + { + "epoch": 66.35135135135135, + "grad_norm": 0.20962084829807281, + "learning_rate": 4.874289522223857e-06, + "loss": 0.0937, + "num_input_tokens_seen": 22900984, + "step": 31915 + }, + { + "epoch": 66.36174636174636, + "grad_norm": 0.32096949219703674, + "learning_rate": 4.868466990959339e-06, + "loss": 0.104, + "num_input_tokens_seen": 22904472, + "step": 31920 + }, + { + "epoch": 66.37214137214137, + "grad_norm": 0.40723109245300293, + "learning_rate": 4.8626475642301964e-06, + "loss": 0.0731, + "num_input_tokens_seen": 22907960, + "step": 31925 + }, + { + "epoch": 66.38253638253639, + "grad_norm": 0.37112703919410706, + "learning_rate": 4.856831242933871e-06, + "loss": 0.1105, + "num_input_tokens_seen": 22911480, + "step": 31930 + }, + { + "epoch": 66.39293139293139, + "grad_norm": 0.35966014862060547, + "learning_rate": 4.851018027967294e-06, + "loss": 0.1144, + "num_input_tokens_seen": 22915192, + "step": 31935 + }, + { + "epoch": 66.4033264033264, + "grad_norm": 1.0162370204925537, + "learning_rate": 4.845207920226946e-06, + "loss": 0.0952, + "num_input_tokens_seen": 22918776, + "step": 31940 + }, + { + "epoch": 66.41372141372142, + "grad_norm": 0.36901766061782837, + "learning_rate": 4.839400920608825e-06, + "loss": 0.1301, + "num_input_tokens_seen": 22922296, + "step": 31945 + }, + { + "epoch": 66.42411642411642, + "grad_norm": 0.22890816628932953, + "learning_rate": 4.83359703000843e-06, + "loss": 0.0828, + "num_input_tokens_seen": 22925880, + "step": 31950 + }, + { + "epoch": 66.43451143451144, + "grad_norm": 0.16886121034622192, + "learning_rate": 4.827796249320804e-06, + "loss": 0.0965, + "num_input_tokens_seen": 22929400, + "step": 31955 + }, + { + "epoch": 66.44490644490645, + "grad_norm": 0.6429018974304199, + "learning_rate": 4.82199857944049e-06, + "loss": 0.1024, + "num_input_tokens_seen": 22933176, + "step": 31960 + }, + { + "epoch": 66.45530145530145, + "grad_norm": 0.6069443821907043, + "learning_rate": 4.8162040212615695e-06, + "loss": 0.1094, + "num_input_tokens_seen": 22936792, + "step": 31965 + }, + { + "epoch": 66.46569646569647, + "grad_norm": 0.1400240808725357, + "learning_rate": 4.810412575677639e-06, + "loss": 0.1236, + "num_input_tokens_seen": 22940440, + "step": 31970 + }, + { + "epoch": 66.47609147609148, + "grad_norm": 0.24317888915538788, + "learning_rate": 4.804624243581801e-06, + "loss": 0.1121, + "num_input_tokens_seen": 22944216, + "step": 31975 + }, + { + "epoch": 66.48648648648648, + "grad_norm": 0.1391100138425827, + "learning_rate": 4.798839025866703e-06, + "loss": 0.1095, + "num_input_tokens_seen": 22947864, + "step": 31980 + }, + { + "epoch": 66.4968814968815, + "grad_norm": 0.3812251091003418, + "learning_rate": 4.793056923424491e-06, + "loss": 0.0675, + "num_input_tokens_seen": 22951448, + "step": 31985 + }, + { + "epoch": 66.5072765072765, + "grad_norm": 0.6488828659057617, + "learning_rate": 4.78727793714683e-06, + "loss": 0.1225, + "num_input_tokens_seen": 22955256, + "step": 31990 + }, + { + "epoch": 66.51767151767152, + "grad_norm": 0.2994897663593292, + "learning_rate": 4.7815020679249285e-06, + "loss": 0.101, + "num_input_tokens_seen": 22958872, + "step": 31995 + }, + { + "epoch": 66.52806652806653, + "grad_norm": 0.9852686524391174, + "learning_rate": 4.775729316649483e-06, + "loss": 0.1095, + "num_input_tokens_seen": 22962360, + "step": 32000 + }, + { + "epoch": 66.52806652806653, + "eval_loss": 0.14417335391044617, + "eval_runtime": 7.7446, + "eval_samples_per_second": 110.529, + "eval_steps_per_second": 27.632, + "num_input_tokens_seen": 22962360, + "step": 32000 + }, + { + "epoch": 66.53846153846153, + "grad_norm": 0.3315780460834503, + "learning_rate": 4.769959684210728e-06, + "loss": 0.1074, + "num_input_tokens_seen": 22966168, + "step": 32005 + }, + { + "epoch": 66.54885654885655, + "grad_norm": 1.481986403465271, + "learning_rate": 4.764193171498426e-06, + "loss": 0.1328, + "num_input_tokens_seen": 22969592, + "step": 32010 + }, + { + "epoch": 66.55925155925156, + "grad_norm": 0.27668312191963196, + "learning_rate": 4.75842977940183e-06, + "loss": 0.107, + "num_input_tokens_seen": 22972984, + "step": 32015 + }, + { + "epoch": 66.56964656964657, + "grad_norm": 0.1697724163532257, + "learning_rate": 4.752669508809729e-06, + "loss": 0.0967, + "num_input_tokens_seen": 22976664, + "step": 32020 + }, + { + "epoch": 66.58004158004158, + "grad_norm": 0.7839643955230713, + "learning_rate": 4.746912360610445e-06, + "loss": 0.1037, + "num_input_tokens_seen": 22980280, + "step": 32025 + }, + { + "epoch": 66.5904365904366, + "grad_norm": 0.2587629556655884, + "learning_rate": 4.741158335691781e-06, + "loss": 0.1012, + "num_input_tokens_seen": 22983672, + "step": 32030 + }, + { + "epoch": 66.6008316008316, + "grad_norm": 0.5428156852722168, + "learning_rate": 4.7354074349410994e-06, + "loss": 0.0932, + "num_input_tokens_seen": 22987064, + "step": 32035 + }, + { + "epoch": 66.61122661122661, + "grad_norm": 0.6792365312576294, + "learning_rate": 4.729659659245245e-06, + "loss": 0.1318, + "num_input_tokens_seen": 22990744, + "step": 32040 + }, + { + "epoch": 66.62162162162163, + "grad_norm": 0.6303175687789917, + "learning_rate": 4.723915009490601e-06, + "loss": 0.0576, + "num_input_tokens_seen": 22994200, + "step": 32045 + }, + { + "epoch": 66.63201663201663, + "grad_norm": 0.3483058214187622, + "learning_rate": 4.718173486563077e-06, + "loss": 0.1075, + "num_input_tokens_seen": 22998040, + "step": 32050 + }, + { + "epoch": 66.64241164241164, + "grad_norm": 0.3470064699649811, + "learning_rate": 4.71243509134808e-06, + "loss": 0.0717, + "num_input_tokens_seen": 23001560, + "step": 32055 + }, + { + "epoch": 66.65280665280665, + "grad_norm": 0.5922716856002808, + "learning_rate": 4.706699824730532e-06, + "loss": 0.0938, + "num_input_tokens_seen": 23005272, + "step": 32060 + }, + { + "epoch": 66.66320166320166, + "grad_norm": 0.45968976616859436, + "learning_rate": 4.700967687594901e-06, + "loss": 0.0817, + "num_input_tokens_seen": 23008760, + "step": 32065 + }, + { + "epoch": 66.67359667359668, + "grad_norm": 0.2090291827917099, + "learning_rate": 4.69523868082514e-06, + "loss": 0.0866, + "num_input_tokens_seen": 23012280, + "step": 32070 + }, + { + "epoch": 66.68399168399168, + "grad_norm": 0.5240241289138794, + "learning_rate": 4.689512805304747e-06, + "loss": 0.1013, + "num_input_tokens_seen": 23015864, + "step": 32075 + }, + { + "epoch": 66.6943866943867, + "grad_norm": 0.34827864170074463, + "learning_rate": 4.683790061916707e-06, + "loss": 0.0728, + "num_input_tokens_seen": 23019448, + "step": 32080 + }, + { + "epoch": 66.70478170478171, + "grad_norm": 0.6395851373672485, + "learning_rate": 4.678070451543551e-06, + "loss": 0.0784, + "num_input_tokens_seen": 23023000, + "step": 32085 + }, + { + "epoch": 66.71517671517671, + "grad_norm": 0.4553733468055725, + "learning_rate": 4.6723539750673204e-06, + "loss": 0.0878, + "num_input_tokens_seen": 23026616, + "step": 32090 + }, + { + "epoch": 66.72557172557173, + "grad_norm": 0.451742559671402, + "learning_rate": 4.666640633369551e-06, + "loss": 0.1063, + "num_input_tokens_seen": 23030104, + "step": 32095 + }, + { + "epoch": 66.73596673596674, + "grad_norm": 0.19700632989406586, + "learning_rate": 4.660930427331323e-06, + "loss": 0.0761, + "num_input_tokens_seen": 23033720, + "step": 32100 + }, + { + "epoch": 66.74636174636174, + "grad_norm": 0.3300938606262207, + "learning_rate": 4.6552233578332244e-06, + "loss": 0.0779, + "num_input_tokens_seen": 23037208, + "step": 32105 + }, + { + "epoch": 66.75675675675676, + "grad_norm": 0.24689306318759918, + "learning_rate": 4.649519425755347e-06, + "loss": 0.1038, + "num_input_tokens_seen": 23041112, + "step": 32110 + }, + { + "epoch": 66.76715176715177, + "grad_norm": 0.5937468409538269, + "learning_rate": 4.64381863197732e-06, + "loss": 0.1315, + "num_input_tokens_seen": 23044856, + "step": 32115 + }, + { + "epoch": 66.77754677754677, + "grad_norm": 0.38962069153785706, + "learning_rate": 4.638120977378269e-06, + "loss": 0.0959, + "num_input_tokens_seen": 23048376, + "step": 32120 + }, + { + "epoch": 66.78794178794179, + "grad_norm": 0.3505648076534271, + "learning_rate": 4.632426462836848e-06, + "loss": 0.0801, + "num_input_tokens_seen": 23051928, + "step": 32125 + }, + { + "epoch": 66.7983367983368, + "grad_norm": 0.6826577186584473, + "learning_rate": 4.626735089231224e-06, + "loss": 0.0769, + "num_input_tokens_seen": 23055512, + "step": 32130 + }, + { + "epoch": 66.8087318087318, + "grad_norm": 1.0104484558105469, + "learning_rate": 4.621046857439068e-06, + "loss": 0.0961, + "num_input_tokens_seen": 23059192, + "step": 32135 + }, + { + "epoch": 66.81912681912682, + "grad_norm": 0.29984787106513977, + "learning_rate": 4.615361768337587e-06, + "loss": 0.1258, + "num_input_tokens_seen": 23062648, + "step": 32140 + }, + { + "epoch": 66.82952182952182, + "grad_norm": 0.35471630096435547, + "learning_rate": 4.6096798228034946e-06, + "loss": 0.0817, + "num_input_tokens_seen": 23066200, + "step": 32145 + }, + { + "epoch": 66.83991683991684, + "grad_norm": 0.7108748555183411, + "learning_rate": 4.604001021713008e-06, + "loss": 0.1078, + "num_input_tokens_seen": 23069752, + "step": 32150 + }, + { + "epoch": 66.85031185031185, + "grad_norm": 0.29195693135261536, + "learning_rate": 4.598325365941883e-06, + "loss": 0.1225, + "num_input_tokens_seen": 23073304, + "step": 32155 + }, + { + "epoch": 66.86070686070686, + "grad_norm": 0.30633020401000977, + "learning_rate": 4.5926528563653645e-06, + "loss": 0.0837, + "num_input_tokens_seen": 23076952, + "step": 32160 + }, + { + "epoch": 66.87110187110187, + "grad_norm": 0.5762143135070801, + "learning_rate": 4.5869834938582295e-06, + "loss": 0.0932, + "num_input_tokens_seen": 23080344, + "step": 32165 + }, + { + "epoch": 66.88149688149689, + "grad_norm": 0.3018818795681, + "learning_rate": 4.581317279294772e-06, + "loss": 0.0979, + "num_input_tokens_seen": 23084024, + "step": 32170 + }, + { + "epoch": 66.89189189189189, + "grad_norm": 0.3781896233558655, + "learning_rate": 4.57565421354878e-06, + "loss": 0.1005, + "num_input_tokens_seen": 23087512, + "step": 32175 + }, + { + "epoch": 66.9022869022869, + "grad_norm": 0.2984575927257538, + "learning_rate": 4.569994297493579e-06, + "loss": 0.0792, + "num_input_tokens_seen": 23091160, + "step": 32180 + }, + { + "epoch": 66.91268191268192, + "grad_norm": 0.3169272840023041, + "learning_rate": 4.564337532002002e-06, + "loss": 0.1426, + "num_input_tokens_seen": 23094808, + "step": 32185 + }, + { + "epoch": 66.92307692307692, + "grad_norm": 0.6329285502433777, + "learning_rate": 4.55868391794638e-06, + "loss": 0.111, + "num_input_tokens_seen": 23098328, + "step": 32190 + }, + { + "epoch": 66.93347193347194, + "grad_norm": 0.3923807740211487, + "learning_rate": 4.553033456198588e-06, + "loss": 0.0767, + "num_input_tokens_seen": 23101976, + "step": 32195 + }, + { + "epoch": 66.94386694386695, + "grad_norm": 0.8384453654289246, + "learning_rate": 4.54738614762999e-06, + "loss": 0.0726, + "num_input_tokens_seen": 23105624, + "step": 32200 + }, + { + "epoch": 66.94386694386695, + "eval_loss": 0.14488768577575684, + "eval_runtime": 7.7624, + "eval_samples_per_second": 110.276, + "eval_steps_per_second": 27.569, + "num_input_tokens_seen": 23105624, + "step": 32200 + }, + { + "epoch": 66.95426195426195, + "grad_norm": 0.26257503032684326, + "learning_rate": 4.541741993111465e-06, + "loss": 0.0742, + "num_input_tokens_seen": 23109048, + "step": 32205 + }, + { + "epoch": 66.96465696465697, + "grad_norm": 0.5934534072875977, + "learning_rate": 4.536100993513423e-06, + "loss": 0.1223, + "num_input_tokens_seen": 23112664, + "step": 32210 + }, + { + "epoch": 66.97505197505197, + "grad_norm": 0.2224658876657486, + "learning_rate": 4.530463149705768e-06, + "loss": 0.0937, + "num_input_tokens_seen": 23116152, + "step": 32215 + }, + { + "epoch": 66.98544698544698, + "grad_norm": 0.23722806572914124, + "learning_rate": 4.524828462557934e-06, + "loss": 0.0853, + "num_input_tokens_seen": 23119768, + "step": 32220 + }, + { + "epoch": 66.995841995842, + "grad_norm": 0.6002709269523621, + "learning_rate": 4.5191969329388625e-06, + "loss": 0.0986, + "num_input_tokens_seen": 23123352, + "step": 32225 + }, + { + "epoch": 67.006237006237, + "grad_norm": 0.538020670413971, + "learning_rate": 4.5135685617169965e-06, + "loss": 0.1084, + "num_input_tokens_seen": 23126736, + "step": 32230 + }, + { + "epoch": 67.01663201663202, + "grad_norm": 0.47488662600517273, + "learning_rate": 4.507943349760313e-06, + "loss": 0.1342, + "num_input_tokens_seen": 23130288, + "step": 32235 + }, + { + "epoch": 67.02702702702703, + "grad_norm": 0.41633981466293335, + "learning_rate": 4.502321297936277e-06, + "loss": 0.0923, + "num_input_tokens_seen": 23133808, + "step": 32240 + }, + { + "epoch": 67.03742203742203, + "grad_norm": 0.32148897647857666, + "learning_rate": 4.496702407111888e-06, + "loss": 0.0774, + "num_input_tokens_seen": 23137392, + "step": 32245 + }, + { + "epoch": 67.04781704781705, + "grad_norm": 0.31444767117500305, + "learning_rate": 4.491086678153653e-06, + "loss": 0.1342, + "num_input_tokens_seen": 23141136, + "step": 32250 + }, + { + "epoch": 67.05821205821206, + "grad_norm": 0.2519046366214752, + "learning_rate": 4.485474111927579e-06, + "loss": 0.1219, + "num_input_tokens_seen": 23144880, + "step": 32255 + }, + { + "epoch": 67.06860706860707, + "grad_norm": 0.536316454410553, + "learning_rate": 4.479864709299197e-06, + "loss": 0.1503, + "num_input_tokens_seen": 23148336, + "step": 32260 + }, + { + "epoch": 67.07900207900208, + "grad_norm": 0.8055493235588074, + "learning_rate": 4.474258471133555e-06, + "loss": 0.1057, + "num_input_tokens_seen": 23152016, + "step": 32265 + }, + { + "epoch": 67.0893970893971, + "grad_norm": 0.32917165756225586, + "learning_rate": 4.4686553982952014e-06, + "loss": 0.1422, + "num_input_tokens_seen": 23155536, + "step": 32270 + }, + { + "epoch": 67.0997920997921, + "grad_norm": 0.3622051477432251, + "learning_rate": 4.463055491648191e-06, + "loss": 0.0847, + "num_input_tokens_seen": 23159216, + "step": 32275 + }, + { + "epoch": 67.11018711018711, + "grad_norm": 0.4869806170463562, + "learning_rate": 4.457458752056112e-06, + "loss": 0.1014, + "num_input_tokens_seen": 23162832, + "step": 32280 + }, + { + "epoch": 67.12058212058211, + "grad_norm": 0.317974328994751, + "learning_rate": 4.451865180382042e-06, + "loss": 0.084, + "num_input_tokens_seen": 23166416, + "step": 32285 + }, + { + "epoch": 67.13097713097713, + "grad_norm": 0.4905342161655426, + "learning_rate": 4.4462747774885936e-06, + "loss": 0.0951, + "num_input_tokens_seen": 23170128, + "step": 32290 + }, + { + "epoch": 67.14137214137214, + "grad_norm": 0.2149980366230011, + "learning_rate": 4.440687544237859e-06, + "loss": 0.0958, + "num_input_tokens_seen": 23173840, + "step": 32295 + }, + { + "epoch": 67.15176715176715, + "grad_norm": 1.2807193994522095, + "learning_rate": 4.435103481491471e-06, + "loss": 0.1091, + "num_input_tokens_seen": 23177296, + "step": 32300 + }, + { + "epoch": 67.16216216216216, + "grad_norm": 0.33662402629852295, + "learning_rate": 4.429522590110569e-06, + "loss": 0.1178, + "num_input_tokens_seen": 23180880, + "step": 32305 + }, + { + "epoch": 67.17255717255718, + "grad_norm": 0.21816453337669373, + "learning_rate": 4.423944870955779e-06, + "loss": 0.0924, + "num_input_tokens_seen": 23184400, + "step": 32310 + }, + { + "epoch": 67.18295218295218, + "grad_norm": 0.17237602174282074, + "learning_rate": 4.418370324887272e-06, + "loss": 0.0757, + "num_input_tokens_seen": 23187984, + "step": 32315 + }, + { + "epoch": 67.1933471933472, + "grad_norm": 0.47503241896629333, + "learning_rate": 4.412798952764699e-06, + "loss": 0.0711, + "num_input_tokens_seen": 23191600, + "step": 32320 + }, + { + "epoch": 67.20374220374221, + "grad_norm": 0.3566131889820099, + "learning_rate": 4.407230755447245e-06, + "loss": 0.087, + "num_input_tokens_seen": 23195056, + "step": 32325 + }, + { + "epoch": 67.21413721413721, + "grad_norm": 0.3711945712566376, + "learning_rate": 4.401665733793598e-06, + "loss": 0.08, + "num_input_tokens_seen": 23198640, + "step": 32330 + }, + { + "epoch": 67.22453222453223, + "grad_norm": 0.8118568062782288, + "learning_rate": 4.3961038886619425e-06, + "loss": 0.0915, + "num_input_tokens_seen": 23202128, + "step": 32335 + }, + { + "epoch": 67.23492723492724, + "grad_norm": 0.3471272885799408, + "learning_rate": 4.39054522091e-06, + "loss": 0.0924, + "num_input_tokens_seen": 23205648, + "step": 32340 + }, + { + "epoch": 67.24532224532224, + "grad_norm": 0.5766470432281494, + "learning_rate": 4.384989731394979e-06, + "loss": 0.1294, + "num_input_tokens_seen": 23209040, + "step": 32345 + }, + { + "epoch": 67.25571725571726, + "grad_norm": 0.19424521923065186, + "learning_rate": 4.379437420973598e-06, + "loss": 0.0886, + "num_input_tokens_seen": 23212656, + "step": 32350 + }, + { + "epoch": 67.26611226611226, + "grad_norm": 0.25618183612823486, + "learning_rate": 4.373888290502107e-06, + "loss": 0.0709, + "num_input_tokens_seen": 23216336, + "step": 32355 + }, + { + "epoch": 67.27650727650727, + "grad_norm": 0.3984967768192291, + "learning_rate": 4.36834234083624e-06, + "loss": 0.0715, + "num_input_tokens_seen": 23219920, + "step": 32360 + }, + { + "epoch": 67.28690228690229, + "grad_norm": 0.4974159300327301, + "learning_rate": 4.362799572831258e-06, + "loss": 0.1075, + "num_input_tokens_seen": 23223440, + "step": 32365 + }, + { + "epoch": 67.29729729729729, + "grad_norm": 0.9484031796455383, + "learning_rate": 4.35725998734193e-06, + "loss": 0.0879, + "num_input_tokens_seen": 23226864, + "step": 32370 + }, + { + "epoch": 67.3076923076923, + "grad_norm": 0.4625803232192993, + "learning_rate": 4.3517235852225195e-06, + "loss": 0.0805, + "num_input_tokens_seen": 23230352, + "step": 32375 + }, + { + "epoch": 67.31808731808732, + "grad_norm": 0.5820983052253723, + "learning_rate": 4.346190367326822e-06, + "loss": 0.1134, + "num_input_tokens_seen": 23233840, + "step": 32380 + }, + { + "epoch": 67.32848232848232, + "grad_norm": 0.2625286281108856, + "learning_rate": 4.340660334508115e-06, + "loss": 0.1006, + "num_input_tokens_seen": 23237424, + "step": 32385 + }, + { + "epoch": 67.33887733887734, + "grad_norm": 0.48022645711898804, + "learning_rate": 4.335133487619206e-06, + "loss": 0.0812, + "num_input_tokens_seen": 23240976, + "step": 32390 + }, + { + "epoch": 67.34927234927235, + "grad_norm": 0.35136929154396057, + "learning_rate": 4.329609827512409e-06, + "loss": 0.0505, + "num_input_tokens_seen": 23244464, + "step": 32395 + }, + { + "epoch": 67.35966735966736, + "grad_norm": 0.17491552233695984, + "learning_rate": 4.324089355039531e-06, + "loss": 0.1103, + "num_input_tokens_seen": 23248272, + "step": 32400 + }, + { + "epoch": 67.35966735966736, + "eval_loss": 0.1467909812927246, + "eval_runtime": 7.766, + "eval_samples_per_second": 110.224, + "eval_steps_per_second": 27.556, + "num_input_tokens_seen": 23248272, + "step": 32400 + }, + { + "epoch": 67.37006237006237, + "grad_norm": 0.7570604681968689, + "learning_rate": 4.3185720710519075e-06, + "loss": 0.1049, + "num_input_tokens_seen": 23251920, + "step": 32405 + }, + { + "epoch": 67.38045738045739, + "grad_norm": 0.255689412355423, + "learning_rate": 4.3130579764003724e-06, + "loss": 0.0922, + "num_input_tokens_seen": 23255536, + "step": 32410 + }, + { + "epoch": 67.39085239085239, + "grad_norm": 0.2034936249256134, + "learning_rate": 4.307547071935267e-06, + "loss": 0.0785, + "num_input_tokens_seen": 23259056, + "step": 32415 + }, + { + "epoch": 67.4012474012474, + "grad_norm": 0.3157929480075836, + "learning_rate": 4.302039358506435e-06, + "loss": 0.0706, + "num_input_tokens_seen": 23262640, + "step": 32420 + }, + { + "epoch": 67.41164241164242, + "grad_norm": 0.17938604950904846, + "learning_rate": 4.296534836963245e-06, + "loss": 0.0658, + "num_input_tokens_seen": 23266160, + "step": 32425 + }, + { + "epoch": 67.42203742203742, + "grad_norm": 1.0368329286575317, + "learning_rate": 4.291033508154555e-06, + "loss": 0.1284, + "num_input_tokens_seen": 23269712, + "step": 32430 + }, + { + "epoch": 67.43243243243244, + "grad_norm": 0.2976546585559845, + "learning_rate": 4.285535372928748e-06, + "loss": 0.1315, + "num_input_tokens_seen": 23273232, + "step": 32435 + }, + { + "epoch": 67.44282744282744, + "grad_norm": 0.594944179058075, + "learning_rate": 4.280040432133695e-06, + "loss": 0.1033, + "num_input_tokens_seen": 23276816, + "step": 32440 + }, + { + "epoch": 67.45322245322245, + "grad_norm": 0.5323029160499573, + "learning_rate": 4.274548686616789e-06, + "loss": 0.069, + "num_input_tokens_seen": 23280464, + "step": 32445 + }, + { + "epoch": 67.46361746361747, + "grad_norm": 0.2028026431798935, + "learning_rate": 4.2690601372249364e-06, + "loss": 0.0971, + "num_input_tokens_seen": 23284304, + "step": 32450 + }, + { + "epoch": 67.47401247401247, + "grad_norm": 0.5364720821380615, + "learning_rate": 4.263574784804525e-06, + "loss": 0.0711, + "num_input_tokens_seen": 23287792, + "step": 32455 + }, + { + "epoch": 67.48440748440748, + "grad_norm": 0.4865758419036865, + "learning_rate": 4.258092630201479e-06, + "loss": 0.0753, + "num_input_tokens_seen": 23291248, + "step": 32460 + }, + { + "epoch": 67.4948024948025, + "grad_norm": 0.32517021894454956, + "learning_rate": 4.252613674261202e-06, + "loss": 0.1562, + "num_input_tokens_seen": 23294736, + "step": 32465 + }, + { + "epoch": 67.5051975051975, + "grad_norm": 0.3830026388168335, + "learning_rate": 4.2471379178286224e-06, + "loss": 0.1102, + "num_input_tokens_seen": 23298288, + "step": 32470 + }, + { + "epoch": 67.51559251559252, + "grad_norm": 0.39033231139183044, + "learning_rate": 4.241665361748181e-06, + "loss": 0.0766, + "num_input_tokens_seen": 23301904, + "step": 32475 + }, + { + "epoch": 67.52598752598753, + "grad_norm": 0.4486124813556671, + "learning_rate": 4.2361960068637994e-06, + "loss": 0.1053, + "num_input_tokens_seen": 23305648, + "step": 32480 + }, + { + "epoch": 67.53638253638253, + "grad_norm": 0.3387245237827301, + "learning_rate": 4.230729854018933e-06, + "loss": 0.1118, + "num_input_tokens_seen": 23309296, + "step": 32485 + }, + { + "epoch": 67.54677754677755, + "grad_norm": 0.2472207397222519, + "learning_rate": 4.225266904056521e-06, + "loss": 0.0594, + "num_input_tokens_seen": 23312912, + "step": 32490 + }, + { + "epoch": 67.55717255717256, + "grad_norm": 0.33658698201179504, + "learning_rate": 4.21980715781903e-06, + "loss": 0.0563, + "num_input_tokens_seen": 23316400, + "step": 32495 + }, + { + "epoch": 67.56756756756756, + "grad_norm": 0.1934880167245865, + "learning_rate": 4.214350616148416e-06, + "loss": 0.069, + "num_input_tokens_seen": 23319856, + "step": 32500 + }, + { + "epoch": 67.57796257796258, + "grad_norm": 0.6198192238807678, + "learning_rate": 4.20889727988614e-06, + "loss": 0.1, + "num_input_tokens_seen": 23323472, + "step": 32505 + }, + { + "epoch": 67.58835758835758, + "grad_norm": 0.28691357374191284, + "learning_rate": 4.20344714987318e-06, + "loss": 0.0788, + "num_input_tokens_seen": 23327056, + "step": 32510 + }, + { + "epoch": 67.5987525987526, + "grad_norm": 0.5690732598304749, + "learning_rate": 4.198000226950022e-06, + "loss": 0.0993, + "num_input_tokens_seen": 23330736, + "step": 32515 + }, + { + "epoch": 67.60914760914761, + "grad_norm": 0.7566962242126465, + "learning_rate": 4.192556511956635e-06, + "loss": 0.1061, + "num_input_tokens_seen": 23334480, + "step": 32520 + }, + { + "epoch": 67.61954261954261, + "grad_norm": 0.35335221886634827, + "learning_rate": 4.18711600573252e-06, + "loss": 0.1264, + "num_input_tokens_seen": 23338320, + "step": 32525 + }, + { + "epoch": 67.62993762993763, + "grad_norm": 0.19134393334388733, + "learning_rate": 4.181678709116671e-06, + "loss": 0.0715, + "num_input_tokens_seen": 23341744, + "step": 32530 + }, + { + "epoch": 67.64033264033264, + "grad_norm": 0.6293647885322571, + "learning_rate": 4.1762446229475785e-06, + "loss": 0.1214, + "num_input_tokens_seen": 23345424, + "step": 32535 + }, + { + "epoch": 67.65072765072765, + "grad_norm": 0.3022599220275879, + "learning_rate": 4.17081374806326e-06, + "loss": 0.0852, + "num_input_tokens_seen": 23348880, + "step": 32540 + }, + { + "epoch": 67.66112266112266, + "grad_norm": 0.35395002365112305, + "learning_rate": 4.165386085301212e-06, + "loss": 0.0897, + "num_input_tokens_seen": 23352400, + "step": 32545 + }, + { + "epoch": 67.67151767151768, + "grad_norm": 0.19989635050296783, + "learning_rate": 4.1599616354984525e-06, + "loss": 0.1186, + "num_input_tokens_seen": 23355984, + "step": 32550 + }, + { + "epoch": 67.68191268191268, + "grad_norm": 0.19540609419345856, + "learning_rate": 4.154540399491508e-06, + "loss": 0.0773, + "num_input_tokens_seen": 23359472, + "step": 32555 + }, + { + "epoch": 67.6923076923077, + "grad_norm": 0.3187265694141388, + "learning_rate": 4.149122378116394e-06, + "loss": 0.1093, + "num_input_tokens_seen": 23362992, + "step": 32560 + }, + { + "epoch": 67.70270270270271, + "grad_norm": 0.39279213547706604, + "learning_rate": 4.14370757220863e-06, + "loss": 0.1203, + "num_input_tokens_seen": 23366512, + "step": 32565 + }, + { + "epoch": 67.71309771309771, + "grad_norm": 0.6156867146492004, + "learning_rate": 4.138295982603263e-06, + "loss": 0.1088, + "num_input_tokens_seen": 23370160, + "step": 32570 + }, + { + "epoch": 67.72349272349273, + "grad_norm": 0.23119351267814636, + "learning_rate": 4.132887610134814e-06, + "loss": 0.0693, + "num_input_tokens_seen": 23373872, + "step": 32575 + }, + { + "epoch": 67.73388773388774, + "grad_norm": 0.9206162095069885, + "learning_rate": 4.127482455637335e-06, + "loss": 0.1139, + "num_input_tokens_seen": 23377392, + "step": 32580 + }, + { + "epoch": 67.74428274428274, + "grad_norm": 0.4048621952533722, + "learning_rate": 4.1220805199443545e-06, + "loss": 0.0967, + "num_input_tokens_seen": 23381040, + "step": 32585 + }, + { + "epoch": 67.75467775467776, + "grad_norm": 0.26007768511772156, + "learning_rate": 4.116681803888925e-06, + "loss": 0.119, + "num_input_tokens_seen": 23384528, + "step": 32590 + }, + { + "epoch": 67.76507276507276, + "grad_norm": 0.6930503845214844, + "learning_rate": 4.111286308303605e-06, + "loss": 0.0998, + "num_input_tokens_seen": 23388144, + "step": 32595 + }, + { + "epoch": 67.77546777546777, + "grad_norm": 0.3782118856906891, + "learning_rate": 4.105894034020433e-06, + "loss": 0.086, + "num_input_tokens_seen": 23391888, + "step": 32600 + }, + { + "epoch": 67.77546777546777, + "eval_loss": 0.14481522142887115, + "eval_runtime": 7.7597, + "eval_samples_per_second": 110.314, + "eval_steps_per_second": 27.579, + "num_input_tokens_seen": 23391888, + "step": 32600 + }, + { + "epoch": 67.78586278586279, + "grad_norm": 0.17932450771331787, + "learning_rate": 4.100504981870975e-06, + "loss": 0.0939, + "num_input_tokens_seen": 23395600, + "step": 32605 + }, + { + "epoch": 67.79625779625779, + "grad_norm": 0.2751418948173523, + "learning_rate": 4.0951191526862915e-06, + "loss": 0.0576, + "num_input_tokens_seen": 23399056, + "step": 32610 + }, + { + "epoch": 67.8066528066528, + "grad_norm": 0.5109661221504211, + "learning_rate": 4.089736547296938e-06, + "loss": 0.0764, + "num_input_tokens_seen": 23402544, + "step": 32615 + }, + { + "epoch": 67.81704781704782, + "grad_norm": 0.6928071975708008, + "learning_rate": 4.08435716653299e-06, + "loss": 0.0901, + "num_input_tokens_seen": 23406288, + "step": 32620 + }, + { + "epoch": 67.82744282744282, + "grad_norm": 0.20805320143699646, + "learning_rate": 4.0789810112240005e-06, + "loss": 0.099, + "num_input_tokens_seen": 23409744, + "step": 32625 + }, + { + "epoch": 67.83783783783784, + "grad_norm": 0.1729089915752411, + "learning_rate": 4.073608082199057e-06, + "loss": 0.087, + "num_input_tokens_seen": 23413488, + "step": 32630 + }, + { + "epoch": 67.84823284823285, + "grad_norm": 0.45124223828315735, + "learning_rate": 4.068238380286718e-06, + "loss": 0.1297, + "num_input_tokens_seen": 23417040, + "step": 32635 + }, + { + "epoch": 67.85862785862786, + "grad_norm": 0.3097591698169708, + "learning_rate": 4.062871906315072e-06, + "loss": 0.0866, + "num_input_tokens_seen": 23420784, + "step": 32640 + }, + { + "epoch": 67.86902286902287, + "grad_norm": 0.4904481768608093, + "learning_rate": 4.057508661111686e-06, + "loss": 0.0994, + "num_input_tokens_seen": 23424272, + "step": 32645 + }, + { + "epoch": 67.87941787941789, + "grad_norm": 0.43604397773742676, + "learning_rate": 4.052148645503648e-06, + "loss": 0.0595, + "num_input_tokens_seen": 23427856, + "step": 32650 + }, + { + "epoch": 67.88981288981289, + "grad_norm": 0.4226151704788208, + "learning_rate": 4.046791860317531e-06, + "loss": 0.1325, + "num_input_tokens_seen": 23431536, + "step": 32655 + }, + { + "epoch": 67.9002079002079, + "grad_norm": 0.3588193953037262, + "learning_rate": 4.041438306379431e-06, + "loss": 0.0899, + "num_input_tokens_seen": 23435152, + "step": 32660 + }, + { + "epoch": 67.9106029106029, + "grad_norm": 0.5184590816497803, + "learning_rate": 4.036087984514916e-06, + "loss": 0.1431, + "num_input_tokens_seen": 23438768, + "step": 32665 + }, + { + "epoch": 67.92099792099792, + "grad_norm": 0.36105799674987793, + "learning_rate": 4.030740895549084e-06, + "loss": 0.1199, + "num_input_tokens_seen": 23442352, + "step": 32670 + }, + { + "epoch": 67.93139293139293, + "grad_norm": 0.5339873433113098, + "learning_rate": 4.025397040306531e-06, + "loss": 0.1608, + "num_input_tokens_seen": 23445904, + "step": 32675 + }, + { + "epoch": 67.94178794178794, + "grad_norm": 0.3393480181694031, + "learning_rate": 4.0200564196113285e-06, + "loss": 0.1041, + "num_input_tokens_seen": 23449712, + "step": 32680 + }, + { + "epoch": 67.95218295218295, + "grad_norm": 0.22011911869049072, + "learning_rate": 4.014719034287079e-06, + "loss": 0.1034, + "num_input_tokens_seen": 23453136, + "step": 32685 + }, + { + "epoch": 67.96257796257797, + "grad_norm": 0.16086040437221527, + "learning_rate": 4.0093848851568775e-06, + "loss": 0.0815, + "num_input_tokens_seen": 23456688, + "step": 32690 + }, + { + "epoch": 67.97297297297297, + "grad_norm": 0.47869250178337097, + "learning_rate": 4.004053973043304e-06, + "loss": 0.0989, + "num_input_tokens_seen": 23460240, + "step": 32695 + }, + { + "epoch": 67.98336798336798, + "grad_norm": 0.3795604109764099, + "learning_rate": 3.998726298768465e-06, + "loss": 0.1007, + "num_input_tokens_seen": 23463696, + "step": 32700 + }, + { + "epoch": 67.993762993763, + "grad_norm": 0.20185214281082153, + "learning_rate": 3.99340186315395e-06, + "loss": 0.101, + "num_input_tokens_seen": 23467248, + "step": 32705 + }, + { + "epoch": 68.004158004158, + "grad_norm": 0.8066232204437256, + "learning_rate": 3.988080667020849e-06, + "loss": 0.0761, + "num_input_tokens_seen": 23470944, + "step": 32710 + }, + { + "epoch": 68.01455301455302, + "grad_norm": 0.22186902165412903, + "learning_rate": 3.982762711189766e-06, + "loss": 0.1075, + "num_input_tokens_seen": 23474464, + "step": 32715 + }, + { + "epoch": 68.02494802494803, + "grad_norm": 0.46507057547569275, + "learning_rate": 3.977447996480785e-06, + "loss": 0.0999, + "num_input_tokens_seen": 23478080, + "step": 32720 + }, + { + "epoch": 68.03534303534303, + "grad_norm": 0.2310633510351181, + "learning_rate": 3.97213652371351e-06, + "loss": 0.1548, + "num_input_tokens_seen": 23481664, + "step": 32725 + }, + { + "epoch": 68.04573804573805, + "grad_norm": 0.17090308666229248, + "learning_rate": 3.966828293707042e-06, + "loss": 0.1263, + "num_input_tokens_seen": 23485120, + "step": 32730 + }, + { + "epoch": 68.05613305613305, + "grad_norm": 0.3373596966266632, + "learning_rate": 3.961523307279963e-06, + "loss": 0.1114, + "num_input_tokens_seen": 23488640, + "step": 32735 + }, + { + "epoch": 68.06652806652806, + "grad_norm": 0.45425930619239807, + "learning_rate": 3.956221565250382e-06, + "loss": 0.0776, + "num_input_tokens_seen": 23492224, + "step": 32740 + }, + { + "epoch": 68.07692307692308, + "grad_norm": 0.1609480232000351, + "learning_rate": 3.950923068435883e-06, + "loss": 0.0947, + "num_input_tokens_seen": 23495904, + "step": 32745 + }, + { + "epoch": 68.08731808731808, + "grad_norm": 0.34807729721069336, + "learning_rate": 3.945627817653566e-06, + "loss": 0.0584, + "num_input_tokens_seen": 23499488, + "step": 32750 + }, + { + "epoch": 68.0977130977131, + "grad_norm": 0.5575540065765381, + "learning_rate": 3.9403358137200335e-06, + "loss": 0.1406, + "num_input_tokens_seen": 23503232, + "step": 32755 + }, + { + "epoch": 68.10810810810811, + "grad_norm": 0.5272672772407532, + "learning_rate": 3.9350470574513605e-06, + "loss": 0.1089, + "num_input_tokens_seen": 23506848, + "step": 32760 + }, + { + "epoch": 68.11850311850311, + "grad_norm": 0.194215327501297, + "learning_rate": 3.9297615496631525e-06, + "loss": 0.0572, + "num_input_tokens_seen": 23510336, + "step": 32765 + }, + { + "epoch": 68.12889812889813, + "grad_norm": 0.3283161520957947, + "learning_rate": 3.924479291170505e-06, + "loss": 0.0954, + "num_input_tokens_seen": 23513856, + "step": 32770 + }, + { + "epoch": 68.13929313929314, + "grad_norm": 0.22929544746875763, + "learning_rate": 3.919200282788002e-06, + "loss": 0.1215, + "num_input_tokens_seen": 23517472, + "step": 32775 + }, + { + "epoch": 68.14968814968815, + "grad_norm": 0.5617566704750061, + "learning_rate": 3.913924525329726e-06, + "loss": 0.1105, + "num_input_tokens_seen": 23521088, + "step": 32780 + }, + { + "epoch": 68.16008316008316, + "grad_norm": 0.326650470495224, + "learning_rate": 3.908652019609279e-06, + "loss": 0.0596, + "num_input_tokens_seen": 23524544, + "step": 32785 + }, + { + "epoch": 68.17047817047818, + "grad_norm": 0.1905907392501831, + "learning_rate": 3.9033827664397364e-06, + "loss": 0.0896, + "num_input_tokens_seen": 23528224, + "step": 32790 + }, + { + "epoch": 68.18087318087318, + "grad_norm": 0.19267074763774872, + "learning_rate": 3.898116766633694e-06, + "loss": 0.1178, + "num_input_tokens_seen": 23531840, + "step": 32795 + }, + { + "epoch": 68.1912681912682, + "grad_norm": 0.3927420973777771, + "learning_rate": 3.8928540210032225e-06, + "loss": 0.1045, + "num_input_tokens_seen": 23535616, + "step": 32800 + }, + { + "epoch": 68.1912681912682, + "eval_loss": 0.14289934933185577, + "eval_runtime": 7.7486, + "eval_samples_per_second": 110.471, + "eval_steps_per_second": 27.618, + "num_input_tokens_seen": 23535616, + "step": 32800 + }, + { + "epoch": 68.20166320166321, + "grad_norm": 0.2739170789718628, + "learning_rate": 3.887594530359909e-06, + "loss": 0.0667, + "num_input_tokens_seen": 23539296, + "step": 32805 + }, + { + "epoch": 68.21205821205821, + "grad_norm": 0.9480540156364441, + "learning_rate": 3.88233829551484e-06, + "loss": 0.0961, + "num_input_tokens_seen": 23543104, + "step": 32810 + }, + { + "epoch": 68.22245322245323, + "grad_norm": 0.6554191708564758, + "learning_rate": 3.877085317278581e-06, + "loss": 0.1258, + "num_input_tokens_seen": 23546624, + "step": 32815 + }, + { + "epoch": 68.23284823284823, + "grad_norm": 0.5345251560211182, + "learning_rate": 3.87183559646122e-06, + "loss": 0.0769, + "num_input_tokens_seen": 23550144, + "step": 32820 + }, + { + "epoch": 68.24324324324324, + "grad_norm": 0.47525089979171753, + "learning_rate": 3.866589133872317e-06, + "loss": 0.1569, + "num_input_tokens_seen": 23553792, + "step": 32825 + }, + { + "epoch": 68.25363825363826, + "grad_norm": 0.229600727558136, + "learning_rate": 3.861345930320948e-06, + "loss": 0.1092, + "num_input_tokens_seen": 23557472, + "step": 32830 + }, + { + "epoch": 68.26403326403326, + "grad_norm": 0.5546121597290039, + "learning_rate": 3.856105986615688e-06, + "loss": 0.0835, + "num_input_tokens_seen": 23561184, + "step": 32835 + }, + { + "epoch": 68.27442827442827, + "grad_norm": 0.2896835207939148, + "learning_rate": 3.850869303564589e-06, + "loss": 0.0779, + "num_input_tokens_seen": 23564736, + "step": 32840 + }, + { + "epoch": 68.28482328482329, + "grad_norm": 0.18487517535686493, + "learning_rate": 3.845635881975226e-06, + "loss": 0.0557, + "num_input_tokens_seen": 23568384, + "step": 32845 + }, + { + "epoch": 68.29521829521829, + "grad_norm": 0.4317780137062073, + "learning_rate": 3.840405722654647e-06, + "loss": 0.0736, + "num_input_tokens_seen": 23571968, + "step": 32850 + }, + { + "epoch": 68.3056133056133, + "grad_norm": 0.5467671751976013, + "learning_rate": 3.835178826409419e-06, + "loss": 0.137, + "num_input_tokens_seen": 23575552, + "step": 32855 + }, + { + "epoch": 68.31600831600832, + "grad_norm": 0.3140923082828522, + "learning_rate": 3.8299551940455895e-06, + "loss": 0.068, + "num_input_tokens_seen": 23579008, + "step": 32860 + }, + { + "epoch": 68.32640332640332, + "grad_norm": 0.2119879573583603, + "learning_rate": 3.824734826368703e-06, + "loss": 0.0893, + "num_input_tokens_seen": 23582656, + "step": 32865 + }, + { + "epoch": 68.33679833679834, + "grad_norm": 1.059798240661621, + "learning_rate": 3.819517724183813e-06, + "loss": 0.156, + "num_input_tokens_seen": 23586336, + "step": 32870 + }, + { + "epoch": 68.34719334719335, + "grad_norm": 0.29512372612953186, + "learning_rate": 3.8143038882954648e-06, + "loss": 0.0741, + "num_input_tokens_seen": 23589952, + "step": 32875 + }, + { + "epoch": 68.35758835758836, + "grad_norm": 0.20586737990379333, + "learning_rate": 3.8090933195076867e-06, + "loss": 0.1135, + "num_input_tokens_seen": 23593472, + "step": 32880 + }, + { + "epoch": 68.36798336798337, + "grad_norm": 0.4088411331176758, + "learning_rate": 3.8038860186240198e-06, + "loss": 0.1232, + "num_input_tokens_seen": 23596992, + "step": 32885 + }, + { + "epoch": 68.37837837837837, + "grad_norm": 0.2381114959716797, + "learning_rate": 3.7986819864475026e-06, + "loss": 0.0559, + "num_input_tokens_seen": 23600512, + "step": 32890 + }, + { + "epoch": 68.38877338877339, + "grad_norm": 0.17713162302970886, + "learning_rate": 3.793481223780651e-06, + "loss": 0.0819, + "num_input_tokens_seen": 23604128, + "step": 32895 + }, + { + "epoch": 68.3991683991684, + "grad_norm": 0.22380563616752625, + "learning_rate": 3.788283731425496e-06, + "loss": 0.0699, + "num_input_tokens_seen": 23607712, + "step": 32900 + }, + { + "epoch": 68.4095634095634, + "grad_norm": 0.22428227961063385, + "learning_rate": 3.7830895101835488e-06, + "loss": 0.0935, + "num_input_tokens_seen": 23611360, + "step": 32905 + }, + { + "epoch": 68.41995841995842, + "grad_norm": 0.47811374068260193, + "learning_rate": 3.7778985608558274e-06, + "loss": 0.1214, + "num_input_tokens_seen": 23615104, + "step": 32910 + }, + { + "epoch": 68.43035343035343, + "grad_norm": 0.3385631740093231, + "learning_rate": 3.7727108842428443e-06, + "loss": 0.0896, + "num_input_tokens_seen": 23618624, + "step": 32915 + }, + { + "epoch": 68.44074844074844, + "grad_norm": 0.4375978112220764, + "learning_rate": 3.7675264811446065e-06, + "loss": 0.1006, + "num_input_tokens_seen": 23622176, + "step": 32920 + }, + { + "epoch": 68.45114345114345, + "grad_norm": 0.9544196128845215, + "learning_rate": 3.7623453523605994e-06, + "loss": 0.0916, + "num_input_tokens_seen": 23625792, + "step": 32925 + }, + { + "epoch": 68.46153846153847, + "grad_norm": 0.18905475735664368, + "learning_rate": 3.757167498689834e-06, + "loss": 0.1035, + "num_input_tokens_seen": 23629408, + "step": 32930 + }, + { + "epoch": 68.47193347193347, + "grad_norm": 0.17619849741458893, + "learning_rate": 3.7519929209307914e-06, + "loss": 0.0736, + "num_input_tokens_seen": 23632864, + "step": 32935 + }, + { + "epoch": 68.48232848232848, + "grad_norm": 0.24240711331367493, + "learning_rate": 3.746821619881463e-06, + "loss": 0.096, + "num_input_tokens_seen": 23636576, + "step": 32940 + }, + { + "epoch": 68.4927234927235, + "grad_norm": 0.20268353819847107, + "learning_rate": 3.74165359633932e-06, + "loss": 0.1167, + "num_input_tokens_seen": 23640096, + "step": 32945 + }, + { + "epoch": 68.5031185031185, + "grad_norm": 0.7135939598083496, + "learning_rate": 3.736488851101341e-06, + "loss": 0.0731, + "num_input_tokens_seen": 23643616, + "step": 32950 + }, + { + "epoch": 68.51351351351352, + "grad_norm": 0.2321529984474182, + "learning_rate": 3.7313273849640035e-06, + "loss": 0.0697, + "num_input_tokens_seen": 23647232, + "step": 32955 + }, + { + "epoch": 68.52390852390852, + "grad_norm": 0.3033941090106964, + "learning_rate": 3.7261691987232533e-06, + "loss": 0.0993, + "num_input_tokens_seen": 23650816, + "step": 32960 + }, + { + "epoch": 68.53430353430353, + "grad_norm": 0.2655241787433624, + "learning_rate": 3.7210142931745575e-06, + "loss": 0.1021, + "num_input_tokens_seen": 23654240, + "step": 32965 + }, + { + "epoch": 68.54469854469855, + "grad_norm": 0.3500001132488251, + "learning_rate": 3.7158626691128712e-06, + "loss": 0.0991, + "num_input_tokens_seen": 23657856, + "step": 32970 + }, + { + "epoch": 68.55509355509355, + "grad_norm": 0.6117265224456787, + "learning_rate": 3.710714327332629e-06, + "loss": 0.102, + "num_input_tokens_seen": 23661216, + "step": 32975 + }, + { + "epoch": 68.56548856548856, + "grad_norm": 0.17868034541606903, + "learning_rate": 3.7055692686277815e-06, + "loss": 0.1121, + "num_input_tokens_seen": 23664864, + "step": 32980 + }, + { + "epoch": 68.57588357588358, + "grad_norm": 0.41213536262512207, + "learning_rate": 3.70042749379175e-06, + "loss": 0.1124, + "num_input_tokens_seen": 23668384, + "step": 32985 + }, + { + "epoch": 68.58627858627858, + "grad_norm": 0.6771590709686279, + "learning_rate": 3.6952890036174693e-06, + "loss": 0.0737, + "num_input_tokens_seen": 23671904, + "step": 32990 + }, + { + "epoch": 68.5966735966736, + "grad_norm": 0.5952103137969971, + "learning_rate": 3.690153798897353e-06, + "loss": 0.0934, + "num_input_tokens_seen": 23675488, + "step": 32995 + }, + { + "epoch": 68.60706860706861, + "grad_norm": 0.22405877709388733, + "learning_rate": 3.6850218804233225e-06, + "loss": 0.0687, + "num_input_tokens_seen": 23678976, + "step": 33000 + }, + { + "epoch": 68.60706860706861, + "eval_loss": 0.14473214745521545, + "eval_runtime": 7.7513, + "eval_samples_per_second": 110.434, + "eval_steps_per_second": 27.608, + "num_input_tokens_seen": 23678976, + "step": 33000 + }, + { + "epoch": 68.61746361746361, + "grad_norm": 0.7343098521232605, + "learning_rate": 3.679893248986779e-06, + "loss": 0.0893, + "num_input_tokens_seen": 23682592, + "step": 33005 + }, + { + "epoch": 68.62785862785863, + "grad_norm": 0.29941633343696594, + "learning_rate": 3.6747679053786147e-06, + "loss": 0.056, + "num_input_tokens_seen": 23686016, + "step": 33010 + }, + { + "epoch": 68.63825363825364, + "grad_norm": 0.28974097967147827, + "learning_rate": 3.669645850389228e-06, + "loss": 0.1335, + "num_input_tokens_seen": 23689568, + "step": 33015 + }, + { + "epoch": 68.64864864864865, + "grad_norm": 0.4096536636352539, + "learning_rate": 3.664527084808514e-06, + "loss": 0.0868, + "num_input_tokens_seen": 23693280, + "step": 33020 + }, + { + "epoch": 68.65904365904366, + "grad_norm": 0.19827863574028015, + "learning_rate": 3.6594116094258337e-06, + "loss": 0.1003, + "num_input_tokens_seen": 23696928, + "step": 33025 + }, + { + "epoch": 68.66943866943868, + "grad_norm": 0.6014654636383057, + "learning_rate": 3.6542994250300665e-06, + "loss": 0.0817, + "num_input_tokens_seen": 23700480, + "step": 33030 + }, + { + "epoch": 68.67983367983368, + "grad_norm": 0.41789835691452026, + "learning_rate": 3.6491905324095825e-06, + "loss": 0.0955, + "num_input_tokens_seen": 23704064, + "step": 33035 + }, + { + "epoch": 68.6902286902287, + "grad_norm": 0.25790145993232727, + "learning_rate": 3.644084932352221e-06, + "loss": 0.1275, + "num_input_tokens_seen": 23707744, + "step": 33040 + }, + { + "epoch": 68.7006237006237, + "grad_norm": 0.16905125975608826, + "learning_rate": 3.6389826256453457e-06, + "loss": 0.0635, + "num_input_tokens_seen": 23711264, + "step": 33045 + }, + { + "epoch": 68.71101871101871, + "grad_norm": 0.5886226892471313, + "learning_rate": 3.633883613075781e-06, + "loss": 0.0749, + "num_input_tokens_seen": 23714656, + "step": 33050 + }, + { + "epoch": 68.72141372141373, + "grad_norm": 0.2522538900375366, + "learning_rate": 3.6287878954298693e-06, + "loss": 0.109, + "num_input_tokens_seen": 23718240, + "step": 33055 + }, + { + "epoch": 68.73180873180873, + "grad_norm": 0.2651035487651825, + "learning_rate": 3.6236954734934354e-06, + "loss": 0.0996, + "num_input_tokens_seen": 23721856, + "step": 33060 + }, + { + "epoch": 68.74220374220374, + "grad_norm": 0.3974784016609192, + "learning_rate": 3.618606348051784e-06, + "loss": 0.0944, + "num_input_tokens_seen": 23725696, + "step": 33065 + }, + { + "epoch": 68.75259875259876, + "grad_norm": 0.8644559979438782, + "learning_rate": 3.6135205198897376e-06, + "loss": 0.0957, + "num_input_tokens_seen": 23729280, + "step": 33070 + }, + { + "epoch": 68.76299376299376, + "grad_norm": 0.48177021741867065, + "learning_rate": 3.6084379897915854e-06, + "loss": 0.0888, + "num_input_tokens_seen": 23732896, + "step": 33075 + }, + { + "epoch": 68.77338877338877, + "grad_norm": 0.7086395025253296, + "learning_rate": 3.6033587585411115e-06, + "loss": 0.1106, + "num_input_tokens_seen": 23736608, + "step": 33080 + }, + { + "epoch": 68.78378378378379, + "grad_norm": 0.24471011757850647, + "learning_rate": 3.5982828269216117e-06, + "loss": 0.0909, + "num_input_tokens_seen": 23740096, + "step": 33085 + }, + { + "epoch": 68.79417879417879, + "grad_norm": 0.18475733697414398, + "learning_rate": 3.593210195715843e-06, + "loss": 0.0916, + "num_input_tokens_seen": 23743872, + "step": 33090 + }, + { + "epoch": 68.8045738045738, + "grad_norm": 0.5500102639198303, + "learning_rate": 3.5881408657060773e-06, + "loss": 0.106, + "num_input_tokens_seen": 23747552, + "step": 33095 + }, + { + "epoch": 68.81496881496882, + "grad_norm": 0.29869696497917175, + "learning_rate": 3.583074837674075e-06, + "loss": 0.1356, + "num_input_tokens_seen": 23751264, + "step": 33100 + }, + { + "epoch": 68.82536382536382, + "grad_norm": 0.3221561908721924, + "learning_rate": 3.578012112401069e-06, + "loss": 0.1315, + "num_input_tokens_seen": 23754848, + "step": 33105 + }, + { + "epoch": 68.83575883575884, + "grad_norm": 0.7596297860145569, + "learning_rate": 3.5729526906677996e-06, + "loss": 0.1036, + "num_input_tokens_seen": 23758400, + "step": 33110 + }, + { + "epoch": 68.84615384615384, + "grad_norm": 0.16770869493484497, + "learning_rate": 3.5678965732545007e-06, + "loss": 0.085, + "num_input_tokens_seen": 23761920, + "step": 33115 + }, + { + "epoch": 68.85654885654886, + "grad_norm": 0.3728848993778229, + "learning_rate": 3.562843760940876e-06, + "loss": 0.1129, + "num_input_tokens_seen": 23765472, + "step": 33120 + }, + { + "epoch": 68.86694386694387, + "grad_norm": 0.4742347002029419, + "learning_rate": 3.5577942545061473e-06, + "loss": 0.0946, + "num_input_tokens_seen": 23769184, + "step": 33125 + }, + { + "epoch": 68.87733887733887, + "grad_norm": 0.6371216773986816, + "learning_rate": 3.5527480547289967e-06, + "loss": 0.0775, + "num_input_tokens_seen": 23772672, + "step": 33130 + }, + { + "epoch": 68.88773388773389, + "grad_norm": 0.18406203389167786, + "learning_rate": 3.547705162387624e-06, + "loss": 0.1063, + "num_input_tokens_seen": 23776320, + "step": 33135 + }, + { + "epoch": 68.8981288981289, + "grad_norm": 0.3331185281276703, + "learning_rate": 3.542665578259699e-06, + "loss": 0.0811, + "num_input_tokens_seen": 23779968, + "step": 33140 + }, + { + "epoch": 68.9085239085239, + "grad_norm": 1.0069175958633423, + "learning_rate": 3.5376293031223945e-06, + "loss": 0.1474, + "num_input_tokens_seen": 23783552, + "step": 33145 + }, + { + "epoch": 68.91891891891892, + "grad_norm": 0.3217298686504364, + "learning_rate": 3.5325963377523614e-06, + "loss": 0.1215, + "num_input_tokens_seen": 23787168, + "step": 33150 + }, + { + "epoch": 68.92931392931393, + "grad_norm": 0.3364819586277008, + "learning_rate": 3.5275666829257536e-06, + "loss": 0.0947, + "num_input_tokens_seen": 23790656, + "step": 33155 + }, + { + "epoch": 68.93970893970894, + "grad_norm": 0.22702892124652863, + "learning_rate": 3.5225403394181955e-06, + "loss": 0.0748, + "num_input_tokens_seen": 23794304, + "step": 33160 + }, + { + "epoch": 68.95010395010395, + "grad_norm": 0.49680572748184204, + "learning_rate": 3.517517308004828e-06, + "loss": 0.0683, + "num_input_tokens_seen": 23797696, + "step": 33165 + }, + { + "epoch": 68.96049896049897, + "grad_norm": 1.1101053953170776, + "learning_rate": 3.512497589460251e-06, + "loss": 0.1191, + "num_input_tokens_seen": 23801248, + "step": 33170 + }, + { + "epoch": 68.97089397089397, + "grad_norm": 0.4313296377658844, + "learning_rate": 3.5074811845585727e-06, + "loss": 0.0763, + "num_input_tokens_seen": 23804704, + "step": 33175 + }, + { + "epoch": 68.98128898128898, + "grad_norm": 0.2183256596326828, + "learning_rate": 3.5024680940733937e-06, + "loss": 0.1043, + "num_input_tokens_seen": 23808224, + "step": 33180 + }, + { + "epoch": 68.99168399168398, + "grad_norm": 0.10175895690917969, + "learning_rate": 3.4974583187777852e-06, + "loss": 0.1092, + "num_input_tokens_seen": 23812000, + "step": 33185 + }, + { + "epoch": 69.002079002079, + "grad_norm": 0.19928283989429474, + "learning_rate": 3.4924518594443204e-06, + "loss": 0.0985, + "num_input_tokens_seen": 23815640, + "step": 33190 + }, + { + "epoch": 69.01247401247402, + "grad_norm": 0.4038469195365906, + "learning_rate": 3.4874487168450682e-06, + "loss": 0.1365, + "num_input_tokens_seen": 23819320, + "step": 33195 + }, + { + "epoch": 69.02286902286902, + "grad_norm": 0.19650810956954956, + "learning_rate": 3.482448891751558e-06, + "loss": 0.0791, + "num_input_tokens_seen": 23823128, + "step": 33200 + }, + { + "epoch": 69.02286902286902, + "eval_loss": 0.14532963931560516, + "eval_runtime": 7.7478, + "eval_samples_per_second": 110.482, + "eval_steps_per_second": 27.621, + "num_input_tokens_seen": 23823128, + "step": 33200 + }, + { + "epoch": 69.03326403326403, + "grad_norm": 0.8164228796958923, + "learning_rate": 3.477452384934843e-06, + "loss": 0.0615, + "num_input_tokens_seen": 23826648, + "step": 33205 + }, + { + "epoch": 69.04365904365905, + "grad_norm": 0.15115255117416382, + "learning_rate": 3.472459197165434e-06, + "loss": 0.0791, + "num_input_tokens_seen": 23830040, + "step": 33210 + }, + { + "epoch": 69.05405405405405, + "grad_norm": 0.4089012145996094, + "learning_rate": 3.4674693292133518e-06, + "loss": 0.1247, + "num_input_tokens_seen": 23833784, + "step": 33215 + }, + { + "epoch": 69.06444906444906, + "grad_norm": 0.311212956905365, + "learning_rate": 3.4624827818480977e-06, + "loss": 0.0896, + "num_input_tokens_seen": 23837432, + "step": 33220 + }, + { + "epoch": 69.07484407484408, + "grad_norm": 0.16397182643413544, + "learning_rate": 3.4574995558386474e-06, + "loss": 0.1176, + "num_input_tokens_seen": 23841016, + "step": 33225 + }, + { + "epoch": 69.08523908523908, + "grad_norm": 0.5661638975143433, + "learning_rate": 3.452519651953487e-06, + "loss": 0.0703, + "num_input_tokens_seen": 23844632, + "step": 33230 + }, + { + "epoch": 69.0956340956341, + "grad_norm": 0.34130486845970154, + "learning_rate": 3.447543070960585e-06, + "loss": 0.0958, + "num_input_tokens_seen": 23848312, + "step": 33235 + }, + { + "epoch": 69.10602910602911, + "grad_norm": 0.27342620491981506, + "learning_rate": 3.4425698136273778e-06, + "loss": 0.0731, + "num_input_tokens_seen": 23851704, + "step": 33240 + }, + { + "epoch": 69.11642411642411, + "grad_norm": 0.15867623686790466, + "learning_rate": 3.437599880720821e-06, + "loss": 0.0867, + "num_input_tokens_seen": 23855384, + "step": 33245 + }, + { + "epoch": 69.12681912681913, + "grad_norm": 0.1330285221338272, + "learning_rate": 3.4326332730073267e-06, + "loss": 0.062, + "num_input_tokens_seen": 23858840, + "step": 33250 + }, + { + "epoch": 69.13721413721414, + "grad_norm": 0.367430716753006, + "learning_rate": 3.427669991252813e-06, + "loss": 0.0996, + "num_input_tokens_seen": 23862360, + "step": 33255 + }, + { + "epoch": 69.14760914760915, + "grad_norm": 0.5326129794120789, + "learning_rate": 3.42271003622269e-06, + "loss": 0.082, + "num_input_tokens_seen": 23865720, + "step": 33260 + }, + { + "epoch": 69.15800415800416, + "grad_norm": 0.2897026538848877, + "learning_rate": 3.4177534086818286e-06, + "loss": 0.1014, + "num_input_tokens_seen": 23869528, + "step": 33265 + }, + { + "epoch": 69.16839916839916, + "grad_norm": 0.30456840991973877, + "learning_rate": 3.412800109394612e-06, + "loss": 0.1473, + "num_input_tokens_seen": 23873080, + "step": 33270 + }, + { + "epoch": 69.17879417879418, + "grad_norm": 0.43727871775627136, + "learning_rate": 3.4078501391249044e-06, + "loss": 0.1182, + "num_input_tokens_seen": 23876664, + "step": 33275 + }, + { + "epoch": 69.1891891891892, + "grad_norm": 0.12401145696640015, + "learning_rate": 3.4029034986360453e-06, + "loss": 0.1152, + "num_input_tokens_seen": 23880344, + "step": 33280 + }, + { + "epoch": 69.1995841995842, + "grad_norm": 0.4536912739276886, + "learning_rate": 3.397960188690877e-06, + "loss": 0.11, + "num_input_tokens_seen": 23884024, + "step": 33285 + }, + { + "epoch": 69.20997920997921, + "grad_norm": 0.4105953276157379, + "learning_rate": 3.393020210051717e-06, + "loss": 0.0703, + "num_input_tokens_seen": 23887608, + "step": 33290 + }, + { + "epoch": 69.22037422037423, + "grad_norm": 0.2482646256685257, + "learning_rate": 3.3880835634803655e-06, + "loss": 0.08, + "num_input_tokens_seen": 23891160, + "step": 33295 + }, + { + "epoch": 69.23076923076923, + "grad_norm": 0.30147746205329895, + "learning_rate": 3.383150249738126e-06, + "loss": 0.0682, + "num_input_tokens_seen": 23894744, + "step": 33300 + }, + { + "epoch": 69.24116424116424, + "grad_norm": 0.300275593996048, + "learning_rate": 3.3782202695857663e-06, + "loss": 0.1095, + "num_input_tokens_seen": 23898264, + "step": 33305 + }, + { + "epoch": 69.25155925155926, + "grad_norm": 0.2300473004579544, + "learning_rate": 3.373293623783558e-06, + "loss": 0.0887, + "num_input_tokens_seen": 23901784, + "step": 33310 + }, + { + "epoch": 69.26195426195426, + "grad_norm": 0.27984872460365295, + "learning_rate": 3.368370313091257e-06, + "loss": 0.1135, + "num_input_tokens_seen": 23905368, + "step": 33315 + }, + { + "epoch": 69.27234927234927, + "grad_norm": 0.4389328360557556, + "learning_rate": 3.363450338268087e-06, + "loss": 0.1264, + "num_input_tokens_seen": 23908920, + "step": 33320 + }, + { + "epoch": 69.28274428274429, + "grad_norm": 0.5335094332695007, + "learning_rate": 3.358533700072783e-06, + "loss": 0.1287, + "num_input_tokens_seen": 23912408, + "step": 33325 + }, + { + "epoch": 69.29313929313929, + "grad_norm": 0.3292142152786255, + "learning_rate": 3.3536203992635377e-06, + "loss": 0.0834, + "num_input_tokens_seen": 23916024, + "step": 33330 + }, + { + "epoch": 69.3035343035343, + "grad_norm": 0.8849036693572998, + "learning_rate": 3.348710436598057e-06, + "loss": 0.1337, + "num_input_tokens_seen": 23919704, + "step": 33335 + }, + { + "epoch": 69.31392931392931, + "grad_norm": 0.15561597049236298, + "learning_rate": 3.3438038128335155e-06, + "loss": 0.0906, + "num_input_tokens_seen": 23923384, + "step": 33340 + }, + { + "epoch": 69.32432432432432, + "grad_norm": 0.38698986172676086, + "learning_rate": 3.338900528726571e-06, + "loss": 0.1083, + "num_input_tokens_seen": 23926808, + "step": 33345 + }, + { + "epoch": 69.33471933471934, + "grad_norm": 0.19532571732997894, + "learning_rate": 3.3340005850333812e-06, + "loss": 0.0771, + "num_input_tokens_seen": 23930520, + "step": 33350 + }, + { + "epoch": 69.34511434511434, + "grad_norm": 0.5781756043434143, + "learning_rate": 3.329103982509568e-06, + "loss": 0.1472, + "num_input_tokens_seen": 23934072, + "step": 33355 + }, + { + "epoch": 69.35550935550935, + "grad_norm": 0.24211253225803375, + "learning_rate": 3.324210721910259e-06, + "loss": 0.0925, + "num_input_tokens_seen": 23937688, + "step": 33360 + }, + { + "epoch": 69.36590436590437, + "grad_norm": 0.3247150778770447, + "learning_rate": 3.319320803990053e-06, + "loss": 0.0975, + "num_input_tokens_seen": 23941240, + "step": 33365 + }, + { + "epoch": 69.37629937629937, + "grad_norm": 0.7700656056404114, + "learning_rate": 3.3144342295030274e-06, + "loss": 0.0776, + "num_input_tokens_seen": 23944952, + "step": 33370 + }, + { + "epoch": 69.38669438669439, + "grad_norm": 0.6330753564834595, + "learning_rate": 3.309550999202765e-06, + "loss": 0.0859, + "num_input_tokens_seen": 23948504, + "step": 33375 + }, + { + "epoch": 69.3970893970894, + "grad_norm": 0.3429119288921356, + "learning_rate": 3.3046711138423197e-06, + "loss": 0.0751, + "num_input_tokens_seen": 23952120, + "step": 33380 + }, + { + "epoch": 69.4074844074844, + "grad_norm": 0.20803506672382355, + "learning_rate": 3.2997945741742255e-06, + "loss": 0.0829, + "num_input_tokens_seen": 23955480, + "step": 33385 + }, + { + "epoch": 69.41787941787942, + "grad_norm": 0.15241248905658722, + "learning_rate": 3.2949213809505082e-06, + "loss": 0.1204, + "num_input_tokens_seen": 23959320, + "step": 33390 + }, + { + "epoch": 69.42827442827443, + "grad_norm": 0.6954514384269714, + "learning_rate": 3.2900515349226834e-06, + "loss": 0.1748, + "num_input_tokens_seen": 23962872, + "step": 33395 + }, + { + "epoch": 69.43866943866944, + "grad_norm": 0.3128257095813751, + "learning_rate": 3.285185036841731e-06, + "loss": 0.0906, + "num_input_tokens_seen": 23966488, + "step": 33400 + }, + { + "epoch": 69.43866943866944, + "eval_loss": 0.14461161196231842, + "eval_runtime": 7.7445, + "eval_samples_per_second": 110.53, + "eval_steps_per_second": 27.632, + "num_input_tokens_seen": 23966488, + "step": 33400 + }, + { + "epoch": 69.44906444906445, + "grad_norm": 0.45061570405960083, + "learning_rate": 3.2803218874581377e-06, + "loss": 0.0977, + "num_input_tokens_seen": 23970136, + "step": 33405 + }, + { + "epoch": 69.45945945945945, + "grad_norm": 0.35254669189453125, + "learning_rate": 3.2754620875218494e-06, + "loss": 0.1049, + "num_input_tokens_seen": 23973816, + "step": 33410 + }, + { + "epoch": 69.46985446985447, + "grad_norm": 0.5274953842163086, + "learning_rate": 3.2706056377823146e-06, + "loss": 0.111, + "num_input_tokens_seen": 23977400, + "step": 33415 + }, + { + "epoch": 69.48024948024948, + "grad_norm": 0.7905495166778564, + "learning_rate": 3.2657525389884647e-06, + "loss": 0.0801, + "num_input_tokens_seen": 23980952, + "step": 33420 + }, + { + "epoch": 69.49064449064448, + "grad_norm": 0.49916401505470276, + "learning_rate": 3.260902791888698e-06, + "loss": 0.0653, + "num_input_tokens_seen": 23984312, + "step": 33425 + }, + { + "epoch": 69.5010395010395, + "grad_norm": 0.2531995177268982, + "learning_rate": 3.2560563972309166e-06, + "loss": 0.0911, + "num_input_tokens_seen": 23987704, + "step": 33430 + }, + { + "epoch": 69.51143451143452, + "grad_norm": 0.30526429414749146, + "learning_rate": 3.251213355762489e-06, + "loss": 0.0998, + "num_input_tokens_seen": 23991320, + "step": 33435 + }, + { + "epoch": 69.52182952182952, + "grad_norm": 0.17856252193450928, + "learning_rate": 3.2463736682302707e-06, + "loss": 0.0838, + "num_input_tokens_seen": 23994872, + "step": 33440 + }, + { + "epoch": 69.53222453222453, + "grad_norm": 0.24608542025089264, + "learning_rate": 3.2415373353806124e-06, + "loss": 0.0934, + "num_input_tokens_seen": 23998456, + "step": 33445 + }, + { + "epoch": 69.54261954261955, + "grad_norm": 0.17549669742584229, + "learning_rate": 3.236704357959322e-06, + "loss": 0.0635, + "num_input_tokens_seen": 24002040, + "step": 33450 + }, + { + "epoch": 69.55301455301455, + "grad_norm": 0.22720734775066376, + "learning_rate": 3.2318747367117154e-06, + "loss": 0.1007, + "num_input_tokens_seen": 24005688, + "step": 33455 + }, + { + "epoch": 69.56340956340956, + "grad_norm": 0.3580119013786316, + "learning_rate": 3.227048472382585e-06, + "loss": 0.138, + "num_input_tokens_seen": 24009368, + "step": 33460 + }, + { + "epoch": 69.57380457380458, + "grad_norm": 0.2734532952308655, + "learning_rate": 3.2222255657161915e-06, + "loss": 0.1073, + "num_input_tokens_seen": 24013016, + "step": 33465 + }, + { + "epoch": 69.58419958419958, + "grad_norm": 0.3418670892715454, + "learning_rate": 3.2174060174562924e-06, + "loss": 0.0774, + "num_input_tokens_seen": 24016408, + "step": 33470 + }, + { + "epoch": 69.5945945945946, + "grad_norm": 0.29307860136032104, + "learning_rate": 3.2125898283461298e-06, + "loss": 0.0969, + "num_input_tokens_seen": 24020216, + "step": 33475 + }, + { + "epoch": 69.60498960498961, + "grad_norm": 0.39232829213142395, + "learning_rate": 3.207776999128406e-06, + "loss": 0.155, + "num_input_tokens_seen": 24023832, + "step": 33480 + }, + { + "epoch": 69.61538461538461, + "grad_norm": 0.18275690078735352, + "learning_rate": 3.202967530545331e-06, + "loss": 0.09, + "num_input_tokens_seen": 24027320, + "step": 33485 + }, + { + "epoch": 69.62577962577963, + "grad_norm": 0.2068556845188141, + "learning_rate": 3.1981614233385778e-06, + "loss": 0.0869, + "num_input_tokens_seen": 24031160, + "step": 33490 + }, + { + "epoch": 69.63617463617463, + "grad_norm": 0.36270496249198914, + "learning_rate": 3.1933586782493115e-06, + "loss": 0.0961, + "num_input_tokens_seen": 24034712, + "step": 33495 + }, + { + "epoch": 69.64656964656965, + "grad_norm": 0.3475678861141205, + "learning_rate": 3.188559296018184e-06, + "loss": 0.1405, + "num_input_tokens_seen": 24038264, + "step": 33500 + }, + { + "epoch": 69.65696465696466, + "grad_norm": 0.32438385486602783, + "learning_rate": 3.1837632773853098e-06, + "loss": 0.1016, + "num_input_tokens_seen": 24041912, + "step": 33505 + }, + { + "epoch": 69.66735966735966, + "grad_norm": 0.16253039240837097, + "learning_rate": 3.178970623090294e-06, + "loss": 0.0908, + "num_input_tokens_seen": 24045400, + "step": 33510 + }, + { + "epoch": 69.67775467775468, + "grad_norm": 0.4066813290119171, + "learning_rate": 3.174181333872234e-06, + "loss": 0.1355, + "num_input_tokens_seen": 24048952, + "step": 33515 + }, + { + "epoch": 69.6881496881497, + "grad_norm": 0.1504625529050827, + "learning_rate": 3.169395410469686e-06, + "loss": 0.1236, + "num_input_tokens_seen": 24052664, + "step": 33520 + }, + { + "epoch": 69.6985446985447, + "grad_norm": 0.1968923807144165, + "learning_rate": 3.164612853620713e-06, + "loss": 0.0947, + "num_input_tokens_seen": 24056312, + "step": 33525 + }, + { + "epoch": 69.70893970893971, + "grad_norm": 0.13062264025211334, + "learning_rate": 3.1598336640628333e-06, + "loss": 0.0771, + "num_input_tokens_seen": 24059832, + "step": 33530 + }, + { + "epoch": 69.71933471933473, + "grad_norm": 0.43124309182167053, + "learning_rate": 3.155057842533063e-06, + "loss": 0.078, + "num_input_tokens_seen": 24063416, + "step": 33535 + }, + { + "epoch": 69.72972972972973, + "grad_norm": 0.3462887406349182, + "learning_rate": 3.1502853897678984e-06, + "loss": 0.0548, + "num_input_tokens_seen": 24067128, + "step": 33540 + }, + { + "epoch": 69.74012474012474, + "grad_norm": 0.5019291639328003, + "learning_rate": 3.1455163065033017e-06, + "loss": 0.1061, + "num_input_tokens_seen": 24070552, + "step": 33545 + }, + { + "epoch": 69.75051975051976, + "grad_norm": 0.5756253600120544, + "learning_rate": 3.140750593474734e-06, + "loss": 0.0623, + "num_input_tokens_seen": 24074456, + "step": 33550 + }, + { + "epoch": 69.76091476091476, + "grad_norm": 0.13128240406513214, + "learning_rate": 3.1359882514171294e-06, + "loss": 0.0998, + "num_input_tokens_seen": 24078072, + "step": 33555 + }, + { + "epoch": 69.77130977130977, + "grad_norm": 0.5247302651405334, + "learning_rate": 3.1312292810648903e-06, + "loss": 0.0683, + "num_input_tokens_seen": 24081816, + "step": 33560 + }, + { + "epoch": 69.78170478170478, + "grad_norm": 0.45464494824409485, + "learning_rate": 3.1264736831519204e-06, + "loss": 0.085, + "num_input_tokens_seen": 24085368, + "step": 33565 + }, + { + "epoch": 69.79209979209979, + "grad_norm": 0.496171236038208, + "learning_rate": 3.1217214584115863e-06, + "loss": 0.1366, + "num_input_tokens_seen": 24089048, + "step": 33570 + }, + { + "epoch": 69.8024948024948, + "grad_norm": 0.18308956921100616, + "learning_rate": 3.116972607576746e-06, + "loss": 0.0746, + "num_input_tokens_seen": 24092600, + "step": 33575 + }, + { + "epoch": 69.81288981288981, + "grad_norm": 0.36030635237693787, + "learning_rate": 3.1122271313797303e-06, + "loss": 0.1149, + "num_input_tokens_seen": 24096312, + "step": 33580 + }, + { + "epoch": 69.82328482328482, + "grad_norm": 0.7623839974403381, + "learning_rate": 3.107485030552343e-06, + "loss": 0.1023, + "num_input_tokens_seen": 24099800, + "step": 33585 + }, + { + "epoch": 69.83367983367984, + "grad_norm": 0.3028676211833954, + "learning_rate": 3.1027463058258848e-06, + "loss": 0.104, + "num_input_tokens_seen": 24103544, + "step": 33590 + }, + { + "epoch": 69.84407484407484, + "grad_norm": 0.4228315055370331, + "learning_rate": 3.0980109579311273e-06, + "loss": 0.0682, + "num_input_tokens_seen": 24107128, + "step": 33595 + }, + { + "epoch": 69.85446985446985, + "grad_norm": 0.1823892891407013, + "learning_rate": 3.093278987598314e-06, + "loss": 0.1076, + "num_input_tokens_seen": 24110648, + "step": 33600 + }, + { + "epoch": 69.85446985446985, + "eval_loss": 0.1447829306125641, + "eval_runtime": 7.7509, + "eval_samples_per_second": 110.439, + "eval_steps_per_second": 27.61, + "num_input_tokens_seen": 24110648, + "step": 33600 + }, + { + "epoch": 69.86486486486487, + "grad_norm": 0.322345495223999, + "learning_rate": 3.0885503955571826e-06, + "loss": 0.0744, + "num_input_tokens_seen": 24114072, + "step": 33605 + }, + { + "epoch": 69.87525987525987, + "grad_norm": 0.1661974936723709, + "learning_rate": 3.0838251825369313e-06, + "loss": 0.0612, + "num_input_tokens_seen": 24117528, + "step": 33610 + }, + { + "epoch": 69.88565488565489, + "grad_norm": 0.2593821883201599, + "learning_rate": 3.0791033492662517e-06, + "loss": 0.0824, + "num_input_tokens_seen": 24121080, + "step": 33615 + }, + { + "epoch": 69.8960498960499, + "grad_norm": 0.458308607339859, + "learning_rate": 3.0743848964733203e-06, + "loss": 0.0737, + "num_input_tokens_seen": 24124824, + "step": 33620 + }, + { + "epoch": 69.9064449064449, + "grad_norm": 0.23093704879283905, + "learning_rate": 3.0696698248857625e-06, + "loss": 0.0978, + "num_input_tokens_seen": 24128248, + "step": 33625 + }, + { + "epoch": 69.91683991683992, + "grad_norm": 0.4466875195503235, + "learning_rate": 3.0649581352307192e-06, + "loss": 0.0883, + "num_input_tokens_seen": 24131736, + "step": 33630 + }, + { + "epoch": 69.92723492723492, + "grad_norm": 0.6976781487464905, + "learning_rate": 3.060249828234776e-06, + "loss": 0.1457, + "num_input_tokens_seen": 24135160, + "step": 33635 + }, + { + "epoch": 69.93762993762994, + "grad_norm": 0.6104515194892883, + "learning_rate": 3.055544904624025e-06, + "loss": 0.116, + "num_input_tokens_seen": 24138648, + "step": 33640 + }, + { + "epoch": 69.94802494802495, + "grad_norm": 0.39349231123924255, + "learning_rate": 3.050843365124026e-06, + "loss": 0.1067, + "num_input_tokens_seen": 24142200, + "step": 33645 + }, + { + "epoch": 69.95841995841995, + "grad_norm": 0.28949445486068726, + "learning_rate": 3.0461452104598083e-06, + "loss": 0.1158, + "num_input_tokens_seen": 24145752, + "step": 33650 + }, + { + "epoch": 69.96881496881497, + "grad_norm": 0.5031801462173462, + "learning_rate": 3.0414504413558836e-06, + "loss": 0.1072, + "num_input_tokens_seen": 24149432, + "step": 33655 + }, + { + "epoch": 69.97920997920998, + "grad_norm": 0.24488045275211334, + "learning_rate": 3.0367590585362564e-06, + "loss": 0.0994, + "num_input_tokens_seen": 24152888, + "step": 33660 + }, + { + "epoch": 69.98960498960498, + "grad_norm": 0.22002187371253967, + "learning_rate": 3.0320710627243813e-06, + "loss": 0.0925, + "num_input_tokens_seen": 24156376, + "step": 33665 + }, + { + "epoch": 70.0, + "grad_norm": 0.3070632815361023, + "learning_rate": 3.027386454643222e-06, + "loss": 0.0844, + "num_input_tokens_seen": 24159728, + "step": 33670 + }, + { + "epoch": 70.01039501039502, + "grad_norm": 0.43762150406837463, + "learning_rate": 3.0227052350151914e-06, + "loss": 0.1041, + "num_input_tokens_seen": 24163184, + "step": 33675 + }, + { + "epoch": 70.02079002079002, + "grad_norm": 0.19828765094280243, + "learning_rate": 3.0180274045621957e-06, + "loss": 0.0861, + "num_input_tokens_seen": 24166800, + "step": 33680 + }, + { + "epoch": 70.03118503118503, + "grad_norm": 0.39565953612327576, + "learning_rate": 3.013352964005625e-06, + "loss": 0.1037, + "num_input_tokens_seen": 24170512, + "step": 33685 + }, + { + "epoch": 70.04158004158005, + "grad_norm": 0.3983089029788971, + "learning_rate": 3.0086819140663218e-06, + "loss": 0.118, + "num_input_tokens_seen": 24174160, + "step": 33690 + }, + { + "epoch": 70.05197505197505, + "grad_norm": 0.2998315095901489, + "learning_rate": 3.0040142554646265e-06, + "loss": 0.0808, + "num_input_tokens_seen": 24177648, + "step": 33695 + }, + { + "epoch": 70.06237006237006, + "grad_norm": 0.5078734755516052, + "learning_rate": 2.999349988920361e-06, + "loss": 0.0622, + "num_input_tokens_seen": 24181264, + "step": 33700 + }, + { + "epoch": 70.07276507276508, + "grad_norm": 1.0298786163330078, + "learning_rate": 2.994689115152796e-06, + "loss": 0.0696, + "num_input_tokens_seen": 24184976, + "step": 33705 + }, + { + "epoch": 70.08316008316008, + "grad_norm": 0.43920931220054626, + "learning_rate": 2.9900316348807105e-06, + "loss": 0.0896, + "num_input_tokens_seen": 24188368, + "step": 33710 + }, + { + "epoch": 70.0935550935551, + "grad_norm": 0.2889860272407532, + "learning_rate": 2.985377548822338e-06, + "loss": 0.1149, + "num_input_tokens_seen": 24192112, + "step": 33715 + }, + { + "epoch": 70.1039501039501, + "grad_norm": 0.3635784089565277, + "learning_rate": 2.980726857695404e-06, + "loss": 0.1338, + "num_input_tokens_seen": 24195728, + "step": 33720 + }, + { + "epoch": 70.11434511434511, + "grad_norm": 0.36613887548446655, + "learning_rate": 2.9760795622171017e-06, + "loss": 0.0905, + "num_input_tokens_seen": 24199408, + "step": 33725 + }, + { + "epoch": 70.12474012474013, + "grad_norm": 0.25033849477767944, + "learning_rate": 2.971435663104094e-06, + "loss": 0.1191, + "num_input_tokens_seen": 24202896, + "step": 33730 + }, + { + "epoch": 70.13513513513513, + "grad_norm": 0.46459826827049255, + "learning_rate": 2.9667951610725385e-06, + "loss": 0.1028, + "num_input_tokens_seen": 24206384, + "step": 33735 + }, + { + "epoch": 70.14553014553015, + "grad_norm": 0.21432596445083618, + "learning_rate": 2.9621580568380575e-06, + "loss": 0.0901, + "num_input_tokens_seen": 24209776, + "step": 33740 + }, + { + "epoch": 70.15592515592516, + "grad_norm": 0.34621986746788025, + "learning_rate": 2.9575243511157453e-06, + "loss": 0.0967, + "num_input_tokens_seen": 24213296, + "step": 33745 + }, + { + "epoch": 70.16632016632016, + "grad_norm": 0.2257993072271347, + "learning_rate": 2.952894044620186e-06, + "loss": 0.1388, + "num_input_tokens_seen": 24216944, + "step": 33750 + }, + { + "epoch": 70.17671517671518, + "grad_norm": 0.5848538875579834, + "learning_rate": 2.948267138065419e-06, + "loss": 0.1003, + "num_input_tokens_seen": 24220592, + "step": 33755 + }, + { + "epoch": 70.18711018711019, + "grad_norm": 0.325182169675827, + "learning_rate": 2.943643632164983e-06, + "loss": 0.0872, + "num_input_tokens_seen": 24224304, + "step": 33760 + }, + { + "epoch": 70.1975051975052, + "grad_norm": 0.4826953709125519, + "learning_rate": 2.939023527631879e-06, + "loss": 0.1343, + "num_input_tokens_seen": 24227952, + "step": 33765 + }, + { + "epoch": 70.20790020790021, + "grad_norm": 0.20498643815517426, + "learning_rate": 2.934406825178576e-06, + "loss": 0.1213, + "num_input_tokens_seen": 24231632, + "step": 33770 + }, + { + "epoch": 70.21829521829522, + "grad_norm": 0.3131744861602783, + "learning_rate": 2.9297935255170357e-06, + "loss": 0.0921, + "num_input_tokens_seen": 24235056, + "step": 33775 + }, + { + "epoch": 70.22869022869023, + "grad_norm": 0.16645056009292603, + "learning_rate": 2.925183629358691e-06, + "loss": 0.1418, + "num_input_tokens_seen": 24238800, + "step": 33780 + }, + { + "epoch": 70.23908523908524, + "grad_norm": 0.29078182578086853, + "learning_rate": 2.9205771374144346e-06, + "loss": 0.0968, + "num_input_tokens_seen": 24242416, + "step": 33785 + }, + { + "epoch": 70.24948024948024, + "grad_norm": 0.3150053024291992, + "learning_rate": 2.915974050394657e-06, + "loss": 0.0715, + "num_input_tokens_seen": 24246000, + "step": 33790 + }, + { + "epoch": 70.25987525987526, + "grad_norm": 0.1229243054986, + "learning_rate": 2.9113743690092067e-06, + "loss": 0.0964, + "num_input_tokens_seen": 24249648, + "step": 33795 + }, + { + "epoch": 70.27027027027027, + "grad_norm": 0.3698353171348572, + "learning_rate": 2.906778093967402e-06, + "loss": 0.0866, + "num_input_tokens_seen": 24253072, + "step": 33800 + }, + { + "epoch": 70.27027027027027, + "eval_loss": 0.14345505833625793, + "eval_runtime": 7.7596, + "eval_samples_per_second": 110.315, + "eval_steps_per_second": 27.579, + "num_input_tokens_seen": 24253072, + "step": 33800 + }, + { + "epoch": 70.28066528066527, + "grad_norm": 0.6366651654243469, + "learning_rate": 2.9021852259780656e-06, + "loss": 0.1037, + "num_input_tokens_seen": 24256432, + "step": 33805 + }, + { + "epoch": 70.29106029106029, + "grad_norm": 0.6677504181861877, + "learning_rate": 2.8975957657494583e-06, + "loss": 0.083, + "num_input_tokens_seen": 24259920, + "step": 33810 + }, + { + "epoch": 70.3014553014553, + "grad_norm": 0.4397392272949219, + "learning_rate": 2.8930097139893417e-06, + "loss": 0.1153, + "num_input_tokens_seen": 24263600, + "step": 33815 + }, + { + "epoch": 70.3118503118503, + "grad_norm": 0.4947119951248169, + "learning_rate": 2.888427071404945e-06, + "loss": 0.0608, + "num_input_tokens_seen": 24267408, + "step": 33820 + }, + { + "epoch": 70.32224532224532, + "grad_norm": 0.769153356552124, + "learning_rate": 2.8838478387029606e-06, + "loss": 0.1205, + "num_input_tokens_seen": 24271056, + "step": 33825 + }, + { + "epoch": 70.33264033264034, + "grad_norm": 0.17611289024353027, + "learning_rate": 2.8792720165895737e-06, + "loss": 0.0827, + "num_input_tokens_seen": 24274608, + "step": 33830 + }, + { + "epoch": 70.34303534303534, + "grad_norm": 0.3423021137714386, + "learning_rate": 2.874699605770423e-06, + "loss": 0.1238, + "num_input_tokens_seen": 24278160, + "step": 33835 + }, + { + "epoch": 70.35343035343035, + "grad_norm": 0.3774571716785431, + "learning_rate": 2.8701306069506383e-06, + "loss": 0.0919, + "num_input_tokens_seen": 24281744, + "step": 33840 + }, + { + "epoch": 70.36382536382537, + "grad_norm": 0.4305661916732788, + "learning_rate": 2.8655650208348178e-06, + "loss": 0.0688, + "num_input_tokens_seen": 24285424, + "step": 33845 + }, + { + "epoch": 70.37422037422037, + "grad_norm": 0.26511892676353455, + "learning_rate": 2.8610028481270257e-06, + "loss": 0.1188, + "num_input_tokens_seen": 24289072, + "step": 33850 + }, + { + "epoch": 70.38461538461539, + "grad_norm": 0.288813978433609, + "learning_rate": 2.856444089530813e-06, + "loss": 0.0674, + "num_input_tokens_seen": 24292688, + "step": 33855 + }, + { + "epoch": 70.39501039501039, + "grad_norm": 0.39542871713638306, + "learning_rate": 2.8518887457491955e-06, + "loss": 0.0805, + "num_input_tokens_seen": 24296368, + "step": 33860 + }, + { + "epoch": 70.4054054054054, + "grad_norm": 0.37450137734413147, + "learning_rate": 2.8473368174846666e-06, + "loss": 0.134, + "num_input_tokens_seen": 24299856, + "step": 33865 + }, + { + "epoch": 70.41580041580042, + "grad_norm": 0.3233131468296051, + "learning_rate": 2.842788305439184e-06, + "loss": 0.089, + "num_input_tokens_seen": 24303312, + "step": 33870 + }, + { + "epoch": 70.42619542619542, + "grad_norm": 0.38416406512260437, + "learning_rate": 2.8382432103141925e-06, + "loss": 0.0943, + "num_input_tokens_seen": 24306832, + "step": 33875 + }, + { + "epoch": 70.43659043659044, + "grad_norm": 0.38880839943885803, + "learning_rate": 2.833701532810598e-06, + "loss": 0.1046, + "num_input_tokens_seen": 24310640, + "step": 33880 + }, + { + "epoch": 70.44698544698545, + "grad_norm": 0.21138763427734375, + "learning_rate": 2.8291632736287877e-06, + "loss": 0.1089, + "num_input_tokens_seen": 24314256, + "step": 33885 + }, + { + "epoch": 70.45738045738045, + "grad_norm": 0.27442309260368347, + "learning_rate": 2.824628433468615e-06, + "loss": 0.1007, + "num_input_tokens_seen": 24317840, + "step": 33890 + }, + { + "epoch": 70.46777546777547, + "grad_norm": 0.407774418592453, + "learning_rate": 2.8200970130294073e-06, + "loss": 0.1005, + "num_input_tokens_seen": 24321424, + "step": 33895 + }, + { + "epoch": 70.47817047817048, + "grad_norm": 0.27181360125541687, + "learning_rate": 2.8155690130099775e-06, + "loss": 0.0853, + "num_input_tokens_seen": 24325072, + "step": 33900 + }, + { + "epoch": 70.48856548856548, + "grad_norm": 0.12862101197242737, + "learning_rate": 2.8110444341085895e-06, + "loss": 0.0866, + "num_input_tokens_seen": 24328464, + "step": 33905 + }, + { + "epoch": 70.4989604989605, + "grad_norm": 0.18986250460147858, + "learning_rate": 2.806523277022996e-06, + "loss": 0.0916, + "num_input_tokens_seen": 24332144, + "step": 33910 + }, + { + "epoch": 70.50935550935552, + "grad_norm": 0.14788846671581268, + "learning_rate": 2.802005542450409e-06, + "loss": 0.0727, + "num_input_tokens_seen": 24335696, + "step": 33915 + }, + { + "epoch": 70.51975051975052, + "grad_norm": 0.20961254835128784, + "learning_rate": 2.797491231087526e-06, + "loss": 0.1105, + "num_input_tokens_seen": 24339088, + "step": 33920 + }, + { + "epoch": 70.53014553014553, + "grad_norm": 0.27708351612091064, + "learning_rate": 2.7929803436305137e-06, + "loss": 0.101, + "num_input_tokens_seen": 24342736, + "step": 33925 + }, + { + "epoch": 70.54054054054055, + "grad_norm": 0.22808514535427094, + "learning_rate": 2.788472880774998e-06, + "loss": 0.1281, + "num_input_tokens_seen": 24346480, + "step": 33930 + }, + { + "epoch": 70.55093555093555, + "grad_norm": 0.2535615563392639, + "learning_rate": 2.7839688432160977e-06, + "loss": 0.0883, + "num_input_tokens_seen": 24350032, + "step": 33935 + }, + { + "epoch": 70.56133056133056, + "grad_norm": 0.329489529132843, + "learning_rate": 2.779468231648383e-06, + "loss": 0.0815, + "num_input_tokens_seen": 24353552, + "step": 33940 + }, + { + "epoch": 70.57172557172557, + "grad_norm": 0.22032903134822845, + "learning_rate": 2.774971046765906e-06, + "loss": 0.1068, + "num_input_tokens_seen": 24357296, + "step": 33945 + }, + { + "epoch": 70.58212058212058, + "grad_norm": 0.2905738353729248, + "learning_rate": 2.770477289262194e-06, + "loss": 0.1065, + "num_input_tokens_seen": 24360976, + "step": 33950 + }, + { + "epoch": 70.5925155925156, + "grad_norm": 0.6121920943260193, + "learning_rate": 2.765986959830233e-06, + "loss": 0.0965, + "num_input_tokens_seen": 24364656, + "step": 33955 + }, + { + "epoch": 70.6029106029106, + "grad_norm": 0.2538926899433136, + "learning_rate": 2.761500059162492e-06, + "loss": 0.0827, + "num_input_tokens_seen": 24368208, + "step": 33960 + }, + { + "epoch": 70.61330561330561, + "grad_norm": 0.7853887677192688, + "learning_rate": 2.757016587950914e-06, + "loss": 0.1161, + "num_input_tokens_seen": 24371664, + "step": 33965 + }, + { + "epoch": 70.62370062370063, + "grad_norm": 0.5981056690216064, + "learning_rate": 2.752536546886897e-06, + "loss": 0.1569, + "num_input_tokens_seen": 24375280, + "step": 33970 + }, + { + "epoch": 70.63409563409563, + "grad_norm": 0.5081318616867065, + "learning_rate": 2.7480599366613234e-06, + "loss": 0.1019, + "num_input_tokens_seen": 24378832, + "step": 33975 + }, + { + "epoch": 70.64449064449065, + "grad_norm": 0.21112492680549622, + "learning_rate": 2.7435867579645473e-06, + "loss": 0.0941, + "num_input_tokens_seen": 24382320, + "step": 33980 + }, + { + "epoch": 70.65488565488566, + "grad_norm": 0.23516137897968292, + "learning_rate": 2.739117011486378e-06, + "loss": 0.0877, + "num_input_tokens_seen": 24385968, + "step": 33985 + }, + { + "epoch": 70.66528066528066, + "grad_norm": 0.4137808084487915, + "learning_rate": 2.7346506979161216e-06, + "loss": 0.0853, + "num_input_tokens_seen": 24389520, + "step": 33990 + }, + { + "epoch": 70.67567567567568, + "grad_norm": 0.3391462564468384, + "learning_rate": 2.7301878179425227e-06, + "loss": 0.1002, + "num_input_tokens_seen": 24392976, + "step": 33995 + }, + { + "epoch": 70.68607068607069, + "grad_norm": 0.14335514605045319, + "learning_rate": 2.7257283722538244e-06, + "loss": 0.1197, + "num_input_tokens_seen": 24396528, + "step": 34000 + }, + { + "epoch": 70.68607068607069, + "eval_loss": 0.1447901874780655, + "eval_runtime": 7.7578, + "eval_samples_per_second": 110.34, + "eval_steps_per_second": 27.585, + "num_input_tokens_seen": 24396528, + "step": 34000 + }, + { + "epoch": 70.6964656964657, + "grad_norm": 0.241583913564682, + "learning_rate": 2.7212723615377326e-06, + "loss": 0.0782, + "num_input_tokens_seen": 24400208, + "step": 34005 + }, + { + "epoch": 70.70686070686071, + "grad_norm": 0.3206329345703125, + "learning_rate": 2.7168197864814145e-06, + "loss": 0.1134, + "num_input_tokens_seen": 24403664, + "step": 34010 + }, + { + "epoch": 70.71725571725571, + "grad_norm": 0.6881839632987976, + "learning_rate": 2.712370647771509e-06, + "loss": 0.1301, + "num_input_tokens_seen": 24407216, + "step": 34015 + }, + { + "epoch": 70.72765072765073, + "grad_norm": 0.273602694272995, + "learning_rate": 2.707924946094137e-06, + "loss": 0.0903, + "num_input_tokens_seen": 24410640, + "step": 34020 + }, + { + "epoch": 70.73804573804574, + "grad_norm": 0.5589106678962708, + "learning_rate": 2.7034826821348723e-06, + "loss": 0.0791, + "num_input_tokens_seen": 24414256, + "step": 34025 + }, + { + "epoch": 70.74844074844074, + "grad_norm": 0.3881967067718506, + "learning_rate": 2.6990438565787786e-06, + "loss": 0.104, + "num_input_tokens_seen": 24417776, + "step": 34030 + }, + { + "epoch": 70.75883575883576, + "grad_norm": 0.4284743368625641, + "learning_rate": 2.6946084701103714e-06, + "loss": 0.0805, + "num_input_tokens_seen": 24421264, + "step": 34035 + }, + { + "epoch": 70.76923076923077, + "grad_norm": 0.522456169128418, + "learning_rate": 2.6901765234136428e-06, + "loss": 0.1088, + "num_input_tokens_seen": 24424944, + "step": 34040 + }, + { + "epoch": 70.77962577962577, + "grad_norm": 0.45047691464424133, + "learning_rate": 2.685748017172063e-06, + "loss": 0.1039, + "num_input_tokens_seen": 24428688, + "step": 34045 + }, + { + "epoch": 70.79002079002079, + "grad_norm": 0.4172815680503845, + "learning_rate": 2.681322952068549e-06, + "loss": 0.0678, + "num_input_tokens_seen": 24432368, + "step": 34050 + }, + { + "epoch": 70.8004158004158, + "grad_norm": 0.7288976311683655, + "learning_rate": 2.6769013287855137e-06, + "loss": 0.1035, + "num_input_tokens_seen": 24435856, + "step": 34055 + }, + { + "epoch": 70.8108108108108, + "grad_norm": 0.6432005167007446, + "learning_rate": 2.6724831480048286e-06, + "loss": 0.0918, + "num_input_tokens_seen": 24439344, + "step": 34060 + }, + { + "epoch": 70.82120582120582, + "grad_norm": 0.19901449978351593, + "learning_rate": 2.66806841040782e-06, + "loss": 0.114, + "num_input_tokens_seen": 24443120, + "step": 34065 + }, + { + "epoch": 70.83160083160084, + "grad_norm": 0.27967017889022827, + "learning_rate": 2.6636571166753083e-06, + "loss": 0.0682, + "num_input_tokens_seen": 24446544, + "step": 34070 + }, + { + "epoch": 70.84199584199584, + "grad_norm": 0.6557608246803284, + "learning_rate": 2.6592492674875598e-06, + "loss": 0.1017, + "num_input_tokens_seen": 24450256, + "step": 34075 + }, + { + "epoch": 70.85239085239085, + "grad_norm": 0.43726134300231934, + "learning_rate": 2.6548448635243305e-06, + "loss": 0.0708, + "num_input_tokens_seen": 24453808, + "step": 34080 + }, + { + "epoch": 70.86278586278586, + "grad_norm": 0.4753997325897217, + "learning_rate": 2.650443905464828e-06, + "loss": 0.1406, + "num_input_tokens_seen": 24457360, + "step": 34085 + }, + { + "epoch": 70.87318087318087, + "grad_norm": 0.36695054173469543, + "learning_rate": 2.646046393987739e-06, + "loss": 0.0784, + "num_input_tokens_seen": 24460944, + "step": 34090 + }, + { + "epoch": 70.88357588357589, + "grad_norm": 0.23879240453243256, + "learning_rate": 2.64165232977121e-06, + "loss": 0.0644, + "num_input_tokens_seen": 24464560, + "step": 34095 + }, + { + "epoch": 70.89397089397089, + "grad_norm": 0.7172359824180603, + "learning_rate": 2.6372617134928695e-06, + "loss": 0.1042, + "num_input_tokens_seen": 24468272, + "step": 34100 + }, + { + "epoch": 70.9043659043659, + "grad_norm": 0.31560322642326355, + "learning_rate": 2.6328745458297943e-06, + "loss": 0.0788, + "num_input_tokens_seen": 24471952, + "step": 34105 + }, + { + "epoch": 70.91476091476092, + "grad_norm": 0.874816358089447, + "learning_rate": 2.6284908274585546e-06, + "loss": 0.1118, + "num_input_tokens_seen": 24475664, + "step": 34110 + }, + { + "epoch": 70.92515592515592, + "grad_norm": 0.31719163060188293, + "learning_rate": 2.6241105590551595e-06, + "loss": 0.0907, + "num_input_tokens_seen": 24479344, + "step": 34115 + }, + { + "epoch": 70.93555093555094, + "grad_norm": 0.17059510946273804, + "learning_rate": 2.6197337412951105e-06, + "loss": 0.0746, + "num_input_tokens_seen": 24482992, + "step": 34120 + }, + { + "epoch": 70.94594594594595, + "grad_norm": 0.216191828250885, + "learning_rate": 2.6153603748533705e-06, + "loss": 0.08, + "num_input_tokens_seen": 24486480, + "step": 34125 + }, + { + "epoch": 70.95634095634095, + "grad_norm": 0.29609233140945435, + "learning_rate": 2.6109904604043585e-06, + "loss": 0.0871, + "num_input_tokens_seen": 24490000, + "step": 34130 + }, + { + "epoch": 70.96673596673597, + "grad_norm": 0.6851346492767334, + "learning_rate": 2.6066239986219765e-06, + "loss": 0.0861, + "num_input_tokens_seen": 24493520, + "step": 34135 + }, + { + "epoch": 70.97713097713098, + "grad_norm": 0.19994525611400604, + "learning_rate": 2.602260990179592e-06, + "loss": 0.0845, + "num_input_tokens_seen": 24497040, + "step": 34140 + }, + { + "epoch": 70.98752598752598, + "grad_norm": 0.41434720158576965, + "learning_rate": 2.5979014357500248e-06, + "loss": 0.0999, + "num_input_tokens_seen": 24500560, + "step": 34145 + }, + { + "epoch": 70.997920997921, + "grad_norm": 0.14336584508419037, + "learning_rate": 2.5935453360055844e-06, + "loss": 0.097, + "num_input_tokens_seen": 24504176, + "step": 34150 + }, + { + "epoch": 71.00831600831602, + "grad_norm": 0.1906377375125885, + "learning_rate": 2.5891926916180283e-06, + "loss": 0.097, + "num_input_tokens_seen": 24507624, + "step": 34155 + }, + { + "epoch": 71.01871101871102, + "grad_norm": 0.3917270302772522, + "learning_rate": 2.5848435032585883e-06, + "loss": 0.1045, + "num_input_tokens_seen": 24511112, + "step": 34160 + }, + { + "epoch": 71.02910602910603, + "grad_norm": 0.18827971816062927, + "learning_rate": 2.58049777159797e-06, + "loss": 0.0919, + "num_input_tokens_seen": 24514696, + "step": 34165 + }, + { + "epoch": 71.03950103950103, + "grad_norm": 0.34238889813423157, + "learning_rate": 2.576155497306332e-06, + "loss": 0.081, + "num_input_tokens_seen": 24518280, + "step": 34170 + }, + { + "epoch": 71.04989604989605, + "grad_norm": 0.24219195544719696, + "learning_rate": 2.57181668105331e-06, + "loss": 0.0908, + "num_input_tokens_seen": 24521864, + "step": 34175 + }, + { + "epoch": 71.06029106029106, + "grad_norm": 0.4914005994796753, + "learning_rate": 2.567481323508014e-06, + "loss": 0.0864, + "num_input_tokens_seen": 24525448, + "step": 34180 + }, + { + "epoch": 71.07068607068607, + "grad_norm": 0.22819045186042786, + "learning_rate": 2.5631494253389954e-06, + "loss": 0.0933, + "num_input_tokens_seen": 24529064, + "step": 34185 + }, + { + "epoch": 71.08108108108108, + "grad_norm": 0.43767493963241577, + "learning_rate": 2.5588209872142997e-06, + "loss": 0.0986, + "num_input_tokens_seen": 24532680, + "step": 34190 + }, + { + "epoch": 71.0914760914761, + "grad_norm": 0.13295476138591766, + "learning_rate": 2.5544960098014186e-06, + "loss": 0.1048, + "num_input_tokens_seen": 24536424, + "step": 34195 + }, + { + "epoch": 71.1018711018711, + "grad_norm": 0.5171935558319092, + "learning_rate": 2.550174493767318e-06, + "loss": 0.1497, + "num_input_tokens_seen": 24540040, + "step": 34200 + }, + { + "epoch": 71.1018711018711, + "eval_loss": 0.14526554942131042, + "eval_runtime": 7.7556, + "eval_samples_per_second": 110.371, + "eval_steps_per_second": 27.593, + "num_input_tokens_seen": 24540040, + "step": 34200 + }, + { + "epoch": 71.11226611226611, + "grad_norm": 0.8297691345214844, + "learning_rate": 2.545856439778438e-06, + "loss": 0.1501, + "num_input_tokens_seen": 24543560, + "step": 34205 + }, + { + "epoch": 71.12266112266113, + "grad_norm": 0.42994415760040283, + "learning_rate": 2.541541848500667e-06, + "loss": 0.1031, + "num_input_tokens_seen": 24547048, + "step": 34210 + }, + { + "epoch": 71.13305613305613, + "grad_norm": 0.43549954891204834, + "learning_rate": 2.5372307205993733e-06, + "loss": 0.1121, + "num_input_tokens_seen": 24550568, + "step": 34215 + }, + { + "epoch": 71.14345114345114, + "grad_norm": 0.22640030086040497, + "learning_rate": 2.5329230567393917e-06, + "loss": 0.0622, + "num_input_tokens_seen": 24554024, + "step": 34220 + }, + { + "epoch": 71.15384615384616, + "grad_norm": 0.486319363117218, + "learning_rate": 2.5286188575850164e-06, + "loss": 0.1388, + "num_input_tokens_seen": 24557768, + "step": 34225 + }, + { + "epoch": 71.16424116424116, + "grad_norm": 0.4653950035572052, + "learning_rate": 2.5243181237999984e-06, + "loss": 0.0806, + "num_input_tokens_seen": 24561160, + "step": 34230 + }, + { + "epoch": 71.17463617463618, + "grad_norm": 0.4044608473777771, + "learning_rate": 2.520020856047578e-06, + "loss": 0.0996, + "num_input_tokens_seen": 24564712, + "step": 34235 + }, + { + "epoch": 71.18503118503118, + "grad_norm": 0.6521154642105103, + "learning_rate": 2.515727054990438e-06, + "loss": 0.0654, + "num_input_tokens_seen": 24568200, + "step": 34240 + }, + { + "epoch": 71.1954261954262, + "grad_norm": 0.3743376135826111, + "learning_rate": 2.511436721290747e-06, + "loss": 0.1062, + "num_input_tokens_seen": 24571688, + "step": 34245 + }, + { + "epoch": 71.20582120582121, + "grad_norm": 0.28419631719589233, + "learning_rate": 2.5071498556101164e-06, + "loss": 0.0915, + "num_input_tokens_seen": 24575304, + "step": 34250 + }, + { + "epoch": 71.21621621621621, + "grad_norm": 0.10830137878656387, + "learning_rate": 2.5028664586096485e-06, + "loss": 0.049, + "num_input_tokens_seen": 24578792, + "step": 34255 + }, + { + "epoch": 71.22661122661123, + "grad_norm": 0.3344177305698395, + "learning_rate": 2.498586530949881e-06, + "loss": 0.0885, + "num_input_tokens_seen": 24582472, + "step": 34260 + }, + { + "epoch": 71.23700623700624, + "grad_norm": 0.7113394141197205, + "learning_rate": 2.4943100732908427e-06, + "loss": 0.1073, + "num_input_tokens_seen": 24586088, + "step": 34265 + }, + { + "epoch": 71.24740124740124, + "grad_norm": 0.4729563891887665, + "learning_rate": 2.4900370862920188e-06, + "loss": 0.0979, + "num_input_tokens_seen": 24589544, + "step": 34270 + }, + { + "epoch": 71.25779625779626, + "grad_norm": 0.6211997270584106, + "learning_rate": 2.4857675706123518e-06, + "loss": 0.0582, + "num_input_tokens_seen": 24593096, + "step": 34275 + }, + { + "epoch": 71.26819126819127, + "grad_norm": 0.1974535882472992, + "learning_rate": 2.4815015269102543e-06, + "loss": 0.0711, + "num_input_tokens_seen": 24596776, + "step": 34280 + }, + { + "epoch": 71.27858627858627, + "grad_norm": 0.3873634338378906, + "learning_rate": 2.477238955843611e-06, + "loss": 0.0788, + "num_input_tokens_seen": 24600296, + "step": 34285 + }, + { + "epoch": 71.28898128898129, + "grad_norm": 0.5760968923568726, + "learning_rate": 2.4729798580697573e-06, + "loss": 0.1276, + "num_input_tokens_seen": 24603912, + "step": 34290 + }, + { + "epoch": 71.2993762993763, + "grad_norm": 0.40480837225914, + "learning_rate": 2.4687242342455034e-06, + "loss": 0.0796, + "num_input_tokens_seen": 24607528, + "step": 34295 + }, + { + "epoch": 71.3097713097713, + "grad_norm": 0.4317091405391693, + "learning_rate": 2.4644720850271196e-06, + "loss": 0.1171, + "num_input_tokens_seen": 24611112, + "step": 34300 + }, + { + "epoch": 71.32016632016632, + "grad_norm": 0.7568615078926086, + "learning_rate": 2.4602234110703364e-06, + "loss": 0.0742, + "num_input_tokens_seen": 24614728, + "step": 34305 + }, + { + "epoch": 71.33056133056132, + "grad_norm": 0.3994470238685608, + "learning_rate": 2.4559782130303576e-06, + "loss": 0.1376, + "num_input_tokens_seen": 24618376, + "step": 34310 + }, + { + "epoch": 71.34095634095634, + "grad_norm": 0.32711607217788696, + "learning_rate": 2.451736491561843e-06, + "loss": 0.0831, + "num_input_tokens_seen": 24621928, + "step": 34315 + }, + { + "epoch": 71.35135135135135, + "grad_norm": 0.7378814220428467, + "learning_rate": 2.4474982473189163e-06, + "loss": 0.0994, + "num_input_tokens_seen": 24625512, + "step": 34320 + }, + { + "epoch": 71.36174636174636, + "grad_norm": 0.588908314704895, + "learning_rate": 2.4432634809551796e-06, + "loss": 0.1095, + "num_input_tokens_seen": 24629288, + "step": 34325 + }, + { + "epoch": 71.37214137214137, + "grad_norm": 0.21179592609405518, + "learning_rate": 2.439032193123675e-06, + "loss": 0.1132, + "num_input_tokens_seen": 24632936, + "step": 34330 + }, + { + "epoch": 71.38253638253639, + "grad_norm": 0.16794411838054657, + "learning_rate": 2.4348043844769297e-06, + "loss": 0.1546, + "num_input_tokens_seen": 24636552, + "step": 34335 + }, + { + "epoch": 71.39293139293139, + "grad_norm": 0.16011683642864227, + "learning_rate": 2.4305800556669146e-06, + "loss": 0.0585, + "num_input_tokens_seen": 24640136, + "step": 34340 + }, + { + "epoch": 71.4033264033264, + "grad_norm": 0.3828398585319519, + "learning_rate": 2.426359207345083e-06, + "loss": 0.0635, + "num_input_tokens_seen": 24643688, + "step": 34345 + }, + { + "epoch": 71.41372141372142, + "grad_norm": 0.25398552417755127, + "learning_rate": 2.4221418401623396e-06, + "loss": 0.103, + "num_input_tokens_seen": 24647272, + "step": 34350 + }, + { + "epoch": 71.42411642411642, + "grad_norm": 0.1916150003671646, + "learning_rate": 2.4179279547690557e-06, + "loss": 0.0659, + "num_input_tokens_seen": 24650696, + "step": 34355 + }, + { + "epoch": 71.43451143451144, + "grad_norm": 0.3008512854576111, + "learning_rate": 2.413717551815062e-06, + "loss": 0.1055, + "num_input_tokens_seen": 24654312, + "step": 34360 + }, + { + "epoch": 71.44490644490645, + "grad_norm": 0.2931464612483978, + "learning_rate": 2.409510631949666e-06, + "loss": 0.1074, + "num_input_tokens_seen": 24657960, + "step": 34365 + }, + { + "epoch": 71.45530145530145, + "grad_norm": 0.619350790977478, + "learning_rate": 2.405307195821618e-06, + "loss": 0.0825, + "num_input_tokens_seen": 24661576, + "step": 34370 + }, + { + "epoch": 71.46569646569647, + "grad_norm": 0.4923252761363983, + "learning_rate": 2.4011072440791372e-06, + "loss": 0.1058, + "num_input_tokens_seen": 24665096, + "step": 34375 + }, + { + "epoch": 71.47609147609148, + "grad_norm": 0.2547106444835663, + "learning_rate": 2.3969107773699233e-06, + "loss": 0.0885, + "num_input_tokens_seen": 24668776, + "step": 34380 + }, + { + "epoch": 71.48648648648648, + "grad_norm": 0.16433578729629517, + "learning_rate": 2.3927177963411096e-06, + "loss": 0.0646, + "num_input_tokens_seen": 24672392, + "step": 34385 + }, + { + "epoch": 71.4968814968815, + "grad_norm": 0.3008168339729309, + "learning_rate": 2.3885283016393144e-06, + "loss": 0.1031, + "num_input_tokens_seen": 24676040, + "step": 34390 + }, + { + "epoch": 71.5072765072765, + "grad_norm": 0.18070077896118164, + "learning_rate": 2.3843422939106076e-06, + "loss": 0.101, + "num_input_tokens_seen": 24679560, + "step": 34395 + }, + { + "epoch": 71.51767151767152, + "grad_norm": 0.5171475410461426, + "learning_rate": 2.380159773800525e-06, + "loss": 0.1028, + "num_input_tokens_seen": 24683144, + "step": 34400 + }, + { + "epoch": 71.51767151767152, + "eval_loss": 0.14505870640277863, + "eval_runtime": 7.7462, + "eval_samples_per_second": 110.506, + "eval_steps_per_second": 27.626, + "num_input_tokens_seen": 24683144, + "step": 34400 + }, + { + "epoch": 71.52806652806653, + "grad_norm": 0.18347632884979248, + "learning_rate": 2.3759807419540675e-06, + "loss": 0.062, + "num_input_tokens_seen": 24686760, + "step": 34405 + }, + { + "epoch": 71.53846153846153, + "grad_norm": 1.0086153745651245, + "learning_rate": 2.3718051990156835e-06, + "loss": 0.1132, + "num_input_tokens_seen": 24690216, + "step": 34410 + }, + { + "epoch": 71.54885654885655, + "grad_norm": 0.17964713275432587, + "learning_rate": 2.367633145629311e-06, + "loss": 0.0537, + "num_input_tokens_seen": 24693800, + "step": 34415 + }, + { + "epoch": 71.55925155925156, + "grad_norm": 0.5211385488510132, + "learning_rate": 2.363464582438316e-06, + "loss": 0.0973, + "num_input_tokens_seen": 24697288, + "step": 34420 + }, + { + "epoch": 71.56964656964657, + "grad_norm": 0.3672832250595093, + "learning_rate": 2.3592995100855526e-06, + "loss": 0.0943, + "num_input_tokens_seen": 24700840, + "step": 34425 + }, + { + "epoch": 71.58004158004158, + "grad_norm": 0.19795668125152588, + "learning_rate": 2.3551379292133273e-06, + "loss": 0.1044, + "num_input_tokens_seen": 24704456, + "step": 34430 + }, + { + "epoch": 71.5904365904366, + "grad_norm": 0.30556485056877136, + "learning_rate": 2.3509798404634047e-06, + "loss": 0.1189, + "num_input_tokens_seen": 24708136, + "step": 34435 + }, + { + "epoch": 71.6008316008316, + "grad_norm": 0.24251899123191833, + "learning_rate": 2.346825244477019e-06, + "loss": 0.095, + "num_input_tokens_seen": 24711560, + "step": 34440 + }, + { + "epoch": 71.61122661122661, + "grad_norm": 0.3202005624771118, + "learning_rate": 2.3426741418948545e-06, + "loss": 0.1192, + "num_input_tokens_seen": 24715272, + "step": 34445 + }, + { + "epoch": 71.62162162162163, + "grad_norm": 0.15312695503234863, + "learning_rate": 2.3385265333570715e-06, + "loss": 0.1127, + "num_input_tokens_seen": 24718792, + "step": 34450 + }, + { + "epoch": 71.63201663201663, + "grad_norm": 0.3670141398906708, + "learning_rate": 2.334382419503278e-06, + "loss": 0.0854, + "num_input_tokens_seen": 24722280, + "step": 34455 + }, + { + "epoch": 71.64241164241164, + "grad_norm": 0.2374100387096405, + "learning_rate": 2.3302418009725465e-06, + "loss": 0.09, + "num_input_tokens_seen": 24725864, + "step": 34460 + }, + { + "epoch": 71.65280665280665, + "grad_norm": 0.20128706097602844, + "learning_rate": 2.326104678403415e-06, + "loss": 0.0907, + "num_input_tokens_seen": 24729544, + "step": 34465 + }, + { + "epoch": 71.66320166320166, + "grad_norm": 0.23086324334144592, + "learning_rate": 2.321971052433883e-06, + "loss": 0.0795, + "num_input_tokens_seen": 24733288, + "step": 34470 + }, + { + "epoch": 71.67359667359668, + "grad_norm": 0.3451364040374756, + "learning_rate": 2.3178409237014004e-06, + "loss": 0.1152, + "num_input_tokens_seen": 24736872, + "step": 34475 + }, + { + "epoch": 71.68399168399168, + "grad_norm": 0.8078809380531311, + "learning_rate": 2.313714292842889e-06, + "loss": 0.1118, + "num_input_tokens_seen": 24740392, + "step": 34480 + }, + { + "epoch": 71.6943866943867, + "grad_norm": 0.4019432067871094, + "learning_rate": 2.309591160494734e-06, + "loss": 0.1078, + "num_input_tokens_seen": 24743976, + "step": 34485 + }, + { + "epoch": 71.70478170478171, + "grad_norm": 0.2138061225414276, + "learning_rate": 2.305471527292763e-06, + "loss": 0.1227, + "num_input_tokens_seen": 24747464, + "step": 34490 + }, + { + "epoch": 71.71517671517671, + "grad_norm": 0.41063281893730164, + "learning_rate": 2.3013553938722817e-06, + "loss": 0.1124, + "num_input_tokens_seen": 24751112, + "step": 34495 + }, + { + "epoch": 71.72557172557173, + "grad_norm": 0.950740396976471, + "learning_rate": 2.297242760868043e-06, + "loss": 0.0725, + "num_input_tokens_seen": 24754920, + "step": 34500 + }, + { + "epoch": 71.73596673596674, + "grad_norm": 0.38189297914505005, + "learning_rate": 2.2931336289142735e-06, + "loss": 0.1076, + "num_input_tokens_seen": 24758600, + "step": 34505 + }, + { + "epoch": 71.74636174636174, + "grad_norm": 0.18034528195858002, + "learning_rate": 2.289027998644655e-06, + "loss": 0.0703, + "num_input_tokens_seen": 24762152, + "step": 34510 + }, + { + "epoch": 71.75675675675676, + "grad_norm": 0.26124557852745056, + "learning_rate": 2.2849258706923228e-06, + "loss": 0.0748, + "num_input_tokens_seen": 24765992, + "step": 34515 + }, + { + "epoch": 71.76715176715177, + "grad_norm": 0.16062182188034058, + "learning_rate": 2.2808272456898705e-06, + "loss": 0.0919, + "num_input_tokens_seen": 24769672, + "step": 34520 + }, + { + "epoch": 71.77754677754677, + "grad_norm": 0.3260864317417145, + "learning_rate": 2.2767321242693707e-06, + "loss": 0.1503, + "num_input_tokens_seen": 24773352, + "step": 34525 + }, + { + "epoch": 71.78794178794179, + "grad_norm": 0.3509436249732971, + "learning_rate": 2.272640507062329e-06, + "loss": 0.132, + "num_input_tokens_seen": 24777096, + "step": 34530 + }, + { + "epoch": 71.7983367983368, + "grad_norm": 0.42399531602859497, + "learning_rate": 2.2685523946997382e-06, + "loss": 0.1207, + "num_input_tokens_seen": 24780776, + "step": 34535 + }, + { + "epoch": 71.8087318087318, + "grad_norm": 0.46016231179237366, + "learning_rate": 2.2644677878120245e-06, + "loss": 0.0902, + "num_input_tokens_seen": 24784360, + "step": 34540 + }, + { + "epoch": 71.81912681912682, + "grad_norm": 0.5392308235168457, + "learning_rate": 2.2603866870290897e-06, + "loss": 0.1027, + "num_input_tokens_seen": 24788008, + "step": 34545 + }, + { + "epoch": 71.82952182952182, + "grad_norm": 0.3949224054813385, + "learning_rate": 2.256309092980294e-06, + "loss": 0.0721, + "num_input_tokens_seen": 24791624, + "step": 34550 + }, + { + "epoch": 71.83991683991684, + "grad_norm": 0.5244479179382324, + "learning_rate": 2.252235006294448e-06, + "loss": 0.0857, + "num_input_tokens_seen": 24795272, + "step": 34555 + }, + { + "epoch": 71.85031185031185, + "grad_norm": 0.3812003433704376, + "learning_rate": 2.2481644275998333e-06, + "loss": 0.062, + "num_input_tokens_seen": 24798760, + "step": 34560 + }, + { + "epoch": 71.86070686070686, + "grad_norm": 0.1947496086359024, + "learning_rate": 2.2440973575241832e-06, + "loss": 0.1044, + "num_input_tokens_seen": 24802248, + "step": 34565 + }, + { + "epoch": 71.87110187110187, + "grad_norm": 0.7617414593696594, + "learning_rate": 2.240033796694685e-06, + "loss": 0.0809, + "num_input_tokens_seen": 24805736, + "step": 34570 + }, + { + "epoch": 71.88149688149689, + "grad_norm": 0.41451308131217957, + "learning_rate": 2.235973745737999e-06, + "loss": 0.1251, + "num_input_tokens_seen": 24809256, + "step": 34575 + }, + { + "epoch": 71.89189189189189, + "grad_norm": 0.7465566396713257, + "learning_rate": 2.2319172052802263e-06, + "loss": 0.0971, + "num_input_tokens_seen": 24812776, + "step": 34580 + }, + { + "epoch": 71.9022869022869, + "grad_norm": 0.5537139177322388, + "learning_rate": 2.2278641759469477e-06, + "loss": 0.0674, + "num_input_tokens_seen": 24816264, + "step": 34585 + }, + { + "epoch": 71.91268191268192, + "grad_norm": 0.4524502754211426, + "learning_rate": 2.2238146583631825e-06, + "loss": 0.1138, + "num_input_tokens_seen": 24819880, + "step": 34590 + }, + { + "epoch": 71.92307692307692, + "grad_norm": 0.23480333387851715, + "learning_rate": 2.2197686531534256e-06, + "loss": 0.0738, + "num_input_tokens_seen": 24823624, + "step": 34595 + }, + { + "epoch": 71.93347193347194, + "grad_norm": 0.3917766809463501, + "learning_rate": 2.2157261609416087e-06, + "loss": 0.0874, + "num_input_tokens_seen": 24827048, + "step": 34600 + }, + { + "epoch": 71.93347193347194, + "eval_loss": 0.14579492807388306, + "eval_runtime": 7.7441, + "eval_samples_per_second": 110.536, + "eval_steps_per_second": 27.634, + "num_input_tokens_seen": 24827048, + "step": 34600 + }, + { + "epoch": 71.94386694386695, + "grad_norm": 0.21865254640579224, + "learning_rate": 2.211687182351149e-06, + "loss": 0.0927, + "num_input_tokens_seen": 24830536, + "step": 34605 + }, + { + "epoch": 71.95426195426195, + "grad_norm": 0.2794295847415924, + "learning_rate": 2.2076517180048993e-06, + "loss": 0.1117, + "num_input_tokens_seen": 24834120, + "step": 34610 + }, + { + "epoch": 71.96465696465697, + "grad_norm": 0.7019571661949158, + "learning_rate": 2.2036197685251834e-06, + "loss": 0.1205, + "num_input_tokens_seen": 24837640, + "step": 34615 + }, + { + "epoch": 71.97505197505197, + "grad_norm": 0.4584653675556183, + "learning_rate": 2.199591334533771e-06, + "loss": 0.1277, + "num_input_tokens_seen": 24841256, + "step": 34620 + }, + { + "epoch": 71.98544698544698, + "grad_norm": 0.6429937481880188, + "learning_rate": 2.1955664166519036e-06, + "loss": 0.1361, + "num_input_tokens_seen": 24844776, + "step": 34625 + }, + { + "epoch": 71.995841995842, + "grad_norm": 0.7246837615966797, + "learning_rate": 2.1915450155002793e-06, + "loss": 0.0753, + "num_input_tokens_seen": 24848520, + "step": 34630 + }, + { + "epoch": 72.006237006237, + "grad_norm": 0.17882856726646423, + "learning_rate": 2.187527131699038e-06, + "loss": 0.0969, + "num_input_tokens_seen": 24851992, + "step": 34635 + }, + { + "epoch": 72.01663201663202, + "grad_norm": 0.2803903818130493, + "learning_rate": 2.18351276586779e-06, + "loss": 0.1161, + "num_input_tokens_seen": 24855480, + "step": 34640 + }, + { + "epoch": 72.02702702702703, + "grad_norm": 1.0578473806381226, + "learning_rate": 2.1795019186256092e-06, + "loss": 0.1494, + "num_input_tokens_seen": 24859256, + "step": 34645 + }, + { + "epoch": 72.03742203742203, + "grad_norm": 0.2712996006011963, + "learning_rate": 2.1754945905910094e-06, + "loss": 0.1012, + "num_input_tokens_seen": 24863064, + "step": 34650 + }, + { + "epoch": 72.04781704781705, + "grad_norm": 0.808577299118042, + "learning_rate": 2.171490782381977e-06, + "loss": 0.0951, + "num_input_tokens_seen": 24866616, + "step": 34655 + }, + { + "epoch": 72.05821205821206, + "grad_norm": 0.23224377632141113, + "learning_rate": 2.1674904946159425e-06, + "loss": 0.1031, + "num_input_tokens_seen": 24870200, + "step": 34660 + }, + { + "epoch": 72.06860706860707, + "grad_norm": 0.22909285128116608, + "learning_rate": 2.16349372790981e-06, + "loss": 0.1209, + "num_input_tokens_seen": 24873816, + "step": 34665 + }, + { + "epoch": 72.07900207900208, + "grad_norm": 0.2302735447883606, + "learning_rate": 2.159500482879928e-06, + "loss": 0.0838, + "num_input_tokens_seen": 24877368, + "step": 34670 + }, + { + "epoch": 72.0893970893971, + "grad_norm": 0.5071144700050354, + "learning_rate": 2.155510760142096e-06, + "loss": 0.1307, + "num_input_tokens_seen": 24880952, + "step": 34675 + }, + { + "epoch": 72.0997920997921, + "grad_norm": 0.23956649005413055, + "learning_rate": 2.151524560311588e-06, + "loss": 0.1411, + "num_input_tokens_seen": 24884728, + "step": 34680 + }, + { + "epoch": 72.11018711018711, + "grad_norm": 0.3713880181312561, + "learning_rate": 2.147541884003129e-06, + "loss": 0.0714, + "num_input_tokens_seen": 24888504, + "step": 34685 + }, + { + "epoch": 72.12058212058211, + "grad_norm": 0.1521829515695572, + "learning_rate": 2.1435627318308895e-06, + "loss": 0.0609, + "num_input_tokens_seen": 24892280, + "step": 34690 + }, + { + "epoch": 72.13097713097713, + "grad_norm": 0.576556921005249, + "learning_rate": 2.139587104408511e-06, + "loss": 0.1313, + "num_input_tokens_seen": 24895832, + "step": 34695 + }, + { + "epoch": 72.14137214137214, + "grad_norm": 0.9603627324104309, + "learning_rate": 2.1356150023490783e-06, + "loss": 0.1202, + "num_input_tokens_seen": 24899576, + "step": 34700 + }, + { + "epoch": 72.15176715176715, + "grad_norm": 0.45298755168914795, + "learning_rate": 2.1316464262651464e-06, + "loss": 0.0965, + "num_input_tokens_seen": 24903064, + "step": 34705 + }, + { + "epoch": 72.16216216216216, + "grad_norm": 1.133068323135376, + "learning_rate": 2.1276813767687224e-06, + "loss": 0.0755, + "num_input_tokens_seen": 24906584, + "step": 34710 + }, + { + "epoch": 72.17255717255718, + "grad_norm": 0.2114570438861847, + "learning_rate": 2.123719854471254e-06, + "loss": 0.1057, + "num_input_tokens_seen": 24910136, + "step": 34715 + }, + { + "epoch": 72.18295218295218, + "grad_norm": 0.44796231389045715, + "learning_rate": 2.119761859983668e-06, + "loss": 0.1006, + "num_input_tokens_seen": 24913656, + "step": 34720 + }, + { + "epoch": 72.1933471933472, + "grad_norm": 0.34282106161117554, + "learning_rate": 2.1158073939163386e-06, + "loss": 0.096, + "num_input_tokens_seen": 24917144, + "step": 34725 + }, + { + "epoch": 72.20374220374221, + "grad_norm": 0.34475672245025635, + "learning_rate": 2.111856456879088e-06, + "loss": 0.0775, + "num_input_tokens_seen": 24920792, + "step": 34730 + }, + { + "epoch": 72.21413721413721, + "grad_norm": 0.18240977823734283, + "learning_rate": 2.1079090494811993e-06, + "loss": 0.0704, + "num_input_tokens_seen": 24924568, + "step": 34735 + }, + { + "epoch": 72.22453222453223, + "grad_norm": 0.4325333833694458, + "learning_rate": 2.103965172331418e-06, + "loss": 0.0803, + "num_input_tokens_seen": 24928024, + "step": 34740 + }, + { + "epoch": 72.23492723492724, + "grad_norm": 0.2161497324705124, + "learning_rate": 2.100024826037933e-06, + "loss": 0.0875, + "num_input_tokens_seen": 24931704, + "step": 34745 + }, + { + "epoch": 72.24532224532224, + "grad_norm": 0.3332388699054718, + "learning_rate": 2.0960880112084027e-06, + "loss": 0.1021, + "num_input_tokens_seen": 24935256, + "step": 34750 + }, + { + "epoch": 72.25571725571726, + "grad_norm": 0.09849938005208969, + "learning_rate": 2.092154728449927e-06, + "loss": 0.0692, + "num_input_tokens_seen": 24938680, + "step": 34755 + }, + { + "epoch": 72.26611226611226, + "grad_norm": 0.4246688783168793, + "learning_rate": 2.0882249783690687e-06, + "loss": 0.0512, + "num_input_tokens_seen": 24942136, + "step": 34760 + }, + { + "epoch": 72.27650727650727, + "grad_norm": 0.275747686624527, + "learning_rate": 2.084298761571851e-06, + "loss": 0.0661, + "num_input_tokens_seen": 24945944, + "step": 34765 + }, + { + "epoch": 72.28690228690229, + "grad_norm": 0.40383487939834595, + "learning_rate": 2.080376078663737e-06, + "loss": 0.0811, + "num_input_tokens_seen": 24949496, + "step": 34770 + }, + { + "epoch": 72.29729729729729, + "grad_norm": 0.48921653628349304, + "learning_rate": 2.0764569302496593e-06, + "loss": 0.1197, + "num_input_tokens_seen": 24953144, + "step": 34775 + }, + { + "epoch": 72.3076923076923, + "grad_norm": 0.37298619747161865, + "learning_rate": 2.0725413169339957e-06, + "loss": 0.1238, + "num_input_tokens_seen": 24956536, + "step": 34780 + }, + { + "epoch": 72.31808731808732, + "grad_norm": 0.4039616584777832, + "learning_rate": 2.068629239320588e-06, + "loss": 0.1066, + "num_input_tokens_seen": 24960024, + "step": 34785 + }, + { + "epoch": 72.32848232848232, + "grad_norm": 0.5145837068557739, + "learning_rate": 2.064720698012726e-06, + "loss": 0.0576, + "num_input_tokens_seen": 24963544, + "step": 34790 + }, + { + "epoch": 72.33887733887734, + "grad_norm": 0.6645298600196838, + "learning_rate": 2.0608156936131522e-06, + "loss": 0.0988, + "num_input_tokens_seen": 24967064, + "step": 34795 + }, + { + "epoch": 72.34927234927235, + "grad_norm": 0.16015592217445374, + "learning_rate": 2.056914226724074e-06, + "loss": 0.1154, + "num_input_tokens_seen": 24970840, + "step": 34800 + }, + { + "epoch": 72.34927234927235, + "eval_loss": 0.14512072503566742, + "eval_runtime": 7.7604, + "eval_samples_per_second": 110.303, + "eval_steps_per_second": 27.576, + "num_input_tokens_seen": 24970840, + "step": 34800 + }, + { + "epoch": 72.35966735966736, + "grad_norm": 0.13192863762378693, + "learning_rate": 2.0530162979471385e-06, + "loss": 0.0899, + "num_input_tokens_seen": 24974424, + "step": 34805 + }, + { + "epoch": 72.37006237006237, + "grad_norm": 0.465313583612442, + "learning_rate": 2.0491219078834667e-06, + "loss": 0.0809, + "num_input_tokens_seen": 24978168, + "step": 34810 + }, + { + "epoch": 72.38045738045739, + "grad_norm": 0.23391470313072205, + "learning_rate": 2.045231057133612e-06, + "loss": 0.1006, + "num_input_tokens_seen": 24981560, + "step": 34815 + }, + { + "epoch": 72.39085239085239, + "grad_norm": 1.2855805158615112, + "learning_rate": 2.0413437462975944e-06, + "loss": 0.0998, + "num_input_tokens_seen": 24985176, + "step": 34820 + }, + { + "epoch": 72.4012474012474, + "grad_norm": 0.4719844460487366, + "learning_rate": 2.0374599759748843e-06, + "loss": 0.103, + "num_input_tokens_seen": 24988952, + "step": 34825 + }, + { + "epoch": 72.41164241164242, + "grad_norm": 0.3119743764400482, + "learning_rate": 2.033579746764419e-06, + "loss": 0.0732, + "num_input_tokens_seen": 24992504, + "step": 34830 + }, + { + "epoch": 72.42203742203742, + "grad_norm": 0.5387187600135803, + "learning_rate": 2.029703059264565e-06, + "loss": 0.0673, + "num_input_tokens_seen": 24996152, + "step": 34835 + }, + { + "epoch": 72.43243243243244, + "grad_norm": 0.5006306767463684, + "learning_rate": 2.02582991407316e-06, + "loss": 0.0864, + "num_input_tokens_seen": 24999672, + "step": 34840 + }, + { + "epoch": 72.44282744282744, + "grad_norm": 0.7044507265090942, + "learning_rate": 2.0219603117874992e-06, + "loss": 0.1793, + "num_input_tokens_seen": 25003512, + "step": 34845 + }, + { + "epoch": 72.45322245322245, + "grad_norm": 0.5148358941078186, + "learning_rate": 2.0180942530043156e-06, + "loss": 0.0805, + "num_input_tokens_seen": 25007288, + "step": 34850 + }, + { + "epoch": 72.46361746361747, + "grad_norm": 0.5614880919456482, + "learning_rate": 2.0142317383198107e-06, + "loss": 0.0945, + "num_input_tokens_seen": 25010904, + "step": 34855 + }, + { + "epoch": 72.47401247401247, + "grad_norm": 0.4163569211959839, + "learning_rate": 2.0103727683296243e-06, + "loss": 0.1207, + "num_input_tokens_seen": 25014520, + "step": 34860 + }, + { + "epoch": 72.48440748440748, + "grad_norm": 0.42683905363082886, + "learning_rate": 2.0065173436288636e-06, + "loss": 0.0739, + "num_input_tokens_seen": 25018200, + "step": 34865 + }, + { + "epoch": 72.4948024948025, + "grad_norm": 0.4297613203525543, + "learning_rate": 2.002665464812087e-06, + "loss": 0.0993, + "num_input_tokens_seen": 25021880, + "step": 34870 + }, + { + "epoch": 72.5051975051975, + "grad_norm": 0.6755818724632263, + "learning_rate": 1.998817132473291e-06, + "loss": 0.0979, + "num_input_tokens_seen": 25025560, + "step": 34875 + }, + { + "epoch": 72.51559251559252, + "grad_norm": 0.3178645372390747, + "learning_rate": 1.9949723472059507e-06, + "loss": 0.0875, + "num_input_tokens_seen": 25029080, + "step": 34880 + }, + { + "epoch": 72.52598752598753, + "grad_norm": 0.33778446912765503, + "learning_rate": 1.9911311096029726e-06, + "loss": 0.1031, + "num_input_tokens_seen": 25032600, + "step": 34885 + }, + { + "epoch": 72.53638253638253, + "grad_norm": 0.29210418462753296, + "learning_rate": 1.9872934202567224e-06, + "loss": 0.108, + "num_input_tokens_seen": 25036152, + "step": 34890 + }, + { + "epoch": 72.54677754677755, + "grad_norm": 0.15637445449829102, + "learning_rate": 1.9834592797590257e-06, + "loss": 0.1029, + "num_input_tokens_seen": 25039960, + "step": 34895 + }, + { + "epoch": 72.55717255717256, + "grad_norm": 0.73127281665802, + "learning_rate": 1.979628688701149e-06, + "loss": 0.0841, + "num_input_tokens_seen": 25043608, + "step": 34900 + }, + { + "epoch": 72.56756756756756, + "grad_norm": 0.4005572199821472, + "learning_rate": 1.9758016476738193e-06, + "loss": 0.0838, + "num_input_tokens_seen": 25047160, + "step": 34905 + }, + { + "epoch": 72.57796257796258, + "grad_norm": 0.3378797769546509, + "learning_rate": 1.971978157267221e-06, + "loss": 0.0963, + "num_input_tokens_seen": 25050776, + "step": 34910 + }, + { + "epoch": 72.58835758835758, + "grad_norm": 0.23470133543014526, + "learning_rate": 1.968158218070973e-06, + "loss": 0.0863, + "num_input_tokens_seen": 25054360, + "step": 34915 + }, + { + "epoch": 72.5987525987526, + "grad_norm": 1.219054102897644, + "learning_rate": 1.9643418306741682e-06, + "loss": 0.1747, + "num_input_tokens_seen": 25057848, + "step": 34920 + }, + { + "epoch": 72.60914760914761, + "grad_norm": 0.5702738761901855, + "learning_rate": 1.9605289956653337e-06, + "loss": 0.1063, + "num_input_tokens_seen": 25061432, + "step": 34925 + }, + { + "epoch": 72.61954261954261, + "grad_norm": 0.8029894232749939, + "learning_rate": 1.9567197136324626e-06, + "loss": 0.0724, + "num_input_tokens_seen": 25065112, + "step": 34930 + }, + { + "epoch": 72.62993762993763, + "grad_norm": 0.5911422371864319, + "learning_rate": 1.9529139851629935e-06, + "loss": 0.1145, + "num_input_tokens_seen": 25068600, + "step": 34935 + }, + { + "epoch": 72.64033264033264, + "grad_norm": 0.14647628366947174, + "learning_rate": 1.949111810843812e-06, + "loss": 0.1303, + "num_input_tokens_seen": 25072152, + "step": 34940 + }, + { + "epoch": 72.65072765072765, + "grad_norm": 0.19437769055366516, + "learning_rate": 1.9453131912612694e-06, + "loss": 0.0855, + "num_input_tokens_seen": 25075800, + "step": 34945 + }, + { + "epoch": 72.66112266112266, + "grad_norm": 0.2314624786376953, + "learning_rate": 1.941518127001149e-06, + "loss": 0.0888, + "num_input_tokens_seen": 25079416, + "step": 34950 + }, + { + "epoch": 72.67151767151768, + "grad_norm": 0.7104864716529846, + "learning_rate": 1.9377266186487107e-06, + "loss": 0.0912, + "num_input_tokens_seen": 25082904, + "step": 34955 + }, + { + "epoch": 72.68191268191268, + "grad_norm": 0.2599247395992279, + "learning_rate": 1.9339386667886483e-06, + "loss": 0.0798, + "num_input_tokens_seen": 25086552, + "step": 34960 + }, + { + "epoch": 72.6923076923077, + "grad_norm": 0.24610185623168945, + "learning_rate": 1.9301542720051024e-06, + "loss": 0.0961, + "num_input_tokens_seen": 25090168, + "step": 34965 + }, + { + "epoch": 72.70270270270271, + "grad_norm": 0.4189920723438263, + "learning_rate": 1.926373434881684e-06, + "loss": 0.1208, + "num_input_tokens_seen": 25093976, + "step": 34970 + }, + { + "epoch": 72.71309771309771, + "grad_norm": 0.19931532442569733, + "learning_rate": 1.9225961560014468e-06, + "loss": 0.0927, + "num_input_tokens_seen": 25097560, + "step": 34975 + }, + { + "epoch": 72.72349272349273, + "grad_norm": 0.18467596173286438, + "learning_rate": 1.918822435946885e-06, + "loss": 0.0822, + "num_input_tokens_seen": 25101208, + "step": 34980 + }, + { + "epoch": 72.73388773388774, + "grad_norm": 0.1449846774339676, + "learning_rate": 1.915052275299961e-06, + "loss": 0.0863, + "num_input_tokens_seen": 25104760, + "step": 34985 + }, + { + "epoch": 72.74428274428274, + "grad_norm": 0.3823544681072235, + "learning_rate": 1.9112856746420854e-06, + "loss": 0.0703, + "num_input_tokens_seen": 25108376, + "step": 34990 + }, + { + "epoch": 72.75467775467776, + "grad_norm": 0.6406662464141846, + "learning_rate": 1.907522634554104e-06, + "loss": 0.0924, + "num_input_tokens_seen": 25112088, + "step": 34995 + }, + { + "epoch": 72.76507276507276, + "grad_norm": 0.2932813763618469, + "learning_rate": 1.9037631556163337e-06, + "loss": 0.0979, + "num_input_tokens_seen": 25115672, + "step": 35000 + }, + { + "epoch": 72.76507276507276, + "eval_loss": 0.14554910361766815, + "eval_runtime": 7.7568, + "eval_samples_per_second": 110.355, + "eval_steps_per_second": 27.589, + "num_input_tokens_seen": 25115672, + "step": 35000 + }, + { + "epoch": 72.77546777546777, + "grad_norm": 0.39172232151031494, + "learning_rate": 1.9000072384085272e-06, + "loss": 0.1055, + "num_input_tokens_seen": 25119160, + "step": 35005 + }, + { + "epoch": 72.78586278586279, + "grad_norm": 0.38963842391967773, + "learning_rate": 1.8962548835098987e-06, + "loss": 0.056, + "num_input_tokens_seen": 25122712, + "step": 35010 + }, + { + "epoch": 72.79625779625779, + "grad_norm": 0.49729427695274353, + "learning_rate": 1.8925060914991077e-06, + "loss": 0.1262, + "num_input_tokens_seen": 25126168, + "step": 35015 + }, + { + "epoch": 72.8066528066528, + "grad_norm": 0.47831228375434875, + "learning_rate": 1.888760862954264e-06, + "loss": 0.1063, + "num_input_tokens_seen": 25129784, + "step": 35020 + }, + { + "epoch": 72.81704781704782, + "grad_norm": 0.2526967227458954, + "learning_rate": 1.8850191984529309e-06, + "loss": 0.0933, + "num_input_tokens_seen": 25133240, + "step": 35025 + }, + { + "epoch": 72.82744282744282, + "grad_norm": 0.26288658380508423, + "learning_rate": 1.8812810985721186e-06, + "loss": 0.0964, + "num_input_tokens_seen": 25136824, + "step": 35030 + }, + { + "epoch": 72.83783783783784, + "grad_norm": 0.8575473427772522, + "learning_rate": 1.8775465638882856e-06, + "loss": 0.1041, + "num_input_tokens_seen": 25140376, + "step": 35035 + }, + { + "epoch": 72.84823284823285, + "grad_norm": 0.1441861242055893, + "learning_rate": 1.8738155949773517e-06, + "loss": 0.0843, + "num_input_tokens_seen": 25143864, + "step": 35040 + }, + { + "epoch": 72.85862785862786, + "grad_norm": 0.49529528617858887, + "learning_rate": 1.8700881924146707e-06, + "loss": 0.1611, + "num_input_tokens_seen": 25147416, + "step": 35045 + }, + { + "epoch": 72.86902286902287, + "grad_norm": 0.2766762971878052, + "learning_rate": 1.8663643567750577e-06, + "loss": 0.1078, + "num_input_tokens_seen": 25150840, + "step": 35050 + }, + { + "epoch": 72.87941787941789, + "grad_norm": 0.16845181584358215, + "learning_rate": 1.8626440886327813e-06, + "loss": 0.056, + "num_input_tokens_seen": 25154328, + "step": 35055 + }, + { + "epoch": 72.88981288981289, + "grad_norm": 0.3207899332046509, + "learning_rate": 1.8589273885615432e-06, + "loss": 0.1202, + "num_input_tokens_seen": 25157816, + "step": 35060 + }, + { + "epoch": 72.9002079002079, + "grad_norm": 0.24580928683280945, + "learning_rate": 1.8552142571345133e-06, + "loss": 0.0583, + "num_input_tokens_seen": 25161432, + "step": 35065 + }, + { + "epoch": 72.9106029106029, + "grad_norm": 0.39110320806503296, + "learning_rate": 1.8515046949243025e-06, + "loss": 0.0577, + "num_input_tokens_seen": 25165144, + "step": 35070 + }, + { + "epoch": 72.92099792099792, + "grad_norm": 0.5580404996871948, + "learning_rate": 1.8477987025029674e-06, + "loss": 0.1347, + "num_input_tokens_seen": 25168728, + "step": 35075 + }, + { + "epoch": 72.93139293139293, + "grad_norm": 0.3609335422515869, + "learning_rate": 1.8440962804420232e-06, + "loss": 0.0635, + "num_input_tokens_seen": 25172312, + "step": 35080 + }, + { + "epoch": 72.94178794178794, + "grad_norm": 0.6500345468521118, + "learning_rate": 1.8403974293124265e-06, + "loss": 0.0785, + "num_input_tokens_seen": 25175896, + "step": 35085 + }, + { + "epoch": 72.95218295218295, + "grad_norm": 0.37331488728523254, + "learning_rate": 1.8367021496845854e-06, + "loss": 0.0816, + "num_input_tokens_seen": 25179416, + "step": 35090 + }, + { + "epoch": 72.96257796257797, + "grad_norm": 0.3652925193309784, + "learning_rate": 1.8330104421283662e-06, + "loss": 0.1184, + "num_input_tokens_seen": 25183000, + "step": 35095 + }, + { + "epoch": 72.97297297297297, + "grad_norm": 0.9193481802940369, + "learning_rate": 1.8293223072130717e-06, + "loss": 0.1546, + "num_input_tokens_seen": 25186552, + "step": 35100 + }, + { + "epoch": 72.98336798336798, + "grad_norm": 0.3232559263706207, + "learning_rate": 1.8256377455074525e-06, + "loss": 0.0766, + "num_input_tokens_seen": 25190200, + "step": 35105 + }, + { + "epoch": 72.993762993763, + "grad_norm": 0.42751628160476685, + "learning_rate": 1.8219567575797263e-06, + "loss": 0.109, + "num_input_tokens_seen": 25193688, + "step": 35110 + }, + { + "epoch": 73.004158004158, + "grad_norm": 0.2742270231246948, + "learning_rate": 1.8182793439975365e-06, + "loss": 0.1231, + "num_input_tokens_seen": 25197072, + "step": 35115 + }, + { + "epoch": 73.01455301455302, + "grad_norm": 0.4428613781929016, + "learning_rate": 1.8146055053279958e-06, + "loss": 0.1018, + "num_input_tokens_seen": 25200656, + "step": 35120 + }, + { + "epoch": 73.02494802494803, + "grad_norm": 0.6905648112297058, + "learning_rate": 1.8109352421376486e-06, + "loss": 0.0997, + "num_input_tokens_seen": 25204208, + "step": 35125 + }, + { + "epoch": 73.03534303534303, + "grad_norm": 0.8178213834762573, + "learning_rate": 1.8072685549924972e-06, + "loss": 0.1724, + "num_input_tokens_seen": 25207824, + "step": 35130 + }, + { + "epoch": 73.04573804573805, + "grad_norm": 0.5243039131164551, + "learning_rate": 1.8036054444579982e-06, + "loss": 0.1708, + "num_input_tokens_seen": 25211472, + "step": 35135 + }, + { + "epoch": 73.05613305613305, + "grad_norm": 0.5861940979957581, + "learning_rate": 1.7999459110990407e-06, + "loss": 0.0952, + "num_input_tokens_seen": 25215120, + "step": 35140 + }, + { + "epoch": 73.06652806652806, + "grad_norm": 0.5847077965736389, + "learning_rate": 1.7962899554799712e-06, + "loss": 0.0842, + "num_input_tokens_seen": 25218768, + "step": 35145 + }, + { + "epoch": 73.07692307692308, + "grad_norm": 0.1564604938030243, + "learning_rate": 1.7926375781645937e-06, + "loss": 0.1056, + "num_input_tokens_seen": 25222352, + "step": 35150 + }, + { + "epoch": 73.08731808731808, + "grad_norm": 0.21870587766170502, + "learning_rate": 1.7889887797161359e-06, + "loss": 0.0927, + "num_input_tokens_seen": 25226064, + "step": 35155 + }, + { + "epoch": 73.0977130977131, + "grad_norm": 0.37879130244255066, + "learning_rate": 1.7853435606973028e-06, + "loss": 0.1012, + "num_input_tokens_seen": 25229712, + "step": 35160 + }, + { + "epoch": 73.10810810810811, + "grad_norm": 0.43199506402015686, + "learning_rate": 1.781701921670223e-06, + "loss": 0.0502, + "num_input_tokens_seen": 25233488, + "step": 35165 + }, + { + "epoch": 73.11850311850311, + "grad_norm": 0.27852770686149597, + "learning_rate": 1.7780638631964886e-06, + "loss": 0.0753, + "num_input_tokens_seen": 25237040, + "step": 35170 + }, + { + "epoch": 73.12889812889813, + "grad_norm": 0.4938666820526123, + "learning_rate": 1.7744293858371314e-06, + "loss": 0.1171, + "num_input_tokens_seen": 25240560, + "step": 35175 + }, + { + "epoch": 73.13929313929314, + "grad_norm": 0.35998761653900146, + "learning_rate": 1.770798490152631e-06, + "loss": 0.0852, + "num_input_tokens_seen": 25244176, + "step": 35180 + }, + { + "epoch": 73.14968814968815, + "grad_norm": 0.1357109248638153, + "learning_rate": 1.767171176702917e-06, + "loss": 0.0804, + "num_input_tokens_seen": 25247728, + "step": 35185 + }, + { + "epoch": 73.16008316008316, + "grad_norm": 0.3027086853981018, + "learning_rate": 1.7635474460473755e-06, + "loss": 0.0881, + "num_input_tokens_seen": 25251376, + "step": 35190 + }, + { + "epoch": 73.17047817047818, + "grad_norm": 0.7018564939498901, + "learning_rate": 1.7599272987448206e-06, + "loss": 0.0651, + "num_input_tokens_seen": 25254864, + "step": 35195 + }, + { + "epoch": 73.18087318087318, + "grad_norm": 0.2692483067512512, + "learning_rate": 1.7563107353535362e-06, + "loss": 0.0703, + "num_input_tokens_seen": 25258416, + "step": 35200 + }, + { + "epoch": 73.18087318087318, + "eval_loss": 0.144135519862175, + "eval_runtime": 7.7678, + "eval_samples_per_second": 110.199, + "eval_steps_per_second": 27.55, + "num_input_tokens_seen": 25258416, + "step": 35200 + }, + { + "epoch": 73.1912681912682, + "grad_norm": 0.7585045099258423, + "learning_rate": 1.7526977564312263e-06, + "loss": 0.0786, + "num_input_tokens_seen": 25261936, + "step": 35205 + }, + { + "epoch": 73.20166320166321, + "grad_norm": 0.31180623173713684, + "learning_rate": 1.7490883625350701e-06, + "loss": 0.0873, + "num_input_tokens_seen": 25265456, + "step": 35210 + }, + { + "epoch": 73.21205821205821, + "grad_norm": 0.32433784008026123, + "learning_rate": 1.7454825542216807e-06, + "loss": 0.0719, + "num_input_tokens_seen": 25269136, + "step": 35215 + }, + { + "epoch": 73.22245322245323, + "grad_norm": 0.4136248528957367, + "learning_rate": 1.7418803320471105e-06, + "loss": 0.0665, + "num_input_tokens_seen": 25272496, + "step": 35220 + }, + { + "epoch": 73.23284823284823, + "grad_norm": 0.6315106153488159, + "learning_rate": 1.7382816965668737e-06, + "loss": 0.103, + "num_input_tokens_seen": 25275888, + "step": 35225 + }, + { + "epoch": 73.24324324324324, + "grad_norm": 0.45238184928894043, + "learning_rate": 1.7346866483359285e-06, + "loss": 0.1002, + "num_input_tokens_seen": 25279536, + "step": 35230 + }, + { + "epoch": 73.25363825363826, + "grad_norm": 0.509468674659729, + "learning_rate": 1.7310951879086657e-06, + "loss": 0.1171, + "num_input_tokens_seen": 25283152, + "step": 35235 + }, + { + "epoch": 73.26403326403326, + "grad_norm": 0.5899872183799744, + "learning_rate": 1.7275073158389471e-06, + "loss": 0.1455, + "num_input_tokens_seen": 25286928, + "step": 35240 + }, + { + "epoch": 73.27442827442827, + "grad_norm": 0.3442595899105072, + "learning_rate": 1.723923032680061e-06, + "loss": 0.0993, + "num_input_tokens_seen": 25290704, + "step": 35245 + }, + { + "epoch": 73.28482328482329, + "grad_norm": 0.23202143609523773, + "learning_rate": 1.7203423389847428e-06, + "loss": 0.1317, + "num_input_tokens_seen": 25294416, + "step": 35250 + }, + { + "epoch": 73.29521829521829, + "grad_norm": 0.20462630689144135, + "learning_rate": 1.7167652353051928e-06, + "loss": 0.0941, + "num_input_tokens_seen": 25298064, + "step": 35255 + }, + { + "epoch": 73.3056133056133, + "grad_norm": 0.3536522388458252, + "learning_rate": 1.7131917221930333e-06, + "loss": 0.0698, + "num_input_tokens_seen": 25301552, + "step": 35260 + }, + { + "epoch": 73.31600831600832, + "grad_norm": 0.32011082768440247, + "learning_rate": 1.7096218001993513e-06, + "loss": 0.1045, + "num_input_tokens_seen": 25305264, + "step": 35265 + }, + { + "epoch": 73.32640332640332, + "grad_norm": 0.4650821089744568, + "learning_rate": 1.706055469874676e-06, + "loss": 0.1072, + "num_input_tokens_seen": 25308848, + "step": 35270 + }, + { + "epoch": 73.33679833679834, + "grad_norm": 0.2494688630104065, + "learning_rate": 1.702492731768976e-06, + "loss": 0.099, + "num_input_tokens_seen": 25312400, + "step": 35275 + }, + { + "epoch": 73.34719334719335, + "grad_norm": 0.531369686126709, + "learning_rate": 1.6989335864316724e-06, + "loss": 0.0793, + "num_input_tokens_seen": 25316080, + "step": 35280 + }, + { + "epoch": 73.35758835758836, + "grad_norm": 0.39191409945487976, + "learning_rate": 1.6953780344116265e-06, + "loss": 0.098, + "num_input_tokens_seen": 25319728, + "step": 35285 + }, + { + "epoch": 73.36798336798337, + "grad_norm": 0.2593550384044647, + "learning_rate": 1.6918260762571497e-06, + "loss": 0.0676, + "num_input_tokens_seen": 25323376, + "step": 35290 + }, + { + "epoch": 73.37837837837837, + "grad_norm": 0.274150550365448, + "learning_rate": 1.6882777125160093e-06, + "loss": 0.0816, + "num_input_tokens_seen": 25327088, + "step": 35295 + }, + { + "epoch": 73.38877338877339, + "grad_norm": 0.34924039244651794, + "learning_rate": 1.6847329437353899e-06, + "loss": 0.0756, + "num_input_tokens_seen": 25330480, + "step": 35300 + }, + { + "epoch": 73.3991683991684, + "grad_norm": 0.3097463846206665, + "learning_rate": 1.6811917704619511e-06, + "loss": 0.1107, + "num_input_tokens_seen": 25334224, + "step": 35305 + }, + { + "epoch": 73.4095634095634, + "grad_norm": 0.4186091423034668, + "learning_rate": 1.67765419324179e-06, + "loss": 0.1164, + "num_input_tokens_seen": 25337872, + "step": 35310 + }, + { + "epoch": 73.41995841995842, + "grad_norm": 0.2782677412033081, + "learning_rate": 1.6741202126204364e-06, + "loss": 0.1262, + "num_input_tokens_seen": 25341328, + "step": 35315 + }, + { + "epoch": 73.43035343035343, + "grad_norm": 0.7586347460746765, + "learning_rate": 1.6705898291428767e-06, + "loss": 0.1029, + "num_input_tokens_seen": 25344912, + "step": 35320 + }, + { + "epoch": 73.44074844074844, + "grad_norm": 0.5067979693412781, + "learning_rate": 1.6670630433535395e-06, + "loss": 0.0885, + "num_input_tokens_seen": 25348592, + "step": 35325 + }, + { + "epoch": 73.45114345114345, + "grad_norm": 0.5439221262931824, + "learning_rate": 1.6635398557962979e-06, + "loss": 0.0709, + "num_input_tokens_seen": 25352208, + "step": 35330 + }, + { + "epoch": 73.46153846153847, + "grad_norm": 0.41558703780174255, + "learning_rate": 1.660020267014481e-06, + "loss": 0.1108, + "num_input_tokens_seen": 25355760, + "step": 35335 + }, + { + "epoch": 73.47193347193347, + "grad_norm": 0.7510110139846802, + "learning_rate": 1.6565042775508438e-06, + "loss": 0.0888, + "num_input_tokens_seen": 25359376, + "step": 35340 + }, + { + "epoch": 73.48232848232848, + "grad_norm": 0.09672967344522476, + "learning_rate": 1.6529918879475997e-06, + "loss": 0.1095, + "num_input_tokens_seen": 25362960, + "step": 35345 + }, + { + "epoch": 73.4927234927235, + "grad_norm": 0.4188491106033325, + "learning_rate": 1.6494830987464043e-06, + "loss": 0.0699, + "num_input_tokens_seen": 25366448, + "step": 35350 + }, + { + "epoch": 73.5031185031185, + "grad_norm": 0.2637839615345001, + "learning_rate": 1.6459779104883555e-06, + "loss": 0.0704, + "num_input_tokens_seen": 25370000, + "step": 35355 + }, + { + "epoch": 73.51351351351352, + "grad_norm": 0.5567712187767029, + "learning_rate": 1.6424763237140013e-06, + "loss": 0.0697, + "num_input_tokens_seen": 25373552, + "step": 35360 + }, + { + "epoch": 73.52390852390852, + "grad_norm": 0.4379713237285614, + "learning_rate": 1.6389783389633207e-06, + "loss": 0.1259, + "num_input_tokens_seen": 25377136, + "step": 35365 + }, + { + "epoch": 73.53430353430353, + "grad_norm": 0.25532710552215576, + "learning_rate": 1.6354839567757546e-06, + "loss": 0.104, + "num_input_tokens_seen": 25380592, + "step": 35370 + }, + { + "epoch": 73.54469854469855, + "grad_norm": 0.253697007894516, + "learning_rate": 1.6319931776901831e-06, + "loss": 0.0987, + "num_input_tokens_seen": 25384368, + "step": 35375 + }, + { + "epoch": 73.55509355509355, + "grad_norm": 0.19702933728694916, + "learning_rate": 1.6285060022449229e-06, + "loss": 0.1085, + "num_input_tokens_seen": 25387952, + "step": 35380 + }, + { + "epoch": 73.56548856548856, + "grad_norm": 0.24571894109249115, + "learning_rate": 1.6250224309777434e-06, + "loss": 0.0872, + "num_input_tokens_seen": 25391472, + "step": 35385 + }, + { + "epoch": 73.57588357588358, + "grad_norm": 0.36259591579437256, + "learning_rate": 1.6215424644258515e-06, + "loss": 0.1126, + "num_input_tokens_seen": 25395056, + "step": 35390 + }, + { + "epoch": 73.58627858627858, + "grad_norm": 0.19399996101856232, + "learning_rate": 1.6180661031259036e-06, + "loss": 0.0931, + "num_input_tokens_seen": 25398768, + "step": 35395 + }, + { + "epoch": 73.5966735966736, + "grad_norm": 0.4292515516281128, + "learning_rate": 1.614593347613999e-06, + "loss": 0.1256, + "num_input_tokens_seen": 25402448, + "step": 35400 + }, + { + "epoch": 73.5966735966736, + "eval_loss": 0.14428985118865967, + "eval_runtime": 7.7539, + "eval_samples_per_second": 110.396, + "eval_steps_per_second": 27.599, + "num_input_tokens_seen": 25402448, + "step": 35400 + }, + { + "epoch": 73.60706860706861, + "grad_norm": 0.740165114402771, + "learning_rate": 1.6111241984256758e-06, + "loss": 0.0807, + "num_input_tokens_seen": 25406032, + "step": 35405 + }, + { + "epoch": 73.61746361746361, + "grad_norm": 0.8791202306747437, + "learning_rate": 1.6076586560959257e-06, + "loss": 0.0981, + "num_input_tokens_seen": 25409424, + "step": 35410 + }, + { + "epoch": 73.62785862785863, + "grad_norm": 0.6431408524513245, + "learning_rate": 1.604196721159182e-06, + "loss": 0.0845, + "num_input_tokens_seen": 25412944, + "step": 35415 + }, + { + "epoch": 73.63825363825364, + "grad_norm": 0.17678521573543549, + "learning_rate": 1.6007383941493092e-06, + "loss": 0.1097, + "num_input_tokens_seen": 25416368, + "step": 35420 + }, + { + "epoch": 73.64864864864865, + "grad_norm": 0.40937039256095886, + "learning_rate": 1.5972836755996285e-06, + "loss": 0.1229, + "num_input_tokens_seen": 25419856, + "step": 35425 + }, + { + "epoch": 73.65904365904366, + "grad_norm": 0.5008718967437744, + "learning_rate": 1.5938325660429076e-06, + "loss": 0.1057, + "num_input_tokens_seen": 25423248, + "step": 35430 + }, + { + "epoch": 73.66943866943868, + "grad_norm": 0.16772280633449554, + "learning_rate": 1.5903850660113378e-06, + "loss": 0.0758, + "num_input_tokens_seen": 25426864, + "step": 35435 + }, + { + "epoch": 73.67983367983368, + "grad_norm": 0.4282515347003937, + "learning_rate": 1.5869411760365826e-06, + "loss": 0.1045, + "num_input_tokens_seen": 25430384, + "step": 35440 + }, + { + "epoch": 73.6902286902287, + "grad_norm": 0.2195986956357956, + "learning_rate": 1.58350089664972e-06, + "loss": 0.0828, + "num_input_tokens_seen": 25433968, + "step": 35445 + }, + { + "epoch": 73.7006237006237, + "grad_norm": 0.7628039121627808, + "learning_rate": 1.5800642283812865e-06, + "loss": 0.1125, + "num_input_tokens_seen": 25437520, + "step": 35450 + }, + { + "epoch": 73.71101871101871, + "grad_norm": 0.7825194001197815, + "learning_rate": 1.5766311717612698e-06, + "loss": 0.0857, + "num_input_tokens_seen": 25441104, + "step": 35455 + }, + { + "epoch": 73.72141372141373, + "grad_norm": 0.22934921085834503, + "learning_rate": 1.5732017273190818e-06, + "loss": 0.0955, + "num_input_tokens_seen": 25444624, + "step": 35460 + }, + { + "epoch": 73.73180873180873, + "grad_norm": 0.2378111034631729, + "learning_rate": 1.5697758955835806e-06, + "loss": 0.1212, + "num_input_tokens_seen": 25448400, + "step": 35465 + }, + { + "epoch": 73.74220374220374, + "grad_norm": 0.583976149559021, + "learning_rate": 1.566353677083085e-06, + "loss": 0.1211, + "num_input_tokens_seen": 25452016, + "step": 35470 + }, + { + "epoch": 73.75259875259876, + "grad_norm": 0.17179980874061584, + "learning_rate": 1.562935072345334e-06, + "loss": 0.1182, + "num_input_tokens_seen": 25455728, + "step": 35475 + }, + { + "epoch": 73.76299376299376, + "grad_norm": 0.25902095437049866, + "learning_rate": 1.5595200818975281e-06, + "loss": 0.0722, + "num_input_tokens_seen": 25459120, + "step": 35480 + }, + { + "epoch": 73.77338877338877, + "grad_norm": 0.9900491237640381, + "learning_rate": 1.5561087062662905e-06, + "loss": 0.1141, + "num_input_tokens_seen": 25462768, + "step": 35485 + }, + { + "epoch": 73.78378378378379, + "grad_norm": 0.24884870648384094, + "learning_rate": 1.5527009459777087e-06, + "loss": 0.0767, + "num_input_tokens_seen": 25466320, + "step": 35490 + }, + { + "epoch": 73.79417879417879, + "grad_norm": 0.33033278584480286, + "learning_rate": 1.5492968015572984e-06, + "loss": 0.1358, + "num_input_tokens_seen": 25470128, + "step": 35495 + }, + { + "epoch": 73.8045738045738, + "grad_norm": 0.44821488857269287, + "learning_rate": 1.5458962735300203e-06, + "loss": 0.0895, + "num_input_tokens_seen": 25473744, + "step": 35500 + }, + { + "epoch": 73.81496881496882, + "grad_norm": 0.17152291536331177, + "learning_rate": 1.54249936242028e-06, + "loss": 0.0798, + "num_input_tokens_seen": 25477232, + "step": 35505 + }, + { + "epoch": 73.82536382536382, + "grad_norm": 0.599461019039154, + "learning_rate": 1.5391060687519222e-06, + "loss": 0.1139, + "num_input_tokens_seen": 25480752, + "step": 35510 + }, + { + "epoch": 73.83575883575884, + "grad_norm": 0.2509579062461853, + "learning_rate": 1.5357163930482367e-06, + "loss": 0.108, + "num_input_tokens_seen": 25484400, + "step": 35515 + }, + { + "epoch": 73.84615384615384, + "grad_norm": 0.3431266248226166, + "learning_rate": 1.532330335831955e-06, + "loss": 0.1309, + "num_input_tokens_seen": 25487984, + "step": 35520 + }, + { + "epoch": 73.85654885654886, + "grad_norm": 0.8043636679649353, + "learning_rate": 1.5289478976252491e-06, + "loss": 0.0797, + "num_input_tokens_seen": 25491536, + "step": 35525 + }, + { + "epoch": 73.86694386694387, + "grad_norm": 0.2538159489631653, + "learning_rate": 1.5255690789497345e-06, + "loss": 0.0897, + "num_input_tokens_seen": 25495056, + "step": 35530 + }, + { + "epoch": 73.87733887733887, + "grad_norm": 0.3766998052597046, + "learning_rate": 1.5221938803264641e-06, + "loss": 0.0784, + "num_input_tokens_seen": 25498736, + "step": 35535 + }, + { + "epoch": 73.88773388773389, + "grad_norm": 1.4961750507354736, + "learning_rate": 1.518822302275938e-06, + "loss": 0.0963, + "num_input_tokens_seen": 25502320, + "step": 35540 + }, + { + "epoch": 73.8981288981289, + "grad_norm": 0.12521354854106903, + "learning_rate": 1.5154543453180958e-06, + "loss": 0.0693, + "num_input_tokens_seen": 25505840, + "step": 35545 + }, + { + "epoch": 73.9085239085239, + "grad_norm": 0.17507268488407135, + "learning_rate": 1.5120900099723167e-06, + "loss": 0.0495, + "num_input_tokens_seen": 25509264, + "step": 35550 + }, + { + "epoch": 73.91891891891892, + "grad_norm": 0.257232129573822, + "learning_rate": 1.5087292967574273e-06, + "loss": 0.1329, + "num_input_tokens_seen": 25512752, + "step": 35555 + }, + { + "epoch": 73.92931392931393, + "grad_norm": 0.5699002146720886, + "learning_rate": 1.5053722061916908e-06, + "loss": 0.0824, + "num_input_tokens_seen": 25516240, + "step": 35560 + }, + { + "epoch": 73.93970893970894, + "grad_norm": 0.15162748098373413, + "learning_rate": 1.5020187387928124e-06, + "loss": 0.0827, + "num_input_tokens_seen": 25519920, + "step": 35565 + }, + { + "epoch": 73.95010395010395, + "grad_norm": 0.5897905230522156, + "learning_rate": 1.4986688950779343e-06, + "loss": 0.101, + "num_input_tokens_seen": 25523568, + "step": 35570 + }, + { + "epoch": 73.96049896049897, + "grad_norm": 0.7508299946784973, + "learning_rate": 1.495322675563654e-06, + "loss": 0.1114, + "num_input_tokens_seen": 25527376, + "step": 35575 + }, + { + "epoch": 73.97089397089397, + "grad_norm": 0.31654369831085205, + "learning_rate": 1.4919800807659922e-06, + "loss": 0.1562, + "num_input_tokens_seen": 25530960, + "step": 35580 + }, + { + "epoch": 73.98128898128898, + "grad_norm": 1.0889211893081665, + "learning_rate": 1.4886411112004255e-06, + "loss": 0.0789, + "num_input_tokens_seen": 25534608, + "step": 35585 + }, + { + "epoch": 73.99168399168398, + "grad_norm": 0.5232087969779968, + "learning_rate": 1.4853057673818588e-06, + "loss": 0.0822, + "num_input_tokens_seen": 25538160, + "step": 35590 + }, + { + "epoch": 74.002079002079, + "grad_norm": 0.135267436504364, + "learning_rate": 1.481974049824647e-06, + "loss": 0.0659, + "num_input_tokens_seen": 25541608, + "step": 35595 + }, + { + "epoch": 74.01247401247402, + "grad_norm": 0.5607352256774902, + "learning_rate": 1.4786459590425849e-06, + "loss": 0.1286, + "num_input_tokens_seen": 25545128, + "step": 35600 + }, + { + "epoch": 74.01247401247402, + "eval_loss": 0.1445075124502182, + "eval_runtime": 7.7465, + "eval_samples_per_second": 110.502, + "eval_steps_per_second": 27.625, + "num_input_tokens_seen": 25545128, + "step": 35600 + }, + { + "epoch": 74.02286902286902, + "grad_norm": 0.3032344877719879, + "learning_rate": 1.4753214955489036e-06, + "loss": 0.0843, + "num_input_tokens_seen": 25548744, + "step": 35605 + }, + { + "epoch": 74.03326403326403, + "grad_norm": 0.4270707368850708, + "learning_rate": 1.4720006598562737e-06, + "loss": 0.1316, + "num_input_tokens_seen": 25552328, + "step": 35610 + }, + { + "epoch": 74.04365904365905, + "grad_norm": 0.328992635011673, + "learning_rate": 1.4686834524768185e-06, + "loss": 0.0862, + "num_input_tokens_seen": 25555912, + "step": 35615 + }, + { + "epoch": 74.05405405405405, + "grad_norm": 1.073219895362854, + "learning_rate": 1.4653698739220844e-06, + "loss": 0.0757, + "num_input_tokens_seen": 25559400, + "step": 35620 + }, + { + "epoch": 74.06444906444906, + "grad_norm": 0.25810733437538147, + "learning_rate": 1.4620599247030715e-06, + "loss": 0.0879, + "num_input_tokens_seen": 25563048, + "step": 35625 + }, + { + "epoch": 74.07484407484408, + "grad_norm": 0.339842826128006, + "learning_rate": 1.4587536053302125e-06, + "loss": 0.1011, + "num_input_tokens_seen": 25566632, + "step": 35630 + }, + { + "epoch": 74.08523908523908, + "grad_norm": 0.19333377480506897, + "learning_rate": 1.4554509163133862e-06, + "loss": 0.0773, + "num_input_tokens_seen": 25570376, + "step": 35635 + }, + { + "epoch": 74.0956340956341, + "grad_norm": 0.40666428208351135, + "learning_rate": 1.4521518581619098e-06, + "loss": 0.1138, + "num_input_tokens_seen": 25573896, + "step": 35640 + }, + { + "epoch": 74.10602910602911, + "grad_norm": 0.2616701126098633, + "learning_rate": 1.4488564313845348e-06, + "loss": 0.0634, + "num_input_tokens_seen": 25577416, + "step": 35645 + }, + { + "epoch": 74.11642411642411, + "grad_norm": 0.14945422112941742, + "learning_rate": 1.4455646364894603e-06, + "loss": 0.1054, + "num_input_tokens_seen": 25581064, + "step": 35650 + }, + { + "epoch": 74.12681912681913, + "grad_norm": 0.522936999797821, + "learning_rate": 1.4422764739843247e-06, + "loss": 0.0723, + "num_input_tokens_seen": 25584712, + "step": 35655 + }, + { + "epoch": 74.13721413721414, + "grad_norm": 0.9500348567962646, + "learning_rate": 1.4389919443762e-06, + "loss": 0.1423, + "num_input_tokens_seen": 25588392, + "step": 35660 + }, + { + "epoch": 74.14760914760915, + "grad_norm": 0.7418513298034668, + "learning_rate": 1.4357110481716063e-06, + "loss": 0.0952, + "num_input_tokens_seen": 25592072, + "step": 35665 + }, + { + "epoch": 74.15800415800416, + "grad_norm": 0.1978106051683426, + "learning_rate": 1.4324337858764941e-06, + "loss": 0.1153, + "num_input_tokens_seen": 25595560, + "step": 35670 + }, + { + "epoch": 74.16839916839916, + "grad_norm": 0.22974249720573425, + "learning_rate": 1.4291601579962622e-06, + "loss": 0.0971, + "num_input_tokens_seen": 25599080, + "step": 35675 + }, + { + "epoch": 74.17879417879418, + "grad_norm": 0.3084312677383423, + "learning_rate": 1.42589016503574e-06, + "loss": 0.11, + "num_input_tokens_seen": 25602696, + "step": 35680 + }, + { + "epoch": 74.1891891891892, + "grad_norm": 0.26686152815818787, + "learning_rate": 1.4226238074992099e-06, + "loss": 0.0699, + "num_input_tokens_seen": 25606056, + "step": 35685 + }, + { + "epoch": 74.1995841995842, + "grad_norm": 0.7290120124816895, + "learning_rate": 1.4193610858903778e-06, + "loss": 0.094, + "num_input_tokens_seen": 25609704, + "step": 35690 + }, + { + "epoch": 74.20997920997921, + "grad_norm": 1.0175933837890625, + "learning_rate": 1.416102000712402e-06, + "loss": 0.0694, + "num_input_tokens_seen": 25613416, + "step": 35695 + }, + { + "epoch": 74.22037422037423, + "grad_norm": 0.4348863661289215, + "learning_rate": 1.4128465524678668e-06, + "loss": 0.0579, + "num_input_tokens_seen": 25616936, + "step": 35700 + }, + { + "epoch": 74.23076923076923, + "grad_norm": 1.039973497390747, + "learning_rate": 1.4095947416588124e-06, + "loss": 0.098, + "num_input_tokens_seen": 25620552, + "step": 35705 + }, + { + "epoch": 74.24116424116424, + "grad_norm": 0.41715550422668457, + "learning_rate": 1.4063465687866983e-06, + "loss": 0.1043, + "num_input_tokens_seen": 25624296, + "step": 35710 + }, + { + "epoch": 74.25155925155926, + "grad_norm": 0.4261985421180725, + "learning_rate": 1.4031020343524438e-06, + "loss": 0.117, + "num_input_tokens_seen": 25627944, + "step": 35715 + }, + { + "epoch": 74.26195426195426, + "grad_norm": 0.4004250466823578, + "learning_rate": 1.3998611388563926e-06, + "loss": 0.1263, + "num_input_tokens_seen": 25631528, + "step": 35720 + }, + { + "epoch": 74.27234927234927, + "grad_norm": 0.39366719126701355, + "learning_rate": 1.3966238827983314e-06, + "loss": 0.127, + "num_input_tokens_seen": 25635208, + "step": 35725 + }, + { + "epoch": 74.28274428274429, + "grad_norm": 0.5442487001419067, + "learning_rate": 1.393390266677483e-06, + "loss": 0.076, + "num_input_tokens_seen": 25638792, + "step": 35730 + }, + { + "epoch": 74.29313929313929, + "grad_norm": 0.2136932760477066, + "learning_rate": 1.3901602909925204e-06, + "loss": 0.0773, + "num_input_tokens_seen": 25642376, + "step": 35735 + }, + { + "epoch": 74.3035343035343, + "grad_norm": 0.6369443535804749, + "learning_rate": 1.3869339562415373e-06, + "loss": 0.1142, + "num_input_tokens_seen": 25645960, + "step": 35740 + }, + { + "epoch": 74.31392931392931, + "grad_norm": 1.2187749147415161, + "learning_rate": 1.38371126292208e-06, + "loss": 0.1115, + "num_input_tokens_seen": 25649576, + "step": 35745 + }, + { + "epoch": 74.32432432432432, + "grad_norm": 0.1367160826921463, + "learning_rate": 1.3804922115311286e-06, + "loss": 0.0716, + "num_input_tokens_seen": 25653288, + "step": 35750 + }, + { + "epoch": 74.33471933471934, + "grad_norm": 0.16954150795936584, + "learning_rate": 1.3772768025650945e-06, + "loss": 0.1081, + "num_input_tokens_seen": 25656648, + "step": 35755 + }, + { + "epoch": 74.34511434511434, + "grad_norm": 0.8760733008384705, + "learning_rate": 1.3740650365198448e-06, + "loss": 0.0746, + "num_input_tokens_seen": 25660264, + "step": 35760 + }, + { + "epoch": 74.35550935550935, + "grad_norm": 0.5306768417358398, + "learning_rate": 1.3708569138906612e-06, + "loss": 0.0901, + "num_input_tokens_seen": 25663784, + "step": 35765 + }, + { + "epoch": 74.36590436590437, + "grad_norm": 0.5067792534828186, + "learning_rate": 1.367652435172287e-06, + "loss": 0.1132, + "num_input_tokens_seen": 25667400, + "step": 35770 + }, + { + "epoch": 74.37629937629937, + "grad_norm": 0.9828143119812012, + "learning_rate": 1.364451600858893e-06, + "loss": 0.0996, + "num_input_tokens_seen": 25670888, + "step": 35775 + }, + { + "epoch": 74.38669438669439, + "grad_norm": 0.6633570194244385, + "learning_rate": 1.3612544114440823e-06, + "loss": 0.1301, + "num_input_tokens_seen": 25674408, + "step": 35780 + }, + { + "epoch": 74.3970893970894, + "grad_norm": 0.7320827841758728, + "learning_rate": 1.3580608674209072e-06, + "loss": 0.0907, + "num_input_tokens_seen": 25677960, + "step": 35785 + }, + { + "epoch": 74.4074844074844, + "grad_norm": 0.2737039029598236, + "learning_rate": 1.3548709692818434e-06, + "loss": 0.0811, + "num_input_tokens_seen": 25681448, + "step": 35790 + }, + { + "epoch": 74.41787941787942, + "grad_norm": 0.6349795460700989, + "learning_rate": 1.3516847175188223e-06, + "loss": 0.1107, + "num_input_tokens_seen": 25684872, + "step": 35795 + }, + { + "epoch": 74.42827442827443, + "grad_norm": 0.4216265082359314, + "learning_rate": 1.348502112623204e-06, + "loss": 0.1168, + "num_input_tokens_seen": 25688392, + "step": 35800 + }, + { + "epoch": 74.42827442827443, + "eval_loss": 0.1452970653772354, + "eval_runtime": 7.7513, + "eval_samples_per_second": 110.433, + "eval_steps_per_second": 27.608, + "num_input_tokens_seen": 25688392, + "step": 35800 + }, + { + "epoch": 74.43866943866944, + "grad_norm": 0.5513052344322205, + "learning_rate": 1.3453231550857787e-06, + "loss": 0.0451, + "num_input_tokens_seen": 25691784, + "step": 35805 + }, + { + "epoch": 74.44906444906445, + "grad_norm": 0.8163275718688965, + "learning_rate": 1.3421478453967878e-06, + "loss": 0.1182, + "num_input_tokens_seen": 25695368, + "step": 35810 + }, + { + "epoch": 74.45945945945945, + "grad_norm": 0.7885404825210571, + "learning_rate": 1.3389761840459065e-06, + "loss": 0.1108, + "num_input_tokens_seen": 25699208, + "step": 35815 + }, + { + "epoch": 74.46985446985447, + "grad_norm": 0.2267576903104782, + "learning_rate": 1.3358081715222376e-06, + "loss": 0.0716, + "num_input_tokens_seen": 25702600, + "step": 35820 + }, + { + "epoch": 74.48024948024948, + "grad_norm": 0.2864640951156616, + "learning_rate": 1.3326438083143295e-06, + "loss": 0.0733, + "num_input_tokens_seen": 25706120, + "step": 35825 + }, + { + "epoch": 74.49064449064448, + "grad_norm": 0.334145724773407, + "learning_rate": 1.3294830949101723e-06, + "loss": 0.0904, + "num_input_tokens_seen": 25709736, + "step": 35830 + }, + { + "epoch": 74.5010395010395, + "grad_norm": 0.30602768063545227, + "learning_rate": 1.3263260317971815e-06, + "loss": 0.1142, + "num_input_tokens_seen": 25713192, + "step": 35835 + }, + { + "epoch": 74.51143451143452, + "grad_norm": 0.31057819724082947, + "learning_rate": 1.3231726194622208e-06, + "loss": 0.0908, + "num_input_tokens_seen": 25716648, + "step": 35840 + }, + { + "epoch": 74.52182952182952, + "grad_norm": 0.47777479887008667, + "learning_rate": 1.3200228583915814e-06, + "loss": 0.0603, + "num_input_tokens_seen": 25720200, + "step": 35845 + }, + { + "epoch": 74.53222453222453, + "grad_norm": 0.25621214509010315, + "learning_rate": 1.3168767490709971e-06, + "loss": 0.0797, + "num_input_tokens_seen": 25723912, + "step": 35850 + }, + { + "epoch": 74.54261954261955, + "grad_norm": 0.796735942363739, + "learning_rate": 1.3137342919856437e-06, + "loss": 0.084, + "num_input_tokens_seen": 25727592, + "step": 35855 + }, + { + "epoch": 74.55301455301455, + "grad_norm": 0.12648163735866547, + "learning_rate": 1.310595487620117e-06, + "loss": 0.0492, + "num_input_tokens_seen": 25730888, + "step": 35860 + }, + { + "epoch": 74.56340956340956, + "grad_norm": 0.20072618126869202, + "learning_rate": 1.3074603364584715e-06, + "loss": 0.0872, + "num_input_tokens_seen": 25734568, + "step": 35865 + }, + { + "epoch": 74.57380457380458, + "grad_norm": 0.2522907853126526, + "learning_rate": 1.3043288389841758e-06, + "loss": 0.1069, + "num_input_tokens_seen": 25738216, + "step": 35870 + }, + { + "epoch": 74.58419958419958, + "grad_norm": 0.62708580493927, + "learning_rate": 1.3012009956801546e-06, + "loss": 0.0716, + "num_input_tokens_seen": 25741864, + "step": 35875 + }, + { + "epoch": 74.5945945945946, + "grad_norm": 0.3587857782840729, + "learning_rate": 1.2980768070287586e-06, + "loss": 0.1093, + "num_input_tokens_seen": 25745448, + "step": 35880 + }, + { + "epoch": 74.60498960498961, + "grad_norm": 0.2985992133617401, + "learning_rate": 1.2949562735117716e-06, + "loss": 0.1085, + "num_input_tokens_seen": 25749032, + "step": 35885 + }, + { + "epoch": 74.61538461538461, + "grad_norm": 0.17433921992778778, + "learning_rate": 1.291839395610428e-06, + "loss": 0.056, + "num_input_tokens_seen": 25752456, + "step": 35890 + }, + { + "epoch": 74.62577962577963, + "grad_norm": 0.18642163276672363, + "learning_rate": 1.2887261738053852e-06, + "loss": 0.1311, + "num_input_tokens_seen": 25756072, + "step": 35895 + }, + { + "epoch": 74.63617463617463, + "grad_norm": 0.5804616808891296, + "learning_rate": 1.2856166085767396e-06, + "loss": 0.123, + "num_input_tokens_seen": 25759720, + "step": 35900 + }, + { + "epoch": 74.64656964656965, + "grad_norm": 0.3641791045665741, + "learning_rate": 1.2825107004040272e-06, + "loss": 0.1118, + "num_input_tokens_seen": 25763176, + "step": 35905 + }, + { + "epoch": 74.65696465696466, + "grad_norm": 0.36912432312965393, + "learning_rate": 1.2794084497662146e-06, + "loss": 0.1012, + "num_input_tokens_seen": 25766760, + "step": 35910 + }, + { + "epoch": 74.66735966735966, + "grad_norm": 0.4160557985305786, + "learning_rate": 1.276309857141711e-06, + "loss": 0.0871, + "num_input_tokens_seen": 25770280, + "step": 35915 + }, + { + "epoch": 74.67775467775468, + "grad_norm": 0.5582438707351685, + "learning_rate": 1.273214923008359e-06, + "loss": 0.0944, + "num_input_tokens_seen": 25773896, + "step": 35920 + }, + { + "epoch": 74.6881496881497, + "grad_norm": 0.244964599609375, + "learning_rate": 1.2701236478434352e-06, + "loss": 0.0979, + "num_input_tokens_seen": 25777640, + "step": 35925 + }, + { + "epoch": 74.6985446985447, + "grad_norm": 0.6298336982727051, + "learning_rate": 1.2670360321236502e-06, + "loss": 0.1196, + "num_input_tokens_seen": 25781320, + "step": 35930 + }, + { + "epoch": 74.70893970893971, + "grad_norm": 0.5382012724876404, + "learning_rate": 1.2639520763251617e-06, + "loss": 0.0937, + "num_input_tokens_seen": 25785032, + "step": 35935 + }, + { + "epoch": 74.71933471933473, + "grad_norm": 0.3652869164943695, + "learning_rate": 1.2608717809235448e-06, + "loss": 0.1161, + "num_input_tokens_seen": 25788648, + "step": 35940 + }, + { + "epoch": 74.72972972972973, + "grad_norm": 0.7426539659500122, + "learning_rate": 1.2577951463938282e-06, + "loss": 0.0983, + "num_input_tokens_seen": 25792136, + "step": 35945 + }, + { + "epoch": 74.74012474012474, + "grad_norm": 1.1784684658050537, + "learning_rate": 1.2547221732104569e-06, + "loss": 0.0858, + "num_input_tokens_seen": 25795752, + "step": 35950 + }, + { + "epoch": 74.75051975051976, + "grad_norm": 1.3427826166152954, + "learning_rate": 1.25165286184733e-06, + "loss": 0.0751, + "num_input_tokens_seen": 25799464, + "step": 35955 + }, + { + "epoch": 74.76091476091476, + "grad_norm": 0.3482956886291504, + "learning_rate": 1.248587212777777e-06, + "loss": 0.0937, + "num_input_tokens_seen": 25803112, + "step": 35960 + }, + { + "epoch": 74.77130977130977, + "grad_norm": 0.32645729184150696, + "learning_rate": 1.2455252264745532e-06, + "loss": 0.1065, + "num_input_tokens_seen": 25806568, + "step": 35965 + }, + { + "epoch": 74.78170478170478, + "grad_norm": 0.098745197057724, + "learning_rate": 1.2424669034098528e-06, + "loss": 0.1139, + "num_input_tokens_seen": 25810184, + "step": 35970 + }, + { + "epoch": 74.79209979209979, + "grad_norm": 0.5340112447738647, + "learning_rate": 1.2394122440553185e-06, + "loss": 0.1005, + "num_input_tokens_seen": 25813768, + "step": 35975 + }, + { + "epoch": 74.8024948024948, + "grad_norm": 0.3976612091064453, + "learning_rate": 1.2363612488820037e-06, + "loss": 0.085, + "num_input_tokens_seen": 25817288, + "step": 35980 + }, + { + "epoch": 74.81288981288981, + "grad_norm": 0.19947415590286255, + "learning_rate": 1.2333139183604208e-06, + "loss": 0.101, + "num_input_tokens_seen": 25820808, + "step": 35985 + }, + { + "epoch": 74.82328482328482, + "grad_norm": 0.452963262796402, + "learning_rate": 1.2302702529604998e-06, + "loss": 0.0721, + "num_input_tokens_seen": 25824488, + "step": 35990 + }, + { + "epoch": 74.83367983367984, + "grad_norm": 0.7871155142784119, + "learning_rate": 1.227230253151615e-06, + "loss": 0.106, + "num_input_tokens_seen": 25828136, + "step": 35995 + }, + { + "epoch": 74.84407484407484, + "grad_norm": 0.27144762873649597, + "learning_rate": 1.2241939194025748e-06, + "loss": 0.1085, + "num_input_tokens_seen": 25831720, + "step": 36000 + }, + { + "epoch": 74.84407484407484, + "eval_loss": 0.1452951580286026, + "eval_runtime": 7.746, + "eval_samples_per_second": 110.508, + "eval_steps_per_second": 27.627, + "num_input_tokens_seen": 25831720, + "step": 36000 + }, + { + "epoch": 74.85446985446985, + "grad_norm": 0.6118495464324951, + "learning_rate": 1.2211612521816156e-06, + "loss": 0.1364, + "num_input_tokens_seen": 25835240, + "step": 36005 + }, + { + "epoch": 74.86486486486487, + "grad_norm": 0.5306423306465149, + "learning_rate": 1.2181322519564137e-06, + "loss": 0.0652, + "num_input_tokens_seen": 25839016, + "step": 36010 + }, + { + "epoch": 74.87525987525987, + "grad_norm": 0.630932092666626, + "learning_rate": 1.2151069191940839e-06, + "loss": 0.0831, + "num_input_tokens_seen": 25842664, + "step": 36015 + }, + { + "epoch": 74.88565488565489, + "grad_norm": 0.45092421770095825, + "learning_rate": 1.2120852543611644e-06, + "loss": 0.148, + "num_input_tokens_seen": 25846280, + "step": 36020 + }, + { + "epoch": 74.8960498960499, + "grad_norm": 0.5061975717544556, + "learning_rate": 1.2090672579236379e-06, + "loss": 0.1243, + "num_input_tokens_seen": 25849960, + "step": 36025 + }, + { + "epoch": 74.9064449064449, + "grad_norm": 0.31057536602020264, + "learning_rate": 1.2060529303469126e-06, + "loss": 0.1163, + "num_input_tokens_seen": 25853608, + "step": 36030 + }, + { + "epoch": 74.91683991683992, + "grad_norm": 0.30489134788513184, + "learning_rate": 1.2030422720958445e-06, + "loss": 0.0904, + "num_input_tokens_seen": 25857352, + "step": 36035 + }, + { + "epoch": 74.92723492723492, + "grad_norm": 0.4541667699813843, + "learning_rate": 1.200035283634704e-06, + "loss": 0.0648, + "num_input_tokens_seen": 25860712, + "step": 36040 + }, + { + "epoch": 74.93762993762994, + "grad_norm": 0.18394000828266144, + "learning_rate": 1.1970319654272144e-06, + "loss": 0.0963, + "num_input_tokens_seen": 25864264, + "step": 36045 + }, + { + "epoch": 74.94802494802495, + "grad_norm": 0.1314752846956253, + "learning_rate": 1.1940323179365192e-06, + "loss": 0.0869, + "num_input_tokens_seen": 25867816, + "step": 36050 + }, + { + "epoch": 74.95841995841995, + "grad_norm": 0.17122294008731842, + "learning_rate": 1.1910363416252095e-06, + "loss": 0.0995, + "num_input_tokens_seen": 25871592, + "step": 36055 + }, + { + "epoch": 74.96881496881497, + "grad_norm": 0.3883211612701416, + "learning_rate": 1.1880440369552964e-06, + "loss": 0.1118, + "num_input_tokens_seen": 25875080, + "step": 36060 + }, + { + "epoch": 74.97920997920998, + "grad_norm": 0.6575104594230652, + "learning_rate": 1.1850554043882328e-06, + "loss": 0.146, + "num_input_tokens_seen": 25878888, + "step": 36065 + }, + { + "epoch": 74.98960498960498, + "grad_norm": 0.2880760133266449, + "learning_rate": 1.1820704443849028e-06, + "loss": 0.0854, + "num_input_tokens_seen": 25882632, + "step": 36070 + }, + { + "epoch": 75.0, + "grad_norm": 0.20517905056476593, + "learning_rate": 1.1790891574056219e-06, + "loss": 0.0943, + "num_input_tokens_seen": 25886168, + "step": 36075 + }, + { + "epoch": 75.01039501039502, + "grad_norm": 0.3445926606655121, + "learning_rate": 1.1761115439101523e-06, + "loss": 0.086, + "num_input_tokens_seen": 25889752, + "step": 36080 + }, + { + "epoch": 75.02079002079002, + "grad_norm": 0.6955698132514954, + "learning_rate": 1.1731376043576659e-06, + "loss": 0.1156, + "num_input_tokens_seen": 25893304, + "step": 36085 + }, + { + "epoch": 75.03118503118503, + "grad_norm": 0.8564364910125732, + "learning_rate": 1.1701673392067875e-06, + "loss": 0.1349, + "num_input_tokens_seen": 25897016, + "step": 36090 + }, + { + "epoch": 75.04158004158005, + "grad_norm": 0.12329817563295364, + "learning_rate": 1.1672007489155757e-06, + "loss": 0.1126, + "num_input_tokens_seen": 25900504, + "step": 36095 + }, + { + "epoch": 75.05197505197505, + "grad_norm": 0.6114428043365479, + "learning_rate": 1.164237833941506e-06, + "loss": 0.0813, + "num_input_tokens_seen": 25904056, + "step": 36100 + }, + { + "epoch": 75.06237006237006, + "grad_norm": 0.7426053285598755, + "learning_rate": 1.1612785947415022e-06, + "loss": 0.0986, + "num_input_tokens_seen": 25907672, + "step": 36105 + }, + { + "epoch": 75.07276507276508, + "grad_norm": 0.4602925479412079, + "learning_rate": 1.1583230317719185e-06, + "loss": 0.0937, + "num_input_tokens_seen": 25911096, + "step": 36110 + }, + { + "epoch": 75.08316008316008, + "grad_norm": 0.19090373814105988, + "learning_rate": 1.1553711454885318e-06, + "loss": 0.1024, + "num_input_tokens_seen": 25914712, + "step": 36115 + }, + { + "epoch": 75.0935550935551, + "grad_norm": 0.3642382025718689, + "learning_rate": 1.152422936346567e-06, + "loss": 0.0933, + "num_input_tokens_seen": 25918360, + "step": 36120 + }, + { + "epoch": 75.1039501039501, + "grad_norm": 0.4425069987773895, + "learning_rate": 1.1494784048006718e-06, + "loss": 0.1023, + "num_input_tokens_seen": 25922136, + "step": 36125 + }, + { + "epoch": 75.11434511434511, + "grad_norm": 0.3472929894924164, + "learning_rate": 1.1465375513049326e-06, + "loss": 0.0646, + "num_input_tokens_seen": 25925656, + "step": 36130 + }, + { + "epoch": 75.12474012474013, + "grad_norm": 1.2784852981567383, + "learning_rate": 1.1436003763128616e-06, + "loss": 0.1733, + "num_input_tokens_seen": 25929240, + "step": 36135 + }, + { + "epoch": 75.13513513513513, + "grad_norm": 0.2751370668411255, + "learning_rate": 1.1406668802774106e-06, + "loss": 0.1183, + "num_input_tokens_seen": 25932856, + "step": 36140 + }, + { + "epoch": 75.14553014553015, + "grad_norm": 0.21135534346103668, + "learning_rate": 1.137737063650965e-06, + "loss": 0.1119, + "num_input_tokens_seen": 25936600, + "step": 36145 + }, + { + "epoch": 75.15592515592516, + "grad_norm": 0.22200851142406464, + "learning_rate": 1.1348109268853323e-06, + "loss": 0.0701, + "num_input_tokens_seen": 25940408, + "step": 36150 + }, + { + "epoch": 75.16632016632016, + "grad_norm": 0.524969220161438, + "learning_rate": 1.1318884704317634e-06, + "loss": 0.072, + "num_input_tokens_seen": 25943864, + "step": 36155 + }, + { + "epoch": 75.17671517671518, + "grad_norm": 0.6148973107337952, + "learning_rate": 1.1289696947409417e-06, + "loss": 0.1112, + "num_input_tokens_seen": 25947480, + "step": 36160 + }, + { + "epoch": 75.18711018711019, + "grad_norm": 0.21006810665130615, + "learning_rate": 1.126054600262974e-06, + "loss": 0.0791, + "num_input_tokens_seen": 25951032, + "step": 36165 + }, + { + "epoch": 75.1975051975052, + "grad_norm": 0.26116347312927246, + "learning_rate": 1.1231431874474064e-06, + "loss": 0.0668, + "num_input_tokens_seen": 25954712, + "step": 36170 + }, + { + "epoch": 75.20790020790021, + "grad_norm": 0.3989645838737488, + "learning_rate": 1.12023545674321e-06, + "loss": 0.1014, + "num_input_tokens_seen": 25958328, + "step": 36175 + }, + { + "epoch": 75.21829521829522, + "grad_norm": 0.5601198673248291, + "learning_rate": 1.117331408598804e-06, + "loss": 0.1137, + "num_input_tokens_seen": 25961848, + "step": 36180 + }, + { + "epoch": 75.22869022869023, + "grad_norm": 0.5707024931907654, + "learning_rate": 1.1144310434620191e-06, + "loss": 0.0999, + "num_input_tokens_seen": 25965528, + "step": 36185 + }, + { + "epoch": 75.23908523908524, + "grad_norm": 0.21127991378307343, + "learning_rate": 1.1115343617801365e-06, + "loss": 0.092, + "num_input_tokens_seen": 25969016, + "step": 36190 + }, + { + "epoch": 75.24948024948024, + "grad_norm": 0.2888876795768738, + "learning_rate": 1.1086413639998515e-06, + "loss": 0.0805, + "num_input_tokens_seen": 25972408, + "step": 36195 + }, + { + "epoch": 75.25987525987526, + "grad_norm": 0.17325274646282196, + "learning_rate": 1.1057520505673103e-06, + "loss": 0.1001, + "num_input_tokens_seen": 25975928, + "step": 36200 + }, + { + "epoch": 75.25987525987526, + "eval_loss": 0.14527320861816406, + "eval_runtime": 7.7491, + "eval_samples_per_second": 110.464, + "eval_steps_per_second": 27.616, + "num_input_tokens_seen": 25975928, + "step": 36200 + }, + { + "epoch": 75.27027027027027, + "grad_norm": 0.3454156219959259, + "learning_rate": 1.1028664219280727e-06, + "loss": 0.077, + "num_input_tokens_seen": 25979256, + "step": 36205 + }, + { + "epoch": 75.28066528066527, + "grad_norm": 0.21174561977386475, + "learning_rate": 1.0999844785271468e-06, + "loss": 0.1001, + "num_input_tokens_seen": 25982936, + "step": 36210 + }, + { + "epoch": 75.29106029106029, + "grad_norm": 0.18447263538837433, + "learning_rate": 1.097106220808955e-06, + "loss": 0.1044, + "num_input_tokens_seen": 25986616, + "step": 36215 + }, + { + "epoch": 75.3014553014553, + "grad_norm": 0.4167442321777344, + "learning_rate": 1.0942316492173698e-06, + "loss": 0.1003, + "num_input_tokens_seen": 25990328, + "step": 36220 + }, + { + "epoch": 75.3118503118503, + "grad_norm": 0.2571474611759186, + "learning_rate": 1.0913607641956841e-06, + "loss": 0.0917, + "num_input_tokens_seen": 25994136, + "step": 36225 + }, + { + "epoch": 75.32224532224532, + "grad_norm": 0.3141537606716156, + "learning_rate": 1.0884935661866213e-06, + "loss": 0.0937, + "num_input_tokens_seen": 25997784, + "step": 36230 + }, + { + "epoch": 75.33264033264034, + "grad_norm": 0.5005452632904053, + "learning_rate": 1.0856300556323418e-06, + "loss": 0.0824, + "num_input_tokens_seen": 26001464, + "step": 36235 + }, + { + "epoch": 75.34303534303534, + "grad_norm": 0.18532998859882355, + "learning_rate": 1.0827702329744365e-06, + "loss": 0.0533, + "num_input_tokens_seen": 26004984, + "step": 36240 + }, + { + "epoch": 75.35343035343035, + "grad_norm": 0.307472825050354, + "learning_rate": 1.0799140986539197e-06, + "loss": 0.1226, + "num_input_tokens_seen": 26008728, + "step": 36245 + }, + { + "epoch": 75.36382536382537, + "grad_norm": 0.3754795789718628, + "learning_rate": 1.0770616531112526e-06, + "loss": 0.0971, + "num_input_tokens_seen": 26012440, + "step": 36250 + }, + { + "epoch": 75.37422037422037, + "grad_norm": 0.23061679303646088, + "learning_rate": 1.0742128967863085e-06, + "loss": 0.0656, + "num_input_tokens_seen": 26015896, + "step": 36255 + }, + { + "epoch": 75.38461538461539, + "grad_norm": 0.26393139362335205, + "learning_rate": 1.071367830118411e-06, + "loss": 0.0885, + "num_input_tokens_seen": 26019384, + "step": 36260 + }, + { + "epoch": 75.39501039501039, + "grad_norm": 0.2000710666179657, + "learning_rate": 1.068526453546298e-06, + "loss": 0.0668, + "num_input_tokens_seen": 26022840, + "step": 36265 + }, + { + "epoch": 75.4054054054054, + "grad_norm": 0.42732709646224976, + "learning_rate": 1.0656887675081467e-06, + "loss": 0.1128, + "num_input_tokens_seen": 26026456, + "step": 36270 + }, + { + "epoch": 75.41580041580042, + "grad_norm": 0.18312965333461761, + "learning_rate": 1.0628547724415628e-06, + "loss": 0.1149, + "num_input_tokens_seen": 26029944, + "step": 36275 + }, + { + "epoch": 75.42619542619542, + "grad_norm": 0.3277023732662201, + "learning_rate": 1.0600244687835881e-06, + "loss": 0.0849, + "num_input_tokens_seen": 26033528, + "step": 36280 + }, + { + "epoch": 75.43659043659044, + "grad_norm": 0.12599802017211914, + "learning_rate": 1.0571978569706876e-06, + "loss": 0.0553, + "num_input_tokens_seen": 26037208, + "step": 36285 + }, + { + "epoch": 75.44698544698545, + "grad_norm": 0.2055816799402237, + "learning_rate": 1.0543749374387652e-06, + "loss": 0.0999, + "num_input_tokens_seen": 26040792, + "step": 36290 + }, + { + "epoch": 75.45738045738045, + "grad_norm": 0.3711676597595215, + "learning_rate": 1.051555710623142e-06, + "loss": 0.1439, + "num_input_tokens_seen": 26044536, + "step": 36295 + }, + { + "epoch": 75.46777546777547, + "grad_norm": 0.35111844539642334, + "learning_rate": 1.0487401769585847e-06, + "loss": 0.0914, + "num_input_tokens_seen": 26048184, + "step": 36300 + }, + { + "epoch": 75.47817047817048, + "grad_norm": 0.7167134881019592, + "learning_rate": 1.0459283368792845e-06, + "loss": 0.0895, + "num_input_tokens_seen": 26051640, + "step": 36305 + }, + { + "epoch": 75.48856548856548, + "grad_norm": 1.3588111400604248, + "learning_rate": 1.043120190818858e-06, + "loss": 0.1427, + "num_input_tokens_seen": 26055256, + "step": 36310 + }, + { + "epoch": 75.4989604989605, + "grad_norm": 1.0667487382888794, + "learning_rate": 1.0403157392103596e-06, + "loss": 0.0931, + "num_input_tokens_seen": 26059096, + "step": 36315 + }, + { + "epoch": 75.50935550935552, + "grad_norm": 0.4409415125846863, + "learning_rate": 1.0375149824862735e-06, + "loss": 0.1078, + "num_input_tokens_seen": 26062680, + "step": 36320 + }, + { + "epoch": 75.51975051975052, + "grad_norm": 0.25254571437835693, + "learning_rate": 1.034717921078507e-06, + "loss": 0.1008, + "num_input_tokens_seen": 26066360, + "step": 36325 + }, + { + "epoch": 75.53014553014553, + "grad_norm": 0.40956148505210876, + "learning_rate": 1.0319245554184009e-06, + "loss": 0.0762, + "num_input_tokens_seen": 26069976, + "step": 36330 + }, + { + "epoch": 75.54054054054055, + "grad_norm": 0.22822526097297668, + "learning_rate": 1.0291348859367361e-06, + "loss": 0.0887, + "num_input_tokens_seen": 26073656, + "step": 36335 + }, + { + "epoch": 75.55093555093555, + "grad_norm": 0.2194785475730896, + "learning_rate": 1.0263489130637016e-06, + "loss": 0.085, + "num_input_tokens_seen": 26077400, + "step": 36340 + }, + { + "epoch": 75.56133056133056, + "grad_norm": 0.4031715989112854, + "learning_rate": 1.0235666372289427e-06, + "loss": 0.0679, + "num_input_tokens_seen": 26080856, + "step": 36345 + }, + { + "epoch": 75.57172557172557, + "grad_norm": 0.12251013517379761, + "learning_rate": 1.0207880588615076e-06, + "loss": 0.1374, + "num_input_tokens_seen": 26084344, + "step": 36350 + }, + { + "epoch": 75.58212058212058, + "grad_norm": 0.21675875782966614, + "learning_rate": 1.0180131783898984e-06, + "loss": 0.0811, + "num_input_tokens_seen": 26087960, + "step": 36355 + }, + { + "epoch": 75.5925155925156, + "grad_norm": 0.22809946537017822, + "learning_rate": 1.0152419962420362e-06, + "loss": 0.1142, + "num_input_tokens_seen": 26091608, + "step": 36360 + }, + { + "epoch": 75.6029106029106, + "grad_norm": 0.3606298267841339, + "learning_rate": 1.0124745128452685e-06, + "loss": 0.0883, + "num_input_tokens_seen": 26095032, + "step": 36365 + }, + { + "epoch": 75.61330561330561, + "grad_norm": 0.34534522891044617, + "learning_rate": 1.0097107286263758e-06, + "loss": 0.0865, + "num_input_tokens_seen": 26098744, + "step": 36370 + }, + { + "epoch": 75.62370062370063, + "grad_norm": 0.4623299837112427, + "learning_rate": 1.00695064401157e-06, + "loss": 0.0894, + "num_input_tokens_seen": 26102424, + "step": 36375 + }, + { + "epoch": 75.63409563409563, + "grad_norm": 0.339041143655777, + "learning_rate": 1.0041942594264886e-06, + "loss": 0.0991, + "num_input_tokens_seen": 26105880, + "step": 36380 + }, + { + "epoch": 75.64449064449065, + "grad_norm": 0.5150296688079834, + "learning_rate": 1.001441575296208e-06, + "loss": 0.117, + "num_input_tokens_seen": 26109336, + "step": 36385 + }, + { + "epoch": 75.65488565488566, + "grad_norm": 0.23321007192134857, + "learning_rate": 9.986925920452139e-07, + "loss": 0.1121, + "num_input_tokens_seen": 26112792, + "step": 36390 + }, + { + "epoch": 75.66528066528066, + "grad_norm": 0.16855256259441376, + "learning_rate": 9.959473100974475e-07, + "loss": 0.0505, + "num_input_tokens_seen": 26116216, + "step": 36395 + }, + { + "epoch": 75.67567567567568, + "grad_norm": 0.5279227495193481, + "learning_rate": 9.932057298762564e-07, + "loss": 0.0624, + "num_input_tokens_seen": 26119704, + "step": 36400 + }, + { + "epoch": 75.67567567567568, + "eval_loss": 0.14434827864170074, + "eval_runtime": 7.7545, + "eval_samples_per_second": 110.387, + "eval_steps_per_second": 27.597, + "num_input_tokens_seen": 26119704, + "step": 36400 + }, + { + "epoch": 75.68607068607069, + "grad_norm": 1.2902183532714844, + "learning_rate": 9.90467851804433e-07, + "loss": 0.1404, + "num_input_tokens_seen": 26123192, + "step": 36405 + }, + { + "epoch": 75.6964656964657, + "grad_norm": 0.9005650877952576, + "learning_rate": 9.877336763041895e-07, + "loss": 0.0848, + "num_input_tokens_seen": 26126776, + "step": 36410 + }, + { + "epoch": 75.70686070686071, + "grad_norm": 0.2945158779621124, + "learning_rate": 9.850032037971662e-07, + "loss": 0.0812, + "num_input_tokens_seen": 26130360, + "step": 36415 + }, + { + "epoch": 75.71725571725571, + "grad_norm": 0.3698696494102478, + "learning_rate": 9.822764347044406e-07, + "loss": 0.0665, + "num_input_tokens_seen": 26134008, + "step": 36420 + }, + { + "epoch": 75.72765072765073, + "grad_norm": 0.35695910453796387, + "learning_rate": 9.795533694465175e-07, + "loss": 0.1083, + "num_input_tokens_seen": 26137560, + "step": 36425 + }, + { + "epoch": 75.73804573804574, + "grad_norm": 0.452856183052063, + "learning_rate": 9.768340084433197e-07, + "loss": 0.0863, + "num_input_tokens_seen": 26141144, + "step": 36430 + }, + { + "epoch": 75.74844074844074, + "grad_norm": 0.294685423374176, + "learning_rate": 9.741183521142143e-07, + "loss": 0.0709, + "num_input_tokens_seen": 26144792, + "step": 36435 + }, + { + "epoch": 75.75883575883576, + "grad_norm": 0.7565234303474426, + "learning_rate": 9.714064008779889e-07, + "loss": 0.0871, + "num_input_tokens_seen": 26148312, + "step": 36440 + }, + { + "epoch": 75.76923076923077, + "grad_norm": 0.16188828647136688, + "learning_rate": 9.686981551528584e-07, + "loss": 0.0897, + "num_input_tokens_seen": 26151928, + "step": 36445 + }, + { + "epoch": 75.77962577962577, + "grad_norm": 0.26532167196273804, + "learning_rate": 9.65993615356467e-07, + "loss": 0.0842, + "num_input_tokens_seen": 26155544, + "step": 36450 + }, + { + "epoch": 75.79002079002079, + "grad_norm": 0.2089972048997879, + "learning_rate": 9.632927819058917e-07, + "loss": 0.1016, + "num_input_tokens_seen": 26159096, + "step": 36455 + }, + { + "epoch": 75.8004158004158, + "grad_norm": 0.4841546416282654, + "learning_rate": 9.605956552176305e-07, + "loss": 0.1425, + "num_input_tokens_seen": 26162616, + "step": 36460 + }, + { + "epoch": 75.8108108108108, + "grad_norm": 0.3288113474845886, + "learning_rate": 9.579022357076223e-07, + "loss": 0.102, + "num_input_tokens_seen": 26166136, + "step": 36465 + }, + { + "epoch": 75.82120582120582, + "grad_norm": 0.5525169968605042, + "learning_rate": 9.552125237912158e-07, + "loss": 0.1101, + "num_input_tokens_seen": 26169720, + "step": 36470 + }, + { + "epoch": 75.83160083160084, + "grad_norm": 0.5627906322479248, + "learning_rate": 9.525265198832096e-07, + "loss": 0.076, + "num_input_tokens_seen": 26173464, + "step": 36475 + }, + { + "epoch": 75.84199584199584, + "grad_norm": 0.2513766586780548, + "learning_rate": 9.498442243978112e-07, + "loss": 0.1175, + "num_input_tokens_seen": 26177112, + "step": 36480 + }, + { + "epoch": 75.85239085239085, + "grad_norm": 0.1561475694179535, + "learning_rate": 9.471656377486649e-07, + "loss": 0.1155, + "num_input_tokens_seen": 26180600, + "step": 36485 + }, + { + "epoch": 75.86278586278586, + "grad_norm": 0.4885791540145874, + "learning_rate": 9.444907603488456e-07, + "loss": 0.1714, + "num_input_tokens_seen": 26184344, + "step": 36490 + }, + { + "epoch": 75.87318087318087, + "grad_norm": 0.3480689227581024, + "learning_rate": 9.418195926108514e-07, + "loss": 0.0915, + "num_input_tokens_seen": 26187832, + "step": 36495 + }, + { + "epoch": 75.88357588357589, + "grad_norm": 0.6383851170539856, + "learning_rate": 9.391521349466053e-07, + "loss": 0.0836, + "num_input_tokens_seen": 26191352, + "step": 36500 + }, + { + "epoch": 75.89397089397089, + "grad_norm": 0.37364014983177185, + "learning_rate": 9.364883877674758e-07, + "loss": 0.0875, + "num_input_tokens_seen": 26195000, + "step": 36505 + }, + { + "epoch": 75.9043659043659, + "grad_norm": 0.41932663321495056, + "learning_rate": 9.33828351484231e-07, + "loss": 0.1187, + "num_input_tokens_seen": 26198488, + "step": 36510 + }, + { + "epoch": 75.91476091476092, + "grad_norm": 0.46941444277763367, + "learning_rate": 9.311720265070906e-07, + "loss": 0.1673, + "num_input_tokens_seen": 26202040, + "step": 36515 + }, + { + "epoch": 75.92515592515592, + "grad_norm": 0.9040143489837646, + "learning_rate": 9.285194132456931e-07, + "loss": 0.1378, + "num_input_tokens_seen": 26205560, + "step": 36520 + }, + { + "epoch": 75.93555093555094, + "grad_norm": 0.24050109088420868, + "learning_rate": 9.258705121091032e-07, + "loss": 0.0614, + "num_input_tokens_seen": 26209240, + "step": 36525 + }, + { + "epoch": 75.94594594594595, + "grad_norm": 0.6314917802810669, + "learning_rate": 9.232253235058136e-07, + "loss": 0.1326, + "num_input_tokens_seen": 26212728, + "step": 36530 + }, + { + "epoch": 75.95634095634095, + "grad_norm": 0.24157775938510895, + "learning_rate": 9.205838478437478e-07, + "loss": 0.0723, + "num_input_tokens_seen": 26216312, + "step": 36535 + }, + { + "epoch": 75.96673596673597, + "grad_norm": 0.4110836088657379, + "learning_rate": 9.179460855302524e-07, + "loss": 0.1164, + "num_input_tokens_seen": 26219864, + "step": 36540 + }, + { + "epoch": 75.97713097713098, + "grad_norm": 0.2590940296649933, + "learning_rate": 9.153120369721046e-07, + "loss": 0.0845, + "num_input_tokens_seen": 26223480, + "step": 36545 + }, + { + "epoch": 75.98752598752598, + "grad_norm": 0.23952841758728027, + "learning_rate": 9.126817025755103e-07, + "loss": 0.104, + "num_input_tokens_seen": 26227000, + "step": 36550 + }, + { + "epoch": 75.997920997921, + "grad_norm": 0.2216225117444992, + "learning_rate": 9.100550827460947e-07, + "loss": 0.052, + "num_input_tokens_seen": 26230520, + "step": 36555 + }, + { + "epoch": 76.00831600831602, + "grad_norm": 0.3731137812137604, + "learning_rate": 9.0743217788892e-07, + "loss": 0.1026, + "num_input_tokens_seen": 26234056, + "step": 36560 + }, + { + "epoch": 76.01871101871102, + "grad_norm": 0.6445068717002869, + "learning_rate": 9.048129884084683e-07, + "loss": 0.0932, + "num_input_tokens_seen": 26237576, + "step": 36565 + }, + { + "epoch": 76.02910602910603, + "grad_norm": 0.7699095010757446, + "learning_rate": 9.021975147086553e-07, + "loss": 0.1101, + "num_input_tokens_seen": 26241224, + "step": 36570 + }, + { + "epoch": 76.03950103950103, + "grad_norm": 0.2081073522567749, + "learning_rate": 8.995857571928141e-07, + "loss": 0.0841, + "num_input_tokens_seen": 26244712, + "step": 36575 + }, + { + "epoch": 76.04989604989605, + "grad_norm": 0.09844966977834702, + "learning_rate": 8.969777162637139e-07, + "loss": 0.1198, + "num_input_tokens_seen": 26248264, + "step": 36580 + }, + { + "epoch": 76.06029106029106, + "grad_norm": 0.7676056623458862, + "learning_rate": 8.943733923235525e-07, + "loss": 0.0874, + "num_input_tokens_seen": 26251784, + "step": 36585 + }, + { + "epoch": 76.07068607068607, + "grad_norm": 0.8542806506156921, + "learning_rate": 8.917727857739394e-07, + "loss": 0.1174, + "num_input_tokens_seen": 26255432, + "step": 36590 + }, + { + "epoch": 76.08108108108108, + "grad_norm": 0.2716899812221527, + "learning_rate": 8.891758970159258e-07, + "loss": 0.0841, + "num_input_tokens_seen": 26259048, + "step": 36595 + }, + { + "epoch": 76.0914760914761, + "grad_norm": 0.5424724221229553, + "learning_rate": 8.86582726449986e-07, + "loss": 0.0936, + "num_input_tokens_seen": 26262696, + "step": 36600 + }, + { + "epoch": 76.0914760914761, + "eval_loss": 0.14542172849178314, + "eval_runtime": 7.7609, + "eval_samples_per_second": 110.296, + "eval_steps_per_second": 27.574, + "num_input_tokens_seen": 26262696, + "step": 36600 + }, + { + "epoch": 76.1018711018711, + "grad_norm": 0.46283477544784546, + "learning_rate": 8.839932744760165e-07, + "loss": 0.0977, + "num_input_tokens_seen": 26266312, + "step": 36605 + }, + { + "epoch": 76.11226611226611, + "grad_norm": 0.8308045268058777, + "learning_rate": 8.814075414933482e-07, + "loss": 0.1163, + "num_input_tokens_seen": 26269864, + "step": 36610 + }, + { + "epoch": 76.12266112266113, + "grad_norm": 0.243289053440094, + "learning_rate": 8.788255279007257e-07, + "loss": 0.0799, + "num_input_tokens_seen": 26273512, + "step": 36615 + }, + { + "epoch": 76.13305613305613, + "grad_norm": 0.75149005651474, + "learning_rate": 8.762472340963362e-07, + "loss": 0.1245, + "num_input_tokens_seen": 26277128, + "step": 36620 + }, + { + "epoch": 76.14345114345114, + "grad_norm": 0.18416735529899597, + "learning_rate": 8.736726604777811e-07, + "loss": 0.1438, + "num_input_tokens_seen": 26280776, + "step": 36625 + }, + { + "epoch": 76.15384615384616, + "grad_norm": 0.2908155024051666, + "learning_rate": 8.711018074420901e-07, + "loss": 0.0515, + "num_input_tokens_seen": 26284200, + "step": 36630 + }, + { + "epoch": 76.16424116424116, + "grad_norm": 0.3639959692955017, + "learning_rate": 8.685346753857209e-07, + "loss": 0.0743, + "num_input_tokens_seen": 26287816, + "step": 36635 + }, + { + "epoch": 76.17463617463618, + "grad_norm": 0.20582076907157898, + "learning_rate": 8.659712647045654e-07, + "loss": 0.0784, + "num_input_tokens_seen": 26291368, + "step": 36640 + }, + { + "epoch": 76.18503118503118, + "grad_norm": 0.29161420464515686, + "learning_rate": 8.634115757939209e-07, + "loss": 0.0499, + "num_input_tokens_seen": 26294952, + "step": 36645 + }, + { + "epoch": 76.1954261954262, + "grad_norm": 0.5268247723579407, + "learning_rate": 8.608556090485387e-07, + "loss": 0.1114, + "num_input_tokens_seen": 26298376, + "step": 36650 + }, + { + "epoch": 76.20582120582121, + "grad_norm": 0.5934571623802185, + "learning_rate": 8.583033648625671e-07, + "loss": 0.1224, + "num_input_tokens_seen": 26301896, + "step": 36655 + }, + { + "epoch": 76.21621621621621, + "grad_norm": 0.28860965371131897, + "learning_rate": 8.557548436295998e-07, + "loss": 0.0647, + "num_input_tokens_seen": 26305480, + "step": 36660 + }, + { + "epoch": 76.22661122661123, + "grad_norm": 0.21644091606140137, + "learning_rate": 8.532100457426556e-07, + "loss": 0.0721, + "num_input_tokens_seen": 26309224, + "step": 36665 + }, + { + "epoch": 76.23700623700624, + "grad_norm": 0.3371211290359497, + "learning_rate": 8.506689715941679e-07, + "loss": 0.1181, + "num_input_tokens_seen": 26312776, + "step": 36670 + }, + { + "epoch": 76.24740124740124, + "grad_norm": 0.6046772599220276, + "learning_rate": 8.481316215760011e-07, + "loss": 0.1108, + "num_input_tokens_seen": 26316520, + "step": 36675 + }, + { + "epoch": 76.25779625779626, + "grad_norm": 0.3107910752296448, + "learning_rate": 8.455979960794558e-07, + "loss": 0.0931, + "num_input_tokens_seen": 26320040, + "step": 36680 + }, + { + "epoch": 76.26819126819127, + "grad_norm": 0.523496687412262, + "learning_rate": 8.430680954952364e-07, + "loss": 0.098, + "num_input_tokens_seen": 26323560, + "step": 36685 + }, + { + "epoch": 76.27858627858627, + "grad_norm": 0.28716325759887695, + "learning_rate": 8.405419202134974e-07, + "loss": 0.1164, + "num_input_tokens_seen": 26327080, + "step": 36690 + }, + { + "epoch": 76.28898128898129, + "grad_norm": 0.4666588604450226, + "learning_rate": 8.380194706237993e-07, + "loss": 0.1038, + "num_input_tokens_seen": 26330600, + "step": 36695 + }, + { + "epoch": 76.2993762993763, + "grad_norm": 1.0193746089935303, + "learning_rate": 8.355007471151366e-07, + "loss": 0.1043, + "num_input_tokens_seen": 26334216, + "step": 36700 + }, + { + "epoch": 76.3097713097713, + "grad_norm": 0.35868677496910095, + "learning_rate": 8.329857500759292e-07, + "loss": 0.0867, + "num_input_tokens_seen": 26337800, + "step": 36705 + }, + { + "epoch": 76.32016632016632, + "grad_norm": 0.2676093280315399, + "learning_rate": 8.304744798940194e-07, + "loss": 0.0957, + "num_input_tokens_seen": 26341288, + "step": 36710 + }, + { + "epoch": 76.33056133056132, + "grad_norm": 0.5139122605323792, + "learning_rate": 8.279669369566756e-07, + "loss": 0.0791, + "num_input_tokens_seen": 26344776, + "step": 36715 + }, + { + "epoch": 76.34095634095634, + "grad_norm": 0.5733832120895386, + "learning_rate": 8.254631216505993e-07, + "loss": 0.1473, + "num_input_tokens_seen": 26348424, + "step": 36720 + }, + { + "epoch": 76.35135135135135, + "grad_norm": 0.29081249237060547, + "learning_rate": 8.229630343619038e-07, + "loss": 0.1012, + "num_input_tokens_seen": 26351976, + "step": 36725 + }, + { + "epoch": 76.36174636174636, + "grad_norm": 0.30436915159225464, + "learning_rate": 8.204666754761392e-07, + "loss": 0.0613, + "num_input_tokens_seen": 26355592, + "step": 36730 + }, + { + "epoch": 76.37214137214137, + "grad_norm": 0.459346204996109, + "learning_rate": 8.179740453782669e-07, + "loss": 0.1193, + "num_input_tokens_seen": 26359240, + "step": 36735 + }, + { + "epoch": 76.38253638253639, + "grad_norm": 0.7975947260856628, + "learning_rate": 8.154851444526907e-07, + "loss": 0.0722, + "num_input_tokens_seen": 26362664, + "step": 36740 + }, + { + "epoch": 76.39293139293139, + "grad_norm": 0.44456058740615845, + "learning_rate": 8.129999730832283e-07, + "loss": 0.0801, + "num_input_tokens_seen": 26366184, + "step": 36745 + }, + { + "epoch": 76.4033264033264, + "grad_norm": 0.3933536410331726, + "learning_rate": 8.105185316531178e-07, + "loss": 0.106, + "num_input_tokens_seen": 26369864, + "step": 36750 + }, + { + "epoch": 76.41372141372142, + "grad_norm": 0.2244427651166916, + "learning_rate": 8.08040820545039e-07, + "loss": 0.1258, + "num_input_tokens_seen": 26373352, + "step": 36755 + }, + { + "epoch": 76.42411642411642, + "grad_norm": 1.0844827890396118, + "learning_rate": 8.055668401410782e-07, + "loss": 0.077, + "num_input_tokens_seen": 26376840, + "step": 36760 + }, + { + "epoch": 76.43451143451144, + "grad_norm": 0.4894666373729706, + "learning_rate": 8.030965908227578e-07, + "loss": 0.1, + "num_input_tokens_seen": 26380584, + "step": 36765 + }, + { + "epoch": 76.44490644490645, + "grad_norm": 0.27755558490753174, + "learning_rate": 8.006300729710203e-07, + "loss": 0.1266, + "num_input_tokens_seen": 26384328, + "step": 36770 + }, + { + "epoch": 76.45530145530145, + "grad_norm": 0.3216620981693268, + "learning_rate": 7.981672869662337e-07, + "loss": 0.0535, + "num_input_tokens_seen": 26388040, + "step": 36775 + }, + { + "epoch": 76.46569646569647, + "grad_norm": 0.27607715129852295, + "learning_rate": 7.957082331881888e-07, + "loss": 0.0794, + "num_input_tokens_seen": 26391560, + "step": 36780 + }, + { + "epoch": 76.47609147609148, + "grad_norm": 0.5702593326568604, + "learning_rate": 7.932529120161069e-07, + "loss": 0.0749, + "num_input_tokens_seen": 26395336, + "step": 36785 + }, + { + "epoch": 76.48648648648648, + "grad_norm": 0.8009644746780396, + "learning_rate": 7.908013238286243e-07, + "loss": 0.1232, + "num_input_tokens_seen": 26398824, + "step": 36790 + }, + { + "epoch": 76.4968814968815, + "grad_norm": 0.20676776766777039, + "learning_rate": 7.883534690038136e-07, + "loss": 0.0817, + "num_input_tokens_seen": 26402312, + "step": 36795 + }, + { + "epoch": 76.5072765072765, + "grad_norm": 0.17755305767059326, + "learning_rate": 7.859093479191559e-07, + "loss": 0.0826, + "num_input_tokens_seen": 26406024, + "step": 36800 + }, + { + "epoch": 76.5072765072765, + "eval_loss": 0.14422206580638885, + "eval_runtime": 7.7535, + "eval_samples_per_second": 110.401, + "eval_steps_per_second": 27.6, + "num_input_tokens_seen": 26406024, + "step": 36800 + }, + { + "epoch": 76.51767151767152, + "grad_norm": 0.36945202946662903, + "learning_rate": 7.834689609515722e-07, + "loss": 0.0911, + "num_input_tokens_seen": 26409576, + "step": 36805 + }, + { + "epoch": 76.52806652806653, + "grad_norm": 0.26325732469558716, + "learning_rate": 7.810323084774002e-07, + "loss": 0.1211, + "num_input_tokens_seen": 26413000, + "step": 36810 + }, + { + "epoch": 76.53846153846153, + "grad_norm": 0.4566749036312103, + "learning_rate": 7.785993908723976e-07, + "loss": 0.0902, + "num_input_tokens_seen": 26416520, + "step": 36815 + }, + { + "epoch": 76.54885654885655, + "grad_norm": 0.5221074223518372, + "learning_rate": 7.761702085117534e-07, + "loss": 0.0881, + "num_input_tokens_seen": 26420168, + "step": 36820 + }, + { + "epoch": 76.55925155925156, + "grad_norm": 0.18539084494113922, + "learning_rate": 7.737447617700844e-07, + "loss": 0.1228, + "num_input_tokens_seen": 26423752, + "step": 36825 + }, + { + "epoch": 76.56964656964657, + "grad_norm": 0.5107372999191284, + "learning_rate": 7.713230510214136e-07, + "loss": 0.1021, + "num_input_tokens_seen": 26427272, + "step": 36830 + }, + { + "epoch": 76.58004158004158, + "grad_norm": 0.38492998480796814, + "learning_rate": 7.689050766392092e-07, + "loss": 0.0785, + "num_input_tokens_seen": 26430664, + "step": 36835 + }, + { + "epoch": 76.5904365904366, + "grad_norm": 0.22447767853736877, + "learning_rate": 7.664908389963477e-07, + "loss": 0.097, + "num_input_tokens_seen": 26434376, + "step": 36840 + }, + { + "epoch": 76.6008316008316, + "grad_norm": 0.4259079694747925, + "learning_rate": 7.64080338465134e-07, + "loss": 0.0653, + "num_input_tokens_seen": 26437928, + "step": 36845 + }, + { + "epoch": 76.61122661122661, + "grad_norm": 0.10021257400512695, + "learning_rate": 7.616735754173043e-07, + "loss": 0.1046, + "num_input_tokens_seen": 26441480, + "step": 36850 + }, + { + "epoch": 76.62162162162163, + "grad_norm": 0.5946040749549866, + "learning_rate": 7.592705502240005e-07, + "loss": 0.0955, + "num_input_tokens_seen": 26445064, + "step": 36855 + }, + { + "epoch": 76.63201663201663, + "grad_norm": 0.9665102362632751, + "learning_rate": 7.568712632558095e-07, + "loss": 0.0983, + "num_input_tokens_seen": 26448584, + "step": 36860 + }, + { + "epoch": 76.64241164241164, + "grad_norm": 0.5753293037414551, + "learning_rate": 7.544757148827297e-07, + "loss": 0.1345, + "num_input_tokens_seen": 26451976, + "step": 36865 + }, + { + "epoch": 76.65280665280665, + "grad_norm": 0.6242565512657166, + "learning_rate": 7.520839054741797e-07, + "loss": 0.0728, + "num_input_tokens_seen": 26455528, + "step": 36870 + }, + { + "epoch": 76.66320166320166, + "grad_norm": 0.3499566614627838, + "learning_rate": 7.496958353990113e-07, + "loss": 0.0743, + "num_input_tokens_seen": 26459144, + "step": 36875 + }, + { + "epoch": 76.67359667359668, + "grad_norm": 0.1544385403394699, + "learning_rate": 7.473115050254941e-07, + "loss": 0.0685, + "num_input_tokens_seen": 26462920, + "step": 36880 + }, + { + "epoch": 76.68399168399168, + "grad_norm": 0.3321981132030487, + "learning_rate": 7.449309147213173e-07, + "loss": 0.0859, + "num_input_tokens_seen": 26466504, + "step": 36885 + }, + { + "epoch": 76.6943866943867, + "grad_norm": 0.2348235696554184, + "learning_rate": 7.425540648536067e-07, + "loss": 0.1028, + "num_input_tokens_seen": 26470056, + "step": 36890 + }, + { + "epoch": 76.70478170478171, + "grad_norm": 0.38910534977912903, + "learning_rate": 7.40180955788894e-07, + "loss": 0.0884, + "num_input_tokens_seen": 26473480, + "step": 36895 + }, + { + "epoch": 76.71517671517671, + "grad_norm": 0.23098677396774292, + "learning_rate": 7.378115878931474e-07, + "loss": 0.064, + "num_input_tokens_seen": 26477064, + "step": 36900 + }, + { + "epoch": 76.72557172557173, + "grad_norm": 0.1929367631673813, + "learning_rate": 7.354459615317527e-07, + "loss": 0.1044, + "num_input_tokens_seen": 26480808, + "step": 36905 + }, + { + "epoch": 76.73596673596674, + "grad_norm": 0.2442605197429657, + "learning_rate": 7.33084077069518e-07, + "loss": 0.1142, + "num_input_tokens_seen": 26484392, + "step": 36910 + }, + { + "epoch": 76.74636174636174, + "grad_norm": 0.5916938185691833, + "learning_rate": 7.307259348706768e-07, + "loss": 0.1351, + "num_input_tokens_seen": 26488072, + "step": 36915 + }, + { + "epoch": 76.75675675675676, + "grad_norm": 0.37238383293151855, + "learning_rate": 7.283715352988801e-07, + "loss": 0.1366, + "num_input_tokens_seen": 26491688, + "step": 36920 + }, + { + "epoch": 76.76715176715177, + "grad_norm": 0.7717053294181824, + "learning_rate": 7.260208787172068e-07, + "loss": 0.1002, + "num_input_tokens_seen": 26495080, + "step": 36925 + }, + { + "epoch": 76.77754677754677, + "grad_norm": 0.13924995064735413, + "learning_rate": 7.23673965488167e-07, + "loss": 0.1145, + "num_input_tokens_seen": 26498664, + "step": 36930 + }, + { + "epoch": 76.78794178794179, + "grad_norm": 0.7061318159103394, + "learning_rate": 7.213307959736709e-07, + "loss": 0.114, + "num_input_tokens_seen": 26502376, + "step": 36935 + }, + { + "epoch": 76.7983367983368, + "grad_norm": 0.316748708486557, + "learning_rate": 7.189913705350715e-07, + "loss": 0.0631, + "num_input_tokens_seen": 26506120, + "step": 36940 + }, + { + "epoch": 76.8087318087318, + "grad_norm": 0.7413954138755798, + "learning_rate": 7.166556895331411e-07, + "loss": 0.1008, + "num_input_tokens_seen": 26509768, + "step": 36945 + }, + { + "epoch": 76.81912681912682, + "grad_norm": 0.2940109372138977, + "learning_rate": 7.143237533280639e-07, + "loss": 0.1121, + "num_input_tokens_seen": 26513320, + "step": 36950 + }, + { + "epoch": 76.82952182952182, + "grad_norm": 0.3368678390979767, + "learning_rate": 7.119955622794578e-07, + "loss": 0.0896, + "num_input_tokens_seen": 26517160, + "step": 36955 + }, + { + "epoch": 76.83991683991684, + "grad_norm": 0.39645183086395264, + "learning_rate": 7.096711167463577e-07, + "loss": 0.1184, + "num_input_tokens_seen": 26520712, + "step": 36960 + }, + { + "epoch": 76.85031185031185, + "grad_norm": 0.15405325591564178, + "learning_rate": 7.073504170872213e-07, + "loss": 0.1013, + "num_input_tokens_seen": 26524296, + "step": 36965 + }, + { + "epoch": 76.86070686070686, + "grad_norm": 0.30992040038108826, + "learning_rate": 7.05033463659932e-07, + "loss": 0.08, + "num_input_tokens_seen": 26527944, + "step": 36970 + }, + { + "epoch": 76.87110187110187, + "grad_norm": 0.3745391368865967, + "learning_rate": 7.027202568217928e-07, + "loss": 0.1084, + "num_input_tokens_seen": 26531560, + "step": 36975 + }, + { + "epoch": 76.88149688149689, + "grad_norm": 0.3605930209159851, + "learning_rate": 7.004107969295293e-07, + "loss": 0.0752, + "num_input_tokens_seen": 26535208, + "step": 36980 + }, + { + "epoch": 76.89189189189189, + "grad_norm": 0.519972026348114, + "learning_rate": 6.9810508433929e-07, + "loss": 0.1045, + "num_input_tokens_seen": 26539048, + "step": 36985 + }, + { + "epoch": 76.9022869022869, + "grad_norm": 0.3904852867126465, + "learning_rate": 6.958031194066406e-07, + "loss": 0.1007, + "num_input_tokens_seen": 26542792, + "step": 36990 + }, + { + "epoch": 76.91268191268192, + "grad_norm": 0.3815818428993225, + "learning_rate": 6.935049024865776e-07, + "loss": 0.0971, + "num_input_tokens_seen": 26546536, + "step": 36995 + }, + { + "epoch": 76.92307692307692, + "grad_norm": 0.32755568623542786, + "learning_rate": 6.912104339335118e-07, + "loss": 0.0844, + "num_input_tokens_seen": 26550088, + "step": 37000 + }, + { + "epoch": 76.92307692307692, + "eval_loss": 0.1468891054391861, + "eval_runtime": 7.7506, + "eval_samples_per_second": 110.443, + "eval_steps_per_second": 27.611, + "num_input_tokens_seen": 26550088, + "step": 37000 + }, + { + "epoch": 76.93347193347194, + "grad_norm": 0.1754159778356552, + "learning_rate": 6.889197141012799e-07, + "loss": 0.0849, + "num_input_tokens_seen": 26553704, + "step": 37005 + }, + { + "epoch": 76.94386694386695, + "grad_norm": 0.2451934963464737, + "learning_rate": 6.866327433431435e-07, + "loss": 0.1191, + "num_input_tokens_seen": 26557288, + "step": 37010 + }, + { + "epoch": 76.95426195426195, + "grad_norm": 0.3552836775779724, + "learning_rate": 6.843495220117735e-07, + "loss": 0.1246, + "num_input_tokens_seen": 26560968, + "step": 37015 + }, + { + "epoch": 76.96465696465697, + "grad_norm": 0.4640047252178192, + "learning_rate": 6.820700504592798e-07, + "loss": 0.0941, + "num_input_tokens_seen": 26564648, + "step": 37020 + }, + { + "epoch": 76.97505197505197, + "grad_norm": 0.17260956764221191, + "learning_rate": 6.797943290371839e-07, + "loss": 0.1226, + "num_input_tokens_seen": 26568232, + "step": 37025 + }, + { + "epoch": 76.98544698544698, + "grad_norm": 0.25129762291908264, + "learning_rate": 6.775223580964274e-07, + "loss": 0.0888, + "num_input_tokens_seen": 26571848, + "step": 37030 + }, + { + "epoch": 76.995841995842, + "grad_norm": 0.3486960530281067, + "learning_rate": 6.7525413798738e-07, + "loss": 0.0646, + "num_input_tokens_seen": 26575272, + "step": 37035 + }, + { + "epoch": 77.006237006237, + "grad_norm": 0.32018449902534485, + "learning_rate": 6.729896690598259e-07, + "loss": 0.1153, + "num_input_tokens_seen": 26578720, + "step": 37040 + }, + { + "epoch": 77.01663201663202, + "grad_norm": 0.2839074432849884, + "learning_rate": 6.707289516629772e-07, + "loss": 0.105, + "num_input_tokens_seen": 26582400, + "step": 37045 + }, + { + "epoch": 77.02702702702703, + "grad_norm": 0.20371182262897491, + "learning_rate": 6.684719861454692e-07, + "loss": 0.0802, + "num_input_tokens_seen": 26585952, + "step": 37050 + }, + { + "epoch": 77.03742203742203, + "grad_norm": 0.4019698202610016, + "learning_rate": 6.662187728553481e-07, + "loss": 0.1287, + "num_input_tokens_seen": 26589536, + "step": 37055 + }, + { + "epoch": 77.04781704781705, + "grad_norm": 0.43911030888557434, + "learning_rate": 6.639693121400892e-07, + "loss": 0.0773, + "num_input_tokens_seen": 26593280, + "step": 37060 + }, + { + "epoch": 77.05821205821206, + "grad_norm": 0.21599525213241577, + "learning_rate": 6.617236043465868e-07, + "loss": 0.1106, + "num_input_tokens_seen": 26596992, + "step": 37065 + }, + { + "epoch": 77.06860706860707, + "grad_norm": 0.6087242960929871, + "learning_rate": 6.594816498211587e-07, + "loss": 0.121, + "num_input_tokens_seen": 26600512, + "step": 37070 + }, + { + "epoch": 77.07900207900208, + "grad_norm": 0.25882449746131897, + "learning_rate": 6.572434489095447e-07, + "loss": 0.1257, + "num_input_tokens_seen": 26604384, + "step": 37075 + }, + { + "epoch": 77.0893970893971, + "grad_norm": 0.1121775209903717, + "learning_rate": 6.550090019568994e-07, + "loss": 0.086, + "num_input_tokens_seen": 26607936, + "step": 37080 + }, + { + "epoch": 77.0997920997921, + "grad_norm": 0.5307621955871582, + "learning_rate": 6.527783093078027e-07, + "loss": 0.1069, + "num_input_tokens_seen": 26611648, + "step": 37085 + }, + { + "epoch": 77.11018711018711, + "grad_norm": 0.40394294261932373, + "learning_rate": 6.5055137130626e-07, + "loss": 0.093, + "num_input_tokens_seen": 26615296, + "step": 37090 + }, + { + "epoch": 77.12058212058211, + "grad_norm": 0.26119157671928406, + "learning_rate": 6.483281882956854e-07, + "loss": 0.0844, + "num_input_tokens_seen": 26618912, + "step": 37095 + }, + { + "epoch": 77.13097713097713, + "grad_norm": 0.9388182759284973, + "learning_rate": 6.461087606189298e-07, + "loss": 0.0748, + "num_input_tokens_seen": 26622464, + "step": 37100 + }, + { + "epoch": 77.14137214137214, + "grad_norm": 0.1938660591840744, + "learning_rate": 6.438930886182554e-07, + "loss": 0.0785, + "num_input_tokens_seen": 26626048, + "step": 37105 + }, + { + "epoch": 77.15176715176715, + "grad_norm": 0.28895360231399536, + "learning_rate": 6.416811726353417e-07, + "loss": 0.1068, + "num_input_tokens_seen": 26629632, + "step": 37110 + }, + { + "epoch": 77.16216216216216, + "grad_norm": 0.38279828429222107, + "learning_rate": 6.394730130112991e-07, + "loss": 0.115, + "num_input_tokens_seen": 26633120, + "step": 37115 + }, + { + "epoch": 77.17255717255718, + "grad_norm": 0.21194994449615479, + "learning_rate": 6.372686100866471e-07, + "loss": 0.0601, + "num_input_tokens_seen": 26636768, + "step": 37120 + }, + { + "epoch": 77.18295218295218, + "grad_norm": 0.2285803109407425, + "learning_rate": 6.350679642013413e-07, + "loss": 0.0864, + "num_input_tokens_seen": 26640288, + "step": 37125 + }, + { + "epoch": 77.1933471933472, + "grad_norm": 0.39910760521888733, + "learning_rate": 6.328710756947437e-07, + "loss": 0.1312, + "num_input_tokens_seen": 26643776, + "step": 37130 + }, + { + "epoch": 77.20374220374221, + "grad_norm": 0.5602274537086487, + "learning_rate": 6.306779449056416e-07, + "loss": 0.0837, + "num_input_tokens_seen": 26647328, + "step": 37135 + }, + { + "epoch": 77.21413721413721, + "grad_norm": 0.33894479274749756, + "learning_rate": 6.284885721722422e-07, + "loss": 0.0764, + "num_input_tokens_seen": 26650688, + "step": 37140 + }, + { + "epoch": 77.22453222453223, + "grad_norm": 0.2695881426334381, + "learning_rate": 6.26302957832181e-07, + "loss": 0.1222, + "num_input_tokens_seen": 26654208, + "step": 37145 + }, + { + "epoch": 77.23492723492724, + "grad_norm": 0.1793213039636612, + "learning_rate": 6.241211022224997e-07, + "loss": 0.1, + "num_input_tokens_seen": 26657952, + "step": 37150 + }, + { + "epoch": 77.24532224532224, + "grad_norm": 0.49772486090660095, + "learning_rate": 6.219430056796732e-07, + "loss": 0.1469, + "num_input_tokens_seen": 26661600, + "step": 37155 + }, + { + "epoch": 77.25571725571726, + "grad_norm": 0.6729981303215027, + "learning_rate": 6.19768668539586e-07, + "loss": 0.1156, + "num_input_tokens_seen": 26665248, + "step": 37160 + }, + { + "epoch": 77.26611226611226, + "grad_norm": 0.5994163751602173, + "learning_rate": 6.175980911375528e-07, + "loss": 0.0821, + "num_input_tokens_seen": 26668768, + "step": 37165 + }, + { + "epoch": 77.27650727650727, + "grad_norm": 0.33078664541244507, + "learning_rate": 6.154312738083034e-07, + "loss": 0.1394, + "num_input_tokens_seen": 26672256, + "step": 37170 + }, + { + "epoch": 77.28690228690229, + "grad_norm": 0.7024956941604614, + "learning_rate": 6.132682168859843e-07, + "loss": 0.0922, + "num_input_tokens_seen": 26675776, + "step": 37175 + }, + { + "epoch": 77.29729729729729, + "grad_norm": 0.2035631537437439, + "learning_rate": 6.111089207041704e-07, + "loss": 0.0761, + "num_input_tokens_seen": 26679392, + "step": 37180 + }, + { + "epoch": 77.3076923076923, + "grad_norm": 0.37973612546920776, + "learning_rate": 6.089533855958507e-07, + "loss": 0.0848, + "num_input_tokens_seen": 26683168, + "step": 37185 + }, + { + "epoch": 77.31808731808732, + "grad_norm": 0.3301275670528412, + "learning_rate": 6.068016118934372e-07, + "loss": 0.0988, + "num_input_tokens_seen": 26686752, + "step": 37190 + }, + { + "epoch": 77.32848232848232, + "grad_norm": 0.18380212783813477, + "learning_rate": 6.04653599928759e-07, + "loss": 0.0792, + "num_input_tokens_seen": 26690336, + "step": 37195 + }, + { + "epoch": 77.33887733887734, + "grad_norm": 0.23778924345970154, + "learning_rate": 6.025093500330675e-07, + "loss": 0.0912, + "num_input_tokens_seen": 26693856, + "step": 37200 + }, + { + "epoch": 77.33887733887734, + "eval_loss": 0.14448165893554688, + "eval_runtime": 7.758, + "eval_samples_per_second": 110.338, + "eval_steps_per_second": 27.585, + "num_input_tokens_seen": 26693856, + "step": 37200 + }, + { + "epoch": 77.34927234927235, + "grad_norm": 0.6558236479759216, + "learning_rate": 6.003688625370291e-07, + "loss": 0.0855, + "num_input_tokens_seen": 26697312, + "step": 37205 + }, + { + "epoch": 77.35966735966736, + "grad_norm": 0.4746255576610565, + "learning_rate": 5.982321377707406e-07, + "loss": 0.1266, + "num_input_tokens_seen": 26700864, + "step": 37210 + }, + { + "epoch": 77.37006237006237, + "grad_norm": 0.5613168478012085, + "learning_rate": 5.96099176063708e-07, + "loss": 0.0983, + "num_input_tokens_seen": 26704320, + "step": 37215 + }, + { + "epoch": 77.38045738045739, + "grad_norm": 0.584765613079071, + "learning_rate": 5.93969977744857e-07, + "loss": 0.0518, + "num_input_tokens_seen": 26707840, + "step": 37220 + }, + { + "epoch": 77.39085239085239, + "grad_norm": 0.22577545046806335, + "learning_rate": 5.918445431425445e-07, + "loss": 0.0785, + "num_input_tokens_seen": 26711488, + "step": 37225 + }, + { + "epoch": 77.4012474012474, + "grad_norm": 0.12855951488018036, + "learning_rate": 5.897228725845333e-07, + "loss": 0.0797, + "num_input_tokens_seen": 26715232, + "step": 37230 + }, + { + "epoch": 77.41164241164242, + "grad_norm": 0.4327733516693115, + "learning_rate": 5.876049663980171e-07, + "loss": 0.1082, + "num_input_tokens_seen": 26718848, + "step": 37235 + }, + { + "epoch": 77.42203742203742, + "grad_norm": 0.6313632130622864, + "learning_rate": 5.854908249095959e-07, + "loss": 0.0944, + "num_input_tokens_seen": 26722464, + "step": 37240 + }, + { + "epoch": 77.43243243243244, + "grad_norm": 0.3396547734737396, + "learning_rate": 5.833804484453031e-07, + "loss": 0.1277, + "num_input_tokens_seen": 26725952, + "step": 37245 + }, + { + "epoch": 77.44282744282744, + "grad_norm": 0.2725011706352234, + "learning_rate": 5.81273837330587e-07, + "loss": 0.0688, + "num_input_tokens_seen": 26729472, + "step": 37250 + }, + { + "epoch": 77.45322245322245, + "grad_norm": 0.26993659138679504, + "learning_rate": 5.791709918903071e-07, + "loss": 0.1318, + "num_input_tokens_seen": 26733056, + "step": 37255 + }, + { + "epoch": 77.46361746361747, + "grad_norm": 0.4894474744796753, + "learning_rate": 5.770719124487483e-07, + "loss": 0.1166, + "num_input_tokens_seen": 26736544, + "step": 37260 + }, + { + "epoch": 77.47401247401247, + "grad_norm": 0.30133727192878723, + "learning_rate": 5.749765993296241e-07, + "loss": 0.0923, + "num_input_tokens_seen": 26740064, + "step": 37265 + }, + { + "epoch": 77.48440748440748, + "grad_norm": 0.6027052998542786, + "learning_rate": 5.728850528560509e-07, + "loss": 0.1423, + "num_input_tokens_seen": 26743808, + "step": 37270 + }, + { + "epoch": 77.4948024948025, + "grad_norm": 0.20081621408462524, + "learning_rate": 5.707972733505707e-07, + "loss": 0.0887, + "num_input_tokens_seen": 26747328, + "step": 37275 + }, + { + "epoch": 77.5051975051975, + "grad_norm": 0.26278483867645264, + "learning_rate": 5.687132611351509e-07, + "loss": 0.1334, + "num_input_tokens_seen": 26750912, + "step": 37280 + }, + { + "epoch": 77.51559251559252, + "grad_norm": 0.8157753348350525, + "learning_rate": 5.666330165311651e-07, + "loss": 0.1341, + "num_input_tokens_seen": 26754592, + "step": 37285 + }, + { + "epoch": 77.52598752598753, + "grad_norm": 0.20404189825057983, + "learning_rate": 5.645565398594204e-07, + "loss": 0.0577, + "num_input_tokens_seen": 26758112, + "step": 37290 + }, + { + "epoch": 77.53638253638253, + "grad_norm": 0.7250176072120667, + "learning_rate": 5.624838314401304e-07, + "loss": 0.1239, + "num_input_tokens_seen": 26761664, + "step": 37295 + }, + { + "epoch": 77.54677754677755, + "grad_norm": 0.2707277834415436, + "learning_rate": 5.604148915929336e-07, + "loss": 0.0823, + "num_input_tokens_seen": 26765344, + "step": 37300 + }, + { + "epoch": 77.55717255717256, + "grad_norm": 0.34675613045692444, + "learning_rate": 5.583497206368887e-07, + "loss": 0.0973, + "num_input_tokens_seen": 26768928, + "step": 37305 + }, + { + "epoch": 77.56756756756756, + "grad_norm": 0.41910502314567566, + "learning_rate": 5.562883188904688e-07, + "loss": 0.0825, + "num_input_tokens_seen": 26772480, + "step": 37310 + }, + { + "epoch": 77.57796257796258, + "grad_norm": 0.3799203038215637, + "learning_rate": 5.542306866715724e-07, + "loss": 0.0885, + "num_input_tokens_seen": 26776192, + "step": 37315 + }, + { + "epoch": 77.58835758835758, + "grad_norm": 0.2760608494281769, + "learning_rate": 5.52176824297504e-07, + "loss": 0.0725, + "num_input_tokens_seen": 26779712, + "step": 37320 + }, + { + "epoch": 77.5987525987526, + "grad_norm": 0.3573709726333618, + "learning_rate": 5.501267320850018e-07, + "loss": 0.0786, + "num_input_tokens_seen": 26783424, + "step": 37325 + }, + { + "epoch": 77.60914760914761, + "grad_norm": 0.6018442511558533, + "learning_rate": 5.480804103502157e-07, + "loss": 0.1285, + "num_input_tokens_seen": 26786880, + "step": 37330 + }, + { + "epoch": 77.61954261954261, + "grad_norm": 0.2847142517566681, + "learning_rate": 5.460378594087101e-07, + "loss": 0.0781, + "num_input_tokens_seen": 26790656, + "step": 37335 + }, + { + "epoch": 77.62993762993763, + "grad_norm": 0.2785479426383972, + "learning_rate": 5.439990795754773e-07, + "loss": 0.0816, + "num_input_tokens_seen": 26794144, + "step": 37340 + }, + { + "epoch": 77.64033264033264, + "grad_norm": 0.3039727807044983, + "learning_rate": 5.419640711649188e-07, + "loss": 0.0541, + "num_input_tokens_seen": 26797728, + "step": 37345 + }, + { + "epoch": 77.65072765072765, + "grad_norm": 0.23540937900543213, + "learning_rate": 5.399328344908583e-07, + "loss": 0.0835, + "num_input_tokens_seen": 26801280, + "step": 37350 + }, + { + "epoch": 77.66112266112266, + "grad_norm": 0.6499500870704651, + "learning_rate": 5.379053698665399e-07, + "loss": 0.0684, + "num_input_tokens_seen": 26804896, + "step": 37355 + }, + { + "epoch": 77.67151767151768, + "grad_norm": 0.2422538697719574, + "learning_rate": 5.358816776046216e-07, + "loss": 0.1245, + "num_input_tokens_seen": 26808480, + "step": 37360 + }, + { + "epoch": 77.68191268191268, + "grad_norm": 0.26367613673210144, + "learning_rate": 5.338617580171817e-07, + "loss": 0.0937, + "num_input_tokens_seen": 26811936, + "step": 37365 + }, + { + "epoch": 77.6923076923077, + "grad_norm": 0.42363202571868896, + "learning_rate": 5.318456114157239e-07, + "loss": 0.1043, + "num_input_tokens_seen": 26815392, + "step": 37370 + }, + { + "epoch": 77.70270270270271, + "grad_norm": 0.20475463569164276, + "learning_rate": 5.298332381111576e-07, + "loss": 0.0902, + "num_input_tokens_seen": 26818880, + "step": 37375 + }, + { + "epoch": 77.71309771309771, + "grad_norm": 0.7399296760559082, + "learning_rate": 5.27824638413818e-07, + "loss": 0.0808, + "num_input_tokens_seen": 26822592, + "step": 37380 + }, + { + "epoch": 77.72349272349273, + "grad_norm": 0.6833633780479431, + "learning_rate": 5.258198126334546e-07, + "loss": 0.0915, + "num_input_tokens_seen": 26826272, + "step": 37385 + }, + { + "epoch": 77.73388773388774, + "grad_norm": 0.2595307230949402, + "learning_rate": 5.238187610792367e-07, + "loss": 0.081, + "num_input_tokens_seen": 26829792, + "step": 37390 + }, + { + "epoch": 77.74428274428274, + "grad_norm": 0.22700782120227814, + "learning_rate": 5.218214840597563e-07, + "loss": 0.1393, + "num_input_tokens_seen": 26833440, + "step": 37395 + }, + { + "epoch": 77.75467775467776, + "grad_norm": 0.24833384156227112, + "learning_rate": 5.198279818830115e-07, + "loss": 0.1002, + "num_input_tokens_seen": 26837120, + "step": 37400 + }, + { + "epoch": 77.75467775467776, + "eval_loss": 0.14611418545246124, + "eval_runtime": 7.757, + "eval_samples_per_second": 110.352, + "eval_steps_per_second": 27.588, + "num_input_tokens_seen": 26837120, + "step": 37400 + }, + { + "epoch": 77.76507276507276, + "grad_norm": 0.9015591740608215, + "learning_rate": 5.178382548564287e-07, + "loss": 0.1078, + "num_input_tokens_seen": 26840800, + "step": 37405 + }, + { + "epoch": 77.77546777546777, + "grad_norm": 0.4266223609447479, + "learning_rate": 5.15852303286854e-07, + "loss": 0.1009, + "num_input_tokens_seen": 26844416, + "step": 37410 + }, + { + "epoch": 77.78586278586279, + "grad_norm": 0.258932888507843, + "learning_rate": 5.138701274805396e-07, + "loss": 0.0704, + "num_input_tokens_seen": 26848096, + "step": 37415 + }, + { + "epoch": 77.79625779625779, + "grad_norm": 0.5116473436355591, + "learning_rate": 5.118917277431606e-07, + "loss": 0.071, + "num_input_tokens_seen": 26851712, + "step": 37420 + }, + { + "epoch": 77.8066528066528, + "grad_norm": 0.21227958798408508, + "learning_rate": 5.099171043798145e-07, + "loss": 0.1068, + "num_input_tokens_seen": 26855232, + "step": 37425 + }, + { + "epoch": 77.81704781704782, + "grad_norm": 0.13576042652130127, + "learning_rate": 5.079462576950133e-07, + "loss": 0.0758, + "num_input_tokens_seen": 26858752, + "step": 37430 + }, + { + "epoch": 77.82744282744282, + "grad_norm": 0.26680704951286316, + "learning_rate": 5.059791879926862e-07, + "loss": 0.0879, + "num_input_tokens_seen": 26862336, + "step": 37435 + }, + { + "epoch": 77.83783783783784, + "grad_norm": 0.28408968448638916, + "learning_rate": 5.040158955761793e-07, + "loss": 0.1104, + "num_input_tokens_seen": 26865888, + "step": 37440 + }, + { + "epoch": 77.84823284823285, + "grad_norm": 0.2786036729812622, + "learning_rate": 5.020563807482559e-07, + "loss": 0.0943, + "num_input_tokens_seen": 26869632, + "step": 37445 + }, + { + "epoch": 77.85862785862786, + "grad_norm": 0.3690204322338104, + "learning_rate": 5.001006438110995e-07, + "loss": 0.1235, + "num_input_tokens_seen": 26873184, + "step": 37450 + }, + { + "epoch": 77.86902286902287, + "grad_norm": 0.21760354936122894, + "learning_rate": 4.981486850663075e-07, + "loss": 0.105, + "num_input_tokens_seen": 26876768, + "step": 37455 + }, + { + "epoch": 77.87941787941789, + "grad_norm": 0.1666523963212967, + "learning_rate": 4.962005048149005e-07, + "loss": 0.1022, + "num_input_tokens_seen": 26880416, + "step": 37460 + }, + { + "epoch": 77.88981288981289, + "grad_norm": 0.27986425161361694, + "learning_rate": 4.942561033573073e-07, + "loss": 0.0802, + "num_input_tokens_seen": 26883872, + "step": 37465 + }, + { + "epoch": 77.9002079002079, + "grad_norm": 0.2752316892147064, + "learning_rate": 4.923154809933827e-07, + "loss": 0.1149, + "num_input_tokens_seen": 26887360, + "step": 37470 + }, + { + "epoch": 77.9106029106029, + "grad_norm": 0.2305660992860794, + "learning_rate": 4.903786380223957e-07, + "loss": 0.0689, + "num_input_tokens_seen": 26891008, + "step": 37475 + }, + { + "epoch": 77.92099792099792, + "grad_norm": 0.19538304209709167, + "learning_rate": 4.884455747430266e-07, + "loss": 0.0824, + "num_input_tokens_seen": 26894400, + "step": 37480 + }, + { + "epoch": 77.93139293139293, + "grad_norm": 0.8081440925598145, + "learning_rate": 4.865162914533816e-07, + "loss": 0.1137, + "num_input_tokens_seen": 26897920, + "step": 37485 + }, + { + "epoch": 77.94178794178794, + "grad_norm": 0.23849549889564514, + "learning_rate": 4.845907884509809e-07, + "loss": 0.1117, + "num_input_tokens_seen": 26901568, + "step": 37490 + }, + { + "epoch": 77.95218295218295, + "grad_norm": 0.517026960849762, + "learning_rate": 4.82669066032762e-07, + "loss": 0.0692, + "num_input_tokens_seen": 26905216, + "step": 37495 + }, + { + "epoch": 77.96257796257797, + "grad_norm": 0.4599483013153076, + "learning_rate": 4.807511244950768e-07, + "loss": 0.1089, + "num_input_tokens_seen": 26908736, + "step": 37500 + }, + { + "epoch": 77.97297297297297, + "grad_norm": 0.7168390154838562, + "learning_rate": 4.788369641336943e-07, + "loss": 0.1199, + "num_input_tokens_seen": 26912288, + "step": 37505 + }, + { + "epoch": 77.98336798336798, + "grad_norm": 0.4320046901702881, + "learning_rate": 4.769265852438032e-07, + "loss": 0.1117, + "num_input_tokens_seen": 26915936, + "step": 37510 + }, + { + "epoch": 77.993762993763, + "grad_norm": 0.21693435311317444, + "learning_rate": 4.750199881200124e-07, + "loss": 0.0951, + "num_input_tokens_seen": 26919584, + "step": 37515 + }, + { + "epoch": 78.004158004158, + "grad_norm": 0.7721031904220581, + "learning_rate": 4.7311717305633664e-07, + "loss": 0.1392, + "num_input_tokens_seen": 26923192, + "step": 37520 + }, + { + "epoch": 78.01455301455302, + "grad_norm": 0.3765774369239807, + "learning_rate": 4.7121814034621623e-07, + "loss": 0.0779, + "num_input_tokens_seen": 26926808, + "step": 37525 + }, + { + "epoch": 78.02494802494803, + "grad_norm": 0.2802281379699707, + "learning_rate": 4.693228902825114e-07, + "loss": 0.077, + "num_input_tokens_seen": 26930584, + "step": 37530 + }, + { + "epoch": 78.03534303534303, + "grad_norm": 0.24251556396484375, + "learning_rate": 4.6743142315748277e-07, + "loss": 0.1391, + "num_input_tokens_seen": 26934104, + "step": 37535 + }, + { + "epoch": 78.04573804573805, + "grad_norm": 0.8450698256492615, + "learning_rate": 4.655437392628276e-07, + "loss": 0.1197, + "num_input_tokens_seen": 26937656, + "step": 37540 + }, + { + "epoch": 78.05613305613305, + "grad_norm": 0.2705097198486328, + "learning_rate": 4.636598388896463e-07, + "loss": 0.113, + "num_input_tokens_seen": 26941240, + "step": 37545 + }, + { + "epoch": 78.06652806652806, + "grad_norm": 0.20052854716777802, + "learning_rate": 4.6177972232845925e-07, + "loss": 0.0938, + "num_input_tokens_seen": 26944568, + "step": 37550 + }, + { + "epoch": 78.07692307692308, + "grad_norm": 0.27081334590911865, + "learning_rate": 4.5990338986920953e-07, + "loss": 0.1033, + "num_input_tokens_seen": 26948344, + "step": 37555 + }, + { + "epoch": 78.08731808731808, + "grad_norm": 0.31960275769233704, + "learning_rate": 4.5803084180124633e-07, + "loss": 0.0846, + "num_input_tokens_seen": 26951864, + "step": 37560 + }, + { + "epoch": 78.0977130977131, + "grad_norm": 1.18937087059021, + "learning_rate": 4.561620784133386e-07, + "loss": 0.1178, + "num_input_tokens_seen": 26955448, + "step": 37565 + }, + { + "epoch": 78.10810810810811, + "grad_norm": 0.33435311913490295, + "learning_rate": 4.5429709999367796e-07, + "loss": 0.0832, + "num_input_tokens_seen": 26958968, + "step": 37570 + }, + { + "epoch": 78.11850311850311, + "grad_norm": 0.30054548382759094, + "learning_rate": 4.5243590682986223e-07, + "loss": 0.0772, + "num_input_tokens_seen": 26962552, + "step": 37575 + }, + { + "epoch": 78.12889812889813, + "grad_norm": 0.5590063333511353, + "learning_rate": 4.5057849920891735e-07, + "loss": 0.0999, + "num_input_tokens_seen": 26966264, + "step": 37580 + }, + { + "epoch": 78.13929313929314, + "grad_norm": 0.3018188774585724, + "learning_rate": 4.487248774172698e-07, + "loss": 0.1435, + "num_input_tokens_seen": 26969880, + "step": 37585 + }, + { + "epoch": 78.14968814968815, + "grad_norm": 0.15456965565681458, + "learning_rate": 4.4687504174077965e-07, + "loss": 0.0801, + "num_input_tokens_seen": 26973304, + "step": 37590 + }, + { + "epoch": 78.16008316008316, + "grad_norm": 0.1369275599718094, + "learning_rate": 4.450289924647133e-07, + "loss": 0.0649, + "num_input_tokens_seen": 26976920, + "step": 37595 + }, + { + "epoch": 78.17047817047818, + "grad_norm": 0.15530547499656677, + "learning_rate": 4.431867298737513e-07, + "loss": 0.0781, + "num_input_tokens_seen": 26980600, + "step": 37600 + }, + { + "epoch": 78.17047817047818, + "eval_loss": 0.14511926472187042, + "eval_runtime": 7.7469, + "eval_samples_per_second": 110.496, + "eval_steps_per_second": 27.624, + "num_input_tokens_seen": 26980600, + "step": 37600 + }, + { + "epoch": 78.18087318087318, + "grad_norm": 0.11969944089651108, + "learning_rate": 4.41348254251997e-07, + "loss": 0.0903, + "num_input_tokens_seen": 26984216, + "step": 37605 + }, + { + "epoch": 78.1912681912682, + "grad_norm": 0.6022998690605164, + "learning_rate": 4.395135658829652e-07, + "loss": 0.0868, + "num_input_tokens_seen": 26987768, + "step": 37610 + }, + { + "epoch": 78.20166320166321, + "grad_norm": 0.13638654351234436, + "learning_rate": 4.376826650495852e-07, + "loss": 0.0781, + "num_input_tokens_seen": 26991416, + "step": 37615 + }, + { + "epoch": 78.21205821205821, + "grad_norm": 0.41457700729370117, + "learning_rate": 4.358555520342117e-07, + "loss": 0.0977, + "num_input_tokens_seen": 26995096, + "step": 37620 + }, + { + "epoch": 78.22245322245323, + "grad_norm": 0.14563971757888794, + "learning_rate": 4.3403222711860257e-07, + "loss": 0.0899, + "num_input_tokens_seen": 26998648, + "step": 37625 + }, + { + "epoch": 78.23284823284823, + "grad_norm": 0.6364639401435852, + "learning_rate": 4.3221269058394133e-07, + "loss": 0.1295, + "num_input_tokens_seen": 27002264, + "step": 37630 + }, + { + "epoch": 78.24324324324324, + "grad_norm": 0.427009254693985, + "learning_rate": 4.303969427108173e-07, + "loss": 0.0821, + "num_input_tokens_seen": 27005784, + "step": 37635 + }, + { + "epoch": 78.25363825363826, + "grad_norm": 0.2823529541492462, + "learning_rate": 4.2858498377924825e-07, + "loss": 0.1182, + "num_input_tokens_seen": 27009304, + "step": 37640 + }, + { + "epoch": 78.26403326403326, + "grad_norm": 0.17852865159511566, + "learning_rate": 4.267768140686579e-07, + "loss": 0.0921, + "num_input_tokens_seen": 27012760, + "step": 37645 + }, + { + "epoch": 78.27442827442827, + "grad_norm": 0.2873994708061218, + "learning_rate": 4.2497243385788975e-07, + "loss": 0.0867, + "num_input_tokens_seen": 27016344, + "step": 37650 + }, + { + "epoch": 78.28482328482329, + "grad_norm": 0.7201957702636719, + "learning_rate": 4.231718434251991e-07, + "loss": 0.1671, + "num_input_tokens_seen": 27019992, + "step": 37655 + }, + { + "epoch": 78.29521829521829, + "grad_norm": 0.8549251556396484, + "learning_rate": 4.213750430482666e-07, + "loss": 0.0961, + "num_input_tokens_seen": 27023544, + "step": 37660 + }, + { + "epoch": 78.3056133056133, + "grad_norm": 0.6141266822814941, + "learning_rate": 4.1958203300417054e-07, + "loss": 0.0918, + "num_input_tokens_seen": 27027128, + "step": 37665 + }, + { + "epoch": 78.31600831600832, + "grad_norm": 0.5349860787391663, + "learning_rate": 4.177928135694259e-07, + "loss": 0.102, + "num_input_tokens_seen": 27030680, + "step": 37670 + }, + { + "epoch": 78.32640332640332, + "grad_norm": 0.4476562440395355, + "learning_rate": 4.1600738501994807e-07, + "loss": 0.0933, + "num_input_tokens_seen": 27034360, + "step": 37675 + }, + { + "epoch": 78.33679833679834, + "grad_norm": 0.3383837342262268, + "learning_rate": 4.1422574763107237e-07, + "loss": 0.1236, + "num_input_tokens_seen": 27038136, + "step": 37680 + }, + { + "epoch": 78.34719334719335, + "grad_norm": 0.27304744720458984, + "learning_rate": 4.124479016775512e-07, + "loss": 0.0717, + "num_input_tokens_seen": 27041784, + "step": 37685 + }, + { + "epoch": 78.35758835758836, + "grad_norm": 0.24457089602947235, + "learning_rate": 4.106738474335514e-07, + "loss": 0.084, + "num_input_tokens_seen": 27045464, + "step": 37690 + }, + { + "epoch": 78.36798336798337, + "grad_norm": 0.5346478223800659, + "learning_rate": 4.089035851726486e-07, + "loss": 0.073, + "num_input_tokens_seen": 27049080, + "step": 37695 + }, + { + "epoch": 78.37837837837837, + "grad_norm": 0.2980842590332031, + "learning_rate": 4.0713711516784937e-07, + "loss": 0.1221, + "num_input_tokens_seen": 27052568, + "step": 37700 + }, + { + "epoch": 78.38877338877339, + "grad_norm": 0.5351454019546509, + "learning_rate": 4.05374437691558e-07, + "loss": 0.0948, + "num_input_tokens_seen": 27056120, + "step": 37705 + }, + { + "epoch": 78.3991683991684, + "grad_norm": 0.09405604004859924, + "learning_rate": 4.036155530156044e-07, + "loss": 0.0865, + "num_input_tokens_seen": 27059640, + "step": 37710 + }, + { + "epoch": 78.4095634095634, + "grad_norm": 0.26619675755500793, + "learning_rate": 4.018604614112298e-07, + "loss": 0.1033, + "num_input_tokens_seen": 27063352, + "step": 37715 + }, + { + "epoch": 78.41995841995842, + "grad_norm": 0.3989146947860718, + "learning_rate": 4.0010916314908996e-07, + "loss": 0.1082, + "num_input_tokens_seen": 27067064, + "step": 37720 + }, + { + "epoch": 78.43035343035343, + "grad_norm": 0.5378381609916687, + "learning_rate": 3.983616584992578e-07, + "loss": 0.0907, + "num_input_tokens_seen": 27070712, + "step": 37725 + }, + { + "epoch": 78.44074844074844, + "grad_norm": 0.2876034677028656, + "learning_rate": 3.9661794773122595e-07, + "loss": 0.0921, + "num_input_tokens_seen": 27074424, + "step": 37730 + }, + { + "epoch": 78.45114345114345, + "grad_norm": 0.2663249969482422, + "learning_rate": 3.9487803111388777e-07, + "loss": 0.0613, + "num_input_tokens_seen": 27077912, + "step": 37735 + }, + { + "epoch": 78.46153846153847, + "grad_norm": 0.27875643968582153, + "learning_rate": 3.9314190891556747e-07, + "loss": 0.0765, + "num_input_tokens_seen": 27081400, + "step": 37740 + }, + { + "epoch": 78.47193347193347, + "grad_norm": 0.4325042963027954, + "learning_rate": 3.914095814039925e-07, + "loss": 0.0677, + "num_input_tokens_seen": 27085112, + "step": 37745 + }, + { + "epoch": 78.48232848232848, + "grad_norm": 0.5259249210357666, + "learning_rate": 3.896810488463104e-07, + "loss": 0.0856, + "num_input_tokens_seen": 27088760, + "step": 37750 + }, + { + "epoch": 78.4927234927235, + "grad_norm": 0.3409915864467621, + "learning_rate": 3.8795631150908565e-07, + "loss": 0.0645, + "num_input_tokens_seen": 27092408, + "step": 37755 + }, + { + "epoch": 78.5031185031185, + "grad_norm": 0.4670361280441284, + "learning_rate": 3.862353696582888e-07, + "loss": 0.099, + "num_input_tokens_seen": 27095992, + "step": 37760 + }, + { + "epoch": 78.51351351351352, + "grad_norm": 0.40130046010017395, + "learning_rate": 3.8451822355931313e-07, + "loss": 0.0738, + "num_input_tokens_seen": 27099512, + "step": 37765 + }, + { + "epoch": 78.52390852390852, + "grad_norm": 0.3926207423210144, + "learning_rate": 3.82804873476969e-07, + "loss": 0.0573, + "num_input_tokens_seen": 27103032, + "step": 37770 + }, + { + "epoch": 78.53430353430353, + "grad_norm": 0.3649664521217346, + "learning_rate": 3.810953196754702e-07, + "loss": 0.1467, + "num_input_tokens_seen": 27106808, + "step": 37775 + }, + { + "epoch": 78.54469854469855, + "grad_norm": 0.21031031012535095, + "learning_rate": 3.793895624184529e-07, + "loss": 0.09, + "num_input_tokens_seen": 27110360, + "step": 37780 + }, + { + "epoch": 78.55509355509355, + "grad_norm": 0.14673371613025665, + "learning_rate": 3.776876019689679e-07, + "loss": 0.079, + "num_input_tokens_seen": 27114072, + "step": 37785 + }, + { + "epoch": 78.56548856548856, + "grad_norm": 0.36762329936027527, + "learning_rate": 3.7598943858947743e-07, + "loss": 0.0906, + "num_input_tokens_seen": 27117656, + "step": 37790 + }, + { + "epoch": 78.57588357588358, + "grad_norm": 0.3039165139198303, + "learning_rate": 3.742950725418637e-07, + "loss": 0.0667, + "num_input_tokens_seen": 27121304, + "step": 37795 + }, + { + "epoch": 78.58627858627858, + "grad_norm": 0.28917616605758667, + "learning_rate": 3.726045040874093e-07, + "loss": 0.0805, + "num_input_tokens_seen": 27124888, + "step": 37800 + }, + { + "epoch": 78.58627858627858, + "eval_loss": 0.1449492871761322, + "eval_runtime": 7.747, + "eval_samples_per_second": 110.494, + "eval_steps_per_second": 27.624, + "num_input_tokens_seen": 27124888, + "step": 37800 + }, + { + "epoch": 78.5966735966736, + "grad_norm": 0.24103252589702606, + "learning_rate": 3.709177334868308e-07, + "loss": 0.0713, + "num_input_tokens_seen": 27128376, + "step": 37805 + }, + { + "epoch": 78.60706860706861, + "grad_norm": 0.6518802642822266, + "learning_rate": 3.692347610002478e-07, + "loss": 0.1125, + "num_input_tokens_seen": 27131832, + "step": 37810 + }, + { + "epoch": 78.61746361746361, + "grad_norm": 0.16264818608760834, + "learning_rate": 3.675555868871916e-07, + "loss": 0.118, + "num_input_tokens_seen": 27135480, + "step": 37815 + }, + { + "epoch": 78.62785862785863, + "grad_norm": 0.12374056130647659, + "learning_rate": 3.658802114066162e-07, + "loss": 0.0429, + "num_input_tokens_seen": 27138904, + "step": 37820 + }, + { + "epoch": 78.63825363825364, + "grad_norm": 0.3989786207675934, + "learning_rate": 3.6420863481688437e-07, + "loss": 0.1062, + "num_input_tokens_seen": 27142456, + "step": 37825 + }, + { + "epoch": 78.64864864864865, + "grad_norm": 0.4998352527618408, + "learning_rate": 3.625408573757705e-07, + "loss": 0.0979, + "num_input_tokens_seen": 27145880, + "step": 37830 + }, + { + "epoch": 78.65904365904366, + "grad_norm": 0.33954450488090515, + "learning_rate": 3.608768793404743e-07, + "loss": 0.0759, + "num_input_tokens_seen": 27149400, + "step": 37835 + }, + { + "epoch": 78.66943866943868, + "grad_norm": 0.5041031837463379, + "learning_rate": 3.592167009675934e-07, + "loss": 0.0784, + "num_input_tokens_seen": 27152920, + "step": 37840 + }, + { + "epoch": 78.67983367983368, + "grad_norm": 0.5818221569061279, + "learning_rate": 3.575603225131563e-07, + "loss": 0.0797, + "num_input_tokens_seen": 27156312, + "step": 37845 + }, + { + "epoch": 78.6902286902287, + "grad_norm": 0.5746005177497864, + "learning_rate": 3.55907744232592e-07, + "loss": 0.0928, + "num_input_tokens_seen": 27159832, + "step": 37850 + }, + { + "epoch": 78.7006237006237, + "grad_norm": 0.2833557724952698, + "learning_rate": 3.5425896638075217e-07, + "loss": 0.0749, + "num_input_tokens_seen": 27163320, + "step": 37855 + }, + { + "epoch": 78.71101871101871, + "grad_norm": 0.3388914465904236, + "learning_rate": 3.5261398921189736e-07, + "loss": 0.109, + "num_input_tokens_seen": 27166744, + "step": 37860 + }, + { + "epoch": 78.72141372141373, + "grad_norm": 0.6223244667053223, + "learning_rate": 3.509728129797024e-07, + "loss": 0.105, + "num_input_tokens_seen": 27170136, + "step": 37865 + }, + { + "epoch": 78.73180873180873, + "grad_norm": 0.2916879653930664, + "learning_rate": 3.4933543793725656e-07, + "loss": 0.0801, + "num_input_tokens_seen": 27173624, + "step": 37870 + }, + { + "epoch": 78.74220374220374, + "grad_norm": 0.3403295874595642, + "learning_rate": 3.4770186433707163e-07, + "loss": 0.1061, + "num_input_tokens_seen": 27177112, + "step": 37875 + }, + { + "epoch": 78.75259875259876, + "grad_norm": 0.14828315377235413, + "learning_rate": 3.4607209243105453e-07, + "loss": 0.1001, + "num_input_tokens_seen": 27180728, + "step": 37880 + }, + { + "epoch": 78.76299376299376, + "grad_norm": 0.3151398003101349, + "learning_rate": 3.444461224705431e-07, + "loss": 0.1253, + "num_input_tokens_seen": 27184248, + "step": 37885 + }, + { + "epoch": 78.77338877338877, + "grad_norm": 0.17504224181175232, + "learning_rate": 3.4282395470628116e-07, + "loss": 0.1013, + "num_input_tokens_seen": 27187672, + "step": 37890 + }, + { + "epoch": 78.78378378378379, + "grad_norm": 0.3311765193939209, + "learning_rate": 3.4120558938842417e-07, + "loss": 0.088, + "num_input_tokens_seen": 27191096, + "step": 37895 + }, + { + "epoch": 78.79417879417879, + "grad_norm": 0.9765014052391052, + "learning_rate": 3.395910267665503e-07, + "loss": 0.1336, + "num_input_tokens_seen": 27194488, + "step": 37900 + }, + { + "epoch": 78.8045738045738, + "grad_norm": 0.43427303433418274, + "learning_rate": 3.3798026708964094e-07, + "loss": 0.078, + "num_input_tokens_seen": 27197976, + "step": 37905 + }, + { + "epoch": 78.81496881496882, + "grad_norm": 0.16772696375846863, + "learning_rate": 3.3637331060609456e-07, + "loss": 0.0956, + "num_input_tokens_seen": 27201592, + "step": 37910 + }, + { + "epoch": 78.82536382536382, + "grad_norm": 1.0628143548965454, + "learning_rate": 3.3477015756372966e-07, + "loss": 0.0969, + "num_input_tokens_seen": 27205176, + "step": 37915 + }, + { + "epoch": 78.83575883575884, + "grad_norm": 0.20686504244804382, + "learning_rate": 3.3317080820976785e-07, + "loss": 0.0614, + "num_input_tokens_seen": 27208792, + "step": 37920 + }, + { + "epoch": 78.84615384615384, + "grad_norm": 0.4276244342327118, + "learning_rate": 3.315752627908508e-07, + "loss": 0.1027, + "num_input_tokens_seen": 27212408, + "step": 37925 + }, + { + "epoch": 78.85654885654886, + "grad_norm": 0.329751193523407, + "learning_rate": 3.299835215530317e-07, + "loss": 0.0757, + "num_input_tokens_seen": 27215864, + "step": 37930 + }, + { + "epoch": 78.86694386694387, + "grad_norm": 0.6246451139450073, + "learning_rate": 3.2839558474177245e-07, + "loss": 0.1475, + "num_input_tokens_seen": 27219544, + "step": 37935 + }, + { + "epoch": 78.87733887733887, + "grad_norm": 0.32381176948547363, + "learning_rate": 3.2681145260196056e-07, + "loss": 0.141, + "num_input_tokens_seen": 27223192, + "step": 37940 + }, + { + "epoch": 78.88773388773389, + "grad_norm": 0.6646823287010193, + "learning_rate": 3.252311253778839e-07, + "loss": 0.1762, + "num_input_tokens_seen": 27226936, + "step": 37945 + }, + { + "epoch": 78.8981288981289, + "grad_norm": 0.6290807723999023, + "learning_rate": 3.2365460331325034e-07, + "loss": 0.1319, + "num_input_tokens_seen": 27230456, + "step": 37950 + }, + { + "epoch": 78.9085239085239, + "grad_norm": 0.23732821643352509, + "learning_rate": 3.2208188665117934e-07, + "loss": 0.0809, + "num_input_tokens_seen": 27234232, + "step": 37955 + }, + { + "epoch": 78.91891891891892, + "grad_norm": 0.11552031338214874, + "learning_rate": 3.205129756342018e-07, + "loss": 0.1037, + "num_input_tokens_seen": 27238104, + "step": 37960 + }, + { + "epoch": 78.92931392931393, + "grad_norm": 0.18125155568122864, + "learning_rate": 3.189478705042659e-07, + "loss": 0.0761, + "num_input_tokens_seen": 27241784, + "step": 37965 + }, + { + "epoch": 78.93970893970894, + "grad_norm": 0.5485876798629761, + "learning_rate": 3.173865715027341e-07, + "loss": 0.1032, + "num_input_tokens_seen": 27245304, + "step": 37970 + }, + { + "epoch": 78.95010395010395, + "grad_norm": 0.3556971848011017, + "learning_rate": 3.158290788703694e-07, + "loss": 0.1526, + "num_input_tokens_seen": 27248920, + "step": 37975 + }, + { + "epoch": 78.96049896049897, + "grad_norm": 0.5053374767303467, + "learning_rate": 3.1427539284736297e-07, + "loss": 0.1211, + "num_input_tokens_seen": 27252632, + "step": 37980 + }, + { + "epoch": 78.97089397089397, + "grad_norm": 0.3828715682029724, + "learning_rate": 3.127255136733093e-07, + "loss": 0.1201, + "num_input_tokens_seen": 27256248, + "step": 37985 + }, + { + "epoch": 78.98128898128898, + "grad_norm": 0.15709376335144043, + "learning_rate": 3.1117944158722544e-07, + "loss": 0.0852, + "num_input_tokens_seen": 27259768, + "step": 37990 + }, + { + "epoch": 78.99168399168398, + "grad_norm": 0.2642946243286133, + "learning_rate": 3.0963717682752635e-07, + "loss": 0.1283, + "num_input_tokens_seen": 27263320, + "step": 37995 + }, + { + "epoch": 79.002079002079, + "grad_norm": 0.4554597735404968, + "learning_rate": 3.080987196320578e-07, + "loss": 0.0633, + "num_input_tokens_seen": 27266800, + "step": 38000 + }, + { + "epoch": 79.002079002079, + "eval_loss": 0.14430324733257294, + "eval_runtime": 7.752, + "eval_samples_per_second": 110.423, + "eval_steps_per_second": 27.606, + "num_input_tokens_seen": 27266800, + "step": 38000 + }, + { + "epoch": 79.01247401247402, + "grad_norm": 0.28768572211265564, + "learning_rate": 3.065640702380607e-07, + "loss": 0.1201, + "num_input_tokens_seen": 27270448, + "step": 38005 + }, + { + "epoch": 79.02286902286902, + "grad_norm": 0.40571653842926025, + "learning_rate": 3.050332288822011e-07, + "loss": 0.0659, + "num_input_tokens_seen": 27273904, + "step": 38010 + }, + { + "epoch": 79.03326403326403, + "grad_norm": 0.7185207605361938, + "learning_rate": 3.035061958005542e-07, + "loss": 0.1185, + "num_input_tokens_seen": 27277520, + "step": 38015 + }, + { + "epoch": 79.04365904365905, + "grad_norm": 0.1972164362668991, + "learning_rate": 3.019829712286093e-07, + "loss": 0.0781, + "num_input_tokens_seen": 27281072, + "step": 38020 + }, + { + "epoch": 79.05405405405405, + "grad_norm": 0.21329733729362488, + "learning_rate": 3.004635554012647e-07, + "loss": 0.0627, + "num_input_tokens_seen": 27284688, + "step": 38025 + }, + { + "epoch": 79.06444906444906, + "grad_norm": 0.23962156474590302, + "learning_rate": 2.9894794855283017e-07, + "loss": 0.1226, + "num_input_tokens_seen": 27288336, + "step": 38030 + }, + { + "epoch": 79.07484407484408, + "grad_norm": 0.4936073422431946, + "learning_rate": 2.9743615091703816e-07, + "loss": 0.1108, + "num_input_tokens_seen": 27291920, + "step": 38035 + }, + { + "epoch": 79.08523908523908, + "grad_norm": 0.47024503350257874, + "learning_rate": 2.959281627270216e-07, + "loss": 0.1142, + "num_input_tokens_seen": 27295440, + "step": 38040 + }, + { + "epoch": 79.0956340956341, + "grad_norm": 0.3761875033378601, + "learning_rate": 2.944239842153362e-07, + "loss": 0.1125, + "num_input_tokens_seen": 27298928, + "step": 38045 + }, + { + "epoch": 79.10602910602911, + "grad_norm": 1.460629940032959, + "learning_rate": 2.929236156139381e-07, + "loss": 0.1373, + "num_input_tokens_seen": 27302576, + "step": 38050 + }, + { + "epoch": 79.11642411642411, + "grad_norm": 0.21899613738059998, + "learning_rate": 2.9142705715420883e-07, + "loss": 0.0641, + "num_input_tokens_seen": 27306032, + "step": 38055 + }, + { + "epoch": 79.12681912681913, + "grad_norm": 0.5127174258232117, + "learning_rate": 2.8993430906693595e-07, + "loss": 0.086, + "num_input_tokens_seen": 27309808, + "step": 38060 + }, + { + "epoch": 79.13721413721414, + "grad_norm": 0.43166691064834595, + "learning_rate": 2.88445371582316e-07, + "loss": 0.1278, + "num_input_tokens_seen": 27313552, + "step": 38065 + }, + { + "epoch": 79.14760914760915, + "grad_norm": 0.36935296654701233, + "learning_rate": 2.8696024492996796e-07, + "loss": 0.1422, + "num_input_tokens_seen": 27317168, + "step": 38070 + }, + { + "epoch": 79.15800415800416, + "grad_norm": 0.4576612114906311, + "learning_rate": 2.854789293389115e-07, + "loss": 0.0875, + "num_input_tokens_seen": 27320912, + "step": 38075 + }, + { + "epoch": 79.16839916839916, + "grad_norm": 0.1480746567249298, + "learning_rate": 2.8400142503758606e-07, + "loss": 0.0777, + "num_input_tokens_seen": 27324592, + "step": 38080 + }, + { + "epoch": 79.17879417879418, + "grad_norm": 0.9702324867248535, + "learning_rate": 2.8252773225384276e-07, + "loss": 0.104, + "num_input_tokens_seen": 27328208, + "step": 38085 + }, + { + "epoch": 79.1891891891892, + "grad_norm": 0.7068156003952026, + "learning_rate": 2.8105785121494143e-07, + "loss": 0.1046, + "num_input_tokens_seen": 27331728, + "step": 38090 + }, + { + "epoch": 79.1995841995842, + "grad_norm": 0.7044373750686646, + "learning_rate": 2.795917821475563e-07, + "loss": 0.0866, + "num_input_tokens_seen": 27335440, + "step": 38095 + }, + { + "epoch": 79.20997920997921, + "grad_norm": 0.2580364942550659, + "learning_rate": 2.78129525277776e-07, + "loss": 0.0728, + "num_input_tokens_seen": 27338832, + "step": 38100 + }, + { + "epoch": 79.22037422037423, + "grad_norm": 0.4308638274669647, + "learning_rate": 2.766710808310952e-07, + "loss": 0.0934, + "num_input_tokens_seen": 27342416, + "step": 38105 + }, + { + "epoch": 79.23076923076923, + "grad_norm": 0.3295705020427704, + "learning_rate": 2.7521644903242827e-07, + "loss": 0.1193, + "num_input_tokens_seen": 27346064, + "step": 38110 + }, + { + "epoch": 79.24116424116424, + "grad_norm": 0.16944892704486847, + "learning_rate": 2.7376563010609593e-07, + "loss": 0.0628, + "num_input_tokens_seen": 27349552, + "step": 38115 + }, + { + "epoch": 79.25155925155926, + "grad_norm": 0.16027136147022247, + "learning_rate": 2.72318624275833e-07, + "loss": 0.0784, + "num_input_tokens_seen": 27352944, + "step": 38120 + }, + { + "epoch": 79.26195426195426, + "grad_norm": 0.2726125717163086, + "learning_rate": 2.7087543176478324e-07, + "loss": 0.1238, + "num_input_tokens_seen": 27356496, + "step": 38125 + }, + { + "epoch": 79.27234927234927, + "grad_norm": 0.21381700038909912, + "learning_rate": 2.694360527955103e-07, + "loss": 0.0845, + "num_input_tokens_seen": 27360240, + "step": 38130 + }, + { + "epoch": 79.28274428274429, + "grad_norm": 0.7202509045600891, + "learning_rate": 2.680004875899811e-07, + "loss": 0.1466, + "num_input_tokens_seen": 27363792, + "step": 38135 + }, + { + "epoch": 79.29313929313929, + "grad_norm": 0.16357380151748657, + "learning_rate": 2.665687363695768e-07, + "loss": 0.095, + "num_input_tokens_seen": 27367408, + "step": 38140 + }, + { + "epoch": 79.3035343035343, + "grad_norm": 0.20175307989120483, + "learning_rate": 2.6514079935509584e-07, + "loss": 0.0705, + "num_input_tokens_seen": 27371056, + "step": 38145 + }, + { + "epoch": 79.31392931392931, + "grad_norm": 0.7526810169219971, + "learning_rate": 2.6371667676673983e-07, + "loss": 0.0783, + "num_input_tokens_seen": 27374832, + "step": 38150 + }, + { + "epoch": 79.32432432432432, + "grad_norm": 0.2462114542722702, + "learning_rate": 2.6229636882412755e-07, + "loss": 0.0971, + "num_input_tokens_seen": 27378480, + "step": 38155 + }, + { + "epoch": 79.33471933471934, + "grad_norm": 0.5326302647590637, + "learning_rate": 2.6087987574628935e-07, + "loss": 0.0926, + "num_input_tokens_seen": 27382096, + "step": 38160 + }, + { + "epoch": 79.34511434511434, + "grad_norm": 0.4757959842681885, + "learning_rate": 2.5946719775166437e-07, + "loss": 0.1168, + "num_input_tokens_seen": 27385776, + "step": 38165 + }, + { + "epoch": 79.35550935550935, + "grad_norm": 0.12682996690273285, + "learning_rate": 2.5805833505810616e-07, + "loss": 0.0545, + "num_input_tokens_seen": 27389424, + "step": 38170 + }, + { + "epoch": 79.36590436590437, + "grad_norm": 0.2496299296617508, + "learning_rate": 2.566532878828798e-07, + "loss": 0.0928, + "num_input_tokens_seen": 27392944, + "step": 38175 + }, + { + "epoch": 79.37629937629937, + "grad_norm": 0.166802778840065, + "learning_rate": 2.552520564426619e-07, + "loss": 0.0805, + "num_input_tokens_seen": 27396400, + "step": 38180 + }, + { + "epoch": 79.38669438669439, + "grad_norm": 1.146201252937317, + "learning_rate": 2.5385464095353803e-07, + "loss": 0.1031, + "num_input_tokens_seen": 27399984, + "step": 38185 + }, + { + "epoch": 79.3970893970894, + "grad_norm": 0.45429757237434387, + "learning_rate": 2.5246104163100804e-07, + "loss": 0.0922, + "num_input_tokens_seen": 27403568, + "step": 38190 + }, + { + "epoch": 79.4074844074844, + "grad_norm": 0.27032962441444397, + "learning_rate": 2.510712586899833e-07, + "loss": 0.097, + "num_input_tokens_seen": 27407280, + "step": 38195 + }, + { + "epoch": 79.41787941787942, + "grad_norm": 0.1820320338010788, + "learning_rate": 2.4968529234478124e-07, + "loss": 0.089, + "num_input_tokens_seen": 27410736, + "step": 38200 + }, + { + "epoch": 79.41787941787942, + "eval_loss": 0.14530903100967407, + "eval_runtime": 7.7546, + "eval_samples_per_second": 110.386, + "eval_steps_per_second": 27.596, + "num_input_tokens_seen": 27410736, + "step": 38200 + }, + { + "epoch": 79.42827442827443, + "grad_norm": 0.17399372160434723, + "learning_rate": 2.483031428091448e-07, + "loss": 0.0675, + "num_input_tokens_seen": 27414352, + "step": 38205 + }, + { + "epoch": 79.43866943866944, + "grad_norm": 0.6018508076667786, + "learning_rate": 2.469248102962091e-07, + "loss": 0.1336, + "num_input_tokens_seen": 27418032, + "step": 38210 + }, + { + "epoch": 79.44906444906445, + "grad_norm": 0.5333465337753296, + "learning_rate": 2.4555029501853455e-07, + "loss": 0.0663, + "num_input_tokens_seen": 27421712, + "step": 38215 + }, + { + "epoch": 79.45945945945945, + "grad_norm": 0.5369693040847778, + "learning_rate": 2.441795971880906e-07, + "loss": 0.0962, + "num_input_tokens_seen": 27425136, + "step": 38220 + }, + { + "epoch": 79.46985446985447, + "grad_norm": 0.6102011203765869, + "learning_rate": 2.4281271701625255e-07, + "loss": 0.0616, + "num_input_tokens_seen": 27428528, + "step": 38225 + }, + { + "epoch": 79.48024948024948, + "grad_norm": 0.2710172235965729, + "learning_rate": 2.4144965471381007e-07, + "loss": 0.0961, + "num_input_tokens_seen": 27432048, + "step": 38230 + }, + { + "epoch": 79.49064449064448, + "grad_norm": 0.19991609454154968, + "learning_rate": 2.400904104909674e-07, + "loss": 0.0656, + "num_input_tokens_seen": 27435728, + "step": 38235 + }, + { + "epoch": 79.5010395010395, + "grad_norm": 0.15657784044742584, + "learning_rate": 2.3873498455733725e-07, + "loss": 0.073, + "num_input_tokens_seen": 27439152, + "step": 38240 + }, + { + "epoch": 79.51143451143452, + "grad_norm": 0.16608832776546478, + "learning_rate": 2.3738337712194137e-07, + "loss": 0.0569, + "num_input_tokens_seen": 27442576, + "step": 38245 + }, + { + "epoch": 79.52182952182952, + "grad_norm": 0.29614293575286865, + "learning_rate": 2.3603558839321305e-07, + "loss": 0.078, + "num_input_tokens_seen": 27446192, + "step": 38250 + }, + { + "epoch": 79.53222453222453, + "grad_norm": 0.2686270773410797, + "learning_rate": 2.3469161857900267e-07, + "loss": 0.0693, + "num_input_tokens_seen": 27449840, + "step": 38255 + }, + { + "epoch": 79.54261954261955, + "grad_norm": 0.6388294696807861, + "learning_rate": 2.3335146788656393e-07, + "loss": 0.1182, + "num_input_tokens_seen": 27453360, + "step": 38260 + }, + { + "epoch": 79.55301455301455, + "grad_norm": 0.5429732799530029, + "learning_rate": 2.3201513652256757e-07, + "loss": 0.1188, + "num_input_tokens_seen": 27456944, + "step": 38265 + }, + { + "epoch": 79.56340956340956, + "grad_norm": 0.5609973669052124, + "learning_rate": 2.3068262469308766e-07, + "loss": 0.0933, + "num_input_tokens_seen": 27460560, + "step": 38270 + }, + { + "epoch": 79.57380457380458, + "grad_norm": 0.2756035029888153, + "learning_rate": 2.2935393260362093e-07, + "loss": 0.0651, + "num_input_tokens_seen": 27464048, + "step": 38275 + }, + { + "epoch": 79.58419958419958, + "grad_norm": 0.5322806239128113, + "learning_rate": 2.2802906045906458e-07, + "loss": 0.0901, + "num_input_tokens_seen": 27467504, + "step": 38280 + }, + { + "epoch": 79.5945945945946, + "grad_norm": 0.16356681287288666, + "learning_rate": 2.2670800846373018e-07, + "loss": 0.0745, + "num_input_tokens_seen": 27470928, + "step": 38285 + }, + { + "epoch": 79.60498960498961, + "grad_norm": 0.3968310058116913, + "learning_rate": 2.2539077682134367e-07, + "loss": 0.1204, + "num_input_tokens_seen": 27474672, + "step": 38290 + }, + { + "epoch": 79.61538461538461, + "grad_norm": 0.21003229916095734, + "learning_rate": 2.2407736573503423e-07, + "loss": 0.0935, + "num_input_tokens_seen": 27478192, + "step": 38295 + }, + { + "epoch": 79.62577962577963, + "grad_norm": 1.2931793928146362, + "learning_rate": 2.2276777540735093e-07, + "loss": 0.1175, + "num_input_tokens_seen": 27481744, + "step": 38300 + }, + { + "epoch": 79.63617463617463, + "grad_norm": 0.4224456548690796, + "learning_rate": 2.2146200604024613e-07, + "loss": 0.1104, + "num_input_tokens_seen": 27485392, + "step": 38305 + }, + { + "epoch": 79.64656964656965, + "grad_norm": 0.2863461375236511, + "learning_rate": 2.2016005783508375e-07, + "loss": 0.1343, + "num_input_tokens_seen": 27489040, + "step": 38310 + }, + { + "epoch": 79.65696465696466, + "grad_norm": 0.302306592464447, + "learning_rate": 2.1886193099264763e-07, + "loss": 0.135, + "num_input_tokens_seen": 27492784, + "step": 38315 + }, + { + "epoch": 79.66735966735966, + "grad_norm": 0.37440556287765503, + "learning_rate": 2.175676257131165e-07, + "loss": 0.1179, + "num_input_tokens_seen": 27496400, + "step": 38320 + }, + { + "epoch": 79.67775467775468, + "grad_norm": 0.6858709454536438, + "learning_rate": 2.162771421960974e-07, + "loss": 0.0724, + "num_input_tokens_seen": 27499984, + "step": 38325 + }, + { + "epoch": 79.6881496881497, + "grad_norm": 0.39335885643959045, + "learning_rate": 2.1499048064059224e-07, + "loss": 0.0726, + "num_input_tokens_seen": 27503472, + "step": 38330 + }, + { + "epoch": 79.6985446985447, + "grad_norm": 0.35493940114974976, + "learning_rate": 2.1370764124502285e-07, + "loss": 0.1264, + "num_input_tokens_seen": 27507024, + "step": 38335 + }, + { + "epoch": 79.70893970893971, + "grad_norm": 0.14001023769378662, + "learning_rate": 2.1242862420721988e-07, + "loss": 0.0898, + "num_input_tokens_seen": 27510416, + "step": 38340 + }, + { + "epoch": 79.71933471933473, + "grad_norm": 0.34336408972740173, + "learning_rate": 2.1115342972442276e-07, + "loss": 0.1034, + "num_input_tokens_seen": 27513904, + "step": 38345 + }, + { + "epoch": 79.72972972972973, + "grad_norm": 0.29061102867126465, + "learning_rate": 2.0988205799328252e-07, + "loss": 0.0843, + "num_input_tokens_seen": 27517488, + "step": 38350 + }, + { + "epoch": 79.74012474012474, + "grad_norm": 1.6351677179336548, + "learning_rate": 2.0861450920986182e-07, + "loss": 0.1535, + "num_input_tokens_seen": 27521008, + "step": 38355 + }, + { + "epoch": 79.75051975051976, + "grad_norm": 0.43741536140441895, + "learning_rate": 2.07350783569632e-07, + "loss": 0.104, + "num_input_tokens_seen": 27524592, + "step": 38360 + }, + { + "epoch": 79.76091476091476, + "grad_norm": 0.2928277254104614, + "learning_rate": 2.060908812674761e-07, + "loss": 0.0671, + "num_input_tokens_seen": 27528304, + "step": 38365 + }, + { + "epoch": 79.77130977130977, + "grad_norm": 0.42414888739585876, + "learning_rate": 2.0483480249768317e-07, + "loss": 0.107, + "num_input_tokens_seen": 27531824, + "step": 38370 + }, + { + "epoch": 79.78170478170478, + "grad_norm": 0.4147961735725403, + "learning_rate": 2.035825474539621e-07, + "loss": 0.1049, + "num_input_tokens_seen": 27535504, + "step": 38375 + }, + { + "epoch": 79.79209979209979, + "grad_norm": 0.27665257453918457, + "learning_rate": 2.0233411632942235e-07, + "loss": 0.0819, + "num_input_tokens_seen": 27539152, + "step": 38380 + }, + { + "epoch": 79.8024948024948, + "grad_norm": 0.8783530592918396, + "learning_rate": 2.0108950931658764e-07, + "loss": 0.0968, + "num_input_tokens_seen": 27542736, + "step": 38385 + }, + { + "epoch": 79.81288981288981, + "grad_norm": 0.5119356513023376, + "learning_rate": 1.998487266073934e-07, + "loss": 0.0781, + "num_input_tokens_seen": 27546256, + "step": 38390 + }, + { + "epoch": 79.82328482328482, + "grad_norm": 0.2722371518611908, + "learning_rate": 1.986117683931865e-07, + "loss": 0.093, + "num_input_tokens_seen": 27549808, + "step": 38395 + }, + { + "epoch": 79.83367983367984, + "grad_norm": 0.37052157521247864, + "learning_rate": 1.9737863486471442e-07, + "loss": 0.1174, + "num_input_tokens_seen": 27553360, + "step": 38400 + }, + { + "epoch": 79.83367983367984, + "eval_loss": 0.1454760581254959, + "eval_runtime": 7.7498, + "eval_samples_per_second": 110.454, + "eval_steps_per_second": 27.614, + "num_input_tokens_seen": 27553360, + "step": 38400 + }, + { + "epoch": 79.84407484407484, + "grad_norm": 0.5260717272758484, + "learning_rate": 1.9614932621215e-07, + "loss": 0.0676, + "num_input_tokens_seen": 27556912, + "step": 38405 + }, + { + "epoch": 79.85446985446985, + "grad_norm": 0.38466691970825195, + "learning_rate": 1.9492384262506102e-07, + "loss": 0.0605, + "num_input_tokens_seen": 27560432, + "step": 38410 + }, + { + "epoch": 79.86486486486487, + "grad_norm": 0.2375497668981552, + "learning_rate": 1.9370218429243524e-07, + "loss": 0.1007, + "num_input_tokens_seen": 27563920, + "step": 38415 + }, + { + "epoch": 79.87525987525987, + "grad_norm": 0.26773741841316223, + "learning_rate": 1.9248435140267197e-07, + "loss": 0.089, + "num_input_tokens_seen": 27567440, + "step": 38420 + }, + { + "epoch": 79.88565488565489, + "grad_norm": 0.4798840880393982, + "learning_rate": 1.9127034414356814e-07, + "loss": 0.1109, + "num_input_tokens_seen": 27570992, + "step": 38425 + }, + { + "epoch": 79.8960498960499, + "grad_norm": 0.4379561245441437, + "learning_rate": 1.9006016270234627e-07, + "loss": 0.106, + "num_input_tokens_seen": 27574640, + "step": 38430 + }, + { + "epoch": 79.9064449064449, + "grad_norm": 0.5678796768188477, + "learning_rate": 1.888538072656293e-07, + "loss": 0.1495, + "num_input_tokens_seen": 27578128, + "step": 38435 + }, + { + "epoch": 79.91683991683992, + "grad_norm": 0.30579674243927, + "learning_rate": 1.8765127801944893e-07, + "loss": 0.0769, + "num_input_tokens_seen": 27581648, + "step": 38440 + }, + { + "epoch": 79.92723492723492, + "grad_norm": 0.8296374678611755, + "learning_rate": 1.8645257514925406e-07, + "loss": 0.1547, + "num_input_tokens_seen": 27585328, + "step": 38445 + }, + { + "epoch": 79.93762993762994, + "grad_norm": 0.7377525568008423, + "learning_rate": 1.8525769883989685e-07, + "loss": 0.129, + "num_input_tokens_seen": 27588912, + "step": 38450 + }, + { + "epoch": 79.94802494802495, + "grad_norm": 0.2630292475223541, + "learning_rate": 1.8406664927564654e-07, + "loss": 0.0936, + "num_input_tokens_seen": 27592368, + "step": 38455 + }, + { + "epoch": 79.95841995841995, + "grad_norm": 0.3196808099746704, + "learning_rate": 1.8287942664017566e-07, + "loss": 0.1089, + "num_input_tokens_seen": 27595824, + "step": 38460 + }, + { + "epoch": 79.96881496881497, + "grad_norm": 0.3517638146877289, + "learning_rate": 1.8169603111656552e-07, + "loss": 0.1001, + "num_input_tokens_seen": 27599408, + "step": 38465 + }, + { + "epoch": 79.97920997920998, + "grad_norm": 0.7540037035942078, + "learning_rate": 1.805164628873146e-07, + "loss": 0.0978, + "num_input_tokens_seen": 27603088, + "step": 38470 + }, + { + "epoch": 79.98960498960498, + "grad_norm": 0.21193787455558777, + "learning_rate": 1.793407221343274e-07, + "loss": 0.0589, + "num_input_tokens_seen": 27606704, + "step": 38475 + }, + { + "epoch": 80.0, + "grad_norm": 0.4045364260673523, + "learning_rate": 1.781688090389172e-07, + "loss": 0.1464, + "num_input_tokens_seen": 27610432, + "step": 38480 + }, + { + "epoch": 80.01039501039502, + "grad_norm": 0.2932051718235016, + "learning_rate": 1.770007237818061e-07, + "loss": 0.1062, + "num_input_tokens_seen": 27613952, + "step": 38485 + }, + { + "epoch": 80.02079002079002, + "grad_norm": 0.3180784583091736, + "learning_rate": 1.7583646654313059e-07, + "loss": 0.1104, + "num_input_tokens_seen": 27617632, + "step": 38490 + }, + { + "epoch": 80.03118503118503, + "grad_norm": 0.28519871830940247, + "learning_rate": 1.7467603750242757e-07, + "loss": 0.0872, + "num_input_tokens_seen": 27621184, + "step": 38495 + }, + { + "epoch": 80.04158004158005, + "grad_norm": 0.2728743851184845, + "learning_rate": 1.7351943683865944e-07, + "loss": 0.0785, + "num_input_tokens_seen": 27624736, + "step": 38500 + }, + { + "epoch": 80.05197505197505, + "grad_norm": 0.4093019664287567, + "learning_rate": 1.723666647301808e-07, + "loss": 0.1069, + "num_input_tokens_seen": 27628480, + "step": 38505 + }, + { + "epoch": 80.06237006237006, + "grad_norm": 0.4278198778629303, + "learning_rate": 1.712177213547661e-07, + "loss": 0.1005, + "num_input_tokens_seen": 27632032, + "step": 38510 + }, + { + "epoch": 80.07276507276508, + "grad_norm": 0.5793370604515076, + "learning_rate": 1.7007260688959581e-07, + "loss": 0.0803, + "num_input_tokens_seen": 27635584, + "step": 38515 + }, + { + "epoch": 80.08316008316008, + "grad_norm": 0.28694358468055725, + "learning_rate": 1.68931321511262e-07, + "loss": 0.1326, + "num_input_tokens_seen": 27639296, + "step": 38520 + }, + { + "epoch": 80.0935550935551, + "grad_norm": 0.3387754261493683, + "learning_rate": 1.6779386539576835e-07, + "loss": 0.085, + "num_input_tokens_seen": 27642816, + "step": 38525 + }, + { + "epoch": 80.1039501039501, + "grad_norm": 0.32218700647354126, + "learning_rate": 1.666602387185162e-07, + "loss": 0.0776, + "num_input_tokens_seen": 27646400, + "step": 38530 + }, + { + "epoch": 80.11434511434511, + "grad_norm": 0.685635507106781, + "learning_rate": 1.655304416543352e-07, + "loss": 0.0926, + "num_input_tokens_seen": 27649952, + "step": 38535 + }, + { + "epoch": 80.12474012474013, + "grad_norm": 0.3063161075115204, + "learning_rate": 1.6440447437744698e-07, + "loss": 0.0747, + "num_input_tokens_seen": 27653536, + "step": 38540 + }, + { + "epoch": 80.13513513513513, + "grad_norm": 0.5163138508796692, + "learning_rate": 1.6328233706149332e-07, + "loss": 0.0869, + "num_input_tokens_seen": 27657088, + "step": 38545 + }, + { + "epoch": 80.14553014553015, + "grad_norm": 0.45553550124168396, + "learning_rate": 1.6216402987951906e-07, + "loss": 0.103, + "num_input_tokens_seen": 27660608, + "step": 38550 + }, + { + "epoch": 80.15592515592516, + "grad_norm": 0.44368410110473633, + "learning_rate": 1.6104955300398627e-07, + "loss": 0.1076, + "num_input_tokens_seen": 27664352, + "step": 38555 + }, + { + "epoch": 80.16632016632016, + "grad_norm": 0.8701601624488831, + "learning_rate": 1.5993890660675748e-07, + "loss": 0.0667, + "num_input_tokens_seen": 27667808, + "step": 38560 + }, + { + "epoch": 80.17671517671518, + "grad_norm": 0.3056877851486206, + "learning_rate": 1.5883209085910678e-07, + "loss": 0.0636, + "num_input_tokens_seen": 27671296, + "step": 38565 + }, + { + "epoch": 80.18711018711019, + "grad_norm": 0.1817285120487213, + "learning_rate": 1.5772910593172264e-07, + "loss": 0.0992, + "num_input_tokens_seen": 27674784, + "step": 38570 + }, + { + "epoch": 80.1975051975052, + "grad_norm": 0.2561867833137512, + "learning_rate": 1.5662995199469954e-07, + "loss": 0.0837, + "num_input_tokens_seen": 27678368, + "step": 38575 + }, + { + "epoch": 80.20790020790021, + "grad_norm": 0.17328625917434692, + "learning_rate": 1.5553462921753802e-07, + "loss": 0.0989, + "num_input_tokens_seen": 27682048, + "step": 38580 + }, + { + "epoch": 80.21829521829522, + "grad_norm": 0.1749628782272339, + "learning_rate": 1.544431377691502e-07, + "loss": 0.1119, + "num_input_tokens_seen": 27685664, + "step": 38585 + }, + { + "epoch": 80.22869022869023, + "grad_norm": 0.1536693423986435, + "learning_rate": 1.5335547781785975e-07, + "loss": 0.0792, + "num_input_tokens_seen": 27689408, + "step": 38590 + }, + { + "epoch": 80.23908523908524, + "grad_norm": 0.2943929135799408, + "learning_rate": 1.5227164953139917e-07, + "loss": 0.0614, + "num_input_tokens_seen": 27693184, + "step": 38595 + }, + { + "epoch": 80.24948024948024, + "grad_norm": 0.22482410073280334, + "learning_rate": 1.511916530769042e-07, + "loss": 0.0652, + "num_input_tokens_seen": 27696864, + "step": 38600 + }, + { + "epoch": 80.24948024948024, + "eval_loss": 0.14532195031642914, + "eval_runtime": 7.7498, + "eval_samples_per_second": 110.454, + "eval_steps_per_second": 27.614, + "num_input_tokens_seen": 27696864, + "step": 38600 + }, + { + "epoch": 80.25987525987526, + "grad_norm": 0.6399914026260376, + "learning_rate": 1.5011548862092773e-07, + "loss": 0.0936, + "num_input_tokens_seen": 27700576, + "step": 38605 + }, + { + "epoch": 80.27027027027027, + "grad_norm": 0.6702752709388733, + "learning_rate": 1.490431563294231e-07, + "loss": 0.1357, + "num_input_tokens_seen": 27704160, + "step": 38610 + }, + { + "epoch": 80.28066528066527, + "grad_norm": 0.31650206446647644, + "learning_rate": 1.4797465636776365e-07, + "loss": 0.1079, + "num_input_tokens_seen": 27707808, + "step": 38615 + }, + { + "epoch": 80.29106029106029, + "grad_norm": 0.3290337026119232, + "learning_rate": 1.4690998890072027e-07, + "loss": 0.0797, + "num_input_tokens_seen": 27711264, + "step": 38620 + }, + { + "epoch": 80.3014553014553, + "grad_norm": 0.3919074535369873, + "learning_rate": 1.4584915409248112e-07, + "loss": 0.0791, + "num_input_tokens_seen": 27714848, + "step": 38625 + }, + { + "epoch": 80.3118503118503, + "grad_norm": 0.4252709150314331, + "learning_rate": 1.4479215210663754e-07, + "loss": 0.1004, + "num_input_tokens_seen": 27718240, + "step": 38630 + }, + { + "epoch": 80.32224532224532, + "grad_norm": 0.20844414830207825, + "learning_rate": 1.4373898310619528e-07, + "loss": 0.0745, + "num_input_tokens_seen": 27721792, + "step": 38635 + }, + { + "epoch": 80.33264033264034, + "grad_norm": 0.5546298027038574, + "learning_rate": 1.4268964725356604e-07, + "loss": 0.0998, + "num_input_tokens_seen": 27725280, + "step": 38640 + }, + { + "epoch": 80.34303534303534, + "grad_norm": 0.6480358242988586, + "learning_rate": 1.4164414471056764e-07, + "loss": 0.1051, + "num_input_tokens_seen": 27728928, + "step": 38645 + }, + { + "epoch": 80.35343035343035, + "grad_norm": 0.1205972358584404, + "learning_rate": 1.4060247563843497e-07, + "loss": 0.0715, + "num_input_tokens_seen": 27732480, + "step": 38650 + }, + { + "epoch": 80.36382536382537, + "grad_norm": 0.2275390475988388, + "learning_rate": 1.3956464019780068e-07, + "loss": 0.0788, + "num_input_tokens_seen": 27736096, + "step": 38655 + }, + { + "epoch": 80.37422037422037, + "grad_norm": 0.35892388224601746, + "learning_rate": 1.385306385487145e-07, + "loss": 0.0753, + "num_input_tokens_seen": 27739520, + "step": 38660 + }, + { + "epoch": 80.38461538461539, + "grad_norm": 0.2900821566581726, + "learning_rate": 1.3750047085063222e-07, + "loss": 0.0641, + "num_input_tokens_seen": 27743072, + "step": 38665 + }, + { + "epoch": 80.39501039501039, + "grad_norm": 0.3162704110145569, + "learning_rate": 1.3647413726242119e-07, + "loss": 0.1341, + "num_input_tokens_seen": 27746624, + "step": 38670 + }, + { + "epoch": 80.4054054054054, + "grad_norm": 0.29281046986579895, + "learning_rate": 1.3545163794235205e-07, + "loss": 0.0845, + "num_input_tokens_seen": 27750208, + "step": 38675 + }, + { + "epoch": 80.41580041580042, + "grad_norm": 0.3561306595802307, + "learning_rate": 1.3443297304810698e-07, + "loss": 0.0753, + "num_input_tokens_seen": 27753760, + "step": 38680 + }, + { + "epoch": 80.42619542619542, + "grad_norm": 0.45776858925819397, + "learning_rate": 1.3341814273677977e-07, + "loss": 0.1037, + "num_input_tokens_seen": 27757184, + "step": 38685 + }, + { + "epoch": 80.43659043659044, + "grad_norm": 0.23411236703395844, + "learning_rate": 1.324071471648647e-07, + "loss": 0.0897, + "num_input_tokens_seen": 27760672, + "step": 38690 + }, + { + "epoch": 80.44698544698545, + "grad_norm": 0.41385725140571594, + "learning_rate": 1.3139998648827312e-07, + "loss": 0.09, + "num_input_tokens_seen": 27764320, + "step": 38695 + }, + { + "epoch": 80.45738045738045, + "grad_norm": 0.3251994848251343, + "learning_rate": 1.3039666086232526e-07, + "loss": 0.1258, + "num_input_tokens_seen": 27767840, + "step": 38700 + }, + { + "epoch": 80.46777546777547, + "grad_norm": 0.2716628909111023, + "learning_rate": 1.2939717044174183e-07, + "loss": 0.067, + "num_input_tokens_seen": 27771296, + "step": 38705 + }, + { + "epoch": 80.47817047817048, + "grad_norm": 0.23697912693023682, + "learning_rate": 1.284015153806578e-07, + "loss": 0.1178, + "num_input_tokens_seen": 27774880, + "step": 38710 + }, + { + "epoch": 80.48856548856548, + "grad_norm": 0.27925315499305725, + "learning_rate": 1.274096958326171e-07, + "loss": 0.0994, + "num_input_tokens_seen": 27778656, + "step": 38715 + }, + { + "epoch": 80.4989604989605, + "grad_norm": 0.23218701779842377, + "learning_rate": 1.2642171195056952e-07, + "loss": 0.1111, + "num_input_tokens_seen": 27782208, + "step": 38720 + }, + { + "epoch": 80.50935550935552, + "grad_norm": 0.26182806491851807, + "learning_rate": 1.2543756388687377e-07, + "loss": 0.1078, + "num_input_tokens_seen": 27785920, + "step": 38725 + }, + { + "epoch": 80.51975051975052, + "grad_norm": 0.6405935287475586, + "learning_rate": 1.2445725179330014e-07, + "loss": 0.118, + "num_input_tokens_seen": 27789536, + "step": 38730 + }, + { + "epoch": 80.53014553014553, + "grad_norm": 0.41700443625450134, + "learning_rate": 1.2348077582102212e-07, + "loss": 0.1151, + "num_input_tokens_seen": 27793056, + "step": 38735 + }, + { + "epoch": 80.54054054054055, + "grad_norm": 0.8061585426330566, + "learning_rate": 1.2250813612062762e-07, + "loss": 0.1179, + "num_input_tokens_seen": 27796704, + "step": 38740 + }, + { + "epoch": 80.55093555093555, + "grad_norm": 0.6084819436073303, + "learning_rate": 1.215393328421105e-07, + "loss": 0.1455, + "num_input_tokens_seen": 27800320, + "step": 38745 + }, + { + "epoch": 80.56133056133056, + "grad_norm": 0.3150646984577179, + "learning_rate": 1.2057436613486796e-07, + "loss": 0.139, + "num_input_tokens_seen": 27803936, + "step": 38750 + }, + { + "epoch": 80.57172557172557, + "grad_norm": 0.3929207921028137, + "learning_rate": 1.1961323614771424e-07, + "loss": 0.0881, + "num_input_tokens_seen": 27807584, + "step": 38755 + }, + { + "epoch": 80.58212058212058, + "grad_norm": 0.2917660176753998, + "learning_rate": 1.1865594302886418e-07, + "loss": 0.0673, + "num_input_tokens_seen": 27811104, + "step": 38760 + }, + { + "epoch": 80.5925155925156, + "grad_norm": 0.5137543082237244, + "learning_rate": 1.1770248692594687e-07, + "loss": 0.0689, + "num_input_tokens_seen": 27814752, + "step": 38765 + }, + { + "epoch": 80.6029106029106, + "grad_norm": 0.3993770182132721, + "learning_rate": 1.167528679859975e-07, + "loss": 0.0972, + "num_input_tokens_seen": 27818400, + "step": 38770 + }, + { + "epoch": 80.61330561330561, + "grad_norm": 0.5345261693000793, + "learning_rate": 1.1580708635545446e-07, + "loss": 0.1551, + "num_input_tokens_seen": 27821952, + "step": 38775 + }, + { + "epoch": 80.62370062370063, + "grad_norm": 0.4032350182533264, + "learning_rate": 1.1486514218017885e-07, + "loss": 0.0801, + "num_input_tokens_seen": 27825600, + "step": 38780 + }, + { + "epoch": 80.63409563409563, + "grad_norm": 0.4380190968513489, + "learning_rate": 1.1392703560542117e-07, + "loss": 0.0621, + "num_input_tokens_seen": 27829088, + "step": 38785 + }, + { + "epoch": 80.64449064449065, + "grad_norm": 0.3022594153881073, + "learning_rate": 1.129927667758518e-07, + "loss": 0.0959, + "num_input_tokens_seen": 27832608, + "step": 38790 + }, + { + "epoch": 80.65488565488566, + "grad_norm": 0.2758076786994934, + "learning_rate": 1.1206233583554992e-07, + "loss": 0.0891, + "num_input_tokens_seen": 27836320, + "step": 38795 + }, + { + "epoch": 80.66528066528066, + "grad_norm": 0.35322538018226624, + "learning_rate": 1.1113574292799523e-07, + "loss": 0.1045, + "num_input_tokens_seen": 27839840, + "step": 38800 + }, + { + "epoch": 80.66528066528066, + "eval_loss": 0.144770085811615, + "eval_runtime": 7.7439, + "eval_samples_per_second": 110.539, + "eval_steps_per_second": 27.635, + "num_input_tokens_seen": 27839840, + "step": 38800 + }, + { + "epoch": 80.67567567567568, + "grad_norm": 1.014211893081665, + "learning_rate": 1.1021298819608449e-07, + "loss": 0.1211, + "num_input_tokens_seen": 27843520, + "step": 38805 + }, + { + "epoch": 80.68607068607069, + "grad_norm": 0.7088318467140198, + "learning_rate": 1.0929407178211226e-07, + "loss": 0.0909, + "num_input_tokens_seen": 27846944, + "step": 38810 + }, + { + "epoch": 80.6964656964657, + "grad_norm": 0.47378313541412354, + "learning_rate": 1.0837899382779293e-07, + "loss": 0.0879, + "num_input_tokens_seen": 27850496, + "step": 38815 + }, + { + "epoch": 80.70686070686071, + "grad_norm": 0.5575592517852783, + "learning_rate": 1.0746775447423862e-07, + "loss": 0.0756, + "num_input_tokens_seen": 27854048, + "step": 38820 + }, + { + "epoch": 80.71725571725571, + "grad_norm": 0.10575973242521286, + "learning_rate": 1.0656035386197583e-07, + "loss": 0.0861, + "num_input_tokens_seen": 27857696, + "step": 38825 + }, + { + "epoch": 80.72765072765073, + "grad_norm": 0.3753626346588135, + "learning_rate": 1.0565679213093982e-07, + "loss": 0.0673, + "num_input_tokens_seen": 27861280, + "step": 38830 + }, + { + "epoch": 80.73804573804574, + "grad_norm": 0.24442444741725922, + "learning_rate": 1.0475706942046638e-07, + "loss": 0.1301, + "num_input_tokens_seen": 27865120, + "step": 38835 + }, + { + "epoch": 80.74844074844074, + "grad_norm": 0.5400694608688354, + "learning_rate": 1.0386118586930282e-07, + "loss": 0.1033, + "num_input_tokens_seen": 27868800, + "step": 38840 + }, + { + "epoch": 80.75883575883576, + "grad_norm": 0.2028653770685196, + "learning_rate": 1.0296914161561367e-07, + "loss": 0.0816, + "num_input_tokens_seen": 27872480, + "step": 38845 + }, + { + "epoch": 80.76923076923077, + "grad_norm": 0.5625078678131104, + "learning_rate": 1.0208093679695552e-07, + "loss": 0.1649, + "num_input_tokens_seen": 27876224, + "step": 38850 + }, + { + "epoch": 80.77962577962577, + "grad_norm": 0.5769095420837402, + "learning_rate": 1.0119657155030493e-07, + "loss": 0.1059, + "num_input_tokens_seen": 27879616, + "step": 38855 + }, + { + "epoch": 80.79002079002079, + "grad_norm": 0.2721710503101349, + "learning_rate": 1.003160460120417e-07, + "loss": 0.072, + "num_input_tokens_seen": 27883264, + "step": 38860 + }, + { + "epoch": 80.8004158004158, + "grad_norm": 0.2621849775314331, + "learning_rate": 9.943936031795165e-08, + "loss": 0.1369, + "num_input_tokens_seen": 27886944, + "step": 38865 + }, + { + "epoch": 80.8108108108108, + "grad_norm": 0.31240880489349365, + "learning_rate": 9.856651460323219e-08, + "loss": 0.1104, + "num_input_tokens_seen": 27890592, + "step": 38870 + }, + { + "epoch": 80.82120582120582, + "grad_norm": 0.25782689452171326, + "learning_rate": 9.769750900248953e-08, + "loss": 0.0767, + "num_input_tokens_seen": 27894304, + "step": 38875 + }, + { + "epoch": 80.83160083160084, + "grad_norm": 0.30053776502609253, + "learning_rate": 9.683234364973038e-08, + "loss": 0.1031, + "num_input_tokens_seen": 27897824, + "step": 38880 + }, + { + "epoch": 80.84199584199584, + "grad_norm": 0.2732643485069275, + "learning_rate": 9.597101867837854e-08, + "loss": 0.0589, + "num_input_tokens_seen": 27901216, + "step": 38885 + }, + { + "epoch": 80.85239085239085, + "grad_norm": 0.34527918696403503, + "learning_rate": 9.511353422125835e-08, + "loss": 0.0797, + "num_input_tokens_seen": 27904864, + "step": 38890 + }, + { + "epoch": 80.86278586278586, + "grad_norm": 0.2199438065290451, + "learning_rate": 9.42598904106029e-08, + "loss": 0.083, + "num_input_tokens_seen": 27908384, + "step": 38895 + }, + { + "epoch": 80.87318087318087, + "grad_norm": 0.19606034457683563, + "learning_rate": 9.341008737806245e-08, + "loss": 0.1269, + "num_input_tokens_seen": 27912064, + "step": 38900 + }, + { + "epoch": 80.88357588357589, + "grad_norm": 0.5174325704574585, + "learning_rate": 9.256412525467661e-08, + "loss": 0.0858, + "num_input_tokens_seen": 27915776, + "step": 38905 + }, + { + "epoch": 80.89397089397089, + "grad_norm": 0.32423901557922363, + "learning_rate": 9.172200417091326e-08, + "loss": 0.09, + "num_input_tokens_seen": 27919456, + "step": 38910 + }, + { + "epoch": 80.9043659043659, + "grad_norm": 0.33863356709480286, + "learning_rate": 9.088372425663239e-08, + "loss": 0.0931, + "num_input_tokens_seen": 27923200, + "step": 38915 + }, + { + "epoch": 80.91476091476092, + "grad_norm": 0.36202701926231384, + "learning_rate": 9.004928564110837e-08, + "loss": 0.1025, + "num_input_tokens_seen": 27926784, + "step": 38920 + }, + { + "epoch": 80.92515592515592, + "grad_norm": 0.40075600147247314, + "learning_rate": 8.92186884530244e-08, + "loss": 0.0917, + "num_input_tokens_seen": 27930272, + "step": 38925 + }, + { + "epoch": 80.93555093555094, + "grad_norm": 0.6294990181922913, + "learning_rate": 8.83919328204641e-08, + "loss": 0.1392, + "num_input_tokens_seen": 27933792, + "step": 38930 + }, + { + "epoch": 80.94594594594595, + "grad_norm": 0.5850522518157959, + "learning_rate": 8.756901887093105e-08, + "loss": 0.1404, + "num_input_tokens_seen": 27937408, + "step": 38935 + }, + { + "epoch": 80.95634095634095, + "grad_norm": 0.23628684878349304, + "learning_rate": 8.674994673132098e-08, + "loss": 0.0992, + "num_input_tokens_seen": 27941024, + "step": 38940 + }, + { + "epoch": 80.96673596673597, + "grad_norm": 0.49798715114593506, + "learning_rate": 8.593471652794949e-08, + "loss": 0.172, + "num_input_tokens_seen": 27944640, + "step": 38945 + }, + { + "epoch": 80.97713097713098, + "grad_norm": 0.6800642609596252, + "learning_rate": 8.512332838653548e-08, + "loss": 0.1048, + "num_input_tokens_seen": 27948160, + "step": 38950 + }, + { + "epoch": 80.98752598752598, + "grad_norm": 0.41848698258399963, + "learning_rate": 8.431578243220106e-08, + "loss": 0.0952, + "num_input_tokens_seen": 27951680, + "step": 38955 + }, + { + "epoch": 80.997920997921, + "grad_norm": 0.7138878107070923, + "learning_rate": 8.351207878948552e-08, + "loss": 0.0842, + "num_input_tokens_seen": 27955296, + "step": 38960 + }, + { + "epoch": 81.00831600831602, + "grad_norm": 0.4392209053039551, + "learning_rate": 8.271221758232583e-08, + "loss": 0.0646, + "num_input_tokens_seen": 27958776, + "step": 38965 + }, + { + "epoch": 81.01871101871102, + "grad_norm": 0.26417145133018494, + "learning_rate": 8.191619893407332e-08, + "loss": 0.086, + "num_input_tokens_seen": 27962424, + "step": 38970 + }, + { + "epoch": 81.02910602910603, + "grad_norm": 0.18092702329158783, + "learning_rate": 8.112402296748534e-08, + "loss": 0.0732, + "num_input_tokens_seen": 27965976, + "step": 38975 + }, + { + "epoch": 81.03950103950103, + "grad_norm": 0.2834857404232025, + "learning_rate": 8.033568980471973e-08, + "loss": 0.121, + "num_input_tokens_seen": 27969432, + "step": 38980 + }, + { + "epoch": 81.04989604989605, + "grad_norm": 0.49424171447753906, + "learning_rate": 7.955119956735146e-08, + "loss": 0.0762, + "num_input_tokens_seen": 27972984, + "step": 38985 + }, + { + "epoch": 81.06029106029106, + "grad_norm": 0.14256960153579712, + "learning_rate": 7.877055237636155e-08, + "loss": 0.0846, + "num_input_tokens_seen": 27976472, + "step": 38990 + }, + { + "epoch": 81.07068607068607, + "grad_norm": 0.20215588808059692, + "learning_rate": 7.79937483521287e-08, + "loss": 0.0895, + "num_input_tokens_seen": 27979864, + "step": 38995 + }, + { + "epoch": 81.08108108108108, + "grad_norm": 0.2757055461406708, + "learning_rate": 7.722078761444873e-08, + "loss": 0.0912, + "num_input_tokens_seen": 27983384, + "step": 39000 + }, + { + "epoch": 81.08108108108108, + "eval_loss": 0.14485956728458405, + "eval_runtime": 7.7475, + "eval_samples_per_second": 110.487, + "eval_steps_per_second": 27.622, + "num_input_tokens_seen": 27983384, + "step": 39000 + }, + { + "epoch": 81.0914760914761, + "grad_norm": 0.399519145488739, + "learning_rate": 7.645167028252631e-08, + "loss": 0.0643, + "num_input_tokens_seen": 27986936, + "step": 39005 + }, + { + "epoch": 81.1018711018711, + "grad_norm": 0.8245398998260498, + "learning_rate": 7.568639647496379e-08, + "loss": 0.1033, + "num_input_tokens_seen": 27990520, + "step": 39010 + }, + { + "epoch": 81.11226611226611, + "grad_norm": 0.23532235622406006, + "learning_rate": 7.492496630977508e-08, + "loss": 0.1081, + "num_input_tokens_seen": 27994328, + "step": 39015 + }, + { + "epoch": 81.12266112266113, + "grad_norm": 1.084386944770813, + "learning_rate": 7.416737990438571e-08, + "loss": 0.1192, + "num_input_tokens_seen": 27997816, + "step": 39020 + }, + { + "epoch": 81.13305613305613, + "grad_norm": 0.55245441198349, + "learning_rate": 7.341363737562445e-08, + "loss": 0.0916, + "num_input_tokens_seen": 28001400, + "step": 39025 + }, + { + "epoch": 81.14345114345114, + "grad_norm": 0.4694778621196747, + "learning_rate": 7.266373883972887e-08, + "loss": 0.0968, + "num_input_tokens_seen": 28004824, + "step": 39030 + }, + { + "epoch": 81.15384615384616, + "grad_norm": 0.14042603969573975, + "learning_rate": 7.191768441233981e-08, + "loss": 0.0899, + "num_input_tokens_seen": 28008568, + "step": 39035 + }, + { + "epoch": 81.16424116424116, + "grad_norm": 0.43712118268013, + "learning_rate": 7.11754742085069e-08, + "loss": 0.0718, + "num_input_tokens_seen": 28012248, + "step": 39040 + }, + { + "epoch": 81.17463617463618, + "grad_norm": 0.2978891432285309, + "learning_rate": 7.043710834269413e-08, + "loss": 0.135, + "num_input_tokens_seen": 28015800, + "step": 39045 + }, + { + "epoch": 81.18503118503118, + "grad_norm": 0.35578450560569763, + "learning_rate": 6.970258692876319e-08, + "loss": 0.0917, + "num_input_tokens_seen": 28019480, + "step": 39050 + }, + { + "epoch": 81.1954261954262, + "grad_norm": 0.625099241733551, + "learning_rate": 6.897191007998738e-08, + "loss": 0.1307, + "num_input_tokens_seen": 28023000, + "step": 39055 + }, + { + "epoch": 81.20582120582121, + "grad_norm": 0.27371424436569214, + "learning_rate": 6.824507790904599e-08, + "loss": 0.0658, + "num_input_tokens_seen": 28026584, + "step": 39060 + }, + { + "epoch": 81.21621621621621, + "grad_norm": 0.36670204997062683, + "learning_rate": 6.752209052802439e-08, + "loss": 0.1276, + "num_input_tokens_seen": 28030136, + "step": 39065 + }, + { + "epoch": 81.22661122661123, + "grad_norm": 0.167515367269516, + "learning_rate": 6.680294804841946e-08, + "loss": 0.0792, + "num_input_tokens_seen": 28033624, + "step": 39070 + }, + { + "epoch": 81.23700623700624, + "grad_norm": 0.45812520384788513, + "learning_rate": 6.608765058112865e-08, + "loss": 0.1231, + "num_input_tokens_seen": 28037144, + "step": 39075 + }, + { + "epoch": 81.24740124740124, + "grad_norm": 0.1608593463897705, + "learning_rate": 6.537619823646368e-08, + "loss": 0.1234, + "num_input_tokens_seen": 28040632, + "step": 39080 + }, + { + "epoch": 81.25779625779626, + "grad_norm": 0.3342064619064331, + "learning_rate": 6.466859112413404e-08, + "loss": 0.0959, + "num_input_tokens_seen": 28044216, + "step": 39085 + }, + { + "epoch": 81.26819126819127, + "grad_norm": 0.4921809732913971, + "learning_rate": 6.39648293532663e-08, + "loss": 0.1294, + "num_input_tokens_seen": 28047736, + "step": 39090 + }, + { + "epoch": 81.27858627858627, + "grad_norm": 0.952217698097229, + "learning_rate": 6.32649130323848e-08, + "loss": 0.1465, + "num_input_tokens_seen": 28051512, + "step": 39095 + }, + { + "epoch": 81.28898128898129, + "grad_norm": 0.2538556158542633, + "learning_rate": 6.256884226943094e-08, + "loss": 0.0703, + "num_input_tokens_seen": 28055064, + "step": 39100 + }, + { + "epoch": 81.2993762993763, + "grad_norm": 0.293078750371933, + "learning_rate": 6.187661717174386e-08, + "loss": 0.0697, + "num_input_tokens_seen": 28058840, + "step": 39105 + }, + { + "epoch": 81.3097713097713, + "grad_norm": 0.25734490156173706, + "learning_rate": 6.118823784607708e-08, + "loss": 0.089, + "num_input_tokens_seen": 28062328, + "step": 39110 + }, + { + "epoch": 81.32016632016632, + "grad_norm": 0.10847674310207367, + "learning_rate": 6.050370439858178e-08, + "loss": 0.0621, + "num_input_tokens_seen": 28065880, + "step": 39115 + }, + { + "epoch": 81.33056133056132, + "grad_norm": 0.42381739616394043, + "learning_rate": 5.98230169348235e-08, + "loss": 0.1131, + "num_input_tokens_seen": 28069368, + "step": 39120 + }, + { + "epoch": 81.34095634095634, + "grad_norm": 0.3481266498565674, + "learning_rate": 5.914617555977664e-08, + "loss": 0.0974, + "num_input_tokens_seen": 28073080, + "step": 39125 + }, + { + "epoch": 81.35135135135135, + "grad_norm": 0.3351792097091675, + "learning_rate": 5.8473180377816017e-08, + "loss": 0.0798, + "num_input_tokens_seen": 28076632, + "step": 39130 + }, + { + "epoch": 81.36174636174636, + "grad_norm": 0.3787980377674103, + "learning_rate": 5.780403149272251e-08, + "loss": 0.0611, + "num_input_tokens_seen": 28080120, + "step": 39135 + }, + { + "epoch": 81.37214137214137, + "grad_norm": 0.3042416274547577, + "learning_rate": 5.7138729007694126e-08, + "loss": 0.1176, + "num_input_tokens_seen": 28083800, + "step": 39140 + }, + { + "epoch": 81.38253638253639, + "grad_norm": 0.2645217478275299, + "learning_rate": 5.64772730253238e-08, + "loss": 0.0651, + "num_input_tokens_seen": 28087448, + "step": 39145 + }, + { + "epoch": 81.39293139293139, + "grad_norm": 0.5925956964492798, + "learning_rate": 5.5819663647618814e-08, + "loss": 0.0842, + "num_input_tokens_seen": 28091064, + "step": 39150 + }, + { + "epoch": 81.4033264033264, + "grad_norm": 0.25357651710510254, + "learning_rate": 5.5165900975989723e-08, + "loss": 0.0785, + "num_input_tokens_seen": 28094872, + "step": 39155 + }, + { + "epoch": 81.41372141372142, + "grad_norm": 0.3050072491168976, + "learning_rate": 5.451598511125311e-08, + "loss": 0.095, + "num_input_tokens_seen": 28098616, + "step": 39160 + }, + { + "epoch": 81.42411642411642, + "grad_norm": 0.298042356967926, + "learning_rate": 5.3869916153637124e-08, + "loss": 0.1198, + "num_input_tokens_seen": 28102360, + "step": 39165 + }, + { + "epoch": 81.43451143451144, + "grad_norm": 0.2274908423423767, + "learning_rate": 5.322769420277318e-08, + "loss": 0.0672, + "num_input_tokens_seen": 28105880, + "step": 39170 + }, + { + "epoch": 81.44490644490645, + "grad_norm": 0.35681644082069397, + "learning_rate": 5.258931935769873e-08, + "loss": 0.1204, + "num_input_tokens_seen": 28109400, + "step": 39175 + }, + { + "epoch": 81.45530145530145, + "grad_norm": 0.3539547026157379, + "learning_rate": 5.19547917168628e-08, + "loss": 0.0937, + "num_input_tokens_seen": 28112952, + "step": 39180 + }, + { + "epoch": 81.46569646569647, + "grad_norm": 0.2137758433818817, + "learning_rate": 5.13241113781121e-08, + "loss": 0.0726, + "num_input_tokens_seen": 28116664, + "step": 39185 + }, + { + "epoch": 81.47609147609148, + "grad_norm": 0.3049239218235016, + "learning_rate": 5.0697278438707755e-08, + "loss": 0.0868, + "num_input_tokens_seen": 28120248, + "step": 39190 + }, + { + "epoch": 81.48648648648648, + "grad_norm": 0.4901053309440613, + "learning_rate": 5.0074292995316854e-08, + "loss": 0.0987, + "num_input_tokens_seen": 28123896, + "step": 39195 + }, + { + "epoch": 81.4968814968815, + "grad_norm": 0.462415486574173, + "learning_rate": 4.945515514400978e-08, + "loss": 0.1128, + "num_input_tokens_seen": 28127512, + "step": 39200 + }, + { + "epoch": 81.4968814968815, + "eval_loss": 0.14526225626468658, + "eval_runtime": 7.7591, + "eval_samples_per_second": 110.322, + "eval_steps_per_second": 27.581, + "num_input_tokens_seen": 28127512, + "step": 39200 + }, + { + "epoch": 81.5072765072765, + "grad_norm": 0.35153728723526, + "learning_rate": 4.883986498026571e-08, + "loss": 0.0872, + "num_input_tokens_seen": 28131032, + "step": 39205 + }, + { + "epoch": 81.51767151767152, + "grad_norm": 0.7654657959938049, + "learning_rate": 4.822842259896987e-08, + "loss": 0.0973, + "num_input_tokens_seen": 28134584, + "step": 39210 + }, + { + "epoch": 81.52806652806653, + "grad_norm": 0.8538441061973572, + "learning_rate": 4.762082809441626e-08, + "loss": 0.0971, + "num_input_tokens_seen": 28138072, + "step": 39215 + }, + { + "epoch": 81.53846153846153, + "grad_norm": 0.4041183888912201, + "learning_rate": 4.7017081560302156e-08, + "loss": 0.1145, + "num_input_tokens_seen": 28141720, + "step": 39220 + }, + { + "epoch": 81.54885654885655, + "grad_norm": 0.16964422166347504, + "learning_rate": 4.6417183089730866e-08, + "loss": 0.1038, + "num_input_tokens_seen": 28145304, + "step": 39225 + }, + { + "epoch": 81.55925155925156, + "grad_norm": 0.32384926080703735, + "learning_rate": 4.5821132775217265e-08, + "loss": 0.0987, + "num_input_tokens_seen": 28148984, + "step": 39230 + }, + { + "epoch": 81.56964656964657, + "grad_norm": 0.3389059007167816, + "learning_rate": 4.5228930708679504e-08, + "loss": 0.0877, + "num_input_tokens_seen": 28152664, + "step": 39235 + }, + { + "epoch": 81.58004158004158, + "grad_norm": 0.2696133255958557, + "learning_rate": 4.464057698144175e-08, + "loss": 0.1057, + "num_input_tokens_seen": 28156248, + "step": 39240 + }, + { + "epoch": 81.5904365904366, + "grad_norm": 0.20644448697566986, + "learning_rate": 4.4056071684236974e-08, + "loss": 0.0794, + "num_input_tokens_seen": 28159704, + "step": 39245 + }, + { + "epoch": 81.6008316008316, + "grad_norm": 0.34008917212486267, + "learning_rate": 4.347541490719864e-08, + "loss": 0.1273, + "num_input_tokens_seen": 28163416, + "step": 39250 + }, + { + "epoch": 81.61122661122661, + "grad_norm": 0.2431999295949936, + "learning_rate": 4.2898606739877336e-08, + "loss": 0.0967, + "num_input_tokens_seen": 28166808, + "step": 39255 + }, + { + "epoch": 81.62162162162163, + "grad_norm": 0.575366735458374, + "learning_rate": 4.232564727122135e-08, + "loss": 0.114, + "num_input_tokens_seen": 28170488, + "step": 39260 + }, + { + "epoch": 81.63201663201663, + "grad_norm": 0.5371441841125488, + "learning_rate": 4.1756536589585004e-08, + "loss": 0.0943, + "num_input_tokens_seen": 28173976, + "step": 39265 + }, + { + "epoch": 81.64241164241164, + "grad_norm": 0.3539598286151886, + "learning_rate": 4.119127478273976e-08, + "loss": 0.116, + "num_input_tokens_seen": 28177432, + "step": 39270 + }, + { + "epoch": 81.65280665280665, + "grad_norm": 0.7179893255233765, + "learning_rate": 4.062986193784923e-08, + "loss": 0.0948, + "num_input_tokens_seen": 28180856, + "step": 39275 + }, + { + "epoch": 81.66320166320166, + "grad_norm": 0.7238876819610596, + "learning_rate": 4.007229814149416e-08, + "loss": 0.1134, + "num_input_tokens_seen": 28184440, + "step": 39280 + }, + { + "epoch": 81.67359667359668, + "grad_norm": 0.2530270516872406, + "learning_rate": 3.951858347965576e-08, + "loss": 0.088, + "num_input_tokens_seen": 28188024, + "step": 39285 + }, + { + "epoch": 81.68399168399168, + "grad_norm": 0.39319369196891785, + "learning_rate": 3.896871803772684e-08, + "loss": 0.0758, + "num_input_tokens_seen": 28191640, + "step": 39290 + }, + { + "epoch": 81.6943866943867, + "grad_norm": 0.6892164945602417, + "learning_rate": 3.842270190050068e-08, + "loss": 0.094, + "num_input_tokens_seen": 28195160, + "step": 39295 + }, + { + "epoch": 81.70478170478171, + "grad_norm": 0.9243858456611633, + "learning_rate": 3.7880535152179376e-08, + "loss": 0.0927, + "num_input_tokens_seen": 28198744, + "step": 39300 + }, + { + "epoch": 81.71517671517671, + "grad_norm": 0.44359341263771057, + "learning_rate": 3.734221787637382e-08, + "loss": 0.0983, + "num_input_tokens_seen": 28202424, + "step": 39305 + }, + { + "epoch": 81.72557172557173, + "grad_norm": 0.1998508870601654, + "learning_rate": 3.680775015609817e-08, + "loss": 0.082, + "num_input_tokens_seen": 28205944, + "step": 39310 + }, + { + "epoch": 81.73596673596674, + "grad_norm": 0.4917124807834625, + "learning_rate": 3.627713207377537e-08, + "loss": 0.1031, + "num_input_tokens_seen": 28209496, + "step": 39315 + }, + { + "epoch": 81.74636174636174, + "grad_norm": 0.26519447565078735, + "learning_rate": 3.575036371123164e-08, + "loss": 0.0706, + "num_input_tokens_seen": 28213080, + "step": 39320 + }, + { + "epoch": 81.75675675675676, + "grad_norm": 0.558172881603241, + "learning_rate": 3.5227445149704776e-08, + "loss": 0.0904, + "num_input_tokens_seen": 28216536, + "step": 39325 + }, + { + "epoch": 81.76715176715177, + "grad_norm": 0.9560224413871765, + "learning_rate": 3.470837646983027e-08, + "loss": 0.0874, + "num_input_tokens_seen": 28220376, + "step": 39330 + }, + { + "epoch": 81.77754677754677, + "grad_norm": 0.5280198454856873, + "learning_rate": 3.419315775165799e-08, + "loss": 0.0924, + "num_input_tokens_seen": 28223832, + "step": 39335 + }, + { + "epoch": 81.78794178794179, + "grad_norm": 0.465232253074646, + "learning_rate": 3.368178907464103e-08, + "loss": 0.0907, + "num_input_tokens_seen": 28227512, + "step": 39340 + }, + { + "epoch": 81.7983367983368, + "grad_norm": 0.31712549924850464, + "learning_rate": 3.317427051763855e-08, + "loss": 0.117, + "num_input_tokens_seen": 28231128, + "step": 39345 + }, + { + "epoch": 81.8087318087318, + "grad_norm": 0.568411648273468, + "learning_rate": 3.267060215891571e-08, + "loss": 0.1331, + "num_input_tokens_seen": 28234744, + "step": 39350 + }, + { + "epoch": 81.81912681912682, + "grad_norm": 0.19099247455596924, + "learning_rate": 3.217078407614649e-08, + "loss": 0.103, + "num_input_tokens_seen": 28238264, + "step": 39355 + }, + { + "epoch": 81.82952182952182, + "grad_norm": 0.29242467880249023, + "learning_rate": 3.1674816346405345e-08, + "loss": 0.102, + "num_input_tokens_seen": 28241816, + "step": 39360 + }, + { + "epoch": 81.83991683991684, + "grad_norm": 0.6746534705162048, + "learning_rate": 3.11826990461811e-08, + "loss": 0.1106, + "num_input_tokens_seen": 28245496, + "step": 39365 + }, + { + "epoch": 81.85031185031185, + "grad_norm": 0.2386143058538437, + "learning_rate": 3.069443225136304e-08, + "loss": 0.0904, + "num_input_tokens_seen": 28249144, + "step": 39370 + }, + { + "epoch": 81.86070686070686, + "grad_norm": 0.5732173323631287, + "learning_rate": 3.021001603724372e-08, + "loss": 0.1094, + "num_input_tokens_seen": 28252632, + "step": 39375 + }, + { + "epoch": 81.87110187110187, + "grad_norm": 0.3094920217990875, + "learning_rate": 2.9729450478532818e-08, + "loss": 0.0812, + "num_input_tokens_seen": 28256120, + "step": 39380 + }, + { + "epoch": 81.88149688149689, + "grad_norm": 0.4933173954486847, + "learning_rate": 2.9252735649337726e-08, + "loss": 0.0966, + "num_input_tokens_seen": 28259608, + "step": 39385 + }, + { + "epoch": 81.89189189189189, + "grad_norm": 0.38154977560043335, + "learning_rate": 2.8779871623171863e-08, + "loss": 0.1187, + "num_input_tokens_seen": 28263128, + "step": 39390 + }, + { + "epoch": 81.9022869022869, + "grad_norm": 0.2307473123073578, + "learning_rate": 2.8310858472957448e-08, + "loss": 0.1294, + "num_input_tokens_seen": 28266584, + "step": 39395 + }, + { + "epoch": 81.91268191268192, + "grad_norm": 0.21951515972614288, + "learning_rate": 2.784569627101996e-08, + "loss": 0.0817, + "num_input_tokens_seen": 28270104, + "step": 39400 + }, + { + "epoch": 81.91268191268192, + "eval_loss": 0.14517425000667572, + "eval_runtime": 7.7526, + "eval_samples_per_second": 110.414, + "eval_steps_per_second": 27.603, + "num_input_tokens_seen": 28270104, + "step": 39400 + }, + { + "epoch": 81.92307692307692, + "grad_norm": 0.35546788573265076, + "learning_rate": 2.738438508909924e-08, + "loss": 0.1307, + "num_input_tokens_seen": 28273848, + "step": 39405 + }, + { + "epoch": 81.93347193347194, + "grad_norm": 0.44870707392692566, + "learning_rate": 2.692692499833005e-08, + "loss": 0.0755, + "num_input_tokens_seen": 28277368, + "step": 39410 + }, + { + "epoch": 81.94386694386695, + "grad_norm": 0.31139904260635376, + "learning_rate": 2.647331606926151e-08, + "loss": 0.0936, + "num_input_tokens_seen": 28280952, + "step": 39415 + }, + { + "epoch": 81.95426195426195, + "grad_norm": 0.20325055718421936, + "learning_rate": 2.6023558371843225e-08, + "loss": 0.1158, + "num_input_tokens_seen": 28284696, + "step": 39420 + }, + { + "epoch": 81.96465696465697, + "grad_norm": 0.20465753972530365, + "learning_rate": 2.557765197543638e-08, + "loss": 0.0622, + "num_input_tokens_seen": 28288376, + "step": 39425 + }, + { + "epoch": 81.97505197505197, + "grad_norm": 0.5098575949668884, + "learning_rate": 2.513559694880263e-08, + "loss": 0.0967, + "num_input_tokens_seen": 28292024, + "step": 39430 + }, + { + "epoch": 81.98544698544698, + "grad_norm": 0.3240266442298889, + "learning_rate": 2.469739336011523e-08, + "loss": 0.0903, + "num_input_tokens_seen": 28295640, + "step": 39435 + }, + { + "epoch": 81.995841995842, + "grad_norm": 0.793709933757782, + "learning_rate": 2.4263041276947894e-08, + "loss": 0.0988, + "num_input_tokens_seen": 28299352, + "step": 39440 + }, + { + "epoch": 82.006237006237, + "grad_norm": 0.43625155091285706, + "learning_rate": 2.3832540766283164e-08, + "loss": 0.1119, + "num_input_tokens_seen": 28303024, + "step": 39445 + }, + { + "epoch": 82.01663201663202, + "grad_norm": 0.6251616477966309, + "learning_rate": 2.3405891894512366e-08, + "loss": 0.0912, + "num_input_tokens_seen": 28306608, + "step": 39450 + }, + { + "epoch": 82.02702702702703, + "grad_norm": 0.21581733226776123, + "learning_rate": 2.29830947274301e-08, + "loss": 0.0936, + "num_input_tokens_seen": 28310192, + "step": 39455 + }, + { + "epoch": 82.03742203742203, + "grad_norm": 0.5268053412437439, + "learning_rate": 2.2564149330231432e-08, + "loss": 0.0918, + "num_input_tokens_seen": 28313712, + "step": 39460 + }, + { + "epoch": 82.04781704781705, + "grad_norm": 0.4666979908943176, + "learning_rate": 2.2149055767528572e-08, + "loss": 0.0739, + "num_input_tokens_seen": 28317264, + "step": 39465 + }, + { + "epoch": 82.05821205821206, + "grad_norm": 0.6284265518188477, + "learning_rate": 2.1737814103334197e-08, + "loss": 0.1325, + "num_input_tokens_seen": 28320848, + "step": 39470 + }, + { + "epoch": 82.06860706860707, + "grad_norm": 0.39248794317245483, + "learning_rate": 2.1330424401064253e-08, + "loss": 0.0861, + "num_input_tokens_seen": 28324560, + "step": 39475 + }, + { + "epoch": 82.07900207900208, + "grad_norm": 0.3312236964702606, + "learning_rate": 2.092688672354348e-08, + "loss": 0.0529, + "num_input_tokens_seen": 28328080, + "step": 39480 + }, + { + "epoch": 82.0893970893971, + "grad_norm": 0.5780149698257446, + "learning_rate": 2.0527201133005435e-08, + "loss": 0.0815, + "num_input_tokens_seen": 28331568, + "step": 39485 + }, + { + "epoch": 82.0997920997921, + "grad_norm": 0.29905831813812256, + "learning_rate": 2.0131367691084148e-08, + "loss": 0.114, + "num_input_tokens_seen": 28334960, + "step": 39490 + }, + { + "epoch": 82.11018711018711, + "grad_norm": 0.37061619758605957, + "learning_rate": 1.9739386458819675e-08, + "loss": 0.1125, + "num_input_tokens_seen": 28338576, + "step": 39495 + }, + { + "epoch": 82.12058212058211, + "grad_norm": 0.30764350295066833, + "learning_rate": 1.9351257496666442e-08, + "loss": 0.1247, + "num_input_tokens_seen": 28342192, + "step": 39500 + }, + { + "epoch": 82.13097713097713, + "grad_norm": 0.13626272976398468, + "learning_rate": 1.896698086447657e-08, + "loss": 0.0948, + "num_input_tokens_seen": 28345808, + "step": 39505 + }, + { + "epoch": 82.14137214137214, + "grad_norm": 0.4735632538795471, + "learning_rate": 1.8586556621505436e-08, + "loss": 0.1039, + "num_input_tokens_seen": 28349456, + "step": 39510 + }, + { + "epoch": 82.15176715176715, + "grad_norm": 0.5455812811851501, + "learning_rate": 1.820998482642833e-08, + "loss": 0.0969, + "num_input_tokens_seen": 28353072, + "step": 39515 + }, + { + "epoch": 82.16216216216216, + "grad_norm": 0.5299173593521118, + "learning_rate": 1.7837265537309912e-08, + "loss": 0.11, + "num_input_tokens_seen": 28356560, + "step": 39520 + }, + { + "epoch": 82.17255717255718, + "grad_norm": 0.41105470061302185, + "learning_rate": 1.7468398811629206e-08, + "loss": 0.102, + "num_input_tokens_seen": 28360144, + "step": 39525 + }, + { + "epoch": 82.18295218295218, + "grad_norm": 0.3065885305404663, + "learning_rate": 1.710338470627404e-08, + "loss": 0.0917, + "num_input_tokens_seen": 28363600, + "step": 39530 + }, + { + "epoch": 82.1933471933472, + "grad_norm": 0.20974209904670715, + "learning_rate": 1.6742223277529945e-08, + "loss": 0.0865, + "num_input_tokens_seen": 28367088, + "step": 39535 + }, + { + "epoch": 82.20374220374221, + "grad_norm": 0.28440380096435547, + "learning_rate": 1.6384914581094036e-08, + "loss": 0.1022, + "num_input_tokens_seen": 28370832, + "step": 39540 + }, + { + "epoch": 82.21413721413721, + "grad_norm": 0.5581520199775696, + "learning_rate": 1.6031458672069455e-08, + "loss": 0.1158, + "num_input_tokens_seen": 28374384, + "step": 39545 + }, + { + "epoch": 82.22453222453223, + "grad_norm": 0.21762526035308838, + "learning_rate": 1.5681855604962602e-08, + "loss": 0.0527, + "num_input_tokens_seen": 28377936, + "step": 39550 + }, + { + "epoch": 82.23492723492724, + "grad_norm": 0.2823074758052826, + "learning_rate": 1.5336105433683135e-08, + "loss": 0.0969, + "num_input_tokens_seen": 28381488, + "step": 39555 + }, + { + "epoch": 82.24532224532224, + "grad_norm": 0.364352285861969, + "learning_rate": 1.499420821155506e-08, + "loss": 0.091, + "num_input_tokens_seen": 28385040, + "step": 39560 + }, + { + "epoch": 82.25571725571726, + "grad_norm": 0.37935343384742737, + "learning_rate": 1.4656163991302874e-08, + "loss": 0.0944, + "num_input_tokens_seen": 28388656, + "step": 39565 + }, + { + "epoch": 82.26611226611226, + "grad_norm": 0.5472084879875183, + "learning_rate": 1.4321972825051544e-08, + "loss": 0.0979, + "num_input_tokens_seen": 28392240, + "step": 39570 + }, + { + "epoch": 82.27650727650727, + "grad_norm": 0.36599892377853394, + "learning_rate": 1.3991634764345951e-08, + "loss": 0.1195, + "num_input_tokens_seen": 28395888, + "step": 39575 + }, + { + "epoch": 82.28690228690229, + "grad_norm": 0.3649110496044159, + "learning_rate": 1.3665149860120352e-08, + "loss": 0.088, + "num_input_tokens_seen": 28399632, + "step": 39580 + }, + { + "epoch": 82.29729729729729, + "grad_norm": 0.8018770217895508, + "learning_rate": 1.3342518162728912e-08, + "loss": 0.1025, + "num_input_tokens_seen": 28403152, + "step": 39585 + }, + { + "epoch": 82.3076923076923, + "grad_norm": 0.13521601259708405, + "learning_rate": 1.30237397219235e-08, + "loss": 0.1247, + "num_input_tokens_seen": 28406736, + "step": 39590 + }, + { + "epoch": 82.31808731808732, + "grad_norm": 0.39863309264183044, + "learning_rate": 1.2708814586862016e-08, + "loss": 0.1083, + "num_input_tokens_seen": 28410192, + "step": 39595 + }, + { + "epoch": 82.32848232848232, + "grad_norm": 0.222974956035614, + "learning_rate": 1.2397742806111168e-08, + "loss": 0.0773, + "num_input_tokens_seen": 28413680, + "step": 39600 + }, + { + "epoch": 82.32848232848232, + "eval_loss": 0.1458195447921753, + "eval_runtime": 7.7583, + "eval_samples_per_second": 110.333, + "eval_steps_per_second": 27.583, + "num_input_tokens_seen": 28413680, + "step": 39600 + }, + { + "epoch": 82.33887733887734, + "grad_norm": 0.6939029693603516, + "learning_rate": 1.209052442764369e-08, + "loss": 0.0801, + "num_input_tokens_seen": 28417360, + "step": 39605 + }, + { + "epoch": 82.34927234927235, + "grad_norm": 1.0374640226364136, + "learning_rate": 1.17871594988328e-08, + "loss": 0.0977, + "num_input_tokens_seen": 28421136, + "step": 39610 + }, + { + "epoch": 82.35966735966736, + "grad_norm": 0.18229103088378906, + "learning_rate": 1.1487648066466072e-08, + "loss": 0.0773, + "num_input_tokens_seen": 28424880, + "step": 39615 + }, + { + "epoch": 82.37006237006237, + "grad_norm": 0.2991883158683777, + "learning_rate": 1.1191990176728784e-08, + "loss": 0.0803, + "num_input_tokens_seen": 28428432, + "step": 39620 + }, + { + "epoch": 82.38045738045739, + "grad_norm": 0.3090576231479645, + "learning_rate": 1.0900185875215018e-08, + "loss": 0.0964, + "num_input_tokens_seen": 28431952, + "step": 39625 + }, + { + "epoch": 82.39085239085239, + "grad_norm": 0.3064256012439728, + "learning_rate": 1.0612235206924891e-08, + "loss": 0.1103, + "num_input_tokens_seen": 28435536, + "step": 39630 + }, + { + "epoch": 82.4012474012474, + "grad_norm": 0.23219680786132812, + "learning_rate": 1.0328138216264549e-08, + "loss": 0.0913, + "num_input_tokens_seen": 28439056, + "step": 39635 + }, + { + "epoch": 82.41164241164242, + "grad_norm": 0.44975537061691284, + "learning_rate": 1.004789494704339e-08, + "loss": 0.0871, + "num_input_tokens_seen": 28442544, + "step": 39640 + }, + { + "epoch": 82.42203742203742, + "grad_norm": 0.3298523724079132, + "learning_rate": 9.771505442482397e-09, + "loss": 0.11, + "num_input_tokens_seen": 28446032, + "step": 39645 + }, + { + "epoch": 82.43243243243244, + "grad_norm": 0.3625642955303192, + "learning_rate": 9.498969745200259e-09, + "loss": 0.08, + "num_input_tokens_seen": 28449808, + "step": 39650 + }, + { + "epoch": 82.44282744282744, + "grad_norm": 0.5257962942123413, + "learning_rate": 9.230287897230017e-09, + "loss": 0.0897, + "num_input_tokens_seen": 28453424, + "step": 39655 + }, + { + "epoch": 82.45322245322245, + "grad_norm": 0.42227786779403687, + "learning_rate": 8.965459940002419e-09, + "loss": 0.1158, + "num_input_tokens_seen": 28457136, + "step": 39660 + }, + { + "epoch": 82.46361746361747, + "grad_norm": 0.46069401502609253, + "learning_rate": 8.704485914357019e-09, + "loss": 0.0628, + "num_input_tokens_seen": 28460656, + "step": 39665 + }, + { + "epoch": 82.47401247401247, + "grad_norm": 0.40637901425361633, + "learning_rate": 8.447365860539402e-09, + "loss": 0.0886, + "num_input_tokens_seen": 28464496, + "step": 39670 + }, + { + "epoch": 82.48440748440748, + "grad_norm": 0.3616507053375244, + "learning_rate": 8.194099818201184e-09, + "loss": 0.1262, + "num_input_tokens_seen": 28468080, + "step": 39675 + }, + { + "epoch": 82.4948024948025, + "grad_norm": 0.26841336488723755, + "learning_rate": 7.944687826400011e-09, + "loss": 0.0887, + "num_input_tokens_seen": 28471600, + "step": 39680 + }, + { + "epoch": 82.5051975051975, + "grad_norm": 0.2214728146791458, + "learning_rate": 7.699129923599557e-09, + "loss": 0.0856, + "num_input_tokens_seen": 28475152, + "step": 39685 + }, + { + "epoch": 82.51559251559252, + "grad_norm": 0.2786385416984558, + "learning_rate": 7.457426147663982e-09, + "loss": 0.0908, + "num_input_tokens_seen": 28478704, + "step": 39690 + }, + { + "epoch": 82.52598752598753, + "grad_norm": 0.35104432702064514, + "learning_rate": 7.219576535871797e-09, + "loss": 0.092, + "num_input_tokens_seen": 28482096, + "step": 39695 + }, + { + "epoch": 82.53638253638253, + "grad_norm": 0.3779306411743164, + "learning_rate": 6.985581124896445e-09, + "loss": 0.1054, + "num_input_tokens_seen": 28485552, + "step": 39700 + }, + { + "epoch": 82.54677754677755, + "grad_norm": 0.33095914125442505, + "learning_rate": 6.755439950828501e-09, + "loss": 0.0774, + "num_input_tokens_seen": 28489008, + "step": 39705 + }, + { + "epoch": 82.55717255717256, + "grad_norm": 0.5743937492370605, + "learning_rate": 6.5291530491562444e-09, + "loss": 0.1818, + "num_input_tokens_seen": 28492752, + "step": 39710 + }, + { + "epoch": 82.56756756756756, + "grad_norm": 0.35554879903793335, + "learning_rate": 6.3067204547739845e-09, + "loss": 0.0943, + "num_input_tokens_seen": 28496336, + "step": 39715 + }, + { + "epoch": 82.57796257796258, + "grad_norm": 0.27125468850135803, + "learning_rate": 6.088142201987612e-09, + "loss": 0.1412, + "num_input_tokens_seen": 28500016, + "step": 39720 + }, + { + "epoch": 82.58835758835758, + "grad_norm": 0.327334463596344, + "learning_rate": 5.873418324503499e-09, + "loss": 0.0916, + "num_input_tokens_seen": 28503632, + "step": 39725 + }, + { + "epoch": 82.5987525987526, + "grad_norm": 0.331596314907074, + "learning_rate": 5.6625488554340465e-09, + "loss": 0.1547, + "num_input_tokens_seen": 28507216, + "step": 39730 + }, + { + "epoch": 82.60914760914761, + "grad_norm": 0.200379878282547, + "learning_rate": 5.455533827297688e-09, + "loss": 0.0525, + "num_input_tokens_seen": 28510672, + "step": 39735 + }, + { + "epoch": 82.61954261954261, + "grad_norm": 0.2503044307231903, + "learning_rate": 5.252373272018885e-09, + "loss": 0.1066, + "num_input_tokens_seen": 28514320, + "step": 39740 + }, + { + "epoch": 82.62993762993763, + "grad_norm": 0.8195543885231018, + "learning_rate": 5.053067220925356e-09, + "loss": 0.0858, + "num_input_tokens_seen": 28517936, + "step": 39745 + }, + { + "epoch": 82.64033264033264, + "grad_norm": 1.0703030824661255, + "learning_rate": 4.857615704759177e-09, + "loss": 0.1204, + "num_input_tokens_seen": 28521520, + "step": 39750 + }, + { + "epoch": 82.65072765072765, + "grad_norm": 0.11204949021339417, + "learning_rate": 4.666018753654577e-09, + "loss": 0.0784, + "num_input_tokens_seen": 28525136, + "step": 39755 + }, + { + "epoch": 82.66112266112266, + "grad_norm": 1.0393905639648438, + "learning_rate": 4.478276397162917e-09, + "loss": 0.1361, + "num_input_tokens_seen": 28528720, + "step": 39760 + }, + { + "epoch": 82.67151767151768, + "grad_norm": 0.3337663412094116, + "learning_rate": 4.294388664233262e-09, + "loss": 0.1106, + "num_input_tokens_seen": 28532368, + "step": 39765 + }, + { + "epoch": 82.68191268191268, + "grad_norm": 0.32348713278770447, + "learning_rate": 4.114355583223484e-09, + "loss": 0.1359, + "num_input_tokens_seen": 28535984, + "step": 39770 + }, + { + "epoch": 82.6923076923077, + "grad_norm": 0.4210728704929352, + "learning_rate": 3.9381771818974845e-09, + "loss": 0.0955, + "num_input_tokens_seen": 28539696, + "step": 39775 + }, + { + "epoch": 82.70270270270271, + "grad_norm": 0.25051766633987427, + "learning_rate": 3.765853487427973e-09, + "loss": 0.1271, + "num_input_tokens_seen": 28543344, + "step": 39780 + }, + { + "epoch": 82.71309771309771, + "grad_norm": 0.7731287479400635, + "learning_rate": 3.5973845263825857e-09, + "loss": 0.0872, + "num_input_tokens_seen": 28546800, + "step": 39785 + }, + { + "epoch": 82.72349272349273, + "grad_norm": 0.3683357238769531, + "learning_rate": 3.4327703247488684e-09, + "loss": 0.0816, + "num_input_tokens_seen": 28550288, + "step": 39790 + }, + { + "epoch": 82.73388773388774, + "grad_norm": 0.16339462995529175, + "learning_rate": 3.2720109079037443e-09, + "loss": 0.0796, + "num_input_tokens_seen": 28553808, + "step": 39795 + }, + { + "epoch": 82.74428274428274, + "grad_norm": 0.5704171061515808, + "learning_rate": 3.1151063006468193e-09, + "loss": 0.0538, + "num_input_tokens_seen": 28557552, + "step": 39800 + }, + { + "epoch": 82.74428274428274, + "eval_loss": 0.14736677706241608, + "eval_runtime": 7.7465, + "eval_samples_per_second": 110.501, + "eval_steps_per_second": 27.625, + "num_input_tokens_seen": 28557552, + "step": 39800 + }, + { + "epoch": 82.75467775467776, + "grad_norm": 0.5754801630973816, + "learning_rate": 2.962056527169854e-09, + "loss": 0.1318, + "num_input_tokens_seen": 28561168, + "step": 39805 + }, + { + "epoch": 82.76507276507276, + "grad_norm": 0.37631189823150635, + "learning_rate": 2.8128616110761898e-09, + "loss": 0.0602, + "num_input_tokens_seen": 28564624, + "step": 39810 + }, + { + "epoch": 82.77546777546777, + "grad_norm": 0.14254304766654968, + "learning_rate": 2.6675215753724223e-09, + "loss": 0.0532, + "num_input_tokens_seen": 28568112, + "step": 39815 + }, + { + "epoch": 82.78586278586279, + "grad_norm": 0.21137459576129913, + "learning_rate": 2.5260364424739557e-09, + "loss": 0.0403, + "num_input_tokens_seen": 28571696, + "step": 39820 + }, + { + "epoch": 82.79625779625779, + "grad_norm": 0.18681517243385315, + "learning_rate": 2.3884062341994475e-09, + "loss": 0.0953, + "num_input_tokens_seen": 28575248, + "step": 39825 + }, + { + "epoch": 82.8066528066528, + "grad_norm": 0.8491746187210083, + "learning_rate": 2.25463097177081e-09, + "loss": 0.1226, + "num_input_tokens_seen": 28578800, + "step": 39830 + }, + { + "epoch": 82.81704781704782, + "grad_norm": 0.34458914399147034, + "learning_rate": 2.1247106758215397e-09, + "loss": 0.1332, + "num_input_tokens_seen": 28582544, + "step": 39835 + }, + { + "epoch": 82.82744282744282, + "grad_norm": 0.6924719214439392, + "learning_rate": 1.998645366382834e-09, + "loss": 0.1234, + "num_input_tokens_seen": 28586096, + "step": 39840 + }, + { + "epoch": 82.83783783783784, + "grad_norm": 0.518453061580658, + "learning_rate": 1.876435062897475e-09, + "loss": 0.1226, + "num_input_tokens_seen": 28589776, + "step": 39845 + }, + { + "epoch": 82.84823284823285, + "grad_norm": 0.29695233702659607, + "learning_rate": 1.758079784211497e-09, + "loss": 0.0852, + "num_input_tokens_seen": 28593232, + "step": 39850 + }, + { + "epoch": 82.85862785862786, + "grad_norm": 0.6937131881713867, + "learning_rate": 1.6435795485797434e-09, + "loss": 0.081, + "num_input_tokens_seen": 28596720, + "step": 39855 + }, + { + "epoch": 82.86902286902287, + "grad_norm": 0.2442285418510437, + "learning_rate": 1.5329343736547596e-09, + "loss": 0.0881, + "num_input_tokens_seen": 28600368, + "step": 39860 + }, + { + "epoch": 82.87941787941789, + "grad_norm": 0.0993712767958641, + "learning_rate": 1.4261442765006739e-09, + "loss": 0.0947, + "num_input_tokens_seen": 28604048, + "step": 39865 + }, + { + "epoch": 82.88981288981289, + "grad_norm": 0.31292563676834106, + "learning_rate": 1.3232092735876445e-09, + "loss": 0.0671, + "num_input_tokens_seen": 28607664, + "step": 39870 + }, + { + "epoch": 82.9002079002079, + "grad_norm": 0.40495648980140686, + "learning_rate": 1.2241293807918607e-09, + "loss": 0.0868, + "num_input_tokens_seen": 28611248, + "step": 39875 + }, + { + "epoch": 82.9106029106029, + "grad_norm": 0.3018670976161957, + "learning_rate": 1.128904613387216e-09, + "loss": 0.0697, + "num_input_tokens_seen": 28614736, + "step": 39880 + }, + { + "epoch": 82.92099792099792, + "grad_norm": 0.928359866142273, + "learning_rate": 1.0375349860591853e-09, + "loss": 0.1359, + "num_input_tokens_seen": 28618320, + "step": 39885 + }, + { + "epoch": 82.93139293139293, + "grad_norm": 0.17639374732971191, + "learning_rate": 9.5002051290205e-10, + "loss": 0.1128, + "num_input_tokens_seen": 28621936, + "step": 39890 + }, + { + "epoch": 82.94178794178794, + "grad_norm": 0.7620830535888672, + "learning_rate": 8.663612074077954e-10, + "loss": 0.1062, + "num_input_tokens_seen": 28625680, + "step": 39895 + }, + { + "epoch": 82.95218295218295, + "grad_norm": 0.5546718835830688, + "learning_rate": 7.865570824799884e-10, + "loss": 0.0729, + "num_input_tokens_seen": 28629200, + "step": 39900 + }, + { + "epoch": 82.96257796257797, + "grad_norm": 0.6529449224472046, + "learning_rate": 7.106081504254514e-10, + "loss": 0.095, + "num_input_tokens_seen": 28632656, + "step": 39905 + }, + { + "epoch": 82.97297297297297, + "grad_norm": 0.2667536735534668, + "learning_rate": 6.385144229570372e-10, + "loss": 0.1008, + "num_input_tokens_seen": 28636240, + "step": 39910 + }, + { + "epoch": 82.98336798336798, + "grad_norm": 0.30372366309165955, + "learning_rate": 5.70275911190854e-10, + "loss": 0.0758, + "num_input_tokens_seen": 28639920, + "step": 39915 + }, + { + "epoch": 82.993762993763, + "grad_norm": 0.27502039074897766, + "learning_rate": 5.058926256490403e-10, + "loss": 0.1002, + "num_input_tokens_seen": 28643440, + "step": 39920 + }, + { + "epoch": 83.004158004158, + "grad_norm": 0.2009638547897339, + "learning_rate": 4.4536457626254134e-10, + "loss": 0.0701, + "num_input_tokens_seen": 28646888, + "step": 39925 + }, + { + "epoch": 83.01455301455302, + "grad_norm": 0.15188826620578766, + "learning_rate": 3.88691772365557e-10, + "loss": 0.0516, + "num_input_tokens_seen": 28650536, + "step": 39930 + }, + { + "epoch": 83.02494802494803, + "grad_norm": 0.5052883625030518, + "learning_rate": 3.358742226955425e-10, + "loss": 0.095, + "num_input_tokens_seen": 28654248, + "step": 39935 + }, + { + "epoch": 83.03534303534303, + "grad_norm": 0.22137930989265442, + "learning_rate": 2.8691193539875925e-10, + "loss": 0.0951, + "num_input_tokens_seen": 28657768, + "step": 39940 + }, + { + "epoch": 83.04573804573805, + "grad_norm": 0.5051931738853455, + "learning_rate": 2.418049180274995e-10, + "loss": 0.07, + "num_input_tokens_seen": 28661512, + "step": 39945 + }, + { + "epoch": 83.05613305613305, + "grad_norm": 0.20361784100532532, + "learning_rate": 2.005531775373104e-10, + "loss": 0.0681, + "num_input_tokens_seen": 28665032, + "step": 39950 + }, + { + "epoch": 83.06652806652806, + "grad_norm": 0.3346145749092102, + "learning_rate": 1.6315672028699435e-10, + "loss": 0.0975, + "num_input_tokens_seen": 28668456, + "step": 39955 + }, + { + "epoch": 83.07692307692308, + "grad_norm": 0.29584094882011414, + "learning_rate": 1.2961555204693555e-10, + "loss": 0.0631, + "num_input_tokens_seen": 28671880, + "step": 39960 + }, + { + "epoch": 83.08731808731808, + "grad_norm": 0.27281057834625244, + "learning_rate": 9.992967798799768e-11, + "loss": 0.0899, + "num_input_tokens_seen": 28675432, + "step": 39965 + }, + { + "epoch": 83.0977130977131, + "grad_norm": 0.1983700543642044, + "learning_rate": 7.409910268707521e-11, + "loss": 0.0988, + "num_input_tokens_seen": 28679144, + "step": 39970 + }, + { + "epoch": 83.10810810810811, + "grad_norm": 0.46110454201698303, + "learning_rate": 5.212383012986877e-11, + "loss": 0.1284, + "num_input_tokens_seen": 28682696, + "step": 39975 + }, + { + "epoch": 83.11850311850311, + "grad_norm": 0.24030987918376923, + "learning_rate": 3.400386370533415e-11, + "loss": 0.0944, + "num_input_tokens_seen": 28686440, + "step": 39980 + }, + { + "epoch": 83.12889812889813, + "grad_norm": 0.40783700346946716, + "learning_rate": 1.9739206205682258e-11, + "loss": 0.0916, + "num_input_tokens_seen": 28690088, + "step": 39985 + }, + { + "epoch": 83.13929313929314, + "grad_norm": 0.4118722677230835, + "learning_rate": 9.329859829154685e-12, + "loss": 0.1184, + "num_input_tokens_seen": 28693608, + "step": 39990 + }, + { + "epoch": 83.14968814968815, + "grad_norm": 0.5227940678596497, + "learning_rate": 2.7758261855748148e-12, + "loss": 0.095, + "num_input_tokens_seen": 28697256, + "step": 39995 + }, + { + "epoch": 83.16008316008316, + "grad_norm": 0.16091160476207733, + "learning_rate": 7.710628524559838e-14, + "loss": 0.0847, + "num_input_tokens_seen": 28700680, + "step": 40000 + }, + { + "epoch": 83.16008316008316, + "eval_loss": 0.1452455073595047, + "eval_runtime": 7.7478, + "eval_samples_per_second": 110.482, + "eval_steps_per_second": 27.621, + "num_input_tokens_seen": 28700680, + "step": 40000 + }, + { + "epoch": 83.16008316008316, + "num_input_tokens_seen": 28700680, + "step": 40000, + "total_flos": 1.225072234834821e+18, + "train_loss": 0.11859339318908751, + "train_runtime": 23487.702, + "train_samples_per_second": 27.248, + "train_steps_per_second": 1.703 + } + ], + "logging_steps": 5, + "max_steps": 40000, + "num_input_tokens_seen": 28700680, + "num_train_epochs": 84, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.225072234834821e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}