{ "best_global_step": 9800, "best_metric": 2.5283308029174805, "best_model_checkpoint": "saves/lntuning/mistral-7b-instruct/train_multirc_1745950270/checkpoint-9800", "epoch": 6.525328330206379, "eval_steps": 200, "global_step": 40000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008157272208173587, "grad_norm": 8.5, "learning_rate": 4.999999876629946e-05, "loss": 2.3824, "num_input_tokens_seen": 8784, "step": 5 }, { "epoch": 0.0016314544416347174, "grad_norm": 5.5, "learning_rate": 4.999999375439123e-05, "loss": 2.2428, "num_input_tokens_seen": 18528, "step": 10 }, { "epoch": 0.002447181662452076, "grad_norm": 9.0, "learning_rate": 4.9999984887169785e-05, "loss": 3.4286, "num_input_tokens_seen": 29168, "step": 15 }, { "epoch": 0.0032629088832694347, "grad_norm": 5.15625, "learning_rate": 4.9999972164636506e-05, "loss": 2.5567, "num_input_tokens_seen": 40624, "step": 20 }, { "epoch": 0.004078636104086793, "grad_norm": 7.90625, "learning_rate": 4.999995558679334e-05, "loss": 2.8244, "num_input_tokens_seen": 49968, "step": 25 }, { "epoch": 0.004894363324904152, "grad_norm": 7.0625, "learning_rate": 4.999993515364287e-05, "loss": 1.5947, "num_input_tokens_seen": 60672, "step": 30 }, { "epoch": 0.005710090545721511, "grad_norm": 8.625, "learning_rate": 4.999991086518822e-05, "loss": 3.8766, "num_input_tokens_seen": 71408, "step": 35 }, { "epoch": 0.0065258177665388694, "grad_norm": 3.9375, "learning_rate": 4.999988272143315e-05, "loss": 2.6206, "num_input_tokens_seen": 82752, "step": 40 }, { "epoch": 0.007341544987356228, "grad_norm": 11.875, "learning_rate": 4.999985072238199e-05, "loss": 2.6362, "num_input_tokens_seen": 93296, "step": 45 }, { "epoch": 0.008157272208173586, "grad_norm": 6.84375, "learning_rate": 4.999981486803969e-05, "loss": 2.9237, "num_input_tokens_seen": 104592, "step": 50 }, { "epoch": 0.008972999428990946, "grad_norm": 1.25, "learning_rate": 4.999977515841176e-05, "loss": 2.3438, "num_input_tokens_seen": 114976, "step": 55 }, { "epoch": 0.009788726649808304, "grad_norm": 11.8125, "learning_rate": 4.9999731593504344e-05, "loss": 3.9838, "num_input_tokens_seen": 124944, "step": 60 }, { "epoch": 0.010604453870625663, "grad_norm": 10.875, "learning_rate": 4.999968417332415e-05, "loss": 2.651, "num_input_tokens_seen": 134560, "step": 65 }, { "epoch": 0.011420181091443021, "grad_norm": 1.7734375, "learning_rate": 4.999963289787848e-05, "loss": 1.7442, "num_input_tokens_seen": 144528, "step": 70 }, { "epoch": 0.012235908312260381, "grad_norm": 8.375, "learning_rate": 4.999957776717526e-05, "loss": 2.3111, "num_input_tokens_seen": 155056, "step": 75 }, { "epoch": 0.013051635533077739, "grad_norm": 2.796875, "learning_rate": 4.9999518781222984e-05, "loss": 2.7145, "num_input_tokens_seen": 164464, "step": 80 }, { "epoch": 0.013867362753895097, "grad_norm": 10.25, "learning_rate": 4.9999455940030746e-05, "loss": 3.3305, "num_input_tokens_seen": 174608, "step": 85 }, { "epoch": 0.014683089974712456, "grad_norm": 5.3125, "learning_rate": 4.999938924360824e-05, "loss": 1.7676, "num_input_tokens_seen": 185616, "step": 90 }, { "epoch": 0.015498817195529814, "grad_norm": 8.625, "learning_rate": 4.999931869196575e-05, "loss": 2.6622, "num_input_tokens_seen": 195360, "step": 95 }, { "epoch": 0.016314544416347172, "grad_norm": 11.8125, "learning_rate": 4.999924428511416e-05, "loss": 1.9336, "num_input_tokens_seen": 207152, "step": 100 }, { "epoch": 0.017130271637164532, "grad_norm": 10.625, "learning_rate": 4.999916602306494e-05, "loss": 2.0811, "num_input_tokens_seen": 218016, "step": 105 }, { "epoch": 0.01794599885798189, "grad_norm": 1.7734375, "learning_rate": 4.999908390583016e-05, "loss": 2.7338, "num_input_tokens_seen": 229680, "step": 110 }, { "epoch": 0.01876172607879925, "grad_norm": 4.40625, "learning_rate": 4.999899793342247e-05, "loss": 2.3528, "num_input_tokens_seen": 238624, "step": 115 }, { "epoch": 0.019577453299616607, "grad_norm": 7.28125, "learning_rate": 4.999890810585516e-05, "loss": 3.1091, "num_input_tokens_seen": 248592, "step": 120 }, { "epoch": 0.020393180520433967, "grad_norm": 6.40625, "learning_rate": 4.999881442314206e-05, "loss": 1.8262, "num_input_tokens_seen": 259392, "step": 125 }, { "epoch": 0.021208907741251327, "grad_norm": 8.6875, "learning_rate": 4.9998716885297617e-05, "loss": 4.0175, "num_input_tokens_seen": 269664, "step": 130 }, { "epoch": 0.022024634962068683, "grad_norm": 7.5625, "learning_rate": 4.999861549233688e-05, "loss": 2.6776, "num_input_tokens_seen": 280800, "step": 135 }, { "epoch": 0.022840362182886043, "grad_norm": 0.263671875, "learning_rate": 4.999851024427548e-05, "loss": 1.98, "num_input_tokens_seen": 290736, "step": 140 }, { "epoch": 0.023656089403703402, "grad_norm": 6.59375, "learning_rate": 4.999840114112965e-05, "loss": 1.8577, "num_input_tokens_seen": 301152, "step": 145 }, { "epoch": 0.024471816624520762, "grad_norm": 7.0625, "learning_rate": 4.999828818291621e-05, "loss": 1.5681, "num_input_tokens_seen": 311040, "step": 150 }, { "epoch": 0.025287543845338118, "grad_norm": 11.5, "learning_rate": 4.999817136965259e-05, "loss": 2.5782, "num_input_tokens_seen": 322512, "step": 155 }, { "epoch": 0.026103271066155478, "grad_norm": 10.1875, "learning_rate": 4.9998050701356794e-05, "loss": 2.9134, "num_input_tokens_seen": 333664, "step": 160 }, { "epoch": 0.026918998286972837, "grad_norm": 8.375, "learning_rate": 4.999792617804744e-05, "loss": 2.7584, "num_input_tokens_seen": 345104, "step": 165 }, { "epoch": 0.027734725507790194, "grad_norm": 4.71875, "learning_rate": 4.9997797799743724e-05, "loss": 2.09, "num_input_tokens_seen": 356288, "step": 170 }, { "epoch": 0.028550452728607553, "grad_norm": 9.6875, "learning_rate": 4.999766556646545e-05, "loss": 2.6638, "num_input_tokens_seen": 365744, "step": 175 }, { "epoch": 0.029366179949424913, "grad_norm": 9.375, "learning_rate": 4.9997529478232996e-05, "loss": 3.0835, "num_input_tokens_seen": 376080, "step": 180 }, { "epoch": 0.030181907170242273, "grad_norm": 7.53125, "learning_rate": 4.9997389535067365e-05, "loss": 2.6222, "num_input_tokens_seen": 386784, "step": 185 }, { "epoch": 0.03099763439105963, "grad_norm": 6.96875, "learning_rate": 4.999724573699012e-05, "loss": 3.8699, "num_input_tokens_seen": 396944, "step": 190 }, { "epoch": 0.031813361611876985, "grad_norm": 13.0, "learning_rate": 4.9997098084023457e-05, "loss": 2.9014, "num_input_tokens_seen": 407952, "step": 195 }, { "epoch": 0.032629088832694345, "grad_norm": 2.421875, "learning_rate": 4.999694657619013e-05, "loss": 1.8061, "num_input_tokens_seen": 418192, "step": 200 }, { "epoch": 0.032629088832694345, "eval_loss": 2.6757116317749023, "eval_runtime": 134.4547, "eval_samples_per_second": 20.267, "eval_steps_per_second": 10.137, "num_input_tokens_seen": 418192, "step": 200 }, { "epoch": 0.033444816053511704, "grad_norm": 9.25, "learning_rate": 4.999679121351352e-05, "loss": 2.4379, "num_input_tokens_seen": 429056, "step": 205 }, { "epoch": 0.034260543274329064, "grad_norm": 2.578125, "learning_rate": 4.9996631996017565e-05, "loss": 2.0123, "num_input_tokens_seen": 440768, "step": 210 }, { "epoch": 0.035076270495146424, "grad_norm": 8.375, "learning_rate": 4.9996468923726835e-05, "loss": 3.8622, "num_input_tokens_seen": 451120, "step": 215 }, { "epoch": 0.03589199771596378, "grad_norm": 5.25, "learning_rate": 4.999630199666647e-05, "loss": 1.69, "num_input_tokens_seen": 463056, "step": 220 }, { "epoch": 0.03670772493678114, "grad_norm": 10.5, "learning_rate": 4.999613121486222e-05, "loss": 2.3376, "num_input_tokens_seen": 474048, "step": 225 }, { "epoch": 0.0375234521575985, "grad_norm": 9.8125, "learning_rate": 4.999595657834041e-05, "loss": 2.4052, "num_input_tokens_seen": 483968, "step": 230 }, { "epoch": 0.038339179378415855, "grad_norm": 3.875, "learning_rate": 4.999577808712798e-05, "loss": 1.6723, "num_input_tokens_seen": 494032, "step": 235 }, { "epoch": 0.039154906599233215, "grad_norm": 12.125, "learning_rate": 4.999559574125244e-05, "loss": 3.0367, "num_input_tokens_seen": 505168, "step": 240 }, { "epoch": 0.039970633820050575, "grad_norm": 4.28125, "learning_rate": 4.9995409540741934e-05, "loss": 1.3688, "num_input_tokens_seen": 514768, "step": 245 }, { "epoch": 0.040786361040867934, "grad_norm": 2.046875, "learning_rate": 4.999521948562516e-05, "loss": 1.7807, "num_input_tokens_seen": 525776, "step": 250 }, { "epoch": 0.041602088261685294, "grad_norm": 5.5, "learning_rate": 4.999502557593143e-05, "loss": 2.1225, "num_input_tokens_seen": 535216, "step": 255 }, { "epoch": 0.042417815482502653, "grad_norm": 2.5625, "learning_rate": 4.999482781169066e-05, "loss": 1.8369, "num_input_tokens_seen": 544864, "step": 260 }, { "epoch": 0.04323354270332001, "grad_norm": 3.71875, "learning_rate": 4.9994626192933324e-05, "loss": 1.7563, "num_input_tokens_seen": 556624, "step": 265 }, { "epoch": 0.044049269924137366, "grad_norm": 5.21875, "learning_rate": 4.999442071969054e-05, "loss": 2.7922, "num_input_tokens_seen": 566320, "step": 270 }, { "epoch": 0.044864997144954726, "grad_norm": 7.90625, "learning_rate": 4.999421139199397e-05, "loss": 1.5266, "num_input_tokens_seen": 577312, "step": 275 }, { "epoch": 0.045680724365772085, "grad_norm": 2.84375, "learning_rate": 4.999399820987592e-05, "loss": 2.6298, "num_input_tokens_seen": 587840, "step": 280 }, { "epoch": 0.046496451586589445, "grad_norm": 3.8125, "learning_rate": 4.999378117336924e-05, "loss": 1.4227, "num_input_tokens_seen": 598224, "step": 285 }, { "epoch": 0.047312178807406804, "grad_norm": 10.375, "learning_rate": 4.9993560282507415e-05, "loss": 2.2756, "num_input_tokens_seen": 607856, "step": 290 }, { "epoch": 0.048127906028224164, "grad_norm": 12.5, "learning_rate": 4.9993335537324495e-05, "loss": 2.8238, "num_input_tokens_seen": 618656, "step": 295 }, { "epoch": 0.048943633249041524, "grad_norm": 3.515625, "learning_rate": 4.999310693785516e-05, "loss": 1.4007, "num_input_tokens_seen": 629056, "step": 300 }, { "epoch": 0.049759360469858877, "grad_norm": 8.6875, "learning_rate": 4.9992874484134653e-05, "loss": 1.9266, "num_input_tokens_seen": 641280, "step": 305 }, { "epoch": 0.050575087690676236, "grad_norm": 5.375, "learning_rate": 4.999263817619882e-05, "loss": 1.5467, "num_input_tokens_seen": 650992, "step": 310 }, { "epoch": 0.051390814911493596, "grad_norm": 10.5625, "learning_rate": 4.9992398014084105e-05, "loss": 3.6772, "num_input_tokens_seen": 662112, "step": 315 }, { "epoch": 0.052206542132310955, "grad_norm": 7.6875, "learning_rate": 4.999215399782754e-05, "loss": 2.7125, "num_input_tokens_seen": 672912, "step": 320 }, { "epoch": 0.053022269353128315, "grad_norm": 7.78125, "learning_rate": 4.999190612746675e-05, "loss": 3.8661, "num_input_tokens_seen": 682864, "step": 325 }, { "epoch": 0.053837996573945675, "grad_norm": 6.4375, "learning_rate": 4.999165440303998e-05, "loss": 3.0503, "num_input_tokens_seen": 692480, "step": 330 }, { "epoch": 0.054653723794763034, "grad_norm": 5.28125, "learning_rate": 4.999139882458603e-05, "loss": 1.3731, "num_input_tokens_seen": 703312, "step": 335 }, { "epoch": 0.05546945101558039, "grad_norm": 3.703125, "learning_rate": 4.9991139392144314e-05, "loss": 1.5192, "num_input_tokens_seen": 713744, "step": 340 }, { "epoch": 0.05628517823639775, "grad_norm": 7.5, "learning_rate": 4.999087610575485e-05, "loss": 2.8708, "num_input_tokens_seen": 722880, "step": 345 }, { "epoch": 0.057100905457215106, "grad_norm": 8.125, "learning_rate": 4.999060896545824e-05, "loss": 2.995, "num_input_tokens_seen": 732912, "step": 350 }, { "epoch": 0.057916632678032466, "grad_norm": 9.0625, "learning_rate": 4.999033797129568e-05, "loss": 2.0663, "num_input_tokens_seen": 744640, "step": 355 }, { "epoch": 0.058732359898849826, "grad_norm": 7.125, "learning_rate": 4.999006312330894e-05, "loss": 1.2672, "num_input_tokens_seen": 754432, "step": 360 }, { "epoch": 0.059548087119667185, "grad_norm": 3.421875, "learning_rate": 4.998978442154043e-05, "loss": 3.1734, "num_input_tokens_seen": 766304, "step": 365 }, { "epoch": 0.060363814340484545, "grad_norm": 5.75, "learning_rate": 4.9989501866033125e-05, "loss": 1.9721, "num_input_tokens_seen": 777072, "step": 370 }, { "epoch": 0.0611795415613019, "grad_norm": 9.8125, "learning_rate": 4.998921545683059e-05, "loss": 3.5648, "num_input_tokens_seen": 787808, "step": 375 }, { "epoch": 0.06199526878211926, "grad_norm": 7.46875, "learning_rate": 4.9988925193976996e-05, "loss": 3.6657, "num_input_tokens_seen": 798016, "step": 380 }, { "epoch": 0.06281099600293662, "grad_norm": 5.1875, "learning_rate": 4.998863107751711e-05, "loss": 3.4512, "num_input_tokens_seen": 807632, "step": 385 }, { "epoch": 0.06362672322375397, "grad_norm": 8.125, "learning_rate": 4.998833310749629e-05, "loss": 2.8466, "num_input_tokens_seen": 816608, "step": 390 }, { "epoch": 0.06444245044457134, "grad_norm": 1.8359375, "learning_rate": 4.998803128396047e-05, "loss": 1.973, "num_input_tokens_seen": 826208, "step": 395 }, { "epoch": 0.06525817766538869, "grad_norm": 4.84375, "learning_rate": 4.9987725606956215e-05, "loss": 1.6352, "num_input_tokens_seen": 836224, "step": 400 }, { "epoch": 0.06525817766538869, "eval_loss": 2.597731351852417, "eval_runtime": 134.5236, "eval_samples_per_second": 20.257, "eval_steps_per_second": 10.132, "num_input_tokens_seen": 836224, "step": 400 }, { "epoch": 0.06607390488620606, "grad_norm": 4.8125, "learning_rate": 4.998741607653066e-05, "loss": 2.1235, "num_input_tokens_seen": 846160, "step": 405 }, { "epoch": 0.06688963210702341, "grad_norm": 2.71875, "learning_rate": 4.9987102692731523e-05, "loss": 1.5486, "num_input_tokens_seen": 856608, "step": 410 }, { "epoch": 0.06770535932784078, "grad_norm": 22.25, "learning_rate": 4.9986785455607157e-05, "loss": 2.1981, "num_input_tokens_seen": 867568, "step": 415 }, { "epoch": 0.06852108654865813, "grad_norm": 10.9375, "learning_rate": 4.9986464365206456e-05, "loss": 1.3882, "num_input_tokens_seen": 878496, "step": 420 }, { "epoch": 0.06933681376947548, "grad_norm": 7.3125, "learning_rate": 4.9986139421578956e-05, "loss": 2.2494, "num_input_tokens_seen": 888784, "step": 425 }, { "epoch": 0.07015254099029285, "grad_norm": 3.984375, "learning_rate": 4.998581062477477e-05, "loss": 1.0806, "num_input_tokens_seen": 899392, "step": 430 }, { "epoch": 0.0709682682111102, "grad_norm": 7.21875, "learning_rate": 4.998547797484458e-05, "loss": 2.2443, "num_input_tokens_seen": 910112, "step": 435 }, { "epoch": 0.07178399543192757, "grad_norm": 7.03125, "learning_rate": 4.9985141471839706e-05, "loss": 2.548, "num_input_tokens_seen": 920000, "step": 440 }, { "epoch": 0.07259972265274492, "grad_norm": 3.828125, "learning_rate": 4.998480111581203e-05, "loss": 4.4676, "num_input_tokens_seen": 930064, "step": 445 }, { "epoch": 0.07341544987356229, "grad_norm": 9.75, "learning_rate": 4.998445690681405e-05, "loss": 1.6247, "num_input_tokens_seen": 940512, "step": 450 }, { "epoch": 0.07423117709437964, "grad_norm": 5.0, "learning_rate": 4.9984108844898834e-05, "loss": 2.3822, "num_input_tokens_seen": 951472, "step": 455 }, { "epoch": 0.075046904315197, "grad_norm": 7.0, "learning_rate": 4.9983756930120076e-05, "loss": 2.2208, "num_input_tokens_seen": 961456, "step": 460 }, { "epoch": 0.07586263153601436, "grad_norm": 4.96875, "learning_rate": 4.9983401162532025e-05, "loss": 2.1054, "num_input_tokens_seen": 972896, "step": 465 }, { "epoch": 0.07667835875683171, "grad_norm": 12.625, "learning_rate": 4.998304154218955e-05, "loss": 1.9543, "num_input_tokens_seen": 984560, "step": 470 }, { "epoch": 0.07749408597764908, "grad_norm": 10.5625, "learning_rate": 4.998267806914812e-05, "loss": 2.9675, "num_input_tokens_seen": 994912, "step": 475 }, { "epoch": 0.07830981319846643, "grad_norm": 5.1875, "learning_rate": 4.998231074346378e-05, "loss": 2.2254, "num_input_tokens_seen": 1004656, "step": 480 }, { "epoch": 0.0791255404192838, "grad_norm": 8.0, "learning_rate": 4.998193956519317e-05, "loss": 1.6657, "num_input_tokens_seen": 1016000, "step": 485 }, { "epoch": 0.07994126764010115, "grad_norm": 2.0, "learning_rate": 4.9981564534393545e-05, "loss": 3.3288, "num_input_tokens_seen": 1027104, "step": 490 }, { "epoch": 0.08075699486091852, "grad_norm": 9.3125, "learning_rate": 4.998118565112272e-05, "loss": 2.7015, "num_input_tokens_seen": 1035696, "step": 495 }, { "epoch": 0.08157272208173587, "grad_norm": 2.046875, "learning_rate": 4.998080291543914e-05, "loss": 2.1205, "num_input_tokens_seen": 1046704, "step": 500 }, { "epoch": 0.08238844930255322, "grad_norm": 6.71875, "learning_rate": 4.9980416327401826e-05, "loss": 2.2885, "num_input_tokens_seen": 1056976, "step": 505 }, { "epoch": 0.08320417652337059, "grad_norm": 6.9375, "learning_rate": 4.998002588707038e-05, "loss": 3.1043, "num_input_tokens_seen": 1066640, "step": 510 }, { "epoch": 0.08401990374418794, "grad_norm": 3.046875, "learning_rate": 4.997963159450503e-05, "loss": 1.6898, "num_input_tokens_seen": 1077552, "step": 515 }, { "epoch": 0.08483563096500531, "grad_norm": 4.59375, "learning_rate": 4.9979233449766575e-05, "loss": 3.3036, "num_input_tokens_seen": 1087552, "step": 520 }, { "epoch": 0.08565135818582266, "grad_norm": 1.859375, "learning_rate": 4.997883145291641e-05, "loss": 1.3319, "num_input_tokens_seen": 1097904, "step": 525 }, { "epoch": 0.08646708540664003, "grad_norm": 8.0625, "learning_rate": 4.9978425604016536e-05, "loss": 1.5346, "num_input_tokens_seen": 1108416, "step": 530 }, { "epoch": 0.08728281262745738, "grad_norm": 12.5, "learning_rate": 4.9978015903129536e-05, "loss": 2.9244, "num_input_tokens_seen": 1119776, "step": 535 }, { "epoch": 0.08809853984827473, "grad_norm": 10.0, "learning_rate": 4.997760235031859e-05, "loss": 2.305, "num_input_tokens_seen": 1129920, "step": 540 }, { "epoch": 0.0889142670690921, "grad_norm": 9.9375, "learning_rate": 4.9977184945647473e-05, "loss": 2.0366, "num_input_tokens_seen": 1140944, "step": 545 }, { "epoch": 0.08972999428990945, "grad_norm": 9.0, "learning_rate": 4.997676368918055e-05, "loss": 2.2726, "num_input_tokens_seen": 1152800, "step": 550 }, { "epoch": 0.09054572151072682, "grad_norm": 10.6875, "learning_rate": 4.9976338580982794e-05, "loss": 3.5505, "num_input_tokens_seen": 1164656, "step": 555 }, { "epoch": 0.09136144873154417, "grad_norm": 6.1875, "learning_rate": 4.9975909621119755e-05, "loss": 1.8602, "num_input_tokens_seen": 1175328, "step": 560 }, { "epoch": 0.09217717595236154, "grad_norm": 5.21875, "learning_rate": 4.997547680965758e-05, "loss": 2.288, "num_input_tokens_seen": 1185264, "step": 565 }, { "epoch": 0.09299290317317889, "grad_norm": 6.6875, "learning_rate": 4.997504014666302e-05, "loss": 2.9191, "num_input_tokens_seen": 1194912, "step": 570 }, { "epoch": 0.09380863039399624, "grad_norm": 5.5, "learning_rate": 4.997459963220342e-05, "loss": 3.4976, "num_input_tokens_seen": 1205792, "step": 575 }, { "epoch": 0.09462435761481361, "grad_norm": 6.65625, "learning_rate": 4.997415526634671e-05, "loss": 2.7631, "num_input_tokens_seen": 1216352, "step": 580 }, { "epoch": 0.09544008483563096, "grad_norm": 5.65625, "learning_rate": 4.99737070491614e-05, "loss": 2.0976, "num_input_tokens_seen": 1225664, "step": 585 }, { "epoch": 0.09625581205644833, "grad_norm": 7.40625, "learning_rate": 4.997325498071663e-05, "loss": 2.7143, "num_input_tokens_seen": 1236912, "step": 590 }, { "epoch": 0.09707153927726568, "grad_norm": 5.90625, "learning_rate": 4.997279906108211e-05, "loss": 2.7869, "num_input_tokens_seen": 1248064, "step": 595 }, { "epoch": 0.09788726649808305, "grad_norm": 12.375, "learning_rate": 4.9972339290328155e-05, "loss": 3.1415, "num_input_tokens_seen": 1258320, "step": 600 }, { "epoch": 0.09788726649808305, "eval_loss": 2.5582432746887207, "eval_runtime": 134.6444, "eval_samples_per_second": 20.238, "eval_steps_per_second": 10.123, "num_input_tokens_seen": 1258320, "step": 600 }, { "epoch": 0.0987029937189004, "grad_norm": 7.0, "learning_rate": 4.9971875668525646e-05, "loss": 3.0459, "num_input_tokens_seen": 1268864, "step": 605 }, { "epoch": 0.09951872093971775, "grad_norm": 8.0625, "learning_rate": 4.997140819574609e-05, "loss": 2.2181, "num_input_tokens_seen": 1279616, "step": 610 }, { "epoch": 0.10033444816053512, "grad_norm": 6.21875, "learning_rate": 4.997093687206159e-05, "loss": 2.3303, "num_input_tokens_seen": 1290688, "step": 615 }, { "epoch": 0.10115017538135247, "grad_norm": 11.9375, "learning_rate": 4.997046169754482e-05, "loss": 2.1778, "num_input_tokens_seen": 1300656, "step": 620 }, { "epoch": 0.10196590260216984, "grad_norm": 1.359375, "learning_rate": 4.996998267226905e-05, "loss": 2.0096, "num_input_tokens_seen": 1311056, "step": 625 }, { "epoch": 0.10278162982298719, "grad_norm": 7.96875, "learning_rate": 4.996949979630817e-05, "loss": 3.1864, "num_input_tokens_seen": 1321024, "step": 630 }, { "epoch": 0.10359735704380456, "grad_norm": 7.625, "learning_rate": 4.996901306973663e-05, "loss": 3.224, "num_input_tokens_seen": 1330912, "step": 635 }, { "epoch": 0.10441308426462191, "grad_norm": 11.625, "learning_rate": 4.996852249262949e-05, "loss": 2.4455, "num_input_tokens_seen": 1339840, "step": 640 }, { "epoch": 0.10522881148543926, "grad_norm": 6.21875, "learning_rate": 4.996802806506241e-05, "loss": 1.9519, "num_input_tokens_seen": 1350208, "step": 645 }, { "epoch": 0.10604453870625663, "grad_norm": 0.490234375, "learning_rate": 4.996752978711164e-05, "loss": 1.7961, "num_input_tokens_seen": 1360720, "step": 650 }, { "epoch": 0.10686026592707398, "grad_norm": 4.15625, "learning_rate": 4.996702765885401e-05, "loss": 2.6737, "num_input_tokens_seen": 1371792, "step": 655 }, { "epoch": 0.10767599314789135, "grad_norm": 5.84375, "learning_rate": 4.9966521680366964e-05, "loss": 1.1267, "num_input_tokens_seen": 1381456, "step": 660 }, { "epoch": 0.1084917203687087, "grad_norm": 8.125, "learning_rate": 4.9966011851728524e-05, "loss": 2.1329, "num_input_tokens_seen": 1390240, "step": 665 }, { "epoch": 0.10930744758952607, "grad_norm": 6.0, "learning_rate": 4.996549817301731e-05, "loss": 2.0607, "num_input_tokens_seen": 1401488, "step": 670 }, { "epoch": 0.11012317481034342, "grad_norm": 4.625, "learning_rate": 4.9964980644312544e-05, "loss": 1.52, "num_input_tokens_seen": 1412208, "step": 675 }, { "epoch": 0.11093890203116077, "grad_norm": 6.90625, "learning_rate": 4.996445926569403e-05, "loss": 2.7516, "num_input_tokens_seen": 1423232, "step": 680 }, { "epoch": 0.11175462925197814, "grad_norm": 5.65625, "learning_rate": 4.996393403724218e-05, "loss": 2.8692, "num_input_tokens_seen": 1432896, "step": 685 }, { "epoch": 0.1125703564727955, "grad_norm": 5.28125, "learning_rate": 4.9963404959037985e-05, "loss": 2.9269, "num_input_tokens_seen": 1443216, "step": 690 }, { "epoch": 0.11338608369361286, "grad_norm": 11.125, "learning_rate": 4.996287203116303e-05, "loss": 2.5973, "num_input_tokens_seen": 1454352, "step": 695 }, { "epoch": 0.11420181091443021, "grad_norm": 7.65625, "learning_rate": 4.996233525369951e-05, "loss": 1.3081, "num_input_tokens_seen": 1464448, "step": 700 }, { "epoch": 0.11501753813524758, "grad_norm": 7.625, "learning_rate": 4.99617946267302e-05, "loss": 1.955, "num_input_tokens_seen": 1474704, "step": 705 }, { "epoch": 0.11583326535606493, "grad_norm": 7.28125, "learning_rate": 4.996125015033846e-05, "loss": 1.7467, "num_input_tokens_seen": 1486400, "step": 710 }, { "epoch": 0.11664899257688228, "grad_norm": 5.71875, "learning_rate": 4.996070182460827e-05, "loss": 1.7782, "num_input_tokens_seen": 1497488, "step": 715 }, { "epoch": 0.11746471979769965, "grad_norm": 10.875, "learning_rate": 4.996014964962418e-05, "loss": 2.5251, "num_input_tokens_seen": 1506928, "step": 720 }, { "epoch": 0.118280447018517, "grad_norm": 12.875, "learning_rate": 4.9959593625471344e-05, "loss": 3.5792, "num_input_tokens_seen": 1517280, "step": 725 }, { "epoch": 0.11909617423933437, "grad_norm": 7.28125, "learning_rate": 4.995903375223552e-05, "loss": 0.9184, "num_input_tokens_seen": 1526864, "step": 730 }, { "epoch": 0.11991190146015172, "grad_norm": 12.375, "learning_rate": 4.995847003000302e-05, "loss": 2.668, "num_input_tokens_seen": 1537712, "step": 735 }, { "epoch": 0.12072762868096909, "grad_norm": 4.4375, "learning_rate": 4.9957902458860804e-05, "loss": 1.4034, "num_input_tokens_seen": 1548752, "step": 740 }, { "epoch": 0.12154335590178644, "grad_norm": 5.9375, "learning_rate": 4.995733103889639e-05, "loss": 2.2332, "num_input_tokens_seen": 1560400, "step": 745 }, { "epoch": 0.1223590831226038, "grad_norm": 6.15625, "learning_rate": 4.99567557701979e-05, "loss": 1.6152, "num_input_tokens_seen": 1571696, "step": 750 }, { "epoch": 0.12317481034342116, "grad_norm": 3.671875, "learning_rate": 4.995617665285403e-05, "loss": 1.0118, "num_input_tokens_seen": 1582720, "step": 755 }, { "epoch": 0.12399053756423851, "grad_norm": 11.25, "learning_rate": 4.99555936869541e-05, "loss": 3.0687, "num_input_tokens_seen": 1593936, "step": 760 }, { "epoch": 0.12480626478505588, "grad_norm": 14.1875, "learning_rate": 4.995500687258803e-05, "loss": 2.5301, "num_input_tokens_seen": 1604384, "step": 765 }, { "epoch": 0.12562199200587323, "grad_norm": 4.4375, "learning_rate": 4.995441620984628e-05, "loss": 1.1851, "num_input_tokens_seen": 1614704, "step": 770 }, { "epoch": 0.1264377192266906, "grad_norm": 5.21875, "learning_rate": 4.995382169881996e-05, "loss": 3.8308, "num_input_tokens_seen": 1623680, "step": 775 }, { "epoch": 0.12725344644750794, "grad_norm": 11.375, "learning_rate": 4.9953223339600755e-05, "loss": 2.9726, "num_input_tokens_seen": 1633664, "step": 780 }, { "epoch": 0.12806917366832532, "grad_norm": 9.125, "learning_rate": 4.995262113228091e-05, "loss": 2.441, "num_input_tokens_seen": 1643552, "step": 785 }, { "epoch": 0.12888490088914267, "grad_norm": 5.6875, "learning_rate": 4.995201507695332e-05, "loss": 1.8785, "num_input_tokens_seen": 1654544, "step": 790 }, { "epoch": 0.12970062810996003, "grad_norm": 5.125, "learning_rate": 4.995140517371144e-05, "loss": 2.0845, "num_input_tokens_seen": 1663728, "step": 795 }, { "epoch": 0.13051635533077738, "grad_norm": 3.875, "learning_rate": 4.995079142264932e-05, "loss": 2.0399, "num_input_tokens_seen": 1673984, "step": 800 }, { "epoch": 0.13051635533077738, "eval_loss": 2.554769277572632, "eval_runtime": 134.7236, "eval_samples_per_second": 20.227, "eval_steps_per_second": 10.117, "num_input_tokens_seen": 1673984, "step": 800 }, { "epoch": 0.13133208255159476, "grad_norm": 3.96875, "learning_rate": 4.995017382386162e-05, "loss": 2.7465, "num_input_tokens_seen": 1684640, "step": 805 }, { "epoch": 0.1321478097724121, "grad_norm": 10.8125, "learning_rate": 4.994955237744356e-05, "loss": 2.7043, "num_input_tokens_seen": 1694576, "step": 810 }, { "epoch": 0.13296353699322946, "grad_norm": 6.46875, "learning_rate": 4.994892708349101e-05, "loss": 3.1384, "num_input_tokens_seen": 1705856, "step": 815 }, { "epoch": 0.13377926421404682, "grad_norm": 10.8125, "learning_rate": 4.994829794210035e-05, "loss": 1.7652, "num_input_tokens_seen": 1716432, "step": 820 }, { "epoch": 0.13459499143486417, "grad_norm": 8.4375, "learning_rate": 4.994766495336864e-05, "loss": 2.9388, "num_input_tokens_seen": 1725936, "step": 825 }, { "epoch": 0.13541071865568155, "grad_norm": 8.5, "learning_rate": 4.994702811739348e-05, "loss": 1.2269, "num_input_tokens_seen": 1736256, "step": 830 }, { "epoch": 0.1362264458764989, "grad_norm": 8.3125, "learning_rate": 4.994638743427308e-05, "loss": 2.9047, "num_input_tokens_seen": 1746512, "step": 835 }, { "epoch": 0.13704217309731626, "grad_norm": 11.0, "learning_rate": 4.994574290410624e-05, "loss": 2.9808, "num_input_tokens_seen": 1756448, "step": 840 }, { "epoch": 0.1378579003181336, "grad_norm": 9.9375, "learning_rate": 4.9945094526992364e-05, "loss": 3.9451, "num_input_tokens_seen": 1766528, "step": 845 }, { "epoch": 0.13867362753895096, "grad_norm": 11.5625, "learning_rate": 4.994444230303142e-05, "loss": 1.4968, "num_input_tokens_seen": 1778784, "step": 850 }, { "epoch": 0.13948935475976834, "grad_norm": 3.8125, "learning_rate": 4.994378623232402e-05, "loss": 2.9597, "num_input_tokens_seen": 1790016, "step": 855 }, { "epoch": 0.1403050819805857, "grad_norm": 5.40625, "learning_rate": 4.99431263149713e-05, "loss": 1.8258, "num_input_tokens_seen": 1800608, "step": 860 }, { "epoch": 0.14112080920140305, "grad_norm": 5.9375, "learning_rate": 4.9942462551075056e-05, "loss": 2.5323, "num_input_tokens_seen": 1810608, "step": 865 }, { "epoch": 0.1419365364222204, "grad_norm": 2.34375, "learning_rate": 4.994179494073764e-05, "loss": 2.4259, "num_input_tokens_seen": 1822896, "step": 870 }, { "epoch": 0.14275226364303778, "grad_norm": 12.5625, "learning_rate": 4.9941123484062e-05, "loss": 3.2869, "num_input_tokens_seen": 1835712, "step": 875 }, { "epoch": 0.14356799086385513, "grad_norm": 5.875, "learning_rate": 4.99404481811517e-05, "loss": 2.3777, "num_input_tokens_seen": 1845856, "step": 880 }, { "epoch": 0.14438371808467249, "grad_norm": 5.71875, "learning_rate": 4.9939769032110864e-05, "loss": 2.6378, "num_input_tokens_seen": 1855360, "step": 885 }, { "epoch": 0.14519944530548984, "grad_norm": 12.0625, "learning_rate": 4.993908603704423e-05, "loss": 3.7733, "num_input_tokens_seen": 1865264, "step": 890 }, { "epoch": 0.1460151725263072, "grad_norm": 4.875, "learning_rate": 4.9938399196057126e-05, "loss": 2.6998, "num_input_tokens_seen": 1874608, "step": 895 }, { "epoch": 0.14683089974712457, "grad_norm": 7.78125, "learning_rate": 4.993770850925547e-05, "loss": 2.7642, "num_input_tokens_seen": 1884736, "step": 900 }, { "epoch": 0.14764662696794192, "grad_norm": 3.25, "learning_rate": 4.993701397674577e-05, "loss": 1.2244, "num_input_tokens_seen": 1896112, "step": 905 }, { "epoch": 0.14846235418875928, "grad_norm": 3.4375, "learning_rate": 4.993631559863515e-05, "loss": 2.2194, "num_input_tokens_seen": 1908128, "step": 910 }, { "epoch": 0.14927808140957663, "grad_norm": 7.5625, "learning_rate": 4.9935613375031283e-05, "loss": 1.7767, "num_input_tokens_seen": 1918528, "step": 915 }, { "epoch": 0.150093808630394, "grad_norm": 8.6875, "learning_rate": 4.993490730604248e-05, "loss": 2.8698, "num_input_tokens_seen": 1928496, "step": 920 }, { "epoch": 0.15090953585121136, "grad_norm": 7.4375, "learning_rate": 4.993419739177761e-05, "loss": 3.9938, "num_input_tokens_seen": 1939504, "step": 925 }, { "epoch": 0.15172526307202872, "grad_norm": 6.90625, "learning_rate": 4.9933483632346164e-05, "loss": 0.9385, "num_input_tokens_seen": 1950464, "step": 930 }, { "epoch": 0.15254099029284607, "grad_norm": 7.125, "learning_rate": 4.993276602785821e-05, "loss": 2.1691, "num_input_tokens_seen": 1962032, "step": 935 }, { "epoch": 0.15335671751366342, "grad_norm": 2.421875, "learning_rate": 4.993204457842441e-05, "loss": 1.9701, "num_input_tokens_seen": 1972272, "step": 940 }, { "epoch": 0.1541724447344808, "grad_norm": 5.59375, "learning_rate": 4.993131928415602e-05, "loss": 1.7892, "num_input_tokens_seen": 1982240, "step": 945 }, { "epoch": 0.15498817195529815, "grad_norm": 6.125, "learning_rate": 4.993059014516489e-05, "loss": 1.6486, "num_input_tokens_seen": 1993472, "step": 950 }, { "epoch": 0.1558038991761155, "grad_norm": 7.34375, "learning_rate": 4.9929857161563464e-05, "loss": 2.4397, "num_input_tokens_seen": 2003360, "step": 955 }, { "epoch": 0.15661962639693286, "grad_norm": 7.25, "learning_rate": 4.992912033346477e-05, "loss": 2.1575, "num_input_tokens_seen": 2015504, "step": 960 }, { "epoch": 0.1574353536177502, "grad_norm": 5.40625, "learning_rate": 4.992837966098245e-05, "loss": 1.8883, "num_input_tokens_seen": 2025280, "step": 965 }, { "epoch": 0.1582510808385676, "grad_norm": 5.90625, "learning_rate": 4.992763514423071e-05, "loss": 2.3832, "num_input_tokens_seen": 2035136, "step": 970 }, { "epoch": 0.15906680805938495, "grad_norm": 5.125, "learning_rate": 4.992688678332437e-05, "loss": 2.0456, "num_input_tokens_seen": 2045664, "step": 975 }, { "epoch": 0.1598825352802023, "grad_norm": 6.8125, "learning_rate": 4.992613457837884e-05, "loss": 1.7361, "num_input_tokens_seen": 2056400, "step": 980 }, { "epoch": 0.16069826250101965, "grad_norm": 4.96875, "learning_rate": 4.992537852951011e-05, "loss": 1.2248, "num_input_tokens_seen": 2066560, "step": 985 }, { "epoch": 0.16151398972183703, "grad_norm": 2.8125, "learning_rate": 4.9924618636834785e-05, "loss": 2.0636, "num_input_tokens_seen": 2077728, "step": 990 }, { "epoch": 0.16232971694265438, "grad_norm": 9.125, "learning_rate": 4.9923854900470046e-05, "loss": 1.8059, "num_input_tokens_seen": 2087728, "step": 995 }, { "epoch": 0.16314544416347174, "grad_norm": 0.234375, "learning_rate": 4.992308732053367e-05, "loss": 1.762, "num_input_tokens_seen": 2097344, "step": 1000 }, { "epoch": 0.16314544416347174, "eval_loss": 2.5650367736816406, "eval_runtime": 134.7133, "eval_samples_per_second": 20.228, "eval_steps_per_second": 10.118, "num_input_tokens_seen": 2097344, "step": 1000 }, { "epoch": 0.1639611713842891, "grad_norm": 6.6875, "learning_rate": 4.992231589714402e-05, "loss": 1.7563, "num_input_tokens_seen": 2108880, "step": 1005 }, { "epoch": 0.16477689860510644, "grad_norm": 3.984375, "learning_rate": 4.992154063042007e-05, "loss": 1.0112, "num_input_tokens_seen": 2117952, "step": 1010 }, { "epoch": 0.16559262582592382, "grad_norm": 8.5625, "learning_rate": 4.992076152048136e-05, "loss": 1.9582, "num_input_tokens_seen": 2129728, "step": 1015 }, { "epoch": 0.16640835304674118, "grad_norm": 5.375, "learning_rate": 4.991997856744807e-05, "loss": 2.4562, "num_input_tokens_seen": 2140256, "step": 1020 }, { "epoch": 0.16722408026755853, "grad_norm": 5.34375, "learning_rate": 4.9919191771440905e-05, "loss": 2.206, "num_input_tokens_seen": 2151088, "step": 1025 }, { "epoch": 0.16803980748837588, "grad_norm": 0.08056640625, "learning_rate": 4.991840113258122e-05, "loss": 1.0637, "num_input_tokens_seen": 2161184, "step": 1030 }, { "epoch": 0.16885553470919323, "grad_norm": 3.03125, "learning_rate": 4.9917606650990933e-05, "loss": 2.1335, "num_input_tokens_seen": 2171520, "step": 1035 }, { "epoch": 0.16967126193001061, "grad_norm": 7.09375, "learning_rate": 4.9916808326792566e-05, "loss": 2.1058, "num_input_tokens_seen": 2181968, "step": 1040 }, { "epoch": 0.17048698915082797, "grad_norm": 1.5, "learning_rate": 4.9916006160109235e-05, "loss": 2.2951, "num_input_tokens_seen": 2192800, "step": 1045 }, { "epoch": 0.17130271637164532, "grad_norm": 5.3125, "learning_rate": 4.991520015106464e-05, "loss": 2.0303, "num_input_tokens_seen": 2203744, "step": 1050 }, { "epoch": 0.17211844359246267, "grad_norm": 3.140625, "learning_rate": 4.991439029978308e-05, "loss": 1.7062, "num_input_tokens_seen": 2212240, "step": 1055 }, { "epoch": 0.17293417081328005, "grad_norm": 6.125, "learning_rate": 4.9913576606389434e-05, "loss": 2.2464, "num_input_tokens_seen": 2223072, "step": 1060 }, { "epoch": 0.1737498980340974, "grad_norm": 3.859375, "learning_rate": 4.991275907100919e-05, "loss": 3.3074, "num_input_tokens_seen": 2233664, "step": 1065 }, { "epoch": 0.17456562525491476, "grad_norm": 5.5625, "learning_rate": 4.9911937693768434e-05, "loss": 3.6805, "num_input_tokens_seen": 2243664, "step": 1070 }, { "epoch": 0.1753813524757321, "grad_norm": 8.3125, "learning_rate": 4.991111247479382e-05, "loss": 1.9175, "num_input_tokens_seen": 2254288, "step": 1075 }, { "epoch": 0.17619707969654946, "grad_norm": 10.625, "learning_rate": 4.9910283414212605e-05, "loss": 2.58, "num_input_tokens_seen": 2264576, "step": 1080 }, { "epoch": 0.17701280691736684, "grad_norm": 2.234375, "learning_rate": 4.990945051215265e-05, "loss": 1.947, "num_input_tokens_seen": 2274832, "step": 1085 }, { "epoch": 0.1778285341381842, "grad_norm": 2.125, "learning_rate": 4.99086137687424e-05, "loss": 1.5949, "num_input_tokens_seen": 2285200, "step": 1090 }, { "epoch": 0.17864426135900155, "grad_norm": 4.1875, "learning_rate": 4.9907773184110874e-05, "loss": 1.5938, "num_input_tokens_seen": 2298528, "step": 1095 }, { "epoch": 0.1794599885798189, "grad_norm": 7.03125, "learning_rate": 4.9906928758387715e-05, "loss": 3.2581, "num_input_tokens_seen": 2309008, "step": 1100 }, { "epoch": 0.18027571580063625, "grad_norm": 4.46875, "learning_rate": 4.9906080491703146e-05, "loss": 2.0334, "num_input_tokens_seen": 2319232, "step": 1105 }, { "epoch": 0.18109144302145364, "grad_norm": 5.125, "learning_rate": 4.990522838418797e-05, "loss": 2.0845, "num_input_tokens_seen": 2329920, "step": 1110 }, { "epoch": 0.181907170242271, "grad_norm": 7.5, "learning_rate": 4.9904372435973604e-05, "loss": 2.0352, "num_input_tokens_seen": 2341984, "step": 1115 }, { "epoch": 0.18272289746308834, "grad_norm": 9.6875, "learning_rate": 4.990351264719203e-05, "loss": 2.1311, "num_input_tokens_seen": 2352416, "step": 1120 }, { "epoch": 0.1835386246839057, "grad_norm": 18.75, "learning_rate": 4.990264901797586e-05, "loss": 2.2372, "num_input_tokens_seen": 2362768, "step": 1125 }, { "epoch": 0.18435435190472307, "grad_norm": 5.625, "learning_rate": 4.990178154845826e-05, "loss": 1.6287, "num_input_tokens_seen": 2373312, "step": 1130 }, { "epoch": 0.18517007912554043, "grad_norm": 9.75, "learning_rate": 4.9900910238773014e-05, "loss": 2.6478, "num_input_tokens_seen": 2382832, "step": 1135 }, { "epoch": 0.18598580634635778, "grad_norm": 8.6875, "learning_rate": 4.990003508905448e-05, "loss": 1.573, "num_input_tokens_seen": 2392368, "step": 1140 }, { "epoch": 0.18680153356717513, "grad_norm": 5.15625, "learning_rate": 4.989915609943763e-05, "loss": 2.6202, "num_input_tokens_seen": 2404096, "step": 1145 }, { "epoch": 0.18761726078799248, "grad_norm": 13.0625, "learning_rate": 4.9898273270058e-05, "loss": 2.8423, "num_input_tokens_seen": 2412688, "step": 1150 }, { "epoch": 0.18843298800880987, "grad_norm": 5.46875, "learning_rate": 4.989738660105174e-05, "loss": 2.633, "num_input_tokens_seen": 2422624, "step": 1155 }, { "epoch": 0.18924871522962722, "grad_norm": 14.375, "learning_rate": 4.989649609255559e-05, "loss": 3.3181, "num_input_tokens_seen": 2432320, "step": 1160 }, { "epoch": 0.19006444245044457, "grad_norm": 6.75, "learning_rate": 4.989560174470687e-05, "loss": 2.152, "num_input_tokens_seen": 2443248, "step": 1165 }, { "epoch": 0.19088016967126192, "grad_norm": 8.625, "learning_rate": 4.989470355764351e-05, "loss": 2.6961, "num_input_tokens_seen": 2454384, "step": 1170 }, { "epoch": 0.19169589689207928, "grad_norm": 13.3125, "learning_rate": 4.9893801531504e-05, "loss": 2.5541, "num_input_tokens_seen": 2465456, "step": 1175 }, { "epoch": 0.19251162411289666, "grad_norm": 1.7109375, "learning_rate": 4.9892895666427475e-05, "loss": 2.472, "num_input_tokens_seen": 2476256, "step": 1180 }, { "epoch": 0.193327351333714, "grad_norm": 8.6875, "learning_rate": 4.9891985962553606e-05, "loss": 2.7221, "num_input_tokens_seen": 2487296, "step": 1185 }, { "epoch": 0.19414307855453136, "grad_norm": 4.75, "learning_rate": 4.989107242002269e-05, "loss": 3.0473, "num_input_tokens_seen": 2496448, "step": 1190 }, { "epoch": 0.19495880577534871, "grad_norm": 7.5625, "learning_rate": 4.989015503897561e-05, "loss": 1.4665, "num_input_tokens_seen": 2505856, "step": 1195 }, { "epoch": 0.1957745329961661, "grad_norm": 5.0, "learning_rate": 4.988923381955383e-05, "loss": 2.7923, "num_input_tokens_seen": 2515056, "step": 1200 }, { "epoch": 0.1957745329961661, "eval_loss": 2.548196315765381, "eval_runtime": 134.6665, "eval_samples_per_second": 20.235, "eval_steps_per_second": 10.121, "num_input_tokens_seen": 2515056, "step": 1200 }, { "epoch": 0.19659026021698345, "grad_norm": 9.5, "learning_rate": 4.988830876189942e-05, "loss": 3.3226, "num_input_tokens_seen": 2525712, "step": 1205 }, { "epoch": 0.1974059874378008, "grad_norm": 7.6875, "learning_rate": 4.988737986615503e-05, "loss": 1.9326, "num_input_tokens_seen": 2535872, "step": 1210 }, { "epoch": 0.19822171465861815, "grad_norm": 6.78125, "learning_rate": 4.988644713246391e-05, "loss": 2.5958, "num_input_tokens_seen": 2547248, "step": 1215 }, { "epoch": 0.1990374418794355, "grad_norm": 8.5625, "learning_rate": 4.988551056096991e-05, "loss": 2.4174, "num_input_tokens_seen": 2558224, "step": 1220 }, { "epoch": 0.1998531691002529, "grad_norm": 6.4375, "learning_rate": 4.988457015181743e-05, "loss": 3.4297, "num_input_tokens_seen": 2568192, "step": 1225 }, { "epoch": 0.20066889632107024, "grad_norm": 7.0625, "learning_rate": 4.988362590515153e-05, "loss": 1.9904, "num_input_tokens_seen": 2580112, "step": 1230 }, { "epoch": 0.2014846235418876, "grad_norm": 8.25, "learning_rate": 4.9882677821117805e-05, "loss": 3.4085, "num_input_tokens_seen": 2591808, "step": 1235 }, { "epoch": 0.20230035076270494, "grad_norm": 7.25, "learning_rate": 4.988172589986246e-05, "loss": 3.7223, "num_input_tokens_seen": 2602336, "step": 1240 }, { "epoch": 0.2031160779835223, "grad_norm": 4.40625, "learning_rate": 4.9880770141532304e-05, "loss": 2.3513, "num_input_tokens_seen": 2612096, "step": 1245 }, { "epoch": 0.20393180520433968, "grad_norm": 11.5, "learning_rate": 4.987981054627472e-05, "loss": 2.9924, "num_input_tokens_seen": 2622736, "step": 1250 }, { "epoch": 0.20474753242515703, "grad_norm": 4.78125, "learning_rate": 4.987884711423769e-05, "loss": 1.6815, "num_input_tokens_seen": 2632320, "step": 1255 }, { "epoch": 0.20556325964597438, "grad_norm": 1.859375, "learning_rate": 4.9877879845569784e-05, "loss": 1.3296, "num_input_tokens_seen": 2644160, "step": 1260 }, { "epoch": 0.20637898686679174, "grad_norm": 4.0, "learning_rate": 4.9876908740420175e-05, "loss": 1.6824, "num_input_tokens_seen": 2655024, "step": 1265 }, { "epoch": 0.20719471408760912, "grad_norm": 5.375, "learning_rate": 4.987593379893861e-05, "loss": 2.9393, "num_input_tokens_seen": 2664848, "step": 1270 }, { "epoch": 0.20801044130842647, "grad_norm": 8.5625, "learning_rate": 4.987495502127545e-05, "loss": 3.3941, "num_input_tokens_seen": 2675680, "step": 1275 }, { "epoch": 0.20882616852924382, "grad_norm": 6.6875, "learning_rate": 4.987397240758162e-05, "loss": 1.4506, "num_input_tokens_seen": 2686384, "step": 1280 }, { "epoch": 0.20964189575006117, "grad_norm": 0.384765625, "learning_rate": 4.9872985958008664e-05, "loss": 1.4825, "num_input_tokens_seen": 2696576, "step": 1285 }, { "epoch": 0.21045762297087853, "grad_norm": 4.53125, "learning_rate": 4.987199567270871e-05, "loss": 1.9586, "num_input_tokens_seen": 2707376, "step": 1290 }, { "epoch": 0.2112733501916959, "grad_norm": 10.125, "learning_rate": 4.9871001551834444e-05, "loss": 2.1887, "num_input_tokens_seen": 2718768, "step": 1295 }, { "epoch": 0.21208907741251326, "grad_norm": 13.9375, "learning_rate": 4.98700035955392e-05, "loss": 2.6108, "num_input_tokens_seen": 2727728, "step": 1300 }, { "epoch": 0.2129048046333306, "grad_norm": 10.0625, "learning_rate": 4.986900180397686e-05, "loss": 3.8105, "num_input_tokens_seen": 2739264, "step": 1305 }, { "epoch": 0.21372053185414797, "grad_norm": 3.0625, "learning_rate": 4.9867996177301926e-05, "loss": 1.6473, "num_input_tokens_seen": 2749920, "step": 1310 }, { "epoch": 0.21453625907496532, "grad_norm": 7.5625, "learning_rate": 4.9866986715669464e-05, "loss": 2.9277, "num_input_tokens_seen": 2759440, "step": 1315 }, { "epoch": 0.2153519862957827, "grad_norm": 11.25, "learning_rate": 4.9865973419235155e-05, "loss": 3.018, "num_input_tokens_seen": 2770720, "step": 1320 }, { "epoch": 0.21616771351660005, "grad_norm": 9.9375, "learning_rate": 4.986495628815526e-05, "loss": 2.8937, "num_input_tokens_seen": 2781760, "step": 1325 }, { "epoch": 0.2169834407374174, "grad_norm": 15.75, "learning_rate": 4.986393532258663e-05, "loss": 2.7528, "num_input_tokens_seen": 2791872, "step": 1330 }, { "epoch": 0.21779916795823476, "grad_norm": 5.09375, "learning_rate": 4.986291052268671e-05, "loss": 1.5995, "num_input_tokens_seen": 2803104, "step": 1335 }, { "epoch": 0.21861489517905214, "grad_norm": 11.3125, "learning_rate": 4.986188188861355e-05, "loss": 2.1062, "num_input_tokens_seen": 2813296, "step": 1340 }, { "epoch": 0.2194306223998695, "grad_norm": 5.03125, "learning_rate": 4.9860849420525766e-05, "loss": 2.8459, "num_input_tokens_seen": 2824960, "step": 1345 }, { "epoch": 0.22024634962068684, "grad_norm": 4.625, "learning_rate": 4.9859813118582575e-05, "loss": 2.1492, "num_input_tokens_seen": 2835984, "step": 1350 }, { "epoch": 0.2210620768415042, "grad_norm": 5.40625, "learning_rate": 4.98587729829438e-05, "loss": 2.705, "num_input_tokens_seen": 2845744, "step": 1355 }, { "epoch": 0.22187780406232155, "grad_norm": 10.75, "learning_rate": 4.985772901376983e-05, "loss": 2.565, "num_input_tokens_seen": 2857344, "step": 1360 }, { "epoch": 0.22269353128313893, "grad_norm": 6.0625, "learning_rate": 4.9856681211221666e-05, "loss": 1.6996, "num_input_tokens_seen": 2867776, "step": 1365 }, { "epoch": 0.22350925850395628, "grad_norm": 9.1875, "learning_rate": 4.985562957546089e-05, "loss": 2.4041, "num_input_tokens_seen": 2878960, "step": 1370 }, { "epoch": 0.22432498572477363, "grad_norm": 5.40625, "learning_rate": 4.9854574106649686e-05, "loss": 1.7318, "num_input_tokens_seen": 2891024, "step": 1375 }, { "epoch": 0.225140712945591, "grad_norm": 7.5625, "learning_rate": 4.985351480495081e-05, "loss": 3.1137, "num_input_tokens_seen": 2901456, "step": 1380 }, { "epoch": 0.22595644016640834, "grad_norm": 8.375, "learning_rate": 4.985245167052762e-05, "loss": 2.8767, "num_input_tokens_seen": 2912016, "step": 1385 }, { "epoch": 0.22677216738722572, "grad_norm": 9.0, "learning_rate": 4.9851384703544066e-05, "loss": 3.1745, "num_input_tokens_seen": 2922192, "step": 1390 }, { "epoch": 0.22758789460804307, "grad_norm": 6.25, "learning_rate": 4.985031390416469e-05, "loss": 2.8386, "num_input_tokens_seen": 2933488, "step": 1395 }, { "epoch": 0.22840362182886043, "grad_norm": 13.8125, "learning_rate": 4.984923927255461e-05, "loss": 1.7457, "num_input_tokens_seen": 2943280, "step": 1400 }, { "epoch": 0.22840362182886043, "eval_loss": 2.5521657466888428, "eval_runtime": 134.7023, "eval_samples_per_second": 20.23, "eval_steps_per_second": 10.119, "num_input_tokens_seen": 2943280, "step": 1400 }, { "epoch": 0.22921934904967778, "grad_norm": 6.34375, "learning_rate": 4.984816080887958e-05, "loss": 2.0731, "num_input_tokens_seen": 2955792, "step": 1405 }, { "epoch": 0.23003507627049516, "grad_norm": 9.0625, "learning_rate": 4.9847078513305875e-05, "loss": 2.6628, "num_input_tokens_seen": 2965968, "step": 1410 }, { "epoch": 0.2308508034913125, "grad_norm": 9.0, "learning_rate": 4.984599238600043e-05, "loss": 2.7593, "num_input_tokens_seen": 2975968, "step": 1415 }, { "epoch": 0.23166653071212986, "grad_norm": 5.15625, "learning_rate": 4.9844902427130716e-05, "loss": 1.1594, "num_input_tokens_seen": 2986016, "step": 1420 }, { "epoch": 0.23248225793294722, "grad_norm": 3.59375, "learning_rate": 4.984380863686482e-05, "loss": 3.4199, "num_input_tokens_seen": 2994768, "step": 1425 }, { "epoch": 0.23329798515376457, "grad_norm": 11.6875, "learning_rate": 4.984271101537143e-05, "loss": 1.5492, "num_input_tokens_seen": 3003264, "step": 1430 }, { "epoch": 0.23411371237458195, "grad_norm": 9.0625, "learning_rate": 4.9841609562819816e-05, "loss": 4.1386, "num_input_tokens_seen": 3013904, "step": 1435 }, { "epoch": 0.2349294395953993, "grad_norm": 3.625, "learning_rate": 4.984050427937983e-05, "loss": 1.155, "num_input_tokens_seen": 3024432, "step": 1440 }, { "epoch": 0.23574516681621666, "grad_norm": 11.0, "learning_rate": 4.983939516522191e-05, "loss": 1.7612, "num_input_tokens_seen": 3034624, "step": 1445 }, { "epoch": 0.236560894037034, "grad_norm": 3.875, "learning_rate": 4.983828222051711e-05, "loss": 1.9255, "num_input_tokens_seen": 3044288, "step": 1450 }, { "epoch": 0.23737662125785136, "grad_norm": 13.0, "learning_rate": 4.983716544543705e-05, "loss": 2.3639, "num_input_tokens_seen": 3054384, "step": 1455 }, { "epoch": 0.23819234847866874, "grad_norm": 6.59375, "learning_rate": 4.983604484015395e-05, "loss": 3.5784, "num_input_tokens_seen": 3065136, "step": 1460 }, { "epoch": 0.2390080756994861, "grad_norm": 8.75, "learning_rate": 4.983492040484064e-05, "loss": 2.7678, "num_input_tokens_seen": 3076384, "step": 1465 }, { "epoch": 0.23982380292030345, "grad_norm": 7.90625, "learning_rate": 4.98337921396705e-05, "loss": 1.7655, "num_input_tokens_seen": 3088800, "step": 1470 }, { "epoch": 0.2406395301411208, "grad_norm": 18.125, "learning_rate": 4.983266004481753e-05, "loss": 3.9605, "num_input_tokens_seen": 3099328, "step": 1475 }, { "epoch": 0.24145525736193818, "grad_norm": 12.5625, "learning_rate": 4.9831524120456316e-05, "loss": 2.3588, "num_input_tokens_seen": 3109920, "step": 1480 }, { "epoch": 0.24227098458275553, "grad_norm": 4.46875, "learning_rate": 4.9830384366762026e-05, "loss": 1.4966, "num_input_tokens_seen": 3119360, "step": 1485 }, { "epoch": 0.24308671180357289, "grad_norm": 6.6875, "learning_rate": 4.9829240783910436e-05, "loss": 2.8051, "num_input_tokens_seen": 3128896, "step": 1490 }, { "epoch": 0.24390243902439024, "grad_norm": 8.875, "learning_rate": 4.982809337207789e-05, "loss": 2.3901, "num_input_tokens_seen": 3140512, "step": 1495 }, { "epoch": 0.2447181662452076, "grad_norm": 12.5, "learning_rate": 4.9826942131441337e-05, "loss": 3.1811, "num_input_tokens_seen": 3150480, "step": 1500 }, { "epoch": 0.24553389346602497, "grad_norm": 15.4375, "learning_rate": 4.9825787062178315e-05, "loss": 3.5, "num_input_tokens_seen": 3161408, "step": 1505 }, { "epoch": 0.24634962068684232, "grad_norm": 3.09375, "learning_rate": 4.9824628164466945e-05, "loss": 1.9037, "num_input_tokens_seen": 3171472, "step": 1510 }, { "epoch": 0.24716534790765968, "grad_norm": 4.21875, "learning_rate": 4.982346543848595e-05, "loss": 2.3638, "num_input_tokens_seen": 3182176, "step": 1515 }, { "epoch": 0.24798107512847703, "grad_norm": 4.125, "learning_rate": 4.9822298884414626e-05, "loss": 2.8021, "num_input_tokens_seen": 3193104, "step": 1520 }, { "epoch": 0.24879680234929438, "grad_norm": 4.625, "learning_rate": 4.982112850243288e-05, "loss": 2.4388, "num_input_tokens_seen": 3203872, "step": 1525 }, { "epoch": 0.24961252957011176, "grad_norm": 9.5, "learning_rate": 4.98199542927212e-05, "loss": 2.3121, "num_input_tokens_seen": 3213440, "step": 1530 }, { "epoch": 0.2504282567909291, "grad_norm": 10.125, "learning_rate": 4.981877625546066e-05, "loss": 2.2994, "num_input_tokens_seen": 3224720, "step": 1535 }, { "epoch": 0.25124398401174647, "grad_norm": 9.875, "learning_rate": 4.981759439083293e-05, "loss": 2.2191, "num_input_tokens_seen": 3236320, "step": 1540 }, { "epoch": 0.2520597112325638, "grad_norm": 5.75, "learning_rate": 4.981640869902027e-05, "loss": 2.3182, "num_input_tokens_seen": 3246928, "step": 1545 }, { "epoch": 0.2528754384533812, "grad_norm": 8.25, "learning_rate": 4.9815219180205517e-05, "loss": 2.1877, "num_input_tokens_seen": 3256672, "step": 1550 }, { "epoch": 0.2536911656741985, "grad_norm": 0.142578125, "learning_rate": 4.9814025834572126e-05, "loss": 2.7323, "num_input_tokens_seen": 3267200, "step": 1555 }, { "epoch": 0.2545068928950159, "grad_norm": 6.96875, "learning_rate": 4.981282866230411e-05, "loss": 1.7918, "num_input_tokens_seen": 3277168, "step": 1560 }, { "epoch": 0.2553226201158333, "grad_norm": 9.75, "learning_rate": 4.981162766358611e-05, "loss": 2.5694, "num_input_tokens_seen": 3287488, "step": 1565 }, { "epoch": 0.25613834733665064, "grad_norm": 10.0625, "learning_rate": 4.9810422838603316e-05, "loss": 4.192, "num_input_tokens_seen": 3297952, "step": 1570 }, { "epoch": 0.256954074557468, "grad_norm": 5.25, "learning_rate": 4.9809214187541533e-05, "loss": 1.7769, "num_input_tokens_seen": 3306912, "step": 1575 }, { "epoch": 0.25776980177828535, "grad_norm": 4.90625, "learning_rate": 4.980800171058715e-05, "loss": 2.2933, "num_input_tokens_seen": 3315712, "step": 1580 }, { "epoch": 0.2585855289991027, "grad_norm": 6.0625, "learning_rate": 4.980678540792715e-05, "loss": 1.933, "num_input_tokens_seen": 3327312, "step": 1585 }, { "epoch": 0.25940125621992005, "grad_norm": 7.625, "learning_rate": 4.980556527974909e-05, "loss": 1.7199, "num_input_tokens_seen": 3338992, "step": 1590 }, { "epoch": 0.2602169834407374, "grad_norm": 3.90625, "learning_rate": 4.980434132624114e-05, "loss": 2.5684, "num_input_tokens_seen": 3350720, "step": 1595 }, { "epoch": 0.26103271066155476, "grad_norm": 5.78125, "learning_rate": 4.980311354759205e-05, "loss": 1.7453, "num_input_tokens_seen": 3360448, "step": 1600 }, { "epoch": 0.26103271066155476, "eval_loss": 2.5495588779449463, "eval_runtime": 134.6251, "eval_samples_per_second": 20.241, "eval_steps_per_second": 10.124, "num_input_tokens_seen": 3360448, "step": 1600 }, { "epoch": 0.2618484378823721, "grad_norm": 4.9375, "learning_rate": 4.980188194399116e-05, "loss": 1.7761, "num_input_tokens_seen": 3371184, "step": 1605 }, { "epoch": 0.2626641651031895, "grad_norm": 5.65625, "learning_rate": 4.9800646515628384e-05, "loss": 1.6489, "num_input_tokens_seen": 3383232, "step": 1610 }, { "epoch": 0.26347989232400687, "grad_norm": 1.2578125, "learning_rate": 4.979940726269426e-05, "loss": 1.8968, "num_input_tokens_seen": 3394384, "step": 1615 }, { "epoch": 0.2642956195448242, "grad_norm": 6.78125, "learning_rate": 4.979816418537988e-05, "loss": 1.8214, "num_input_tokens_seen": 3404288, "step": 1620 }, { "epoch": 0.2651113467656416, "grad_norm": 6.5625, "learning_rate": 4.979691728387696e-05, "loss": 2.9042, "num_input_tokens_seen": 3415440, "step": 1625 }, { "epoch": 0.26592707398645893, "grad_norm": 7.6875, "learning_rate": 4.979566655837776e-05, "loss": 2.726, "num_input_tokens_seen": 3426672, "step": 1630 }, { "epoch": 0.2667428012072763, "grad_norm": 5.0625, "learning_rate": 4.9794412009075184e-05, "loss": 2.2014, "num_input_tokens_seen": 3436304, "step": 1635 }, { "epoch": 0.26755852842809363, "grad_norm": 6.125, "learning_rate": 4.979315363616269e-05, "loss": 1.8452, "num_input_tokens_seen": 3447408, "step": 1640 }, { "epoch": 0.268374255648911, "grad_norm": 4.875, "learning_rate": 4.979189143983434e-05, "loss": 2.2648, "num_input_tokens_seen": 3458144, "step": 1645 }, { "epoch": 0.26918998286972834, "grad_norm": 4.875, "learning_rate": 4.979062542028478e-05, "loss": 2.12, "num_input_tokens_seen": 3469776, "step": 1650 }, { "epoch": 0.27000571009054575, "grad_norm": 6.15625, "learning_rate": 4.978935557770923e-05, "loss": 2.3385, "num_input_tokens_seen": 3480560, "step": 1655 }, { "epoch": 0.2708214373113631, "grad_norm": 3.734375, "learning_rate": 4.978808191230353e-05, "loss": 1.8997, "num_input_tokens_seen": 3491264, "step": 1660 }, { "epoch": 0.27163716453218045, "grad_norm": 7.46875, "learning_rate": 4.9786804424264085e-05, "loss": 2.633, "num_input_tokens_seen": 3501920, "step": 1665 }, { "epoch": 0.2724528917529978, "grad_norm": 0.10693359375, "learning_rate": 4.978552311378792e-05, "loss": 1.5724, "num_input_tokens_seen": 3511776, "step": 1670 }, { "epoch": 0.27326861897381516, "grad_norm": 9.6875, "learning_rate": 4.978423798107261e-05, "loss": 2.3548, "num_input_tokens_seen": 3522896, "step": 1675 }, { "epoch": 0.2740843461946325, "grad_norm": 6.9375, "learning_rate": 4.978294902631635e-05, "loss": 2.4052, "num_input_tokens_seen": 3533808, "step": 1680 }, { "epoch": 0.27490007341544986, "grad_norm": 7.03125, "learning_rate": 4.9781656249717914e-05, "loss": 1.9794, "num_input_tokens_seen": 3544960, "step": 1685 }, { "epoch": 0.2757158006362672, "grad_norm": 6.6875, "learning_rate": 4.9780359651476645e-05, "loss": 2.2846, "num_input_tokens_seen": 3554672, "step": 1690 }, { "epoch": 0.27653152785708457, "grad_norm": 16.5, "learning_rate": 4.977905923179251e-05, "loss": 3.55, "num_input_tokens_seen": 3565664, "step": 1695 }, { "epoch": 0.2773472550779019, "grad_norm": 4.8125, "learning_rate": 4.977775499086606e-05, "loss": 2.0515, "num_input_tokens_seen": 3575744, "step": 1700 }, { "epoch": 0.27816298229871933, "grad_norm": 8.5625, "learning_rate": 4.97764469288984e-05, "loss": 2.1667, "num_input_tokens_seen": 3585840, "step": 1705 }, { "epoch": 0.2789787095195367, "grad_norm": 6.15625, "learning_rate": 4.977513504609127e-05, "loss": 1.5532, "num_input_tokens_seen": 3596816, "step": 1710 }, { "epoch": 0.27979443674035404, "grad_norm": 7.375, "learning_rate": 4.9773819342646965e-05, "loss": 2.0152, "num_input_tokens_seen": 3607584, "step": 1715 }, { "epoch": 0.2806101639611714, "grad_norm": 7.96875, "learning_rate": 4.97724998187684e-05, "loss": 2.5906, "num_input_tokens_seen": 3617504, "step": 1720 }, { "epoch": 0.28142589118198874, "grad_norm": 5.3125, "learning_rate": 4.9771176474659045e-05, "loss": 2.6162, "num_input_tokens_seen": 3628496, "step": 1725 }, { "epoch": 0.2822416184028061, "grad_norm": 3.25, "learning_rate": 4.976984931052299e-05, "loss": 1.6676, "num_input_tokens_seen": 3638848, "step": 1730 }, { "epoch": 0.28305734562362345, "grad_norm": 8.875, "learning_rate": 4.976851832656489e-05, "loss": 2.5647, "num_input_tokens_seen": 3648784, "step": 1735 }, { "epoch": 0.2838730728444408, "grad_norm": 5.15625, "learning_rate": 4.9767183522990004e-05, "loss": 4.2479, "num_input_tokens_seen": 3657760, "step": 1740 }, { "epoch": 0.28468880006525815, "grad_norm": 1.609375, "learning_rate": 4.9765844900004176e-05, "loss": 2.0166, "num_input_tokens_seen": 3667456, "step": 1745 }, { "epoch": 0.28550452728607556, "grad_norm": 10.4375, "learning_rate": 4.9764502457813834e-05, "loss": 2.5445, "num_input_tokens_seen": 3679296, "step": 1750 }, { "epoch": 0.2863202545068929, "grad_norm": 8.625, "learning_rate": 4.9763156196626005e-05, "loss": 2.893, "num_input_tokens_seen": 3690176, "step": 1755 }, { "epoch": 0.28713598172771027, "grad_norm": 8.8125, "learning_rate": 4.97618061166483e-05, "loss": 2.7165, "num_input_tokens_seen": 3699744, "step": 1760 }, { "epoch": 0.2879517089485276, "grad_norm": 5.34375, "learning_rate": 4.9760452218088915e-05, "loss": 2.7786, "num_input_tokens_seen": 3710016, "step": 1765 }, { "epoch": 0.28876743616934497, "grad_norm": 9.8125, "learning_rate": 4.975909450115663e-05, "loss": 2.5256, "num_input_tokens_seen": 3720224, "step": 1770 }, { "epoch": 0.2895831633901623, "grad_norm": 6.53125, "learning_rate": 4.975773296606084e-05, "loss": 1.9861, "num_input_tokens_seen": 3731280, "step": 1775 }, { "epoch": 0.2903988906109797, "grad_norm": 10.6875, "learning_rate": 4.97563676130115e-05, "loss": 1.6704, "num_input_tokens_seen": 3742352, "step": 1780 }, { "epoch": 0.29121461783179703, "grad_norm": 6.9375, "learning_rate": 4.9754998442219166e-05, "loss": 2.7467, "num_input_tokens_seen": 3752144, "step": 1785 }, { "epoch": 0.2920303450526144, "grad_norm": 3.65625, "learning_rate": 4.9753625453894984e-05, "loss": 1.7765, "num_input_tokens_seen": 3761760, "step": 1790 }, { "epoch": 0.2928460722734318, "grad_norm": 5.78125, "learning_rate": 4.975224864825068e-05, "loss": 1.3404, "num_input_tokens_seen": 3771952, "step": 1795 }, { "epoch": 0.29366179949424914, "grad_norm": 8.0, "learning_rate": 4.9750868025498576e-05, "loss": 1.8503, "num_input_tokens_seen": 3782768, "step": 1800 }, { "epoch": 0.29366179949424914, "eval_loss": 2.5513477325439453, "eval_runtime": 134.7861, "eval_samples_per_second": 20.217, "eval_steps_per_second": 10.112, "num_input_tokens_seen": 3782768, "step": 1800 }, { "epoch": 0.2944775267150665, "grad_norm": 7.4375, "learning_rate": 4.974948358585158e-05, "loss": 2.1661, "num_input_tokens_seen": 3794032, "step": 1805 }, { "epoch": 0.29529325393588385, "grad_norm": 10.375, "learning_rate": 4.9748095329523205e-05, "loss": 2.6868, "num_input_tokens_seen": 3803696, "step": 1810 }, { "epoch": 0.2961089811567012, "grad_norm": 3.25, "learning_rate": 4.974670325672752e-05, "loss": 1.5509, "num_input_tokens_seen": 3815184, "step": 1815 }, { "epoch": 0.29692470837751855, "grad_norm": 10.75, "learning_rate": 4.974530736767921e-05, "loss": 2.5746, "num_input_tokens_seen": 3823280, "step": 1820 }, { "epoch": 0.2977404355983359, "grad_norm": 7.9375, "learning_rate": 4.9743907662593524e-05, "loss": 2.7132, "num_input_tokens_seen": 3834880, "step": 1825 }, { "epoch": 0.29855616281915326, "grad_norm": 5.09375, "learning_rate": 4.974250414168633e-05, "loss": 2.337, "num_input_tokens_seen": 3846688, "step": 1830 }, { "epoch": 0.2993718900399706, "grad_norm": 9.125, "learning_rate": 4.974109680517407e-05, "loss": 2.7045, "num_input_tokens_seen": 3856672, "step": 1835 }, { "epoch": 0.300187617260788, "grad_norm": 12.1875, "learning_rate": 4.973968565327376e-05, "loss": 2.9434, "num_input_tokens_seen": 3867008, "step": 1840 }, { "epoch": 0.3010033444816054, "grad_norm": 5.53125, "learning_rate": 4.973827068620303e-05, "loss": 2.5616, "num_input_tokens_seen": 3876096, "step": 1845 }, { "epoch": 0.3018190717024227, "grad_norm": 6.28125, "learning_rate": 4.973685190418008e-05, "loss": 1.01, "num_input_tokens_seen": 3886288, "step": 1850 }, { "epoch": 0.3026347989232401, "grad_norm": 8.75, "learning_rate": 4.97354293074237e-05, "loss": 1.9692, "num_input_tokens_seen": 3896096, "step": 1855 }, { "epoch": 0.30345052614405743, "grad_norm": 9.625, "learning_rate": 4.9734002896153276e-05, "loss": 1.9227, "num_input_tokens_seen": 3905632, "step": 1860 }, { "epoch": 0.3042662533648748, "grad_norm": 5.4375, "learning_rate": 4.973257267058877e-05, "loss": 2.4571, "num_input_tokens_seen": 3915856, "step": 1865 }, { "epoch": 0.30508198058569214, "grad_norm": 11.0, "learning_rate": 4.973113863095076e-05, "loss": 2.8401, "num_input_tokens_seen": 3926592, "step": 1870 }, { "epoch": 0.3058977078065095, "grad_norm": 6.25, "learning_rate": 4.9729700777460384e-05, "loss": 2.1013, "num_input_tokens_seen": 3937344, "step": 1875 }, { "epoch": 0.30671343502732684, "grad_norm": 4.65625, "learning_rate": 4.972825911033937e-05, "loss": 1.4578, "num_input_tokens_seen": 3948224, "step": 1880 }, { "epoch": 0.3075291622481442, "grad_norm": 5.25, "learning_rate": 4.9726813629810056e-05, "loss": 2.2478, "num_input_tokens_seen": 3958368, "step": 1885 }, { "epoch": 0.3083448894689616, "grad_norm": 8.875, "learning_rate": 4.9725364336095326e-05, "loss": 2.0339, "num_input_tokens_seen": 3968960, "step": 1890 }, { "epoch": 0.30916061668977896, "grad_norm": 2.46875, "learning_rate": 4.972391122941871e-05, "loss": 1.5583, "num_input_tokens_seen": 3978656, "step": 1895 }, { "epoch": 0.3099763439105963, "grad_norm": 9.0, "learning_rate": 4.972245431000428e-05, "loss": 1.1397, "num_input_tokens_seen": 3989376, "step": 1900 }, { "epoch": 0.31079207113141366, "grad_norm": 3.046875, "learning_rate": 4.972099357807671e-05, "loss": 1.6381, "num_input_tokens_seen": 4001264, "step": 1905 }, { "epoch": 0.311607798352231, "grad_norm": 6.3125, "learning_rate": 4.971952903386127e-05, "loss": 3.108, "num_input_tokens_seen": 4012576, "step": 1910 }, { "epoch": 0.31242352557304837, "grad_norm": 6.25, "learning_rate": 4.971806067758381e-05, "loss": 2.5796, "num_input_tokens_seen": 4022976, "step": 1915 }, { "epoch": 0.3132392527938657, "grad_norm": 5.4375, "learning_rate": 4.971658850947076e-05, "loss": 3.5806, "num_input_tokens_seen": 4033664, "step": 1920 }, { "epoch": 0.31405498001468307, "grad_norm": 6.25, "learning_rate": 4.9715112529749165e-05, "loss": 2.894, "num_input_tokens_seen": 4045264, "step": 1925 }, { "epoch": 0.3148707072355004, "grad_norm": 10.4375, "learning_rate": 4.9713632738646624e-05, "loss": 2.1021, "num_input_tokens_seen": 4056288, "step": 1930 }, { "epoch": 0.31568643445631783, "grad_norm": 10.9375, "learning_rate": 4.971214913639134e-05, "loss": 2.4593, "num_input_tokens_seen": 4067472, "step": 1935 }, { "epoch": 0.3165021616771352, "grad_norm": 0.23828125, "learning_rate": 4.9710661723212104e-05, "loss": 1.0891, "num_input_tokens_seen": 4078976, "step": 1940 }, { "epoch": 0.31731788889795254, "grad_norm": 3.53125, "learning_rate": 4.9709170499338295e-05, "loss": 2.8066, "num_input_tokens_seen": 4088256, "step": 1945 }, { "epoch": 0.3181336161187699, "grad_norm": 8.3125, "learning_rate": 4.9707675464999895e-05, "loss": 2.8469, "num_input_tokens_seen": 4098800, "step": 1950 }, { "epoch": 0.31894934333958724, "grad_norm": 0.1318359375, "learning_rate": 4.970617662042743e-05, "loss": 2.5577, "num_input_tokens_seen": 4110048, "step": 1955 }, { "epoch": 0.3197650705604046, "grad_norm": 9.3125, "learning_rate": 4.970467396585206e-05, "loss": 2.7401, "num_input_tokens_seen": 4120000, "step": 1960 }, { "epoch": 0.32058079778122195, "grad_norm": 7.90625, "learning_rate": 4.97031675015055e-05, "loss": 2.345, "num_input_tokens_seen": 4129840, "step": 1965 }, { "epoch": 0.3213965250020393, "grad_norm": 2.390625, "learning_rate": 4.9701657227620075e-05, "loss": 2.5168, "num_input_tokens_seen": 4140224, "step": 1970 }, { "epoch": 0.32221225222285665, "grad_norm": 7.59375, "learning_rate": 4.9700143144428685e-05, "loss": 2.7057, "num_input_tokens_seen": 4150832, "step": 1975 }, { "epoch": 0.32302797944367406, "grad_norm": 3.828125, "learning_rate": 4.969862525216482e-05, "loss": 2.1708, "num_input_tokens_seen": 4161136, "step": 1980 }, { "epoch": 0.3238437066644914, "grad_norm": 12.6875, "learning_rate": 4.9697103551062556e-05, "loss": 2.974, "num_input_tokens_seen": 4171392, "step": 1985 }, { "epoch": 0.32465943388530877, "grad_norm": 12.1875, "learning_rate": 4.9695578041356565e-05, "loss": 3.6053, "num_input_tokens_seen": 4181984, "step": 1990 }, { "epoch": 0.3254751611061261, "grad_norm": 6.21875, "learning_rate": 4.969404872328209e-05, "loss": 2.3394, "num_input_tokens_seen": 4193472, "step": 1995 }, { "epoch": 0.3262908883269435, "grad_norm": 3.890625, "learning_rate": 4.969251559707498e-05, "loss": 2.2157, "num_input_tokens_seen": 4205680, "step": 2000 }, { "epoch": 0.3262908883269435, "eval_loss": 2.535407781600952, "eval_runtime": 134.8752, "eval_samples_per_second": 20.204, "eval_steps_per_second": 10.106, "num_input_tokens_seen": 4205680, "step": 2000 }, { "epoch": 0.3271066155477608, "grad_norm": 0.1298828125, "learning_rate": 4.9690978662971674e-05, "loss": 2.6241, "num_input_tokens_seen": 4216496, "step": 2005 }, { "epoch": 0.3279223427685782, "grad_norm": 4.375, "learning_rate": 4.968943792120916e-05, "loss": 2.4422, "num_input_tokens_seen": 4227232, "step": 2010 }, { "epoch": 0.32873806998939553, "grad_norm": 8.9375, "learning_rate": 4.9687893372025046e-05, "loss": 1.9462, "num_input_tokens_seen": 4238512, "step": 2015 }, { "epoch": 0.3295537972102129, "grad_norm": 4.75, "learning_rate": 4.9686345015657535e-05, "loss": 3.0434, "num_input_tokens_seen": 4250048, "step": 2020 }, { "epoch": 0.33036952443103024, "grad_norm": 11.5, "learning_rate": 4.968479285234538e-05, "loss": 3.0429, "num_input_tokens_seen": 4260640, "step": 2025 }, { "epoch": 0.33118525165184765, "grad_norm": 9.5625, "learning_rate": 4.9683236882327974e-05, "loss": 2.6472, "num_input_tokens_seen": 4271248, "step": 2030 }, { "epoch": 0.332000978872665, "grad_norm": 2.90625, "learning_rate": 4.968167710584526e-05, "loss": 1.9837, "num_input_tokens_seen": 4281040, "step": 2035 }, { "epoch": 0.33281670609348235, "grad_norm": 6.9375, "learning_rate": 4.968011352313775e-05, "loss": 1.9224, "num_input_tokens_seen": 4291520, "step": 2040 }, { "epoch": 0.3336324333142997, "grad_norm": 5.78125, "learning_rate": 4.967854613444659e-05, "loss": 1.0642, "num_input_tokens_seen": 4302240, "step": 2045 }, { "epoch": 0.33444816053511706, "grad_norm": 10.125, "learning_rate": 4.967697494001349e-05, "loss": 2.585, "num_input_tokens_seen": 4312128, "step": 2050 }, { "epoch": 0.3352638877559344, "grad_norm": 16.125, "learning_rate": 4.9675399940080736e-05, "loss": 3.2917, "num_input_tokens_seen": 4322720, "step": 2055 }, { "epoch": 0.33607961497675176, "grad_norm": 8.3125, "learning_rate": 4.9673821134891226e-05, "loss": 2.9071, "num_input_tokens_seen": 4332400, "step": 2060 }, { "epoch": 0.3368953421975691, "grad_norm": 3.640625, "learning_rate": 4.967223852468842e-05, "loss": 1.6234, "num_input_tokens_seen": 4342928, "step": 2065 }, { "epoch": 0.33771106941838647, "grad_norm": 5.5, "learning_rate": 4.967065210971639e-05, "loss": 3.0376, "num_input_tokens_seen": 4354400, "step": 2070 }, { "epoch": 0.3385267966392039, "grad_norm": 11.625, "learning_rate": 4.966906189021977e-05, "loss": 3.7964, "num_input_tokens_seen": 4365328, "step": 2075 }, { "epoch": 0.33934252386002123, "grad_norm": 9.75, "learning_rate": 4.966746786644379e-05, "loss": 2.2056, "num_input_tokens_seen": 4374512, "step": 2080 }, { "epoch": 0.3401582510808386, "grad_norm": 2.515625, "learning_rate": 4.966587003863429e-05, "loss": 1.5924, "num_input_tokens_seen": 4384480, "step": 2085 }, { "epoch": 0.34097397830165593, "grad_norm": 7.0625, "learning_rate": 4.966426840703765e-05, "loss": 4.1598, "num_input_tokens_seen": 4396096, "step": 2090 }, { "epoch": 0.3417897055224733, "grad_norm": 4.84375, "learning_rate": 4.9662662971900875e-05, "loss": 2.1955, "num_input_tokens_seen": 4406496, "step": 2095 }, { "epoch": 0.34260543274329064, "grad_norm": 7.125, "learning_rate": 4.9661053733471534e-05, "loss": 2.8056, "num_input_tokens_seen": 4416368, "step": 2100 }, { "epoch": 0.343421159964108, "grad_norm": 9.25, "learning_rate": 4.965944069199781e-05, "loss": 2.8787, "num_input_tokens_seen": 4427360, "step": 2105 }, { "epoch": 0.34423688718492534, "grad_norm": 5.375, "learning_rate": 4.965782384772842e-05, "loss": 1.0717, "num_input_tokens_seen": 4437280, "step": 2110 }, { "epoch": 0.3450526144057427, "grad_norm": 8.6875, "learning_rate": 4.9656203200912734e-05, "loss": 2.7308, "num_input_tokens_seen": 4447872, "step": 2115 }, { "epoch": 0.3458683416265601, "grad_norm": 6.5625, "learning_rate": 4.965457875180067e-05, "loss": 2.4515, "num_input_tokens_seen": 4458784, "step": 2120 }, { "epoch": 0.34668406884737746, "grad_norm": 6.5, "learning_rate": 4.9652950500642724e-05, "loss": 2.4324, "num_input_tokens_seen": 4469728, "step": 2125 }, { "epoch": 0.3474997960681948, "grad_norm": 3.21875, "learning_rate": 4.965131844769001e-05, "loss": 1.203, "num_input_tokens_seen": 4479872, "step": 2130 }, { "epoch": 0.34831552328901216, "grad_norm": 8.875, "learning_rate": 4.96496825931942e-05, "loss": 2.8917, "num_input_tokens_seen": 4489920, "step": 2135 }, { "epoch": 0.3491312505098295, "grad_norm": 4.53125, "learning_rate": 4.9648042937407566e-05, "loss": 2.9721, "num_input_tokens_seen": 4499232, "step": 2140 }, { "epoch": 0.34994697773064687, "grad_norm": 7.3125, "learning_rate": 4.964639948058297e-05, "loss": 2.5412, "num_input_tokens_seen": 4508528, "step": 2145 }, { "epoch": 0.3507627049514642, "grad_norm": 5.34375, "learning_rate": 4.9644752222973846e-05, "loss": 2.9385, "num_input_tokens_seen": 4518880, "step": 2150 }, { "epoch": 0.3515784321722816, "grad_norm": 5.46875, "learning_rate": 4.964310116483422e-05, "loss": 2.8934, "num_input_tokens_seen": 4530176, "step": 2155 }, { "epoch": 0.3523941593930989, "grad_norm": 5.59375, "learning_rate": 4.964144630641872e-05, "loss": 1.3564, "num_input_tokens_seen": 4540048, "step": 2160 }, { "epoch": 0.3532098866139163, "grad_norm": 5.90625, "learning_rate": 4.9639787647982525e-05, "loss": 2.1106, "num_input_tokens_seen": 4549744, "step": 2165 }, { "epoch": 0.3540256138347337, "grad_norm": 6.625, "learning_rate": 4.963812518978143e-05, "loss": 2.9727, "num_input_tokens_seen": 4561280, "step": 2170 }, { "epoch": 0.35484134105555104, "grad_norm": 12.125, "learning_rate": 4.963645893207182e-05, "loss": 2.3613, "num_input_tokens_seen": 4571232, "step": 2175 }, { "epoch": 0.3556570682763684, "grad_norm": 4.5625, "learning_rate": 4.963478887511063e-05, "loss": 1.265, "num_input_tokens_seen": 4580960, "step": 2180 }, { "epoch": 0.35647279549718575, "grad_norm": 7.25, "learning_rate": 4.963311501915542e-05, "loss": 1.8243, "num_input_tokens_seen": 4590560, "step": 2185 }, { "epoch": 0.3572885227180031, "grad_norm": 4.25, "learning_rate": 4.963143736446432e-05, "loss": 2.4404, "num_input_tokens_seen": 4600048, "step": 2190 }, { "epoch": 0.35810424993882045, "grad_norm": 9.5, "learning_rate": 4.962975591129603e-05, "loss": 2.3032, "num_input_tokens_seen": 4610688, "step": 2195 }, { "epoch": 0.3589199771596378, "grad_norm": 3.75, "learning_rate": 4.962807065990986e-05, "loss": 2.5347, "num_input_tokens_seen": 4620944, "step": 2200 }, { "epoch": 0.3589199771596378, "eval_loss": 2.5436015129089355, "eval_runtime": 134.7892, "eval_samples_per_second": 20.217, "eval_steps_per_second": 10.112, "num_input_tokens_seen": 4620944, "step": 2200 }, { "epoch": 0.35973570438045516, "grad_norm": 9.0625, "learning_rate": 4.9626381610565714e-05, "loss": 3.3176, "num_input_tokens_seen": 4630656, "step": 2205 }, { "epoch": 0.3605514316012725, "grad_norm": 6.375, "learning_rate": 4.9624688763524043e-05, "loss": 3.0733, "num_input_tokens_seen": 4642112, "step": 2210 }, { "epoch": 0.3613671588220899, "grad_norm": 12.5, "learning_rate": 4.962299211904591e-05, "loss": 3.3108, "num_input_tokens_seen": 4652688, "step": 2215 }, { "epoch": 0.36218288604290727, "grad_norm": 3.125, "learning_rate": 4.962129167739296e-05, "loss": 2.1965, "num_input_tokens_seen": 4663840, "step": 2220 }, { "epoch": 0.3629986132637246, "grad_norm": 10.25, "learning_rate": 4.961958743882742e-05, "loss": 2.0496, "num_input_tokens_seen": 4674384, "step": 2225 }, { "epoch": 0.363814340484542, "grad_norm": 8.1875, "learning_rate": 4.961787940361211e-05, "loss": 2.2178, "num_input_tokens_seen": 4685248, "step": 2230 }, { "epoch": 0.36463006770535933, "grad_norm": 6.0625, "learning_rate": 4.961616757201043e-05, "loss": 2.8622, "num_input_tokens_seen": 4696576, "step": 2235 }, { "epoch": 0.3654457949261767, "grad_norm": 3.5625, "learning_rate": 4.961445194428637e-05, "loss": 2.79, "num_input_tokens_seen": 4706848, "step": 2240 }, { "epoch": 0.36626152214699403, "grad_norm": 10.9375, "learning_rate": 4.9612732520704486e-05, "loss": 2.7198, "num_input_tokens_seen": 4718224, "step": 2245 }, { "epoch": 0.3670772493678114, "grad_norm": 6.28125, "learning_rate": 4.961100930152994e-05, "loss": 3.1937, "num_input_tokens_seen": 4728336, "step": 2250 }, { "epoch": 0.36789297658862874, "grad_norm": 5.3125, "learning_rate": 4.960928228702849e-05, "loss": 1.3801, "num_input_tokens_seen": 4739264, "step": 2255 }, { "epoch": 0.36870870380944615, "grad_norm": 4.90625, "learning_rate": 4.960755147746645e-05, "loss": 2.3433, "num_input_tokens_seen": 4746736, "step": 2260 }, { "epoch": 0.3695244310302635, "grad_norm": 4.53125, "learning_rate": 4.9605816873110736e-05, "loss": 2.3433, "num_input_tokens_seen": 4756976, "step": 2265 }, { "epoch": 0.37034015825108085, "grad_norm": 0.984375, "learning_rate": 4.960407847422883e-05, "loss": 1.6484, "num_input_tokens_seen": 4766448, "step": 2270 }, { "epoch": 0.3711558854718982, "grad_norm": 8.625, "learning_rate": 4.960233628108885e-05, "loss": 2.4146, "num_input_tokens_seen": 4777584, "step": 2275 }, { "epoch": 0.37197161269271556, "grad_norm": 2.890625, "learning_rate": 4.960059029395942e-05, "loss": 2.0387, "num_input_tokens_seen": 4788064, "step": 2280 }, { "epoch": 0.3727873399135329, "grad_norm": 4.1875, "learning_rate": 4.959884051310983e-05, "loss": 1.0596, "num_input_tokens_seen": 4799568, "step": 2285 }, { "epoch": 0.37360306713435026, "grad_norm": 7.4375, "learning_rate": 4.959708693880991e-05, "loss": 3.1311, "num_input_tokens_seen": 4809200, "step": 2290 }, { "epoch": 0.3744187943551676, "grad_norm": 3.765625, "learning_rate": 4.9595329571330074e-05, "loss": 3.1501, "num_input_tokens_seen": 4818640, "step": 2295 }, { "epoch": 0.37523452157598497, "grad_norm": 8.6875, "learning_rate": 4.9593568410941326e-05, "loss": 2.0934, "num_input_tokens_seen": 4828816, "step": 2300 }, { "epoch": 0.3760502487968023, "grad_norm": 4.5, "learning_rate": 4.959180345791528e-05, "loss": 1.6894, "num_input_tokens_seen": 4838864, "step": 2305 }, { "epoch": 0.37686597601761973, "grad_norm": 4.96875, "learning_rate": 4.9590034712524086e-05, "loss": 2.6994, "num_input_tokens_seen": 4849728, "step": 2310 }, { "epoch": 0.3776817032384371, "grad_norm": 4.1875, "learning_rate": 4.958826217504053e-05, "loss": 1.34, "num_input_tokens_seen": 4860432, "step": 2315 }, { "epoch": 0.37849743045925444, "grad_norm": 6.65625, "learning_rate": 4.958648584573795e-05, "loss": 2.4233, "num_input_tokens_seen": 4870784, "step": 2320 }, { "epoch": 0.3793131576800718, "grad_norm": 3.015625, "learning_rate": 4.958470572489028e-05, "loss": 2.6665, "num_input_tokens_seen": 4880768, "step": 2325 }, { "epoch": 0.38012888490088914, "grad_norm": 6.03125, "learning_rate": 4.958292181277203e-05, "loss": 1.5832, "num_input_tokens_seen": 4891536, "step": 2330 }, { "epoch": 0.3809446121217065, "grad_norm": 4.21875, "learning_rate": 4.958113410965832e-05, "loss": 2.6957, "num_input_tokens_seen": 4901680, "step": 2335 }, { "epoch": 0.38176033934252385, "grad_norm": 7.75, "learning_rate": 4.957934261582481e-05, "loss": 2.0524, "num_input_tokens_seen": 4913824, "step": 2340 }, { "epoch": 0.3825760665633412, "grad_norm": 7.15625, "learning_rate": 4.95775473315478e-05, "loss": 3.7545, "num_input_tokens_seen": 4924528, "step": 2345 }, { "epoch": 0.38339179378415855, "grad_norm": 13.0625, "learning_rate": 4.9575748257104124e-05, "loss": 2.2637, "num_input_tokens_seen": 4934784, "step": 2350 }, { "epoch": 0.38420752100497596, "grad_norm": 7.3125, "learning_rate": 4.9573945392771224e-05, "loss": 3.619, "num_input_tokens_seen": 4944768, "step": 2355 }, { "epoch": 0.3850232482257933, "grad_norm": 13.0, "learning_rate": 4.9572138738827134e-05, "loss": 2.9085, "num_input_tokens_seen": 4954368, "step": 2360 }, { "epoch": 0.38583897544661067, "grad_norm": 3.296875, "learning_rate": 4.957032829555046e-05, "loss": 2.8159, "num_input_tokens_seen": 4963616, "step": 2365 }, { "epoch": 0.386654702667428, "grad_norm": 7.03125, "learning_rate": 4.956851406322039e-05, "loss": 2.7959, "num_input_tokens_seen": 4973808, "step": 2370 }, { "epoch": 0.38747042988824537, "grad_norm": 7.8125, "learning_rate": 4.9566696042116704e-05, "loss": 1.8125, "num_input_tokens_seen": 4984320, "step": 2375 }, { "epoch": 0.3882861571090627, "grad_norm": 3.796875, "learning_rate": 4.9564874232519766e-05, "loss": 2.2002, "num_input_tokens_seen": 4995488, "step": 2380 }, { "epoch": 0.3891018843298801, "grad_norm": 4.78125, "learning_rate": 4.9563048634710516e-05, "loss": 2.5364, "num_input_tokens_seen": 5006080, "step": 2385 }, { "epoch": 0.38991761155069743, "grad_norm": 9.6875, "learning_rate": 4.956121924897049e-05, "loss": 3.3898, "num_input_tokens_seen": 5016464, "step": 2390 }, { "epoch": 0.3907333387715148, "grad_norm": 5.71875, "learning_rate": 4.955938607558181e-05, "loss": 1.7356, "num_input_tokens_seen": 5026944, "step": 2395 }, { "epoch": 0.3915490659923322, "grad_norm": 7.28125, "learning_rate": 4.955754911482715e-05, "loss": 2.6464, "num_input_tokens_seen": 5037232, "step": 2400 }, { "epoch": 0.3915490659923322, "eval_loss": 2.545353889465332, "eval_runtime": 134.9038, "eval_samples_per_second": 20.2, "eval_steps_per_second": 10.103, "num_input_tokens_seen": 5037232, "step": 2400 }, { "epoch": 0.39236479321314954, "grad_norm": 8.375, "learning_rate": 4.9555708366989804e-05, "loss": 1.5869, "num_input_tokens_seen": 5047536, "step": 2405 }, { "epoch": 0.3931805204339669, "grad_norm": 10.9375, "learning_rate": 4.9553863832353655e-05, "loss": 1.8219, "num_input_tokens_seen": 5057408, "step": 2410 }, { "epoch": 0.39399624765478425, "grad_norm": 8.5, "learning_rate": 4.955201551120313e-05, "loss": 1.8402, "num_input_tokens_seen": 5068416, "step": 2415 }, { "epoch": 0.3948119748756016, "grad_norm": 1.2890625, "learning_rate": 4.955016340382328e-05, "loss": 2.1584, "num_input_tokens_seen": 5079936, "step": 2420 }, { "epoch": 0.39562770209641895, "grad_norm": 8.75, "learning_rate": 4.954830751049972e-05, "loss": 1.3047, "num_input_tokens_seen": 5090080, "step": 2425 }, { "epoch": 0.3964434293172363, "grad_norm": 8.8125, "learning_rate": 4.954644783151864e-05, "loss": 1.1211, "num_input_tokens_seen": 5101120, "step": 2430 }, { "epoch": 0.39725915653805366, "grad_norm": 8.125, "learning_rate": 4.954458436716684e-05, "loss": 1.7216, "num_input_tokens_seen": 5110976, "step": 2435 }, { "epoch": 0.398074883758871, "grad_norm": 4.0625, "learning_rate": 4.954271711773168e-05, "loss": 3.0302, "num_input_tokens_seen": 5121840, "step": 2440 }, { "epoch": 0.39889061097968836, "grad_norm": 9.0, "learning_rate": 4.9540846083501115e-05, "loss": 2.4965, "num_input_tokens_seen": 5133120, "step": 2445 }, { "epoch": 0.3997063382005058, "grad_norm": 5.53125, "learning_rate": 4.953897126476369e-05, "loss": 2.4533, "num_input_tokens_seen": 5144096, "step": 2450 }, { "epoch": 0.4005220654213231, "grad_norm": 18.0, "learning_rate": 4.9537092661808514e-05, "loss": 2.6103, "num_input_tokens_seen": 5154848, "step": 2455 }, { "epoch": 0.4013377926421405, "grad_norm": 6.4375, "learning_rate": 4.9535210274925306e-05, "loss": 2.0982, "num_input_tokens_seen": 5166112, "step": 2460 }, { "epoch": 0.40215351986295783, "grad_norm": 5.125, "learning_rate": 4.953332410440435e-05, "loss": 2.1108, "num_input_tokens_seen": 5177760, "step": 2465 }, { "epoch": 0.4029692470837752, "grad_norm": 0.1328125, "learning_rate": 4.9531434150536496e-05, "loss": 2.8628, "num_input_tokens_seen": 5188016, "step": 2470 }, { "epoch": 0.40378497430459254, "grad_norm": 7.625, "learning_rate": 4.952954041361322e-05, "loss": 2.0925, "num_input_tokens_seen": 5199744, "step": 2475 }, { "epoch": 0.4046007015254099, "grad_norm": 7.875, "learning_rate": 4.952764289392655e-05, "loss": 3.7824, "num_input_tokens_seen": 5210624, "step": 2480 }, { "epoch": 0.40541642874622724, "grad_norm": 8.9375, "learning_rate": 4.952574159176912e-05, "loss": 1.9947, "num_input_tokens_seen": 5219344, "step": 2485 }, { "epoch": 0.4062321559670446, "grad_norm": 5.21875, "learning_rate": 4.952383650743413e-05, "loss": 1.0435, "num_input_tokens_seen": 5228400, "step": 2490 }, { "epoch": 0.407047883187862, "grad_norm": 6.3125, "learning_rate": 4.952192764121536e-05, "loss": 3.6553, "num_input_tokens_seen": 5239056, "step": 2495 }, { "epoch": 0.40786361040867936, "grad_norm": 6.8125, "learning_rate": 4.9520014993407185e-05, "loss": 2.4143, "num_input_tokens_seen": 5248640, "step": 2500 }, { "epoch": 0.4086793376294967, "grad_norm": 5.21875, "learning_rate": 4.951809856430456e-05, "loss": 2.0306, "num_input_tokens_seen": 5259312, "step": 2505 }, { "epoch": 0.40949506485031406, "grad_norm": 8.8125, "learning_rate": 4.951617835420303e-05, "loss": 2.5674, "num_input_tokens_seen": 5269360, "step": 2510 }, { "epoch": 0.4103107920711314, "grad_norm": 8.9375, "learning_rate": 4.951425436339869e-05, "loss": 2.208, "num_input_tokens_seen": 5278528, "step": 2515 }, { "epoch": 0.41112651929194877, "grad_norm": 7.40625, "learning_rate": 4.9512326592188274e-05, "loss": 2.838, "num_input_tokens_seen": 5288864, "step": 2520 }, { "epoch": 0.4119422465127661, "grad_norm": 7.6875, "learning_rate": 4.9510395040869054e-05, "loss": 2.4106, "num_input_tokens_seen": 5298864, "step": 2525 }, { "epoch": 0.41275797373358347, "grad_norm": 8.875, "learning_rate": 4.9508459709738905e-05, "loss": 1.7885, "num_input_tokens_seen": 5309824, "step": 2530 }, { "epoch": 0.4135737009544008, "grad_norm": 9.6875, "learning_rate": 4.950652059909627e-05, "loss": 2.6078, "num_input_tokens_seen": 5317872, "step": 2535 }, { "epoch": 0.41438942817521823, "grad_norm": 9.6875, "learning_rate": 4.95045777092402e-05, "loss": 2.7714, "num_input_tokens_seen": 5327584, "step": 2540 }, { "epoch": 0.4152051553960356, "grad_norm": 16.25, "learning_rate": 4.950263104047031e-05, "loss": 3.3783, "num_input_tokens_seen": 5337824, "step": 2545 }, { "epoch": 0.41602088261685294, "grad_norm": 11.8125, "learning_rate": 4.9500680593086775e-05, "loss": 2.3399, "num_input_tokens_seen": 5348480, "step": 2550 }, { "epoch": 0.4168366098376703, "grad_norm": 6.46875, "learning_rate": 4.94987263673904e-05, "loss": 2.0048, "num_input_tokens_seen": 5358768, "step": 2555 }, { "epoch": 0.41765233705848764, "grad_norm": 6.6875, "learning_rate": 4.949676836368256e-05, "loss": 1.6839, "num_input_tokens_seen": 5369472, "step": 2560 }, { "epoch": 0.418468064279305, "grad_norm": 7.6875, "learning_rate": 4.949480658226518e-05, "loss": 1.8178, "num_input_tokens_seen": 5381344, "step": 2565 }, { "epoch": 0.41928379150012235, "grad_norm": 4.09375, "learning_rate": 4.949284102344082e-05, "loss": 3.9615, "num_input_tokens_seen": 5390688, "step": 2570 }, { "epoch": 0.4200995187209397, "grad_norm": 8.125, "learning_rate": 4.9490871687512565e-05, "loss": 2.7066, "num_input_tokens_seen": 5401984, "step": 2575 }, { "epoch": 0.42091524594175705, "grad_norm": 4.375, "learning_rate": 4.948889857478413e-05, "loss": 0.8198, "num_input_tokens_seen": 5411792, "step": 2580 }, { "epoch": 0.4217309731625744, "grad_norm": 8.875, "learning_rate": 4.948692168555978e-05, "loss": 1.9961, "num_input_tokens_seen": 5421616, "step": 2585 }, { "epoch": 0.4225467003833918, "grad_norm": 9.0625, "learning_rate": 4.94849410201444e-05, "loss": 2.1862, "num_input_tokens_seen": 5430976, "step": 2590 }, { "epoch": 0.42336242760420917, "grad_norm": 17.875, "learning_rate": 4.948295657884341e-05, "loss": 3.3456, "num_input_tokens_seen": 5442128, "step": 2595 }, { "epoch": 0.4241781548250265, "grad_norm": 13.3125, "learning_rate": 4.9480968361962835e-05, "loss": 3.414, "num_input_tokens_seen": 5452736, "step": 2600 }, { "epoch": 0.4241781548250265, "eval_loss": 2.5460686683654785, "eval_runtime": 134.9209, "eval_samples_per_second": 20.197, "eval_steps_per_second": 10.102, "num_input_tokens_seen": 5452736, "step": 2600 }, { "epoch": 0.4249938820458439, "grad_norm": 8.6875, "learning_rate": 4.9478976369809305e-05, "loss": 1.8162, "num_input_tokens_seen": 5463808, "step": 2605 }, { "epoch": 0.4258096092666612, "grad_norm": 2.421875, "learning_rate": 4.947698060268999e-05, "loss": 0.95, "num_input_tokens_seen": 5473520, "step": 2610 }, { "epoch": 0.4266253364874786, "grad_norm": 2.140625, "learning_rate": 4.9474981060912665e-05, "loss": 1.9411, "num_input_tokens_seen": 5483216, "step": 2615 }, { "epoch": 0.42744106370829593, "grad_norm": 8.0, "learning_rate": 4.94729777447857e-05, "loss": 1.9486, "num_input_tokens_seen": 5493136, "step": 2620 }, { "epoch": 0.4282567909291133, "grad_norm": 6.90625, "learning_rate": 4.947097065461801e-05, "loss": 3.2971, "num_input_tokens_seen": 5504016, "step": 2625 }, { "epoch": 0.42907251814993064, "grad_norm": 8.875, "learning_rate": 4.9468959790719125e-05, "loss": 2.7721, "num_input_tokens_seen": 5514800, "step": 2630 }, { "epoch": 0.42988824537074805, "grad_norm": 5.90625, "learning_rate": 4.9466945153399146e-05, "loss": 3.3625, "num_input_tokens_seen": 5524208, "step": 2635 }, { "epoch": 0.4307039725915654, "grad_norm": 4.4375, "learning_rate": 4.9464926742968755e-05, "loss": 1.8359, "num_input_tokens_seen": 5535232, "step": 2640 }, { "epoch": 0.43151969981238275, "grad_norm": 7.84375, "learning_rate": 4.946290455973921e-05, "loss": 4.118, "num_input_tokens_seen": 5546480, "step": 2645 }, { "epoch": 0.4323354270332001, "grad_norm": 10.5625, "learning_rate": 4.9460878604022365e-05, "loss": 4.2019, "num_input_tokens_seen": 5556960, "step": 2650 }, { "epoch": 0.43315115425401746, "grad_norm": 3.046875, "learning_rate": 4.945884887613065e-05, "loss": 1.1823, "num_input_tokens_seen": 5566976, "step": 2655 }, { "epoch": 0.4339668814748348, "grad_norm": 6.46875, "learning_rate": 4.9456815376377055e-05, "loss": 3.3858, "num_input_tokens_seen": 5577440, "step": 2660 }, { "epoch": 0.43478260869565216, "grad_norm": 2.921875, "learning_rate": 4.9454778105075195e-05, "loss": 1.5755, "num_input_tokens_seen": 5587824, "step": 2665 }, { "epoch": 0.4355983359164695, "grad_norm": 4.90625, "learning_rate": 4.945273706253924e-05, "loss": 2.3803, "num_input_tokens_seen": 5598656, "step": 2670 }, { "epoch": 0.43641406313728687, "grad_norm": 3.3125, "learning_rate": 4.9450692249083925e-05, "loss": 2.1766, "num_input_tokens_seen": 5608656, "step": 2675 }, { "epoch": 0.4372297903581043, "grad_norm": 7.5625, "learning_rate": 4.9448643665024605e-05, "loss": 2.0596, "num_input_tokens_seen": 5618912, "step": 2680 }, { "epoch": 0.43804551757892163, "grad_norm": 5.6875, "learning_rate": 4.944659131067719e-05, "loss": 1.4934, "num_input_tokens_seen": 5630256, "step": 2685 }, { "epoch": 0.438861244799739, "grad_norm": 5.625, "learning_rate": 4.944453518635818e-05, "loss": 1.5493, "num_input_tokens_seen": 5640192, "step": 2690 }, { "epoch": 0.43967697202055633, "grad_norm": 5.65625, "learning_rate": 4.944247529238465e-05, "loss": 2.7586, "num_input_tokens_seen": 5649744, "step": 2695 }, { "epoch": 0.4404926992413737, "grad_norm": 6.9375, "learning_rate": 4.944041162907427e-05, "loss": 3.4399, "num_input_tokens_seen": 5658464, "step": 2700 }, { "epoch": 0.44130842646219104, "grad_norm": 3.921875, "learning_rate": 4.943834419674529e-05, "loss": 2.9379, "num_input_tokens_seen": 5669632, "step": 2705 }, { "epoch": 0.4421241536830084, "grad_norm": 12.625, "learning_rate": 4.9436272995716506e-05, "loss": 2.9962, "num_input_tokens_seen": 5679056, "step": 2710 }, { "epoch": 0.44293988090382574, "grad_norm": 7.03125, "learning_rate": 4.943419802630735e-05, "loss": 2.9155, "num_input_tokens_seen": 5689376, "step": 2715 }, { "epoch": 0.4437556081246431, "grad_norm": 1.6875, "learning_rate": 4.94321192888378e-05, "loss": 2.4922, "num_input_tokens_seen": 5700960, "step": 2720 }, { "epoch": 0.4445713353454605, "grad_norm": 8.25, "learning_rate": 4.943003678362842e-05, "loss": 2.7537, "num_input_tokens_seen": 5711680, "step": 2725 }, { "epoch": 0.44538706256627786, "grad_norm": 5.90625, "learning_rate": 4.942795051100036e-05, "loss": 1.5171, "num_input_tokens_seen": 5721328, "step": 2730 }, { "epoch": 0.4462027897870952, "grad_norm": 6.90625, "learning_rate": 4.942586047127536e-05, "loss": 2.1331, "num_input_tokens_seen": 5732480, "step": 2735 }, { "epoch": 0.44701851700791256, "grad_norm": 5.46875, "learning_rate": 4.942376666477571e-05, "loss": 2.4113, "num_input_tokens_seen": 5742896, "step": 2740 }, { "epoch": 0.4478342442287299, "grad_norm": 8.25, "learning_rate": 4.9421669091824304e-05, "loss": 3.3349, "num_input_tokens_seen": 5752304, "step": 2745 }, { "epoch": 0.44864997144954727, "grad_norm": 12.0625, "learning_rate": 4.9419567752744634e-05, "loss": 3.7092, "num_input_tokens_seen": 5764032, "step": 2750 }, { "epoch": 0.4494656986703646, "grad_norm": 11.375, "learning_rate": 4.941746264786074e-05, "loss": 3.2194, "num_input_tokens_seen": 5774960, "step": 2755 }, { "epoch": 0.450281425891182, "grad_norm": 3.953125, "learning_rate": 4.9415353777497254e-05, "loss": 1.6109, "num_input_tokens_seen": 5786256, "step": 2760 }, { "epoch": 0.4510971531119993, "grad_norm": 9.4375, "learning_rate": 4.9413241141979394e-05, "loss": 3.1308, "num_input_tokens_seen": 5797536, "step": 2765 }, { "epoch": 0.4519128803328167, "grad_norm": 5.34375, "learning_rate": 4.9411124741632956e-05, "loss": 1.163, "num_input_tokens_seen": 5807824, "step": 2770 }, { "epoch": 0.4527286075536341, "grad_norm": 6.0, "learning_rate": 4.940900457678431e-05, "loss": 2.4222, "num_input_tokens_seen": 5819072, "step": 2775 }, { "epoch": 0.45354433477445144, "grad_norm": 3.359375, "learning_rate": 4.9406880647760425e-05, "loss": 2.2993, "num_input_tokens_seen": 5830272, "step": 2780 }, { "epoch": 0.4543600619952688, "grad_norm": 5.34375, "learning_rate": 4.9404752954888824e-05, "loss": 2.4058, "num_input_tokens_seen": 5841280, "step": 2785 }, { "epoch": 0.45517578921608615, "grad_norm": 1.0078125, "learning_rate": 4.940262149849762e-05, "loss": 2.1057, "num_input_tokens_seen": 5850640, "step": 2790 }, { "epoch": 0.4559915164369035, "grad_norm": 9.6875, "learning_rate": 4.9400486278915526e-05, "loss": 2.2462, "num_input_tokens_seen": 5862704, "step": 2795 }, { "epoch": 0.45680724365772085, "grad_norm": 7.25, "learning_rate": 4.939834729647181e-05, "loss": 4.0924, "num_input_tokens_seen": 5872752, "step": 2800 }, { "epoch": 0.45680724365772085, "eval_loss": 2.5363454818725586, "eval_runtime": 134.7289, "eval_samples_per_second": 20.226, "eval_steps_per_second": 10.117, "num_input_tokens_seen": 5872752, "step": 2800 }, { "epoch": 0.4576229708785382, "grad_norm": 11.1875, "learning_rate": 4.9396204551496326e-05, "loss": 3.4319, "num_input_tokens_seen": 5882464, "step": 2805 }, { "epoch": 0.45843869809935556, "grad_norm": 5.53125, "learning_rate": 4.939405804431952e-05, "loss": 1.9734, "num_input_tokens_seen": 5892048, "step": 2810 }, { "epoch": 0.4592544253201729, "grad_norm": 9.125, "learning_rate": 4.9391907775272414e-05, "loss": 1.755, "num_input_tokens_seen": 5901504, "step": 2815 }, { "epoch": 0.4600701525409903, "grad_norm": 6.25, "learning_rate": 4.9389753744686604e-05, "loss": 2.7649, "num_input_tokens_seen": 5912832, "step": 2820 }, { "epoch": 0.46088587976180767, "grad_norm": 10.8125, "learning_rate": 4.938759595289426e-05, "loss": 1.9929, "num_input_tokens_seen": 5923392, "step": 2825 }, { "epoch": 0.461701606982625, "grad_norm": 11.0, "learning_rate": 4.938543440022815e-05, "loss": 4.088, "num_input_tokens_seen": 5933968, "step": 2830 }, { "epoch": 0.4625173342034424, "grad_norm": 8.125, "learning_rate": 4.938326908702161e-05, "loss": 1.915, "num_input_tokens_seen": 5943872, "step": 2835 }, { "epoch": 0.46333306142425973, "grad_norm": 7.96875, "learning_rate": 4.9381100013608554e-05, "loss": 2.8769, "num_input_tokens_seen": 5954752, "step": 2840 }, { "epoch": 0.4641487886450771, "grad_norm": 11.5625, "learning_rate": 4.9378927180323485e-05, "loss": 3.1543, "num_input_tokens_seen": 5965280, "step": 2845 }, { "epoch": 0.46496451586589443, "grad_norm": 10.1875, "learning_rate": 4.937675058750148e-05, "loss": 2.1452, "num_input_tokens_seen": 5975792, "step": 2850 }, { "epoch": 0.4657802430867118, "grad_norm": 3.59375, "learning_rate": 4.937457023547819e-05, "loss": 1.2275, "num_input_tokens_seen": 5986480, "step": 2855 }, { "epoch": 0.46659597030752914, "grad_norm": 9.5625, "learning_rate": 4.9372386124589876e-05, "loss": 3.0902, "num_input_tokens_seen": 5997264, "step": 2860 }, { "epoch": 0.46741169752834655, "grad_norm": 4.5625, "learning_rate": 4.937019825517333e-05, "loss": 1.1095, "num_input_tokens_seen": 6006976, "step": 2865 }, { "epoch": 0.4682274247491639, "grad_norm": 5.5625, "learning_rate": 4.9368006627565954e-05, "loss": 3.2964, "num_input_tokens_seen": 6016912, "step": 2870 }, { "epoch": 0.46904315196998125, "grad_norm": 6.84375, "learning_rate": 4.936581124210573e-05, "loss": 3.4153, "num_input_tokens_seen": 6027248, "step": 2875 }, { "epoch": 0.4698588791907986, "grad_norm": 11.1875, "learning_rate": 4.9363612099131216e-05, "loss": 3.9821, "num_input_tokens_seen": 6037616, "step": 2880 }, { "epoch": 0.47067460641161596, "grad_norm": 4.4375, "learning_rate": 4.936140919898155e-05, "loss": 1.3584, "num_input_tokens_seen": 6047600, "step": 2885 }, { "epoch": 0.4714903336324333, "grad_norm": 6.03125, "learning_rate": 4.9359202541996426e-05, "loss": 1.8138, "num_input_tokens_seen": 6056960, "step": 2890 }, { "epoch": 0.47230606085325066, "grad_norm": 2.203125, "learning_rate": 4.935699212851616e-05, "loss": 2.701, "num_input_tokens_seen": 6067408, "step": 2895 }, { "epoch": 0.473121788074068, "grad_norm": 4.25, "learning_rate": 4.935477795888162e-05, "loss": 2.3525, "num_input_tokens_seen": 6078240, "step": 2900 }, { "epoch": 0.47393751529488537, "grad_norm": 8.0, "learning_rate": 4.935256003343426e-05, "loss": 2.2479, "num_input_tokens_seen": 6087760, "step": 2905 }, { "epoch": 0.4747532425157027, "grad_norm": 1.9453125, "learning_rate": 4.93503383525161e-05, "loss": 2.5845, "num_input_tokens_seen": 6098512, "step": 2910 }, { "epoch": 0.47556896973652013, "grad_norm": 13.125, "learning_rate": 4.934811291646977e-05, "loss": 4.7693, "num_input_tokens_seen": 6108592, "step": 2915 }, { "epoch": 0.4763846969573375, "grad_norm": 6.25, "learning_rate": 4.934588372563845e-05, "loss": 1.5896, "num_input_tokens_seen": 6118816, "step": 2920 }, { "epoch": 0.47720042417815484, "grad_norm": 8.875, "learning_rate": 4.93436507803659e-05, "loss": 1.9661, "num_input_tokens_seen": 6129136, "step": 2925 }, { "epoch": 0.4780161513989722, "grad_norm": 10.8125, "learning_rate": 4.934141408099649e-05, "loss": 2.4596, "num_input_tokens_seen": 6138768, "step": 2930 }, { "epoch": 0.47883187861978954, "grad_norm": 9.4375, "learning_rate": 4.9339173627875135e-05, "loss": 3.1813, "num_input_tokens_seen": 6149824, "step": 2935 }, { "epoch": 0.4796476058406069, "grad_norm": 8.625, "learning_rate": 4.9336929421347335e-05, "loss": 3.1056, "num_input_tokens_seen": 6159184, "step": 2940 }, { "epoch": 0.48046333306142425, "grad_norm": 4.8125, "learning_rate": 4.933468146175918e-05, "loss": 1.3328, "num_input_tokens_seen": 6168000, "step": 2945 }, { "epoch": 0.4812790602822416, "grad_norm": 7.375, "learning_rate": 4.933242974945734e-05, "loss": 1.4682, "num_input_tokens_seen": 6178512, "step": 2950 }, { "epoch": 0.48209478750305895, "grad_norm": 4.25, "learning_rate": 4.933017428478906e-05, "loss": 1.3646, "num_input_tokens_seen": 6189360, "step": 2955 }, { "epoch": 0.48291051472387636, "grad_norm": 6.5625, "learning_rate": 4.932791506810214e-05, "loss": 2.1511, "num_input_tokens_seen": 6199632, "step": 2960 }, { "epoch": 0.4837262419446937, "grad_norm": 8.125, "learning_rate": 4.932565209974499e-05, "loss": 2.5074, "num_input_tokens_seen": 6211280, "step": 2965 }, { "epoch": 0.48454196916551107, "grad_norm": 8.4375, "learning_rate": 4.93233853800666e-05, "loss": 3.1926, "num_input_tokens_seen": 6221328, "step": 2970 }, { "epoch": 0.4853576963863284, "grad_norm": 10.1875, "learning_rate": 4.932111490941651e-05, "loss": 2.8718, "num_input_tokens_seen": 6231648, "step": 2975 }, { "epoch": 0.48617342360714577, "grad_norm": 4.875, "learning_rate": 4.9318840688144876e-05, "loss": 2.9625, "num_input_tokens_seen": 6243296, "step": 2980 }, { "epoch": 0.4869891508279631, "grad_norm": 5.5, "learning_rate": 4.9316562716602387e-05, "loss": 2.5188, "num_input_tokens_seen": 6254080, "step": 2985 }, { "epoch": 0.4878048780487805, "grad_norm": 8.0625, "learning_rate": 4.9314280995140346e-05, "loss": 2.2166, "num_input_tokens_seen": 6263680, "step": 2990 }, { "epoch": 0.48862060526959783, "grad_norm": 6.15625, "learning_rate": 4.931199552411063e-05, "loss": 2.1349, "num_input_tokens_seen": 6274560, "step": 2995 }, { "epoch": 0.4894363324904152, "grad_norm": 9.0, "learning_rate": 4.930970630386568e-05, "loss": 1.594, "num_input_tokens_seen": 6285232, "step": 3000 }, { "epoch": 0.4894363324904152, "eval_loss": 2.556004285812378, "eval_runtime": 134.74, "eval_samples_per_second": 20.224, "eval_steps_per_second": 10.116, "num_input_tokens_seen": 6285232, "step": 3000 }, { "epoch": 0.4902520597112326, "grad_norm": 4.5625, "learning_rate": 4.9307413334758524e-05, "loss": 3.6524, "num_input_tokens_seen": 6296272, "step": 3005 }, { "epoch": 0.49106778693204994, "grad_norm": 7.0625, "learning_rate": 4.930511661714276e-05, "loss": 2.1055, "num_input_tokens_seen": 6306720, "step": 3010 }, { "epoch": 0.4918835141528673, "grad_norm": 4.5, "learning_rate": 4.9302816151372576e-05, "loss": 2.7039, "num_input_tokens_seen": 6316480, "step": 3015 }, { "epoch": 0.49269924137368465, "grad_norm": 5.28125, "learning_rate": 4.930051193780274e-05, "loss": 2.569, "num_input_tokens_seen": 6326048, "step": 3020 }, { "epoch": 0.493514968594502, "grad_norm": 9.5, "learning_rate": 4.929820397678858e-05, "loss": 3.0794, "num_input_tokens_seen": 6335760, "step": 3025 }, { "epoch": 0.49433069581531935, "grad_norm": 10.0, "learning_rate": 4.9295892268686015e-05, "loss": 2.4776, "num_input_tokens_seen": 6345984, "step": 3030 }, { "epoch": 0.4951464230361367, "grad_norm": 0.546875, "learning_rate": 4.9293576813851536e-05, "loss": 1.4657, "num_input_tokens_seen": 6356448, "step": 3035 }, { "epoch": 0.49596215025695406, "grad_norm": 7.8125, "learning_rate": 4.929125761264223e-05, "loss": 3.0922, "num_input_tokens_seen": 6367440, "step": 3040 }, { "epoch": 0.4967778774777714, "grad_norm": 11.3125, "learning_rate": 4.928893466541573e-05, "loss": 3.0937, "num_input_tokens_seen": 6378768, "step": 3045 }, { "epoch": 0.49759360469858877, "grad_norm": 15.5625, "learning_rate": 4.928660797253027e-05, "loss": 2.7171, "num_input_tokens_seen": 6390032, "step": 3050 }, { "epoch": 0.4984093319194062, "grad_norm": 17.375, "learning_rate": 4.928427753434467e-05, "loss": 2.3117, "num_input_tokens_seen": 6400784, "step": 3055 }, { "epoch": 0.4992250591402235, "grad_norm": 10.75, "learning_rate": 4.9281943351218286e-05, "loss": 2.8343, "num_input_tokens_seen": 6410320, "step": 3060 }, { "epoch": 0.5000407863610409, "grad_norm": 7.59375, "learning_rate": 4.9279605423511095e-05, "loss": 3.4688, "num_input_tokens_seen": 6420368, "step": 3065 }, { "epoch": 0.5008565135818582, "grad_norm": 8.8125, "learning_rate": 4.927726375158363e-05, "loss": 3.1149, "num_input_tokens_seen": 6430912, "step": 3070 }, { "epoch": 0.5016722408026756, "grad_norm": 10.25, "learning_rate": 4.9274918335797004e-05, "loss": 2.3083, "num_input_tokens_seen": 6440720, "step": 3075 }, { "epoch": 0.5024879680234929, "grad_norm": 11.75, "learning_rate": 4.927256917651292e-05, "loss": 1.1752, "num_input_tokens_seen": 6450800, "step": 3080 }, { "epoch": 0.5033036952443103, "grad_norm": 8.4375, "learning_rate": 4.927021627409364e-05, "loss": 2.1302, "num_input_tokens_seen": 6460752, "step": 3085 }, { "epoch": 0.5041194224651276, "grad_norm": 3.671875, "learning_rate": 4.9267859628902005e-05, "loss": 2.0868, "num_input_tokens_seen": 6470208, "step": 3090 }, { "epoch": 0.504935149685945, "grad_norm": 8.875, "learning_rate": 4.9265499241301454e-05, "loss": 3.0766, "num_input_tokens_seen": 6480768, "step": 3095 }, { "epoch": 0.5057508769067623, "grad_norm": 6.40625, "learning_rate": 4.926313511165598e-05, "loss": 1.9637, "num_input_tokens_seen": 6490928, "step": 3100 }, { "epoch": 0.5065666041275797, "grad_norm": 9.5, "learning_rate": 4.926076724033016e-05, "loss": 2.5437, "num_input_tokens_seen": 6501856, "step": 3105 }, { "epoch": 0.507382331348397, "grad_norm": 3.5625, "learning_rate": 4.9258395627689146e-05, "loss": 2.0218, "num_input_tokens_seen": 6513232, "step": 3110 }, { "epoch": 0.5081980585692144, "grad_norm": 5.53125, "learning_rate": 4.925602027409868e-05, "loss": 2.3945, "num_input_tokens_seen": 6523632, "step": 3115 }, { "epoch": 0.5090137857900318, "grad_norm": 3.921875, "learning_rate": 4.925364117992507e-05, "loss": 2.8361, "num_input_tokens_seen": 6534176, "step": 3120 }, { "epoch": 0.5098295130108492, "grad_norm": 9.5, "learning_rate": 4.92512583455352e-05, "loss": 2.9379, "num_input_tokens_seen": 6544912, "step": 3125 }, { "epoch": 0.5106452402316666, "grad_norm": 10.375, "learning_rate": 4.9248871771296536e-05, "loss": 3.0284, "num_input_tokens_seen": 6554656, "step": 3130 }, { "epoch": 0.5114609674524839, "grad_norm": 9.5, "learning_rate": 4.924648145757711e-05, "loss": 3.8549, "num_input_tokens_seen": 6564032, "step": 3135 }, { "epoch": 0.5122766946733013, "grad_norm": 12.1875, "learning_rate": 4.924408740474554e-05, "loss": 2.1427, "num_input_tokens_seen": 6574544, "step": 3140 }, { "epoch": 0.5130924218941186, "grad_norm": 6.15625, "learning_rate": 4.924168961317103e-05, "loss": 2.6508, "num_input_tokens_seen": 6584448, "step": 3145 }, { "epoch": 0.513908149114936, "grad_norm": 6.65625, "learning_rate": 4.9239288083223334e-05, "loss": 2.5556, "num_input_tokens_seen": 6593552, "step": 3150 }, { "epoch": 0.5147238763357533, "grad_norm": 5.09375, "learning_rate": 4.9236882815272803e-05, "loss": 1.9787, "num_input_tokens_seen": 6602464, "step": 3155 }, { "epoch": 0.5155396035565707, "grad_norm": 10.375, "learning_rate": 4.9234473809690365e-05, "loss": 2.7816, "num_input_tokens_seen": 6612000, "step": 3160 }, { "epoch": 0.516355330777388, "grad_norm": 3.515625, "learning_rate": 4.923206106684752e-05, "loss": 2.1682, "num_input_tokens_seen": 6623184, "step": 3165 }, { "epoch": 0.5171710579982054, "grad_norm": 9.75, "learning_rate": 4.922964458711634e-05, "loss": 2.6035, "num_input_tokens_seen": 6634384, "step": 3170 }, { "epoch": 0.5179867852190227, "grad_norm": 3.109375, "learning_rate": 4.9227224370869474e-05, "loss": 2.0977, "num_input_tokens_seen": 6644624, "step": 3175 }, { "epoch": 0.5188025124398401, "grad_norm": 3.421875, "learning_rate": 4.9224800418480155e-05, "loss": 1.5099, "num_input_tokens_seen": 6654848, "step": 3180 }, { "epoch": 0.5196182396606575, "grad_norm": 7.6875, "learning_rate": 4.9222372730322176e-05, "loss": 3.129, "num_input_tokens_seen": 6665904, "step": 3185 }, { "epoch": 0.5204339668814748, "grad_norm": 6.0, "learning_rate": 4.921994130676993e-05, "loss": 2.2162, "num_input_tokens_seen": 6677872, "step": 3190 }, { "epoch": 0.5212496941022922, "grad_norm": 7.90625, "learning_rate": 4.9217506148198366e-05, "loss": 1.5603, "num_input_tokens_seen": 6688640, "step": 3195 }, { "epoch": 0.5220654213231095, "grad_norm": 9.25, "learning_rate": 4.921506725498302e-05, "loss": 2.1049, "num_input_tokens_seen": 6699264, "step": 3200 }, { "epoch": 0.5220654213231095, "eval_loss": 2.5613045692443848, "eval_runtime": 134.9035, "eval_samples_per_second": 20.2, "eval_steps_per_second": 10.104, "num_input_tokens_seen": 6699264, "step": 3200 }, { "epoch": 0.5228811485439269, "grad_norm": 6.625, "learning_rate": 4.9212624627499994e-05, "loss": 2.3308, "num_input_tokens_seen": 6710016, "step": 3205 }, { "epoch": 0.5236968757647442, "grad_norm": 6.75, "learning_rate": 4.921017826612597e-05, "loss": 1.6122, "num_input_tokens_seen": 6720304, "step": 3210 }, { "epoch": 0.5245126029855617, "grad_norm": 7.21875, "learning_rate": 4.9207728171238223e-05, "loss": 2.8545, "num_input_tokens_seen": 6730784, "step": 3215 }, { "epoch": 0.525328330206379, "grad_norm": 5.46875, "learning_rate": 4.920527434321458e-05, "loss": 1.2659, "num_input_tokens_seen": 6740208, "step": 3220 }, { "epoch": 0.5261440574271964, "grad_norm": 8.5, "learning_rate": 4.920281678243345e-05, "loss": 2.263, "num_input_tokens_seen": 6750864, "step": 3225 }, { "epoch": 0.5269597846480137, "grad_norm": 10.125, "learning_rate": 4.920035548927381e-05, "loss": 2.1463, "num_input_tokens_seen": 6761072, "step": 3230 }, { "epoch": 0.5277755118688311, "grad_norm": 9.1875, "learning_rate": 4.919789046411525e-05, "loss": 2.7964, "num_input_tokens_seen": 6770928, "step": 3235 }, { "epoch": 0.5285912390896484, "grad_norm": 3.71875, "learning_rate": 4.919542170733787e-05, "loss": 2.9093, "num_input_tokens_seen": 6781936, "step": 3240 }, { "epoch": 0.5294069663104658, "grad_norm": 8.6875, "learning_rate": 4.919294921932242e-05, "loss": 2.3789, "num_input_tokens_seen": 6791632, "step": 3245 }, { "epoch": 0.5302226935312832, "grad_norm": 10.9375, "learning_rate": 4.919047300045016e-05, "loss": 4.3352, "num_input_tokens_seen": 6802000, "step": 3250 }, { "epoch": 0.5310384207521005, "grad_norm": 10.5, "learning_rate": 4.918799305110299e-05, "loss": 1.2275, "num_input_tokens_seen": 6812976, "step": 3255 }, { "epoch": 0.5318541479729179, "grad_norm": 4.28125, "learning_rate": 4.918550937166331e-05, "loss": 2.0981, "num_input_tokens_seen": 6822992, "step": 3260 }, { "epoch": 0.5326698751937352, "grad_norm": 4.625, "learning_rate": 4.918302196251415e-05, "loss": 1.9569, "num_input_tokens_seen": 6833680, "step": 3265 }, { "epoch": 0.5334856024145526, "grad_norm": 4.53125, "learning_rate": 4.91805308240391e-05, "loss": 2.5389, "num_input_tokens_seen": 6843968, "step": 3270 }, { "epoch": 0.5343013296353699, "grad_norm": 8.375, "learning_rate": 4.9178035956622326e-05, "loss": 2.4846, "num_input_tokens_seen": 6855040, "step": 3275 }, { "epoch": 0.5351170568561873, "grad_norm": 8.625, "learning_rate": 4.917553736064857e-05, "loss": 3.2353, "num_input_tokens_seen": 6866688, "step": 3280 }, { "epoch": 0.5359327840770046, "grad_norm": 6.4375, "learning_rate": 4.917303503650314e-05, "loss": 2.669, "num_input_tokens_seen": 6877808, "step": 3285 }, { "epoch": 0.536748511297822, "grad_norm": 7.78125, "learning_rate": 4.917052898457194e-05, "loss": 1.5405, "num_input_tokens_seen": 6887824, "step": 3290 }, { "epoch": 0.5375642385186393, "grad_norm": 6.96875, "learning_rate": 4.916801920524141e-05, "loss": 2.6606, "num_input_tokens_seen": 6897936, "step": 3295 }, { "epoch": 0.5383799657394567, "grad_norm": 5.09375, "learning_rate": 4.916550569889862e-05, "loss": 1.7135, "num_input_tokens_seen": 6908448, "step": 3300 }, { "epoch": 0.539195692960274, "grad_norm": 7.09375, "learning_rate": 4.916298846593116e-05, "loss": 1.8162, "num_input_tokens_seen": 6919680, "step": 3305 }, { "epoch": 0.5400114201810915, "grad_norm": 6.9375, "learning_rate": 4.916046750672722e-05, "loss": 3.4111, "num_input_tokens_seen": 6930528, "step": 3310 }, { "epoch": 0.5408271474019088, "grad_norm": 10.625, "learning_rate": 4.915794282167559e-05, "loss": 4.0599, "num_input_tokens_seen": 6941360, "step": 3315 }, { "epoch": 0.5416428746227262, "grad_norm": 5.8125, "learning_rate": 4.915541441116558e-05, "loss": 3.2968, "num_input_tokens_seen": 6951440, "step": 3320 }, { "epoch": 0.5424586018435436, "grad_norm": 3.890625, "learning_rate": 4.915288227558711e-05, "loss": 3.3474, "num_input_tokens_seen": 6961856, "step": 3325 }, { "epoch": 0.5432743290643609, "grad_norm": 8.75, "learning_rate": 4.915034641533066e-05, "loss": 3.1139, "num_input_tokens_seen": 6971808, "step": 3330 }, { "epoch": 0.5440900562851783, "grad_norm": 6.625, "learning_rate": 4.914780683078731e-05, "loss": 1.8606, "num_input_tokens_seen": 6981968, "step": 3335 }, { "epoch": 0.5449057835059956, "grad_norm": 5.71875, "learning_rate": 4.9145263522348695e-05, "loss": 2.3299, "num_input_tokens_seen": 6992224, "step": 3340 }, { "epoch": 0.545721510726813, "grad_norm": 9.25, "learning_rate": 4.9142716490407e-05, "loss": 1.5681, "num_input_tokens_seen": 7002768, "step": 3345 }, { "epoch": 0.5465372379476303, "grad_norm": 9.8125, "learning_rate": 4.914016573535504e-05, "loss": 2.1197, "num_input_tokens_seen": 7012656, "step": 3350 }, { "epoch": 0.5473529651684477, "grad_norm": 6.59375, "learning_rate": 4.9137611257586154e-05, "loss": 3.2067, "num_input_tokens_seen": 7022368, "step": 3355 }, { "epoch": 0.548168692389265, "grad_norm": 8.75, "learning_rate": 4.9135053057494274e-05, "loss": 1.8967, "num_input_tokens_seen": 7033024, "step": 3360 }, { "epoch": 0.5489844196100824, "grad_norm": 7.28125, "learning_rate": 4.913249113547392e-05, "loss": 2.9176, "num_input_tokens_seen": 7043536, "step": 3365 }, { "epoch": 0.5498001468308997, "grad_norm": 6.875, "learning_rate": 4.912992549192016e-05, "loss": 2.0323, "num_input_tokens_seen": 7053824, "step": 3370 }, { "epoch": 0.5506158740517171, "grad_norm": 9.125, "learning_rate": 4.9127356127228665e-05, "loss": 2.8353, "num_input_tokens_seen": 7064464, "step": 3375 }, { "epoch": 0.5514316012725344, "grad_norm": 1.578125, "learning_rate": 4.912478304179564e-05, "loss": 1.5215, "num_input_tokens_seen": 7075408, "step": 3380 }, { "epoch": 0.5522473284933518, "grad_norm": 4.75, "learning_rate": 4.9122206236017896e-05, "loss": 3.1836, "num_input_tokens_seen": 7087296, "step": 3385 }, { "epoch": 0.5530630557141691, "grad_norm": 8.125, "learning_rate": 4.911962571029282e-05, "loss": 1.6045, "num_input_tokens_seen": 7098240, "step": 3390 }, { "epoch": 0.5538787829349865, "grad_norm": 10.125, "learning_rate": 4.9117041465018353e-05, "loss": 3.3567, "num_input_tokens_seen": 7108736, "step": 3395 }, { "epoch": 0.5546945101558038, "grad_norm": 4.5625, "learning_rate": 4.911445350059302e-05, "loss": 2.4132, "num_input_tokens_seen": 7118336, "step": 3400 }, { "epoch": 0.5546945101558038, "eval_loss": 2.552403450012207, "eval_runtime": 135.0245, "eval_samples_per_second": 20.182, "eval_steps_per_second": 10.094, "num_input_tokens_seen": 7118336, "step": 3400 }, { "epoch": 0.5555102373766213, "grad_norm": 8.375, "learning_rate": 4.9111861817415905e-05, "loss": 1.8922, "num_input_tokens_seen": 7128304, "step": 3405 }, { "epoch": 0.5563259645974387, "grad_norm": 4.9375, "learning_rate": 4.91092664158867e-05, "loss": 2.5185, "num_input_tokens_seen": 7137456, "step": 3410 }, { "epoch": 0.557141691818256, "grad_norm": 4.9375, "learning_rate": 4.910666729640563e-05, "loss": 2.2434, "num_input_tokens_seen": 7147712, "step": 3415 }, { "epoch": 0.5579574190390734, "grad_norm": 1.828125, "learning_rate": 4.910406445937353e-05, "loss": 2.4215, "num_input_tokens_seen": 7158608, "step": 3420 }, { "epoch": 0.5587731462598907, "grad_norm": 3.0625, "learning_rate": 4.9101457905191774e-05, "loss": 1.5247, "num_input_tokens_seen": 7168880, "step": 3425 }, { "epoch": 0.5595888734807081, "grad_norm": 7.09375, "learning_rate": 4.909884763426233e-05, "loss": 2.3167, "num_input_tokens_seen": 7178576, "step": 3430 }, { "epoch": 0.5604046007015254, "grad_norm": 5.40625, "learning_rate": 4.9096233646987736e-05, "loss": 2.1785, "num_input_tokens_seen": 7189488, "step": 3435 }, { "epoch": 0.5612203279223428, "grad_norm": 2.25, "learning_rate": 4.9093615943771104e-05, "loss": 1.6636, "num_input_tokens_seen": 7199584, "step": 3440 }, { "epoch": 0.5620360551431601, "grad_norm": 5.5625, "learning_rate": 4.909099452501611e-05, "loss": 2.2311, "num_input_tokens_seen": 7208400, "step": 3445 }, { "epoch": 0.5628517823639775, "grad_norm": 9.75, "learning_rate": 4.908836939112702e-05, "loss": 2.5834, "num_input_tokens_seen": 7218704, "step": 3450 }, { "epoch": 0.5636675095847948, "grad_norm": 0.15625, "learning_rate": 4.908574054250865e-05, "loss": 2.2339, "num_input_tokens_seen": 7230304, "step": 3455 }, { "epoch": 0.5644832368056122, "grad_norm": 7.84375, "learning_rate": 4.9083107979566414e-05, "loss": 1.4289, "num_input_tokens_seen": 7240912, "step": 3460 }, { "epoch": 0.5652989640264295, "grad_norm": 0.205078125, "learning_rate": 4.908047170270628e-05, "loss": 0.7788, "num_input_tokens_seen": 7251872, "step": 3465 }, { "epoch": 0.5661146912472469, "grad_norm": 7.84375, "learning_rate": 4.9077831712334784e-05, "loss": 3.3725, "num_input_tokens_seen": 7262336, "step": 3470 }, { "epoch": 0.5669304184680642, "grad_norm": 7.0, "learning_rate": 4.907518800885907e-05, "loss": 2.1348, "num_input_tokens_seen": 7273344, "step": 3475 }, { "epoch": 0.5677461456888816, "grad_norm": 6.25, "learning_rate": 4.907254059268681e-05, "loss": 2.9166, "num_input_tokens_seen": 7282848, "step": 3480 }, { "epoch": 0.568561872909699, "grad_norm": 9.375, "learning_rate": 4.906988946422628e-05, "loss": 2.8431, "num_input_tokens_seen": 7293424, "step": 3485 }, { "epoch": 0.5693776001305163, "grad_norm": 7.71875, "learning_rate": 4.9067234623886315e-05, "loss": 1.2115, "num_input_tokens_seen": 7304192, "step": 3490 }, { "epoch": 0.5701933273513338, "grad_norm": 6.6875, "learning_rate": 4.9064576072076316e-05, "loss": 3.5245, "num_input_tokens_seen": 7315488, "step": 3495 }, { "epoch": 0.5710090545721511, "grad_norm": 3.46875, "learning_rate": 4.906191380920628e-05, "loss": 1.942, "num_input_tokens_seen": 7325840, "step": 3500 }, { "epoch": 0.5718247817929685, "grad_norm": 11.4375, "learning_rate": 4.905924783568675e-05, "loss": 2.1367, "num_input_tokens_seen": 7334864, "step": 3505 }, { "epoch": 0.5726405090137858, "grad_norm": 7.78125, "learning_rate": 4.905657815192886e-05, "loss": 4.048, "num_input_tokens_seen": 7345472, "step": 3510 }, { "epoch": 0.5734562362346032, "grad_norm": 0.1796875, "learning_rate": 4.90539047583443e-05, "loss": 1.5848, "num_input_tokens_seen": 7356752, "step": 3515 }, { "epoch": 0.5742719634554205, "grad_norm": 11.3125, "learning_rate": 4.905122765534534e-05, "loss": 2.2789, "num_input_tokens_seen": 7366736, "step": 3520 }, { "epoch": 0.5750876906762379, "grad_norm": 9.75, "learning_rate": 4.9048546843344846e-05, "loss": 2.2609, "num_input_tokens_seen": 7376416, "step": 3525 }, { "epoch": 0.5759034178970552, "grad_norm": 7.4375, "learning_rate": 4.9045862322756206e-05, "loss": 2.2595, "num_input_tokens_seen": 7387456, "step": 3530 }, { "epoch": 0.5767191451178726, "grad_norm": 6.4375, "learning_rate": 4.904317409399342e-05, "loss": 2.6043, "num_input_tokens_seen": 7398704, "step": 3535 }, { "epoch": 0.5775348723386899, "grad_norm": 9.4375, "learning_rate": 4.904048215747104e-05, "loss": 2.5046, "num_input_tokens_seen": 7408832, "step": 3540 }, { "epoch": 0.5783505995595073, "grad_norm": 9.125, "learning_rate": 4.90377865136042e-05, "loss": 3.227, "num_input_tokens_seen": 7419120, "step": 3545 }, { "epoch": 0.5791663267803246, "grad_norm": 12.0, "learning_rate": 4.90350871628086e-05, "loss": 2.0458, "num_input_tokens_seen": 7429712, "step": 3550 }, { "epoch": 0.579982054001142, "grad_norm": 7.4375, "learning_rate": 4.903238410550052e-05, "loss": 2.4662, "num_input_tokens_seen": 7440480, "step": 3555 }, { "epoch": 0.5807977812219594, "grad_norm": 14.1875, "learning_rate": 4.90296773420968e-05, "loss": 3.1046, "num_input_tokens_seen": 7451840, "step": 3560 }, { "epoch": 0.5816135084427767, "grad_norm": 9.5, "learning_rate": 4.902696687301486e-05, "loss": 1.7273, "num_input_tokens_seen": 7461696, "step": 3565 }, { "epoch": 0.5824292356635941, "grad_norm": 10.4375, "learning_rate": 4.902425269867268e-05, "loss": 2.9585, "num_input_tokens_seen": 7471568, "step": 3570 }, { "epoch": 0.5832449628844114, "grad_norm": 6.0625, "learning_rate": 4.902153481948883e-05, "loss": 2.2035, "num_input_tokens_seen": 7482512, "step": 3575 }, { "epoch": 0.5840606901052288, "grad_norm": 3.296875, "learning_rate": 4.901881323588244e-05, "loss": 1.3552, "num_input_tokens_seen": 7492016, "step": 3580 }, { "epoch": 0.5848764173260461, "grad_norm": 1.078125, "learning_rate": 4.90160879482732e-05, "loss": 0.9002, "num_input_tokens_seen": 7501328, "step": 3585 }, { "epoch": 0.5856921445468636, "grad_norm": 4.90625, "learning_rate": 4.9013358957081405e-05, "loss": 1.5126, "num_input_tokens_seen": 7511952, "step": 3590 }, { "epoch": 0.5865078717676809, "grad_norm": 15.3125, "learning_rate": 4.901062626272789e-05, "loss": 2.6654, "num_input_tokens_seen": 7523520, "step": 3595 }, { "epoch": 0.5873235989884983, "grad_norm": 7.84375, "learning_rate": 4.900788986563406e-05, "loss": 2.0952, "num_input_tokens_seen": 7533408, "step": 3600 }, { "epoch": 0.5873235989884983, "eval_loss": 2.552873373031616, "eval_runtime": 134.8449, "eval_samples_per_second": 20.208, "eval_steps_per_second": 10.108, "num_input_tokens_seen": 7533408, "step": 3600 }, { "epoch": 0.5881393262093156, "grad_norm": 10.75, "learning_rate": 4.9005149766221915e-05, "loss": 1.7189, "num_input_tokens_seen": 7543760, "step": 3605 }, { "epoch": 0.588955053430133, "grad_norm": 6.4375, "learning_rate": 4.9002405964914e-05, "loss": 2.3758, "num_input_tokens_seen": 7554320, "step": 3610 }, { "epoch": 0.5897707806509503, "grad_norm": 9.0625, "learning_rate": 4.899965846213346e-05, "loss": 3.6809, "num_input_tokens_seen": 7564672, "step": 3615 }, { "epoch": 0.5905865078717677, "grad_norm": 9.4375, "learning_rate": 4.899690725830399e-05, "loss": 2.1018, "num_input_tokens_seen": 7574608, "step": 3620 }, { "epoch": 0.591402235092585, "grad_norm": 5.5, "learning_rate": 4.899415235384985e-05, "loss": 1.428, "num_input_tokens_seen": 7586704, "step": 3625 }, { "epoch": 0.5922179623134024, "grad_norm": 9.375, "learning_rate": 4.899139374919589e-05, "loss": 2.3859, "num_input_tokens_seen": 7596992, "step": 3630 }, { "epoch": 0.5930336895342198, "grad_norm": 10.6875, "learning_rate": 4.898863144476752e-05, "loss": 4.1084, "num_input_tokens_seen": 7607376, "step": 3635 }, { "epoch": 0.5938494167550371, "grad_norm": 10.0625, "learning_rate": 4.898586544099072e-05, "loss": 2.4244, "num_input_tokens_seen": 7616608, "step": 3640 }, { "epoch": 0.5946651439758545, "grad_norm": 3.609375, "learning_rate": 4.898309573829204e-05, "loss": 1.9677, "num_input_tokens_seen": 7626416, "step": 3645 }, { "epoch": 0.5954808711966718, "grad_norm": 5.40625, "learning_rate": 4.898032233709862e-05, "loss": 3.2209, "num_input_tokens_seen": 7637104, "step": 3650 }, { "epoch": 0.5962965984174892, "grad_norm": 4.625, "learning_rate": 4.8977545237838123e-05, "loss": 2.4833, "num_input_tokens_seen": 7648816, "step": 3655 }, { "epoch": 0.5971123256383065, "grad_norm": 7.6875, "learning_rate": 4.8974764440938836e-05, "loss": 2.2217, "num_input_tokens_seen": 7660336, "step": 3660 }, { "epoch": 0.5979280528591239, "grad_norm": 10.0625, "learning_rate": 4.897197994682959e-05, "loss": 1.9646, "num_input_tokens_seen": 7670336, "step": 3665 }, { "epoch": 0.5987437800799412, "grad_norm": 12.0, "learning_rate": 4.8969191755939786e-05, "loss": 2.4625, "num_input_tokens_seen": 7680000, "step": 3670 }, { "epoch": 0.5995595073007586, "grad_norm": 10.125, "learning_rate": 4.8966399868699396e-05, "loss": 2.3734, "num_input_tokens_seen": 7690912, "step": 3675 }, { "epoch": 0.600375234521576, "grad_norm": 4.71875, "learning_rate": 4.8963604285538965e-05, "loss": 1.7672, "num_input_tokens_seen": 7703616, "step": 3680 }, { "epoch": 0.6011909617423934, "grad_norm": 1.9140625, "learning_rate": 4.8960805006889604e-05, "loss": 2.2219, "num_input_tokens_seen": 7711952, "step": 3685 }, { "epoch": 0.6020066889632107, "grad_norm": 9.4375, "learning_rate": 4.8958002033183004e-05, "loss": 2.6358, "num_input_tokens_seen": 7723216, "step": 3690 }, { "epoch": 0.6028224161840281, "grad_norm": 7.9375, "learning_rate": 4.8955195364851414e-05, "loss": 3.7151, "num_input_tokens_seen": 7732192, "step": 3695 }, { "epoch": 0.6036381434048455, "grad_norm": 1.7421875, "learning_rate": 4.895238500232766e-05, "loss": 2.1522, "num_input_tokens_seen": 7742144, "step": 3700 }, { "epoch": 0.6044538706256628, "grad_norm": 5.375, "learning_rate": 4.8949570946045143e-05, "loss": 2.3937, "num_input_tokens_seen": 7753600, "step": 3705 }, { "epoch": 0.6052695978464802, "grad_norm": 3.921875, "learning_rate": 4.89467531964378e-05, "loss": 1.367, "num_input_tokens_seen": 7763248, "step": 3710 }, { "epoch": 0.6060853250672975, "grad_norm": 6.4375, "learning_rate": 4.894393175394019e-05, "loss": 3.0416, "num_input_tokens_seen": 7773424, "step": 3715 }, { "epoch": 0.6069010522881149, "grad_norm": 3.578125, "learning_rate": 4.8941106618987406e-05, "loss": 1.6572, "num_input_tokens_seen": 7783248, "step": 3720 }, { "epoch": 0.6077167795089322, "grad_norm": 7.09375, "learning_rate": 4.893827779201512e-05, "loss": 2.6385, "num_input_tokens_seen": 7794048, "step": 3725 }, { "epoch": 0.6085325067297496, "grad_norm": 6.875, "learning_rate": 4.893544527345957e-05, "loss": 3.5935, "num_input_tokens_seen": 7803920, "step": 3730 }, { "epoch": 0.6093482339505669, "grad_norm": 7.40625, "learning_rate": 4.8932609063757563e-05, "loss": 2.344, "num_input_tokens_seen": 7816048, "step": 3735 }, { "epoch": 0.6101639611713843, "grad_norm": 7.84375, "learning_rate": 4.8929769163346484e-05, "loss": 2.4508, "num_input_tokens_seen": 7827056, "step": 3740 }, { "epoch": 0.6109796883922016, "grad_norm": 6.9375, "learning_rate": 4.892692557266429e-05, "loss": 1.0042, "num_input_tokens_seen": 7837392, "step": 3745 }, { "epoch": 0.611795415613019, "grad_norm": 7.53125, "learning_rate": 4.8924078292149464e-05, "loss": 2.4875, "num_input_tokens_seen": 7847888, "step": 3750 }, { "epoch": 0.6126111428338363, "grad_norm": 17.75, "learning_rate": 4.892122732224114e-05, "loss": 3.0358, "num_input_tokens_seen": 7858064, "step": 3755 }, { "epoch": 0.6134268700546537, "grad_norm": 5.28125, "learning_rate": 4.8918372663378944e-05, "loss": 2.3247, "num_input_tokens_seen": 7867648, "step": 3760 }, { "epoch": 0.614242597275471, "grad_norm": 4.875, "learning_rate": 4.89155143160031e-05, "loss": 2.3061, "num_input_tokens_seen": 7878496, "step": 3765 }, { "epoch": 0.6150583244962884, "grad_norm": 8.875, "learning_rate": 4.891265228055441e-05, "loss": 3.2971, "num_input_tokens_seen": 7890016, "step": 3770 }, { "epoch": 0.6158740517171059, "grad_norm": 2.578125, "learning_rate": 4.890978655747424e-05, "loss": 1.7368, "num_input_tokens_seen": 7901056, "step": 3775 }, { "epoch": 0.6166897789379232, "grad_norm": 8.625, "learning_rate": 4.89069171472045e-05, "loss": 2.0773, "num_input_tokens_seen": 7911296, "step": 3780 }, { "epoch": 0.6175055061587406, "grad_norm": 5.625, "learning_rate": 4.890404405018772e-05, "loss": 2.245, "num_input_tokens_seen": 7920896, "step": 3785 }, { "epoch": 0.6183212333795579, "grad_norm": 9.4375, "learning_rate": 4.8901167266866934e-05, "loss": 2.376, "num_input_tokens_seen": 7931168, "step": 3790 }, { "epoch": 0.6191369606003753, "grad_norm": 11.8125, "learning_rate": 4.88982867976858e-05, "loss": 2.0185, "num_input_tokens_seen": 7941328, "step": 3795 }, { "epoch": 0.6199526878211926, "grad_norm": 6.65625, "learning_rate": 4.889540264308852e-05, "loss": 2.8358, "num_input_tokens_seen": 7950560, "step": 3800 }, { "epoch": 0.6199526878211926, "eval_loss": 2.542593240737915, "eval_runtime": 134.8495, "eval_samples_per_second": 20.208, "eval_steps_per_second": 10.108, "num_input_tokens_seen": 7950560, "step": 3800 }, { "epoch": 0.62076841504201, "grad_norm": 6.1875, "learning_rate": 4.889251480351986e-05, "loss": 2.726, "num_input_tokens_seen": 7960672, "step": 3805 }, { "epoch": 0.6215841422628273, "grad_norm": 12.875, "learning_rate": 4.888962327942517e-05, "loss": 2.3652, "num_input_tokens_seen": 7970848, "step": 3810 }, { "epoch": 0.6223998694836447, "grad_norm": 9.0625, "learning_rate": 4.8886728071250356e-05, "loss": 3.3738, "num_input_tokens_seen": 7983216, "step": 3815 }, { "epoch": 0.623215596704462, "grad_norm": 7.5625, "learning_rate": 4.8883829179441884e-05, "loss": 2.6637, "num_input_tokens_seen": 7994064, "step": 3820 }, { "epoch": 0.6240313239252794, "grad_norm": 7.125, "learning_rate": 4.888092660444682e-05, "loss": 2.0591, "num_input_tokens_seen": 8004656, "step": 3825 }, { "epoch": 0.6248470511460967, "grad_norm": 7.09375, "learning_rate": 4.887802034671276e-05, "loss": 2.8221, "num_input_tokens_seen": 8015360, "step": 3830 }, { "epoch": 0.6256627783669141, "grad_norm": 5.4375, "learning_rate": 4.88751104066879e-05, "loss": 1.7193, "num_input_tokens_seen": 8025040, "step": 3835 }, { "epoch": 0.6264785055877314, "grad_norm": 2.328125, "learning_rate": 4.887219678482098e-05, "loss": 2.5356, "num_input_tokens_seen": 8036576, "step": 3840 }, { "epoch": 0.6272942328085488, "grad_norm": 7.46875, "learning_rate": 4.8869279481561316e-05, "loss": 2.0837, "num_input_tokens_seen": 8045728, "step": 3845 }, { "epoch": 0.6281099600293661, "grad_norm": 5.75, "learning_rate": 4.88663584973588e-05, "loss": 2.1651, "num_input_tokens_seen": 8056992, "step": 3850 }, { "epoch": 0.6289256872501835, "grad_norm": 0.31640625, "learning_rate": 4.8863433832663874e-05, "loss": 2.1636, "num_input_tokens_seen": 8065504, "step": 3855 }, { "epoch": 0.6297414144710008, "grad_norm": 5.53125, "learning_rate": 4.886050548792757e-05, "loss": 1.9006, "num_input_tokens_seen": 8076384, "step": 3860 }, { "epoch": 0.6305571416918182, "grad_norm": 11.8125, "learning_rate": 4.8857573463601465e-05, "loss": 2.5184, "num_input_tokens_seen": 8087296, "step": 3865 }, { "epoch": 0.6313728689126357, "grad_norm": 6.4375, "learning_rate": 4.885463776013772e-05, "loss": 2.4237, "num_input_tokens_seen": 8098368, "step": 3870 }, { "epoch": 0.632188596133453, "grad_norm": 12.25, "learning_rate": 4.8851698377989056e-05, "loss": 2.2896, "num_input_tokens_seen": 8109232, "step": 3875 }, { "epoch": 0.6330043233542704, "grad_norm": 5.78125, "learning_rate": 4.884875531760876e-05, "loss": 2.04, "num_input_tokens_seen": 8119808, "step": 3880 }, { "epoch": 0.6338200505750877, "grad_norm": 9.0625, "learning_rate": 4.88458085794507e-05, "loss": 2.3342, "num_input_tokens_seen": 8129376, "step": 3885 }, { "epoch": 0.6346357777959051, "grad_norm": 10.3125, "learning_rate": 4.884285816396929e-05, "loss": 3.3626, "num_input_tokens_seen": 8139168, "step": 3890 }, { "epoch": 0.6354515050167224, "grad_norm": 6.46875, "learning_rate": 4.8839904071619526e-05, "loss": 1.9, "num_input_tokens_seen": 8149264, "step": 3895 }, { "epoch": 0.6362672322375398, "grad_norm": 3.328125, "learning_rate": 4.8836946302856955e-05, "loss": 1.9218, "num_input_tokens_seen": 8161248, "step": 3900 }, { "epoch": 0.6370829594583571, "grad_norm": 10.6875, "learning_rate": 4.8833984858137715e-05, "loss": 2.6557, "num_input_tokens_seen": 8171248, "step": 3905 }, { "epoch": 0.6378986866791745, "grad_norm": 3.671875, "learning_rate": 4.8831019737918494e-05, "loss": 1.7655, "num_input_tokens_seen": 8182976, "step": 3910 }, { "epoch": 0.6387144138999918, "grad_norm": 2.8125, "learning_rate": 4.882805094265655e-05, "loss": 1.8309, "num_input_tokens_seen": 8192832, "step": 3915 }, { "epoch": 0.6395301411208092, "grad_norm": 4.75, "learning_rate": 4.8825078472809706e-05, "loss": 2.1732, "num_input_tokens_seen": 8202992, "step": 3920 }, { "epoch": 0.6403458683416265, "grad_norm": 9.25, "learning_rate": 4.882210232883635e-05, "loss": 2.4585, "num_input_tokens_seen": 8214176, "step": 3925 }, { "epoch": 0.6411615955624439, "grad_norm": 6.6875, "learning_rate": 4.881912251119546e-05, "loss": 2.413, "num_input_tokens_seen": 8225216, "step": 3930 }, { "epoch": 0.6419773227832613, "grad_norm": 9.375, "learning_rate": 4.881613902034654e-05, "loss": 2.6427, "num_input_tokens_seen": 8235680, "step": 3935 }, { "epoch": 0.6427930500040786, "grad_norm": 6.90625, "learning_rate": 4.88131518567497e-05, "loss": 1.1857, "num_input_tokens_seen": 8246480, "step": 3940 }, { "epoch": 0.643608777224896, "grad_norm": 4.9375, "learning_rate": 4.881016102086558e-05, "loss": 1.9682, "num_input_tokens_seen": 8256928, "step": 3945 }, { "epoch": 0.6444245044457133, "grad_norm": 4.90625, "learning_rate": 4.8807166513155425e-05, "loss": 3.1164, "num_input_tokens_seen": 8268096, "step": 3950 }, { "epoch": 0.6452402316665307, "grad_norm": 8.5625, "learning_rate": 4.8804168334081004e-05, "loss": 3.4649, "num_input_tokens_seen": 8279440, "step": 3955 }, { "epoch": 0.6460559588873481, "grad_norm": 5.09375, "learning_rate": 4.880116648410468e-05, "loss": 3.0097, "num_input_tokens_seen": 8290144, "step": 3960 }, { "epoch": 0.6468716861081655, "grad_norm": 0.376953125, "learning_rate": 4.879816096368939e-05, "loss": 1.2124, "num_input_tokens_seen": 8300240, "step": 3965 }, { "epoch": 0.6476874133289828, "grad_norm": 7.78125, "learning_rate": 4.879515177329861e-05, "loss": 2.0724, "num_input_tokens_seen": 8309632, "step": 3970 }, { "epoch": 0.6485031405498002, "grad_norm": 3.15625, "learning_rate": 4.8792138913396394e-05, "loss": 2.0452, "num_input_tokens_seen": 8320752, "step": 3975 }, { "epoch": 0.6493188677706175, "grad_norm": 10.9375, "learning_rate": 4.8789122384447374e-05, "loss": 2.1118, "num_input_tokens_seen": 8331136, "step": 3980 }, { "epoch": 0.6501345949914349, "grad_norm": 7.6875, "learning_rate": 4.878610218691673e-05, "loss": 2.3542, "num_input_tokens_seen": 8341552, "step": 3985 }, { "epoch": 0.6509503222122522, "grad_norm": 6.5625, "learning_rate": 4.87830783212702e-05, "loss": 2.6979, "num_input_tokens_seen": 8352176, "step": 3990 }, { "epoch": 0.6517660494330696, "grad_norm": 9.6875, "learning_rate": 4.878005078797413e-05, "loss": 3.1937, "num_input_tokens_seen": 8361952, "step": 3995 }, { "epoch": 0.652581776653887, "grad_norm": 10.75, "learning_rate": 4.877701958749539e-05, "loss": 2.2672, "num_input_tokens_seen": 8372672, "step": 4000 }, { "epoch": 0.652581776653887, "eval_loss": 2.5473592281341553, "eval_runtime": 135.1225, "eval_samples_per_second": 20.167, "eval_steps_per_second": 10.087, "num_input_tokens_seen": 8372672, "step": 4000 }, { "epoch": 0.6533975038747043, "grad_norm": 7.65625, "learning_rate": 4.877398472030142e-05, "loss": 1.8732, "num_input_tokens_seen": 8382752, "step": 4005 }, { "epoch": 0.6542132310955217, "grad_norm": 7.0, "learning_rate": 4.877094618686024e-05, "loss": 1.8587, "num_input_tokens_seen": 8394112, "step": 4010 }, { "epoch": 0.655028958316339, "grad_norm": 12.25, "learning_rate": 4.876790398764045e-05, "loss": 2.7755, "num_input_tokens_seen": 8404576, "step": 4015 }, { "epoch": 0.6558446855371564, "grad_norm": 7.625, "learning_rate": 4.8764858123111167e-05, "loss": 2.8243, "num_input_tokens_seen": 8414592, "step": 4020 }, { "epoch": 0.6566604127579737, "grad_norm": 10.125, "learning_rate": 4.876180859374212e-05, "loss": 3.097, "num_input_tokens_seen": 8425168, "step": 4025 }, { "epoch": 0.6574761399787911, "grad_norm": 8.5, "learning_rate": 4.875875540000357e-05, "loss": 2.681, "num_input_tokens_seen": 8436768, "step": 4030 }, { "epoch": 0.6582918671996084, "grad_norm": 17.5, "learning_rate": 4.8755698542366376e-05, "loss": 2.1566, "num_input_tokens_seen": 8448176, "step": 4035 }, { "epoch": 0.6591075944204258, "grad_norm": 6.1875, "learning_rate": 4.875263802130193e-05, "loss": 2.7086, "num_input_tokens_seen": 8460480, "step": 4040 }, { "epoch": 0.6599233216412431, "grad_norm": 2.015625, "learning_rate": 4.8749573837282207e-05, "loss": 2.5483, "num_input_tokens_seen": 8470400, "step": 4045 }, { "epoch": 0.6607390488620605, "grad_norm": 12.625, "learning_rate": 4.874650599077974e-05, "loss": 2.7352, "num_input_tokens_seen": 8480352, "step": 4050 }, { "epoch": 0.6615547760828779, "grad_norm": 6.09375, "learning_rate": 4.874343448226764e-05, "loss": 2.3886, "num_input_tokens_seen": 8491344, "step": 4055 }, { "epoch": 0.6623705033036953, "grad_norm": 10.125, "learning_rate": 4.874035931221955e-05, "loss": 2.279, "num_input_tokens_seen": 8502480, "step": 4060 }, { "epoch": 0.6631862305245126, "grad_norm": 11.4375, "learning_rate": 4.8737280481109724e-05, "loss": 1.8528, "num_input_tokens_seen": 8512672, "step": 4065 }, { "epoch": 0.66400195774533, "grad_norm": 8.1875, "learning_rate": 4.873419798941294e-05, "loss": 2.2189, "num_input_tokens_seen": 8523856, "step": 4070 }, { "epoch": 0.6648176849661473, "grad_norm": 3.859375, "learning_rate": 4.873111183760458e-05, "loss": 1.986, "num_input_tokens_seen": 8536192, "step": 4075 }, { "epoch": 0.6656334121869647, "grad_norm": 3.765625, "learning_rate": 4.8728022026160537e-05, "loss": 2.0461, "num_input_tokens_seen": 8545136, "step": 4080 }, { "epoch": 0.666449139407782, "grad_norm": 8.25, "learning_rate": 4.872492855555732e-05, "loss": 2.3533, "num_input_tokens_seen": 8555456, "step": 4085 }, { "epoch": 0.6672648666285994, "grad_norm": 3.25, "learning_rate": 4.8721831426271956e-05, "loss": 1.2958, "num_input_tokens_seen": 8564304, "step": 4090 }, { "epoch": 0.6680805938494168, "grad_norm": 0.068359375, "learning_rate": 4.87187306387821e-05, "loss": 2.3116, "num_input_tokens_seen": 8575536, "step": 4095 }, { "epoch": 0.6688963210702341, "grad_norm": 1.578125, "learning_rate": 4.87156261935659e-05, "loss": 2.1941, "num_input_tokens_seen": 8587056, "step": 4100 }, { "epoch": 0.6697120482910515, "grad_norm": 7.8125, "learning_rate": 4.871251809110211e-05, "loss": 2.1207, "num_input_tokens_seen": 8597808, "step": 4105 }, { "epoch": 0.6705277755118688, "grad_norm": 6.03125, "learning_rate": 4.8709406331870044e-05, "loss": 2.5123, "num_input_tokens_seen": 8608592, "step": 4110 }, { "epoch": 0.6713435027326862, "grad_norm": 9.5, "learning_rate": 4.8706290916349574e-05, "loss": 2.4135, "num_input_tokens_seen": 8617824, "step": 4115 }, { "epoch": 0.6721592299535035, "grad_norm": 9.25, "learning_rate": 4.8703171845021134e-05, "loss": 1.7784, "num_input_tokens_seen": 8628464, "step": 4120 }, { "epoch": 0.6729749571743209, "grad_norm": 9.1875, "learning_rate": 4.870004911836572e-05, "loss": 2.8992, "num_input_tokens_seen": 8639168, "step": 4125 }, { "epoch": 0.6737906843951382, "grad_norm": 4.1875, "learning_rate": 4.869692273686489e-05, "loss": 2.2268, "num_input_tokens_seen": 8649680, "step": 4130 }, { "epoch": 0.6746064116159556, "grad_norm": 9.625, "learning_rate": 4.869379270100079e-05, "loss": 3.371, "num_input_tokens_seen": 8660400, "step": 4135 }, { "epoch": 0.6754221388367729, "grad_norm": 8.625, "learning_rate": 4.86906590112561e-05, "loss": 3.0593, "num_input_tokens_seen": 8670080, "step": 4140 }, { "epoch": 0.6762378660575903, "grad_norm": 11.375, "learning_rate": 4.8687521668114064e-05, "loss": 3.0317, "num_input_tokens_seen": 8679952, "step": 4145 }, { "epoch": 0.6770535932784078, "grad_norm": 8.3125, "learning_rate": 4.868438067205853e-05, "loss": 2.8708, "num_input_tokens_seen": 8689920, "step": 4150 }, { "epoch": 0.6778693204992251, "grad_norm": 12.25, "learning_rate": 4.8681236023573844e-05, "loss": 3.0706, "num_input_tokens_seen": 8700432, "step": 4155 }, { "epoch": 0.6786850477200425, "grad_norm": 7.875, "learning_rate": 4.867808772314497e-05, "loss": 3.2197, "num_input_tokens_seen": 8711104, "step": 4160 }, { "epoch": 0.6795007749408598, "grad_norm": 7.21875, "learning_rate": 4.867493577125741e-05, "loss": 2.6816, "num_input_tokens_seen": 8721264, "step": 4165 }, { "epoch": 0.6803165021616772, "grad_norm": 2.046875, "learning_rate": 4.867178016839725e-05, "loss": 2.1749, "num_input_tokens_seen": 8732112, "step": 4170 }, { "epoch": 0.6811322293824945, "grad_norm": 9.0, "learning_rate": 4.8668620915051094e-05, "loss": 1.5776, "num_input_tokens_seen": 8742464, "step": 4175 }, { "epoch": 0.6819479566033119, "grad_norm": 7.25, "learning_rate": 4.866545801170616e-05, "loss": 2.6211, "num_input_tokens_seen": 8754128, "step": 4180 }, { "epoch": 0.6827636838241292, "grad_norm": 5.46875, "learning_rate": 4.86622914588502e-05, "loss": 1.5238, "num_input_tokens_seen": 8765280, "step": 4185 }, { "epoch": 0.6835794110449466, "grad_norm": 9.9375, "learning_rate": 4.865912125697154e-05, "loss": 2.4654, "num_input_tokens_seen": 8775712, "step": 4190 }, { "epoch": 0.6843951382657639, "grad_norm": 0.07177734375, "learning_rate": 4.865594740655907e-05, "loss": 1.6411, "num_input_tokens_seen": 8785376, "step": 4195 }, { "epoch": 0.6852108654865813, "grad_norm": 8.4375, "learning_rate": 4.865276990810222e-05, "loss": 1.7307, "num_input_tokens_seen": 8796048, "step": 4200 }, { "epoch": 0.6852108654865813, "eval_loss": 2.558070421218872, "eval_runtime": 134.8436, "eval_samples_per_second": 20.209, "eval_steps_per_second": 10.108, "num_input_tokens_seen": 8796048, "step": 4200 }, { "epoch": 0.6860265927073986, "grad_norm": 9.3125, "learning_rate": 4.8649588762091016e-05, "loss": 1.0379, "num_input_tokens_seen": 8807040, "step": 4205 }, { "epoch": 0.686842319928216, "grad_norm": 8.1875, "learning_rate": 4.8646403969016016e-05, "loss": 1.9028, "num_input_tokens_seen": 8817664, "step": 4210 }, { "epoch": 0.6876580471490333, "grad_norm": 10.5625, "learning_rate": 4.864321552936838e-05, "loss": 2.1559, "num_input_tokens_seen": 8828880, "step": 4215 }, { "epoch": 0.6884737743698507, "grad_norm": 1.5078125, "learning_rate": 4.864002344363978e-05, "loss": 1.9426, "num_input_tokens_seen": 8839840, "step": 4220 }, { "epoch": 0.689289501590668, "grad_norm": 10.1875, "learning_rate": 4.863682771232248e-05, "loss": 1.3108, "num_input_tokens_seen": 8850224, "step": 4225 }, { "epoch": 0.6901052288114854, "grad_norm": 2.046875, "learning_rate": 4.8633628335909324e-05, "loss": 1.9218, "num_input_tokens_seen": 8862592, "step": 4230 }, { "epoch": 0.6909209560323027, "grad_norm": 10.6875, "learning_rate": 4.8630425314893676e-05, "loss": 1.8785, "num_input_tokens_seen": 8872656, "step": 4235 }, { "epoch": 0.6917366832531202, "grad_norm": 6.625, "learning_rate": 4.862721864976948e-05, "loss": 1.9657, "num_input_tokens_seen": 8882016, "step": 4240 }, { "epoch": 0.6925524104739376, "grad_norm": 3.890625, "learning_rate": 4.862400834103125e-05, "loss": 1.9744, "num_input_tokens_seen": 8891712, "step": 4245 }, { "epoch": 0.6933681376947549, "grad_norm": 7.84375, "learning_rate": 4.862079438917406e-05, "loss": 2.1444, "num_input_tokens_seen": 8902000, "step": 4250 }, { "epoch": 0.6941838649155723, "grad_norm": 2.203125, "learning_rate": 4.8617576794693536e-05, "loss": 2.6894, "num_input_tokens_seen": 8911040, "step": 4255 }, { "epoch": 0.6949995921363896, "grad_norm": 3.84375, "learning_rate": 4.8614355558085875e-05, "loss": 2.3691, "num_input_tokens_seen": 8920912, "step": 4260 }, { "epoch": 0.695815319357207, "grad_norm": 7.6875, "learning_rate": 4.861113067984783e-05, "loss": 2.8364, "num_input_tokens_seen": 8931728, "step": 4265 }, { "epoch": 0.6966310465780243, "grad_norm": 3.875, "learning_rate": 4.860790216047671e-05, "loss": 2.4062, "num_input_tokens_seen": 8941616, "step": 4270 }, { "epoch": 0.6974467737988417, "grad_norm": 10.4375, "learning_rate": 4.860467000047041e-05, "loss": 1.8364, "num_input_tokens_seen": 8951008, "step": 4275 }, { "epoch": 0.698262501019659, "grad_norm": 10.875, "learning_rate": 4.860143420032737e-05, "loss": 1.9861, "num_input_tokens_seen": 8961856, "step": 4280 }, { "epoch": 0.6990782282404764, "grad_norm": 8.4375, "learning_rate": 4.859819476054657e-05, "loss": 2.5696, "num_input_tokens_seen": 8971792, "step": 4285 }, { "epoch": 0.6998939554612937, "grad_norm": 7.65625, "learning_rate": 4.859495168162758e-05, "loss": 2.2095, "num_input_tokens_seen": 8982000, "step": 4290 }, { "epoch": 0.7007096826821111, "grad_norm": 11.1875, "learning_rate": 4.859170496407054e-05, "loss": 4.1751, "num_input_tokens_seen": 8990832, "step": 4295 }, { "epoch": 0.7015254099029284, "grad_norm": 3.625, "learning_rate": 4.8588454608376114e-05, "loss": 2.2711, "num_input_tokens_seen": 8999904, "step": 4300 }, { "epoch": 0.7023411371237458, "grad_norm": 9.0625, "learning_rate": 4.8585200615045555e-05, "loss": 2.43, "num_input_tokens_seen": 9009888, "step": 4305 }, { "epoch": 0.7031568643445631, "grad_norm": 7.625, "learning_rate": 4.8581942984580674e-05, "loss": 1.9469, "num_input_tokens_seen": 9020224, "step": 4310 }, { "epoch": 0.7039725915653805, "grad_norm": 5.78125, "learning_rate": 4.857868171748384e-05, "loss": 2.8645, "num_input_tokens_seen": 9030176, "step": 4315 }, { "epoch": 0.7047883187861979, "grad_norm": 6.25, "learning_rate": 4.8575416814257976e-05, "loss": 1.8936, "num_input_tokens_seen": 9041312, "step": 4320 }, { "epoch": 0.7056040460070152, "grad_norm": 12.1875, "learning_rate": 4.857214827540657e-05, "loss": 2.8018, "num_input_tokens_seen": 9052368, "step": 4325 }, { "epoch": 0.7064197732278326, "grad_norm": 6.5625, "learning_rate": 4.856887610143367e-05, "loss": 2.8542, "num_input_tokens_seen": 9064096, "step": 4330 }, { "epoch": 0.70723550044865, "grad_norm": 14.3125, "learning_rate": 4.8565600292843896e-05, "loss": 3.1432, "num_input_tokens_seen": 9074064, "step": 4335 }, { "epoch": 0.7080512276694674, "grad_norm": 9.0, "learning_rate": 4.856232085014241e-05, "loss": 2.4022, "num_input_tokens_seen": 9085728, "step": 4340 }, { "epoch": 0.7088669548902847, "grad_norm": 9.0, "learning_rate": 4.855903777383495e-05, "loss": 2.5207, "num_input_tokens_seen": 9097440, "step": 4345 }, { "epoch": 0.7096826821111021, "grad_norm": 5.90625, "learning_rate": 4.85557510644278e-05, "loss": 2.5286, "num_input_tokens_seen": 9108320, "step": 4350 }, { "epoch": 0.7104984093319194, "grad_norm": 4.5, "learning_rate": 4.855246072242782e-05, "loss": 1.6211, "num_input_tokens_seen": 9119232, "step": 4355 }, { "epoch": 0.7113141365527368, "grad_norm": 7.28125, "learning_rate": 4.8549166748342414e-05, "loss": 2.2292, "num_input_tokens_seen": 9128816, "step": 4360 }, { "epoch": 0.7121298637735541, "grad_norm": 7.34375, "learning_rate": 4.8545869142679556e-05, "loss": 3.4955, "num_input_tokens_seen": 9137840, "step": 4365 }, { "epoch": 0.7129455909943715, "grad_norm": 2.859375, "learning_rate": 4.8542567905947776e-05, "loss": 2.2395, "num_input_tokens_seen": 9148672, "step": 4370 }, { "epoch": 0.7137613182151888, "grad_norm": 7.21875, "learning_rate": 4.853926303865618e-05, "loss": 1.7518, "num_input_tokens_seen": 9158720, "step": 4375 }, { "epoch": 0.7145770454360062, "grad_norm": 7.0, "learning_rate": 4.853595454131441e-05, "loss": 2.1249, "num_input_tokens_seen": 9168240, "step": 4380 }, { "epoch": 0.7153927726568236, "grad_norm": 5.21875, "learning_rate": 4.8532642414432674e-05, "loss": 2.0307, "num_input_tokens_seen": 9178704, "step": 4385 }, { "epoch": 0.7162084998776409, "grad_norm": 5.0, "learning_rate": 4.8529326658521754e-05, "loss": 2.4567, "num_input_tokens_seen": 9189648, "step": 4390 }, { "epoch": 0.7170242270984583, "grad_norm": 13.0625, "learning_rate": 4.8526007274092965e-05, "loss": 3.8078, "num_input_tokens_seen": 9199696, "step": 4395 }, { "epoch": 0.7178399543192756, "grad_norm": 14.75, "learning_rate": 4.852268426165822e-05, "loss": 2.0188, "num_input_tokens_seen": 9210416, "step": 4400 }, { "epoch": 0.7178399543192756, "eval_loss": 2.547496795654297, "eval_runtime": 135.0287, "eval_samples_per_second": 20.181, "eval_steps_per_second": 10.094, "num_input_tokens_seen": 9210416, "step": 4400 }, { "epoch": 0.718655681540093, "grad_norm": 6.0, "learning_rate": 4.851935762172995e-05, "loss": 1.4136, "num_input_tokens_seen": 9220016, "step": 4405 }, { "epoch": 0.7194714087609103, "grad_norm": 9.0, "learning_rate": 4.8516027354821175e-05, "loss": 3.8296, "num_input_tokens_seen": 9230944, "step": 4410 }, { "epoch": 0.7202871359817277, "grad_norm": 8.1875, "learning_rate": 4.851269346144546e-05, "loss": 3.7847, "num_input_tokens_seen": 9240528, "step": 4415 }, { "epoch": 0.721102863202545, "grad_norm": 10.5, "learning_rate": 4.850935594211693e-05, "loss": 2.8351, "num_input_tokens_seen": 9250496, "step": 4420 }, { "epoch": 0.7219185904233625, "grad_norm": 2.5, "learning_rate": 4.850601479735029e-05, "loss": 1.8854, "num_input_tokens_seen": 9260832, "step": 4425 }, { "epoch": 0.7227343176441798, "grad_norm": 4.125, "learning_rate": 4.850267002766076e-05, "loss": 1.7331, "num_input_tokens_seen": 9270352, "step": 4430 }, { "epoch": 0.7235500448649972, "grad_norm": 8.875, "learning_rate": 4.849932163356417e-05, "loss": 2.5217, "num_input_tokens_seen": 9281120, "step": 4435 }, { "epoch": 0.7243657720858145, "grad_norm": 6.6875, "learning_rate": 4.8495969615576864e-05, "loss": 2.0177, "num_input_tokens_seen": 9292000, "step": 4440 }, { "epoch": 0.7251814993066319, "grad_norm": 4.4375, "learning_rate": 4.849261397421577e-05, "loss": 2.2123, "num_input_tokens_seen": 9302480, "step": 4445 }, { "epoch": 0.7259972265274492, "grad_norm": 4.28125, "learning_rate": 4.848925470999839e-05, "loss": 2.4211, "num_input_tokens_seen": 9312464, "step": 4450 }, { "epoch": 0.7268129537482666, "grad_norm": 6.9375, "learning_rate": 4.848589182344273e-05, "loss": 2.667, "num_input_tokens_seen": 9322224, "step": 4455 }, { "epoch": 0.727628680969084, "grad_norm": 11.3125, "learning_rate": 4.848252531506742e-05, "loss": 4.6915, "num_input_tokens_seen": 9332176, "step": 4460 }, { "epoch": 0.7284444081899013, "grad_norm": 6.40625, "learning_rate": 4.847915518539161e-05, "loss": 3.0387, "num_input_tokens_seen": 9344176, "step": 4465 }, { "epoch": 0.7292601354107187, "grad_norm": 3.5, "learning_rate": 4.847578143493501e-05, "loss": 1.1769, "num_input_tokens_seen": 9355904, "step": 4470 }, { "epoch": 0.730075862631536, "grad_norm": 12.6875, "learning_rate": 4.847240406421789e-05, "loss": 3.601, "num_input_tokens_seen": 9366320, "step": 4475 }, { "epoch": 0.7308915898523534, "grad_norm": 9.0, "learning_rate": 4.84690230737611e-05, "loss": 1.9841, "num_input_tokens_seen": 9377504, "step": 4480 }, { "epoch": 0.7317073170731707, "grad_norm": 3.375, "learning_rate": 4.846563846408602e-05, "loss": 1.5232, "num_input_tokens_seen": 9387936, "step": 4485 }, { "epoch": 0.7325230442939881, "grad_norm": 8.3125, "learning_rate": 4.84622502357146e-05, "loss": 4.1495, "num_input_tokens_seen": 9398496, "step": 4490 }, { "epoch": 0.7333387715148054, "grad_norm": 15.5625, "learning_rate": 4.8458858389169345e-05, "loss": 2.6902, "num_input_tokens_seen": 9408848, "step": 4495 }, { "epoch": 0.7341544987356228, "grad_norm": 4.96875, "learning_rate": 4.8455462924973334e-05, "loss": 2.1822, "num_input_tokens_seen": 9418224, "step": 4500 }, { "epoch": 0.7349702259564401, "grad_norm": 1.2421875, "learning_rate": 4.845206384365018e-05, "loss": 2.3119, "num_input_tokens_seen": 9428464, "step": 4505 }, { "epoch": 0.7357859531772575, "grad_norm": 6.875, "learning_rate": 4.844866114572405e-05, "loss": 2.7446, "num_input_tokens_seen": 9437792, "step": 4510 }, { "epoch": 0.7366016803980748, "grad_norm": 7.5625, "learning_rate": 4.8445254831719706e-05, "loss": 1.9596, "num_input_tokens_seen": 9448144, "step": 4515 }, { "epoch": 0.7374174076188923, "grad_norm": 2.265625, "learning_rate": 4.8441844902162434e-05, "loss": 1.5452, "num_input_tokens_seen": 9457312, "step": 4520 }, { "epoch": 0.7382331348397096, "grad_norm": 3.453125, "learning_rate": 4.843843135757809e-05, "loss": 1.126, "num_input_tokens_seen": 9469024, "step": 4525 }, { "epoch": 0.739048862060527, "grad_norm": 8.0, "learning_rate": 4.843501419849308e-05, "loss": 3.9736, "num_input_tokens_seen": 9480000, "step": 4530 }, { "epoch": 0.7398645892813444, "grad_norm": 6.0625, "learning_rate": 4.8431593425434386e-05, "loss": 2.5313, "num_input_tokens_seen": 9491696, "step": 4535 }, { "epoch": 0.7406803165021617, "grad_norm": 4.8125, "learning_rate": 4.8428169038929526e-05, "loss": 2.5195, "num_input_tokens_seen": 9502528, "step": 4540 }, { "epoch": 0.7414960437229791, "grad_norm": 10.4375, "learning_rate": 4.8424741039506575e-05, "loss": 2.3166, "num_input_tokens_seen": 9512112, "step": 4545 }, { "epoch": 0.7423117709437964, "grad_norm": 10.5625, "learning_rate": 4.842130942769419e-05, "loss": 1.2655, "num_input_tokens_seen": 9523568, "step": 4550 }, { "epoch": 0.7431274981646138, "grad_norm": 4.0625, "learning_rate": 4.841787420402156e-05, "loss": 2.5286, "num_input_tokens_seen": 9533392, "step": 4555 }, { "epoch": 0.7439432253854311, "grad_norm": 5.15625, "learning_rate": 4.841443536901844e-05, "loss": 3.0285, "num_input_tokens_seen": 9544560, "step": 4560 }, { "epoch": 0.7447589526062485, "grad_norm": 8.3125, "learning_rate": 4.841099292321514e-05, "loss": 2.3562, "num_input_tokens_seen": 9553984, "step": 4565 }, { "epoch": 0.7455746798270658, "grad_norm": 12.5, "learning_rate": 4.8407546867142525e-05, "loss": 3.4789, "num_input_tokens_seen": 9563664, "step": 4570 }, { "epoch": 0.7463904070478832, "grad_norm": 11.0625, "learning_rate": 4.840409720133203e-05, "loss": 1.856, "num_input_tokens_seen": 9574256, "step": 4575 }, { "epoch": 0.7472061342687005, "grad_norm": 3.703125, "learning_rate": 4.8400643926315634e-05, "loss": 1.7858, "num_input_tokens_seen": 9584800, "step": 4580 }, { "epoch": 0.7480218614895179, "grad_norm": 6.3125, "learning_rate": 4.839718704262587e-05, "loss": 3.5959, "num_input_tokens_seen": 9595120, "step": 4585 }, { "epoch": 0.7488375887103352, "grad_norm": 4.375, "learning_rate": 4.839372655079585e-05, "loss": 1.0393, "num_input_tokens_seen": 9605024, "step": 4590 }, { "epoch": 0.7496533159311526, "grad_norm": 1.921875, "learning_rate": 4.83902624513592e-05, "loss": 2.8227, "num_input_tokens_seen": 9617520, "step": 4595 }, { "epoch": 0.7504690431519699, "grad_norm": 5.59375, "learning_rate": 4.838679474485014e-05, "loss": 2.0703, "num_input_tokens_seen": 9628832, "step": 4600 }, { "epoch": 0.7504690431519699, "eval_loss": 2.543306827545166, "eval_runtime": 135.0617, "eval_samples_per_second": 20.176, "eval_steps_per_second": 10.092, "num_input_tokens_seen": 9628832, "step": 4600 }, { "epoch": 0.7512847703727873, "grad_norm": 10.125, "learning_rate": 4.838332343180343e-05, "loss": 2.2992, "num_input_tokens_seen": 9639408, "step": 4605 }, { "epoch": 0.7521004975936046, "grad_norm": 7.4375, "learning_rate": 4.83798485127544e-05, "loss": 1.8661, "num_input_tokens_seen": 9650752, "step": 4610 }, { "epoch": 0.7529162248144221, "grad_norm": 10.9375, "learning_rate": 4.837636998823892e-05, "loss": 2.3467, "num_input_tokens_seen": 9660672, "step": 4615 }, { "epoch": 0.7537319520352395, "grad_norm": 10.125, "learning_rate": 4.8372887858793414e-05, "loss": 2.0318, "num_input_tokens_seen": 9670256, "step": 4620 }, { "epoch": 0.7545476792560568, "grad_norm": 8.3125, "learning_rate": 4.836940212495489e-05, "loss": 4.4494, "num_input_tokens_seen": 9680592, "step": 4625 }, { "epoch": 0.7553634064768742, "grad_norm": 9.375, "learning_rate": 4.836591278726087e-05, "loss": 2.0766, "num_input_tokens_seen": 9689904, "step": 4630 }, { "epoch": 0.7561791336976915, "grad_norm": 8.0625, "learning_rate": 4.836241984624947e-05, "loss": 2.3191, "num_input_tokens_seen": 9699888, "step": 4635 }, { "epoch": 0.7569948609185089, "grad_norm": 3.21875, "learning_rate": 4.8358923302459336e-05, "loss": 2.5817, "num_input_tokens_seen": 9710592, "step": 4640 }, { "epoch": 0.7578105881393262, "grad_norm": 7.15625, "learning_rate": 4.835542315642968e-05, "loss": 1.6087, "num_input_tokens_seen": 9721184, "step": 4645 }, { "epoch": 0.7586263153601436, "grad_norm": 2.71875, "learning_rate": 4.8351919408700274e-05, "loss": 2.1471, "num_input_tokens_seen": 9732384, "step": 4650 }, { "epoch": 0.7594420425809609, "grad_norm": 7.65625, "learning_rate": 4.834841205981144e-05, "loss": 2.3985, "num_input_tokens_seen": 9742544, "step": 4655 }, { "epoch": 0.7602577698017783, "grad_norm": 3.6875, "learning_rate": 4.8344901110304054e-05, "loss": 3.0241, "num_input_tokens_seen": 9752736, "step": 4660 }, { "epoch": 0.7610734970225956, "grad_norm": 5.375, "learning_rate": 4.8341386560719534e-05, "loss": 3.3465, "num_input_tokens_seen": 9763744, "step": 4665 }, { "epoch": 0.761889224243413, "grad_norm": 0.34375, "learning_rate": 4.833786841159989e-05, "loss": 2.0751, "num_input_tokens_seen": 9775040, "step": 4670 }, { "epoch": 0.7627049514642303, "grad_norm": 15.1875, "learning_rate": 4.833434666348765e-05, "loss": 3.6634, "num_input_tokens_seen": 9785328, "step": 4675 }, { "epoch": 0.7635206786850477, "grad_norm": 7.3125, "learning_rate": 4.833082131692592e-05, "loss": 2.2743, "num_input_tokens_seen": 9795328, "step": 4680 }, { "epoch": 0.764336405905865, "grad_norm": 4.34375, "learning_rate": 4.832729237245835e-05, "loss": 3.2662, "num_input_tokens_seen": 9805328, "step": 4685 }, { "epoch": 0.7651521331266824, "grad_norm": 6.53125, "learning_rate": 4.8323759830629145e-05, "loss": 3.0139, "num_input_tokens_seen": 9814688, "step": 4690 }, { "epoch": 0.7659678603474998, "grad_norm": 7.65625, "learning_rate": 4.8320223691983066e-05, "loss": 2.7113, "num_input_tokens_seen": 9825664, "step": 4695 }, { "epoch": 0.7667835875683171, "grad_norm": 6.71875, "learning_rate": 4.831668395706544e-05, "loss": 2.4601, "num_input_tokens_seen": 9837760, "step": 4700 }, { "epoch": 0.7675993147891346, "grad_norm": 7.34375, "learning_rate": 4.8313140626422125e-05, "loss": 1.8229, "num_input_tokens_seen": 9848512, "step": 4705 }, { "epoch": 0.7684150420099519, "grad_norm": 6.1875, "learning_rate": 4.830959370059956e-05, "loss": 2.0388, "num_input_tokens_seen": 9858912, "step": 4710 }, { "epoch": 0.7692307692307693, "grad_norm": 6.0625, "learning_rate": 4.830604318014472e-05, "loss": 1.932, "num_input_tokens_seen": 9869024, "step": 4715 }, { "epoch": 0.7700464964515866, "grad_norm": 6.5625, "learning_rate": 4.830248906560514e-05, "loss": 2.619, "num_input_tokens_seen": 9878464, "step": 4720 }, { "epoch": 0.770862223672404, "grad_norm": 9.1875, "learning_rate": 4.829893135752891e-05, "loss": 2.4192, "num_input_tokens_seen": 9888864, "step": 4725 }, { "epoch": 0.7716779508932213, "grad_norm": 2.296875, "learning_rate": 4.829537005646466e-05, "loss": 2.1911, "num_input_tokens_seen": 9899696, "step": 4730 }, { "epoch": 0.7724936781140387, "grad_norm": 10.5, "learning_rate": 4.8291805162961615e-05, "loss": 2.548, "num_input_tokens_seen": 9911392, "step": 4735 }, { "epoch": 0.773309405334856, "grad_norm": 3.609375, "learning_rate": 4.82882366775695e-05, "loss": 2.0496, "num_input_tokens_seen": 9920304, "step": 4740 }, { "epoch": 0.7741251325556734, "grad_norm": 2.71875, "learning_rate": 4.828466460083864e-05, "loss": 2.6142, "num_input_tokens_seen": 9930816, "step": 4745 }, { "epoch": 0.7749408597764907, "grad_norm": 1.828125, "learning_rate": 4.8281088933319877e-05, "loss": 1.6442, "num_input_tokens_seen": 9940848, "step": 4750 }, { "epoch": 0.7757565869973081, "grad_norm": 6.28125, "learning_rate": 4.827750967556464e-05, "loss": 3.6581, "num_input_tokens_seen": 9951920, "step": 4755 }, { "epoch": 0.7765723142181254, "grad_norm": 11.3125, "learning_rate": 4.827392682812488e-05, "loss": 2.5322, "num_input_tokens_seen": 9962176, "step": 4760 }, { "epoch": 0.7773880414389428, "grad_norm": 6.125, "learning_rate": 4.827034039155312e-05, "loss": 2.0869, "num_input_tokens_seen": 9973600, "step": 4765 }, { "epoch": 0.7782037686597602, "grad_norm": 6.21875, "learning_rate": 4.8266750366402445e-05, "loss": 2.6236, "num_input_tokens_seen": 9984528, "step": 4770 }, { "epoch": 0.7790194958805775, "grad_norm": 4.3125, "learning_rate": 4.8263156753226476e-05, "loss": 2.0488, "num_input_tokens_seen": 9993632, "step": 4775 }, { "epoch": 0.7798352231013949, "grad_norm": 7.71875, "learning_rate": 4.8259559552579394e-05, "loss": 1.7693, "num_input_tokens_seen": 10004448, "step": 4780 }, { "epoch": 0.7806509503222122, "grad_norm": 6.03125, "learning_rate": 4.825595876501593e-05, "loss": 2.7809, "num_input_tokens_seen": 10015664, "step": 4785 }, { "epoch": 0.7814666775430296, "grad_norm": 8.75, "learning_rate": 4.825235439109137e-05, "loss": 1.8827, "num_input_tokens_seen": 10026576, "step": 4790 }, { "epoch": 0.7822824047638469, "grad_norm": 9.1875, "learning_rate": 4.824874643136156e-05, "loss": 2.939, "num_input_tokens_seen": 10037168, "step": 4795 }, { "epoch": 0.7830981319846644, "grad_norm": 6.5, "learning_rate": 4.824513488638288e-05, "loss": 3.0207, "num_input_tokens_seen": 10048144, "step": 4800 }, { "epoch": 0.7830981319846644, "eval_loss": 2.536677598953247, "eval_runtime": 134.8407, "eval_samples_per_second": 20.209, "eval_steps_per_second": 10.108, "num_input_tokens_seen": 10048144, "step": 4800 }, { "epoch": 0.7839138592054817, "grad_norm": 10.1875, "learning_rate": 4.8241519756712293e-05, "loss": 2.8481, "num_input_tokens_seen": 10057472, "step": 4805 }, { "epoch": 0.7847295864262991, "grad_norm": 13.25, "learning_rate": 4.8237901042907285e-05, "loss": 4.1903, "num_input_tokens_seen": 10068720, "step": 4810 }, { "epoch": 0.7855453136471164, "grad_norm": 10.0625, "learning_rate": 4.823427874552591e-05, "loss": 2.2892, "num_input_tokens_seen": 10078336, "step": 4815 }, { "epoch": 0.7863610408679338, "grad_norm": 8.0, "learning_rate": 4.823065286512677e-05, "loss": 1.5361, "num_input_tokens_seen": 10088064, "step": 4820 }, { "epoch": 0.7871767680887511, "grad_norm": 8.0, "learning_rate": 4.8227023402269025e-05, "loss": 2.8675, "num_input_tokens_seen": 10098928, "step": 4825 }, { "epoch": 0.7879924953095685, "grad_norm": 5.875, "learning_rate": 4.822339035751239e-05, "loss": 2.0661, "num_input_tokens_seen": 10108608, "step": 4830 }, { "epoch": 0.7888082225303858, "grad_norm": 7.75, "learning_rate": 4.8219753731417104e-05, "loss": 3.0142, "num_input_tokens_seen": 10119168, "step": 4835 }, { "epoch": 0.7896239497512032, "grad_norm": 6.59375, "learning_rate": 4.821611352454401e-05, "loss": 2.4362, "num_input_tokens_seen": 10129472, "step": 4840 }, { "epoch": 0.7904396769720206, "grad_norm": 9.375, "learning_rate": 4.8212469737454444e-05, "loss": 2.5736, "num_input_tokens_seen": 10138944, "step": 4845 }, { "epoch": 0.7912554041928379, "grad_norm": 0.2314453125, "learning_rate": 4.820882237071035e-05, "loss": 2.6321, "num_input_tokens_seen": 10150096, "step": 4850 }, { "epoch": 0.7920711314136553, "grad_norm": 13.375, "learning_rate": 4.820517142487417e-05, "loss": 2.1525, "num_input_tokens_seen": 10158512, "step": 4855 }, { "epoch": 0.7928868586344726, "grad_norm": 8.375, "learning_rate": 4.8201516900508956e-05, "loss": 2.8096, "num_input_tokens_seen": 10167536, "step": 4860 }, { "epoch": 0.79370258585529, "grad_norm": 11.375, "learning_rate": 4.819785879817827e-05, "loss": 2.8131, "num_input_tokens_seen": 10178400, "step": 4865 }, { "epoch": 0.7945183130761073, "grad_norm": 5.4375, "learning_rate": 4.8194197118446226e-05, "loss": 1.5677, "num_input_tokens_seen": 10188240, "step": 4870 }, { "epoch": 0.7953340402969247, "grad_norm": 6.1875, "learning_rate": 4.819053186187752e-05, "loss": 3.1932, "num_input_tokens_seen": 10198672, "step": 4875 }, { "epoch": 0.796149767517742, "grad_norm": 8.125, "learning_rate": 4.818686302903736e-05, "loss": 3.0827, "num_input_tokens_seen": 10208576, "step": 4880 }, { "epoch": 0.7969654947385594, "grad_norm": 3.25, "learning_rate": 4.818319062049154e-05, "loss": 1.9723, "num_input_tokens_seen": 10219472, "step": 4885 }, { "epoch": 0.7977812219593767, "grad_norm": 3.890625, "learning_rate": 4.817951463680639e-05, "loss": 2.5122, "num_input_tokens_seen": 10230112, "step": 4890 }, { "epoch": 0.7985969491801942, "grad_norm": 5.71875, "learning_rate": 4.817583507854879e-05, "loss": 2.9596, "num_input_tokens_seen": 10240992, "step": 4895 }, { "epoch": 0.7994126764010115, "grad_norm": 11.25, "learning_rate": 4.817215194628617e-05, "loss": 1.3114, "num_input_tokens_seen": 10250336, "step": 4900 }, { "epoch": 0.8002284036218289, "grad_norm": 7.1875, "learning_rate": 4.816846524058653e-05, "loss": 1.5633, "num_input_tokens_seen": 10261600, "step": 4905 }, { "epoch": 0.8010441308426463, "grad_norm": 7.46875, "learning_rate": 4.816477496201839e-05, "loss": 1.7865, "num_input_tokens_seen": 10271040, "step": 4910 }, { "epoch": 0.8018598580634636, "grad_norm": 4.21875, "learning_rate": 4.8161081111150845e-05, "loss": 2.0804, "num_input_tokens_seen": 10281568, "step": 4915 }, { "epoch": 0.802675585284281, "grad_norm": 7.65625, "learning_rate": 4.815738368855354e-05, "loss": 1.8795, "num_input_tokens_seen": 10289952, "step": 4920 }, { "epoch": 0.8034913125050983, "grad_norm": 4.46875, "learning_rate": 4.815368269479664e-05, "loss": 2.6724, "num_input_tokens_seen": 10299536, "step": 4925 }, { "epoch": 0.8043070397259157, "grad_norm": 4.0625, "learning_rate": 4.814997813045092e-05, "loss": 3.4789, "num_input_tokens_seen": 10309936, "step": 4930 }, { "epoch": 0.805122766946733, "grad_norm": 9.4375, "learning_rate": 4.814626999608764e-05, "loss": 2.5626, "num_input_tokens_seen": 10319776, "step": 4935 }, { "epoch": 0.8059384941675504, "grad_norm": 3.078125, "learning_rate": 4.814255829227865e-05, "loss": 1.4127, "num_input_tokens_seen": 10330432, "step": 4940 }, { "epoch": 0.8067542213883677, "grad_norm": 4.3125, "learning_rate": 4.813884301959635e-05, "loss": 2.7938, "num_input_tokens_seen": 10341936, "step": 4945 }, { "epoch": 0.8075699486091851, "grad_norm": 4.1875, "learning_rate": 4.813512417861368e-05, "loss": 1.6468, "num_input_tokens_seen": 10353488, "step": 4950 }, { "epoch": 0.8083856758300024, "grad_norm": 5.09375, "learning_rate": 4.813140176990411e-05, "loss": 1.4526, "num_input_tokens_seen": 10364688, "step": 4955 }, { "epoch": 0.8092014030508198, "grad_norm": 7.3125, "learning_rate": 4.8127675794041714e-05, "loss": 1.6773, "num_input_tokens_seen": 10373104, "step": 4960 }, { "epoch": 0.8100171302716371, "grad_norm": 10.9375, "learning_rate": 4.812394625160107e-05, "loss": 2.9198, "num_input_tokens_seen": 10383776, "step": 4965 }, { "epoch": 0.8108328574924545, "grad_norm": 2.25, "learning_rate": 4.812021314315732e-05, "loss": 1.4405, "num_input_tokens_seen": 10395072, "step": 4970 }, { "epoch": 0.8116485847132718, "grad_norm": 4.78125, "learning_rate": 4.811647646928616e-05, "loss": 1.8837, "num_input_tokens_seen": 10407264, "step": 4975 }, { "epoch": 0.8124643119340892, "grad_norm": 9.5, "learning_rate": 4.8112736230563814e-05, "loss": 4.0979, "num_input_tokens_seen": 10417776, "step": 4980 }, { "epoch": 0.8132800391549067, "grad_norm": 7.15625, "learning_rate": 4.81089924275671e-05, "loss": 1.8326, "num_input_tokens_seen": 10428560, "step": 4985 }, { "epoch": 0.814095766375724, "grad_norm": 7.15625, "learning_rate": 4.810524506087335e-05, "loss": 1.9079, "num_input_tokens_seen": 10439904, "step": 4990 }, { "epoch": 0.8149114935965414, "grad_norm": 6.53125, "learning_rate": 4.810149413106044e-05, "loss": 2.1585, "num_input_tokens_seen": 10449136, "step": 4995 }, { "epoch": 0.8157272208173587, "grad_norm": 7.96875, "learning_rate": 4.809773963870684e-05, "loss": 2.1577, "num_input_tokens_seen": 10460144, "step": 5000 }, { "epoch": 0.8157272208173587, "eval_loss": 2.5384411811828613, "eval_runtime": 135.0685, "eval_samples_per_second": 20.175, "eval_steps_per_second": 10.091, "num_input_tokens_seen": 10460144, "step": 5000 }, { "epoch": 0.8165429480381761, "grad_norm": 0.85546875, "learning_rate": 4.809398158439151e-05, "loss": 1.9975, "num_input_tokens_seen": 10470816, "step": 5005 }, { "epoch": 0.8173586752589934, "grad_norm": 4.59375, "learning_rate": 4.8090219968694005e-05, "loss": 2.0518, "num_input_tokens_seen": 10482064, "step": 5010 }, { "epoch": 0.8181744024798108, "grad_norm": 7.6875, "learning_rate": 4.808645479219442e-05, "loss": 2.1023, "num_input_tokens_seen": 10491408, "step": 5015 }, { "epoch": 0.8189901297006281, "grad_norm": 10.3125, "learning_rate": 4.8082686055473375e-05, "loss": 2.3896, "num_input_tokens_seen": 10502896, "step": 5020 }, { "epoch": 0.8198058569214455, "grad_norm": 10.25, "learning_rate": 4.8078913759112066e-05, "loss": 2.1198, "num_input_tokens_seen": 10513040, "step": 5025 }, { "epoch": 0.8206215841422628, "grad_norm": 3.578125, "learning_rate": 4.807513790369223e-05, "loss": 2.6971, "num_input_tokens_seen": 10523392, "step": 5030 }, { "epoch": 0.8214373113630802, "grad_norm": 8.25, "learning_rate": 4.8071358489796145e-05, "loss": 2.8387, "num_input_tokens_seen": 10534592, "step": 5035 }, { "epoch": 0.8222530385838975, "grad_norm": 14.75, "learning_rate": 4.806757551800665e-05, "loss": 2.8668, "num_input_tokens_seen": 10544656, "step": 5040 }, { "epoch": 0.8230687658047149, "grad_norm": 10.9375, "learning_rate": 4.806378898890713e-05, "loss": 1.828, "num_input_tokens_seen": 10554448, "step": 5045 }, { "epoch": 0.8238844930255322, "grad_norm": 8.875, "learning_rate": 4.80599989030815e-05, "loss": 3.1283, "num_input_tokens_seen": 10564864, "step": 5050 }, { "epoch": 0.8247002202463496, "grad_norm": 6.21875, "learning_rate": 4.805620526111426e-05, "loss": 3.3142, "num_input_tokens_seen": 10575712, "step": 5055 }, { "epoch": 0.8255159474671669, "grad_norm": 9.375, "learning_rate": 4.805240806359042e-05, "loss": 2.4858, "num_input_tokens_seen": 10585792, "step": 5060 }, { "epoch": 0.8263316746879843, "grad_norm": 12.6875, "learning_rate": 4.804860731109557e-05, "loss": 3.2062, "num_input_tokens_seen": 10594688, "step": 5065 }, { "epoch": 0.8271474019088016, "grad_norm": 5.9375, "learning_rate": 4.804480300421581e-05, "loss": 2.8882, "num_input_tokens_seen": 10603968, "step": 5070 }, { "epoch": 0.827963129129619, "grad_norm": 10.375, "learning_rate": 4.804099514353784e-05, "loss": 2.8833, "num_input_tokens_seen": 10614432, "step": 5075 }, { "epoch": 0.8287788563504365, "grad_norm": 9.0625, "learning_rate": 4.8037183729648867e-05, "loss": 2.7341, "num_input_tokens_seen": 10624176, "step": 5080 }, { "epoch": 0.8295945835712538, "grad_norm": 8.375, "learning_rate": 4.803336876313666e-05, "loss": 2.8572, "num_input_tokens_seen": 10635168, "step": 5085 }, { "epoch": 0.8304103107920712, "grad_norm": 4.5, "learning_rate": 4.802955024458953e-05, "loss": 2.6549, "num_input_tokens_seen": 10644336, "step": 5090 }, { "epoch": 0.8312260380128885, "grad_norm": 1.9765625, "learning_rate": 4.802572817459634e-05, "loss": 1.8507, "num_input_tokens_seen": 10654656, "step": 5095 }, { "epoch": 0.8320417652337059, "grad_norm": 2.671875, "learning_rate": 4.802190255374651e-05, "loss": 2.2612, "num_input_tokens_seen": 10665008, "step": 5100 }, { "epoch": 0.8328574924545232, "grad_norm": 8.5, "learning_rate": 4.801807338263e-05, "loss": 2.7435, "num_input_tokens_seen": 10675616, "step": 5105 }, { "epoch": 0.8336732196753406, "grad_norm": 9.875, "learning_rate": 4.8014240661837306e-05, "loss": 2.9749, "num_input_tokens_seen": 10686416, "step": 5110 }, { "epoch": 0.8344889468961579, "grad_norm": 5.03125, "learning_rate": 4.80104043919595e-05, "loss": 2.6904, "num_input_tokens_seen": 10697056, "step": 5115 }, { "epoch": 0.8353046741169753, "grad_norm": 7.40625, "learning_rate": 4.800656457358815e-05, "loss": 2.4095, "num_input_tokens_seen": 10706544, "step": 5120 }, { "epoch": 0.8361204013377926, "grad_norm": 0.2431640625, "learning_rate": 4.800272120731544e-05, "loss": 1.9017, "num_input_tokens_seen": 10716752, "step": 5125 }, { "epoch": 0.83693612855861, "grad_norm": 7.65625, "learning_rate": 4.799887429373404e-05, "loss": 2.4891, "num_input_tokens_seen": 10727536, "step": 5130 }, { "epoch": 0.8377518557794273, "grad_norm": 6.59375, "learning_rate": 4.79950238334372e-05, "loss": 2.1901, "num_input_tokens_seen": 10738000, "step": 5135 }, { "epoch": 0.8385675830002447, "grad_norm": 5.6875, "learning_rate": 4.799116982701872e-05, "loss": 0.7661, "num_input_tokens_seen": 10748320, "step": 5140 }, { "epoch": 0.839383310221062, "grad_norm": 8.875, "learning_rate": 4.7987312275072926e-05, "loss": 1.8018, "num_input_tokens_seen": 10757120, "step": 5145 }, { "epoch": 0.8401990374418794, "grad_norm": 3.375, "learning_rate": 4.79834511781947e-05, "loss": 2.0503, "num_input_tokens_seen": 10766192, "step": 5150 }, { "epoch": 0.8410147646626968, "grad_norm": 1.3359375, "learning_rate": 4.797958653697947e-05, "loss": 1.063, "num_input_tokens_seen": 10775088, "step": 5155 }, { "epoch": 0.8418304918835141, "grad_norm": 7.71875, "learning_rate": 4.7975718352023225e-05, "loss": 2.594, "num_input_tokens_seen": 10786624, "step": 5160 }, { "epoch": 0.8426462191043315, "grad_norm": 8.0625, "learning_rate": 4.7971846623922476e-05, "loss": 3.0066, "num_input_tokens_seen": 10797376, "step": 5165 }, { "epoch": 0.8434619463251488, "grad_norm": 6.90625, "learning_rate": 4.7967971353274294e-05, "loss": 2.6395, "num_input_tokens_seen": 10806832, "step": 5170 }, { "epoch": 0.8442776735459663, "grad_norm": 3.6875, "learning_rate": 4.79640925406763e-05, "loss": 1.8819, "num_input_tokens_seen": 10817888, "step": 5175 }, { "epoch": 0.8450934007667836, "grad_norm": 12.3125, "learning_rate": 4.796021018672664e-05, "loss": 2.6378, "num_input_tokens_seen": 10829120, "step": 5180 }, { "epoch": 0.845909127987601, "grad_norm": 9.6875, "learning_rate": 4.795632429202405e-05, "loss": 2.3422, "num_input_tokens_seen": 10839248, "step": 5185 }, { "epoch": 0.8467248552084183, "grad_norm": 7.3125, "learning_rate": 4.795243485716775e-05, "loss": 3.4465, "num_input_tokens_seen": 10850624, "step": 5190 }, { "epoch": 0.8475405824292357, "grad_norm": 9.75, "learning_rate": 4.794854188275757e-05, "loss": 2.1124, "num_input_tokens_seen": 10861744, "step": 5195 }, { "epoch": 0.848356309650053, "grad_norm": 8.75, "learning_rate": 4.794464536939384e-05, "loss": 1.4574, "num_input_tokens_seen": 10871296, "step": 5200 }, { "epoch": 0.848356309650053, "eval_loss": 2.5493946075439453, "eval_runtime": 135.017, "eval_samples_per_second": 20.183, "eval_steps_per_second": 10.095, "num_input_tokens_seen": 10871296, "step": 5200 }, { "epoch": 0.8491720368708704, "grad_norm": 7.9375, "learning_rate": 4.794074531767745e-05, "loss": 3.1758, "num_input_tokens_seen": 10881776, "step": 5205 }, { "epoch": 0.8499877640916877, "grad_norm": 1.7578125, "learning_rate": 4.7936841728209834e-05, "loss": 2.5807, "num_input_tokens_seen": 10890720, "step": 5210 }, { "epoch": 0.8508034913125051, "grad_norm": 5.03125, "learning_rate": 4.7932934601593e-05, "loss": 2.1814, "num_input_tokens_seen": 10900528, "step": 5215 }, { "epoch": 0.8516192185333225, "grad_norm": 11.875, "learning_rate": 4.792902393842943e-05, "loss": 2.2278, "num_input_tokens_seen": 10912048, "step": 5220 }, { "epoch": 0.8524349457541398, "grad_norm": 6.65625, "learning_rate": 4.792510973932225e-05, "loss": 2.5504, "num_input_tokens_seen": 10921584, "step": 5225 }, { "epoch": 0.8532506729749572, "grad_norm": 6.03125, "learning_rate": 4.7921192004875036e-05, "loss": 3.1665, "num_input_tokens_seen": 10932208, "step": 5230 }, { "epoch": 0.8540664001957745, "grad_norm": 5.84375, "learning_rate": 4.791727073569198e-05, "loss": 2.3347, "num_input_tokens_seen": 10941984, "step": 5235 }, { "epoch": 0.8548821274165919, "grad_norm": 8.125, "learning_rate": 4.7913345932377775e-05, "loss": 1.7396, "num_input_tokens_seen": 10951584, "step": 5240 }, { "epoch": 0.8556978546374092, "grad_norm": 5.96875, "learning_rate": 4.790941759553769e-05, "loss": 2.4622, "num_input_tokens_seen": 10961280, "step": 5245 }, { "epoch": 0.8565135818582266, "grad_norm": 6.375, "learning_rate": 4.79054857257775e-05, "loss": 1.7231, "num_input_tokens_seen": 10972720, "step": 5250 }, { "epoch": 0.8573293090790439, "grad_norm": 4.21875, "learning_rate": 4.790155032370357e-05, "loss": 2.423, "num_input_tokens_seen": 10981936, "step": 5255 }, { "epoch": 0.8581450362998613, "grad_norm": 6.5625, "learning_rate": 4.789761138992278e-05, "loss": 3.4917, "num_input_tokens_seen": 10992080, "step": 5260 }, { "epoch": 0.8589607635206787, "grad_norm": 4.46875, "learning_rate": 4.7893668925042565e-05, "loss": 2.1076, "num_input_tokens_seen": 11002832, "step": 5265 }, { "epoch": 0.8597764907414961, "grad_norm": 5.71875, "learning_rate": 4.78897229296709e-05, "loss": 1.7452, "num_input_tokens_seen": 11013344, "step": 5270 }, { "epoch": 0.8605922179623134, "grad_norm": 10.5625, "learning_rate": 4.7885773404416315e-05, "loss": 3.7345, "num_input_tokens_seen": 11022304, "step": 5275 }, { "epoch": 0.8614079451831308, "grad_norm": 6.375, "learning_rate": 4.788182034988786e-05, "loss": 2.2345, "num_input_tokens_seen": 11033424, "step": 5280 }, { "epoch": 0.8622236724039481, "grad_norm": 7.125, "learning_rate": 4.787786376669516e-05, "loss": 2.1459, "num_input_tokens_seen": 11044112, "step": 5285 }, { "epoch": 0.8630393996247655, "grad_norm": 7.90625, "learning_rate": 4.787390365544837e-05, "loss": 2.3779, "num_input_tokens_seen": 11055280, "step": 5290 }, { "epoch": 0.8638551268455829, "grad_norm": 4.03125, "learning_rate": 4.786994001675818e-05, "loss": 2.051, "num_input_tokens_seen": 11066416, "step": 5295 }, { "epoch": 0.8646708540664002, "grad_norm": 2.6875, "learning_rate": 4.786597285123584e-05, "loss": 2.3115, "num_input_tokens_seen": 11075216, "step": 5300 }, { "epoch": 0.8654865812872176, "grad_norm": 6.03125, "learning_rate": 4.7862002159493135e-05, "loss": 2.6198, "num_input_tokens_seen": 11085264, "step": 5305 }, { "epoch": 0.8663023085080349, "grad_norm": 11.25, "learning_rate": 4.785802794214239e-05, "loss": 2.4652, "num_input_tokens_seen": 11095536, "step": 5310 }, { "epoch": 0.8671180357288523, "grad_norm": 4.28125, "learning_rate": 4.7854050199796495e-05, "loss": 1.6442, "num_input_tokens_seen": 11107360, "step": 5315 }, { "epoch": 0.8679337629496696, "grad_norm": 1.796875, "learning_rate": 4.7850068933068845e-05, "loss": 2.2302, "num_input_tokens_seen": 11118336, "step": 5320 }, { "epoch": 0.868749490170487, "grad_norm": 8.8125, "learning_rate": 4.7846084142573425e-05, "loss": 2.4921, "num_input_tokens_seen": 11130064, "step": 5325 }, { "epoch": 0.8695652173913043, "grad_norm": 0.1162109375, "learning_rate": 4.7842095828924725e-05, "loss": 2.4931, "num_input_tokens_seen": 11139520, "step": 5330 }, { "epoch": 0.8703809446121217, "grad_norm": 6.5, "learning_rate": 4.783810399273779e-05, "loss": 2.0956, "num_input_tokens_seen": 11150224, "step": 5335 }, { "epoch": 0.871196671832939, "grad_norm": 4.6875, "learning_rate": 4.7834108634628226e-05, "loss": 1.9645, "num_input_tokens_seen": 11160848, "step": 5340 }, { "epoch": 0.8720123990537564, "grad_norm": 5.15625, "learning_rate": 4.783010975521216e-05, "loss": 2.7703, "num_input_tokens_seen": 11171824, "step": 5345 }, { "epoch": 0.8728281262745737, "grad_norm": 10.0, "learning_rate": 4.782610735510626e-05, "loss": 1.8452, "num_input_tokens_seen": 11182448, "step": 5350 }, { "epoch": 0.8736438534953911, "grad_norm": 7.84375, "learning_rate": 4.782210143492776e-05, "loss": 2.7839, "num_input_tokens_seen": 11192736, "step": 5355 }, { "epoch": 0.8744595807162086, "grad_norm": 5.0625, "learning_rate": 4.781809199529442e-05, "loss": 2.148, "num_input_tokens_seen": 11204752, "step": 5360 }, { "epoch": 0.8752753079370259, "grad_norm": 4.78125, "learning_rate": 4.781407903682454e-05, "loss": 1.7689, "num_input_tokens_seen": 11216160, "step": 5365 }, { "epoch": 0.8760910351578433, "grad_norm": 1.2734375, "learning_rate": 4.781006256013698e-05, "loss": 1.1763, "num_input_tokens_seen": 11227664, "step": 5370 }, { "epoch": 0.8769067623786606, "grad_norm": 9.0, "learning_rate": 4.7806042565851115e-05, "loss": 3.1638, "num_input_tokens_seen": 11238160, "step": 5375 }, { "epoch": 0.877722489599478, "grad_norm": 11.25, "learning_rate": 4.7802019054586895e-05, "loss": 3.6178, "num_input_tokens_seen": 11248240, "step": 5380 }, { "epoch": 0.8785382168202953, "grad_norm": 1.890625, "learning_rate": 4.779799202696479e-05, "loss": 1.9397, "num_input_tokens_seen": 11257760, "step": 5385 }, { "epoch": 0.8793539440411127, "grad_norm": 6.4375, "learning_rate": 4.779396148360581e-05, "loss": 2.6823, "num_input_tokens_seen": 11268608, "step": 5390 }, { "epoch": 0.88016967126193, "grad_norm": 10.6875, "learning_rate": 4.7789927425131517e-05, "loss": 2.8279, "num_input_tokens_seen": 11278176, "step": 5395 }, { "epoch": 0.8809853984827474, "grad_norm": 7.75, "learning_rate": 4.778588985216403e-05, "loss": 3.7552, "num_input_tokens_seen": 11287600, "step": 5400 }, { "epoch": 0.8809853984827474, "eval_loss": 2.5324652194976807, "eval_runtime": 135.0952, "eval_samples_per_second": 20.171, "eval_steps_per_second": 10.089, "num_input_tokens_seen": 11287600, "step": 5400 }, { "epoch": 0.8818011257035647, "grad_norm": 3.828125, "learning_rate": 4.778184876532598e-05, "loss": 2.5186, "num_input_tokens_seen": 11298624, "step": 5405 }, { "epoch": 0.8826168529243821, "grad_norm": 6.0625, "learning_rate": 4.7777804165240556e-05, "loss": 1.6157, "num_input_tokens_seen": 11307440, "step": 5410 }, { "epoch": 0.8834325801451994, "grad_norm": 9.8125, "learning_rate": 4.7773756052531485e-05, "loss": 4.2717, "num_input_tokens_seen": 11318288, "step": 5415 }, { "epoch": 0.8842483073660168, "grad_norm": 4.71875, "learning_rate": 4.7769704427823035e-05, "loss": 2.1809, "num_input_tokens_seen": 11328048, "step": 5420 }, { "epoch": 0.8850640345868341, "grad_norm": 12.6875, "learning_rate": 4.776564929174003e-05, "loss": 2.2513, "num_input_tokens_seen": 11338048, "step": 5425 }, { "epoch": 0.8858797618076515, "grad_norm": 7.84375, "learning_rate": 4.7761590644907806e-05, "loss": 1.882, "num_input_tokens_seen": 11349600, "step": 5430 }, { "epoch": 0.8866954890284688, "grad_norm": 6.8125, "learning_rate": 4.7757528487952263e-05, "loss": 1.8687, "num_input_tokens_seen": 11360000, "step": 5435 }, { "epoch": 0.8875112162492862, "grad_norm": 3.359375, "learning_rate": 4.7753462821499836e-05, "loss": 2.5302, "num_input_tokens_seen": 11370624, "step": 5440 }, { "epoch": 0.8883269434701035, "grad_norm": 5.0625, "learning_rate": 4.774939364617751e-05, "loss": 1.869, "num_input_tokens_seen": 11381104, "step": 5445 }, { "epoch": 0.889142670690921, "grad_norm": 4.09375, "learning_rate": 4.7745320962612795e-05, "loss": 2.55, "num_input_tokens_seen": 11392144, "step": 5450 }, { "epoch": 0.8899583979117384, "grad_norm": 4.28125, "learning_rate": 4.7741244771433756e-05, "loss": 2.9084, "num_input_tokens_seen": 11401760, "step": 5455 }, { "epoch": 0.8907741251325557, "grad_norm": 5.75, "learning_rate": 4.7737165073268985e-05, "loss": 1.9265, "num_input_tokens_seen": 11412352, "step": 5460 }, { "epoch": 0.8915898523533731, "grad_norm": 7.84375, "learning_rate": 4.7733081868747626e-05, "loss": 1.7198, "num_input_tokens_seen": 11421424, "step": 5465 }, { "epoch": 0.8924055795741904, "grad_norm": 6.84375, "learning_rate": 4.772899515849936e-05, "loss": 2.6791, "num_input_tokens_seen": 11431680, "step": 5470 }, { "epoch": 0.8932213067950078, "grad_norm": 11.5, "learning_rate": 4.7724904943154414e-05, "loss": 2.3797, "num_input_tokens_seen": 11442672, "step": 5475 }, { "epoch": 0.8940370340158251, "grad_norm": 9.375, "learning_rate": 4.772081122334354e-05, "loss": 2.5438, "num_input_tokens_seen": 11453712, "step": 5480 }, { "epoch": 0.8948527612366425, "grad_norm": 9.6875, "learning_rate": 4.771671399969806e-05, "loss": 3.078, "num_input_tokens_seen": 11464128, "step": 5485 }, { "epoch": 0.8956684884574598, "grad_norm": 5.09375, "learning_rate": 4.7712613272849794e-05, "loss": 3.5604, "num_input_tokens_seen": 11474208, "step": 5490 }, { "epoch": 0.8964842156782772, "grad_norm": 7.375, "learning_rate": 4.770850904343114e-05, "loss": 2.2904, "num_input_tokens_seen": 11485968, "step": 5495 }, { "epoch": 0.8972999428990945, "grad_norm": 8.25, "learning_rate": 4.770440131207502e-05, "loss": 2.5698, "num_input_tokens_seen": 11497104, "step": 5500 }, { "epoch": 0.8981156701199119, "grad_norm": 7.65625, "learning_rate": 4.7700290079414896e-05, "loss": 2.9943, "num_input_tokens_seen": 11507440, "step": 5505 }, { "epoch": 0.8989313973407292, "grad_norm": 16.5, "learning_rate": 4.769617534608477e-05, "loss": 3.4936, "num_input_tokens_seen": 11518000, "step": 5510 }, { "epoch": 0.8997471245615466, "grad_norm": 8.125, "learning_rate": 4.7692057112719193e-05, "loss": 2.5138, "num_input_tokens_seen": 11528592, "step": 5515 }, { "epoch": 0.900562851782364, "grad_norm": 7.125, "learning_rate": 4.7687935379953234e-05, "loss": 2.3693, "num_input_tokens_seen": 11538432, "step": 5520 }, { "epoch": 0.9013785790031813, "grad_norm": 8.25, "learning_rate": 4.7683810148422534e-05, "loss": 1.582, "num_input_tokens_seen": 11548352, "step": 5525 }, { "epoch": 0.9021943062239987, "grad_norm": 0.515625, "learning_rate": 4.767968141876324e-05, "loss": 3.2872, "num_input_tokens_seen": 11558896, "step": 5530 }, { "epoch": 0.903010033444816, "grad_norm": 5.21875, "learning_rate": 4.767554919161207e-05, "loss": 1.6834, "num_input_tokens_seen": 11568304, "step": 5535 }, { "epoch": 0.9038257606656334, "grad_norm": 7.03125, "learning_rate": 4.767141346760624e-05, "loss": 1.4889, "num_input_tokens_seen": 11579392, "step": 5540 }, { "epoch": 0.9046414878864508, "grad_norm": 6.0625, "learning_rate": 4.766727424738356e-05, "loss": 4.0811, "num_input_tokens_seen": 11590048, "step": 5545 }, { "epoch": 0.9054572151072682, "grad_norm": 5.0625, "learning_rate": 4.7663131531582325e-05, "loss": 1.5634, "num_input_tokens_seen": 11601216, "step": 5550 }, { "epoch": 0.9062729423280855, "grad_norm": 3.890625, "learning_rate": 4.765898532084142e-05, "loss": 3.1074, "num_input_tokens_seen": 11611808, "step": 5555 }, { "epoch": 0.9070886695489029, "grad_norm": 6.5, "learning_rate": 4.765483561580022e-05, "loss": 2.5083, "num_input_tokens_seen": 11623408, "step": 5560 }, { "epoch": 0.9079043967697202, "grad_norm": 6.3125, "learning_rate": 4.7650682417098666e-05, "loss": 2.079, "num_input_tokens_seen": 11632416, "step": 5565 }, { "epoch": 0.9087201239905376, "grad_norm": 9.8125, "learning_rate": 4.7646525725377244e-05, "loss": 2.4267, "num_input_tokens_seen": 11643248, "step": 5570 }, { "epoch": 0.9095358512113549, "grad_norm": 4.53125, "learning_rate": 4.764236554127696e-05, "loss": 3.2853, "num_input_tokens_seen": 11654832, "step": 5575 }, { "epoch": 0.9103515784321723, "grad_norm": 3.875, "learning_rate": 4.7638201865439356e-05, "loss": 2.9913, "num_input_tokens_seen": 11665696, "step": 5580 }, { "epoch": 0.9111673056529896, "grad_norm": 14.75, "learning_rate": 4.7634034698506545e-05, "loss": 2.9404, "num_input_tokens_seen": 11675472, "step": 5585 }, { "epoch": 0.911983032873807, "grad_norm": 8.0625, "learning_rate": 4.762986404112115e-05, "loss": 2.857, "num_input_tokens_seen": 11685872, "step": 5590 }, { "epoch": 0.9127987600946244, "grad_norm": 7.8125, "learning_rate": 4.762568989392633e-05, "loss": 2.1901, "num_input_tokens_seen": 11696304, "step": 5595 }, { "epoch": 0.9136144873154417, "grad_norm": 5.75, "learning_rate": 4.76215122575658e-05, "loss": 1.8036, "num_input_tokens_seen": 11707328, "step": 5600 }, { "epoch": 0.9136144873154417, "eval_loss": 2.537649393081665, "eval_runtime": 134.8687, "eval_samples_per_second": 20.205, "eval_steps_per_second": 10.106, "num_input_tokens_seen": 11707328, "step": 5600 }, { "epoch": 0.9144302145362591, "grad_norm": 3.78125, "learning_rate": 4.7617331132683795e-05, "loss": 3.3887, "num_input_tokens_seen": 11717936, "step": 5605 }, { "epoch": 0.9152459417570764, "grad_norm": 16.375, "learning_rate": 4.7613146519925105e-05, "loss": 2.5385, "num_input_tokens_seen": 11728688, "step": 5610 }, { "epoch": 0.9160616689778938, "grad_norm": 2.21875, "learning_rate": 4.7608958419935045e-05, "loss": 2.6724, "num_input_tokens_seen": 11738016, "step": 5615 }, { "epoch": 0.9168773961987111, "grad_norm": 6.96875, "learning_rate": 4.760476683335948e-05, "loss": 2.8947, "num_input_tokens_seen": 11747520, "step": 5620 }, { "epoch": 0.9176931234195285, "grad_norm": 9.25, "learning_rate": 4.760057176084479e-05, "loss": 2.9242, "num_input_tokens_seen": 11757344, "step": 5625 }, { "epoch": 0.9185088506403458, "grad_norm": 0.2431640625, "learning_rate": 4.759637320303793e-05, "loss": 3.6776, "num_input_tokens_seen": 11768352, "step": 5630 }, { "epoch": 0.9193245778611632, "grad_norm": 15.625, "learning_rate": 4.759217116058635e-05, "loss": 2.0023, "num_input_tokens_seen": 11777824, "step": 5635 }, { "epoch": 0.9201403050819806, "grad_norm": 3.046875, "learning_rate": 4.758796563413807e-05, "loss": 2.2695, "num_input_tokens_seen": 11788432, "step": 5640 }, { "epoch": 0.920956032302798, "grad_norm": 2.015625, "learning_rate": 4.758375662434163e-05, "loss": 2.4233, "num_input_tokens_seen": 11798352, "step": 5645 }, { "epoch": 0.9217717595236153, "grad_norm": 4.125, "learning_rate": 4.7579544131846114e-05, "loss": 2.1305, "num_input_tokens_seen": 11807984, "step": 5650 }, { "epoch": 0.9225874867444327, "grad_norm": 7.84375, "learning_rate": 4.757532815730114e-05, "loss": 3.3345, "num_input_tokens_seen": 11818448, "step": 5655 }, { "epoch": 0.92340321396525, "grad_norm": 8.1875, "learning_rate": 4.7571108701356865e-05, "loss": 2.4339, "num_input_tokens_seen": 11828128, "step": 5660 }, { "epoch": 0.9242189411860674, "grad_norm": 2.765625, "learning_rate": 4.756688576466398e-05, "loss": 1.7845, "num_input_tokens_seen": 11839344, "step": 5665 }, { "epoch": 0.9250346684068848, "grad_norm": 3.609375, "learning_rate": 4.756265934787372e-05, "loss": 2.8798, "num_input_tokens_seen": 11849280, "step": 5670 }, { "epoch": 0.9258503956277021, "grad_norm": 8.375, "learning_rate": 4.755842945163785e-05, "loss": 2.237, "num_input_tokens_seen": 11859296, "step": 5675 }, { "epoch": 0.9266661228485195, "grad_norm": 13.375, "learning_rate": 4.755419607660867e-05, "loss": 2.4382, "num_input_tokens_seen": 11870304, "step": 5680 }, { "epoch": 0.9274818500693368, "grad_norm": 10.5, "learning_rate": 4.7549959223439016e-05, "loss": 0.7031, "num_input_tokens_seen": 11880736, "step": 5685 }, { "epoch": 0.9282975772901542, "grad_norm": 4.0, "learning_rate": 4.754571889278228e-05, "loss": 1.8747, "num_input_tokens_seen": 11891648, "step": 5690 }, { "epoch": 0.9291133045109715, "grad_norm": 7.65625, "learning_rate": 4.754147508529235e-05, "loss": 2.5913, "num_input_tokens_seen": 11902480, "step": 5695 }, { "epoch": 0.9299290317317889, "grad_norm": 12.5, "learning_rate": 4.75372278016237e-05, "loss": 3.1828, "num_input_tokens_seen": 11912592, "step": 5700 }, { "epoch": 0.9307447589526062, "grad_norm": 11.8125, "learning_rate": 4.753297704243129e-05, "loss": 2.8244, "num_input_tokens_seen": 11923024, "step": 5705 }, { "epoch": 0.9315604861734236, "grad_norm": 8.375, "learning_rate": 4.752872280837066e-05, "loss": 3.1457, "num_input_tokens_seen": 11933984, "step": 5710 }, { "epoch": 0.9323762133942409, "grad_norm": 8.875, "learning_rate": 4.752446510009786e-05, "loss": 2.2456, "num_input_tokens_seen": 11945744, "step": 5715 }, { "epoch": 0.9331919406150583, "grad_norm": 3.3125, "learning_rate": 4.7520203918269476e-05, "loss": 3.4833, "num_input_tokens_seen": 11955456, "step": 5720 }, { "epoch": 0.9340076678358756, "grad_norm": 5.40625, "learning_rate": 4.751593926354265e-05, "loss": 3.3501, "num_input_tokens_seen": 11965920, "step": 5725 }, { "epoch": 0.9348233950566931, "grad_norm": 10.3125, "learning_rate": 4.751167113657503e-05, "loss": 2.6175, "num_input_tokens_seen": 11975232, "step": 5730 }, { "epoch": 0.9356391222775104, "grad_norm": 10.5625, "learning_rate": 4.7507399538024834e-05, "loss": 2.5082, "num_input_tokens_seen": 11985664, "step": 5735 }, { "epoch": 0.9364548494983278, "grad_norm": 9.4375, "learning_rate": 4.750312446855077e-05, "loss": 3.6315, "num_input_tokens_seen": 11996208, "step": 5740 }, { "epoch": 0.9372705767191452, "grad_norm": 9.375, "learning_rate": 4.749884592881212e-05, "loss": 3.213, "num_input_tokens_seen": 12006992, "step": 5745 }, { "epoch": 0.9380863039399625, "grad_norm": 5.125, "learning_rate": 4.74945639194687e-05, "loss": 1.9797, "num_input_tokens_seen": 12018064, "step": 5750 }, { "epoch": 0.9389020311607799, "grad_norm": 6.25, "learning_rate": 4.749027844118083e-05, "loss": 2.7685, "num_input_tokens_seen": 12028128, "step": 5755 }, { "epoch": 0.9397177583815972, "grad_norm": 7.21875, "learning_rate": 4.7485989494609395e-05, "loss": 2.4822, "num_input_tokens_seen": 12040016, "step": 5760 }, { "epoch": 0.9405334856024146, "grad_norm": 10.3125, "learning_rate": 4.748169708041581e-05, "loss": 3.1446, "num_input_tokens_seen": 12050608, "step": 5765 }, { "epoch": 0.9413492128232319, "grad_norm": 2.09375, "learning_rate": 4.7477401199262004e-05, "loss": 1.9529, "num_input_tokens_seen": 12061296, "step": 5770 }, { "epoch": 0.9421649400440493, "grad_norm": 9.6875, "learning_rate": 4.747310185181048e-05, "loss": 2.2417, "num_input_tokens_seen": 12072000, "step": 5775 }, { "epoch": 0.9429806672648666, "grad_norm": 11.75, "learning_rate": 4.746879903872422e-05, "loss": 1.6819, "num_input_tokens_seen": 12080992, "step": 5780 }, { "epoch": 0.943796394485684, "grad_norm": 1.7265625, "learning_rate": 4.746449276066679e-05, "loss": 1.9088, "num_input_tokens_seen": 12090272, "step": 5785 }, { "epoch": 0.9446121217065013, "grad_norm": 4.8125, "learning_rate": 4.746018301830227e-05, "loss": 2.6754, "num_input_tokens_seen": 12100528, "step": 5790 }, { "epoch": 0.9454278489273187, "grad_norm": 11.1875, "learning_rate": 4.7455869812295275e-05, "loss": 4.3559, "num_input_tokens_seen": 12110352, "step": 5795 }, { "epoch": 0.946243576148136, "grad_norm": 3.5625, "learning_rate": 4.7451553143310964e-05, "loss": 2.9821, "num_input_tokens_seen": 12120624, "step": 5800 }, { "epoch": 0.946243576148136, "eval_loss": 2.555554151535034, "eval_runtime": 135.0115, "eval_samples_per_second": 20.183, "eval_steps_per_second": 10.095, "num_input_tokens_seen": 12120624, "step": 5800 }, { "epoch": 0.9470593033689534, "grad_norm": 5.78125, "learning_rate": 4.744723301201501e-05, "loss": 1.9478, "num_input_tokens_seen": 12131440, "step": 5805 }, { "epoch": 0.9478750305897707, "grad_norm": 8.625, "learning_rate": 4.744290941907364e-05, "loss": 2.1477, "num_input_tokens_seen": 12141856, "step": 5810 }, { "epoch": 0.9486907578105881, "grad_norm": 4.875, "learning_rate": 4.7438582365153594e-05, "loss": 2.3477, "num_input_tokens_seen": 12152512, "step": 5815 }, { "epoch": 0.9495064850314054, "grad_norm": 10.375, "learning_rate": 4.743425185092217e-05, "loss": 2.9303, "num_input_tokens_seen": 12163872, "step": 5820 }, { "epoch": 0.9503222122522229, "grad_norm": 8.125, "learning_rate": 4.742991787704719e-05, "loss": 3.2727, "num_input_tokens_seen": 12175536, "step": 5825 }, { "epoch": 0.9511379394730403, "grad_norm": 8.375, "learning_rate": 4.7425580444196994e-05, "loss": 2.2856, "num_input_tokens_seen": 12186624, "step": 5830 }, { "epoch": 0.9519536666938576, "grad_norm": 5.875, "learning_rate": 4.742123955304048e-05, "loss": 2.6641, "num_input_tokens_seen": 12196976, "step": 5835 }, { "epoch": 0.952769393914675, "grad_norm": 6.3125, "learning_rate": 4.741689520424706e-05, "loss": 2.0776, "num_input_tokens_seen": 12206784, "step": 5840 }, { "epoch": 0.9535851211354923, "grad_norm": 5.65625, "learning_rate": 4.741254739848669e-05, "loss": 1.2001, "num_input_tokens_seen": 12217504, "step": 5845 }, { "epoch": 0.9544008483563097, "grad_norm": 8.75, "learning_rate": 4.740819613642987e-05, "loss": 3.2334, "num_input_tokens_seen": 12227360, "step": 5850 }, { "epoch": 0.955216575577127, "grad_norm": 4.53125, "learning_rate": 4.74038414187476e-05, "loss": 2.1254, "num_input_tokens_seen": 12237344, "step": 5855 }, { "epoch": 0.9560323027979444, "grad_norm": 15.0625, "learning_rate": 4.739948324611144e-05, "loss": 2.0956, "num_input_tokens_seen": 12248400, "step": 5860 }, { "epoch": 0.9568480300187617, "grad_norm": 0.73828125, "learning_rate": 4.7395121619193465e-05, "loss": 2.8317, "num_input_tokens_seen": 12258224, "step": 5865 }, { "epoch": 0.9576637572395791, "grad_norm": 8.25, "learning_rate": 4.7390756538666313e-05, "loss": 1.5871, "num_input_tokens_seen": 12268736, "step": 5870 }, { "epoch": 0.9584794844603964, "grad_norm": 7.40625, "learning_rate": 4.738638800520311e-05, "loss": 2.5152, "num_input_tokens_seen": 12278496, "step": 5875 }, { "epoch": 0.9592952116812138, "grad_norm": 7.59375, "learning_rate": 4.738201601947757e-05, "loss": 2.0659, "num_input_tokens_seen": 12289088, "step": 5880 }, { "epoch": 0.9601109389020311, "grad_norm": 4.78125, "learning_rate": 4.7377640582163876e-05, "loss": 3.9642, "num_input_tokens_seen": 12299728, "step": 5885 }, { "epoch": 0.9609266661228485, "grad_norm": 12.3125, "learning_rate": 4.7373261693936786e-05, "loss": 3.7051, "num_input_tokens_seen": 12309520, "step": 5890 }, { "epoch": 0.9617423933436658, "grad_norm": 0.12060546875, "learning_rate": 4.7368879355471595e-05, "loss": 3.1656, "num_input_tokens_seen": 12319776, "step": 5895 }, { "epoch": 0.9625581205644832, "grad_norm": 6.59375, "learning_rate": 4.736449356744409e-05, "loss": 2.453, "num_input_tokens_seen": 12329168, "step": 5900 }, { "epoch": 0.9633738477853006, "grad_norm": 6.9375, "learning_rate": 4.736010433053064e-05, "loss": 1.4093, "num_input_tokens_seen": 12339696, "step": 5905 }, { "epoch": 0.9641895750061179, "grad_norm": 12.0, "learning_rate": 4.73557116454081e-05, "loss": 3.2938, "num_input_tokens_seen": 12349840, "step": 5910 }, { "epoch": 0.9650053022269353, "grad_norm": 4.875, "learning_rate": 4.735131551275389e-05, "loss": 3.278, "num_input_tokens_seen": 12361088, "step": 5915 }, { "epoch": 0.9658210294477527, "grad_norm": 8.375, "learning_rate": 4.734691593324594e-05, "loss": 2.0114, "num_input_tokens_seen": 12372080, "step": 5920 }, { "epoch": 0.9666367566685701, "grad_norm": 6.90625, "learning_rate": 4.734251290756272e-05, "loss": 1.6471, "num_input_tokens_seen": 12382752, "step": 5925 }, { "epoch": 0.9674524838893874, "grad_norm": 6.65625, "learning_rate": 4.7338106436383246e-05, "loss": 3.7974, "num_input_tokens_seen": 12392704, "step": 5930 }, { "epoch": 0.9682682111102048, "grad_norm": 7.125, "learning_rate": 4.733369652038703e-05, "loss": 1.705, "num_input_tokens_seen": 12404128, "step": 5935 }, { "epoch": 0.9690839383310221, "grad_norm": 6.625, "learning_rate": 4.7329283160254156e-05, "loss": 2.2585, "num_input_tokens_seen": 12415936, "step": 5940 }, { "epoch": 0.9698996655518395, "grad_norm": 5.28125, "learning_rate": 4.732486635666521e-05, "loss": 1.4724, "num_input_tokens_seen": 12427104, "step": 5945 }, { "epoch": 0.9707153927726568, "grad_norm": 1.3515625, "learning_rate": 4.732044611030132e-05, "loss": 1.7289, "num_input_tokens_seen": 12437728, "step": 5950 }, { "epoch": 0.9715311199934742, "grad_norm": 3.9375, "learning_rate": 4.731602242184414e-05, "loss": 2.0069, "num_input_tokens_seen": 12447968, "step": 5955 }, { "epoch": 0.9723468472142915, "grad_norm": 16.625, "learning_rate": 4.7311595291975864e-05, "loss": 1.7803, "num_input_tokens_seen": 12459072, "step": 5960 }, { "epoch": 0.9731625744351089, "grad_norm": 3.46875, "learning_rate": 4.7307164721379216e-05, "loss": 1.665, "num_input_tokens_seen": 12470592, "step": 5965 }, { "epoch": 0.9739783016559262, "grad_norm": 6.5625, "learning_rate": 4.730273071073743e-05, "loss": 2.1494, "num_input_tokens_seen": 12480864, "step": 5970 }, { "epoch": 0.9747940288767436, "grad_norm": 11.0625, "learning_rate": 4.729829326073429e-05, "loss": 2.8779, "num_input_tokens_seen": 12490672, "step": 5975 }, { "epoch": 0.975609756097561, "grad_norm": 14.125, "learning_rate": 4.7293852372054126e-05, "loss": 3.0805, "num_input_tokens_seen": 12501840, "step": 5980 }, { "epoch": 0.9764254833183783, "grad_norm": 5.59375, "learning_rate": 4.728940804538176e-05, "loss": 2.6568, "num_input_tokens_seen": 12511216, "step": 5985 }, { "epoch": 0.9772412105391957, "grad_norm": 5.1875, "learning_rate": 4.7284960281402556e-05, "loss": 1.5319, "num_input_tokens_seen": 12521920, "step": 5990 }, { "epoch": 0.978056937760013, "grad_norm": 10.625, "learning_rate": 4.728050908080244e-05, "loss": 3.5632, "num_input_tokens_seen": 12533200, "step": 5995 }, { "epoch": 0.9788726649808304, "grad_norm": 14.3125, "learning_rate": 4.727605444426782e-05, "loss": 1.8289, "num_input_tokens_seen": 12542416, "step": 6000 }, { "epoch": 0.9788726649808304, "eval_loss": 2.549168348312378, "eval_runtime": 134.811, "eval_samples_per_second": 20.213, "eval_steps_per_second": 10.11, "num_input_tokens_seen": 12542416, "step": 6000 }, { "epoch": 0.9796883922016477, "grad_norm": 3.5, "learning_rate": 4.727159637248567e-05, "loss": 1.621, "num_input_tokens_seen": 12553152, "step": 6005 }, { "epoch": 0.9805041194224652, "grad_norm": 6.40625, "learning_rate": 4.7267134866143474e-05, "loss": 3.041, "num_input_tokens_seen": 12562960, "step": 6010 }, { "epoch": 0.9813198466432825, "grad_norm": 6.4375, "learning_rate": 4.726266992592926e-05, "loss": 4.6699, "num_input_tokens_seen": 12572144, "step": 6015 }, { "epoch": 0.9821355738640999, "grad_norm": 5.75, "learning_rate": 4.725820155253157e-05, "loss": 0.8568, "num_input_tokens_seen": 12583120, "step": 6020 }, { "epoch": 0.9829513010849172, "grad_norm": 10.25, "learning_rate": 4.725372974663948e-05, "loss": 3.5086, "num_input_tokens_seen": 12594208, "step": 6025 }, { "epoch": 0.9837670283057346, "grad_norm": 5.375, "learning_rate": 4.724925450894262e-05, "loss": 1.9006, "num_input_tokens_seen": 12605456, "step": 6030 }, { "epoch": 0.984582755526552, "grad_norm": 8.4375, "learning_rate": 4.72447758401311e-05, "loss": 3.3245, "num_input_tokens_seen": 12615872, "step": 6035 }, { "epoch": 0.9853984827473693, "grad_norm": 14.375, "learning_rate": 4.7240293740895616e-05, "loss": 2.5671, "num_input_tokens_seen": 12626144, "step": 6040 }, { "epoch": 0.9862142099681867, "grad_norm": 7.5, "learning_rate": 4.723580821192733e-05, "loss": 2.8449, "num_input_tokens_seen": 12636112, "step": 6045 }, { "epoch": 0.987029937189004, "grad_norm": 8.75, "learning_rate": 4.7231319253917996e-05, "loss": 2.2511, "num_input_tokens_seen": 12645168, "step": 6050 }, { "epoch": 0.9878456644098214, "grad_norm": 3.234375, "learning_rate": 4.722682686755986e-05, "loss": 2.184, "num_input_tokens_seen": 12656736, "step": 6055 }, { "epoch": 0.9886613916306387, "grad_norm": 4.6875, "learning_rate": 4.722233105354569e-05, "loss": 2.0117, "num_input_tokens_seen": 12667472, "step": 6060 }, { "epoch": 0.9894771188514561, "grad_norm": 12.4375, "learning_rate": 4.7217831812568815e-05, "loss": 2.4944, "num_input_tokens_seen": 12678384, "step": 6065 }, { "epoch": 0.9902928460722734, "grad_norm": 3.609375, "learning_rate": 4.721332914532307e-05, "loss": 1.823, "num_input_tokens_seen": 12689200, "step": 6070 }, { "epoch": 0.9911085732930908, "grad_norm": 10.375, "learning_rate": 4.720882305250281e-05, "loss": 2.7774, "num_input_tokens_seen": 12699568, "step": 6075 }, { "epoch": 0.9919243005139081, "grad_norm": 7.78125, "learning_rate": 4.720431353480295e-05, "loss": 2.2263, "num_input_tokens_seen": 12710336, "step": 6080 }, { "epoch": 0.9927400277347255, "grad_norm": 8.4375, "learning_rate": 4.719980059291891e-05, "loss": 3.1793, "num_input_tokens_seen": 12719392, "step": 6085 }, { "epoch": 0.9935557549555428, "grad_norm": 0.06201171875, "learning_rate": 4.7195284227546634e-05, "loss": 1.8151, "num_input_tokens_seen": 12729424, "step": 6090 }, { "epoch": 0.9943714821763602, "grad_norm": 10.125, "learning_rate": 4.7190764439382604e-05, "loss": 2.6556, "num_input_tokens_seen": 12741248, "step": 6095 }, { "epoch": 0.9951872093971775, "grad_norm": 9.125, "learning_rate": 4.7186241229123826e-05, "loss": 2.8441, "num_input_tokens_seen": 12751184, "step": 6100 }, { "epoch": 0.996002936617995, "grad_norm": 11.875, "learning_rate": 4.718171459746785e-05, "loss": 1.7279, "num_input_tokens_seen": 12760464, "step": 6105 }, { "epoch": 0.9968186638388123, "grad_norm": 14.0625, "learning_rate": 4.717718454511273e-05, "loss": 1.9571, "num_input_tokens_seen": 12770672, "step": 6110 }, { "epoch": 0.9976343910596297, "grad_norm": 4.46875, "learning_rate": 4.7172651072757056e-05, "loss": 1.6713, "num_input_tokens_seen": 12781456, "step": 6115 }, { "epoch": 0.998450118280447, "grad_norm": 5.5625, "learning_rate": 4.7168114181099945e-05, "loss": 1.3262, "num_input_tokens_seen": 12792592, "step": 6120 }, { "epoch": 0.9992658455012644, "grad_norm": 6.28125, "learning_rate": 4.716357387084105e-05, "loss": 2.7664, "num_input_tokens_seen": 12803840, "step": 6125 }, { "epoch": 1.0, "grad_norm": 0.302734375, "learning_rate": 4.715903014268054e-05, "loss": 2.6198, "num_input_tokens_seen": 12813920, "step": 6130 }, { "epoch": 1.0008157272208174, "grad_norm": 11.9375, "learning_rate": 4.715448299731911e-05, "loss": 2.2941, "num_input_tokens_seen": 12824960, "step": 6135 }, { "epoch": 1.0016314544416347, "grad_norm": 6.5625, "learning_rate": 4.7149932435457986e-05, "loss": 1.086, "num_input_tokens_seen": 12836112, "step": 6140 }, { "epoch": 1.002447181662452, "grad_norm": 4.8125, "learning_rate": 4.714537845779894e-05, "loss": 1.5446, "num_input_tokens_seen": 12846560, "step": 6145 }, { "epoch": 1.0032629088832694, "grad_norm": 2.71875, "learning_rate": 4.714082106504423e-05, "loss": 2.6856, "num_input_tokens_seen": 12858208, "step": 6150 }, { "epoch": 1.0040786361040868, "grad_norm": 8.75, "learning_rate": 4.713626025789667e-05, "loss": 1.6021, "num_input_tokens_seen": 12868608, "step": 6155 }, { "epoch": 1.0048943633249041, "grad_norm": 8.8125, "learning_rate": 4.7131696037059606e-05, "loss": 0.9059, "num_input_tokens_seen": 12878320, "step": 6160 }, { "epoch": 1.0057100905457215, "grad_norm": 2.3125, "learning_rate": 4.712712840323689e-05, "loss": 1.9529, "num_input_tokens_seen": 12889248, "step": 6165 }, { "epoch": 1.0065258177665388, "grad_norm": 4.59375, "learning_rate": 4.71225573571329e-05, "loss": 1.3517, "num_input_tokens_seen": 12900432, "step": 6170 }, { "epoch": 1.0073415449873562, "grad_norm": 2.921875, "learning_rate": 4.711798289945256e-05, "loss": 2.3083, "num_input_tokens_seen": 12910800, "step": 6175 }, { "epoch": 1.0081572722081735, "grad_norm": 6.4375, "learning_rate": 4.71134050309013e-05, "loss": 2.4419, "num_input_tokens_seen": 12921872, "step": 6180 }, { "epoch": 1.0089729994289909, "grad_norm": 7.75, "learning_rate": 4.710882375218509e-05, "loss": 2.385, "num_input_tokens_seen": 12933024, "step": 6185 }, { "epoch": 1.0097887266498082, "grad_norm": 0.515625, "learning_rate": 4.7104239064010424e-05, "loss": 2.1937, "num_input_tokens_seen": 12943536, "step": 6190 }, { "epoch": 1.0106044538706256, "grad_norm": 10.125, "learning_rate": 4.709965096708432e-05, "loss": 2.363, "num_input_tokens_seen": 12952080, "step": 6195 }, { "epoch": 1.011420181091443, "grad_norm": 5.875, "learning_rate": 4.709505946211431e-05, "loss": 1.8497, "num_input_tokens_seen": 12963008, "step": 6200 }, { "epoch": 1.011420181091443, "eval_loss": 2.5441558361053467, "eval_runtime": 134.972, "eval_samples_per_second": 20.189, "eval_steps_per_second": 10.098, "num_input_tokens_seen": 12963008, "step": 6200 }, { "epoch": 1.0122359083122603, "grad_norm": 2.453125, "learning_rate": 4.709046454980846e-05, "loss": 1.761, "num_input_tokens_seen": 12974864, "step": 6205 }, { "epoch": 1.0130516355330776, "grad_norm": 5.21875, "learning_rate": 4.708586623087538e-05, "loss": 1.9405, "num_input_tokens_seen": 12985216, "step": 6210 }, { "epoch": 1.013867362753895, "grad_norm": 4.625, "learning_rate": 4.708126450602418e-05, "loss": 3.8876, "num_input_tokens_seen": 12996976, "step": 6215 }, { "epoch": 1.0146830899747123, "grad_norm": 6.03125, "learning_rate": 4.7076659375964495e-05, "loss": 2.7066, "num_input_tokens_seen": 13007952, "step": 6220 }, { "epoch": 1.01549881719553, "grad_norm": 7.09375, "learning_rate": 4.707205084140651e-05, "loss": 2.507, "num_input_tokens_seen": 13016672, "step": 6225 }, { "epoch": 1.0163145444163473, "grad_norm": 11.6875, "learning_rate": 4.7067438903060904e-05, "loss": 2.4329, "num_input_tokens_seen": 13027520, "step": 6230 }, { "epoch": 1.0171302716371646, "grad_norm": 10.5, "learning_rate": 4.70628235616389e-05, "loss": 2.6443, "num_input_tokens_seen": 13037552, "step": 6235 }, { "epoch": 1.017945998857982, "grad_norm": 8.625, "learning_rate": 4.7058204817852256e-05, "loss": 2.5943, "num_input_tokens_seen": 13047728, "step": 6240 }, { "epoch": 1.0187617260787993, "grad_norm": 8.4375, "learning_rate": 4.705358267241322e-05, "loss": 2.1596, "num_input_tokens_seen": 13058880, "step": 6245 }, { "epoch": 1.0195774532996167, "grad_norm": 11.125, "learning_rate": 4.704895712603459e-05, "loss": 2.7262, "num_input_tokens_seen": 13069856, "step": 6250 }, { "epoch": 1.020393180520434, "grad_norm": 7.5, "learning_rate": 4.704432817942969e-05, "loss": 3.3593, "num_input_tokens_seen": 13080160, "step": 6255 }, { "epoch": 1.0212089077412514, "grad_norm": 4.59375, "learning_rate": 4.703969583331236e-05, "loss": 1.729, "num_input_tokens_seen": 13090832, "step": 6260 }, { "epoch": 1.0220246349620687, "grad_norm": 4.0, "learning_rate": 4.7035060088396965e-05, "loss": 1.9941, "num_input_tokens_seen": 13100976, "step": 6265 }, { "epoch": 1.022840362182886, "grad_norm": 11.9375, "learning_rate": 4.703042094539839e-05, "loss": 3.7058, "num_input_tokens_seen": 13109904, "step": 6270 }, { "epoch": 1.0236560894037035, "grad_norm": 2.859375, "learning_rate": 4.702577840503206e-05, "loss": 1.8261, "num_input_tokens_seen": 13120032, "step": 6275 }, { "epoch": 1.0244718166245208, "grad_norm": 17.5, "learning_rate": 4.70211324680139e-05, "loss": 2.3398, "num_input_tokens_seen": 13130448, "step": 6280 }, { "epoch": 1.0252875438453382, "grad_norm": 1.71875, "learning_rate": 4.7016483135060386e-05, "loss": 1.472, "num_input_tokens_seen": 13142448, "step": 6285 }, { "epoch": 1.0261032710661555, "grad_norm": 2.171875, "learning_rate": 4.701183040688849e-05, "loss": 1.0879, "num_input_tokens_seen": 13152592, "step": 6290 }, { "epoch": 1.0269189982869729, "grad_norm": 5.09375, "learning_rate": 4.700717428421573e-05, "loss": 1.8312, "num_input_tokens_seen": 13163904, "step": 6295 }, { "epoch": 1.0277347255077902, "grad_norm": 3.71875, "learning_rate": 4.700251476776014e-05, "loss": 2.3739, "num_input_tokens_seen": 13174064, "step": 6300 }, { "epoch": 1.0285504527286076, "grad_norm": 9.0, "learning_rate": 4.699785185824026e-05, "loss": 2.4082, "num_input_tokens_seen": 13184528, "step": 6305 }, { "epoch": 1.029366179949425, "grad_norm": 4.4375, "learning_rate": 4.699318555637519e-05, "loss": 2.3398, "num_input_tokens_seen": 13194464, "step": 6310 }, { "epoch": 1.0301819071702423, "grad_norm": 10.1875, "learning_rate": 4.6988515862884525e-05, "loss": 1.8384, "num_input_tokens_seen": 13206272, "step": 6315 }, { "epoch": 1.0309976343910596, "grad_norm": 5.3125, "learning_rate": 4.698384277848838e-05, "loss": 3.2116, "num_input_tokens_seen": 13216944, "step": 6320 }, { "epoch": 1.031813361611877, "grad_norm": 4.46875, "learning_rate": 4.6979166303907425e-05, "loss": 2.2847, "num_input_tokens_seen": 13227664, "step": 6325 }, { "epoch": 1.0326290888326943, "grad_norm": 7.96875, "learning_rate": 4.697448643986281e-05, "loss": 2.4183, "num_input_tokens_seen": 13238832, "step": 6330 }, { "epoch": 1.0334448160535117, "grad_norm": 1.2109375, "learning_rate": 4.696980318707624e-05, "loss": 1.7859, "num_input_tokens_seen": 13248880, "step": 6335 }, { "epoch": 1.034260543274329, "grad_norm": 6.625, "learning_rate": 4.6965116546269924e-05, "loss": 1.8692, "num_input_tokens_seen": 13259408, "step": 6340 }, { "epoch": 1.0350762704951464, "grad_norm": 6.9375, "learning_rate": 4.6960426518166615e-05, "loss": 3.024, "num_input_tokens_seen": 13270432, "step": 6345 }, { "epoch": 1.0358919977159637, "grad_norm": 4.09375, "learning_rate": 4.6955733103489556e-05, "loss": 1.4392, "num_input_tokens_seen": 13282112, "step": 6350 }, { "epoch": 1.036707724936781, "grad_norm": 6.0625, "learning_rate": 4.695103630296255e-05, "loss": 2.0951, "num_input_tokens_seen": 13293248, "step": 6355 }, { "epoch": 1.0375234521575984, "grad_norm": 8.0625, "learning_rate": 4.694633611730988e-05, "loss": 2.2979, "num_input_tokens_seen": 13303488, "step": 6360 }, { "epoch": 1.0383391793784158, "grad_norm": 9.5, "learning_rate": 4.694163254725639e-05, "loss": 3.425, "num_input_tokens_seen": 13314848, "step": 6365 }, { "epoch": 1.0391549065992332, "grad_norm": 7.03125, "learning_rate": 4.693692559352743e-05, "loss": 4.1539, "num_input_tokens_seen": 13325712, "step": 6370 }, { "epoch": 1.0399706338200505, "grad_norm": 7.21875, "learning_rate": 4.693221525684886e-05, "loss": 2.5029, "num_input_tokens_seen": 13335664, "step": 6375 }, { "epoch": 1.0407863610408679, "grad_norm": 8.25, "learning_rate": 4.6927501537947084e-05, "loss": 1.6925, "num_input_tokens_seen": 13345760, "step": 6380 }, { "epoch": 1.0416020882616852, "grad_norm": 4.375, "learning_rate": 4.692278443754901e-05, "loss": 1.8803, "num_input_tokens_seen": 13356144, "step": 6385 }, { "epoch": 1.0424178154825026, "grad_norm": 3.0625, "learning_rate": 4.691806395638208e-05, "loss": 2.0043, "num_input_tokens_seen": 13366432, "step": 6390 }, { "epoch": 1.04323354270332, "grad_norm": 8.875, "learning_rate": 4.6913340095174255e-05, "loss": 1.9346, "num_input_tokens_seen": 13378800, "step": 6395 }, { "epoch": 1.0440492699241373, "grad_norm": 4.15625, "learning_rate": 4.690861285465399e-05, "loss": 1.2982, "num_input_tokens_seen": 13388336, "step": 6400 }, { "epoch": 1.0440492699241373, "eval_loss": 2.5316240787506104, "eval_runtime": 134.8601, "eval_samples_per_second": 20.206, "eval_steps_per_second": 10.107, "num_input_tokens_seen": 13388336, "step": 6400 }, { "epoch": 1.0448649971449546, "grad_norm": 5.75, "learning_rate": 4.690388223555031e-05, "loss": 1.554, "num_input_tokens_seen": 13399088, "step": 6405 }, { "epoch": 1.0456807243657722, "grad_norm": 5.96875, "learning_rate": 4.689914823859273e-05, "loss": 1.4387, "num_input_tokens_seen": 13410384, "step": 6410 }, { "epoch": 1.0464964515865895, "grad_norm": 8.6875, "learning_rate": 4.689441086451129e-05, "loss": 2.3521, "num_input_tokens_seen": 13419216, "step": 6415 }, { "epoch": 1.047312178807407, "grad_norm": 9.875, "learning_rate": 4.688967011403655e-05, "loss": 4.2419, "num_input_tokens_seen": 13430224, "step": 6420 }, { "epoch": 1.0481279060282243, "grad_norm": 5.5, "learning_rate": 4.68849259878996e-05, "loss": 1.707, "num_input_tokens_seen": 13441424, "step": 6425 }, { "epoch": 1.0489436332490416, "grad_norm": 7.53125, "learning_rate": 4.6880178486832036e-05, "loss": 2.474, "num_input_tokens_seen": 13451760, "step": 6430 }, { "epoch": 1.049759360469859, "grad_norm": 4.78125, "learning_rate": 4.687542761156598e-05, "loss": 2.1315, "num_input_tokens_seen": 13462464, "step": 6435 }, { "epoch": 1.0505750876906763, "grad_norm": 8.3125, "learning_rate": 4.6870673362834096e-05, "loss": 3.3358, "num_input_tokens_seen": 13471968, "step": 6440 }, { "epoch": 1.0513908149114937, "grad_norm": 5.25, "learning_rate": 4.6865915741369526e-05, "loss": 3.8159, "num_input_tokens_seen": 13483856, "step": 6445 }, { "epoch": 1.052206542132311, "grad_norm": 8.3125, "learning_rate": 4.686115474790597e-05, "loss": 1.8372, "num_input_tokens_seen": 13494464, "step": 6450 }, { "epoch": 1.0530222693531284, "grad_norm": 12.25, "learning_rate": 4.685639038317762e-05, "loss": 3.3756, "num_input_tokens_seen": 13503664, "step": 6455 }, { "epoch": 1.0538379965739457, "grad_norm": 6.96875, "learning_rate": 4.685162264791921e-05, "loss": 2.6894, "num_input_tokens_seen": 13515824, "step": 6460 }, { "epoch": 1.054653723794763, "grad_norm": 4.34375, "learning_rate": 4.684685154286599e-05, "loss": 1.2832, "num_input_tokens_seen": 13525712, "step": 6465 }, { "epoch": 1.0554694510155804, "grad_norm": 8.125, "learning_rate": 4.684207706875371e-05, "loss": 2.6077, "num_input_tokens_seen": 13536176, "step": 6470 }, { "epoch": 1.0562851782363978, "grad_norm": 10.8125, "learning_rate": 4.683729922631866e-05, "loss": 4.0819, "num_input_tokens_seen": 13546704, "step": 6475 }, { "epoch": 1.0571009054572151, "grad_norm": 9.75, "learning_rate": 4.683251801629765e-05, "loss": 2.8638, "num_input_tokens_seen": 13557520, "step": 6480 }, { "epoch": 1.0579166326780325, "grad_norm": 8.3125, "learning_rate": 4.6827733439428e-05, "loss": 2.2073, "num_input_tokens_seen": 13568640, "step": 6485 }, { "epoch": 1.0587323598988498, "grad_norm": 11.875, "learning_rate": 4.682294549644754e-05, "loss": 2.9975, "num_input_tokens_seen": 13580016, "step": 6490 }, { "epoch": 1.0595480871196672, "grad_norm": 6.59375, "learning_rate": 4.681815418809464e-05, "loss": 1.5164, "num_input_tokens_seen": 13589744, "step": 6495 }, { "epoch": 1.0603638143404845, "grad_norm": 5.875, "learning_rate": 4.681335951510819e-05, "loss": 1.446, "num_input_tokens_seen": 13598896, "step": 6500 }, { "epoch": 1.061179541561302, "grad_norm": 11.1875, "learning_rate": 4.6808561478227576e-05, "loss": 3.2373, "num_input_tokens_seen": 13610352, "step": 6505 }, { "epoch": 1.0619952687821193, "grad_norm": 2.65625, "learning_rate": 4.680376007819271e-05, "loss": 1.783, "num_input_tokens_seen": 13621536, "step": 6510 }, { "epoch": 1.0628109960029366, "grad_norm": 10.375, "learning_rate": 4.679895531574405e-05, "loss": 1.3744, "num_input_tokens_seen": 13632704, "step": 6515 }, { "epoch": 1.063626723223754, "grad_norm": 8.4375, "learning_rate": 4.679414719162253e-05, "loss": 3.1847, "num_input_tokens_seen": 13642656, "step": 6520 }, { "epoch": 1.0644424504445713, "grad_norm": 4.90625, "learning_rate": 4.6789335706569635e-05, "loss": 2.4636, "num_input_tokens_seen": 13652800, "step": 6525 }, { "epoch": 1.0652581776653887, "grad_norm": 4.15625, "learning_rate": 4.678452086132734e-05, "loss": 2.388, "num_input_tokens_seen": 13664160, "step": 6530 }, { "epoch": 1.066073904886206, "grad_norm": 5.875, "learning_rate": 4.677970265663818e-05, "loss": 3.9124, "num_input_tokens_seen": 13675824, "step": 6535 }, { "epoch": 1.0668896321070234, "grad_norm": 4.4375, "learning_rate": 4.677488109324517e-05, "loss": 1.9941, "num_input_tokens_seen": 13684960, "step": 6540 }, { "epoch": 1.0677053593278407, "grad_norm": 18.875, "learning_rate": 4.6770056171891846e-05, "loss": 2.2253, "num_input_tokens_seen": 13696688, "step": 6545 }, { "epoch": 1.068521086548658, "grad_norm": 5.25, "learning_rate": 4.6765227893322286e-05, "loss": 3.2079, "num_input_tokens_seen": 13707440, "step": 6550 }, { "epoch": 1.0693368137694754, "grad_norm": 9.1875, "learning_rate": 4.676039625828107e-05, "loss": 2.862, "num_input_tokens_seen": 13718368, "step": 6555 }, { "epoch": 1.0701525409902928, "grad_norm": 4.59375, "learning_rate": 4.675556126751328e-05, "loss": 2.6244, "num_input_tokens_seen": 13729152, "step": 6560 }, { "epoch": 1.0709682682111101, "grad_norm": 7.75, "learning_rate": 4.6750722921764556e-05, "loss": 2.242, "num_input_tokens_seen": 13740672, "step": 6565 }, { "epoch": 1.0717839954319275, "grad_norm": 7.65625, "learning_rate": 4.674588122178102e-05, "loss": 2.2849, "num_input_tokens_seen": 13751360, "step": 6570 }, { "epoch": 1.0725997226527448, "grad_norm": 2.390625, "learning_rate": 4.674103616830931e-05, "loss": 1.8315, "num_input_tokens_seen": 13762896, "step": 6575 }, { "epoch": 1.0734154498735622, "grad_norm": 8.125, "learning_rate": 4.673618776209663e-05, "loss": 2.4437, "num_input_tokens_seen": 13774192, "step": 6580 }, { "epoch": 1.0742311770943795, "grad_norm": 7.75, "learning_rate": 4.673133600389063e-05, "loss": 3.4912, "num_input_tokens_seen": 13784304, "step": 6585 }, { "epoch": 1.075046904315197, "grad_norm": 7.5625, "learning_rate": 4.672648089443953e-05, "loss": 2.4513, "num_input_tokens_seen": 13794432, "step": 6590 }, { "epoch": 1.0758626315360145, "grad_norm": 8.5625, "learning_rate": 4.672162243449204e-05, "loss": 1.3882, "num_input_tokens_seen": 13805872, "step": 6595 }, { "epoch": 1.0766783587568316, "grad_norm": 15.75, "learning_rate": 4.67167606247974e-05, "loss": 2.5333, "num_input_tokens_seen": 13816224, "step": 6600 }, { "epoch": 1.0766783587568316, "eval_loss": 2.5479660034179688, "eval_runtime": 134.926, "eval_samples_per_second": 20.196, "eval_steps_per_second": 10.102, "num_input_tokens_seen": 13816224, "step": 6600 }, { "epoch": 1.0774940859776492, "grad_norm": 6.84375, "learning_rate": 4.671189546610536e-05, "loss": 1.9081, "num_input_tokens_seen": 13827264, "step": 6605 }, { "epoch": 1.0783098131984665, "grad_norm": 7.03125, "learning_rate": 4.67070269591662e-05, "loss": 1.6404, "num_input_tokens_seen": 13837344, "step": 6610 }, { "epoch": 1.0791255404192839, "grad_norm": 9.0, "learning_rate": 4.670215510473068e-05, "loss": 1.2935, "num_input_tokens_seen": 13848368, "step": 6615 }, { "epoch": 1.0799412676401012, "grad_norm": 11.0625, "learning_rate": 4.669727990355013e-05, "loss": 3.1689, "num_input_tokens_seen": 13858688, "step": 6620 }, { "epoch": 1.0807569948609186, "grad_norm": 3.75, "learning_rate": 4.669240135637635e-05, "loss": 2.5636, "num_input_tokens_seen": 13867728, "step": 6625 }, { "epoch": 1.081572722081736, "grad_norm": 8.6875, "learning_rate": 4.6687519463961675e-05, "loss": 3.2371, "num_input_tokens_seen": 13879216, "step": 6630 }, { "epoch": 1.0823884493025533, "grad_norm": 2.421875, "learning_rate": 4.668263422705896e-05, "loss": 2.1644, "num_input_tokens_seen": 13888976, "step": 6635 }, { "epoch": 1.0832041765233706, "grad_norm": 10.9375, "learning_rate": 4.667774564642156e-05, "loss": 3.4243, "num_input_tokens_seen": 13898064, "step": 6640 }, { "epoch": 1.084019903744188, "grad_norm": 7.21875, "learning_rate": 4.6672853722803365e-05, "loss": 1.8537, "num_input_tokens_seen": 13909136, "step": 6645 }, { "epoch": 1.0848356309650053, "grad_norm": 2.6875, "learning_rate": 4.666795845695877e-05, "loss": 1.807, "num_input_tokens_seen": 13919632, "step": 6650 }, { "epoch": 1.0856513581858227, "grad_norm": 6.96875, "learning_rate": 4.666305984964269e-05, "loss": 3.7924, "num_input_tokens_seen": 13929552, "step": 6655 }, { "epoch": 1.08646708540664, "grad_norm": 3.984375, "learning_rate": 4.6658157901610535e-05, "loss": 2.2932, "num_input_tokens_seen": 13941232, "step": 6660 }, { "epoch": 1.0872828126274574, "grad_norm": 12.75, "learning_rate": 4.665325261361826e-05, "loss": 2.8005, "num_input_tokens_seen": 13953264, "step": 6665 }, { "epoch": 1.0880985398482748, "grad_norm": 11.3125, "learning_rate": 4.664834398642232e-05, "loss": 2.9229, "num_input_tokens_seen": 13963248, "step": 6670 }, { "epoch": 1.0889142670690921, "grad_norm": 6.1875, "learning_rate": 4.6643432020779686e-05, "loss": 2.7717, "num_input_tokens_seen": 13973680, "step": 6675 }, { "epoch": 1.0897299942899095, "grad_norm": 9.5625, "learning_rate": 4.663851671744786e-05, "loss": 2.5093, "num_input_tokens_seen": 13984528, "step": 6680 }, { "epoch": 1.0905457215107268, "grad_norm": 6.90625, "learning_rate": 4.6633598077184815e-05, "loss": 0.7262, "num_input_tokens_seen": 13994032, "step": 6685 }, { "epoch": 1.0913614487315442, "grad_norm": 10.8125, "learning_rate": 4.662867610074908e-05, "loss": 2.7007, "num_input_tokens_seen": 14004976, "step": 6690 }, { "epoch": 1.0921771759523615, "grad_norm": 5.53125, "learning_rate": 4.6623750788899696e-05, "loss": 0.8619, "num_input_tokens_seen": 14014240, "step": 6695 }, { "epoch": 1.0929929031731789, "grad_norm": 5.65625, "learning_rate": 4.6618822142396195e-05, "loss": 2.3617, "num_input_tokens_seen": 14025152, "step": 6700 }, { "epoch": 1.0938086303939962, "grad_norm": 7.90625, "learning_rate": 4.661389016199864e-05, "loss": 1.7394, "num_input_tokens_seen": 14036528, "step": 6705 }, { "epoch": 1.0946243576148136, "grad_norm": 5.1875, "learning_rate": 4.660895484846761e-05, "loss": 3.3268, "num_input_tokens_seen": 14047568, "step": 6710 }, { "epoch": 1.095440084835631, "grad_norm": 11.4375, "learning_rate": 4.660401620256418e-05, "loss": 3.3054, "num_input_tokens_seen": 14057760, "step": 6715 }, { "epoch": 1.0962558120564483, "grad_norm": 4.3125, "learning_rate": 4.659907422504997e-05, "loss": 2.9211, "num_input_tokens_seen": 14067936, "step": 6720 }, { "epoch": 1.0970715392772656, "grad_norm": 6.5625, "learning_rate": 4.6594128916687074e-05, "loss": 2.6511, "num_input_tokens_seen": 14077616, "step": 6725 }, { "epoch": 1.097887266498083, "grad_norm": 10.8125, "learning_rate": 4.658918027823813e-05, "loss": 2.1828, "num_input_tokens_seen": 14087440, "step": 6730 }, { "epoch": 1.0987029937189003, "grad_norm": 9.4375, "learning_rate": 4.658422831046628e-05, "loss": 1.69, "num_input_tokens_seen": 14096880, "step": 6735 }, { "epoch": 1.0995187209397177, "grad_norm": 5.84375, "learning_rate": 4.657927301413518e-05, "loss": 2.5988, "num_input_tokens_seen": 14108320, "step": 6740 }, { "epoch": 1.100334448160535, "grad_norm": 3.265625, "learning_rate": 4.657431439000901e-05, "loss": 2.5977, "num_input_tokens_seen": 14116368, "step": 6745 }, { "epoch": 1.1011501753813524, "grad_norm": 1.75, "learning_rate": 4.656935243885243e-05, "loss": 1.6304, "num_input_tokens_seen": 14126352, "step": 6750 }, { "epoch": 1.1019659026021698, "grad_norm": 1.6875, "learning_rate": 4.656438716143066e-05, "loss": 1.296, "num_input_tokens_seen": 14137920, "step": 6755 }, { "epoch": 1.102781629822987, "grad_norm": 8.1875, "learning_rate": 4.6559418558509384e-05, "loss": 2.2682, "num_input_tokens_seen": 14148448, "step": 6760 }, { "epoch": 1.1035973570438045, "grad_norm": 3.828125, "learning_rate": 4.6554446630854833e-05, "loss": 1.7127, "num_input_tokens_seen": 14157248, "step": 6765 }, { "epoch": 1.1044130842646218, "grad_norm": 8.6875, "learning_rate": 4.654947137923374e-05, "loss": 2.3281, "num_input_tokens_seen": 14167216, "step": 6770 }, { "epoch": 1.1052288114854392, "grad_norm": 7.78125, "learning_rate": 4.654449280441335e-05, "loss": 2.1477, "num_input_tokens_seen": 14176352, "step": 6775 }, { "epoch": 1.1060445387062567, "grad_norm": 6.53125, "learning_rate": 4.653951090716143e-05, "loss": 2.6341, "num_input_tokens_seen": 14185840, "step": 6780 }, { "epoch": 1.1068602659270739, "grad_norm": 4.09375, "learning_rate": 4.653452568824625e-05, "loss": 2.1709, "num_input_tokens_seen": 14196064, "step": 6785 }, { "epoch": 1.1076759931478914, "grad_norm": 8.125, "learning_rate": 4.6529537148436585e-05, "loss": 1.4094, "num_input_tokens_seen": 14207152, "step": 6790 }, { "epoch": 1.1084917203687088, "grad_norm": 7.3125, "learning_rate": 4.6524545288501734e-05, "loss": 3.4547, "num_input_tokens_seen": 14217392, "step": 6795 }, { "epoch": 1.1093074475895262, "grad_norm": 5.625, "learning_rate": 4.6519550109211506e-05, "loss": 2.6627, "num_input_tokens_seen": 14228240, "step": 6800 }, { "epoch": 1.1093074475895262, "eval_loss": 2.5380496978759766, "eval_runtime": 134.8409, "eval_samples_per_second": 20.209, "eval_steps_per_second": 10.108, "num_input_tokens_seen": 14228240, "step": 6800 }, { "epoch": 1.1101231748103435, "grad_norm": 6.3125, "learning_rate": 4.651455161133622e-05, "loss": 2.7996, "num_input_tokens_seen": 14237536, "step": 6805 }, { "epoch": 1.1109389020311609, "grad_norm": 5.625, "learning_rate": 4.6509549795646704e-05, "loss": 3.0776, "num_input_tokens_seen": 14249440, "step": 6810 }, { "epoch": 1.1117546292519782, "grad_norm": 5.78125, "learning_rate": 4.6504544662914306e-05, "loss": 3.4046, "num_input_tokens_seen": 14259952, "step": 6815 }, { "epoch": 1.1125703564727956, "grad_norm": 11.8125, "learning_rate": 4.6499536213910876e-05, "loss": 3.9139, "num_input_tokens_seen": 14270032, "step": 6820 }, { "epoch": 1.113386083693613, "grad_norm": 7.4375, "learning_rate": 4.6494524449408786e-05, "loss": 3.3805, "num_input_tokens_seen": 14279776, "step": 6825 }, { "epoch": 1.1142018109144303, "grad_norm": 7.5, "learning_rate": 4.6489509370180903e-05, "loss": 1.6634, "num_input_tokens_seen": 14289776, "step": 6830 }, { "epoch": 1.1150175381352476, "grad_norm": 6.0625, "learning_rate": 4.648449097700063e-05, "loss": 2.532, "num_input_tokens_seen": 14298544, "step": 6835 }, { "epoch": 1.115833265356065, "grad_norm": 6.8125, "learning_rate": 4.647946927064185e-05, "loss": 1.6581, "num_input_tokens_seen": 14308624, "step": 6840 }, { "epoch": 1.1166489925768823, "grad_norm": 9.125, "learning_rate": 4.647444425187898e-05, "loss": 2.2007, "num_input_tokens_seen": 14318512, "step": 6845 }, { "epoch": 1.1174647197976997, "grad_norm": 2.46875, "learning_rate": 4.646941592148695e-05, "loss": 2.7297, "num_input_tokens_seen": 14329520, "step": 6850 }, { "epoch": 1.118280447018517, "grad_norm": 12.5625, "learning_rate": 4.646438428024117e-05, "loss": 3.3047, "num_input_tokens_seen": 14339536, "step": 6855 }, { "epoch": 1.1190961742393344, "grad_norm": 4.46875, "learning_rate": 4.64593493289176e-05, "loss": 2.8425, "num_input_tokens_seen": 14350128, "step": 6860 }, { "epoch": 1.1199119014601517, "grad_norm": 7.71875, "learning_rate": 4.64543110682927e-05, "loss": 1.9209, "num_input_tokens_seen": 14360832, "step": 6865 }, { "epoch": 1.120727628680969, "grad_norm": 6.59375, "learning_rate": 4.644926949914341e-05, "loss": 2.7893, "num_input_tokens_seen": 14372192, "step": 6870 }, { "epoch": 1.1215433559017864, "grad_norm": 11.5, "learning_rate": 4.644422462224722e-05, "loss": 3.2243, "num_input_tokens_seen": 14382064, "step": 6875 }, { "epoch": 1.1223590831226038, "grad_norm": 4.65625, "learning_rate": 4.643917643838211e-05, "loss": 1.9737, "num_input_tokens_seen": 14391312, "step": 6880 }, { "epoch": 1.1231748103434211, "grad_norm": 7.59375, "learning_rate": 4.6434124948326564e-05, "loss": 3.3349, "num_input_tokens_seen": 14401424, "step": 6885 }, { "epoch": 1.1239905375642385, "grad_norm": 7.90625, "learning_rate": 4.6429070152859594e-05, "loss": 2.5061, "num_input_tokens_seen": 14410832, "step": 6890 }, { "epoch": 1.1248062647850559, "grad_norm": 4.21875, "learning_rate": 4.6424012052760714e-05, "loss": 2.3073, "num_input_tokens_seen": 14421888, "step": 6895 }, { "epoch": 1.1256219920058732, "grad_norm": 9.0625, "learning_rate": 4.6418950648809945e-05, "loss": 2.4475, "num_input_tokens_seen": 14432848, "step": 6900 }, { "epoch": 1.1264377192266906, "grad_norm": 2.84375, "learning_rate": 4.641388594178782e-05, "loss": 2.1236, "num_input_tokens_seen": 14443728, "step": 6905 }, { "epoch": 1.127253446447508, "grad_norm": 3.1875, "learning_rate": 4.640881793247538e-05, "loss": 2.3005, "num_input_tokens_seen": 14455296, "step": 6910 }, { "epoch": 1.1280691736683253, "grad_norm": 5.09375, "learning_rate": 4.6403746621654173e-05, "loss": 1.8599, "num_input_tokens_seen": 14466016, "step": 6915 }, { "epoch": 1.1288849008891426, "grad_norm": 4.34375, "learning_rate": 4.639867201010626e-05, "loss": 3.3185, "num_input_tokens_seen": 14475296, "step": 6920 }, { "epoch": 1.12970062810996, "grad_norm": 6.59375, "learning_rate": 4.6393594098614204e-05, "loss": 2.6306, "num_input_tokens_seen": 14485280, "step": 6925 }, { "epoch": 1.1305163553307773, "grad_norm": 8.0625, "learning_rate": 4.63885128879611e-05, "loss": 1.8298, "num_input_tokens_seen": 14494960, "step": 6930 }, { "epoch": 1.1313320825515947, "grad_norm": 11.625, "learning_rate": 4.638342837893052e-05, "loss": 3.1982, "num_input_tokens_seen": 14506048, "step": 6935 }, { "epoch": 1.132147809772412, "grad_norm": 7.125, "learning_rate": 4.6378340572306565e-05, "loss": 3.4628, "num_input_tokens_seen": 14515264, "step": 6940 }, { "epoch": 1.1329635369932294, "grad_norm": 9.8125, "learning_rate": 4.6373249468873833e-05, "loss": 2.4782, "num_input_tokens_seen": 14524304, "step": 6945 }, { "epoch": 1.1337792642140467, "grad_norm": 7.5625, "learning_rate": 4.636815506941744e-05, "loss": 2.3501, "num_input_tokens_seen": 14535152, "step": 6950 }, { "epoch": 1.134594991434864, "grad_norm": 6.09375, "learning_rate": 4.6363057374723004e-05, "loss": 2.3922, "num_input_tokens_seen": 14545488, "step": 6955 }, { "epoch": 1.1354107186556814, "grad_norm": 4.3125, "learning_rate": 4.635795638557666e-05, "loss": 3.8938, "num_input_tokens_seen": 14556304, "step": 6960 }, { "epoch": 1.136226445876499, "grad_norm": 7.40625, "learning_rate": 4.635285210276504e-05, "loss": 2.2939, "num_input_tokens_seen": 14567424, "step": 6965 }, { "epoch": 1.1370421730973161, "grad_norm": 5.125, "learning_rate": 4.6347744527075295e-05, "loss": 2.8028, "num_input_tokens_seen": 14576928, "step": 6970 }, { "epoch": 1.1378579003181337, "grad_norm": 7.09375, "learning_rate": 4.634263365929506e-05, "loss": 2.8515, "num_input_tokens_seen": 14585392, "step": 6975 }, { "epoch": 1.1386736275389508, "grad_norm": 11.25, "learning_rate": 4.6337519500212515e-05, "loss": 2.3309, "num_input_tokens_seen": 14595792, "step": 6980 }, { "epoch": 1.1394893547597684, "grad_norm": 8.25, "learning_rate": 4.633240205061632e-05, "loss": 2.8317, "num_input_tokens_seen": 14606464, "step": 6985 }, { "epoch": 1.1403050819805858, "grad_norm": 3.359375, "learning_rate": 4.632728131129565e-05, "loss": 2.4585, "num_input_tokens_seen": 14616576, "step": 6990 }, { "epoch": 1.1411208092014031, "grad_norm": 5.3125, "learning_rate": 4.632215728304018e-05, "loss": 1.8745, "num_input_tokens_seen": 14626944, "step": 6995 }, { "epoch": 1.1419365364222205, "grad_norm": 8.1875, "learning_rate": 4.63170299666401e-05, "loss": 3.0394, "num_input_tokens_seen": 14637984, "step": 7000 }, { "epoch": 1.1419365364222205, "eval_loss": 2.5363268852233887, "eval_runtime": 135.0809, "eval_samples_per_second": 20.173, "eval_steps_per_second": 10.09, "num_input_tokens_seen": 14637984, "step": 7000 }, { "epoch": 1.1427522636430378, "grad_norm": 1.8828125, "learning_rate": 4.631189936288612e-05, "loss": 2.1908, "num_input_tokens_seen": 14648064, "step": 7005 }, { "epoch": 1.1435679908638552, "grad_norm": 0.83203125, "learning_rate": 4.630676547256944e-05, "loss": 1.7766, "num_input_tokens_seen": 14658048, "step": 7010 }, { "epoch": 1.1443837180846725, "grad_norm": 4.25, "learning_rate": 4.630162829648176e-05, "loss": 1.0917, "num_input_tokens_seen": 14668256, "step": 7015 }, { "epoch": 1.14519944530549, "grad_norm": 5.6875, "learning_rate": 4.629648783541531e-05, "loss": 1.7112, "num_input_tokens_seen": 14679280, "step": 7020 }, { "epoch": 1.1460151725263072, "grad_norm": 4.5, "learning_rate": 4.6291344090162804e-05, "loss": 2.1924, "num_input_tokens_seen": 14688992, "step": 7025 }, { "epoch": 1.1468308997471246, "grad_norm": 2.640625, "learning_rate": 4.628619706151748e-05, "loss": 2.4936, "num_input_tokens_seen": 14699328, "step": 7030 }, { "epoch": 1.147646626967942, "grad_norm": 1.3046875, "learning_rate": 4.628104675027306e-05, "loss": 2.275, "num_input_tokens_seen": 14711264, "step": 7035 }, { "epoch": 1.1484623541887593, "grad_norm": 8.5625, "learning_rate": 4.6275893157223805e-05, "loss": 2.5831, "num_input_tokens_seen": 14721504, "step": 7040 }, { "epoch": 1.1492780814095767, "grad_norm": 8.75, "learning_rate": 4.627073628316445e-05, "loss": 1.6562, "num_input_tokens_seen": 14730896, "step": 7045 }, { "epoch": 1.150093808630394, "grad_norm": 7.0, "learning_rate": 4.626557612889026e-05, "loss": 2.058, "num_input_tokens_seen": 14740448, "step": 7050 }, { "epoch": 1.1509095358512114, "grad_norm": 4.875, "learning_rate": 4.626041269519699e-05, "loss": 2.8781, "num_input_tokens_seen": 14751216, "step": 7055 }, { "epoch": 1.1517252630720287, "grad_norm": 1.984375, "learning_rate": 4.6255245982880905e-05, "loss": 2.3806, "num_input_tokens_seen": 14760720, "step": 7060 }, { "epoch": 1.152540990292846, "grad_norm": 11.1875, "learning_rate": 4.625007599273879e-05, "loss": 1.8402, "num_input_tokens_seen": 14772848, "step": 7065 }, { "epoch": 1.1533567175136634, "grad_norm": 4.375, "learning_rate": 4.6244902725567895e-05, "loss": 1.56, "num_input_tokens_seen": 14782528, "step": 7070 }, { "epoch": 1.1541724447344808, "grad_norm": 3.5, "learning_rate": 4.6239726182166024e-05, "loss": 1.9506, "num_input_tokens_seen": 14791968, "step": 7075 }, { "epoch": 1.1549881719552981, "grad_norm": 11.9375, "learning_rate": 4.623454636333147e-05, "loss": 1.659, "num_input_tokens_seen": 14801616, "step": 7080 }, { "epoch": 1.1558038991761155, "grad_norm": 5.625, "learning_rate": 4.622936326986301e-05, "loss": 2.0274, "num_input_tokens_seen": 14812272, "step": 7085 }, { "epoch": 1.1566196263969328, "grad_norm": 3.046875, "learning_rate": 4.6224176902559946e-05, "loss": 2.1539, "num_input_tokens_seen": 14823552, "step": 7090 }, { "epoch": 1.1574353536177502, "grad_norm": 9.5625, "learning_rate": 4.621898726222209e-05, "loss": 1.9231, "num_input_tokens_seen": 14833920, "step": 7095 }, { "epoch": 1.1582510808385675, "grad_norm": 4.6875, "learning_rate": 4.6213794349649744e-05, "loss": 1.0234, "num_input_tokens_seen": 14842816, "step": 7100 }, { "epoch": 1.159066808059385, "grad_norm": 3.53125, "learning_rate": 4.6208598165643715e-05, "loss": 2.2755, "num_input_tokens_seen": 14853104, "step": 7105 }, { "epoch": 1.1598825352802022, "grad_norm": 10.5625, "learning_rate": 4.620339871100533e-05, "loss": 1.922, "num_input_tokens_seen": 14862176, "step": 7110 }, { "epoch": 1.1606982625010196, "grad_norm": 7.625, "learning_rate": 4.6198195986536394e-05, "loss": 1.8391, "num_input_tokens_seen": 14872160, "step": 7115 }, { "epoch": 1.161513989721837, "grad_norm": 7.34375, "learning_rate": 4.619298999303926e-05, "loss": 1.513, "num_input_tokens_seen": 14882576, "step": 7120 }, { "epoch": 1.1623297169426543, "grad_norm": 4.0625, "learning_rate": 4.618778073131673e-05, "loss": 3.0759, "num_input_tokens_seen": 14893040, "step": 7125 }, { "epoch": 1.1631454441634717, "grad_norm": 6.5, "learning_rate": 4.618256820217215e-05, "loss": 2.3181, "num_input_tokens_seen": 14904336, "step": 7130 }, { "epoch": 1.163961171384289, "grad_norm": 6.9375, "learning_rate": 4.617735240640936e-05, "loss": 2.516, "num_input_tokens_seen": 14915840, "step": 7135 }, { "epoch": 1.1647768986051064, "grad_norm": 8.875, "learning_rate": 4.6172133344832705e-05, "loss": 2.8026, "num_input_tokens_seen": 14925776, "step": 7140 }, { "epoch": 1.1655926258259237, "grad_norm": 7.4375, "learning_rate": 4.6166911018247004e-05, "loss": 2.2109, "num_input_tokens_seen": 14936208, "step": 7145 }, { "epoch": 1.1664083530467413, "grad_norm": 4.53125, "learning_rate": 4.616168542745764e-05, "loss": 2.641, "num_input_tokens_seen": 14947456, "step": 7150 }, { "epoch": 1.1672240802675584, "grad_norm": 6.15625, "learning_rate": 4.6156456573270446e-05, "loss": 2.2243, "num_input_tokens_seen": 14956112, "step": 7155 }, { "epoch": 1.168039807488376, "grad_norm": 5.5, "learning_rate": 4.615122445649177e-05, "loss": 1.6195, "num_input_tokens_seen": 14965344, "step": 7160 }, { "epoch": 1.1688555347091931, "grad_norm": 7.84375, "learning_rate": 4.6145989077928486e-05, "loss": 1.9419, "num_input_tokens_seen": 14975888, "step": 7165 }, { "epoch": 1.1696712619300107, "grad_norm": 8.0625, "learning_rate": 4.6140750438387953e-05, "loss": 1.5907, "num_input_tokens_seen": 14986160, "step": 7170 }, { "epoch": 1.170486989150828, "grad_norm": 11.375, "learning_rate": 4.613550853867803e-05, "loss": 2.863, "num_input_tokens_seen": 14996544, "step": 7175 }, { "epoch": 1.1713027163716454, "grad_norm": 5.625, "learning_rate": 4.613026337960708e-05, "loss": 2.1381, "num_input_tokens_seen": 15007712, "step": 7180 }, { "epoch": 1.1721184435924628, "grad_norm": 7.5625, "learning_rate": 4.612501496198398e-05, "loss": 3.8136, "num_input_tokens_seen": 15017056, "step": 7185 }, { "epoch": 1.17293417081328, "grad_norm": 13.625, "learning_rate": 4.61197632866181e-05, "loss": 4.3741, "num_input_tokens_seen": 15028048, "step": 7190 }, { "epoch": 1.1737498980340975, "grad_norm": 12.75, "learning_rate": 4.611450835431931e-05, "loss": 1.6193, "num_input_tokens_seen": 15038096, "step": 7195 }, { "epoch": 1.1745656252549148, "grad_norm": 19.5, "learning_rate": 4.6109250165898e-05, "loss": 2.7862, "num_input_tokens_seen": 15049216, "step": 7200 }, { "epoch": 1.1745656252549148, "eval_loss": 2.54175066947937, "eval_runtime": 134.8401, "eval_samples_per_second": 20.209, "eval_steps_per_second": 10.108, "num_input_tokens_seen": 15049216, "step": 7200 }, { "epoch": 1.1753813524757322, "grad_norm": 6.09375, "learning_rate": 4.610398872216503e-05, "loss": 2.2944, "num_input_tokens_seen": 15060080, "step": 7205 }, { "epoch": 1.1761970796965495, "grad_norm": 9.0625, "learning_rate": 4.6098724023931796e-05, "loss": 2.0494, "num_input_tokens_seen": 15071408, "step": 7210 }, { "epoch": 1.1770128069173669, "grad_norm": 9.625, "learning_rate": 4.609345607201017e-05, "loss": 2.5433, "num_input_tokens_seen": 15080496, "step": 7215 }, { "epoch": 1.1778285341381842, "grad_norm": 0.142578125, "learning_rate": 4.608818486721254e-05, "loss": 1.5975, "num_input_tokens_seen": 15090592, "step": 7220 }, { "epoch": 1.1786442613590016, "grad_norm": 4.34375, "learning_rate": 4.608291041035179e-05, "loss": 1.0553, "num_input_tokens_seen": 15100832, "step": 7225 }, { "epoch": 1.179459988579819, "grad_norm": 7.53125, "learning_rate": 4.607763270224132e-05, "loss": 2.1474, "num_input_tokens_seen": 15111104, "step": 7230 }, { "epoch": 1.1802757158006363, "grad_norm": 6.46875, "learning_rate": 4.6072351743695e-05, "loss": 2.0628, "num_input_tokens_seen": 15121152, "step": 7235 }, { "epoch": 1.1810914430214536, "grad_norm": 11.25, "learning_rate": 4.606706753552723e-05, "loss": 3.0488, "num_input_tokens_seen": 15132800, "step": 7240 }, { "epoch": 1.181907170242271, "grad_norm": 7.375, "learning_rate": 4.6061780078552906e-05, "loss": 2.26, "num_input_tokens_seen": 15143632, "step": 7245 }, { "epoch": 1.1827228974630883, "grad_norm": 4.53125, "learning_rate": 4.605648937358742e-05, "loss": 1.4248, "num_input_tokens_seen": 15154688, "step": 7250 }, { "epoch": 1.1835386246839057, "grad_norm": 10.8125, "learning_rate": 4.605119542144665e-05, "loss": 3.4472, "num_input_tokens_seen": 15165136, "step": 7255 }, { "epoch": 1.184354351904723, "grad_norm": 7.0625, "learning_rate": 4.604589822294701e-05, "loss": 1.8409, "num_input_tokens_seen": 15173968, "step": 7260 }, { "epoch": 1.1851700791255404, "grad_norm": 11.125, "learning_rate": 4.604059777890537e-05, "loss": 2.4529, "num_input_tokens_seen": 15184720, "step": 7265 }, { "epoch": 1.1859858063463578, "grad_norm": 9.0625, "learning_rate": 4.6035294090139145e-05, "loss": 2.0046, "num_input_tokens_seen": 15195024, "step": 7270 }, { "epoch": 1.186801533567175, "grad_norm": 8.875, "learning_rate": 4.6029987157466226e-05, "loss": 3.3057, "num_input_tokens_seen": 15206256, "step": 7275 }, { "epoch": 1.1876172607879925, "grad_norm": 7.125, "learning_rate": 4.602467698170502e-05, "loss": 2.1189, "num_input_tokens_seen": 15218832, "step": 7280 }, { "epoch": 1.1884329880088098, "grad_norm": 7.8125, "learning_rate": 4.601936356367439e-05, "loss": 2.8011, "num_input_tokens_seen": 15229536, "step": 7285 }, { "epoch": 1.1892487152296272, "grad_norm": 8.0, "learning_rate": 4.601404690419377e-05, "loss": 2.4708, "num_input_tokens_seen": 15240512, "step": 7290 }, { "epoch": 1.1900644424504445, "grad_norm": 6.5625, "learning_rate": 4.600872700408303e-05, "loss": 2.636, "num_input_tokens_seen": 15250080, "step": 7295 }, { "epoch": 1.1908801696712619, "grad_norm": 6.9375, "learning_rate": 4.600340386416258e-05, "loss": 3.0933, "num_input_tokens_seen": 15260704, "step": 7300 }, { "epoch": 1.1916958968920792, "grad_norm": 7.15625, "learning_rate": 4.5998077485253296e-05, "loss": 2.0272, "num_input_tokens_seen": 15269504, "step": 7305 }, { "epoch": 1.1925116241128966, "grad_norm": 4.78125, "learning_rate": 4.59927478681766e-05, "loss": 2.2788, "num_input_tokens_seen": 15280064, "step": 7310 }, { "epoch": 1.193327351333714, "grad_norm": 8.25, "learning_rate": 4.5987415013754366e-05, "loss": 2.0007, "num_input_tokens_seen": 15291456, "step": 7315 }, { "epoch": 1.1941430785545313, "grad_norm": 4.125, "learning_rate": 4.598207892280899e-05, "loss": 2.2253, "num_input_tokens_seen": 15303968, "step": 7320 }, { "epoch": 1.1949588057753486, "grad_norm": 4.65625, "learning_rate": 4.597673959616337e-05, "loss": 1.6033, "num_input_tokens_seen": 15313760, "step": 7325 }, { "epoch": 1.195774532996166, "grad_norm": 4.15625, "learning_rate": 4.597139703464089e-05, "loss": 1.1099, "num_input_tokens_seen": 15324192, "step": 7330 }, { "epoch": 1.1965902602169836, "grad_norm": 3.515625, "learning_rate": 4.596605123906545e-05, "loss": 1.6314, "num_input_tokens_seen": 15335456, "step": 7335 }, { "epoch": 1.1974059874378007, "grad_norm": 3.46875, "learning_rate": 4.596070221026143e-05, "loss": 1.4956, "num_input_tokens_seen": 15345600, "step": 7340 }, { "epoch": 1.1982217146586183, "grad_norm": 7.03125, "learning_rate": 4.595534994905372e-05, "loss": 2.2547, "num_input_tokens_seen": 15356544, "step": 7345 }, { "epoch": 1.1990374418794354, "grad_norm": 8.4375, "learning_rate": 4.594999445626771e-05, "loss": 1.4879, "num_input_tokens_seen": 15367232, "step": 7350 }, { "epoch": 1.199853169100253, "grad_norm": 6.5625, "learning_rate": 4.5944635732729276e-05, "loss": 3.7935, "num_input_tokens_seen": 15377168, "step": 7355 }, { "epoch": 1.2006688963210703, "grad_norm": 12.5625, "learning_rate": 4.5939273779264804e-05, "loss": 2.3969, "num_input_tokens_seen": 15387088, "step": 7360 }, { "epoch": 1.2014846235418877, "grad_norm": 7.25, "learning_rate": 4.593390859670118e-05, "loss": 2.2434, "num_input_tokens_seen": 15397584, "step": 7365 }, { "epoch": 1.202300350762705, "grad_norm": 14.9375, "learning_rate": 4.5928540185865776e-05, "loss": 2.43, "num_input_tokens_seen": 15407264, "step": 7370 }, { "epoch": 1.2031160779835224, "grad_norm": 10.625, "learning_rate": 4.592316854758648e-05, "loss": 3.6633, "num_input_tokens_seen": 15418080, "step": 7375 }, { "epoch": 1.2039318052043397, "grad_norm": 6.15625, "learning_rate": 4.5917793682691646e-05, "loss": 3.0852, "num_input_tokens_seen": 15428464, "step": 7380 }, { "epoch": 1.204747532425157, "grad_norm": 3.0625, "learning_rate": 4.5912415592010164e-05, "loss": 2.6455, "num_input_tokens_seen": 15438320, "step": 7385 }, { "epoch": 1.2055632596459744, "grad_norm": 16.25, "learning_rate": 4.5907034276371386e-05, "loss": 2.7999, "num_input_tokens_seen": 15449360, "step": 7390 }, { "epoch": 1.2063789868667918, "grad_norm": 7.1875, "learning_rate": 4.5901649736605196e-05, "loss": 1.8303, "num_input_tokens_seen": 15460496, "step": 7395 }, { "epoch": 1.2071947140876091, "grad_norm": 8.3125, "learning_rate": 4.589626197354195e-05, "loss": 2.3596, "num_input_tokens_seen": 15471984, "step": 7400 }, { "epoch": 1.2071947140876091, "eval_loss": 2.558281660079956, "eval_runtime": 135.0649, "eval_samples_per_second": 20.175, "eval_steps_per_second": 10.091, "num_input_tokens_seen": 15471984, "step": 7400 }, { "epoch": 1.2080104413084265, "grad_norm": 1.59375, "learning_rate": 4.5890870988012504e-05, "loss": 2.091, "num_input_tokens_seen": 15481472, "step": 7405 }, { "epoch": 1.2088261685292438, "grad_norm": 6.25, "learning_rate": 4.5885476780848226e-05, "loss": 2.1034, "num_input_tokens_seen": 15492544, "step": 7410 }, { "epoch": 1.2096418957500612, "grad_norm": 11.9375, "learning_rate": 4.5880079352880964e-05, "loss": 1.8024, "num_input_tokens_seen": 15503408, "step": 7415 }, { "epoch": 1.2104576229708786, "grad_norm": 4.15625, "learning_rate": 4.5874678704943065e-05, "loss": 2.0026, "num_input_tokens_seen": 15514256, "step": 7420 }, { "epoch": 1.211273350191696, "grad_norm": 9.5625, "learning_rate": 4.5869274837867394e-05, "loss": 2.8967, "num_input_tokens_seen": 15525456, "step": 7425 }, { "epoch": 1.2120890774125133, "grad_norm": 8.3125, "learning_rate": 4.5863867752487275e-05, "loss": 3.706, "num_input_tokens_seen": 15535600, "step": 7430 }, { "epoch": 1.2129048046333306, "grad_norm": 5.625, "learning_rate": 4.5858457449636554e-05, "loss": 2.11, "num_input_tokens_seen": 15546976, "step": 7435 }, { "epoch": 1.213720531854148, "grad_norm": 12.625, "learning_rate": 4.5853043930149574e-05, "loss": 2.3336, "num_input_tokens_seen": 15556928, "step": 7440 }, { "epoch": 1.2145362590749653, "grad_norm": 6.78125, "learning_rate": 4.584762719486117e-05, "loss": 1.7933, "num_input_tokens_seen": 15568192, "step": 7445 }, { "epoch": 1.2153519862957827, "grad_norm": 5.09375, "learning_rate": 4.584220724460665e-05, "loss": 2.4251, "num_input_tokens_seen": 15579360, "step": 7450 }, { "epoch": 1.2161677135166, "grad_norm": 7.9375, "learning_rate": 4.5836784080221865e-05, "loss": 3.3301, "num_input_tokens_seen": 15589680, "step": 7455 }, { "epoch": 1.2169834407374174, "grad_norm": 6.03125, "learning_rate": 4.583135770254312e-05, "loss": 2.9671, "num_input_tokens_seen": 15600704, "step": 7460 }, { "epoch": 1.2177991679582347, "grad_norm": 5.125, "learning_rate": 4.5825928112407236e-05, "loss": 2.569, "num_input_tokens_seen": 15611024, "step": 7465 }, { "epoch": 1.218614895179052, "grad_norm": 2.90625, "learning_rate": 4.582049531065152e-05, "loss": 2.7915, "num_input_tokens_seen": 15621232, "step": 7470 }, { "epoch": 1.2194306223998694, "grad_norm": 2.71875, "learning_rate": 4.5815059298113783e-05, "loss": 1.4746, "num_input_tokens_seen": 15630976, "step": 7475 }, { "epoch": 1.2202463496206868, "grad_norm": 6.875, "learning_rate": 4.580962007563232e-05, "loss": 2.3339, "num_input_tokens_seen": 15640880, "step": 7480 }, { "epoch": 1.2210620768415041, "grad_norm": 2.265625, "learning_rate": 4.5804177644045935e-05, "loss": 2.695, "num_input_tokens_seen": 15651040, "step": 7485 }, { "epoch": 1.2218778040623215, "grad_norm": 4.21875, "learning_rate": 4.579873200419391e-05, "loss": 2.2661, "num_input_tokens_seen": 15661248, "step": 7490 }, { "epoch": 1.2226935312831388, "grad_norm": 4.90625, "learning_rate": 4.5793283156916046e-05, "loss": 3.0956, "num_input_tokens_seen": 15670560, "step": 7495 }, { "epoch": 1.2235092585039562, "grad_norm": 6.9375, "learning_rate": 4.578783110305261e-05, "loss": 2.3361, "num_input_tokens_seen": 15681296, "step": 7500 }, { "epoch": 1.2243249857247736, "grad_norm": 8.25, "learning_rate": 4.578237584344438e-05, "loss": 3.1864, "num_input_tokens_seen": 15692880, "step": 7505 }, { "epoch": 1.225140712945591, "grad_norm": 2.328125, "learning_rate": 4.577691737893263e-05, "loss": 2.6673, "num_input_tokens_seen": 15703168, "step": 7510 }, { "epoch": 1.2259564401664083, "grad_norm": 8.0, "learning_rate": 4.577145571035912e-05, "loss": 2.9582, "num_input_tokens_seen": 15713568, "step": 7515 }, { "epoch": 1.2267721673872258, "grad_norm": 5.25, "learning_rate": 4.576599083856611e-05, "loss": 2.0738, "num_input_tokens_seen": 15723872, "step": 7520 }, { "epoch": 1.227587894608043, "grad_norm": 7.03125, "learning_rate": 4.576052276439635e-05, "loss": 2.1149, "num_input_tokens_seen": 15733472, "step": 7525 }, { "epoch": 1.2284036218288605, "grad_norm": 2.828125, "learning_rate": 4.575505148869308e-05, "loss": 3.2454, "num_input_tokens_seen": 15743920, "step": 7530 }, { "epoch": 1.2292193490496777, "grad_norm": 9.625, "learning_rate": 4.574957701230006e-05, "loss": 3.307, "num_input_tokens_seen": 15753648, "step": 7535 }, { "epoch": 1.2300350762704952, "grad_norm": 4.59375, "learning_rate": 4.57440993360615e-05, "loss": 1.7898, "num_input_tokens_seen": 15764160, "step": 7540 }, { "epoch": 1.2308508034913126, "grad_norm": 11.5625, "learning_rate": 4.5738618460822134e-05, "loss": 2.3753, "num_input_tokens_seen": 15774512, "step": 7545 }, { "epoch": 1.23166653071213, "grad_norm": 9.0625, "learning_rate": 4.573313438742719e-05, "loss": 2.4045, "num_input_tokens_seen": 15784944, "step": 7550 }, { "epoch": 1.2324822579329473, "grad_norm": 6.1875, "learning_rate": 4.5727647116722374e-05, "loss": 2.7328, "num_input_tokens_seen": 15795776, "step": 7555 }, { "epoch": 1.2332979851537647, "grad_norm": 8.25, "learning_rate": 4.5722156649553884e-05, "loss": 2.9951, "num_input_tokens_seen": 15806304, "step": 7560 }, { "epoch": 1.234113712374582, "grad_norm": 4.9375, "learning_rate": 4.571666298676843e-05, "loss": 2.2625, "num_input_tokens_seen": 15817376, "step": 7565 }, { "epoch": 1.2349294395953994, "grad_norm": 13.25, "learning_rate": 4.571116612921321e-05, "loss": 2.7347, "num_input_tokens_seen": 15827072, "step": 7570 }, { "epoch": 1.2357451668162167, "grad_norm": 2.3125, "learning_rate": 4.57056660777359e-05, "loss": 1.1264, "num_input_tokens_seen": 15839040, "step": 7575 }, { "epoch": 1.236560894037034, "grad_norm": 8.125, "learning_rate": 4.5700162833184666e-05, "loss": 2.3227, "num_input_tokens_seen": 15849296, "step": 7580 }, { "epoch": 1.2373766212578514, "grad_norm": 3.953125, "learning_rate": 4.5694656396408195e-05, "loss": 1.3727, "num_input_tokens_seen": 15859424, "step": 7585 }, { "epoch": 1.2381923484786688, "grad_norm": 7.5, "learning_rate": 4.5689146768255646e-05, "loss": 2.17, "num_input_tokens_seen": 15870864, "step": 7590 }, { "epoch": 1.2390080756994861, "grad_norm": 8.75, "learning_rate": 4.568363394957667e-05, "loss": 2.8341, "num_input_tokens_seen": 15881056, "step": 7595 }, { "epoch": 1.2398238029203035, "grad_norm": 9.0625, "learning_rate": 4.567811794122141e-05, "loss": 2.6311, "num_input_tokens_seen": 15891152, "step": 7600 }, { "epoch": 1.2398238029203035, "eval_loss": 2.5384037494659424, "eval_runtime": 134.8844, "eval_samples_per_second": 20.202, "eval_steps_per_second": 10.105, "num_input_tokens_seen": 15891152, "step": 7600 }, { "epoch": 1.2406395301411208, "grad_norm": 10.4375, "learning_rate": 4.56725987440405e-05, "loss": 2.9682, "num_input_tokens_seen": 15901296, "step": 7605 }, { "epoch": 1.2414552573619382, "grad_norm": 5.625, "learning_rate": 4.566707635888508e-05, "loss": 1.8171, "num_input_tokens_seen": 15911280, "step": 7610 }, { "epoch": 1.2422709845827555, "grad_norm": 6.53125, "learning_rate": 4.566155078660677e-05, "loss": 2.4598, "num_input_tokens_seen": 15920128, "step": 7615 }, { "epoch": 1.2430867118035729, "grad_norm": 11.375, "learning_rate": 4.565602202805768e-05, "loss": 2.0089, "num_input_tokens_seen": 15930768, "step": 7620 }, { "epoch": 1.2439024390243902, "grad_norm": 6.34375, "learning_rate": 4.56504900840904e-05, "loss": 1.2776, "num_input_tokens_seen": 15940592, "step": 7625 }, { "epoch": 1.2447181662452076, "grad_norm": 4.3125, "learning_rate": 4.564495495555805e-05, "loss": 1.3889, "num_input_tokens_seen": 15950160, "step": 7630 }, { "epoch": 1.245533893466025, "grad_norm": 4.75, "learning_rate": 4.5639416643314204e-05, "loss": 1.2862, "num_input_tokens_seen": 15960880, "step": 7635 }, { "epoch": 1.2463496206868423, "grad_norm": 7.3125, "learning_rate": 4.5633875148212946e-05, "loss": 1.912, "num_input_tokens_seen": 15972064, "step": 7640 }, { "epoch": 1.2471653479076596, "grad_norm": 5.6875, "learning_rate": 4.562833047110883e-05, "loss": 2.5649, "num_input_tokens_seen": 15983184, "step": 7645 }, { "epoch": 1.247981075128477, "grad_norm": 5.59375, "learning_rate": 4.5622782612856923e-05, "loss": 2.2973, "num_input_tokens_seen": 15994624, "step": 7650 }, { "epoch": 1.2487968023492944, "grad_norm": 1.7890625, "learning_rate": 4.561723157431278e-05, "loss": 1.5844, "num_input_tokens_seen": 16004336, "step": 7655 }, { "epoch": 1.2496125295701117, "grad_norm": 8.25, "learning_rate": 4.5611677356332435e-05, "loss": 2.2688, "num_input_tokens_seen": 16014400, "step": 7660 }, { "epoch": 1.250428256790929, "grad_norm": 10.4375, "learning_rate": 4.560611995977242e-05, "loss": 2.6278, "num_input_tokens_seen": 16024720, "step": 7665 }, { "epoch": 1.2512439840117464, "grad_norm": 8.5, "learning_rate": 4.560055938548975e-05, "loss": 2.6172, "num_input_tokens_seen": 16036624, "step": 7670 }, { "epoch": 1.2520597112325638, "grad_norm": 11.625, "learning_rate": 4.5594995634341944e-05, "loss": 2.5714, "num_input_tokens_seen": 16046240, "step": 7675 }, { "epoch": 1.2528754384533811, "grad_norm": 9.5, "learning_rate": 4.5589428707187e-05, "loss": 2.9079, "num_input_tokens_seen": 16057232, "step": 7680 }, { "epoch": 1.2536911656741985, "grad_norm": 10.375, "learning_rate": 4.55838586048834e-05, "loss": 1.9369, "num_input_tokens_seen": 16067296, "step": 7685 }, { "epoch": 1.2545068928950158, "grad_norm": 16.875, "learning_rate": 4.557828532829013e-05, "loss": 3.1759, "num_input_tokens_seen": 16079312, "step": 7690 }, { "epoch": 1.2553226201158334, "grad_norm": 0.265625, "learning_rate": 4.557270887826667e-05, "loss": 2.2967, "num_input_tokens_seen": 16089776, "step": 7695 }, { "epoch": 1.2561383473366505, "grad_norm": 11.375, "learning_rate": 4.556712925567296e-05, "loss": 2.3469, "num_input_tokens_seen": 16100544, "step": 7700 }, { "epoch": 1.256954074557468, "grad_norm": 5.21875, "learning_rate": 4.5561546461369454e-05, "loss": 2.4295, "num_input_tokens_seen": 16111424, "step": 7705 }, { "epoch": 1.2577698017782852, "grad_norm": 10.5625, "learning_rate": 4.55559604962171e-05, "loss": 3.0378, "num_input_tokens_seen": 16121056, "step": 7710 }, { "epoch": 1.2585855289991028, "grad_norm": 1.9609375, "learning_rate": 4.55503713610773e-05, "loss": 2.651, "num_input_tokens_seen": 16130304, "step": 7715 }, { "epoch": 1.25940125621992, "grad_norm": 17.25, "learning_rate": 4.5544779056812e-05, "loss": 2.5304, "num_input_tokens_seen": 16141936, "step": 7720 }, { "epoch": 1.2602169834407375, "grad_norm": 10.875, "learning_rate": 4.553918358428358e-05, "loss": 2.5715, "num_input_tokens_seen": 16151664, "step": 7725 }, { "epoch": 1.2610327106615546, "grad_norm": 7.0625, "learning_rate": 4.553358494435494e-05, "loss": 2.8047, "num_input_tokens_seen": 16161936, "step": 7730 }, { "epoch": 1.2618484378823722, "grad_norm": 8.9375, "learning_rate": 4.5527983137889464e-05, "loss": 2.9944, "num_input_tokens_seen": 16172944, "step": 7735 }, { "epoch": 1.2626641651031896, "grad_norm": 2.5625, "learning_rate": 4.5522378165751015e-05, "loss": 1.7051, "num_input_tokens_seen": 16183136, "step": 7740 }, { "epoch": 1.263479892324007, "grad_norm": 5.03125, "learning_rate": 4.5516770028803954e-05, "loss": 2.0574, "num_input_tokens_seen": 16194592, "step": 7745 }, { "epoch": 1.2642956195448243, "grad_norm": 5.0, "learning_rate": 4.5511158727913116e-05, "loss": 3.056, "num_input_tokens_seen": 16203296, "step": 7750 }, { "epoch": 1.2651113467656416, "grad_norm": 10.3125, "learning_rate": 4.5505544263943856e-05, "loss": 2.6448, "num_input_tokens_seen": 16214096, "step": 7755 }, { "epoch": 1.265927073986459, "grad_norm": 3.5625, "learning_rate": 4.549992663776197e-05, "loss": 1.4155, "num_input_tokens_seen": 16224400, "step": 7760 }, { "epoch": 1.2667428012072763, "grad_norm": 7.9375, "learning_rate": 4.5494305850233786e-05, "loss": 2.6791, "num_input_tokens_seen": 16235152, "step": 7765 }, { "epoch": 1.2675585284280937, "grad_norm": 5.75, "learning_rate": 4.5488681902226094e-05, "loss": 2.8537, "num_input_tokens_seen": 16245552, "step": 7770 }, { "epoch": 1.268374255648911, "grad_norm": 6.5625, "learning_rate": 4.5483054794606174e-05, "loss": 2.5883, "num_input_tokens_seen": 16258272, "step": 7775 }, { "epoch": 1.2691899828697284, "grad_norm": 7.21875, "learning_rate": 4.547742452824179e-05, "loss": 1.6048, "num_input_tokens_seen": 16268096, "step": 7780 }, { "epoch": 1.2700057100905457, "grad_norm": 1.796875, "learning_rate": 4.5471791104001215e-05, "loss": 2.0426, "num_input_tokens_seen": 16278752, "step": 7785 }, { "epoch": 1.270821437311363, "grad_norm": 11.1875, "learning_rate": 4.546615452275319e-05, "loss": 2.8123, "num_input_tokens_seen": 16289328, "step": 7790 }, { "epoch": 1.2716371645321805, "grad_norm": 6.5625, "learning_rate": 4.5460514785366944e-05, "loss": 2.1361, "num_input_tokens_seen": 16299248, "step": 7795 }, { "epoch": 1.2724528917529978, "grad_norm": 6.375, "learning_rate": 4.545487189271219e-05, "loss": 1.9245, "num_input_tokens_seen": 16309376, "step": 7800 }, { "epoch": 1.2724528917529978, "eval_loss": 2.5406174659729004, "eval_runtime": 134.9886, "eval_samples_per_second": 20.187, "eval_steps_per_second": 10.097, "num_input_tokens_seen": 16309376, "step": 7800 }, { "epoch": 1.2732686189738152, "grad_norm": 5.625, "learning_rate": 4.544922584565914e-05, "loss": 2.609, "num_input_tokens_seen": 16319664, "step": 7805 }, { "epoch": 1.2740843461946325, "grad_norm": 8.5, "learning_rate": 4.544357664507848e-05, "loss": 1.9248, "num_input_tokens_seen": 16329776, "step": 7810 }, { "epoch": 1.2749000734154499, "grad_norm": 7.03125, "learning_rate": 4.54379242918414e-05, "loss": 3.1428, "num_input_tokens_seen": 16339520, "step": 7815 }, { "epoch": 1.2757158006362672, "grad_norm": 5.125, "learning_rate": 4.543226878681955e-05, "loss": 2.4886, "num_input_tokens_seen": 16351040, "step": 7820 }, { "epoch": 1.2765315278570846, "grad_norm": 1.8671875, "learning_rate": 4.5426610130885087e-05, "loss": 1.0436, "num_input_tokens_seen": 16360848, "step": 7825 }, { "epoch": 1.277347255077902, "grad_norm": 6.28125, "learning_rate": 4.542094832491064e-05, "loss": 3.4212, "num_input_tokens_seen": 16372256, "step": 7830 }, { "epoch": 1.2781629822987193, "grad_norm": 7.875, "learning_rate": 4.541528336976934e-05, "loss": 2.5651, "num_input_tokens_seen": 16384016, "step": 7835 }, { "epoch": 1.2789787095195366, "grad_norm": 6.0625, "learning_rate": 4.540961526633479e-05, "loss": 2.6668, "num_input_tokens_seen": 16395104, "step": 7840 }, { "epoch": 1.279794436740354, "grad_norm": 5.5625, "learning_rate": 4.540394401548108e-05, "loss": 2.9675, "num_input_tokens_seen": 16405280, "step": 7845 }, { "epoch": 1.2806101639611713, "grad_norm": 10.0625, "learning_rate": 4.539826961808279e-05, "loss": 1.8778, "num_input_tokens_seen": 16415264, "step": 7850 }, { "epoch": 1.2814258911819887, "grad_norm": 3.15625, "learning_rate": 4.5392592075014994e-05, "loss": 1.9129, "num_input_tokens_seen": 16424928, "step": 7855 }, { "epoch": 1.282241618402806, "grad_norm": 2.25, "learning_rate": 4.538691138715322e-05, "loss": 1.4572, "num_input_tokens_seen": 16434864, "step": 7860 }, { "epoch": 1.2830573456236234, "grad_norm": 10.5625, "learning_rate": 4.5381227555373516e-05, "loss": 2.9183, "num_input_tokens_seen": 16445584, "step": 7865 }, { "epoch": 1.2838730728444407, "grad_norm": 2.984375, "learning_rate": 4.537554058055239e-05, "loss": 1.5132, "num_input_tokens_seen": 16455440, "step": 7870 }, { "epoch": 1.284688800065258, "grad_norm": 8.1875, "learning_rate": 4.5369850463566865e-05, "loss": 2.5803, "num_input_tokens_seen": 16465024, "step": 7875 }, { "epoch": 1.2855045272860757, "grad_norm": 8.375, "learning_rate": 4.5364157205294404e-05, "loss": 1.9098, "num_input_tokens_seen": 16476128, "step": 7880 }, { "epoch": 1.2863202545068928, "grad_norm": 3.625, "learning_rate": 4.5358460806612996e-05, "loss": 2.3376, "num_input_tokens_seen": 16486704, "step": 7885 }, { "epoch": 1.2871359817277104, "grad_norm": 3.109375, "learning_rate": 4.535276126840109e-05, "loss": 2.509, "num_input_tokens_seen": 16497984, "step": 7890 }, { "epoch": 1.2879517089485275, "grad_norm": 2.953125, "learning_rate": 4.5347058591537626e-05, "loss": 2.7248, "num_input_tokens_seen": 16508848, "step": 7895 }, { "epoch": 1.288767436169345, "grad_norm": 7.34375, "learning_rate": 4.534135277690203e-05, "loss": 2.1849, "num_input_tokens_seen": 16519008, "step": 7900 }, { "epoch": 1.2895831633901622, "grad_norm": 6.53125, "learning_rate": 4.533564382537421e-05, "loss": 2.814, "num_input_tokens_seen": 16529120, "step": 7905 }, { "epoch": 1.2903988906109798, "grad_norm": 6.6875, "learning_rate": 4.532993173783456e-05, "loss": 2.3283, "num_input_tokens_seen": 16540032, "step": 7910 }, { "epoch": 1.291214617831797, "grad_norm": 4.75, "learning_rate": 4.5324216515163954e-05, "loss": 2.3421, "num_input_tokens_seen": 16551664, "step": 7915 }, { "epoch": 1.2920303450526145, "grad_norm": 6.34375, "learning_rate": 4.531849815824375e-05, "loss": 2.5789, "num_input_tokens_seen": 16561856, "step": 7920 }, { "epoch": 1.2928460722734318, "grad_norm": 6.28125, "learning_rate": 4.5312776667955795e-05, "loss": 2.5213, "num_input_tokens_seen": 16572688, "step": 7925 }, { "epoch": 1.2936617994942492, "grad_norm": 2.96875, "learning_rate": 4.5307052045182405e-05, "loss": 2.3967, "num_input_tokens_seen": 16583024, "step": 7930 }, { "epoch": 1.2944775267150666, "grad_norm": 11.0625, "learning_rate": 4.53013242908064e-05, "loss": 2.7529, "num_input_tokens_seen": 16593936, "step": 7935 }, { "epoch": 1.295293253935884, "grad_norm": 8.1875, "learning_rate": 4.529559340571107e-05, "loss": 1.5142, "num_input_tokens_seen": 16604016, "step": 7940 }, { "epoch": 1.2961089811567013, "grad_norm": 6.1875, "learning_rate": 4.528985939078018e-05, "loss": 1.5558, "num_input_tokens_seen": 16613744, "step": 7945 }, { "epoch": 1.2969247083775186, "grad_norm": 8.125, "learning_rate": 4.5284122246898e-05, "loss": 3.0316, "num_input_tokens_seen": 16625168, "step": 7950 }, { "epoch": 1.297740435598336, "grad_norm": 7.625, "learning_rate": 4.527838197494926e-05, "loss": 2.9055, "num_input_tokens_seen": 16635664, "step": 7955 }, { "epoch": 1.2985561628191533, "grad_norm": 8.625, "learning_rate": 4.527263857581918e-05, "loss": 1.9755, "num_input_tokens_seen": 16646736, "step": 7960 }, { "epoch": 1.2993718900399707, "grad_norm": 4.9375, "learning_rate": 4.526689205039347e-05, "loss": 3.1641, "num_input_tokens_seen": 16658432, "step": 7965 }, { "epoch": 1.300187617260788, "grad_norm": 4.96875, "learning_rate": 4.5261142399558324e-05, "loss": 1.9703, "num_input_tokens_seen": 16668608, "step": 7970 }, { "epoch": 1.3010033444816054, "grad_norm": 10.1875, "learning_rate": 4.525538962420041e-05, "loss": 2.7511, "num_input_tokens_seen": 16678224, "step": 7975 }, { "epoch": 1.3018190717024227, "grad_norm": 8.4375, "learning_rate": 4.524963372520685e-05, "loss": 2.2828, "num_input_tokens_seen": 16688960, "step": 7980 }, { "epoch": 1.30263479892324, "grad_norm": 3.84375, "learning_rate": 4.524387470346531e-05, "loss": 1.4005, "num_input_tokens_seen": 16699552, "step": 7985 }, { "epoch": 1.3034505261440574, "grad_norm": 3.421875, "learning_rate": 4.5238112559863885e-05, "loss": 2.1994, "num_input_tokens_seen": 16710720, "step": 7990 }, { "epoch": 1.3042662533648748, "grad_norm": 8.25, "learning_rate": 4.5232347295291175e-05, "loss": 1.922, "num_input_tokens_seen": 16719968, "step": 7995 }, { "epoch": 1.3050819805856921, "grad_norm": 6.5, "learning_rate": 4.522657891063626e-05, "loss": 1.3522, "num_input_tokens_seen": 16729632, "step": 8000 }, { "epoch": 1.3050819805856921, "eval_loss": 2.5443472862243652, "eval_runtime": 134.6824, "eval_samples_per_second": 20.233, "eval_steps_per_second": 10.12, "num_input_tokens_seen": 16729632, "step": 8000 }, { "epoch": 1.3058977078065095, "grad_norm": 8.75, "learning_rate": 4.52208074067887e-05, "loss": 2.08, "num_input_tokens_seen": 16740000, "step": 8005 }, { "epoch": 1.3067134350273268, "grad_norm": 11.9375, "learning_rate": 4.5215032784638516e-05, "loss": 4.4784, "num_input_tokens_seen": 16748928, "step": 8010 }, { "epoch": 1.3075291622481442, "grad_norm": 9.3125, "learning_rate": 4.5209255045076245e-05, "loss": 2.2037, "num_input_tokens_seen": 16759728, "step": 8015 }, { "epoch": 1.3083448894689615, "grad_norm": 7.125, "learning_rate": 4.5203474188992875e-05, "loss": 1.6067, "num_input_tokens_seen": 16770672, "step": 8020 }, { "epoch": 1.309160616689779, "grad_norm": 8.4375, "learning_rate": 4.51976902172799e-05, "loss": 2.3676, "num_input_tokens_seen": 16781008, "step": 8025 }, { "epoch": 1.3099763439105963, "grad_norm": 5.59375, "learning_rate": 4.519190313082927e-05, "loss": 1.862, "num_input_tokens_seen": 16791008, "step": 8030 }, { "epoch": 1.3107920711314136, "grad_norm": 5.90625, "learning_rate": 4.518611293053343e-05, "loss": 1.9986, "num_input_tokens_seen": 16800896, "step": 8035 }, { "epoch": 1.311607798352231, "grad_norm": 2.171875, "learning_rate": 4.51803196172853e-05, "loss": 3.0175, "num_input_tokens_seen": 16809072, "step": 8040 }, { "epoch": 1.3124235255730483, "grad_norm": 2.828125, "learning_rate": 4.517452319197828e-05, "loss": 3.0101, "num_input_tokens_seen": 16818976, "step": 8045 }, { "epoch": 1.3132392527938657, "grad_norm": 5.90625, "learning_rate": 4.5168723655506265e-05, "loss": 2.6953, "num_input_tokens_seen": 16828176, "step": 8050 }, { "epoch": 1.314054980014683, "grad_norm": 6.09375, "learning_rate": 4.51629210087636e-05, "loss": 2.3099, "num_input_tokens_seen": 16837680, "step": 8055 }, { "epoch": 1.3148707072355004, "grad_norm": 8.375, "learning_rate": 4.515711525264513e-05, "loss": 3.1837, "num_input_tokens_seen": 16847344, "step": 8060 }, { "epoch": 1.315686434456318, "grad_norm": 4.75, "learning_rate": 4.5151306388046175e-05, "loss": 1.6865, "num_input_tokens_seen": 16858832, "step": 8065 }, { "epoch": 1.316502161677135, "grad_norm": 6.15625, "learning_rate": 4.514549441586255e-05, "loss": 2.0347, "num_input_tokens_seen": 16868672, "step": 8070 }, { "epoch": 1.3173178888979526, "grad_norm": 12.0625, "learning_rate": 4.513967933699051e-05, "loss": 3.2597, "num_input_tokens_seen": 16879392, "step": 8075 }, { "epoch": 1.3181336161187698, "grad_norm": 1.96875, "learning_rate": 4.513386115232684e-05, "loss": 1.7723, "num_input_tokens_seen": 16889568, "step": 8080 }, { "epoch": 1.3189493433395874, "grad_norm": 15.0625, "learning_rate": 4.5128039862768745e-05, "loss": 3.5898, "num_input_tokens_seen": 16899392, "step": 8085 }, { "epoch": 1.3197650705604045, "grad_norm": 3.3125, "learning_rate": 4.512221546921397e-05, "loss": 2.6839, "num_input_tokens_seen": 16910448, "step": 8090 }, { "epoch": 1.320580797781222, "grad_norm": 3.046875, "learning_rate": 4.5116387972560694e-05, "loss": 1.4084, "num_input_tokens_seen": 16919808, "step": 8095 }, { "epoch": 1.3213965250020392, "grad_norm": 3.53125, "learning_rate": 4.511055737370759e-05, "loss": 1.3778, "num_input_tokens_seen": 16930288, "step": 8100 }, { "epoch": 1.3222122522228568, "grad_norm": 3.390625, "learning_rate": 4.510472367355383e-05, "loss": 1.8917, "num_input_tokens_seen": 16941200, "step": 8105 }, { "epoch": 1.3230279794436741, "grad_norm": 14.75, "learning_rate": 4.509888687299901e-05, "loss": 2.8396, "num_input_tokens_seen": 16950624, "step": 8110 }, { "epoch": 1.3238437066644915, "grad_norm": 16.25, "learning_rate": 4.5093046972943266e-05, "loss": 3.5705, "num_input_tokens_seen": 16960960, "step": 8115 }, { "epoch": 1.3246594338853088, "grad_norm": 0.85546875, "learning_rate": 4.508720397428717e-05, "loss": 2.7155, "num_input_tokens_seen": 16970224, "step": 8120 }, { "epoch": 1.3254751611061262, "grad_norm": 3.640625, "learning_rate": 4.508135787793178e-05, "loss": 1.5689, "num_input_tokens_seen": 16981600, "step": 8125 }, { "epoch": 1.3262908883269435, "grad_norm": 4.5, "learning_rate": 4.5075508684778664e-05, "loss": 2.3816, "num_input_tokens_seen": 16991264, "step": 8130 }, { "epoch": 1.3271066155477609, "grad_norm": 12.3125, "learning_rate": 4.506965639572982e-05, "loss": 2.723, "num_input_tokens_seen": 17003248, "step": 8135 }, { "epoch": 1.3279223427685782, "grad_norm": 10.5, "learning_rate": 4.506380101168774e-05, "loss": 2.6515, "num_input_tokens_seen": 17014016, "step": 8140 }, { "epoch": 1.3287380699893956, "grad_norm": 10.5, "learning_rate": 4.505794253355542e-05, "loss": 2.1346, "num_input_tokens_seen": 17023840, "step": 8145 }, { "epoch": 1.329553797210213, "grad_norm": 8.5, "learning_rate": 4.5052080962236286e-05, "loss": 2.7023, "num_input_tokens_seen": 17034176, "step": 8150 }, { "epoch": 1.3303695244310303, "grad_norm": 8.9375, "learning_rate": 4.504621629863428e-05, "loss": 2.8469, "num_input_tokens_seen": 17045424, "step": 8155 }, { "epoch": 1.3311852516518476, "grad_norm": 3.171875, "learning_rate": 4.504034854365381e-05, "loss": 3.282, "num_input_tokens_seen": 17056080, "step": 8160 }, { "epoch": 1.332000978872665, "grad_norm": 8.25, "learning_rate": 4.503447769819974e-05, "loss": 3.1554, "num_input_tokens_seen": 17065776, "step": 8165 }, { "epoch": 1.3328167060934824, "grad_norm": 7.90625, "learning_rate": 4.502860376317745e-05, "loss": 2.1478, "num_input_tokens_seen": 17076320, "step": 8170 }, { "epoch": 1.3336324333142997, "grad_norm": 7.78125, "learning_rate": 4.502272673949276e-05, "loss": 2.6394, "num_input_tokens_seen": 17086304, "step": 8175 }, { "epoch": 1.334448160535117, "grad_norm": 8.0625, "learning_rate": 4.501684662805199e-05, "loss": 1.7187, "num_input_tokens_seen": 17096976, "step": 8180 }, { "epoch": 1.3352638877559344, "grad_norm": 7.09375, "learning_rate": 4.5010963429761924e-05, "loss": 1.8292, "num_input_tokens_seen": 17108064, "step": 8185 }, { "epoch": 1.3360796149767518, "grad_norm": 9.875, "learning_rate": 4.500507714552982e-05, "loss": 2.5231, "num_input_tokens_seen": 17118912, "step": 8190 }, { "epoch": 1.3368953421975691, "grad_norm": 8.25, "learning_rate": 4.499918777626342e-05, "loss": 2.7606, "num_input_tokens_seen": 17129168, "step": 8195 }, { "epoch": 1.3377110694183865, "grad_norm": 8.3125, "learning_rate": 4.499329532287093e-05, "loss": 2.9042, "num_input_tokens_seen": 17139952, "step": 8200 }, { "epoch": 1.3377110694183865, "eval_loss": 2.5448949337005615, "eval_runtime": 134.8864, "eval_samples_per_second": 20.202, "eval_steps_per_second": 10.105, "num_input_tokens_seen": 17139952, "step": 8200 }, { "epoch": 1.3385267966392038, "grad_norm": 12.75, "learning_rate": 4.4987399786261064e-05, "loss": 1.9021, "num_input_tokens_seen": 17149664, "step": 8205 }, { "epoch": 1.3393425238600212, "grad_norm": 9.4375, "learning_rate": 4.498150116734297e-05, "loss": 3.6334, "num_input_tokens_seen": 17159040, "step": 8210 }, { "epoch": 1.3401582510808385, "grad_norm": 7.59375, "learning_rate": 4.4975599467026294e-05, "loss": 2.2201, "num_input_tokens_seen": 17169360, "step": 8215 }, { "epoch": 1.3409739783016559, "grad_norm": 10.0, "learning_rate": 4.496969468622114e-05, "loss": 3.4071, "num_input_tokens_seen": 17181920, "step": 8220 }, { "epoch": 1.3417897055224732, "grad_norm": 7.28125, "learning_rate": 4.496378682583813e-05, "loss": 1.0302, "num_input_tokens_seen": 17191600, "step": 8225 }, { "epoch": 1.3426054327432906, "grad_norm": 10.4375, "learning_rate": 4.495787588678829e-05, "loss": 3.1304, "num_input_tokens_seen": 17201776, "step": 8230 }, { "epoch": 1.343421159964108, "grad_norm": 9.9375, "learning_rate": 4.4951961869983196e-05, "loss": 2.6814, "num_input_tokens_seen": 17212864, "step": 8235 }, { "epoch": 1.3442368871849253, "grad_norm": 9.25, "learning_rate": 4.494604477633485e-05, "loss": 1.916, "num_input_tokens_seen": 17223888, "step": 8240 }, { "epoch": 1.3450526144057426, "grad_norm": 15.1875, "learning_rate": 4.4940124606755734e-05, "loss": 1.2973, "num_input_tokens_seen": 17234064, "step": 8245 }, { "epoch": 1.3458683416265602, "grad_norm": 4.4375, "learning_rate": 4.493420136215882e-05, "loss": 1.6788, "num_input_tokens_seen": 17243296, "step": 8250 }, { "epoch": 1.3466840688473773, "grad_norm": 11.375, "learning_rate": 4.492827504345756e-05, "loss": 1.9523, "num_input_tokens_seen": 17253520, "step": 8255 }, { "epoch": 1.347499796068195, "grad_norm": 4.40625, "learning_rate": 4.492234565156584e-05, "loss": 2.1785, "num_input_tokens_seen": 17264208, "step": 8260 }, { "epoch": 1.348315523289012, "grad_norm": 8.3125, "learning_rate": 4.491641318739807e-05, "loss": 2.0787, "num_input_tokens_seen": 17273296, "step": 8265 }, { "epoch": 1.3491312505098296, "grad_norm": 8.75, "learning_rate": 4.4910477651869096e-05, "loss": 2.9158, "num_input_tokens_seen": 17284848, "step": 8270 }, { "epoch": 1.3499469777306468, "grad_norm": 6.15625, "learning_rate": 4.4904539045894254e-05, "loss": 1.9896, "num_input_tokens_seen": 17296592, "step": 8275 }, { "epoch": 1.3507627049514643, "grad_norm": 7.3125, "learning_rate": 4.4898597370389364e-05, "loss": 2.6489, "num_input_tokens_seen": 17307264, "step": 8280 }, { "epoch": 1.3515784321722815, "grad_norm": 10.1875, "learning_rate": 4.489265262627069e-05, "loss": 2.1398, "num_input_tokens_seen": 17317040, "step": 8285 }, { "epoch": 1.352394159393099, "grad_norm": 1.375, "learning_rate": 4.488670481445499e-05, "loss": 1.9753, "num_input_tokens_seen": 17327072, "step": 8290 }, { "epoch": 1.3532098866139162, "grad_norm": 2.203125, "learning_rate": 4.488075393585951e-05, "loss": 1.4988, "num_input_tokens_seen": 17337360, "step": 8295 }, { "epoch": 1.3540256138347337, "grad_norm": 13.375, "learning_rate": 4.487479999140193e-05, "loss": 2.4538, "num_input_tokens_seen": 17347232, "step": 8300 }, { "epoch": 1.354841341055551, "grad_norm": 8.0625, "learning_rate": 4.4868842982000425e-05, "loss": 2.9812, "num_input_tokens_seen": 17357424, "step": 8305 }, { "epoch": 1.3556570682763684, "grad_norm": 7.46875, "learning_rate": 4.486288290857365e-05, "loss": 3.7324, "num_input_tokens_seen": 17367408, "step": 8310 }, { "epoch": 1.3564727954971858, "grad_norm": 4.84375, "learning_rate": 4.4856919772040715e-05, "loss": 2.0072, "num_input_tokens_seen": 17378480, "step": 8315 }, { "epoch": 1.3572885227180032, "grad_norm": 4.09375, "learning_rate": 4.485095357332122e-05, "loss": 1.7019, "num_input_tokens_seen": 17388784, "step": 8320 }, { "epoch": 1.3581042499388205, "grad_norm": 9.125, "learning_rate": 4.484498431333521e-05, "loss": 2.2157, "num_input_tokens_seen": 17399552, "step": 8325 }, { "epoch": 1.3589199771596379, "grad_norm": 2.28125, "learning_rate": 4.4839011993003245e-05, "loss": 2.6732, "num_input_tokens_seen": 17411168, "step": 8330 }, { "epoch": 1.3597357043804552, "grad_norm": 8.4375, "learning_rate": 4.4833036613246305e-05, "loss": 2.7798, "num_input_tokens_seen": 17422224, "step": 8335 }, { "epoch": 1.3605514316012726, "grad_norm": 9.5, "learning_rate": 4.482705817498589e-05, "loss": 2.9483, "num_input_tokens_seen": 17433792, "step": 8340 }, { "epoch": 1.36136715882209, "grad_norm": 11.5, "learning_rate": 4.4821076679143934e-05, "loss": 4.2509, "num_input_tokens_seen": 17444688, "step": 8345 }, { "epoch": 1.3621828860429073, "grad_norm": 2.828125, "learning_rate": 4.481509212664288e-05, "loss": 3.326, "num_input_tokens_seen": 17454720, "step": 8350 }, { "epoch": 1.3629986132637246, "grad_norm": 12.9375, "learning_rate": 4.480910451840559e-05, "loss": 2.1183, "num_input_tokens_seen": 17463424, "step": 8355 }, { "epoch": 1.363814340484542, "grad_norm": 5.78125, "learning_rate": 4.480311385535546e-05, "loss": 2.4097, "num_input_tokens_seen": 17474416, "step": 8360 }, { "epoch": 1.3646300677053593, "grad_norm": 5.875, "learning_rate": 4.47971201384163e-05, "loss": 2.4651, "num_input_tokens_seen": 17484496, "step": 8365 }, { "epoch": 1.3654457949261767, "grad_norm": 6.75, "learning_rate": 4.4791123368512446e-05, "loss": 2.473, "num_input_tokens_seen": 17495136, "step": 8370 }, { "epoch": 1.366261522146994, "grad_norm": 6.40625, "learning_rate": 4.478512354656864e-05, "loss": 2.3176, "num_input_tokens_seen": 17504848, "step": 8375 }, { "epoch": 1.3670772493678114, "grad_norm": 7.3125, "learning_rate": 4.477912067351016e-05, "loss": 1.6594, "num_input_tokens_seen": 17515072, "step": 8380 }, { "epoch": 1.3678929765886287, "grad_norm": 3.828125, "learning_rate": 4.477311475026271e-05, "loss": 3.1007, "num_input_tokens_seen": 17524672, "step": 8385 }, { "epoch": 1.368708703809446, "grad_norm": 10.375, "learning_rate": 4.476710577775248e-05, "loss": 2.9775, "num_input_tokens_seen": 17534976, "step": 8390 }, { "epoch": 1.3695244310302634, "grad_norm": 4.09375, "learning_rate": 4.476109375690612e-05, "loss": 1.6781, "num_input_tokens_seen": 17545248, "step": 8395 }, { "epoch": 1.3703401582510808, "grad_norm": 6.4375, "learning_rate": 4.4755078688650784e-05, "loss": 2.6583, "num_input_tokens_seen": 17557136, "step": 8400 }, { "epoch": 1.3703401582510808, "eval_loss": 2.540050983428955, "eval_runtime": 135.0136, "eval_samples_per_second": 20.183, "eval_steps_per_second": 10.095, "num_input_tokens_seen": 17557136, "step": 8400 }, { "epoch": 1.3711558854718982, "grad_norm": 13.0625, "learning_rate": 4.474906057391406e-05, "loss": 3.358, "num_input_tokens_seen": 17567696, "step": 8405 }, { "epoch": 1.3719716126927155, "grad_norm": 10.25, "learning_rate": 4.4743039413624e-05, "loss": 2.8831, "num_input_tokens_seen": 17577008, "step": 8410 }, { "epoch": 1.3727873399135329, "grad_norm": 2.96875, "learning_rate": 4.473701520870916e-05, "loss": 2.505, "num_input_tokens_seen": 17587344, "step": 8415 }, { "epoch": 1.3736030671343502, "grad_norm": 4.90625, "learning_rate": 4.4730987960098544e-05, "loss": 2.9895, "num_input_tokens_seen": 17598096, "step": 8420 }, { "epoch": 1.3744187943551676, "grad_norm": 5.40625, "learning_rate": 4.4724957668721635e-05, "loss": 2.0436, "num_input_tokens_seen": 17609488, "step": 8425 }, { "epoch": 1.375234521575985, "grad_norm": 2.90625, "learning_rate": 4.471892433550836e-05, "loss": 1.4558, "num_input_tokens_seen": 17619920, "step": 8430 }, { "epoch": 1.3760502487968023, "grad_norm": 5.09375, "learning_rate": 4.471288796138916e-05, "loss": 3.1056, "num_input_tokens_seen": 17629600, "step": 8435 }, { "epoch": 1.3768659760176196, "grad_norm": 8.5625, "learning_rate": 4.470684854729491e-05, "loss": 2.015, "num_input_tokens_seen": 17640752, "step": 8440 }, { "epoch": 1.3776817032384372, "grad_norm": 9.1875, "learning_rate": 4.4700806094156955e-05, "loss": 2.4412, "num_input_tokens_seen": 17650432, "step": 8445 }, { "epoch": 1.3784974304592543, "grad_norm": 18.0, "learning_rate": 4.469476060290713e-05, "loss": 2.296, "num_input_tokens_seen": 17661600, "step": 8450 }, { "epoch": 1.379313157680072, "grad_norm": 2.078125, "learning_rate": 4.468871207447772e-05, "loss": 1.805, "num_input_tokens_seen": 17671792, "step": 8455 }, { "epoch": 1.380128884900889, "grad_norm": 8.1875, "learning_rate": 4.4682660509801486e-05, "loss": 2.446, "num_input_tokens_seen": 17682128, "step": 8460 }, { "epoch": 1.3809446121217066, "grad_norm": 6.65625, "learning_rate": 4.467660590981165e-05, "loss": 3.3865, "num_input_tokens_seen": 17694128, "step": 8465 }, { "epoch": 1.3817603393425237, "grad_norm": 13.0, "learning_rate": 4.467054827544191e-05, "loss": 2.6766, "num_input_tokens_seen": 17704128, "step": 8470 }, { "epoch": 1.3825760665633413, "grad_norm": 1.9296875, "learning_rate": 4.4664487607626434e-05, "loss": 1.8885, "num_input_tokens_seen": 17712960, "step": 8475 }, { "epoch": 1.3833917937841584, "grad_norm": 14.625, "learning_rate": 4.4658423907299845e-05, "loss": 2.7156, "num_input_tokens_seen": 17724096, "step": 8480 }, { "epoch": 1.384207521004976, "grad_norm": 7.84375, "learning_rate": 4.465235717539725e-05, "loss": 2.1476, "num_input_tokens_seen": 17735728, "step": 8485 }, { "epoch": 1.3850232482257934, "grad_norm": 4.34375, "learning_rate": 4.464628741285421e-05, "loss": 1.8076, "num_input_tokens_seen": 17745968, "step": 8490 }, { "epoch": 1.3858389754466107, "grad_norm": 2.984375, "learning_rate": 4.4640214620606754e-05, "loss": 2.437, "num_input_tokens_seen": 17757744, "step": 8495 }, { "epoch": 1.386654702667428, "grad_norm": 7.75, "learning_rate": 4.46341387995914e-05, "loss": 2.4694, "num_input_tokens_seen": 17767824, "step": 8500 }, { "epoch": 1.3874704298882454, "grad_norm": 6.75, "learning_rate": 4.4628059950745106e-05, "loss": 2.5645, "num_input_tokens_seen": 17778128, "step": 8505 }, { "epoch": 1.3882861571090628, "grad_norm": 9.5625, "learning_rate": 4.4621978075005297e-05, "loss": 2.0868, "num_input_tokens_seen": 17787888, "step": 8510 }, { "epoch": 1.3891018843298801, "grad_norm": 9.1875, "learning_rate": 4.461589317330989e-05, "loss": 2.4031, "num_input_tokens_seen": 17799664, "step": 8515 }, { "epoch": 1.3899176115506975, "grad_norm": 8.5, "learning_rate": 4.460980524659724e-05, "loss": 1.7137, "num_input_tokens_seen": 17809360, "step": 8520 }, { "epoch": 1.3907333387715148, "grad_norm": 10.0, "learning_rate": 4.46037142958062e-05, "loss": 2.7602, "num_input_tokens_seen": 17819184, "step": 8525 }, { "epoch": 1.3915490659923322, "grad_norm": 10.4375, "learning_rate": 4.4597620321876046e-05, "loss": 2.6755, "num_input_tokens_seen": 17830320, "step": 8530 }, { "epoch": 1.3923647932131495, "grad_norm": 3.59375, "learning_rate": 4.459152332574656e-05, "loss": 2.0528, "num_input_tokens_seen": 17838880, "step": 8535 }, { "epoch": 1.393180520433967, "grad_norm": 7.03125, "learning_rate": 4.4585423308357985e-05, "loss": 1.4005, "num_input_tokens_seen": 17848848, "step": 8540 }, { "epoch": 1.3939962476547842, "grad_norm": 7.53125, "learning_rate": 4.457932027065102e-05, "loss": 2.3557, "num_input_tokens_seen": 17859792, "step": 8545 }, { "epoch": 1.3948119748756016, "grad_norm": 5.71875, "learning_rate": 4.45732142135668e-05, "loss": 1.867, "num_input_tokens_seen": 17869872, "step": 8550 }, { "epoch": 1.395627702096419, "grad_norm": 5.0, "learning_rate": 4.4567105138046986e-05, "loss": 3.0756, "num_input_tokens_seen": 17881392, "step": 8555 }, { "epoch": 1.3964434293172363, "grad_norm": 5.15625, "learning_rate": 4.456099304503365e-05, "loss": 2.0689, "num_input_tokens_seen": 17892064, "step": 8560 }, { "epoch": 1.3972591565380537, "grad_norm": 7.0, "learning_rate": 4.455487793546939e-05, "loss": 2.1211, "num_input_tokens_seen": 17903280, "step": 8565 }, { "epoch": 1.398074883758871, "grad_norm": 9.8125, "learning_rate": 4.454875981029719e-05, "loss": 2.5391, "num_input_tokens_seen": 17913584, "step": 8570 }, { "epoch": 1.3988906109796884, "grad_norm": 4.0, "learning_rate": 4.454263867046057e-05, "loss": 0.876, "num_input_tokens_seen": 17923328, "step": 8575 }, { "epoch": 1.3997063382005057, "grad_norm": 4.34375, "learning_rate": 4.4536514516903484e-05, "loss": 1.562, "num_input_tokens_seen": 17933728, "step": 8580 }, { "epoch": 1.400522065421323, "grad_norm": 12.4375, "learning_rate": 4.453038735057034e-05, "loss": 3.1081, "num_input_tokens_seen": 17943376, "step": 8585 }, { "epoch": 1.4013377926421404, "grad_norm": 6.65625, "learning_rate": 4.4524257172406034e-05, "loss": 1.9284, "num_input_tokens_seen": 17954160, "step": 8590 }, { "epoch": 1.4021535198629578, "grad_norm": 8.0, "learning_rate": 4.451812398335592e-05, "loss": 3.2021, "num_input_tokens_seen": 17964384, "step": 8595 }, { "epoch": 1.4029692470837751, "grad_norm": 13.5, "learning_rate": 4.4511987784365805e-05, "loss": 2.7435, "num_input_tokens_seen": 17974864, "step": 8600 }, { "epoch": 1.4029692470837751, "eval_loss": 2.5591657161712646, "eval_runtime": 135.0229, "eval_samples_per_second": 20.182, "eval_steps_per_second": 10.095, "num_input_tokens_seen": 17974864, "step": 8600 }, { "epoch": 1.4037849743045925, "grad_norm": 8.5, "learning_rate": 4.450584857638197e-05, "loss": 3.0168, "num_input_tokens_seen": 17983792, "step": 8605 }, { "epoch": 1.4046007015254098, "grad_norm": 3.984375, "learning_rate": 4.449970636035116e-05, "loss": 1.9743, "num_input_tokens_seen": 17994688, "step": 8610 }, { "epoch": 1.4054164287462272, "grad_norm": 6.28125, "learning_rate": 4.4493561137220574e-05, "loss": 2.567, "num_input_tokens_seen": 18005024, "step": 8615 }, { "epoch": 1.4062321559670445, "grad_norm": 5.09375, "learning_rate": 4.44874129079379e-05, "loss": 1.2527, "num_input_tokens_seen": 18015104, "step": 8620 }, { "epoch": 1.407047883187862, "grad_norm": 6.28125, "learning_rate": 4.4481261673451255e-05, "loss": 3.0014, "num_input_tokens_seen": 18024976, "step": 8625 }, { "epoch": 1.4078636104086795, "grad_norm": 9.3125, "learning_rate": 4.4475107434709245e-05, "loss": 2.8707, "num_input_tokens_seen": 18035520, "step": 8630 }, { "epoch": 1.4086793376294966, "grad_norm": 4.96875, "learning_rate": 4.446895019266093e-05, "loss": 0.6995, "num_input_tokens_seen": 18045312, "step": 8635 }, { "epoch": 1.4094950648503142, "grad_norm": 11.625, "learning_rate": 4.446278994825583e-05, "loss": 2.6908, "num_input_tokens_seen": 18055872, "step": 8640 }, { "epoch": 1.4103107920711313, "grad_norm": 6.59375, "learning_rate": 4.445662670244394e-05, "loss": 1.8127, "num_input_tokens_seen": 18066208, "step": 8645 }, { "epoch": 1.4111265192919489, "grad_norm": 5.625, "learning_rate": 4.44504604561757e-05, "loss": 2.8751, "num_input_tokens_seen": 18075968, "step": 8650 }, { "epoch": 1.411942246512766, "grad_norm": 11.4375, "learning_rate": 4.4444291210402035e-05, "loss": 4.1479, "num_input_tokens_seen": 18085728, "step": 8655 }, { "epoch": 1.4127579737335836, "grad_norm": 9.9375, "learning_rate": 4.443811896607431e-05, "loss": 2.9089, "num_input_tokens_seen": 18096144, "step": 8660 }, { "epoch": 1.4135737009544007, "grad_norm": 11.375, "learning_rate": 4.443194372414436e-05, "loss": 2.2833, "num_input_tokens_seen": 18107840, "step": 8665 }, { "epoch": 1.4143894281752183, "grad_norm": 7.59375, "learning_rate": 4.442576548556449e-05, "loss": 4.3797, "num_input_tokens_seen": 18118064, "step": 8670 }, { "epoch": 1.4152051553960356, "grad_norm": 7.90625, "learning_rate": 4.441958425128747e-05, "loss": 1.4752, "num_input_tokens_seen": 18128656, "step": 8675 }, { "epoch": 1.416020882616853, "grad_norm": 8.5, "learning_rate": 4.4413400022266515e-05, "loss": 2.6552, "num_input_tokens_seen": 18140560, "step": 8680 }, { "epoch": 1.4168366098376703, "grad_norm": 3.6875, "learning_rate": 4.4407212799455313e-05, "loss": 1.3233, "num_input_tokens_seen": 18152048, "step": 8685 }, { "epoch": 1.4176523370584877, "grad_norm": 3.046875, "learning_rate": 4.4401022583808003e-05, "loss": 1.2866, "num_input_tokens_seen": 18161808, "step": 8690 }, { "epoch": 1.418468064279305, "grad_norm": 3.734375, "learning_rate": 4.439482937627921e-05, "loss": 1.6412, "num_input_tokens_seen": 18173488, "step": 8695 }, { "epoch": 1.4192837915001224, "grad_norm": 9.875, "learning_rate": 4.4388633177824004e-05, "loss": 3.2706, "num_input_tokens_seen": 18182448, "step": 8700 }, { "epoch": 1.4200995187209398, "grad_norm": 5.375, "learning_rate": 4.4382433989397895e-05, "loss": 3.5, "num_input_tokens_seen": 18191296, "step": 8705 }, { "epoch": 1.420915245941757, "grad_norm": 5.90625, "learning_rate": 4.4376231811956895e-05, "loss": 3.1701, "num_input_tokens_seen": 18202128, "step": 8710 }, { "epoch": 1.4217309731625745, "grad_norm": 4.21875, "learning_rate": 4.437002664645745e-05, "loss": 1.6259, "num_input_tokens_seen": 18213280, "step": 8715 }, { "epoch": 1.4225467003833918, "grad_norm": 5.5, "learning_rate": 4.436381849385649e-05, "loss": 1.9465, "num_input_tokens_seen": 18223936, "step": 8720 }, { "epoch": 1.4233624276042092, "grad_norm": 7.71875, "learning_rate": 4.435760735511136e-05, "loss": 3.6822, "num_input_tokens_seen": 18234800, "step": 8725 }, { "epoch": 1.4241781548250265, "grad_norm": 2.890625, "learning_rate": 4.435139323117992e-05, "loss": 1.7686, "num_input_tokens_seen": 18244240, "step": 8730 }, { "epoch": 1.4249938820458439, "grad_norm": 2.109375, "learning_rate": 4.434517612302046e-05, "loss": 2.1102, "num_input_tokens_seen": 18256240, "step": 8735 }, { "epoch": 1.4258096092666612, "grad_norm": 8.8125, "learning_rate": 4.433895603159174e-05, "loss": 2.2475, "num_input_tokens_seen": 18268608, "step": 8740 }, { "epoch": 1.4266253364874786, "grad_norm": 3.84375, "learning_rate": 4.433273295785296e-05, "loss": 2.1408, "num_input_tokens_seen": 18279936, "step": 8745 }, { "epoch": 1.427441063708296, "grad_norm": 11.0625, "learning_rate": 4.432650690276382e-05, "loss": 2.4811, "num_input_tokens_seen": 18289552, "step": 8750 }, { "epoch": 1.4282567909291133, "grad_norm": 6.09375, "learning_rate": 4.4320277867284435e-05, "loss": 2.2092, "num_input_tokens_seen": 18301600, "step": 8755 }, { "epoch": 1.4290725181499306, "grad_norm": 6.65625, "learning_rate": 4.431404585237541e-05, "loss": 1.6207, "num_input_tokens_seen": 18311328, "step": 8760 }, { "epoch": 1.429888245370748, "grad_norm": 4.5625, "learning_rate": 4.43078108589978e-05, "loss": 1.8095, "num_input_tokens_seen": 18321408, "step": 8765 }, { "epoch": 1.4307039725915653, "grad_norm": 8.1875, "learning_rate": 4.4301572888113116e-05, "loss": 2.3585, "num_input_tokens_seen": 18332720, "step": 8770 }, { "epoch": 1.4315196998123827, "grad_norm": 6.34375, "learning_rate": 4.4295331940683337e-05, "loss": 2.205, "num_input_tokens_seen": 18342576, "step": 8775 }, { "epoch": 1.4323354270332, "grad_norm": 6.75, "learning_rate": 4.428908801767089e-05, "loss": 1.7936, "num_input_tokens_seen": 18353360, "step": 8780 }, { "epoch": 1.4331511542540174, "grad_norm": 5.8125, "learning_rate": 4.428284112003868e-05, "loss": 3.0113, "num_input_tokens_seen": 18363200, "step": 8785 }, { "epoch": 1.4339668814748348, "grad_norm": 0.259765625, "learning_rate": 4.4276591248750033e-05, "loss": 2.5336, "num_input_tokens_seen": 18373456, "step": 8790 }, { "epoch": 1.434782608695652, "grad_norm": 3.546875, "learning_rate": 4.4270338404768774e-05, "loss": 2.7362, "num_input_tokens_seen": 18384432, "step": 8795 }, { "epoch": 1.4355983359164695, "grad_norm": 3.953125, "learning_rate": 4.426408258905917e-05, "loss": 2.7448, "num_input_tokens_seen": 18394320, "step": 8800 }, { "epoch": 1.4355983359164695, "eval_loss": 2.5488646030426025, "eval_runtime": 134.9944, "eval_samples_per_second": 20.186, "eval_steps_per_second": 10.097, "num_input_tokens_seen": 18394320, "step": 8800 }, { "epoch": 1.4364140631372868, "grad_norm": 12.125, "learning_rate": 4.425782380258594e-05, "loss": 2.5851, "num_input_tokens_seen": 18404992, "step": 8805 }, { "epoch": 1.4372297903581042, "grad_norm": 4.09375, "learning_rate": 4.425156204631427e-05, "loss": 2.4947, "num_input_tokens_seen": 18416096, "step": 8810 }, { "epoch": 1.4380455175789217, "grad_norm": 10.1875, "learning_rate": 4.424529732120981e-05, "loss": 2.6177, "num_input_tokens_seen": 18426832, "step": 8815 }, { "epoch": 1.4388612447997389, "grad_norm": 3.59375, "learning_rate": 4.423902962823864e-05, "loss": 1.1538, "num_input_tokens_seen": 18437584, "step": 8820 }, { "epoch": 1.4396769720205564, "grad_norm": 13.25, "learning_rate": 4.423275896836733e-05, "loss": 3.9667, "num_input_tokens_seen": 18447024, "step": 8825 }, { "epoch": 1.4404926992413736, "grad_norm": 10.1875, "learning_rate": 4.42264853425629e-05, "loss": 3.4546, "num_input_tokens_seen": 18458512, "step": 8830 }, { "epoch": 1.4413084264621912, "grad_norm": 5.0625, "learning_rate": 4.4220208751792816e-05, "loss": 3.0126, "num_input_tokens_seen": 18469824, "step": 8835 }, { "epoch": 1.4421241536830083, "grad_norm": 5.625, "learning_rate": 4.421392919702499e-05, "loss": 2.1316, "num_input_tokens_seen": 18479936, "step": 8840 }, { "epoch": 1.4429398809038259, "grad_norm": 2.46875, "learning_rate": 4.4207646679227846e-05, "loss": 2.1269, "num_input_tokens_seen": 18489072, "step": 8845 }, { "epoch": 1.443755608124643, "grad_norm": 2.8125, "learning_rate": 4.42013611993702e-05, "loss": 2.1949, "num_input_tokens_seen": 18499200, "step": 8850 }, { "epoch": 1.4445713353454606, "grad_norm": 4.40625, "learning_rate": 4.419507275842135e-05, "loss": 2.6193, "num_input_tokens_seen": 18510080, "step": 8855 }, { "epoch": 1.445387062566278, "grad_norm": 5.53125, "learning_rate": 4.418878135735106e-05, "loss": 1.9718, "num_input_tokens_seen": 18517264, "step": 8860 }, { "epoch": 1.4462027897870953, "grad_norm": 8.5625, "learning_rate": 4.418248699712955e-05, "loss": 1.5938, "num_input_tokens_seen": 18528544, "step": 8865 }, { "epoch": 1.4470185170079126, "grad_norm": 6.0625, "learning_rate": 4.417618967872748e-05, "loss": 2.6605, "num_input_tokens_seen": 18540048, "step": 8870 }, { "epoch": 1.44783424422873, "grad_norm": 7.4375, "learning_rate": 4.4169889403115985e-05, "loss": 2.1911, "num_input_tokens_seen": 18550688, "step": 8875 }, { "epoch": 1.4486499714495473, "grad_norm": 2.671875, "learning_rate": 4.4163586171266627e-05, "loss": 2.6802, "num_input_tokens_seen": 18560960, "step": 8880 }, { "epoch": 1.4494656986703647, "grad_norm": 12.0, "learning_rate": 4.415727998415147e-05, "loss": 3.3054, "num_input_tokens_seen": 18572224, "step": 8885 }, { "epoch": 1.450281425891182, "grad_norm": 6.4375, "learning_rate": 4.4150970842742985e-05, "loss": 2.3937, "num_input_tokens_seen": 18581872, "step": 8890 }, { "epoch": 1.4510971531119994, "grad_norm": 5.09375, "learning_rate": 4.4144658748014134e-05, "loss": 1.1845, "num_input_tokens_seen": 18592736, "step": 8895 }, { "epoch": 1.4519128803328167, "grad_norm": 5.21875, "learning_rate": 4.413834370093831e-05, "loss": 1.9351, "num_input_tokens_seen": 18603232, "step": 8900 }, { "epoch": 1.452728607553634, "grad_norm": 8.8125, "learning_rate": 4.413202570248939e-05, "loss": 2.4058, "num_input_tokens_seen": 18614256, "step": 8905 }, { "epoch": 1.4535443347744514, "grad_norm": 2.1875, "learning_rate": 4.412570475364167e-05, "loss": 1.8693, "num_input_tokens_seen": 18625968, "step": 8910 }, { "epoch": 1.4543600619952688, "grad_norm": 10.125, "learning_rate": 4.411938085536994e-05, "loss": 1.8954, "num_input_tokens_seen": 18636048, "step": 8915 }, { "epoch": 1.4551757892160861, "grad_norm": 5.3125, "learning_rate": 4.41130540086494e-05, "loss": 1.2701, "num_input_tokens_seen": 18646144, "step": 8920 }, { "epoch": 1.4559915164369035, "grad_norm": 3.265625, "learning_rate": 4.4106724214455754e-05, "loss": 2.0906, "num_input_tokens_seen": 18657312, "step": 8925 }, { "epoch": 1.4568072436577209, "grad_norm": 6.625, "learning_rate": 4.4100391473765115e-05, "loss": 2.4887, "num_input_tokens_seen": 18668976, "step": 8930 }, { "epoch": 1.4576229708785382, "grad_norm": 12.9375, "learning_rate": 4.409405578755408e-05, "loss": 3.8476, "num_input_tokens_seen": 18679728, "step": 8935 }, { "epoch": 1.4584386980993556, "grad_norm": 4.125, "learning_rate": 4.4087717156799705e-05, "loss": 1.5907, "num_input_tokens_seen": 18691296, "step": 8940 }, { "epoch": 1.459254425320173, "grad_norm": 5.84375, "learning_rate": 4.408137558247946e-05, "loss": 1.7532, "num_input_tokens_seen": 18702080, "step": 8945 }, { "epoch": 1.4600701525409903, "grad_norm": 16.375, "learning_rate": 4.4075031065571306e-05, "loss": 2.6004, "num_input_tokens_seen": 18713120, "step": 8950 }, { "epoch": 1.4608858797618076, "grad_norm": 7.25, "learning_rate": 4.406868360705366e-05, "loss": 2.9629, "num_input_tokens_seen": 18723488, "step": 8955 }, { "epoch": 1.461701606982625, "grad_norm": 9.625, "learning_rate": 4.406233320790536e-05, "loss": 2.617, "num_input_tokens_seen": 18733968, "step": 8960 }, { "epoch": 1.4625173342034423, "grad_norm": 3.625, "learning_rate": 4.4055979869105734e-05, "loss": 1.9915, "num_input_tokens_seen": 18745664, "step": 8965 }, { "epoch": 1.4633330614242597, "grad_norm": 4.53125, "learning_rate": 4.404962359163454e-05, "loss": 2.7435, "num_input_tokens_seen": 18755680, "step": 8970 }, { "epoch": 1.464148788645077, "grad_norm": 9.125, "learning_rate": 4.404326437647199e-05, "loss": 3.6306, "num_input_tokens_seen": 18765920, "step": 8975 }, { "epoch": 1.4649645158658944, "grad_norm": 16.75, "learning_rate": 4.403690222459877e-05, "loss": 1.5588, "num_input_tokens_seen": 18776480, "step": 8980 }, { "epoch": 1.4657802430867117, "grad_norm": 8.6875, "learning_rate": 4.4030537136995984e-05, "loss": 3.4508, "num_input_tokens_seen": 18787632, "step": 8985 }, { "epoch": 1.466595970307529, "grad_norm": 11.0, "learning_rate": 4.402416911464523e-05, "loss": 2.5324, "num_input_tokens_seen": 18797488, "step": 8990 }, { "epoch": 1.4674116975283464, "grad_norm": 12.5, "learning_rate": 4.4017798158528516e-05, "loss": 2.3699, "num_input_tokens_seen": 18808880, "step": 8995 }, { "epoch": 1.468227424749164, "grad_norm": 8.875, "learning_rate": 4.401142426962834e-05, "loss": 3.6386, "num_input_tokens_seen": 18820208, "step": 9000 }, { "epoch": 1.468227424749164, "eval_loss": 2.5527656078338623, "eval_runtime": 134.8717, "eval_samples_per_second": 20.204, "eval_steps_per_second": 10.106, "num_input_tokens_seen": 18820208, "step": 9000 }, { "epoch": 1.4690431519699811, "grad_norm": 7.6875, "learning_rate": 4.400504744892763e-05, "loss": 2.4906, "num_input_tokens_seen": 18830160, "step": 9005 }, { "epoch": 1.4698588791907987, "grad_norm": 2.484375, "learning_rate": 4.399866769740975e-05, "loss": 3.1367, "num_input_tokens_seen": 18839408, "step": 9010 }, { "epoch": 1.4706746064116158, "grad_norm": 15.0625, "learning_rate": 4.399228501605859e-05, "loss": 2.3759, "num_input_tokens_seen": 18847520, "step": 9015 }, { "epoch": 1.4714903336324334, "grad_norm": 8.625, "learning_rate": 4.398589940585839e-05, "loss": 2.9692, "num_input_tokens_seen": 18858608, "step": 9020 }, { "epoch": 1.4723060608532506, "grad_norm": 8.8125, "learning_rate": 4.3979510867793917e-05, "loss": 2.3692, "num_input_tokens_seen": 18868416, "step": 9025 }, { "epoch": 1.4731217880740681, "grad_norm": 2.671875, "learning_rate": 4.3973119402850346e-05, "loss": 2.5802, "num_input_tokens_seen": 18877408, "step": 9030 }, { "epoch": 1.4739375152948853, "grad_norm": 3.40625, "learning_rate": 4.396672501201334e-05, "loss": 1.6646, "num_input_tokens_seen": 18887552, "step": 9035 }, { "epoch": 1.4747532425157028, "grad_norm": 1.5078125, "learning_rate": 4.396032769626899e-05, "loss": 1.3979, "num_input_tokens_seen": 18896832, "step": 9040 }, { "epoch": 1.4755689697365202, "grad_norm": 7.9375, "learning_rate": 4.395392745660384e-05, "loss": 2.4439, "num_input_tokens_seen": 18907856, "step": 9045 }, { "epoch": 1.4763846969573375, "grad_norm": 11.0625, "learning_rate": 4.394752429400488e-05, "loss": 3.273, "num_input_tokens_seen": 18919008, "step": 9050 }, { "epoch": 1.477200424178155, "grad_norm": 1.6015625, "learning_rate": 4.394111820945957e-05, "loss": 2.0073, "num_input_tokens_seen": 18931344, "step": 9055 }, { "epoch": 1.4780161513989722, "grad_norm": 5.6875, "learning_rate": 4.393470920395579e-05, "loss": 2.8412, "num_input_tokens_seen": 18940528, "step": 9060 }, { "epoch": 1.4788318786197896, "grad_norm": 10.0625, "learning_rate": 4.392829727848192e-05, "loss": 2.9761, "num_input_tokens_seen": 18950784, "step": 9065 }, { "epoch": 1.479647605840607, "grad_norm": 5.96875, "learning_rate": 4.392188243402673e-05, "loss": 2.8897, "num_input_tokens_seen": 18961616, "step": 9070 }, { "epoch": 1.4804633330614243, "grad_norm": 4.5625, "learning_rate": 4.391546467157949e-05, "loss": 2.3154, "num_input_tokens_seen": 18972880, "step": 9075 }, { "epoch": 1.4812790602822417, "grad_norm": 6.9375, "learning_rate": 4.390904399212988e-05, "loss": 1.7477, "num_input_tokens_seen": 18984704, "step": 9080 }, { "epoch": 1.482094787503059, "grad_norm": 3.296875, "learning_rate": 4.390262039666807e-05, "loss": 2.0176, "num_input_tokens_seen": 18996560, "step": 9085 }, { "epoch": 1.4829105147238764, "grad_norm": 2.28125, "learning_rate": 4.389619388618464e-05, "loss": 2.1006, "num_input_tokens_seen": 19006176, "step": 9090 }, { "epoch": 1.4837262419446937, "grad_norm": 0.71484375, "learning_rate": 4.3889764461670655e-05, "loss": 1.9793, "num_input_tokens_seen": 19017120, "step": 9095 }, { "epoch": 1.484541969165511, "grad_norm": 6.125, "learning_rate": 4.38833321241176e-05, "loss": 2.067, "num_input_tokens_seen": 19028512, "step": 9100 }, { "epoch": 1.4853576963863284, "grad_norm": 4.28125, "learning_rate": 4.3876896874517434e-05, "loss": 1.9383, "num_input_tokens_seen": 19038992, "step": 9105 }, { "epoch": 1.4861734236071458, "grad_norm": 11.375, "learning_rate": 4.3870458713862554e-05, "loss": 2.7034, "num_input_tokens_seen": 19049728, "step": 9110 }, { "epoch": 1.4869891508279631, "grad_norm": 15.375, "learning_rate": 4.386401764314579e-05, "loss": 2.3843, "num_input_tokens_seen": 19059520, "step": 9115 }, { "epoch": 1.4878048780487805, "grad_norm": 4.6875, "learning_rate": 4.385757366336045e-05, "loss": 2.4358, "num_input_tokens_seen": 19070944, "step": 9120 }, { "epoch": 1.4886206052695978, "grad_norm": 8.75, "learning_rate": 4.385112677550027e-05, "loss": 3.5552, "num_input_tokens_seen": 19082672, "step": 9125 }, { "epoch": 1.4894363324904152, "grad_norm": 7.09375, "learning_rate": 4.384467698055945e-05, "loss": 3.0621, "num_input_tokens_seen": 19093536, "step": 9130 }, { "epoch": 1.4902520597112325, "grad_norm": 8.5625, "learning_rate": 4.383822427953261e-05, "loss": 2.1582, "num_input_tokens_seen": 19104848, "step": 9135 }, { "epoch": 1.4910677869320499, "grad_norm": 10.125, "learning_rate": 4.3831768673414864e-05, "loss": 2.286, "num_input_tokens_seen": 19116704, "step": 9140 }, { "epoch": 1.4918835141528672, "grad_norm": 8.5625, "learning_rate": 4.382531016320173e-05, "loss": 1.3295, "num_input_tokens_seen": 19128688, "step": 9145 }, { "epoch": 1.4926992413736846, "grad_norm": 1.28125, "learning_rate": 4.3818848749889184e-05, "loss": 1.5414, "num_input_tokens_seen": 19141664, "step": 9150 }, { "epoch": 1.493514968594502, "grad_norm": 5.1875, "learning_rate": 4.381238443447368e-05, "loss": 2.55, "num_input_tokens_seen": 19151408, "step": 9155 }, { "epoch": 1.4943306958153193, "grad_norm": 5.5625, "learning_rate": 4.380591721795208e-05, "loss": 2.8566, "num_input_tokens_seen": 19162016, "step": 9160 }, { "epoch": 1.4951464230361367, "grad_norm": 6.09375, "learning_rate": 4.3799447101321723e-05, "loss": 3.038, "num_input_tokens_seen": 19172064, "step": 9165 }, { "epoch": 1.495962150256954, "grad_norm": 7.53125, "learning_rate": 4.379297408558036e-05, "loss": 3.4502, "num_input_tokens_seen": 19181456, "step": 9170 }, { "epoch": 1.4967778774777714, "grad_norm": 15.9375, "learning_rate": 4.378649817172624e-05, "loss": 3.5764, "num_input_tokens_seen": 19191440, "step": 9175 }, { "epoch": 1.4975936046985887, "grad_norm": 5.0625, "learning_rate": 4.378001936075801e-05, "loss": 2.8938, "num_input_tokens_seen": 19203504, "step": 9180 }, { "epoch": 1.4984093319194063, "grad_norm": 7.40625, "learning_rate": 4.377353765367479e-05, "loss": 4.5858, "num_input_tokens_seen": 19214064, "step": 9185 }, { "epoch": 1.4992250591402234, "grad_norm": 8.3125, "learning_rate": 4.376705305147614e-05, "loss": 3.2526, "num_input_tokens_seen": 19223792, "step": 9190 }, { "epoch": 1.500040786361041, "grad_norm": 5.84375, "learning_rate": 4.376056555516206e-05, "loss": 2.5012, "num_input_tokens_seen": 19233312, "step": 9195 }, { "epoch": 1.5008565135818581, "grad_norm": 13.375, "learning_rate": 4.375407516573302e-05, "loss": 1.1522, "num_input_tokens_seen": 19244192, "step": 9200 }, { "epoch": 1.5008565135818581, "eval_loss": 2.5400924682617188, "eval_runtime": 135.0531, "eval_samples_per_second": 20.177, "eval_steps_per_second": 10.092, "num_input_tokens_seen": 19244192, "step": 9200 }, { "epoch": 1.5016722408026757, "grad_norm": 7.84375, "learning_rate": 4.3747581884189913e-05, "loss": 3.1154, "num_input_tokens_seen": 19254640, "step": 9205 }, { "epoch": 1.5024879680234928, "grad_norm": 8.1875, "learning_rate": 4.374108571153408e-05, "loss": 2.1562, "num_input_tokens_seen": 19265568, "step": 9210 }, { "epoch": 1.5033036952443104, "grad_norm": 12.8125, "learning_rate": 4.3734586648767316e-05, "loss": 2.4074, "num_input_tokens_seen": 19275360, "step": 9215 }, { "epoch": 1.5041194224651275, "grad_norm": 6.625, "learning_rate": 4.372808469689186e-05, "loss": 1.8542, "num_input_tokens_seen": 19286800, "step": 9220 }, { "epoch": 1.504935149685945, "grad_norm": 3.25, "learning_rate": 4.372157985691039e-05, "loss": 0.8788, "num_input_tokens_seen": 19296192, "step": 9225 }, { "epoch": 1.5057508769067622, "grad_norm": 5.5625, "learning_rate": 4.371507212982603e-05, "loss": 1.4798, "num_input_tokens_seen": 19306864, "step": 9230 }, { "epoch": 1.5065666041275798, "grad_norm": 11.125, "learning_rate": 4.370856151664236e-05, "loss": 2.9215, "num_input_tokens_seen": 19318288, "step": 9235 }, { "epoch": 1.507382331348397, "grad_norm": 4.53125, "learning_rate": 4.3702048018363404e-05, "loss": 1.2685, "num_input_tokens_seen": 19328112, "step": 9240 }, { "epoch": 1.5081980585692145, "grad_norm": 8.125, "learning_rate": 4.369553163599362e-05, "loss": 2.4029, "num_input_tokens_seen": 19339872, "step": 9245 }, { "epoch": 1.5090137857900316, "grad_norm": 10.5, "learning_rate": 4.3689012370537904e-05, "loss": 3.1361, "num_input_tokens_seen": 19349424, "step": 9250 }, { "epoch": 1.5098295130108492, "grad_norm": 6.6875, "learning_rate": 4.368249022300164e-05, "loss": 2.5712, "num_input_tokens_seen": 19360560, "step": 9255 }, { "epoch": 1.5106452402316666, "grad_norm": 6.25, "learning_rate": 4.367596519439059e-05, "loss": 1.9279, "num_input_tokens_seen": 19369968, "step": 9260 }, { "epoch": 1.511460967452484, "grad_norm": 2.875, "learning_rate": 4.366943728571101e-05, "loss": 3.1247, "num_input_tokens_seen": 19378624, "step": 9265 }, { "epoch": 1.5122766946733013, "grad_norm": 2.453125, "learning_rate": 4.366290649796959e-05, "loss": 1.5744, "num_input_tokens_seen": 19388224, "step": 9270 }, { "epoch": 1.5130924218941186, "grad_norm": 8.6875, "learning_rate": 4.3656372832173456e-05, "loss": 2.543, "num_input_tokens_seen": 19399120, "step": 9275 }, { "epoch": 1.513908149114936, "grad_norm": 12.0625, "learning_rate": 4.364983628933017e-05, "loss": 4.1608, "num_input_tokens_seen": 19409584, "step": 9280 }, { "epoch": 1.5147238763357533, "grad_norm": 14.6875, "learning_rate": 4.364329687044777e-05, "loss": 2.9229, "num_input_tokens_seen": 19419360, "step": 9285 }, { "epoch": 1.5155396035565707, "grad_norm": 5.0625, "learning_rate": 4.36367545765347e-05, "loss": 2.0604, "num_input_tokens_seen": 19428368, "step": 9290 }, { "epoch": 1.516355330777388, "grad_norm": 9.625, "learning_rate": 4.363020940859988e-05, "loss": 3.5126, "num_input_tokens_seen": 19438240, "step": 9295 }, { "epoch": 1.5171710579982054, "grad_norm": 5.875, "learning_rate": 4.362366136765263e-05, "loss": 2.4625, "num_input_tokens_seen": 19448784, "step": 9300 }, { "epoch": 1.5179867852190227, "grad_norm": 8.75, "learning_rate": 4.361711045470278e-05, "loss": 2.9133, "num_input_tokens_seen": 19458752, "step": 9305 }, { "epoch": 1.51880251243984, "grad_norm": 3.921875, "learning_rate": 4.3610556670760524e-05, "loss": 1.2429, "num_input_tokens_seen": 19469888, "step": 9310 }, { "epoch": 1.5196182396606575, "grad_norm": 5.78125, "learning_rate": 4.360400001683657e-05, "loss": 1.3817, "num_input_tokens_seen": 19480848, "step": 9315 }, { "epoch": 1.5204339668814748, "grad_norm": 12.1875, "learning_rate": 4.3597440493942e-05, "loss": 2.3602, "num_input_tokens_seen": 19491280, "step": 9320 }, { "epoch": 1.5212496941022922, "grad_norm": 5.8125, "learning_rate": 4.3590878103088405e-05, "loss": 1.841, "num_input_tokens_seen": 19501904, "step": 9325 }, { "epoch": 1.5220654213231095, "grad_norm": 7.53125, "learning_rate": 4.358431284528779e-05, "loss": 2.2008, "num_input_tokens_seen": 19511984, "step": 9330 }, { "epoch": 1.5228811485439269, "grad_norm": 10.75, "learning_rate": 4.357774472155257e-05, "loss": 3.1643, "num_input_tokens_seen": 19521952, "step": 9335 }, { "epoch": 1.5236968757647442, "grad_norm": 3.515625, "learning_rate": 4.3571173732895664e-05, "loss": 1.7009, "num_input_tokens_seen": 19531648, "step": 9340 }, { "epoch": 1.5245126029855616, "grad_norm": 8.6875, "learning_rate": 4.356459988033039e-05, "loss": 2.5876, "num_input_tokens_seen": 19540672, "step": 9345 }, { "epoch": 1.5253283302063791, "grad_norm": 10.625, "learning_rate": 4.355802316487051e-05, "loss": 3.7905, "num_input_tokens_seen": 19549328, "step": 9350 }, { "epoch": 1.5261440574271963, "grad_norm": 7.375, "learning_rate": 4.355144358753025e-05, "loss": 2.6895, "num_input_tokens_seen": 19559872, "step": 9355 }, { "epoch": 1.5269597846480139, "grad_norm": 8.5625, "learning_rate": 4.354486114932425e-05, "loss": 2.4303, "num_input_tokens_seen": 19570880, "step": 9360 }, { "epoch": 1.527775511868831, "grad_norm": 4.03125, "learning_rate": 4.353827585126762e-05, "loss": 1.2927, "num_input_tokens_seen": 19581536, "step": 9365 }, { "epoch": 1.5285912390896486, "grad_norm": 10.1875, "learning_rate": 4.353168769437588e-05, "loss": 3.0233, "num_input_tokens_seen": 19591344, "step": 9370 }, { "epoch": 1.5294069663104657, "grad_norm": 8.5625, "learning_rate": 4.3525096679665014e-05, "loss": 2.2789, "num_input_tokens_seen": 19601664, "step": 9375 }, { "epoch": 1.5302226935312833, "grad_norm": 0.123046875, "learning_rate": 4.351850280815144e-05, "loss": 1.5994, "num_input_tokens_seen": 19612192, "step": 9380 }, { "epoch": 1.5310384207521004, "grad_norm": 6.84375, "learning_rate": 4.3511906080852014e-05, "loss": 1.8388, "num_input_tokens_seen": 19622256, "step": 9385 }, { "epoch": 1.531854147972918, "grad_norm": 6.46875, "learning_rate": 4.350530649878404e-05, "loss": 1.4125, "num_input_tokens_seen": 19633424, "step": 9390 }, { "epoch": 1.532669875193735, "grad_norm": 5.8125, "learning_rate": 4.3498704062965246e-05, "loss": 2.4417, "num_input_tokens_seen": 19643584, "step": 9395 }, { "epoch": 1.5334856024145527, "grad_norm": 8.0625, "learning_rate": 4.3492098774413815e-05, "loss": 2.7058, "num_input_tokens_seen": 19654192, "step": 9400 }, { "epoch": 1.5334856024145527, "eval_loss": 2.5398964881896973, "eval_runtime": 134.9947, "eval_samples_per_second": 20.186, "eval_steps_per_second": 10.097, "num_input_tokens_seen": 19654192, "step": 9400 }, { "epoch": 1.5343013296353698, "grad_norm": 7.53125, "learning_rate": 4.3485490634148375e-05, "loss": 2.8161, "num_input_tokens_seen": 19665248, "step": 9405 }, { "epoch": 1.5351170568561874, "grad_norm": 6.90625, "learning_rate": 4.347887964318797e-05, "loss": 2.0126, "num_input_tokens_seen": 19674784, "step": 9410 }, { "epoch": 1.5359327840770045, "grad_norm": 1.7265625, "learning_rate": 4.34722658025521e-05, "loss": 1.6498, "num_input_tokens_seen": 19685632, "step": 9415 }, { "epoch": 1.536748511297822, "grad_norm": 4.8125, "learning_rate": 4.346564911326071e-05, "loss": 3.838, "num_input_tokens_seen": 19697184, "step": 9420 }, { "epoch": 1.5375642385186392, "grad_norm": 4.40625, "learning_rate": 4.345902957633418e-05, "loss": 1.0547, "num_input_tokens_seen": 19708064, "step": 9425 }, { "epoch": 1.5383799657394568, "grad_norm": 4.125, "learning_rate": 4.345240719279331e-05, "loss": 2.7169, "num_input_tokens_seen": 19717664, "step": 9430 }, { "epoch": 1.539195692960274, "grad_norm": 4.46875, "learning_rate": 4.3445781963659374e-05, "loss": 1.9193, "num_input_tokens_seen": 19728736, "step": 9435 }, { "epoch": 1.5400114201810915, "grad_norm": 3.96875, "learning_rate": 4.3439153889954045e-05, "loss": 3.4239, "num_input_tokens_seen": 19740080, "step": 9440 }, { "epoch": 1.5408271474019088, "grad_norm": 12.1875, "learning_rate": 4.343252297269946e-05, "loss": 2.7399, "num_input_tokens_seen": 19750688, "step": 9445 }, { "epoch": 1.5416428746227262, "grad_norm": 2.046875, "learning_rate": 4.342588921291821e-05, "loss": 1.8026, "num_input_tokens_seen": 19762176, "step": 9450 }, { "epoch": 1.5424586018435436, "grad_norm": 7.71875, "learning_rate": 4.341925261163328e-05, "loss": 3.1393, "num_input_tokens_seen": 19771920, "step": 9455 }, { "epoch": 1.543274329064361, "grad_norm": 7.78125, "learning_rate": 4.341261316986813e-05, "loss": 1.768, "num_input_tokens_seen": 19782784, "step": 9460 }, { "epoch": 1.5440900562851783, "grad_norm": 5.15625, "learning_rate": 4.340597088864664e-05, "loss": 2.0342, "num_input_tokens_seen": 19794256, "step": 9465 }, { "epoch": 1.5449057835059956, "grad_norm": 7.0625, "learning_rate": 4.339932576899313e-05, "loss": 2.4258, "num_input_tokens_seen": 19805168, "step": 9470 }, { "epoch": 1.545721510726813, "grad_norm": 5.84375, "learning_rate": 4.3392677811932375e-05, "loss": 2.2318, "num_input_tokens_seen": 19816288, "step": 9475 }, { "epoch": 1.5465372379476303, "grad_norm": 2.5625, "learning_rate": 4.338602701848956e-05, "loss": 1.4446, "num_input_tokens_seen": 19826144, "step": 9480 }, { "epoch": 1.5473529651684477, "grad_norm": 4.90625, "learning_rate": 4.337937338969033e-05, "loss": 2.8775, "num_input_tokens_seen": 19835840, "step": 9485 }, { "epoch": 1.548168692389265, "grad_norm": 13.3125, "learning_rate": 4.337271692656075e-05, "loss": 3.0846, "num_input_tokens_seen": 19848000, "step": 9490 }, { "epoch": 1.5489844196100824, "grad_norm": 6.15625, "learning_rate": 4.336605763012733e-05, "loss": 2.3266, "num_input_tokens_seen": 19858496, "step": 9495 }, { "epoch": 1.5498001468308997, "grad_norm": 6.65625, "learning_rate": 4.3359395501417026e-05, "loss": 1.5661, "num_input_tokens_seen": 19870304, "step": 9500 }, { "epoch": 1.550615874051717, "grad_norm": 4.28125, "learning_rate": 4.335273054145722e-05, "loss": 1.3735, "num_input_tokens_seen": 19881904, "step": 9505 }, { "epoch": 1.5514316012725344, "grad_norm": 5.4375, "learning_rate": 4.334606275127572e-05, "loss": 2.4925, "num_input_tokens_seen": 19892720, "step": 9510 }, { "epoch": 1.5522473284933518, "grad_norm": 5.21875, "learning_rate": 4.33393921319008e-05, "loss": 2.0904, "num_input_tokens_seen": 19902800, "step": 9515 }, { "epoch": 1.5530630557141691, "grad_norm": 8.5, "learning_rate": 4.3332718684361146e-05, "loss": 2.5532, "num_input_tokens_seen": 19913424, "step": 9520 }, { "epoch": 1.5538787829349865, "grad_norm": 8.0625, "learning_rate": 4.332604240968588e-05, "loss": 1.8244, "num_input_tokens_seen": 19922976, "step": 9525 }, { "epoch": 1.5546945101558038, "grad_norm": 5.90625, "learning_rate": 4.331936330890459e-05, "loss": 2.9644, "num_input_tokens_seen": 19932992, "step": 9530 }, { "epoch": 1.5555102373766214, "grad_norm": 18.125, "learning_rate": 4.331268138304725e-05, "loss": 1.7415, "num_input_tokens_seen": 19944160, "step": 9535 }, { "epoch": 1.5563259645974385, "grad_norm": 11.4375, "learning_rate": 4.330599663314431e-05, "loss": 1.007, "num_input_tokens_seen": 19955216, "step": 9540 }, { "epoch": 1.5571416918182561, "grad_norm": 5.84375, "learning_rate": 4.329930906022665e-05, "loss": 3.0016, "num_input_tokens_seen": 19965440, "step": 9545 }, { "epoch": 1.5579574190390733, "grad_norm": 6.5625, "learning_rate": 4.3292618665325564e-05, "loss": 1.9259, "num_input_tokens_seen": 19973936, "step": 9550 }, { "epoch": 1.5587731462598908, "grad_norm": 14.1875, "learning_rate": 4.3285925449472796e-05, "loss": 3.0655, "num_input_tokens_seen": 19984752, "step": 9555 }, { "epoch": 1.559588873480708, "grad_norm": 4.375, "learning_rate": 4.327922941370054e-05, "loss": 1.6832, "num_input_tokens_seen": 19995056, "step": 9560 }, { "epoch": 1.5604046007015255, "grad_norm": 6.875, "learning_rate": 4.3272530559041384e-05, "loss": 1.3606, "num_input_tokens_seen": 20005888, "step": 9565 }, { "epoch": 1.5612203279223427, "grad_norm": 2.75, "learning_rate": 4.32658288865284e-05, "loss": 2.2324, "num_input_tokens_seen": 20016688, "step": 9570 }, { "epoch": 1.5620360551431602, "grad_norm": 16.125, "learning_rate": 4.325912439719505e-05, "loss": 2.4209, "num_input_tokens_seen": 20027808, "step": 9575 }, { "epoch": 1.5628517823639774, "grad_norm": 2.6875, "learning_rate": 4.3252417092075266e-05, "loss": 1.0929, "num_input_tokens_seen": 20038144, "step": 9580 }, { "epoch": 1.563667509584795, "grad_norm": 3.765625, "learning_rate": 4.3245706972203385e-05, "loss": 3.2779, "num_input_tokens_seen": 20049552, "step": 9585 }, { "epoch": 1.564483236805612, "grad_norm": 7.59375, "learning_rate": 4.323899403861421e-05, "loss": 2.8241, "num_input_tokens_seen": 20057744, "step": 9590 }, { "epoch": 1.5652989640264297, "grad_norm": 2.640625, "learning_rate": 4.3232278292342935e-05, "loss": 1.6569, "num_input_tokens_seen": 20067536, "step": 9595 }, { "epoch": 1.5661146912472468, "grad_norm": 6.46875, "learning_rate": 4.322555973442524e-05, "loss": 1.7284, "num_input_tokens_seen": 20077520, "step": 9600 }, { "epoch": 1.5661146912472468, "eval_loss": 2.555157423019409, "eval_runtime": 134.7593, "eval_samples_per_second": 20.221, "eval_steps_per_second": 10.114, "num_input_tokens_seen": 20077520, "step": 9600 }, { "epoch": 1.5669304184680644, "grad_norm": 3.75, "learning_rate": 4.3218838365897184e-05, "loss": 1.7063, "num_input_tokens_seen": 20086640, "step": 9605 }, { "epoch": 1.5677461456888815, "grad_norm": 7.125, "learning_rate": 4.3212114187795306e-05, "loss": 2.1828, "num_input_tokens_seen": 20097472, "step": 9610 }, { "epoch": 1.568561872909699, "grad_norm": 12.4375, "learning_rate": 4.320538720115656e-05, "loss": 1.602, "num_input_tokens_seen": 20107104, "step": 9615 }, { "epoch": 1.5693776001305162, "grad_norm": 9.6875, "learning_rate": 4.319865740701831e-05, "loss": 1.4984, "num_input_tokens_seen": 20117280, "step": 9620 }, { "epoch": 1.5701933273513338, "grad_norm": 12.4375, "learning_rate": 4.3191924806418396e-05, "loss": 2.1557, "num_input_tokens_seen": 20129120, "step": 9625 }, { "epoch": 1.5710090545721511, "grad_norm": 5.65625, "learning_rate": 4.318518940039507e-05, "loss": 1.5604, "num_input_tokens_seen": 20138672, "step": 9630 }, { "epoch": 1.5718247817929685, "grad_norm": 16.125, "learning_rate": 4.3178451189987e-05, "loss": 2.5694, "num_input_tokens_seen": 20149888, "step": 9635 }, { "epoch": 1.5726405090137858, "grad_norm": 4.625, "learning_rate": 4.3171710176233315e-05, "loss": 1.3876, "num_input_tokens_seen": 20160192, "step": 9640 }, { "epoch": 1.5734562362346032, "grad_norm": 10.3125, "learning_rate": 4.316496636017355e-05, "loss": 3.0902, "num_input_tokens_seen": 20170608, "step": 9645 }, { "epoch": 1.5742719634554205, "grad_norm": 5.59375, "learning_rate": 4.315821974284771e-05, "loss": 2.1185, "num_input_tokens_seen": 20181936, "step": 9650 }, { "epoch": 1.5750876906762379, "grad_norm": 5.90625, "learning_rate": 4.315147032529619e-05, "loss": 2.2076, "num_input_tokens_seen": 20191008, "step": 9655 }, { "epoch": 1.5759034178970552, "grad_norm": 15.3125, "learning_rate": 4.3144718108559845e-05, "loss": 2.3942, "num_input_tokens_seen": 20199840, "step": 9660 }, { "epoch": 1.5767191451178726, "grad_norm": 7.875, "learning_rate": 4.3137963093679945e-05, "loss": 2.6206, "num_input_tokens_seen": 20211152, "step": 9665 }, { "epoch": 1.57753487233869, "grad_norm": 4.46875, "learning_rate": 4.31312052816982e-05, "loss": 1.6106, "num_input_tokens_seen": 20220304, "step": 9670 }, { "epoch": 1.5783505995595073, "grad_norm": 6.21875, "learning_rate": 4.312444467365675e-05, "loss": 1.7438, "num_input_tokens_seen": 20230432, "step": 9675 }, { "epoch": 1.5791663267803246, "grad_norm": 6.03125, "learning_rate": 4.311768127059816e-05, "loss": 2.0414, "num_input_tokens_seen": 20240896, "step": 9680 }, { "epoch": 1.579982054001142, "grad_norm": 4.875, "learning_rate": 4.3110915073565444e-05, "loss": 2.1504, "num_input_tokens_seen": 20252448, "step": 9685 }, { "epoch": 1.5807977812219594, "grad_norm": 1.75, "learning_rate": 4.310414608360203e-05, "loss": 2.5433, "num_input_tokens_seen": 20262768, "step": 9690 }, { "epoch": 1.5816135084427767, "grad_norm": 5.3125, "learning_rate": 4.309737430175177e-05, "loss": 2.1019, "num_input_tokens_seen": 20273584, "step": 9695 }, { "epoch": 1.582429235663594, "grad_norm": 8.4375, "learning_rate": 4.309059972905897e-05, "loss": 2.0451, "num_input_tokens_seen": 20285056, "step": 9700 }, { "epoch": 1.5832449628844114, "grad_norm": 9.25, "learning_rate": 4.308382236656836e-05, "loss": 1.8828, "num_input_tokens_seen": 20296000, "step": 9705 }, { "epoch": 1.5840606901052288, "grad_norm": 4.5, "learning_rate": 4.307704221532507e-05, "loss": 2.2228, "num_input_tokens_seen": 20306464, "step": 9710 }, { "epoch": 1.5848764173260461, "grad_norm": 9.9375, "learning_rate": 4.307025927637471e-05, "loss": 2.4342, "num_input_tokens_seen": 20316320, "step": 9715 }, { "epoch": 1.5856921445468637, "grad_norm": 8.875, "learning_rate": 4.306347355076328e-05, "loss": 2.5944, "num_input_tokens_seen": 20327488, "step": 9720 }, { "epoch": 1.5865078717676808, "grad_norm": 4.34375, "learning_rate": 4.305668503953724e-05, "loss": 4.4652, "num_input_tokens_seen": 20336832, "step": 9725 }, { "epoch": 1.5873235989884984, "grad_norm": 5.0, "learning_rate": 4.3049893743743436e-05, "loss": 2.4719, "num_input_tokens_seen": 20346288, "step": 9730 }, { "epoch": 1.5881393262093155, "grad_norm": 9.0625, "learning_rate": 4.304309966442919e-05, "loss": 2.1083, "num_input_tokens_seen": 20356016, "step": 9735 }, { "epoch": 1.588955053430133, "grad_norm": 12.9375, "learning_rate": 4.303630280264224e-05, "loss": 3.3368, "num_input_tokens_seen": 20368208, "step": 9740 }, { "epoch": 1.5897707806509502, "grad_norm": 3.515625, "learning_rate": 4.302950315943074e-05, "loss": 2.7414, "num_input_tokens_seen": 20379408, "step": 9745 }, { "epoch": 1.5905865078717678, "grad_norm": 7.0, "learning_rate": 4.3022700735843275e-05, "loss": 2.9612, "num_input_tokens_seen": 20388864, "step": 9750 }, { "epoch": 1.591402235092585, "grad_norm": 2.8125, "learning_rate": 4.301589553292887e-05, "loss": 2.1642, "num_input_tokens_seen": 20399120, "step": 9755 }, { "epoch": 1.5922179623134025, "grad_norm": 7.21875, "learning_rate": 4.300908755173697e-05, "loss": 1.8425, "num_input_tokens_seen": 20410464, "step": 9760 }, { "epoch": 1.5930336895342196, "grad_norm": 1.4921875, "learning_rate": 4.300227679331745e-05, "loss": 2.1656, "num_input_tokens_seen": 20420400, "step": 9765 }, { "epoch": 1.5938494167550372, "grad_norm": 10.625, "learning_rate": 4.299546325872063e-05, "loss": 2.9693, "num_input_tokens_seen": 20430512, "step": 9770 }, { "epoch": 1.5946651439758543, "grad_norm": 13.0625, "learning_rate": 4.2988646948997225e-05, "loss": 2.0745, "num_input_tokens_seen": 20440848, "step": 9775 }, { "epoch": 1.595480871196672, "grad_norm": 6.625, "learning_rate": 4.29818278651984e-05, "loss": 2.458, "num_input_tokens_seen": 20451520, "step": 9780 }, { "epoch": 1.596296598417489, "grad_norm": 7.34375, "learning_rate": 4.297500600837574e-05, "loss": 3.2988, "num_input_tokens_seen": 20461856, "step": 9785 }, { "epoch": 1.5971123256383066, "grad_norm": 8.25, "learning_rate": 4.2968181379581276e-05, "loss": 3.4894, "num_input_tokens_seen": 20473648, "step": 9790 }, { "epoch": 1.5979280528591238, "grad_norm": 9.1875, "learning_rate": 4.296135397986743e-05, "loss": 2.5406, "num_input_tokens_seen": 20482752, "step": 9795 }, { "epoch": 1.5987437800799413, "grad_norm": 6.1875, "learning_rate": 4.295452381028709e-05, "loss": 2.7139, "num_input_tokens_seen": 20493344, "step": 9800 }, { "epoch": 1.5987437800799413, "eval_loss": 2.5283308029174805, "eval_runtime": 134.8676, "eval_samples_per_second": 20.205, "eval_steps_per_second": 10.106, "num_input_tokens_seen": 20493344, "step": 9800 }, { "epoch": 1.5995595073007585, "grad_norm": 16.0, "learning_rate": 4.294769087189354e-05, "loss": 3.8364, "num_input_tokens_seen": 20505264, "step": 9805 }, { "epoch": 1.600375234521576, "grad_norm": 6.9375, "learning_rate": 4.294085516574052e-05, "loss": 1.7072, "num_input_tokens_seen": 20515632, "step": 9810 }, { "epoch": 1.6011909617423934, "grad_norm": 5.09375, "learning_rate": 4.2934016692882176e-05, "loss": 1.2228, "num_input_tokens_seen": 20526848, "step": 9815 }, { "epoch": 1.6020066889632107, "grad_norm": 7.875, "learning_rate": 4.292717545437308e-05, "loss": 2.6823, "num_input_tokens_seen": 20536416, "step": 9820 }, { "epoch": 1.602822416184028, "grad_norm": 7.75, "learning_rate": 4.292033145126825e-05, "loss": 2.4265, "num_input_tokens_seen": 20547008, "step": 9825 }, { "epoch": 1.6036381434048455, "grad_norm": 9.3125, "learning_rate": 4.29134846846231e-05, "loss": 1.145, "num_input_tokens_seen": 20558064, "step": 9830 }, { "epoch": 1.6044538706256628, "grad_norm": 4.25, "learning_rate": 4.29066351554935e-05, "loss": 1.7239, "num_input_tokens_seen": 20568512, "step": 9835 }, { "epoch": 1.6052695978464802, "grad_norm": 7.59375, "learning_rate": 4.289978286493574e-05, "loss": 2.7111, "num_input_tokens_seen": 20578256, "step": 9840 }, { "epoch": 1.6060853250672975, "grad_norm": 12.6875, "learning_rate": 4.28929278140065e-05, "loss": 3.2203, "num_input_tokens_seen": 20589344, "step": 9845 }, { "epoch": 1.6069010522881149, "grad_norm": 4.3125, "learning_rate": 4.288607000376295e-05, "loss": 2.4628, "num_input_tokens_seen": 20600960, "step": 9850 }, { "epoch": 1.6077167795089322, "grad_norm": 7.5625, "learning_rate": 4.2879209435262624e-05, "loss": 2.3288, "num_input_tokens_seen": 20611424, "step": 9855 }, { "epoch": 1.6085325067297496, "grad_norm": 6.84375, "learning_rate": 4.287234610956353e-05, "loss": 3.4665, "num_input_tokens_seen": 20621824, "step": 9860 }, { "epoch": 1.609348233950567, "grad_norm": 12.5625, "learning_rate": 4.2865480027724056e-05, "loss": 2.9214, "num_input_tokens_seen": 20631264, "step": 9865 }, { "epoch": 1.6101639611713843, "grad_norm": 8.5625, "learning_rate": 4.285861119080306e-05, "loss": 2.6535, "num_input_tokens_seen": 20642096, "step": 9870 }, { "epoch": 1.6109796883922016, "grad_norm": 9.875, "learning_rate": 4.2851739599859784e-05, "loss": 2.9815, "num_input_tokens_seen": 20652976, "step": 9875 }, { "epoch": 1.611795415613019, "grad_norm": 10.3125, "learning_rate": 4.2844865255953934e-05, "loss": 3.4443, "num_input_tokens_seen": 20661920, "step": 9880 }, { "epoch": 1.6126111428338363, "grad_norm": 11.4375, "learning_rate": 4.2837988160145605e-05, "loss": 2.2692, "num_input_tokens_seen": 20672336, "step": 9885 }, { "epoch": 1.6134268700546537, "grad_norm": 4.0, "learning_rate": 4.2831108313495336e-05, "loss": 1.8791, "num_input_tokens_seen": 20684448, "step": 9890 }, { "epoch": 1.614242597275471, "grad_norm": 3.421875, "learning_rate": 4.282422571706408e-05, "loss": 2.0034, "num_input_tokens_seen": 20695248, "step": 9895 }, { "epoch": 1.6150583244962884, "grad_norm": 2.28125, "learning_rate": 4.281734037191323e-05, "loss": 2.2174, "num_input_tokens_seen": 20705488, "step": 9900 }, { "epoch": 1.615874051717106, "grad_norm": 5.5, "learning_rate": 4.281045227910459e-05, "loss": 1.578, "num_input_tokens_seen": 20715120, "step": 9905 }, { "epoch": 1.616689778937923, "grad_norm": 3.703125, "learning_rate": 4.280356143970038e-05, "loss": 1.6327, "num_input_tokens_seen": 20724624, "step": 9910 }, { "epoch": 1.6175055061587407, "grad_norm": 11.3125, "learning_rate": 4.279666785476327e-05, "loss": 2.7766, "num_input_tokens_seen": 20735664, "step": 9915 }, { "epoch": 1.6183212333795578, "grad_norm": 8.5, "learning_rate": 4.2789771525356325e-05, "loss": 2.478, "num_input_tokens_seen": 20745104, "step": 9920 }, { "epoch": 1.6191369606003754, "grad_norm": 8.5, "learning_rate": 4.2782872452543056e-05, "loss": 2.5877, "num_input_tokens_seen": 20755632, "step": 9925 }, { "epoch": 1.6199526878211925, "grad_norm": 5.8125, "learning_rate": 4.2775970637387376e-05, "loss": 1.6479, "num_input_tokens_seen": 20766304, "step": 9930 }, { "epoch": 1.62076841504201, "grad_norm": 5.875, "learning_rate": 4.276906608095363e-05, "loss": 3.4129, "num_input_tokens_seen": 20777712, "step": 9935 }, { "epoch": 1.6215841422628272, "grad_norm": 5.59375, "learning_rate": 4.276215878430661e-05, "loss": 2.6432, "num_input_tokens_seen": 20788000, "step": 9940 }, { "epoch": 1.6223998694836448, "grad_norm": 4.96875, "learning_rate": 4.275524874851149e-05, "loss": 1.8947, "num_input_tokens_seen": 20798720, "step": 9945 }, { "epoch": 1.623215596704462, "grad_norm": 6.6875, "learning_rate": 4.274833597463388e-05, "loss": 1.7858, "num_input_tokens_seen": 20808704, "step": 9950 }, { "epoch": 1.6240313239252795, "grad_norm": 10.875, "learning_rate": 4.2741420463739824e-05, "loss": 2.6123, "num_input_tokens_seen": 20818848, "step": 9955 }, { "epoch": 1.6248470511460966, "grad_norm": 9.0625, "learning_rate": 4.273450221689578e-05, "loss": 3.578, "num_input_tokens_seen": 20828304, "step": 9960 }, { "epoch": 1.6256627783669142, "grad_norm": 8.4375, "learning_rate": 4.272758123516863e-05, "loss": 2.9122, "num_input_tokens_seen": 20837840, "step": 9965 }, { "epoch": 1.6264785055877313, "grad_norm": 6.78125, "learning_rate": 4.272065751962567e-05, "loss": 1.7519, "num_input_tokens_seen": 20848608, "step": 9970 }, { "epoch": 1.627294232808549, "grad_norm": 11.5, "learning_rate": 4.271373107133464e-05, "loss": 2.5725, "num_input_tokens_seen": 20859616, "step": 9975 }, { "epoch": 1.628109960029366, "grad_norm": 20.25, "learning_rate": 4.270680189136366e-05, "loss": 3.8603, "num_input_tokens_seen": 20869984, "step": 9980 }, { "epoch": 1.6289256872501836, "grad_norm": 12.125, "learning_rate": 4.269986998078132e-05, "loss": 3.1739, "num_input_tokens_seen": 20880448, "step": 9985 }, { "epoch": 1.6297414144710007, "grad_norm": 12.0625, "learning_rate": 4.2692935340656595e-05, "loss": 2.2646, "num_input_tokens_seen": 20890480, "step": 9990 }, { "epoch": 1.6305571416918183, "grad_norm": 5.4375, "learning_rate": 4.26859979720589e-05, "loss": 2.554, "num_input_tokens_seen": 20902416, "step": 9995 }, { "epoch": 1.6313728689126357, "grad_norm": 5.75, "learning_rate": 4.267905787605806e-05, "loss": 2.3272, "num_input_tokens_seen": 20912896, "step": 10000 }, { "epoch": 1.6313728689126357, "eval_loss": 2.542886972427368, "eval_runtime": 135.0431, "eval_samples_per_second": 20.179, "eval_steps_per_second": 10.093, "num_input_tokens_seen": 20912896, "step": 10000 }, { "epoch": 1.632188596133453, "grad_norm": 8.9375, "learning_rate": 4.267211505372433e-05, "loss": 1.6226, "num_input_tokens_seen": 20923056, "step": 10005 }, { "epoch": 1.6330043233542704, "grad_norm": 7.96875, "learning_rate": 4.266516950612837e-05, "loss": 2.0991, "num_input_tokens_seen": 20934336, "step": 10010 }, { "epoch": 1.6338200505750877, "grad_norm": 4.25, "learning_rate": 4.265822123434128e-05, "loss": 2.9489, "num_input_tokens_seen": 20945440, "step": 10015 }, { "epoch": 1.634635777795905, "grad_norm": 3.640625, "learning_rate": 4.265127023943457e-05, "loss": 1.4088, "num_input_tokens_seen": 20956544, "step": 10020 }, { "epoch": 1.6354515050167224, "grad_norm": 5.6875, "learning_rate": 4.2644316522480176e-05, "loss": 3.0212, "num_input_tokens_seen": 20966560, "step": 10025 }, { "epoch": 1.6362672322375398, "grad_norm": 8.4375, "learning_rate": 4.263736008455044e-05, "loss": 1.8325, "num_input_tokens_seen": 20976784, "step": 10030 }, { "epoch": 1.6370829594583571, "grad_norm": 4.59375, "learning_rate": 4.2630400926718125e-05, "loss": 2.3791, "num_input_tokens_seen": 20988256, "step": 10035 }, { "epoch": 1.6378986866791745, "grad_norm": 6.96875, "learning_rate": 4.262343905005644e-05, "loss": 1.1471, "num_input_tokens_seen": 20997648, "step": 10040 }, { "epoch": 1.6387144138999918, "grad_norm": 1.5625, "learning_rate": 4.261647445563897e-05, "loss": 2.0964, "num_input_tokens_seen": 21008480, "step": 10045 }, { "epoch": 1.6395301411208092, "grad_norm": 6.96875, "learning_rate": 4.260950714453976e-05, "loss": 3.2704, "num_input_tokens_seen": 21018384, "step": 10050 }, { "epoch": 1.6403458683416265, "grad_norm": 7.375, "learning_rate": 4.2602537117833266e-05, "loss": 1.6717, "num_input_tokens_seen": 21029424, "step": 10055 }, { "epoch": 1.641161595562444, "grad_norm": 13.4375, "learning_rate": 4.259556437659433e-05, "loss": 1.8708, "num_input_tokens_seen": 21039936, "step": 10060 }, { "epoch": 1.6419773227832613, "grad_norm": 4.9375, "learning_rate": 4.258858892189825e-05, "loss": 1.7369, "num_input_tokens_seen": 21049904, "step": 10065 }, { "epoch": 1.6427930500040786, "grad_norm": 6.5, "learning_rate": 4.2581610754820725e-05, "loss": 1.122, "num_input_tokens_seen": 21059312, "step": 10070 }, { "epoch": 1.643608777224896, "grad_norm": 6.0, "learning_rate": 4.2574629876437876e-05, "loss": 0.9097, "num_input_tokens_seen": 21068816, "step": 10075 }, { "epoch": 1.6444245044457133, "grad_norm": 14.5625, "learning_rate": 4.256764628782625e-05, "loss": 2.4086, "num_input_tokens_seen": 21080128, "step": 10080 }, { "epoch": 1.6452402316665307, "grad_norm": 12.125, "learning_rate": 4.256065999006279e-05, "loss": 2.7302, "num_input_tokens_seen": 21090800, "step": 10085 }, { "epoch": 1.6460559588873482, "grad_norm": 5.0625, "learning_rate": 4.2553670984224885e-05, "loss": 2.3471, "num_input_tokens_seen": 21101744, "step": 10090 }, { "epoch": 1.6468716861081654, "grad_norm": 6.75, "learning_rate": 4.254667927139032e-05, "loss": 3.115, "num_input_tokens_seen": 21112320, "step": 10095 }, { "epoch": 1.647687413328983, "grad_norm": 8.0, "learning_rate": 4.2539684852637295e-05, "loss": 2.7829, "num_input_tokens_seen": 21123376, "step": 10100 }, { "epoch": 1.6485031405498, "grad_norm": 5.59375, "learning_rate": 4.253268772904446e-05, "loss": 2.4823, "num_input_tokens_seen": 21135104, "step": 10105 }, { "epoch": 1.6493188677706176, "grad_norm": 4.03125, "learning_rate": 4.252568790169085e-05, "loss": 3.1325, "num_input_tokens_seen": 21145744, "step": 10110 }, { "epoch": 1.6501345949914348, "grad_norm": 5.5625, "learning_rate": 4.251868537165592e-05, "loss": 1.9709, "num_input_tokens_seen": 21157488, "step": 10115 }, { "epoch": 1.6509503222122524, "grad_norm": 5.15625, "learning_rate": 4.251168014001955e-05, "loss": 1.5757, "num_input_tokens_seen": 21167120, "step": 10120 }, { "epoch": 1.6517660494330695, "grad_norm": 14.0625, "learning_rate": 4.250467220786204e-05, "loss": 2.7046, "num_input_tokens_seen": 21178704, "step": 10125 }, { "epoch": 1.652581776653887, "grad_norm": 7.46875, "learning_rate": 4.249766157626409e-05, "loss": 2.9901, "num_input_tokens_seen": 21188640, "step": 10130 }, { "epoch": 1.6533975038747042, "grad_norm": 11.5, "learning_rate": 4.249064824630684e-05, "loss": 3.1327, "num_input_tokens_seen": 21199312, "step": 10135 }, { "epoch": 1.6542132310955218, "grad_norm": 7.21875, "learning_rate": 4.248363221907183e-05, "loss": 2.0713, "num_input_tokens_seen": 21209824, "step": 10140 }, { "epoch": 1.655028958316339, "grad_norm": 6.59375, "learning_rate": 4.2476613495641026e-05, "loss": 2.7075, "num_input_tokens_seen": 21220832, "step": 10145 }, { "epoch": 1.6558446855371565, "grad_norm": 8.125, "learning_rate": 4.246959207709679e-05, "loss": 3.6813, "num_input_tokens_seen": 21230928, "step": 10150 }, { "epoch": 1.6566604127579736, "grad_norm": 7.0625, "learning_rate": 4.246256796452192e-05, "loss": 1.5339, "num_input_tokens_seen": 21240928, "step": 10155 }, { "epoch": 1.6574761399787912, "grad_norm": 5.15625, "learning_rate": 4.245554115899962e-05, "loss": 2.1236, "num_input_tokens_seen": 21251280, "step": 10160 }, { "epoch": 1.6582918671996083, "grad_norm": 9.8125, "learning_rate": 4.2448511661613514e-05, "loss": 2.2097, "num_input_tokens_seen": 21261072, "step": 10165 }, { "epoch": 1.6591075944204259, "grad_norm": 4.65625, "learning_rate": 4.2441479473447635e-05, "loss": 1.7586, "num_input_tokens_seen": 21270416, "step": 10170 }, { "epoch": 1.659923321641243, "grad_norm": 4.4375, "learning_rate": 4.243444459558644e-05, "loss": 2.6622, "num_input_tokens_seen": 21280976, "step": 10175 }, { "epoch": 1.6607390488620606, "grad_norm": 13.375, "learning_rate": 4.24274070291148e-05, "loss": 2.9632, "num_input_tokens_seen": 21290304, "step": 10180 }, { "epoch": 1.661554776082878, "grad_norm": 2.515625, "learning_rate": 4.242036677511798e-05, "loss": 1.6508, "num_input_tokens_seen": 21299504, "step": 10185 }, { "epoch": 1.6623705033036953, "grad_norm": 11.3125, "learning_rate": 4.241332383468169e-05, "loss": 2.7934, "num_input_tokens_seen": 21308288, "step": 10190 }, { "epoch": 1.6631862305245126, "grad_norm": 10.9375, "learning_rate": 4.2406278208892034e-05, "loss": 3.0532, "num_input_tokens_seen": 21318704, "step": 10195 }, { "epoch": 1.66400195774533, "grad_norm": 4.90625, "learning_rate": 4.2399229898835536e-05, "loss": 2.7251, "num_input_tokens_seen": 21328976, "step": 10200 }, { "epoch": 1.66400195774533, "eval_loss": 2.5554885864257812, "eval_runtime": 135.1264, "eval_samples_per_second": 20.166, "eval_steps_per_second": 10.087, "num_input_tokens_seen": 21328976, "step": 10200 }, { "epoch": 1.6648176849661473, "grad_norm": 9.25, "learning_rate": 4.239217890559914e-05, "loss": 2.7855, "num_input_tokens_seen": 21339840, "step": 10205 }, { "epoch": 1.6656334121869647, "grad_norm": 0.2060546875, "learning_rate": 4.238512523027019e-05, "loss": 0.9088, "num_input_tokens_seen": 21350368, "step": 10210 }, { "epoch": 1.666449139407782, "grad_norm": 7.1875, "learning_rate": 4.237806887393645e-05, "loss": 2.8211, "num_input_tokens_seen": 21360736, "step": 10215 }, { "epoch": 1.6672648666285994, "grad_norm": 9.25, "learning_rate": 4.237100983768611e-05, "loss": 1.4705, "num_input_tokens_seen": 21372096, "step": 10220 }, { "epoch": 1.6680805938494168, "grad_norm": 2.09375, "learning_rate": 4.2363948122607756e-05, "loss": 1.5441, "num_input_tokens_seen": 21382240, "step": 10225 }, { "epoch": 1.6688963210702341, "grad_norm": 10.375, "learning_rate": 4.235688372979039e-05, "loss": 2.7682, "num_input_tokens_seen": 21392864, "step": 10230 }, { "epoch": 1.6697120482910515, "grad_norm": 7.40625, "learning_rate": 4.234981666032343e-05, "loss": 3.0931, "num_input_tokens_seen": 21403008, "step": 10235 }, { "epoch": 1.6705277755118688, "grad_norm": 9.625, "learning_rate": 4.2342746915296704e-05, "loss": 2.3487, "num_input_tokens_seen": 21414048, "step": 10240 }, { "epoch": 1.6713435027326862, "grad_norm": 7.25, "learning_rate": 4.233567449580047e-05, "loss": 2.3831, "num_input_tokens_seen": 21425792, "step": 10245 }, { "epoch": 1.6721592299535035, "grad_norm": 8.0625, "learning_rate": 4.232859940292537e-05, "loss": 3.5676, "num_input_tokens_seen": 21434736, "step": 10250 }, { "epoch": 1.6729749571743209, "grad_norm": 1.4375, "learning_rate": 4.232152163776248e-05, "loss": 2.0049, "num_input_tokens_seen": 21445984, "step": 10255 }, { "epoch": 1.6737906843951382, "grad_norm": 0.53125, "learning_rate": 4.231444120140328e-05, "loss": 1.6876, "num_input_tokens_seen": 21457440, "step": 10260 }, { "epoch": 1.6746064116159556, "grad_norm": 7.03125, "learning_rate": 4.230735809493967e-05, "loss": 2.5528, "num_input_tokens_seen": 21468112, "step": 10265 }, { "epoch": 1.675422138836773, "grad_norm": 9.5, "learning_rate": 4.2300272319463926e-05, "loss": 3.9087, "num_input_tokens_seen": 21478912, "step": 10270 }, { "epoch": 1.6762378660575903, "grad_norm": 2.765625, "learning_rate": 4.2293183876068786e-05, "loss": 1.9837, "num_input_tokens_seen": 21489264, "step": 10275 }, { "epoch": 1.6770535932784076, "grad_norm": 9.25, "learning_rate": 4.228609276584737e-05, "loss": 2.3761, "num_input_tokens_seen": 21500944, "step": 10280 }, { "epoch": 1.6778693204992252, "grad_norm": 7.5, "learning_rate": 4.227899898989323e-05, "loss": 2.1788, "num_input_tokens_seen": 21510848, "step": 10285 }, { "epoch": 1.6786850477200423, "grad_norm": 5.53125, "learning_rate": 4.2271902549300293e-05, "loss": 2.2827, "num_input_tokens_seen": 21520384, "step": 10290 }, { "epoch": 1.67950077494086, "grad_norm": 5.03125, "learning_rate": 4.226480344516294e-05, "loss": 1.9437, "num_input_tokens_seen": 21531600, "step": 10295 }, { "epoch": 1.680316502161677, "grad_norm": 4.5, "learning_rate": 4.2257701678575925e-05, "loss": 2.1368, "num_input_tokens_seen": 21541760, "step": 10300 }, { "epoch": 1.6811322293824946, "grad_norm": 7.0625, "learning_rate": 4.225059725063444e-05, "loss": 1.9154, "num_input_tokens_seen": 21552000, "step": 10305 }, { "epoch": 1.6819479566033118, "grad_norm": 5.4375, "learning_rate": 4.2243490162434074e-05, "loss": 2.0984, "num_input_tokens_seen": 21562416, "step": 10310 }, { "epoch": 1.6827636838241293, "grad_norm": 8.25, "learning_rate": 4.223638041507083e-05, "loss": 2.6947, "num_input_tokens_seen": 21572656, "step": 10315 }, { "epoch": 1.6835794110449465, "grad_norm": 3.078125, "learning_rate": 4.2229268009641124e-05, "loss": 2.6856, "num_input_tokens_seen": 21583456, "step": 10320 }, { "epoch": 1.684395138265764, "grad_norm": 3.203125, "learning_rate": 4.222215294724177e-05, "loss": 2.5418, "num_input_tokens_seen": 21593696, "step": 10325 }, { "epoch": 1.6852108654865812, "grad_norm": 10.5625, "learning_rate": 4.2215035228970005e-05, "loss": 1.5621, "num_input_tokens_seen": 21602832, "step": 10330 }, { "epoch": 1.6860265927073987, "grad_norm": 10.125, "learning_rate": 4.2207914855923464e-05, "loss": 1.9968, "num_input_tokens_seen": 21613760, "step": 10335 }, { "epoch": 1.6868423199282159, "grad_norm": 3.1875, "learning_rate": 4.220079182920021e-05, "loss": 1.9081, "num_input_tokens_seen": 21622576, "step": 10340 }, { "epoch": 1.6876580471490334, "grad_norm": 7.90625, "learning_rate": 4.2193666149898705e-05, "loss": 2.6379, "num_input_tokens_seen": 21634032, "step": 10345 }, { "epoch": 1.6884737743698506, "grad_norm": 4.96875, "learning_rate": 4.21865378191178e-05, "loss": 2.3657, "num_input_tokens_seen": 21644592, "step": 10350 }, { "epoch": 1.6892895015906682, "grad_norm": 8.25, "learning_rate": 4.217940683795678e-05, "loss": 2.3895, "num_input_tokens_seen": 21655552, "step": 10355 }, { "epoch": 1.6901052288114853, "grad_norm": 10.375, "learning_rate": 4.217227320751534e-05, "loss": 2.3325, "num_input_tokens_seen": 21666960, "step": 10360 }, { "epoch": 1.6909209560323029, "grad_norm": 3.828125, "learning_rate": 4.216513692889358e-05, "loss": 2.191, "num_input_tokens_seen": 21676608, "step": 10365 }, { "epoch": 1.6917366832531202, "grad_norm": 7.8125, "learning_rate": 4.215799800319199e-05, "loss": 2.014, "num_input_tokens_seen": 21686576, "step": 10370 }, { "epoch": 1.6925524104739376, "grad_norm": 2.40625, "learning_rate": 4.2150856431511485e-05, "loss": 2.4317, "num_input_tokens_seen": 21697104, "step": 10375 }, { "epoch": 1.693368137694755, "grad_norm": 8.75, "learning_rate": 4.214371221495339e-05, "loss": 3.8414, "num_input_tokens_seen": 21707376, "step": 10380 }, { "epoch": 1.6941838649155723, "grad_norm": 4.21875, "learning_rate": 4.213656535461942e-05, "loss": 2.6699, "num_input_tokens_seen": 21718848, "step": 10385 }, { "epoch": 1.6949995921363896, "grad_norm": 6.375, "learning_rate": 4.2129415851611734e-05, "loss": 2.3662, "num_input_tokens_seen": 21729968, "step": 10390 }, { "epoch": 1.695815319357207, "grad_norm": 6.0, "learning_rate": 4.2122263707032855e-05, "loss": 1.4607, "num_input_tokens_seen": 21740832, "step": 10395 }, { "epoch": 1.6966310465780243, "grad_norm": 8.25, "learning_rate": 4.211510892198574e-05, "loss": 3.511, "num_input_tokens_seen": 21752192, "step": 10400 }, { "epoch": 1.6966310465780243, "eval_loss": 2.530885696411133, "eval_runtime": 134.8787, "eval_samples_per_second": 20.203, "eval_steps_per_second": 10.105, "num_input_tokens_seen": 21752192, "step": 10400 }, { "epoch": 1.6974467737988417, "grad_norm": 1.1640625, "learning_rate": 4.210795149757375e-05, "loss": 1.8275, "num_input_tokens_seen": 21761344, "step": 10405 }, { "epoch": 1.698262501019659, "grad_norm": 7.3125, "learning_rate": 4.210079143490065e-05, "loss": 2.6952, "num_input_tokens_seen": 21770720, "step": 10410 }, { "epoch": 1.6990782282404764, "grad_norm": 6.875, "learning_rate": 4.2093628735070604e-05, "loss": 1.688, "num_input_tokens_seen": 21780208, "step": 10415 }, { "epoch": 1.6998939554612937, "grad_norm": 8.8125, "learning_rate": 4.208646339918819e-05, "loss": 2.0777, "num_input_tokens_seen": 21790624, "step": 10420 }, { "epoch": 1.700709682682111, "grad_norm": 7.8125, "learning_rate": 4.2079295428358414e-05, "loss": 2.3737, "num_input_tokens_seen": 21801008, "step": 10425 }, { "epoch": 1.7015254099029284, "grad_norm": 8.1875, "learning_rate": 4.207212482368664e-05, "loss": 2.0515, "num_input_tokens_seen": 21812656, "step": 10430 }, { "epoch": 1.7023411371237458, "grad_norm": 3.90625, "learning_rate": 4.206495158627867e-05, "loss": 3.2055, "num_input_tokens_seen": 21824000, "step": 10435 }, { "epoch": 1.7031568643445631, "grad_norm": 9.9375, "learning_rate": 4.205777571724073e-05, "loss": 2.3469, "num_input_tokens_seen": 21834976, "step": 10440 }, { "epoch": 1.7039725915653805, "grad_norm": 15.5625, "learning_rate": 4.20505972176794e-05, "loss": 1.9091, "num_input_tokens_seen": 21845344, "step": 10445 }, { "epoch": 1.7047883187861979, "grad_norm": 4.3125, "learning_rate": 4.204341608870171e-05, "loss": 2.3963, "num_input_tokens_seen": 21856944, "step": 10450 }, { "epoch": 1.7056040460070152, "grad_norm": 5.03125, "learning_rate": 4.203623233141508e-05, "loss": 1.7123, "num_input_tokens_seen": 21867856, "step": 10455 }, { "epoch": 1.7064197732278326, "grad_norm": 9.875, "learning_rate": 4.2029045946927334e-05, "loss": 2.5352, "num_input_tokens_seen": 21877504, "step": 10460 }, { "epoch": 1.70723550044865, "grad_norm": 0.061767578125, "learning_rate": 4.20218569363467e-05, "loss": 1.4753, "num_input_tokens_seen": 21886352, "step": 10465 }, { "epoch": 1.7080512276694675, "grad_norm": 4.15625, "learning_rate": 4.2014665300781834e-05, "loss": 1.2279, "num_input_tokens_seen": 21896976, "step": 10470 }, { "epoch": 1.7088669548902846, "grad_norm": 2.125, "learning_rate": 4.200747104134174e-05, "loss": 1.6798, "num_input_tokens_seen": 21906160, "step": 10475 }, { "epoch": 1.7096826821111022, "grad_norm": 6.59375, "learning_rate": 4.200027415913588e-05, "loss": 1.8652, "num_input_tokens_seen": 21915792, "step": 10480 }, { "epoch": 1.7104984093319193, "grad_norm": 6.96875, "learning_rate": 4.1993074655274126e-05, "loss": 4.3753, "num_input_tokens_seen": 21926080, "step": 10485 }, { "epoch": 1.711314136552737, "grad_norm": 11.5625, "learning_rate": 4.198587253086669e-05, "loss": 2.8847, "num_input_tokens_seen": 21936656, "step": 10490 }, { "epoch": 1.712129863773554, "grad_norm": 14.3125, "learning_rate": 4.197866778702426e-05, "loss": 2.3488, "num_input_tokens_seen": 21947488, "step": 10495 }, { "epoch": 1.7129455909943716, "grad_norm": 10.125, "learning_rate": 4.197146042485789e-05, "loss": 2.4031, "num_input_tokens_seen": 21957760, "step": 10500 }, { "epoch": 1.7137613182151887, "grad_norm": 5.1875, "learning_rate": 4.1964250445479046e-05, "loss": 1.9205, "num_input_tokens_seen": 21967472, "step": 10505 }, { "epoch": 1.7145770454360063, "grad_norm": 10.25, "learning_rate": 4.19570378499996e-05, "loss": 3.2976, "num_input_tokens_seen": 21977568, "step": 10510 }, { "epoch": 1.7153927726568234, "grad_norm": 15.6875, "learning_rate": 4.194982263953182e-05, "loss": 4.1467, "num_input_tokens_seen": 21988224, "step": 10515 }, { "epoch": 1.716208499877641, "grad_norm": 2.9375, "learning_rate": 4.194260481518838e-05, "loss": 2.7116, "num_input_tokens_seen": 21998032, "step": 10520 }, { "epoch": 1.7170242270984581, "grad_norm": 14.25, "learning_rate": 4.1935384378082366e-05, "loss": 3.4925, "num_input_tokens_seen": 22008800, "step": 10525 }, { "epoch": 1.7178399543192757, "grad_norm": 10.625, "learning_rate": 4.1928161329327267e-05, "loss": 4.0193, "num_input_tokens_seen": 22020000, "step": 10530 }, { "epoch": 1.7186556815400928, "grad_norm": 6.28125, "learning_rate": 4.1920935670036945e-05, "loss": 2.767, "num_input_tokens_seen": 22031712, "step": 10535 }, { "epoch": 1.7194714087609104, "grad_norm": 7.90625, "learning_rate": 4.1913707401325705e-05, "loss": 1.5344, "num_input_tokens_seen": 22042352, "step": 10540 }, { "epoch": 1.7202871359817276, "grad_norm": 8.6875, "learning_rate": 4.1906476524308235e-05, "loss": 2.9611, "num_input_tokens_seen": 22054064, "step": 10545 }, { "epoch": 1.7211028632025451, "grad_norm": 2.78125, "learning_rate": 4.189924304009962e-05, "loss": 1.8644, "num_input_tokens_seen": 22064368, "step": 10550 }, { "epoch": 1.7219185904233625, "grad_norm": 9.1875, "learning_rate": 4.189200694981537e-05, "loss": 2.8982, "num_input_tokens_seen": 22074064, "step": 10555 }, { "epoch": 1.7227343176441798, "grad_norm": 14.375, "learning_rate": 4.188476825457136e-05, "loss": 2.3784, "num_input_tokens_seen": 22084864, "step": 10560 }, { "epoch": 1.7235500448649972, "grad_norm": 2.65625, "learning_rate": 4.18775269554839e-05, "loss": 1.3378, "num_input_tokens_seen": 22094544, "step": 10565 }, { "epoch": 1.7243657720858145, "grad_norm": 6.84375, "learning_rate": 4.187028305366969e-05, "loss": 2.3798, "num_input_tokens_seen": 22102656, "step": 10570 }, { "epoch": 1.725181499306632, "grad_norm": 7.59375, "learning_rate": 4.1863036550245824e-05, "loss": 1.953, "num_input_tokens_seen": 22112976, "step": 10575 }, { "epoch": 1.7259972265274492, "grad_norm": 11.5625, "learning_rate": 4.1855787446329806e-05, "loss": 3.6301, "num_input_tokens_seen": 22123936, "step": 10580 }, { "epoch": 1.7268129537482666, "grad_norm": 7.9375, "learning_rate": 4.184853574303955e-05, "loss": 2.9563, "num_input_tokens_seen": 22134736, "step": 10585 }, { "epoch": 1.727628680969084, "grad_norm": 1.6875, "learning_rate": 4.184128144149334e-05, "loss": 2.2153, "num_input_tokens_seen": 22144736, "step": 10590 }, { "epoch": 1.7284444081899013, "grad_norm": 12.5, "learning_rate": 4.1834024542809896e-05, "loss": 3.638, "num_input_tokens_seen": 22154656, "step": 10595 }, { "epoch": 1.7292601354107187, "grad_norm": 6.03125, "learning_rate": 4.1826765048108315e-05, "loss": 2.7897, "num_input_tokens_seen": 22164912, "step": 10600 }, { "epoch": 1.7292601354107187, "eval_loss": 2.539090394973755, "eval_runtime": 134.9487, "eval_samples_per_second": 20.193, "eval_steps_per_second": 10.1, "num_input_tokens_seen": 22164912, "step": 10600 }, { "epoch": 1.730075862631536, "grad_norm": 7.4375, "learning_rate": 4.181950295850811e-05, "loss": 2.4834, "num_input_tokens_seen": 22174560, "step": 10605 }, { "epoch": 1.7308915898523534, "grad_norm": 3.21875, "learning_rate": 4.181223827512918e-05, "loss": 3.7932, "num_input_tokens_seen": 22185552, "step": 10610 }, { "epoch": 1.7317073170731707, "grad_norm": 14.4375, "learning_rate": 4.180497099909183e-05, "loss": 1.6313, "num_input_tokens_seen": 22194352, "step": 10615 }, { "epoch": 1.732523044293988, "grad_norm": 7.8125, "learning_rate": 4.179770113151677e-05, "loss": 1.7586, "num_input_tokens_seen": 22203600, "step": 10620 }, { "epoch": 1.7333387715148054, "grad_norm": 0.265625, "learning_rate": 4.179042867352511e-05, "loss": 2.8869, "num_input_tokens_seen": 22214720, "step": 10625 }, { "epoch": 1.7341544987356228, "grad_norm": 6.71875, "learning_rate": 4.1783153626238334e-05, "loss": 2.9075, "num_input_tokens_seen": 22227568, "step": 10630 }, { "epoch": 1.7349702259564401, "grad_norm": 5.6875, "learning_rate": 4.177587599077836e-05, "loss": 3.3877, "num_input_tokens_seen": 22239392, "step": 10635 }, { "epoch": 1.7357859531772575, "grad_norm": 2.59375, "learning_rate": 4.1768595768267494e-05, "loss": 1.8827, "num_input_tokens_seen": 22249808, "step": 10640 }, { "epoch": 1.7366016803980748, "grad_norm": 7.375, "learning_rate": 4.176131295982843e-05, "loss": 2.1767, "num_input_tokens_seen": 22260512, "step": 10645 }, { "epoch": 1.7374174076188922, "grad_norm": 6.96875, "learning_rate": 4.1754027566584276e-05, "loss": 2.4503, "num_input_tokens_seen": 22270144, "step": 10650 }, { "epoch": 1.7382331348397098, "grad_norm": 10.625, "learning_rate": 4.174673958965852e-05, "loss": 3.9162, "num_input_tokens_seen": 22280688, "step": 10655 }, { "epoch": 1.739048862060527, "grad_norm": 6.03125, "learning_rate": 4.173944903017507e-05, "loss": 3.0249, "num_input_tokens_seen": 22291568, "step": 10660 }, { "epoch": 1.7398645892813445, "grad_norm": 10.5, "learning_rate": 4.173215588925822e-05, "loss": 2.3746, "num_input_tokens_seen": 22300912, "step": 10665 }, { "epoch": 1.7406803165021616, "grad_norm": 11.5625, "learning_rate": 4.172486016803266e-05, "loss": 2.6148, "num_input_tokens_seen": 22310688, "step": 10670 }, { "epoch": 1.7414960437229792, "grad_norm": 5.4375, "learning_rate": 4.171756186762349e-05, "loss": 2.9161, "num_input_tokens_seen": 22320720, "step": 10675 }, { "epoch": 1.7423117709437963, "grad_norm": 9.5, "learning_rate": 4.171026098915619e-05, "loss": 1.9717, "num_input_tokens_seen": 22331072, "step": 10680 }, { "epoch": 1.7431274981646139, "grad_norm": 10.625, "learning_rate": 4.170295753375665e-05, "loss": 4.0071, "num_input_tokens_seen": 22341072, "step": 10685 }, { "epoch": 1.743943225385431, "grad_norm": 11.75, "learning_rate": 4.169565150255117e-05, "loss": 2.3715, "num_input_tokens_seen": 22350624, "step": 10690 }, { "epoch": 1.7447589526062486, "grad_norm": 5.46875, "learning_rate": 4.16883428966664e-05, "loss": 1.7984, "num_input_tokens_seen": 22361200, "step": 10695 }, { "epoch": 1.7455746798270657, "grad_norm": 5.46875, "learning_rate": 4.168103171722944e-05, "loss": 1.6843, "num_input_tokens_seen": 22372464, "step": 10700 }, { "epoch": 1.7463904070478833, "grad_norm": 9.4375, "learning_rate": 4.167371796536777e-05, "loss": 2.9907, "num_input_tokens_seen": 22383936, "step": 10705 }, { "epoch": 1.7472061342687004, "grad_norm": 4.4375, "learning_rate": 4.166640164220924e-05, "loss": 2.9674, "num_input_tokens_seen": 22394144, "step": 10710 }, { "epoch": 1.748021861489518, "grad_norm": 9.625, "learning_rate": 4.1659082748882144e-05, "loss": 1.4832, "num_input_tokens_seen": 22404000, "step": 10715 }, { "epoch": 1.7488375887103351, "grad_norm": 4.28125, "learning_rate": 4.1651761286515135e-05, "loss": 2.2355, "num_input_tokens_seen": 22415232, "step": 10720 }, { "epoch": 1.7496533159311527, "grad_norm": 4.25, "learning_rate": 4.164443725623728e-05, "loss": 1.6445, "num_input_tokens_seen": 22424672, "step": 10725 }, { "epoch": 1.7504690431519698, "grad_norm": 5.0, "learning_rate": 4.163711065917802e-05, "loss": 1.9583, "num_input_tokens_seen": 22434256, "step": 10730 }, { "epoch": 1.7512847703727874, "grad_norm": 2.3125, "learning_rate": 4.1629781496467234e-05, "loss": 2.0795, "num_input_tokens_seen": 22444288, "step": 10735 }, { "epoch": 1.7521004975936045, "grad_norm": 6.46875, "learning_rate": 4.1622449769235164e-05, "loss": 3.4827, "num_input_tokens_seen": 22454880, "step": 10740 }, { "epoch": 1.752916224814422, "grad_norm": 7.09375, "learning_rate": 4.161511547861243e-05, "loss": 3.4251, "num_input_tokens_seen": 22466000, "step": 10745 }, { "epoch": 1.7537319520352395, "grad_norm": 4.40625, "learning_rate": 4.1607778625730104e-05, "loss": 2.075, "num_input_tokens_seen": 22476240, "step": 10750 }, { "epoch": 1.7545476792560568, "grad_norm": 8.9375, "learning_rate": 4.160043921171961e-05, "loss": 2.5045, "num_input_tokens_seen": 22485968, "step": 10755 }, { "epoch": 1.7553634064768742, "grad_norm": 8.8125, "learning_rate": 4.159309723771276e-05, "loss": 1.776, "num_input_tokens_seen": 22497824, "step": 10760 }, { "epoch": 1.7561791336976915, "grad_norm": 9.0625, "learning_rate": 4.158575270484181e-05, "loss": 2.9436, "num_input_tokens_seen": 22509344, "step": 10765 }, { "epoch": 1.7569948609185089, "grad_norm": 3.71875, "learning_rate": 4.157840561423936e-05, "loss": 2.382, "num_input_tokens_seen": 22521760, "step": 10770 }, { "epoch": 1.7578105881393262, "grad_norm": 12.25, "learning_rate": 4.1571055967038416e-05, "loss": 1.3928, "num_input_tokens_seen": 22532000, "step": 10775 }, { "epoch": 1.7586263153601436, "grad_norm": 10.375, "learning_rate": 4.156370376437241e-05, "loss": 3.461, "num_input_tokens_seen": 22542112, "step": 10780 }, { "epoch": 1.759442042580961, "grad_norm": 6.25, "learning_rate": 4.155634900737513e-05, "loss": 2.9181, "num_input_tokens_seen": 22553504, "step": 10785 }, { "epoch": 1.7602577698017783, "grad_norm": 7.90625, "learning_rate": 4.1548991697180764e-05, "loss": 1.8568, "num_input_tokens_seen": 22563872, "step": 10790 }, { "epoch": 1.7610734970225956, "grad_norm": 7.71875, "learning_rate": 4.1541631834923914e-05, "loss": 1.5312, "num_input_tokens_seen": 22575024, "step": 10795 }, { "epoch": 1.761889224243413, "grad_norm": 6.6875, "learning_rate": 4.153426942173956e-05, "loss": 2.4284, "num_input_tokens_seen": 22585216, "step": 10800 }, { "epoch": 1.761889224243413, "eval_loss": 2.5454680919647217, "eval_runtime": 135.0717, "eval_samples_per_second": 20.174, "eval_steps_per_second": 10.091, "num_input_tokens_seen": 22585216, "step": 10800 }, { "epoch": 1.7627049514642303, "grad_norm": 14.6875, "learning_rate": 4.152690445876308e-05, "loss": 2.6657, "num_input_tokens_seen": 22595088, "step": 10805 }, { "epoch": 1.7635206786850477, "grad_norm": 3.1875, "learning_rate": 4.1519536947130245e-05, "loss": 2.1949, "num_input_tokens_seen": 22604784, "step": 10810 }, { "epoch": 1.764336405905865, "grad_norm": 6.03125, "learning_rate": 4.151216688797722e-05, "loss": 2.6813, "num_input_tokens_seen": 22615040, "step": 10815 }, { "epoch": 1.7651521331266824, "grad_norm": 4.46875, "learning_rate": 4.150479428244054e-05, "loss": 2.1004, "num_input_tokens_seen": 22626928, "step": 10820 }, { "epoch": 1.7659678603474998, "grad_norm": 9.0, "learning_rate": 4.1497419131657176e-05, "loss": 1.895, "num_input_tokens_seen": 22636896, "step": 10825 }, { "epoch": 1.766783587568317, "grad_norm": 3.90625, "learning_rate": 4.149004143676447e-05, "loss": 1.5149, "num_input_tokens_seen": 22649296, "step": 10830 }, { "epoch": 1.7675993147891345, "grad_norm": 3.25, "learning_rate": 4.148266119890015e-05, "loss": 1.8836, "num_input_tokens_seen": 22658160, "step": 10835 }, { "epoch": 1.768415042009952, "grad_norm": 12.125, "learning_rate": 4.1475278419202324e-05, "loss": 2.2586, "num_input_tokens_seen": 22666944, "step": 10840 }, { "epoch": 1.7692307692307692, "grad_norm": 0.859375, "learning_rate": 4.146789309880953e-05, "loss": 3.2418, "num_input_tokens_seen": 22678032, "step": 10845 }, { "epoch": 1.7700464964515867, "grad_norm": 4.96875, "learning_rate": 4.146050523886068e-05, "loss": 1.3837, "num_input_tokens_seen": 22687536, "step": 10850 }, { "epoch": 1.7708622236724039, "grad_norm": 9.9375, "learning_rate": 4.1453114840495055e-05, "loss": 3.2864, "num_input_tokens_seen": 22698784, "step": 10855 }, { "epoch": 1.7716779508932214, "grad_norm": 7.3125, "learning_rate": 4.1445721904852364e-05, "loss": 1.8127, "num_input_tokens_seen": 22709536, "step": 10860 }, { "epoch": 1.7724936781140386, "grad_norm": 1.75, "learning_rate": 4.143832643307269e-05, "loss": 1.3252, "num_input_tokens_seen": 22718960, "step": 10865 }, { "epoch": 1.7733094053348561, "grad_norm": 10.625, "learning_rate": 4.1430928426296503e-05, "loss": 3.6312, "num_input_tokens_seen": 22729696, "step": 10870 }, { "epoch": 1.7741251325556733, "grad_norm": 2.515625, "learning_rate": 4.142352788566466e-05, "loss": 1.9633, "num_input_tokens_seen": 22739808, "step": 10875 }, { "epoch": 1.7749408597764909, "grad_norm": 11.0, "learning_rate": 4.1416124812318424e-05, "loss": 2.199, "num_input_tokens_seen": 22752112, "step": 10880 }, { "epoch": 1.775756586997308, "grad_norm": 10.6875, "learning_rate": 4.1408719207399453e-05, "loss": 2.5672, "num_input_tokens_seen": 22763392, "step": 10885 }, { "epoch": 1.7765723142181256, "grad_norm": 6.53125, "learning_rate": 4.140131107204978e-05, "loss": 2.0084, "num_input_tokens_seen": 22773408, "step": 10890 }, { "epoch": 1.7773880414389427, "grad_norm": 5.46875, "learning_rate": 4.139390040741182e-05, "loss": 3.0286, "num_input_tokens_seen": 22783936, "step": 10895 }, { "epoch": 1.7782037686597603, "grad_norm": 7.96875, "learning_rate": 4.1386487214628396e-05, "loss": 2.7703, "num_input_tokens_seen": 22795152, "step": 10900 }, { "epoch": 1.7790194958805774, "grad_norm": 7.0625, "learning_rate": 4.137907149484272e-05, "loss": 2.2327, "num_input_tokens_seen": 22805104, "step": 10905 }, { "epoch": 1.779835223101395, "grad_norm": 9.125, "learning_rate": 4.137165324919839e-05, "loss": 3.2997, "num_input_tokens_seen": 22815520, "step": 10910 }, { "epoch": 1.780650950322212, "grad_norm": 10.5, "learning_rate": 4.136423247883939e-05, "loss": 1.4504, "num_input_tokens_seen": 22825424, "step": 10915 }, { "epoch": 1.7814666775430297, "grad_norm": 8.375, "learning_rate": 4.135680918491009e-05, "loss": 2.0813, "num_input_tokens_seen": 22835968, "step": 10920 }, { "epoch": 1.7822824047638468, "grad_norm": 7.34375, "learning_rate": 4.1349383368555265e-05, "loss": 2.9522, "num_input_tokens_seen": 22846432, "step": 10925 }, { "epoch": 1.7830981319846644, "grad_norm": 5.4375, "learning_rate": 4.1341955030920065e-05, "loss": 1.9342, "num_input_tokens_seen": 22856160, "step": 10930 }, { "epoch": 1.7839138592054817, "grad_norm": 3.6875, "learning_rate": 4.1334524173150036e-05, "loss": 2.4893, "num_input_tokens_seen": 22866512, "step": 10935 }, { "epoch": 1.784729586426299, "grad_norm": 6.78125, "learning_rate": 4.13270907963911e-05, "loss": 3.3057, "num_input_tokens_seen": 22876272, "step": 10940 }, { "epoch": 1.7855453136471164, "grad_norm": 3.734375, "learning_rate": 4.131965490178959e-05, "loss": 1.7925, "num_input_tokens_seen": 22886592, "step": 10945 }, { "epoch": 1.7863610408679338, "grad_norm": 5.90625, "learning_rate": 4.131221649049222e-05, "loss": 2.0263, "num_input_tokens_seen": 22897968, "step": 10950 }, { "epoch": 1.7871767680887511, "grad_norm": 4.5625, "learning_rate": 4.130477556364606e-05, "loss": 2.4973, "num_input_tokens_seen": 22907792, "step": 10955 }, { "epoch": 1.7879924953095685, "grad_norm": 8.875, "learning_rate": 4.129733212239861e-05, "loss": 2.3772, "num_input_tokens_seen": 22917264, "step": 10960 }, { "epoch": 1.7888082225303858, "grad_norm": 8.4375, "learning_rate": 4.128988616789774e-05, "loss": 1.9849, "num_input_tokens_seen": 22928608, "step": 10965 }, { "epoch": 1.7896239497512032, "grad_norm": 9.0, "learning_rate": 4.1282437701291724e-05, "loss": 2.6225, "num_input_tokens_seen": 22939888, "step": 10970 }, { "epoch": 1.7904396769720206, "grad_norm": 0.1923828125, "learning_rate": 4.1274986723729184e-05, "loss": 2.9913, "num_input_tokens_seen": 22950864, "step": 10975 }, { "epoch": 1.791255404192838, "grad_norm": 6.9375, "learning_rate": 4.126753323635917e-05, "loss": 2.0562, "num_input_tokens_seen": 22962656, "step": 10980 }, { "epoch": 1.7920711314136553, "grad_norm": 2.96875, "learning_rate": 4.12600772403311e-05, "loss": 3.0361, "num_input_tokens_seen": 22974112, "step": 10985 }, { "epoch": 1.7928868586344726, "grad_norm": 8.6875, "learning_rate": 4.125261873679479e-05, "loss": 1.8447, "num_input_tokens_seen": 22985232, "step": 10990 }, { "epoch": 1.79370258585529, "grad_norm": 12.75, "learning_rate": 4.124515772690042e-05, "loss": 2.978, "num_input_tokens_seen": 22995312, "step": 10995 }, { "epoch": 1.7945183130761073, "grad_norm": 0.1826171875, "learning_rate": 4.123769421179858e-05, "loss": 2.028, "num_input_tokens_seen": 23005600, "step": 11000 }, { "epoch": 1.7945183130761073, "eval_loss": 2.5415890216827393, "eval_runtime": 135.1545, "eval_samples_per_second": 20.162, "eval_steps_per_second": 10.085, "num_input_tokens_seen": 23005600, "step": 11000 }, { "epoch": 1.7953340402969247, "grad_norm": 13.375, "learning_rate": 4.1230228192640236e-05, "loss": 2.9247, "num_input_tokens_seen": 23014720, "step": 11005 }, { "epoch": 1.796149767517742, "grad_norm": 5.1875, "learning_rate": 4.122275967057675e-05, "loss": 2.1675, "num_input_tokens_seen": 23024944, "step": 11010 }, { "epoch": 1.7969654947385594, "grad_norm": 4.1875, "learning_rate": 4.1215288646759846e-05, "loss": 1.4726, "num_input_tokens_seen": 23035856, "step": 11015 }, { "epoch": 1.7977812219593767, "grad_norm": 4.53125, "learning_rate": 4.120781512234166e-05, "loss": 2.8311, "num_input_tokens_seen": 23045360, "step": 11020 }, { "epoch": 1.7985969491801943, "grad_norm": 10.4375, "learning_rate": 4.120033909847471e-05, "loss": 2.0828, "num_input_tokens_seen": 23056368, "step": 11025 }, { "epoch": 1.7994126764010114, "grad_norm": 3.171875, "learning_rate": 4.119286057631187e-05, "loss": 2.2712, "num_input_tokens_seen": 23066736, "step": 11030 }, { "epoch": 1.800228403621829, "grad_norm": 9.0, "learning_rate": 4.118537955700646e-05, "loss": 2.0837, "num_input_tokens_seen": 23078992, "step": 11035 }, { "epoch": 1.8010441308426461, "grad_norm": 7.25, "learning_rate": 4.11778960417121e-05, "loss": 1.7473, "num_input_tokens_seen": 23089984, "step": 11040 }, { "epoch": 1.8018598580634637, "grad_norm": 13.5625, "learning_rate": 4.117041003158288e-05, "loss": 2.8248, "num_input_tokens_seen": 23099904, "step": 11045 }, { "epoch": 1.8026755852842808, "grad_norm": 5.03125, "learning_rate": 4.1162921527773215e-05, "loss": 2.6457, "num_input_tokens_seen": 23110208, "step": 11050 }, { "epoch": 1.8034913125050984, "grad_norm": 10.1875, "learning_rate": 4.115543053143794e-05, "loss": 3.4696, "num_input_tokens_seen": 23120544, "step": 11055 }, { "epoch": 1.8043070397259156, "grad_norm": 7.6875, "learning_rate": 4.114793704373226e-05, "loss": 2.8976, "num_input_tokens_seen": 23131472, "step": 11060 }, { "epoch": 1.8051227669467331, "grad_norm": 6.25, "learning_rate": 4.114044106581175e-05, "loss": 1.779, "num_input_tokens_seen": 23140640, "step": 11065 }, { "epoch": 1.8059384941675503, "grad_norm": 7.78125, "learning_rate": 4.11329425988324e-05, "loss": 3.3835, "num_input_tokens_seen": 23150800, "step": 11070 }, { "epoch": 1.8067542213883678, "grad_norm": 6.71875, "learning_rate": 4.112544164395056e-05, "loss": 2.7285, "num_input_tokens_seen": 23161360, "step": 11075 }, { "epoch": 1.807569948609185, "grad_norm": 4.90625, "learning_rate": 4.111793820232297e-05, "loss": 1.0915, "num_input_tokens_seen": 23171776, "step": 11080 }, { "epoch": 1.8083856758300025, "grad_norm": 8.5625, "learning_rate": 4.1110432275106767e-05, "loss": 3.2355, "num_input_tokens_seen": 23181344, "step": 11085 }, { "epoch": 1.8092014030508197, "grad_norm": 8.3125, "learning_rate": 4.110292386345944e-05, "loss": 2.6463, "num_input_tokens_seen": 23191808, "step": 11090 }, { "epoch": 1.8100171302716372, "grad_norm": 3.9375, "learning_rate": 4.109541296853891e-05, "loss": 1.9661, "num_input_tokens_seen": 23201312, "step": 11095 }, { "epoch": 1.8108328574924544, "grad_norm": 10.8125, "learning_rate": 4.108789959150341e-05, "loss": 1.6202, "num_input_tokens_seen": 23212416, "step": 11100 }, { "epoch": 1.811648584713272, "grad_norm": 14.1875, "learning_rate": 4.108038373351163e-05, "loss": 2.6634, "num_input_tokens_seen": 23221024, "step": 11105 }, { "epoch": 1.812464311934089, "grad_norm": 0.142578125, "learning_rate": 4.10728653957226e-05, "loss": 1.2823, "num_input_tokens_seen": 23230416, "step": 11110 }, { "epoch": 1.8132800391549067, "grad_norm": 8.25, "learning_rate": 4.106534457929575e-05, "loss": 3.5251, "num_input_tokens_seen": 23240192, "step": 11115 }, { "epoch": 1.814095766375724, "grad_norm": 5.875, "learning_rate": 4.105782128539086e-05, "loss": 2.2713, "num_input_tokens_seen": 23248992, "step": 11120 }, { "epoch": 1.8149114935965414, "grad_norm": 1.59375, "learning_rate": 4.1050295515168144e-05, "loss": 1.1684, "num_input_tokens_seen": 23258688, "step": 11125 }, { "epoch": 1.8157272208173587, "grad_norm": 7.84375, "learning_rate": 4.1042767269788155e-05, "loss": 1.9391, "num_input_tokens_seen": 23268448, "step": 11130 }, { "epoch": 1.816542948038176, "grad_norm": 9.3125, "learning_rate": 4.103523655041185e-05, "loss": 3.8295, "num_input_tokens_seen": 23277616, "step": 11135 }, { "epoch": 1.8173586752589934, "grad_norm": 3.6875, "learning_rate": 4.102770335820055e-05, "loss": 3.4805, "num_input_tokens_seen": 23287136, "step": 11140 }, { "epoch": 1.8181744024798108, "grad_norm": 6.9375, "learning_rate": 4.1020167694315984e-05, "loss": 1.8822, "num_input_tokens_seen": 23296880, "step": 11145 }, { "epoch": 1.8189901297006281, "grad_norm": 8.1875, "learning_rate": 4.101262955992023e-05, "loss": 2.8875, "num_input_tokens_seen": 23305856, "step": 11150 }, { "epoch": 1.8198058569214455, "grad_norm": 5.5, "learning_rate": 4.100508895617578e-05, "loss": 2.3623, "num_input_tokens_seen": 23315936, "step": 11155 }, { "epoch": 1.8206215841422628, "grad_norm": 4.90625, "learning_rate": 4.099754588424547e-05, "loss": 2.2908, "num_input_tokens_seen": 23326752, "step": 11160 }, { "epoch": 1.8214373113630802, "grad_norm": 5.40625, "learning_rate": 4.0990000345292546e-05, "loss": 3.5846, "num_input_tokens_seen": 23338544, "step": 11165 }, { "epoch": 1.8222530385838975, "grad_norm": 9.5625, "learning_rate": 4.098245234048064e-05, "loss": 2.2862, "num_input_tokens_seen": 23348800, "step": 11170 }, { "epoch": 1.8230687658047149, "grad_norm": 5.875, "learning_rate": 4.0974901870973726e-05, "loss": 2.0122, "num_input_tokens_seen": 23359824, "step": 11175 }, { "epoch": 1.8238844930255322, "grad_norm": 6.28125, "learning_rate": 4.096734893793619e-05, "loss": 2.8398, "num_input_tokens_seen": 23369920, "step": 11180 }, { "epoch": 1.8247002202463496, "grad_norm": 6.15625, "learning_rate": 4.095979354253279e-05, "loss": 1.9091, "num_input_tokens_seen": 23380864, "step": 11185 }, { "epoch": 1.825515947467167, "grad_norm": 3.59375, "learning_rate": 4.0952235685928656e-05, "loss": 2.5836, "num_input_tokens_seen": 23391888, "step": 11190 }, { "epoch": 1.8263316746879843, "grad_norm": 6.15625, "learning_rate": 4.094467536928932e-05, "loss": 1.8714, "num_input_tokens_seen": 23403312, "step": 11195 }, { "epoch": 1.8271474019088016, "grad_norm": 5.1875, "learning_rate": 4.093711259378067e-05, "loss": 2.9135, "num_input_tokens_seen": 23413712, "step": 11200 }, { "epoch": 1.8271474019088016, "eval_loss": 2.548494338989258, "eval_runtime": 135.0508, "eval_samples_per_second": 20.178, "eval_steps_per_second": 10.093, "num_input_tokens_seen": 23413712, "step": 11200 }, { "epoch": 1.827963129129619, "grad_norm": 6.46875, "learning_rate": 4.092954736056897e-05, "loss": 2.3802, "num_input_tokens_seen": 23424544, "step": 11205 }, { "epoch": 1.8287788563504366, "grad_norm": 1.7734375, "learning_rate": 4.09219796708209e-05, "loss": 2.1915, "num_input_tokens_seen": 23435200, "step": 11210 }, { "epoch": 1.8295945835712537, "grad_norm": 8.0, "learning_rate": 4.0914409525703464e-05, "loss": 3.4955, "num_input_tokens_seen": 23444192, "step": 11215 }, { "epoch": 1.8304103107920713, "grad_norm": 5.40625, "learning_rate": 4.090683692638408e-05, "loss": 3.2005, "num_input_tokens_seen": 23454144, "step": 11220 }, { "epoch": 1.8312260380128884, "grad_norm": 0.60546875, "learning_rate": 4.089926187403056e-05, "loss": 2.6453, "num_input_tokens_seen": 23464656, "step": 11225 }, { "epoch": 1.832041765233706, "grad_norm": 7.28125, "learning_rate": 4.0891684369811044e-05, "loss": 3.4868, "num_input_tokens_seen": 23474112, "step": 11230 }, { "epoch": 1.8328574924545231, "grad_norm": 7.34375, "learning_rate": 4.0884104414894107e-05, "loss": 2.1139, "num_input_tokens_seen": 23485456, "step": 11235 }, { "epoch": 1.8336732196753407, "grad_norm": 5.5, "learning_rate": 4.087652201044864e-05, "loss": 2.7731, "num_input_tokens_seen": 23496576, "step": 11240 }, { "epoch": 1.8344889468961578, "grad_norm": 5.46875, "learning_rate": 4.086893715764397e-05, "loss": 3.1156, "num_input_tokens_seen": 23506976, "step": 11245 }, { "epoch": 1.8353046741169754, "grad_norm": 8.1875, "learning_rate": 4.086134985764977e-05, "loss": 2.0548, "num_input_tokens_seen": 23517776, "step": 11250 }, { "epoch": 1.8361204013377925, "grad_norm": 1.609375, "learning_rate": 4.0853760111636085e-05, "loss": 2.7464, "num_input_tokens_seen": 23528096, "step": 11255 }, { "epoch": 1.83693612855861, "grad_norm": 8.875, "learning_rate": 4.084616792077337e-05, "loss": 2.9937, "num_input_tokens_seen": 23538816, "step": 11260 }, { "epoch": 1.8377518557794272, "grad_norm": 2.640625, "learning_rate": 4.083857328623243e-05, "loss": 1.7838, "num_input_tokens_seen": 23548688, "step": 11265 }, { "epoch": 1.8385675830002448, "grad_norm": 8.625, "learning_rate": 4.083097620918444e-05, "loss": 3.5507, "num_input_tokens_seen": 23559440, "step": 11270 }, { "epoch": 1.839383310221062, "grad_norm": 8.25, "learning_rate": 4.082337669080097e-05, "loss": 0.9322, "num_input_tokens_seen": 23569488, "step": 11275 }, { "epoch": 1.8401990374418795, "grad_norm": 4.78125, "learning_rate": 4.081577473225398e-05, "loss": 1.7583, "num_input_tokens_seen": 23581088, "step": 11280 }, { "epoch": 1.8410147646626966, "grad_norm": 17.625, "learning_rate": 4.080817033471577e-05, "loss": 2.5401, "num_input_tokens_seen": 23590048, "step": 11285 }, { "epoch": 1.8418304918835142, "grad_norm": 9.0625, "learning_rate": 4.080056349935903e-05, "loss": 2.7579, "num_input_tokens_seen": 23600544, "step": 11290 }, { "epoch": 1.8426462191043314, "grad_norm": 9.125, "learning_rate": 4.079295422735684e-05, "loss": 2.7838, "num_input_tokens_seen": 23609984, "step": 11295 }, { "epoch": 1.843461946325149, "grad_norm": 14.375, "learning_rate": 4.078534251988264e-05, "loss": 2.6622, "num_input_tokens_seen": 23620400, "step": 11300 }, { "epoch": 1.8442776735459663, "grad_norm": 8.0625, "learning_rate": 4.077772837811025e-05, "loss": 2.0743, "num_input_tokens_seen": 23629840, "step": 11305 }, { "epoch": 1.8450934007667836, "grad_norm": 7.90625, "learning_rate": 4.0770111803213874e-05, "loss": 3.0851, "num_input_tokens_seen": 23641696, "step": 11310 }, { "epoch": 1.845909127987601, "grad_norm": 3.515625, "learning_rate": 4.076249279636807e-05, "loss": 1.4606, "num_input_tokens_seen": 23651680, "step": 11315 }, { "epoch": 1.8467248552084183, "grad_norm": 9.6875, "learning_rate": 4.075487135874781e-05, "loss": 2.8207, "num_input_tokens_seen": 23662528, "step": 11320 }, { "epoch": 1.8475405824292357, "grad_norm": 10.5625, "learning_rate": 4.074724749152837e-05, "loss": 2.2815, "num_input_tokens_seen": 23673328, "step": 11325 }, { "epoch": 1.848356309650053, "grad_norm": 6.28125, "learning_rate": 4.07396211958855e-05, "loss": 2.1915, "num_input_tokens_seen": 23683872, "step": 11330 }, { "epoch": 1.8491720368708704, "grad_norm": 4.21875, "learning_rate": 4.073199247299523e-05, "loss": 2.6668, "num_input_tokens_seen": 23693360, "step": 11335 }, { "epoch": 1.8499877640916877, "grad_norm": 11.0, "learning_rate": 4.072436132403403e-05, "loss": 2.3273, "num_input_tokens_seen": 23704112, "step": 11340 }, { "epoch": 1.850803491312505, "grad_norm": 1.4140625, "learning_rate": 4.0716727750178704e-05, "loss": 2.0404, "num_input_tokens_seen": 23713984, "step": 11345 }, { "epoch": 1.8516192185333225, "grad_norm": 7.59375, "learning_rate": 4.0709091752606455e-05, "loss": 3.2624, "num_input_tokens_seen": 23724608, "step": 11350 }, { "epoch": 1.8524349457541398, "grad_norm": 12.9375, "learning_rate": 4.070145333249484e-05, "loss": 2.4581, "num_input_tokens_seen": 23735472, "step": 11355 }, { "epoch": 1.8532506729749572, "grad_norm": 11.125, "learning_rate": 4.069381249102181e-05, "loss": 2.4519, "num_input_tokens_seen": 23746592, "step": 11360 }, { "epoch": 1.8540664001957745, "grad_norm": 8.3125, "learning_rate": 4.0686169229365665e-05, "loss": 2.8099, "num_input_tokens_seen": 23757536, "step": 11365 }, { "epoch": 1.8548821274165919, "grad_norm": 11.5, "learning_rate": 4.067852354870511e-05, "loss": 3.5504, "num_input_tokens_seen": 23767584, "step": 11370 }, { "epoch": 1.8556978546374092, "grad_norm": 5.46875, "learning_rate": 4.067087545021919e-05, "loss": 2.2058, "num_input_tokens_seen": 23777136, "step": 11375 }, { "epoch": 1.8565135818582266, "grad_norm": 8.6875, "learning_rate": 4.066322493508734e-05, "loss": 3.6205, "num_input_tokens_seen": 23787872, "step": 11380 }, { "epoch": 1.857329309079044, "grad_norm": 7.375, "learning_rate": 4.065557200448937e-05, "loss": 3.8011, "num_input_tokens_seen": 23796464, "step": 11385 }, { "epoch": 1.8581450362998613, "grad_norm": 6.28125, "learning_rate": 4.064791665960546e-05, "loss": 3.8066, "num_input_tokens_seen": 23807360, "step": 11390 }, { "epoch": 1.8589607635206788, "grad_norm": 4.6875, "learning_rate": 4.064025890161615e-05, "loss": 1.6115, "num_input_tokens_seen": 23816768, "step": 11395 }, { "epoch": 1.859776490741496, "grad_norm": 7.5625, "learning_rate": 4.0632598731702373e-05, "loss": 1.8818, "num_input_tokens_seen": 23827536, "step": 11400 }, { "epoch": 1.859776490741496, "eval_loss": 2.5369606018066406, "eval_runtime": 134.9998, "eval_samples_per_second": 20.185, "eval_steps_per_second": 10.096, "num_input_tokens_seen": 23827536, "step": 11400 }, { "epoch": 1.8605922179623136, "grad_norm": 7.8125, "learning_rate": 4.0624936151045426e-05, "loss": 2.2367, "num_input_tokens_seen": 23837824, "step": 11405 }, { "epoch": 1.8614079451831307, "grad_norm": 8.6875, "learning_rate": 4.061727116082696e-05, "loss": 3.2004, "num_input_tokens_seen": 23847904, "step": 11410 }, { "epoch": 1.8622236724039483, "grad_norm": 6.96875, "learning_rate": 4.060960376222903e-05, "loss": 2.4678, "num_input_tokens_seen": 23856816, "step": 11415 }, { "epoch": 1.8630393996247654, "grad_norm": 4.21875, "learning_rate": 4.0601933956434034e-05, "loss": 2.1978, "num_input_tokens_seen": 23865392, "step": 11420 }, { "epoch": 1.863855126845583, "grad_norm": 2.625, "learning_rate": 4.059426174462476e-05, "loss": 2.297, "num_input_tokens_seen": 23876384, "step": 11425 }, { "epoch": 1.8646708540664, "grad_norm": 4.8125, "learning_rate": 4.058658712798435e-05, "loss": 1.8264, "num_input_tokens_seen": 23887792, "step": 11430 }, { "epoch": 1.8654865812872177, "grad_norm": 6.75, "learning_rate": 4.0578910107696336e-05, "loss": 2.5089, "num_input_tokens_seen": 23897968, "step": 11435 }, { "epoch": 1.8663023085080348, "grad_norm": 8.5625, "learning_rate": 4.05712306849446e-05, "loss": 2.3993, "num_input_tokens_seen": 23907632, "step": 11440 }, { "epoch": 1.8671180357288524, "grad_norm": 12.875, "learning_rate": 4.0563548860913415e-05, "loss": 1.976, "num_input_tokens_seen": 23917440, "step": 11445 }, { "epoch": 1.8679337629496695, "grad_norm": 8.6875, "learning_rate": 4.0555864636787414e-05, "loss": 2.2438, "num_input_tokens_seen": 23928144, "step": 11450 }, { "epoch": 1.868749490170487, "grad_norm": 6.90625, "learning_rate": 4.054817801375159e-05, "loss": 2.6947, "num_input_tokens_seen": 23938480, "step": 11455 }, { "epoch": 1.8695652173913042, "grad_norm": 4.4375, "learning_rate": 4.054048899299134e-05, "loss": 2.8081, "num_input_tokens_seen": 23948192, "step": 11460 }, { "epoch": 1.8703809446121218, "grad_norm": 5.71875, "learning_rate": 4.0532797575692385e-05, "loss": 3.7468, "num_input_tokens_seen": 23958784, "step": 11465 }, { "epoch": 1.871196671832939, "grad_norm": 5.96875, "learning_rate": 4.052510376304085e-05, "loss": 2.9304, "num_input_tokens_seen": 23969008, "step": 11470 }, { "epoch": 1.8720123990537565, "grad_norm": 9.125, "learning_rate": 4.051740755622321e-05, "loss": 2.3511, "num_input_tokens_seen": 23978688, "step": 11475 }, { "epoch": 1.8728281262745736, "grad_norm": 14.625, "learning_rate": 4.050970895642632e-05, "loss": 4.4255, "num_input_tokens_seen": 23987408, "step": 11480 }, { "epoch": 1.8736438534953912, "grad_norm": 5.75, "learning_rate": 4.050200796483741e-05, "loss": 1.6992, "num_input_tokens_seen": 23997328, "step": 11485 }, { "epoch": 1.8744595807162086, "grad_norm": 6.375, "learning_rate": 4.049430458264405e-05, "loss": 3.2928, "num_input_tokens_seen": 24006304, "step": 11490 }, { "epoch": 1.875275307937026, "grad_norm": 6.5, "learning_rate": 4.048659881103422e-05, "loss": 2.3, "num_input_tokens_seen": 24017856, "step": 11495 }, { "epoch": 1.8760910351578433, "grad_norm": 6.96875, "learning_rate": 4.0478890651196235e-05, "loss": 2.3548, "num_input_tokens_seen": 24029760, "step": 11500 }, { "epoch": 1.8769067623786606, "grad_norm": 3.640625, "learning_rate": 4.047118010431879e-05, "loss": 3.428, "num_input_tokens_seen": 24040336, "step": 11505 }, { "epoch": 1.877722489599478, "grad_norm": 4.3125, "learning_rate": 4.046346717159094e-05, "loss": 2.5567, "num_input_tokens_seen": 24051472, "step": 11510 }, { "epoch": 1.8785382168202953, "grad_norm": 10.875, "learning_rate": 4.045575185420214e-05, "loss": 2.718, "num_input_tokens_seen": 24061920, "step": 11515 }, { "epoch": 1.8793539440411127, "grad_norm": 3.515625, "learning_rate": 4.0448034153342165e-05, "loss": 1.6905, "num_input_tokens_seen": 24071424, "step": 11520 }, { "epoch": 1.88016967126193, "grad_norm": 9.1875, "learning_rate": 4.0440314070201194e-05, "loss": 3.4315, "num_input_tokens_seen": 24081712, "step": 11525 }, { "epoch": 1.8809853984827474, "grad_norm": 9.9375, "learning_rate": 4.043259160596976e-05, "loss": 2.6037, "num_input_tokens_seen": 24091248, "step": 11530 }, { "epoch": 1.8818011257035647, "grad_norm": 4.9375, "learning_rate": 4.0424866761838767e-05, "loss": 2.7409, "num_input_tokens_seen": 24103200, "step": 11535 }, { "epoch": 1.882616852924382, "grad_norm": 5.8125, "learning_rate": 4.041713953899948e-05, "loss": 2.4161, "num_input_tokens_seen": 24113200, "step": 11540 }, { "epoch": 1.8834325801451994, "grad_norm": 12.125, "learning_rate": 4.0409409938643515e-05, "loss": 3.3316, "num_input_tokens_seen": 24121952, "step": 11545 }, { "epoch": 1.8842483073660168, "grad_norm": 6.15625, "learning_rate": 4.0401677961962904e-05, "loss": 1.7687, "num_input_tokens_seen": 24131840, "step": 11550 }, { "epoch": 1.8850640345868341, "grad_norm": 7.96875, "learning_rate": 4.039394361015001e-05, "loss": 2.2257, "num_input_tokens_seen": 24143936, "step": 11555 }, { "epoch": 1.8858797618076515, "grad_norm": 5.53125, "learning_rate": 4.038620688439755e-05, "loss": 1.8754, "num_input_tokens_seen": 24153168, "step": 11560 }, { "epoch": 1.8866954890284688, "grad_norm": 1.515625, "learning_rate": 4.037846778589862e-05, "loss": 1.0507, "num_input_tokens_seen": 24164576, "step": 11565 }, { "epoch": 1.8875112162492862, "grad_norm": 10.125, "learning_rate": 4.0370726315846715e-05, "loss": 2.1941, "num_input_tokens_seen": 24176608, "step": 11570 }, { "epoch": 1.8883269434701035, "grad_norm": 7.46875, "learning_rate": 4.036298247543565e-05, "loss": 2.4753, "num_input_tokens_seen": 24187696, "step": 11575 }, { "epoch": 1.8891426706909211, "grad_norm": 10.4375, "learning_rate": 4.035523626585962e-05, "loss": 1.8263, "num_input_tokens_seen": 24199280, "step": 11580 }, { "epoch": 1.8899583979117383, "grad_norm": 16.875, "learning_rate": 4.0347487688313194e-05, "loss": 3.9414, "num_input_tokens_seen": 24209504, "step": 11585 }, { "epoch": 1.8907741251325558, "grad_norm": 8.125, "learning_rate": 4.0339736743991296e-05, "loss": 2.0571, "num_input_tokens_seen": 24220576, "step": 11590 }, { "epoch": 1.891589852353373, "grad_norm": 5.75, "learning_rate": 4.0331983434089227e-05, "loss": 3.0148, "num_input_tokens_seen": 24230752, "step": 11595 }, { "epoch": 1.8924055795741905, "grad_norm": 3.859375, "learning_rate": 4.032422775980264e-05, "loss": 1.3497, "num_input_tokens_seen": 24242256, "step": 11600 }, { "epoch": 1.8924055795741905, "eval_loss": 2.549673080444336, "eval_runtime": 135.0674, "eval_samples_per_second": 20.175, "eval_steps_per_second": 10.091, "num_input_tokens_seen": 24242256, "step": 11600 }, { "epoch": 1.8932213067950077, "grad_norm": 3.640625, "learning_rate": 4.031646972232754e-05, "loss": 1.3969, "num_input_tokens_seen": 24252608, "step": 11605 }, { "epoch": 1.8940370340158252, "grad_norm": 7.0625, "learning_rate": 4.0308709322860344e-05, "loss": 1.9765, "num_input_tokens_seen": 24262416, "step": 11610 }, { "epoch": 1.8948527612366424, "grad_norm": 11.1875, "learning_rate": 4.0300946562597784e-05, "loss": 2.9223, "num_input_tokens_seen": 24273952, "step": 11615 }, { "epoch": 1.89566848845746, "grad_norm": 6.4375, "learning_rate": 4.029318144273698e-05, "loss": 2.3679, "num_input_tokens_seen": 24282352, "step": 11620 }, { "epoch": 1.896484215678277, "grad_norm": 7.90625, "learning_rate": 4.0285413964475415e-05, "loss": 2.5045, "num_input_tokens_seen": 24293440, "step": 11625 }, { "epoch": 1.8972999428990946, "grad_norm": 7.46875, "learning_rate": 4.0277644129010927e-05, "loss": 1.8866, "num_input_tokens_seen": 24304640, "step": 11630 }, { "epoch": 1.8981156701199118, "grad_norm": 1.203125, "learning_rate": 4.0269871937541724e-05, "loss": 2.1028, "num_input_tokens_seen": 24314496, "step": 11635 }, { "epoch": 1.8989313973407294, "grad_norm": 7.9375, "learning_rate": 4.026209739126637e-05, "loss": 2.0925, "num_input_tokens_seen": 24324096, "step": 11640 }, { "epoch": 1.8997471245615465, "grad_norm": 7.59375, "learning_rate": 4.025432049138381e-05, "loss": 3.0681, "num_input_tokens_seen": 24334672, "step": 11645 }, { "epoch": 1.900562851782364, "grad_norm": 4.59375, "learning_rate": 4.0246541239093325e-05, "loss": 2.6697, "num_input_tokens_seen": 24344512, "step": 11650 }, { "epoch": 1.9013785790031812, "grad_norm": 11.5625, "learning_rate": 4.023875963559459e-05, "loss": 2.952, "num_input_tokens_seen": 24354208, "step": 11655 }, { "epoch": 1.9021943062239988, "grad_norm": 8.5625, "learning_rate": 4.023097568208761e-05, "loss": 2.4238, "num_input_tokens_seen": 24364928, "step": 11660 }, { "epoch": 1.903010033444816, "grad_norm": 6.96875, "learning_rate": 4.022318937977277e-05, "loss": 2.7698, "num_input_tokens_seen": 24376576, "step": 11665 }, { "epoch": 1.9038257606656335, "grad_norm": 1.4375, "learning_rate": 4.021540072985084e-05, "loss": 1.5888, "num_input_tokens_seen": 24387504, "step": 11670 }, { "epoch": 1.9046414878864508, "grad_norm": 2.53125, "learning_rate": 4.020760973352289e-05, "loss": 3.0971, "num_input_tokens_seen": 24398400, "step": 11675 }, { "epoch": 1.9054572151072682, "grad_norm": 8.8125, "learning_rate": 4.019981639199042e-05, "loss": 2.9334, "num_input_tokens_seen": 24408896, "step": 11680 }, { "epoch": 1.9062729423280855, "grad_norm": 7.75, "learning_rate": 4.0192020706455245e-05, "loss": 1.9956, "num_input_tokens_seen": 24418928, "step": 11685 }, { "epoch": 1.9070886695489029, "grad_norm": 8.125, "learning_rate": 4.018422267811956e-05, "loss": 2.9171, "num_input_tokens_seen": 24429792, "step": 11690 }, { "epoch": 1.9079043967697202, "grad_norm": 7.09375, "learning_rate": 4.017642230818592e-05, "loss": 2.2483, "num_input_tokens_seen": 24439808, "step": 11695 }, { "epoch": 1.9087201239905376, "grad_norm": 14.125, "learning_rate": 4.0168619597857246e-05, "loss": 2.5742, "num_input_tokens_seen": 24449200, "step": 11700 }, { "epoch": 1.909535851211355, "grad_norm": 11.5625, "learning_rate": 4.016081454833681e-05, "loss": 3.0056, "num_input_tokens_seen": 24459552, "step": 11705 }, { "epoch": 1.9103515784321723, "grad_norm": 4.59375, "learning_rate": 4.0153007160828245e-05, "loss": 2.7527, "num_input_tokens_seen": 24469008, "step": 11710 }, { "epoch": 1.9111673056529896, "grad_norm": 6.625, "learning_rate": 4.0145197436535555e-05, "loss": 2.0328, "num_input_tokens_seen": 24480576, "step": 11715 }, { "epoch": 1.911983032873807, "grad_norm": 5.96875, "learning_rate": 4.0137385376663095e-05, "loss": 2.4346, "num_input_tokens_seen": 24489856, "step": 11720 }, { "epoch": 1.9127987600946244, "grad_norm": 8.3125, "learning_rate": 4.012957098241558e-05, "loss": 2.8905, "num_input_tokens_seen": 24500080, "step": 11725 }, { "epoch": 1.9136144873154417, "grad_norm": 3.21875, "learning_rate": 4.0121754254998076e-05, "loss": 1.3164, "num_input_tokens_seen": 24510480, "step": 11730 }, { "epoch": 1.914430214536259, "grad_norm": 5.90625, "learning_rate": 4.011393519561606e-05, "loss": 1.4809, "num_input_tokens_seen": 24519168, "step": 11735 }, { "epoch": 1.9152459417570764, "grad_norm": 5.46875, "learning_rate": 4.010611380547529e-05, "loss": 1.9885, "num_input_tokens_seen": 24528720, "step": 11740 }, { "epoch": 1.9160616689778938, "grad_norm": 3.4375, "learning_rate": 4.009829008578192e-05, "loss": 3.1988, "num_input_tokens_seen": 24539408, "step": 11745 }, { "epoch": 1.9168773961987111, "grad_norm": 6.40625, "learning_rate": 4.00904640377425e-05, "loss": 1.5636, "num_input_tokens_seen": 24549280, "step": 11750 }, { "epoch": 1.9176931234195285, "grad_norm": 5.40625, "learning_rate": 4.0082635662563886e-05, "loss": 2.5178, "num_input_tokens_seen": 24559536, "step": 11755 }, { "epoch": 1.9185088506403458, "grad_norm": 7.78125, "learning_rate": 4.007480496145331e-05, "loss": 3.2025, "num_input_tokens_seen": 24571120, "step": 11760 }, { "epoch": 1.9193245778611632, "grad_norm": 8.0625, "learning_rate": 4.006697193561837e-05, "loss": 1.9617, "num_input_tokens_seen": 24582864, "step": 11765 }, { "epoch": 1.9201403050819805, "grad_norm": 5.6875, "learning_rate": 4.005913658626701e-05, "loss": 2.34, "num_input_tokens_seen": 24592016, "step": 11770 }, { "epoch": 1.920956032302798, "grad_norm": 6.40625, "learning_rate": 4.005129891460754e-05, "loss": 2.3168, "num_input_tokens_seen": 24603728, "step": 11775 }, { "epoch": 1.9217717595236152, "grad_norm": 3.140625, "learning_rate": 4.004345892184864e-05, "loss": 1.9279, "num_input_tokens_seen": 24612416, "step": 11780 }, { "epoch": 1.9225874867444328, "grad_norm": 7.0625, "learning_rate": 4.003561660919932e-05, "loss": 1.7097, "num_input_tokens_seen": 24624080, "step": 11785 }, { "epoch": 1.92340321396525, "grad_norm": 5.78125, "learning_rate": 4.002777197786897e-05, "loss": 1.1837, "num_input_tokens_seen": 24633392, "step": 11790 }, { "epoch": 1.9242189411860675, "grad_norm": 11.375, "learning_rate": 4.0019925029067326e-05, "loss": 1.5724, "num_input_tokens_seen": 24643712, "step": 11795 }, { "epoch": 1.9250346684068846, "grad_norm": 6.40625, "learning_rate": 4.0012075764004495e-05, "loss": 2.077, "num_input_tokens_seen": 24655408, "step": 11800 }, { "epoch": 1.9250346684068846, "eval_loss": 2.532294511795044, "eval_runtime": 134.8739, "eval_samples_per_second": 20.204, "eval_steps_per_second": 10.106, "num_input_tokens_seen": 24655408, "step": 11800 }, { "epoch": 1.9258503956277022, "grad_norm": 18.25, "learning_rate": 4.000422418389094e-05, "loss": 2.2146, "num_input_tokens_seen": 24666144, "step": 11805 }, { "epoch": 1.9266661228485193, "grad_norm": 6.46875, "learning_rate": 3.999637028993744e-05, "loss": 3.2053, "num_input_tokens_seen": 24677264, "step": 11810 }, { "epoch": 1.927481850069337, "grad_norm": 7.6875, "learning_rate": 3.99885140833552e-05, "loss": 2.0437, "num_input_tokens_seen": 24688864, "step": 11815 }, { "epoch": 1.928297577290154, "grad_norm": 12.25, "learning_rate": 3.998065556535572e-05, "loss": 2.5752, "num_input_tokens_seen": 24697600, "step": 11820 }, { "epoch": 1.9291133045109716, "grad_norm": 7.90625, "learning_rate": 3.9972794737150895e-05, "loss": 2.7432, "num_input_tokens_seen": 24708608, "step": 11825 }, { "epoch": 1.9299290317317888, "grad_norm": 6.9375, "learning_rate": 3.996493159995297e-05, "loss": 3.82, "num_input_tokens_seen": 24718944, "step": 11830 }, { "epoch": 1.9307447589526063, "grad_norm": 5.125, "learning_rate": 3.995706615497453e-05, "loss": 2.5464, "num_input_tokens_seen": 24729152, "step": 11835 }, { "epoch": 1.9315604861734235, "grad_norm": 10.1875, "learning_rate": 3.994919840342852e-05, "loss": 2.9565, "num_input_tokens_seen": 24739152, "step": 11840 }, { "epoch": 1.932376213394241, "grad_norm": 2.71875, "learning_rate": 3.994132834652825e-05, "loss": 2.8477, "num_input_tokens_seen": 24748384, "step": 11845 }, { "epoch": 1.9331919406150582, "grad_norm": 6.5625, "learning_rate": 3.99334559854874e-05, "loss": 2.235, "num_input_tokens_seen": 24760544, "step": 11850 }, { "epoch": 1.9340076678358757, "grad_norm": 5.9375, "learning_rate": 3.9925581321519955e-05, "loss": 2.7372, "num_input_tokens_seen": 24769376, "step": 11855 }, { "epoch": 1.934823395056693, "grad_norm": 8.8125, "learning_rate": 3.991770435584031e-05, "loss": 3.5214, "num_input_tokens_seen": 24778816, "step": 11860 }, { "epoch": 1.9356391222775104, "grad_norm": 3.734375, "learning_rate": 3.990982508966319e-05, "loss": 1.3603, "num_input_tokens_seen": 24789696, "step": 11865 }, { "epoch": 1.9364548494983278, "grad_norm": 10.875, "learning_rate": 3.990194352420367e-05, "loss": 2.0686, "num_input_tokens_seen": 24799536, "step": 11870 }, { "epoch": 1.9372705767191452, "grad_norm": 6.3125, "learning_rate": 3.9894059660677184e-05, "loss": 2.3669, "num_input_tokens_seen": 24809232, "step": 11875 }, { "epoch": 1.9380863039399625, "grad_norm": 11.25, "learning_rate": 3.9886173500299526e-05, "loss": 3.0545, "num_input_tokens_seen": 24819872, "step": 11880 }, { "epoch": 1.9389020311607799, "grad_norm": 10.1875, "learning_rate": 3.987828504428685e-05, "loss": 2.841, "num_input_tokens_seen": 24831072, "step": 11885 }, { "epoch": 1.9397177583815972, "grad_norm": 6.71875, "learning_rate": 3.987039429385565e-05, "loss": 2.1879, "num_input_tokens_seen": 24842384, "step": 11890 }, { "epoch": 1.9405334856024146, "grad_norm": 4.71875, "learning_rate": 3.986250125022277e-05, "loss": 2.2361, "num_input_tokens_seen": 24852016, "step": 11895 }, { "epoch": 1.941349212823232, "grad_norm": 5.875, "learning_rate": 3.985460591460544e-05, "loss": 2.6951, "num_input_tokens_seen": 24862448, "step": 11900 }, { "epoch": 1.9421649400440493, "grad_norm": 8.8125, "learning_rate": 3.984670828822118e-05, "loss": 2.2953, "num_input_tokens_seen": 24871696, "step": 11905 }, { "epoch": 1.9429806672648666, "grad_norm": 4.21875, "learning_rate": 3.983880837228794e-05, "loss": 2.6165, "num_input_tokens_seen": 24882912, "step": 11910 }, { "epoch": 1.943796394485684, "grad_norm": 2.578125, "learning_rate": 3.983090616802396e-05, "loss": 2.0177, "num_input_tokens_seen": 24892256, "step": 11915 }, { "epoch": 1.9446121217065013, "grad_norm": 5.3125, "learning_rate": 3.982300167664788e-05, "loss": 2.2415, "num_input_tokens_seen": 24902048, "step": 11920 }, { "epoch": 1.9454278489273187, "grad_norm": 8.8125, "learning_rate": 3.981509489937868e-05, "loss": 2.2327, "num_input_tokens_seen": 24912096, "step": 11925 }, { "epoch": 1.946243576148136, "grad_norm": 9.8125, "learning_rate": 3.9807185837435643e-05, "loss": 2.5083, "num_input_tokens_seen": 24923632, "step": 11930 }, { "epoch": 1.9470593033689534, "grad_norm": 8.125, "learning_rate": 3.9799274492038484e-05, "loss": 1.4533, "num_input_tokens_seen": 24933408, "step": 11935 }, { "epoch": 1.9478750305897707, "grad_norm": 5.21875, "learning_rate": 3.979136086440722e-05, "loss": 2.7647, "num_input_tokens_seen": 24944400, "step": 11940 }, { "epoch": 1.948690757810588, "grad_norm": 2.890625, "learning_rate": 3.9783444955762226e-05, "loss": 1.7154, "num_input_tokens_seen": 24955440, "step": 11945 }, { "epoch": 1.9495064850314054, "grad_norm": 11.5, "learning_rate": 3.977552676732424e-05, "loss": 1.8168, "num_input_tokens_seen": 24966176, "step": 11950 }, { "epoch": 1.9503222122522228, "grad_norm": 3.78125, "learning_rate": 3.976760630031435e-05, "loss": 1.6403, "num_input_tokens_seen": 24976736, "step": 11955 }, { "epoch": 1.9511379394730404, "grad_norm": 6.875, "learning_rate": 3.975968355595398e-05, "loss": 2.311, "num_input_tokens_seen": 24985568, "step": 11960 }, { "epoch": 1.9519536666938575, "grad_norm": 6.53125, "learning_rate": 3.9751758535464935e-05, "loss": 1.1853, "num_input_tokens_seen": 24996992, "step": 11965 }, { "epoch": 1.952769393914675, "grad_norm": 6.3125, "learning_rate": 3.9743831240069326e-05, "loss": 2.3758, "num_input_tokens_seen": 25008896, "step": 11970 }, { "epoch": 1.9535851211354922, "grad_norm": 10.4375, "learning_rate": 3.9735901670989675e-05, "loss": 2.9113, "num_input_tokens_seen": 25020304, "step": 11975 }, { "epoch": 1.9544008483563098, "grad_norm": 8.25, "learning_rate": 3.97279698294488e-05, "loss": 2.9867, "num_input_tokens_seen": 25030384, "step": 11980 }, { "epoch": 1.955216575577127, "grad_norm": 6.34375, "learning_rate": 3.9720035716669876e-05, "loss": 1.8336, "num_input_tokens_seen": 25040608, "step": 11985 }, { "epoch": 1.9560323027979445, "grad_norm": 7.375, "learning_rate": 3.9712099333876474e-05, "loss": 1.9539, "num_input_tokens_seen": 25051680, "step": 11990 }, { "epoch": 1.9568480300187616, "grad_norm": 11.875, "learning_rate": 3.9704160682292475e-05, "loss": 2.7914, "num_input_tokens_seen": 25062624, "step": 11995 }, { "epoch": 1.9576637572395792, "grad_norm": 5.59375, "learning_rate": 3.9696219763142106e-05, "loss": 2.8321, "num_input_tokens_seen": 25074096, "step": 12000 }, { "epoch": 1.9576637572395792, "eval_loss": 2.5416414737701416, "eval_runtime": 134.7761, "eval_samples_per_second": 20.219, "eval_steps_per_second": 10.113, "num_input_tokens_seen": 25074096, "step": 12000 }, { "epoch": 1.9584794844603963, "grad_norm": 10.125, "learning_rate": 3.968827657764997e-05, "loss": 3.2021, "num_input_tokens_seen": 25084000, "step": 12005 }, { "epoch": 1.959295211681214, "grad_norm": 11.4375, "learning_rate": 3.9680331127041e-05, "loss": 2.1219, "num_input_tokens_seen": 25093232, "step": 12010 }, { "epoch": 1.960110938902031, "grad_norm": 1.8828125, "learning_rate": 3.9672383412540495e-05, "loss": 2.0452, "num_input_tokens_seen": 25103312, "step": 12015 }, { "epoch": 1.9609266661228486, "grad_norm": 6.125, "learning_rate": 3.966443343537407e-05, "loss": 2.8641, "num_input_tokens_seen": 25113136, "step": 12020 }, { "epoch": 1.9617423933436657, "grad_norm": 8.1875, "learning_rate": 3.965648119676772e-05, "loss": 2.1686, "num_input_tokens_seen": 25124128, "step": 12025 }, { "epoch": 1.9625581205644833, "grad_norm": 6.5625, "learning_rate": 3.96485266979478e-05, "loss": 1.6585, "num_input_tokens_seen": 25134960, "step": 12030 }, { "epoch": 1.9633738477853004, "grad_norm": 10.375, "learning_rate": 3.9640569940140974e-05, "loss": 2.6192, "num_input_tokens_seen": 25146896, "step": 12035 }, { "epoch": 1.964189575006118, "grad_norm": 4.65625, "learning_rate": 3.963261092457428e-05, "loss": 1.6255, "num_input_tokens_seen": 25157648, "step": 12040 }, { "epoch": 1.9650053022269351, "grad_norm": 4.625, "learning_rate": 3.962464965247509e-05, "loss": 2.2282, "num_input_tokens_seen": 25169040, "step": 12045 }, { "epoch": 1.9658210294477527, "grad_norm": 10.8125, "learning_rate": 3.9616686125071135e-05, "loss": 3.2049, "num_input_tokens_seen": 25178944, "step": 12050 }, { "epoch": 1.96663675666857, "grad_norm": 9.5625, "learning_rate": 3.9608720343590506e-05, "loss": 2.4419, "num_input_tokens_seen": 25187888, "step": 12055 }, { "epoch": 1.9674524838893874, "grad_norm": 11.5625, "learning_rate": 3.960075230926161e-05, "loss": 3.2146, "num_input_tokens_seen": 25199440, "step": 12060 }, { "epoch": 1.9682682111102048, "grad_norm": 4.125, "learning_rate": 3.959278202331322e-05, "loss": 2.3154, "num_input_tokens_seen": 25209616, "step": 12065 }, { "epoch": 1.9690839383310221, "grad_norm": 8.5625, "learning_rate": 3.958480948697446e-05, "loss": 3.2495, "num_input_tokens_seen": 25220544, "step": 12070 }, { "epoch": 1.9698996655518395, "grad_norm": 7.28125, "learning_rate": 3.95768347014748e-05, "loss": 1.944, "num_input_tokens_seen": 25231424, "step": 12075 }, { "epoch": 1.9707153927726568, "grad_norm": 6.1875, "learning_rate": 3.956885766804404e-05, "loss": 2.4192, "num_input_tokens_seen": 25241712, "step": 12080 }, { "epoch": 1.9715311199934742, "grad_norm": 6.6875, "learning_rate": 3.956087838791235e-05, "loss": 2.8942, "num_input_tokens_seen": 25252192, "step": 12085 }, { "epoch": 1.9723468472142915, "grad_norm": 11.6875, "learning_rate": 3.955289686231022e-05, "loss": 2.9327, "num_input_tokens_seen": 25262416, "step": 12090 }, { "epoch": 1.973162574435109, "grad_norm": 6.1875, "learning_rate": 3.9544913092468504e-05, "loss": 3.3473, "num_input_tokens_seen": 25272464, "step": 12095 }, { "epoch": 1.9739783016559262, "grad_norm": 6.5625, "learning_rate": 3.9536927079618425e-05, "loss": 1.9518, "num_input_tokens_seen": 25283248, "step": 12100 }, { "epoch": 1.9747940288767436, "grad_norm": 0.16015625, "learning_rate": 3.9528938824991494e-05, "loss": 2.5185, "num_input_tokens_seen": 25292512, "step": 12105 }, { "epoch": 1.975609756097561, "grad_norm": 10.1875, "learning_rate": 3.952094832981962e-05, "loss": 2.4884, "num_input_tokens_seen": 25303888, "step": 12110 }, { "epoch": 1.9764254833183783, "grad_norm": 3.734375, "learning_rate": 3.951295559533503e-05, "loss": 3.4112, "num_input_tokens_seen": 25314464, "step": 12115 }, { "epoch": 1.9772412105391957, "grad_norm": 3.46875, "learning_rate": 3.95049606227703e-05, "loss": 3.3551, "num_input_tokens_seen": 25324080, "step": 12120 }, { "epoch": 1.978056937760013, "grad_norm": 7.875, "learning_rate": 3.949696341335838e-05, "loss": 2.7883, "num_input_tokens_seen": 25334816, "step": 12125 }, { "epoch": 1.9788726649808304, "grad_norm": 7.90625, "learning_rate": 3.9488963968332503e-05, "loss": 3.4786, "num_input_tokens_seen": 25345280, "step": 12130 }, { "epoch": 1.9796883922016477, "grad_norm": 5.90625, "learning_rate": 3.948096228892631e-05, "loss": 2.2978, "num_input_tokens_seen": 25356624, "step": 12135 }, { "epoch": 1.980504119422465, "grad_norm": 10.125, "learning_rate": 3.947295837637375e-05, "loss": 3.4422, "num_input_tokens_seen": 25367376, "step": 12140 }, { "epoch": 1.9813198466432826, "grad_norm": 5.90625, "learning_rate": 3.9464952231909135e-05, "loss": 2.3039, "num_input_tokens_seen": 25377584, "step": 12145 }, { "epoch": 1.9821355738640998, "grad_norm": 7.375, "learning_rate": 3.945694385676711e-05, "loss": 1.616, "num_input_tokens_seen": 25386672, "step": 12150 }, { "epoch": 1.9829513010849174, "grad_norm": 16.375, "learning_rate": 3.944893325218265e-05, "loss": 2.4857, "num_input_tokens_seen": 25396320, "step": 12155 }, { "epoch": 1.9837670283057345, "grad_norm": 5.53125, "learning_rate": 3.944092041939112e-05, "loss": 1.5507, "num_input_tokens_seen": 25406208, "step": 12160 }, { "epoch": 1.984582755526552, "grad_norm": 8.0, "learning_rate": 3.943290535962818e-05, "loss": 2.3132, "num_input_tokens_seen": 25417344, "step": 12165 }, { "epoch": 1.9853984827473692, "grad_norm": 4.21875, "learning_rate": 3.942488807412985e-05, "loss": 2.2424, "num_input_tokens_seen": 25427200, "step": 12170 }, { "epoch": 1.9862142099681868, "grad_norm": 10.5625, "learning_rate": 3.941686856413251e-05, "loss": 3.1834, "num_input_tokens_seen": 25437872, "step": 12175 }, { "epoch": 1.987029937189004, "grad_norm": 6.90625, "learning_rate": 3.9408846830872874e-05, "loss": 1.4161, "num_input_tokens_seen": 25447520, "step": 12180 }, { "epoch": 1.9878456644098215, "grad_norm": 3.765625, "learning_rate": 3.940082287558798e-05, "loss": 2.4365, "num_input_tokens_seen": 25458304, "step": 12185 }, { "epoch": 1.9886613916306386, "grad_norm": 7.125, "learning_rate": 3.939279669951522e-05, "loss": 2.1, "num_input_tokens_seen": 25467952, "step": 12190 }, { "epoch": 1.9894771188514562, "grad_norm": 17.5, "learning_rate": 3.938476830389234e-05, "loss": 1.7848, "num_input_tokens_seen": 25478480, "step": 12195 }, { "epoch": 1.9902928460722733, "grad_norm": 6.40625, "learning_rate": 3.937673768995742e-05, "loss": 1.0373, "num_input_tokens_seen": 25489056, "step": 12200 }, { "epoch": 1.9902928460722733, "eval_loss": 2.535943031311035, "eval_runtime": 134.7977, "eval_samples_per_second": 20.215, "eval_steps_per_second": 10.111, "num_input_tokens_seen": 25489056, "step": 12200 }, { "epoch": 1.9911085732930909, "grad_norm": 8.1875, "learning_rate": 3.936870485894888e-05, "loss": 2.4247, "num_input_tokens_seen": 25498816, "step": 12205 }, { "epoch": 1.991924300513908, "grad_norm": 3.328125, "learning_rate": 3.9360669812105475e-05, "loss": 1.4053, "num_input_tokens_seen": 25509696, "step": 12210 }, { "epoch": 1.9927400277347256, "grad_norm": 5.40625, "learning_rate": 3.9352632550666325e-05, "loss": 1.7624, "num_input_tokens_seen": 25519360, "step": 12215 }, { "epoch": 1.9935557549555427, "grad_norm": 5.4375, "learning_rate": 3.9344593075870866e-05, "loss": 1.7833, "num_input_tokens_seen": 25530288, "step": 12220 }, { "epoch": 1.9943714821763603, "grad_norm": 4.9375, "learning_rate": 3.933655138895889e-05, "loss": 1.2549, "num_input_tokens_seen": 25539632, "step": 12225 }, { "epoch": 1.9951872093971774, "grad_norm": 5.34375, "learning_rate": 3.932850749117053e-05, "loss": 3.268, "num_input_tokens_seen": 25549744, "step": 12230 }, { "epoch": 1.996002936617995, "grad_norm": 9.9375, "learning_rate": 3.932046138374624e-05, "loss": 1.9572, "num_input_tokens_seen": 25558864, "step": 12235 }, { "epoch": 1.9968186638388123, "grad_norm": 9.9375, "learning_rate": 3.9312413067926854e-05, "loss": 2.6345, "num_input_tokens_seen": 25568784, "step": 12240 }, { "epoch": 1.9976343910596297, "grad_norm": 7.875, "learning_rate": 3.9304362544953506e-05, "loss": 3.4014, "num_input_tokens_seen": 25578816, "step": 12245 }, { "epoch": 1.998450118280447, "grad_norm": 2.328125, "learning_rate": 3.929630981606769e-05, "loss": 2.5976, "num_input_tokens_seen": 25589696, "step": 12250 }, { "epoch": 1.9992658455012644, "grad_norm": 9.875, "learning_rate": 3.928825488251124e-05, "loss": 2.1884, "num_input_tokens_seen": 25601184, "step": 12255 }, { "epoch": 2.0, "grad_norm": 0.06689453125, "learning_rate": 3.9280197745526344e-05, "loss": 1.8509, "num_input_tokens_seen": 25610688, "step": 12260 }, { "epoch": 2.0008157272208176, "grad_norm": 5.53125, "learning_rate": 3.9272138406355495e-05, "loss": 2.2133, "num_input_tokens_seen": 25620368, "step": 12265 }, { "epoch": 2.0016314544416347, "grad_norm": 4.5625, "learning_rate": 3.926407686624154e-05, "loss": 3.1083, "num_input_tokens_seen": 25629664, "step": 12270 }, { "epoch": 2.0024471816624523, "grad_norm": 2.875, "learning_rate": 3.9256013126427684e-05, "loss": 1.4889, "num_input_tokens_seen": 25640880, "step": 12275 }, { "epoch": 2.0032629088832694, "grad_norm": 5.03125, "learning_rate": 3.9247947188157455e-05, "loss": 2.3403, "num_input_tokens_seen": 25650480, "step": 12280 }, { "epoch": 2.004078636104087, "grad_norm": 5.65625, "learning_rate": 3.9239879052674715e-05, "loss": 1.2824, "num_input_tokens_seen": 25659424, "step": 12285 }, { "epoch": 2.004894363324904, "grad_norm": 11.4375, "learning_rate": 3.9231808721223673e-05, "loss": 2.865, "num_input_tokens_seen": 25669408, "step": 12290 }, { "epoch": 2.0057100905457217, "grad_norm": 1.5625, "learning_rate": 3.9223736195048886e-05, "loss": 2.9898, "num_input_tokens_seen": 25678880, "step": 12295 }, { "epoch": 2.006525817766539, "grad_norm": 7.53125, "learning_rate": 3.921566147539523e-05, "loss": 1.8816, "num_input_tokens_seen": 25689760, "step": 12300 }, { "epoch": 2.0073415449873564, "grad_norm": 7.625, "learning_rate": 3.920758456350792e-05, "loss": 2.2706, "num_input_tokens_seen": 25700656, "step": 12305 }, { "epoch": 2.0081572722081735, "grad_norm": 4.90625, "learning_rate": 3.919950546063253e-05, "loss": 1.9157, "num_input_tokens_seen": 25711920, "step": 12310 }, { "epoch": 2.008972999428991, "grad_norm": 5.28125, "learning_rate": 3.919142416801496e-05, "loss": 2.9542, "num_input_tokens_seen": 25721008, "step": 12315 }, { "epoch": 2.0097887266498082, "grad_norm": 6.8125, "learning_rate": 3.918334068690144e-05, "loss": 2.6724, "num_input_tokens_seen": 25732000, "step": 12320 }, { "epoch": 2.010604453870626, "grad_norm": 4.53125, "learning_rate": 3.917525501853855e-05, "loss": 1.734, "num_input_tokens_seen": 25742368, "step": 12325 }, { "epoch": 2.011420181091443, "grad_norm": 5.53125, "learning_rate": 3.916716716417319e-05, "loss": 1.4206, "num_input_tokens_seen": 25752880, "step": 12330 }, { "epoch": 2.0122359083122605, "grad_norm": 6.8125, "learning_rate": 3.915907712505263e-05, "loss": 2.1621, "num_input_tokens_seen": 25761968, "step": 12335 }, { "epoch": 2.0130516355330776, "grad_norm": 7.78125, "learning_rate": 3.915098490242444e-05, "loss": 2.5838, "num_input_tokens_seen": 25772688, "step": 12340 }, { "epoch": 2.013867362753895, "grad_norm": 7.53125, "learning_rate": 3.914289049753654e-05, "loss": 2.812, "num_input_tokens_seen": 25783792, "step": 12345 }, { "epoch": 2.0146830899747123, "grad_norm": 10.125, "learning_rate": 3.913479391163719e-05, "loss": 3.0933, "num_input_tokens_seen": 25794128, "step": 12350 }, { "epoch": 2.01549881719553, "grad_norm": 3.4375, "learning_rate": 3.9126695145975e-05, "loss": 0.9704, "num_input_tokens_seen": 25805056, "step": 12355 }, { "epoch": 2.016314544416347, "grad_norm": 2.8125, "learning_rate": 3.911859420179889e-05, "loss": 2.623, "num_input_tokens_seen": 25814320, "step": 12360 }, { "epoch": 2.0171302716371646, "grad_norm": 4.0625, "learning_rate": 3.911049108035813e-05, "loss": 2.7762, "num_input_tokens_seen": 25825520, "step": 12365 }, { "epoch": 2.0179459988579818, "grad_norm": 2.609375, "learning_rate": 3.910238578290232e-05, "loss": 1.5594, "num_input_tokens_seen": 25836208, "step": 12370 }, { "epoch": 2.0187617260787993, "grad_norm": 6.21875, "learning_rate": 3.90942783106814e-05, "loss": 2.3027, "num_input_tokens_seen": 25844720, "step": 12375 }, { "epoch": 2.0195774532996165, "grad_norm": 2.46875, "learning_rate": 3.908616866494564e-05, "loss": 1.9183, "num_input_tokens_seen": 25854864, "step": 12380 }, { "epoch": 2.020393180520434, "grad_norm": 5.375, "learning_rate": 3.907805684694566e-05, "loss": 2.7487, "num_input_tokens_seen": 25865968, "step": 12385 }, { "epoch": 2.021208907741251, "grad_norm": 3.265625, "learning_rate": 3.90699428579324e-05, "loss": 0.5792, "num_input_tokens_seen": 25876128, "step": 12390 }, { "epoch": 2.0220246349620687, "grad_norm": 9.375, "learning_rate": 3.906182669915713e-05, "loss": 3.8842, "num_input_tokens_seen": 25886736, "step": 12395 }, { "epoch": 2.022840362182886, "grad_norm": 3.140625, "learning_rate": 3.9053708371871476e-05, "loss": 1.368, "num_input_tokens_seen": 25898496, "step": 12400 }, { "epoch": 2.022840362182886, "eval_loss": 2.5435256958007812, "eval_runtime": 135.048, "eval_samples_per_second": 20.178, "eval_steps_per_second": 10.093, "num_input_tokens_seen": 25898496, "step": 12400 }, { "epoch": 2.0236560894037035, "grad_norm": 9.25, "learning_rate": 3.904558787732738e-05, "loss": 2.179, "num_input_tokens_seen": 25908560, "step": 12405 }, { "epoch": 2.0244718166245206, "grad_norm": 3.203125, "learning_rate": 3.9037465216777135e-05, "loss": 1.9731, "num_input_tokens_seen": 25919824, "step": 12410 }, { "epoch": 2.025287543845338, "grad_norm": 9.5, "learning_rate": 3.902934039147334e-05, "loss": 2.8509, "num_input_tokens_seen": 25929968, "step": 12415 }, { "epoch": 2.0261032710661553, "grad_norm": 5.125, "learning_rate": 3.902121340266894e-05, "loss": 2.4208, "num_input_tokens_seen": 25941216, "step": 12420 }, { "epoch": 2.026918998286973, "grad_norm": 9.8125, "learning_rate": 3.9013084251617246e-05, "loss": 3.5501, "num_input_tokens_seen": 25950816, "step": 12425 }, { "epoch": 2.02773472550779, "grad_norm": 8.4375, "learning_rate": 3.9004952939571865e-05, "loss": 1.962, "num_input_tokens_seen": 25962336, "step": 12430 }, { "epoch": 2.0285504527286076, "grad_norm": 10.1875, "learning_rate": 3.899681946778673e-05, "loss": 3.5948, "num_input_tokens_seen": 25971072, "step": 12435 }, { "epoch": 2.0293661799494247, "grad_norm": 5.15625, "learning_rate": 3.898868383751615e-05, "loss": 2.3573, "num_input_tokens_seen": 25982448, "step": 12440 }, { "epoch": 2.0301819071702423, "grad_norm": 10.4375, "learning_rate": 3.8980546050014724e-05, "loss": 2.226, "num_input_tokens_seen": 25993344, "step": 12445 }, { "epoch": 2.03099763439106, "grad_norm": 4.90625, "learning_rate": 3.897240610653741e-05, "loss": 1.801, "num_input_tokens_seen": 26004288, "step": 12450 }, { "epoch": 2.031813361611877, "grad_norm": 13.5625, "learning_rate": 3.896426400833948e-05, "loss": 2.6098, "num_input_tokens_seen": 26014928, "step": 12455 }, { "epoch": 2.0326290888326946, "grad_norm": 3.4375, "learning_rate": 3.895611975667656e-05, "loss": 3.965, "num_input_tokens_seen": 26025088, "step": 12460 }, { "epoch": 2.0334448160535117, "grad_norm": 4.71875, "learning_rate": 3.8947973352804584e-05, "loss": 2.8017, "num_input_tokens_seen": 26036816, "step": 12465 }, { "epoch": 2.0342605432743293, "grad_norm": 10.125, "learning_rate": 3.893982479797984e-05, "loss": 4.9591, "num_input_tokens_seen": 26047488, "step": 12470 }, { "epoch": 2.0350762704951464, "grad_norm": 8.375, "learning_rate": 3.8931674093458926e-05, "loss": 3.0106, "num_input_tokens_seen": 26058448, "step": 12475 }, { "epoch": 2.035891997715964, "grad_norm": 6.46875, "learning_rate": 3.89235212404988e-05, "loss": 2.3259, "num_input_tokens_seen": 26069760, "step": 12480 }, { "epoch": 2.036707724936781, "grad_norm": 4.5, "learning_rate": 3.891536624035672e-05, "loss": 1.1691, "num_input_tokens_seen": 26079680, "step": 12485 }, { "epoch": 2.0375234521575987, "grad_norm": 4.28125, "learning_rate": 3.8907209094290295e-05, "loss": 2.4451, "num_input_tokens_seen": 26090800, "step": 12490 }, { "epoch": 2.038339179378416, "grad_norm": 1.8828125, "learning_rate": 3.8899049803557466e-05, "loss": 1.0834, "num_input_tokens_seen": 26100384, "step": 12495 }, { "epoch": 2.0391549065992334, "grad_norm": 6.15625, "learning_rate": 3.889088836941648e-05, "loss": 3.3338, "num_input_tokens_seen": 26109712, "step": 12500 }, { "epoch": 2.0399706338200505, "grad_norm": 8.125, "learning_rate": 3.8882724793125946e-05, "loss": 2.2804, "num_input_tokens_seen": 26121536, "step": 12505 }, { "epoch": 2.040786361040868, "grad_norm": 12.9375, "learning_rate": 3.8874559075944794e-05, "loss": 1.2882, "num_input_tokens_seen": 26133024, "step": 12510 }, { "epoch": 2.041602088261685, "grad_norm": 9.125, "learning_rate": 3.886639121913227e-05, "loss": 1.6742, "num_input_tokens_seen": 26143376, "step": 12515 }, { "epoch": 2.042417815482503, "grad_norm": 10.375, "learning_rate": 3.885822122394797e-05, "loss": 2.8512, "num_input_tokens_seen": 26154608, "step": 12520 }, { "epoch": 2.04323354270332, "grad_norm": 10.25, "learning_rate": 3.8850049091651794e-05, "loss": 2.3516, "num_input_tokens_seen": 26165248, "step": 12525 }, { "epoch": 2.0440492699241375, "grad_norm": 4.625, "learning_rate": 3.8841874823504e-05, "loss": 1.7105, "num_input_tokens_seen": 26175952, "step": 12530 }, { "epoch": 2.0448649971449546, "grad_norm": 9.375, "learning_rate": 3.8833698420765157e-05, "loss": 2.4398, "num_input_tokens_seen": 26185984, "step": 12535 }, { "epoch": 2.045680724365772, "grad_norm": 8.0625, "learning_rate": 3.882551988469618e-05, "loss": 1.8864, "num_input_tokens_seen": 26196656, "step": 12540 }, { "epoch": 2.0464964515865893, "grad_norm": 7.59375, "learning_rate": 3.881733921655829e-05, "loss": 2.3541, "num_input_tokens_seen": 26206848, "step": 12545 }, { "epoch": 2.047312178807407, "grad_norm": 9.125, "learning_rate": 3.8809156417613054e-05, "loss": 2.796, "num_input_tokens_seen": 26216768, "step": 12550 }, { "epoch": 2.048127906028224, "grad_norm": 3.515625, "learning_rate": 3.8800971489122364e-05, "loss": 1.947, "num_input_tokens_seen": 26226320, "step": 12555 }, { "epoch": 2.0489436332490416, "grad_norm": 10.3125, "learning_rate": 3.8792784432348434e-05, "loss": 2.4148, "num_input_tokens_seen": 26236448, "step": 12560 }, { "epoch": 2.0497593604698587, "grad_norm": 7.9375, "learning_rate": 3.878459524855381e-05, "loss": 2.6365, "num_input_tokens_seen": 26247392, "step": 12565 }, { "epoch": 2.0505750876906763, "grad_norm": 6.34375, "learning_rate": 3.8776403939001384e-05, "loss": 2.5982, "num_input_tokens_seen": 26257872, "step": 12570 }, { "epoch": 2.0513908149114934, "grad_norm": 0.06640625, "learning_rate": 3.876821050495433e-05, "loss": 2.456, "num_input_tokens_seen": 26267872, "step": 12575 }, { "epoch": 2.052206542132311, "grad_norm": 12.25, "learning_rate": 3.87600149476762e-05, "loss": 4.0875, "num_input_tokens_seen": 26279088, "step": 12580 }, { "epoch": 2.053022269353128, "grad_norm": 4.53125, "learning_rate": 3.8751817268430843e-05, "loss": 1.6644, "num_input_tokens_seen": 26289664, "step": 12585 }, { "epoch": 2.0538379965739457, "grad_norm": 0.0966796875, "learning_rate": 3.8743617468482464e-05, "loss": 1.612, "num_input_tokens_seen": 26298768, "step": 12590 }, { "epoch": 2.054653723794763, "grad_norm": 7.9375, "learning_rate": 3.8735415549095535e-05, "loss": 2.4845, "num_input_tokens_seen": 26309216, "step": 12595 }, { "epoch": 2.0554694510155804, "grad_norm": 4.4375, "learning_rate": 3.8727211511534934e-05, "loss": 2.2438, "num_input_tokens_seen": 26319696, "step": 12600 }, { "epoch": 2.0554694510155804, "eval_loss": 2.54951810836792, "eval_runtime": 134.8545, "eval_samples_per_second": 20.207, "eval_steps_per_second": 10.107, "num_input_tokens_seen": 26319696, "step": 12600 }, { "epoch": 2.0562851782363976, "grad_norm": 8.6875, "learning_rate": 3.8719005357065804e-05, "loss": 2.1388, "num_input_tokens_seen": 26329824, "step": 12605 }, { "epoch": 2.057100905457215, "grad_norm": 7.78125, "learning_rate": 3.8710797086953645e-05, "loss": 2.6026, "num_input_tokens_seen": 26340432, "step": 12610 }, { "epoch": 2.0579166326780323, "grad_norm": 11.0, "learning_rate": 3.870258670246427e-05, "loss": 3.012, "num_input_tokens_seen": 26351760, "step": 12615 }, { "epoch": 2.05873235989885, "grad_norm": 5.46875, "learning_rate": 3.869437420486384e-05, "loss": 1.6749, "num_input_tokens_seen": 26361696, "step": 12620 }, { "epoch": 2.059548087119667, "grad_norm": 10.0625, "learning_rate": 3.8686159595418805e-05, "loss": 2.9085, "num_input_tokens_seen": 26370592, "step": 12625 }, { "epoch": 2.0603638143404845, "grad_norm": 3.5625, "learning_rate": 3.867794287539597e-05, "loss": 2.6304, "num_input_tokens_seen": 26382208, "step": 12630 }, { "epoch": 2.0611795415613017, "grad_norm": 18.125, "learning_rate": 3.866972404606245e-05, "loss": 1.9655, "num_input_tokens_seen": 26392192, "step": 12635 }, { "epoch": 2.0619952687821193, "grad_norm": 9.1875, "learning_rate": 3.866150310868571e-05, "loss": 2.4638, "num_input_tokens_seen": 26403872, "step": 12640 }, { "epoch": 2.062810996002937, "grad_norm": 8.1875, "learning_rate": 3.8653280064533506e-05, "loss": 1.692, "num_input_tokens_seen": 26415312, "step": 12645 }, { "epoch": 2.063626723223754, "grad_norm": 7.21875, "learning_rate": 3.864505491487394e-05, "loss": 3.9869, "num_input_tokens_seen": 26426144, "step": 12650 }, { "epoch": 2.0644424504445715, "grad_norm": 2.40625, "learning_rate": 3.8636827660975414e-05, "loss": 2.282, "num_input_tokens_seen": 26436112, "step": 12655 }, { "epoch": 2.0652581776653887, "grad_norm": 15.0625, "learning_rate": 3.862859830410671e-05, "loss": 2.1649, "num_input_tokens_seen": 26447408, "step": 12660 }, { "epoch": 2.0660739048862062, "grad_norm": 5.5625, "learning_rate": 3.862036684553688e-05, "loss": 2.5149, "num_input_tokens_seen": 26459504, "step": 12665 }, { "epoch": 2.0668896321070234, "grad_norm": 8.0, "learning_rate": 3.8612133286535314e-05, "loss": 2.5804, "num_input_tokens_seen": 26469904, "step": 12670 }, { "epoch": 2.067705359327841, "grad_norm": 4.375, "learning_rate": 3.860389762837173e-05, "loss": 1.7207, "num_input_tokens_seen": 26479040, "step": 12675 }, { "epoch": 2.068521086548658, "grad_norm": 3.28125, "learning_rate": 3.859565987231618e-05, "loss": 1.9384, "num_input_tokens_seen": 26488256, "step": 12680 }, { "epoch": 2.0693368137694756, "grad_norm": 5.53125, "learning_rate": 3.858742001963902e-05, "loss": 2.1978, "num_input_tokens_seen": 26500016, "step": 12685 }, { "epoch": 2.0701525409902928, "grad_norm": 7.3125, "learning_rate": 3.857917807161094e-05, "loss": 3.1785, "num_input_tokens_seen": 26510464, "step": 12690 }, { "epoch": 2.0709682682111104, "grad_norm": 8.3125, "learning_rate": 3.857093402950296e-05, "loss": 1.6891, "num_input_tokens_seen": 26521344, "step": 12695 }, { "epoch": 2.0717839954319275, "grad_norm": 14.0, "learning_rate": 3.8562687894586414e-05, "loss": 2.6856, "num_input_tokens_seen": 26532416, "step": 12700 }, { "epoch": 2.072599722652745, "grad_norm": 8.4375, "learning_rate": 3.8554439668132946e-05, "loss": 2.0034, "num_input_tokens_seen": 26543520, "step": 12705 }, { "epoch": 2.073415449873562, "grad_norm": 7.0, "learning_rate": 3.854618935141455e-05, "loss": 2.4925, "num_input_tokens_seen": 26554352, "step": 12710 }, { "epoch": 2.0742311770943798, "grad_norm": 4.96875, "learning_rate": 3.8537936945703525e-05, "loss": 2.1544, "num_input_tokens_seen": 26565488, "step": 12715 }, { "epoch": 2.075046904315197, "grad_norm": 11.25, "learning_rate": 3.852968245227249e-05, "loss": 3.0383, "num_input_tokens_seen": 26576080, "step": 12720 }, { "epoch": 2.0758626315360145, "grad_norm": 8.75, "learning_rate": 3.85214258723944e-05, "loss": 2.0196, "num_input_tokens_seen": 26585968, "step": 12725 }, { "epoch": 2.0766783587568316, "grad_norm": 8.4375, "learning_rate": 3.8513167207342524e-05, "loss": 2.1878, "num_input_tokens_seen": 26597760, "step": 12730 }, { "epoch": 2.077494085977649, "grad_norm": 5.1875, "learning_rate": 3.850490645839044e-05, "loss": 2.8964, "num_input_tokens_seen": 26607568, "step": 12735 }, { "epoch": 2.0783098131984663, "grad_norm": 11.3125, "learning_rate": 3.849664362681207e-05, "loss": 2.9705, "num_input_tokens_seen": 26616912, "step": 12740 }, { "epoch": 2.079125540419284, "grad_norm": 10.0625, "learning_rate": 3.848837871388165e-05, "loss": 2.3568, "num_input_tokens_seen": 26628064, "step": 12745 }, { "epoch": 2.079941267640101, "grad_norm": 7.75, "learning_rate": 3.848011172087371e-05, "loss": 2.1284, "num_input_tokens_seen": 26640128, "step": 12750 }, { "epoch": 2.0807569948609186, "grad_norm": 3.46875, "learning_rate": 3.847184264906315e-05, "loss": 1.8572, "num_input_tokens_seen": 26650512, "step": 12755 }, { "epoch": 2.0815727220817357, "grad_norm": 5.21875, "learning_rate": 3.846357149972516e-05, "loss": 2.0123, "num_input_tokens_seen": 26661488, "step": 12760 }, { "epoch": 2.0823884493025533, "grad_norm": 2.703125, "learning_rate": 3.8455298274135246e-05, "loss": 2.0347, "num_input_tokens_seen": 26671744, "step": 12765 }, { "epoch": 2.0832041765233704, "grad_norm": 4.125, "learning_rate": 3.8447022973569254e-05, "loss": 1.7539, "num_input_tokens_seen": 26681824, "step": 12770 }, { "epoch": 2.084019903744188, "grad_norm": 5.34375, "learning_rate": 3.843874559930332e-05, "loss": 2.0657, "num_input_tokens_seen": 26692400, "step": 12775 }, { "epoch": 2.084835630965005, "grad_norm": 6.34375, "learning_rate": 3.843046615261394e-05, "loss": 1.2089, "num_input_tokens_seen": 26703200, "step": 12780 }, { "epoch": 2.0856513581858227, "grad_norm": 5.375, "learning_rate": 3.842218463477791e-05, "loss": 1.5922, "num_input_tokens_seen": 26713824, "step": 12785 }, { "epoch": 2.08646708540664, "grad_norm": 13.4375, "learning_rate": 3.841390104707233e-05, "loss": 2.7453, "num_input_tokens_seen": 26724832, "step": 12790 }, { "epoch": 2.0872828126274574, "grad_norm": 4.9375, "learning_rate": 3.8405615390774643e-05, "loss": 2.648, "num_input_tokens_seen": 26735552, "step": 12795 }, { "epoch": 2.0880985398482745, "grad_norm": 5.40625, "learning_rate": 3.839732766716259e-05, "loss": 2.1512, "num_input_tokens_seen": 26744896, "step": 12800 }, { "epoch": 2.0880985398482745, "eval_loss": 2.5462207794189453, "eval_runtime": 135.1489, "eval_samples_per_second": 20.163, "eval_steps_per_second": 10.085, "num_input_tokens_seen": 26744896, "step": 12800 }, { "epoch": 2.088914267069092, "grad_norm": 10.0, "learning_rate": 3.838903787751425e-05, "loss": 2.6636, "num_input_tokens_seen": 26754400, "step": 12805 }, { "epoch": 2.0897299942899092, "grad_norm": 5.84375, "learning_rate": 3.838074602310802e-05, "loss": 1.5369, "num_input_tokens_seen": 26765168, "step": 12810 }, { "epoch": 2.090545721510727, "grad_norm": 4.03125, "learning_rate": 3.837245210522258e-05, "loss": 1.29, "num_input_tokens_seen": 26775520, "step": 12815 }, { "epoch": 2.0913614487315444, "grad_norm": 5.84375, "learning_rate": 3.8364156125136996e-05, "loss": 2.7491, "num_input_tokens_seen": 26784816, "step": 12820 }, { "epoch": 2.0921771759523615, "grad_norm": 13.625, "learning_rate": 3.835585808413059e-05, "loss": 2.2804, "num_input_tokens_seen": 26795056, "step": 12825 }, { "epoch": 2.092992903173179, "grad_norm": 5.15625, "learning_rate": 3.8347557983483024e-05, "loss": 1.6828, "num_input_tokens_seen": 26806992, "step": 12830 }, { "epoch": 2.0938086303939962, "grad_norm": 7.4375, "learning_rate": 3.833925582447428e-05, "loss": 3.3858, "num_input_tokens_seen": 26817088, "step": 12835 }, { "epoch": 2.094624357614814, "grad_norm": 7.09375, "learning_rate": 3.8330951608384656e-05, "loss": 2.9799, "num_input_tokens_seen": 26827392, "step": 12840 }, { "epoch": 2.095440084835631, "grad_norm": 5.03125, "learning_rate": 3.832264533649477e-05, "loss": 2.9696, "num_input_tokens_seen": 26836096, "step": 12845 }, { "epoch": 2.0962558120564485, "grad_norm": 1.78125, "learning_rate": 3.8314337010085555e-05, "loss": 1.6584, "num_input_tokens_seen": 26846768, "step": 12850 }, { "epoch": 2.0970715392772656, "grad_norm": 5.8125, "learning_rate": 3.830602663043824e-05, "loss": 2.2998, "num_input_tokens_seen": 26857536, "step": 12855 }, { "epoch": 2.097887266498083, "grad_norm": 8.5625, "learning_rate": 3.8297714198834414e-05, "loss": 2.8623, "num_input_tokens_seen": 26868368, "step": 12860 }, { "epoch": 2.0987029937189003, "grad_norm": 2.734375, "learning_rate": 3.828939971655595e-05, "loss": 2.1364, "num_input_tokens_seen": 26877584, "step": 12865 }, { "epoch": 2.099518720939718, "grad_norm": 3.765625, "learning_rate": 3.828108318488505e-05, "loss": 1.6517, "num_input_tokens_seen": 26889136, "step": 12870 }, { "epoch": 2.100334448160535, "grad_norm": 8.1875, "learning_rate": 3.8272764605104216e-05, "loss": 2.905, "num_input_tokens_seen": 26899952, "step": 12875 }, { "epoch": 2.1011501753813526, "grad_norm": 8.0625, "learning_rate": 3.826444397849628e-05, "loss": 2.2225, "num_input_tokens_seen": 26910688, "step": 12880 }, { "epoch": 2.1019659026021698, "grad_norm": 9.3125, "learning_rate": 3.825612130634439e-05, "loss": 2.1163, "num_input_tokens_seen": 26922000, "step": 12885 }, { "epoch": 2.1027816298229873, "grad_norm": 3.703125, "learning_rate": 3.824779658993202e-05, "loss": 1.7419, "num_input_tokens_seen": 26932608, "step": 12890 }, { "epoch": 2.1035973570438045, "grad_norm": 8.375, "learning_rate": 3.823946983054292e-05, "loss": 2.8928, "num_input_tokens_seen": 26943520, "step": 12895 }, { "epoch": 2.104413084264622, "grad_norm": 15.8125, "learning_rate": 3.82311410294612e-05, "loss": 3.0741, "num_input_tokens_seen": 26954048, "step": 12900 }, { "epoch": 2.105228811485439, "grad_norm": 7.8125, "learning_rate": 3.822281018797127e-05, "loss": 1.3691, "num_input_tokens_seen": 26965152, "step": 12905 }, { "epoch": 2.1060445387062567, "grad_norm": 13.4375, "learning_rate": 3.821447730735783e-05, "loss": 3.287, "num_input_tokens_seen": 26975120, "step": 12910 }, { "epoch": 2.106860265927074, "grad_norm": 13.375, "learning_rate": 3.820614238890592e-05, "loss": 3.7302, "num_input_tokens_seen": 26986816, "step": 12915 }, { "epoch": 2.1076759931478914, "grad_norm": 5.375, "learning_rate": 3.819780543390091e-05, "loss": 0.7774, "num_input_tokens_seen": 26998400, "step": 12920 }, { "epoch": 2.1084917203687086, "grad_norm": 3.609375, "learning_rate": 3.818946644362844e-05, "loss": 1.7324, "num_input_tokens_seen": 27007616, "step": 12925 }, { "epoch": 2.109307447589526, "grad_norm": 10.25, "learning_rate": 3.81811254193745e-05, "loss": 2.3223, "num_input_tokens_seen": 27019168, "step": 12930 }, { "epoch": 2.1101231748103433, "grad_norm": 4.0, "learning_rate": 3.8172782362425366e-05, "loss": 2.6146, "num_input_tokens_seen": 27030048, "step": 12935 }, { "epoch": 2.110938902031161, "grad_norm": 5.96875, "learning_rate": 3.816443727406765e-05, "loss": 1.7589, "num_input_tokens_seen": 27040080, "step": 12940 }, { "epoch": 2.111754629251978, "grad_norm": 7.375, "learning_rate": 3.815609015558829e-05, "loss": 1.9522, "num_input_tokens_seen": 27050496, "step": 12945 }, { "epoch": 2.1125703564727956, "grad_norm": 9.6875, "learning_rate": 3.814774100827448e-05, "loss": 3.4615, "num_input_tokens_seen": 27060784, "step": 12950 }, { "epoch": 2.1133860836936127, "grad_norm": 14.5625, "learning_rate": 3.813938983341379e-05, "loss": 2.9793, "num_input_tokens_seen": 27071056, "step": 12955 }, { "epoch": 2.1142018109144303, "grad_norm": 8.0625, "learning_rate": 3.813103663229407e-05, "loss": 2.3659, "num_input_tokens_seen": 27080096, "step": 12960 }, { "epoch": 2.1150175381352474, "grad_norm": 9.25, "learning_rate": 3.812268140620349e-05, "loss": 2.5049, "num_input_tokens_seen": 27091264, "step": 12965 }, { "epoch": 2.115833265356065, "grad_norm": 6.09375, "learning_rate": 3.811432415643051e-05, "loss": 2.0872, "num_input_tokens_seen": 27101040, "step": 12970 }, { "epoch": 2.116648992576882, "grad_norm": 4.84375, "learning_rate": 3.8105964884263954e-05, "loss": 2.8819, "num_input_tokens_seen": 27113424, "step": 12975 }, { "epoch": 2.1174647197976997, "grad_norm": 4.03125, "learning_rate": 3.809760359099291e-05, "loss": 2.963, "num_input_tokens_seen": 27123488, "step": 12980 }, { "epoch": 2.118280447018517, "grad_norm": 4.65625, "learning_rate": 3.8089240277906804e-05, "loss": 2.4296, "num_input_tokens_seen": 27134432, "step": 12985 }, { "epoch": 2.1190961742393344, "grad_norm": 5.59375, "learning_rate": 3.808087494629535e-05, "loss": 2.4118, "num_input_tokens_seen": 27144976, "step": 12990 }, { "epoch": 2.1199119014601515, "grad_norm": 7.46875, "learning_rate": 3.8072507597448595e-05, "loss": 2.7485, "num_input_tokens_seen": 27155152, "step": 12995 }, { "epoch": 2.120727628680969, "grad_norm": 10.3125, "learning_rate": 3.806413823265689e-05, "loss": 1.7513, "num_input_tokens_seen": 27166064, "step": 13000 }, { "epoch": 2.120727628680969, "eval_loss": 2.5287065505981445, "eval_runtime": 134.8393, "eval_samples_per_second": 20.209, "eval_steps_per_second": 10.108, "num_input_tokens_seen": 27166064, "step": 13000 }, { "epoch": 2.121543355901786, "grad_norm": 3.203125, "learning_rate": 3.805576685321089e-05, "loss": 2.6723, "num_input_tokens_seen": 27177456, "step": 13005 }, { "epoch": 2.122359083122604, "grad_norm": 3.96875, "learning_rate": 3.804739346040158e-05, "loss": 1.2507, "num_input_tokens_seen": 27186464, "step": 13010 }, { "epoch": 2.1231748103434214, "grad_norm": 4.9375, "learning_rate": 3.8039018055520234e-05, "loss": 2.8962, "num_input_tokens_seen": 27196320, "step": 13015 }, { "epoch": 2.1239905375642385, "grad_norm": 7.90625, "learning_rate": 3.803064063985844e-05, "loss": 2.2396, "num_input_tokens_seen": 27206992, "step": 13020 }, { "epoch": 2.124806264785056, "grad_norm": 2.234375, "learning_rate": 3.802226121470811e-05, "loss": 1.8954, "num_input_tokens_seen": 27218304, "step": 13025 }, { "epoch": 2.125621992005873, "grad_norm": 3.578125, "learning_rate": 3.801387978136145e-05, "loss": 3.4184, "num_input_tokens_seen": 27229040, "step": 13030 }, { "epoch": 2.126437719226691, "grad_norm": 7.625, "learning_rate": 3.800549634111099e-05, "loss": 2.4736, "num_input_tokens_seen": 27240064, "step": 13035 }, { "epoch": 2.127253446447508, "grad_norm": 9.9375, "learning_rate": 3.799711089524955e-05, "loss": 3.6355, "num_input_tokens_seen": 27249600, "step": 13040 }, { "epoch": 2.1280691736683255, "grad_norm": 2.265625, "learning_rate": 3.7988723445070285e-05, "loss": 1.1616, "num_input_tokens_seen": 27259264, "step": 13045 }, { "epoch": 2.1288849008891426, "grad_norm": 7.0625, "learning_rate": 3.798033399186663e-05, "loss": 1.3842, "num_input_tokens_seen": 27270784, "step": 13050 }, { "epoch": 2.12970062810996, "grad_norm": 4.34375, "learning_rate": 3.797194253693237e-05, "loss": 1.4918, "num_input_tokens_seen": 27281168, "step": 13055 }, { "epoch": 2.1305163553307773, "grad_norm": 5.46875, "learning_rate": 3.796354908156153e-05, "loss": 2.7857, "num_input_tokens_seen": 27290976, "step": 13060 }, { "epoch": 2.131332082551595, "grad_norm": 7.21875, "learning_rate": 3.795515362704853e-05, "loss": 3.5724, "num_input_tokens_seen": 27301088, "step": 13065 }, { "epoch": 2.132147809772412, "grad_norm": 8.4375, "learning_rate": 3.794675617468803e-05, "loss": 1.7854, "num_input_tokens_seen": 27311856, "step": 13070 }, { "epoch": 2.1329635369932296, "grad_norm": 8.5, "learning_rate": 3.793835672577503e-05, "loss": 2.7045, "num_input_tokens_seen": 27323280, "step": 13075 }, { "epoch": 2.1337792642140467, "grad_norm": 5.125, "learning_rate": 3.7929955281604826e-05, "loss": 2.5457, "num_input_tokens_seen": 27332192, "step": 13080 }, { "epoch": 2.1345949914348643, "grad_norm": 10.25, "learning_rate": 3.7921551843473036e-05, "loss": 1.4884, "num_input_tokens_seen": 27341712, "step": 13085 }, { "epoch": 2.1354107186556814, "grad_norm": 6.125, "learning_rate": 3.791314641267557e-05, "loss": 2.5575, "num_input_tokens_seen": 27352880, "step": 13090 }, { "epoch": 2.136226445876499, "grad_norm": 6.0, "learning_rate": 3.790473899050864e-05, "loss": 2.2465, "num_input_tokens_seen": 27362432, "step": 13095 }, { "epoch": 2.137042173097316, "grad_norm": 12.0625, "learning_rate": 3.7896329578268794e-05, "loss": 2.9499, "num_input_tokens_seen": 27373728, "step": 13100 }, { "epoch": 2.1378579003181337, "grad_norm": 6.8125, "learning_rate": 3.7887918177252855e-05, "loss": 1.6174, "num_input_tokens_seen": 27383936, "step": 13105 }, { "epoch": 2.138673627538951, "grad_norm": 6.28125, "learning_rate": 3.787950478875798e-05, "loss": 2.5031, "num_input_tokens_seen": 27396240, "step": 13110 }, { "epoch": 2.1394893547597684, "grad_norm": 9.5, "learning_rate": 3.787108941408162e-05, "loss": 2.6774, "num_input_tokens_seen": 27406352, "step": 13115 }, { "epoch": 2.1403050819805856, "grad_norm": 5.78125, "learning_rate": 3.786267205452151e-05, "loss": 1.6694, "num_input_tokens_seen": 27415696, "step": 13120 }, { "epoch": 2.141120809201403, "grad_norm": 3.53125, "learning_rate": 3.785425271137573e-05, "loss": 2.0385, "num_input_tokens_seen": 27425504, "step": 13125 }, { "epoch": 2.1419365364222203, "grad_norm": 8.75, "learning_rate": 3.7845831385942655e-05, "loss": 3.7738, "num_input_tokens_seen": 27437184, "step": 13130 }, { "epoch": 2.142752263643038, "grad_norm": 0.1962890625, "learning_rate": 3.7837408079520944e-05, "loss": 2.089, "num_input_tokens_seen": 27446608, "step": 13135 }, { "epoch": 2.143567990863855, "grad_norm": 4.65625, "learning_rate": 3.782898279340957e-05, "loss": 1.5335, "num_input_tokens_seen": 27456128, "step": 13140 }, { "epoch": 2.1443837180846725, "grad_norm": 13.625, "learning_rate": 3.782055552890784e-05, "loss": 2.4087, "num_input_tokens_seen": 27466640, "step": 13145 }, { "epoch": 2.1451994453054897, "grad_norm": 15.3125, "learning_rate": 3.781212628731534e-05, "loss": 2.5935, "num_input_tokens_seen": 27476384, "step": 13150 }, { "epoch": 2.1460151725263072, "grad_norm": 15.0, "learning_rate": 3.7803695069931946e-05, "loss": 2.7839, "num_input_tokens_seen": 27487184, "step": 13155 }, { "epoch": 2.1468308997471244, "grad_norm": 6.0625, "learning_rate": 3.779526187805789e-05, "loss": 3.7321, "num_input_tokens_seen": 27497152, "step": 13160 }, { "epoch": 2.147646626967942, "grad_norm": 6.625, "learning_rate": 3.778682671299364e-05, "loss": 2.1731, "num_input_tokens_seen": 27507504, "step": 13165 }, { "epoch": 2.148462354188759, "grad_norm": 9.9375, "learning_rate": 3.777838957604003e-05, "loss": 2.1296, "num_input_tokens_seen": 27517008, "step": 13170 }, { "epoch": 2.1492780814095767, "grad_norm": 13.4375, "learning_rate": 3.776995046849816e-05, "loss": 2.4311, "num_input_tokens_seen": 27527184, "step": 13175 }, { "epoch": 2.150093808630394, "grad_norm": 12.375, "learning_rate": 3.776150939166945e-05, "loss": 1.663, "num_input_tokens_seen": 27537632, "step": 13180 }, { "epoch": 2.1509095358512114, "grad_norm": 7.40625, "learning_rate": 3.775306634685562e-05, "loss": 3.7676, "num_input_tokens_seen": 27550032, "step": 13185 }, { "epoch": 2.151725263072029, "grad_norm": 10.6875, "learning_rate": 3.7744621335358696e-05, "loss": 2.4983, "num_input_tokens_seen": 27560336, "step": 13190 }, { "epoch": 2.152540990292846, "grad_norm": 6.96875, "learning_rate": 3.7736174358481e-05, "loss": 2.2404, "num_input_tokens_seen": 27571200, "step": 13195 }, { "epoch": 2.153356717513663, "grad_norm": 3.09375, "learning_rate": 3.7727725417525175e-05, "loss": 0.9243, "num_input_tokens_seen": 27581264, "step": 13200 }, { "epoch": 2.153356717513663, "eval_loss": 2.5427470207214355, "eval_runtime": 134.7467, "eval_samples_per_second": 20.223, "eval_steps_per_second": 10.115, "num_input_tokens_seen": 27581264, "step": 13200 }, { "epoch": 2.1541724447344808, "grad_norm": 6.6875, "learning_rate": 3.771927451379414e-05, "loss": 2.3377, "num_input_tokens_seen": 27593168, "step": 13205 }, { "epoch": 2.1549881719552983, "grad_norm": 5.875, "learning_rate": 3.7710821648591135e-05, "loss": 2.5799, "num_input_tokens_seen": 27602352, "step": 13210 }, { "epoch": 2.1558038991761155, "grad_norm": 3.953125, "learning_rate": 3.7702366823219694e-05, "loss": 1.9942, "num_input_tokens_seen": 27612032, "step": 13215 }, { "epoch": 2.156619626396933, "grad_norm": 8.4375, "learning_rate": 3.769391003898366e-05, "loss": 1.3627, "num_input_tokens_seen": 27621792, "step": 13220 }, { "epoch": 2.15743535361775, "grad_norm": 6.6875, "learning_rate": 3.768545129718718e-05, "loss": 2.4639, "num_input_tokens_seen": 27633856, "step": 13225 }, { "epoch": 2.1582510808385678, "grad_norm": 6.6875, "learning_rate": 3.7676990599134686e-05, "loss": 2.2643, "num_input_tokens_seen": 27644656, "step": 13230 }, { "epoch": 2.159066808059385, "grad_norm": 9.4375, "learning_rate": 3.766852794613095e-05, "loss": 3.2836, "num_input_tokens_seen": 27654736, "step": 13235 }, { "epoch": 2.1598825352802025, "grad_norm": 4.09375, "learning_rate": 3.766006333948099e-05, "loss": 2.2872, "num_input_tokens_seen": 27664336, "step": 13240 }, { "epoch": 2.1606982625010196, "grad_norm": 10.875, "learning_rate": 3.765159678049017e-05, "loss": 2.0381, "num_input_tokens_seen": 27675728, "step": 13245 }, { "epoch": 2.161513989721837, "grad_norm": 9.125, "learning_rate": 3.7643128270464134e-05, "loss": 3.165, "num_input_tokens_seen": 27687088, "step": 13250 }, { "epoch": 2.1623297169426543, "grad_norm": 5.9375, "learning_rate": 3.763465781070884e-05, "loss": 1.7316, "num_input_tokens_seen": 27697088, "step": 13255 }, { "epoch": 2.163145444163472, "grad_norm": 9.25, "learning_rate": 3.762618540253052e-05, "loss": 2.5775, "num_input_tokens_seen": 27707600, "step": 13260 }, { "epoch": 2.163961171384289, "grad_norm": 7.21875, "learning_rate": 3.761771104723576e-05, "loss": 1.5108, "num_input_tokens_seen": 27717360, "step": 13265 }, { "epoch": 2.1647768986051066, "grad_norm": 3.46875, "learning_rate": 3.7609234746131386e-05, "loss": 2.9984, "num_input_tokens_seen": 27727952, "step": 13270 }, { "epoch": 2.1655926258259237, "grad_norm": 6.8125, "learning_rate": 3.7600756500524556e-05, "loss": 2.8128, "num_input_tokens_seen": 27737792, "step": 13275 }, { "epoch": 2.1664083530467413, "grad_norm": 8.1875, "learning_rate": 3.759227631172271e-05, "loss": 2.0977, "num_input_tokens_seen": 27747280, "step": 13280 }, { "epoch": 2.1672240802675584, "grad_norm": 7.28125, "learning_rate": 3.758379418103363e-05, "loss": 3.0331, "num_input_tokens_seen": 27757504, "step": 13285 }, { "epoch": 2.168039807488376, "grad_norm": 16.0, "learning_rate": 3.757531010976534e-05, "loss": 3.1703, "num_input_tokens_seen": 27767552, "step": 13290 }, { "epoch": 2.168855534709193, "grad_norm": 11.9375, "learning_rate": 3.75668240992262e-05, "loss": 3.727, "num_input_tokens_seen": 27778560, "step": 13295 }, { "epoch": 2.1696712619300107, "grad_norm": 3.453125, "learning_rate": 3.7558336150724865e-05, "loss": 1.8895, "num_input_tokens_seen": 27789344, "step": 13300 }, { "epoch": 2.170486989150828, "grad_norm": 9.5625, "learning_rate": 3.754984626557028e-05, "loss": 3.3177, "num_input_tokens_seen": 27799824, "step": 13305 }, { "epoch": 2.1713027163716454, "grad_norm": 2.703125, "learning_rate": 3.754135444507168e-05, "loss": 1.1676, "num_input_tokens_seen": 27810992, "step": 13310 }, { "epoch": 2.1721184435924625, "grad_norm": 7.3125, "learning_rate": 3.753286069053863e-05, "loss": 1.347, "num_input_tokens_seen": 27821392, "step": 13315 }, { "epoch": 2.17293417081328, "grad_norm": 6.5, "learning_rate": 3.7524365003280945e-05, "loss": 2.4593, "num_input_tokens_seen": 27831264, "step": 13320 }, { "epoch": 2.1737498980340972, "grad_norm": 10.0, "learning_rate": 3.75158673846088e-05, "loss": 2.4446, "num_input_tokens_seen": 27839360, "step": 13325 }, { "epoch": 2.174565625254915, "grad_norm": 3.453125, "learning_rate": 3.750736783583262e-05, "loss": 1.6798, "num_input_tokens_seen": 27848448, "step": 13330 }, { "epoch": 2.175381352475732, "grad_norm": 7.5, "learning_rate": 3.7498866358263144e-05, "loss": 2.3483, "num_input_tokens_seen": 27859440, "step": 13335 }, { "epoch": 2.1761970796965495, "grad_norm": 10.125, "learning_rate": 3.74903629532114e-05, "loss": 2.3944, "num_input_tokens_seen": 27869952, "step": 13340 }, { "epoch": 2.1770128069173666, "grad_norm": 4.0, "learning_rate": 3.748185762198873e-05, "loss": 1.3601, "num_input_tokens_seen": 27879376, "step": 13345 }, { "epoch": 2.1778285341381842, "grad_norm": 10.875, "learning_rate": 3.747335036590676e-05, "loss": 2.4343, "num_input_tokens_seen": 27888912, "step": 13350 }, { "epoch": 2.1786442613590014, "grad_norm": 6.71875, "learning_rate": 3.7464841186277405e-05, "loss": 1.9506, "num_input_tokens_seen": 27898512, "step": 13355 }, { "epoch": 2.179459988579819, "grad_norm": 9.625, "learning_rate": 3.7456330084412896e-05, "loss": 3.0834, "num_input_tokens_seen": 27908352, "step": 13360 }, { "epoch": 2.180275715800636, "grad_norm": 4.25, "learning_rate": 3.744781706162576e-05, "loss": 3.2379, "num_input_tokens_seen": 27917456, "step": 13365 }, { "epoch": 2.1810914430214536, "grad_norm": 9.375, "learning_rate": 3.743930211922879e-05, "loss": 2.627, "num_input_tokens_seen": 27929024, "step": 13370 }, { "epoch": 2.1819071702422708, "grad_norm": 8.75, "learning_rate": 3.743078525853513e-05, "loss": 2.2167, "num_input_tokens_seen": 27940192, "step": 13375 }, { "epoch": 2.1827228974630883, "grad_norm": 4.90625, "learning_rate": 3.7422266480858154e-05, "loss": 2.6997, "num_input_tokens_seen": 27949648, "step": 13380 }, { "epoch": 2.183538624683906, "grad_norm": 6.59375, "learning_rate": 3.741374578751158e-05, "loss": 1.8599, "num_input_tokens_seen": 27959968, "step": 13385 }, { "epoch": 2.184354351904723, "grad_norm": 4.40625, "learning_rate": 3.740522317980941e-05, "loss": 1.7879, "num_input_tokens_seen": 27968144, "step": 13390 }, { "epoch": 2.1851700791255406, "grad_norm": 12.4375, "learning_rate": 3.739669865906593e-05, "loss": 3.2361, "num_input_tokens_seen": 27978992, "step": 13395 }, { "epoch": 2.1859858063463578, "grad_norm": 8.25, "learning_rate": 3.738817222659573e-05, "loss": 1.8392, "num_input_tokens_seen": 27988880, "step": 13400 }, { "epoch": 2.1859858063463578, "eval_loss": 2.5632946491241455, "eval_runtime": 134.7942, "eval_samples_per_second": 20.216, "eval_steps_per_second": 10.112, "num_input_tokens_seen": 27988880, "step": 13400 }, { "epoch": 2.1868015335671753, "grad_norm": 11.5, "learning_rate": 3.73796438837137e-05, "loss": 2.3795, "num_input_tokens_seen": 27999520, "step": 13405 }, { "epoch": 2.1876172607879925, "grad_norm": 2.390625, "learning_rate": 3.7371113631735e-05, "loss": 1.8136, "num_input_tokens_seen": 28010640, "step": 13410 }, { "epoch": 2.18843298800881, "grad_norm": 4.65625, "learning_rate": 3.736258147197512e-05, "loss": 2.3365, "num_input_tokens_seen": 28021504, "step": 13415 }, { "epoch": 2.189248715229627, "grad_norm": 2.109375, "learning_rate": 3.735404740574981e-05, "loss": 2.0843, "num_input_tokens_seen": 28030432, "step": 13420 }, { "epoch": 2.1900644424504447, "grad_norm": 4.0, "learning_rate": 3.7345511434375145e-05, "loss": 1.6312, "num_input_tokens_seen": 28040752, "step": 13425 }, { "epoch": 2.190880169671262, "grad_norm": 7.53125, "learning_rate": 3.733697355916748e-05, "loss": 2.3655, "num_input_tokens_seen": 28050464, "step": 13430 }, { "epoch": 2.1916958968920794, "grad_norm": 10.625, "learning_rate": 3.732843378144345e-05, "loss": 2.2273, "num_input_tokens_seen": 28060048, "step": 13435 }, { "epoch": 2.1925116241128966, "grad_norm": 6.8125, "learning_rate": 3.7319892102519995e-05, "loss": 3.9828, "num_input_tokens_seen": 28069696, "step": 13440 }, { "epoch": 2.193327351333714, "grad_norm": 4.75, "learning_rate": 3.731134852371436e-05, "loss": 2.2042, "num_input_tokens_seen": 28080464, "step": 13445 }, { "epoch": 2.1941430785545313, "grad_norm": 11.8125, "learning_rate": 3.730280304634408e-05, "loss": 2.9954, "num_input_tokens_seen": 28091296, "step": 13450 }, { "epoch": 2.194958805775349, "grad_norm": 5.8125, "learning_rate": 3.729425567172696e-05, "loss": 2.8598, "num_input_tokens_seen": 28103232, "step": 13455 }, { "epoch": 2.195774532996166, "grad_norm": 9.0, "learning_rate": 3.728570640118111e-05, "loss": 2.5514, "num_input_tokens_seen": 28114000, "step": 13460 }, { "epoch": 2.1965902602169836, "grad_norm": 6.0, "learning_rate": 3.727715523602494e-05, "loss": 1.6747, "num_input_tokens_seen": 28123856, "step": 13465 }, { "epoch": 2.1974059874378007, "grad_norm": 5.9375, "learning_rate": 3.726860217757715e-05, "loss": 1.7441, "num_input_tokens_seen": 28135040, "step": 13470 }, { "epoch": 2.1982217146586183, "grad_norm": 6.84375, "learning_rate": 3.726004722715673e-05, "loss": 3.6209, "num_input_tokens_seen": 28143808, "step": 13475 }, { "epoch": 2.1990374418794354, "grad_norm": 9.125, "learning_rate": 3.725149038608296e-05, "loss": 2.721, "num_input_tokens_seen": 28154528, "step": 13480 }, { "epoch": 2.199853169100253, "grad_norm": 11.0, "learning_rate": 3.7242931655675404e-05, "loss": 3.1101, "num_input_tokens_seen": 28164800, "step": 13485 }, { "epoch": 2.20066889632107, "grad_norm": 7.6875, "learning_rate": 3.7234371037253937e-05, "loss": 2.8425, "num_input_tokens_seen": 28174208, "step": 13490 }, { "epoch": 2.2014846235418877, "grad_norm": 17.375, "learning_rate": 3.7225808532138705e-05, "loss": 3.5908, "num_input_tokens_seen": 28183648, "step": 13495 }, { "epoch": 2.202300350762705, "grad_norm": 6.5625, "learning_rate": 3.721724414165016e-05, "loss": 2.6048, "num_input_tokens_seen": 28194656, "step": 13500 }, { "epoch": 2.2031160779835224, "grad_norm": 9.25, "learning_rate": 3.720867786710904e-05, "loss": 2.39, "num_input_tokens_seen": 28204624, "step": 13505 }, { "epoch": 2.2039318052043395, "grad_norm": 4.46875, "learning_rate": 3.7200109709836366e-05, "loss": 2.1693, "num_input_tokens_seen": 28214304, "step": 13510 }, { "epoch": 2.204747532425157, "grad_norm": 2.09375, "learning_rate": 3.7191539671153465e-05, "loss": 2.0736, "num_input_tokens_seen": 28225728, "step": 13515 }, { "epoch": 2.205563259645974, "grad_norm": 9.625, "learning_rate": 3.718296775238193e-05, "loss": 3.4614, "num_input_tokens_seen": 28235520, "step": 13520 }, { "epoch": 2.206378986866792, "grad_norm": 7.0625, "learning_rate": 3.7174393954843675e-05, "loss": 2.2725, "num_input_tokens_seen": 28245232, "step": 13525 }, { "epoch": 2.207194714087609, "grad_norm": 7.78125, "learning_rate": 3.716581827986087e-05, "loss": 1.7475, "num_input_tokens_seen": 28255552, "step": 13530 }, { "epoch": 2.2080104413084265, "grad_norm": 2.5625, "learning_rate": 3.7157240728756004e-05, "loss": 2.3929, "num_input_tokens_seen": 28265104, "step": 13535 }, { "epoch": 2.2088261685292436, "grad_norm": 10.6875, "learning_rate": 3.714866130285184e-05, "loss": 3.1104, "num_input_tokens_seen": 28275552, "step": 13540 }, { "epoch": 2.209641895750061, "grad_norm": 2.046875, "learning_rate": 3.714008000347143e-05, "loss": 1.5486, "num_input_tokens_seen": 28285296, "step": 13545 }, { "epoch": 2.2104576229708783, "grad_norm": 10.0625, "learning_rate": 3.7131496831938126e-05, "loss": 3.3411, "num_input_tokens_seen": 28295872, "step": 13550 }, { "epoch": 2.211273350191696, "grad_norm": 2.984375, "learning_rate": 3.7122911789575565e-05, "loss": 1.4784, "num_input_tokens_seen": 28304816, "step": 13555 }, { "epoch": 2.2120890774125135, "grad_norm": 4.4375, "learning_rate": 3.711432487770765e-05, "loss": 2.0286, "num_input_tokens_seen": 28315664, "step": 13560 }, { "epoch": 2.2129048046333306, "grad_norm": 6.34375, "learning_rate": 3.710573609765861e-05, "loss": 2.3015, "num_input_tokens_seen": 28326160, "step": 13565 }, { "epoch": 2.2137205318541477, "grad_norm": 7.15625, "learning_rate": 3.709714545075292e-05, "loss": 3.069, "num_input_tokens_seen": 28335872, "step": 13570 }, { "epoch": 2.2145362590749653, "grad_norm": 8.375, "learning_rate": 3.708855293831538e-05, "loss": 1.668, "num_input_tokens_seen": 28346192, "step": 13575 }, { "epoch": 2.215351986295783, "grad_norm": 2.59375, "learning_rate": 3.707995856167107e-05, "loss": 2.2027, "num_input_tokens_seen": 28355680, "step": 13580 }, { "epoch": 2.2161677135166, "grad_norm": 6.21875, "learning_rate": 3.707136232214534e-05, "loss": 2.1956, "num_input_tokens_seen": 28366080, "step": 13585 }, { "epoch": 2.2169834407374176, "grad_norm": 6.6875, "learning_rate": 3.7062764221063844e-05, "loss": 1.955, "num_input_tokens_seen": 28376176, "step": 13590 }, { "epoch": 2.2177991679582347, "grad_norm": 4.5625, "learning_rate": 3.705416425975252e-05, "loss": 2.4168, "num_input_tokens_seen": 28386832, "step": 13595 }, { "epoch": 2.2186148951790523, "grad_norm": 2.078125, "learning_rate": 3.704556243953758e-05, "loss": 2.2886, "num_input_tokens_seen": 28397472, "step": 13600 }, { "epoch": 2.2186148951790523, "eval_loss": 2.5419933795928955, "eval_runtime": 134.9612, "eval_samples_per_second": 20.191, "eval_steps_per_second": 10.099, "num_input_tokens_seen": 28397472, "step": 13600 }, { "epoch": 2.2194306223998694, "grad_norm": 6.625, "learning_rate": 3.7036958761745535e-05, "loss": 1.9117, "num_input_tokens_seen": 28406608, "step": 13605 }, { "epoch": 2.220246349620687, "grad_norm": 3.46875, "learning_rate": 3.702835322770318e-05, "loss": 1.927, "num_input_tokens_seen": 28416480, "step": 13610 }, { "epoch": 2.221062076841504, "grad_norm": 11.5625, "learning_rate": 3.701974583873761e-05, "loss": 3.7652, "num_input_tokens_seen": 28426144, "step": 13615 }, { "epoch": 2.2218778040623217, "grad_norm": 10.5625, "learning_rate": 3.701113659617618e-05, "loss": 2.1548, "num_input_tokens_seen": 28438080, "step": 13620 }, { "epoch": 2.222693531283139, "grad_norm": 11.6875, "learning_rate": 3.7002525501346535e-05, "loss": 2.6727, "num_input_tokens_seen": 28447968, "step": 13625 }, { "epoch": 2.2235092585039564, "grad_norm": 5.9375, "learning_rate": 3.699391255557664e-05, "loss": 2.3659, "num_input_tokens_seen": 28458752, "step": 13630 }, { "epoch": 2.2243249857247736, "grad_norm": 3.484375, "learning_rate": 3.69852977601947e-05, "loss": 1.9306, "num_input_tokens_seen": 28468912, "step": 13635 }, { "epoch": 2.225140712945591, "grad_norm": 3.4375, "learning_rate": 3.697668111652922e-05, "loss": 2.6419, "num_input_tokens_seen": 28478864, "step": 13640 }, { "epoch": 2.2259564401664083, "grad_norm": 8.375, "learning_rate": 3.6968062625909005e-05, "loss": 1.6479, "num_input_tokens_seen": 28489504, "step": 13645 }, { "epoch": 2.226772167387226, "grad_norm": 5.96875, "learning_rate": 3.6959442289663135e-05, "loss": 2.1652, "num_input_tokens_seen": 28500112, "step": 13650 }, { "epoch": 2.227587894608043, "grad_norm": 3.65625, "learning_rate": 3.695082010912098e-05, "loss": 1.927, "num_input_tokens_seen": 28511216, "step": 13655 }, { "epoch": 2.2284036218288605, "grad_norm": 8.5, "learning_rate": 3.694219608561217e-05, "loss": 3.0202, "num_input_tokens_seen": 28521776, "step": 13660 }, { "epoch": 2.2292193490496777, "grad_norm": 5.25, "learning_rate": 3.693357022046665e-05, "loss": 2.5021, "num_input_tokens_seen": 28531920, "step": 13665 }, { "epoch": 2.2300350762704952, "grad_norm": 11.625, "learning_rate": 3.6924942515014644e-05, "loss": 3.6784, "num_input_tokens_seen": 28543856, "step": 13670 }, { "epoch": 2.2308508034913124, "grad_norm": 5.09375, "learning_rate": 3.691631297058664e-05, "loss": 2.7551, "num_input_tokens_seen": 28554288, "step": 13675 }, { "epoch": 2.23166653071213, "grad_norm": 8.375, "learning_rate": 3.6907681588513424e-05, "loss": 2.895, "num_input_tokens_seen": 28564352, "step": 13680 }, { "epoch": 2.232482257932947, "grad_norm": 6.78125, "learning_rate": 3.689904837012606e-05, "loss": 2.7676, "num_input_tokens_seen": 28575712, "step": 13685 }, { "epoch": 2.2332979851537647, "grad_norm": 8.625, "learning_rate": 3.689041331675591e-05, "loss": 2.0033, "num_input_tokens_seen": 28587104, "step": 13690 }, { "epoch": 2.234113712374582, "grad_norm": 7.4375, "learning_rate": 3.688177642973461e-05, "loss": 2.4829, "num_input_tokens_seen": 28597840, "step": 13695 }, { "epoch": 2.2349294395953994, "grad_norm": 8.875, "learning_rate": 3.687313771039406e-05, "loss": 2.3574, "num_input_tokens_seen": 28608656, "step": 13700 }, { "epoch": 2.2357451668162165, "grad_norm": 7.28125, "learning_rate": 3.686449716006647e-05, "loss": 2.884, "num_input_tokens_seen": 28618576, "step": 13705 }, { "epoch": 2.236560894037034, "grad_norm": 7.875, "learning_rate": 3.685585478008432e-05, "loss": 2.5909, "num_input_tokens_seen": 28629008, "step": 13710 }, { "epoch": 2.237376621257851, "grad_norm": 17.375, "learning_rate": 3.6847210571780364e-05, "loss": 2.6273, "num_input_tokens_seen": 28639072, "step": 13715 }, { "epoch": 2.2381923484786688, "grad_norm": 6.9375, "learning_rate": 3.683856453648767e-05, "loss": 2.9438, "num_input_tokens_seen": 28648304, "step": 13720 }, { "epoch": 2.239008075699486, "grad_norm": 3.796875, "learning_rate": 3.682991667553954e-05, "loss": 2.1269, "num_input_tokens_seen": 28659168, "step": 13725 }, { "epoch": 2.2398238029203035, "grad_norm": 9.8125, "learning_rate": 3.6821266990269606e-05, "loss": 3.2685, "num_input_tokens_seen": 28668560, "step": 13730 }, { "epoch": 2.2406395301411206, "grad_norm": 10.3125, "learning_rate": 3.681261548201174e-05, "loss": 3.126, "num_input_tokens_seen": 28679888, "step": 13735 }, { "epoch": 2.241455257361938, "grad_norm": 10.0625, "learning_rate": 3.6803962152100125e-05, "loss": 2.3718, "num_input_tokens_seen": 28688992, "step": 13740 }, { "epoch": 2.2422709845827553, "grad_norm": 9.375, "learning_rate": 3.67953070018692e-05, "loss": 2.9371, "num_input_tokens_seen": 28700704, "step": 13745 }, { "epoch": 2.243086711803573, "grad_norm": 7.46875, "learning_rate": 3.678665003265371e-05, "loss": 1.72, "num_input_tokens_seen": 28711392, "step": 13750 }, { "epoch": 2.2439024390243905, "grad_norm": 10.6875, "learning_rate": 3.677799124578867e-05, "loss": 2.8431, "num_input_tokens_seen": 28721920, "step": 13755 }, { "epoch": 2.2447181662452076, "grad_norm": 10.25, "learning_rate": 3.676933064260937e-05, "loss": 1.663, "num_input_tokens_seen": 28733392, "step": 13760 }, { "epoch": 2.245533893466025, "grad_norm": 0.21484375, "learning_rate": 3.6760668224451365e-05, "loss": 1.6227, "num_input_tokens_seen": 28743088, "step": 13765 }, { "epoch": 2.2463496206868423, "grad_norm": 10.5, "learning_rate": 3.675200399265054e-05, "loss": 2.1069, "num_input_tokens_seen": 28752656, "step": 13770 }, { "epoch": 2.24716534790766, "grad_norm": 9.0, "learning_rate": 3.6743337948543014e-05, "loss": 2.1198, "num_input_tokens_seen": 28763440, "step": 13775 }, { "epoch": 2.247981075128477, "grad_norm": 2.328125, "learning_rate": 3.6734670093465204e-05, "loss": 1.2295, "num_input_tokens_seen": 28774128, "step": 13780 }, { "epoch": 2.2487968023492946, "grad_norm": 5.5, "learning_rate": 3.672600042875379e-05, "loss": 2.0569, "num_input_tokens_seen": 28782768, "step": 13785 }, { "epoch": 2.2496125295701117, "grad_norm": 6.84375, "learning_rate": 3.671732895574575e-05, "loss": 2.03, "num_input_tokens_seen": 28792224, "step": 13790 }, { "epoch": 2.2504282567909293, "grad_norm": 7.03125, "learning_rate": 3.670865567577834e-05, "loss": 2.5284, "num_input_tokens_seen": 28801664, "step": 13795 }, { "epoch": 2.2512439840117464, "grad_norm": 5.21875, "learning_rate": 3.669998059018909e-05, "loss": 3.2633, "num_input_tokens_seen": 28812800, "step": 13800 }, { "epoch": 2.2512439840117464, "eval_loss": 2.545050859451294, "eval_runtime": 134.9252, "eval_samples_per_second": 20.196, "eval_steps_per_second": 10.102, "num_input_tokens_seen": 28812800, "step": 13800 }, { "epoch": 2.252059711232564, "grad_norm": 1.6953125, "learning_rate": 3.6691303700315796e-05, "loss": 1.9925, "num_input_tokens_seen": 28822224, "step": 13805 }, { "epoch": 2.252875438453381, "grad_norm": 0.11962890625, "learning_rate": 3.668262500749655e-05, "loss": 2.4546, "num_input_tokens_seen": 28831536, "step": 13810 }, { "epoch": 2.2536911656741987, "grad_norm": 0.640625, "learning_rate": 3.667394451306971e-05, "loss": 2.2364, "num_input_tokens_seen": 28841728, "step": 13815 }, { "epoch": 2.254506892895016, "grad_norm": 11.375, "learning_rate": 3.666526221837393e-05, "loss": 3.1414, "num_input_tokens_seen": 28852208, "step": 13820 }, { "epoch": 2.2553226201158334, "grad_norm": 11.4375, "learning_rate": 3.665657812474812e-05, "loss": 2.4102, "num_input_tokens_seen": 28862912, "step": 13825 }, { "epoch": 2.2561383473366505, "grad_norm": 8.6875, "learning_rate": 3.664789223353147e-05, "loss": 4.0906, "num_input_tokens_seen": 28872560, "step": 13830 }, { "epoch": 2.256954074557468, "grad_norm": 7.6875, "learning_rate": 3.663920454606347e-05, "loss": 4.0156, "num_input_tokens_seen": 28884400, "step": 13835 }, { "epoch": 2.2577698017782852, "grad_norm": 4.1875, "learning_rate": 3.6630515063683856e-05, "loss": 3.4722, "num_input_tokens_seen": 28892976, "step": 13840 }, { "epoch": 2.258585528999103, "grad_norm": 4.59375, "learning_rate": 3.662182378773267e-05, "loss": 2.3658, "num_input_tokens_seen": 28904016, "step": 13845 }, { "epoch": 2.25940125621992, "grad_norm": 0.2158203125, "learning_rate": 3.66131307195502e-05, "loss": 2.4775, "num_input_tokens_seen": 28914288, "step": 13850 }, { "epoch": 2.2602169834407375, "grad_norm": 4.8125, "learning_rate": 3.6604435860477034e-05, "loss": 1.985, "num_input_tokens_seen": 28923968, "step": 13855 }, { "epoch": 2.2610327106615546, "grad_norm": 4.75, "learning_rate": 3.6595739211854025e-05, "loss": 2.2976, "num_input_tokens_seen": 28936352, "step": 13860 }, { "epoch": 2.261848437882372, "grad_norm": 6.5, "learning_rate": 3.658704077502231e-05, "loss": 1.6449, "num_input_tokens_seen": 28945232, "step": 13865 }, { "epoch": 2.2626641651031894, "grad_norm": 7.09375, "learning_rate": 3.65783405513233e-05, "loss": 1.9578, "num_input_tokens_seen": 28954992, "step": 13870 }, { "epoch": 2.263479892324007, "grad_norm": 10.625, "learning_rate": 3.656963854209867e-05, "loss": 3.1532, "num_input_tokens_seen": 28965136, "step": 13875 }, { "epoch": 2.264295619544824, "grad_norm": 7.3125, "learning_rate": 3.656093474869038e-05, "loss": 2.488, "num_input_tokens_seen": 28975520, "step": 13880 }, { "epoch": 2.2651113467656416, "grad_norm": 1.8359375, "learning_rate": 3.655222917244068e-05, "loss": 1.8568, "num_input_tokens_seen": 28985424, "step": 13885 }, { "epoch": 2.2659270739864588, "grad_norm": 7.0625, "learning_rate": 3.6543521814692054e-05, "loss": 1.8749, "num_input_tokens_seen": 28995040, "step": 13890 }, { "epoch": 2.2667428012072763, "grad_norm": 4.0625, "learning_rate": 3.653481267678731e-05, "loss": 1.5914, "num_input_tokens_seen": 29005008, "step": 13895 }, { "epoch": 2.2675585284280935, "grad_norm": 7.3125, "learning_rate": 3.652610176006949e-05, "loss": 1.7404, "num_input_tokens_seen": 29014944, "step": 13900 }, { "epoch": 2.268374255648911, "grad_norm": 7.90625, "learning_rate": 3.6517389065881925e-05, "loss": 2.9863, "num_input_tokens_seen": 29024880, "step": 13905 }, { "epoch": 2.269189982869728, "grad_norm": 8.4375, "learning_rate": 3.650867459556824e-05, "loss": 1.5652, "num_input_tokens_seen": 29034784, "step": 13910 }, { "epoch": 2.2700057100905457, "grad_norm": 5.0, "learning_rate": 3.64999583504723e-05, "loss": 2.2973, "num_input_tokens_seen": 29043744, "step": 13915 }, { "epoch": 2.270821437311363, "grad_norm": 16.125, "learning_rate": 3.649124033193827e-05, "loss": 1.6257, "num_input_tokens_seen": 29054128, "step": 13920 }, { "epoch": 2.2716371645321805, "grad_norm": 0.095703125, "learning_rate": 3.648252054131057e-05, "loss": 2.1929, "num_input_tokens_seen": 29063200, "step": 13925 }, { "epoch": 2.272452891752998, "grad_norm": 4.34375, "learning_rate": 3.647379897993391e-05, "loss": 0.9812, "num_input_tokens_seen": 29073488, "step": 13930 }, { "epoch": 2.273268618973815, "grad_norm": 2.09375, "learning_rate": 3.646507564915325e-05, "loss": 1.989, "num_input_tokens_seen": 29082896, "step": 13935 }, { "epoch": 2.2740843461946323, "grad_norm": 6.96875, "learning_rate": 3.645635055031385e-05, "loss": 2.7019, "num_input_tokens_seen": 29093968, "step": 13940 }, { "epoch": 2.27490007341545, "grad_norm": 10.1875, "learning_rate": 3.6447623684761224e-05, "loss": 3.7933, "num_input_tokens_seen": 29103984, "step": 13945 }, { "epoch": 2.2757158006362674, "grad_norm": 8.125, "learning_rate": 3.643889505384117e-05, "loss": 2.7107, "num_input_tokens_seen": 29115280, "step": 13950 }, { "epoch": 2.2765315278570846, "grad_norm": 7.875, "learning_rate": 3.6430164658899744e-05, "loss": 2.807, "num_input_tokens_seen": 29126464, "step": 13955 }, { "epoch": 2.2773472550779017, "grad_norm": 7.5, "learning_rate": 3.642143250128329e-05, "loss": 2.0492, "num_input_tokens_seen": 29137424, "step": 13960 }, { "epoch": 2.2781629822987193, "grad_norm": 2.28125, "learning_rate": 3.641269858233841e-05, "loss": 1.1254, "num_input_tokens_seen": 29148448, "step": 13965 }, { "epoch": 2.278978709519537, "grad_norm": 9.5, "learning_rate": 3.640396290341199e-05, "loss": 2.2028, "num_input_tokens_seen": 29159104, "step": 13970 }, { "epoch": 2.279794436740354, "grad_norm": 9.3125, "learning_rate": 3.639522546585118e-05, "loss": 2.7009, "num_input_tokens_seen": 29169232, "step": 13975 }, { "epoch": 2.2806101639611716, "grad_norm": 9.5, "learning_rate": 3.6386486271003404e-05, "loss": 2.7676, "num_input_tokens_seen": 29179232, "step": 13980 }, { "epoch": 2.2814258911819887, "grad_norm": 11.0, "learning_rate": 3.6377745320216346e-05, "loss": 3.3726, "num_input_tokens_seen": 29189408, "step": 13985 }, { "epoch": 2.2822416184028063, "grad_norm": 7.75, "learning_rate": 3.636900261483798e-05, "loss": 1.8711, "num_input_tokens_seen": 29200160, "step": 13990 }, { "epoch": 2.2830573456236234, "grad_norm": 7.5, "learning_rate": 3.636025815621654e-05, "loss": 2.4261, "num_input_tokens_seen": 29211872, "step": 13995 }, { "epoch": 2.283873072844441, "grad_norm": 8.9375, "learning_rate": 3.635151194570054e-05, "loss": 2.5096, "num_input_tokens_seen": 29222656, "step": 14000 }, { "epoch": 2.283873072844441, "eval_loss": 2.546114921569824, "eval_runtime": 135.0346, "eval_samples_per_second": 20.18, "eval_steps_per_second": 10.094, "num_input_tokens_seen": 29222656, "step": 14000 }, { "epoch": 2.284688800065258, "grad_norm": 8.625, "learning_rate": 3.634276398463873e-05, "loss": 2.4827, "num_input_tokens_seen": 29233840, "step": 14005 }, { "epoch": 2.2855045272860757, "grad_norm": 6.21875, "learning_rate": 3.633401427438018e-05, "loss": 2.0139, "num_input_tokens_seen": 29244288, "step": 14010 }, { "epoch": 2.286320254506893, "grad_norm": 7.40625, "learning_rate": 3.63252628162742e-05, "loss": 2.8611, "num_input_tokens_seen": 29253968, "step": 14015 }, { "epoch": 2.2871359817277104, "grad_norm": 3.265625, "learning_rate": 3.6316509611670364e-05, "loss": 2.3223, "num_input_tokens_seen": 29265168, "step": 14020 }, { "epoch": 2.2879517089485275, "grad_norm": 5.96875, "learning_rate": 3.630775466191854e-05, "loss": 3.226, "num_input_tokens_seen": 29274496, "step": 14025 }, { "epoch": 2.288767436169345, "grad_norm": 8.0625, "learning_rate": 3.629899796836884e-05, "loss": 3.2147, "num_input_tokens_seen": 29284496, "step": 14030 }, { "epoch": 2.289583163390162, "grad_norm": 7.96875, "learning_rate": 3.6290239532371666e-05, "loss": 2.1345, "num_input_tokens_seen": 29295264, "step": 14035 }, { "epoch": 2.29039889061098, "grad_norm": 4.5625, "learning_rate": 3.628147935527767e-05, "loss": 1.7201, "num_input_tokens_seen": 29305280, "step": 14040 }, { "epoch": 2.291214617831797, "grad_norm": 7.375, "learning_rate": 3.627271743843779e-05, "loss": 1.9968, "num_input_tokens_seen": 29314640, "step": 14045 }, { "epoch": 2.2920303450526145, "grad_norm": 13.0625, "learning_rate": 3.626395378320321e-05, "loss": 3.2169, "num_input_tokens_seen": 29325504, "step": 14050 }, { "epoch": 2.2928460722734316, "grad_norm": 3.671875, "learning_rate": 3.625518839092541e-05, "loss": 1.8564, "num_input_tokens_seen": 29335632, "step": 14055 }, { "epoch": 2.293661799494249, "grad_norm": 7.6875, "learning_rate": 3.624642126295612e-05, "loss": 3.6122, "num_input_tokens_seen": 29346896, "step": 14060 }, { "epoch": 2.2944775267150663, "grad_norm": 3.609375, "learning_rate": 3.6237652400647345e-05, "loss": 1.6163, "num_input_tokens_seen": 29357008, "step": 14065 }, { "epoch": 2.295293253935884, "grad_norm": 8.1875, "learning_rate": 3.622888180535134e-05, "loss": 1.5268, "num_input_tokens_seen": 29366960, "step": 14070 }, { "epoch": 2.296108981156701, "grad_norm": 5.8125, "learning_rate": 3.6220109478420655e-05, "loss": 2.4028, "num_input_tokens_seen": 29377664, "step": 14075 }, { "epoch": 2.2969247083775186, "grad_norm": 4.0625, "learning_rate": 3.6211335421208084e-05, "loss": 2.1865, "num_input_tokens_seen": 29388528, "step": 14080 }, { "epoch": 2.2977404355983357, "grad_norm": 5.5, "learning_rate": 3.62025596350667e-05, "loss": 1.6842, "num_input_tokens_seen": 29398960, "step": 14085 }, { "epoch": 2.2985561628191533, "grad_norm": 10.375, "learning_rate": 3.619378212134984e-05, "loss": 3.0079, "num_input_tokens_seen": 29409936, "step": 14090 }, { "epoch": 2.2993718900399704, "grad_norm": 16.375, "learning_rate": 3.618500288141111e-05, "loss": 2.3269, "num_input_tokens_seen": 29421136, "step": 14095 }, { "epoch": 2.300187617260788, "grad_norm": 7.15625, "learning_rate": 3.617622191660438e-05, "loss": 3.0474, "num_input_tokens_seen": 29431168, "step": 14100 }, { "epoch": 2.3010033444816056, "grad_norm": 5.625, "learning_rate": 3.616743922828377e-05, "loss": 2.4032, "num_input_tokens_seen": 29441936, "step": 14105 }, { "epoch": 2.3018190717024227, "grad_norm": 4.125, "learning_rate": 3.615865481780371e-05, "loss": 2.403, "num_input_tokens_seen": 29451808, "step": 14110 }, { "epoch": 2.30263479892324, "grad_norm": 9.1875, "learning_rate": 3.614986868651883e-05, "loss": 2.0912, "num_input_tokens_seen": 29462464, "step": 14115 }, { "epoch": 2.3034505261440574, "grad_norm": 7.125, "learning_rate": 3.614108083578409e-05, "loss": 1.6459, "num_input_tokens_seen": 29473552, "step": 14120 }, { "epoch": 2.304266253364875, "grad_norm": 2.21875, "learning_rate": 3.613229126695467e-05, "loss": 2.6505, "num_input_tokens_seen": 29482896, "step": 14125 }, { "epoch": 2.305081980585692, "grad_norm": 3.734375, "learning_rate": 3.612349998138605e-05, "loss": 2.4459, "num_input_tokens_seen": 29492912, "step": 14130 }, { "epoch": 2.3058977078065093, "grad_norm": 7.15625, "learning_rate": 3.6114706980433946e-05, "loss": 3.7244, "num_input_tokens_seen": 29503536, "step": 14135 }, { "epoch": 2.306713435027327, "grad_norm": 4.40625, "learning_rate": 3.610591226545435e-05, "loss": 2.1316, "num_input_tokens_seen": 29513328, "step": 14140 }, { "epoch": 2.3075291622481444, "grad_norm": 6.125, "learning_rate": 3.6097115837803505e-05, "loss": 1.7471, "num_input_tokens_seen": 29523856, "step": 14145 }, { "epoch": 2.3083448894689615, "grad_norm": 12.5625, "learning_rate": 3.608831769883795e-05, "loss": 3.4881, "num_input_tokens_seen": 29535504, "step": 14150 }, { "epoch": 2.309160616689779, "grad_norm": 5.15625, "learning_rate": 3.607951784991446e-05, "loss": 2.2863, "num_input_tokens_seen": 29545376, "step": 14155 }, { "epoch": 2.3099763439105963, "grad_norm": 6.84375, "learning_rate": 3.6070716292390085e-05, "loss": 1.7744, "num_input_tokens_seen": 29554848, "step": 14160 }, { "epoch": 2.310792071131414, "grad_norm": 4.875, "learning_rate": 3.606191302762213e-05, "loss": 1.903, "num_input_tokens_seen": 29566080, "step": 14165 }, { "epoch": 2.311607798352231, "grad_norm": 4.84375, "learning_rate": 3.605310805696818e-05, "loss": 2.5293, "num_input_tokens_seen": 29575968, "step": 14170 }, { "epoch": 2.3124235255730485, "grad_norm": 12.375, "learning_rate": 3.6044301381786067e-05, "loss": 2.1236, "num_input_tokens_seen": 29586512, "step": 14175 }, { "epoch": 2.3132392527938657, "grad_norm": 6.8125, "learning_rate": 3.6035493003433883e-05, "loss": 3.2138, "num_input_tokens_seen": 29598816, "step": 14180 }, { "epoch": 2.3140549800146832, "grad_norm": 14.25, "learning_rate": 3.6026682923269994e-05, "loss": 3.3737, "num_input_tokens_seen": 29608768, "step": 14185 }, { "epoch": 2.3148707072355004, "grad_norm": 9.6875, "learning_rate": 3.6017871142653034e-05, "loss": 2.3755, "num_input_tokens_seen": 29620032, "step": 14190 }, { "epoch": 2.315686434456318, "grad_norm": 5.34375, "learning_rate": 3.600905766294189e-05, "loss": 1.867, "num_input_tokens_seen": 29631728, "step": 14195 }, { "epoch": 2.316502161677135, "grad_norm": 4.25, "learning_rate": 3.60002424854957e-05, "loss": 2.9558, "num_input_tokens_seen": 29642224, "step": 14200 }, { "epoch": 2.316502161677135, "eval_loss": 2.5656850337982178, "eval_runtime": 135.0129, "eval_samples_per_second": 20.183, "eval_steps_per_second": 10.095, "num_input_tokens_seen": 29642224, "step": 14200 }, { "epoch": 2.3173178888979526, "grad_norm": 3.8125, "learning_rate": 3.5991425611673876e-05, "loss": 2.5722, "num_input_tokens_seen": 29653872, "step": 14205 }, { "epoch": 2.31813361611877, "grad_norm": 13.75, "learning_rate": 3.5982607042836105e-05, "loss": 3.2859, "num_input_tokens_seen": 29664560, "step": 14210 }, { "epoch": 2.3189493433395874, "grad_norm": 7.0, "learning_rate": 3.597378678034231e-05, "loss": 1.4872, "num_input_tokens_seen": 29676496, "step": 14215 }, { "epoch": 2.3197650705604045, "grad_norm": 10.4375, "learning_rate": 3.596496482555269e-05, "loss": 2.7543, "num_input_tokens_seen": 29687824, "step": 14220 }, { "epoch": 2.320580797781222, "grad_norm": 3.28125, "learning_rate": 3.595614117982769e-05, "loss": 1.1508, "num_input_tokens_seen": 29698720, "step": 14225 }, { "epoch": 2.321396525002039, "grad_norm": 9.3125, "learning_rate": 3.594731584452805e-05, "loss": 2.5484, "num_input_tokens_seen": 29708240, "step": 14230 }, { "epoch": 2.3222122522228568, "grad_norm": 6.15625, "learning_rate": 3.593848882101472e-05, "loss": 1.8081, "num_input_tokens_seen": 29718976, "step": 14235 }, { "epoch": 2.323027979443674, "grad_norm": 6.46875, "learning_rate": 3.592966011064896e-05, "loss": 2.3083, "num_input_tokens_seen": 29728784, "step": 14240 }, { "epoch": 2.3238437066644915, "grad_norm": 13.25, "learning_rate": 3.592082971479226e-05, "loss": 2.5457, "num_input_tokens_seen": 29738880, "step": 14245 }, { "epoch": 2.3246594338853086, "grad_norm": 5.0625, "learning_rate": 3.5911997634806385e-05, "loss": 4.0108, "num_input_tokens_seen": 29750480, "step": 14250 }, { "epoch": 2.325475161106126, "grad_norm": 11.625, "learning_rate": 3.5903163872053336e-05, "loss": 2.8508, "num_input_tokens_seen": 29761520, "step": 14255 }, { "epoch": 2.3262908883269433, "grad_norm": 1.90625, "learning_rate": 3.58943284278954e-05, "loss": 2.3385, "num_input_tokens_seen": 29771872, "step": 14260 }, { "epoch": 2.327106615547761, "grad_norm": 2.75, "learning_rate": 3.588549130369512e-05, "loss": 0.9379, "num_input_tokens_seen": 29779600, "step": 14265 }, { "epoch": 2.327922342768578, "grad_norm": 5.21875, "learning_rate": 3.5876652500815274e-05, "loss": 1.7726, "num_input_tokens_seen": 29790640, "step": 14270 }, { "epoch": 2.3287380699893956, "grad_norm": 4.5, "learning_rate": 3.586781202061894e-05, "loss": 3.997, "num_input_tokens_seen": 29800640, "step": 14275 }, { "epoch": 2.3295537972102127, "grad_norm": 5.53125, "learning_rate": 3.585896986446942e-05, "loss": 2.3434, "num_input_tokens_seen": 29811728, "step": 14280 }, { "epoch": 2.3303695244310303, "grad_norm": 7.46875, "learning_rate": 3.585012603373028e-05, "loss": 2.1559, "num_input_tokens_seen": 29822288, "step": 14285 }, { "epoch": 2.3311852516518474, "grad_norm": 8.0625, "learning_rate": 3.584128052976535e-05, "loss": 2.9035, "num_input_tokens_seen": 29833840, "step": 14290 }, { "epoch": 2.332000978872665, "grad_norm": 10.625, "learning_rate": 3.5832433353938724e-05, "loss": 1.7853, "num_input_tokens_seen": 29845760, "step": 14295 }, { "epoch": 2.3328167060934826, "grad_norm": 6.21875, "learning_rate": 3.5823584507614746e-05, "loss": 1.1862, "num_input_tokens_seen": 29854816, "step": 14300 }, { "epoch": 2.3336324333142997, "grad_norm": 6.65625, "learning_rate": 3.581473399215802e-05, "loss": 1.4872, "num_input_tokens_seen": 29864512, "step": 14305 }, { "epoch": 2.334448160535117, "grad_norm": 7.0, "learning_rate": 3.580588180893341e-05, "loss": 2.6081, "num_input_tokens_seen": 29874544, "step": 14310 }, { "epoch": 2.3352638877559344, "grad_norm": 9.5625, "learning_rate": 3.579702795930602e-05, "loss": 2.7013, "num_input_tokens_seen": 29883760, "step": 14315 }, { "epoch": 2.336079614976752, "grad_norm": 5.0625, "learning_rate": 3.578817244464125e-05, "loss": 2.9244, "num_input_tokens_seen": 29894480, "step": 14320 }, { "epoch": 2.336895342197569, "grad_norm": 2.171875, "learning_rate": 3.577931526630471e-05, "loss": 1.306, "num_input_tokens_seen": 29905280, "step": 14325 }, { "epoch": 2.3377110694183862, "grad_norm": 1.71875, "learning_rate": 3.577045642566229e-05, "loss": 1.1952, "num_input_tokens_seen": 29914832, "step": 14330 }, { "epoch": 2.338526796639204, "grad_norm": 4.0, "learning_rate": 3.576159592408014e-05, "loss": 2.855, "num_input_tokens_seen": 29925888, "step": 14335 }, { "epoch": 2.3393425238600214, "grad_norm": 0.2060546875, "learning_rate": 3.575273376292466e-05, "loss": 2.5254, "num_input_tokens_seen": 29936368, "step": 14340 }, { "epoch": 2.3401582510808385, "grad_norm": 14.0, "learning_rate": 3.574386994356251e-05, "loss": 3.0436, "num_input_tokens_seen": 29946672, "step": 14345 }, { "epoch": 2.340973978301656, "grad_norm": 9.25, "learning_rate": 3.573500446736059e-05, "loss": 1.878, "num_input_tokens_seen": 29956544, "step": 14350 }, { "epoch": 2.3417897055224732, "grad_norm": 9.0, "learning_rate": 3.5726137335686094e-05, "loss": 2.3938, "num_input_tokens_seen": 29966224, "step": 14355 }, { "epoch": 2.342605432743291, "grad_norm": 9.5625, "learning_rate": 3.571726854990642e-05, "loss": 2.4237, "num_input_tokens_seen": 29977936, "step": 14360 }, { "epoch": 2.343421159964108, "grad_norm": 4.5, "learning_rate": 3.570839811138925e-05, "loss": 1.9157, "num_input_tokens_seen": 29990224, "step": 14365 }, { "epoch": 2.3442368871849255, "grad_norm": 7.4375, "learning_rate": 3.569952602150252e-05, "loss": 1.1063, "num_input_tokens_seen": 30000672, "step": 14370 }, { "epoch": 2.3450526144057426, "grad_norm": 3.1875, "learning_rate": 3.569065228161442e-05, "loss": 1.9741, "num_input_tokens_seen": 30011664, "step": 14375 }, { "epoch": 2.34586834162656, "grad_norm": 2.40625, "learning_rate": 3.5681776893093395e-05, "loss": 2.5175, "num_input_tokens_seen": 30023536, "step": 14380 }, { "epoch": 2.3466840688473773, "grad_norm": 8.625, "learning_rate": 3.5672899857308134e-05, "loss": 2.5688, "num_input_tokens_seen": 30034016, "step": 14385 }, { "epoch": 2.347499796068195, "grad_norm": 6.5, "learning_rate": 3.566402117562759e-05, "loss": 3.0788, "num_input_tokens_seen": 30045520, "step": 14390 }, { "epoch": 2.348315523289012, "grad_norm": 8.6875, "learning_rate": 3.565514084942097e-05, "loss": 1.8852, "num_input_tokens_seen": 30055440, "step": 14395 }, { "epoch": 2.3491312505098296, "grad_norm": 8.25, "learning_rate": 3.564625888005773e-05, "loss": 3.4931, "num_input_tokens_seen": 30064704, "step": 14400 }, { "epoch": 2.3491312505098296, "eval_loss": 2.555907726287842, "eval_runtime": 134.9015, "eval_samples_per_second": 20.2, "eval_steps_per_second": 10.104, "num_input_tokens_seen": 30064704, "step": 14400 }, { "epoch": 2.3499469777306468, "grad_norm": 3.609375, "learning_rate": 3.563737526890759e-05, "loss": 1.3909, "num_input_tokens_seen": 30074832, "step": 14405 }, { "epoch": 2.3507627049514643, "grad_norm": 1.8125, "learning_rate": 3.562849001734049e-05, "loss": 1.7463, "num_input_tokens_seen": 30084592, "step": 14410 }, { "epoch": 2.3515784321722815, "grad_norm": 9.9375, "learning_rate": 3.561960312672667e-05, "loss": 1.4989, "num_input_tokens_seen": 30095216, "step": 14415 }, { "epoch": 2.352394159393099, "grad_norm": 6.84375, "learning_rate": 3.5610714598436596e-05, "loss": 1.7822, "num_input_tokens_seen": 30106320, "step": 14420 }, { "epoch": 2.353209886613916, "grad_norm": 7.84375, "learning_rate": 3.5601824433840986e-05, "loss": 3.1477, "num_input_tokens_seen": 30116320, "step": 14425 }, { "epoch": 2.3540256138347337, "grad_norm": 9.4375, "learning_rate": 3.559293263431082e-05, "loss": 2.4141, "num_input_tokens_seen": 30127712, "step": 14430 }, { "epoch": 2.354841341055551, "grad_norm": 7.28125, "learning_rate": 3.558403920121732e-05, "loss": 1.5888, "num_input_tokens_seen": 30137312, "step": 14435 }, { "epoch": 2.3556570682763684, "grad_norm": 6.46875, "learning_rate": 3.557514413593197e-05, "loss": 2.4824, "num_input_tokens_seen": 30148848, "step": 14440 }, { "epoch": 2.3564727954971856, "grad_norm": 3.234375, "learning_rate": 3.55662474398265e-05, "loss": 1.4398, "num_input_tokens_seen": 30158944, "step": 14445 }, { "epoch": 2.357288522718003, "grad_norm": 8.0, "learning_rate": 3.555734911427288e-05, "loss": 2.9362, "num_input_tokens_seen": 30167920, "step": 14450 }, { "epoch": 2.3581042499388203, "grad_norm": 6.78125, "learning_rate": 3.5548449160643363e-05, "loss": 1.6401, "num_input_tokens_seen": 30179664, "step": 14455 }, { "epoch": 2.358919977159638, "grad_norm": 3.515625, "learning_rate": 3.553954758031043e-05, "loss": 2.514, "num_input_tokens_seen": 30191488, "step": 14460 }, { "epoch": 2.359735704380455, "grad_norm": 6.625, "learning_rate": 3.5530644374646815e-05, "loss": 2.1916, "num_input_tokens_seen": 30202240, "step": 14465 }, { "epoch": 2.3605514316012726, "grad_norm": 14.25, "learning_rate": 3.552173954502549e-05, "loss": 2.2845, "num_input_tokens_seen": 30212496, "step": 14470 }, { "epoch": 2.36136715882209, "grad_norm": 5.15625, "learning_rate": 3.55128330928197e-05, "loss": 1.33, "num_input_tokens_seen": 30221872, "step": 14475 }, { "epoch": 2.3621828860429073, "grad_norm": 8.375, "learning_rate": 3.550392501940294e-05, "loss": 3.4351, "num_input_tokens_seen": 30232960, "step": 14480 }, { "epoch": 2.3629986132637244, "grad_norm": 3.046875, "learning_rate": 3.5495015326148945e-05, "loss": 2.874, "num_input_tokens_seen": 30244256, "step": 14485 }, { "epoch": 2.363814340484542, "grad_norm": 6.0625, "learning_rate": 3.548610401443169e-05, "loss": 2.5433, "num_input_tokens_seen": 30253104, "step": 14490 }, { "epoch": 2.3646300677053596, "grad_norm": 4.1875, "learning_rate": 3.547719108562543e-05, "loss": 2.1104, "num_input_tokens_seen": 30264096, "step": 14495 }, { "epoch": 2.3654457949261767, "grad_norm": 4.34375, "learning_rate": 3.546827654110464e-05, "loss": 1.8633, "num_input_tokens_seen": 30273744, "step": 14500 }, { "epoch": 2.366261522146994, "grad_norm": 12.5, "learning_rate": 3.545936038224405e-05, "loss": 2.1202, "num_input_tokens_seen": 30283232, "step": 14505 }, { "epoch": 2.3670772493678114, "grad_norm": 6.09375, "learning_rate": 3.545044261041864e-05, "loss": 1.4335, "num_input_tokens_seen": 30293040, "step": 14510 }, { "epoch": 2.367892976588629, "grad_norm": 6.78125, "learning_rate": 3.5441523227003657e-05, "loss": 2.5401, "num_input_tokens_seen": 30303968, "step": 14515 }, { "epoch": 2.368708703809446, "grad_norm": 9.25, "learning_rate": 3.543260223337459e-05, "loss": 2.9432, "num_input_tokens_seen": 30314576, "step": 14520 }, { "epoch": 2.3695244310302637, "grad_norm": 3.90625, "learning_rate": 3.542367963090714e-05, "loss": 3.546, "num_input_tokens_seen": 30324208, "step": 14525 }, { "epoch": 2.370340158251081, "grad_norm": 5.09375, "learning_rate": 3.5414755420977295e-05, "loss": 1.7623, "num_input_tokens_seen": 30333680, "step": 14530 }, { "epoch": 2.3711558854718984, "grad_norm": 11.8125, "learning_rate": 3.54058296049613e-05, "loss": 2.7064, "num_input_tokens_seen": 30342384, "step": 14535 }, { "epoch": 2.3719716126927155, "grad_norm": 7.09375, "learning_rate": 3.53969021842356e-05, "loss": 3.1097, "num_input_tokens_seen": 30352784, "step": 14540 }, { "epoch": 2.372787339913533, "grad_norm": 10.625, "learning_rate": 3.5387973160176926e-05, "loss": 2.4309, "num_input_tokens_seen": 30362976, "step": 14545 }, { "epoch": 2.37360306713435, "grad_norm": 5.4375, "learning_rate": 3.537904253416224e-05, "loss": 2.2821, "num_input_tokens_seen": 30373696, "step": 14550 }, { "epoch": 2.374418794355168, "grad_norm": 8.1875, "learning_rate": 3.537011030756878e-05, "loss": 2.0931, "num_input_tokens_seen": 30384000, "step": 14555 }, { "epoch": 2.375234521575985, "grad_norm": 5.21875, "learning_rate": 3.536117648177399e-05, "loss": 2.6989, "num_input_tokens_seen": 30394352, "step": 14560 }, { "epoch": 2.3760502487968025, "grad_norm": 1.5859375, "learning_rate": 3.535224105815558e-05, "loss": 1.5693, "num_input_tokens_seen": 30405232, "step": 14565 }, { "epoch": 2.3768659760176196, "grad_norm": 2.75, "learning_rate": 3.5343304038091494e-05, "loss": 1.9615, "num_input_tokens_seen": 30415088, "step": 14570 }, { "epoch": 2.377681703238437, "grad_norm": 7.21875, "learning_rate": 3.5334365422959955e-05, "loss": 3.2998, "num_input_tokens_seen": 30426272, "step": 14575 }, { "epoch": 2.3784974304592543, "grad_norm": 14.4375, "learning_rate": 3.5325425214139396e-05, "loss": 2.9735, "num_input_tokens_seen": 30436848, "step": 14580 }, { "epoch": 2.379313157680072, "grad_norm": 1.921875, "learning_rate": 3.531648341300851e-05, "loss": 2.2361, "num_input_tokens_seen": 30448144, "step": 14585 }, { "epoch": 2.380128884900889, "grad_norm": 3.6875, "learning_rate": 3.530754002094623e-05, "loss": 2.6064, "num_input_tokens_seen": 30459360, "step": 14590 }, { "epoch": 2.3809446121217066, "grad_norm": 3.171875, "learning_rate": 3.529859503933175e-05, "loss": 0.8127, "num_input_tokens_seen": 30470560, "step": 14595 }, { "epoch": 2.3817603393425237, "grad_norm": 4.0625, "learning_rate": 3.52896484695445e-05, "loss": 1.6007, "num_input_tokens_seen": 30481488, "step": 14600 }, { "epoch": 2.3817603393425237, "eval_loss": 2.5469157695770264, "eval_runtime": 134.8148, "eval_samples_per_second": 20.213, "eval_steps_per_second": 10.11, "num_input_tokens_seen": 30481488, "step": 14600 }, { "epoch": 2.3825760665633413, "grad_norm": 9.875, "learning_rate": 3.528070031296414e-05, "loss": 2.3966, "num_input_tokens_seen": 30491696, "step": 14605 }, { "epoch": 2.3833917937841584, "grad_norm": 2.5, "learning_rate": 3.5271750570970605e-05, "loss": 2.1099, "num_input_tokens_seen": 30501200, "step": 14610 }, { "epoch": 2.384207521004976, "grad_norm": 11.0, "learning_rate": 3.526279924494405e-05, "loss": 1.9646, "num_input_tokens_seen": 30510432, "step": 14615 }, { "epoch": 2.385023248225793, "grad_norm": 8.5, "learning_rate": 3.5253846336264874e-05, "loss": 1.996, "num_input_tokens_seen": 30520592, "step": 14620 }, { "epoch": 2.3858389754466107, "grad_norm": 3.796875, "learning_rate": 3.5244891846313736e-05, "loss": 2.7506, "num_input_tokens_seen": 30531296, "step": 14625 }, { "epoch": 2.386654702667428, "grad_norm": 6.5625, "learning_rate": 3.5235935776471527e-05, "loss": 2.2318, "num_input_tokens_seen": 30543184, "step": 14630 }, { "epoch": 2.3874704298882454, "grad_norm": 6.375, "learning_rate": 3.522697812811939e-05, "loss": 2.2106, "num_input_tokens_seen": 30553072, "step": 14635 }, { "epoch": 2.3882861571090626, "grad_norm": 7.75, "learning_rate": 3.521801890263871e-05, "loss": 2.7049, "num_input_tokens_seen": 30563600, "step": 14640 }, { "epoch": 2.38910188432988, "grad_norm": 7.53125, "learning_rate": 3.5209058101411114e-05, "loss": 2.2348, "num_input_tokens_seen": 30574320, "step": 14645 }, { "epoch": 2.3899176115506973, "grad_norm": 5.09375, "learning_rate": 3.520009572581845e-05, "loss": 2.4331, "num_input_tokens_seen": 30585232, "step": 14650 }, { "epoch": 2.390733338771515, "grad_norm": 7.09375, "learning_rate": 3.519113177724285e-05, "loss": 2.1535, "num_input_tokens_seen": 30596096, "step": 14655 }, { "epoch": 2.391549065992332, "grad_norm": 7.0, "learning_rate": 3.5182166257066656e-05, "loss": 2.0447, "num_input_tokens_seen": 30607264, "step": 14660 }, { "epoch": 2.3923647932131495, "grad_norm": 6.65625, "learning_rate": 3.517319916667247e-05, "loss": 1.7979, "num_input_tokens_seen": 30617344, "step": 14665 }, { "epoch": 2.393180520433967, "grad_norm": 4.375, "learning_rate": 3.516423050744313e-05, "loss": 1.6519, "num_input_tokens_seen": 30629600, "step": 14670 }, { "epoch": 2.3939962476547842, "grad_norm": 4.8125, "learning_rate": 3.5155260280761704e-05, "loss": 3.1939, "num_input_tokens_seen": 30640864, "step": 14675 }, { "epoch": 2.3948119748756014, "grad_norm": 5.28125, "learning_rate": 3.514628848801154e-05, "loss": 1.392, "num_input_tokens_seen": 30650272, "step": 14680 }, { "epoch": 2.395627702096419, "grad_norm": 7.28125, "learning_rate": 3.5137315130576174e-05, "loss": 3.4961, "num_input_tokens_seen": 30662080, "step": 14685 }, { "epoch": 2.3964434293172365, "grad_norm": 7.84375, "learning_rate": 3.512834020983942e-05, "loss": 2.0489, "num_input_tokens_seen": 30672416, "step": 14690 }, { "epoch": 2.3972591565380537, "grad_norm": 4.0625, "learning_rate": 3.5119363727185334e-05, "loss": 3.0647, "num_input_tokens_seen": 30683440, "step": 14695 }, { "epoch": 2.398074883758871, "grad_norm": 10.0625, "learning_rate": 3.511038568399819e-05, "loss": 2.162, "num_input_tokens_seen": 30693136, "step": 14700 }, { "epoch": 2.3988906109796884, "grad_norm": 4.59375, "learning_rate": 3.510140608166251e-05, "loss": 1.4769, "num_input_tokens_seen": 30702672, "step": 14705 }, { "epoch": 2.399706338200506, "grad_norm": 7.46875, "learning_rate": 3.509242492156308e-05, "loss": 2.4022, "num_input_tokens_seen": 30713680, "step": 14710 }, { "epoch": 2.400522065421323, "grad_norm": 6.125, "learning_rate": 3.5083442205084896e-05, "loss": 2.8953, "num_input_tokens_seen": 30724016, "step": 14715 }, { "epoch": 2.4013377926421406, "grad_norm": 8.9375, "learning_rate": 3.507445793361321e-05, "loss": 2.3662, "num_input_tokens_seen": 30734096, "step": 14720 }, { "epoch": 2.4021535198629578, "grad_norm": 7.5, "learning_rate": 3.5065472108533505e-05, "loss": 1.5555, "num_input_tokens_seen": 30745136, "step": 14725 }, { "epoch": 2.4029692470837754, "grad_norm": 7.0, "learning_rate": 3.5056484731231504e-05, "loss": 1.8827, "num_input_tokens_seen": 30756032, "step": 14730 }, { "epoch": 2.4037849743045925, "grad_norm": 14.5, "learning_rate": 3.504749580309319e-05, "loss": 2.306, "num_input_tokens_seen": 30766208, "step": 14735 }, { "epoch": 2.40460070152541, "grad_norm": 9.1875, "learning_rate": 3.5038505325504753e-05, "loss": 1.7345, "num_input_tokens_seen": 30776864, "step": 14740 }, { "epoch": 2.405416428746227, "grad_norm": 3.234375, "learning_rate": 3.502951329985264e-05, "loss": 2.6317, "num_input_tokens_seen": 30786800, "step": 14745 }, { "epoch": 2.4062321559670448, "grad_norm": 3.984375, "learning_rate": 3.502051972752354e-05, "loss": 2.0613, "num_input_tokens_seen": 30797072, "step": 14750 }, { "epoch": 2.407047883187862, "grad_norm": 3.484375, "learning_rate": 3.5011524609904374e-05, "loss": 2.445, "num_input_tokens_seen": 30808688, "step": 14755 }, { "epoch": 2.4078636104086795, "grad_norm": 8.125, "learning_rate": 3.50025279483823e-05, "loss": 4.2698, "num_input_tokens_seen": 30818016, "step": 14760 }, { "epoch": 2.4086793376294966, "grad_norm": 8.3125, "learning_rate": 3.499352974434472e-05, "loss": 3.152, "num_input_tokens_seen": 30828160, "step": 14765 }, { "epoch": 2.409495064850314, "grad_norm": 7.875, "learning_rate": 3.498452999917926e-05, "loss": 1.0195, "num_input_tokens_seen": 30837824, "step": 14770 }, { "epoch": 2.4103107920711313, "grad_norm": 4.1875, "learning_rate": 3.4975528714273795e-05, "loss": 2.1821, "num_input_tokens_seen": 30848544, "step": 14775 }, { "epoch": 2.411126519291949, "grad_norm": 5.3125, "learning_rate": 3.4966525891016454e-05, "loss": 1.8296, "num_input_tokens_seen": 30860304, "step": 14780 }, { "epoch": 2.411942246512766, "grad_norm": 7.34375, "learning_rate": 3.495752153079557e-05, "loss": 3.2565, "num_input_tokens_seen": 30871360, "step": 14785 }, { "epoch": 2.4127579737335836, "grad_norm": 4.03125, "learning_rate": 3.494851563499974e-05, "loss": 1.044, "num_input_tokens_seen": 30880800, "step": 14790 }, { "epoch": 2.4135737009544007, "grad_norm": 19.625, "learning_rate": 3.493950820501777e-05, "loss": 2.8518, "num_input_tokens_seen": 30890192, "step": 14795 }, { "epoch": 2.4143894281752183, "grad_norm": 3.375, "learning_rate": 3.493049924223872e-05, "loss": 2.3649, "num_input_tokens_seen": 30900976, "step": 14800 }, { "epoch": 2.4143894281752183, "eval_loss": 2.544508457183838, "eval_runtime": 135.118, "eval_samples_per_second": 20.168, "eval_steps_per_second": 10.087, "num_input_tokens_seen": 30900976, "step": 14800 }, { "epoch": 2.4152051553960354, "grad_norm": 8.0, "learning_rate": 3.49214887480519e-05, "loss": 3.0263, "num_input_tokens_seen": 30911120, "step": 14805 }, { "epoch": 2.416020882616853, "grad_norm": 0.68359375, "learning_rate": 3.4912476723846834e-05, "loss": 1.6809, "num_input_tokens_seen": 30921008, "step": 14810 }, { "epoch": 2.41683660983767, "grad_norm": 3.65625, "learning_rate": 3.490346317101328e-05, "loss": 2.7736, "num_input_tokens_seen": 30931664, "step": 14815 }, { "epoch": 2.4176523370584877, "grad_norm": 4.5, "learning_rate": 3.4894448090941266e-05, "loss": 1.6923, "num_input_tokens_seen": 30941824, "step": 14820 }, { "epoch": 2.418468064279305, "grad_norm": 5.375, "learning_rate": 3.488543148502101e-05, "loss": 1.6397, "num_input_tokens_seen": 30952608, "step": 14825 }, { "epoch": 2.4192837915001224, "grad_norm": 14.75, "learning_rate": 3.487641335464299e-05, "loss": 2.9483, "num_input_tokens_seen": 30960944, "step": 14830 }, { "epoch": 2.4200995187209395, "grad_norm": 5.25, "learning_rate": 3.4867393701197914e-05, "loss": 2.0904, "num_input_tokens_seen": 30972512, "step": 14835 }, { "epoch": 2.420915245941757, "grad_norm": 7.5, "learning_rate": 3.485837252607673e-05, "loss": 3.2152, "num_input_tokens_seen": 30983520, "step": 14840 }, { "epoch": 2.4217309731625742, "grad_norm": 6.15625, "learning_rate": 3.4849349830670615e-05, "loss": 2.866, "num_input_tokens_seen": 30993792, "step": 14845 }, { "epoch": 2.422546700383392, "grad_norm": 3.828125, "learning_rate": 3.4840325616370976e-05, "loss": 2.7403, "num_input_tokens_seen": 31003552, "step": 14850 }, { "epoch": 2.423362427604209, "grad_norm": 9.625, "learning_rate": 3.483129988456947e-05, "loss": 2.6481, "num_input_tokens_seen": 31013232, "step": 14855 }, { "epoch": 2.4241781548250265, "grad_norm": 12.75, "learning_rate": 3.482227263665797e-05, "loss": 2.1846, "num_input_tokens_seen": 31022432, "step": 14860 }, { "epoch": 2.424993882045844, "grad_norm": 12.8125, "learning_rate": 3.48132438740286e-05, "loss": 2.2602, "num_input_tokens_seen": 31034512, "step": 14865 }, { "epoch": 2.4258096092666612, "grad_norm": 10.125, "learning_rate": 3.48042135980737e-05, "loss": 2.8863, "num_input_tokens_seen": 31044704, "step": 14870 }, { "epoch": 2.4266253364874784, "grad_norm": 7.03125, "learning_rate": 3.479518181018586e-05, "loss": 2.6415, "num_input_tokens_seen": 31055280, "step": 14875 }, { "epoch": 2.427441063708296, "grad_norm": 7.65625, "learning_rate": 3.4786148511757886e-05, "loss": 1.4341, "num_input_tokens_seen": 31066656, "step": 14880 }, { "epoch": 2.4282567909291135, "grad_norm": 5.1875, "learning_rate": 3.477711370418284e-05, "loss": 2.6218, "num_input_tokens_seen": 31079120, "step": 14885 }, { "epoch": 2.4290725181499306, "grad_norm": 8.6875, "learning_rate": 3.476807738885399e-05, "loss": 1.5148, "num_input_tokens_seen": 31089872, "step": 14890 }, { "epoch": 2.429888245370748, "grad_norm": 7.5625, "learning_rate": 3.475903956716485e-05, "loss": 2.7455, "num_input_tokens_seen": 31099744, "step": 14895 }, { "epoch": 2.4307039725915653, "grad_norm": 11.5, "learning_rate": 3.475000024050917e-05, "loss": 2.1032, "num_input_tokens_seen": 31109760, "step": 14900 }, { "epoch": 2.431519699812383, "grad_norm": 8.9375, "learning_rate": 3.4740959410280926e-05, "loss": 3.1514, "num_input_tokens_seen": 31119664, "step": 14905 }, { "epoch": 2.4323354270332, "grad_norm": 4.25, "learning_rate": 3.4731917077874324e-05, "loss": 2.6253, "num_input_tokens_seen": 31132128, "step": 14910 }, { "epoch": 2.4331511542540176, "grad_norm": 12.5, "learning_rate": 3.4722873244683816e-05, "loss": 3.7684, "num_input_tokens_seen": 31143344, "step": 14915 }, { "epoch": 2.4339668814748348, "grad_norm": 5.28125, "learning_rate": 3.4713827912104065e-05, "loss": 1.3152, "num_input_tokens_seen": 31153952, "step": 14920 }, { "epoch": 2.4347826086956523, "grad_norm": 8.5, "learning_rate": 3.470478108152998e-05, "loss": 1.5731, "num_input_tokens_seen": 31163680, "step": 14925 }, { "epoch": 2.4355983359164695, "grad_norm": 5.75, "learning_rate": 3.4695732754356695e-05, "loss": 2.6869, "num_input_tokens_seen": 31174816, "step": 14930 }, { "epoch": 2.436414063137287, "grad_norm": 7.59375, "learning_rate": 3.4686682931979576e-05, "loss": 3.86, "num_input_tokens_seen": 31186400, "step": 14935 }, { "epoch": 2.437229790358104, "grad_norm": 16.625, "learning_rate": 3.467763161579422e-05, "loss": 2.1023, "num_input_tokens_seen": 31195712, "step": 14940 }, { "epoch": 2.4380455175789217, "grad_norm": 4.90625, "learning_rate": 3.466857880719645e-05, "loss": 2.6019, "num_input_tokens_seen": 31205184, "step": 14945 }, { "epoch": 2.438861244799739, "grad_norm": 3.4375, "learning_rate": 3.465952450758233e-05, "loss": 1.8624, "num_input_tokens_seen": 31215504, "step": 14950 }, { "epoch": 2.4396769720205564, "grad_norm": 9.25, "learning_rate": 3.4650468718348126e-05, "loss": 3.2255, "num_input_tokens_seen": 31226688, "step": 14955 }, { "epoch": 2.4404926992413736, "grad_norm": 7.4375, "learning_rate": 3.464141144089038e-05, "loss": 2.5472, "num_input_tokens_seen": 31236048, "step": 14960 }, { "epoch": 2.441308426462191, "grad_norm": 8.3125, "learning_rate": 3.463235267660583e-05, "loss": 1.7156, "num_input_tokens_seen": 31246864, "step": 14965 }, { "epoch": 2.4421241536830083, "grad_norm": 9.375, "learning_rate": 3.462329242689145e-05, "loss": 2.487, "num_input_tokens_seen": 31257808, "step": 14970 }, { "epoch": 2.442939880903826, "grad_norm": 8.6875, "learning_rate": 3.461423069314444e-05, "loss": 2.8036, "num_input_tokens_seen": 31268976, "step": 14975 }, { "epoch": 2.443755608124643, "grad_norm": 6.625, "learning_rate": 3.460516747676224e-05, "loss": 2.4585, "num_input_tokens_seen": 31279488, "step": 14980 }, { "epoch": 2.4445713353454606, "grad_norm": 4.5625, "learning_rate": 3.459610277914251e-05, "loss": 2.3534, "num_input_tokens_seen": 31290400, "step": 14985 }, { "epoch": 2.4453870625662777, "grad_norm": 5.0625, "learning_rate": 3.458703660168314e-05, "loss": 2.583, "num_input_tokens_seen": 31301008, "step": 14990 }, { "epoch": 2.4462027897870953, "grad_norm": 9.375, "learning_rate": 3.457796894578224e-05, "loss": 2.8282, "num_input_tokens_seen": 31311360, "step": 14995 }, { "epoch": 2.4470185170079124, "grad_norm": 6.25, "learning_rate": 3.456889981283817e-05, "loss": 2.3624, "num_input_tokens_seen": 31321184, "step": 15000 }, { "epoch": 2.4470185170079124, "eval_loss": 2.5304582118988037, "eval_runtime": 135.1172, "eval_samples_per_second": 20.168, "eval_steps_per_second": 10.088, "num_input_tokens_seen": 31321184, "step": 15000 }, { "epoch": 2.44783424422873, "grad_norm": 11.6875, "learning_rate": 3.45598292042495e-05, "loss": 4.5703, "num_input_tokens_seen": 31330304, "step": 15005 }, { "epoch": 2.448649971449547, "grad_norm": 7.71875, "learning_rate": 3.4550757121415035e-05, "loss": 3.3948, "num_input_tokens_seen": 31339040, "step": 15010 }, { "epoch": 2.4494656986703647, "grad_norm": 4.8125, "learning_rate": 3.454168356573378e-05, "loss": 1.4794, "num_input_tokens_seen": 31349296, "step": 15015 }, { "epoch": 2.450281425891182, "grad_norm": 4.03125, "learning_rate": 3.453260853860503e-05, "loss": 1.7381, "num_input_tokens_seen": 31358976, "step": 15020 }, { "epoch": 2.4510971531119994, "grad_norm": 4.71875, "learning_rate": 3.452353204142824e-05, "loss": 2.2608, "num_input_tokens_seen": 31368864, "step": 15025 }, { "epoch": 2.4519128803328165, "grad_norm": 6.09375, "learning_rate": 3.4514454075603136e-05, "loss": 2.0529, "num_input_tokens_seen": 31378800, "step": 15030 }, { "epoch": 2.452728607553634, "grad_norm": 2.203125, "learning_rate": 3.450537464252964e-05, "loss": 2.0673, "num_input_tokens_seen": 31389072, "step": 15035 }, { "epoch": 2.4535443347744517, "grad_norm": 6.53125, "learning_rate": 3.4496293743607925e-05, "loss": 2.5837, "num_input_tokens_seen": 31399312, "step": 15040 }, { "epoch": 2.454360061995269, "grad_norm": 7.75, "learning_rate": 3.448721138023838e-05, "loss": 1.8439, "num_input_tokens_seen": 31408864, "step": 15045 }, { "epoch": 2.455175789216086, "grad_norm": 11.0625, "learning_rate": 3.447812755382162e-05, "loss": 2.3647, "num_input_tokens_seen": 31420176, "step": 15050 }, { "epoch": 2.4559915164369035, "grad_norm": 2.640625, "learning_rate": 3.446904226575847e-05, "loss": 2.6357, "num_input_tokens_seen": 31431296, "step": 15055 }, { "epoch": 2.456807243657721, "grad_norm": 1.21875, "learning_rate": 3.445995551745002e-05, "loss": 2.0267, "num_input_tokens_seen": 31441184, "step": 15060 }, { "epoch": 2.457622970878538, "grad_norm": 11.4375, "learning_rate": 3.445086731029753e-05, "loss": 2.3031, "num_input_tokens_seen": 31451168, "step": 15065 }, { "epoch": 2.4584386980993553, "grad_norm": 2.8125, "learning_rate": 3.444177764570255e-05, "loss": 2.7129, "num_input_tokens_seen": 31462624, "step": 15070 }, { "epoch": 2.459254425320173, "grad_norm": 9.3125, "learning_rate": 3.44326865250668e-05, "loss": 2.7599, "num_input_tokens_seen": 31473744, "step": 15075 }, { "epoch": 2.4600701525409905, "grad_norm": 5.4375, "learning_rate": 3.442359394979225e-05, "loss": 2.737, "num_input_tokens_seen": 31484640, "step": 15080 }, { "epoch": 2.4608858797618076, "grad_norm": 1.6875, "learning_rate": 3.441449992128108e-05, "loss": 1.5051, "num_input_tokens_seen": 31494944, "step": 15085 }, { "epoch": 2.461701606982625, "grad_norm": 6.1875, "learning_rate": 3.440540444093573e-05, "loss": 1.5543, "num_input_tokens_seen": 31504032, "step": 15090 }, { "epoch": 2.4625173342034423, "grad_norm": 3.828125, "learning_rate": 3.43963075101588e-05, "loss": 1.5137, "num_input_tokens_seen": 31514128, "step": 15095 }, { "epoch": 2.46333306142426, "grad_norm": 8.4375, "learning_rate": 3.438720913035318e-05, "loss": 3.3419, "num_input_tokens_seen": 31525008, "step": 15100 }, { "epoch": 2.464148788645077, "grad_norm": 2.90625, "learning_rate": 3.437810930292195e-05, "loss": 1.7543, "num_input_tokens_seen": 31535568, "step": 15105 }, { "epoch": 2.4649645158658946, "grad_norm": 7.03125, "learning_rate": 3.43690080292684e-05, "loss": 2.6299, "num_input_tokens_seen": 31544336, "step": 15110 }, { "epoch": 2.4657802430867117, "grad_norm": 3.015625, "learning_rate": 3.435990531079608e-05, "loss": 1.2907, "num_input_tokens_seen": 31553456, "step": 15115 }, { "epoch": 2.4665959703075293, "grad_norm": 3.0625, "learning_rate": 3.435080114890874e-05, "loss": 1.7295, "num_input_tokens_seen": 31564816, "step": 15120 }, { "epoch": 2.4674116975283464, "grad_norm": 0.173828125, "learning_rate": 3.434169554501035e-05, "loss": 2.0831, "num_input_tokens_seen": 31574768, "step": 15125 }, { "epoch": 2.468227424749164, "grad_norm": 9.375, "learning_rate": 3.433258850050511e-05, "loss": 1.5758, "num_input_tokens_seen": 31585808, "step": 15130 }, { "epoch": 2.469043151969981, "grad_norm": 9.5625, "learning_rate": 3.4323480016797446e-05, "loss": 2.5295, "num_input_tokens_seen": 31595216, "step": 15135 }, { "epoch": 2.4698588791907987, "grad_norm": 8.25, "learning_rate": 3.4314370095291995e-05, "loss": 2.0662, "num_input_tokens_seen": 31606384, "step": 15140 }, { "epoch": 2.470674606411616, "grad_norm": 9.875, "learning_rate": 3.430525873739363e-05, "loss": 1.9718, "num_input_tokens_seen": 31616320, "step": 15145 }, { "epoch": 2.4714903336324334, "grad_norm": 6.59375, "learning_rate": 3.429614594450743e-05, "loss": 1.8538, "num_input_tokens_seen": 31625984, "step": 15150 }, { "epoch": 2.4723060608532506, "grad_norm": 7.65625, "learning_rate": 3.428703171803869e-05, "loss": 2.9443, "num_input_tokens_seen": 31634848, "step": 15155 }, { "epoch": 2.473121788074068, "grad_norm": 5.78125, "learning_rate": 3.4277916059392964e-05, "loss": 1.8884, "num_input_tokens_seen": 31646256, "step": 15160 }, { "epoch": 2.4739375152948853, "grad_norm": 10.8125, "learning_rate": 3.426879896997598e-05, "loss": 3.4463, "num_input_tokens_seen": 31656448, "step": 15165 }, { "epoch": 2.474753242515703, "grad_norm": 3.390625, "learning_rate": 3.425968045119372e-05, "loss": 3.6269, "num_input_tokens_seen": 31666800, "step": 15170 }, { "epoch": 2.47556896973652, "grad_norm": 4.1875, "learning_rate": 3.425056050445237e-05, "loss": 1.1527, "num_input_tokens_seen": 31677056, "step": 15175 }, { "epoch": 2.4763846969573375, "grad_norm": 9.625, "learning_rate": 3.4241439131158336e-05, "loss": 1.7436, "num_input_tokens_seen": 31688336, "step": 15180 }, { "epoch": 2.4772004241781547, "grad_norm": 8.625, "learning_rate": 3.423231633271825e-05, "loss": 3.1376, "num_input_tokens_seen": 31699296, "step": 15185 }, { "epoch": 2.4780161513989722, "grad_norm": 8.375, "learning_rate": 3.4223192110538985e-05, "loss": 2.9628, "num_input_tokens_seen": 31710784, "step": 15190 }, { "epoch": 2.4788318786197894, "grad_norm": 9.5625, "learning_rate": 3.4214066466027575e-05, "loss": 2.9427, "num_input_tokens_seen": 31721616, "step": 15195 }, { "epoch": 2.479647605840607, "grad_norm": 4.28125, "learning_rate": 3.4204939400591325e-05, "loss": 1.8445, "num_input_tokens_seen": 31730928, "step": 15200 }, { "epoch": 2.479647605840607, "eval_loss": 2.553328037261963, "eval_runtime": 134.795, "eval_samples_per_second": 20.216, "eval_steps_per_second": 10.112, "num_input_tokens_seen": 31730928, "step": 15200 }, { "epoch": 2.480463333061424, "grad_norm": 7.9375, "learning_rate": 3.419581091563775e-05, "loss": 3.1038, "num_input_tokens_seen": 31741152, "step": 15205 }, { "epoch": 2.4812790602822417, "grad_norm": 8.125, "learning_rate": 3.418668101257456e-05, "loss": 0.9535, "num_input_tokens_seen": 31750960, "step": 15210 }, { "epoch": 2.482094787503059, "grad_norm": 2.65625, "learning_rate": 3.417754969280971e-05, "loss": 2.2999, "num_input_tokens_seen": 31761584, "step": 15215 }, { "epoch": 2.4829105147238764, "grad_norm": 6.25, "learning_rate": 3.416841695775137e-05, "loss": 2.7431, "num_input_tokens_seen": 31772016, "step": 15220 }, { "epoch": 2.4837262419446935, "grad_norm": 1.53125, "learning_rate": 3.415928280880792e-05, "loss": 1.7103, "num_input_tokens_seen": 31782560, "step": 15225 }, { "epoch": 2.484541969165511, "grad_norm": 5.71875, "learning_rate": 3.4150147247387965e-05, "loss": 1.5234, "num_input_tokens_seen": 31792752, "step": 15230 }, { "epoch": 2.4853576963863286, "grad_norm": 4.40625, "learning_rate": 3.4141010274900306e-05, "loss": 2.962, "num_input_tokens_seen": 31801280, "step": 15235 }, { "epoch": 2.4861734236071458, "grad_norm": 5.8125, "learning_rate": 3.413187189275399e-05, "loss": 1.8907, "num_input_tokens_seen": 31812144, "step": 15240 }, { "epoch": 2.486989150827963, "grad_norm": 9.3125, "learning_rate": 3.4122732102358265e-05, "loss": 3.0662, "num_input_tokens_seen": 31822048, "step": 15245 }, { "epoch": 2.4878048780487805, "grad_norm": 8.25, "learning_rate": 3.411359090512261e-05, "loss": 4.1113, "num_input_tokens_seen": 31832688, "step": 15250 }, { "epoch": 2.488620605269598, "grad_norm": 6.1875, "learning_rate": 3.410444830245672e-05, "loss": 1.5104, "num_input_tokens_seen": 31842896, "step": 15255 }, { "epoch": 2.489436332490415, "grad_norm": 7.5, "learning_rate": 3.409530429577048e-05, "loss": 2.677, "num_input_tokens_seen": 31852768, "step": 15260 }, { "epoch": 2.4902520597112328, "grad_norm": 6.1875, "learning_rate": 3.408615888647402e-05, "loss": 3.061, "num_input_tokens_seen": 31862848, "step": 15265 }, { "epoch": 2.49106778693205, "grad_norm": 6.25, "learning_rate": 3.4077012075977675e-05, "loss": 3.2132, "num_input_tokens_seen": 31872800, "step": 15270 }, { "epoch": 2.4918835141528675, "grad_norm": 12.5625, "learning_rate": 3.4067863865692e-05, "loss": 2.2271, "num_input_tokens_seen": 31883360, "step": 15275 }, { "epoch": 2.4926992413736846, "grad_norm": 3.328125, "learning_rate": 3.4058714257027755e-05, "loss": 3.0222, "num_input_tokens_seen": 31893792, "step": 15280 }, { "epoch": 2.493514968594502, "grad_norm": 10.4375, "learning_rate": 3.404956325139594e-05, "loss": 2.8273, "num_input_tokens_seen": 31905584, "step": 15285 }, { "epoch": 2.4943306958153193, "grad_norm": 6.21875, "learning_rate": 3.404041085020775e-05, "loss": 2.6197, "num_input_tokens_seen": 31916208, "step": 15290 }, { "epoch": 2.495146423036137, "grad_norm": 13.5625, "learning_rate": 3.403125705487459e-05, "loss": 2.3621, "num_input_tokens_seen": 31926080, "step": 15295 }, { "epoch": 2.495962150256954, "grad_norm": 3.484375, "learning_rate": 3.402210186680811e-05, "loss": 2.1563, "num_input_tokens_seen": 31936576, "step": 15300 }, { "epoch": 2.4967778774777716, "grad_norm": 8.5, "learning_rate": 3.4012945287420137e-05, "loss": 1.9736, "num_input_tokens_seen": 31946544, "step": 15305 }, { "epoch": 2.4975936046985887, "grad_norm": 13.3125, "learning_rate": 3.400378731812274e-05, "loss": 3.9374, "num_input_tokens_seen": 31956288, "step": 15310 }, { "epoch": 2.4984093319194063, "grad_norm": 12.4375, "learning_rate": 3.399462796032817e-05, "loss": 2.2517, "num_input_tokens_seen": 31967632, "step": 15315 }, { "epoch": 2.4992250591402234, "grad_norm": 6.0, "learning_rate": 3.3985467215448954e-05, "loss": 1.6535, "num_input_tokens_seen": 31977712, "step": 15320 }, { "epoch": 2.500040786361041, "grad_norm": 14.25, "learning_rate": 3.3976305084897776e-05, "loss": 3.4787, "num_input_tokens_seen": 31988000, "step": 15325 }, { "epoch": 2.500856513581858, "grad_norm": 6.0625, "learning_rate": 3.3967141570087544e-05, "loss": 2.4876, "num_input_tokens_seen": 31998304, "step": 15330 }, { "epoch": 2.5016722408026757, "grad_norm": 8.4375, "learning_rate": 3.39579766724314e-05, "loss": 2.6039, "num_input_tokens_seen": 32009984, "step": 15335 }, { "epoch": 2.502487968023493, "grad_norm": 11.8125, "learning_rate": 3.3948810393342677e-05, "loss": 3.0548, "num_input_tokens_seen": 32020720, "step": 15340 }, { "epoch": 2.5033036952443104, "grad_norm": 10.0, "learning_rate": 3.3939642734234936e-05, "loss": 1.6656, "num_input_tokens_seen": 32031824, "step": 15345 }, { "epoch": 2.5041194224651275, "grad_norm": 7.46875, "learning_rate": 3.393047369652194e-05, "loss": 1.1684, "num_input_tokens_seen": 32042816, "step": 15350 }, { "epoch": 2.504935149685945, "grad_norm": 5.3125, "learning_rate": 3.3921303281617664e-05, "loss": 1.8827, "num_input_tokens_seen": 32052992, "step": 15355 }, { "epoch": 2.5057508769067622, "grad_norm": 4.1875, "learning_rate": 3.391213149093632e-05, "loss": 1.221, "num_input_tokens_seen": 32063216, "step": 15360 }, { "epoch": 2.50656660412758, "grad_norm": 11.1875, "learning_rate": 3.3902958325892303e-05, "loss": 3.292, "num_input_tokens_seen": 32075360, "step": 15365 }, { "epoch": 2.507382331348397, "grad_norm": 10.1875, "learning_rate": 3.389378378790023e-05, "loss": 2.2244, "num_input_tokens_seen": 32085072, "step": 15370 }, { "epoch": 2.5081980585692145, "grad_norm": 7.25, "learning_rate": 3.388460787837493e-05, "loss": 2.5915, "num_input_tokens_seen": 32095792, "step": 15375 }, { "epoch": 2.5090137857900316, "grad_norm": 5.15625, "learning_rate": 3.387543059873145e-05, "loss": 2.0859, "num_input_tokens_seen": 32106880, "step": 15380 }, { "epoch": 2.5098295130108492, "grad_norm": 4.90625, "learning_rate": 3.386625195038503e-05, "loss": 2.4389, "num_input_tokens_seen": 32117200, "step": 15385 }, { "epoch": 2.510645240231667, "grad_norm": 5.96875, "learning_rate": 3.3857071934751136e-05, "loss": 3.0424, "num_input_tokens_seen": 32127072, "step": 15390 }, { "epoch": 2.511460967452484, "grad_norm": 4.53125, "learning_rate": 3.384789055324544e-05, "loss": 1.921, "num_input_tokens_seen": 32136368, "step": 15395 }, { "epoch": 2.512276694673301, "grad_norm": 4.375, "learning_rate": 3.3838707807283843e-05, "loss": 2.4441, "num_input_tokens_seen": 32146304, "step": 15400 }, { "epoch": 2.512276694673301, "eval_loss": 2.5513195991516113, "eval_runtime": 134.7621, "eval_samples_per_second": 20.221, "eval_steps_per_second": 10.114, "num_input_tokens_seen": 32146304, "step": 15400 }, { "epoch": 2.5130924218941186, "grad_norm": 9.75, "learning_rate": 3.382952369828243e-05, "loss": 2.8509, "num_input_tokens_seen": 32154896, "step": 15405 }, { "epoch": 2.513908149114936, "grad_norm": 4.96875, "learning_rate": 3.38203382276575e-05, "loss": 1.9193, "num_input_tokens_seen": 32166304, "step": 15410 }, { "epoch": 2.5147238763357533, "grad_norm": 4.65625, "learning_rate": 3.381115139682557e-05, "loss": 2.4256, "num_input_tokens_seen": 32176656, "step": 15415 }, { "epoch": 2.5155396035565705, "grad_norm": 4.28125, "learning_rate": 3.3801963207203366e-05, "loss": 1.2782, "num_input_tokens_seen": 32188800, "step": 15420 }, { "epoch": 2.516355330777388, "grad_norm": 6.28125, "learning_rate": 3.379277366020782e-05, "loss": 2.4299, "num_input_tokens_seen": 32200256, "step": 15425 }, { "epoch": 2.5171710579982056, "grad_norm": 9.6875, "learning_rate": 3.3783582757256085e-05, "loss": 2.9336, "num_input_tokens_seen": 32211104, "step": 15430 }, { "epoch": 2.5179867852190227, "grad_norm": 7.1875, "learning_rate": 3.3774390499765504e-05, "loss": 2.0857, "num_input_tokens_seen": 32221104, "step": 15435 }, { "epoch": 2.51880251243984, "grad_norm": 11.375, "learning_rate": 3.376519688915364e-05, "loss": 3.0225, "num_input_tokens_seen": 32231184, "step": 15440 }, { "epoch": 2.5196182396606575, "grad_norm": 11.375, "learning_rate": 3.3756001926838273e-05, "loss": 2.5835, "num_input_tokens_seen": 32241536, "step": 15445 }, { "epoch": 2.520433966881475, "grad_norm": 1.4375, "learning_rate": 3.374680561423737e-05, "loss": 2.6224, "num_input_tokens_seen": 32252896, "step": 15450 }, { "epoch": 2.521249694102292, "grad_norm": 6.1875, "learning_rate": 3.373760795276912e-05, "loss": 3.0368, "num_input_tokens_seen": 32263472, "step": 15455 }, { "epoch": 2.5220654213231093, "grad_norm": 4.125, "learning_rate": 3.372840894385192e-05, "loss": 2.6807, "num_input_tokens_seen": 32273920, "step": 15460 }, { "epoch": 2.522881148543927, "grad_norm": 6.96875, "learning_rate": 3.3719208588904375e-05, "loss": 1.9394, "num_input_tokens_seen": 32282784, "step": 15465 }, { "epoch": 2.5236968757647444, "grad_norm": 6.71875, "learning_rate": 3.371000688934529e-05, "loss": 2.7863, "num_input_tokens_seen": 32293136, "step": 15470 }, { "epoch": 2.5245126029855616, "grad_norm": 7.15625, "learning_rate": 3.370080384659369e-05, "loss": 2.8565, "num_input_tokens_seen": 32303824, "step": 15475 }, { "epoch": 2.525328330206379, "grad_norm": 12.3125, "learning_rate": 3.36915994620688e-05, "loss": 3.9078, "num_input_tokens_seen": 32314352, "step": 15480 }, { "epoch": 2.5261440574271963, "grad_norm": 6.46875, "learning_rate": 3.3682393737190035e-05, "loss": 1.7043, "num_input_tokens_seen": 32324912, "step": 15485 }, { "epoch": 2.526959784648014, "grad_norm": 6.3125, "learning_rate": 3.3673186673377054e-05, "loss": 2.1298, "num_input_tokens_seen": 32335200, "step": 15490 }, { "epoch": 2.527775511868831, "grad_norm": 10.0, "learning_rate": 3.366397827204969e-05, "loss": 2.9514, "num_input_tokens_seen": 32346208, "step": 15495 }, { "epoch": 2.5285912390896486, "grad_norm": 3.875, "learning_rate": 3.3654768534628e-05, "loss": 0.9498, "num_input_tokens_seen": 32356368, "step": 15500 }, { "epoch": 2.5294069663104657, "grad_norm": 18.875, "learning_rate": 3.3645557462532245e-05, "loss": 1.9841, "num_input_tokens_seen": 32367376, "step": 15505 }, { "epoch": 2.5302226935312833, "grad_norm": 7.21875, "learning_rate": 3.363634505718288e-05, "loss": 2.0988, "num_input_tokens_seen": 32378048, "step": 15510 }, { "epoch": 2.5310384207521004, "grad_norm": 2.21875, "learning_rate": 3.362713132000057e-05, "loss": 2.065, "num_input_tokens_seen": 32388208, "step": 15515 }, { "epoch": 2.531854147972918, "grad_norm": 1.7109375, "learning_rate": 3.36179162524062e-05, "loss": 1.6412, "num_input_tokens_seen": 32398720, "step": 15520 }, { "epoch": 2.532669875193735, "grad_norm": 9.75, "learning_rate": 3.3608699855820846e-05, "loss": 2.5209, "num_input_tokens_seen": 32409328, "step": 15525 }, { "epoch": 2.5334856024145527, "grad_norm": 12.1875, "learning_rate": 3.359948213166578e-05, "loss": 2.6615, "num_input_tokens_seen": 32420800, "step": 15530 }, { "epoch": 2.53430132963537, "grad_norm": 0.1474609375, "learning_rate": 3.359026308136252e-05, "loss": 2.5416, "num_input_tokens_seen": 32430784, "step": 15535 }, { "epoch": 2.5351170568561874, "grad_norm": 10.0, "learning_rate": 3.358104270633272e-05, "loss": 3.4055, "num_input_tokens_seen": 32441296, "step": 15540 }, { "epoch": 2.5359327840770045, "grad_norm": 12.125, "learning_rate": 3.357182100799831e-05, "loss": 1.6179, "num_input_tokens_seen": 32451552, "step": 15545 }, { "epoch": 2.536748511297822, "grad_norm": 6.125, "learning_rate": 3.3562597987781384e-05, "loss": 2.2592, "num_input_tokens_seen": 32463136, "step": 15550 }, { "epoch": 2.537564238518639, "grad_norm": 4.03125, "learning_rate": 3.355337364710424e-05, "loss": 2.8074, "num_input_tokens_seen": 32472416, "step": 15555 }, { "epoch": 2.538379965739457, "grad_norm": 9.0625, "learning_rate": 3.354414798738939e-05, "loss": 2.3342, "num_input_tokens_seen": 32481344, "step": 15560 }, { "epoch": 2.539195692960274, "grad_norm": 4.03125, "learning_rate": 3.353492101005955e-05, "loss": 1.4057, "num_input_tokens_seen": 32491648, "step": 15565 }, { "epoch": 2.5400114201810915, "grad_norm": 3.265625, "learning_rate": 3.352569271653763e-05, "loss": 2.158, "num_input_tokens_seen": 32503072, "step": 15570 }, { "epoch": 2.5408271474019086, "grad_norm": 5.84375, "learning_rate": 3.351646310824675e-05, "loss": 2.0664, "num_input_tokens_seen": 32513504, "step": 15575 }, { "epoch": 2.541642874622726, "grad_norm": 5.75, "learning_rate": 3.350723218661023e-05, "loss": 2.6191, "num_input_tokens_seen": 32525040, "step": 15580 }, { "epoch": 2.5424586018435438, "grad_norm": 6.125, "learning_rate": 3.349799995305162e-05, "loss": 2.4312, "num_input_tokens_seen": 32536208, "step": 15585 }, { "epoch": 2.543274329064361, "grad_norm": 7.6875, "learning_rate": 3.348876640899461e-05, "loss": 1.7887, "num_input_tokens_seen": 32546080, "step": 15590 }, { "epoch": 2.544090056285178, "grad_norm": 7.28125, "learning_rate": 3.3479531555863144e-05, "loss": 3.3634, "num_input_tokens_seen": 32555104, "step": 15595 }, { "epoch": 2.5449057835059956, "grad_norm": 5.875, "learning_rate": 3.3470295395081344e-05, "loss": 1.2673, "num_input_tokens_seen": 32566096, "step": 15600 }, { "epoch": 2.5449057835059956, "eval_loss": 2.535526990890503, "eval_runtime": 135.1285, "eval_samples_per_second": 20.166, "eval_steps_per_second": 10.087, "num_input_tokens_seen": 32566096, "step": 15600 }, { "epoch": 2.545721510726813, "grad_norm": 3.1875, "learning_rate": 3.3461057928073556e-05, "loss": 3.0945, "num_input_tokens_seen": 32575488, "step": 15605 }, { "epoch": 2.5465372379476303, "grad_norm": 15.1875, "learning_rate": 3.345181915626431e-05, "loss": 2.1694, "num_input_tokens_seen": 32586816, "step": 15610 }, { "epoch": 2.5473529651684474, "grad_norm": 4.25, "learning_rate": 3.344257908107834e-05, "loss": 2.726, "num_input_tokens_seen": 32597904, "step": 15615 }, { "epoch": 2.548168692389265, "grad_norm": 7.71875, "learning_rate": 3.343333770394058e-05, "loss": 1.7838, "num_input_tokens_seen": 32607696, "step": 15620 }, { "epoch": 2.5489844196100826, "grad_norm": 2.0, "learning_rate": 3.342409502627616e-05, "loss": 2.8575, "num_input_tokens_seen": 32617664, "step": 15625 }, { "epoch": 2.5498001468308997, "grad_norm": 5.71875, "learning_rate": 3.341485104951043e-05, "loss": 1.4401, "num_input_tokens_seen": 32626624, "step": 15630 }, { "epoch": 2.550615874051717, "grad_norm": 13.375, "learning_rate": 3.340560577506892e-05, "loss": 3.2228, "num_input_tokens_seen": 32635344, "step": 15635 }, { "epoch": 2.5514316012725344, "grad_norm": 4.8125, "learning_rate": 3.339635920437735e-05, "loss": 1.6663, "num_input_tokens_seen": 32646112, "step": 15640 }, { "epoch": 2.552247328493352, "grad_norm": 7.84375, "learning_rate": 3.338711133886169e-05, "loss": 0.8318, "num_input_tokens_seen": 32656528, "step": 15645 }, { "epoch": 2.553063055714169, "grad_norm": 2.15625, "learning_rate": 3.3377862179948064e-05, "loss": 2.9447, "num_input_tokens_seen": 32666688, "step": 15650 }, { "epoch": 2.5538787829349863, "grad_norm": 6.15625, "learning_rate": 3.336861172906281e-05, "loss": 1.6852, "num_input_tokens_seen": 32677632, "step": 15655 }, { "epoch": 2.554694510155804, "grad_norm": 5.8125, "learning_rate": 3.335935998763245e-05, "loss": 2.5543, "num_input_tokens_seen": 32688016, "step": 15660 }, { "epoch": 2.5555102373766214, "grad_norm": 4.25, "learning_rate": 3.3350106957083744e-05, "loss": 2.012, "num_input_tokens_seen": 32699632, "step": 15665 }, { "epoch": 2.5563259645974385, "grad_norm": 5.40625, "learning_rate": 3.33408526388436e-05, "loss": 2.4287, "num_input_tokens_seen": 32709728, "step": 15670 }, { "epoch": 2.557141691818256, "grad_norm": 6.40625, "learning_rate": 3.3331597034339166e-05, "loss": 2.2699, "num_input_tokens_seen": 32719408, "step": 15675 }, { "epoch": 2.5579574190390733, "grad_norm": 6.34375, "learning_rate": 3.3322340144997764e-05, "loss": 4.4912, "num_input_tokens_seen": 32729776, "step": 15680 }, { "epoch": 2.558773146259891, "grad_norm": 5.96875, "learning_rate": 3.331308197224693e-05, "loss": 1.2452, "num_input_tokens_seen": 32739136, "step": 15685 }, { "epoch": 2.559588873480708, "grad_norm": 5.625, "learning_rate": 3.330382251751438e-05, "loss": 1.7019, "num_input_tokens_seen": 32750304, "step": 15690 }, { "epoch": 2.5604046007015255, "grad_norm": 10.8125, "learning_rate": 3.3294561782228054e-05, "loss": 2.8795, "num_input_tokens_seen": 32760592, "step": 15695 }, { "epoch": 2.5612203279223427, "grad_norm": 12.1875, "learning_rate": 3.328529976781607e-05, "loss": 3.287, "num_input_tokens_seen": 32770800, "step": 15700 }, { "epoch": 2.5620360551431602, "grad_norm": 8.1875, "learning_rate": 3.327603647570673e-05, "loss": 3.8624, "num_input_tokens_seen": 32781408, "step": 15705 }, { "epoch": 2.5628517823639774, "grad_norm": 10.5, "learning_rate": 3.326677190732857e-05, "loss": 1.5547, "num_input_tokens_seen": 32793712, "step": 15710 }, { "epoch": 2.563667509584795, "grad_norm": 8.3125, "learning_rate": 3.325750606411029e-05, "loss": 3.686, "num_input_tokens_seen": 32804608, "step": 15715 }, { "epoch": 2.564483236805612, "grad_norm": 3.484375, "learning_rate": 3.3248238947480804e-05, "loss": 2.6634, "num_input_tokens_seen": 32814416, "step": 15720 }, { "epoch": 2.5652989640264297, "grad_norm": 1.4296875, "learning_rate": 3.323897055886922e-05, "loss": 1.2281, "num_input_tokens_seen": 32824672, "step": 15725 }, { "epoch": 2.566114691247247, "grad_norm": 3.53125, "learning_rate": 3.322970089970484e-05, "loss": 2.3854, "num_input_tokens_seen": 32834528, "step": 15730 }, { "epoch": 2.5669304184680644, "grad_norm": 8.6875, "learning_rate": 3.3220429971417165e-05, "loss": 1.2117, "num_input_tokens_seen": 32844496, "step": 15735 }, { "epoch": 2.5677461456888815, "grad_norm": 8.5625, "learning_rate": 3.321115777543588e-05, "loss": 3.9821, "num_input_tokens_seen": 32852992, "step": 15740 }, { "epoch": 2.568561872909699, "grad_norm": 4.5625, "learning_rate": 3.320188431319088e-05, "loss": 2.4435, "num_input_tokens_seen": 32862832, "step": 15745 }, { "epoch": 2.569377600130516, "grad_norm": 7.28125, "learning_rate": 3.319260958611224e-05, "loss": 2.8928, "num_input_tokens_seen": 32873168, "step": 15750 }, { "epoch": 2.5701933273513338, "grad_norm": 15.25, "learning_rate": 3.3183333595630256e-05, "loss": 2.9744, "num_input_tokens_seen": 32882624, "step": 15755 }, { "epoch": 2.5710090545721513, "grad_norm": 9.5625, "learning_rate": 3.317405634317538e-05, "loss": 3.4959, "num_input_tokens_seen": 32893328, "step": 15760 }, { "epoch": 2.5718247817929685, "grad_norm": 12.0, "learning_rate": 3.3164777830178315e-05, "loss": 2.9176, "num_input_tokens_seen": 32904288, "step": 15765 }, { "epoch": 2.5726405090137856, "grad_norm": 6.875, "learning_rate": 3.315549805806989e-05, "loss": 2.0804, "num_input_tokens_seen": 32914384, "step": 15770 }, { "epoch": 2.573456236234603, "grad_norm": 7.1875, "learning_rate": 3.314621702828118e-05, "loss": 2.2735, "num_input_tokens_seen": 32924400, "step": 15775 }, { "epoch": 2.5742719634554208, "grad_norm": 4.71875, "learning_rate": 3.313693474224342e-05, "loss": 2.2886, "num_input_tokens_seen": 32935856, "step": 15780 }, { "epoch": 2.575087690676238, "grad_norm": 4.15625, "learning_rate": 3.312765120138809e-05, "loss": 1.0562, "num_input_tokens_seen": 32948160, "step": 15785 }, { "epoch": 2.575903417897055, "grad_norm": 5.125, "learning_rate": 3.311836640714679e-05, "loss": 1.2024, "num_input_tokens_seen": 32959840, "step": 15790 }, { "epoch": 2.5767191451178726, "grad_norm": 9.375, "learning_rate": 3.310908036095137e-05, "loss": 2.3837, "num_input_tokens_seen": 32969856, "step": 15795 }, { "epoch": 2.57753487233869, "grad_norm": 9.8125, "learning_rate": 3.309979306423386e-05, "loss": 2.9281, "num_input_tokens_seen": 32981664, "step": 15800 }, { "epoch": 2.57753487233869, "eval_loss": 2.5533969402313232, "eval_runtime": 134.8638, "eval_samples_per_second": 20.206, "eval_steps_per_second": 10.106, "num_input_tokens_seen": 32981664, "step": 15800 }, { "epoch": 2.5783505995595073, "grad_norm": 2.078125, "learning_rate": 3.309050451842647e-05, "loss": 3.3276, "num_input_tokens_seen": 32991168, "step": 15805 }, { "epoch": 2.5791663267803244, "grad_norm": 10.0625, "learning_rate": 3.3081214724961604e-05, "loss": 2.1978, "num_input_tokens_seen": 33001728, "step": 15810 }, { "epoch": 2.579982054001142, "grad_norm": 9.125, "learning_rate": 3.307192368527188e-05, "loss": 2.7747, "num_input_tokens_seen": 33011744, "step": 15815 }, { "epoch": 2.5807977812219596, "grad_norm": 3.609375, "learning_rate": 3.306263140079008e-05, "loss": 3.1538, "num_input_tokens_seen": 33022016, "step": 15820 }, { "epoch": 2.5816135084427767, "grad_norm": 19.75, "learning_rate": 3.30533378729492e-05, "loss": 1.7303, "num_input_tokens_seen": 33032480, "step": 15825 }, { "epoch": 2.582429235663594, "grad_norm": 3.625, "learning_rate": 3.304404310318242e-05, "loss": 1.2063, "num_input_tokens_seen": 33042336, "step": 15830 }, { "epoch": 2.5832449628844114, "grad_norm": 7.28125, "learning_rate": 3.3034747092923105e-05, "loss": 2.3761, "num_input_tokens_seen": 33053264, "step": 15835 }, { "epoch": 2.584060690105229, "grad_norm": 11.5, "learning_rate": 3.3025449843604806e-05, "loss": 2.8879, "num_input_tokens_seen": 33064032, "step": 15840 }, { "epoch": 2.584876417326046, "grad_norm": 7.25, "learning_rate": 3.30161513566613e-05, "loss": 1.9474, "num_input_tokens_seen": 33074544, "step": 15845 }, { "epoch": 2.5856921445468637, "grad_norm": 6.5625, "learning_rate": 3.3006851633526506e-05, "loss": 2.4332, "num_input_tokens_seen": 33085008, "step": 15850 }, { "epoch": 2.586507871767681, "grad_norm": 8.25, "learning_rate": 3.2997550675634584e-05, "loss": 1.8788, "num_input_tokens_seen": 33095200, "step": 15855 }, { "epoch": 2.5873235989884984, "grad_norm": 8.875, "learning_rate": 3.2988248484419825e-05, "loss": 3.2416, "num_input_tokens_seen": 33105120, "step": 15860 }, { "epoch": 2.5881393262093155, "grad_norm": 6.625, "learning_rate": 3.2978945061316776e-05, "loss": 1.7235, "num_input_tokens_seen": 33115440, "step": 15865 }, { "epoch": 2.588955053430133, "grad_norm": 8.25, "learning_rate": 3.296964040776013e-05, "loss": 2.5833, "num_input_tokens_seen": 33125728, "step": 15870 }, { "epoch": 2.5897707806509502, "grad_norm": 8.375, "learning_rate": 3.296033452518478e-05, "loss": 2.4218, "num_input_tokens_seen": 33136224, "step": 15875 }, { "epoch": 2.590586507871768, "grad_norm": 2.515625, "learning_rate": 3.2951027415025806e-05, "loss": 1.9913, "num_input_tokens_seen": 33147344, "step": 15880 }, { "epoch": 2.591402235092585, "grad_norm": 5.125, "learning_rate": 3.294171907871849e-05, "loss": 3.3177, "num_input_tokens_seen": 33156096, "step": 15885 }, { "epoch": 2.5922179623134025, "grad_norm": 4.34375, "learning_rate": 3.293240951769828e-05, "loss": 2.6984, "num_input_tokens_seen": 33165744, "step": 15890 }, { "epoch": 2.5930336895342196, "grad_norm": 7.09375, "learning_rate": 3.2923098733400846e-05, "loss": 1.8384, "num_input_tokens_seen": 33176192, "step": 15895 }, { "epoch": 2.593849416755037, "grad_norm": 7.4375, "learning_rate": 3.291378672726202e-05, "loss": 2.2854, "num_input_tokens_seen": 33186432, "step": 15900 }, { "epoch": 2.5946651439758543, "grad_norm": 10.0625, "learning_rate": 3.2904473500717824e-05, "loss": 2.9531, "num_input_tokens_seen": 33197696, "step": 15905 }, { "epoch": 2.595480871196672, "grad_norm": 10.0, "learning_rate": 3.289515905520449e-05, "loss": 2.0581, "num_input_tokens_seen": 33206864, "step": 15910 }, { "epoch": 2.596296598417489, "grad_norm": 10.0625, "learning_rate": 3.288584339215841e-05, "loss": 2.2463, "num_input_tokens_seen": 33218080, "step": 15915 }, { "epoch": 2.5971123256383066, "grad_norm": 6.6875, "learning_rate": 3.287652651301617e-05, "loss": 2.7555, "num_input_tokens_seen": 33228848, "step": 15920 }, { "epoch": 2.5979280528591238, "grad_norm": 9.1875, "learning_rate": 3.286720841921457e-05, "loss": 3.1506, "num_input_tokens_seen": 33239872, "step": 15925 }, { "epoch": 2.5987437800799413, "grad_norm": 5.46875, "learning_rate": 3.285788911219056e-05, "loss": 2.1683, "num_input_tokens_seen": 33250592, "step": 15930 }, { "epoch": 2.5995595073007585, "grad_norm": 12.0625, "learning_rate": 3.284856859338131e-05, "loss": 2.8576, "num_input_tokens_seen": 33261488, "step": 15935 }, { "epoch": 2.600375234521576, "grad_norm": 6.90625, "learning_rate": 3.283924686422414e-05, "loss": 2.319, "num_input_tokens_seen": 33273872, "step": 15940 }, { "epoch": 2.601190961742393, "grad_norm": 9.1875, "learning_rate": 3.282992392615659e-05, "loss": 3.1293, "num_input_tokens_seen": 33284208, "step": 15945 }, { "epoch": 2.6020066889632107, "grad_norm": 11.25, "learning_rate": 3.282059978061638e-05, "loss": 2.5991, "num_input_tokens_seen": 33295584, "step": 15950 }, { "epoch": 2.6028224161840283, "grad_norm": 5.5625, "learning_rate": 3.28112744290414e-05, "loss": 2.2707, "num_input_tokens_seen": 33305872, "step": 15955 }, { "epoch": 2.6036381434048455, "grad_norm": 8.4375, "learning_rate": 3.280194787286974e-05, "loss": 3.0486, "num_input_tokens_seen": 33317712, "step": 15960 }, { "epoch": 2.6044538706256626, "grad_norm": 4.59375, "learning_rate": 3.2792620113539674e-05, "loss": 1.3371, "num_input_tokens_seen": 33328944, "step": 15965 }, { "epoch": 2.60526959784648, "grad_norm": 2.9375, "learning_rate": 3.278329115248966e-05, "loss": 2.552, "num_input_tokens_seen": 33340816, "step": 15970 }, { "epoch": 2.6060853250672977, "grad_norm": 3.65625, "learning_rate": 3.277396099115834e-05, "loss": 2.1152, "num_input_tokens_seen": 33351152, "step": 15975 }, { "epoch": 2.606901052288115, "grad_norm": 3.21875, "learning_rate": 3.276462963098454e-05, "loss": 2.3813, "num_input_tokens_seen": 33361680, "step": 15980 }, { "epoch": 2.607716779508932, "grad_norm": 4.34375, "learning_rate": 3.275529707340728e-05, "loss": 1.8114, "num_input_tokens_seen": 33372992, "step": 15985 }, { "epoch": 2.6085325067297496, "grad_norm": 10.1875, "learning_rate": 3.274596331986574e-05, "loss": 2.5709, "num_input_tokens_seen": 33382656, "step": 15990 }, { "epoch": 2.609348233950567, "grad_norm": 8.8125, "learning_rate": 3.273662837179932e-05, "loss": 2.1711, "num_input_tokens_seen": 33393824, "step": 15995 }, { "epoch": 2.6101639611713843, "grad_norm": 15.6875, "learning_rate": 3.272729223064758e-05, "loss": 3.8721, "num_input_tokens_seen": 33403328, "step": 16000 }, { "epoch": 2.6101639611713843, "eval_loss": 2.542147636413574, "eval_runtime": 134.8552, "eval_samples_per_second": 20.207, "eval_steps_per_second": 10.107, "num_input_tokens_seen": 33403328, "step": 16000 }, { "epoch": 2.6109796883922014, "grad_norm": 8.0, "learning_rate": 3.2717954897850264e-05, "loss": 2.5919, "num_input_tokens_seen": 33413568, "step": 16005 }, { "epoch": 2.611795415613019, "grad_norm": 3.125, "learning_rate": 3.270861637484733e-05, "loss": 1.279, "num_input_tokens_seen": 33424032, "step": 16010 }, { "epoch": 2.6126111428338366, "grad_norm": 1.3671875, "learning_rate": 3.2699276663078867e-05, "loss": 1.663, "num_input_tokens_seen": 33435056, "step": 16015 }, { "epoch": 2.6134268700546537, "grad_norm": 4.96875, "learning_rate": 3.268993576398519e-05, "loss": 1.964, "num_input_tokens_seen": 33447312, "step": 16020 }, { "epoch": 2.614242597275471, "grad_norm": 6.96875, "learning_rate": 3.268059367900678e-05, "loss": 2.8165, "num_input_tokens_seen": 33458480, "step": 16025 }, { "epoch": 2.6150583244962884, "grad_norm": 5.75, "learning_rate": 3.26712504095843e-05, "loss": 1.5581, "num_input_tokens_seen": 33468512, "step": 16030 }, { "epoch": 2.615874051717106, "grad_norm": 4.53125, "learning_rate": 3.2661905957158615e-05, "loss": 1.5841, "num_input_tokens_seen": 33479328, "step": 16035 }, { "epoch": 2.616689778937923, "grad_norm": 9.6875, "learning_rate": 3.2652560323170734e-05, "loss": 2.9167, "num_input_tokens_seen": 33490416, "step": 16040 }, { "epoch": 2.6175055061587407, "grad_norm": 7.25, "learning_rate": 3.264321350906189e-05, "loss": 2.464, "num_input_tokens_seen": 33499504, "step": 16045 }, { "epoch": 2.618321233379558, "grad_norm": 10.625, "learning_rate": 3.263386551627346e-05, "loss": 3.0481, "num_input_tokens_seen": 33509056, "step": 16050 }, { "epoch": 2.6191369606003754, "grad_norm": 5.46875, "learning_rate": 3.2624516346247055e-05, "loss": 2.2434, "num_input_tokens_seen": 33519584, "step": 16055 }, { "epoch": 2.6199526878211925, "grad_norm": 7.09375, "learning_rate": 3.2615166000424404e-05, "loss": 1.7911, "num_input_tokens_seen": 33528752, "step": 16060 }, { "epoch": 2.62076841504201, "grad_norm": 6.125, "learning_rate": 3.260581448024745e-05, "loss": 1.6875, "num_input_tokens_seen": 33538528, "step": 16065 }, { "epoch": 2.621584142262827, "grad_norm": 9.3125, "learning_rate": 3.2596461787158335e-05, "loss": 3.255, "num_input_tokens_seen": 33549104, "step": 16070 }, { "epoch": 2.622399869483645, "grad_norm": 4.96875, "learning_rate": 3.258710792259934e-05, "loss": 3.9284, "num_input_tokens_seen": 33559680, "step": 16075 }, { "epoch": 2.623215596704462, "grad_norm": 3.90625, "learning_rate": 3.257775288801296e-05, "loss": 2.4367, "num_input_tokens_seen": 33570480, "step": 16080 }, { "epoch": 2.6240313239252795, "grad_norm": 10.625, "learning_rate": 3.256839668484186e-05, "loss": 1.7586, "num_input_tokens_seen": 33580912, "step": 16085 }, { "epoch": 2.6248470511460966, "grad_norm": 6.28125, "learning_rate": 3.255903931452888e-05, "loss": 1.8664, "num_input_tokens_seen": 33592112, "step": 16090 }, { "epoch": 2.625662778366914, "grad_norm": 6.5625, "learning_rate": 3.2549680778517045e-05, "loss": 2.1405, "num_input_tokens_seen": 33602320, "step": 16095 }, { "epoch": 2.6264785055877313, "grad_norm": 15.375, "learning_rate": 3.2540321078249556e-05, "loss": 3.7148, "num_input_tokens_seen": 33613648, "step": 16100 }, { "epoch": 2.627294232808549, "grad_norm": 6.1875, "learning_rate": 3.2530960215169795e-05, "loss": 2.2121, "num_input_tokens_seen": 33623328, "step": 16105 }, { "epoch": 2.628109960029366, "grad_norm": 2.046875, "learning_rate": 3.2521598190721345e-05, "loss": 1.9363, "num_input_tokens_seen": 33632848, "step": 16110 }, { "epoch": 2.6289256872501836, "grad_norm": 7.03125, "learning_rate": 3.251223500634792e-05, "loss": 1.5073, "num_input_tokens_seen": 33641904, "step": 16115 }, { "epoch": 2.6297414144710007, "grad_norm": 7.8125, "learning_rate": 3.2502870663493445e-05, "loss": 1.6046, "num_input_tokens_seen": 33653152, "step": 16120 }, { "epoch": 2.6305571416918183, "grad_norm": 3.84375, "learning_rate": 3.249350516360203e-05, "loss": 1.76, "num_input_tokens_seen": 33663984, "step": 16125 }, { "epoch": 2.631372868912636, "grad_norm": 5.8125, "learning_rate": 3.248413850811797e-05, "loss": 1.8946, "num_input_tokens_seen": 33675664, "step": 16130 }, { "epoch": 2.632188596133453, "grad_norm": 4.40625, "learning_rate": 3.2474770698485677e-05, "loss": 1.9752, "num_input_tokens_seen": 33686080, "step": 16135 }, { "epoch": 2.63300432335427, "grad_norm": 8.0625, "learning_rate": 3.246540173614983e-05, "loss": 2.8633, "num_input_tokens_seen": 33695568, "step": 16140 }, { "epoch": 2.6338200505750877, "grad_norm": 6.1875, "learning_rate": 3.2456031622555197e-05, "loss": 1.7717, "num_input_tokens_seen": 33708096, "step": 16145 }, { "epoch": 2.6346357777959053, "grad_norm": 7.875, "learning_rate": 3.2446660359146794e-05, "loss": 2.1496, "num_input_tokens_seen": 33718976, "step": 16150 }, { "epoch": 2.6354515050167224, "grad_norm": 5.0625, "learning_rate": 3.2437287947369786e-05, "loss": 3.0623, "num_input_tokens_seen": 33730048, "step": 16155 }, { "epoch": 2.6362672322375396, "grad_norm": 6.9375, "learning_rate": 3.2427914388669525e-05, "loss": 3.4004, "num_input_tokens_seen": 33740688, "step": 16160 }, { "epoch": 2.637082959458357, "grad_norm": 11.0, "learning_rate": 3.241853968449151e-05, "loss": 3.1018, "num_input_tokens_seen": 33750912, "step": 16165 }, { "epoch": 2.6378986866791747, "grad_norm": 11.3125, "learning_rate": 3.240916383628144e-05, "loss": 2.6743, "num_input_tokens_seen": 33761520, "step": 16170 }, { "epoch": 2.638714413899992, "grad_norm": 9.75, "learning_rate": 3.239978684548521e-05, "loss": 3.6831, "num_input_tokens_seen": 33772096, "step": 16175 }, { "epoch": 2.639530141120809, "grad_norm": 9.5, "learning_rate": 3.239040871354885e-05, "loss": 3.7123, "num_input_tokens_seen": 33783264, "step": 16180 }, { "epoch": 2.6403458683416265, "grad_norm": 3.4375, "learning_rate": 3.2381029441918596e-05, "loss": 1.7228, "num_input_tokens_seen": 33793840, "step": 16185 }, { "epoch": 2.641161595562444, "grad_norm": 5.8125, "learning_rate": 3.2371649032040845e-05, "loss": 2.4894, "num_input_tokens_seen": 33804768, "step": 16190 }, { "epoch": 2.6419773227832613, "grad_norm": 9.5625, "learning_rate": 3.2362267485362174e-05, "loss": 2.3563, "num_input_tokens_seen": 33815648, "step": 16195 }, { "epoch": 2.6427930500040784, "grad_norm": 6.375, "learning_rate": 3.235288480332934e-05, "loss": 1.8551, "num_input_tokens_seen": 33827808, "step": 16200 }, { "epoch": 2.6427930500040784, "eval_loss": 2.546783208847046, "eval_runtime": 134.8424, "eval_samples_per_second": 20.209, "eval_steps_per_second": 10.108, "num_input_tokens_seen": 33827808, "step": 16200 }, { "epoch": 2.643608777224896, "grad_norm": 9.4375, "learning_rate": 3.234350098738927e-05, "loss": 2.7817, "num_input_tokens_seen": 33838544, "step": 16205 }, { "epoch": 2.6444245044457135, "grad_norm": 10.375, "learning_rate": 3.233411603898906e-05, "loss": 2.1702, "num_input_tokens_seen": 33849280, "step": 16210 }, { "epoch": 2.6452402316665307, "grad_norm": 7.96875, "learning_rate": 3.232472995957599e-05, "loss": 1.6365, "num_input_tokens_seen": 33859808, "step": 16215 }, { "epoch": 2.6460559588873482, "grad_norm": 10.0, "learning_rate": 3.231534275059751e-05, "loss": 3.3369, "num_input_tokens_seen": 33870112, "step": 16220 }, { "epoch": 2.6468716861081654, "grad_norm": 2.234375, "learning_rate": 3.230595441350125e-05, "loss": 2.2198, "num_input_tokens_seen": 33880832, "step": 16225 }, { "epoch": 2.647687413328983, "grad_norm": 8.25, "learning_rate": 3.2296564949735e-05, "loss": 4.5615, "num_input_tokens_seen": 33890960, "step": 16230 }, { "epoch": 2.6485031405498, "grad_norm": 8.3125, "learning_rate": 3.228717436074675e-05, "loss": 2.0312, "num_input_tokens_seen": 33900816, "step": 16235 }, { "epoch": 2.6493188677706176, "grad_norm": 9.0, "learning_rate": 3.227778264798463e-05, "loss": 2.5483, "num_input_tokens_seen": 33910464, "step": 16240 }, { "epoch": 2.6501345949914348, "grad_norm": 8.4375, "learning_rate": 3.226838981289698e-05, "loss": 2.0637, "num_input_tokens_seen": 33921520, "step": 16245 }, { "epoch": 2.6509503222122524, "grad_norm": 0.166015625, "learning_rate": 3.225899585693227e-05, "loss": 0.6785, "num_input_tokens_seen": 33931552, "step": 16250 }, { "epoch": 2.6517660494330695, "grad_norm": 8.375, "learning_rate": 3.224960078153918e-05, "loss": 2.1385, "num_input_tokens_seen": 33943072, "step": 16255 }, { "epoch": 2.652581776653887, "grad_norm": 7.0625, "learning_rate": 3.224020458816655e-05, "loss": 3.2142, "num_input_tokens_seen": 33952448, "step": 16260 }, { "epoch": 2.653397503874704, "grad_norm": 5.4375, "learning_rate": 3.223080727826337e-05, "loss": 3.8898, "num_input_tokens_seen": 33963264, "step": 16265 }, { "epoch": 2.6542132310955218, "grad_norm": 9.125, "learning_rate": 3.222140885327885e-05, "loss": 3.6363, "num_input_tokens_seen": 33974368, "step": 16270 }, { "epoch": 2.655028958316339, "grad_norm": 14.25, "learning_rate": 3.221200931466234e-05, "loss": 2.9565, "num_input_tokens_seen": 33985248, "step": 16275 }, { "epoch": 2.6558446855371565, "grad_norm": 3.703125, "learning_rate": 3.220260866386336e-05, "loss": 2.5862, "num_input_tokens_seen": 33995632, "step": 16280 }, { "epoch": 2.6566604127579736, "grad_norm": 9.625, "learning_rate": 3.21932069023316e-05, "loss": 3.0562, "num_input_tokens_seen": 34006160, "step": 16285 }, { "epoch": 2.657476139978791, "grad_norm": 5.875, "learning_rate": 3.218380403151695e-05, "loss": 2.0022, "num_input_tokens_seen": 34016576, "step": 16290 }, { "epoch": 2.6582918671996083, "grad_norm": 5.28125, "learning_rate": 3.217440005286943e-05, "loss": 1.3833, "num_input_tokens_seen": 34027776, "step": 16295 }, { "epoch": 2.659107594420426, "grad_norm": 4.78125, "learning_rate": 3.216499496783928e-05, "loss": 2.7437, "num_input_tokens_seen": 34038256, "step": 16300 }, { "epoch": 2.659923321641243, "grad_norm": 12.3125, "learning_rate": 3.2155588777876856e-05, "loss": 2.2299, "num_input_tokens_seen": 34046896, "step": 16305 }, { "epoch": 2.6607390488620606, "grad_norm": 8.0625, "learning_rate": 3.214618148443273e-05, "loss": 2.6666, "num_input_tokens_seen": 34056896, "step": 16310 }, { "epoch": 2.6615547760828777, "grad_norm": 4.875, "learning_rate": 3.2136773088957595e-05, "loss": 2.8164, "num_input_tokens_seen": 34066672, "step": 16315 }, { "epoch": 2.6623705033036953, "grad_norm": 7.9375, "learning_rate": 3.2127363592902374e-05, "loss": 3.217, "num_input_tokens_seen": 34078384, "step": 16320 }, { "epoch": 2.663186230524513, "grad_norm": 7.3125, "learning_rate": 3.211795299771812e-05, "loss": 3.2504, "num_input_tokens_seen": 34087568, "step": 16325 }, { "epoch": 2.66400195774533, "grad_norm": 5.21875, "learning_rate": 3.210854130485605e-05, "loss": 2.2324, "num_input_tokens_seen": 34098896, "step": 16330 }, { "epoch": 2.664817684966147, "grad_norm": 10.875, "learning_rate": 3.209912851576759e-05, "loss": 2.5861, "num_input_tokens_seen": 34108032, "step": 16335 }, { "epoch": 2.6656334121869647, "grad_norm": 2.484375, "learning_rate": 3.208971463190431e-05, "loss": 2.3717, "num_input_tokens_seen": 34118448, "step": 16340 }, { "epoch": 2.6664491394077823, "grad_norm": 10.3125, "learning_rate": 3.208029965471793e-05, "loss": 2.9142, "num_input_tokens_seen": 34129936, "step": 16345 }, { "epoch": 2.6672648666285994, "grad_norm": 4.84375, "learning_rate": 3.2070883585660364e-05, "loss": 1.8916, "num_input_tokens_seen": 34141632, "step": 16350 }, { "epoch": 2.6680805938494165, "grad_norm": 4.65625, "learning_rate": 3.20614664261837e-05, "loss": 2.1912, "num_input_tokens_seen": 34150912, "step": 16355 }, { "epoch": 2.668896321070234, "grad_norm": 2.390625, "learning_rate": 3.205204817774016e-05, "loss": 1.105, "num_input_tokens_seen": 34161312, "step": 16360 }, { "epoch": 2.6697120482910517, "grad_norm": 11.875, "learning_rate": 3.204262884178218e-05, "loss": 2.2465, "num_input_tokens_seen": 34173104, "step": 16365 }, { "epoch": 2.670527775511869, "grad_norm": 6.15625, "learning_rate": 3.2033208419762314e-05, "loss": 1.3373, "num_input_tokens_seen": 34183376, "step": 16370 }, { "epoch": 2.671343502732686, "grad_norm": 4.1875, "learning_rate": 3.2023786913133344e-05, "loss": 1.5983, "num_input_tokens_seen": 34193168, "step": 16375 }, { "epoch": 2.6721592299535035, "grad_norm": 6.28125, "learning_rate": 3.201436432334816e-05, "loss": 2.2728, "num_input_tokens_seen": 34203664, "step": 16380 }, { "epoch": 2.672974957174321, "grad_norm": 7.125, "learning_rate": 3.2004940651859844e-05, "loss": 3.2168, "num_input_tokens_seen": 34214544, "step": 16385 }, { "epoch": 2.6737906843951382, "grad_norm": 5.625, "learning_rate": 3.1995515900121655e-05, "loss": 2.6577, "num_input_tokens_seen": 34225424, "step": 16390 }, { "epoch": 2.6746064116159554, "grad_norm": 7.21875, "learning_rate": 3.1986090069587e-05, "loss": 2.5505, "num_input_tokens_seen": 34236496, "step": 16395 }, { "epoch": 2.675422138836773, "grad_norm": 3.8125, "learning_rate": 3.1976663161709466e-05, "loss": 2.6711, "num_input_tokens_seen": 34245456, "step": 16400 }, { "epoch": 2.675422138836773, "eval_loss": 2.5521464347839355, "eval_runtime": 135.1348, "eval_samples_per_second": 20.165, "eval_steps_per_second": 10.086, "num_input_tokens_seen": 34245456, "step": 16400 }, { "epoch": 2.6762378660575905, "grad_norm": 6.3125, "learning_rate": 3.196723517794279e-05, "loss": 1.8623, "num_input_tokens_seen": 34256464, "step": 16405 }, { "epoch": 2.6770535932784076, "grad_norm": 6.375, "learning_rate": 3.19578061197409e-05, "loss": 3.2433, "num_input_tokens_seen": 34267728, "step": 16410 }, { "epoch": 2.677869320499225, "grad_norm": 4.125, "learning_rate": 3.194837598855787e-05, "loss": 2.6012, "num_input_tokens_seen": 34278192, "step": 16415 }, { "epoch": 2.6786850477200423, "grad_norm": 8.25, "learning_rate": 3.193894478584794e-05, "loss": 3.3096, "num_input_tokens_seen": 34288624, "step": 16420 }, { "epoch": 2.67950077494086, "grad_norm": 11.3125, "learning_rate": 3.192951251306553e-05, "loss": 3.0019, "num_input_tokens_seen": 34299536, "step": 16425 }, { "epoch": 2.680316502161677, "grad_norm": 8.1875, "learning_rate": 3.192007917166521e-05, "loss": 1.7351, "num_input_tokens_seen": 34310640, "step": 16430 }, { "epoch": 2.6811322293824946, "grad_norm": 7.375, "learning_rate": 3.191064476310171e-05, "loss": 3.6402, "num_input_tokens_seen": 34321392, "step": 16435 }, { "epoch": 2.6819479566033118, "grad_norm": 6.9375, "learning_rate": 3.1901209288829944e-05, "loss": 4.4265, "num_input_tokens_seen": 34332544, "step": 16440 }, { "epoch": 2.6827636838241293, "grad_norm": 6.40625, "learning_rate": 3.1891772750304985e-05, "loss": 3.1058, "num_input_tokens_seen": 34342800, "step": 16445 }, { "epoch": 2.6835794110449465, "grad_norm": 8.75, "learning_rate": 3.188233514898206e-05, "loss": 2.7022, "num_input_tokens_seen": 34353200, "step": 16450 }, { "epoch": 2.684395138265764, "grad_norm": 7.84375, "learning_rate": 3.187289648631657e-05, "loss": 1.7889, "num_input_tokens_seen": 34364784, "step": 16455 }, { "epoch": 2.685210865486581, "grad_norm": 8.1875, "learning_rate": 3.186345676376406e-05, "loss": 2.9518, "num_input_tokens_seen": 34375824, "step": 16460 }, { "epoch": 2.6860265927073987, "grad_norm": 3.171875, "learning_rate": 3.1854015982780275e-05, "loss": 2.4943, "num_input_tokens_seen": 34387168, "step": 16465 }, { "epoch": 2.686842319928216, "grad_norm": 13.5, "learning_rate": 3.1844574144821084e-05, "loss": 2.1364, "num_input_tokens_seen": 34398128, "step": 16470 }, { "epoch": 2.6876580471490334, "grad_norm": 4.625, "learning_rate": 3.1835131251342554e-05, "loss": 2.8713, "num_input_tokens_seen": 34409296, "step": 16475 }, { "epoch": 2.6884737743698506, "grad_norm": 3.828125, "learning_rate": 3.182568730380089e-05, "loss": 1.813, "num_input_tokens_seen": 34418224, "step": 16480 }, { "epoch": 2.689289501590668, "grad_norm": 11.1875, "learning_rate": 3.181624230365245e-05, "loss": 3.4145, "num_input_tokens_seen": 34428720, "step": 16485 }, { "epoch": 2.6901052288114853, "grad_norm": 10.6875, "learning_rate": 3.180679625235381e-05, "loss": 2.5642, "num_input_tokens_seen": 34438752, "step": 16490 }, { "epoch": 2.690920956032303, "grad_norm": 9.8125, "learning_rate": 3.1797349151361646e-05, "loss": 2.0636, "num_input_tokens_seen": 34449840, "step": 16495 }, { "epoch": 2.6917366832531204, "grad_norm": 6.59375, "learning_rate": 3.178790100213281e-05, "loss": 2.8, "num_input_tokens_seen": 34460352, "step": 16500 }, { "epoch": 2.6925524104739376, "grad_norm": 10.625, "learning_rate": 3.1778451806124346e-05, "loss": 2.9267, "num_input_tokens_seen": 34472368, "step": 16505 }, { "epoch": 2.6933681376947547, "grad_norm": 12.0625, "learning_rate": 3.176900156479342e-05, "loss": 3.2725, "num_input_tokens_seen": 34484160, "step": 16510 }, { "epoch": 2.6941838649155723, "grad_norm": 7.78125, "learning_rate": 3.17595502795974e-05, "loss": 3.5583, "num_input_tokens_seen": 34494000, "step": 16515 }, { "epoch": 2.69499959213639, "grad_norm": 8.4375, "learning_rate": 3.175009795199377e-05, "loss": 2.0964, "num_input_tokens_seen": 34503968, "step": 16520 }, { "epoch": 2.695815319357207, "grad_norm": 12.5625, "learning_rate": 3.1740644583440224e-05, "loss": 3.9194, "num_input_tokens_seen": 34514832, "step": 16525 }, { "epoch": 2.696631046578024, "grad_norm": 13.25, "learning_rate": 3.173119017539457e-05, "loss": 3.6509, "num_input_tokens_seen": 34525824, "step": 16530 }, { "epoch": 2.6974467737988417, "grad_norm": 4.625, "learning_rate": 3.172173472931479e-05, "loss": 3.5547, "num_input_tokens_seen": 34536832, "step": 16535 }, { "epoch": 2.6982625010196593, "grad_norm": 7.59375, "learning_rate": 3.1712278246659055e-05, "loss": 3.1259, "num_input_tokens_seen": 34546720, "step": 16540 }, { "epoch": 2.6990782282404764, "grad_norm": 5.28125, "learning_rate": 3.170282072888566e-05, "loss": 2.0688, "num_input_tokens_seen": 34555904, "step": 16545 }, { "epoch": 2.6998939554612935, "grad_norm": 2.484375, "learning_rate": 3.169336217745307e-05, "loss": 2.6751, "num_input_tokens_seen": 34566944, "step": 16550 }, { "epoch": 2.700709682682111, "grad_norm": 8.75, "learning_rate": 3.1683902593819924e-05, "loss": 2.8128, "num_input_tokens_seen": 34578752, "step": 16555 }, { "epoch": 2.7015254099029287, "grad_norm": 4.6875, "learning_rate": 3.1674441979445e-05, "loss": 2.1039, "num_input_tokens_seen": 34589856, "step": 16560 }, { "epoch": 2.702341137123746, "grad_norm": 11.125, "learning_rate": 3.166498033578725e-05, "loss": 3.8616, "num_input_tokens_seen": 34600896, "step": 16565 }, { "epoch": 2.703156864344563, "grad_norm": 6.8125, "learning_rate": 3.165551766430578e-05, "loss": 2.9954, "num_input_tokens_seen": 34611376, "step": 16570 }, { "epoch": 2.7039725915653805, "grad_norm": 11.0, "learning_rate": 3.164605396645984e-05, "loss": 0.9768, "num_input_tokens_seen": 34622896, "step": 16575 }, { "epoch": 2.704788318786198, "grad_norm": 2.8125, "learning_rate": 3.163658924370886e-05, "loss": 3.186, "num_input_tokens_seen": 34634352, "step": 16580 }, { "epoch": 2.705604046007015, "grad_norm": 6.25, "learning_rate": 3.1627123497512415e-05, "loss": 2.7933, "num_input_tokens_seen": 34645040, "step": 16585 }, { "epoch": 2.7064197732278323, "grad_norm": 6.75, "learning_rate": 3.1617656729330245e-05, "loss": 2.1957, "num_input_tokens_seen": 34654080, "step": 16590 }, { "epoch": 2.70723550044865, "grad_norm": 8.125, "learning_rate": 3.1608188940622255e-05, "loss": 2.1157, "num_input_tokens_seen": 34663200, "step": 16595 }, { "epoch": 2.7080512276694675, "grad_norm": 1.9296875, "learning_rate": 3.159872013284847e-05, "loss": 1.1239, "num_input_tokens_seen": 34673616, "step": 16600 }, { "epoch": 2.7080512276694675, "eval_loss": 2.545654535293579, "eval_runtime": 134.8327, "eval_samples_per_second": 20.21, "eval_steps_per_second": 10.109, "num_input_tokens_seen": 34673616, "step": 16600 }, { "epoch": 2.7088669548902846, "grad_norm": 13.625, "learning_rate": 3.1589250307469134e-05, "loss": 2.0076, "num_input_tokens_seen": 34684144, "step": 16605 }, { "epoch": 2.709682682111102, "grad_norm": 5.15625, "learning_rate": 3.1579779465944586e-05, "loss": 1.5802, "num_input_tokens_seen": 34695328, "step": 16610 }, { "epoch": 2.7104984093319193, "grad_norm": 4.625, "learning_rate": 3.1570307609735363e-05, "loss": 2.5241, "num_input_tokens_seen": 34706288, "step": 16615 }, { "epoch": 2.711314136552737, "grad_norm": 10.5, "learning_rate": 3.156083474030213e-05, "loss": 2.6136, "num_input_tokens_seen": 34717520, "step": 16620 }, { "epoch": 2.712129863773554, "grad_norm": 11.125, "learning_rate": 3.155136085910573e-05, "loss": 3.1711, "num_input_tokens_seen": 34728480, "step": 16625 }, { "epoch": 2.7129455909943716, "grad_norm": 10.375, "learning_rate": 3.154188596760717e-05, "loss": 3.0228, "num_input_tokens_seen": 34738880, "step": 16630 }, { "epoch": 2.7137613182151887, "grad_norm": 5.125, "learning_rate": 3.153241006726757e-05, "loss": 1.6732, "num_input_tokens_seen": 34749200, "step": 16635 }, { "epoch": 2.7145770454360063, "grad_norm": 3.515625, "learning_rate": 3.152293315954825e-05, "loss": 2.7277, "num_input_tokens_seen": 34759968, "step": 16640 }, { "epoch": 2.7153927726568234, "grad_norm": 9.375, "learning_rate": 3.1513455245910666e-05, "loss": 2.0066, "num_input_tokens_seen": 34769664, "step": 16645 }, { "epoch": 2.716208499877641, "grad_norm": 8.0, "learning_rate": 3.150397632781643e-05, "loss": 2.9875, "num_input_tokens_seen": 34780544, "step": 16650 }, { "epoch": 2.717024227098458, "grad_norm": 8.0, "learning_rate": 3.149449640672731e-05, "loss": 0.8494, "num_input_tokens_seen": 34790144, "step": 16655 }, { "epoch": 2.7178399543192757, "grad_norm": 5.375, "learning_rate": 3.148501548410523e-05, "loss": 2.4132, "num_input_tokens_seen": 34800832, "step": 16660 }, { "epoch": 2.718655681540093, "grad_norm": 4.65625, "learning_rate": 3.1475533561412256e-05, "loss": 2.1299, "num_input_tokens_seen": 34810576, "step": 16665 }, { "epoch": 2.7194714087609104, "grad_norm": 8.125, "learning_rate": 3.146605064011065e-05, "loss": 2.204, "num_input_tokens_seen": 34820976, "step": 16670 }, { "epoch": 2.7202871359817276, "grad_norm": 7.0, "learning_rate": 3.145656672166277e-05, "loss": 2.6839, "num_input_tokens_seen": 34830256, "step": 16675 }, { "epoch": 2.721102863202545, "grad_norm": 2.578125, "learning_rate": 3.144708180753116e-05, "loss": 1.743, "num_input_tokens_seen": 34840848, "step": 16680 }, { "epoch": 2.7219185904233623, "grad_norm": 11.1875, "learning_rate": 3.143759589917851e-05, "loss": 1.5557, "num_input_tokens_seen": 34851184, "step": 16685 }, { "epoch": 2.72273431764418, "grad_norm": 6.625, "learning_rate": 3.142810899806768e-05, "loss": 1.928, "num_input_tokens_seen": 34861232, "step": 16690 }, { "epoch": 2.7235500448649974, "grad_norm": 5.125, "learning_rate": 3.141862110566166e-05, "loss": 2.1565, "num_input_tokens_seen": 34869968, "step": 16695 }, { "epoch": 2.7243657720858145, "grad_norm": 1.859375, "learning_rate": 3.1409132223423606e-05, "loss": 1.488, "num_input_tokens_seen": 34881120, "step": 16700 }, { "epoch": 2.7251814993066317, "grad_norm": 2.40625, "learning_rate": 3.139964235281682e-05, "loss": 2.4342, "num_input_tokens_seen": 34891696, "step": 16705 }, { "epoch": 2.7259972265274492, "grad_norm": 0.1298828125, "learning_rate": 3.139015149530476e-05, "loss": 2.2172, "num_input_tokens_seen": 34901808, "step": 16710 }, { "epoch": 2.726812953748267, "grad_norm": 6.6875, "learning_rate": 3.1380659652351034e-05, "loss": 1.335, "num_input_tokens_seen": 34911216, "step": 16715 }, { "epoch": 2.727628680969084, "grad_norm": 10.0, "learning_rate": 3.137116682541941e-05, "loss": 2.4858, "num_input_tokens_seen": 34922304, "step": 16720 }, { "epoch": 2.728444408189901, "grad_norm": 3.46875, "learning_rate": 3.136167301597379e-05, "loss": 1.628, "num_input_tokens_seen": 34932336, "step": 16725 }, { "epoch": 2.7292601354107187, "grad_norm": 5.71875, "learning_rate": 3.1352178225478254e-05, "loss": 1.7993, "num_input_tokens_seen": 34942880, "step": 16730 }, { "epoch": 2.7300758626315362, "grad_norm": 8.25, "learning_rate": 3.1342682455396996e-05, "loss": 1.4979, "num_input_tokens_seen": 34952592, "step": 16735 }, { "epoch": 2.7308915898523534, "grad_norm": 12.125, "learning_rate": 3.133318570719441e-05, "loss": 2.1121, "num_input_tokens_seen": 34964592, "step": 16740 }, { "epoch": 2.7317073170731705, "grad_norm": 7.0, "learning_rate": 3.132368798233499e-05, "loss": 1.6761, "num_input_tokens_seen": 34975328, "step": 16745 }, { "epoch": 2.732523044293988, "grad_norm": 9.8125, "learning_rate": 3.131418928228342e-05, "loss": 2.7917, "num_input_tokens_seen": 34985088, "step": 16750 }, { "epoch": 2.7333387715148056, "grad_norm": 5.15625, "learning_rate": 3.1304689608504514e-05, "loss": 3.6144, "num_input_tokens_seen": 34994960, "step": 16755 }, { "epoch": 2.7341544987356228, "grad_norm": 8.0, "learning_rate": 3.129518896246324e-05, "loss": 2.2451, "num_input_tokens_seen": 35005184, "step": 16760 }, { "epoch": 2.73497022595644, "grad_norm": 2.5, "learning_rate": 3.128568734562472e-05, "loss": 2.0785, "num_input_tokens_seen": 35018112, "step": 16765 }, { "epoch": 2.7357859531772575, "grad_norm": 8.1875, "learning_rate": 3.127618475945421e-05, "loss": 2.3159, "num_input_tokens_seen": 35029856, "step": 16770 }, { "epoch": 2.736601680398075, "grad_norm": 6.40625, "learning_rate": 3.126668120541715e-05, "loss": 3.1607, "num_input_tokens_seen": 35040416, "step": 16775 }, { "epoch": 2.737417407618892, "grad_norm": 15.0625, "learning_rate": 3.1257176684979096e-05, "loss": 3.0275, "num_input_tokens_seen": 35051920, "step": 16780 }, { "epoch": 2.7382331348397098, "grad_norm": 4.75, "learning_rate": 3.124767119960576e-05, "loss": 3.6674, "num_input_tokens_seen": 35060656, "step": 16785 }, { "epoch": 2.739048862060527, "grad_norm": 3.8125, "learning_rate": 3.123816475076301e-05, "loss": 1.5737, "num_input_tokens_seen": 35072064, "step": 16790 }, { "epoch": 2.7398645892813445, "grad_norm": 6.4375, "learning_rate": 3.122865733991687e-05, "loss": 1.9718, "num_input_tokens_seen": 35081360, "step": 16795 }, { "epoch": 2.7406803165021616, "grad_norm": 7.625, "learning_rate": 3.1219148968533486e-05, "loss": 1.4874, "num_input_tokens_seen": 35089872, "step": 16800 }, { "epoch": 2.7406803165021616, "eval_loss": 2.5342445373535156, "eval_runtime": 134.8029, "eval_samples_per_second": 20.215, "eval_steps_per_second": 10.111, "num_input_tokens_seen": 35089872, "step": 16800 }, { "epoch": 2.741496043722979, "grad_norm": 4.84375, "learning_rate": 3.120963963807918e-05, "loss": 1.1901, "num_input_tokens_seen": 35100608, "step": 16805 }, { "epoch": 2.7423117709437963, "grad_norm": 12.5625, "learning_rate": 3.12001293500204e-05, "loss": 2.1622, "num_input_tokens_seen": 35111216, "step": 16810 }, { "epoch": 2.743127498164614, "grad_norm": 6.84375, "learning_rate": 3.1190618105823765e-05, "loss": 3.1671, "num_input_tokens_seen": 35120848, "step": 16815 }, { "epoch": 2.743943225385431, "grad_norm": 12.875, "learning_rate": 3.118110590695603e-05, "loss": 2.3103, "num_input_tokens_seen": 35130416, "step": 16820 }, { "epoch": 2.7447589526062486, "grad_norm": 1.7578125, "learning_rate": 3.117159275488407e-05, "loss": 1.6903, "num_input_tokens_seen": 35141152, "step": 16825 }, { "epoch": 2.7455746798270657, "grad_norm": 11.8125, "learning_rate": 3.1162078651074956e-05, "loss": 2.0126, "num_input_tokens_seen": 35151424, "step": 16830 }, { "epoch": 2.7463904070478833, "grad_norm": 6.3125, "learning_rate": 3.1152563596995885e-05, "loss": 2.7796, "num_input_tokens_seen": 35162128, "step": 16835 }, { "epoch": 2.7472061342687004, "grad_norm": 5.71875, "learning_rate": 3.1143047594114186e-05, "loss": 2.2402, "num_input_tokens_seen": 35172688, "step": 16840 }, { "epoch": 2.748021861489518, "grad_norm": 6.4375, "learning_rate": 3.113353064389734e-05, "loss": 2.0471, "num_input_tokens_seen": 35182672, "step": 16845 }, { "epoch": 2.748837588710335, "grad_norm": 5.8125, "learning_rate": 3.1124012747812993e-05, "loss": 1.8655, "num_input_tokens_seen": 35193920, "step": 16850 }, { "epoch": 2.7496533159311527, "grad_norm": 1.8828125, "learning_rate": 3.1114493907328936e-05, "loss": 2.5842, "num_input_tokens_seen": 35204576, "step": 16855 }, { "epoch": 2.75046904315197, "grad_norm": 3.828125, "learning_rate": 3.110497412391306e-05, "loss": 2.4617, "num_input_tokens_seen": 35215056, "step": 16860 }, { "epoch": 2.7512847703727874, "grad_norm": 9.4375, "learning_rate": 3.1095453399033466e-05, "loss": 2.5978, "num_input_tokens_seen": 35223456, "step": 16865 }, { "epoch": 2.7521004975936045, "grad_norm": 5.78125, "learning_rate": 3.108593173415835e-05, "loss": 1.5533, "num_input_tokens_seen": 35235344, "step": 16870 }, { "epoch": 2.752916224814422, "grad_norm": 5.1875, "learning_rate": 3.107640913075609e-05, "loss": 2.0874, "num_input_tokens_seen": 35245824, "step": 16875 }, { "epoch": 2.7537319520352392, "grad_norm": 7.40625, "learning_rate": 3.106688559029517e-05, "loss": 2.311, "num_input_tokens_seen": 35255808, "step": 16880 }, { "epoch": 2.754547679256057, "grad_norm": 6.65625, "learning_rate": 3.105736111424425e-05, "loss": 2.7496, "num_input_tokens_seen": 35265232, "step": 16885 }, { "epoch": 2.7553634064768744, "grad_norm": 8.6875, "learning_rate": 3.1047835704072136e-05, "loss": 4.0668, "num_input_tokens_seen": 35274480, "step": 16890 }, { "epoch": 2.7561791336976915, "grad_norm": 8.3125, "learning_rate": 3.103830936124775e-05, "loss": 4.0149, "num_input_tokens_seen": 35284816, "step": 16895 }, { "epoch": 2.7569948609185086, "grad_norm": 12.3125, "learning_rate": 3.102878208724018e-05, "loss": 3.4668, "num_input_tokens_seen": 35296592, "step": 16900 }, { "epoch": 2.7578105881393262, "grad_norm": 1.5, "learning_rate": 3.101925388351865e-05, "loss": 2.1166, "num_input_tokens_seen": 35307152, "step": 16905 }, { "epoch": 2.758626315360144, "grad_norm": 8.0, "learning_rate": 3.1009724751552515e-05, "loss": 2.6486, "num_input_tokens_seen": 35317264, "step": 16910 }, { "epoch": 2.759442042580961, "grad_norm": 6.40625, "learning_rate": 3.100019469281131e-05, "loss": 1.6806, "num_input_tokens_seen": 35328528, "step": 16915 }, { "epoch": 2.760257769801778, "grad_norm": 8.8125, "learning_rate": 3.0990663708764685e-05, "loss": 1.6484, "num_input_tokens_seen": 35339936, "step": 16920 }, { "epoch": 2.7610734970225956, "grad_norm": 7.03125, "learning_rate": 3.098113180088243e-05, "loss": 2.4061, "num_input_tokens_seen": 35350704, "step": 16925 }, { "epoch": 2.761889224243413, "grad_norm": 4.875, "learning_rate": 3.097159897063448e-05, "loss": 1.5002, "num_input_tokens_seen": 35361584, "step": 16930 }, { "epoch": 2.7627049514642303, "grad_norm": 5.5, "learning_rate": 3.096206521949094e-05, "loss": 2.7435, "num_input_tokens_seen": 35371440, "step": 16935 }, { "epoch": 2.7635206786850475, "grad_norm": 7.71875, "learning_rate": 3.0952530548922006e-05, "loss": 1.5029, "num_input_tokens_seen": 35382048, "step": 16940 }, { "epoch": 2.764336405905865, "grad_norm": 11.875, "learning_rate": 3.0942994960398064e-05, "loss": 2.207, "num_input_tokens_seen": 35392880, "step": 16945 }, { "epoch": 2.7651521331266826, "grad_norm": 5.75, "learning_rate": 3.093345845538961e-05, "loss": 3.9481, "num_input_tokens_seen": 35402944, "step": 16950 }, { "epoch": 2.7659678603474998, "grad_norm": 5.40625, "learning_rate": 3.09239210353673e-05, "loss": 1.6424, "num_input_tokens_seen": 35413168, "step": 16955 }, { "epoch": 2.766783587568317, "grad_norm": 3.375, "learning_rate": 3.0914382701801926e-05, "loss": 1.8041, "num_input_tokens_seen": 35423888, "step": 16960 }, { "epoch": 2.7675993147891345, "grad_norm": 9.3125, "learning_rate": 3.090484345616441e-05, "loss": 3.4789, "num_input_tokens_seen": 35434368, "step": 16965 }, { "epoch": 2.768415042009952, "grad_norm": 11.6875, "learning_rate": 3.0895303299925825e-05, "loss": 3.8258, "num_input_tokens_seen": 35444096, "step": 16970 }, { "epoch": 2.769230769230769, "grad_norm": 3.359375, "learning_rate": 3.0885762234557393e-05, "loss": 2.3049, "num_input_tokens_seen": 35455280, "step": 16975 }, { "epoch": 2.7700464964515867, "grad_norm": 6.96875, "learning_rate": 3.087622026153045e-05, "loss": 1.9618, "num_input_tokens_seen": 35466160, "step": 16980 }, { "epoch": 2.770862223672404, "grad_norm": 9.5625, "learning_rate": 3.086667738231651e-05, "loss": 3.5833, "num_input_tokens_seen": 35475824, "step": 16985 }, { "epoch": 2.7716779508932214, "grad_norm": 19.25, "learning_rate": 3.085713359838718e-05, "loss": 3.2067, "num_input_tokens_seen": 35486272, "step": 16990 }, { "epoch": 2.7724936781140386, "grad_norm": 9.9375, "learning_rate": 3.084758891121425e-05, "loss": 2.129, "num_input_tokens_seen": 35497424, "step": 16995 }, { "epoch": 2.773309405334856, "grad_norm": 3.953125, "learning_rate": 3.083804332226963e-05, "loss": 1.152, "num_input_tokens_seen": 35508944, "step": 17000 }, { "epoch": 2.773309405334856, "eval_loss": 2.5512051582336426, "eval_runtime": 135.0119, "eval_samples_per_second": 20.183, "eval_steps_per_second": 10.095, "num_input_tokens_seen": 35508944, "step": 17000 }, { "epoch": 2.7741251325556733, "grad_norm": 5.15625, "learning_rate": 3.082849683302536e-05, "loss": 3.032, "num_input_tokens_seen": 35519872, "step": 17005 }, { "epoch": 2.774940859776491, "grad_norm": 8.125, "learning_rate": 3.081894944495363e-05, "loss": 1.8765, "num_input_tokens_seen": 35530032, "step": 17010 }, { "epoch": 2.775756586997308, "grad_norm": 10.5625, "learning_rate": 3.080940115952677e-05, "loss": 4.8067, "num_input_tokens_seen": 35538944, "step": 17015 }, { "epoch": 2.7765723142181256, "grad_norm": 3.40625, "learning_rate": 3.0799851978217245e-05, "loss": 1.8712, "num_input_tokens_seen": 35549136, "step": 17020 }, { "epoch": 2.7773880414389427, "grad_norm": 3.96875, "learning_rate": 3.0790301902497666e-05, "loss": 3.0485, "num_input_tokens_seen": 35559376, "step": 17025 }, { "epoch": 2.7782037686597603, "grad_norm": 1.921875, "learning_rate": 3.078075093384076e-05, "loss": 1.2675, "num_input_tokens_seen": 35570352, "step": 17030 }, { "epoch": 2.7790194958805774, "grad_norm": 5.65625, "learning_rate": 3.077119907371942e-05, "loss": 2.1657, "num_input_tokens_seen": 35582144, "step": 17035 }, { "epoch": 2.779835223101395, "grad_norm": 5.3125, "learning_rate": 3.076164632360666e-05, "loss": 1.9921, "num_input_tokens_seen": 35592304, "step": 17040 }, { "epoch": 2.780650950322212, "grad_norm": 9.6875, "learning_rate": 3.075209268497563e-05, "loss": 2.7261, "num_input_tokens_seen": 35603264, "step": 17045 }, { "epoch": 2.7814666775430297, "grad_norm": 11.1875, "learning_rate": 3.074253815929961e-05, "loss": 1.976, "num_input_tokens_seen": 35613904, "step": 17050 }, { "epoch": 2.782282404763847, "grad_norm": 7.65625, "learning_rate": 3.0732982748052054e-05, "loss": 4.0768, "num_input_tokens_seen": 35625424, "step": 17055 }, { "epoch": 2.7830981319846644, "grad_norm": 17.5, "learning_rate": 3.072342645270651e-05, "loss": 2.4043, "num_input_tokens_seen": 35636720, "step": 17060 }, { "epoch": 2.783913859205482, "grad_norm": 4.0625, "learning_rate": 3.071386927473668e-05, "loss": 2.4074, "num_input_tokens_seen": 35645360, "step": 17065 }, { "epoch": 2.784729586426299, "grad_norm": 10.6875, "learning_rate": 3.0704311215616404e-05, "loss": 3.319, "num_input_tokens_seen": 35655600, "step": 17070 }, { "epoch": 2.785545313647116, "grad_norm": 0.1318359375, "learning_rate": 3.0694752276819656e-05, "loss": 1.2539, "num_input_tokens_seen": 35664736, "step": 17075 }, { "epoch": 2.786361040867934, "grad_norm": 2.421875, "learning_rate": 3.068519245982054e-05, "loss": 2.0095, "num_input_tokens_seen": 35675040, "step": 17080 }, { "epoch": 2.7871767680887514, "grad_norm": 10.9375, "learning_rate": 3.0675631766093304e-05, "loss": 2.9543, "num_input_tokens_seen": 35684384, "step": 17085 }, { "epoch": 2.7879924953095685, "grad_norm": 5.21875, "learning_rate": 3.066607019711232e-05, "loss": 1.7485, "num_input_tokens_seen": 35695216, "step": 17090 }, { "epoch": 2.7888082225303856, "grad_norm": 6.90625, "learning_rate": 3.065650775435211e-05, "loss": 2.0713, "num_input_tokens_seen": 35705920, "step": 17095 }, { "epoch": 2.789623949751203, "grad_norm": 8.1875, "learning_rate": 3.0646944439287326e-05, "loss": 2.7004, "num_input_tokens_seen": 35715104, "step": 17100 }, { "epoch": 2.7904396769720208, "grad_norm": 9.75, "learning_rate": 3.0637380253392736e-05, "loss": 2.2188, "num_input_tokens_seen": 35724960, "step": 17105 }, { "epoch": 2.791255404192838, "grad_norm": 3.5, "learning_rate": 3.062781519814327e-05, "loss": 2.2592, "num_input_tokens_seen": 35734432, "step": 17110 }, { "epoch": 2.792071131413655, "grad_norm": 5.4375, "learning_rate": 3.0618249275013985e-05, "loss": 1.9567, "num_input_tokens_seen": 35745392, "step": 17115 }, { "epoch": 2.7928868586344726, "grad_norm": 3.109375, "learning_rate": 3.060868248548005e-05, "loss": 1.3854, "num_input_tokens_seen": 35756096, "step": 17120 }, { "epoch": 2.79370258585529, "grad_norm": 13.8125, "learning_rate": 3.0599114831016796e-05, "loss": 2.242, "num_input_tokens_seen": 35767056, "step": 17125 }, { "epoch": 2.7945183130761073, "grad_norm": 8.75, "learning_rate": 3.0589546313099666e-05, "loss": 1.7485, "num_input_tokens_seen": 35775424, "step": 17130 }, { "epoch": 2.7953340402969244, "grad_norm": 2.953125, "learning_rate": 3.0579976933204255e-05, "loss": 1.3898, "num_input_tokens_seen": 35785376, "step": 17135 }, { "epoch": 2.796149767517742, "grad_norm": 4.71875, "learning_rate": 3.0570406692806284e-05, "loss": 2.9007, "num_input_tokens_seen": 35796928, "step": 17140 }, { "epoch": 2.7969654947385596, "grad_norm": 8.6875, "learning_rate": 3.05608355933816e-05, "loss": 2.9269, "num_input_tokens_seen": 35807216, "step": 17145 }, { "epoch": 2.7977812219593767, "grad_norm": 4.96875, "learning_rate": 3.055126363640618e-05, "loss": 2.1375, "num_input_tokens_seen": 35817088, "step": 17150 }, { "epoch": 2.7985969491801943, "grad_norm": 9.875, "learning_rate": 3.0541690823356146e-05, "loss": 2.3497, "num_input_tokens_seen": 35828160, "step": 17155 }, { "epoch": 2.7994126764010114, "grad_norm": 4.5625, "learning_rate": 3.053211715570775e-05, "loss": 2.8608, "num_input_tokens_seen": 35839072, "step": 17160 }, { "epoch": 2.800228403621829, "grad_norm": 2.546875, "learning_rate": 3.052254263493736e-05, "loss": 1.4995, "num_input_tokens_seen": 35849472, "step": 17165 }, { "epoch": 2.801044130842646, "grad_norm": 9.9375, "learning_rate": 3.0512967262521498e-05, "loss": 3.3004, "num_input_tokens_seen": 35858000, "step": 17170 }, { "epoch": 2.8018598580634637, "grad_norm": 10.375, "learning_rate": 3.0503391039936803e-05, "loss": 1.9098, "num_input_tokens_seen": 35867488, "step": 17175 }, { "epoch": 2.802675585284281, "grad_norm": 10.875, "learning_rate": 3.0493813968660056e-05, "loss": 2.473, "num_input_tokens_seen": 35878608, "step": 17180 }, { "epoch": 2.8034913125050984, "grad_norm": 8.9375, "learning_rate": 3.0484236050168153e-05, "loss": 2.3895, "num_input_tokens_seen": 35889840, "step": 17185 }, { "epoch": 2.8043070397259156, "grad_norm": 7.34375, "learning_rate": 3.0474657285938123e-05, "loss": 1.8708, "num_input_tokens_seen": 35901072, "step": 17190 }, { "epoch": 2.805122766946733, "grad_norm": 11.5, "learning_rate": 3.046507767744715e-05, "loss": 3.7846, "num_input_tokens_seen": 35911616, "step": 17195 }, { "epoch": 2.8059384941675503, "grad_norm": 2.171875, "learning_rate": 3.045549722617252e-05, "loss": 2.1899, "num_input_tokens_seen": 35922144, "step": 17200 }, { "epoch": 2.8059384941675503, "eval_loss": 2.556351900100708, "eval_runtime": 135.0275, "eval_samples_per_second": 20.181, "eval_steps_per_second": 10.094, "num_input_tokens_seen": 35922144, "step": 17200 }, { "epoch": 2.806754221388368, "grad_norm": 4.875, "learning_rate": 3.0445915933591658e-05, "loss": 2.6447, "num_input_tokens_seen": 35933024, "step": 17205 }, { "epoch": 2.807569948609185, "grad_norm": 6.40625, "learning_rate": 3.0436333801182114e-05, "loss": 2.0107, "num_input_tokens_seen": 35944192, "step": 17210 }, { "epoch": 2.8083856758300025, "grad_norm": 5.15625, "learning_rate": 3.0426750830421596e-05, "loss": 2.6096, "num_input_tokens_seen": 35954208, "step": 17215 }, { "epoch": 2.8092014030508197, "grad_norm": 3.703125, "learning_rate": 3.0417167022787897e-05, "loss": 2.6197, "num_input_tokens_seen": 35963440, "step": 17220 }, { "epoch": 2.8100171302716372, "grad_norm": 9.5, "learning_rate": 3.0407582379758966e-05, "loss": 2.4956, "num_input_tokens_seen": 35974224, "step": 17225 }, { "epoch": 2.8108328574924544, "grad_norm": 1.1484375, "learning_rate": 3.039799690281287e-05, "loss": 2.0583, "num_input_tokens_seen": 35982752, "step": 17230 }, { "epoch": 2.811648584713272, "grad_norm": 5.0, "learning_rate": 3.0388410593427823e-05, "loss": 2.7041, "num_input_tokens_seen": 35991984, "step": 17235 }, { "epoch": 2.812464311934089, "grad_norm": 10.125, "learning_rate": 3.0378823453082146e-05, "loss": 2.176, "num_input_tokens_seen": 36002720, "step": 17240 }, { "epoch": 2.8132800391549067, "grad_norm": 7.625, "learning_rate": 3.03692354832543e-05, "loss": 3.2001, "num_input_tokens_seen": 36014592, "step": 17245 }, { "epoch": 2.814095766375724, "grad_norm": 3.265625, "learning_rate": 3.0359646685422865e-05, "loss": 3.0239, "num_input_tokens_seen": 36026720, "step": 17250 }, { "epoch": 2.8149114935965414, "grad_norm": 7.5625, "learning_rate": 3.035005706106656e-05, "loss": 2.1807, "num_input_tokens_seen": 36037200, "step": 17255 }, { "epoch": 2.815727220817359, "grad_norm": 4.15625, "learning_rate": 3.034046661166422e-05, "loss": 2.3683, "num_input_tokens_seen": 36048144, "step": 17260 }, { "epoch": 2.816542948038176, "grad_norm": 7.25, "learning_rate": 3.033087533869482e-05, "loss": 2.7169, "num_input_tokens_seen": 36059312, "step": 17265 }, { "epoch": 2.817358675258993, "grad_norm": 7.375, "learning_rate": 3.0321283243637444e-05, "loss": 3.0105, "num_input_tokens_seen": 36070640, "step": 17270 }, { "epoch": 2.8181744024798108, "grad_norm": 11.875, "learning_rate": 3.0311690327971326e-05, "loss": 3.4156, "num_input_tokens_seen": 36081104, "step": 17275 }, { "epoch": 2.8189901297006283, "grad_norm": 8.125, "learning_rate": 3.030209659317581e-05, "loss": 2.627, "num_input_tokens_seen": 36092160, "step": 17280 }, { "epoch": 2.8198058569214455, "grad_norm": 0.85546875, "learning_rate": 3.0292502040730362e-05, "loss": 3.1023, "num_input_tokens_seen": 36103200, "step": 17285 }, { "epoch": 2.8206215841422626, "grad_norm": 7.1875, "learning_rate": 3.0282906672114597e-05, "loss": 1.2399, "num_input_tokens_seen": 36114512, "step": 17290 }, { "epoch": 2.82143731136308, "grad_norm": 1.609375, "learning_rate": 3.027331048880823e-05, "loss": 1.3071, "num_input_tokens_seen": 36125808, "step": 17295 }, { "epoch": 2.8222530385838978, "grad_norm": 7.59375, "learning_rate": 3.0263713492291123e-05, "loss": 2.8007, "num_input_tokens_seen": 36136112, "step": 17300 }, { "epoch": 2.823068765804715, "grad_norm": 7.5, "learning_rate": 3.0254115684043242e-05, "loss": 2.1973, "num_input_tokens_seen": 36147872, "step": 17305 }, { "epoch": 2.823884493025532, "grad_norm": 4.0625, "learning_rate": 3.024451706554469e-05, "loss": 3.9647, "num_input_tokens_seen": 36157840, "step": 17310 }, { "epoch": 2.8247002202463496, "grad_norm": 7.0625, "learning_rate": 3.0234917638275705e-05, "loss": 3.0103, "num_input_tokens_seen": 36167696, "step": 17315 }, { "epoch": 2.825515947467167, "grad_norm": 6.75, "learning_rate": 3.0225317403716635e-05, "loss": 2.9, "num_input_tokens_seen": 36179024, "step": 17320 }, { "epoch": 2.8263316746879843, "grad_norm": 10.0, "learning_rate": 3.0215716363347956e-05, "loss": 2.3599, "num_input_tokens_seen": 36189584, "step": 17325 }, { "epoch": 2.8271474019088014, "grad_norm": 1.6953125, "learning_rate": 3.0206114518650275e-05, "loss": 1.9153, "num_input_tokens_seen": 36201184, "step": 17330 }, { "epoch": 2.827963129129619, "grad_norm": 5.5625, "learning_rate": 3.0196511871104304e-05, "loss": 3.1447, "num_input_tokens_seen": 36212912, "step": 17335 }, { "epoch": 2.8287788563504366, "grad_norm": 9.375, "learning_rate": 3.01869084221909e-05, "loss": 2.2019, "num_input_tokens_seen": 36222608, "step": 17340 }, { "epoch": 2.8295945835712537, "grad_norm": 10.3125, "learning_rate": 3.0177304173391037e-05, "loss": 3.1339, "num_input_tokens_seen": 36232848, "step": 17345 }, { "epoch": 2.8304103107920713, "grad_norm": 10.3125, "learning_rate": 3.01676991261858e-05, "loss": 2.4359, "num_input_tokens_seen": 36243776, "step": 17350 }, { "epoch": 2.8312260380128884, "grad_norm": 4.59375, "learning_rate": 3.015809328205642e-05, "loss": 3.0128, "num_input_tokens_seen": 36252960, "step": 17355 }, { "epoch": 2.832041765233706, "grad_norm": 11.9375, "learning_rate": 3.0148486642484248e-05, "loss": 3.9857, "num_input_tokens_seen": 36262832, "step": 17360 }, { "epoch": 2.832857492454523, "grad_norm": 9.4375, "learning_rate": 3.0138879208950722e-05, "loss": 2.0324, "num_input_tokens_seen": 36272544, "step": 17365 }, { "epoch": 2.8336732196753407, "grad_norm": 6.59375, "learning_rate": 3.012927098293744e-05, "loss": 2.5986, "num_input_tokens_seen": 36283392, "step": 17370 }, { "epoch": 2.834488946896158, "grad_norm": 8.125, "learning_rate": 3.0119661965926123e-05, "loss": 2.4543, "num_input_tokens_seen": 36292272, "step": 17375 }, { "epoch": 2.8353046741169754, "grad_norm": 10.8125, "learning_rate": 3.0110052159398587e-05, "loss": 3.0589, "num_input_tokens_seen": 36302400, "step": 17380 }, { "epoch": 2.8361204013377925, "grad_norm": 8.6875, "learning_rate": 3.0100441564836802e-05, "loss": 2.7999, "num_input_tokens_seen": 36313072, "step": 17385 }, { "epoch": 2.83693612855861, "grad_norm": 7.6875, "learning_rate": 3.0090830183722817e-05, "loss": 2.2582, "num_input_tokens_seen": 36324048, "step": 17390 }, { "epoch": 2.8377518557794272, "grad_norm": 8.8125, "learning_rate": 3.0081218017538852e-05, "loss": 2.415, "num_input_tokens_seen": 36335360, "step": 17395 }, { "epoch": 2.838567583000245, "grad_norm": 2.890625, "learning_rate": 3.0071605067767212e-05, "loss": 3.1566, "num_input_tokens_seen": 36345856, "step": 17400 }, { "epoch": 2.838567583000245, "eval_loss": 2.540825843811035, "eval_runtime": 135.0255, "eval_samples_per_second": 20.181, "eval_steps_per_second": 10.094, "num_input_tokens_seen": 36345856, "step": 17400 }, { "epoch": 2.839383310221062, "grad_norm": 4.5, "learning_rate": 3.006199133589034e-05, "loss": 2.7889, "num_input_tokens_seen": 36357392, "step": 17405 }, { "epoch": 2.8401990374418795, "grad_norm": 2.203125, "learning_rate": 3.005237682339079e-05, "loss": 2.587, "num_input_tokens_seen": 36367664, "step": 17410 }, { "epoch": 2.8410147646626966, "grad_norm": 4.46875, "learning_rate": 3.0042761531751228e-05, "loss": 2.8813, "num_input_tokens_seen": 36377856, "step": 17415 }, { "epoch": 2.841830491883514, "grad_norm": 5.71875, "learning_rate": 3.0033145462454482e-05, "loss": 1.8902, "num_input_tokens_seen": 36387200, "step": 17420 }, { "epoch": 2.8426462191043314, "grad_norm": 10.8125, "learning_rate": 3.002352861698345e-05, "loss": 1.8144, "num_input_tokens_seen": 36397888, "step": 17425 }, { "epoch": 2.843461946325149, "grad_norm": 6.84375, "learning_rate": 3.0013910996821178e-05, "loss": 2.1199, "num_input_tokens_seen": 36408752, "step": 17430 }, { "epoch": 2.8442776735459665, "grad_norm": 10.3125, "learning_rate": 3.0004292603450817e-05, "loss": 1.1044, "num_input_tokens_seen": 36420272, "step": 17435 }, { "epoch": 2.8450934007667836, "grad_norm": 5.625, "learning_rate": 2.9994673438355653e-05, "loss": 1.8123, "num_input_tokens_seen": 36431040, "step": 17440 }, { "epoch": 2.8459091279876008, "grad_norm": 3.75, "learning_rate": 2.9985053503019078e-05, "loss": 2.1149, "num_input_tokens_seen": 36441520, "step": 17445 }, { "epoch": 2.8467248552084183, "grad_norm": 5.625, "learning_rate": 2.99754327989246e-05, "loss": 2.4335, "num_input_tokens_seen": 36452272, "step": 17450 }, { "epoch": 2.847540582429236, "grad_norm": 8.375, "learning_rate": 2.9965811327555864e-05, "loss": 1.3532, "num_input_tokens_seen": 36462832, "step": 17455 }, { "epoch": 2.848356309650053, "grad_norm": 0.16796875, "learning_rate": 2.995618909039662e-05, "loss": 2.0719, "num_input_tokens_seen": 36473952, "step": 17460 }, { "epoch": 2.84917203687087, "grad_norm": 7.15625, "learning_rate": 2.9946566088930727e-05, "loss": 2.202, "num_input_tokens_seen": 36484384, "step": 17465 }, { "epoch": 2.8499877640916877, "grad_norm": 6.65625, "learning_rate": 2.9936942324642192e-05, "loss": 1.8019, "num_input_tokens_seen": 36493968, "step": 17470 }, { "epoch": 2.8508034913125053, "grad_norm": 8.0625, "learning_rate": 2.9927317799015097e-05, "loss": 2.2342, "num_input_tokens_seen": 36504144, "step": 17475 }, { "epoch": 2.8516192185333225, "grad_norm": 5.34375, "learning_rate": 2.9917692513533685e-05, "loss": 2.025, "num_input_tokens_seen": 36514512, "step": 17480 }, { "epoch": 2.8524349457541396, "grad_norm": 7.84375, "learning_rate": 2.990806646968229e-05, "loss": 1.4646, "num_input_tokens_seen": 36525648, "step": 17485 }, { "epoch": 2.853250672974957, "grad_norm": 7.90625, "learning_rate": 2.989843966894536e-05, "loss": 4.3998, "num_input_tokens_seen": 36536144, "step": 17490 }, { "epoch": 2.8540664001957747, "grad_norm": 6.25, "learning_rate": 2.9888812112807472e-05, "loss": 3.5881, "num_input_tokens_seen": 36546000, "step": 17495 }, { "epoch": 2.854882127416592, "grad_norm": 7.8125, "learning_rate": 2.987918380275333e-05, "loss": 2.1008, "num_input_tokens_seen": 36556688, "step": 17500 }, { "epoch": 2.855697854637409, "grad_norm": 9.375, "learning_rate": 2.9869554740267724e-05, "loss": 3.3469, "num_input_tokens_seen": 36567232, "step": 17505 }, { "epoch": 2.8565135818582266, "grad_norm": 6.34375, "learning_rate": 2.9859924926835585e-05, "loss": 2.5967, "num_input_tokens_seen": 36576832, "step": 17510 }, { "epoch": 2.857329309079044, "grad_norm": 8.9375, "learning_rate": 2.9850294363941944e-05, "loss": 3.5461, "num_input_tokens_seen": 36586576, "step": 17515 }, { "epoch": 2.8581450362998613, "grad_norm": 4.125, "learning_rate": 2.9840663053071967e-05, "loss": 1.3531, "num_input_tokens_seen": 36598400, "step": 17520 }, { "epoch": 2.858960763520679, "grad_norm": 7.65625, "learning_rate": 2.983103099571091e-05, "loss": 2.4773, "num_input_tokens_seen": 36609696, "step": 17525 }, { "epoch": 2.859776490741496, "grad_norm": 11.375, "learning_rate": 2.9821398193344164e-05, "loss": 3.664, "num_input_tokens_seen": 36620944, "step": 17530 }, { "epoch": 2.8605922179623136, "grad_norm": 3.3125, "learning_rate": 2.9811764647457226e-05, "loss": 3.1181, "num_input_tokens_seen": 36630976, "step": 17535 }, { "epoch": 2.8614079451831307, "grad_norm": 9.5625, "learning_rate": 2.9802130359535714e-05, "loss": 2.042, "num_input_tokens_seen": 36642704, "step": 17540 }, { "epoch": 2.8622236724039483, "grad_norm": 9.0, "learning_rate": 2.979249533106535e-05, "loss": 3.0682, "num_input_tokens_seen": 36652464, "step": 17545 }, { "epoch": 2.8630393996247654, "grad_norm": 7.28125, "learning_rate": 2.9782859563531986e-05, "loss": 3.3983, "num_input_tokens_seen": 36663120, "step": 17550 }, { "epoch": 2.863855126845583, "grad_norm": 7.4375, "learning_rate": 2.977322305842156e-05, "loss": 2.2612, "num_input_tokens_seen": 36673872, "step": 17555 }, { "epoch": 2.8646708540664, "grad_norm": 9.0625, "learning_rate": 2.9763585817220162e-05, "loss": 1.81, "num_input_tokens_seen": 36683760, "step": 17560 }, { "epoch": 2.8654865812872177, "grad_norm": 3.828125, "learning_rate": 2.975394784141397e-05, "loss": 2.5872, "num_input_tokens_seen": 36694112, "step": 17565 }, { "epoch": 2.866302308508035, "grad_norm": 5.25, "learning_rate": 2.974430913248928e-05, "loss": 3.3368, "num_input_tokens_seen": 36705472, "step": 17570 }, { "epoch": 2.8671180357288524, "grad_norm": 10.1875, "learning_rate": 2.9734669691932497e-05, "loss": 2.6282, "num_input_tokens_seen": 36716288, "step": 17575 }, { "epoch": 2.8679337629496695, "grad_norm": 6.125, "learning_rate": 2.9725029521230147e-05, "loss": 2.7525, "num_input_tokens_seen": 36727216, "step": 17580 }, { "epoch": 2.868749490170487, "grad_norm": 9.4375, "learning_rate": 2.9715388621868873e-05, "loss": 1.658, "num_input_tokens_seen": 36737648, "step": 17585 }, { "epoch": 2.869565217391304, "grad_norm": 3.9375, "learning_rate": 2.970574699533541e-05, "loss": 3.0057, "num_input_tokens_seen": 36748784, "step": 17590 }, { "epoch": 2.870380944612122, "grad_norm": 3.0, "learning_rate": 2.969610464311662e-05, "loss": 1.2724, "num_input_tokens_seen": 36758976, "step": 17595 }, { "epoch": 2.871196671832939, "grad_norm": 8.125, "learning_rate": 2.9686461566699487e-05, "loss": 1.2781, "num_input_tokens_seen": 36770688, "step": 17600 }, { "epoch": 2.871196671832939, "eval_loss": 2.5483222007751465, "eval_runtime": 134.9733, "eval_samples_per_second": 20.189, "eval_steps_per_second": 10.098, "num_input_tokens_seen": 36770688, "step": 17600 }, { "epoch": 2.8720123990537565, "grad_norm": 9.3125, "learning_rate": 2.9676817767571086e-05, "loss": 2.8454, "num_input_tokens_seen": 36782448, "step": 17605 }, { "epoch": 2.8728281262745736, "grad_norm": 7.875, "learning_rate": 2.966717324721861e-05, "loss": 1.6154, "num_input_tokens_seen": 36793232, "step": 17610 }, { "epoch": 2.873643853495391, "grad_norm": 3.234375, "learning_rate": 2.9657528007129366e-05, "loss": 3.3533, "num_input_tokens_seen": 36803664, "step": 17615 }, { "epoch": 2.8744595807162083, "grad_norm": 8.625, "learning_rate": 2.9647882048790777e-05, "loss": 2.1044, "num_input_tokens_seen": 36815504, "step": 17620 }, { "epoch": 2.875275307937026, "grad_norm": 16.125, "learning_rate": 2.963823537369037e-05, "loss": 2.6641, "num_input_tokens_seen": 36826224, "step": 17625 }, { "epoch": 2.8760910351578435, "grad_norm": 7.375, "learning_rate": 2.9628587983315775e-05, "loss": 2.3371, "num_input_tokens_seen": 36835872, "step": 17630 }, { "epoch": 2.8769067623786606, "grad_norm": 5.125, "learning_rate": 2.9618939879154746e-05, "loss": 2.5781, "num_input_tokens_seen": 36846880, "step": 17635 }, { "epoch": 2.8777224895994777, "grad_norm": 9.6875, "learning_rate": 2.9609291062695143e-05, "loss": 2.0684, "num_input_tokens_seen": 36856528, "step": 17640 }, { "epoch": 2.8785382168202953, "grad_norm": 10.6875, "learning_rate": 2.9599641535424938e-05, "loss": 1.6072, "num_input_tokens_seen": 36866592, "step": 17645 }, { "epoch": 2.879353944041113, "grad_norm": 5.59375, "learning_rate": 2.9589991298832202e-05, "loss": 2.0815, "num_input_tokens_seen": 36875232, "step": 17650 }, { "epoch": 2.88016967126193, "grad_norm": 8.75, "learning_rate": 2.958034035440513e-05, "loss": 1.5411, "num_input_tokens_seen": 36885696, "step": 17655 }, { "epoch": 2.880985398482747, "grad_norm": 7.34375, "learning_rate": 2.957068870363201e-05, "loss": 1.6153, "num_input_tokens_seen": 36895456, "step": 17660 }, { "epoch": 2.8818011257035647, "grad_norm": 4.75, "learning_rate": 2.956103634800126e-05, "loss": 3.3167, "num_input_tokens_seen": 36905728, "step": 17665 }, { "epoch": 2.8826168529243823, "grad_norm": 15.75, "learning_rate": 2.9551383289001384e-05, "loss": 3.4195, "num_input_tokens_seen": 36916784, "step": 17670 }, { "epoch": 2.8834325801451994, "grad_norm": 8.875, "learning_rate": 2.9541729528121005e-05, "loss": 3.0218, "num_input_tokens_seen": 36927488, "step": 17675 }, { "epoch": 2.8842483073660166, "grad_norm": 6.15625, "learning_rate": 2.9532075066848856e-05, "loss": 1.8761, "num_input_tokens_seen": 36938704, "step": 17680 }, { "epoch": 2.885064034586834, "grad_norm": 13.5625, "learning_rate": 2.9522419906673786e-05, "loss": 2.72, "num_input_tokens_seen": 36947904, "step": 17685 }, { "epoch": 2.8858797618076517, "grad_norm": 6.84375, "learning_rate": 2.951276404908474e-05, "loss": 2.8413, "num_input_tokens_seen": 36958672, "step": 17690 }, { "epoch": 2.886695489028469, "grad_norm": 6.625, "learning_rate": 2.9503107495570752e-05, "loss": 3.1267, "num_input_tokens_seen": 36970336, "step": 17695 }, { "epoch": 2.887511216249286, "grad_norm": 5.40625, "learning_rate": 2.9493450247621003e-05, "loss": 0.9952, "num_input_tokens_seen": 36979680, "step": 17700 }, { "epoch": 2.8883269434701035, "grad_norm": 4.65625, "learning_rate": 2.948379230672476e-05, "loss": 1.0621, "num_input_tokens_seen": 36990912, "step": 17705 }, { "epoch": 2.889142670690921, "grad_norm": 6.5, "learning_rate": 2.9474133674371396e-05, "loss": 3.3345, "num_input_tokens_seen": 37001408, "step": 17710 }, { "epoch": 2.8899583979117383, "grad_norm": 14.1875, "learning_rate": 2.9464474352050387e-05, "loss": 2.4321, "num_input_tokens_seen": 37011600, "step": 17715 }, { "epoch": 2.890774125132556, "grad_norm": 7.28125, "learning_rate": 2.9454814341251336e-05, "loss": 1.6818, "num_input_tokens_seen": 37022160, "step": 17720 }, { "epoch": 2.891589852353373, "grad_norm": 6.40625, "learning_rate": 2.9445153643463942e-05, "loss": 1.7129, "num_input_tokens_seen": 37033504, "step": 17725 }, { "epoch": 2.8924055795741905, "grad_norm": 6.78125, "learning_rate": 2.943549226017798e-05, "loss": 1.9233, "num_input_tokens_seen": 37045696, "step": 17730 }, { "epoch": 2.8932213067950077, "grad_norm": 4.34375, "learning_rate": 2.942583019288337e-05, "loss": 2.4831, "num_input_tokens_seen": 37055840, "step": 17735 }, { "epoch": 2.8940370340158252, "grad_norm": 12.5, "learning_rate": 2.9416167443070132e-05, "loss": 2.6092, "num_input_tokens_seen": 37066144, "step": 17740 }, { "epoch": 2.8948527612366424, "grad_norm": 5.40625, "learning_rate": 2.9406504012228375e-05, "loss": 2.9348, "num_input_tokens_seen": 37076144, "step": 17745 }, { "epoch": 2.89566848845746, "grad_norm": 12.0625, "learning_rate": 2.939683990184832e-05, "loss": 1.5366, "num_input_tokens_seen": 37087440, "step": 17750 }, { "epoch": 2.896484215678277, "grad_norm": 6.21875, "learning_rate": 2.93871751134203e-05, "loss": 2.1692, "num_input_tokens_seen": 37097520, "step": 17755 }, { "epoch": 2.8972999428990946, "grad_norm": 6.1875, "learning_rate": 2.9377509648434752e-05, "loss": 1.9937, "num_input_tokens_seen": 37109024, "step": 17760 }, { "epoch": 2.898115670119912, "grad_norm": 4.1875, "learning_rate": 2.9367843508382203e-05, "loss": 3.1007, "num_input_tokens_seen": 37118336, "step": 17765 }, { "epoch": 2.8989313973407294, "grad_norm": 7.9375, "learning_rate": 2.9358176694753293e-05, "loss": 1.047, "num_input_tokens_seen": 37128624, "step": 17770 }, { "epoch": 2.8997471245615465, "grad_norm": 5.53125, "learning_rate": 2.9348509209038766e-05, "loss": 1.3671, "num_input_tokens_seen": 37139824, "step": 17775 }, { "epoch": 2.900562851782364, "grad_norm": 7.8125, "learning_rate": 2.933884105272947e-05, "loss": 2.157, "num_input_tokens_seen": 37150288, "step": 17780 }, { "epoch": 2.901378579003181, "grad_norm": 5.375, "learning_rate": 2.9329172227316366e-05, "loss": 0.8971, "num_input_tokens_seen": 37162576, "step": 17785 }, { "epoch": 2.9021943062239988, "grad_norm": 5.625, "learning_rate": 2.93195027342905e-05, "loss": 1.5663, "num_input_tokens_seen": 37173104, "step": 17790 }, { "epoch": 2.903010033444816, "grad_norm": 5.84375, "learning_rate": 2.9309832575143024e-05, "loss": 2.5671, "num_input_tokens_seen": 37184544, "step": 17795 }, { "epoch": 2.9038257606656335, "grad_norm": 5.40625, "learning_rate": 2.930016175136521e-05, "loss": 2.3983, "num_input_tokens_seen": 37194864, "step": 17800 }, { "epoch": 2.9038257606656335, "eval_loss": 2.533569574356079, "eval_runtime": 134.8985, "eval_samples_per_second": 20.2, "eval_steps_per_second": 10.104, "num_input_tokens_seen": 37194864, "step": 17800 }, { "epoch": 2.904641487886451, "grad_norm": 4.0, "learning_rate": 2.9290490264448412e-05, "loss": 2.2953, "num_input_tokens_seen": 37204528, "step": 17805 }, { "epoch": 2.905457215107268, "grad_norm": 8.8125, "learning_rate": 2.9280818115884094e-05, "loss": 2.2755, "num_input_tokens_seen": 37214704, "step": 17810 }, { "epoch": 2.9062729423280853, "grad_norm": 10.375, "learning_rate": 2.9271145307163828e-05, "loss": 2.8866, "num_input_tokens_seen": 37225456, "step": 17815 }, { "epoch": 2.907088669548903, "grad_norm": 10.875, "learning_rate": 2.9261471839779287e-05, "loss": 3.1576, "num_input_tokens_seen": 37235872, "step": 17820 }, { "epoch": 2.9079043967697205, "grad_norm": 4.59375, "learning_rate": 2.925179771522223e-05, "loss": 1.6748, "num_input_tokens_seen": 37246032, "step": 17825 }, { "epoch": 2.9087201239905376, "grad_norm": 6.65625, "learning_rate": 2.9242122934984535e-05, "loss": 2.7183, "num_input_tokens_seen": 37256912, "step": 17830 }, { "epoch": 2.9095358512113547, "grad_norm": 7.21875, "learning_rate": 2.9232447500558176e-05, "loss": 2.2345, "num_input_tokens_seen": 37268896, "step": 17835 }, { "epoch": 2.9103515784321723, "grad_norm": 3.953125, "learning_rate": 2.9222771413435225e-05, "loss": 2.6866, "num_input_tokens_seen": 37278672, "step": 17840 }, { "epoch": 2.91116730565299, "grad_norm": 0.93359375, "learning_rate": 2.9213094675107848e-05, "loss": 2.5689, "num_input_tokens_seen": 37288208, "step": 17845 }, { "epoch": 2.911983032873807, "grad_norm": 8.1875, "learning_rate": 2.9203417287068335e-05, "loss": 2.7114, "num_input_tokens_seen": 37298112, "step": 17850 }, { "epoch": 2.912798760094624, "grad_norm": 11.375, "learning_rate": 2.9193739250809042e-05, "loss": 2.8746, "num_input_tokens_seen": 37308032, "step": 17855 }, { "epoch": 2.9136144873154417, "grad_norm": 13.1875, "learning_rate": 2.9184060567822463e-05, "loss": 4.1829, "num_input_tokens_seen": 37320720, "step": 17860 }, { "epoch": 2.9144302145362593, "grad_norm": 4.875, "learning_rate": 2.9174381239601166e-05, "loss": 2.7385, "num_input_tokens_seen": 37329840, "step": 17865 }, { "epoch": 2.9152459417570764, "grad_norm": 6.34375, "learning_rate": 2.916470126763783e-05, "loss": 2.5973, "num_input_tokens_seen": 37339488, "step": 17870 }, { "epoch": 2.9160616689778935, "grad_norm": 8.8125, "learning_rate": 2.9155020653425203e-05, "loss": 3.386, "num_input_tokens_seen": 37348608, "step": 17875 }, { "epoch": 2.916877396198711, "grad_norm": 6.09375, "learning_rate": 2.9145339398456184e-05, "loss": 2.7162, "num_input_tokens_seen": 37358672, "step": 17880 }, { "epoch": 2.9176931234195287, "grad_norm": 12.375, "learning_rate": 2.913565750422374e-05, "loss": 4.2184, "num_input_tokens_seen": 37368752, "step": 17885 }, { "epoch": 2.918508850640346, "grad_norm": 9.375, "learning_rate": 2.9125974972220938e-05, "loss": 1.9727, "num_input_tokens_seen": 37379168, "step": 17890 }, { "epoch": 2.919324577861163, "grad_norm": 21.0, "learning_rate": 2.9116291803940932e-05, "loss": 2.4604, "num_input_tokens_seen": 37388944, "step": 17895 }, { "epoch": 2.9201403050819805, "grad_norm": 8.9375, "learning_rate": 2.910660800087701e-05, "loss": 2.3198, "num_input_tokens_seen": 37399216, "step": 17900 }, { "epoch": 2.920956032302798, "grad_norm": 11.875, "learning_rate": 2.909692356452254e-05, "loss": 4.4498, "num_input_tokens_seen": 37410144, "step": 17905 }, { "epoch": 2.9217717595236152, "grad_norm": 8.4375, "learning_rate": 2.9087238496370962e-05, "loss": 2.4111, "num_input_tokens_seen": 37421264, "step": 17910 }, { "epoch": 2.922587486744433, "grad_norm": 2.125, "learning_rate": 2.907755279791583e-05, "loss": 1.0407, "num_input_tokens_seen": 37431408, "step": 17915 }, { "epoch": 2.92340321396525, "grad_norm": 1.9296875, "learning_rate": 2.906786647065083e-05, "loss": 3.126, "num_input_tokens_seen": 37442976, "step": 17920 }, { "epoch": 2.9242189411860675, "grad_norm": 3.28125, "learning_rate": 2.9058179516069695e-05, "loss": 2.0501, "num_input_tokens_seen": 37452608, "step": 17925 }, { "epoch": 2.9250346684068846, "grad_norm": 4.8125, "learning_rate": 2.9048491935666282e-05, "loss": 3.5146, "num_input_tokens_seen": 37462960, "step": 17930 }, { "epoch": 2.925850395627702, "grad_norm": 5.0, "learning_rate": 2.9038803730934534e-05, "loss": 2.8024, "num_input_tokens_seen": 37473776, "step": 17935 }, { "epoch": 2.9266661228485193, "grad_norm": 5.6875, "learning_rate": 2.9029114903368503e-05, "loss": 1.4991, "num_input_tokens_seen": 37484896, "step": 17940 }, { "epoch": 2.927481850069337, "grad_norm": 6.53125, "learning_rate": 2.9019425454462318e-05, "loss": 2.23, "num_input_tokens_seen": 37496272, "step": 17945 }, { "epoch": 2.928297577290154, "grad_norm": 12.0, "learning_rate": 2.9009735385710212e-05, "loss": 2.9989, "num_input_tokens_seen": 37505872, "step": 17950 }, { "epoch": 2.9291133045109716, "grad_norm": 6.96875, "learning_rate": 2.900004469860652e-05, "loss": 3.0661, "num_input_tokens_seen": 37516864, "step": 17955 }, { "epoch": 2.9299290317317888, "grad_norm": 8.125, "learning_rate": 2.8990353394645668e-05, "loss": 1.5856, "num_input_tokens_seen": 37528080, "step": 17960 }, { "epoch": 2.9307447589526063, "grad_norm": 6.09375, "learning_rate": 2.8980661475322186e-05, "loss": 2.3062, "num_input_tokens_seen": 37539600, "step": 17965 }, { "epoch": 2.9315604861734235, "grad_norm": 5.90625, "learning_rate": 2.897096894213067e-05, "loss": 1.5724, "num_input_tokens_seen": 37550176, "step": 17970 }, { "epoch": 2.932376213394241, "grad_norm": 7.625, "learning_rate": 2.8961275796565845e-05, "loss": 3.0958, "num_input_tokens_seen": 37560304, "step": 17975 }, { "epoch": 2.933191940615058, "grad_norm": 6.15625, "learning_rate": 2.8951582040122517e-05, "loss": 2.1767, "num_input_tokens_seen": 37571488, "step": 17980 }, { "epoch": 2.9340076678358757, "grad_norm": 6.21875, "learning_rate": 2.894188767429557e-05, "loss": 1.8617, "num_input_tokens_seen": 37581136, "step": 17985 }, { "epoch": 2.934823395056693, "grad_norm": 8.0625, "learning_rate": 2.8932192700580014e-05, "loss": 3.1924, "num_input_tokens_seen": 37591968, "step": 17990 }, { "epoch": 2.9356391222775104, "grad_norm": 11.625, "learning_rate": 2.8922497120470916e-05, "loss": 3.033, "num_input_tokens_seen": 37603072, "step": 17995 }, { "epoch": 2.936454849498328, "grad_norm": 6.53125, "learning_rate": 2.891280093546348e-05, "loss": 1.9318, "num_input_tokens_seen": 37615344, "step": 18000 }, { "epoch": 2.936454849498328, "eval_loss": 2.53791880607605, "eval_runtime": 134.9454, "eval_samples_per_second": 20.193, "eval_steps_per_second": 10.1, "num_input_tokens_seen": 37615344, "step": 18000 }, { "epoch": 2.937270576719145, "grad_norm": 8.0625, "learning_rate": 2.890310414705297e-05, "loss": 1.4782, "num_input_tokens_seen": 37626608, "step": 18005 }, { "epoch": 2.9380863039399623, "grad_norm": 6.3125, "learning_rate": 2.8893406756734742e-05, "loss": 2.6036, "num_input_tokens_seen": 37637136, "step": 18010 }, { "epoch": 2.93890203116078, "grad_norm": 5.34375, "learning_rate": 2.888370876600427e-05, "loss": 1.9082, "num_input_tokens_seen": 37647664, "step": 18015 }, { "epoch": 2.9397177583815974, "grad_norm": 9.25, "learning_rate": 2.8874010176357104e-05, "loss": 1.8654, "num_input_tokens_seen": 37658352, "step": 18020 }, { "epoch": 2.9405334856024146, "grad_norm": 6.96875, "learning_rate": 2.886431098928888e-05, "loss": 3.079, "num_input_tokens_seen": 37669504, "step": 18025 }, { "epoch": 2.9413492128232317, "grad_norm": 9.375, "learning_rate": 2.885461120629534e-05, "loss": 2.9154, "num_input_tokens_seen": 37679520, "step": 18030 }, { "epoch": 2.9421649400440493, "grad_norm": 6.90625, "learning_rate": 2.8844910828872317e-05, "loss": 2.5521, "num_input_tokens_seen": 37688176, "step": 18035 }, { "epoch": 2.942980667264867, "grad_norm": 11.125, "learning_rate": 2.8835209858515715e-05, "loss": 2.7315, "num_input_tokens_seen": 37698912, "step": 18040 }, { "epoch": 2.943796394485684, "grad_norm": 8.1875, "learning_rate": 2.8825508296721566e-05, "loss": 2.1624, "num_input_tokens_seen": 37710480, "step": 18045 }, { "epoch": 2.944612121706501, "grad_norm": 5.5, "learning_rate": 2.881580614498596e-05, "loss": 2.2693, "num_input_tokens_seen": 37721216, "step": 18050 }, { "epoch": 2.9454278489273187, "grad_norm": 5.78125, "learning_rate": 2.8806103404805103e-05, "loss": 2.5076, "num_input_tokens_seen": 37732480, "step": 18055 }, { "epoch": 2.9462435761481363, "grad_norm": 10.375, "learning_rate": 2.8796400077675257e-05, "loss": 2.3798, "num_input_tokens_seen": 37743744, "step": 18060 }, { "epoch": 2.9470593033689534, "grad_norm": 12.4375, "learning_rate": 2.8786696165092812e-05, "loss": 2.1377, "num_input_tokens_seen": 37754848, "step": 18065 }, { "epoch": 2.9478750305897705, "grad_norm": 4.125, "learning_rate": 2.8776991668554236e-05, "loss": 1.4124, "num_input_tokens_seen": 37764960, "step": 18070 }, { "epoch": 2.948690757810588, "grad_norm": 3.546875, "learning_rate": 2.876728658955608e-05, "loss": 1.1722, "num_input_tokens_seen": 37775856, "step": 18075 }, { "epoch": 2.9495064850314057, "grad_norm": 4.625, "learning_rate": 2.8757580929594986e-05, "loss": 2.5072, "num_input_tokens_seen": 37786416, "step": 18080 }, { "epoch": 2.950322212252223, "grad_norm": 0.478515625, "learning_rate": 2.87478746901677e-05, "loss": 1.9796, "num_input_tokens_seen": 37795104, "step": 18085 }, { "epoch": 2.9511379394730404, "grad_norm": 7.28125, "learning_rate": 2.873816787277103e-05, "loss": 1.3089, "num_input_tokens_seen": 37805536, "step": 18090 }, { "epoch": 2.9519536666938575, "grad_norm": 6.5625, "learning_rate": 2.8728460478901903e-05, "loss": 3.0849, "num_input_tokens_seen": 37813488, "step": 18095 }, { "epoch": 2.952769393914675, "grad_norm": 8.0625, "learning_rate": 2.8718752510057307e-05, "loss": 2.0511, "num_input_tokens_seen": 37823648, "step": 18100 }, { "epoch": 2.953585121135492, "grad_norm": 12.125, "learning_rate": 2.870904396773435e-05, "loss": 2.1328, "num_input_tokens_seen": 37833744, "step": 18105 }, { "epoch": 2.95440084835631, "grad_norm": 5.0625, "learning_rate": 2.86993348534302e-05, "loss": 1.3966, "num_input_tokens_seen": 37844752, "step": 18110 }, { "epoch": 2.955216575577127, "grad_norm": 1.1328125, "learning_rate": 2.868962516864212e-05, "loss": 2.2523, "num_input_tokens_seen": 37854704, "step": 18115 }, { "epoch": 2.9560323027979445, "grad_norm": 12.0, "learning_rate": 2.8679914914867477e-05, "loss": 2.3021, "num_input_tokens_seen": 37864976, "step": 18120 }, { "epoch": 2.9568480300187616, "grad_norm": 13.0, "learning_rate": 2.8670204093603713e-05, "loss": 2.3064, "num_input_tokens_seen": 37876352, "step": 18125 }, { "epoch": 2.957663757239579, "grad_norm": 8.8125, "learning_rate": 2.8660492706348357e-05, "loss": 1.9308, "num_input_tokens_seen": 37886000, "step": 18130 }, { "epoch": 2.9584794844603963, "grad_norm": 9.3125, "learning_rate": 2.8650780754599022e-05, "loss": 2.4612, "num_input_tokens_seen": 37896960, "step": 18135 }, { "epoch": 2.959295211681214, "grad_norm": 6.875, "learning_rate": 2.8641068239853407e-05, "loss": 1.7649, "num_input_tokens_seen": 37906224, "step": 18140 }, { "epoch": 2.960110938902031, "grad_norm": 5.75, "learning_rate": 2.863135516360932e-05, "loss": 2.873, "num_input_tokens_seen": 37917632, "step": 18145 }, { "epoch": 2.9609266661228486, "grad_norm": 10.125, "learning_rate": 2.8621641527364633e-05, "loss": 2.4413, "num_input_tokens_seen": 37929440, "step": 18150 }, { "epoch": 2.9617423933436657, "grad_norm": 5.15625, "learning_rate": 2.8611927332617313e-05, "loss": 2.2025, "num_input_tokens_seen": 37939968, "step": 18155 }, { "epoch": 2.9625581205644833, "grad_norm": 14.8125, "learning_rate": 2.8602212580865405e-05, "loss": 2.6721, "num_input_tokens_seen": 37949728, "step": 18160 }, { "epoch": 2.9633738477853004, "grad_norm": 8.3125, "learning_rate": 2.859249727360705e-05, "loss": 2.6591, "num_input_tokens_seen": 37960832, "step": 18165 }, { "epoch": 2.964189575006118, "grad_norm": 10.0, "learning_rate": 2.8582781412340465e-05, "loss": 2.8765, "num_input_tokens_seen": 37972704, "step": 18170 }, { "epoch": 2.965005302226935, "grad_norm": 12.4375, "learning_rate": 2.857306499856397e-05, "loss": 2.853, "num_input_tokens_seen": 37983120, "step": 18175 }, { "epoch": 2.9658210294477527, "grad_norm": 10.5625, "learning_rate": 2.856334803377594e-05, "loss": 2.9062, "num_input_tokens_seen": 37992672, "step": 18180 }, { "epoch": 2.96663675666857, "grad_norm": 11.75, "learning_rate": 2.8553630519474867e-05, "loss": 2.4334, "num_input_tokens_seen": 38002944, "step": 18185 }, { "epoch": 2.9674524838893874, "grad_norm": 8.125, "learning_rate": 2.8543912457159317e-05, "loss": 2.8965, "num_input_tokens_seen": 38012096, "step": 18190 }, { "epoch": 2.968268211110205, "grad_norm": 7.3125, "learning_rate": 2.853419384832792e-05, "loss": 3.5969, "num_input_tokens_seen": 38020576, "step": 18195 }, { "epoch": 2.969083938331022, "grad_norm": 2.328125, "learning_rate": 2.8524474694479423e-05, "loss": 1.6621, "num_input_tokens_seen": 38030400, "step": 18200 }, { "epoch": 2.969083938331022, "eval_loss": 2.5444138050079346, "eval_runtime": 135.0548, "eval_samples_per_second": 20.177, "eval_steps_per_second": 10.092, "num_input_tokens_seen": 38030400, "step": 18200 }, { "epoch": 2.9698996655518393, "grad_norm": 9.9375, "learning_rate": 2.851475499711264e-05, "loss": 3.4106, "num_input_tokens_seen": 38039856, "step": 18205 }, { "epoch": 2.970715392772657, "grad_norm": 6.3125, "learning_rate": 2.8505034757726468e-05, "loss": 1.4542, "num_input_tokens_seen": 38049600, "step": 18210 }, { "epoch": 2.9715311199934744, "grad_norm": 8.5625, "learning_rate": 2.8495313977819886e-05, "loss": 3.4219, "num_input_tokens_seen": 38060512, "step": 18215 }, { "epoch": 2.9723468472142915, "grad_norm": 11.125, "learning_rate": 2.8485592658891956e-05, "loss": 3.0502, "num_input_tokens_seen": 38069968, "step": 18220 }, { "epoch": 2.9731625744351087, "grad_norm": 6.0, "learning_rate": 2.8475870802441844e-05, "loss": 3.5948, "num_input_tokens_seen": 38080560, "step": 18225 }, { "epoch": 2.9739783016559262, "grad_norm": 9.5, "learning_rate": 2.8466148409968774e-05, "loss": 2.6381, "num_input_tokens_seen": 38089328, "step": 18230 }, { "epoch": 2.974794028876744, "grad_norm": 4.53125, "learning_rate": 2.8456425482972067e-05, "loss": 1.267, "num_input_tokens_seen": 38098864, "step": 18235 }, { "epoch": 2.975609756097561, "grad_norm": 5.625, "learning_rate": 2.84467020229511e-05, "loss": 2.028, "num_input_tokens_seen": 38108464, "step": 18240 }, { "epoch": 2.976425483318378, "grad_norm": 9.25, "learning_rate": 2.8436978031405375e-05, "loss": 2.8224, "num_input_tokens_seen": 38118272, "step": 18245 }, { "epoch": 2.9772412105391957, "grad_norm": 8.625, "learning_rate": 2.842725350983445e-05, "loss": 4.1128, "num_input_tokens_seen": 38128688, "step": 18250 }, { "epoch": 2.9780569377600132, "grad_norm": 2.984375, "learning_rate": 2.8417528459737957e-05, "loss": 1.9387, "num_input_tokens_seen": 38137520, "step": 18255 }, { "epoch": 2.9788726649808304, "grad_norm": 16.625, "learning_rate": 2.8407802882615624e-05, "loss": 1.9687, "num_input_tokens_seen": 38148480, "step": 18260 }, { "epoch": 2.9796883922016475, "grad_norm": 11.1875, "learning_rate": 2.8398076779967277e-05, "loss": 2.9298, "num_input_tokens_seen": 38158160, "step": 18265 }, { "epoch": 2.980504119422465, "grad_norm": 2.59375, "learning_rate": 2.8388350153292774e-05, "loss": 1.3153, "num_input_tokens_seen": 38168544, "step": 18270 }, { "epoch": 2.9813198466432826, "grad_norm": 4.03125, "learning_rate": 2.8378623004092103e-05, "loss": 1.5028, "num_input_tokens_seen": 38178848, "step": 18275 }, { "epoch": 2.9821355738640998, "grad_norm": 5.25, "learning_rate": 2.8368895333865302e-05, "loss": 2.29, "num_input_tokens_seen": 38188288, "step": 18280 }, { "epoch": 2.9829513010849174, "grad_norm": 8.5, "learning_rate": 2.835916714411251e-05, "loss": 2.9891, "num_input_tokens_seen": 38197904, "step": 18285 }, { "epoch": 2.9837670283057345, "grad_norm": 10.5, "learning_rate": 2.8349438436333926e-05, "loss": 3.2737, "num_input_tokens_seen": 38208640, "step": 18290 }, { "epoch": 2.984582755526552, "grad_norm": 2.1875, "learning_rate": 2.833970921202984e-05, "loss": 1.5725, "num_input_tokens_seen": 38221136, "step": 18295 }, { "epoch": 2.985398482747369, "grad_norm": 7.75, "learning_rate": 2.8329979472700628e-05, "loss": 2.4457, "num_input_tokens_seen": 38230800, "step": 18300 }, { "epoch": 2.9862142099681868, "grad_norm": 6.6875, "learning_rate": 2.832024921984674e-05, "loss": 2.9475, "num_input_tokens_seen": 38240128, "step": 18305 }, { "epoch": 2.987029937189004, "grad_norm": 1.609375, "learning_rate": 2.8310518454968693e-05, "loss": 1.8237, "num_input_tokens_seen": 38251888, "step": 18310 }, { "epoch": 2.9878456644098215, "grad_norm": 7.84375, "learning_rate": 2.8300787179567095e-05, "loss": 1.9924, "num_input_tokens_seen": 38262384, "step": 18315 }, { "epoch": 2.9886613916306386, "grad_norm": 4.59375, "learning_rate": 2.8291055395142636e-05, "loss": 2.4782, "num_input_tokens_seen": 38270528, "step": 18320 }, { "epoch": 2.989477118851456, "grad_norm": 5.875, "learning_rate": 2.8281323103196073e-05, "loss": 1.7856, "num_input_tokens_seen": 38282320, "step": 18325 }, { "epoch": 2.9902928460722733, "grad_norm": 7.21875, "learning_rate": 2.8271590305228256e-05, "loss": 2.7209, "num_input_tokens_seen": 38293984, "step": 18330 }, { "epoch": 2.991108573293091, "grad_norm": 1.9765625, "learning_rate": 2.82618570027401e-05, "loss": 2.3179, "num_input_tokens_seen": 38304048, "step": 18335 }, { "epoch": 2.991924300513908, "grad_norm": 2.78125, "learning_rate": 2.8252123197232604e-05, "loss": 2.617, "num_input_tokens_seen": 38314816, "step": 18340 }, { "epoch": 2.9927400277347256, "grad_norm": 6.71875, "learning_rate": 2.8242388890206843e-05, "loss": 2.7697, "num_input_tokens_seen": 38325424, "step": 18345 }, { "epoch": 2.9935557549555427, "grad_norm": 8.75, "learning_rate": 2.8232654083163967e-05, "loss": 1.7537, "num_input_tokens_seen": 38335776, "step": 18350 }, { "epoch": 2.9943714821763603, "grad_norm": 6.96875, "learning_rate": 2.822291877760521e-05, "loss": 2.5797, "num_input_tokens_seen": 38346608, "step": 18355 }, { "epoch": 2.9951872093971774, "grad_norm": 4.375, "learning_rate": 2.8213182975031864e-05, "loss": 1.0546, "num_input_tokens_seen": 38357200, "step": 18360 }, { "epoch": 2.996002936617995, "grad_norm": 8.6875, "learning_rate": 2.8203446676945337e-05, "loss": 2.0592, "num_input_tokens_seen": 38367104, "step": 18365 }, { "epoch": 2.9968186638388126, "grad_norm": 1.1953125, "learning_rate": 2.8193709884847075e-05, "loss": 2.3507, "num_input_tokens_seen": 38377984, "step": 18370 }, { "epoch": 2.9976343910596297, "grad_norm": 0.365234375, "learning_rate": 2.8183972600238605e-05, "loss": 1.4517, "num_input_tokens_seen": 38387936, "step": 18375 }, { "epoch": 2.998450118280447, "grad_norm": 9.8125, "learning_rate": 2.817423482462156e-05, "loss": 4.0664, "num_input_tokens_seen": 38397024, "step": 18380 }, { "epoch": 2.9992658455012644, "grad_norm": 7.75, "learning_rate": 2.8164496559497605e-05, "loss": 1.9576, "num_input_tokens_seen": 38405936, "step": 18385 }, { "epoch": 3.0, "grad_norm": 5.0, "learning_rate": 2.815475780636852e-05, "loss": 2.1702, "num_input_tokens_seen": 38413760, "step": 18390 }, { "epoch": 3.0008157272208176, "grad_norm": 0.76953125, "learning_rate": 2.814501856673613e-05, "loss": 1.7922, "num_input_tokens_seen": 38423248, "step": 18395 }, { "epoch": 3.0016314544416347, "grad_norm": 8.4375, "learning_rate": 2.8135278842102353e-05, "loss": 2.3376, "num_input_tokens_seen": 38435312, "step": 18400 }, { "epoch": 3.0016314544416347, "eval_loss": 2.535127878189087, "eval_runtime": 134.6644, "eval_samples_per_second": 20.235, "eval_steps_per_second": 10.121, "num_input_tokens_seen": 38435312, "step": 18400 }, { "epoch": 3.0024471816624523, "grad_norm": 2.578125, "learning_rate": 2.8125538633969183e-05, "loss": 2.2135, "num_input_tokens_seen": 38446352, "step": 18405 }, { "epoch": 3.0032629088832694, "grad_norm": 5.59375, "learning_rate": 2.8115797943838677e-05, "loss": 2.1805, "num_input_tokens_seen": 38455520, "step": 18410 }, { "epoch": 3.004078636104087, "grad_norm": 8.75, "learning_rate": 2.810605677321298e-05, "loss": 2.539, "num_input_tokens_seen": 38467056, "step": 18415 }, { "epoch": 3.004894363324904, "grad_norm": 3.359375, "learning_rate": 2.809631512359428e-05, "loss": 0.9996, "num_input_tokens_seen": 38476688, "step": 18420 }, { "epoch": 3.0057100905457217, "grad_norm": 8.75, "learning_rate": 2.8086572996484884e-05, "loss": 4.1607, "num_input_tokens_seen": 38487936, "step": 18425 }, { "epoch": 3.006525817766539, "grad_norm": 1.765625, "learning_rate": 2.8076830393387143e-05, "loss": 2.9166, "num_input_tokens_seen": 38497984, "step": 18430 }, { "epoch": 3.0073415449873564, "grad_norm": 3.390625, "learning_rate": 2.8067087315803497e-05, "loss": 1.1785, "num_input_tokens_seen": 38508944, "step": 18435 }, { "epoch": 3.0081572722081735, "grad_norm": 9.875, "learning_rate": 2.8057343765236433e-05, "loss": 3.0023, "num_input_tokens_seen": 38519808, "step": 18440 }, { "epoch": 3.008972999428991, "grad_norm": 12.4375, "learning_rate": 2.804759974318854e-05, "loss": 2.3482, "num_input_tokens_seen": 38530928, "step": 18445 }, { "epoch": 3.0097887266498082, "grad_norm": 3.875, "learning_rate": 2.8037855251162482e-05, "loss": 3.0993, "num_input_tokens_seen": 38542160, "step": 18450 }, { "epoch": 3.010604453870626, "grad_norm": 2.609375, "learning_rate": 2.802811029066096e-05, "loss": 3.5326, "num_input_tokens_seen": 38554224, "step": 18455 }, { "epoch": 3.011420181091443, "grad_norm": 7.3125, "learning_rate": 2.8018364863186764e-05, "loss": 2.1594, "num_input_tokens_seen": 38565952, "step": 18460 }, { "epoch": 3.0122359083122605, "grad_norm": 7.59375, "learning_rate": 2.800861897024279e-05, "loss": 2.3561, "num_input_tokens_seen": 38576800, "step": 18465 }, { "epoch": 3.0130516355330776, "grad_norm": 13.0, "learning_rate": 2.799887261333196e-05, "loss": 2.2839, "num_input_tokens_seen": 38587584, "step": 18470 }, { "epoch": 3.013867362753895, "grad_norm": 6.84375, "learning_rate": 2.798912579395728e-05, "loss": 1.8395, "num_input_tokens_seen": 38597520, "step": 18475 }, { "epoch": 3.0146830899747123, "grad_norm": 5.28125, "learning_rate": 2.797937851362185e-05, "loss": 1.2295, "num_input_tokens_seen": 38608288, "step": 18480 }, { "epoch": 3.01549881719553, "grad_norm": 7.5, "learning_rate": 2.7969630773828802e-05, "loss": 2.3853, "num_input_tokens_seen": 38617328, "step": 18485 }, { "epoch": 3.016314544416347, "grad_norm": 7.09375, "learning_rate": 2.7959882576081382e-05, "loss": 2.8429, "num_input_tokens_seen": 38627536, "step": 18490 }, { "epoch": 3.0171302716371646, "grad_norm": 7.125, "learning_rate": 2.795013392188286e-05, "loss": 2.3691, "num_input_tokens_seen": 38638480, "step": 18495 }, { "epoch": 3.0179459988579818, "grad_norm": 11.625, "learning_rate": 2.7940384812736614e-05, "loss": 1.9965, "num_input_tokens_seen": 38649872, "step": 18500 }, { "epoch": 3.0187617260787993, "grad_norm": 5.6875, "learning_rate": 2.7930635250146087e-05, "loss": 2.4363, "num_input_tokens_seen": 38660688, "step": 18505 }, { "epoch": 3.0195774532996165, "grad_norm": 4.75, "learning_rate": 2.792088523561477e-05, "loss": 1.6412, "num_input_tokens_seen": 38672800, "step": 18510 }, { "epoch": 3.020393180520434, "grad_norm": 2.9375, "learning_rate": 2.7911134770646246e-05, "loss": 2.1075, "num_input_tokens_seen": 38684128, "step": 18515 }, { "epoch": 3.021208907741251, "grad_norm": 6.34375, "learning_rate": 2.7901383856744157e-05, "loss": 2.8286, "num_input_tokens_seen": 38694736, "step": 18520 }, { "epoch": 3.0220246349620687, "grad_norm": 6.96875, "learning_rate": 2.7891632495412217e-05, "loss": 1.8855, "num_input_tokens_seen": 38706064, "step": 18525 }, { "epoch": 3.022840362182886, "grad_norm": 6.375, "learning_rate": 2.7881880688154205e-05, "loss": 2.2283, "num_input_tokens_seen": 38717632, "step": 18530 }, { "epoch": 3.0236560894037035, "grad_norm": 4.375, "learning_rate": 2.7872128436473977e-05, "loss": 1.8838, "num_input_tokens_seen": 38728528, "step": 18535 }, { "epoch": 3.0244718166245206, "grad_norm": 9.8125, "learning_rate": 2.7862375741875448e-05, "loss": 4.1259, "num_input_tokens_seen": 38739536, "step": 18540 }, { "epoch": 3.025287543845338, "grad_norm": 0.60546875, "learning_rate": 2.785262260586261e-05, "loss": 2.8911, "num_input_tokens_seen": 38750880, "step": 18545 }, { "epoch": 3.0261032710661553, "grad_norm": 7.5, "learning_rate": 2.7842869029939517e-05, "loss": 2.1592, "num_input_tokens_seen": 38760576, "step": 18550 }, { "epoch": 3.026918998286973, "grad_norm": 6.0, "learning_rate": 2.7833115015610296e-05, "loss": 2.9689, "num_input_tokens_seen": 38771152, "step": 18555 }, { "epoch": 3.02773472550779, "grad_norm": 7.8125, "learning_rate": 2.7823360564379136e-05, "loss": 1.89, "num_input_tokens_seen": 38780672, "step": 18560 }, { "epoch": 3.0285504527286076, "grad_norm": 3.859375, "learning_rate": 2.7813605677750297e-05, "loss": 1.6542, "num_input_tokens_seen": 38789696, "step": 18565 }, { "epoch": 3.0293661799494247, "grad_norm": 5.375, "learning_rate": 2.7803850357228102e-05, "loss": 3.9287, "num_input_tokens_seen": 38801872, "step": 18570 }, { "epoch": 3.0301819071702423, "grad_norm": 7.25, "learning_rate": 2.779409460431695e-05, "loss": 2.5079, "num_input_tokens_seen": 38813200, "step": 18575 }, { "epoch": 3.03099763439106, "grad_norm": 6.28125, "learning_rate": 2.778433842052129e-05, "loss": 2.2778, "num_input_tokens_seen": 38824160, "step": 18580 }, { "epoch": 3.031813361611877, "grad_norm": 4.78125, "learning_rate": 2.7774581807345664e-05, "loss": 1.9163, "num_input_tokens_seen": 38835856, "step": 18585 }, { "epoch": 3.0326290888326946, "grad_norm": 4.09375, "learning_rate": 2.776482476629465e-05, "loss": 1.7937, "num_input_tokens_seen": 38846672, "step": 18590 }, { "epoch": 3.0334448160535117, "grad_norm": 17.25, "learning_rate": 2.7755067298872924e-05, "loss": 2.5019, "num_input_tokens_seen": 38856576, "step": 18595 }, { "epoch": 3.0342605432743293, "grad_norm": 4.84375, "learning_rate": 2.774530940658518e-05, "loss": 3.3812, "num_input_tokens_seen": 38869040, "step": 18600 }, { "epoch": 3.0342605432743293, "eval_loss": 2.5402324199676514, "eval_runtime": 134.6647, "eval_samples_per_second": 20.235, "eval_steps_per_second": 10.121, "num_input_tokens_seen": 38869040, "step": 18600 }, { "epoch": 3.0350762704951464, "grad_norm": 6.0, "learning_rate": 2.7735551090936236e-05, "loss": 2.1983, "num_input_tokens_seen": 38879216, "step": 18605 }, { "epoch": 3.035891997715964, "grad_norm": 9.375, "learning_rate": 2.7725792353430934e-05, "loss": 2.0366, "num_input_tokens_seen": 38890208, "step": 18610 }, { "epoch": 3.036707724936781, "grad_norm": 5.84375, "learning_rate": 2.77160331955742e-05, "loss": 2.2987, "num_input_tokens_seen": 38900800, "step": 18615 }, { "epoch": 3.0375234521575987, "grad_norm": 6.03125, "learning_rate": 2.7706273618871008e-05, "loss": 1.7196, "num_input_tokens_seen": 38912096, "step": 18620 }, { "epoch": 3.038339179378416, "grad_norm": 4.5625, "learning_rate": 2.769651362482642e-05, "loss": 3.3059, "num_input_tokens_seen": 38922192, "step": 18625 }, { "epoch": 3.0391549065992334, "grad_norm": 5.71875, "learning_rate": 2.768675321494555e-05, "loss": 2.3266, "num_input_tokens_seen": 38932528, "step": 18630 }, { "epoch": 3.0399706338200505, "grad_norm": 5.875, "learning_rate": 2.7676992390733565e-05, "loss": 1.4866, "num_input_tokens_seen": 38943424, "step": 18635 }, { "epoch": 3.040786361040868, "grad_norm": 1.203125, "learning_rate": 2.766723115369571e-05, "loss": 1.9037, "num_input_tokens_seen": 38953312, "step": 18640 }, { "epoch": 3.041602088261685, "grad_norm": 11.3125, "learning_rate": 2.765746950533729e-05, "loss": 3.0202, "num_input_tokens_seen": 38963648, "step": 18645 }, { "epoch": 3.042417815482503, "grad_norm": 9.3125, "learning_rate": 2.7647707447163684e-05, "loss": 1.8829, "num_input_tokens_seen": 38975440, "step": 18650 }, { "epoch": 3.04323354270332, "grad_norm": 9.875, "learning_rate": 2.7637944980680315e-05, "loss": 1.6415, "num_input_tokens_seen": 38985712, "step": 18655 }, { "epoch": 3.0440492699241375, "grad_norm": 7.9375, "learning_rate": 2.762818210739268e-05, "loss": 3.2137, "num_input_tokens_seen": 38996432, "step": 18660 }, { "epoch": 3.0448649971449546, "grad_norm": 5.375, "learning_rate": 2.7618418828806332e-05, "loss": 2.4147, "num_input_tokens_seen": 39006176, "step": 18665 }, { "epoch": 3.045680724365772, "grad_norm": 6.625, "learning_rate": 2.76086551464269e-05, "loss": 2.1611, "num_input_tokens_seen": 39016032, "step": 18670 }, { "epoch": 3.0464964515865893, "grad_norm": 10.5625, "learning_rate": 2.759889106176006e-05, "loss": 2.0296, "num_input_tokens_seen": 39028112, "step": 18675 }, { "epoch": 3.047312178807407, "grad_norm": 7.34375, "learning_rate": 2.758912657631156e-05, "loss": 2.7356, "num_input_tokens_seen": 39039104, "step": 18680 }, { "epoch": 3.048127906028224, "grad_norm": 6.4375, "learning_rate": 2.7579361691587198e-05, "loss": 1.9054, "num_input_tokens_seen": 39050944, "step": 18685 }, { "epoch": 3.0489436332490416, "grad_norm": 4.78125, "learning_rate": 2.756959640909285e-05, "loss": 2.6923, "num_input_tokens_seen": 39062608, "step": 18690 }, { "epoch": 3.0497593604698587, "grad_norm": 5.0625, "learning_rate": 2.7559830730334452e-05, "loss": 1.8744, "num_input_tokens_seen": 39073088, "step": 18695 }, { "epoch": 3.0505750876906763, "grad_norm": 6.21875, "learning_rate": 2.7550064656817988e-05, "loss": 1.3693, "num_input_tokens_seen": 39082720, "step": 18700 }, { "epoch": 3.0513908149114934, "grad_norm": 18.625, "learning_rate": 2.7540298190049503e-05, "loss": 1.7674, "num_input_tokens_seen": 39094272, "step": 18705 }, { "epoch": 3.052206542132311, "grad_norm": 4.0, "learning_rate": 2.7530531331535107e-05, "loss": 2.6544, "num_input_tokens_seen": 39104816, "step": 18710 }, { "epoch": 3.053022269353128, "grad_norm": 7.0, "learning_rate": 2.752076408278099e-05, "loss": 2.6126, "num_input_tokens_seen": 39114880, "step": 18715 }, { "epoch": 3.0538379965739457, "grad_norm": 1.2578125, "learning_rate": 2.751099644529337e-05, "loss": 1.7647, "num_input_tokens_seen": 39126880, "step": 18720 }, { "epoch": 3.054653723794763, "grad_norm": 4.09375, "learning_rate": 2.7501228420578533e-05, "loss": 1.33, "num_input_tokens_seen": 39136016, "step": 18725 }, { "epoch": 3.0554694510155804, "grad_norm": 8.1875, "learning_rate": 2.7491460010142857e-05, "loss": 2.4768, "num_input_tokens_seen": 39146688, "step": 18730 }, { "epoch": 3.0562851782363976, "grad_norm": 1.8203125, "learning_rate": 2.7481691215492727e-05, "loss": 2.3832, "num_input_tokens_seen": 39156928, "step": 18735 }, { "epoch": 3.057100905457215, "grad_norm": 8.5625, "learning_rate": 2.747192203813463e-05, "loss": 2.1745, "num_input_tokens_seen": 39168096, "step": 18740 }, { "epoch": 3.0579166326780323, "grad_norm": 18.25, "learning_rate": 2.7462152479575087e-05, "loss": 3.1783, "num_input_tokens_seen": 39178272, "step": 18745 }, { "epoch": 3.05873235989885, "grad_norm": 9.0, "learning_rate": 2.7452382541320697e-05, "loss": 2.4667, "num_input_tokens_seen": 39188880, "step": 18750 }, { "epoch": 3.059548087119667, "grad_norm": 7.28125, "learning_rate": 2.7442612224878096e-05, "loss": 2.8692, "num_input_tokens_seen": 39199712, "step": 18755 }, { "epoch": 3.0603638143404845, "grad_norm": 8.375, "learning_rate": 2.7432841531753994e-05, "loss": 3.7592, "num_input_tokens_seen": 39210928, "step": 18760 }, { "epoch": 3.0611795415613017, "grad_norm": 7.0, "learning_rate": 2.7423070463455147e-05, "loss": 2.7395, "num_input_tokens_seen": 39221392, "step": 18765 }, { "epoch": 3.0619952687821193, "grad_norm": 8.4375, "learning_rate": 2.7413299021488397e-05, "loss": 3.383, "num_input_tokens_seen": 39230624, "step": 18770 }, { "epoch": 3.062810996002937, "grad_norm": 9.6875, "learning_rate": 2.7403527207360615e-05, "loss": 2.5962, "num_input_tokens_seen": 39241776, "step": 18775 }, { "epoch": 3.063626723223754, "grad_norm": 0.216796875, "learning_rate": 2.7393755022578722e-05, "loss": 1.927, "num_input_tokens_seen": 39251312, "step": 18780 }, { "epoch": 3.0644424504445715, "grad_norm": 6.5, "learning_rate": 2.7383982468649714e-05, "loss": 2.0895, "num_input_tokens_seen": 39262512, "step": 18785 }, { "epoch": 3.0652581776653887, "grad_norm": 4.375, "learning_rate": 2.7374209547080665e-05, "loss": 2.3992, "num_input_tokens_seen": 39272320, "step": 18790 }, { "epoch": 3.0660739048862062, "grad_norm": 8.1875, "learning_rate": 2.7364436259378663e-05, "loss": 1.4882, "num_input_tokens_seen": 39283888, "step": 18795 }, { "epoch": 3.0668896321070234, "grad_norm": 6.90625, "learning_rate": 2.735466260705088e-05, "loss": 2.2142, "num_input_tokens_seen": 39294832, "step": 18800 }, { "epoch": 3.0668896321070234, "eval_loss": 2.548177480697632, "eval_runtime": 135.1277, "eval_samples_per_second": 20.166, "eval_steps_per_second": 10.087, "num_input_tokens_seen": 39294832, "step": 18800 }, { "epoch": 3.067705359327841, "grad_norm": 10.4375, "learning_rate": 2.7344888591604524e-05, "loss": 1.2988, "num_input_tokens_seen": 39305520, "step": 18805 }, { "epoch": 3.068521086548658, "grad_norm": 3.796875, "learning_rate": 2.7335114214546893e-05, "loss": 2.2595, "num_input_tokens_seen": 39314640, "step": 18810 }, { "epoch": 3.0693368137694756, "grad_norm": 7.78125, "learning_rate": 2.7325339477385293e-05, "loss": 2.6693, "num_input_tokens_seen": 39324720, "step": 18815 }, { "epoch": 3.0701525409902928, "grad_norm": 7.0625, "learning_rate": 2.7315564381627128e-05, "loss": 2.535, "num_input_tokens_seen": 39335504, "step": 18820 }, { "epoch": 3.0709682682111104, "grad_norm": 3.65625, "learning_rate": 2.7305788928779835e-05, "loss": 2.8521, "num_input_tokens_seen": 39346160, "step": 18825 }, { "epoch": 3.0717839954319275, "grad_norm": 5.53125, "learning_rate": 2.729601312035091e-05, "loss": 1.8358, "num_input_tokens_seen": 39354576, "step": 18830 }, { "epoch": 3.072599722652745, "grad_norm": 10.0, "learning_rate": 2.7286236957847915e-05, "loss": 2.5139, "num_input_tokens_seen": 39363920, "step": 18835 }, { "epoch": 3.073415449873562, "grad_norm": 10.3125, "learning_rate": 2.7276460442778446e-05, "loss": 1.8985, "num_input_tokens_seen": 39374368, "step": 18840 }, { "epoch": 3.0742311770943798, "grad_norm": 7.25, "learning_rate": 2.726668357665017e-05, "loss": 2.1724, "num_input_tokens_seen": 39386048, "step": 18845 }, { "epoch": 3.075046904315197, "grad_norm": 7.84375, "learning_rate": 2.7256906360970808e-05, "loss": 1.0069, "num_input_tokens_seen": 39397056, "step": 18850 }, { "epoch": 3.0758626315360145, "grad_norm": 5.75, "learning_rate": 2.7247128797248117e-05, "loss": 3.2035, "num_input_tokens_seen": 39406592, "step": 18855 }, { "epoch": 3.0766783587568316, "grad_norm": 1.9296875, "learning_rate": 2.7237350886989925e-05, "loss": 2.4079, "num_input_tokens_seen": 39416016, "step": 18860 }, { "epoch": 3.077494085977649, "grad_norm": 12.5625, "learning_rate": 2.7227572631704107e-05, "loss": 2.8224, "num_input_tokens_seen": 39426144, "step": 18865 }, { "epoch": 3.0783098131984663, "grad_norm": 4.6875, "learning_rate": 2.7217794032898596e-05, "loss": 2.8631, "num_input_tokens_seen": 39436880, "step": 18870 }, { "epoch": 3.079125540419284, "grad_norm": 6.34375, "learning_rate": 2.7208015092081384e-05, "loss": 2.592, "num_input_tokens_seen": 39447856, "step": 18875 }, { "epoch": 3.079941267640101, "grad_norm": 9.8125, "learning_rate": 2.719823581076049e-05, "loss": 3.2701, "num_input_tokens_seen": 39456720, "step": 18880 }, { "epoch": 3.0807569948609186, "grad_norm": 7.6875, "learning_rate": 2.718845619044401e-05, "loss": 2.3436, "num_input_tokens_seen": 39466992, "step": 18885 }, { "epoch": 3.0815727220817357, "grad_norm": 11.3125, "learning_rate": 2.7178676232640088e-05, "loss": 2.0766, "num_input_tokens_seen": 39477488, "step": 18890 }, { "epoch": 3.0823884493025533, "grad_norm": 4.96875, "learning_rate": 2.716889593885691e-05, "loss": 2.3528, "num_input_tokens_seen": 39486432, "step": 18895 }, { "epoch": 3.0832041765233704, "grad_norm": 5.34375, "learning_rate": 2.7159115310602716e-05, "loss": 1.709, "num_input_tokens_seen": 39498208, "step": 18900 }, { "epoch": 3.084019903744188, "grad_norm": 5.03125, "learning_rate": 2.7149334349385814e-05, "loss": 1.2992, "num_input_tokens_seen": 39508832, "step": 18905 }, { "epoch": 3.084835630965005, "grad_norm": 5.8125, "learning_rate": 2.713955305671454e-05, "loss": 1.6824, "num_input_tokens_seen": 39519248, "step": 18910 }, { "epoch": 3.0856513581858227, "grad_norm": 11.25, "learning_rate": 2.71297714340973e-05, "loss": 2.8482, "num_input_tokens_seen": 39529936, "step": 18915 }, { "epoch": 3.08646708540664, "grad_norm": 6.0625, "learning_rate": 2.7119989483042545e-05, "loss": 1.9419, "num_input_tokens_seen": 39540752, "step": 18920 }, { "epoch": 3.0872828126274574, "grad_norm": 9.3125, "learning_rate": 2.7110207205058768e-05, "loss": 2.4596, "num_input_tokens_seen": 39550272, "step": 18925 }, { "epoch": 3.0880985398482745, "grad_norm": 3.015625, "learning_rate": 2.7100424601654517e-05, "loss": 2.2172, "num_input_tokens_seen": 39559920, "step": 18930 }, { "epoch": 3.088914267069092, "grad_norm": 1.6953125, "learning_rate": 2.7090641674338403e-05, "loss": 1.0733, "num_input_tokens_seen": 39569632, "step": 18935 }, { "epoch": 3.0897299942899092, "grad_norm": 11.625, "learning_rate": 2.7080858424619072e-05, "loss": 2.7145, "num_input_tokens_seen": 39580704, "step": 18940 }, { "epoch": 3.090545721510727, "grad_norm": 8.6875, "learning_rate": 2.707107485400521e-05, "loss": 2.8611, "num_input_tokens_seen": 39590160, "step": 18945 }, { "epoch": 3.0913614487315444, "grad_norm": 3.34375, "learning_rate": 2.7061290964005586e-05, "loss": 1.4562, "num_input_tokens_seen": 39600768, "step": 18950 }, { "epoch": 3.0921771759523615, "grad_norm": 8.5, "learning_rate": 2.7051506756129e-05, "loss": 2.1127, "num_input_tokens_seen": 39612256, "step": 18955 }, { "epoch": 3.092992903173179, "grad_norm": 10.9375, "learning_rate": 2.704172223188428e-05, "loss": 3.5645, "num_input_tokens_seen": 39621536, "step": 18960 }, { "epoch": 3.0938086303939962, "grad_norm": 3.953125, "learning_rate": 2.7031937392780334e-05, "loss": 3.572, "num_input_tokens_seen": 39631904, "step": 18965 }, { "epoch": 3.094624357614814, "grad_norm": 6.6875, "learning_rate": 2.702215224032611e-05, "loss": 2.5679, "num_input_tokens_seen": 39641872, "step": 18970 }, { "epoch": 3.095440084835631, "grad_norm": 5.9375, "learning_rate": 2.70123667760306e-05, "loss": 2.0799, "num_input_tokens_seen": 39652656, "step": 18975 }, { "epoch": 3.0962558120564485, "grad_norm": 7.78125, "learning_rate": 2.7002581001402845e-05, "loss": 1.7741, "num_input_tokens_seen": 39664400, "step": 18980 }, { "epoch": 3.0970715392772656, "grad_norm": 11.125, "learning_rate": 2.6992794917951923e-05, "loss": 2.7907, "num_input_tokens_seen": 39674944, "step": 18985 }, { "epoch": 3.097887266498083, "grad_norm": 4.5625, "learning_rate": 2.6983008527187e-05, "loss": 1.2365, "num_input_tokens_seen": 39685440, "step": 18990 }, { "epoch": 3.0987029937189003, "grad_norm": 9.1875, "learning_rate": 2.697322183061723e-05, "loss": 1.362, "num_input_tokens_seen": 39696608, "step": 18995 }, { "epoch": 3.099518720939718, "grad_norm": 5.96875, "learning_rate": 2.696343482975186e-05, "loss": 2.0647, "num_input_tokens_seen": 39706928, "step": 19000 }, { "epoch": 3.099518720939718, "eval_loss": 2.538421630859375, "eval_runtime": 134.7507, "eval_samples_per_second": 20.223, "eval_steps_per_second": 10.115, "num_input_tokens_seen": 39706928, "step": 19000 }, { "epoch": 3.100334448160535, "grad_norm": 4.875, "learning_rate": 2.695364752610016e-05, "loss": 1.3137, "num_input_tokens_seen": 39717904, "step": 19005 }, { "epoch": 3.1011501753813526, "grad_norm": 8.5625, "learning_rate": 2.6943859921171467e-05, "loss": 2.3466, "num_input_tokens_seen": 39728368, "step": 19010 }, { "epoch": 3.1019659026021698, "grad_norm": 5.59375, "learning_rate": 2.6934072016475143e-05, "loss": 2.2962, "num_input_tokens_seen": 39738544, "step": 19015 }, { "epoch": 3.1027816298229873, "grad_norm": 7.65625, "learning_rate": 2.6924283813520606e-05, "loss": 1.6056, "num_input_tokens_seen": 39748848, "step": 19020 }, { "epoch": 3.1035973570438045, "grad_norm": 9.4375, "learning_rate": 2.691449531381733e-05, "loss": 2.5877, "num_input_tokens_seen": 39759024, "step": 19025 }, { "epoch": 3.104413084264622, "grad_norm": 2.078125, "learning_rate": 2.6904706518874816e-05, "loss": 2.9983, "num_input_tokens_seen": 39770576, "step": 19030 }, { "epoch": 3.105228811485439, "grad_norm": 10.6875, "learning_rate": 2.6894917430202615e-05, "loss": 2.3436, "num_input_tokens_seen": 39780272, "step": 19035 }, { "epoch": 3.1060445387062567, "grad_norm": 3.9375, "learning_rate": 2.6885128049310343e-05, "loss": 2.1036, "num_input_tokens_seen": 39793072, "step": 19040 }, { "epoch": 3.106860265927074, "grad_norm": 11.4375, "learning_rate": 2.687533837770762e-05, "loss": 1.4843, "num_input_tokens_seen": 39803264, "step": 19045 }, { "epoch": 3.1076759931478914, "grad_norm": 6.53125, "learning_rate": 2.6865548416904162e-05, "loss": 0.7208, "num_input_tokens_seen": 39813824, "step": 19050 }, { "epoch": 3.1084917203687086, "grad_norm": 0.44921875, "learning_rate": 2.68557581684097e-05, "loss": 2.4978, "num_input_tokens_seen": 39822560, "step": 19055 }, { "epoch": 3.109307447589526, "grad_norm": 9.6875, "learning_rate": 2.6845967633733998e-05, "loss": 2.3114, "num_input_tokens_seen": 39832432, "step": 19060 }, { "epoch": 3.1101231748103433, "grad_norm": 9.125, "learning_rate": 2.683617681438689e-05, "loss": 4.1923, "num_input_tokens_seen": 39843424, "step": 19065 }, { "epoch": 3.110938902031161, "grad_norm": 8.75, "learning_rate": 2.682638571187825e-05, "loss": 2.5996, "num_input_tokens_seen": 39852128, "step": 19070 }, { "epoch": 3.111754629251978, "grad_norm": 7.0, "learning_rate": 2.6816594327717976e-05, "loss": 2.7419, "num_input_tokens_seen": 39863744, "step": 19075 }, { "epoch": 3.1125703564727956, "grad_norm": 5.09375, "learning_rate": 2.680680266341603e-05, "loss": 1.6439, "num_input_tokens_seen": 39873584, "step": 19080 }, { "epoch": 3.1133860836936127, "grad_norm": 6.96875, "learning_rate": 2.67970107204824e-05, "loss": 1.4829, "num_input_tokens_seen": 39882704, "step": 19085 }, { "epoch": 3.1142018109144303, "grad_norm": 9.0625, "learning_rate": 2.6787218500427142e-05, "loss": 3.2501, "num_input_tokens_seen": 39893568, "step": 19090 }, { "epoch": 3.1150175381352474, "grad_norm": 7.65625, "learning_rate": 2.6777426004760332e-05, "loss": 3.3429, "num_input_tokens_seen": 39905648, "step": 19095 }, { "epoch": 3.115833265356065, "grad_norm": 7.6875, "learning_rate": 2.6767633234992094e-05, "loss": 2.7122, "num_input_tokens_seen": 39914608, "step": 19100 }, { "epoch": 3.116648992576882, "grad_norm": 8.6875, "learning_rate": 2.6757840192632598e-05, "loss": 4.4158, "num_input_tokens_seen": 39925312, "step": 19105 }, { "epoch": 3.1174647197976997, "grad_norm": 6.5625, "learning_rate": 2.6748046879192052e-05, "loss": 1.5997, "num_input_tokens_seen": 39934672, "step": 19110 }, { "epoch": 3.118280447018517, "grad_norm": 13.1875, "learning_rate": 2.673825329618071e-05, "loss": 2.7824, "num_input_tokens_seen": 39945040, "step": 19115 }, { "epoch": 3.1190961742393344, "grad_norm": 6.625, "learning_rate": 2.6728459445108866e-05, "loss": 3.0114, "num_input_tokens_seen": 39955840, "step": 19120 }, { "epoch": 3.1199119014601515, "grad_norm": 6.5, "learning_rate": 2.6718665327486854e-05, "loss": 2.5034, "num_input_tokens_seen": 39966688, "step": 19125 }, { "epoch": 3.120727628680969, "grad_norm": 11.375, "learning_rate": 2.6708870944825048e-05, "loss": 3.1015, "num_input_tokens_seen": 39977392, "step": 19130 }, { "epoch": 3.121543355901786, "grad_norm": 3.3125, "learning_rate": 2.6699076298633874e-05, "loss": 2.2812, "num_input_tokens_seen": 39987968, "step": 19135 }, { "epoch": 3.122359083122604, "grad_norm": 6.15625, "learning_rate": 2.6689281390423788e-05, "loss": 2.9054, "num_input_tokens_seen": 39998272, "step": 19140 }, { "epoch": 3.1231748103434214, "grad_norm": 11.375, "learning_rate": 2.667948622170527e-05, "loss": 2.3876, "num_input_tokens_seen": 40009344, "step": 19145 }, { "epoch": 3.1239905375642385, "grad_norm": 2.546875, "learning_rate": 2.6669690793988873e-05, "loss": 2.3684, "num_input_tokens_seen": 40019200, "step": 19150 }, { "epoch": 3.124806264785056, "grad_norm": 6.40625, "learning_rate": 2.665989510878518e-05, "loss": 2.8145, "num_input_tokens_seen": 40028880, "step": 19155 }, { "epoch": 3.125621992005873, "grad_norm": 7.5, "learning_rate": 2.6650099167604793e-05, "loss": 2.9376, "num_input_tokens_seen": 40039824, "step": 19160 }, { "epoch": 3.126437719226691, "grad_norm": 2.1875, "learning_rate": 2.6640302971958376e-05, "loss": 2.1377, "num_input_tokens_seen": 40048944, "step": 19165 }, { "epoch": 3.127253446447508, "grad_norm": 7.71875, "learning_rate": 2.6630506523356635e-05, "loss": 3.9531, "num_input_tokens_seen": 40059312, "step": 19170 }, { "epoch": 3.1280691736683255, "grad_norm": 12.5, "learning_rate": 2.6620709823310297e-05, "loss": 3.2793, "num_input_tokens_seen": 40070928, "step": 19175 }, { "epoch": 3.1288849008891426, "grad_norm": 12.0, "learning_rate": 2.661091287333014e-05, "loss": 2.8367, "num_input_tokens_seen": 40080368, "step": 19180 }, { "epoch": 3.12970062810996, "grad_norm": 7.8125, "learning_rate": 2.660111567492696e-05, "loss": 2.9917, "num_input_tokens_seen": 40091376, "step": 19185 }, { "epoch": 3.1305163553307773, "grad_norm": 8.9375, "learning_rate": 2.6591318229611635e-05, "loss": 2.7834, "num_input_tokens_seen": 40101408, "step": 19190 }, { "epoch": 3.131332082551595, "grad_norm": 5.90625, "learning_rate": 2.6581520538895037e-05, "loss": 2.5444, "num_input_tokens_seen": 40112880, "step": 19195 }, { "epoch": 3.132147809772412, "grad_norm": 6.3125, "learning_rate": 2.6571722604288102e-05, "loss": 2.2582, "num_input_tokens_seen": 40121472, "step": 19200 }, { "epoch": 3.132147809772412, "eval_loss": 2.5614473819732666, "eval_runtime": 135.0803, "eval_samples_per_second": 20.173, "eval_steps_per_second": 10.09, "num_input_tokens_seen": 40121472, "step": 19200 }, { "epoch": 3.1329635369932296, "grad_norm": 11.4375, "learning_rate": 2.656192442730179e-05, "loss": 1.7784, "num_input_tokens_seen": 40131728, "step": 19205 }, { "epoch": 3.1337792642140467, "grad_norm": 10.5625, "learning_rate": 2.6552126009447098e-05, "loss": 2.3919, "num_input_tokens_seen": 40141408, "step": 19210 }, { "epoch": 3.1345949914348643, "grad_norm": 7.65625, "learning_rate": 2.654232735223507e-05, "loss": 2.6903, "num_input_tokens_seen": 40150880, "step": 19215 }, { "epoch": 3.1354107186556814, "grad_norm": 8.375, "learning_rate": 2.6532528457176787e-05, "loss": 2.7103, "num_input_tokens_seen": 40160608, "step": 19220 }, { "epoch": 3.136226445876499, "grad_norm": 6.25, "learning_rate": 2.6522729325783348e-05, "loss": 2.6182, "num_input_tokens_seen": 40170480, "step": 19225 }, { "epoch": 3.137042173097316, "grad_norm": 2.921875, "learning_rate": 2.6512929959565914e-05, "loss": 2.015, "num_input_tokens_seen": 40181536, "step": 19230 }, { "epoch": 3.1378579003181337, "grad_norm": 8.6875, "learning_rate": 2.6503130360035673e-05, "loss": 2.1905, "num_input_tokens_seen": 40191952, "step": 19235 }, { "epoch": 3.138673627538951, "grad_norm": 8.5625, "learning_rate": 2.6493330528703835e-05, "loss": 2.9658, "num_input_tokens_seen": 40201808, "step": 19240 }, { "epoch": 3.1394893547597684, "grad_norm": 9.25, "learning_rate": 2.648353046708167e-05, "loss": 0.716, "num_input_tokens_seen": 40214720, "step": 19245 }, { "epoch": 3.1403050819805856, "grad_norm": 8.75, "learning_rate": 2.647373017668046e-05, "loss": 3.0051, "num_input_tokens_seen": 40225920, "step": 19250 }, { "epoch": 3.141120809201403, "grad_norm": 11.75, "learning_rate": 2.6463929659011537e-05, "loss": 3.3569, "num_input_tokens_seen": 40236416, "step": 19255 }, { "epoch": 3.1419365364222203, "grad_norm": 10.0, "learning_rate": 2.6454128915586262e-05, "loss": 3.1072, "num_input_tokens_seen": 40245776, "step": 19260 }, { "epoch": 3.142752263643038, "grad_norm": 12.625, "learning_rate": 2.6444327947916036e-05, "loss": 2.3197, "num_input_tokens_seen": 40256160, "step": 19265 }, { "epoch": 3.143567990863855, "grad_norm": 10.75, "learning_rate": 2.6434526757512292e-05, "loss": 2.4474, "num_input_tokens_seen": 40267136, "step": 19270 }, { "epoch": 3.1443837180846725, "grad_norm": 0.12158203125, "learning_rate": 2.6424725345886486e-05, "loss": 2.1871, "num_input_tokens_seen": 40277360, "step": 19275 }, { "epoch": 3.1451994453054897, "grad_norm": 8.125, "learning_rate": 2.641492371455014e-05, "loss": 2.6652, "num_input_tokens_seen": 40287152, "step": 19280 }, { "epoch": 3.1460151725263072, "grad_norm": 7.4375, "learning_rate": 2.640512186501477e-05, "loss": 2.3262, "num_input_tokens_seen": 40297744, "step": 19285 }, { "epoch": 3.1468308997471244, "grad_norm": 6.53125, "learning_rate": 2.639531979879195e-05, "loss": 2.8425, "num_input_tokens_seen": 40307808, "step": 19290 }, { "epoch": 3.147646626967942, "grad_norm": 4.09375, "learning_rate": 2.638551751739328e-05, "loss": 2.9993, "num_input_tokens_seen": 40319152, "step": 19295 }, { "epoch": 3.148462354188759, "grad_norm": 8.0, "learning_rate": 2.6375715022330404e-05, "loss": 2.4484, "num_input_tokens_seen": 40330512, "step": 19300 }, { "epoch": 3.1492780814095767, "grad_norm": 5.875, "learning_rate": 2.6365912315114976e-05, "loss": 2.0357, "num_input_tokens_seen": 40341744, "step": 19305 }, { "epoch": 3.150093808630394, "grad_norm": 3.5625, "learning_rate": 2.6356109397258704e-05, "loss": 1.5845, "num_input_tokens_seen": 40353744, "step": 19310 }, { "epoch": 3.1509095358512114, "grad_norm": 4.40625, "learning_rate": 2.6346306270273325e-05, "loss": 2.025, "num_input_tokens_seen": 40364288, "step": 19315 }, { "epoch": 3.151725263072029, "grad_norm": 7.25, "learning_rate": 2.6336502935670608e-05, "loss": 2.7205, "num_input_tokens_seen": 40375152, "step": 19320 }, { "epoch": 3.152540990292846, "grad_norm": 8.25, "learning_rate": 2.6326699394962333e-05, "loss": 1.8616, "num_input_tokens_seen": 40384496, "step": 19325 }, { "epoch": 3.153356717513663, "grad_norm": 5.96875, "learning_rate": 2.6316895649660334e-05, "loss": 2.255, "num_input_tokens_seen": 40394080, "step": 19330 }, { "epoch": 3.1541724447344808, "grad_norm": 5.78125, "learning_rate": 2.6307091701276486e-05, "loss": 2.293, "num_input_tokens_seen": 40404240, "step": 19335 }, { "epoch": 3.1549881719552983, "grad_norm": 4.6875, "learning_rate": 2.629728755132267e-05, "loss": 1.6456, "num_input_tokens_seen": 40415696, "step": 19340 }, { "epoch": 3.1558038991761155, "grad_norm": 7.96875, "learning_rate": 2.628748320131081e-05, "loss": 2.3573, "num_input_tokens_seen": 40425376, "step": 19345 }, { "epoch": 3.156619626396933, "grad_norm": 4.0, "learning_rate": 2.6277678652752856e-05, "loss": 4.3231, "num_input_tokens_seen": 40435040, "step": 19350 }, { "epoch": 3.15743535361775, "grad_norm": 3.203125, "learning_rate": 2.6267873907160807e-05, "loss": 1.2957, "num_input_tokens_seen": 40445680, "step": 19355 }, { "epoch": 3.1582510808385678, "grad_norm": 8.6875, "learning_rate": 2.6258068966046668e-05, "loss": 2.794, "num_input_tokens_seen": 40457040, "step": 19360 }, { "epoch": 3.159066808059385, "grad_norm": 6.6875, "learning_rate": 2.6248263830922475e-05, "loss": 2.3179, "num_input_tokens_seen": 40467776, "step": 19365 }, { "epoch": 3.1598825352802025, "grad_norm": 7.0, "learning_rate": 2.6238458503300318e-05, "loss": 2.1722, "num_input_tokens_seen": 40478464, "step": 19370 }, { "epoch": 3.1606982625010196, "grad_norm": 9.125, "learning_rate": 2.6228652984692292e-05, "loss": 3.241, "num_input_tokens_seen": 40487760, "step": 19375 }, { "epoch": 3.161513989721837, "grad_norm": 9.875, "learning_rate": 2.621884727661054e-05, "loss": 1.3619, "num_input_tokens_seen": 40497584, "step": 19380 }, { "epoch": 3.1623297169426543, "grad_norm": 1.84375, "learning_rate": 2.6209041380567222e-05, "loss": 1.8896, "num_input_tokens_seen": 40507808, "step": 19385 }, { "epoch": 3.163145444163472, "grad_norm": 4.625, "learning_rate": 2.6199235298074527e-05, "loss": 2.5161, "num_input_tokens_seen": 40518256, "step": 19390 }, { "epoch": 3.163961171384289, "grad_norm": 0.126953125, "learning_rate": 2.618942903064468e-05, "loss": 2.0876, "num_input_tokens_seen": 40528144, "step": 19395 }, { "epoch": 3.1647768986051066, "grad_norm": 2.546875, "learning_rate": 2.6179622579789932e-05, "loss": 2.1523, "num_input_tokens_seen": 40537792, "step": 19400 }, { "epoch": 3.1647768986051066, "eval_loss": 2.54274582862854, "eval_runtime": 135.1021, "eval_samples_per_second": 20.17, "eval_steps_per_second": 10.089, "num_input_tokens_seen": 40537792, "step": 19400 }, { "epoch": 3.1655926258259237, "grad_norm": 18.5, "learning_rate": 2.6169815947022553e-05, "loss": 4.3618, "num_input_tokens_seen": 40548320, "step": 19405 }, { "epoch": 3.1664083530467413, "grad_norm": 3.546875, "learning_rate": 2.6160009133854853e-05, "loss": 1.5637, "num_input_tokens_seen": 40559536, "step": 19410 }, { "epoch": 3.1672240802675584, "grad_norm": 7.28125, "learning_rate": 2.6150202141799168e-05, "loss": 1.3557, "num_input_tokens_seen": 40571136, "step": 19415 }, { "epoch": 3.168039807488376, "grad_norm": 4.5, "learning_rate": 2.614039497236786e-05, "loss": 1.9108, "num_input_tokens_seen": 40581776, "step": 19420 }, { "epoch": 3.168855534709193, "grad_norm": 8.5, "learning_rate": 2.6130587627073315e-05, "loss": 2.6794, "num_input_tokens_seen": 40592656, "step": 19425 }, { "epoch": 3.1696712619300107, "grad_norm": 10.8125, "learning_rate": 2.6120780107427956e-05, "loss": 2.988, "num_input_tokens_seen": 40601984, "step": 19430 }, { "epoch": 3.170486989150828, "grad_norm": 8.25, "learning_rate": 2.6110972414944214e-05, "loss": 3.5976, "num_input_tokens_seen": 40612368, "step": 19435 }, { "epoch": 3.1713027163716454, "grad_norm": 9.75, "learning_rate": 2.6101164551134565e-05, "loss": 3.5858, "num_input_tokens_seen": 40623376, "step": 19440 }, { "epoch": 3.1721184435924625, "grad_norm": 4.0625, "learning_rate": 2.6091356517511505e-05, "loss": 1.5686, "num_input_tokens_seen": 40633616, "step": 19445 }, { "epoch": 3.17293417081328, "grad_norm": 9.75, "learning_rate": 2.608154831558755e-05, "loss": 2.89, "num_input_tokens_seen": 40644192, "step": 19450 }, { "epoch": 3.1737498980340972, "grad_norm": 4.71875, "learning_rate": 2.607173994687526e-05, "loss": 2.9278, "num_input_tokens_seen": 40655584, "step": 19455 }, { "epoch": 3.174565625254915, "grad_norm": 8.9375, "learning_rate": 2.6061931412887196e-05, "loss": 2.379, "num_input_tokens_seen": 40664544, "step": 19460 }, { "epoch": 3.175381352475732, "grad_norm": 5.21875, "learning_rate": 2.6052122715135973e-05, "loss": 2.7792, "num_input_tokens_seen": 40675968, "step": 19465 }, { "epoch": 3.1761970796965495, "grad_norm": 3.03125, "learning_rate": 2.60423138551342e-05, "loss": 1.3348, "num_input_tokens_seen": 40686736, "step": 19470 }, { "epoch": 3.1770128069173666, "grad_norm": 8.9375, "learning_rate": 2.6032504834394527e-05, "loss": 3.083, "num_input_tokens_seen": 40696656, "step": 19475 }, { "epoch": 3.1778285341381842, "grad_norm": 11.0, "learning_rate": 2.602269565442964e-05, "loss": 1.9149, "num_input_tokens_seen": 40707856, "step": 19480 }, { "epoch": 3.1786442613590014, "grad_norm": 10.5, "learning_rate": 2.6012886316752227e-05, "loss": 2.7468, "num_input_tokens_seen": 40718752, "step": 19485 }, { "epoch": 3.179459988579819, "grad_norm": 6.65625, "learning_rate": 2.6003076822875018e-05, "loss": 1.717, "num_input_tokens_seen": 40730240, "step": 19490 }, { "epoch": 3.180275715800636, "grad_norm": 5.625, "learning_rate": 2.5993267174310755e-05, "loss": 0.9695, "num_input_tokens_seen": 40740416, "step": 19495 }, { "epoch": 3.1810914430214536, "grad_norm": 4.96875, "learning_rate": 2.5983457372572218e-05, "loss": 2.1982, "num_input_tokens_seen": 40749968, "step": 19500 }, { "epoch": 3.1819071702422708, "grad_norm": 3.734375, "learning_rate": 2.597364741917219e-05, "loss": 3.3268, "num_input_tokens_seen": 40759088, "step": 19505 }, { "epoch": 3.1827228974630883, "grad_norm": 2.234375, "learning_rate": 2.5963837315623492e-05, "loss": 1.4316, "num_input_tokens_seen": 40769504, "step": 19510 }, { "epoch": 3.183538624683906, "grad_norm": 8.75, "learning_rate": 2.595402706343897e-05, "loss": 2.6635, "num_input_tokens_seen": 40780048, "step": 19515 }, { "epoch": 3.184354351904723, "grad_norm": 6.375, "learning_rate": 2.594421666413148e-05, "loss": 1.2298, "num_input_tokens_seen": 40790256, "step": 19520 }, { "epoch": 3.1851700791255406, "grad_norm": 5.6875, "learning_rate": 2.5934406119213928e-05, "loss": 1.7631, "num_input_tokens_seen": 40802192, "step": 19525 }, { "epoch": 3.1859858063463578, "grad_norm": 10.5, "learning_rate": 2.5924595430199193e-05, "loss": 3.1036, "num_input_tokens_seen": 40812880, "step": 19530 }, { "epoch": 3.1868015335671753, "grad_norm": 8.0625, "learning_rate": 2.5914784598600238e-05, "loss": 3.3569, "num_input_tokens_seen": 40824320, "step": 19535 }, { "epoch": 3.1876172607879925, "grad_norm": 8.375, "learning_rate": 2.5904973625930002e-05, "loss": 2.2934, "num_input_tokens_seen": 40835888, "step": 19540 }, { "epoch": 3.18843298800881, "grad_norm": 5.03125, "learning_rate": 2.5895162513701456e-05, "loss": 1.9237, "num_input_tokens_seen": 40847680, "step": 19545 }, { "epoch": 3.189248715229627, "grad_norm": 3.25, "learning_rate": 2.5885351263427593e-05, "loss": 1.9952, "num_input_tokens_seen": 40857424, "step": 19550 }, { "epoch": 3.1900644424504447, "grad_norm": 7.09375, "learning_rate": 2.5875539876621448e-05, "loss": 3.0426, "num_input_tokens_seen": 40868864, "step": 19555 }, { "epoch": 3.190880169671262, "grad_norm": 6.03125, "learning_rate": 2.586572835479605e-05, "loss": 2.5765, "num_input_tokens_seen": 40878672, "step": 19560 }, { "epoch": 3.1916958968920794, "grad_norm": 7.4375, "learning_rate": 2.585591669946446e-05, "loss": 3.1153, "num_input_tokens_seen": 40888256, "step": 19565 }, { "epoch": 3.1925116241128966, "grad_norm": 11.5, "learning_rate": 2.5846104912139756e-05, "loss": 2.7339, "num_input_tokens_seen": 40899104, "step": 19570 }, { "epoch": 3.193327351333714, "grad_norm": 10.25, "learning_rate": 2.583629299433505e-05, "loss": 1.8835, "num_input_tokens_seen": 40909856, "step": 19575 }, { "epoch": 3.1941430785545313, "grad_norm": 4.1875, "learning_rate": 2.582648094756345e-05, "loss": 2.3613, "num_input_tokens_seen": 40920000, "step": 19580 }, { "epoch": 3.194958805775349, "grad_norm": 4.71875, "learning_rate": 2.5816668773338098e-05, "loss": 1.7474, "num_input_tokens_seen": 40929168, "step": 19585 }, { "epoch": 3.195774532996166, "grad_norm": 6.9375, "learning_rate": 2.580685647317216e-05, "loss": 2.2862, "num_input_tokens_seen": 40939328, "step": 19590 }, { "epoch": 3.1965902602169836, "grad_norm": 9.0, "learning_rate": 2.5797044048578818e-05, "loss": 4.9899, "num_input_tokens_seen": 40949664, "step": 19595 }, { "epoch": 3.1974059874378007, "grad_norm": 8.5625, "learning_rate": 2.5787231501071262e-05, "loss": 3.2834, "num_input_tokens_seen": 40958928, "step": 19600 }, { "epoch": 3.1974059874378007, "eval_loss": 2.5320518016815186, "eval_runtime": 134.8504, "eval_samples_per_second": 20.208, "eval_steps_per_second": 10.107, "num_input_tokens_seen": 40958928, "step": 19600 }, { "epoch": 3.1982217146586183, "grad_norm": 5.375, "learning_rate": 2.577741883216272e-05, "loss": 2.2181, "num_input_tokens_seen": 40969952, "step": 19605 }, { "epoch": 3.1990374418794354, "grad_norm": 4.03125, "learning_rate": 2.576760604336642e-05, "loss": 3.4751, "num_input_tokens_seen": 40978208, "step": 19610 }, { "epoch": 3.199853169100253, "grad_norm": 14.5, "learning_rate": 2.575779313619563e-05, "loss": 3.2397, "num_input_tokens_seen": 40990016, "step": 19615 }, { "epoch": 3.20066889632107, "grad_norm": 7.59375, "learning_rate": 2.5747980112163605e-05, "loss": 1.2285, "num_input_tokens_seen": 41000672, "step": 19620 }, { "epoch": 3.2014846235418877, "grad_norm": 7.375, "learning_rate": 2.5738166972783656e-05, "loss": 2.1506, "num_input_tokens_seen": 41011200, "step": 19625 }, { "epoch": 3.202300350762705, "grad_norm": 6.15625, "learning_rate": 2.5728353719569075e-05, "loss": 1.664, "num_input_tokens_seen": 41022320, "step": 19630 }, { "epoch": 3.2031160779835224, "grad_norm": 6.90625, "learning_rate": 2.57185403540332e-05, "loss": 1.8824, "num_input_tokens_seen": 41031760, "step": 19635 }, { "epoch": 3.2039318052043395, "grad_norm": 8.625, "learning_rate": 2.5708726877689375e-05, "loss": 2.1367, "num_input_tokens_seen": 41042448, "step": 19640 }, { "epoch": 3.204747532425157, "grad_norm": 4.0, "learning_rate": 2.5698913292050964e-05, "loss": 2.8153, "num_input_tokens_seen": 41051344, "step": 19645 }, { "epoch": 3.205563259645974, "grad_norm": 2.578125, "learning_rate": 2.568909959863133e-05, "loss": 1.9878, "num_input_tokens_seen": 41062000, "step": 19650 }, { "epoch": 3.206378986866792, "grad_norm": 6.53125, "learning_rate": 2.5679285798943887e-05, "loss": 2.8459, "num_input_tokens_seen": 41070912, "step": 19655 }, { "epoch": 3.207194714087609, "grad_norm": 7.0, "learning_rate": 2.5669471894502035e-05, "loss": 2.5157, "num_input_tokens_seen": 41082080, "step": 19660 }, { "epoch": 3.2080104413084265, "grad_norm": 3.3125, "learning_rate": 2.56596578868192e-05, "loss": 1.2955, "num_input_tokens_seen": 41092096, "step": 19665 }, { "epoch": 3.2088261685292436, "grad_norm": 3.734375, "learning_rate": 2.564984377740883e-05, "loss": 1.6861, "num_input_tokens_seen": 41102816, "step": 19670 }, { "epoch": 3.209641895750061, "grad_norm": 7.125, "learning_rate": 2.564002956778438e-05, "loss": 0.7973, "num_input_tokens_seen": 41112032, "step": 19675 }, { "epoch": 3.2104576229708783, "grad_norm": 7.53125, "learning_rate": 2.563021525945934e-05, "loss": 1.7472, "num_input_tokens_seen": 41123952, "step": 19680 }, { "epoch": 3.211273350191696, "grad_norm": 7.53125, "learning_rate": 2.562040085394718e-05, "loss": 2.7136, "num_input_tokens_seen": 41133648, "step": 19685 }, { "epoch": 3.2120890774125135, "grad_norm": 12.375, "learning_rate": 2.56105863527614e-05, "loss": 3.2519, "num_input_tokens_seen": 41146128, "step": 19690 }, { "epoch": 3.2129048046333306, "grad_norm": 9.125, "learning_rate": 2.5600771757415548e-05, "loss": 2.1781, "num_input_tokens_seen": 41156144, "step": 19695 }, { "epoch": 3.2137205318541477, "grad_norm": 7.25, "learning_rate": 2.5590957069423134e-05, "loss": 2.0801, "num_input_tokens_seen": 41165552, "step": 19700 }, { "epoch": 3.2145362590749653, "grad_norm": 9.5, "learning_rate": 2.5581142290297716e-05, "loss": 1.9684, "num_input_tokens_seen": 41175968, "step": 19705 }, { "epoch": 3.215351986295783, "grad_norm": 6.59375, "learning_rate": 2.557132742155285e-05, "loss": 1.7726, "num_input_tokens_seen": 41187264, "step": 19710 }, { "epoch": 3.2161677135166, "grad_norm": 11.4375, "learning_rate": 2.556151246470212e-05, "loss": 2.1975, "num_input_tokens_seen": 41197616, "step": 19715 }, { "epoch": 3.2169834407374176, "grad_norm": 11.375, "learning_rate": 2.5551697421259114e-05, "loss": 2.1005, "num_input_tokens_seen": 41208896, "step": 19720 }, { "epoch": 3.2177991679582347, "grad_norm": 0.458984375, "learning_rate": 2.554188229273743e-05, "loss": 2.2241, "num_input_tokens_seen": 41219552, "step": 19725 }, { "epoch": 3.2186148951790523, "grad_norm": 10.1875, "learning_rate": 2.5532067080650678e-05, "loss": 2.104, "num_input_tokens_seen": 41230448, "step": 19730 }, { "epoch": 3.2194306223998694, "grad_norm": 7.0625, "learning_rate": 2.55222517865125e-05, "loss": 1.8107, "num_input_tokens_seen": 41241104, "step": 19735 }, { "epoch": 3.220246349620687, "grad_norm": 2.84375, "learning_rate": 2.5512436411836538e-05, "loss": 3.1106, "num_input_tokens_seen": 41253232, "step": 19740 }, { "epoch": 3.221062076841504, "grad_norm": 3.5, "learning_rate": 2.5502620958136443e-05, "loss": 1.2471, "num_input_tokens_seen": 41264640, "step": 19745 }, { "epoch": 3.2218778040623217, "grad_norm": 11.125, "learning_rate": 2.5492805426925874e-05, "loss": 2.4015, "num_input_tokens_seen": 41275200, "step": 19750 }, { "epoch": 3.222693531283139, "grad_norm": 5.4375, "learning_rate": 2.5482989819718523e-05, "loss": 2.1809, "num_input_tokens_seen": 41285568, "step": 19755 }, { "epoch": 3.2235092585039564, "grad_norm": 1.6328125, "learning_rate": 2.5473174138028065e-05, "loss": 2.5299, "num_input_tokens_seen": 41295600, "step": 19760 }, { "epoch": 3.2243249857247736, "grad_norm": 5.03125, "learning_rate": 2.5463358383368212e-05, "loss": 1.6698, "num_input_tokens_seen": 41304880, "step": 19765 }, { "epoch": 3.225140712945591, "grad_norm": 2.5, "learning_rate": 2.545354255725267e-05, "loss": 2.4432, "num_input_tokens_seen": 41313616, "step": 19770 }, { "epoch": 3.2259564401664083, "grad_norm": 10.5625, "learning_rate": 2.5443726661195165e-05, "loss": 3.9232, "num_input_tokens_seen": 41324176, "step": 19775 }, { "epoch": 3.226772167387226, "grad_norm": 6.40625, "learning_rate": 2.543391069670944e-05, "loss": 4.0166, "num_input_tokens_seen": 41335824, "step": 19780 }, { "epoch": 3.227587894608043, "grad_norm": 4.75, "learning_rate": 2.5424094665309228e-05, "loss": 1.6907, "num_input_tokens_seen": 41346272, "step": 19785 }, { "epoch": 3.2284036218288605, "grad_norm": 3.78125, "learning_rate": 2.5414278568508292e-05, "loss": 2.3165, "num_input_tokens_seen": 41356880, "step": 19790 }, { "epoch": 3.2292193490496777, "grad_norm": 7.65625, "learning_rate": 2.540446240782039e-05, "loss": 1.7776, "num_input_tokens_seen": 41366608, "step": 19795 }, { "epoch": 3.2300350762704952, "grad_norm": 9.5, "learning_rate": 2.5394646184759307e-05, "loss": 2.2189, "num_input_tokens_seen": 41378128, "step": 19800 }, { "epoch": 3.2300350762704952, "eval_loss": 2.543567419052124, "eval_runtime": 134.8524, "eval_samples_per_second": 20.207, "eval_steps_per_second": 10.107, "num_input_tokens_seen": 41378128, "step": 19800 }, { "epoch": 3.2308508034913124, "grad_norm": 6.75, "learning_rate": 2.538482990083882e-05, "loss": 1.7004, "num_input_tokens_seen": 41387392, "step": 19805 }, { "epoch": 3.23166653071213, "grad_norm": 7.4375, "learning_rate": 2.5375013557572725e-05, "loss": 2.3691, "num_input_tokens_seen": 41397776, "step": 19810 }, { "epoch": 3.232482257932947, "grad_norm": 5.09375, "learning_rate": 2.536519715647483e-05, "loss": 2.3283, "num_input_tokens_seen": 41408080, "step": 19815 }, { "epoch": 3.2332979851537647, "grad_norm": 6.15625, "learning_rate": 2.535538069905894e-05, "loss": 2.4024, "num_input_tokens_seen": 41418032, "step": 19820 }, { "epoch": 3.234113712374582, "grad_norm": 7.09375, "learning_rate": 2.534556418683888e-05, "loss": 1.686, "num_input_tokens_seen": 41428048, "step": 19825 }, { "epoch": 3.2349294395953994, "grad_norm": 4.6875, "learning_rate": 2.5335747621328486e-05, "loss": 1.852, "num_input_tokens_seen": 41438704, "step": 19830 }, { "epoch": 3.2357451668162165, "grad_norm": 7.09375, "learning_rate": 2.5325931004041586e-05, "loss": 2.7015, "num_input_tokens_seen": 41449008, "step": 19835 }, { "epoch": 3.236560894037034, "grad_norm": 9.3125, "learning_rate": 2.5316114336492032e-05, "loss": 2.7264, "num_input_tokens_seen": 41460176, "step": 19840 }, { "epoch": 3.237376621257851, "grad_norm": 6.5625, "learning_rate": 2.530629762019367e-05, "loss": 2.3789, "num_input_tokens_seen": 41471168, "step": 19845 }, { "epoch": 3.2381923484786688, "grad_norm": 7.4375, "learning_rate": 2.5296480856660364e-05, "loss": 2.2269, "num_input_tokens_seen": 41482192, "step": 19850 }, { "epoch": 3.239008075699486, "grad_norm": 6.40625, "learning_rate": 2.528666404740599e-05, "loss": 3.8866, "num_input_tokens_seen": 41492384, "step": 19855 }, { "epoch": 3.2398238029203035, "grad_norm": 0.197265625, "learning_rate": 2.527684719394442e-05, "loss": 1.754, "num_input_tokens_seen": 41500896, "step": 19860 }, { "epoch": 3.2406395301411206, "grad_norm": 7.21875, "learning_rate": 2.526703029778953e-05, "loss": 1.5633, "num_input_tokens_seen": 41511920, "step": 19865 }, { "epoch": 3.241455257361938, "grad_norm": 0.07470703125, "learning_rate": 2.5257213360455208e-05, "loss": 2.5769, "num_input_tokens_seen": 41522720, "step": 19870 }, { "epoch": 3.2422709845827553, "grad_norm": 13.8125, "learning_rate": 2.5247396383455353e-05, "loss": 4.4329, "num_input_tokens_seen": 41533840, "step": 19875 }, { "epoch": 3.243086711803573, "grad_norm": 6.34375, "learning_rate": 2.523757936830387e-05, "loss": 2.8715, "num_input_tokens_seen": 41544368, "step": 19880 }, { "epoch": 3.2439024390243905, "grad_norm": 13.5, "learning_rate": 2.5227762316514662e-05, "loss": 4.3807, "num_input_tokens_seen": 41555952, "step": 19885 }, { "epoch": 3.2447181662452076, "grad_norm": 9.6875, "learning_rate": 2.5217945229601648e-05, "loss": 2.6528, "num_input_tokens_seen": 41566096, "step": 19890 }, { "epoch": 3.245533893466025, "grad_norm": 7.3125, "learning_rate": 2.5208128109078738e-05, "loss": 2.2239, "num_input_tokens_seen": 41576928, "step": 19895 }, { "epoch": 3.2463496206868423, "grad_norm": 7.09375, "learning_rate": 2.5198310956459853e-05, "loss": 4.3729, "num_input_tokens_seen": 41588112, "step": 19900 }, { "epoch": 3.24716534790766, "grad_norm": 6.53125, "learning_rate": 2.518849377325893e-05, "loss": 2.3434, "num_input_tokens_seen": 41597744, "step": 19905 }, { "epoch": 3.247981075128477, "grad_norm": 10.625, "learning_rate": 2.51786765609899e-05, "loss": 1.9911, "num_input_tokens_seen": 41609232, "step": 19910 }, { "epoch": 3.2487968023492946, "grad_norm": 8.5, "learning_rate": 2.5168859321166694e-05, "loss": 3.9234, "num_input_tokens_seen": 41619408, "step": 19915 }, { "epoch": 3.2496125295701117, "grad_norm": 5.40625, "learning_rate": 2.515904205530326e-05, "loss": 3.744, "num_input_tokens_seen": 41630112, "step": 19920 }, { "epoch": 3.2504282567909293, "grad_norm": 3.484375, "learning_rate": 2.514922476491355e-05, "loss": 2.2678, "num_input_tokens_seen": 41639920, "step": 19925 }, { "epoch": 3.2512439840117464, "grad_norm": 6.25, "learning_rate": 2.51394074515115e-05, "loss": 2.1572, "num_input_tokens_seen": 41650608, "step": 19930 }, { "epoch": 3.252059711232564, "grad_norm": 5.3125, "learning_rate": 2.5129590116611067e-05, "loss": 3.117, "num_input_tokens_seen": 41661808, "step": 19935 }, { "epoch": 3.252875438453381, "grad_norm": 6.09375, "learning_rate": 2.5119772761726212e-05, "loss": 1.6687, "num_input_tokens_seen": 41672320, "step": 19940 }, { "epoch": 3.2536911656741987, "grad_norm": 6.65625, "learning_rate": 2.5109955388370893e-05, "loss": 3.0645, "num_input_tokens_seen": 41683392, "step": 19945 }, { "epoch": 3.254506892895016, "grad_norm": 1.9453125, "learning_rate": 2.510013799805907e-05, "loss": 3.329, "num_input_tokens_seen": 41692528, "step": 19950 }, { "epoch": 3.2553226201158334, "grad_norm": 8.1875, "learning_rate": 2.5090320592304706e-05, "loss": 1.9377, "num_input_tokens_seen": 41701984, "step": 19955 }, { "epoch": 3.2561383473366505, "grad_norm": 5.3125, "learning_rate": 2.5080503172621777e-05, "loss": 1.776, "num_input_tokens_seen": 41711760, "step": 19960 }, { "epoch": 3.256954074557468, "grad_norm": 8.5, "learning_rate": 2.5070685740524246e-05, "loss": 3.2439, "num_input_tokens_seen": 41723376, "step": 19965 }, { "epoch": 3.2577698017782852, "grad_norm": 15.0, "learning_rate": 2.5060868297526084e-05, "loss": 1.657, "num_input_tokens_seen": 41733392, "step": 19970 }, { "epoch": 3.258585528999103, "grad_norm": 10.4375, "learning_rate": 2.5051050845141267e-05, "loss": 3.1965, "num_input_tokens_seen": 41744704, "step": 19975 }, { "epoch": 3.25940125621992, "grad_norm": 8.5625, "learning_rate": 2.5041233384883765e-05, "loss": 2.1927, "num_input_tokens_seen": 41754176, "step": 19980 }, { "epoch": 3.2602169834407375, "grad_norm": 11.0625, "learning_rate": 2.5031415918267564e-05, "loss": 1.7513, "num_input_tokens_seen": 41763776, "step": 19985 }, { "epoch": 3.2610327106615546, "grad_norm": 7.90625, "learning_rate": 2.5021598446806626e-05, "loss": 2.6045, "num_input_tokens_seen": 41775120, "step": 19990 }, { "epoch": 3.261848437882372, "grad_norm": 15.1875, "learning_rate": 2.5011780972014937e-05, "loss": 1.1522, "num_input_tokens_seen": 41783792, "step": 19995 }, { "epoch": 3.2626641651031894, "grad_norm": 4.5, "learning_rate": 2.5001963495406478e-05, "loss": 3.4322, "num_input_tokens_seen": 41794480, "step": 20000 }, { "epoch": 3.2626641651031894, "eval_loss": 2.537410259246826, "eval_runtime": 135.0333, "eval_samples_per_second": 20.18, "eval_steps_per_second": 10.094, "num_input_tokens_seen": 41794480, "step": 20000 }, { "epoch": 3.263479892324007, "grad_norm": 6.84375, "learning_rate": 2.499214601849522e-05, "loss": 2.1072, "num_input_tokens_seen": 41804528, "step": 20005 }, { "epoch": 3.264295619544824, "grad_norm": 6.34375, "learning_rate": 2.4982328542795148e-05, "loss": 3.2755, "num_input_tokens_seen": 41814624, "step": 20010 }, { "epoch": 3.2651113467656416, "grad_norm": 19.25, "learning_rate": 2.497251106982024e-05, "loss": 2.9295, "num_input_tokens_seen": 41825856, "step": 20015 }, { "epoch": 3.2659270739864588, "grad_norm": 9.3125, "learning_rate": 2.4962693601084458e-05, "loss": 2.649, "num_input_tokens_seen": 41834864, "step": 20020 }, { "epoch": 3.2667428012072763, "grad_norm": 5.8125, "learning_rate": 2.4952876138101794e-05, "loss": 3.0694, "num_input_tokens_seen": 41845344, "step": 20025 }, { "epoch": 3.2675585284280935, "grad_norm": 0.267578125, "learning_rate": 2.4943058682386233e-05, "loss": 1.7387, "num_input_tokens_seen": 41855808, "step": 20030 }, { "epoch": 3.268374255648911, "grad_norm": 3.5, "learning_rate": 2.493324123545173e-05, "loss": 1.1589, "num_input_tokens_seen": 41866176, "step": 20035 }, { "epoch": 3.269189982869728, "grad_norm": 11.125, "learning_rate": 2.4923423798812272e-05, "loss": 2.8565, "num_input_tokens_seen": 41878032, "step": 20040 }, { "epoch": 3.2700057100905457, "grad_norm": 6.84375, "learning_rate": 2.4913606373981825e-05, "loss": 2.7191, "num_input_tokens_seen": 41889280, "step": 20045 }, { "epoch": 3.270821437311363, "grad_norm": 4.21875, "learning_rate": 2.4903788962474357e-05, "loss": 2.2296, "num_input_tokens_seen": 41898304, "step": 20050 }, { "epoch": 3.2716371645321805, "grad_norm": 8.0, "learning_rate": 2.489397156580385e-05, "loss": 2.8063, "num_input_tokens_seen": 41907136, "step": 20055 }, { "epoch": 3.272452891752998, "grad_norm": 3.078125, "learning_rate": 2.4884154185484246e-05, "loss": 2.6455, "num_input_tokens_seen": 41917536, "step": 20060 }, { "epoch": 3.273268618973815, "grad_norm": 4.4375, "learning_rate": 2.4874336823029526e-05, "loss": 2.4361, "num_input_tokens_seen": 41927856, "step": 20065 }, { "epoch": 3.2740843461946323, "grad_norm": 6.90625, "learning_rate": 2.4864519479953656e-05, "loss": 2.8108, "num_input_tokens_seen": 41937584, "step": 20070 }, { "epoch": 3.27490007341545, "grad_norm": 5.8125, "learning_rate": 2.485470215777058e-05, "loss": 2.1643, "num_input_tokens_seen": 41946448, "step": 20075 }, { "epoch": 3.2757158006362674, "grad_norm": 3.90625, "learning_rate": 2.4844884857994258e-05, "loss": 3.3462, "num_input_tokens_seen": 41957904, "step": 20080 }, { "epoch": 3.2765315278570846, "grad_norm": 4.5, "learning_rate": 2.4835067582138638e-05, "loss": 1.7505, "num_input_tokens_seen": 41968816, "step": 20085 }, { "epoch": 3.2773472550779017, "grad_norm": 7.53125, "learning_rate": 2.4825250331717666e-05, "loss": 2.5283, "num_input_tokens_seen": 41979344, "step": 20090 }, { "epoch": 3.2781629822987193, "grad_norm": 10.1875, "learning_rate": 2.4815433108245298e-05, "loss": 2.3034, "num_input_tokens_seen": 41990896, "step": 20095 }, { "epoch": 3.278978709519537, "grad_norm": 6.03125, "learning_rate": 2.4805615913235456e-05, "loss": 2.6634, "num_input_tokens_seen": 42000880, "step": 20100 }, { "epoch": 3.279794436740354, "grad_norm": 1.890625, "learning_rate": 2.479579874820208e-05, "loss": 1.5421, "num_input_tokens_seen": 42012592, "step": 20105 }, { "epoch": 3.2806101639611716, "grad_norm": 9.9375, "learning_rate": 2.4785981614659115e-05, "loss": 1.7741, "num_input_tokens_seen": 42022608, "step": 20110 }, { "epoch": 3.2814258911819887, "grad_norm": 9.1875, "learning_rate": 2.477616451412047e-05, "loss": 3.941, "num_input_tokens_seen": 42034208, "step": 20115 }, { "epoch": 3.2822416184028063, "grad_norm": 7.0625, "learning_rate": 2.476634744810007e-05, "loss": 4.598, "num_input_tokens_seen": 42043744, "step": 20120 }, { "epoch": 3.2830573456236234, "grad_norm": 5.25, "learning_rate": 2.475653041811183e-05, "loss": 2.1986, "num_input_tokens_seen": 42054272, "step": 20125 }, { "epoch": 3.283873072844441, "grad_norm": 10.6875, "learning_rate": 2.4746713425669652e-05, "loss": 3.1769, "num_input_tokens_seen": 42065472, "step": 20130 }, { "epoch": 3.284688800065258, "grad_norm": 3.75, "learning_rate": 2.4736896472287458e-05, "loss": 3.018, "num_input_tokens_seen": 42075008, "step": 20135 }, { "epoch": 3.2855045272860757, "grad_norm": 11.5625, "learning_rate": 2.4727079559479124e-05, "loss": 3.1353, "num_input_tokens_seen": 42084736, "step": 20140 }, { "epoch": 3.286320254506893, "grad_norm": 8.9375, "learning_rate": 2.4717262688758557e-05, "loss": 3.8477, "num_input_tokens_seen": 42095104, "step": 20145 }, { "epoch": 3.2871359817277104, "grad_norm": 5.09375, "learning_rate": 2.4707445861639637e-05, "loss": 3.0957, "num_input_tokens_seen": 42104928, "step": 20150 }, { "epoch": 3.2879517089485275, "grad_norm": 1.7109375, "learning_rate": 2.4697629079636244e-05, "loss": 2.6156, "num_input_tokens_seen": 42114704, "step": 20155 }, { "epoch": 3.288767436169345, "grad_norm": 9.125, "learning_rate": 2.4687812344262244e-05, "loss": 4.717, "num_input_tokens_seen": 42124720, "step": 20160 }, { "epoch": 3.289583163390162, "grad_norm": 3.296875, "learning_rate": 2.46779956570315e-05, "loss": 2.7845, "num_input_tokens_seen": 42134256, "step": 20165 }, { "epoch": 3.29039889061098, "grad_norm": 11.0625, "learning_rate": 2.466817901945787e-05, "loss": 2.5101, "num_input_tokens_seen": 42144736, "step": 20170 }, { "epoch": 3.291214617831797, "grad_norm": 5.125, "learning_rate": 2.4658362433055217e-05, "loss": 2.7502, "num_input_tokens_seen": 42155008, "step": 20175 }, { "epoch": 3.2920303450526145, "grad_norm": 10.6875, "learning_rate": 2.4648545899337356e-05, "loss": 3.3808, "num_input_tokens_seen": 42164896, "step": 20180 }, { "epoch": 3.2928460722734316, "grad_norm": 5.125, "learning_rate": 2.4638729419818143e-05, "loss": 2.2589, "num_input_tokens_seen": 42175216, "step": 20185 }, { "epoch": 3.293661799494249, "grad_norm": 3.5, "learning_rate": 2.46289129960114e-05, "loss": 1.9207, "num_input_tokens_seen": 42186368, "step": 20190 }, { "epoch": 3.2944775267150663, "grad_norm": 4.4375, "learning_rate": 2.4619096629430924e-05, "loss": 1.8192, "num_input_tokens_seen": 42198400, "step": 20195 }, { "epoch": 3.295293253935884, "grad_norm": 3.828125, "learning_rate": 2.4609280321590543e-05, "loss": 3.3739, "num_input_tokens_seen": 42208144, "step": 20200 }, { "epoch": 3.295293253935884, "eval_loss": 2.5433285236358643, "eval_runtime": 134.8846, "eval_samples_per_second": 20.202, "eval_steps_per_second": 10.105, "num_input_tokens_seen": 42208144, "step": 20200 }, { "epoch": 3.296108981156701, "grad_norm": 1.6953125, "learning_rate": 2.4599464074004037e-05, "loss": 1.9925, "num_input_tokens_seen": 42219360, "step": 20205 }, { "epoch": 3.2969247083775186, "grad_norm": 3.859375, "learning_rate": 2.4589647888185204e-05, "loss": 1.2414, "num_input_tokens_seen": 42229456, "step": 20210 }, { "epoch": 3.2977404355983357, "grad_norm": 10.75, "learning_rate": 2.4579831765647836e-05, "loss": 2.4535, "num_input_tokens_seen": 42240688, "step": 20215 }, { "epoch": 3.2985561628191533, "grad_norm": 7.25, "learning_rate": 2.4570015707905676e-05, "loss": 1.4684, "num_input_tokens_seen": 42249680, "step": 20220 }, { "epoch": 3.2993718900399704, "grad_norm": 9.8125, "learning_rate": 2.4560199716472508e-05, "loss": 2.7598, "num_input_tokens_seen": 42259936, "step": 20225 }, { "epoch": 3.300187617260788, "grad_norm": 8.75, "learning_rate": 2.455038379286207e-05, "loss": 1.9112, "num_input_tokens_seen": 42271776, "step": 20230 }, { "epoch": 3.3010033444816056, "grad_norm": 11.5, "learning_rate": 2.4540567938588095e-05, "loss": 1.1607, "num_input_tokens_seen": 42283536, "step": 20235 }, { "epoch": 3.3018190717024227, "grad_norm": 6.0, "learning_rate": 2.4530752155164328e-05, "loss": 2.773, "num_input_tokens_seen": 42294464, "step": 20240 }, { "epoch": 3.30263479892324, "grad_norm": 9.375, "learning_rate": 2.4520936444104463e-05, "loss": 2.0663, "num_input_tokens_seen": 42304464, "step": 20245 }, { "epoch": 3.3034505261440574, "grad_norm": 18.375, "learning_rate": 2.4511120806922218e-05, "loss": 3.7125, "num_input_tokens_seen": 42316048, "step": 20250 }, { "epoch": 3.304266253364875, "grad_norm": 9.0625, "learning_rate": 2.45013052451313e-05, "loss": 3.4228, "num_input_tokens_seen": 42327616, "step": 20255 }, { "epoch": 3.305081980585692, "grad_norm": 6.5, "learning_rate": 2.4491489760245376e-05, "loss": 1.983, "num_input_tokens_seen": 42336704, "step": 20260 }, { "epoch": 3.3058977078065093, "grad_norm": 6.96875, "learning_rate": 2.4481674353778115e-05, "loss": 2.5131, "num_input_tokens_seen": 42346672, "step": 20265 }, { "epoch": 3.306713435027327, "grad_norm": 8.375, "learning_rate": 2.447185902724319e-05, "loss": 3.5484, "num_input_tokens_seen": 42356848, "step": 20270 }, { "epoch": 3.3075291622481444, "grad_norm": 0.87109375, "learning_rate": 2.4462043782154233e-05, "loss": 1.6148, "num_input_tokens_seen": 42367392, "step": 20275 }, { "epoch": 3.3083448894689615, "grad_norm": 4.78125, "learning_rate": 2.4452228620024895e-05, "loss": 2.4417, "num_input_tokens_seen": 42377856, "step": 20280 }, { "epoch": 3.309160616689779, "grad_norm": 5.8125, "learning_rate": 2.4442413542368776e-05, "loss": 2.4401, "num_input_tokens_seen": 42387808, "step": 20285 }, { "epoch": 3.3099763439105963, "grad_norm": 5.09375, "learning_rate": 2.4432598550699502e-05, "loss": 1.7985, "num_input_tokens_seen": 42399008, "step": 20290 }, { "epoch": 3.310792071131414, "grad_norm": 8.1875, "learning_rate": 2.4422783646530663e-05, "loss": 2.7012, "num_input_tokens_seen": 42409408, "step": 20295 }, { "epoch": 3.311607798352231, "grad_norm": 12.1875, "learning_rate": 2.441296883137584e-05, "loss": 4.0959, "num_input_tokens_seen": 42420736, "step": 20300 }, { "epoch": 3.3124235255730485, "grad_norm": 2.03125, "learning_rate": 2.4403154106748592e-05, "loss": 1.7947, "num_input_tokens_seen": 42430352, "step": 20305 }, { "epoch": 3.3132392527938657, "grad_norm": 6.15625, "learning_rate": 2.4393339474162494e-05, "loss": 1.0776, "num_input_tokens_seen": 42440464, "step": 20310 }, { "epoch": 3.3140549800146832, "grad_norm": 9.0, "learning_rate": 2.4383524935131062e-05, "loss": 2.5704, "num_input_tokens_seen": 42450896, "step": 20315 }, { "epoch": 3.3148707072355004, "grad_norm": 2.25, "learning_rate": 2.437371049116784e-05, "loss": 1.1739, "num_input_tokens_seen": 42461856, "step": 20320 }, { "epoch": 3.315686434456318, "grad_norm": 6.03125, "learning_rate": 2.436389614378632e-05, "loss": 1.6025, "num_input_tokens_seen": 42472768, "step": 20325 }, { "epoch": 3.316502161677135, "grad_norm": 3.359375, "learning_rate": 2.435408189450002e-05, "loss": 2.0135, "num_input_tokens_seen": 42482928, "step": 20330 }, { "epoch": 3.3173178888979526, "grad_norm": 1.8359375, "learning_rate": 2.4344267744822406e-05, "loss": 1.7818, "num_input_tokens_seen": 42493696, "step": 20335 }, { "epoch": 3.31813361611877, "grad_norm": 8.0, "learning_rate": 2.4334453696266944e-05, "loss": 2.3737, "num_input_tokens_seen": 42503920, "step": 20340 }, { "epoch": 3.3189493433395874, "grad_norm": 4.46875, "learning_rate": 2.432463975034708e-05, "loss": 1.9545, "num_input_tokens_seen": 42514560, "step": 20345 }, { "epoch": 3.3197650705604045, "grad_norm": 9.3125, "learning_rate": 2.4314825908576265e-05, "loss": 2.9223, "num_input_tokens_seen": 42524272, "step": 20350 }, { "epoch": 3.320580797781222, "grad_norm": 6.53125, "learning_rate": 2.4305012172467897e-05, "loss": 2.3135, "num_input_tokens_seen": 42534704, "step": 20355 }, { "epoch": 3.321396525002039, "grad_norm": 6.125, "learning_rate": 2.4295198543535393e-05, "loss": 1.9922, "num_input_tokens_seen": 42545616, "step": 20360 }, { "epoch": 3.3222122522228568, "grad_norm": 6.09375, "learning_rate": 2.4285385023292124e-05, "loss": 2.089, "num_input_tokens_seen": 42555136, "step": 20365 }, { "epoch": 3.323027979443674, "grad_norm": 4.125, "learning_rate": 2.427557161325147e-05, "loss": 2.8918, "num_input_tokens_seen": 42565456, "step": 20370 }, { "epoch": 3.3238437066644915, "grad_norm": 4.0625, "learning_rate": 2.4265758314926778e-05, "loss": 1.4327, "num_input_tokens_seen": 42577072, "step": 20375 }, { "epoch": 3.3246594338853086, "grad_norm": 6.40625, "learning_rate": 2.4255945129831373e-05, "loss": 1.5703, "num_input_tokens_seen": 42587424, "step": 20380 }, { "epoch": 3.325475161106126, "grad_norm": 6.9375, "learning_rate": 2.4246132059478578e-05, "loss": 2.4988, "num_input_tokens_seen": 42597888, "step": 20385 }, { "epoch": 3.3262908883269433, "grad_norm": 6.03125, "learning_rate": 2.4236319105381706e-05, "loss": 1.9234, "num_input_tokens_seen": 42606192, "step": 20390 }, { "epoch": 3.327106615547761, "grad_norm": 1.796875, "learning_rate": 2.422650626905401e-05, "loss": 2.1471, "num_input_tokens_seen": 42616096, "step": 20395 }, { "epoch": 3.327922342768578, "grad_norm": 5.59375, "learning_rate": 2.4216693552008785e-05, "loss": 2.6455, "num_input_tokens_seen": 42625520, "step": 20400 }, { "epoch": 3.327922342768578, "eval_loss": 2.5354511737823486, "eval_runtime": 135.0546, "eval_samples_per_second": 20.177, "eval_steps_per_second": 10.092, "num_input_tokens_seen": 42625520, "step": 20400 }, { "epoch": 3.3287380699893956, "grad_norm": 3.984375, "learning_rate": 2.4206880955759247e-05, "loss": 2.0619, "num_input_tokens_seen": 42636448, "step": 20405 }, { "epoch": 3.3295537972102127, "grad_norm": 13.25, "learning_rate": 2.419706848181863e-05, "loss": 2.969, "num_input_tokens_seen": 42647328, "step": 20410 }, { "epoch": 3.3303695244310303, "grad_norm": 9.3125, "learning_rate": 2.4187256131700153e-05, "loss": 3.5411, "num_input_tokens_seen": 42657552, "step": 20415 }, { "epoch": 3.3311852516518474, "grad_norm": 5.96875, "learning_rate": 2.4177443906916985e-05, "loss": 2.1037, "num_input_tokens_seen": 42668800, "step": 20420 }, { "epoch": 3.332000978872665, "grad_norm": 10.5625, "learning_rate": 2.4167631808982303e-05, "loss": 2.661, "num_input_tokens_seen": 42679600, "step": 20425 }, { "epoch": 3.3328167060934826, "grad_norm": 11.625, "learning_rate": 2.4157819839409264e-05, "loss": 2.026, "num_input_tokens_seen": 42690832, "step": 20430 }, { "epoch": 3.3336324333142997, "grad_norm": 5.75, "learning_rate": 2.414800799971098e-05, "loss": 2.5018, "num_input_tokens_seen": 42700912, "step": 20435 }, { "epoch": 3.334448160535117, "grad_norm": 10.875, "learning_rate": 2.4138196291400582e-05, "loss": 1.784, "num_input_tokens_seen": 42712016, "step": 20440 }, { "epoch": 3.3352638877559344, "grad_norm": 13.1875, "learning_rate": 2.412838471599114e-05, "loss": 3.6619, "num_input_tokens_seen": 42722336, "step": 20445 }, { "epoch": 3.336079614976752, "grad_norm": 12.3125, "learning_rate": 2.411857327499572e-05, "loss": 2.7701, "num_input_tokens_seen": 42731808, "step": 20450 }, { "epoch": 3.336895342197569, "grad_norm": 6.4375, "learning_rate": 2.410876196992739e-05, "loss": 2.526, "num_input_tokens_seen": 42742512, "step": 20455 }, { "epoch": 3.3377110694183862, "grad_norm": 2.953125, "learning_rate": 2.4098950802299156e-05, "loss": 1.7085, "num_input_tokens_seen": 42753680, "step": 20460 }, { "epoch": 3.338526796639204, "grad_norm": 3.09375, "learning_rate": 2.4089139773624027e-05, "loss": 1.8869, "num_input_tokens_seen": 42765376, "step": 20465 }, { "epoch": 3.3393425238600214, "grad_norm": 7.0625, "learning_rate": 2.4079328885415007e-05, "loss": 2.6333, "num_input_tokens_seen": 42775072, "step": 20470 }, { "epoch": 3.3401582510808385, "grad_norm": 2.359375, "learning_rate": 2.4069518139185036e-05, "loss": 3.2955, "num_input_tokens_seen": 42785872, "step": 20475 }, { "epoch": 3.340973978301656, "grad_norm": 4.96875, "learning_rate": 2.405970753644706e-05, "loss": 3.4296, "num_input_tokens_seen": 42796800, "step": 20480 }, { "epoch": 3.3417897055224732, "grad_norm": 6.125, "learning_rate": 2.4049897078714e-05, "loss": 2.823, "num_input_tokens_seen": 42807312, "step": 20485 }, { "epoch": 3.342605432743291, "grad_norm": 1.859375, "learning_rate": 2.404008676749874e-05, "loss": 3.3115, "num_input_tokens_seen": 42818704, "step": 20490 }, { "epoch": 3.343421159964108, "grad_norm": 7.0625, "learning_rate": 2.403027660431418e-05, "loss": 1.3676, "num_input_tokens_seen": 42829088, "step": 20495 }, { "epoch": 3.3442368871849255, "grad_norm": 4.96875, "learning_rate": 2.402046659067314e-05, "loss": 2.2335, "num_input_tokens_seen": 42840000, "step": 20500 }, { "epoch": 3.3450526144057426, "grad_norm": 2.25, "learning_rate": 2.401065672808847e-05, "loss": 1.8677, "num_input_tokens_seen": 42850240, "step": 20505 }, { "epoch": 3.34586834162656, "grad_norm": 8.3125, "learning_rate": 2.400084701807296e-05, "loss": 1.9798, "num_input_tokens_seen": 42862704, "step": 20510 }, { "epoch": 3.3466840688473773, "grad_norm": 12.625, "learning_rate": 2.39910374621394e-05, "loss": 3.1174, "num_input_tokens_seen": 42873392, "step": 20515 }, { "epoch": 3.347499796068195, "grad_norm": 7.25, "learning_rate": 2.3981228061800544e-05, "loss": 2.9435, "num_input_tokens_seen": 42883440, "step": 20520 }, { "epoch": 3.348315523289012, "grad_norm": 7.09375, "learning_rate": 2.3971418818569115e-05, "loss": 2.9212, "num_input_tokens_seen": 42893728, "step": 20525 }, { "epoch": 3.3491312505098296, "grad_norm": 7.40625, "learning_rate": 2.3961609733957832e-05, "loss": 3.6041, "num_input_tokens_seen": 42903968, "step": 20530 }, { "epoch": 3.3499469777306468, "grad_norm": 4.96875, "learning_rate": 2.395180080947939e-05, "loss": 2.7136, "num_input_tokens_seen": 42914464, "step": 20535 }, { "epoch": 3.3507627049514643, "grad_norm": 11.25, "learning_rate": 2.394199204664642e-05, "loss": 3.2381, "num_input_tokens_seen": 42925872, "step": 20540 }, { "epoch": 3.3515784321722815, "grad_norm": 13.5625, "learning_rate": 2.3932183446971583e-05, "loss": 2.6707, "num_input_tokens_seen": 42936208, "step": 20545 }, { "epoch": 3.352394159393099, "grad_norm": 3.328125, "learning_rate": 2.3922375011967473e-05, "loss": 3.7772, "num_input_tokens_seen": 42947456, "step": 20550 }, { "epoch": 3.353209886613916, "grad_norm": 4.3125, "learning_rate": 2.3912566743146676e-05, "loss": 2.3992, "num_input_tokens_seen": 42959088, "step": 20555 }, { "epoch": 3.3540256138347337, "grad_norm": 8.125, "learning_rate": 2.390275864202176e-05, "loss": 1.8851, "num_input_tokens_seen": 42969424, "step": 20560 }, { "epoch": 3.354841341055551, "grad_norm": 5.46875, "learning_rate": 2.3892950710105243e-05, "loss": 2.056, "num_input_tokens_seen": 42980192, "step": 20565 }, { "epoch": 3.3556570682763684, "grad_norm": 8.5625, "learning_rate": 2.3883142948909635e-05, "loss": 1.5327, "num_input_tokens_seen": 42991168, "step": 20570 }, { "epoch": 3.3564727954971856, "grad_norm": 7.375, "learning_rate": 2.3873335359947433e-05, "loss": 2.3814, "num_input_tokens_seen": 43000928, "step": 20575 }, { "epoch": 3.357288522718003, "grad_norm": 8.6875, "learning_rate": 2.3863527944731066e-05, "loss": 1.9113, "num_input_tokens_seen": 43011936, "step": 20580 }, { "epoch": 3.3581042499388203, "grad_norm": 6.65625, "learning_rate": 2.385372070477298e-05, "loss": 2.3879, "num_input_tokens_seen": 43023840, "step": 20585 }, { "epoch": 3.358919977159638, "grad_norm": 15.9375, "learning_rate": 2.384391364158556e-05, "loss": 2.972, "num_input_tokens_seen": 43034288, "step": 20590 }, { "epoch": 3.359735704380455, "grad_norm": 6.0, "learning_rate": 2.3834106756681185e-05, "loss": 2.527, "num_input_tokens_seen": 43044768, "step": 20595 }, { "epoch": 3.3605514316012726, "grad_norm": 11.875, "learning_rate": 2.3824300051572206e-05, "loss": 2.4638, "num_input_tokens_seen": 43054848, "step": 20600 }, { "epoch": 3.3605514316012726, "eval_loss": 2.5471649169921875, "eval_runtime": 135.0498, "eval_samples_per_second": 20.178, "eval_steps_per_second": 10.093, "num_input_tokens_seen": 43054848, "step": 20600 }, { "epoch": 3.36136715882209, "grad_norm": 7.25, "learning_rate": 2.3814493527770923e-05, "loss": 2.5281, "num_input_tokens_seen": 43064880, "step": 20605 }, { "epoch": 3.3621828860429073, "grad_norm": 7.8125, "learning_rate": 2.3804687186789637e-05, "loss": 1.9726, "num_input_tokens_seen": 43076000, "step": 20610 }, { "epoch": 3.3629986132637244, "grad_norm": 9.9375, "learning_rate": 2.379488103014062e-05, "loss": 2.747, "num_input_tokens_seen": 43085296, "step": 20615 }, { "epoch": 3.363814340484542, "grad_norm": 8.1875, "learning_rate": 2.3785075059336086e-05, "loss": 2.0082, "num_input_tokens_seen": 43096432, "step": 20620 }, { "epoch": 3.3646300677053596, "grad_norm": 12.625, "learning_rate": 2.3775269275888248e-05, "loss": 3.2529, "num_input_tokens_seen": 43108512, "step": 20625 }, { "epoch": 3.3654457949261767, "grad_norm": 9.5, "learning_rate": 2.3765463681309274e-05, "loss": 2.348, "num_input_tokens_seen": 43118416, "step": 20630 }, { "epoch": 3.366261522146994, "grad_norm": 10.9375, "learning_rate": 2.3755658277111313e-05, "loss": 2.4894, "num_input_tokens_seen": 43129488, "step": 20635 }, { "epoch": 3.3670772493678114, "grad_norm": 11.3125, "learning_rate": 2.374585306480649e-05, "loss": 4.1974, "num_input_tokens_seen": 43140688, "step": 20640 }, { "epoch": 3.367892976588629, "grad_norm": 3.90625, "learning_rate": 2.3736048045906877e-05, "loss": 1.6057, "num_input_tokens_seen": 43149344, "step": 20645 }, { "epoch": 3.368708703809446, "grad_norm": 8.125, "learning_rate": 2.372624322192454e-05, "loss": 2.8391, "num_input_tokens_seen": 43160080, "step": 20650 }, { "epoch": 3.3695244310302637, "grad_norm": 1.6953125, "learning_rate": 2.3716438594371516e-05, "loss": 2.0155, "num_input_tokens_seen": 43171104, "step": 20655 }, { "epoch": 3.370340158251081, "grad_norm": 6.8125, "learning_rate": 2.3706634164759784e-05, "loss": 2.3163, "num_input_tokens_seen": 43181728, "step": 20660 }, { "epoch": 3.3711558854718984, "grad_norm": 5.8125, "learning_rate": 2.3696829934601323e-05, "loss": 2.3781, "num_input_tokens_seen": 43191984, "step": 20665 }, { "epoch": 3.3719716126927155, "grad_norm": 5.0, "learning_rate": 2.3687025905408053e-05, "loss": 2.1347, "num_input_tokens_seen": 43202240, "step": 20670 }, { "epoch": 3.372787339913533, "grad_norm": 8.125, "learning_rate": 2.3677222078691886e-05, "loss": 2.2269, "num_input_tokens_seen": 43213408, "step": 20675 }, { "epoch": 3.37360306713435, "grad_norm": 8.75, "learning_rate": 2.366741845596471e-05, "loss": 2.1443, "num_input_tokens_seen": 43223568, "step": 20680 }, { "epoch": 3.374418794355168, "grad_norm": 9.5625, "learning_rate": 2.3657615038738343e-05, "loss": 3.006, "num_input_tokens_seen": 43235024, "step": 20685 }, { "epoch": 3.375234521575985, "grad_norm": 4.8125, "learning_rate": 2.3647811828524614e-05, "loss": 2.3502, "num_input_tokens_seen": 43246640, "step": 20690 }, { "epoch": 3.3760502487968025, "grad_norm": 9.0, "learning_rate": 2.363800882683529e-05, "loss": 1.9453, "num_input_tokens_seen": 43258448, "step": 20695 }, { "epoch": 3.3768659760176196, "grad_norm": 10.8125, "learning_rate": 2.3628206035182125e-05, "loss": 1.7205, "num_input_tokens_seen": 43268800, "step": 20700 }, { "epoch": 3.377681703238437, "grad_norm": 9.4375, "learning_rate": 2.361840345507683e-05, "loss": 2.2791, "num_input_tokens_seen": 43279584, "step": 20705 }, { "epoch": 3.3784974304592543, "grad_norm": 10.25, "learning_rate": 2.3608601088031073e-05, "loss": 3.7969, "num_input_tokens_seen": 43290352, "step": 20710 }, { "epoch": 3.379313157680072, "grad_norm": 9.375, "learning_rate": 2.3598798935556516e-05, "loss": 2.0891, "num_input_tokens_seen": 43300704, "step": 20715 }, { "epoch": 3.380128884900889, "grad_norm": 3.078125, "learning_rate": 2.3588996999164784e-05, "loss": 1.9274, "num_input_tokens_seen": 43309936, "step": 20720 }, { "epoch": 3.3809446121217066, "grad_norm": 8.9375, "learning_rate": 2.3579195280367434e-05, "loss": 3.228, "num_input_tokens_seen": 43320720, "step": 20725 }, { "epoch": 3.3817603393425237, "grad_norm": 14.9375, "learning_rate": 2.356939378067603e-05, "loss": 2.214, "num_input_tokens_seen": 43332080, "step": 20730 }, { "epoch": 3.3825760665633413, "grad_norm": 1.046875, "learning_rate": 2.3559592501602092e-05, "loss": 2.0855, "num_input_tokens_seen": 43342944, "step": 20735 }, { "epoch": 3.3833917937841584, "grad_norm": 13.8125, "learning_rate": 2.3549791444657076e-05, "loss": 3.7724, "num_input_tokens_seen": 43352864, "step": 20740 }, { "epoch": 3.384207521004976, "grad_norm": 8.4375, "learning_rate": 2.353999061135246e-05, "loss": 2.5566, "num_input_tokens_seen": 43362480, "step": 20745 }, { "epoch": 3.385023248225793, "grad_norm": 6.15625, "learning_rate": 2.3530190003199626e-05, "loss": 2.5185, "num_input_tokens_seen": 43371840, "step": 20750 }, { "epoch": 3.3858389754466107, "grad_norm": 9.0625, "learning_rate": 2.3520389621709965e-05, "loss": 3.7439, "num_input_tokens_seen": 43383200, "step": 20755 }, { "epoch": 3.386654702667428, "grad_norm": 3.515625, "learning_rate": 2.351058946839483e-05, "loss": 2.2205, "num_input_tokens_seen": 43393792, "step": 20760 }, { "epoch": 3.3874704298882454, "grad_norm": 6.09375, "learning_rate": 2.350078954476551e-05, "loss": 1.9372, "num_input_tokens_seen": 43404704, "step": 20765 }, { "epoch": 3.3882861571090626, "grad_norm": 8.5, "learning_rate": 2.3490989852333272e-05, "loss": 2.715, "num_input_tokens_seen": 43413344, "step": 20770 }, { "epoch": 3.38910188432988, "grad_norm": 4.71875, "learning_rate": 2.3481190392609377e-05, "loss": 2.1407, "num_input_tokens_seen": 43423200, "step": 20775 }, { "epoch": 3.3899176115506973, "grad_norm": 6.0625, "learning_rate": 2.3471391167105e-05, "loss": 2.9771, "num_input_tokens_seen": 43432320, "step": 20780 }, { "epoch": 3.390733338771515, "grad_norm": 9.9375, "learning_rate": 2.3461592177331325e-05, "loss": 3.4726, "num_input_tokens_seen": 43441616, "step": 20785 }, { "epoch": 3.391549065992332, "grad_norm": 13.4375, "learning_rate": 2.345179342479946e-05, "loss": 2.6923, "num_input_tokens_seen": 43452752, "step": 20790 }, { "epoch": 3.3923647932131495, "grad_norm": 5.96875, "learning_rate": 2.3441994911020503e-05, "loss": 2.2664, "num_input_tokens_seen": 43463200, "step": 20795 }, { "epoch": 3.393180520433967, "grad_norm": 1.375, "learning_rate": 2.3432196637505522e-05, "loss": 0.7799, "num_input_tokens_seen": 43472928, "step": 20800 }, { "epoch": 3.393180520433967, "eval_loss": 2.5441832542419434, "eval_runtime": 135.0363, "eval_samples_per_second": 20.18, "eval_steps_per_second": 10.094, "num_input_tokens_seen": 43472928, "step": 20800 }, { "epoch": 3.3939962476547842, "grad_norm": 4.96875, "learning_rate": 2.3422398605765515e-05, "loss": 2.1707, "num_input_tokens_seen": 43482976, "step": 20805 }, { "epoch": 3.3948119748756014, "grad_norm": 6.3125, "learning_rate": 2.3412600817311462e-05, "loss": 1.6711, "num_input_tokens_seen": 43493936, "step": 20810 }, { "epoch": 3.395627702096419, "grad_norm": 4.78125, "learning_rate": 2.3402803273654326e-05, "loss": 1.9102, "num_input_tokens_seen": 43503936, "step": 20815 }, { "epoch": 3.3964434293172365, "grad_norm": 10.6875, "learning_rate": 2.3393005976304983e-05, "loss": 3.3553, "num_input_tokens_seen": 43514416, "step": 20820 }, { "epoch": 3.3972591565380537, "grad_norm": 5.0, "learning_rate": 2.338320892677432e-05, "loss": 1.8994, "num_input_tokens_seen": 43525536, "step": 20825 }, { "epoch": 3.398074883758871, "grad_norm": 7.0625, "learning_rate": 2.3373412126573155e-05, "loss": 1.9248, "num_input_tokens_seen": 43537152, "step": 20830 }, { "epoch": 3.3988906109796884, "grad_norm": 4.03125, "learning_rate": 2.3363615577212285e-05, "loss": 2.3896, "num_input_tokens_seen": 43546944, "step": 20835 }, { "epoch": 3.399706338200506, "grad_norm": 3.984375, "learning_rate": 2.3353819280202455e-05, "loss": 1.9429, "num_input_tokens_seen": 43556096, "step": 20840 }, { "epoch": 3.400522065421323, "grad_norm": 8.625, "learning_rate": 2.334402323705438e-05, "loss": 2.7314, "num_input_tokens_seen": 43566256, "step": 20845 }, { "epoch": 3.4013377926421406, "grad_norm": 7.40625, "learning_rate": 2.3334227449278725e-05, "loss": 2.7681, "num_input_tokens_seen": 43576896, "step": 20850 }, { "epoch": 3.4021535198629578, "grad_norm": 9.3125, "learning_rate": 2.3324431918386143e-05, "loss": 2.326, "num_input_tokens_seen": 43587392, "step": 20855 }, { "epoch": 3.4029692470837754, "grad_norm": 9.0625, "learning_rate": 2.3314636645887207e-05, "loss": 3.8835, "num_input_tokens_seen": 43596608, "step": 20860 }, { "epoch": 3.4037849743045925, "grad_norm": 5.96875, "learning_rate": 2.3304841633292487e-05, "loss": 2.3571, "num_input_tokens_seen": 43607840, "step": 20865 }, { "epoch": 3.40460070152541, "grad_norm": 6.90625, "learning_rate": 2.329504688211248e-05, "loss": 1.7162, "num_input_tokens_seen": 43616752, "step": 20870 }, { "epoch": 3.405416428746227, "grad_norm": 7.5, "learning_rate": 2.3285252393857677e-05, "loss": 2.2454, "num_input_tokens_seen": 43628080, "step": 20875 }, { "epoch": 3.4062321559670448, "grad_norm": 7.0625, "learning_rate": 2.327545817003851e-05, "loss": 1.7572, "num_input_tokens_seen": 43638208, "step": 20880 }, { "epoch": 3.407047883187862, "grad_norm": 1.734375, "learning_rate": 2.326566421216535e-05, "loss": 2.4163, "num_input_tokens_seen": 43648688, "step": 20885 }, { "epoch": 3.4078636104086795, "grad_norm": 9.125, "learning_rate": 2.3255870521748565e-05, "loss": 3.6, "num_input_tokens_seen": 43659616, "step": 20890 }, { "epoch": 3.4086793376294966, "grad_norm": 8.625, "learning_rate": 2.3246077100298474e-05, "loss": 1.899, "num_input_tokens_seen": 43669744, "step": 20895 }, { "epoch": 3.409495064850314, "grad_norm": 7.09375, "learning_rate": 2.3236283949325328e-05, "loss": 1.8893, "num_input_tokens_seen": 43682080, "step": 20900 }, { "epoch": 3.4103107920711313, "grad_norm": 6.375, "learning_rate": 2.3226491070339368e-05, "loss": 2.3363, "num_input_tokens_seen": 43692640, "step": 20905 }, { "epoch": 3.411126519291949, "grad_norm": 8.1875, "learning_rate": 2.3216698464850762e-05, "loss": 1.8996, "num_input_tokens_seen": 43702496, "step": 20910 }, { "epoch": 3.411942246512766, "grad_norm": 5.25, "learning_rate": 2.320690613436967e-05, "loss": 2.5471, "num_input_tokens_seen": 43713552, "step": 20915 }, { "epoch": 3.4127579737335836, "grad_norm": 11.25, "learning_rate": 2.3197114080406192e-05, "loss": 2.8929, "num_input_tokens_seen": 43725264, "step": 20920 }, { "epoch": 3.4135737009544007, "grad_norm": 7.15625, "learning_rate": 2.3187322304470365e-05, "loss": 2.3528, "num_input_tokens_seen": 43734864, "step": 20925 }, { "epoch": 3.4143894281752183, "grad_norm": 13.125, "learning_rate": 2.3177530808072222e-05, "loss": 3.1845, "num_input_tokens_seen": 43746240, "step": 20930 }, { "epoch": 3.4152051553960354, "grad_norm": 6.0, "learning_rate": 2.316773959272174e-05, "loss": 1.975, "num_input_tokens_seen": 43757072, "step": 20935 }, { "epoch": 3.416020882616853, "grad_norm": 4.96875, "learning_rate": 2.3157948659928823e-05, "loss": 2.5901, "num_input_tokens_seen": 43766576, "step": 20940 }, { "epoch": 3.41683660983767, "grad_norm": 3.4375, "learning_rate": 2.3148158011203388e-05, "loss": 2.8722, "num_input_tokens_seen": 43777056, "step": 20945 }, { "epoch": 3.4176523370584877, "grad_norm": 7.8125, "learning_rate": 2.3138367648055253e-05, "loss": 2.8573, "num_input_tokens_seen": 43786704, "step": 20950 }, { "epoch": 3.418468064279305, "grad_norm": 8.8125, "learning_rate": 2.312857757199422e-05, "loss": 3.0529, "num_input_tokens_seen": 43797760, "step": 20955 }, { "epoch": 3.4192837915001224, "grad_norm": 11.875, "learning_rate": 2.3118787784530048e-05, "loss": 2.3851, "num_input_tokens_seen": 43807680, "step": 20960 }, { "epoch": 3.4200995187209395, "grad_norm": 0.1513671875, "learning_rate": 2.310899828717243e-05, "loss": 1.5343, "num_input_tokens_seen": 43818064, "step": 20965 }, { "epoch": 3.420915245941757, "grad_norm": 7.28125, "learning_rate": 2.309920908143104e-05, "loss": 2.3401, "num_input_tokens_seen": 43829376, "step": 20970 }, { "epoch": 3.4217309731625742, "grad_norm": 5.21875, "learning_rate": 2.308942016881551e-05, "loss": 2.6402, "num_input_tokens_seen": 43839792, "step": 20975 }, { "epoch": 3.422546700383392, "grad_norm": 12.4375, "learning_rate": 2.307963155083539e-05, "loss": 1.7793, "num_input_tokens_seen": 43850624, "step": 20980 }, { "epoch": 3.423362427604209, "grad_norm": 5.6875, "learning_rate": 2.306984322900022e-05, "loss": 3.2126, "num_input_tokens_seen": 43862416, "step": 20985 }, { "epoch": 3.4241781548250265, "grad_norm": 5.625, "learning_rate": 2.3060055204819482e-05, "loss": 1.8333, "num_input_tokens_seen": 43872432, "step": 20990 }, { "epoch": 3.424993882045844, "grad_norm": 9.375, "learning_rate": 2.3050267479802604e-05, "loss": 2.4744, "num_input_tokens_seen": 43882976, "step": 20995 }, { "epoch": 3.4258096092666612, "grad_norm": 7.0625, "learning_rate": 2.304048005545899e-05, "loss": 1.9196, "num_input_tokens_seen": 43892704, "step": 21000 }, { "epoch": 3.4258096092666612, "eval_loss": 2.540254831314087, "eval_runtime": 135.074, "eval_samples_per_second": 20.174, "eval_steps_per_second": 10.091, "num_input_tokens_seen": 43892704, "step": 21000 }, { "epoch": 3.4266253364874784, "grad_norm": 6.1875, "learning_rate": 2.3030692933297972e-05, "loss": 2.0137, "num_input_tokens_seen": 43903520, "step": 21005 }, { "epoch": 3.427441063708296, "grad_norm": 5.71875, "learning_rate": 2.3020906114828843e-05, "loss": 1.0638, "num_input_tokens_seen": 43912960, "step": 21010 }, { "epoch": 3.4282567909291135, "grad_norm": 3.59375, "learning_rate": 2.301111960156088e-05, "loss": 3.6893, "num_input_tokens_seen": 43924288, "step": 21015 }, { "epoch": 3.4290725181499306, "grad_norm": 8.5, "learning_rate": 2.300133339500326e-05, "loss": 2.2816, "num_input_tokens_seen": 43934240, "step": 21020 }, { "epoch": 3.429888245370748, "grad_norm": 3.765625, "learning_rate": 2.2991547496665148e-05, "loss": 2.6181, "num_input_tokens_seen": 43943504, "step": 21025 }, { "epoch": 3.4307039725915653, "grad_norm": 13.75, "learning_rate": 2.298176190805565e-05, "loss": 2.8553, "num_input_tokens_seen": 43953264, "step": 21030 }, { "epoch": 3.431519699812383, "grad_norm": 3.453125, "learning_rate": 2.2971976630683826e-05, "loss": 1.7463, "num_input_tokens_seen": 43964608, "step": 21035 }, { "epoch": 3.4323354270332, "grad_norm": 6.71875, "learning_rate": 2.29621916660587e-05, "loss": 2.36, "num_input_tokens_seen": 43973664, "step": 21040 }, { "epoch": 3.4331511542540176, "grad_norm": 8.375, "learning_rate": 2.295240701568922e-05, "loss": 3.1605, "num_input_tokens_seen": 43984960, "step": 21045 }, { "epoch": 3.4339668814748348, "grad_norm": 8.5625, "learning_rate": 2.2942622681084312e-05, "loss": 1.5289, "num_input_tokens_seen": 43995424, "step": 21050 }, { "epoch": 3.4347826086956523, "grad_norm": 16.5, "learning_rate": 2.293283866375284e-05, "loss": 3.0836, "num_input_tokens_seen": 44006160, "step": 21055 }, { "epoch": 3.4355983359164695, "grad_norm": 12.8125, "learning_rate": 2.2923054965203627e-05, "loss": 2.526, "num_input_tokens_seen": 44016960, "step": 21060 }, { "epoch": 3.436414063137287, "grad_norm": 4.3125, "learning_rate": 2.2913271586945443e-05, "loss": 3.2256, "num_input_tokens_seen": 44027328, "step": 21065 }, { "epoch": 3.437229790358104, "grad_norm": 4.03125, "learning_rate": 2.290348853048699e-05, "loss": 2.2815, "num_input_tokens_seen": 44037824, "step": 21070 }, { "epoch": 3.4380455175789217, "grad_norm": 4.75, "learning_rate": 2.2893705797336956e-05, "loss": 1.7393, "num_input_tokens_seen": 44046448, "step": 21075 }, { "epoch": 3.438861244799739, "grad_norm": 4.71875, "learning_rate": 2.288392338900397e-05, "loss": 1.8878, "num_input_tokens_seen": 44056896, "step": 21080 }, { "epoch": 3.4396769720205564, "grad_norm": 6.625, "learning_rate": 2.2874141306996576e-05, "loss": 2.6685, "num_input_tokens_seen": 44066624, "step": 21085 }, { "epoch": 3.4404926992413736, "grad_norm": 9.625, "learning_rate": 2.2864359552823312e-05, "loss": 2.1314, "num_input_tokens_seen": 44076384, "step": 21090 }, { "epoch": 3.441308426462191, "grad_norm": 6.375, "learning_rate": 2.2854578127992648e-05, "loss": 3.8402, "num_input_tokens_seen": 44086704, "step": 21095 }, { "epoch": 3.4421241536830083, "grad_norm": 13.75, "learning_rate": 2.2844797034012988e-05, "loss": 2.6663, "num_input_tokens_seen": 44097616, "step": 21100 }, { "epoch": 3.442939880903826, "grad_norm": 1.3828125, "learning_rate": 2.2835016272392722e-05, "loss": 1.6125, "num_input_tokens_seen": 44108928, "step": 21105 }, { "epoch": 3.443755608124643, "grad_norm": 1.890625, "learning_rate": 2.2825235844640142e-05, "loss": 2.2246, "num_input_tokens_seen": 44119888, "step": 21110 }, { "epoch": 3.4445713353454606, "grad_norm": 10.5625, "learning_rate": 2.2815455752263522e-05, "loss": 1.4623, "num_input_tokens_seen": 44130656, "step": 21115 }, { "epoch": 3.4453870625662777, "grad_norm": 3.6875, "learning_rate": 2.2805675996771092e-05, "loss": 2.2528, "num_input_tokens_seen": 44143360, "step": 21120 }, { "epoch": 3.4462027897870953, "grad_norm": 3.0625, "learning_rate": 2.2795896579670987e-05, "loss": 2.188, "num_input_tokens_seen": 44154752, "step": 21125 }, { "epoch": 3.4470185170079124, "grad_norm": 5.96875, "learning_rate": 2.2786117502471337e-05, "loss": 0.6839, "num_input_tokens_seen": 44163728, "step": 21130 }, { "epoch": 3.44783424422873, "grad_norm": 9.8125, "learning_rate": 2.2776338766680185e-05, "loss": 3.2804, "num_input_tokens_seen": 44174928, "step": 21135 }, { "epoch": 3.448649971449547, "grad_norm": 8.125, "learning_rate": 2.2766560373805533e-05, "loss": 2.6387, "num_input_tokens_seen": 44185776, "step": 21140 }, { "epoch": 3.4494656986703647, "grad_norm": 10.0, "learning_rate": 2.2756782325355353e-05, "loss": 3.6729, "num_input_tokens_seen": 44195920, "step": 21145 }, { "epoch": 3.450281425891182, "grad_norm": 10.875, "learning_rate": 2.2747004622837514e-05, "loss": 2.3434, "num_input_tokens_seen": 44206000, "step": 21150 }, { "epoch": 3.4510971531119994, "grad_norm": 3.703125, "learning_rate": 2.2737227267759878e-05, "loss": 3.1034, "num_input_tokens_seen": 44216032, "step": 21155 }, { "epoch": 3.4519128803328165, "grad_norm": 4.4375, "learning_rate": 2.272745026163024e-05, "loss": 2.3247, "num_input_tokens_seen": 44226368, "step": 21160 }, { "epoch": 3.452728607553634, "grad_norm": 10.1875, "learning_rate": 2.271767360595633e-05, "loss": 3.0496, "num_input_tokens_seen": 44237008, "step": 21165 }, { "epoch": 3.4535443347744517, "grad_norm": 5.46875, "learning_rate": 2.270789730224583e-05, "loss": 1.736, "num_input_tokens_seen": 44248144, "step": 21170 }, { "epoch": 3.454360061995269, "grad_norm": 10.6875, "learning_rate": 2.2698121352006367e-05, "loss": 2.5785, "num_input_tokens_seen": 44259040, "step": 21175 }, { "epoch": 3.455175789216086, "grad_norm": 5.6875, "learning_rate": 2.2688345756745517e-05, "loss": 1.6497, "num_input_tokens_seen": 44269184, "step": 21180 }, { "epoch": 3.4559915164369035, "grad_norm": 8.1875, "learning_rate": 2.267857051797081e-05, "loss": 2.6884, "num_input_tokens_seen": 44279808, "step": 21185 }, { "epoch": 3.456807243657721, "grad_norm": 10.1875, "learning_rate": 2.2668795637189695e-05, "loss": 3.2279, "num_input_tokens_seen": 44289072, "step": 21190 }, { "epoch": 3.457622970878538, "grad_norm": 1.1953125, "learning_rate": 2.2659021115909586e-05, "loss": 1.9791, "num_input_tokens_seen": 44298752, "step": 21195 }, { "epoch": 3.4584386980993553, "grad_norm": 3.203125, "learning_rate": 2.2649246955637847e-05, "loss": 2.9388, "num_input_tokens_seen": 44309408, "step": 21200 }, { "epoch": 3.4584386980993553, "eval_loss": 2.5528268814086914, "eval_runtime": 135.0319, "eval_samples_per_second": 20.18, "eval_steps_per_second": 10.094, "num_input_tokens_seen": 44309408, "step": 21200 }, { "epoch": 3.459254425320173, "grad_norm": 9.375, "learning_rate": 2.2639473157881766e-05, "loss": 2.1529, "num_input_tokens_seen": 44320160, "step": 21205 }, { "epoch": 3.4600701525409905, "grad_norm": 5.40625, "learning_rate": 2.2629699724148594e-05, "loss": 3.0936, "num_input_tokens_seen": 44330528, "step": 21210 }, { "epoch": 3.4608858797618076, "grad_norm": 12.25, "learning_rate": 2.26199266559455e-05, "loss": 3.2488, "num_input_tokens_seen": 44340304, "step": 21215 }, { "epoch": 3.461701606982625, "grad_norm": 5.5, "learning_rate": 2.2610153954779625e-05, "loss": 2.3216, "num_input_tokens_seen": 44351648, "step": 21220 }, { "epoch": 3.4625173342034423, "grad_norm": 4.96875, "learning_rate": 2.2600381622158056e-05, "loss": 0.8426, "num_input_tokens_seen": 44361584, "step": 21225 }, { "epoch": 3.46333306142426, "grad_norm": 8.375, "learning_rate": 2.2590609659587783e-05, "loss": 3.1564, "num_input_tokens_seen": 44371200, "step": 21230 }, { "epoch": 3.464148788645077, "grad_norm": 8.8125, "learning_rate": 2.2580838068575787e-05, "loss": 1.7587, "num_input_tokens_seen": 44380544, "step": 21235 }, { "epoch": 3.4649645158658946, "grad_norm": 9.875, "learning_rate": 2.257106685062896e-05, "loss": 3.409, "num_input_tokens_seen": 44389472, "step": 21240 }, { "epoch": 3.4657802430867117, "grad_norm": 9.375, "learning_rate": 2.256129600725415e-05, "loss": 3.0092, "num_input_tokens_seen": 44400704, "step": 21245 }, { "epoch": 3.4665959703075293, "grad_norm": 3.578125, "learning_rate": 2.2551525539958145e-05, "loss": 3.1499, "num_input_tokens_seen": 44411120, "step": 21250 }, { "epoch": 3.4674116975283464, "grad_norm": 10.3125, "learning_rate": 2.2541755450247663e-05, "loss": 3.0787, "num_input_tokens_seen": 44421040, "step": 21255 }, { "epoch": 3.468227424749164, "grad_norm": 10.75, "learning_rate": 2.2531985739629382e-05, "loss": 3.6333, "num_input_tokens_seen": 44431920, "step": 21260 }, { "epoch": 3.469043151969981, "grad_norm": 7.1875, "learning_rate": 2.2522216409609924e-05, "loss": 2.173, "num_input_tokens_seen": 44441328, "step": 21265 }, { "epoch": 3.4698588791907987, "grad_norm": 3.28125, "learning_rate": 2.2512447461695826e-05, "loss": 2.8693, "num_input_tokens_seen": 44452080, "step": 21270 }, { "epoch": 3.470674606411616, "grad_norm": 6.3125, "learning_rate": 2.2502678897393593e-05, "loss": 2.6314, "num_input_tokens_seen": 44463232, "step": 21275 }, { "epoch": 3.4714903336324334, "grad_norm": 10.9375, "learning_rate": 2.2492910718209665e-05, "loss": 2.1098, "num_input_tokens_seen": 44473232, "step": 21280 }, { "epoch": 3.4723060608532506, "grad_norm": 2.4375, "learning_rate": 2.2483142925650398e-05, "loss": 2.548, "num_input_tokens_seen": 44485232, "step": 21285 }, { "epoch": 3.473121788074068, "grad_norm": 9.0625, "learning_rate": 2.247337552122213e-05, "loss": 3.9409, "num_input_tokens_seen": 44493840, "step": 21290 }, { "epoch": 3.4739375152948853, "grad_norm": 5.84375, "learning_rate": 2.24636085064311e-05, "loss": 2.1471, "num_input_tokens_seen": 44504560, "step": 21295 }, { "epoch": 3.474753242515703, "grad_norm": 4.90625, "learning_rate": 2.245384188278351e-05, "loss": 2.7483, "num_input_tokens_seen": 44515136, "step": 21300 }, { "epoch": 3.47556896973652, "grad_norm": 3.0625, "learning_rate": 2.2444075651785513e-05, "loss": 2.0221, "num_input_tokens_seen": 44526256, "step": 21305 }, { "epoch": 3.4763846969573375, "grad_norm": 7.875, "learning_rate": 2.243430981494316e-05, "loss": 2.5828, "num_input_tokens_seen": 44535824, "step": 21310 }, { "epoch": 3.4772004241781547, "grad_norm": 10.625, "learning_rate": 2.2424544373762475e-05, "loss": 2.4823, "num_input_tokens_seen": 44547472, "step": 21315 }, { "epoch": 3.4780161513989722, "grad_norm": 11.875, "learning_rate": 2.2414779329749418e-05, "loss": 1.8334, "num_input_tokens_seen": 44557328, "step": 21320 }, { "epoch": 3.4788318786197894, "grad_norm": 5.28125, "learning_rate": 2.2405014684409873e-05, "loss": 2.3925, "num_input_tokens_seen": 44567168, "step": 21325 }, { "epoch": 3.479647605840607, "grad_norm": 7.375, "learning_rate": 2.239525043924968e-05, "loss": 2.0841, "num_input_tokens_seen": 44577984, "step": 21330 }, { "epoch": 3.480463333061424, "grad_norm": 8.375, "learning_rate": 2.2385486595774592e-05, "loss": 2.8518, "num_input_tokens_seen": 44589184, "step": 21335 }, { "epoch": 3.4812790602822417, "grad_norm": 9.25, "learning_rate": 2.237572315549033e-05, "loss": 3.5003, "num_input_tokens_seen": 44598624, "step": 21340 }, { "epoch": 3.482094787503059, "grad_norm": 4.71875, "learning_rate": 2.2365960119902545e-05, "loss": 1.7744, "num_input_tokens_seen": 44609584, "step": 21345 }, { "epoch": 3.4829105147238764, "grad_norm": 3.0, "learning_rate": 2.2356197490516806e-05, "loss": 2.1558, "num_input_tokens_seen": 44621760, "step": 21350 }, { "epoch": 3.4837262419446935, "grad_norm": 11.0, "learning_rate": 2.234643526883863e-05, "loss": 3.8071, "num_input_tokens_seen": 44632832, "step": 21355 }, { "epoch": 3.484541969165511, "grad_norm": 9.8125, "learning_rate": 2.2336673456373497e-05, "loss": 2.6449, "num_input_tokens_seen": 44642464, "step": 21360 }, { "epoch": 3.4853576963863286, "grad_norm": 10.9375, "learning_rate": 2.2326912054626772e-05, "loss": 2.911, "num_input_tokens_seen": 44652192, "step": 21365 }, { "epoch": 3.4861734236071458, "grad_norm": 3.953125, "learning_rate": 2.2317151065103813e-05, "loss": 1.2649, "num_input_tokens_seen": 44663072, "step": 21370 }, { "epoch": 3.486989150827963, "grad_norm": 6.8125, "learning_rate": 2.2307390489309865e-05, "loss": 1.854, "num_input_tokens_seen": 44674384, "step": 21375 }, { "epoch": 3.4878048780487805, "grad_norm": 12.625, "learning_rate": 2.2297630328750146e-05, "loss": 2.1862, "num_input_tokens_seen": 44684848, "step": 21380 }, { "epoch": 3.488620605269598, "grad_norm": 8.5625, "learning_rate": 2.228787058492979e-05, "loss": 2.4327, "num_input_tokens_seen": 44695120, "step": 21385 }, { "epoch": 3.489436332490415, "grad_norm": 10.25, "learning_rate": 2.2278111259353875e-05, "loss": 3.6651, "num_input_tokens_seen": 44705456, "step": 21390 }, { "epoch": 3.4902520597112328, "grad_norm": 10.0, "learning_rate": 2.2268352353527395e-05, "loss": 3.7307, "num_input_tokens_seen": 44715360, "step": 21395 }, { "epoch": 3.49106778693205, "grad_norm": 3.203125, "learning_rate": 2.225859386895533e-05, "loss": 1.287, "num_input_tokens_seen": 44724144, "step": 21400 }, { "epoch": 3.49106778693205, "eval_loss": 2.545358419418335, "eval_runtime": 134.8349, "eval_samples_per_second": 20.21, "eval_steps_per_second": 10.109, "num_input_tokens_seen": 44724144, "step": 21400 }, { "epoch": 3.4918835141528675, "grad_norm": 3.3125, "learning_rate": 2.2248835807142525e-05, "loss": 1.6725, "num_input_tokens_seen": 44734976, "step": 21405 }, { "epoch": 3.4926992413736846, "grad_norm": 8.125, "learning_rate": 2.2239078169593826e-05, "loss": 1.8006, "num_input_tokens_seen": 44744560, "step": 21410 }, { "epoch": 3.493514968594502, "grad_norm": 9.25, "learning_rate": 2.222932095781396e-05, "loss": 1.9292, "num_input_tokens_seen": 44753984, "step": 21415 }, { "epoch": 3.4943306958153193, "grad_norm": 11.375, "learning_rate": 2.221956417330762e-05, "loss": 2.6013, "num_input_tokens_seen": 44763552, "step": 21420 }, { "epoch": 3.495146423036137, "grad_norm": 2.21875, "learning_rate": 2.2209807817579438e-05, "loss": 1.8122, "num_input_tokens_seen": 44774832, "step": 21425 }, { "epoch": 3.495962150256954, "grad_norm": 9.4375, "learning_rate": 2.220005189213394e-05, "loss": 1.8778, "num_input_tokens_seen": 44784448, "step": 21430 }, { "epoch": 3.4967778774777716, "grad_norm": 6.21875, "learning_rate": 2.2190296398475624e-05, "loss": 2.6066, "num_input_tokens_seen": 44795360, "step": 21435 }, { "epoch": 3.4975936046985887, "grad_norm": 4.96875, "learning_rate": 2.2180541338108926e-05, "loss": 2.3608, "num_input_tokens_seen": 44806224, "step": 21440 }, { "epoch": 3.4984093319194063, "grad_norm": 4.65625, "learning_rate": 2.2170786712538176e-05, "loss": 1.852, "num_input_tokens_seen": 44816992, "step": 21445 }, { "epoch": 3.4992250591402234, "grad_norm": 10.0625, "learning_rate": 2.216103252326768e-05, "loss": 2.8916, "num_input_tokens_seen": 44827984, "step": 21450 }, { "epoch": 3.500040786361041, "grad_norm": 8.875, "learning_rate": 2.2151278771801635e-05, "loss": 2.7281, "num_input_tokens_seen": 44838480, "step": 21455 }, { "epoch": 3.500856513581858, "grad_norm": 8.875, "learning_rate": 2.21415254596442e-05, "loss": 2.6169, "num_input_tokens_seen": 44847680, "step": 21460 }, { "epoch": 3.5016722408026757, "grad_norm": 7.03125, "learning_rate": 2.213177258829947e-05, "loss": 1.8145, "num_input_tokens_seen": 44857872, "step": 21465 }, { "epoch": 3.502487968023493, "grad_norm": 5.625, "learning_rate": 2.2122020159271445e-05, "loss": 2.8653, "num_input_tokens_seen": 44868096, "step": 21470 }, { "epoch": 3.5033036952443104, "grad_norm": 8.5, "learning_rate": 2.2112268174064075e-05, "loss": 1.9789, "num_input_tokens_seen": 44878592, "step": 21475 }, { "epoch": 3.5041194224651275, "grad_norm": 5.34375, "learning_rate": 2.2102516634181253e-05, "loss": 2.1145, "num_input_tokens_seen": 44888304, "step": 21480 }, { "epoch": 3.504935149685945, "grad_norm": 6.96875, "learning_rate": 2.209276554112677e-05, "loss": 2.4559, "num_input_tokens_seen": 44897872, "step": 21485 }, { "epoch": 3.5057508769067622, "grad_norm": 12.25, "learning_rate": 2.2083014896404384e-05, "loss": 2.3114, "num_input_tokens_seen": 44908912, "step": 21490 }, { "epoch": 3.50656660412758, "grad_norm": 9.5625, "learning_rate": 2.207326470151775e-05, "loss": 1.2112, "num_input_tokens_seen": 44919760, "step": 21495 }, { "epoch": 3.507382331348397, "grad_norm": 2.484375, "learning_rate": 2.2063514957970477e-05, "loss": 3.0373, "num_input_tokens_seen": 44930912, "step": 21500 }, { "epoch": 3.5081980585692145, "grad_norm": 10.875, "learning_rate": 2.205376566726611e-05, "loss": 3.1224, "num_input_tokens_seen": 44942144, "step": 21505 }, { "epoch": 3.5090137857900316, "grad_norm": 9.75, "learning_rate": 2.204401683090809e-05, "loss": 2.7421, "num_input_tokens_seen": 44953280, "step": 21510 }, { "epoch": 3.5098295130108492, "grad_norm": 3.65625, "learning_rate": 2.203426845039982e-05, "loss": 1.4317, "num_input_tokens_seen": 44963136, "step": 21515 }, { "epoch": 3.510645240231667, "grad_norm": 2.75, "learning_rate": 2.202452052724464e-05, "loss": 4.4474, "num_input_tokens_seen": 44973664, "step": 21520 }, { "epoch": 3.511460967452484, "grad_norm": 9.0, "learning_rate": 2.2014773062945777e-05, "loss": 2.5751, "num_input_tokens_seen": 44983936, "step": 21525 }, { "epoch": 3.512276694673301, "grad_norm": 7.25, "learning_rate": 2.2005026059006427e-05, "loss": 2.6147, "num_input_tokens_seen": 44994272, "step": 21530 }, { "epoch": 3.5130924218941186, "grad_norm": 8.5625, "learning_rate": 2.1995279516929695e-05, "loss": 2.6797, "num_input_tokens_seen": 45004848, "step": 21535 }, { "epoch": 3.513908149114936, "grad_norm": 0.181640625, "learning_rate": 2.1985533438218613e-05, "loss": 1.8192, "num_input_tokens_seen": 45015872, "step": 21540 }, { "epoch": 3.5147238763357533, "grad_norm": 6.75, "learning_rate": 2.197578782437617e-05, "loss": 1.5126, "num_input_tokens_seen": 45027344, "step": 21545 }, { "epoch": 3.5155396035565705, "grad_norm": 3.34375, "learning_rate": 2.196604267690524e-05, "loss": 1.0659, "num_input_tokens_seen": 45037744, "step": 21550 }, { "epoch": 3.516355330777388, "grad_norm": 4.15625, "learning_rate": 2.195629799730865e-05, "loss": 2.9049, "num_input_tokens_seen": 45048704, "step": 21555 }, { "epoch": 3.5171710579982056, "grad_norm": 7.65625, "learning_rate": 2.1946553787089173e-05, "loss": 2.7401, "num_input_tokens_seen": 45058864, "step": 21560 }, { "epoch": 3.5179867852190227, "grad_norm": 13.75, "learning_rate": 2.193681004774947e-05, "loss": 3.1275, "num_input_tokens_seen": 45069760, "step": 21565 }, { "epoch": 3.51880251243984, "grad_norm": 8.75, "learning_rate": 2.1927066780792154e-05, "loss": 2.7207, "num_input_tokens_seen": 45078912, "step": 21570 }, { "epoch": 3.5196182396606575, "grad_norm": 3.890625, "learning_rate": 2.191732398771975e-05, "loss": 1.6301, "num_input_tokens_seen": 45090800, "step": 21575 }, { "epoch": 3.520433966881475, "grad_norm": 5.03125, "learning_rate": 2.1907581670034725e-05, "loss": 1.3946, "num_input_tokens_seen": 45101264, "step": 21580 }, { "epoch": 3.521249694102292, "grad_norm": 2.953125, "learning_rate": 2.189783982923948e-05, "loss": 3.0403, "num_input_tokens_seen": 45110288, "step": 21585 }, { "epoch": 3.5220654213231093, "grad_norm": 5.5625, "learning_rate": 2.1888098466836303e-05, "loss": 1.342, "num_input_tokens_seen": 45121680, "step": 21590 }, { "epoch": 3.522881148543927, "grad_norm": 6.75, "learning_rate": 2.1878357584327457e-05, "loss": 2.1924, "num_input_tokens_seen": 45132944, "step": 21595 }, { "epoch": 3.5236968757647444, "grad_norm": 6.53125, "learning_rate": 2.1868617183215103e-05, "loss": 1.2195, "num_input_tokens_seen": 45143632, "step": 21600 }, { "epoch": 3.5236968757647444, "eval_loss": 2.533942699432373, "eval_runtime": 134.9233, "eval_samples_per_second": 20.197, "eval_steps_per_second": 10.102, "num_input_tokens_seen": 45143632, "step": 21600 }, { "epoch": 3.5245126029855616, "grad_norm": 4.8125, "learning_rate": 2.1858877265001327e-05, "loss": 1.9932, "num_input_tokens_seen": 45153424, "step": 21605 }, { "epoch": 3.525328330206379, "grad_norm": 13.75, "learning_rate": 2.184913783118816e-05, "loss": 2.5312, "num_input_tokens_seen": 45162688, "step": 21610 }, { "epoch": 3.5261440574271963, "grad_norm": 8.1875, "learning_rate": 2.1839398883277522e-05, "loss": 2.6139, "num_input_tokens_seen": 45173392, "step": 21615 }, { "epoch": 3.526959784648014, "grad_norm": 10.625, "learning_rate": 2.182966042277129e-05, "loss": 1.7992, "num_input_tokens_seen": 45182640, "step": 21620 }, { "epoch": 3.527775511868831, "grad_norm": 7.3125, "learning_rate": 2.181992245117128e-05, "loss": 2.4849, "num_input_tokens_seen": 45192848, "step": 21625 }, { "epoch": 3.5285912390896486, "grad_norm": 4.59375, "learning_rate": 2.181018496997918e-05, "loss": 2.4438, "num_input_tokens_seen": 45203920, "step": 21630 }, { "epoch": 3.5294069663104657, "grad_norm": 3.59375, "learning_rate": 2.1800447980696648e-05, "loss": 2.9273, "num_input_tokens_seen": 45213280, "step": 21635 }, { "epoch": 3.5302226935312833, "grad_norm": 10.5, "learning_rate": 2.1790711484825248e-05, "loss": 3.201, "num_input_tokens_seen": 45223712, "step": 21640 }, { "epoch": 3.5310384207521004, "grad_norm": 5.875, "learning_rate": 2.178097548386646e-05, "loss": 2.8379, "num_input_tokens_seen": 45234368, "step": 21645 }, { "epoch": 3.531854147972918, "grad_norm": 9.6875, "learning_rate": 2.1771239979321712e-05, "loss": 2.3264, "num_input_tokens_seen": 45244656, "step": 21650 }, { "epoch": 3.532669875193735, "grad_norm": 8.875, "learning_rate": 2.1761504972692327e-05, "loss": 2.1105, "num_input_tokens_seen": 45255104, "step": 21655 }, { "epoch": 3.5334856024145527, "grad_norm": 12.9375, "learning_rate": 2.1751770465479572e-05, "loss": 2.3326, "num_input_tokens_seen": 45266464, "step": 21660 }, { "epoch": 3.53430132963537, "grad_norm": 5.4375, "learning_rate": 2.174203645918464e-05, "loss": 1.3326, "num_input_tokens_seen": 45276544, "step": 21665 }, { "epoch": 3.5351170568561874, "grad_norm": 4.4375, "learning_rate": 2.1732302955308624e-05, "loss": 0.8551, "num_input_tokens_seen": 45287312, "step": 21670 }, { "epoch": 3.5359327840770045, "grad_norm": 4.03125, "learning_rate": 2.172256995535255e-05, "loss": 2.3107, "num_input_tokens_seen": 45298960, "step": 21675 }, { "epoch": 3.536748511297822, "grad_norm": 9.6875, "learning_rate": 2.171283746081739e-05, "loss": 2.5911, "num_input_tokens_seen": 45310352, "step": 21680 }, { "epoch": 3.537564238518639, "grad_norm": 8.75, "learning_rate": 2.1703105473203988e-05, "loss": 2.758, "num_input_tokens_seen": 45321376, "step": 21685 }, { "epoch": 3.538379965739457, "grad_norm": 6.625, "learning_rate": 2.1693373994013168e-05, "loss": 3.3842, "num_input_tokens_seen": 45332192, "step": 21690 }, { "epoch": 3.539195692960274, "grad_norm": 8.625, "learning_rate": 2.168364302474562e-05, "loss": 2.7232, "num_input_tokens_seen": 45342800, "step": 21695 }, { "epoch": 3.5400114201810915, "grad_norm": 16.75, "learning_rate": 2.167391256690199e-05, "loss": 1.7412, "num_input_tokens_seen": 45354384, "step": 21700 }, { "epoch": 3.5408271474019086, "grad_norm": 10.0, "learning_rate": 2.1664182621982855e-05, "loss": 1.9621, "num_input_tokens_seen": 45364752, "step": 21705 }, { "epoch": 3.541642874622726, "grad_norm": 5.71875, "learning_rate": 2.1654453191488673e-05, "loss": 2.2376, "num_input_tokens_seen": 45374640, "step": 21710 }, { "epoch": 3.5424586018435438, "grad_norm": 9.8125, "learning_rate": 2.1644724276919846e-05, "loss": 3.0, "num_input_tokens_seen": 45385920, "step": 21715 }, { "epoch": 3.543274329064361, "grad_norm": 7.875, "learning_rate": 2.1634995879776715e-05, "loss": 1.405, "num_input_tokens_seen": 45396848, "step": 21720 }, { "epoch": 3.544090056285178, "grad_norm": 10.0625, "learning_rate": 2.162526800155949e-05, "loss": 1.943, "num_input_tokens_seen": 45405856, "step": 21725 }, { "epoch": 3.5449057835059956, "grad_norm": 5.75, "learning_rate": 2.1615540643768363e-05, "loss": 2.9437, "num_input_tokens_seen": 45417600, "step": 21730 }, { "epoch": 3.545721510726813, "grad_norm": 11.625, "learning_rate": 2.160581380790339e-05, "loss": 3.1273, "num_input_tokens_seen": 45427856, "step": 21735 }, { "epoch": 3.5465372379476303, "grad_norm": 2.953125, "learning_rate": 2.1596087495464586e-05, "loss": 2.0303, "num_input_tokens_seen": 45437600, "step": 21740 }, { "epoch": 3.5473529651684474, "grad_norm": 4.96875, "learning_rate": 2.1586361707951866e-05, "loss": 2.2672, "num_input_tokens_seen": 45449312, "step": 21745 }, { "epoch": 3.548168692389265, "grad_norm": 7.75, "learning_rate": 2.157663644686507e-05, "loss": 2.6136, "num_input_tokens_seen": 45459520, "step": 21750 }, { "epoch": 3.5489844196100826, "grad_norm": 10.6875, "learning_rate": 2.156691171370396e-05, "loss": 2.3074, "num_input_tokens_seen": 45470320, "step": 21755 }, { "epoch": 3.5498001468308997, "grad_norm": 4.25, "learning_rate": 2.1557187509968195e-05, "loss": 3.6416, "num_input_tokens_seen": 45480576, "step": 21760 }, { "epoch": 3.550615874051717, "grad_norm": 6.78125, "learning_rate": 2.1547463837157382e-05, "loss": 2.7181, "num_input_tokens_seen": 45489344, "step": 21765 }, { "epoch": 3.5514316012725344, "grad_norm": 7.8125, "learning_rate": 2.1537740696771045e-05, "loss": 2.2638, "num_input_tokens_seen": 45500592, "step": 21770 }, { "epoch": 3.552247328493352, "grad_norm": 3.4375, "learning_rate": 2.1528018090308587e-05, "loss": 1.5722, "num_input_tokens_seen": 45511568, "step": 21775 }, { "epoch": 3.553063055714169, "grad_norm": 6.53125, "learning_rate": 2.151829601926938e-05, "loss": 1.9293, "num_input_tokens_seen": 45521840, "step": 21780 }, { "epoch": 3.5538787829349863, "grad_norm": 5.34375, "learning_rate": 2.1508574485152684e-05, "loss": 2.0968, "num_input_tokens_seen": 45533632, "step": 21785 }, { "epoch": 3.554694510155804, "grad_norm": 7.78125, "learning_rate": 2.1498853489457667e-05, "loss": 1.8898, "num_input_tokens_seen": 45544752, "step": 21790 }, { "epoch": 3.5555102373766214, "grad_norm": 8.9375, "learning_rate": 2.1489133033683455e-05, "loss": 1.9762, "num_input_tokens_seen": 45556336, "step": 21795 }, { "epoch": 3.5563259645974385, "grad_norm": 8.5625, "learning_rate": 2.1479413119329038e-05, "loss": 1.8093, "num_input_tokens_seen": 45567152, "step": 21800 }, { "epoch": 3.5563259645974385, "eval_loss": 2.5370492935180664, "eval_runtime": 134.8999, "eval_samples_per_second": 20.2, "eval_steps_per_second": 10.104, "num_input_tokens_seen": 45567152, "step": 21800 }, { "epoch": 3.557141691818256, "grad_norm": 5.1875, "learning_rate": 2.1469693747893355e-05, "loss": 2.0237, "num_input_tokens_seen": 45578368, "step": 21805 }, { "epoch": 3.5579574190390733, "grad_norm": 3.390625, "learning_rate": 2.1459974920875274e-05, "loss": 0.9706, "num_input_tokens_seen": 45588992, "step": 21810 }, { "epoch": 3.558773146259891, "grad_norm": 14.4375, "learning_rate": 2.145025663977354e-05, "loss": 2.9433, "num_input_tokens_seen": 45599840, "step": 21815 }, { "epoch": 3.559588873480708, "grad_norm": 5.6875, "learning_rate": 2.1440538906086844e-05, "loss": 1.8411, "num_input_tokens_seen": 45609632, "step": 21820 }, { "epoch": 3.5604046007015255, "grad_norm": 9.9375, "learning_rate": 2.1430821721313782e-05, "loss": 1.7808, "num_input_tokens_seen": 45620832, "step": 21825 }, { "epoch": 3.5612203279223427, "grad_norm": 8.8125, "learning_rate": 2.142110508695286e-05, "loss": 2.4799, "num_input_tokens_seen": 45631616, "step": 21830 }, { "epoch": 3.5620360551431602, "grad_norm": 5.75, "learning_rate": 2.1411389004502515e-05, "loss": 1.2807, "num_input_tokens_seen": 45642048, "step": 21835 }, { "epoch": 3.5628517823639774, "grad_norm": 6.84375, "learning_rate": 2.140167347546107e-05, "loss": 2.209, "num_input_tokens_seen": 45653040, "step": 21840 }, { "epoch": 3.563667509584795, "grad_norm": 4.21875, "learning_rate": 2.1391958501326793e-05, "loss": 3.0142, "num_input_tokens_seen": 45663344, "step": 21845 }, { "epoch": 3.564483236805612, "grad_norm": 6.0, "learning_rate": 2.1382244083597873e-05, "loss": 1.9283, "num_input_tokens_seen": 45672896, "step": 21850 }, { "epoch": 3.5652989640264297, "grad_norm": 4.34375, "learning_rate": 2.137253022377237e-05, "loss": 3.0355, "num_input_tokens_seen": 45684048, "step": 21855 }, { "epoch": 3.566114691247247, "grad_norm": 4.84375, "learning_rate": 2.136281692334829e-05, "loss": 0.8018, "num_input_tokens_seen": 45695088, "step": 21860 }, { "epoch": 3.5669304184680644, "grad_norm": 8.125, "learning_rate": 2.135310418382356e-05, "loss": 1.307, "num_input_tokens_seen": 45705232, "step": 21865 }, { "epoch": 3.5677461456888815, "grad_norm": 6.1875, "learning_rate": 2.134339200669598e-05, "loss": 4.3258, "num_input_tokens_seen": 45715856, "step": 21870 }, { "epoch": 3.568561872909699, "grad_norm": 5.90625, "learning_rate": 2.133368039346331e-05, "loss": 2.3396, "num_input_tokens_seen": 45725552, "step": 21875 }, { "epoch": 3.569377600130516, "grad_norm": 6.375, "learning_rate": 2.1323969345623195e-05, "loss": 2.3826, "num_input_tokens_seen": 45737472, "step": 21880 }, { "epoch": 3.5701933273513338, "grad_norm": 3.34375, "learning_rate": 2.1314258864673207e-05, "loss": 1.5054, "num_input_tokens_seen": 45748224, "step": 21885 }, { "epoch": 3.5710090545721513, "grad_norm": 5.75, "learning_rate": 2.130454895211082e-05, "loss": 2.9304, "num_input_tokens_seen": 45757696, "step": 21890 }, { "epoch": 3.5718247817929685, "grad_norm": 5.84375, "learning_rate": 2.129483960943342e-05, "loss": 2.1773, "num_input_tokens_seen": 45769200, "step": 21895 }, { "epoch": 3.5726405090137856, "grad_norm": 4.4375, "learning_rate": 2.128513083813831e-05, "loss": 1.7327, "num_input_tokens_seen": 45779520, "step": 21900 }, { "epoch": 3.573456236234603, "grad_norm": 9.625, "learning_rate": 2.1275422639722724e-05, "loss": 1.6593, "num_input_tokens_seen": 45790048, "step": 21905 }, { "epoch": 3.5742719634554208, "grad_norm": 4.3125, "learning_rate": 2.126571501568376e-05, "loss": 1.8003, "num_input_tokens_seen": 45800144, "step": 21910 }, { "epoch": 3.575087690676238, "grad_norm": 8.25, "learning_rate": 2.1256007967518478e-05, "loss": 1.1462, "num_input_tokens_seen": 45809344, "step": 21915 }, { "epoch": 3.575903417897055, "grad_norm": 7.21875, "learning_rate": 2.124630149672381e-05, "loss": 3.0097, "num_input_tokens_seen": 45820096, "step": 21920 }, { "epoch": 3.5767191451178726, "grad_norm": 12.6875, "learning_rate": 2.1236595604796624e-05, "loss": 1.9164, "num_input_tokens_seen": 45830448, "step": 21925 }, { "epoch": 3.57753487233869, "grad_norm": 5.90625, "learning_rate": 2.1226890293233693e-05, "loss": 1.6277, "num_input_tokens_seen": 45840032, "step": 21930 }, { "epoch": 3.5783505995595073, "grad_norm": 10.5625, "learning_rate": 2.1217185563531694e-05, "loss": 3.0302, "num_input_tokens_seen": 45850704, "step": 21935 }, { "epoch": 3.5791663267803244, "grad_norm": 7.1875, "learning_rate": 2.120748141718721e-05, "loss": 3.5045, "num_input_tokens_seen": 45861296, "step": 21940 }, { "epoch": 3.579982054001142, "grad_norm": 2.5, "learning_rate": 2.1197777855696765e-05, "loss": 2.5429, "num_input_tokens_seen": 45869984, "step": 21945 }, { "epoch": 3.5807977812219596, "grad_norm": 2.75, "learning_rate": 2.1188074880556746e-05, "loss": 1.6309, "num_input_tokens_seen": 45879968, "step": 21950 }, { "epoch": 3.5816135084427767, "grad_norm": 9.75, "learning_rate": 2.1178372493263495e-05, "loss": 3.3112, "num_input_tokens_seen": 45890608, "step": 21955 }, { "epoch": 3.582429235663594, "grad_norm": 3.78125, "learning_rate": 2.116867069531322e-05, "loss": 1.9357, "num_input_tokens_seen": 45901504, "step": 21960 }, { "epoch": 3.5832449628844114, "grad_norm": 4.78125, "learning_rate": 2.1158969488202073e-05, "loss": 2.1044, "num_input_tokens_seen": 45911232, "step": 21965 }, { "epoch": 3.584060690105229, "grad_norm": 3.375, "learning_rate": 2.114926887342611e-05, "loss": 1.7192, "num_input_tokens_seen": 45922368, "step": 21970 }, { "epoch": 3.584876417326046, "grad_norm": 8.1875, "learning_rate": 2.113956885248127e-05, "loss": 2.5827, "num_input_tokens_seen": 45932448, "step": 21975 }, { "epoch": 3.5856921445468637, "grad_norm": 7.5, "learning_rate": 2.112986942686342e-05, "loss": 4.5316, "num_input_tokens_seen": 45942560, "step": 21980 }, { "epoch": 3.586507871767681, "grad_norm": 6.25, "learning_rate": 2.112017059806835e-05, "loss": 2.7886, "num_input_tokens_seen": 45951824, "step": 21985 }, { "epoch": 3.5873235989884984, "grad_norm": 6.75, "learning_rate": 2.1110472367591724e-05, "loss": 2.4559, "num_input_tokens_seen": 45962640, "step": 21990 }, { "epoch": 3.5881393262093155, "grad_norm": 8.125, "learning_rate": 2.1100774736929145e-05, "loss": 3.4588, "num_input_tokens_seen": 45972624, "step": 21995 }, { "epoch": 3.588955053430133, "grad_norm": 2.84375, "learning_rate": 2.10910777075761e-05, "loss": 2.1424, "num_input_tokens_seen": 45983168, "step": 22000 }, { "epoch": 3.588955053430133, "eval_loss": 2.551025152206421, "eval_runtime": 134.8504, "eval_samples_per_second": 20.208, "eval_steps_per_second": 10.108, "num_input_tokens_seen": 45983168, "step": 22000 }, { "epoch": 3.5897707806509502, "grad_norm": 8.75, "learning_rate": 2.108138128102799e-05, "loss": 2.6027, "num_input_tokens_seen": 45994816, "step": 22005 }, { "epoch": 3.590586507871768, "grad_norm": 9.625, "learning_rate": 2.107168545878014e-05, "loss": 2.3617, "num_input_tokens_seen": 46004384, "step": 22010 }, { "epoch": 3.591402235092585, "grad_norm": 5.5625, "learning_rate": 2.106199024232775e-05, "loss": 1.235, "num_input_tokens_seen": 46015552, "step": 22015 }, { "epoch": 3.5922179623134025, "grad_norm": 2.125, "learning_rate": 2.105229563316595e-05, "loss": 1.2198, "num_input_tokens_seen": 46025024, "step": 22020 }, { "epoch": 3.5930336895342196, "grad_norm": 7.78125, "learning_rate": 2.1042601632789784e-05, "loss": 3.5261, "num_input_tokens_seen": 46035584, "step": 22025 }, { "epoch": 3.593849416755037, "grad_norm": 8.8125, "learning_rate": 2.103290824269417e-05, "loss": 2.107, "num_input_tokens_seen": 46046112, "step": 22030 }, { "epoch": 3.5946651439758543, "grad_norm": 10.75, "learning_rate": 2.1023215464373965e-05, "loss": 3.3555, "num_input_tokens_seen": 46057376, "step": 22035 }, { "epoch": 3.595480871196672, "grad_norm": 13.1875, "learning_rate": 2.1013523299323908e-05, "loss": 2.4826, "num_input_tokens_seen": 46068944, "step": 22040 }, { "epoch": 3.596296598417489, "grad_norm": 5.71875, "learning_rate": 2.1003831749038654e-05, "loss": 2.3215, "num_input_tokens_seen": 46080352, "step": 22045 }, { "epoch": 3.5971123256383066, "grad_norm": 13.25, "learning_rate": 2.099414081501277e-05, "loss": 2.1673, "num_input_tokens_seen": 46089872, "step": 22050 }, { "epoch": 3.5979280528591238, "grad_norm": 10.8125, "learning_rate": 2.09844504987407e-05, "loss": 2.5795, "num_input_tokens_seen": 46098160, "step": 22055 }, { "epoch": 3.5987437800799413, "grad_norm": 9.875, "learning_rate": 2.097476080171683e-05, "loss": 2.2625, "num_input_tokens_seen": 46108704, "step": 22060 }, { "epoch": 3.5995595073007585, "grad_norm": 8.4375, "learning_rate": 2.0965071725435436e-05, "loss": 2.4275, "num_input_tokens_seen": 46119664, "step": 22065 }, { "epoch": 3.600375234521576, "grad_norm": 11.0625, "learning_rate": 2.0955383271390684e-05, "loss": 2.9259, "num_input_tokens_seen": 46128512, "step": 22070 }, { "epoch": 3.601190961742393, "grad_norm": 5.5, "learning_rate": 2.094569544107666e-05, "loss": 2.2409, "num_input_tokens_seen": 46137680, "step": 22075 }, { "epoch": 3.6020066889632107, "grad_norm": 2.40625, "learning_rate": 2.093600823598735e-05, "loss": 3.4845, "num_input_tokens_seen": 46148512, "step": 22080 }, { "epoch": 3.6028224161840283, "grad_norm": 13.125, "learning_rate": 2.092632165761663e-05, "loss": 2.5582, "num_input_tokens_seen": 46160000, "step": 22085 }, { "epoch": 3.6036381434048455, "grad_norm": 6.28125, "learning_rate": 2.091663570745832e-05, "loss": 4.3453, "num_input_tokens_seen": 46170320, "step": 22090 }, { "epoch": 3.6044538706256626, "grad_norm": 13.125, "learning_rate": 2.0906950387006086e-05, "loss": 2.9761, "num_input_tokens_seen": 46181552, "step": 22095 }, { "epoch": 3.60526959784648, "grad_norm": 4.28125, "learning_rate": 2.0897265697753543e-05, "loss": 1.2138, "num_input_tokens_seen": 46191984, "step": 22100 }, { "epoch": 3.6060853250672977, "grad_norm": 1.84375, "learning_rate": 2.088758164119419e-05, "loss": 1.7952, "num_input_tokens_seen": 46201616, "step": 22105 }, { "epoch": 3.606901052288115, "grad_norm": 8.625, "learning_rate": 2.0877898218821428e-05, "loss": 2.1018, "num_input_tokens_seen": 46211504, "step": 22110 }, { "epoch": 3.607716779508932, "grad_norm": 9.5625, "learning_rate": 2.0868215432128565e-05, "loss": 1.783, "num_input_tokens_seen": 46222768, "step": 22115 }, { "epoch": 3.6085325067297496, "grad_norm": 6.84375, "learning_rate": 2.0858533282608796e-05, "loss": 2.9174, "num_input_tokens_seen": 46233168, "step": 22120 }, { "epoch": 3.609348233950567, "grad_norm": 5.96875, "learning_rate": 2.084885177175524e-05, "loss": 2.4737, "num_input_tokens_seen": 46243104, "step": 22125 }, { "epoch": 3.6101639611713843, "grad_norm": 6.34375, "learning_rate": 2.0839170901060917e-05, "loss": 3.0332, "num_input_tokens_seen": 46254512, "step": 22130 }, { "epoch": 3.6109796883922014, "grad_norm": 7.9375, "learning_rate": 2.082949067201872e-05, "loss": 1.6607, "num_input_tokens_seen": 46264352, "step": 22135 }, { "epoch": 3.611795415613019, "grad_norm": 9.6875, "learning_rate": 2.0819811086121475e-05, "loss": 2.3821, "num_input_tokens_seen": 46274736, "step": 22140 }, { "epoch": 3.6126111428338366, "grad_norm": 0.1318359375, "learning_rate": 2.08101321448619e-05, "loss": 0.5144, "num_input_tokens_seen": 46285680, "step": 22145 }, { "epoch": 3.6134268700546537, "grad_norm": 7.5, "learning_rate": 2.080045384973259e-05, "loss": 2.5745, "num_input_tokens_seen": 46296784, "step": 22150 }, { "epoch": 3.614242597275471, "grad_norm": 9.1875, "learning_rate": 2.0790776202226082e-05, "loss": 2.8913, "num_input_tokens_seen": 46307888, "step": 22155 }, { "epoch": 3.6150583244962884, "grad_norm": 6.65625, "learning_rate": 2.078109920383477e-05, "loss": 3.1692, "num_input_tokens_seen": 46319136, "step": 22160 }, { "epoch": 3.615874051717106, "grad_norm": 9.1875, "learning_rate": 2.0771422856050978e-05, "loss": 2.8868, "num_input_tokens_seen": 46329120, "step": 22165 }, { "epoch": 3.616689778937923, "grad_norm": 10.5, "learning_rate": 2.076174716036693e-05, "loss": 1.8302, "num_input_tokens_seen": 46339168, "step": 22170 }, { "epoch": 3.6175055061587407, "grad_norm": 8.0, "learning_rate": 2.075207211827472e-05, "loss": 1.7734, "num_input_tokens_seen": 46350352, "step": 22175 }, { "epoch": 3.618321233379558, "grad_norm": 7.125, "learning_rate": 2.074239773126638e-05, "loss": 1.8889, "num_input_tokens_seen": 46361728, "step": 22180 }, { "epoch": 3.6191369606003754, "grad_norm": 7.21875, "learning_rate": 2.073272400083382e-05, "loss": 3.2668, "num_input_tokens_seen": 46372224, "step": 22185 }, { "epoch": 3.6199526878211925, "grad_norm": 10.5, "learning_rate": 2.072305092846883e-05, "loss": 3.6411, "num_input_tokens_seen": 46382016, "step": 22190 }, { "epoch": 3.62076841504201, "grad_norm": 8.25, "learning_rate": 2.0713378515663152e-05, "loss": 2.2546, "num_input_tokens_seen": 46391904, "step": 22195 }, { "epoch": 3.621584142262827, "grad_norm": 9.375, "learning_rate": 2.070370676390836e-05, "loss": 3.3381, "num_input_tokens_seen": 46401184, "step": 22200 }, { "epoch": 3.621584142262827, "eval_loss": 2.5417721271514893, "eval_runtime": 135.0376, "eval_samples_per_second": 20.18, "eval_steps_per_second": 10.093, "num_input_tokens_seen": 46401184, "step": 22200 }, { "epoch": 3.622399869483645, "grad_norm": 5.59375, "learning_rate": 2.0694035674695974e-05, "loss": 1.9254, "num_input_tokens_seen": 46411840, "step": 22205 }, { "epoch": 3.623215596704462, "grad_norm": 4.78125, "learning_rate": 2.0684365249517416e-05, "loss": 0.9247, "num_input_tokens_seen": 46423152, "step": 22210 }, { "epoch": 3.6240313239252795, "grad_norm": 0.890625, "learning_rate": 2.067469548986396e-05, "loss": 1.7019, "num_input_tokens_seen": 46432656, "step": 22215 }, { "epoch": 3.6248470511460966, "grad_norm": 7.25, "learning_rate": 2.066502639722681e-05, "loss": 3.3078, "num_input_tokens_seen": 46443392, "step": 22220 }, { "epoch": 3.625662778366914, "grad_norm": 15.5, "learning_rate": 2.065535797309708e-05, "loss": 2.2293, "num_input_tokens_seen": 46453296, "step": 22225 }, { "epoch": 3.6264785055877313, "grad_norm": 9.3125, "learning_rate": 2.0645690218965736e-05, "loss": 2.746, "num_input_tokens_seen": 46464000, "step": 22230 }, { "epoch": 3.627294232808549, "grad_norm": 7.71875, "learning_rate": 2.063602313632369e-05, "loss": 2.4252, "num_input_tokens_seen": 46475120, "step": 22235 }, { "epoch": 3.628109960029366, "grad_norm": 7.34375, "learning_rate": 2.0626356726661704e-05, "loss": 2.3792, "num_input_tokens_seen": 46486000, "step": 22240 }, { "epoch": 3.6289256872501836, "grad_norm": 7.96875, "learning_rate": 2.0616690991470477e-05, "loss": 2.2951, "num_input_tokens_seen": 46496288, "step": 22245 }, { "epoch": 3.6297414144710007, "grad_norm": 5.21875, "learning_rate": 2.0607025932240595e-05, "loss": 2.6267, "num_input_tokens_seen": 46504000, "step": 22250 }, { "epoch": 3.6305571416918183, "grad_norm": 9.375, "learning_rate": 2.059736155046251e-05, "loss": 2.8624, "num_input_tokens_seen": 46513056, "step": 22255 }, { "epoch": 3.631372868912636, "grad_norm": 9.8125, "learning_rate": 2.0587697847626603e-05, "loss": 3.029, "num_input_tokens_seen": 46523488, "step": 22260 }, { "epoch": 3.632188596133453, "grad_norm": 5.96875, "learning_rate": 2.057803482522314e-05, "loss": 0.8649, "num_input_tokens_seen": 46533808, "step": 22265 }, { "epoch": 3.63300432335427, "grad_norm": 13.0, "learning_rate": 2.056837248474227e-05, "loss": 2.7777, "num_input_tokens_seen": 46544672, "step": 22270 }, { "epoch": 3.6338200505750877, "grad_norm": 2.390625, "learning_rate": 2.0558710827674064e-05, "loss": 3.1123, "num_input_tokens_seen": 46554160, "step": 22275 }, { "epoch": 3.6346357777959053, "grad_norm": 4.46875, "learning_rate": 2.054904985550845e-05, "loss": 1.976, "num_input_tokens_seen": 46564672, "step": 22280 }, { "epoch": 3.6354515050167224, "grad_norm": 5.5, "learning_rate": 2.0539389569735287e-05, "loss": 1.4411, "num_input_tokens_seen": 46574400, "step": 22285 }, { "epoch": 3.6362672322375396, "grad_norm": 5.03125, "learning_rate": 2.052972997184431e-05, "loss": 2.1281, "num_input_tokens_seen": 46583776, "step": 22290 }, { "epoch": 3.637082959458357, "grad_norm": 9.8125, "learning_rate": 2.0520071063325146e-05, "loss": 2.5547, "num_input_tokens_seen": 46593552, "step": 22295 }, { "epoch": 3.6378986866791747, "grad_norm": 3.171875, "learning_rate": 2.051041284566732e-05, "loss": 2.0177, "num_input_tokens_seen": 46603408, "step": 22300 }, { "epoch": 3.638714413899992, "grad_norm": 7.15625, "learning_rate": 2.050075532036026e-05, "loss": 2.819, "num_input_tokens_seen": 46614000, "step": 22305 }, { "epoch": 3.639530141120809, "grad_norm": 5.8125, "learning_rate": 2.0491098488893264e-05, "loss": 1.6924, "num_input_tokens_seen": 46624192, "step": 22310 }, { "epoch": 3.6403458683416265, "grad_norm": 14.1875, "learning_rate": 2.0481442352755546e-05, "loss": 3.2055, "num_input_tokens_seen": 46635600, "step": 22315 }, { "epoch": 3.641161595562444, "grad_norm": 14.5, "learning_rate": 2.0471786913436198e-05, "loss": 3.002, "num_input_tokens_seen": 46646656, "step": 22320 }, { "epoch": 3.6419773227832613, "grad_norm": 5.15625, "learning_rate": 2.0462132172424218e-05, "loss": 3.1638, "num_input_tokens_seen": 46657600, "step": 22325 }, { "epoch": 3.6427930500040784, "grad_norm": 3.09375, "learning_rate": 2.0452478131208484e-05, "loss": 3.7448, "num_input_tokens_seen": 46669040, "step": 22330 }, { "epoch": 3.643608777224896, "grad_norm": 1.7890625, "learning_rate": 2.0442824791277765e-05, "loss": 1.4292, "num_input_tokens_seen": 46679792, "step": 22335 }, { "epoch": 3.6444245044457135, "grad_norm": 6.625, "learning_rate": 2.0433172154120727e-05, "loss": 1.38, "num_input_tokens_seen": 46689280, "step": 22340 }, { "epoch": 3.6452402316665307, "grad_norm": 7.3125, "learning_rate": 2.0423520221225947e-05, "loss": 2.2728, "num_input_tokens_seen": 46697968, "step": 22345 }, { "epoch": 3.6460559588873482, "grad_norm": 5.875, "learning_rate": 2.0413868994081848e-05, "loss": 2.5299, "num_input_tokens_seen": 46708416, "step": 22350 }, { "epoch": 3.6468716861081654, "grad_norm": 5.46875, "learning_rate": 2.0404218474176795e-05, "loss": 2.1664, "num_input_tokens_seen": 46718432, "step": 22355 }, { "epoch": 3.647687413328983, "grad_norm": 7.71875, "learning_rate": 2.0394568662999002e-05, "loss": 2.2535, "num_input_tokens_seen": 46729008, "step": 22360 }, { "epoch": 3.6485031405498, "grad_norm": 9.625, "learning_rate": 2.0384919562036593e-05, "loss": 2.0641, "num_input_tokens_seen": 46740064, "step": 22365 }, { "epoch": 3.6493188677706176, "grad_norm": 6.40625, "learning_rate": 2.0375271172777593e-05, "loss": 2.6467, "num_input_tokens_seen": 46749568, "step": 22370 }, { "epoch": 3.6501345949914348, "grad_norm": 9.6875, "learning_rate": 2.0365623496709885e-05, "loss": 2.2089, "num_input_tokens_seen": 46760304, "step": 22375 }, { "epoch": 3.6509503222122524, "grad_norm": 9.8125, "learning_rate": 2.0355976535321283e-05, "loss": 1.6465, "num_input_tokens_seen": 46771072, "step": 22380 }, { "epoch": 3.6517660494330695, "grad_norm": 14.3125, "learning_rate": 2.034633029009945e-05, "loss": 1.4858, "num_input_tokens_seen": 46781856, "step": 22385 }, { "epoch": 3.652581776653887, "grad_norm": 13.5, "learning_rate": 2.0336684762531972e-05, "loss": 1.6981, "num_input_tokens_seen": 46792112, "step": 22390 }, { "epoch": 3.653397503874704, "grad_norm": 9.75, "learning_rate": 2.032703995410631e-05, "loss": 1.5857, "num_input_tokens_seen": 46801664, "step": 22395 }, { "epoch": 3.6542132310955218, "grad_norm": 5.0625, "learning_rate": 2.031739586630981e-05, "loss": 1.9925, "num_input_tokens_seen": 46813008, "step": 22400 }, { "epoch": 3.6542132310955218, "eval_loss": 2.5449345111846924, "eval_runtime": 134.8181, "eval_samples_per_second": 20.212, "eval_steps_per_second": 10.11, "num_input_tokens_seen": 46813008, "step": 22400 }, { "epoch": 3.655028958316339, "grad_norm": 11.875, "learning_rate": 2.0307752500629707e-05, "loss": 2.8708, "num_input_tokens_seen": 46824208, "step": 22405 }, { "epoch": 3.6558446855371565, "grad_norm": 2.25, "learning_rate": 2.0298109858553144e-05, "loss": 2.2309, "num_input_tokens_seen": 46833312, "step": 22410 }, { "epoch": 3.6566604127579736, "grad_norm": 6.5625, "learning_rate": 2.028846794156712e-05, "loss": 2.2348, "num_input_tokens_seen": 46843360, "step": 22415 }, { "epoch": 3.657476139978791, "grad_norm": 8.0625, "learning_rate": 2.027882675115856e-05, "loss": 2.5278, "num_input_tokens_seen": 46854240, "step": 22420 }, { "epoch": 3.6582918671996083, "grad_norm": 7.34375, "learning_rate": 2.026918628881423e-05, "loss": 1.7401, "num_input_tokens_seen": 46864784, "step": 22425 }, { "epoch": 3.659107594420426, "grad_norm": 7.59375, "learning_rate": 2.0259546556020833e-05, "loss": 1.9327, "num_input_tokens_seen": 46876352, "step": 22430 }, { "epoch": 3.659923321641243, "grad_norm": 7.71875, "learning_rate": 2.024990755426493e-05, "loss": 3.1102, "num_input_tokens_seen": 46886896, "step": 22435 }, { "epoch": 3.6607390488620606, "grad_norm": 9.375, "learning_rate": 2.0240269285032975e-05, "loss": 1.4497, "num_input_tokens_seen": 46898576, "step": 22440 }, { "epoch": 3.6615547760828777, "grad_norm": 7.78125, "learning_rate": 2.0230631749811306e-05, "loss": 1.9124, "num_input_tokens_seen": 46908752, "step": 22445 }, { "epoch": 3.6623705033036953, "grad_norm": 4.8125, "learning_rate": 2.0220994950086162e-05, "loss": 2.8595, "num_input_tokens_seen": 46918848, "step": 22450 }, { "epoch": 3.663186230524513, "grad_norm": 2.59375, "learning_rate": 2.021135888734365e-05, "loss": 1.4355, "num_input_tokens_seen": 46929664, "step": 22455 }, { "epoch": 3.66400195774533, "grad_norm": 3.921875, "learning_rate": 2.0201723563069783e-05, "loss": 1.5092, "num_input_tokens_seen": 46939664, "step": 22460 }, { "epoch": 3.664817684966147, "grad_norm": 7.15625, "learning_rate": 2.0192088978750433e-05, "loss": 2.7935, "num_input_tokens_seen": 46950912, "step": 22465 }, { "epoch": 3.6656334121869647, "grad_norm": 5.6875, "learning_rate": 2.0182455135871385e-05, "loss": 2.3507, "num_input_tokens_seen": 46960272, "step": 22470 }, { "epoch": 3.6664491394077823, "grad_norm": 6.84375, "learning_rate": 2.0172822035918305e-05, "loss": 2.2197, "num_input_tokens_seen": 46970144, "step": 22475 }, { "epoch": 3.6672648666285994, "grad_norm": 10.875, "learning_rate": 2.016318968037671e-05, "loss": 1.6904, "num_input_tokens_seen": 46981040, "step": 22480 }, { "epoch": 3.6680805938494165, "grad_norm": 9.6875, "learning_rate": 2.015355807073206e-05, "loss": 1.5422, "num_input_tokens_seen": 46989744, "step": 22485 }, { "epoch": 3.668896321070234, "grad_norm": 7.59375, "learning_rate": 2.0143927208469664e-05, "loss": 2.2099, "num_input_tokens_seen": 47001376, "step": 22490 }, { "epoch": 3.6697120482910517, "grad_norm": 20.375, "learning_rate": 2.0134297095074708e-05, "loss": 1.9581, "num_input_tokens_seen": 47012816, "step": 22495 }, { "epoch": 3.670527775511869, "grad_norm": 7.34375, "learning_rate": 2.0124667732032297e-05, "loss": 0.9262, "num_input_tokens_seen": 47023488, "step": 22500 }, { "epoch": 3.671343502732686, "grad_norm": 4.46875, "learning_rate": 2.011503912082738e-05, "loss": 1.3541, "num_input_tokens_seen": 47033200, "step": 22505 }, { "epoch": 3.6721592299535035, "grad_norm": 5.875, "learning_rate": 2.0105411262944823e-05, "loss": 2.5225, "num_input_tokens_seen": 47043936, "step": 22510 }, { "epoch": 3.672974957174321, "grad_norm": 1.9375, "learning_rate": 2.0095784159869366e-05, "loss": 3.2647, "num_input_tokens_seen": 47053712, "step": 22515 }, { "epoch": 3.6737906843951382, "grad_norm": 7.5, "learning_rate": 2.0086157813085608e-05, "loss": 1.8715, "num_input_tokens_seen": 47063168, "step": 22520 }, { "epoch": 3.6746064116159554, "grad_norm": 6.96875, "learning_rate": 2.0076532224078068e-05, "loss": 1.3041, "num_input_tokens_seen": 47073968, "step": 22525 }, { "epoch": 3.675422138836773, "grad_norm": 8.8125, "learning_rate": 2.0066907394331142e-05, "loss": 1.3821, "num_input_tokens_seen": 47084944, "step": 22530 }, { "epoch": 3.6762378660575905, "grad_norm": 2.34375, "learning_rate": 2.0057283325329077e-05, "loss": 2.9788, "num_input_tokens_seen": 47095888, "step": 22535 }, { "epoch": 3.6770535932784076, "grad_norm": 12.875, "learning_rate": 2.0047660018556047e-05, "loss": 1.3853, "num_input_tokens_seen": 47106384, "step": 22540 }, { "epoch": 3.677869320499225, "grad_norm": 20.375, "learning_rate": 2.0038037475496075e-05, "loss": 2.719, "num_input_tokens_seen": 47117376, "step": 22545 }, { "epoch": 3.6786850477200423, "grad_norm": 8.1875, "learning_rate": 2.0028415697633073e-05, "loss": 2.7743, "num_input_tokens_seen": 47127712, "step": 22550 }, { "epoch": 3.67950077494086, "grad_norm": 7.71875, "learning_rate": 2.0018794686450858e-05, "loss": 2.6223, "num_input_tokens_seen": 47136112, "step": 22555 }, { "epoch": 3.680316502161677, "grad_norm": 6.75, "learning_rate": 2.0009174443433088e-05, "loss": 2.0775, "num_input_tokens_seen": 47147904, "step": 22560 }, { "epoch": 3.6811322293824946, "grad_norm": 7.4375, "learning_rate": 1.999955497006334e-05, "loss": 2.8767, "num_input_tokens_seen": 47158112, "step": 22565 }, { "epoch": 3.6819479566033118, "grad_norm": 17.5, "learning_rate": 1.9989936267825067e-05, "loss": 1.9076, "num_input_tokens_seen": 47168512, "step": 22570 }, { "epoch": 3.6827636838241293, "grad_norm": 4.09375, "learning_rate": 1.9980318338201572e-05, "loss": 1.7447, "num_input_tokens_seen": 47178176, "step": 22575 }, { "epoch": 3.6835794110449465, "grad_norm": 6.4375, "learning_rate": 1.997070118267607e-05, "loss": 2.6484, "num_input_tokens_seen": 47190304, "step": 22580 }, { "epoch": 3.684395138265764, "grad_norm": 6.96875, "learning_rate": 1.9961084802731654e-05, "loss": 2.9103, "num_input_tokens_seen": 47201680, "step": 22585 }, { "epoch": 3.685210865486581, "grad_norm": 22.625, "learning_rate": 1.9951469199851273e-05, "loss": 3.0654, "num_input_tokens_seen": 47213200, "step": 22590 }, { "epoch": 3.6860265927073987, "grad_norm": 10.125, "learning_rate": 1.99418543755178e-05, "loss": 2.4724, "num_input_tokens_seen": 47224736, "step": 22595 }, { "epoch": 3.686842319928216, "grad_norm": 6.375, "learning_rate": 1.9932240331213936e-05, "loss": 1.7949, "num_input_tokens_seen": 47233968, "step": 22600 }, { "epoch": 3.686842319928216, "eval_loss": 2.540861129760742, "eval_runtime": 134.858, "eval_samples_per_second": 20.206, "eval_steps_per_second": 10.107, "num_input_tokens_seen": 47233968, "step": 22600 }, { "epoch": 3.6876580471490334, "grad_norm": 9.25, "learning_rate": 1.9922627068422297e-05, "loss": 2.1231, "num_input_tokens_seen": 47242496, "step": 22605 }, { "epoch": 3.6884737743698506, "grad_norm": 2.9375, "learning_rate": 1.991301458862538e-05, "loss": 1.5806, "num_input_tokens_seen": 47254336, "step": 22610 }, { "epoch": 3.689289501590668, "grad_norm": 2.03125, "learning_rate": 1.9903402893305536e-05, "loss": 1.5829, "num_input_tokens_seen": 47264384, "step": 22615 }, { "epoch": 3.6901052288114853, "grad_norm": 2.40625, "learning_rate": 1.9893791983945016e-05, "loss": 1.3518, "num_input_tokens_seen": 47276400, "step": 22620 }, { "epoch": 3.690920956032303, "grad_norm": 4.8125, "learning_rate": 1.988418186202594e-05, "loss": 2.9133, "num_input_tokens_seen": 47287040, "step": 22625 }, { "epoch": 3.6917366832531204, "grad_norm": 7.46875, "learning_rate": 1.98745725290303e-05, "loss": 3.1148, "num_input_tokens_seen": 47298864, "step": 22630 }, { "epoch": 3.6925524104739376, "grad_norm": 7.40625, "learning_rate": 1.986496398644e-05, "loss": 2.4159, "num_input_tokens_seen": 47307920, "step": 22635 }, { "epoch": 3.6933681376947547, "grad_norm": 5.46875, "learning_rate": 1.9855356235736777e-05, "loss": 1.9059, "num_input_tokens_seen": 47319520, "step": 22640 }, { "epoch": 3.6941838649155723, "grad_norm": 6.875, "learning_rate": 1.9845749278402277e-05, "loss": 3.8034, "num_input_tokens_seen": 47330448, "step": 22645 }, { "epoch": 3.69499959213639, "grad_norm": 10.5625, "learning_rate": 1.9836143115918006e-05, "loss": 2.7537, "num_input_tokens_seen": 47339728, "step": 22650 }, { "epoch": 3.695815319357207, "grad_norm": 3.59375, "learning_rate": 1.9826537749765367e-05, "loss": 2.2537, "num_input_tokens_seen": 47350000, "step": 22655 }, { "epoch": 3.696631046578024, "grad_norm": 7.09375, "learning_rate": 1.9816933181425625e-05, "loss": 3.7077, "num_input_tokens_seen": 47360432, "step": 22660 }, { "epoch": 3.6974467737988417, "grad_norm": 17.125, "learning_rate": 1.9807329412379903e-05, "loss": 2.2047, "num_input_tokens_seen": 47371264, "step": 22665 }, { "epoch": 3.6982625010196593, "grad_norm": 4.8125, "learning_rate": 1.9797726444109247e-05, "loss": 1.4649, "num_input_tokens_seen": 47382144, "step": 22670 }, { "epoch": 3.6990782282404764, "grad_norm": 0.5703125, "learning_rate": 1.9788124278094557e-05, "loss": 2.3226, "num_input_tokens_seen": 47391712, "step": 22675 }, { "epoch": 3.6998939554612935, "grad_norm": 8.0625, "learning_rate": 1.9778522915816594e-05, "loss": 2.0422, "num_input_tokens_seen": 47402080, "step": 22680 }, { "epoch": 3.700709682682111, "grad_norm": 4.59375, "learning_rate": 1.9768922358756014e-05, "loss": 2.8576, "num_input_tokens_seen": 47412720, "step": 22685 }, { "epoch": 3.7015254099029287, "grad_norm": 4.15625, "learning_rate": 1.9759322608393353e-05, "loss": 2.4574, "num_input_tokens_seen": 47422368, "step": 22690 }, { "epoch": 3.702341137123746, "grad_norm": 4.625, "learning_rate": 1.9749723666208992e-05, "loss": 2.4857, "num_input_tokens_seen": 47433600, "step": 22695 }, { "epoch": 3.703156864344563, "grad_norm": 8.0625, "learning_rate": 1.9740125533683235e-05, "loss": 2.4607, "num_input_tokens_seen": 47444624, "step": 22700 }, { "epoch": 3.7039725915653805, "grad_norm": 8.5625, "learning_rate": 1.9730528212296208e-05, "loss": 3.2363, "num_input_tokens_seen": 47456112, "step": 22705 }, { "epoch": 3.704788318786198, "grad_norm": 8.5625, "learning_rate": 1.9720931703527945e-05, "loss": 2.4641, "num_input_tokens_seen": 47466080, "step": 22710 }, { "epoch": 3.705604046007015, "grad_norm": 9.375, "learning_rate": 1.9711336008858373e-05, "loss": 4.4062, "num_input_tokens_seen": 47476688, "step": 22715 }, { "epoch": 3.7064197732278323, "grad_norm": 12.125, "learning_rate": 1.9701741129767233e-05, "loss": 3.5217, "num_input_tokens_seen": 47487744, "step": 22720 }, { "epoch": 3.70723550044865, "grad_norm": 7.0625, "learning_rate": 1.9692147067734202e-05, "loss": 2.8436, "num_input_tokens_seen": 47497392, "step": 22725 }, { "epoch": 3.7080512276694675, "grad_norm": 3.890625, "learning_rate": 1.96825538242388e-05, "loss": 1.948, "num_input_tokens_seen": 47508064, "step": 22730 }, { "epoch": 3.7088669548902846, "grad_norm": 8.3125, "learning_rate": 1.967296140076041e-05, "loss": 2.656, "num_input_tokens_seen": 47518080, "step": 22735 }, { "epoch": 3.709682682111102, "grad_norm": 6.96875, "learning_rate": 1.966336979877833e-05, "loss": 2.6002, "num_input_tokens_seen": 47529040, "step": 22740 }, { "epoch": 3.7104984093319193, "grad_norm": 5.90625, "learning_rate": 1.9653779019771678e-05, "loss": 2.2671, "num_input_tokens_seen": 47537760, "step": 22745 }, { "epoch": 3.711314136552737, "grad_norm": 10.1875, "learning_rate": 1.9644189065219488e-05, "loss": 1.8158, "num_input_tokens_seen": 47548544, "step": 22750 }, { "epoch": 3.712129863773554, "grad_norm": 7.03125, "learning_rate": 1.9634599936600655e-05, "loss": 2.112, "num_input_tokens_seen": 47558368, "step": 22755 }, { "epoch": 3.7129455909943716, "grad_norm": 5.1875, "learning_rate": 1.9625011635393935e-05, "loss": 1.1854, "num_input_tokens_seen": 47567744, "step": 22760 }, { "epoch": 3.7137613182151887, "grad_norm": 7.5, "learning_rate": 1.9615424163077963e-05, "loss": 3.1212, "num_input_tokens_seen": 47577488, "step": 22765 }, { "epoch": 3.7145770454360063, "grad_norm": 2.78125, "learning_rate": 1.9605837521131263e-05, "loss": 3.4194, "num_input_tokens_seen": 47587760, "step": 22770 }, { "epoch": 3.7153927726568234, "grad_norm": 1.546875, "learning_rate": 1.9596251711032192e-05, "loss": 1.0511, "num_input_tokens_seen": 47598960, "step": 22775 }, { "epoch": 3.716208499877641, "grad_norm": 3.28125, "learning_rate": 1.958666673425903e-05, "loss": 1.9501, "num_input_tokens_seen": 47608400, "step": 22780 }, { "epoch": 3.717024227098458, "grad_norm": 2.640625, "learning_rate": 1.957708259228987e-05, "loss": 2.2746, "num_input_tokens_seen": 47620192, "step": 22785 }, { "epoch": 3.7178399543192757, "grad_norm": 6.375, "learning_rate": 1.956749928660273e-05, "loss": 3.0651, "num_input_tokens_seen": 47629360, "step": 22790 }, { "epoch": 3.718655681540093, "grad_norm": 12.75, "learning_rate": 1.955791681867547e-05, "loss": 2.5457, "num_input_tokens_seen": 47639456, "step": 22795 }, { "epoch": 3.7194714087609104, "grad_norm": 4.46875, "learning_rate": 1.9548335189985824e-05, "loss": 2.2354, "num_input_tokens_seen": 47650016, "step": 22800 }, { "epoch": 3.7194714087609104, "eval_loss": 2.5383472442626953, "eval_runtime": 134.9191, "eval_samples_per_second": 20.197, "eval_steps_per_second": 10.102, "num_input_tokens_seen": 47650016, "step": 22800 }, { "epoch": 3.7202871359817276, "grad_norm": 2.46875, "learning_rate": 1.9538754402011396e-05, "loss": 3.3677, "num_input_tokens_seen": 47661056, "step": 22805 }, { "epoch": 3.721102863202545, "grad_norm": 8.25, "learning_rate": 1.952917445622968e-05, "loss": 2.9967, "num_input_tokens_seen": 47671200, "step": 22810 }, { "epoch": 3.7219185904233623, "grad_norm": 10.5625, "learning_rate": 1.9519595354118005e-05, "loss": 3.0766, "num_input_tokens_seen": 47681408, "step": 22815 }, { "epoch": 3.72273431764418, "grad_norm": 10.125, "learning_rate": 1.951001709715361e-05, "loss": 2.6799, "num_input_tokens_seen": 47691872, "step": 22820 }, { "epoch": 3.7235500448649974, "grad_norm": 3.59375, "learning_rate": 1.9500439686813556e-05, "loss": 2.3022, "num_input_tokens_seen": 47703248, "step": 22825 }, { "epoch": 3.7243657720858145, "grad_norm": 10.125, "learning_rate": 1.949086312457482e-05, "loss": 2.3113, "num_input_tokens_seen": 47715152, "step": 22830 }, { "epoch": 3.7251814993066317, "grad_norm": 12.0, "learning_rate": 1.9481287411914223e-05, "loss": 1.6907, "num_input_tokens_seen": 47725680, "step": 22835 }, { "epoch": 3.7259972265274492, "grad_norm": 5.1875, "learning_rate": 1.9471712550308457e-05, "loss": 2.8205, "num_input_tokens_seen": 47737184, "step": 22840 }, { "epoch": 3.726812953748267, "grad_norm": 7.40625, "learning_rate": 1.946213854123409e-05, "loss": 3.477, "num_input_tokens_seen": 47747424, "step": 22845 }, { "epoch": 3.727628680969084, "grad_norm": 13.875, "learning_rate": 1.9452565386167554e-05, "loss": 2.4781, "num_input_tokens_seen": 47757664, "step": 22850 }, { "epoch": 3.728444408189901, "grad_norm": 0.234375, "learning_rate": 1.9442993086585142e-05, "loss": 2.8343, "num_input_tokens_seen": 47767664, "step": 22855 }, { "epoch": 3.7292601354107187, "grad_norm": 3.328125, "learning_rate": 1.9433421643963043e-05, "loss": 1.6406, "num_input_tokens_seen": 47778208, "step": 22860 }, { "epoch": 3.7300758626315362, "grad_norm": 1.7734375, "learning_rate": 1.942385105977727e-05, "loss": 1.7675, "num_input_tokens_seen": 47789456, "step": 22865 }, { "epoch": 3.7308915898523534, "grad_norm": 6.71875, "learning_rate": 1.9414281335503743e-05, "loss": 2.7943, "num_input_tokens_seen": 47800480, "step": 22870 }, { "epoch": 3.7317073170731705, "grad_norm": 6.625, "learning_rate": 1.9404712472618232e-05, "loss": 2.1083, "num_input_tokens_seen": 47810048, "step": 22875 }, { "epoch": 3.732523044293988, "grad_norm": 8.125, "learning_rate": 1.939514447259636e-05, "loss": 2.8386, "num_input_tokens_seen": 47820320, "step": 22880 }, { "epoch": 3.7333387715148056, "grad_norm": 10.5, "learning_rate": 1.938557733691365e-05, "loss": 2.1758, "num_input_tokens_seen": 47830800, "step": 22885 }, { "epoch": 3.7341544987356228, "grad_norm": 11.125, "learning_rate": 1.9376011067045476e-05, "loss": 2.5858, "num_input_tokens_seen": 47840464, "step": 22890 }, { "epoch": 3.73497022595644, "grad_norm": 5.5625, "learning_rate": 1.9366445664467065e-05, "loss": 2.0333, "num_input_tokens_seen": 47849600, "step": 22895 }, { "epoch": 3.7357859531772575, "grad_norm": 7.625, "learning_rate": 1.9356881130653533e-05, "loss": 2.3987, "num_input_tokens_seen": 47859616, "step": 22900 }, { "epoch": 3.736601680398075, "grad_norm": 10.625, "learning_rate": 1.9347317467079846e-05, "loss": 2.7427, "num_input_tokens_seen": 47869712, "step": 22905 }, { "epoch": 3.737417407618892, "grad_norm": 5.78125, "learning_rate": 1.9337754675220836e-05, "loss": 2.4678, "num_input_tokens_seen": 47879472, "step": 22910 }, { "epoch": 3.7382331348397098, "grad_norm": 13.75, "learning_rate": 1.9328192756551218e-05, "loss": 2.4255, "num_input_tokens_seen": 47889008, "step": 22915 }, { "epoch": 3.739048862060527, "grad_norm": 6.28125, "learning_rate": 1.931863171254555e-05, "loss": 2.0429, "num_input_tokens_seen": 47899936, "step": 22920 }, { "epoch": 3.7398645892813445, "grad_norm": 7.625, "learning_rate": 1.930907154467826e-05, "loss": 2.0553, "num_input_tokens_seen": 47910496, "step": 22925 }, { "epoch": 3.7406803165021616, "grad_norm": 7.0625, "learning_rate": 1.9299512254423673e-05, "loss": 2.8983, "num_input_tokens_seen": 47918832, "step": 22930 }, { "epoch": 3.741496043722979, "grad_norm": 6.46875, "learning_rate": 1.9289953843255914e-05, "loss": 2.1715, "num_input_tokens_seen": 47929264, "step": 22935 }, { "epoch": 3.7423117709437963, "grad_norm": 7.75, "learning_rate": 1.9280396312649048e-05, "loss": 4.5285, "num_input_tokens_seen": 47940032, "step": 22940 }, { "epoch": 3.743127498164614, "grad_norm": 10.5625, "learning_rate": 1.9270839664076936e-05, "loss": 2.8065, "num_input_tokens_seen": 47949600, "step": 22945 }, { "epoch": 3.743943225385431, "grad_norm": 11.375, "learning_rate": 1.9261283899013345e-05, "loss": 2.2834, "num_input_tokens_seen": 47959792, "step": 22950 }, { "epoch": 3.7447589526062486, "grad_norm": 5.5, "learning_rate": 1.92517290189319e-05, "loss": 1.95, "num_input_tokens_seen": 47969680, "step": 22955 }, { "epoch": 3.7455746798270657, "grad_norm": 6.65625, "learning_rate": 1.924217502530607e-05, "loss": 2.2995, "num_input_tokens_seen": 47980032, "step": 22960 }, { "epoch": 3.7463904070478833, "grad_norm": 3.515625, "learning_rate": 1.9232621919609207e-05, "loss": 2.1178, "num_input_tokens_seen": 47991488, "step": 22965 }, { "epoch": 3.7472061342687004, "grad_norm": 3.796875, "learning_rate": 1.9223069703314534e-05, "loss": 3.0665, "num_input_tokens_seen": 48001632, "step": 22970 }, { "epoch": 3.748021861489518, "grad_norm": 1.421875, "learning_rate": 1.92135183778951e-05, "loss": 1.6698, "num_input_tokens_seen": 48012864, "step": 22975 }, { "epoch": 3.748837588710335, "grad_norm": 13.4375, "learning_rate": 1.9203967944823857e-05, "loss": 3.7176, "num_input_tokens_seen": 48023408, "step": 22980 }, { "epoch": 3.7496533159311527, "grad_norm": 2.21875, "learning_rate": 1.9194418405573588e-05, "loss": 2.5234, "num_input_tokens_seen": 48033072, "step": 22985 }, { "epoch": 3.75046904315197, "grad_norm": 3.6875, "learning_rate": 1.9184869761616954e-05, "loss": 2.1939, "num_input_tokens_seen": 48043264, "step": 22990 }, { "epoch": 3.7512847703727874, "grad_norm": 11.25, "learning_rate": 1.9175322014426495e-05, "loss": 3.7622, "num_input_tokens_seen": 48053616, "step": 22995 }, { "epoch": 3.7521004975936045, "grad_norm": 4.78125, "learning_rate": 1.9165775165474565e-05, "loss": 2.1881, "num_input_tokens_seen": 48064160, "step": 23000 }, { "epoch": 3.7521004975936045, "eval_loss": 2.535382032394409, "eval_runtime": 134.8347, "eval_samples_per_second": 20.21, "eval_steps_per_second": 10.109, "num_input_tokens_seen": 48064160, "step": 23000 }, { "epoch": 3.752916224814422, "grad_norm": 2.421875, "learning_rate": 1.9156229216233434e-05, "loss": 2.4889, "num_input_tokens_seen": 48074288, "step": 23005 }, { "epoch": 3.7537319520352392, "grad_norm": 8.0, "learning_rate": 1.9146684168175184e-05, "loss": 2.9693, "num_input_tokens_seen": 48085456, "step": 23010 }, { "epoch": 3.754547679256057, "grad_norm": 5.15625, "learning_rate": 1.9137140022771796e-05, "loss": 1.8511, "num_input_tokens_seen": 48096896, "step": 23015 }, { "epoch": 3.7553634064768744, "grad_norm": 3.921875, "learning_rate": 1.9127596781495103e-05, "loss": 2.1199, "num_input_tokens_seen": 48108224, "step": 23020 }, { "epoch": 3.7561791336976915, "grad_norm": 3.34375, "learning_rate": 1.9118054445816767e-05, "loss": 0.5095, "num_input_tokens_seen": 48118192, "step": 23025 }, { "epoch": 3.7569948609185086, "grad_norm": 13.0, "learning_rate": 1.9108513017208356e-05, "loss": 1.7675, "num_input_tokens_seen": 48128848, "step": 23030 }, { "epoch": 3.7578105881393262, "grad_norm": 6.71875, "learning_rate": 1.9098972497141287e-05, "loss": 2.165, "num_input_tokens_seen": 48139056, "step": 23035 }, { "epoch": 3.758626315360144, "grad_norm": 5.1875, "learning_rate": 1.9089432887086806e-05, "loss": 2.6717, "num_input_tokens_seen": 48148928, "step": 23040 }, { "epoch": 3.759442042580961, "grad_norm": 10.375, "learning_rate": 1.9079894188516056e-05, "loss": 2.7976, "num_input_tokens_seen": 48159760, "step": 23045 }, { "epoch": 3.760257769801778, "grad_norm": 10.875, "learning_rate": 1.907035640290002e-05, "loss": 2.4682, "num_input_tokens_seen": 48169120, "step": 23050 }, { "epoch": 3.7610734970225956, "grad_norm": 12.125, "learning_rate": 1.9060819531709534e-05, "loss": 2.1562, "num_input_tokens_seen": 48180832, "step": 23055 }, { "epoch": 3.761889224243413, "grad_norm": 1.859375, "learning_rate": 1.9051283576415325e-05, "loss": 2.6737, "num_input_tokens_seen": 48189840, "step": 23060 }, { "epoch": 3.7627049514642303, "grad_norm": 15.4375, "learning_rate": 1.904174853848793e-05, "loss": 3.4441, "num_input_tokens_seen": 48200272, "step": 23065 }, { "epoch": 3.7635206786850475, "grad_norm": 5.53125, "learning_rate": 1.903221441939779e-05, "loss": 2.2975, "num_input_tokens_seen": 48211040, "step": 23070 }, { "epoch": 3.764336405905865, "grad_norm": 6.96875, "learning_rate": 1.9022681220615194e-05, "loss": 3.1918, "num_input_tokens_seen": 48222480, "step": 23075 }, { "epoch": 3.7651521331266826, "grad_norm": 13.0, "learning_rate": 1.9013148943610255e-05, "loss": 2.9126, "num_input_tokens_seen": 48233664, "step": 23080 }, { "epoch": 3.7659678603474998, "grad_norm": 7.65625, "learning_rate": 1.9003617589852998e-05, "loss": 2.6854, "num_input_tokens_seen": 48243680, "step": 23085 }, { "epoch": 3.766783587568317, "grad_norm": 8.75, "learning_rate": 1.899408716081326e-05, "loss": 2.093, "num_input_tokens_seen": 48254128, "step": 23090 }, { "epoch": 3.7675993147891345, "grad_norm": 7.1875, "learning_rate": 1.898455765796075e-05, "loss": 2.514, "num_input_tokens_seen": 48263856, "step": 23095 }, { "epoch": 3.768415042009952, "grad_norm": 8.9375, "learning_rate": 1.8975029082765053e-05, "loss": 3.3207, "num_input_tokens_seen": 48276144, "step": 23100 }, { "epoch": 3.769230769230769, "grad_norm": 8.5625, "learning_rate": 1.8965501436695577e-05, "loss": 3.4451, "num_input_tokens_seen": 48286768, "step": 23105 }, { "epoch": 3.7700464964515867, "grad_norm": 7.03125, "learning_rate": 1.895597472122161e-05, "loss": 3.3405, "num_input_tokens_seen": 48296896, "step": 23110 }, { "epoch": 3.770862223672404, "grad_norm": 9.4375, "learning_rate": 1.894644893781231e-05, "loss": 1.7976, "num_input_tokens_seen": 48306576, "step": 23115 }, { "epoch": 3.7716779508932214, "grad_norm": 3.15625, "learning_rate": 1.893692408793665e-05, "loss": 2.3311, "num_input_tokens_seen": 48317200, "step": 23120 }, { "epoch": 3.7724936781140386, "grad_norm": 4.625, "learning_rate": 1.8927400173063493e-05, "loss": 2.2113, "num_input_tokens_seen": 48328080, "step": 23125 }, { "epoch": 3.773309405334856, "grad_norm": 8.4375, "learning_rate": 1.891787719466154e-05, "loss": 3.2427, "num_input_tokens_seen": 48338848, "step": 23130 }, { "epoch": 3.7741251325556733, "grad_norm": 2.78125, "learning_rate": 1.8908355154199346e-05, "loss": 2.6816, "num_input_tokens_seen": 48347888, "step": 23135 }, { "epoch": 3.774940859776491, "grad_norm": 18.75, "learning_rate": 1.8898834053145357e-05, "loss": 1.594, "num_input_tokens_seen": 48357728, "step": 23140 }, { "epoch": 3.775756586997308, "grad_norm": 3.375, "learning_rate": 1.8889313892967813e-05, "loss": 1.8555, "num_input_tokens_seen": 48368928, "step": 23145 }, { "epoch": 3.7765723142181256, "grad_norm": 8.5625, "learning_rate": 1.8879794675134863e-05, "loss": 3.3091, "num_input_tokens_seen": 48378848, "step": 23150 }, { "epoch": 3.7773880414389427, "grad_norm": 10.875, "learning_rate": 1.8870276401114494e-05, "loss": 1.8459, "num_input_tokens_seen": 48389664, "step": 23155 }, { "epoch": 3.7782037686597603, "grad_norm": 2.015625, "learning_rate": 1.886075907237453e-05, "loss": 2.8076, "num_input_tokens_seen": 48400928, "step": 23160 }, { "epoch": 3.7790194958805774, "grad_norm": 1.6484375, "learning_rate": 1.8851242690382672e-05, "loss": 1.3478, "num_input_tokens_seen": 48412032, "step": 23165 }, { "epoch": 3.779835223101395, "grad_norm": 9.25, "learning_rate": 1.884172725660645e-05, "loss": 1.2957, "num_input_tokens_seen": 48422704, "step": 23170 }, { "epoch": 3.780650950322212, "grad_norm": 4.03125, "learning_rate": 1.8832212772513277e-05, "loss": 2.1649, "num_input_tokens_seen": 48433104, "step": 23175 }, { "epoch": 3.7814666775430297, "grad_norm": 6.1875, "learning_rate": 1.8822699239570414e-05, "loss": 3.6983, "num_input_tokens_seen": 48443632, "step": 23180 }, { "epoch": 3.782282404763847, "grad_norm": 7.375, "learning_rate": 1.8813186659244943e-05, "loss": 4.2684, "num_input_tokens_seen": 48453504, "step": 23185 }, { "epoch": 3.7830981319846644, "grad_norm": 7.5, "learning_rate": 1.880367503300385e-05, "loss": 2.2703, "num_input_tokens_seen": 48463680, "step": 23190 }, { "epoch": 3.783913859205482, "grad_norm": 8.5625, "learning_rate": 1.8794164362313927e-05, "loss": 2.9701, "num_input_tokens_seen": 48473712, "step": 23195 }, { "epoch": 3.784729586426299, "grad_norm": 9.0, "learning_rate": 1.878465464864185e-05, "loss": 3.1696, "num_input_tokens_seen": 48484384, "step": 23200 }, { "epoch": 3.784729586426299, "eval_loss": 2.534999370574951, "eval_runtime": 134.968, "eval_samples_per_second": 20.19, "eval_steps_per_second": 10.099, "num_input_tokens_seen": 48484384, "step": 23200 }, { "epoch": 3.785545313647116, "grad_norm": 1.453125, "learning_rate": 1.877514589345414e-05, "loss": 1.6768, "num_input_tokens_seen": 48495936, "step": 23205 }, { "epoch": 3.786361040867934, "grad_norm": 9.4375, "learning_rate": 1.876563809821715e-05, "loss": 3.1496, "num_input_tokens_seen": 48506656, "step": 23210 }, { "epoch": 3.7871767680887514, "grad_norm": 4.375, "learning_rate": 1.8756131264397106e-05, "loss": 1.8999, "num_input_tokens_seen": 48516624, "step": 23215 }, { "epoch": 3.7879924953095685, "grad_norm": 0.0732421875, "learning_rate": 1.87466253934601e-05, "loss": 2.2128, "num_input_tokens_seen": 48526752, "step": 23220 }, { "epoch": 3.7888082225303856, "grad_norm": 10.125, "learning_rate": 1.8737120486872033e-05, "loss": 2.6603, "num_input_tokens_seen": 48537664, "step": 23225 }, { "epoch": 3.789623949751203, "grad_norm": 8.5, "learning_rate": 1.8727616546098696e-05, "loss": 1.1937, "num_input_tokens_seen": 48548656, "step": 23230 }, { "epoch": 3.7904396769720208, "grad_norm": 3.828125, "learning_rate": 1.8718113572605716e-05, "loss": 2.1051, "num_input_tokens_seen": 48559888, "step": 23235 }, { "epoch": 3.791255404192838, "grad_norm": 9.1875, "learning_rate": 1.8708611567858554e-05, "loss": 2.7362, "num_input_tokens_seen": 48570768, "step": 23240 }, { "epoch": 3.792071131413655, "grad_norm": 12.5625, "learning_rate": 1.8699110533322565e-05, "loss": 3.1446, "num_input_tokens_seen": 48581280, "step": 23245 }, { "epoch": 3.7928868586344726, "grad_norm": 8.1875, "learning_rate": 1.8689610470462897e-05, "loss": 1.6523, "num_input_tokens_seen": 48592032, "step": 23250 }, { "epoch": 3.79370258585529, "grad_norm": 4.59375, "learning_rate": 1.8680111380744604e-05, "loss": 2.6382, "num_input_tokens_seen": 48601072, "step": 23255 }, { "epoch": 3.7945183130761073, "grad_norm": 7.09375, "learning_rate": 1.8670613265632564e-05, "loss": 3.4684, "num_input_tokens_seen": 48611392, "step": 23260 }, { "epoch": 3.7953340402969244, "grad_norm": 6.6875, "learning_rate": 1.866111612659149e-05, "loss": 2.1921, "num_input_tokens_seen": 48621984, "step": 23265 }, { "epoch": 3.796149767517742, "grad_norm": 8.9375, "learning_rate": 1.8651619965085967e-05, "loss": 2.4243, "num_input_tokens_seen": 48632864, "step": 23270 }, { "epoch": 3.7969654947385596, "grad_norm": 7.84375, "learning_rate": 1.8642124782580433e-05, "loss": 2.5384, "num_input_tokens_seen": 48643520, "step": 23275 }, { "epoch": 3.7977812219593767, "grad_norm": 3.1875, "learning_rate": 1.8632630580539144e-05, "loss": 3.2707, "num_input_tokens_seen": 48653104, "step": 23280 }, { "epoch": 3.7985969491801943, "grad_norm": 6.8125, "learning_rate": 1.862313736042625e-05, "loss": 2.6351, "num_input_tokens_seen": 48663536, "step": 23285 }, { "epoch": 3.7994126764010114, "grad_norm": 3.734375, "learning_rate": 1.8613645123705703e-05, "loss": 1.2241, "num_input_tokens_seen": 48674272, "step": 23290 }, { "epoch": 3.800228403621829, "grad_norm": 2.21875, "learning_rate": 1.8604153871841328e-05, "loss": 1.3883, "num_input_tokens_seen": 48684848, "step": 23295 }, { "epoch": 3.801044130842646, "grad_norm": 8.5625, "learning_rate": 1.859466360629682e-05, "loss": 3.0263, "num_input_tokens_seen": 48694048, "step": 23300 }, { "epoch": 3.8018598580634637, "grad_norm": 4.125, "learning_rate": 1.8585174328535666e-05, "loss": 1.9799, "num_input_tokens_seen": 48704640, "step": 23305 }, { "epoch": 3.802675585284281, "grad_norm": 6.65625, "learning_rate": 1.857568604002124e-05, "loss": 3.1137, "num_input_tokens_seen": 48716880, "step": 23310 }, { "epoch": 3.8034913125050984, "grad_norm": 8.25, "learning_rate": 1.8566198742216774e-05, "loss": 2.8122, "num_input_tokens_seen": 48726336, "step": 23315 }, { "epoch": 3.8043070397259156, "grad_norm": 5.15625, "learning_rate": 1.85567124365853e-05, "loss": 2.8075, "num_input_tokens_seen": 48736192, "step": 23320 }, { "epoch": 3.805122766946733, "grad_norm": 6.53125, "learning_rate": 1.854722712458975e-05, "loss": 2.9333, "num_input_tokens_seen": 48746864, "step": 23325 }, { "epoch": 3.8059384941675503, "grad_norm": 15.5, "learning_rate": 1.853774280769286e-05, "loss": 2.2157, "num_input_tokens_seen": 48756432, "step": 23330 }, { "epoch": 3.806754221388368, "grad_norm": 4.75, "learning_rate": 1.852825948735724e-05, "loss": 1.9392, "num_input_tokens_seen": 48766672, "step": 23335 }, { "epoch": 3.807569948609185, "grad_norm": 3.765625, "learning_rate": 1.851877716504534e-05, "loss": 1.3286, "num_input_tokens_seen": 48776800, "step": 23340 }, { "epoch": 3.8083856758300025, "grad_norm": 5.40625, "learning_rate": 1.8509295842219448e-05, "loss": 1.7811, "num_input_tokens_seen": 48786752, "step": 23345 }, { "epoch": 3.8092014030508197, "grad_norm": 3.4375, "learning_rate": 1.8499815520341697e-05, "loss": 1.5269, "num_input_tokens_seen": 48795408, "step": 23350 }, { "epoch": 3.8100171302716372, "grad_norm": 7.75, "learning_rate": 1.8490336200874094e-05, "loss": 3.3678, "num_input_tokens_seen": 48806016, "step": 23355 }, { "epoch": 3.8108328574924544, "grad_norm": 9.9375, "learning_rate": 1.848085788527844e-05, "loss": 1.9982, "num_input_tokens_seen": 48817088, "step": 23360 }, { "epoch": 3.811648584713272, "grad_norm": 5.15625, "learning_rate": 1.847138057501644e-05, "loss": 3.7848, "num_input_tokens_seen": 48828416, "step": 23365 }, { "epoch": 3.812464311934089, "grad_norm": 9.5, "learning_rate": 1.8461904271549582e-05, "loss": 2.5334, "num_input_tokens_seen": 48838320, "step": 23370 }, { "epoch": 3.8132800391549067, "grad_norm": 10.125, "learning_rate": 1.845242897633926e-05, "loss": 1.2705, "num_input_tokens_seen": 48848288, "step": 23375 }, { "epoch": 3.814095766375724, "grad_norm": 6.375, "learning_rate": 1.844295469084667e-05, "loss": 2.2726, "num_input_tokens_seen": 48858672, "step": 23380 }, { "epoch": 3.8149114935965414, "grad_norm": 1.7734375, "learning_rate": 1.843348141653286e-05, "loss": 1.3909, "num_input_tokens_seen": 48868720, "step": 23385 }, { "epoch": 3.815727220817359, "grad_norm": 11.3125, "learning_rate": 1.842400915485874e-05, "loss": 2.9399, "num_input_tokens_seen": 48878528, "step": 23390 }, { "epoch": 3.816542948038176, "grad_norm": 2.875, "learning_rate": 1.8414537907285053e-05, "loss": 1.2243, "num_input_tokens_seen": 48888032, "step": 23395 }, { "epoch": 3.817358675258993, "grad_norm": 10.25, "learning_rate": 1.840506767527237e-05, "loss": 2.9062, "num_input_tokens_seen": 48897744, "step": 23400 }, { "epoch": 3.817358675258993, "eval_loss": 2.557155132293701, "eval_runtime": 135.0065, "eval_samples_per_second": 20.184, "eval_steps_per_second": 10.096, "num_input_tokens_seen": 48897744, "step": 23400 }, { "epoch": 3.8181744024798108, "grad_norm": 5.03125, "learning_rate": 1.8395598460281137e-05, "loss": 3.3745, "num_input_tokens_seen": 48907904, "step": 23405 }, { "epoch": 3.8189901297006283, "grad_norm": 7.34375, "learning_rate": 1.838613026377161e-05, "loss": 3.5262, "num_input_tokens_seen": 48917888, "step": 23410 }, { "epoch": 3.8198058569214455, "grad_norm": 2.390625, "learning_rate": 1.8376663087203917e-05, "loss": 1.3472, "num_input_tokens_seen": 48926976, "step": 23415 }, { "epoch": 3.8206215841422626, "grad_norm": 3.921875, "learning_rate": 1.8367196932038014e-05, "loss": 2.2316, "num_input_tokens_seen": 48935440, "step": 23420 }, { "epoch": 3.82143731136308, "grad_norm": 6.1875, "learning_rate": 1.8357731799733686e-05, "loss": 2.7267, "num_input_tokens_seen": 48945712, "step": 23425 }, { "epoch": 3.8222530385838978, "grad_norm": 10.6875, "learning_rate": 1.8348267691750586e-05, "loss": 3.2482, "num_input_tokens_seen": 48956768, "step": 23430 }, { "epoch": 3.823068765804715, "grad_norm": 3.15625, "learning_rate": 1.833880460954821e-05, "loss": 2.3624, "num_input_tokens_seen": 48968144, "step": 23435 }, { "epoch": 3.823884493025532, "grad_norm": 8.4375, "learning_rate": 1.8329342554585866e-05, "loss": 1.9989, "num_input_tokens_seen": 48980144, "step": 23440 }, { "epoch": 3.8247002202463496, "grad_norm": 7.8125, "learning_rate": 1.8319881528322735e-05, "loss": 2.293, "num_input_tokens_seen": 48991328, "step": 23445 }, { "epoch": 3.825515947467167, "grad_norm": 7.65625, "learning_rate": 1.8310421532217815e-05, "loss": 1.8562, "num_input_tokens_seen": 49001984, "step": 23450 }, { "epoch": 3.8263316746879843, "grad_norm": 4.40625, "learning_rate": 1.8300962567729958e-05, "loss": 1.9575, "num_input_tokens_seen": 49011888, "step": 23455 }, { "epoch": 3.8271474019088014, "grad_norm": 1.5859375, "learning_rate": 1.8291504636317866e-05, "loss": 1.9032, "num_input_tokens_seen": 49021584, "step": 23460 }, { "epoch": 3.827963129129619, "grad_norm": 0.8828125, "learning_rate": 1.8282047739440055e-05, "loss": 2.286, "num_input_tokens_seen": 49033184, "step": 23465 }, { "epoch": 3.8287788563504366, "grad_norm": 7.28125, "learning_rate": 1.8272591878554903e-05, "loss": 1.8907, "num_input_tokens_seen": 49044064, "step": 23470 }, { "epoch": 3.8295945835712537, "grad_norm": 7.34375, "learning_rate": 1.8263137055120638e-05, "loss": 2.3856, "num_input_tokens_seen": 49054320, "step": 23475 }, { "epoch": 3.8304103107920713, "grad_norm": 12.5, "learning_rate": 1.8253683270595295e-05, "loss": 4.0116, "num_input_tokens_seen": 49063520, "step": 23480 }, { "epoch": 3.8312260380128884, "grad_norm": 5.71875, "learning_rate": 1.824423052643677e-05, "loss": 2.7427, "num_input_tokens_seen": 49074080, "step": 23485 }, { "epoch": 3.832041765233706, "grad_norm": 4.65625, "learning_rate": 1.82347788241028e-05, "loss": 2.8989, "num_input_tokens_seen": 49084432, "step": 23490 }, { "epoch": 3.832857492454523, "grad_norm": 8.6875, "learning_rate": 1.8225328165050942e-05, "loss": 2.7773, "num_input_tokens_seen": 49094560, "step": 23495 }, { "epoch": 3.8336732196753407, "grad_norm": 5.03125, "learning_rate": 1.821587855073863e-05, "loss": 2.1054, "num_input_tokens_seen": 49105216, "step": 23500 }, { "epoch": 3.834488946896158, "grad_norm": 5.25, "learning_rate": 1.8206429982623086e-05, "loss": 1.8277, "num_input_tokens_seen": 49115440, "step": 23505 }, { "epoch": 3.8353046741169754, "grad_norm": 9.625, "learning_rate": 1.8196982462161416e-05, "loss": 1.6404, "num_input_tokens_seen": 49125200, "step": 23510 }, { "epoch": 3.8361204013377925, "grad_norm": 7.4375, "learning_rate": 1.818753599081055e-05, "loss": 2.7055, "num_input_tokens_seen": 49135328, "step": 23515 }, { "epoch": 3.83693612855861, "grad_norm": 1.7734375, "learning_rate": 1.817809057002724e-05, "loss": 1.3624, "num_input_tokens_seen": 49146016, "step": 23520 }, { "epoch": 3.8377518557794272, "grad_norm": 10.5, "learning_rate": 1.8168646201268096e-05, "loss": 2.601, "num_input_tokens_seen": 49154416, "step": 23525 }, { "epoch": 3.838567583000245, "grad_norm": 5.90625, "learning_rate": 1.8159202885989557e-05, "loss": 1.4826, "num_input_tokens_seen": 49163728, "step": 23530 }, { "epoch": 3.839383310221062, "grad_norm": 4.625, "learning_rate": 1.814976062564789e-05, "loss": 0.8096, "num_input_tokens_seen": 49174960, "step": 23535 }, { "epoch": 3.8401990374418795, "grad_norm": 6.8125, "learning_rate": 1.8140319421699234e-05, "loss": 2.5002, "num_input_tokens_seen": 49185648, "step": 23540 }, { "epoch": 3.8410147646626966, "grad_norm": 4.0625, "learning_rate": 1.8130879275599515e-05, "loss": 1.1655, "num_input_tokens_seen": 49196896, "step": 23545 }, { "epoch": 3.841830491883514, "grad_norm": 1.28125, "learning_rate": 1.8121440188804544e-05, "loss": 1.7487, "num_input_tokens_seen": 49205728, "step": 23550 }, { "epoch": 3.8426462191043314, "grad_norm": 8.75, "learning_rate": 1.811200216276993e-05, "loss": 2.7157, "num_input_tokens_seen": 49214816, "step": 23555 }, { "epoch": 3.843461946325149, "grad_norm": 8.4375, "learning_rate": 1.810256519895115e-05, "loss": 3.6901, "num_input_tokens_seen": 49224288, "step": 23560 }, { "epoch": 3.8442776735459665, "grad_norm": 7.75, "learning_rate": 1.8093129298803494e-05, "loss": 2.641, "num_input_tokens_seen": 49234000, "step": 23565 }, { "epoch": 3.8450934007667836, "grad_norm": 11.4375, "learning_rate": 1.808369446378209e-05, "loss": 2.5648, "num_input_tokens_seen": 49244144, "step": 23570 }, { "epoch": 3.8459091279876008, "grad_norm": 3.140625, "learning_rate": 1.8074260695341914e-05, "loss": 1.4215, "num_input_tokens_seen": 49252848, "step": 23575 }, { "epoch": 3.8467248552084183, "grad_norm": 10.375, "learning_rate": 1.8064827994937782e-05, "loss": 2.5475, "num_input_tokens_seen": 49263920, "step": 23580 }, { "epoch": 3.847540582429236, "grad_norm": 4.71875, "learning_rate": 1.8055396364024317e-05, "loss": 3.06, "num_input_tokens_seen": 49274608, "step": 23585 }, { "epoch": 3.848356309650053, "grad_norm": 16.5, "learning_rate": 1.804596580405601e-05, "loss": 2.7333, "num_input_tokens_seen": 49284992, "step": 23590 }, { "epoch": 3.84917203687087, "grad_norm": 8.5625, "learning_rate": 1.8036536316487174e-05, "loss": 2.7093, "num_input_tokens_seen": 49297072, "step": 23595 }, { "epoch": 3.8499877640916877, "grad_norm": 3.71875, "learning_rate": 1.802710790277193e-05, "loss": 1.5842, "num_input_tokens_seen": 49308304, "step": 23600 }, { "epoch": 3.8499877640916877, "eval_loss": 2.5380730628967285, "eval_runtime": 135.0144, "eval_samples_per_second": 20.183, "eval_steps_per_second": 10.095, "num_input_tokens_seen": 49308304, "step": 23600 }, { "epoch": 3.8508034913125053, "grad_norm": 4.78125, "learning_rate": 1.801768056436429e-05, "loss": 2.4033, "num_input_tokens_seen": 49318272, "step": 23605 }, { "epoch": 3.8516192185333225, "grad_norm": 7.5625, "learning_rate": 1.8008254302718035e-05, "loss": 2.6844, "num_input_tokens_seen": 49329904, "step": 23610 }, { "epoch": 3.8524349457541396, "grad_norm": 2.75, "learning_rate": 1.7998829119286837e-05, "loss": 1.3121, "num_input_tokens_seen": 49339984, "step": 23615 }, { "epoch": 3.853250672974957, "grad_norm": 4.84375, "learning_rate": 1.798940501552418e-05, "loss": 1.8081, "num_input_tokens_seen": 49351680, "step": 23620 }, { "epoch": 3.8540664001957747, "grad_norm": 7.0625, "learning_rate": 1.797998199288336e-05, "loss": 1.8598, "num_input_tokens_seen": 49360576, "step": 23625 }, { "epoch": 3.854882127416592, "grad_norm": 9.875, "learning_rate": 1.7970560052817543e-05, "loss": 2.517, "num_input_tokens_seen": 49371840, "step": 23630 }, { "epoch": 3.855697854637409, "grad_norm": 6.96875, "learning_rate": 1.7961139196779702e-05, "loss": 2.618, "num_input_tokens_seen": 49381968, "step": 23635 }, { "epoch": 3.8565135818582266, "grad_norm": 7.21875, "learning_rate": 1.7951719426222647e-05, "loss": 3.6684, "num_input_tokens_seen": 49392896, "step": 23640 }, { "epoch": 3.857329309079044, "grad_norm": 6.59375, "learning_rate": 1.794230074259904e-05, "loss": 1.9203, "num_input_tokens_seen": 49402352, "step": 23645 }, { "epoch": 3.8581450362998613, "grad_norm": 4.90625, "learning_rate": 1.7932883147361336e-05, "loss": 3.1964, "num_input_tokens_seen": 49413504, "step": 23650 }, { "epoch": 3.858960763520679, "grad_norm": 7.78125, "learning_rate": 1.7923466641961865e-05, "loss": 3.1565, "num_input_tokens_seen": 49423952, "step": 23655 }, { "epoch": 3.859776490741496, "grad_norm": 2.34375, "learning_rate": 1.791405122785278e-05, "loss": 1.7276, "num_input_tokens_seen": 49433808, "step": 23660 }, { "epoch": 3.8605922179623136, "grad_norm": 4.4375, "learning_rate": 1.7904636906486037e-05, "loss": 2.0053, "num_input_tokens_seen": 49445152, "step": 23665 }, { "epoch": 3.8614079451831307, "grad_norm": 17.25, "learning_rate": 1.7895223679313448e-05, "loss": 3.456, "num_input_tokens_seen": 49456064, "step": 23670 }, { "epoch": 3.8622236724039483, "grad_norm": 12.6875, "learning_rate": 1.7885811547786653e-05, "loss": 2.1831, "num_input_tokens_seen": 49467360, "step": 23675 }, { "epoch": 3.8630393996247654, "grad_norm": 3.65625, "learning_rate": 1.7876400513357115e-05, "loss": 2.5175, "num_input_tokens_seen": 49478224, "step": 23680 }, { "epoch": 3.863855126845583, "grad_norm": 5.65625, "learning_rate": 1.7866990577476146e-05, "loss": 1.7914, "num_input_tokens_seen": 49488960, "step": 23685 }, { "epoch": 3.8646708540664, "grad_norm": 1.453125, "learning_rate": 1.7857581741594863e-05, "loss": 1.8239, "num_input_tokens_seen": 49499552, "step": 23690 }, { "epoch": 3.8654865812872177, "grad_norm": 0.76953125, "learning_rate": 1.7848174007164237e-05, "loss": 2.4778, "num_input_tokens_seen": 49509696, "step": 23695 }, { "epoch": 3.866302308508035, "grad_norm": 8.1875, "learning_rate": 1.7838767375635052e-05, "loss": 2.1175, "num_input_tokens_seen": 49520336, "step": 23700 }, { "epoch": 3.8671180357288524, "grad_norm": 4.625, "learning_rate": 1.782936184845793e-05, "loss": 1.6817, "num_input_tokens_seen": 49530320, "step": 23705 }, { "epoch": 3.8679337629496695, "grad_norm": 9.5, "learning_rate": 1.7819957427083334e-05, "loss": 3.01, "num_input_tokens_seen": 49540944, "step": 23710 }, { "epoch": 3.868749490170487, "grad_norm": 6.25, "learning_rate": 1.7810554112961516e-05, "loss": 2.4251, "num_input_tokens_seen": 49551168, "step": 23715 }, { "epoch": 3.869565217391304, "grad_norm": 3.765625, "learning_rate": 1.7801151907542607e-05, "loss": 1.7166, "num_input_tokens_seen": 49561136, "step": 23720 }, { "epoch": 3.870380944612122, "grad_norm": 4.75, "learning_rate": 1.7791750812276547e-05, "loss": 1.8846, "num_input_tokens_seen": 49570864, "step": 23725 }, { "epoch": 3.871196671832939, "grad_norm": 4.84375, "learning_rate": 1.778235082861309e-05, "loss": 1.6086, "num_input_tokens_seen": 49581824, "step": 23730 }, { "epoch": 3.8720123990537565, "grad_norm": 8.6875, "learning_rate": 1.777295195800184e-05, "loss": 3.4678, "num_input_tokens_seen": 49591712, "step": 23735 }, { "epoch": 3.8728281262745736, "grad_norm": 4.1875, "learning_rate": 1.7763554201892215e-05, "loss": 2.5205, "num_input_tokens_seen": 49602192, "step": 23740 }, { "epoch": 3.873643853495391, "grad_norm": 8.5, "learning_rate": 1.7754157561733476e-05, "loss": 1.9765, "num_input_tokens_seen": 49612704, "step": 23745 }, { "epoch": 3.8744595807162083, "grad_norm": 3.9375, "learning_rate": 1.7744762038974702e-05, "loss": 2.2369, "num_input_tokens_seen": 49623184, "step": 23750 }, { "epoch": 3.875275307937026, "grad_norm": 4.90625, "learning_rate": 1.7735367635064788e-05, "loss": 1.5965, "num_input_tokens_seen": 49634720, "step": 23755 }, { "epoch": 3.8760910351578435, "grad_norm": 5.28125, "learning_rate": 1.7725974351452474e-05, "loss": 1.8073, "num_input_tokens_seen": 49644640, "step": 23760 }, { "epoch": 3.8769067623786606, "grad_norm": 7.84375, "learning_rate": 1.771658218958634e-05, "loss": 1.3121, "num_input_tokens_seen": 49654160, "step": 23765 }, { "epoch": 3.8777224895994777, "grad_norm": 1.109375, "learning_rate": 1.770719115091475e-05, "loss": 1.6511, "num_input_tokens_seen": 49664816, "step": 23770 }, { "epoch": 3.8785382168202953, "grad_norm": 4.5625, "learning_rate": 1.7697801236885935e-05, "loss": 2.1329, "num_input_tokens_seen": 49675968, "step": 23775 }, { "epoch": 3.879353944041113, "grad_norm": 16.125, "learning_rate": 1.7688412448947944e-05, "loss": 2.0832, "num_input_tokens_seen": 49686992, "step": 23780 }, { "epoch": 3.88016967126193, "grad_norm": 8.75, "learning_rate": 1.767902478854862e-05, "loss": 2.5624, "num_input_tokens_seen": 49697392, "step": 23785 }, { "epoch": 3.880985398482747, "grad_norm": 9.6875, "learning_rate": 1.766963825713569e-05, "loss": 1.3421, "num_input_tokens_seen": 49707184, "step": 23790 }, { "epoch": 3.8818011257035647, "grad_norm": 3.421875, "learning_rate": 1.766025285615665e-05, "loss": 1.4656, "num_input_tokens_seen": 49717424, "step": 23795 }, { "epoch": 3.8826168529243823, "grad_norm": 7.0, "learning_rate": 1.7650868587058854e-05, "loss": 3.0507, "num_input_tokens_seen": 49728368, "step": 23800 }, { "epoch": 3.8826168529243823, "eval_loss": 2.532986640930176, "eval_runtime": 135.0473, "eval_samples_per_second": 20.178, "eval_steps_per_second": 10.093, "num_input_tokens_seen": 49728368, "step": 23800 }, { "epoch": 3.8834325801451994, "grad_norm": 9.5, "learning_rate": 1.7641485451289484e-05, "loss": 3.1382, "num_input_tokens_seen": 49738752, "step": 23805 }, { "epoch": 3.8842483073660166, "grad_norm": 8.1875, "learning_rate": 1.7632103450295534e-05, "loss": 3.5653, "num_input_tokens_seen": 49746480, "step": 23810 }, { "epoch": 3.885064034586834, "grad_norm": 5.625, "learning_rate": 1.762272258552381e-05, "loss": 2.8683, "num_input_tokens_seen": 49758128, "step": 23815 }, { "epoch": 3.8858797618076517, "grad_norm": 11.0, "learning_rate": 1.7613342858420988e-05, "loss": 2.9552, "num_input_tokens_seen": 49769072, "step": 23820 }, { "epoch": 3.886695489028469, "grad_norm": 3.359375, "learning_rate": 1.760396427043351e-05, "loss": 1.876, "num_input_tokens_seen": 49779104, "step": 23825 }, { "epoch": 3.887511216249286, "grad_norm": 2.4375, "learning_rate": 1.7594586823007696e-05, "loss": 1.8226, "num_input_tokens_seen": 49790000, "step": 23830 }, { "epoch": 3.8883269434701035, "grad_norm": 2.125, "learning_rate": 1.7585210517589646e-05, "loss": 1.902, "num_input_tokens_seen": 49800256, "step": 23835 }, { "epoch": 3.889142670690921, "grad_norm": 5.875, "learning_rate": 1.7575835355625314e-05, "loss": 2.4173, "num_input_tokens_seen": 49810848, "step": 23840 }, { "epoch": 3.8899583979117383, "grad_norm": 4.75, "learning_rate": 1.756646133856048e-05, "loss": 3.121, "num_input_tokens_seen": 49821600, "step": 23845 }, { "epoch": 3.890774125132556, "grad_norm": 6.75, "learning_rate": 1.7557088467840714e-05, "loss": 2.3789, "num_input_tokens_seen": 49829904, "step": 23850 }, { "epoch": 3.891589852353373, "grad_norm": 4.28125, "learning_rate": 1.7547716744911438e-05, "loss": 2.889, "num_input_tokens_seen": 49839024, "step": 23855 }, { "epoch": 3.8924055795741905, "grad_norm": 5.4375, "learning_rate": 1.7538346171217902e-05, "loss": 2.8393, "num_input_tokens_seen": 49850304, "step": 23860 }, { "epoch": 3.8932213067950077, "grad_norm": 7.59375, "learning_rate": 1.7528976748205146e-05, "loss": 2.1528, "num_input_tokens_seen": 49860608, "step": 23865 }, { "epoch": 3.8940370340158252, "grad_norm": 12.75, "learning_rate": 1.751960847731807e-05, "loss": 2.2034, "num_input_tokens_seen": 49870848, "step": 23870 }, { "epoch": 3.8948527612366424, "grad_norm": 8.0625, "learning_rate": 1.7510241360001362e-05, "loss": 2.1133, "num_input_tokens_seen": 49882112, "step": 23875 }, { "epoch": 3.89566848845746, "grad_norm": 10.75, "learning_rate": 1.7500875397699562e-05, "loss": 2.1655, "num_input_tokens_seen": 49890944, "step": 23880 }, { "epoch": 3.896484215678277, "grad_norm": 5.65625, "learning_rate": 1.7491510591857015e-05, "loss": 2.9392, "num_input_tokens_seen": 49901520, "step": 23885 }, { "epoch": 3.8972999428990946, "grad_norm": 8.5625, "learning_rate": 1.7482146943917896e-05, "loss": 2.4056, "num_input_tokens_seen": 49911712, "step": 23890 }, { "epoch": 3.898115670119912, "grad_norm": 6.75, "learning_rate": 1.7472784455326185e-05, "loss": 2.0738, "num_input_tokens_seen": 49923344, "step": 23895 }, { "epoch": 3.8989313973407294, "grad_norm": 9.75, "learning_rate": 1.746342312752572e-05, "loss": 2.7893, "num_input_tokens_seen": 49933520, "step": 23900 }, { "epoch": 3.8997471245615465, "grad_norm": 6.6875, "learning_rate": 1.74540629619601e-05, "loss": 2.4992, "num_input_tokens_seen": 49943152, "step": 23905 }, { "epoch": 3.900562851782364, "grad_norm": 9.25, "learning_rate": 1.7444703960072815e-05, "loss": 2.8972, "num_input_tokens_seen": 49951840, "step": 23910 }, { "epoch": 3.901378579003181, "grad_norm": 4.84375, "learning_rate": 1.7435346123307118e-05, "loss": 1.2922, "num_input_tokens_seen": 49962128, "step": 23915 }, { "epoch": 3.9021943062239988, "grad_norm": 6.25, "learning_rate": 1.742598945310611e-05, "loss": 1.6437, "num_input_tokens_seen": 49972528, "step": 23920 }, { "epoch": 3.903010033444816, "grad_norm": 8.125, "learning_rate": 1.741663395091272e-05, "loss": 2.9469, "num_input_tokens_seen": 49982096, "step": 23925 }, { "epoch": 3.9038257606656335, "grad_norm": 3.5, "learning_rate": 1.7407279618169657e-05, "loss": 2.8083, "num_input_tokens_seen": 49992864, "step": 23930 }, { "epoch": 3.904641487886451, "grad_norm": 5.65625, "learning_rate": 1.73979264563195e-05, "loss": 3.0942, "num_input_tokens_seen": 50002864, "step": 23935 }, { "epoch": 3.905457215107268, "grad_norm": 0.1884765625, "learning_rate": 1.7388574466804625e-05, "loss": 1.7955, "num_input_tokens_seen": 50014064, "step": 23940 }, { "epoch": 3.9062729423280853, "grad_norm": 9.0, "learning_rate": 1.7379223651067207e-05, "loss": 2.8995, "num_input_tokens_seen": 50025088, "step": 23945 }, { "epoch": 3.907088669548903, "grad_norm": 10.8125, "learning_rate": 1.736987401054928e-05, "loss": 2.4534, "num_input_tokens_seen": 50034912, "step": 23950 }, { "epoch": 3.9079043967697205, "grad_norm": 8.9375, "learning_rate": 1.736052554669266e-05, "loss": 3.3202, "num_input_tokens_seen": 50044912, "step": 23955 }, { "epoch": 3.9087201239905376, "grad_norm": 12.25, "learning_rate": 1.7351178260939007e-05, "loss": 2.3299, "num_input_tokens_seen": 50055728, "step": 23960 }, { "epoch": 3.9095358512113547, "grad_norm": 3.578125, "learning_rate": 1.7341832154729794e-05, "loss": 2.7093, "num_input_tokens_seen": 50065168, "step": 23965 }, { "epoch": 3.9103515784321723, "grad_norm": 5.625, "learning_rate": 1.7332487229506286e-05, "loss": 2.3807, "num_input_tokens_seen": 50076352, "step": 23970 }, { "epoch": 3.91116730565299, "grad_norm": 4.65625, "learning_rate": 1.732314348670961e-05, "loss": 2.8095, "num_input_tokens_seen": 50088432, "step": 23975 }, { "epoch": 3.911983032873807, "grad_norm": 3.25, "learning_rate": 1.7313800927780686e-05, "loss": 2.4315, "num_input_tokens_seen": 50100112, "step": 23980 }, { "epoch": 3.912798760094624, "grad_norm": 8.9375, "learning_rate": 1.7304459554160245e-05, "loss": 2.5146, "num_input_tokens_seen": 50110320, "step": 23985 }, { "epoch": 3.9136144873154417, "grad_norm": 9.5625, "learning_rate": 1.7295119367288853e-05, "loss": 2.723, "num_input_tokens_seen": 50120464, "step": 23990 }, { "epoch": 3.9144302145362593, "grad_norm": 4.5, "learning_rate": 1.728578036860688e-05, "loss": 2.7446, "num_input_tokens_seen": 50129680, "step": 23995 }, { "epoch": 3.9152459417570764, "grad_norm": 11.375, "learning_rate": 1.7276442559554513e-05, "loss": 3.2435, "num_input_tokens_seen": 50140272, "step": 24000 }, { "epoch": 3.9152459417570764, "eval_loss": 2.5345358848571777, "eval_runtime": 135.0261, "eval_samples_per_second": 20.181, "eval_steps_per_second": 10.094, "num_input_tokens_seen": 50140272, "step": 24000 }, { "epoch": 3.9160616689778935, "grad_norm": 17.0, "learning_rate": 1.726710594157177e-05, "loss": 1.9235, "num_input_tokens_seen": 50149872, "step": 24005 }, { "epoch": 3.916877396198711, "grad_norm": 2.53125, "learning_rate": 1.725777051609846e-05, "loss": 1.5808, "num_input_tokens_seen": 50161136, "step": 24010 }, { "epoch": 3.9176931234195287, "grad_norm": 4.90625, "learning_rate": 1.7248436284574228e-05, "loss": 2.2378, "num_input_tokens_seen": 50169104, "step": 24015 }, { "epoch": 3.918508850640346, "grad_norm": 8.6875, "learning_rate": 1.723910324843855e-05, "loss": 1.5061, "num_input_tokens_seen": 50179104, "step": 24020 }, { "epoch": 3.919324577861163, "grad_norm": 4.625, "learning_rate": 1.722977140913067e-05, "loss": 2.3459, "num_input_tokens_seen": 50189920, "step": 24025 }, { "epoch": 3.9201403050819805, "grad_norm": 6.5, "learning_rate": 1.7220440768089688e-05, "loss": 3.9588, "num_input_tokens_seen": 50200608, "step": 24030 }, { "epoch": 3.920956032302798, "grad_norm": 4.375, "learning_rate": 1.7211111326754505e-05, "loss": 2.3685, "num_input_tokens_seen": 50210560, "step": 24035 }, { "epoch": 3.9217717595236152, "grad_norm": 9.0625, "learning_rate": 1.720178308656383e-05, "loss": 2.5645, "num_input_tokens_seen": 50221568, "step": 24040 }, { "epoch": 3.922587486744433, "grad_norm": 6.96875, "learning_rate": 1.719245604895621e-05, "loss": 3.063, "num_input_tokens_seen": 50230848, "step": 24045 }, { "epoch": 3.92340321396525, "grad_norm": 3.375, "learning_rate": 1.7183130215369972e-05, "loss": 2.5264, "num_input_tokens_seen": 50241840, "step": 24050 }, { "epoch": 3.9242189411860675, "grad_norm": 5.65625, "learning_rate": 1.7173805587243292e-05, "loss": 1.5753, "num_input_tokens_seen": 50252432, "step": 24055 }, { "epoch": 3.9250346684068846, "grad_norm": 7.71875, "learning_rate": 1.7164482166014147e-05, "loss": 3.072, "num_input_tokens_seen": 50262704, "step": 24060 }, { "epoch": 3.925850395627702, "grad_norm": 12.625, "learning_rate": 1.7155159953120313e-05, "loss": 2.0696, "num_input_tokens_seen": 50273184, "step": 24065 }, { "epoch": 3.9266661228485193, "grad_norm": 7.9375, "learning_rate": 1.714583894999941e-05, "loss": 2.933, "num_input_tokens_seen": 50283296, "step": 24070 }, { "epoch": 3.927481850069337, "grad_norm": 8.4375, "learning_rate": 1.7136519158088826e-05, "loss": 1.298, "num_input_tokens_seen": 50293536, "step": 24075 }, { "epoch": 3.928297577290154, "grad_norm": 7.03125, "learning_rate": 1.712720057882581e-05, "loss": 3.1418, "num_input_tokens_seen": 50305104, "step": 24080 }, { "epoch": 3.9291133045109716, "grad_norm": 10.75, "learning_rate": 1.7117883213647413e-05, "loss": 3.1658, "num_input_tokens_seen": 50315648, "step": 24085 }, { "epoch": 3.9299290317317888, "grad_norm": 6.15625, "learning_rate": 1.710856706399046e-05, "loss": 2.8169, "num_input_tokens_seen": 50326352, "step": 24090 }, { "epoch": 3.9307447589526063, "grad_norm": 4.65625, "learning_rate": 1.7099252131291648e-05, "loss": 1.7872, "num_input_tokens_seen": 50337520, "step": 24095 }, { "epoch": 3.9315604861734235, "grad_norm": 6.5625, "learning_rate": 1.708993841698744e-05, "loss": 1.7968, "num_input_tokens_seen": 50348880, "step": 24100 }, { "epoch": 3.932376213394241, "grad_norm": 4.1875, "learning_rate": 1.7080625922514132e-05, "loss": 2.7252, "num_input_tokens_seen": 50359840, "step": 24105 }, { "epoch": 3.933191940615058, "grad_norm": 4.625, "learning_rate": 1.7071314649307836e-05, "loss": 3.2273, "num_input_tokens_seen": 50371248, "step": 24110 }, { "epoch": 3.9340076678358757, "grad_norm": 8.625, "learning_rate": 1.7062004598804448e-05, "loss": 2.5219, "num_input_tokens_seen": 50382672, "step": 24115 }, { "epoch": 3.934823395056693, "grad_norm": 6.40625, "learning_rate": 1.7052695772439702e-05, "loss": 1.8529, "num_input_tokens_seen": 50393584, "step": 24120 }, { "epoch": 3.9356391222775104, "grad_norm": 17.75, "learning_rate": 1.7043388171649154e-05, "loss": 2.5868, "num_input_tokens_seen": 50404016, "step": 24125 }, { "epoch": 3.936454849498328, "grad_norm": 10.0, "learning_rate": 1.7034081797868127e-05, "loss": 2.3666, "num_input_tokens_seen": 50415488, "step": 24130 }, { "epoch": 3.937270576719145, "grad_norm": 9.4375, "learning_rate": 1.70247766525318e-05, "loss": 2.8201, "num_input_tokens_seen": 50426464, "step": 24135 }, { "epoch": 3.9380863039399623, "grad_norm": 6.4375, "learning_rate": 1.701547273707514e-05, "loss": 1.9513, "num_input_tokens_seen": 50435312, "step": 24140 }, { "epoch": 3.93890203116078, "grad_norm": 7.8125, "learning_rate": 1.7006170052932916e-05, "loss": 2.5555, "num_input_tokens_seen": 50445792, "step": 24145 }, { "epoch": 3.9397177583815974, "grad_norm": 9.0, "learning_rate": 1.6996868601539735e-05, "loss": 3.1563, "num_input_tokens_seen": 50455808, "step": 24150 }, { "epoch": 3.9405334856024146, "grad_norm": 4.5625, "learning_rate": 1.6987568384329977e-05, "loss": 2.6695, "num_input_tokens_seen": 50464960, "step": 24155 }, { "epoch": 3.9413492128232317, "grad_norm": 8.3125, "learning_rate": 1.6978269402737866e-05, "loss": 1.6504, "num_input_tokens_seen": 50475760, "step": 24160 }, { "epoch": 3.9421649400440493, "grad_norm": 4.90625, "learning_rate": 1.696897165819743e-05, "loss": 1.9029, "num_input_tokens_seen": 50487600, "step": 24165 }, { "epoch": 3.942980667264867, "grad_norm": 4.90625, "learning_rate": 1.6959675152142487e-05, "loss": 1.84, "num_input_tokens_seen": 50497200, "step": 24170 }, { "epoch": 3.943796394485684, "grad_norm": 6.3125, "learning_rate": 1.6950379886006667e-05, "loss": 2.2768, "num_input_tokens_seen": 50508416, "step": 24175 }, { "epoch": 3.944612121706501, "grad_norm": 10.875, "learning_rate": 1.6941085861223438e-05, "loss": 2.633, "num_input_tokens_seen": 50518992, "step": 24180 }, { "epoch": 3.9454278489273187, "grad_norm": 6.65625, "learning_rate": 1.6931793079226034e-05, "loss": 2.492, "num_input_tokens_seen": 50529664, "step": 24185 }, { "epoch": 3.9462435761481363, "grad_norm": 2.84375, "learning_rate": 1.692250154144754e-05, "loss": 1.5206, "num_input_tokens_seen": 50540192, "step": 24190 }, { "epoch": 3.9470593033689534, "grad_norm": 8.4375, "learning_rate": 1.6913211249320807e-05, "loss": 3.3063, "num_input_tokens_seen": 50549568, "step": 24195 }, { "epoch": 3.9478750305897705, "grad_norm": 9.0, "learning_rate": 1.6903922204278522e-05, "loss": 2.597, "num_input_tokens_seen": 50557680, "step": 24200 }, { "epoch": 3.9478750305897705, "eval_loss": 2.538020133972168, "eval_runtime": 134.8482, "eval_samples_per_second": 20.208, "eval_steps_per_second": 10.108, "num_input_tokens_seen": 50557680, "step": 24200 }, { "epoch": 3.948690757810588, "grad_norm": 8.25, "learning_rate": 1.6894634407753186e-05, "loss": 1.9356, "num_input_tokens_seen": 50569904, "step": 24205 }, { "epoch": 3.9495064850314057, "grad_norm": 8.75, "learning_rate": 1.6885347861177077e-05, "loss": 2.0969, "num_input_tokens_seen": 50580304, "step": 24210 }, { "epoch": 3.950322212252223, "grad_norm": 9.1875, "learning_rate": 1.6876062565982298e-05, "loss": 2.8843, "num_input_tokens_seen": 50591760, "step": 24215 }, { "epoch": 3.9511379394730404, "grad_norm": 8.8125, "learning_rate": 1.6866778523600774e-05, "loss": 1.3718, "num_input_tokens_seen": 50602928, "step": 24220 }, { "epoch": 3.9519536666938575, "grad_norm": 5.25, "learning_rate": 1.6857495735464195e-05, "loss": 2.2752, "num_input_tokens_seen": 50613200, "step": 24225 }, { "epoch": 3.952769393914675, "grad_norm": 10.0625, "learning_rate": 1.6848214203004115e-05, "loss": 2.8655, "num_input_tokens_seen": 50623984, "step": 24230 }, { "epoch": 3.953585121135492, "grad_norm": 6.0625, "learning_rate": 1.6838933927651835e-05, "loss": 1.7066, "num_input_tokens_seen": 50634160, "step": 24235 }, { "epoch": 3.95440084835631, "grad_norm": 6.84375, "learning_rate": 1.6829654910838506e-05, "loss": 1.5895, "num_input_tokens_seen": 50644512, "step": 24240 }, { "epoch": 3.955216575577127, "grad_norm": 4.9375, "learning_rate": 1.6820377153995065e-05, "loss": 2.3008, "num_input_tokens_seen": 50655456, "step": 24245 }, { "epoch": 3.9560323027979445, "grad_norm": 2.515625, "learning_rate": 1.681110065855226e-05, "loss": 2.5212, "num_input_tokens_seen": 50665584, "step": 24250 }, { "epoch": 3.9568480300187616, "grad_norm": 8.125, "learning_rate": 1.6801825425940642e-05, "loss": 2.7638, "num_input_tokens_seen": 50675328, "step": 24255 }, { "epoch": 3.957663757239579, "grad_norm": 6.0, "learning_rate": 1.679255145759056e-05, "loss": 2.979, "num_input_tokens_seen": 50685232, "step": 24260 }, { "epoch": 3.9584794844603963, "grad_norm": 8.4375, "learning_rate": 1.6783278754932187e-05, "loss": 2.7366, "num_input_tokens_seen": 50696144, "step": 24265 }, { "epoch": 3.959295211681214, "grad_norm": 8.375, "learning_rate": 1.6774007319395496e-05, "loss": 1.5716, "num_input_tokens_seen": 50706224, "step": 24270 }, { "epoch": 3.960110938902031, "grad_norm": 1.71875, "learning_rate": 1.6764737152410243e-05, "loss": 0.8679, "num_input_tokens_seen": 50716144, "step": 24275 }, { "epoch": 3.9609266661228486, "grad_norm": 7.09375, "learning_rate": 1.6755468255406016e-05, "loss": 3.1037, "num_input_tokens_seen": 50726448, "step": 24280 }, { "epoch": 3.9617423933436657, "grad_norm": 9.0625, "learning_rate": 1.674620062981219e-05, "loss": 3.5542, "num_input_tokens_seen": 50736144, "step": 24285 }, { "epoch": 3.9625581205644833, "grad_norm": 8.9375, "learning_rate": 1.6736934277057947e-05, "loss": 2.3121, "num_input_tokens_seen": 50746080, "step": 24290 }, { "epoch": 3.9633738477853004, "grad_norm": 4.875, "learning_rate": 1.6727669198572286e-05, "loss": 2.1518, "num_input_tokens_seen": 50756960, "step": 24295 }, { "epoch": 3.964189575006118, "grad_norm": 8.0, "learning_rate": 1.6718405395783984e-05, "loss": 2.8379, "num_input_tokens_seen": 50767632, "step": 24300 }, { "epoch": 3.965005302226935, "grad_norm": 5.5, "learning_rate": 1.6709142870121643e-05, "loss": 3.4078, "num_input_tokens_seen": 50777072, "step": 24305 }, { "epoch": 3.9658210294477527, "grad_norm": 4.46875, "learning_rate": 1.669988162301367e-05, "loss": 2.0472, "num_input_tokens_seen": 50788672, "step": 24310 }, { "epoch": 3.96663675666857, "grad_norm": 4.84375, "learning_rate": 1.6690621655888243e-05, "loss": 1.5713, "num_input_tokens_seen": 50799888, "step": 24315 }, { "epoch": 3.9674524838893874, "grad_norm": 4.09375, "learning_rate": 1.6681362970173386e-05, "loss": 1.7874, "num_input_tokens_seen": 50809600, "step": 24320 }, { "epoch": 3.968268211110205, "grad_norm": 8.125, "learning_rate": 1.6672105567296904e-05, "loss": 4.0698, "num_input_tokens_seen": 50817744, "step": 24325 }, { "epoch": 3.969083938331022, "grad_norm": 6.28125, "learning_rate": 1.666284944868639e-05, "loss": 1.6144, "num_input_tokens_seen": 50830240, "step": 24330 }, { "epoch": 3.9698996655518393, "grad_norm": 2.125, "learning_rate": 1.665359461576927e-05, "loss": 1.6948, "num_input_tokens_seen": 50840800, "step": 24335 }, { "epoch": 3.970715392772657, "grad_norm": 7.78125, "learning_rate": 1.6644341069972736e-05, "loss": 2.3517, "num_input_tokens_seen": 50851440, "step": 24340 }, { "epoch": 3.9715311199934744, "grad_norm": 6.125, "learning_rate": 1.6635088812723813e-05, "loss": 2.0406, "num_input_tokens_seen": 50861856, "step": 24345 }, { "epoch": 3.9723468472142915, "grad_norm": 6.625, "learning_rate": 1.6625837845449328e-05, "loss": 1.773, "num_input_tokens_seen": 50872528, "step": 24350 }, { "epoch": 3.9731625744351087, "grad_norm": 5.4375, "learning_rate": 1.6616588169575874e-05, "loss": 0.7588, "num_input_tokens_seen": 50882928, "step": 24355 }, { "epoch": 3.9739783016559262, "grad_norm": 5.90625, "learning_rate": 1.6607339786529878e-05, "loss": 2.7793, "num_input_tokens_seen": 50894336, "step": 24360 }, { "epoch": 3.974794028876744, "grad_norm": 9.375, "learning_rate": 1.659809269773756e-05, "loss": 1.2777, "num_input_tokens_seen": 50904112, "step": 24365 }, { "epoch": 3.975609756097561, "grad_norm": 6.0, "learning_rate": 1.658884690462493e-05, "loss": 2.5333, "num_input_tokens_seen": 50914752, "step": 24370 }, { "epoch": 3.976425483318378, "grad_norm": 4.59375, "learning_rate": 1.6579602408617813e-05, "loss": 3.1664, "num_input_tokens_seen": 50926304, "step": 24375 }, { "epoch": 3.9772412105391957, "grad_norm": 7.09375, "learning_rate": 1.657035921114181e-05, "loss": 2.6584, "num_input_tokens_seen": 50936400, "step": 24380 }, { "epoch": 3.9780569377600132, "grad_norm": 10.375, "learning_rate": 1.656111731362236e-05, "loss": 2.8105, "num_input_tokens_seen": 50946176, "step": 24385 }, { "epoch": 3.9788726649808304, "grad_norm": 7.375, "learning_rate": 1.6551876717484666e-05, "loss": 1.6455, "num_input_tokens_seen": 50957424, "step": 24390 }, { "epoch": 3.9796883922016475, "grad_norm": 6.65625, "learning_rate": 1.6542637424153752e-05, "loss": 2.5444, "num_input_tokens_seen": 50968656, "step": 24395 }, { "epoch": 3.980504119422465, "grad_norm": 6.40625, "learning_rate": 1.6533399435054418e-05, "loss": 2.3862, "num_input_tokens_seen": 50978512, "step": 24400 }, { "epoch": 3.980504119422465, "eval_loss": 2.557262420654297, "eval_runtime": 134.9833, "eval_samples_per_second": 20.188, "eval_steps_per_second": 10.098, "num_input_tokens_seen": 50978512, "step": 24400 }, { "epoch": 3.9813198466432826, "grad_norm": 2.5, "learning_rate": 1.6524162751611304e-05, "loss": 2.895, "num_input_tokens_seen": 50990544, "step": 24405 }, { "epoch": 3.9821355738640998, "grad_norm": 7.25, "learning_rate": 1.6514927375248796e-05, "loss": 1.7926, "num_input_tokens_seen": 50999664, "step": 24410 }, { "epoch": 3.9829513010849174, "grad_norm": 9.0, "learning_rate": 1.6505693307391127e-05, "loss": 1.9168, "num_input_tokens_seen": 51009920, "step": 24415 }, { "epoch": 3.9837670283057345, "grad_norm": 8.1875, "learning_rate": 1.6496460549462288e-05, "loss": 1.8072, "num_input_tokens_seen": 51020544, "step": 24420 }, { "epoch": 3.984582755526552, "grad_norm": 15.6875, "learning_rate": 1.6487229102886097e-05, "loss": 1.8038, "num_input_tokens_seen": 51030672, "step": 24425 }, { "epoch": 3.985398482747369, "grad_norm": 4.28125, "learning_rate": 1.6477998969086155e-05, "loss": 2.4318, "num_input_tokens_seen": 51040544, "step": 24430 }, { "epoch": 3.9862142099681868, "grad_norm": 10.25, "learning_rate": 1.646877014948587e-05, "loss": 1.2334, "num_input_tokens_seen": 51050672, "step": 24435 }, { "epoch": 3.987029937189004, "grad_norm": 10.0625, "learning_rate": 1.6459542645508433e-05, "loss": 2.9823, "num_input_tokens_seen": 51061808, "step": 24440 }, { "epoch": 3.9878456644098215, "grad_norm": 4.09375, "learning_rate": 1.6450316458576852e-05, "loss": 2.9226, "num_input_tokens_seen": 51071872, "step": 24445 }, { "epoch": 3.9886613916306386, "grad_norm": 9.6875, "learning_rate": 1.6441091590113912e-05, "loss": 2.1689, "num_input_tokens_seen": 51081600, "step": 24450 }, { "epoch": 3.989477118851456, "grad_norm": 5.34375, "learning_rate": 1.6431868041542213e-05, "loss": 1.9945, "num_input_tokens_seen": 51091584, "step": 24455 }, { "epoch": 3.9902928460722733, "grad_norm": 6.375, "learning_rate": 1.6422645814284123e-05, "loss": 1.2654, "num_input_tokens_seen": 51101568, "step": 24460 }, { "epoch": 3.991108573293091, "grad_norm": 2.953125, "learning_rate": 1.6413424909761846e-05, "loss": 2.4927, "num_input_tokens_seen": 51112080, "step": 24465 }, { "epoch": 3.991924300513908, "grad_norm": 9.75, "learning_rate": 1.640420532939736e-05, "loss": 2.1908, "num_input_tokens_seen": 51122960, "step": 24470 }, { "epoch": 3.9927400277347256, "grad_norm": 14.5, "learning_rate": 1.639498707461242e-05, "loss": 2.2578, "num_input_tokens_seen": 51133232, "step": 24475 }, { "epoch": 3.9935557549555427, "grad_norm": 9.8125, "learning_rate": 1.6385770146828614e-05, "loss": 2.4918, "num_input_tokens_seen": 51143872, "step": 24480 }, { "epoch": 3.9943714821763603, "grad_norm": 0.2255859375, "learning_rate": 1.637655454746731e-05, "loss": 1.2688, "num_input_tokens_seen": 51153024, "step": 24485 }, { "epoch": 3.9951872093971774, "grad_norm": 9.0, "learning_rate": 1.6367340277949658e-05, "loss": 2.4167, "num_input_tokens_seen": 51163504, "step": 24490 }, { "epoch": 3.996002936617995, "grad_norm": 13.125, "learning_rate": 1.635812733969663e-05, "loss": 3.6797, "num_input_tokens_seen": 51174416, "step": 24495 }, { "epoch": 3.9968186638388126, "grad_norm": 6.78125, "learning_rate": 1.634891573412896e-05, "loss": 2.5743, "num_input_tokens_seen": 51185584, "step": 24500 }, { "epoch": 3.9976343910596297, "grad_norm": 2.859375, "learning_rate": 1.6339705462667196e-05, "loss": 2.6267, "num_input_tokens_seen": 51195856, "step": 24505 }, { "epoch": 3.998450118280447, "grad_norm": 1.609375, "learning_rate": 1.633049652673169e-05, "loss": 1.6336, "num_input_tokens_seen": 51205760, "step": 24510 }, { "epoch": 3.9992658455012644, "grad_norm": 9.875, "learning_rate": 1.632128892774256e-05, "loss": 2.6896, "num_input_tokens_seen": 51216512, "step": 24515 }, { "epoch": 4.0, "grad_norm": 8.375, "learning_rate": 1.6312082667119737e-05, "loss": 1.7332, "num_input_tokens_seen": 51225984, "step": 24520 }, { "epoch": 4.000815727220817, "grad_norm": 10.0625, "learning_rate": 1.630287774628296e-05, "loss": 2.7793, "num_input_tokens_seen": 51237728, "step": 24525 }, { "epoch": 4.001631454441635, "grad_norm": 5.0, "learning_rate": 1.6293674166651718e-05, "loss": 2.4229, "num_input_tokens_seen": 51249056, "step": 24530 }, { "epoch": 4.002447181662452, "grad_norm": 3.09375, "learning_rate": 1.6284471929645338e-05, "loss": 2.0425, "num_input_tokens_seen": 51257376, "step": 24535 }, { "epoch": 4.003262908883269, "grad_norm": 5.0625, "learning_rate": 1.627527103668291e-05, "loss": 1.8085, "num_input_tokens_seen": 51265952, "step": 24540 }, { "epoch": 4.0040786361040865, "grad_norm": 10.6875, "learning_rate": 1.6266071489183327e-05, "loss": 3.2492, "num_input_tokens_seen": 51277120, "step": 24545 }, { "epoch": 4.004894363324905, "grad_norm": 11.25, "learning_rate": 1.6256873288565283e-05, "loss": 2.8756, "num_input_tokens_seen": 51287936, "step": 24550 }, { "epoch": 4.005710090545722, "grad_norm": 2.0, "learning_rate": 1.6247676436247245e-05, "loss": 1.9261, "num_input_tokens_seen": 51298240, "step": 24555 }, { "epoch": 4.006525817766539, "grad_norm": 11.1875, "learning_rate": 1.6238480933647486e-05, "loss": 3.6083, "num_input_tokens_seen": 51307616, "step": 24560 }, { "epoch": 4.007341544987356, "grad_norm": 5.25, "learning_rate": 1.6229286782184083e-05, "loss": 1.1911, "num_input_tokens_seen": 51318096, "step": 24565 }, { "epoch": 4.008157272208174, "grad_norm": 14.0625, "learning_rate": 1.622009398327487e-05, "loss": 2.1849, "num_input_tokens_seen": 51329808, "step": 24570 }, { "epoch": 4.008972999428991, "grad_norm": 6.8125, "learning_rate": 1.6210902538337502e-05, "loss": 2.5064, "num_input_tokens_seen": 51341056, "step": 24575 }, { "epoch": 4.009788726649808, "grad_norm": 5.53125, "learning_rate": 1.6201712448789413e-05, "loss": 2.088, "num_input_tokens_seen": 51351568, "step": 24580 }, { "epoch": 4.010604453870625, "grad_norm": 4.375, "learning_rate": 1.6192523716047827e-05, "loss": 2.0302, "num_input_tokens_seen": 51362640, "step": 24585 }, { "epoch": 4.011420181091443, "grad_norm": 6.40625, "learning_rate": 1.6183336341529776e-05, "loss": 1.9117, "num_input_tokens_seen": 51372368, "step": 24590 }, { "epoch": 4.0122359083122605, "grad_norm": 10.0625, "learning_rate": 1.6174150326652047e-05, "loss": 2.4326, "num_input_tokens_seen": 51382528, "step": 24595 }, { "epoch": 4.013051635533078, "grad_norm": 3.109375, "learning_rate": 1.6164965672831256e-05, "loss": 2.1603, "num_input_tokens_seen": 51394160, "step": 24600 }, { "epoch": 4.013051635533078, "eval_loss": 2.540343999862671, "eval_runtime": 135.0117, "eval_samples_per_second": 20.183, "eval_steps_per_second": 10.095, "num_input_tokens_seen": 51394160, "step": 24600 }, { "epoch": 4.013867362753895, "grad_norm": 4.6875, "learning_rate": 1.6155782381483784e-05, "loss": 3.2689, "num_input_tokens_seen": 51405728, "step": 24605 }, { "epoch": 4.014683089974713, "grad_norm": 4.625, "learning_rate": 1.6146600454025813e-05, "loss": 2.0811, "num_input_tokens_seen": 51415616, "step": 24610 }, { "epoch": 4.01549881719553, "grad_norm": 8.9375, "learning_rate": 1.6137419891873317e-05, "loss": 3.4216, "num_input_tokens_seen": 51426240, "step": 24615 }, { "epoch": 4.016314544416347, "grad_norm": 10.5, "learning_rate": 1.6128240696442038e-05, "loss": 2.3852, "num_input_tokens_seen": 51436048, "step": 24620 }, { "epoch": 4.017130271637164, "grad_norm": 4.40625, "learning_rate": 1.611906286914753e-05, "loss": 1.864, "num_input_tokens_seen": 51446320, "step": 24625 }, { "epoch": 4.017945998857982, "grad_norm": 10.1875, "learning_rate": 1.6109886411405144e-05, "loss": 3.3988, "num_input_tokens_seen": 51457104, "step": 24630 }, { "epoch": 4.018761726078799, "grad_norm": 6.625, "learning_rate": 1.6100711324629985e-05, "loss": 3.9376, "num_input_tokens_seen": 51469088, "step": 24635 }, { "epoch": 4.0195774532996165, "grad_norm": 7.0, "learning_rate": 1.609153761023698e-05, "loss": 2.0672, "num_input_tokens_seen": 51480320, "step": 24640 }, { "epoch": 4.020393180520434, "grad_norm": 10.5, "learning_rate": 1.608236526964083e-05, "loss": 4.404, "num_input_tokens_seen": 51490976, "step": 24645 }, { "epoch": 4.021208907741252, "grad_norm": 8.4375, "learning_rate": 1.607319430425601e-05, "loss": 2.3763, "num_input_tokens_seen": 51500784, "step": 24650 }, { "epoch": 4.022024634962069, "grad_norm": 5.1875, "learning_rate": 1.606402471549682e-05, "loss": 1.6848, "num_input_tokens_seen": 51510432, "step": 24655 }, { "epoch": 4.022840362182886, "grad_norm": 12.4375, "learning_rate": 1.6054856504777312e-05, "loss": 2.2264, "num_input_tokens_seen": 51520896, "step": 24660 }, { "epoch": 4.023656089403703, "grad_norm": 4.90625, "learning_rate": 1.6045689673511334e-05, "loss": 1.9036, "num_input_tokens_seen": 51531872, "step": 24665 }, { "epoch": 4.024471816624521, "grad_norm": 10.375, "learning_rate": 1.6036524223112548e-05, "loss": 2.106, "num_input_tokens_seen": 51542816, "step": 24670 }, { "epoch": 4.025287543845338, "grad_norm": 17.0, "learning_rate": 1.602736015499436e-05, "loss": 2.0568, "num_input_tokens_seen": 51553568, "step": 24675 }, { "epoch": 4.026103271066155, "grad_norm": 6.6875, "learning_rate": 1.601819747057e-05, "loss": 1.9333, "num_input_tokens_seen": 51563920, "step": 24680 }, { "epoch": 4.026918998286972, "grad_norm": 4.28125, "learning_rate": 1.6009036171252465e-05, "loss": 1.7162, "num_input_tokens_seen": 51575584, "step": 24685 }, { "epoch": 4.02773472550779, "grad_norm": 5.15625, "learning_rate": 1.599987625845453e-05, "loss": 2.5217, "num_input_tokens_seen": 51587680, "step": 24690 }, { "epoch": 4.028550452728608, "grad_norm": 4.40625, "learning_rate": 1.599071773358879e-05, "loss": 2.7604, "num_input_tokens_seen": 51598160, "step": 24695 }, { "epoch": 4.029366179949425, "grad_norm": 6.15625, "learning_rate": 1.598156059806758e-05, "loss": 2.2855, "num_input_tokens_seen": 51609072, "step": 24700 }, { "epoch": 4.030181907170242, "grad_norm": 3.46875, "learning_rate": 1.5972404853303062e-05, "loss": 3.2393, "num_input_tokens_seen": 51619664, "step": 24705 }, { "epoch": 4.03099763439106, "grad_norm": 6.6875, "learning_rate": 1.5963250500707172e-05, "loss": 3.5823, "num_input_tokens_seen": 51630016, "step": 24710 }, { "epoch": 4.031813361611877, "grad_norm": 6.71875, "learning_rate": 1.5954097541691612e-05, "loss": 3.8073, "num_input_tokens_seen": 51639664, "step": 24715 }, { "epoch": 4.032629088832694, "grad_norm": 5.96875, "learning_rate": 1.5944945977667884e-05, "loss": 2.0225, "num_input_tokens_seen": 51650096, "step": 24720 }, { "epoch": 4.033444816053512, "grad_norm": 7.5, "learning_rate": 1.593579581004729e-05, "loss": 3.432, "num_input_tokens_seen": 51659952, "step": 24725 }, { "epoch": 4.034260543274329, "grad_norm": 13.5, "learning_rate": 1.592664704024088e-05, "loss": 3.7906, "num_input_tokens_seen": 51669488, "step": 24730 }, { "epoch": 4.035076270495146, "grad_norm": 5.4375, "learning_rate": 1.591749966965953e-05, "loss": 2.3926, "num_input_tokens_seen": 51681232, "step": 24735 }, { "epoch": 4.0358919977159635, "grad_norm": 7.8125, "learning_rate": 1.5908353699713856e-05, "loss": 1.8183, "num_input_tokens_seen": 51692112, "step": 24740 }, { "epoch": 4.0367077249367815, "grad_norm": 12.1875, "learning_rate": 1.5899209131814298e-05, "loss": 2.2185, "num_input_tokens_seen": 51702096, "step": 24745 }, { "epoch": 4.037523452157599, "grad_norm": 10.9375, "learning_rate": 1.5890065967371067e-05, "loss": 3.4912, "num_input_tokens_seen": 51713936, "step": 24750 }, { "epoch": 4.038339179378416, "grad_norm": 10.0, "learning_rate": 1.5880924207794144e-05, "loss": 3.7162, "num_input_tokens_seen": 51722512, "step": 24755 }, { "epoch": 4.039154906599233, "grad_norm": 4.53125, "learning_rate": 1.5871783854493298e-05, "loss": 1.8996, "num_input_tokens_seen": 51734720, "step": 24760 }, { "epoch": 4.039970633820051, "grad_norm": 9.4375, "learning_rate": 1.5862644908878106e-05, "loss": 2.7278, "num_input_tokens_seen": 51745600, "step": 24765 }, { "epoch": 4.040786361040868, "grad_norm": 9.75, "learning_rate": 1.5853507372357885e-05, "loss": 1.9228, "num_input_tokens_seen": 51757568, "step": 24770 }, { "epoch": 4.041602088261685, "grad_norm": 7.09375, "learning_rate": 1.5844371246341776e-05, "loss": 2.1256, "num_input_tokens_seen": 51767328, "step": 24775 }, { "epoch": 4.042417815482502, "grad_norm": 6.78125, "learning_rate": 1.5835236532238674e-05, "loss": 2.9969, "num_input_tokens_seen": 51777536, "step": 24780 }, { "epoch": 4.04323354270332, "grad_norm": 5.03125, "learning_rate": 1.582610323145727e-05, "loss": 3.3975, "num_input_tokens_seen": 51788096, "step": 24785 }, { "epoch": 4.0440492699241375, "grad_norm": 8.8125, "learning_rate": 1.5816971345406035e-05, "loss": 2.3444, "num_input_tokens_seen": 51799056, "step": 24790 }, { "epoch": 4.044864997144955, "grad_norm": 7.03125, "learning_rate": 1.5807840875493225e-05, "loss": 2.5925, "num_input_tokens_seen": 51810320, "step": 24795 }, { "epoch": 4.045680724365772, "grad_norm": 6.4375, "learning_rate": 1.5798711823126854e-05, "loss": 2.1706, "num_input_tokens_seen": 51821712, "step": 24800 }, { "epoch": 4.045680724365772, "eval_loss": 2.545102119445801, "eval_runtime": 134.848, "eval_samples_per_second": 20.208, "eval_steps_per_second": 10.108, "num_input_tokens_seen": 51821712, "step": 24800 }, { "epoch": 4.04649645158659, "grad_norm": 4.75, "learning_rate": 1.578958418971477e-05, "loss": 2.2454, "num_input_tokens_seen": 51831136, "step": 24805 }, { "epoch": 4.047312178807407, "grad_norm": 7.46875, "learning_rate": 1.578045797666453e-05, "loss": 1.7889, "num_input_tokens_seen": 51841488, "step": 24810 }, { "epoch": 4.048127906028224, "grad_norm": 10.0, "learning_rate": 1.5771333185383548e-05, "loss": 2.5769, "num_input_tokens_seen": 51852528, "step": 24815 }, { "epoch": 4.048943633249041, "grad_norm": 1.9375, "learning_rate": 1.576220981727895e-05, "loss": 1.8891, "num_input_tokens_seen": 51864240, "step": 24820 }, { "epoch": 4.049759360469859, "grad_norm": 15.625, "learning_rate": 1.575308787375769e-05, "loss": 2.218, "num_input_tokens_seen": 51873168, "step": 24825 }, { "epoch": 4.050575087690676, "grad_norm": 8.9375, "learning_rate": 1.5743967356226492e-05, "loss": 2.3698, "num_input_tokens_seen": 51884496, "step": 24830 }, { "epoch": 4.051390814911493, "grad_norm": 0.177734375, "learning_rate": 1.5734848266091835e-05, "loss": 2.6955, "num_input_tokens_seen": 51896304, "step": 24835 }, { "epoch": 4.052206542132311, "grad_norm": 12.0, "learning_rate": 1.572573060476001e-05, "loss": 1.8319, "num_input_tokens_seen": 51906608, "step": 24840 }, { "epoch": 4.053022269353129, "grad_norm": 5.46875, "learning_rate": 1.5716614373637085e-05, "loss": 3.6039, "num_input_tokens_seen": 51916880, "step": 24845 }, { "epoch": 4.053837996573946, "grad_norm": 7.9375, "learning_rate": 1.570749957412887e-05, "loss": 2.1091, "num_input_tokens_seen": 51926672, "step": 24850 }, { "epoch": 4.054653723794763, "grad_norm": 1.1953125, "learning_rate": 1.5698386207641013e-05, "loss": 2.4642, "num_input_tokens_seen": 51937280, "step": 24855 }, { "epoch": 4.05546945101558, "grad_norm": 8.5625, "learning_rate": 1.5689274275578884e-05, "loss": 3.1211, "num_input_tokens_seen": 51948752, "step": 24860 }, { "epoch": 4.056285178236398, "grad_norm": 10.625, "learning_rate": 1.5680163779347667e-05, "loss": 3.1941, "num_input_tokens_seen": 51959760, "step": 24865 }, { "epoch": 4.057100905457215, "grad_norm": 5.25, "learning_rate": 1.5671054720352327e-05, "loss": 1.3567, "num_input_tokens_seen": 51970000, "step": 24870 }, { "epoch": 4.057916632678032, "grad_norm": 5.59375, "learning_rate": 1.566194709999757e-05, "loss": 3.4178, "num_input_tokens_seen": 51981456, "step": 24875 }, { "epoch": 4.058732359898849, "grad_norm": 9.125, "learning_rate": 1.5652840919687933e-05, "loss": 3.3472, "num_input_tokens_seen": 51991584, "step": 24880 }, { "epoch": 4.059548087119667, "grad_norm": 4.03125, "learning_rate": 1.5643736180827676e-05, "loss": 1.9951, "num_input_tokens_seen": 52001152, "step": 24885 }, { "epoch": 4.0603638143404845, "grad_norm": 6.34375, "learning_rate": 1.5634632884820878e-05, "loss": 1.7925, "num_input_tokens_seen": 52011600, "step": 24890 }, { "epoch": 4.061179541561302, "grad_norm": 2.296875, "learning_rate": 1.5625531033071395e-05, "loss": 1.8111, "num_input_tokens_seen": 52022496, "step": 24895 }, { "epoch": 4.06199526878212, "grad_norm": 4.375, "learning_rate": 1.5616430626982828e-05, "loss": 2.8478, "num_input_tokens_seen": 52033472, "step": 24900 }, { "epoch": 4.062810996002937, "grad_norm": 6.53125, "learning_rate": 1.5607331667958575e-05, "loss": 2.8529, "num_input_tokens_seen": 52043280, "step": 24905 }, { "epoch": 4.063626723223754, "grad_norm": 2.8125, "learning_rate": 1.5598234157401824e-05, "loss": 1.6373, "num_input_tokens_seen": 52053680, "step": 24910 }, { "epoch": 4.064442450444571, "grad_norm": 10.0, "learning_rate": 1.5589138096715503e-05, "loss": 3.0509, "num_input_tokens_seen": 52063584, "step": 24915 }, { "epoch": 4.065258177665389, "grad_norm": 6.84375, "learning_rate": 1.5580043487302365e-05, "loss": 1.8038, "num_input_tokens_seen": 52073488, "step": 24920 }, { "epoch": 4.066073904886206, "grad_norm": 11.25, "learning_rate": 1.5570950330564888e-05, "loss": 1.5999, "num_input_tokens_seen": 52082688, "step": 24925 }, { "epoch": 4.066889632107023, "grad_norm": 4.71875, "learning_rate": 1.5561858627905367e-05, "loss": 2.4747, "num_input_tokens_seen": 52094176, "step": 24930 }, { "epoch": 4.0677053593278405, "grad_norm": 8.8125, "learning_rate": 1.5552768380725857e-05, "loss": 2.0008, "num_input_tokens_seen": 52105200, "step": 24935 }, { "epoch": 4.0685210865486585, "grad_norm": 4.46875, "learning_rate": 1.5543679590428183e-05, "loss": 2.2917, "num_input_tokens_seen": 52115024, "step": 24940 }, { "epoch": 4.069336813769476, "grad_norm": 7.25, "learning_rate": 1.5534592258413943e-05, "loss": 2.7056, "num_input_tokens_seen": 52125360, "step": 24945 }, { "epoch": 4.070152540990293, "grad_norm": 10.5, "learning_rate": 1.5525506386084538e-05, "loss": 3.5798, "num_input_tokens_seen": 52135376, "step": 24950 }, { "epoch": 4.07096826821111, "grad_norm": 11.0, "learning_rate": 1.55164219748411e-05, "loss": 1.877, "num_input_tokens_seen": 52147040, "step": 24955 }, { "epoch": 4.071783995431928, "grad_norm": 2.390625, "learning_rate": 1.550733902608459e-05, "loss": 1.7353, "num_input_tokens_seen": 52157680, "step": 24960 }, { "epoch": 4.072599722652745, "grad_norm": 9.0625, "learning_rate": 1.549825754121568e-05, "loss": 2.5551, "num_input_tokens_seen": 52167888, "step": 24965 }, { "epoch": 4.073415449873562, "grad_norm": 7.90625, "learning_rate": 1.5489177521634864e-05, "loss": 2.9202, "num_input_tokens_seen": 52178752, "step": 24970 }, { "epoch": 4.074231177094379, "grad_norm": 6.0625, "learning_rate": 1.5480098968742402e-05, "loss": 1.5285, "num_input_tokens_seen": 52189200, "step": 24975 }, { "epoch": 4.075046904315197, "grad_norm": 9.1875, "learning_rate": 1.5471021883938304e-05, "loss": 2.4032, "num_input_tokens_seen": 52199984, "step": 24980 }, { "epoch": 4.0758626315360145, "grad_norm": 11.375, "learning_rate": 1.546194626862238e-05, "loss": 1.8813, "num_input_tokens_seen": 52211584, "step": 24985 }, { "epoch": 4.076678358756832, "grad_norm": 2.203125, "learning_rate": 1.5452872124194216e-05, "loss": 1.5998, "num_input_tokens_seen": 52222816, "step": 24990 }, { "epoch": 4.077494085977649, "grad_norm": 5.5625, "learning_rate": 1.5443799452053136e-05, "loss": 2.8754, "num_input_tokens_seen": 52234016, "step": 24995 }, { "epoch": 4.078309813198467, "grad_norm": 8.375, "learning_rate": 1.543472825359828e-05, "loss": 1.6054, "num_input_tokens_seen": 52244608, "step": 25000 }, { "epoch": 4.078309813198467, "eval_loss": 2.5288426876068115, "eval_runtime": 134.8652, "eval_samples_per_second": 20.205, "eval_steps_per_second": 10.106, "num_input_tokens_seen": 52244608, "step": 25000 }, { "epoch": 4.079125540419284, "grad_norm": 7.09375, "learning_rate": 1.5425658530228522e-05, "loss": 3.1642, "num_input_tokens_seen": 52254960, "step": 25005 }, { "epoch": 4.079941267640101, "grad_norm": 5.34375, "learning_rate": 1.5416590283342546e-05, "loss": 2.4792, "num_input_tokens_seen": 52265136, "step": 25010 }, { "epoch": 4.080756994860918, "grad_norm": 5.75, "learning_rate": 1.5407523514338783e-05, "loss": 0.9519, "num_input_tokens_seen": 52276048, "step": 25015 }, { "epoch": 4.081572722081736, "grad_norm": 9.5625, "learning_rate": 1.539845822461543e-05, "loss": 2.0118, "num_input_tokens_seen": 52286256, "step": 25020 }, { "epoch": 4.082388449302553, "grad_norm": 5.375, "learning_rate": 1.538939441557048e-05, "loss": 2.3859, "num_input_tokens_seen": 52295872, "step": 25025 }, { "epoch": 4.08320417652337, "grad_norm": 6.4375, "learning_rate": 1.5380332088601696e-05, "loss": 4.7387, "num_input_tokens_seen": 52305616, "step": 25030 }, { "epoch": 4.0840199037441876, "grad_norm": 7.1875, "learning_rate": 1.537127124510658e-05, "loss": 2.3978, "num_input_tokens_seen": 52317376, "step": 25035 }, { "epoch": 4.084835630965006, "grad_norm": 6.71875, "learning_rate": 1.5362211886482457e-05, "loss": 3.6264, "num_input_tokens_seen": 52327792, "step": 25040 }, { "epoch": 4.085651358185823, "grad_norm": 3.5, "learning_rate": 1.5353154014126363e-05, "loss": 2.7063, "num_input_tokens_seen": 52338112, "step": 25045 }, { "epoch": 4.08646708540664, "grad_norm": 6.59375, "learning_rate": 1.534409762943515e-05, "loss": 3.0695, "num_input_tokens_seen": 52347840, "step": 25050 }, { "epoch": 4.087282812627457, "grad_norm": 5.90625, "learning_rate": 1.5335042733805438e-05, "loss": 1.9829, "num_input_tokens_seen": 52358000, "step": 25055 }, { "epoch": 4.088098539848275, "grad_norm": 3.890625, "learning_rate": 1.532598932863358e-05, "loss": 2.0714, "num_input_tokens_seen": 52368512, "step": 25060 }, { "epoch": 4.088914267069092, "grad_norm": 4.6875, "learning_rate": 1.531693741531574e-05, "loss": 2.4597, "num_input_tokens_seen": 52379568, "step": 25065 }, { "epoch": 4.089729994289909, "grad_norm": 17.0, "learning_rate": 1.5307886995247844e-05, "loss": 3.4794, "num_input_tokens_seen": 52388832, "step": 25070 }, { "epoch": 4.090545721510727, "grad_norm": 9.125, "learning_rate": 1.529883806982557e-05, "loss": 3.1409, "num_input_tokens_seen": 52399056, "step": 25075 }, { "epoch": 4.091361448731544, "grad_norm": 2.8125, "learning_rate": 1.5289790640444376e-05, "loss": 2.4856, "num_input_tokens_seen": 52411456, "step": 25080 }, { "epoch": 4.0921771759523615, "grad_norm": 12.625, "learning_rate": 1.5280744708499494e-05, "loss": 2.5573, "num_input_tokens_seen": 52422176, "step": 25085 }, { "epoch": 4.092992903173179, "grad_norm": 2.46875, "learning_rate": 1.527170027538591e-05, "loss": 1.6164, "num_input_tokens_seen": 52432880, "step": 25090 }, { "epoch": 4.093808630393997, "grad_norm": 6.96875, "learning_rate": 1.5262657342498407e-05, "loss": 1.7433, "num_input_tokens_seen": 52444256, "step": 25095 }, { "epoch": 4.094624357614814, "grad_norm": 6.09375, "learning_rate": 1.52536159112315e-05, "loss": 1.8, "num_input_tokens_seen": 52453856, "step": 25100 }, { "epoch": 4.095440084835631, "grad_norm": 11.6875, "learning_rate": 1.5244575982979497e-05, "loss": 1.95, "num_input_tokens_seen": 52463568, "step": 25105 }, { "epoch": 4.096255812056448, "grad_norm": 9.6875, "learning_rate": 1.5235537559136487e-05, "loss": 2.9884, "num_input_tokens_seen": 52473232, "step": 25110 }, { "epoch": 4.097071539277266, "grad_norm": 8.3125, "learning_rate": 1.5226500641096286e-05, "loss": 1.6695, "num_input_tokens_seen": 52482672, "step": 25115 }, { "epoch": 4.097887266498083, "grad_norm": 0.177734375, "learning_rate": 1.5217465230252509e-05, "loss": 1.0523, "num_input_tokens_seen": 52494656, "step": 25120 }, { "epoch": 4.0987029937189, "grad_norm": 6.9375, "learning_rate": 1.5208431327998523e-05, "loss": 1.3795, "num_input_tokens_seen": 52504064, "step": 25125 }, { "epoch": 4.0995187209397175, "grad_norm": 10.5, "learning_rate": 1.5199398935727477e-05, "loss": 2.6356, "num_input_tokens_seen": 52515760, "step": 25130 }, { "epoch": 4.1003344481605355, "grad_norm": 5.75, "learning_rate": 1.5190368054832282e-05, "loss": 2.2369, "num_input_tokens_seen": 52526400, "step": 25135 }, { "epoch": 4.101150175381353, "grad_norm": 3.5625, "learning_rate": 1.5181338686705601e-05, "loss": 1.7902, "num_input_tokens_seen": 52537456, "step": 25140 }, { "epoch": 4.10196590260217, "grad_norm": 5.5, "learning_rate": 1.5172310832739889e-05, "loss": 1.669, "num_input_tokens_seen": 52547792, "step": 25145 }, { "epoch": 4.102781629822987, "grad_norm": 5.3125, "learning_rate": 1.5163284494327346e-05, "loss": 2.0257, "num_input_tokens_seen": 52558288, "step": 25150 }, { "epoch": 4.103597357043805, "grad_norm": 13.6875, "learning_rate": 1.5154259672859952e-05, "loss": 3.0248, "num_input_tokens_seen": 52568000, "step": 25155 }, { "epoch": 4.104413084264622, "grad_norm": 8.8125, "learning_rate": 1.5145236369729452e-05, "loss": 2.9275, "num_input_tokens_seen": 52577920, "step": 25160 }, { "epoch": 4.105228811485439, "grad_norm": 9.0625, "learning_rate": 1.5136214586327335e-05, "loss": 1.569, "num_input_tokens_seen": 52587536, "step": 25165 }, { "epoch": 4.106044538706256, "grad_norm": 7.65625, "learning_rate": 1.5127194324044885e-05, "loss": 3.0632, "num_input_tokens_seen": 52597008, "step": 25170 }, { "epoch": 4.106860265927074, "grad_norm": 9.375, "learning_rate": 1.5118175584273148e-05, "loss": 2.833, "num_input_tokens_seen": 52606624, "step": 25175 }, { "epoch": 4.1076759931478914, "grad_norm": 7.625, "learning_rate": 1.5109158368402909e-05, "loss": 2.0158, "num_input_tokens_seen": 52617376, "step": 25180 }, { "epoch": 4.108491720368709, "grad_norm": 12.1875, "learning_rate": 1.5100142677824753e-05, "loss": 2.7997, "num_input_tokens_seen": 52628400, "step": 25185 }, { "epoch": 4.109307447589526, "grad_norm": 5.5, "learning_rate": 1.509112851392901e-05, "loss": 2.3182, "num_input_tokens_seen": 52638192, "step": 25190 }, { "epoch": 4.110123174810344, "grad_norm": 8.5, "learning_rate": 1.5082115878105763e-05, "loss": 4.5375, "num_input_tokens_seen": 52649008, "step": 25195 }, { "epoch": 4.110938902031161, "grad_norm": 6.34375, "learning_rate": 1.5073104771744892e-05, "loss": 2.0526, "num_input_tokens_seen": 52659888, "step": 25200 }, { "epoch": 4.110938902031161, "eval_loss": 2.551703691482544, "eval_runtime": 134.8052, "eval_samples_per_second": 20.214, "eval_steps_per_second": 10.111, "num_input_tokens_seen": 52659888, "step": 25200 }, { "epoch": 4.111754629251978, "grad_norm": 1.4375, "learning_rate": 1.5064095196236006e-05, "loss": 2.6566, "num_input_tokens_seen": 52668528, "step": 25205 }, { "epoch": 4.112570356472795, "grad_norm": 5.03125, "learning_rate": 1.50550871529685e-05, "loss": 2.7985, "num_input_tokens_seen": 52679824, "step": 25210 }, { "epoch": 4.113386083693613, "grad_norm": 7.15625, "learning_rate": 1.5046080643331546e-05, "loss": 1.5852, "num_input_tokens_seen": 52688496, "step": 25215 }, { "epoch": 4.11420181091443, "grad_norm": 10.9375, "learning_rate": 1.5037075668714028e-05, "loss": 2.7034, "num_input_tokens_seen": 52700544, "step": 25220 }, { "epoch": 4.115017538135247, "grad_norm": 0.1689453125, "learning_rate": 1.5028072230504656e-05, "loss": 1.9227, "num_input_tokens_seen": 52710304, "step": 25225 }, { "epoch": 4.1158332653560645, "grad_norm": 8.375, "learning_rate": 1.5019070330091861e-05, "loss": 2.2664, "num_input_tokens_seen": 52720528, "step": 25230 }, { "epoch": 4.1166489925768825, "grad_norm": 8.5625, "learning_rate": 1.5010069968863843e-05, "loss": 1.9659, "num_input_tokens_seen": 52731280, "step": 25235 }, { "epoch": 4.1174647197977, "grad_norm": 6.21875, "learning_rate": 1.5001071148208584e-05, "loss": 2.0341, "num_input_tokens_seen": 52742448, "step": 25240 }, { "epoch": 4.118280447018517, "grad_norm": 9.5, "learning_rate": 1.49920738695138e-05, "loss": 2.2142, "num_input_tokens_seen": 52753328, "step": 25245 }, { "epoch": 4.119096174239334, "grad_norm": 6.6875, "learning_rate": 1.4983078134166995e-05, "loss": 2.4535, "num_input_tokens_seen": 52764416, "step": 25250 }, { "epoch": 4.119911901460152, "grad_norm": 3.828125, "learning_rate": 1.4974083943555428e-05, "loss": 2.6979, "num_input_tokens_seen": 52775616, "step": 25255 }, { "epoch": 4.120727628680969, "grad_norm": 5.125, "learning_rate": 1.496509129906611e-05, "loss": 3.6935, "num_input_tokens_seen": 52787056, "step": 25260 }, { "epoch": 4.121543355901786, "grad_norm": 11.0, "learning_rate": 1.4956100202085809e-05, "loss": 2.3477, "num_input_tokens_seen": 52796016, "step": 25265 }, { "epoch": 4.122359083122603, "grad_norm": 6.75, "learning_rate": 1.4947110654001093e-05, "loss": 3.1076, "num_input_tokens_seen": 52807248, "step": 25270 }, { "epoch": 4.123174810343421, "grad_norm": 8.5625, "learning_rate": 1.4938122656198234e-05, "loss": 2.2347, "num_input_tokens_seen": 52816848, "step": 25275 }, { "epoch": 4.1239905375642385, "grad_norm": 4.34375, "learning_rate": 1.4929136210063316e-05, "loss": 2.4091, "num_input_tokens_seen": 52826816, "step": 25280 }, { "epoch": 4.124806264785056, "grad_norm": 4.3125, "learning_rate": 1.4920151316982146e-05, "loss": 1.2216, "num_input_tokens_seen": 52836528, "step": 25285 }, { "epoch": 4.125621992005874, "grad_norm": 5.40625, "learning_rate": 1.4911167978340312e-05, "loss": 2.4897, "num_input_tokens_seen": 52846576, "step": 25290 }, { "epoch": 4.126437719226691, "grad_norm": 9.375, "learning_rate": 1.4902186195523166e-05, "loss": 2.5142, "num_input_tokens_seen": 52857712, "step": 25295 }, { "epoch": 4.127253446447508, "grad_norm": 8.625, "learning_rate": 1.4893205969915805e-05, "loss": 1.9978, "num_input_tokens_seen": 52868576, "step": 25300 }, { "epoch": 4.128069173668325, "grad_norm": 8.25, "learning_rate": 1.4884227302903086e-05, "loss": 2.9423, "num_input_tokens_seen": 52879552, "step": 25305 }, { "epoch": 4.128884900889143, "grad_norm": 5.5625, "learning_rate": 1.4875250195869653e-05, "loss": 2.0911, "num_input_tokens_seen": 52888144, "step": 25310 }, { "epoch": 4.12970062810996, "grad_norm": 6.8125, "learning_rate": 1.4866274650199862e-05, "loss": 2.4658, "num_input_tokens_seen": 52898208, "step": 25315 }, { "epoch": 4.130516355330777, "grad_norm": 12.4375, "learning_rate": 1.485730066727788e-05, "loss": 2.7093, "num_input_tokens_seen": 52909136, "step": 25320 }, { "epoch": 4.1313320825515945, "grad_norm": 9.25, "learning_rate": 1.4848328248487586e-05, "loss": 2.6395, "num_input_tokens_seen": 52918720, "step": 25325 }, { "epoch": 4.1321478097724125, "grad_norm": 4.09375, "learning_rate": 1.4839357395212656e-05, "loss": 2.8061, "num_input_tokens_seen": 52928464, "step": 25330 }, { "epoch": 4.13296353699323, "grad_norm": 9.1875, "learning_rate": 1.4830388108836502e-05, "loss": 2.9722, "num_input_tokens_seen": 52939008, "step": 25335 }, { "epoch": 4.133779264214047, "grad_norm": 8.9375, "learning_rate": 1.4821420390742299e-05, "loss": 2.0187, "num_input_tokens_seen": 52949664, "step": 25340 }, { "epoch": 4.134594991434864, "grad_norm": 15.125, "learning_rate": 1.4812454242312979e-05, "loss": 3.8748, "num_input_tokens_seen": 52959760, "step": 25345 }, { "epoch": 4.135410718655682, "grad_norm": 4.9375, "learning_rate": 1.4803489664931253e-05, "loss": 1.4902, "num_input_tokens_seen": 52969664, "step": 25350 }, { "epoch": 4.136226445876499, "grad_norm": 5.5625, "learning_rate": 1.4794526659979544e-05, "loss": 2.2104, "num_input_tokens_seen": 52979200, "step": 25355 }, { "epoch": 4.137042173097316, "grad_norm": 2.140625, "learning_rate": 1.4785565228840086e-05, "loss": 1.1141, "num_input_tokens_seen": 52988976, "step": 25360 }, { "epoch": 4.137857900318133, "grad_norm": 6.78125, "learning_rate": 1.4776605372894819e-05, "loss": 2.4724, "num_input_tokens_seen": 52999936, "step": 25365 }, { "epoch": 4.138673627538951, "grad_norm": 9.9375, "learning_rate": 1.4767647093525488e-05, "loss": 1.0567, "num_input_tokens_seen": 53010512, "step": 25370 }, { "epoch": 4.139489354759768, "grad_norm": 9.0, "learning_rate": 1.4758690392113566e-05, "loss": 1.6404, "num_input_tokens_seen": 53020880, "step": 25375 }, { "epoch": 4.1403050819805856, "grad_norm": 8.9375, "learning_rate": 1.4749735270040276e-05, "loss": 3.3843, "num_input_tokens_seen": 53031280, "step": 25380 }, { "epoch": 4.141120809201403, "grad_norm": 2.203125, "learning_rate": 1.4740781728686623e-05, "loss": 2.0223, "num_input_tokens_seen": 53042176, "step": 25385 }, { "epoch": 4.141936536422221, "grad_norm": 10.0, "learning_rate": 1.4731829769433358e-05, "loss": 1.7687, "num_input_tokens_seen": 53051424, "step": 25390 }, { "epoch": 4.142752263643038, "grad_norm": 5.5625, "learning_rate": 1.4722879393660976e-05, "loss": 1.5099, "num_input_tokens_seen": 53062576, "step": 25395 }, { "epoch": 4.143567990863855, "grad_norm": 6.625, "learning_rate": 1.4713930602749748e-05, "loss": 1.658, "num_input_tokens_seen": 53073648, "step": 25400 }, { "epoch": 4.143567990863855, "eval_loss": 2.54581880569458, "eval_runtime": 135.0407, "eval_samples_per_second": 20.179, "eval_steps_per_second": 10.093, "num_input_tokens_seen": 53073648, "step": 25400 }, { "epoch": 4.144383718084672, "grad_norm": 11.625, "learning_rate": 1.470498339807968e-05, "loss": 3.5884, "num_input_tokens_seen": 53083344, "step": 25405 }, { "epoch": 4.14519944530549, "grad_norm": 4.78125, "learning_rate": 1.4696037781030542e-05, "loss": 2.7814, "num_input_tokens_seen": 53093280, "step": 25410 }, { "epoch": 4.146015172526307, "grad_norm": 3.453125, "learning_rate": 1.4687093752981876e-05, "loss": 1.6041, "num_input_tokens_seen": 53103952, "step": 25415 }, { "epoch": 4.146830899747124, "grad_norm": 3.703125, "learning_rate": 1.4678151315312943e-05, "loss": 2.2638, "num_input_tokens_seen": 53114256, "step": 25420 }, { "epoch": 4.1476466269679415, "grad_norm": 5.21875, "learning_rate": 1.4669210469402789e-05, "loss": 2.6153, "num_input_tokens_seen": 53125136, "step": 25425 }, { "epoch": 4.1484623541887595, "grad_norm": 7.5625, "learning_rate": 1.4660271216630218e-05, "loss": 3.0091, "num_input_tokens_seen": 53136720, "step": 25430 }, { "epoch": 4.149278081409577, "grad_norm": 7.6875, "learning_rate": 1.4651333558373748e-05, "loss": 1.6617, "num_input_tokens_seen": 53147456, "step": 25435 }, { "epoch": 4.150093808630394, "grad_norm": 7.1875, "learning_rate": 1.4642397496011707e-05, "loss": 1.922, "num_input_tokens_seen": 53156704, "step": 25440 }, { "epoch": 4.150909535851211, "grad_norm": 7.96875, "learning_rate": 1.4633463030922129e-05, "loss": 2.7945, "num_input_tokens_seen": 53166528, "step": 25445 }, { "epoch": 4.151725263072029, "grad_norm": 5.75, "learning_rate": 1.462453016448282e-05, "loss": 1.8874, "num_input_tokens_seen": 53176992, "step": 25450 }, { "epoch": 4.152540990292846, "grad_norm": 0.10791015625, "learning_rate": 1.4615598898071354e-05, "loss": 1.6731, "num_input_tokens_seen": 53188688, "step": 25455 }, { "epoch": 4.153356717513663, "grad_norm": 7.59375, "learning_rate": 1.4606669233065026e-05, "loss": 3.3342, "num_input_tokens_seen": 53198672, "step": 25460 }, { "epoch": 4.154172444734481, "grad_norm": 5.1875, "learning_rate": 1.4597741170840914e-05, "loss": 1.5395, "num_input_tokens_seen": 53208640, "step": 25465 }, { "epoch": 4.154988171955298, "grad_norm": 3.328125, "learning_rate": 1.4588814712775853e-05, "loss": 2.2881, "num_input_tokens_seen": 53219648, "step": 25470 }, { "epoch": 4.1558038991761155, "grad_norm": 5.34375, "learning_rate": 1.4579889860246382e-05, "loss": 2.3771, "num_input_tokens_seen": 53231152, "step": 25475 }, { "epoch": 4.156619626396933, "grad_norm": 12.625, "learning_rate": 1.457096661462885e-05, "loss": 1.9645, "num_input_tokens_seen": 53242784, "step": 25480 }, { "epoch": 4.157435353617751, "grad_norm": 14.25, "learning_rate": 1.4562044977299322e-05, "loss": 3.9255, "num_input_tokens_seen": 53252704, "step": 25485 }, { "epoch": 4.158251080838568, "grad_norm": 6.5625, "learning_rate": 1.4553124949633623e-05, "loss": 2.4001, "num_input_tokens_seen": 53263424, "step": 25490 }, { "epoch": 4.159066808059385, "grad_norm": 5.84375, "learning_rate": 1.4544206533007354e-05, "loss": 2.8383, "num_input_tokens_seen": 53272240, "step": 25495 }, { "epoch": 4.159882535280202, "grad_norm": 6.375, "learning_rate": 1.4535289728795821e-05, "loss": 3.5275, "num_input_tokens_seen": 53283120, "step": 25500 }, { "epoch": 4.16069826250102, "grad_norm": 2.234375, "learning_rate": 1.4526374538374132e-05, "loss": 1.6568, "num_input_tokens_seen": 53294464, "step": 25505 }, { "epoch": 4.161513989721837, "grad_norm": 6.0625, "learning_rate": 1.4517460963117097e-05, "loss": 2.4369, "num_input_tokens_seen": 53304464, "step": 25510 }, { "epoch": 4.162329716942654, "grad_norm": 7.8125, "learning_rate": 1.4508549004399314e-05, "loss": 2.7863, "num_input_tokens_seen": 53314688, "step": 25515 }, { "epoch": 4.163145444163471, "grad_norm": 8.6875, "learning_rate": 1.449963866359513e-05, "loss": 1.7282, "num_input_tokens_seen": 53325152, "step": 25520 }, { "epoch": 4.1639611713842895, "grad_norm": 6.15625, "learning_rate": 1.4490729942078607e-05, "loss": 1.2701, "num_input_tokens_seen": 53336240, "step": 25525 }, { "epoch": 4.164776898605107, "grad_norm": 10.125, "learning_rate": 1.4481822841223608e-05, "loss": 3.1916, "num_input_tokens_seen": 53346800, "step": 25530 }, { "epoch": 4.165592625825924, "grad_norm": 4.53125, "learning_rate": 1.4472917362403704e-05, "loss": 2.2709, "num_input_tokens_seen": 53357168, "step": 25535 }, { "epoch": 4.166408353046741, "grad_norm": 8.875, "learning_rate": 1.4464013506992224e-05, "loss": 1.7058, "num_input_tokens_seen": 53367120, "step": 25540 }, { "epoch": 4.167224080267559, "grad_norm": 2.625, "learning_rate": 1.4455111276362277e-05, "loss": 3.6354, "num_input_tokens_seen": 53376864, "step": 25545 }, { "epoch": 4.168039807488376, "grad_norm": 5.84375, "learning_rate": 1.4446210671886676e-05, "loss": 2.0646, "num_input_tokens_seen": 53386192, "step": 25550 }, { "epoch": 4.168855534709193, "grad_norm": 3.625, "learning_rate": 1.4437311694938015e-05, "loss": 1.8337, "num_input_tokens_seen": 53397664, "step": 25555 }, { "epoch": 4.16967126193001, "grad_norm": 2.25, "learning_rate": 1.442841434688864e-05, "loss": 2.0724, "num_input_tokens_seen": 53408752, "step": 25560 }, { "epoch": 4.170486989150828, "grad_norm": 7.96875, "learning_rate": 1.4419518629110615e-05, "loss": 2.7139, "num_input_tokens_seen": 53420560, "step": 25565 }, { "epoch": 4.171302716371645, "grad_norm": 5.03125, "learning_rate": 1.4410624542975778e-05, "loss": 1.7932, "num_input_tokens_seen": 53429136, "step": 25570 }, { "epoch": 4.1721184435924625, "grad_norm": 15.25, "learning_rate": 1.4401732089855724e-05, "loss": 2.9515, "num_input_tokens_seen": 53440016, "step": 25575 }, { "epoch": 4.17293417081328, "grad_norm": 4.84375, "learning_rate": 1.4392841271121754e-05, "loss": 1.5418, "num_input_tokens_seen": 53450816, "step": 25580 }, { "epoch": 4.173749898034098, "grad_norm": 6.90625, "learning_rate": 1.438395208814497e-05, "loss": 2.2517, "num_input_tokens_seen": 53461024, "step": 25585 }, { "epoch": 4.174565625254915, "grad_norm": 7.28125, "learning_rate": 1.4375064542296174e-05, "loss": 2.0148, "num_input_tokens_seen": 53470912, "step": 25590 }, { "epoch": 4.175381352475732, "grad_norm": 6.75, "learning_rate": 1.4366178634945946e-05, "loss": 2.6814, "num_input_tokens_seen": 53482736, "step": 25595 }, { "epoch": 4.176197079696549, "grad_norm": 8.5625, "learning_rate": 1.4357294367464616e-05, "loss": 1.2321, "num_input_tokens_seen": 53493696, "step": 25600 }, { "epoch": 4.176197079696549, "eval_loss": 2.5427770614624023, "eval_runtime": 135.0388, "eval_samples_per_second": 20.179, "eval_steps_per_second": 10.093, "num_input_tokens_seen": 53493696, "step": 25600 }, { "epoch": 4.177012806917367, "grad_norm": 11.4375, "learning_rate": 1.434841174122224e-05, "loss": 2.7373, "num_input_tokens_seen": 53503328, "step": 25605 }, { "epoch": 4.177828534138184, "grad_norm": 12.25, "learning_rate": 1.4339530757588615e-05, "loss": 2.5927, "num_input_tokens_seen": 53514032, "step": 25610 }, { "epoch": 4.178644261359001, "grad_norm": 3.234375, "learning_rate": 1.433065141793333e-05, "loss": 2.3291, "num_input_tokens_seen": 53524368, "step": 25615 }, { "epoch": 4.1794599885798185, "grad_norm": 4.65625, "learning_rate": 1.4321773723625665e-05, "loss": 3.2613, "num_input_tokens_seen": 53534976, "step": 25620 }, { "epoch": 4.1802757158006365, "grad_norm": 1.9140625, "learning_rate": 1.4312897676034693e-05, "loss": 2.1765, "num_input_tokens_seen": 53545472, "step": 25625 }, { "epoch": 4.181091443021454, "grad_norm": 8.1875, "learning_rate": 1.4304023276529188e-05, "loss": 2.1716, "num_input_tokens_seen": 53557344, "step": 25630 }, { "epoch": 4.181907170242271, "grad_norm": 7.96875, "learning_rate": 1.4295150526477712e-05, "loss": 2.7278, "num_input_tokens_seen": 53567248, "step": 25635 }, { "epoch": 4.182722897463089, "grad_norm": 5.5, "learning_rate": 1.4286279427248562e-05, "loss": 2.4444, "num_input_tokens_seen": 53576720, "step": 25640 }, { "epoch": 4.183538624683906, "grad_norm": 5.1875, "learning_rate": 1.4277409980209747e-05, "loss": 0.9651, "num_input_tokens_seen": 53586848, "step": 25645 }, { "epoch": 4.184354351904723, "grad_norm": 7.09375, "learning_rate": 1.4268542186729061e-05, "loss": 3.0174, "num_input_tokens_seen": 53596352, "step": 25650 }, { "epoch": 4.18517007912554, "grad_norm": 4.59375, "learning_rate": 1.4259676048174043e-05, "loss": 1.6685, "num_input_tokens_seen": 53606768, "step": 25655 }, { "epoch": 4.185985806346358, "grad_norm": 5.65625, "learning_rate": 1.4250811565911937e-05, "loss": 2.0507, "num_input_tokens_seen": 53616768, "step": 25660 }, { "epoch": 4.186801533567175, "grad_norm": 2.953125, "learning_rate": 1.4241948741309782e-05, "loss": 2.2424, "num_input_tokens_seen": 53626944, "step": 25665 }, { "epoch": 4.1876172607879925, "grad_norm": 9.9375, "learning_rate": 1.4233087575734317e-05, "loss": 3.4302, "num_input_tokens_seen": 53636016, "step": 25670 }, { "epoch": 4.18843298800881, "grad_norm": 10.875, "learning_rate": 1.422422807055206e-05, "loss": 2.0502, "num_input_tokens_seen": 53648112, "step": 25675 }, { "epoch": 4.189248715229628, "grad_norm": 7.28125, "learning_rate": 1.4215370227129243e-05, "loss": 1.939, "num_input_tokens_seen": 53658112, "step": 25680 }, { "epoch": 4.190064442450445, "grad_norm": 12.9375, "learning_rate": 1.4206514046831876e-05, "loss": 3.2173, "num_input_tokens_seen": 53669696, "step": 25685 }, { "epoch": 4.190880169671262, "grad_norm": 6.34375, "learning_rate": 1.419765953102567e-05, "loss": 2.4448, "num_input_tokens_seen": 53679856, "step": 25690 }, { "epoch": 4.191695896892079, "grad_norm": 7.21875, "learning_rate": 1.4188806681076125e-05, "loss": 2.607, "num_input_tokens_seen": 53690176, "step": 25695 }, { "epoch": 4.192511624112897, "grad_norm": 3.140625, "learning_rate": 1.4179955498348443e-05, "loss": 4.4754, "num_input_tokens_seen": 53699744, "step": 25700 }, { "epoch": 4.193327351333714, "grad_norm": 4.6875, "learning_rate": 1.4171105984207605e-05, "loss": 2.3131, "num_input_tokens_seen": 53710144, "step": 25705 }, { "epoch": 4.194143078554531, "grad_norm": 4.90625, "learning_rate": 1.4162258140018304e-05, "loss": 2.7778, "num_input_tokens_seen": 53721808, "step": 25710 }, { "epoch": 4.194958805775348, "grad_norm": 7.0, "learning_rate": 1.4153411967144986e-05, "loss": 3.0982, "num_input_tokens_seen": 53732848, "step": 25715 }, { "epoch": 4.195774532996166, "grad_norm": 8.4375, "learning_rate": 1.4144567466951864e-05, "loss": 2.6465, "num_input_tokens_seen": 53742944, "step": 25720 }, { "epoch": 4.196590260216984, "grad_norm": 5.125, "learning_rate": 1.4135724640802844e-05, "loss": 2.036, "num_input_tokens_seen": 53753152, "step": 25725 }, { "epoch": 4.197405987437801, "grad_norm": 9.5, "learning_rate": 1.4126883490061615e-05, "loss": 3.3645, "num_input_tokens_seen": 53764384, "step": 25730 }, { "epoch": 4.198221714658618, "grad_norm": 4.28125, "learning_rate": 1.4118044016091603e-05, "loss": 1.4475, "num_input_tokens_seen": 53775360, "step": 25735 }, { "epoch": 4.199037441879436, "grad_norm": 6.46875, "learning_rate": 1.410920622025594e-05, "loss": 2.3715, "num_input_tokens_seen": 53784400, "step": 25740 }, { "epoch": 4.199853169100253, "grad_norm": 6.28125, "learning_rate": 1.4100370103917554e-05, "loss": 2.3947, "num_input_tokens_seen": 53794048, "step": 25745 }, { "epoch": 4.20066889632107, "grad_norm": 2.84375, "learning_rate": 1.409153566843907e-05, "loss": 1.6669, "num_input_tokens_seen": 53804560, "step": 25750 }, { "epoch": 4.201484623541887, "grad_norm": 0.94140625, "learning_rate": 1.408270291518286e-05, "loss": 1.1062, "num_input_tokens_seen": 53815536, "step": 25755 }, { "epoch": 4.202300350762705, "grad_norm": 4.46875, "learning_rate": 1.407387184551107e-05, "loss": 2.0715, "num_input_tokens_seen": 53826400, "step": 25760 }, { "epoch": 4.203116077983522, "grad_norm": 6.1875, "learning_rate": 1.4065042460785532e-05, "loss": 2.4419, "num_input_tokens_seen": 53835808, "step": 25765 }, { "epoch": 4.2039318052043395, "grad_norm": 5.375, "learning_rate": 1.405621476236787e-05, "loss": 1.1933, "num_input_tokens_seen": 53844880, "step": 25770 }, { "epoch": 4.204747532425157, "grad_norm": 12.1875, "learning_rate": 1.4047388751619423e-05, "loss": 3.1226, "num_input_tokens_seen": 53856352, "step": 25775 }, { "epoch": 4.205563259645975, "grad_norm": 5.25, "learning_rate": 1.4038564429901264e-05, "loss": 1.8335, "num_input_tokens_seen": 53867232, "step": 25780 }, { "epoch": 4.206378986866792, "grad_norm": 11.6875, "learning_rate": 1.4029741798574227e-05, "loss": 2.8169, "num_input_tokens_seen": 53877248, "step": 25785 }, { "epoch": 4.207194714087609, "grad_norm": 7.25, "learning_rate": 1.402092085899886e-05, "loss": 3.2848, "num_input_tokens_seen": 53887328, "step": 25790 }, { "epoch": 4.208010441308426, "grad_norm": 8.875, "learning_rate": 1.4012101612535464e-05, "loss": 2.9211, "num_input_tokens_seen": 53897504, "step": 25795 }, { "epoch": 4.208826168529244, "grad_norm": 6.1875, "learning_rate": 1.4003284060544092e-05, "loss": 2.4438, "num_input_tokens_seen": 53907648, "step": 25800 }, { "epoch": 4.208826168529244, "eval_loss": 2.5512468814849854, "eval_runtime": 134.8314, "eval_samples_per_second": 20.21, "eval_steps_per_second": 10.109, "num_input_tokens_seen": 53907648, "step": 25800 }, { "epoch": 4.209641895750061, "grad_norm": 0.2294921875, "learning_rate": 1.3994468204384504e-05, "loss": 2.0562, "num_input_tokens_seen": 53917760, "step": 25805 }, { "epoch": 4.210457622970878, "grad_norm": 10.8125, "learning_rate": 1.398565404541622e-05, "loss": 2.6092, "num_input_tokens_seen": 53928512, "step": 25810 }, { "epoch": 4.211273350191696, "grad_norm": 5.96875, "learning_rate": 1.3976841584998513e-05, "loss": 1.9569, "num_input_tokens_seen": 53939424, "step": 25815 }, { "epoch": 4.2120890774125135, "grad_norm": 6.4375, "learning_rate": 1.3968030824490352e-05, "loss": 1.6437, "num_input_tokens_seen": 53949408, "step": 25820 }, { "epoch": 4.212904804633331, "grad_norm": 6.0625, "learning_rate": 1.3959221765250469e-05, "loss": 2.976, "num_input_tokens_seen": 53960928, "step": 25825 }, { "epoch": 4.213720531854148, "grad_norm": 2.375, "learning_rate": 1.3950414408637343e-05, "loss": 3.0501, "num_input_tokens_seen": 53970144, "step": 25830 }, { "epoch": 4.214536259074965, "grad_norm": 8.3125, "learning_rate": 1.3941608756009166e-05, "loss": 2.853, "num_input_tokens_seen": 53979808, "step": 25835 }, { "epoch": 4.215351986295783, "grad_norm": 8.9375, "learning_rate": 1.3932804808723898e-05, "loss": 2.5677, "num_input_tokens_seen": 53990992, "step": 25840 }, { "epoch": 4.2161677135166, "grad_norm": 3.375, "learning_rate": 1.3924002568139194e-05, "loss": 2.016, "num_input_tokens_seen": 54001200, "step": 25845 }, { "epoch": 4.216983440737417, "grad_norm": 4.53125, "learning_rate": 1.3915202035612485e-05, "loss": 1.4556, "num_input_tokens_seen": 54011120, "step": 25850 }, { "epoch": 4.217799167958235, "grad_norm": 6.9375, "learning_rate": 1.3906403212500935e-05, "loss": 1.6477, "num_input_tokens_seen": 54021888, "step": 25855 }, { "epoch": 4.218614895179052, "grad_norm": 8.0625, "learning_rate": 1.3897606100161409e-05, "loss": 2.5998, "num_input_tokens_seen": 54032800, "step": 25860 }, { "epoch": 4.219430622399869, "grad_norm": 4.46875, "learning_rate": 1.388881069995055e-05, "loss": 2.1501, "num_input_tokens_seen": 54044784, "step": 25865 }, { "epoch": 4.220246349620687, "grad_norm": 10.25, "learning_rate": 1.3880017013224708e-05, "loss": 3.1494, "num_input_tokens_seen": 54055664, "step": 25870 }, { "epoch": 4.221062076841505, "grad_norm": 11.5625, "learning_rate": 1.3871225041339984e-05, "loss": 1.8781, "num_input_tokens_seen": 54066416, "step": 25875 }, { "epoch": 4.221877804062322, "grad_norm": 9.125, "learning_rate": 1.386243478565222e-05, "loss": 3.6046, "num_input_tokens_seen": 54077792, "step": 25880 }, { "epoch": 4.222693531283139, "grad_norm": 7.65625, "learning_rate": 1.3853646247516966e-05, "loss": 1.0474, "num_input_tokens_seen": 54088192, "step": 25885 }, { "epoch": 4.223509258503956, "grad_norm": 5.1875, "learning_rate": 1.3844859428289545e-05, "loss": 2.4658, "num_input_tokens_seen": 54098160, "step": 25890 }, { "epoch": 4.224324985724774, "grad_norm": 11.875, "learning_rate": 1.3836074329324984e-05, "loss": 4.0572, "num_input_tokens_seen": 54108704, "step": 25895 }, { "epoch": 4.225140712945591, "grad_norm": 5.78125, "learning_rate": 1.3827290951978044e-05, "loss": 2.9406, "num_input_tokens_seen": 54120736, "step": 25900 }, { "epoch": 4.225956440166408, "grad_norm": 7.8125, "learning_rate": 1.381850929760326e-05, "loss": 2.3327, "num_input_tokens_seen": 54131696, "step": 25905 }, { "epoch": 4.226772167387225, "grad_norm": 9.0625, "learning_rate": 1.3809729367554842e-05, "loss": 2.7324, "num_input_tokens_seen": 54141552, "step": 25910 }, { "epoch": 4.227587894608043, "grad_norm": 5.375, "learning_rate": 1.3800951163186784e-05, "loss": 2.4908, "num_input_tokens_seen": 54152112, "step": 25915 }, { "epoch": 4.2284036218288605, "grad_norm": 9.3125, "learning_rate": 1.3792174685852801e-05, "loss": 1.7665, "num_input_tokens_seen": 54162400, "step": 25920 }, { "epoch": 4.229219349049678, "grad_norm": 8.1875, "learning_rate": 1.378339993690632e-05, "loss": 1.7352, "num_input_tokens_seen": 54173472, "step": 25925 }, { "epoch": 4.230035076270495, "grad_norm": 8.9375, "learning_rate": 1.3774626917700523e-05, "loss": 2.8969, "num_input_tokens_seen": 54185312, "step": 25930 }, { "epoch": 4.230850803491313, "grad_norm": 7.40625, "learning_rate": 1.3765855629588334e-05, "loss": 2.0314, "num_input_tokens_seen": 54196304, "step": 25935 }, { "epoch": 4.23166653071213, "grad_norm": 6.875, "learning_rate": 1.3757086073922374e-05, "loss": 2.1068, "num_input_tokens_seen": 54206400, "step": 25940 }, { "epoch": 4.232482257932947, "grad_norm": 8.6875, "learning_rate": 1.3748318252055038e-05, "loss": 1.3149, "num_input_tokens_seen": 54215760, "step": 25945 }, { "epoch": 4.233297985153764, "grad_norm": 7.90625, "learning_rate": 1.3739552165338416e-05, "loss": 2.1171, "num_input_tokens_seen": 54226768, "step": 25950 }, { "epoch": 4.234113712374582, "grad_norm": 5.40625, "learning_rate": 1.3730787815124354e-05, "loss": 2.33, "num_input_tokens_seen": 54237120, "step": 25955 }, { "epoch": 4.234929439595399, "grad_norm": 2.34375, "learning_rate": 1.3722025202764443e-05, "loss": 3.0722, "num_input_tokens_seen": 54247664, "step": 25960 }, { "epoch": 4.2357451668162165, "grad_norm": 9.5, "learning_rate": 1.371326432960997e-05, "loss": 1.6266, "num_input_tokens_seen": 54257248, "step": 25965 }, { "epoch": 4.236560894037034, "grad_norm": 10.0625, "learning_rate": 1.3704505197011969e-05, "loss": 2.0549, "num_input_tokens_seen": 54265808, "step": 25970 }, { "epoch": 4.237376621257852, "grad_norm": 2.296875, "learning_rate": 1.3695747806321224e-05, "loss": 2.7867, "num_input_tokens_seen": 54277552, "step": 25975 }, { "epoch": 4.238192348478669, "grad_norm": 3.484375, "learning_rate": 1.3686992158888212e-05, "loss": 1.8499, "num_input_tokens_seen": 54288288, "step": 25980 }, { "epoch": 4.239008075699486, "grad_norm": 9.8125, "learning_rate": 1.367823825606319e-05, "loss": 3.647, "num_input_tokens_seen": 54297600, "step": 25985 }, { "epoch": 4.239823802920303, "grad_norm": 7.75, "learning_rate": 1.36694860991961e-05, "loss": 1.6617, "num_input_tokens_seen": 54307440, "step": 25990 }, { "epoch": 4.240639530141121, "grad_norm": 2.96875, "learning_rate": 1.3660735689636636e-05, "loss": 2.4633, "num_input_tokens_seen": 54317024, "step": 25995 }, { "epoch": 4.241455257361938, "grad_norm": 9.4375, "learning_rate": 1.365198702873424e-05, "loss": 1.8597, "num_input_tokens_seen": 54327568, "step": 26000 }, { "epoch": 4.241455257361938, "eval_loss": 2.5442895889282227, "eval_runtime": 134.7854, "eval_samples_per_second": 20.217, "eval_steps_per_second": 10.112, "num_input_tokens_seen": 54327568, "step": 26000 }, { "epoch": 4.242270984582755, "grad_norm": 8.0625, "learning_rate": 1.364324011783804e-05, "loss": 2.2468, "num_input_tokens_seen": 54337136, "step": 26005 }, { "epoch": 4.243086711803572, "grad_norm": 9.3125, "learning_rate": 1.3634494958296934e-05, "loss": 1.6152, "num_input_tokens_seen": 54348752, "step": 26010 }, { "epoch": 4.2439024390243905, "grad_norm": 13.0625, "learning_rate": 1.3625751551459542e-05, "loss": 2.6483, "num_input_tokens_seen": 54359840, "step": 26015 }, { "epoch": 4.244718166245208, "grad_norm": 7.28125, "learning_rate": 1.3617009898674188e-05, "loss": 2.9703, "num_input_tokens_seen": 54370656, "step": 26020 }, { "epoch": 4.245533893466025, "grad_norm": 7.71875, "learning_rate": 1.3608270001288967e-05, "loss": 2.9144, "num_input_tokens_seen": 54380768, "step": 26025 }, { "epoch": 4.246349620686843, "grad_norm": 6.25, "learning_rate": 1.359953186065166e-05, "loss": 3.5743, "num_input_tokens_seen": 54390688, "step": 26030 }, { "epoch": 4.24716534790766, "grad_norm": 1.84375, "learning_rate": 1.3590795478109814e-05, "loss": 1.4763, "num_input_tokens_seen": 54400368, "step": 26035 }, { "epoch": 4.247981075128477, "grad_norm": 5.375, "learning_rate": 1.3582060855010675e-05, "loss": 2.2997, "num_input_tokens_seen": 54411168, "step": 26040 }, { "epoch": 4.248796802349294, "grad_norm": 2.0625, "learning_rate": 1.3573327992701245e-05, "loss": 2.3867, "num_input_tokens_seen": 54421344, "step": 26045 }, { "epoch": 4.249612529570112, "grad_norm": 10.125, "learning_rate": 1.356459689252823e-05, "loss": 2.6096, "num_input_tokens_seen": 54433344, "step": 26050 }, { "epoch": 4.250428256790929, "grad_norm": 6.25, "learning_rate": 1.3555867555838087e-05, "loss": 1.7033, "num_input_tokens_seen": 54444256, "step": 26055 }, { "epoch": 4.251243984011746, "grad_norm": 9.375, "learning_rate": 1.3547139983976975e-05, "loss": 1.2953, "num_input_tokens_seen": 54454384, "step": 26060 }, { "epoch": 4.2520597112325635, "grad_norm": 0.298828125, "learning_rate": 1.3538414178290815e-05, "loss": 1.5873, "num_input_tokens_seen": 54466032, "step": 26065 }, { "epoch": 4.252875438453382, "grad_norm": 4.6875, "learning_rate": 1.3529690140125209e-05, "loss": 1.652, "num_input_tokens_seen": 54476704, "step": 26070 }, { "epoch": 4.253691165674199, "grad_norm": 6.5625, "learning_rate": 1.352096787082553e-05, "loss": 4.1459, "num_input_tokens_seen": 54488320, "step": 26075 }, { "epoch": 4.254506892895016, "grad_norm": 6.71875, "learning_rate": 1.3512247371736871e-05, "loss": 2.4114, "num_input_tokens_seen": 54497456, "step": 26080 }, { "epoch": 4.255322620115833, "grad_norm": 6.625, "learning_rate": 1.3503528644204022e-05, "loss": 2.1498, "num_input_tokens_seen": 54508944, "step": 26085 }, { "epoch": 4.256138347336651, "grad_norm": 10.375, "learning_rate": 1.349481168957153e-05, "loss": 2.4941, "num_input_tokens_seen": 54518960, "step": 26090 }, { "epoch": 4.256954074557468, "grad_norm": 7.1875, "learning_rate": 1.3486096509183665e-05, "loss": 1.5968, "num_input_tokens_seen": 54527920, "step": 26095 }, { "epoch": 4.257769801778285, "grad_norm": 7.4375, "learning_rate": 1.3477383104384406e-05, "loss": 2.293, "num_input_tokens_seen": 54539904, "step": 26100 }, { "epoch": 4.258585528999102, "grad_norm": 4.09375, "learning_rate": 1.3468671476517481e-05, "loss": 1.5148, "num_input_tokens_seen": 54550416, "step": 26105 }, { "epoch": 4.25940125621992, "grad_norm": 9.9375, "learning_rate": 1.3459961626926326e-05, "loss": 2.6581, "num_input_tokens_seen": 54560640, "step": 26110 }, { "epoch": 4.2602169834407375, "grad_norm": 15.25, "learning_rate": 1.3451253556954101e-05, "loss": 3.6614, "num_input_tokens_seen": 54570752, "step": 26115 }, { "epoch": 4.261032710661555, "grad_norm": 9.125, "learning_rate": 1.3442547267943717e-05, "loss": 1.876, "num_input_tokens_seen": 54580224, "step": 26120 }, { "epoch": 4.261848437882372, "grad_norm": 7.125, "learning_rate": 1.3433842761237774e-05, "loss": 3.3499, "num_input_tokens_seen": 54589488, "step": 26125 }, { "epoch": 4.26266416510319, "grad_norm": 10.0625, "learning_rate": 1.3425140038178639e-05, "loss": 2.2879, "num_input_tokens_seen": 54599072, "step": 26130 }, { "epoch": 4.263479892324007, "grad_norm": 6.90625, "learning_rate": 1.3416439100108358e-05, "loss": 3.4869, "num_input_tokens_seen": 54609312, "step": 26135 }, { "epoch": 4.264295619544824, "grad_norm": 10.6875, "learning_rate": 1.3407739948368734e-05, "loss": 3.3091, "num_input_tokens_seen": 54619472, "step": 26140 }, { "epoch": 4.265111346765641, "grad_norm": 5.96875, "learning_rate": 1.3399042584301298e-05, "loss": 2.0133, "num_input_tokens_seen": 54627840, "step": 26145 }, { "epoch": 4.265927073986459, "grad_norm": 3.203125, "learning_rate": 1.3390347009247272e-05, "loss": 2.734, "num_input_tokens_seen": 54638848, "step": 26150 }, { "epoch": 4.266742801207276, "grad_norm": 2.203125, "learning_rate": 1.3381653224547635e-05, "loss": 1.3911, "num_input_tokens_seen": 54649680, "step": 26155 }, { "epoch": 4.2675585284280935, "grad_norm": 8.1875, "learning_rate": 1.3372961231543086e-05, "loss": 2.3937, "num_input_tokens_seen": 54658608, "step": 26160 }, { "epoch": 4.268374255648911, "grad_norm": 6.0, "learning_rate": 1.3364271031574016e-05, "loss": 2.2395, "num_input_tokens_seen": 54670272, "step": 26165 }, { "epoch": 4.269189982869729, "grad_norm": 8.6875, "learning_rate": 1.335558262598059e-05, "loss": 1.8139, "num_input_tokens_seen": 54682048, "step": 26170 }, { "epoch": 4.270005710090546, "grad_norm": 7.6875, "learning_rate": 1.3346896016102645e-05, "loss": 1.4124, "num_input_tokens_seen": 54691856, "step": 26175 }, { "epoch": 4.270821437311363, "grad_norm": 3.875, "learning_rate": 1.3338211203279788e-05, "loss": 2.2178, "num_input_tokens_seen": 54701552, "step": 26180 }, { "epoch": 4.27163716453218, "grad_norm": 5.84375, "learning_rate": 1.3329528188851303e-05, "loss": 3.1556, "num_input_tokens_seen": 54712528, "step": 26185 }, { "epoch": 4.272452891752998, "grad_norm": 5.125, "learning_rate": 1.3320846974156242e-05, "loss": 2.9252, "num_input_tokens_seen": 54723360, "step": 26190 }, { "epoch": 4.273268618973815, "grad_norm": 5.84375, "learning_rate": 1.3312167560533337e-05, "loss": 1.7974, "num_input_tokens_seen": 54733856, "step": 26195 }, { "epoch": 4.274084346194632, "grad_norm": 5.59375, "learning_rate": 1.3303489949321082e-05, "loss": 2.9831, "num_input_tokens_seen": 54743840, "step": 26200 }, { "epoch": 4.274084346194632, "eval_loss": 2.538890838623047, "eval_runtime": 134.9681, "eval_samples_per_second": 20.19, "eval_steps_per_second": 10.099, "num_input_tokens_seen": 54743840, "step": 26200 }, { "epoch": 4.27490007341545, "grad_norm": 4.03125, "learning_rate": 1.3294814141857653e-05, "loss": 2.2755, "num_input_tokens_seen": 54753920, "step": 26205 }, { "epoch": 4.275715800636267, "grad_norm": 5.4375, "learning_rate": 1.3286140139480992e-05, "loss": 2.7358, "num_input_tokens_seen": 54763408, "step": 26210 }, { "epoch": 4.276531527857085, "grad_norm": 5.28125, "learning_rate": 1.3277467943528719e-05, "loss": 2.5934, "num_input_tokens_seen": 54774096, "step": 26215 }, { "epoch": 4.277347255077902, "grad_norm": 8.125, "learning_rate": 1.3268797555338203e-05, "loss": 1.948, "num_input_tokens_seen": 54784560, "step": 26220 }, { "epoch": 4.27816298229872, "grad_norm": 11.9375, "learning_rate": 1.3260128976246533e-05, "loss": 2.1775, "num_input_tokens_seen": 54795056, "step": 26225 }, { "epoch": 4.278978709519537, "grad_norm": 9.5, "learning_rate": 1.32514622075905e-05, "loss": 3.1502, "num_input_tokens_seen": 54804816, "step": 26230 }, { "epoch": 4.279794436740354, "grad_norm": 9.0625, "learning_rate": 1.3242797250706638e-05, "loss": 3.2743, "num_input_tokens_seen": 54815184, "step": 26235 }, { "epoch": 4.280610163961171, "grad_norm": 8.8125, "learning_rate": 1.3234134106931195e-05, "loss": 1.6456, "num_input_tokens_seen": 54826192, "step": 26240 }, { "epoch": 4.281425891181989, "grad_norm": 11.0625, "learning_rate": 1.322547277760013e-05, "loss": 2.8276, "num_input_tokens_seen": 54837392, "step": 26245 }, { "epoch": 4.282241618402806, "grad_norm": 6.5, "learning_rate": 1.3216813264049132e-05, "loss": 1.6048, "num_input_tokens_seen": 54847600, "step": 26250 }, { "epoch": 4.283057345623623, "grad_norm": 3.78125, "learning_rate": 1.32081555676136e-05, "loss": 1.2416, "num_input_tokens_seen": 54858128, "step": 26255 }, { "epoch": 4.2838730728444405, "grad_norm": 10.1875, "learning_rate": 1.3199499689628674e-05, "loss": 3.753, "num_input_tokens_seen": 54868896, "step": 26260 }, { "epoch": 4.2846888000652585, "grad_norm": 8.3125, "learning_rate": 1.3190845631429192e-05, "loss": 2.3611, "num_input_tokens_seen": 54879280, "step": 26265 }, { "epoch": 4.285504527286076, "grad_norm": 7.5, "learning_rate": 1.3182193394349704e-05, "loss": 2.7388, "num_input_tokens_seen": 54890368, "step": 26270 }, { "epoch": 4.286320254506893, "grad_norm": 5.1875, "learning_rate": 1.3173542979724507e-05, "loss": 3.2111, "num_input_tokens_seen": 54900016, "step": 26275 }, { "epoch": 4.28713598172771, "grad_norm": 6.3125, "learning_rate": 1.3164894388887617e-05, "loss": 2.3468, "num_input_tokens_seen": 54910112, "step": 26280 }, { "epoch": 4.287951708948528, "grad_norm": 2.65625, "learning_rate": 1.3156247623172727e-05, "loss": 1.3178, "num_input_tokens_seen": 54921200, "step": 26285 }, { "epoch": 4.288767436169345, "grad_norm": 5.53125, "learning_rate": 1.3147602683913302e-05, "loss": 2.2775, "num_input_tokens_seen": 54932144, "step": 26290 }, { "epoch": 4.289583163390162, "grad_norm": 8.5, "learning_rate": 1.3138959572442481e-05, "loss": 3.49, "num_input_tokens_seen": 54941744, "step": 26295 }, { "epoch": 4.290398890610979, "grad_norm": 4.3125, "learning_rate": 1.3130318290093146e-05, "loss": 2.0101, "num_input_tokens_seen": 54950800, "step": 26300 }, { "epoch": 4.291214617831797, "grad_norm": 7.59375, "learning_rate": 1.3121678838197909e-05, "loss": 2.4744, "num_input_tokens_seen": 54961936, "step": 26305 }, { "epoch": 4.2920303450526145, "grad_norm": 4.09375, "learning_rate": 1.3113041218089056e-05, "loss": 1.1062, "num_input_tokens_seen": 54972512, "step": 26310 }, { "epoch": 4.292846072273432, "grad_norm": 4.21875, "learning_rate": 1.3104405431098626e-05, "loss": 2.0107, "num_input_tokens_seen": 54983040, "step": 26315 }, { "epoch": 4.293661799494249, "grad_norm": 4.09375, "learning_rate": 1.3095771478558377e-05, "loss": 2.337, "num_input_tokens_seen": 54993328, "step": 26320 }, { "epoch": 4.294477526715067, "grad_norm": 0.169921875, "learning_rate": 1.3087139361799766e-05, "loss": 2.1072, "num_input_tokens_seen": 55003680, "step": 26325 }, { "epoch": 4.295293253935884, "grad_norm": 6.46875, "learning_rate": 1.3078509082153964e-05, "loss": 1.4382, "num_input_tokens_seen": 55014032, "step": 26330 }, { "epoch": 4.296108981156701, "grad_norm": 7.0625, "learning_rate": 1.3069880640951885e-05, "loss": 1.3562, "num_input_tokens_seen": 55023472, "step": 26335 }, { "epoch": 4.296924708377518, "grad_norm": 2.359375, "learning_rate": 1.3061254039524123e-05, "loss": 2.9152, "num_input_tokens_seen": 55033664, "step": 26340 }, { "epoch": 4.297740435598336, "grad_norm": 5.15625, "learning_rate": 1.3052629279201028e-05, "loss": 1.9892, "num_input_tokens_seen": 55044480, "step": 26345 }, { "epoch": 4.298556162819153, "grad_norm": 3.984375, "learning_rate": 1.3044006361312633e-05, "loss": 2.2614, "num_input_tokens_seen": 55055472, "step": 26350 }, { "epoch": 4.2993718900399704, "grad_norm": 7.46875, "learning_rate": 1.30353852871887e-05, "loss": 3.2531, "num_input_tokens_seen": 55064512, "step": 26355 }, { "epoch": 4.300187617260788, "grad_norm": 8.625, "learning_rate": 1.302676605815873e-05, "loss": 2.5758, "num_input_tokens_seen": 55075408, "step": 26360 }, { "epoch": 4.301003344481606, "grad_norm": 4.3125, "learning_rate": 1.3018148675551884e-05, "loss": 3.4504, "num_input_tokens_seen": 55085760, "step": 26365 }, { "epoch": 4.301819071702423, "grad_norm": 8.1875, "learning_rate": 1.3009533140697094e-05, "loss": 2.8685, "num_input_tokens_seen": 55095936, "step": 26370 }, { "epoch": 4.30263479892324, "grad_norm": 6.5, "learning_rate": 1.3000919454922966e-05, "loss": 2.498, "num_input_tokens_seen": 55106064, "step": 26375 }, { "epoch": 4.303450526144058, "grad_norm": 8.0, "learning_rate": 1.299230761955785e-05, "loss": 2.0844, "num_input_tokens_seen": 55118080, "step": 26380 }, { "epoch": 4.304266253364875, "grad_norm": 4.5, "learning_rate": 1.2983697635929807e-05, "loss": 2.586, "num_input_tokens_seen": 55128464, "step": 26385 }, { "epoch": 4.305081980585692, "grad_norm": 14.4375, "learning_rate": 1.2975089505366584e-05, "loss": 3.7998, "num_input_tokens_seen": 55137632, "step": 26390 }, { "epoch": 4.305897707806509, "grad_norm": 2.109375, "learning_rate": 1.2966483229195683e-05, "loss": 2.7398, "num_input_tokens_seen": 55148192, "step": 26395 }, { "epoch": 4.306713435027326, "grad_norm": 6.34375, "learning_rate": 1.2957878808744283e-05, "loss": 2.0809, "num_input_tokens_seen": 55158912, "step": 26400 }, { "epoch": 4.306713435027326, "eval_loss": 2.5430989265441895, "eval_runtime": 134.9563, "eval_samples_per_second": 20.192, "eval_steps_per_second": 10.1, "num_input_tokens_seen": 55158912, "step": 26400 }, { "epoch": 4.307529162248144, "grad_norm": 8.8125, "learning_rate": 1.294927624533931e-05, "loss": 2.1283, "num_input_tokens_seen": 55169312, "step": 26405 }, { "epoch": 4.3083448894689615, "grad_norm": 7.53125, "learning_rate": 1.2940675540307378e-05, "loss": 2.6489, "num_input_tokens_seen": 55180208, "step": 26410 }, { "epoch": 4.309160616689779, "grad_norm": 8.375, "learning_rate": 1.2932076694974814e-05, "loss": 3.074, "num_input_tokens_seen": 55190016, "step": 26415 }, { "epoch": 4.309976343910597, "grad_norm": 9.9375, "learning_rate": 1.2923479710667682e-05, "loss": 2.5658, "num_input_tokens_seen": 55199984, "step": 26420 }, { "epoch": 4.310792071131414, "grad_norm": 4.25, "learning_rate": 1.2914884588711751e-05, "loss": 2.3926, "num_input_tokens_seen": 55209968, "step": 26425 }, { "epoch": 4.311607798352231, "grad_norm": 7.1875, "learning_rate": 1.2906291330432475e-05, "loss": 2.2025, "num_input_tokens_seen": 55220064, "step": 26430 }, { "epoch": 4.312423525573048, "grad_norm": 5.0625, "learning_rate": 1.2897699937155055e-05, "loss": 2.3129, "num_input_tokens_seen": 55231184, "step": 26435 }, { "epoch": 4.313239252793866, "grad_norm": 4.65625, "learning_rate": 1.2889110410204403e-05, "loss": 2.3949, "num_input_tokens_seen": 55241264, "step": 26440 }, { "epoch": 4.314054980014683, "grad_norm": 7.28125, "learning_rate": 1.2880522750905111e-05, "loss": 2.4862, "num_input_tokens_seen": 55252336, "step": 26445 }, { "epoch": 4.3148707072355, "grad_norm": 2.53125, "learning_rate": 1.2871936960581523e-05, "loss": 2.3529, "num_input_tokens_seen": 55262736, "step": 26450 }, { "epoch": 4.3156864344563175, "grad_norm": 7.5625, "learning_rate": 1.2863353040557658e-05, "loss": 3.0663, "num_input_tokens_seen": 55272512, "step": 26455 }, { "epoch": 4.3165021616771355, "grad_norm": 7.875, "learning_rate": 1.2854770992157273e-05, "loss": 2.7455, "num_input_tokens_seen": 55283200, "step": 26460 }, { "epoch": 4.317317888897953, "grad_norm": 7.875, "learning_rate": 1.2846190816703835e-05, "loss": 3.5423, "num_input_tokens_seen": 55293536, "step": 26465 }, { "epoch": 4.31813361611877, "grad_norm": 3.625, "learning_rate": 1.2837612515520498e-05, "loss": 1.0637, "num_input_tokens_seen": 55303952, "step": 26470 }, { "epoch": 4.318949343339587, "grad_norm": 3.984375, "learning_rate": 1.2829036089930163e-05, "loss": 1.6944, "num_input_tokens_seen": 55315136, "step": 26475 }, { "epoch": 4.319765070560405, "grad_norm": 5.8125, "learning_rate": 1.2820461541255412e-05, "loss": 2.3862, "num_input_tokens_seen": 55325264, "step": 26480 }, { "epoch": 4.320580797781222, "grad_norm": 3.671875, "learning_rate": 1.2811888870818543e-05, "loss": 1.4603, "num_input_tokens_seen": 55335040, "step": 26485 }, { "epoch": 4.321396525002039, "grad_norm": 1.71875, "learning_rate": 1.2803318079941581e-05, "loss": 1.3448, "num_input_tokens_seen": 55345648, "step": 26490 }, { "epoch": 4.322212252222856, "grad_norm": 7.34375, "learning_rate": 1.2794749169946235e-05, "loss": 2.214, "num_input_tokens_seen": 55356592, "step": 26495 }, { "epoch": 4.323027979443674, "grad_norm": 12.1875, "learning_rate": 1.2786182142153952e-05, "loss": 3.4936, "num_input_tokens_seen": 55368272, "step": 26500 }, { "epoch": 4.3238437066644915, "grad_norm": 10.9375, "learning_rate": 1.2777616997885878e-05, "loss": 2.3098, "num_input_tokens_seen": 55379008, "step": 26505 }, { "epoch": 4.324659433885309, "grad_norm": 7.4375, "learning_rate": 1.2769053738462847e-05, "loss": 1.1381, "num_input_tokens_seen": 55389216, "step": 26510 }, { "epoch": 4.325475161106126, "grad_norm": 5.65625, "learning_rate": 1.2760492365205434e-05, "loss": 1.8611, "num_input_tokens_seen": 55401024, "step": 26515 }, { "epoch": 4.326290888326944, "grad_norm": 8.875, "learning_rate": 1.2751932879433919e-05, "loss": 2.5534, "num_input_tokens_seen": 55410432, "step": 26520 }, { "epoch": 4.327106615547761, "grad_norm": 8.75, "learning_rate": 1.2743375282468267e-05, "loss": 3.6348, "num_input_tokens_seen": 55420480, "step": 26525 }, { "epoch": 4.327922342768578, "grad_norm": 7.5625, "learning_rate": 1.2734819575628182e-05, "loss": 2.3713, "num_input_tokens_seen": 55429792, "step": 26530 }, { "epoch": 4.328738069989395, "grad_norm": 11.125, "learning_rate": 1.2726265760233039e-05, "loss": 2.7636, "num_input_tokens_seen": 55439600, "step": 26535 }, { "epoch": 4.329553797210213, "grad_norm": 9.1875, "learning_rate": 1.271771383760197e-05, "loss": 1.8167, "num_input_tokens_seen": 55450320, "step": 26540 }, { "epoch": 4.33036952443103, "grad_norm": 10.0, "learning_rate": 1.2709163809053764e-05, "loss": 1.9796, "num_input_tokens_seen": 55462160, "step": 26545 }, { "epoch": 4.331185251651847, "grad_norm": 3.6875, "learning_rate": 1.2700615675906963e-05, "loss": 1.4681, "num_input_tokens_seen": 55472288, "step": 26550 }, { "epoch": 4.332000978872665, "grad_norm": 9.9375, "learning_rate": 1.269206943947978e-05, "loss": 2.764, "num_input_tokens_seen": 55482848, "step": 26555 }, { "epoch": 4.332816706093483, "grad_norm": 7.4375, "learning_rate": 1.2683525101090177e-05, "loss": 2.2095, "num_input_tokens_seen": 55492480, "step": 26560 }, { "epoch": 4.3336324333143, "grad_norm": 9.375, "learning_rate": 1.2674982662055765e-05, "loss": 1.9035, "num_input_tokens_seen": 55503328, "step": 26565 }, { "epoch": 4.334448160535117, "grad_norm": 7.875, "learning_rate": 1.2666442123693922e-05, "loss": 1.9483, "num_input_tokens_seen": 55513392, "step": 26570 }, { "epoch": 4.335263887755934, "grad_norm": 4.0, "learning_rate": 1.265790348732169e-05, "loss": 3.4337, "num_input_tokens_seen": 55523808, "step": 26575 }, { "epoch": 4.336079614976752, "grad_norm": 10.125, "learning_rate": 1.264936675425584e-05, "loss": 3.2104, "num_input_tokens_seen": 55534256, "step": 26580 }, { "epoch": 4.336895342197569, "grad_norm": 11.75, "learning_rate": 1.2640831925812852e-05, "loss": 2.6843, "num_input_tokens_seen": 55545360, "step": 26585 }, { "epoch": 4.337711069418386, "grad_norm": 6.46875, "learning_rate": 1.263229900330889e-05, "loss": 0.9418, "num_input_tokens_seen": 55554896, "step": 26590 }, { "epoch": 4.338526796639204, "grad_norm": 6.53125, "learning_rate": 1.2623767988059843e-05, "loss": 1.2915, "num_input_tokens_seen": 55565904, "step": 26595 }, { "epoch": 4.339342523860021, "grad_norm": 8.5, "learning_rate": 1.2615238881381309e-05, "loss": 2.8693, "num_input_tokens_seen": 55575232, "step": 26600 }, { "epoch": 4.339342523860021, "eval_loss": 2.5383238792419434, "eval_runtime": 134.8546, "eval_samples_per_second": 20.207, "eval_steps_per_second": 10.107, "num_input_tokens_seen": 55575232, "step": 26600 }, { "epoch": 4.3401582510808385, "grad_norm": 6.15625, "learning_rate": 1.2606711684588568e-05, "loss": 2.6183, "num_input_tokens_seen": 55586864, "step": 26605 }, { "epoch": 4.340973978301656, "grad_norm": 7.1875, "learning_rate": 1.2598186398996636e-05, "loss": 1.1416, "num_input_tokens_seen": 55597040, "step": 26610 }, { "epoch": 4.341789705522474, "grad_norm": 6.4375, "learning_rate": 1.2589663025920207e-05, "loss": 2.8979, "num_input_tokens_seen": 55607040, "step": 26615 }, { "epoch": 4.342605432743291, "grad_norm": 3.03125, "learning_rate": 1.2581141566673705e-05, "loss": 2.6637, "num_input_tokens_seen": 55617872, "step": 26620 }, { "epoch": 4.343421159964108, "grad_norm": 7.59375, "learning_rate": 1.257262202257124e-05, "loss": 2.0756, "num_input_tokens_seen": 55629680, "step": 26625 }, { "epoch": 4.344236887184925, "grad_norm": 11.4375, "learning_rate": 1.2564104394926618e-05, "loss": 2.9739, "num_input_tokens_seen": 55641296, "step": 26630 }, { "epoch": 4.345052614405743, "grad_norm": 8.125, "learning_rate": 1.2555588685053383e-05, "loss": 1.4902, "num_input_tokens_seen": 55652576, "step": 26635 }, { "epoch": 4.34586834162656, "grad_norm": 7.84375, "learning_rate": 1.2547074894264762e-05, "loss": 1.9406, "num_input_tokens_seen": 55662992, "step": 26640 }, { "epoch": 4.346684068847377, "grad_norm": 9.5, "learning_rate": 1.2538563023873679e-05, "loss": 1.5873, "num_input_tokens_seen": 55673120, "step": 26645 }, { "epoch": 4.3474997960681945, "grad_norm": 10.0, "learning_rate": 1.2530053075192789e-05, "loss": 2.4964, "num_input_tokens_seen": 55683552, "step": 26650 }, { "epoch": 4.3483155232890125, "grad_norm": 12.0625, "learning_rate": 1.252154504953441e-05, "loss": 1.6233, "num_input_tokens_seen": 55695008, "step": 26655 }, { "epoch": 4.34913125050983, "grad_norm": 2.984375, "learning_rate": 1.25130389482106e-05, "loss": 2.1137, "num_input_tokens_seen": 55705360, "step": 26660 }, { "epoch": 4.349946977730647, "grad_norm": 7.65625, "learning_rate": 1.2504534772533116e-05, "loss": 3.5857, "num_input_tokens_seen": 55715680, "step": 26665 }, { "epoch": 4.350762704951464, "grad_norm": 2.203125, "learning_rate": 1.2496032523813387e-05, "loss": 3.2373, "num_input_tokens_seen": 55726368, "step": 26670 }, { "epoch": 4.351578432172282, "grad_norm": 6.90625, "learning_rate": 1.2487532203362576e-05, "loss": 2.1156, "num_input_tokens_seen": 55736416, "step": 26675 }, { "epoch": 4.352394159393099, "grad_norm": 10.9375, "learning_rate": 1.247903381249155e-05, "loss": 3.6187, "num_input_tokens_seen": 55747552, "step": 26680 }, { "epoch": 4.353209886613916, "grad_norm": 7.5625, "learning_rate": 1.2470537352510853e-05, "loss": 2.6886, "num_input_tokens_seen": 55758752, "step": 26685 }, { "epoch": 4.354025613834733, "grad_norm": 5.34375, "learning_rate": 1.2462042824730758e-05, "loss": 2.4493, "num_input_tokens_seen": 55769520, "step": 26690 }, { "epoch": 4.354841341055551, "grad_norm": 6.625, "learning_rate": 1.245355023046122e-05, "loss": 1.459, "num_input_tokens_seen": 55779296, "step": 26695 }, { "epoch": 4.3556570682763684, "grad_norm": 1.1875, "learning_rate": 1.2445059571011896e-05, "loss": 1.3119, "num_input_tokens_seen": 55790112, "step": 26700 }, { "epoch": 4.356472795497186, "grad_norm": 4.6875, "learning_rate": 1.2436570847692173e-05, "loss": 3.4496, "num_input_tokens_seen": 55800848, "step": 26705 }, { "epoch": 4.357288522718003, "grad_norm": 0.142578125, "learning_rate": 1.2428084061811096e-05, "loss": 1.8793, "num_input_tokens_seen": 55810736, "step": 26710 }, { "epoch": 4.358104249938821, "grad_norm": 7.0, "learning_rate": 1.2419599214677447e-05, "loss": 2.6779, "num_input_tokens_seen": 55821168, "step": 26715 }, { "epoch": 4.358919977159638, "grad_norm": 3.0625, "learning_rate": 1.2411116307599702e-05, "loss": 2.681, "num_input_tokens_seen": 55831744, "step": 26720 }, { "epoch": 4.359735704380455, "grad_norm": 14.0, "learning_rate": 1.2402635341886016e-05, "loss": 1.896, "num_input_tokens_seen": 55840800, "step": 26725 }, { "epoch": 4.360551431601272, "grad_norm": 12.5, "learning_rate": 1.2394156318844278e-05, "loss": 2.4084, "num_input_tokens_seen": 55851472, "step": 26730 }, { "epoch": 4.36136715882209, "grad_norm": 5.46875, "learning_rate": 1.2385679239782039e-05, "loss": 2.0209, "num_input_tokens_seen": 55861216, "step": 26735 }, { "epoch": 4.362182886042907, "grad_norm": 4.15625, "learning_rate": 1.2377204106006585e-05, "loss": 1.6956, "num_input_tokens_seen": 55872448, "step": 26740 }, { "epoch": 4.362998613263724, "grad_norm": 4.46875, "learning_rate": 1.2368730918824891e-05, "loss": 1.7769, "num_input_tokens_seen": 55883360, "step": 26745 }, { "epoch": 4.3638143404845415, "grad_norm": 5.875, "learning_rate": 1.236025967954362e-05, "loss": 1.3866, "num_input_tokens_seen": 55892480, "step": 26750 }, { "epoch": 4.3646300677053596, "grad_norm": 3.796875, "learning_rate": 1.2351790389469153e-05, "loss": 1.7371, "num_input_tokens_seen": 55903360, "step": 26755 }, { "epoch": 4.365445794926177, "grad_norm": 7.46875, "learning_rate": 1.234332304990755e-05, "loss": 2.3808, "num_input_tokens_seen": 55912720, "step": 26760 }, { "epoch": 4.366261522146994, "grad_norm": 11.9375, "learning_rate": 1.2334857662164593e-05, "loss": 3.2799, "num_input_tokens_seen": 55922912, "step": 26765 }, { "epoch": 4.367077249367812, "grad_norm": 7.625, "learning_rate": 1.2326394227545743e-05, "loss": 3.0729, "num_input_tokens_seen": 55933952, "step": 26770 }, { "epoch": 4.367892976588629, "grad_norm": 8.75, "learning_rate": 1.2317932747356162e-05, "loss": 3.0676, "num_input_tokens_seen": 55943728, "step": 26775 }, { "epoch": 4.368708703809446, "grad_norm": 8.375, "learning_rate": 1.2309473222900726e-05, "loss": 2.0989, "num_input_tokens_seen": 55955536, "step": 26780 }, { "epoch": 4.369524431030263, "grad_norm": 4.15625, "learning_rate": 1.2301015655484006e-05, "loss": 1.9864, "num_input_tokens_seen": 55966016, "step": 26785 }, { "epoch": 4.370340158251081, "grad_norm": 7.4375, "learning_rate": 1.2292560046410245e-05, "loss": 2.984, "num_input_tokens_seen": 55975376, "step": 26790 }, { "epoch": 4.371155885471898, "grad_norm": 5.90625, "learning_rate": 1.228410639698343e-05, "loss": 2.8631, "num_input_tokens_seen": 55984848, "step": 26795 }, { "epoch": 4.3719716126927155, "grad_norm": 6.59375, "learning_rate": 1.2275654708507195e-05, "loss": 1.9899, "num_input_tokens_seen": 55994160, "step": 26800 }, { "epoch": 4.3719716126927155, "eval_loss": 2.5557234287261963, "eval_runtime": 134.8218, "eval_samples_per_second": 20.212, "eval_steps_per_second": 10.11, "num_input_tokens_seen": 55994160, "step": 26800 }, { "epoch": 4.372787339913533, "grad_norm": 4.59375, "learning_rate": 1.2267204982284908e-05, "loss": 1.653, "num_input_tokens_seen": 56004288, "step": 26805 }, { "epoch": 4.373603067134351, "grad_norm": 7.90625, "learning_rate": 1.2258757219619635e-05, "loss": 1.7344, "num_input_tokens_seen": 56015504, "step": 26810 }, { "epoch": 4.374418794355168, "grad_norm": 14.0, "learning_rate": 1.2250311421814104e-05, "loss": 2.3234, "num_input_tokens_seen": 56025184, "step": 26815 }, { "epoch": 4.375234521575985, "grad_norm": 8.125, "learning_rate": 1.2241867590170772e-05, "loss": 3.2723, "num_input_tokens_seen": 56036560, "step": 26820 }, { "epoch": 4.376050248796802, "grad_norm": 5.96875, "learning_rate": 1.2233425725991799e-05, "loss": 2.1071, "num_input_tokens_seen": 56046208, "step": 26825 }, { "epoch": 4.37686597601762, "grad_norm": 7.8125, "learning_rate": 1.2224985830579003e-05, "loss": 2.6846, "num_input_tokens_seen": 56056560, "step": 26830 }, { "epoch": 4.377681703238437, "grad_norm": 2.234375, "learning_rate": 1.2216547905233944e-05, "loss": 2.0788, "num_input_tokens_seen": 56067680, "step": 26835 }, { "epoch": 4.378497430459254, "grad_norm": 0.0830078125, "learning_rate": 1.2208111951257842e-05, "loss": 2.6445, "num_input_tokens_seen": 56077904, "step": 26840 }, { "epoch": 4.3793131576800715, "grad_norm": 4.03125, "learning_rate": 1.2199677969951622e-05, "loss": 2.3529, "num_input_tokens_seen": 56089232, "step": 26845 }, { "epoch": 4.3801288849008895, "grad_norm": 11.0625, "learning_rate": 1.2191245962615927e-05, "loss": 3.2528, "num_input_tokens_seen": 56100208, "step": 26850 }, { "epoch": 4.380944612121707, "grad_norm": 4.0625, "learning_rate": 1.218281593055106e-05, "loss": 1.6481, "num_input_tokens_seen": 56109968, "step": 26855 }, { "epoch": 4.381760339342524, "grad_norm": 4.46875, "learning_rate": 1.217438787505705e-05, "loss": 1.4445, "num_input_tokens_seen": 56120176, "step": 26860 }, { "epoch": 4.382576066563341, "grad_norm": 6.9375, "learning_rate": 1.2165961797433615e-05, "loss": 3.0079, "num_input_tokens_seen": 56131232, "step": 26865 }, { "epoch": 4.383391793784159, "grad_norm": 10.4375, "learning_rate": 1.215753769898014e-05, "loss": 1.6381, "num_input_tokens_seen": 56141776, "step": 26870 }, { "epoch": 4.384207521004976, "grad_norm": 8.0, "learning_rate": 1.2149115580995755e-05, "loss": 2.7466, "num_input_tokens_seen": 56151232, "step": 26875 }, { "epoch": 4.385023248225793, "grad_norm": 6.53125, "learning_rate": 1.2140695444779227e-05, "loss": 1.6192, "num_input_tokens_seen": 56161152, "step": 26880 }, { "epoch": 4.38583897544661, "grad_norm": 9.9375, "learning_rate": 1.2132277291629066e-05, "loss": 2.1711, "num_input_tokens_seen": 56172384, "step": 26885 }, { "epoch": 4.386654702667428, "grad_norm": 4.375, "learning_rate": 1.2123861122843458e-05, "loss": 1.457, "num_input_tokens_seen": 56182960, "step": 26890 }, { "epoch": 4.387470429888245, "grad_norm": 9.9375, "learning_rate": 1.2115446939720271e-05, "loss": 2.6034, "num_input_tokens_seen": 56194512, "step": 26895 }, { "epoch": 4.388286157109063, "grad_norm": 9.375, "learning_rate": 1.210703474355708e-05, "loss": 2.9705, "num_input_tokens_seen": 56204816, "step": 26900 }, { "epoch": 4.38910188432988, "grad_norm": 9.9375, "learning_rate": 1.2098624535651164e-05, "loss": 2.42, "num_input_tokens_seen": 56216240, "step": 26905 }, { "epoch": 4.389917611550698, "grad_norm": 5.0, "learning_rate": 1.2090216317299477e-05, "loss": 3.1761, "num_input_tokens_seen": 56226304, "step": 26910 }, { "epoch": 4.390733338771515, "grad_norm": 4.46875, "learning_rate": 1.2081810089798668e-05, "loss": 1.9562, "num_input_tokens_seen": 56237264, "step": 26915 }, { "epoch": 4.391549065992332, "grad_norm": 5.96875, "learning_rate": 1.2073405854445072e-05, "loss": 2.3134, "num_input_tokens_seen": 56246848, "step": 26920 }, { "epoch": 4.392364793213149, "grad_norm": 6.59375, "learning_rate": 1.206500361253474e-05, "loss": 1.8387, "num_input_tokens_seen": 56256800, "step": 26925 }, { "epoch": 4.393180520433967, "grad_norm": 6.0, "learning_rate": 1.2056603365363409e-05, "loss": 3.0284, "num_input_tokens_seen": 56266320, "step": 26930 }, { "epoch": 4.393996247654784, "grad_norm": 9.625, "learning_rate": 1.2048205114226487e-05, "loss": 3.3402, "num_input_tokens_seen": 56276896, "step": 26935 }, { "epoch": 4.394811974875601, "grad_norm": 13.625, "learning_rate": 1.2039808860419102e-05, "loss": 2.9582, "num_input_tokens_seen": 56287424, "step": 26940 }, { "epoch": 4.395627702096419, "grad_norm": 8.1875, "learning_rate": 1.2031414605236066e-05, "loss": 1.9413, "num_input_tokens_seen": 56298624, "step": 26945 }, { "epoch": 4.3964434293172365, "grad_norm": 2.84375, "learning_rate": 1.2023022349971862e-05, "loss": 2.5914, "num_input_tokens_seen": 56307536, "step": 26950 }, { "epoch": 4.397259156538054, "grad_norm": 6.125, "learning_rate": 1.20146320959207e-05, "loss": 2.9134, "num_input_tokens_seen": 56317504, "step": 26955 }, { "epoch": 4.398074883758871, "grad_norm": 7.90625, "learning_rate": 1.2006243844376445e-05, "loss": 1.5769, "num_input_tokens_seen": 56328816, "step": 26960 }, { "epoch": 4.398890610979688, "grad_norm": 9.25, "learning_rate": 1.1997857596632678e-05, "loss": 1.6905, "num_input_tokens_seen": 56339744, "step": 26965 }, { "epoch": 4.399706338200506, "grad_norm": 9.875, "learning_rate": 1.1989473353982672e-05, "loss": 3.4085, "num_input_tokens_seen": 56348320, "step": 26970 }, { "epoch": 4.400522065421323, "grad_norm": 7.75, "learning_rate": 1.198109111771937e-05, "loss": 3.682, "num_input_tokens_seen": 56358512, "step": 26975 }, { "epoch": 4.40133779264214, "grad_norm": 10.875, "learning_rate": 1.197271088913543e-05, "loss": 1.9749, "num_input_tokens_seen": 56369104, "step": 26980 }, { "epoch": 4.402153519862958, "grad_norm": 3.625, "learning_rate": 1.1964332669523182e-05, "loss": 3.1665, "num_input_tokens_seen": 56379824, "step": 26985 }, { "epoch": 4.402969247083775, "grad_norm": 3.1875, "learning_rate": 1.1955956460174645e-05, "loss": 2.5987, "num_input_tokens_seen": 56389936, "step": 26990 }, { "epoch": 4.4037849743045925, "grad_norm": 2.703125, "learning_rate": 1.1947582262381552e-05, "loss": 3.062, "num_input_tokens_seen": 56400336, "step": 26995 }, { "epoch": 4.40460070152541, "grad_norm": 11.375, "learning_rate": 1.1939210077435293e-05, "loss": 2.8289, "num_input_tokens_seen": 56410736, "step": 27000 }, { "epoch": 4.40460070152541, "eval_loss": 2.542567253112793, "eval_runtime": 135.0559, "eval_samples_per_second": 20.177, "eval_steps_per_second": 10.092, "num_input_tokens_seen": 56410736, "step": 27000 }, { "epoch": 4.405416428746228, "grad_norm": 4.8125, "learning_rate": 1.193083990662697e-05, "loss": 2.6392, "num_input_tokens_seen": 56421248, "step": 27005 }, { "epoch": 4.406232155967045, "grad_norm": 4.0, "learning_rate": 1.192247175124738e-05, "loss": 2.2897, "num_input_tokens_seen": 56432560, "step": 27010 }, { "epoch": 4.407047883187862, "grad_norm": 8.6875, "learning_rate": 1.191410561258698e-05, "loss": 2.3512, "num_input_tokens_seen": 56444432, "step": 27015 }, { "epoch": 4.407863610408679, "grad_norm": 10.25, "learning_rate": 1.1905741491935944e-05, "loss": 2.0979, "num_input_tokens_seen": 56455584, "step": 27020 }, { "epoch": 4.408679337629497, "grad_norm": 6.78125, "learning_rate": 1.1897379390584129e-05, "loss": 3.0597, "num_input_tokens_seen": 56465632, "step": 27025 }, { "epoch": 4.409495064850314, "grad_norm": 9.0625, "learning_rate": 1.1889019309821062e-05, "loss": 2.1277, "num_input_tokens_seen": 56475120, "step": 27030 }, { "epoch": 4.410310792071131, "grad_norm": 8.625, "learning_rate": 1.188066125093599e-05, "loss": 2.3598, "num_input_tokens_seen": 56486160, "step": 27035 }, { "epoch": 4.411126519291948, "grad_norm": 7.125, "learning_rate": 1.1872305215217811e-05, "loss": 2.3317, "num_input_tokens_seen": 56495968, "step": 27040 }, { "epoch": 4.4119422465127665, "grad_norm": 8.8125, "learning_rate": 1.186395120395514e-05, "loss": 3.8431, "num_input_tokens_seen": 56506464, "step": 27045 }, { "epoch": 4.412757973733584, "grad_norm": 11.6875, "learning_rate": 1.1855599218436283e-05, "loss": 1.8081, "num_input_tokens_seen": 56515680, "step": 27050 }, { "epoch": 4.413573700954401, "grad_norm": 4.59375, "learning_rate": 1.1847249259949209e-05, "loss": 2.9268, "num_input_tokens_seen": 56526528, "step": 27055 }, { "epoch": 4.414389428175218, "grad_norm": 15.4375, "learning_rate": 1.1838901329781574e-05, "loss": 2.4452, "num_input_tokens_seen": 56536720, "step": 27060 }, { "epoch": 4.415205155396036, "grad_norm": 3.359375, "learning_rate": 1.1830555429220758e-05, "loss": 1.697, "num_input_tokens_seen": 56546432, "step": 27065 }, { "epoch": 4.416020882616853, "grad_norm": 0.265625, "learning_rate": 1.1822211559553784e-05, "loss": 1.8248, "num_input_tokens_seen": 56557248, "step": 27070 }, { "epoch": 4.41683660983767, "grad_norm": 15.0, "learning_rate": 1.18138697220674e-05, "loss": 3.2158, "num_input_tokens_seen": 56568784, "step": 27075 }, { "epoch": 4.417652337058487, "grad_norm": 4.25, "learning_rate": 1.1805529918048e-05, "loss": 3.1081, "num_input_tokens_seen": 56580256, "step": 27080 }, { "epoch": 4.418468064279305, "grad_norm": 10.625, "learning_rate": 1.1797192148781702e-05, "loss": 2.3387, "num_input_tokens_seen": 56590144, "step": 27085 }, { "epoch": 4.419283791500122, "grad_norm": 12.125, "learning_rate": 1.1788856415554297e-05, "loss": 4.6517, "num_input_tokens_seen": 56601408, "step": 27090 }, { "epoch": 4.4200995187209395, "grad_norm": 1.859375, "learning_rate": 1.1780522719651249e-05, "loss": 2.5073, "num_input_tokens_seen": 56611200, "step": 27095 }, { "epoch": 4.420915245941757, "grad_norm": 2.9375, "learning_rate": 1.1772191062357721e-05, "loss": 1.3269, "num_input_tokens_seen": 56622416, "step": 27100 }, { "epoch": 4.421730973162575, "grad_norm": 13.75, "learning_rate": 1.1763861444958573e-05, "loss": 2.1386, "num_input_tokens_seen": 56633504, "step": 27105 }, { "epoch": 4.422546700383392, "grad_norm": 9.8125, "learning_rate": 1.1755533868738317e-05, "loss": 2.663, "num_input_tokens_seen": 56645328, "step": 27110 }, { "epoch": 4.423362427604209, "grad_norm": 4.875, "learning_rate": 1.1747208334981185e-05, "loss": 2.8413, "num_input_tokens_seen": 56655616, "step": 27115 }, { "epoch": 4.424178154825027, "grad_norm": 7.46875, "learning_rate": 1.1738884844971067e-05, "loss": 3.8827, "num_input_tokens_seen": 56666256, "step": 27120 }, { "epoch": 4.424993882045844, "grad_norm": 4.40625, "learning_rate": 1.1730563399991563e-05, "loss": 1.9422, "num_input_tokens_seen": 56676848, "step": 27125 }, { "epoch": 4.425809609266661, "grad_norm": 6.0625, "learning_rate": 1.1722244001325938e-05, "loss": 2.3056, "num_input_tokens_seen": 56687472, "step": 27130 }, { "epoch": 4.426625336487478, "grad_norm": 7.3125, "learning_rate": 1.1713926650257137e-05, "loss": 1.5109, "num_input_tokens_seen": 56697760, "step": 27135 }, { "epoch": 4.4274410637082955, "grad_norm": 7.375, "learning_rate": 1.170561134806781e-05, "loss": 1.982, "num_input_tokens_seen": 56708736, "step": 27140 }, { "epoch": 4.4282567909291135, "grad_norm": 9.125, "learning_rate": 1.1697298096040287e-05, "loss": 2.8317, "num_input_tokens_seen": 56719504, "step": 27145 }, { "epoch": 4.429072518149931, "grad_norm": 6.5625, "learning_rate": 1.1688986895456567e-05, "loss": 2.0861, "num_input_tokens_seen": 56728608, "step": 27150 }, { "epoch": 4.429888245370748, "grad_norm": 13.4375, "learning_rate": 1.1680677747598349e-05, "loss": 2.3026, "num_input_tokens_seen": 56738656, "step": 27155 }, { "epoch": 4.430703972591566, "grad_norm": 6.90625, "learning_rate": 1.1672370653746995e-05, "loss": 1.1011, "num_input_tokens_seen": 56749344, "step": 27160 }, { "epoch": 4.431519699812383, "grad_norm": 3.171875, "learning_rate": 1.166406561518357e-05, "loss": 2.1379, "num_input_tokens_seen": 56759008, "step": 27165 }, { "epoch": 4.4323354270332, "grad_norm": 4.59375, "learning_rate": 1.1655762633188826e-05, "loss": 1.803, "num_input_tokens_seen": 56769616, "step": 27170 }, { "epoch": 4.433151154254017, "grad_norm": 8.0, "learning_rate": 1.1647461709043172e-05, "loss": 2.0809, "num_input_tokens_seen": 56781232, "step": 27175 }, { "epoch": 4.433966881474835, "grad_norm": 8.25, "learning_rate": 1.1639162844026722e-05, "loss": 1.4889, "num_input_tokens_seen": 56793568, "step": 27180 }, { "epoch": 4.434782608695652, "grad_norm": 3.90625, "learning_rate": 1.163086603941927e-05, "loss": 1.2822, "num_input_tokens_seen": 56805280, "step": 27185 }, { "epoch": 4.4355983359164695, "grad_norm": 7.03125, "learning_rate": 1.1622571296500273e-05, "loss": 2.8293, "num_input_tokens_seen": 56816720, "step": 27190 }, { "epoch": 4.436414063137287, "grad_norm": 11.5, "learning_rate": 1.1614278616548904e-05, "loss": 2.4947, "num_input_tokens_seen": 56828112, "step": 27195 }, { "epoch": 4.437229790358105, "grad_norm": 6.21875, "learning_rate": 1.1605988000843986e-05, "loss": 2.7897, "num_input_tokens_seen": 56838864, "step": 27200 }, { "epoch": 4.437229790358105, "eval_loss": 2.5423049926757812, "eval_runtime": 134.8218, "eval_samples_per_second": 20.212, "eval_steps_per_second": 10.11, "num_input_tokens_seen": 56838864, "step": 27200 }, { "epoch": 4.438045517578922, "grad_norm": 8.8125, "learning_rate": 1.1597699450664028e-05, "loss": 3.2075, "num_input_tokens_seen": 56848512, "step": 27205 }, { "epoch": 4.438861244799739, "grad_norm": 5.46875, "learning_rate": 1.1589412967287252e-05, "loss": 3.2601, "num_input_tokens_seen": 56857600, "step": 27210 }, { "epoch": 4.439676972020556, "grad_norm": 6.03125, "learning_rate": 1.1581128551991514e-05, "loss": 1.0898, "num_input_tokens_seen": 56867824, "step": 27215 }, { "epoch": 4.440492699241374, "grad_norm": 8.25, "learning_rate": 1.1572846206054383e-05, "loss": 2.5979, "num_input_tokens_seen": 56877632, "step": 27220 }, { "epoch": 4.441308426462191, "grad_norm": 4.65625, "learning_rate": 1.1564565930753113e-05, "loss": 2.7843, "num_input_tokens_seen": 56889136, "step": 27225 }, { "epoch": 4.442124153683008, "grad_norm": 7.65625, "learning_rate": 1.1556287727364606e-05, "loss": 2.6444, "num_input_tokens_seen": 56900160, "step": 27230 }, { "epoch": 4.442939880903825, "grad_norm": 3.9375, "learning_rate": 1.1548011597165489e-05, "loss": 1.407, "num_input_tokens_seen": 56910752, "step": 27235 }, { "epoch": 4.443755608124643, "grad_norm": 5.96875, "learning_rate": 1.1539737541432019e-05, "loss": 1.0073, "num_input_tokens_seen": 56920096, "step": 27240 }, { "epoch": 4.444571335345461, "grad_norm": 2.671875, "learning_rate": 1.1531465561440174e-05, "loss": 1.4787, "num_input_tokens_seen": 56929808, "step": 27245 }, { "epoch": 4.445387062566278, "grad_norm": 7.9375, "learning_rate": 1.1523195658465605e-05, "loss": 2.346, "num_input_tokens_seen": 56940016, "step": 27250 }, { "epoch": 4.446202789787095, "grad_norm": 2.15625, "learning_rate": 1.1514927833783618e-05, "loss": 1.5121, "num_input_tokens_seen": 56950176, "step": 27255 }, { "epoch": 4.447018517007913, "grad_norm": 9.0625, "learning_rate": 1.150666208866922e-05, "loss": 3.3724, "num_input_tokens_seen": 56960656, "step": 27260 }, { "epoch": 4.44783424422873, "grad_norm": 10.625, "learning_rate": 1.1498398424397106e-05, "loss": 3.8869, "num_input_tokens_seen": 56970800, "step": 27265 }, { "epoch": 4.448649971449547, "grad_norm": 5.84375, "learning_rate": 1.1490136842241628e-05, "loss": 2.2372, "num_input_tokens_seen": 56981984, "step": 27270 }, { "epoch": 4.449465698670364, "grad_norm": 10.1875, "learning_rate": 1.1481877343476813e-05, "loss": 2.0308, "num_input_tokens_seen": 56991312, "step": 27275 }, { "epoch": 4.450281425891182, "grad_norm": 11.25, "learning_rate": 1.14736199293764e-05, "loss": 2.7058, "num_input_tokens_seen": 57002032, "step": 27280 }, { "epoch": 4.451097153111999, "grad_norm": 8.9375, "learning_rate": 1.1465364601213771e-05, "loss": 3.0221, "num_input_tokens_seen": 57013296, "step": 27285 }, { "epoch": 4.4519128803328165, "grad_norm": 8.0, "learning_rate": 1.1457111360262012e-05, "loss": 2.1033, "num_input_tokens_seen": 57023104, "step": 27290 }, { "epoch": 4.4527286075536345, "grad_norm": 5.75, "learning_rate": 1.1448860207793869e-05, "loss": 1.8154, "num_input_tokens_seen": 57033696, "step": 27295 }, { "epoch": 4.453544334774452, "grad_norm": 5.09375, "learning_rate": 1.144061114508177e-05, "loss": 2.3669, "num_input_tokens_seen": 57044320, "step": 27300 }, { "epoch": 4.454360061995269, "grad_norm": 16.625, "learning_rate": 1.1432364173397842e-05, "loss": 2.4075, "num_input_tokens_seen": 57053712, "step": 27305 }, { "epoch": 4.455175789216086, "grad_norm": 4.0625, "learning_rate": 1.1424119294013852e-05, "loss": 1.3778, "num_input_tokens_seen": 57063376, "step": 27310 }, { "epoch": 4.455991516436903, "grad_norm": 3.046875, "learning_rate": 1.1415876508201279e-05, "loss": 1.5237, "num_input_tokens_seen": 57074512, "step": 27315 }, { "epoch": 4.456807243657721, "grad_norm": 5.6875, "learning_rate": 1.140763581723125e-05, "loss": 2.4006, "num_input_tokens_seen": 57083952, "step": 27320 }, { "epoch": 4.457622970878538, "grad_norm": 13.5625, "learning_rate": 1.1399397222374588e-05, "loss": 2.9348, "num_input_tokens_seen": 57094256, "step": 27325 }, { "epoch": 4.458438698099355, "grad_norm": 2.875, "learning_rate": 1.1391160724901804e-05, "loss": 1.8388, "num_input_tokens_seen": 57106000, "step": 27330 }, { "epoch": 4.459254425320173, "grad_norm": 7.90625, "learning_rate": 1.138292632608304e-05, "loss": 2.464, "num_input_tokens_seen": 57115504, "step": 27335 }, { "epoch": 4.4600701525409905, "grad_norm": 8.0, "learning_rate": 1.1374694027188174e-05, "loss": 3.3646, "num_input_tokens_seen": 57125808, "step": 27340 }, { "epoch": 4.460885879761808, "grad_norm": 8.0, "learning_rate": 1.1366463829486711e-05, "loss": 1.9847, "num_input_tokens_seen": 57134688, "step": 27345 }, { "epoch": 4.461701606982625, "grad_norm": 5.71875, "learning_rate": 1.1358235734247849e-05, "loss": 1.9088, "num_input_tokens_seen": 57144032, "step": 27350 }, { "epoch": 4.462517334203443, "grad_norm": 5.46875, "learning_rate": 1.1350009742740478e-05, "loss": 1.8997, "num_input_tokens_seen": 57153712, "step": 27355 }, { "epoch": 4.46333306142426, "grad_norm": 9.0, "learning_rate": 1.134178585623313e-05, "loss": 2.6563, "num_input_tokens_seen": 57164560, "step": 27360 }, { "epoch": 4.464148788645077, "grad_norm": 4.8125, "learning_rate": 1.1333564075994047e-05, "loss": 2.5535, "num_input_tokens_seen": 57174048, "step": 27365 }, { "epoch": 4.464964515865894, "grad_norm": 5.90625, "learning_rate": 1.1325344403291133e-05, "loss": 3.1231, "num_input_tokens_seen": 57184304, "step": 27370 }, { "epoch": 4.465780243086712, "grad_norm": 17.875, "learning_rate": 1.1317126839391951e-05, "loss": 1.8863, "num_input_tokens_seen": 57194240, "step": 27375 }, { "epoch": 4.466595970307529, "grad_norm": 0.08642578125, "learning_rate": 1.1308911385563766e-05, "loss": 2.3274, "num_input_tokens_seen": 57202928, "step": 27380 }, { "epoch": 4.467411697528346, "grad_norm": 11.125, "learning_rate": 1.1300698043073494e-05, "loss": 3.1338, "num_input_tokens_seen": 57212272, "step": 27385 }, { "epoch": 4.468227424749164, "grad_norm": 7.8125, "learning_rate": 1.1292486813187736e-05, "loss": 1.3611, "num_input_tokens_seen": 57223424, "step": 27390 }, { "epoch": 4.469043151969982, "grad_norm": 10.375, "learning_rate": 1.1284277697172782e-05, "loss": 3.5807, "num_input_tokens_seen": 57234608, "step": 27395 }, { "epoch": 4.469858879190799, "grad_norm": 7.25, "learning_rate": 1.127607069629456e-05, "loss": 1.8468, "num_input_tokens_seen": 57245776, "step": 27400 }, { "epoch": 4.469858879190799, "eval_loss": 2.5330114364624023, "eval_runtime": 134.9853, "eval_samples_per_second": 20.187, "eval_steps_per_second": 10.097, "num_input_tokens_seen": 57245776, "step": 27400 }, { "epoch": 4.470674606411616, "grad_norm": 13.4375, "learning_rate": 1.1267865811818701e-05, "loss": 1.8755, "num_input_tokens_seen": 57256528, "step": 27405 }, { "epoch": 4.471490333632433, "grad_norm": 9.1875, "learning_rate": 1.1259663045010513e-05, "loss": 2.423, "num_input_tokens_seen": 57266272, "step": 27410 }, { "epoch": 4.472306060853251, "grad_norm": 15.3125, "learning_rate": 1.1251462397134957e-05, "loss": 2.6156, "num_input_tokens_seen": 57277008, "step": 27415 }, { "epoch": 4.473121788074068, "grad_norm": 0.13671875, "learning_rate": 1.1243263869456664e-05, "loss": 0.857, "num_input_tokens_seen": 57287632, "step": 27420 }, { "epoch": 4.473937515294885, "grad_norm": 6.53125, "learning_rate": 1.1235067463239967e-05, "loss": 2.1285, "num_input_tokens_seen": 57297104, "step": 27425 }, { "epoch": 4.474753242515702, "grad_norm": 10.0, "learning_rate": 1.122687317974884e-05, "loss": 3.6802, "num_input_tokens_seen": 57305968, "step": 27430 }, { "epoch": 4.47556896973652, "grad_norm": 3.9375, "learning_rate": 1.1218681020246963e-05, "loss": 1.8872, "num_input_tokens_seen": 57316400, "step": 27435 }, { "epoch": 4.4763846969573375, "grad_norm": 3.65625, "learning_rate": 1.1210490985997652e-05, "loss": 2.7732, "num_input_tokens_seen": 57327376, "step": 27440 }, { "epoch": 4.477200424178155, "grad_norm": 5.75, "learning_rate": 1.1202303078263917e-05, "loss": 1.554, "num_input_tokens_seen": 57336704, "step": 27445 }, { "epoch": 4.478016151398972, "grad_norm": 6.96875, "learning_rate": 1.1194117298308451e-05, "loss": 2.4769, "num_input_tokens_seen": 57347056, "step": 27450 }, { "epoch": 4.47883187861979, "grad_norm": 4.90625, "learning_rate": 1.1185933647393585e-05, "loss": 3.1823, "num_input_tokens_seen": 57357792, "step": 27455 }, { "epoch": 4.479647605840607, "grad_norm": 3.90625, "learning_rate": 1.1177752126781354e-05, "loss": 2.6147, "num_input_tokens_seen": 57368576, "step": 27460 }, { "epoch": 4.480463333061424, "grad_norm": 4.15625, "learning_rate": 1.1169572737733441e-05, "loss": 2.5051, "num_input_tokens_seen": 57379616, "step": 27465 }, { "epoch": 4.481279060282241, "grad_norm": 7.0625, "learning_rate": 1.1161395481511216e-05, "loss": 1.526, "num_input_tokens_seen": 57389840, "step": 27470 }, { "epoch": 4.482094787503059, "grad_norm": 1.390625, "learning_rate": 1.1153220359375722e-05, "loss": 2.0087, "num_input_tokens_seen": 57399840, "step": 27475 }, { "epoch": 4.482910514723876, "grad_norm": 7.125, "learning_rate": 1.114504737258765e-05, "loss": 3.0372, "num_input_tokens_seen": 57409568, "step": 27480 }, { "epoch": 4.4837262419446935, "grad_norm": 3.109375, "learning_rate": 1.1136876522407393e-05, "loss": 1.6737, "num_input_tokens_seen": 57418560, "step": 27485 }, { "epoch": 4.484541969165511, "grad_norm": 10.0625, "learning_rate": 1.1128707810094985e-05, "loss": 3.1618, "num_input_tokens_seen": 57427904, "step": 27490 }, { "epoch": 4.485357696386329, "grad_norm": 6.84375, "learning_rate": 1.1120541236910157e-05, "loss": 2.0874, "num_input_tokens_seen": 57437408, "step": 27495 }, { "epoch": 4.486173423607146, "grad_norm": 3.609375, "learning_rate": 1.111237680411229e-05, "loss": 2.8262, "num_input_tokens_seen": 57447664, "step": 27500 }, { "epoch": 4.486989150827963, "grad_norm": 5.875, "learning_rate": 1.1104214512960433e-05, "loss": 1.4062, "num_input_tokens_seen": 57457424, "step": 27505 }, { "epoch": 4.487804878048781, "grad_norm": 4.96875, "learning_rate": 1.1096054364713327e-05, "loss": 2.5104, "num_input_tokens_seen": 57466784, "step": 27510 }, { "epoch": 4.488620605269598, "grad_norm": 4.59375, "learning_rate": 1.1087896360629371e-05, "loss": 1.7033, "num_input_tokens_seen": 57475296, "step": 27515 }, { "epoch": 4.489436332490415, "grad_norm": 10.9375, "learning_rate": 1.107974050196662e-05, "loss": 2.4584, "num_input_tokens_seen": 57486400, "step": 27520 }, { "epoch": 4.490252059711232, "grad_norm": 6.4375, "learning_rate": 1.1071586789982816e-05, "loss": 2.2201, "num_input_tokens_seen": 57498304, "step": 27525 }, { "epoch": 4.49106778693205, "grad_norm": 11.375, "learning_rate": 1.1063435225935373e-05, "loss": 2.7589, "num_input_tokens_seen": 57507664, "step": 27530 }, { "epoch": 4.4918835141528675, "grad_norm": 5.65625, "learning_rate": 1.1055285811081348e-05, "loss": 2.1304, "num_input_tokens_seen": 57518720, "step": 27535 }, { "epoch": 4.492699241373685, "grad_norm": 7.59375, "learning_rate": 1.1047138546677499e-05, "loss": 2.0693, "num_input_tokens_seen": 57528496, "step": 27540 }, { "epoch": 4.493514968594502, "grad_norm": 7.75, "learning_rate": 1.1038993433980219e-05, "loss": 2.4205, "num_input_tokens_seen": 57538416, "step": 27545 }, { "epoch": 4.49433069581532, "grad_norm": 7.59375, "learning_rate": 1.1030850474245597e-05, "loss": 4.3318, "num_input_tokens_seen": 57548560, "step": 27550 }, { "epoch": 4.495146423036137, "grad_norm": 13.25, "learning_rate": 1.102270966872939e-05, "loss": 2.3987, "num_input_tokens_seen": 57558400, "step": 27555 }, { "epoch": 4.495962150256954, "grad_norm": 6.1875, "learning_rate": 1.1014571018687e-05, "loss": 2.2123, "num_input_tokens_seen": 57568496, "step": 27560 }, { "epoch": 4.496777877477771, "grad_norm": 8.75, "learning_rate": 1.1006434525373502e-05, "loss": 2.4697, "num_input_tokens_seen": 57578704, "step": 27565 }, { "epoch": 4.497593604698589, "grad_norm": 9.375, "learning_rate": 1.0998300190043664e-05, "loss": 1.7434, "num_input_tokens_seen": 57590128, "step": 27570 }, { "epoch": 4.498409331919406, "grad_norm": 1.6796875, "learning_rate": 1.0990168013951882e-05, "loss": 2.6784, "num_input_tokens_seen": 57601472, "step": 27575 }, { "epoch": 4.499225059140223, "grad_norm": 7.59375, "learning_rate": 1.0982037998352263e-05, "loss": 1.3403, "num_input_tokens_seen": 57611648, "step": 27580 }, { "epoch": 4.5000407863610405, "grad_norm": 9.125, "learning_rate": 1.0973910144498534e-05, "loss": 3.3324, "num_input_tokens_seen": 57622560, "step": 27585 }, { "epoch": 4.500856513581859, "grad_norm": 6.96875, "learning_rate": 1.0965784453644123e-05, "loss": 1.8981, "num_input_tokens_seen": 57632656, "step": 27590 }, { "epoch": 4.501672240802676, "grad_norm": 9.8125, "learning_rate": 1.0957660927042127e-05, "loss": 3.1912, "num_input_tokens_seen": 57642704, "step": 27595 }, { "epoch": 4.502487968023493, "grad_norm": 5.125, "learning_rate": 1.094953956594527e-05, "loss": 2.2285, "num_input_tokens_seen": 57651824, "step": 27600 }, { "epoch": 4.502487968023493, "eval_loss": 2.5300850868225098, "eval_runtime": 134.9579, "eval_samples_per_second": 20.191, "eval_steps_per_second": 10.099, "num_input_tokens_seen": 57651824, "step": 27600 }, { "epoch": 4.50330369524431, "grad_norm": 4.875, "learning_rate": 1.0941420371605981e-05, "loss": 2.9662, "num_input_tokens_seen": 57662976, "step": 27605 }, { "epoch": 4.504119422465128, "grad_norm": 7.46875, "learning_rate": 1.0933303345276354e-05, "loss": 2.599, "num_input_tokens_seen": 57673952, "step": 27610 }, { "epoch": 4.504935149685945, "grad_norm": 8.0625, "learning_rate": 1.0925188488208112e-05, "loss": 2.3898, "num_input_tokens_seen": 57684704, "step": 27615 }, { "epoch": 4.505750876906762, "grad_norm": 13.3125, "learning_rate": 1.0917075801652694e-05, "loss": 2.9725, "num_input_tokens_seen": 57695008, "step": 27620 }, { "epoch": 4.506566604127579, "grad_norm": 4.84375, "learning_rate": 1.0908965286861151e-05, "loss": 1.3661, "num_input_tokens_seen": 57704624, "step": 27625 }, { "epoch": 4.507382331348397, "grad_norm": 9.3125, "learning_rate": 1.090085694508425e-05, "loss": 3.6842, "num_input_tokens_seen": 57714224, "step": 27630 }, { "epoch": 4.5081980585692145, "grad_norm": 8.625, "learning_rate": 1.089275077757238e-05, "loss": 3.1895, "num_input_tokens_seen": 57723680, "step": 27635 }, { "epoch": 4.509013785790032, "grad_norm": 10.1875, "learning_rate": 1.0884646785575633e-05, "loss": 1.9219, "num_input_tokens_seen": 57732576, "step": 27640 }, { "epoch": 4.50982951301085, "grad_norm": 7.40625, "learning_rate": 1.0876544970343728e-05, "loss": 2.692, "num_input_tokens_seen": 57741872, "step": 27645 }, { "epoch": 4.510645240231667, "grad_norm": 1.8671875, "learning_rate": 1.0868445333126082e-05, "loss": 2.8436, "num_input_tokens_seen": 57752272, "step": 27650 }, { "epoch": 4.511460967452484, "grad_norm": 5.1875, "learning_rate": 1.0860347875171745e-05, "loss": 1.2589, "num_input_tokens_seen": 57761488, "step": 27655 }, { "epoch": 4.512276694673301, "grad_norm": 3.578125, "learning_rate": 1.0852252597729465e-05, "loss": 1.6207, "num_input_tokens_seen": 57770704, "step": 27660 }, { "epoch": 4.513092421894118, "grad_norm": 9.4375, "learning_rate": 1.0844159502047615e-05, "loss": 2.4946, "num_input_tokens_seen": 57781632, "step": 27665 }, { "epoch": 4.513908149114936, "grad_norm": 4.09375, "learning_rate": 1.0836068589374265e-05, "loss": 1.6343, "num_input_tokens_seen": 57791264, "step": 27670 }, { "epoch": 4.514723876335753, "grad_norm": 12.6875, "learning_rate": 1.0827979860957144e-05, "loss": 4.1241, "num_input_tokens_seen": 57803632, "step": 27675 }, { "epoch": 4.5155396035565705, "grad_norm": 8.0625, "learning_rate": 1.0819893318043615e-05, "loss": 2.1605, "num_input_tokens_seen": 57813504, "step": 27680 }, { "epoch": 4.5163553307773885, "grad_norm": 5.25, "learning_rate": 1.0811808961880734e-05, "loss": 1.9107, "num_input_tokens_seen": 57824816, "step": 27685 }, { "epoch": 4.517171057998206, "grad_norm": 7.65625, "learning_rate": 1.080372679371522e-05, "loss": 2.653, "num_input_tokens_seen": 57833760, "step": 27690 }, { "epoch": 4.517986785219023, "grad_norm": 10.0, "learning_rate": 1.0795646814793428e-05, "loss": 2.8833, "num_input_tokens_seen": 57843328, "step": 27695 }, { "epoch": 4.51880251243984, "grad_norm": 3.203125, "learning_rate": 1.078756902636141e-05, "loss": 2.2048, "num_input_tokens_seen": 57853504, "step": 27700 }, { "epoch": 4.519618239660657, "grad_norm": 9.625, "learning_rate": 1.077949342966485e-05, "loss": 2.8611, "num_input_tokens_seen": 57863632, "step": 27705 }, { "epoch": 4.520433966881475, "grad_norm": 5.84375, "learning_rate": 1.0771420025949103e-05, "loss": 2.8812, "num_input_tokens_seen": 57872704, "step": 27710 }, { "epoch": 4.521249694102292, "grad_norm": 4.875, "learning_rate": 1.0763348816459204e-05, "loss": 1.882, "num_input_tokens_seen": 57882272, "step": 27715 }, { "epoch": 4.522065421323109, "grad_norm": 4.65625, "learning_rate": 1.0755279802439816e-05, "loss": 2.4799, "num_input_tokens_seen": 57893712, "step": 27720 }, { "epoch": 4.522881148543927, "grad_norm": 10.9375, "learning_rate": 1.0747212985135293e-05, "loss": 2.1825, "num_input_tokens_seen": 57903520, "step": 27725 }, { "epoch": 4.523696875764744, "grad_norm": 8.875, "learning_rate": 1.073914836578965e-05, "loss": 2.1964, "num_input_tokens_seen": 57913808, "step": 27730 }, { "epoch": 4.524512602985562, "grad_norm": 5.65625, "learning_rate": 1.0731085945646529e-05, "loss": 1.7204, "num_input_tokens_seen": 57924064, "step": 27735 }, { "epoch": 4.525328330206379, "grad_norm": 2.078125, "learning_rate": 1.0723025725949285e-05, "loss": 0.9123, "num_input_tokens_seen": 57935104, "step": 27740 }, { "epoch": 4.526144057427197, "grad_norm": 0.056640625, "learning_rate": 1.0714967707940875e-05, "loss": 2.1329, "num_input_tokens_seen": 57946800, "step": 27745 }, { "epoch": 4.526959784648014, "grad_norm": 6.25, "learning_rate": 1.0706911892863963e-05, "loss": 1.6029, "num_input_tokens_seen": 57956608, "step": 27750 }, { "epoch": 4.527775511868831, "grad_norm": 12.5625, "learning_rate": 1.0698858281960866e-05, "loss": 2.3899, "num_input_tokens_seen": 57966464, "step": 27755 }, { "epoch": 4.528591239089648, "grad_norm": 10.875, "learning_rate": 1.069080687647353e-05, "loss": 1.6803, "num_input_tokens_seen": 57977344, "step": 27760 }, { "epoch": 4.529406966310466, "grad_norm": 5.46875, "learning_rate": 1.0682757677643596e-05, "loss": 2.0569, "num_input_tokens_seen": 57986752, "step": 27765 }, { "epoch": 4.530222693531283, "grad_norm": 6.09375, "learning_rate": 1.0674710686712359e-05, "loss": 2.6496, "num_input_tokens_seen": 57997104, "step": 27770 }, { "epoch": 4.5310384207521, "grad_norm": 5.9375, "learning_rate": 1.0666665904920756e-05, "loss": 2.3924, "num_input_tokens_seen": 58008464, "step": 27775 }, { "epoch": 4.5318541479729175, "grad_norm": 4.53125, "learning_rate": 1.0658623333509385e-05, "loss": 2.9048, "num_input_tokens_seen": 58019488, "step": 27780 }, { "epoch": 4.5326698751937355, "grad_norm": 5.84375, "learning_rate": 1.0650582973718532e-05, "loss": 2.031, "num_input_tokens_seen": 58029504, "step": 27785 }, { "epoch": 4.533485602414553, "grad_norm": 5.28125, "learning_rate": 1.0642544826788098e-05, "loss": 2.1859, "num_input_tokens_seen": 58039744, "step": 27790 }, { "epoch": 4.53430132963537, "grad_norm": 0.31640625, "learning_rate": 1.063450889395769e-05, "loss": 1.9924, "num_input_tokens_seen": 58050432, "step": 27795 }, { "epoch": 4.535117056856187, "grad_norm": 7.59375, "learning_rate": 1.062647517646653e-05, "loss": 2.8184, "num_input_tokens_seen": 58060672, "step": 27800 }, { "epoch": 4.535117056856187, "eval_loss": 2.5348377227783203, "eval_runtime": 134.6924, "eval_samples_per_second": 20.231, "eval_steps_per_second": 10.119, "num_input_tokens_seen": 58060672, "step": 27800 }, { "epoch": 4.535932784077005, "grad_norm": 7.4375, "learning_rate": 1.0618443675553527e-05, "loss": 2.6065, "num_input_tokens_seen": 58070560, "step": 27805 }, { "epoch": 4.536748511297822, "grad_norm": 9.5, "learning_rate": 1.0610414392457247e-05, "loss": 2.2254, "num_input_tokens_seen": 58079824, "step": 27810 }, { "epoch": 4.537564238518639, "grad_norm": 6.09375, "learning_rate": 1.0602387328415888e-05, "loss": 1.8626, "num_input_tokens_seen": 58092144, "step": 27815 }, { "epoch": 4.538379965739456, "grad_norm": 11.625, "learning_rate": 1.0594362484667347e-05, "loss": 2.4524, "num_input_tokens_seen": 58103568, "step": 27820 }, { "epoch": 4.539195692960274, "grad_norm": 5.71875, "learning_rate": 1.0586339862449132e-05, "loss": 2.6355, "num_input_tokens_seen": 58113744, "step": 27825 }, { "epoch": 4.5400114201810915, "grad_norm": 6.65625, "learning_rate": 1.0578319462998445e-05, "loss": 1.3788, "num_input_tokens_seen": 58123520, "step": 27830 }, { "epoch": 4.540827147401909, "grad_norm": 8.75, "learning_rate": 1.057030128755214e-05, "loss": 2.1747, "num_input_tokens_seen": 58134720, "step": 27835 }, { "epoch": 4.541642874622726, "grad_norm": 4.21875, "learning_rate": 1.0562285337346703e-05, "loss": 2.6409, "num_input_tokens_seen": 58143776, "step": 27840 }, { "epoch": 4.542458601843544, "grad_norm": 8.25, "learning_rate": 1.0554271613618308e-05, "loss": 1.1453, "num_input_tokens_seen": 58153520, "step": 27845 }, { "epoch": 4.543274329064361, "grad_norm": 8.8125, "learning_rate": 1.054626011760276e-05, "loss": 2.139, "num_input_tokens_seen": 58163696, "step": 27850 }, { "epoch": 4.544090056285178, "grad_norm": 11.6875, "learning_rate": 1.0538250850535549e-05, "loss": 2.1314, "num_input_tokens_seen": 58174080, "step": 27855 }, { "epoch": 4.544905783505996, "grad_norm": 4.34375, "learning_rate": 1.0530243813651794e-05, "loss": 0.8476, "num_input_tokens_seen": 58185104, "step": 27860 }, { "epoch": 4.545721510726813, "grad_norm": 7.15625, "learning_rate": 1.0522239008186271e-05, "loss": 1.7967, "num_input_tokens_seen": 58195200, "step": 27865 }, { "epoch": 4.54653723794763, "grad_norm": 9.5625, "learning_rate": 1.0514236435373434e-05, "loss": 3.1794, "num_input_tokens_seen": 58205632, "step": 27870 }, { "epoch": 4.5473529651684474, "grad_norm": 4.59375, "learning_rate": 1.0506236096447386e-05, "loss": 1.8804, "num_input_tokens_seen": 58218112, "step": 27875 }, { "epoch": 4.548168692389265, "grad_norm": 8.75, "learning_rate": 1.049823799264186e-05, "loss": 1.4054, "num_input_tokens_seen": 58226848, "step": 27880 }, { "epoch": 4.548984419610083, "grad_norm": 6.3125, "learning_rate": 1.049024212519028e-05, "loss": 1.9463, "num_input_tokens_seen": 58238080, "step": 27885 }, { "epoch": 4.5498001468309, "grad_norm": 11.25, "learning_rate": 1.0482248495325713e-05, "loss": 2.6347, "num_input_tokens_seen": 58248400, "step": 27890 }, { "epoch": 4.550615874051717, "grad_norm": 7.25, "learning_rate": 1.047425710428086e-05, "loss": 2.4034, "num_input_tokens_seen": 58258560, "step": 27895 }, { "epoch": 4.551431601272535, "grad_norm": 12.0625, "learning_rate": 1.0466267953288114e-05, "loss": 2.6312, "num_input_tokens_seen": 58268800, "step": 27900 }, { "epoch": 4.552247328493352, "grad_norm": 6.625, "learning_rate": 1.0458281043579482e-05, "loss": 2.5601, "num_input_tokens_seen": 58278592, "step": 27905 }, { "epoch": 4.553063055714169, "grad_norm": 2.734375, "learning_rate": 1.0450296376386657e-05, "loss": 2.6781, "num_input_tokens_seen": 58289936, "step": 27910 }, { "epoch": 4.553878782934986, "grad_norm": 5.5, "learning_rate": 1.044231395294098e-05, "loss": 2.2968, "num_input_tokens_seen": 58300576, "step": 27915 }, { "epoch": 4.554694510155803, "grad_norm": 8.0625, "learning_rate": 1.0434333774473435e-05, "loss": 1.814, "num_input_tokens_seen": 58310160, "step": 27920 }, { "epoch": 4.555510237376621, "grad_norm": 6.09375, "learning_rate": 1.0426355842214657e-05, "loss": 1.8297, "num_input_tokens_seen": 58319808, "step": 27925 }, { "epoch": 4.5563259645974385, "grad_norm": 11.875, "learning_rate": 1.0418380157394963e-05, "loss": 1.9359, "num_input_tokens_seen": 58331360, "step": 27930 }, { "epoch": 4.557141691818256, "grad_norm": 3.125, "learning_rate": 1.0410406721244281e-05, "loss": 2.0197, "num_input_tokens_seen": 58341152, "step": 27935 }, { "epoch": 4.557957419039074, "grad_norm": 6.28125, "learning_rate": 1.0402435534992238e-05, "loss": 2.9594, "num_input_tokens_seen": 58351472, "step": 27940 }, { "epoch": 4.558773146259891, "grad_norm": 6.46875, "learning_rate": 1.0394466599868071e-05, "loss": 2.102, "num_input_tokens_seen": 58359664, "step": 27945 }, { "epoch": 4.559588873480708, "grad_norm": 1.796875, "learning_rate": 1.0386499917100697e-05, "loss": 2.8509, "num_input_tokens_seen": 58370112, "step": 27950 }, { "epoch": 4.560404600701525, "grad_norm": 6.4375, "learning_rate": 1.0378535487918692e-05, "loss": 2.457, "num_input_tokens_seen": 58380560, "step": 27955 }, { "epoch": 4.561220327922343, "grad_norm": 7.4375, "learning_rate": 1.037057331355025e-05, "loss": 5.1613, "num_input_tokens_seen": 58391968, "step": 27960 }, { "epoch": 4.56203605514316, "grad_norm": 4.9375, "learning_rate": 1.0362613395223247e-05, "loss": 2.9612, "num_input_tokens_seen": 58401984, "step": 27965 }, { "epoch": 4.562851782363977, "grad_norm": 4.9375, "learning_rate": 1.0354655734165212e-05, "loss": 1.8174, "num_input_tokens_seen": 58412048, "step": 27970 }, { "epoch": 4.5636675095847945, "grad_norm": 13.625, "learning_rate": 1.03467003316033e-05, "loss": 2.6338, "num_input_tokens_seen": 58422864, "step": 27975 }, { "epoch": 4.5644832368056125, "grad_norm": 4.46875, "learning_rate": 1.033874718876435e-05, "loss": 2.5256, "num_input_tokens_seen": 58433808, "step": 27980 }, { "epoch": 4.56529896402643, "grad_norm": 7.625, "learning_rate": 1.0330796306874818e-05, "loss": 2.8845, "num_input_tokens_seen": 58443376, "step": 27985 }, { "epoch": 4.566114691247247, "grad_norm": 4.96875, "learning_rate": 1.032284768716085e-05, "loss": 2.2115, "num_input_tokens_seen": 58453936, "step": 27990 }, { "epoch": 4.566930418468064, "grad_norm": 9.0, "learning_rate": 1.0314901330848206e-05, "loss": 1.7832, "num_input_tokens_seen": 58464416, "step": 27995 }, { "epoch": 4.567746145688882, "grad_norm": 12.125, "learning_rate": 1.030695723916233e-05, "loss": 3.3468, "num_input_tokens_seen": 58475488, "step": 28000 }, { "epoch": 4.567746145688882, "eval_loss": 2.545409679412842, "eval_runtime": 134.6944, "eval_samples_per_second": 20.231, "eval_steps_per_second": 10.119, "num_input_tokens_seen": 58475488, "step": 28000 }, { "epoch": 4.568561872909699, "grad_norm": 4.28125, "learning_rate": 1.0299015413328289e-05, "loss": 1.2479, "num_input_tokens_seen": 58484720, "step": 28005 }, { "epoch": 4.569377600130516, "grad_norm": 3.390625, "learning_rate": 1.0291075854570809e-05, "loss": 4.0472, "num_input_tokens_seen": 58495184, "step": 28010 }, { "epoch": 4.570193327351333, "grad_norm": 2.90625, "learning_rate": 1.0283138564114275e-05, "loss": 4.1318, "num_input_tokens_seen": 58506336, "step": 28015 }, { "epoch": 4.571009054572151, "grad_norm": 13.125, "learning_rate": 1.027520354318273e-05, "loss": 2.3654, "num_input_tokens_seen": 58516768, "step": 28020 }, { "epoch": 4.5718247817929685, "grad_norm": 6.09375, "learning_rate": 1.0267270792999828e-05, "loss": 2.7872, "num_input_tokens_seen": 58528272, "step": 28025 }, { "epoch": 4.572640509013786, "grad_norm": 12.4375, "learning_rate": 1.0259340314788919e-05, "loss": 2.4599, "num_input_tokens_seen": 58539232, "step": 28030 }, { "epoch": 4.573456236234604, "grad_norm": 2.328125, "learning_rate": 1.0251412109772979e-05, "loss": 1.3142, "num_input_tokens_seen": 58550400, "step": 28035 }, { "epoch": 4.574271963455421, "grad_norm": 8.8125, "learning_rate": 1.0243486179174627e-05, "loss": 1.3613, "num_input_tokens_seen": 58560288, "step": 28040 }, { "epoch": 4.575087690676238, "grad_norm": 10.25, "learning_rate": 1.0235562524216158e-05, "loss": 2.5354, "num_input_tokens_seen": 58570048, "step": 28045 }, { "epoch": 4.575903417897055, "grad_norm": 1.5390625, "learning_rate": 1.022764114611948e-05, "loss": 1.8604, "num_input_tokens_seen": 58580240, "step": 28050 }, { "epoch": 4.576719145117872, "grad_norm": 10.25, "learning_rate": 1.0219722046106178e-05, "loss": 2.3342, "num_input_tokens_seen": 58589200, "step": 28055 }, { "epoch": 4.57753487233869, "grad_norm": 14.5625, "learning_rate": 1.0211805225397486e-05, "loss": 3.5172, "num_input_tokens_seen": 58600768, "step": 28060 }, { "epoch": 4.578350599559507, "grad_norm": 5.59375, "learning_rate": 1.020389068521426e-05, "loss": 3.196, "num_input_tokens_seen": 58611376, "step": 28065 }, { "epoch": 4.579166326780324, "grad_norm": 7.3125, "learning_rate": 1.0195978426777039e-05, "loss": 1.9882, "num_input_tokens_seen": 58620496, "step": 28070 }, { "epoch": 4.5799820540011424, "grad_norm": 4.625, "learning_rate": 1.0188068451305982e-05, "loss": 2.1651, "num_input_tokens_seen": 58631264, "step": 28075 }, { "epoch": 4.58079778122196, "grad_norm": 3.40625, "learning_rate": 1.0180160760020902e-05, "loss": 2.1447, "num_input_tokens_seen": 58640272, "step": 28080 }, { "epoch": 4.581613508442777, "grad_norm": 2.625, "learning_rate": 1.0172255354141278e-05, "loss": 2.6813, "num_input_tokens_seen": 58649280, "step": 28085 }, { "epoch": 4.582429235663594, "grad_norm": 8.0, "learning_rate": 1.0164352234886205e-05, "loss": 2.1149, "num_input_tokens_seen": 58658960, "step": 28090 }, { "epoch": 4.583244962884411, "grad_norm": 2.546875, "learning_rate": 1.0156451403474454e-05, "loss": 1.4688, "num_input_tokens_seen": 58671856, "step": 28095 }, { "epoch": 4.584060690105229, "grad_norm": 11.25, "learning_rate": 1.0148552861124443e-05, "loss": 2.6753, "num_input_tokens_seen": 58681456, "step": 28100 }, { "epoch": 4.584876417326046, "grad_norm": 3.140625, "learning_rate": 1.0140656609054205e-05, "loss": 2.3744, "num_input_tokens_seen": 58692080, "step": 28105 }, { "epoch": 4.585692144546863, "grad_norm": 4.25, "learning_rate": 1.0132762648481455e-05, "loss": 2.0643, "num_input_tokens_seen": 58703024, "step": 28110 }, { "epoch": 4.586507871767681, "grad_norm": 3.984375, "learning_rate": 1.0124870980623543e-05, "loss": 2.5376, "num_input_tokens_seen": 58714960, "step": 28115 }, { "epoch": 4.587323598988498, "grad_norm": 6.84375, "learning_rate": 1.0116981606697453e-05, "loss": 1.5032, "num_input_tokens_seen": 58725920, "step": 28120 }, { "epoch": 4.5881393262093155, "grad_norm": 6.09375, "learning_rate": 1.0109094527919838e-05, "loss": 2.4988, "num_input_tokens_seen": 58737584, "step": 28125 }, { "epoch": 4.588955053430133, "grad_norm": 0.059814453125, "learning_rate": 1.010120974550697e-05, "loss": 2.5805, "num_input_tokens_seen": 58748016, "step": 28130 }, { "epoch": 4.589770780650951, "grad_norm": 7.34375, "learning_rate": 1.0093327260674795e-05, "loss": 3.4765, "num_input_tokens_seen": 58758800, "step": 28135 }, { "epoch": 4.590586507871768, "grad_norm": 11.9375, "learning_rate": 1.0085447074638878e-05, "loss": 2.0011, "num_input_tokens_seen": 58769904, "step": 28140 }, { "epoch": 4.591402235092585, "grad_norm": 15.25, "learning_rate": 1.0077569188614461e-05, "loss": 2.9371, "num_input_tokens_seen": 58779696, "step": 28145 }, { "epoch": 4.592217962313402, "grad_norm": 5.84375, "learning_rate": 1.0069693603816393e-05, "loss": 2.0271, "num_input_tokens_seen": 58790816, "step": 28150 }, { "epoch": 4.59303368953422, "grad_norm": 4.5, "learning_rate": 1.0061820321459204e-05, "loss": 2.6568, "num_input_tokens_seen": 58802208, "step": 28155 }, { "epoch": 4.593849416755037, "grad_norm": 5.96875, "learning_rate": 1.0053949342757038e-05, "loss": 2.2766, "num_input_tokens_seen": 58813168, "step": 28160 }, { "epoch": 4.594665143975854, "grad_norm": 4.96875, "learning_rate": 1.0046080668923717e-05, "loss": 4.1358, "num_input_tokens_seen": 58824880, "step": 28165 }, { "epoch": 4.5954808711966715, "grad_norm": 5.75, "learning_rate": 1.003821430117267e-05, "loss": 2.2151, "num_input_tokens_seen": 58836016, "step": 28170 }, { "epoch": 4.5962965984174895, "grad_norm": 8.625, "learning_rate": 1.0030350240716999e-05, "loss": 2.6984, "num_input_tokens_seen": 58846768, "step": 28175 }, { "epoch": 4.597112325638307, "grad_norm": 7.65625, "learning_rate": 1.0022488488769449e-05, "loss": 2.6146, "num_input_tokens_seen": 58856816, "step": 28180 }, { "epoch": 4.597928052859124, "grad_norm": 8.6875, "learning_rate": 1.0014629046542387e-05, "loss": 2.7373, "num_input_tokens_seen": 58867648, "step": 28185 }, { "epoch": 4.598743780079941, "grad_norm": 10.4375, "learning_rate": 1.0006771915247842e-05, "loss": 2.5584, "num_input_tokens_seen": 58877792, "step": 28190 }, { "epoch": 4.599559507300759, "grad_norm": 6.375, "learning_rate": 9.998917096097495e-06, "loss": 2.6661, "num_input_tokens_seen": 58888176, "step": 28195 }, { "epoch": 4.600375234521576, "grad_norm": 3.65625, "learning_rate": 9.991064590302638e-06, "loss": 3.1295, "num_input_tokens_seen": 58898896, "step": 28200 }, { "epoch": 4.600375234521576, "eval_loss": 2.5404255390167236, "eval_runtime": 134.8721, "eval_samples_per_second": 20.204, "eval_steps_per_second": 10.106, "num_input_tokens_seen": 58898896, "step": 28200 }, { "epoch": 4.601190961742393, "grad_norm": 7.21875, "learning_rate": 9.983214399074241e-06, "loss": 1.968, "num_input_tokens_seen": 58909616, "step": 28205 }, { "epoch": 4.602006688963211, "grad_norm": 10.3125, "learning_rate": 9.975366523622893e-06, "loss": 2.455, "num_input_tokens_seen": 58919776, "step": 28210 }, { "epoch": 4.602822416184028, "grad_norm": 6.875, "learning_rate": 9.967520965158841e-06, "loss": 2.1679, "num_input_tokens_seen": 58930704, "step": 28215 }, { "epoch": 4.6036381434048455, "grad_norm": 8.5, "learning_rate": 9.95967772489197e-06, "loss": 2.1011, "num_input_tokens_seen": 58940608, "step": 28220 }, { "epoch": 4.604453870625663, "grad_norm": 2.203125, "learning_rate": 9.951836804031794e-06, "loss": 1.7826, "num_input_tokens_seen": 58952352, "step": 28225 }, { "epoch": 4.60526959784648, "grad_norm": 10.125, "learning_rate": 9.943998203787489e-06, "loss": 3.2396, "num_input_tokens_seen": 58962528, "step": 28230 }, { "epoch": 4.606085325067298, "grad_norm": 12.0, "learning_rate": 9.936161925367874e-06, "loss": 3.3115, "num_input_tokens_seen": 58971472, "step": 28235 }, { "epoch": 4.606901052288115, "grad_norm": 7.5, "learning_rate": 9.928327969981386e-06, "loss": 3.3945, "num_input_tokens_seen": 58982672, "step": 28240 }, { "epoch": 4.607716779508932, "grad_norm": 1.8125, "learning_rate": 9.920496338836135e-06, "loss": 2.9928, "num_input_tokens_seen": 58993904, "step": 28245 }, { "epoch": 4.60853250672975, "grad_norm": 2.890625, "learning_rate": 9.912667033139844e-06, "loss": 1.4599, "num_input_tokens_seen": 59004352, "step": 28250 }, { "epoch": 4.609348233950567, "grad_norm": 6.1875, "learning_rate": 9.904840054099893e-06, "loss": 2.5722, "num_input_tokens_seen": 59015344, "step": 28255 }, { "epoch": 4.610163961171384, "grad_norm": 4.15625, "learning_rate": 9.897015402923312e-06, "loss": 2.591, "num_input_tokens_seen": 59024640, "step": 28260 }, { "epoch": 4.610979688392201, "grad_norm": 3.734375, "learning_rate": 9.889193080816744e-06, "loss": 1.5616, "num_input_tokens_seen": 59034672, "step": 28265 }, { "epoch": 4.6117954156130185, "grad_norm": 10.0625, "learning_rate": 9.881373088986498e-06, "loss": 1.8154, "num_input_tokens_seen": 59044112, "step": 28270 }, { "epoch": 4.6126111428338366, "grad_norm": 6.53125, "learning_rate": 9.873555428638523e-06, "loss": 2.0809, "num_input_tokens_seen": 59053808, "step": 28275 }, { "epoch": 4.613426870054654, "grad_norm": 8.1875, "learning_rate": 9.865740100978383e-06, "loss": 3.9938, "num_input_tokens_seen": 59064640, "step": 28280 }, { "epoch": 4.614242597275471, "grad_norm": 10.5625, "learning_rate": 9.857927107211315e-06, "loss": 3.3046, "num_input_tokens_seen": 59076512, "step": 28285 }, { "epoch": 4.615058324496289, "grad_norm": 4.90625, "learning_rate": 9.850116448542177e-06, "loss": 2.0611, "num_input_tokens_seen": 59086432, "step": 28290 }, { "epoch": 4.615874051717106, "grad_norm": 10.1875, "learning_rate": 9.842308126175457e-06, "loss": 1.7429, "num_input_tokens_seen": 59096240, "step": 28295 }, { "epoch": 4.616689778937923, "grad_norm": 9.375, "learning_rate": 9.834502141315315e-06, "loss": 2.4445, "num_input_tokens_seen": 59106080, "step": 28300 }, { "epoch": 4.61750550615874, "grad_norm": 5.5625, "learning_rate": 9.82669849516552e-06, "loss": 2.6314, "num_input_tokens_seen": 59116816, "step": 28305 }, { "epoch": 4.618321233379558, "grad_norm": 4.46875, "learning_rate": 9.818897188929493e-06, "loss": 1.6128, "num_input_tokens_seen": 59128208, "step": 28310 }, { "epoch": 4.619136960600375, "grad_norm": 1.46875, "learning_rate": 9.811098223810309e-06, "loss": 1.9283, "num_input_tokens_seen": 59138208, "step": 28315 }, { "epoch": 4.6199526878211925, "grad_norm": 5.9375, "learning_rate": 9.803301601010641e-06, "loss": 2.3968, "num_input_tokens_seen": 59149728, "step": 28320 }, { "epoch": 4.62076841504201, "grad_norm": 4.75, "learning_rate": 9.795507321732853e-06, "loss": 1.5971, "num_input_tokens_seen": 59160272, "step": 28325 }, { "epoch": 4.621584142262828, "grad_norm": 10.1875, "learning_rate": 9.787715387178898e-06, "loss": 2.1268, "num_input_tokens_seen": 59172320, "step": 28330 }, { "epoch": 4.622399869483645, "grad_norm": 12.375, "learning_rate": 9.779925798550399e-06, "loss": 2.7148, "num_input_tokens_seen": 59182816, "step": 28335 }, { "epoch": 4.623215596704462, "grad_norm": 10.6875, "learning_rate": 9.772138557048619e-06, "loss": 2.5023, "num_input_tokens_seen": 59193328, "step": 28340 }, { "epoch": 4.624031323925279, "grad_norm": 4.1875, "learning_rate": 9.764353663874426e-06, "loss": 1.3306, "num_input_tokens_seen": 59203392, "step": 28345 }, { "epoch": 4.624847051146097, "grad_norm": 6.625, "learning_rate": 9.756571120228375e-06, "loss": 3.0225, "num_input_tokens_seen": 59212800, "step": 28350 }, { "epoch": 4.625662778366914, "grad_norm": 14.9375, "learning_rate": 9.748790927310605e-06, "loss": 3.9625, "num_input_tokens_seen": 59222464, "step": 28355 }, { "epoch": 4.626478505587731, "grad_norm": 12.8125, "learning_rate": 9.741013086320946e-06, "loss": 2.2866, "num_input_tokens_seen": 59233248, "step": 28360 }, { "epoch": 4.6272942328085485, "grad_norm": 4.75, "learning_rate": 9.733237598458821e-06, "loss": 2.2495, "num_input_tokens_seen": 59243392, "step": 28365 }, { "epoch": 4.6281099600293665, "grad_norm": 6.1875, "learning_rate": 9.725464464923308e-06, "loss": 3.1369, "num_input_tokens_seen": 59254432, "step": 28370 }, { "epoch": 4.628925687250184, "grad_norm": 9.9375, "learning_rate": 9.717693686913123e-06, "loss": 1.7599, "num_input_tokens_seen": 59265408, "step": 28375 }, { "epoch": 4.629741414471001, "grad_norm": 6.34375, "learning_rate": 9.709925265626632e-06, "loss": 2.4338, "num_input_tokens_seen": 59276224, "step": 28380 }, { "epoch": 4.630557141691818, "grad_norm": 6.0, "learning_rate": 9.702159202261801e-06, "loss": 1.6574, "num_input_tokens_seen": 59286560, "step": 28385 }, { "epoch": 4.631372868912636, "grad_norm": 7.34375, "learning_rate": 9.694395498016268e-06, "loss": 1.4097, "num_input_tokens_seen": 59297920, "step": 28390 }, { "epoch": 4.632188596133453, "grad_norm": 4.46875, "learning_rate": 9.686634154087298e-06, "loss": 1.3523, "num_input_tokens_seen": 59308704, "step": 28395 }, { "epoch": 4.63300432335427, "grad_norm": 2.09375, "learning_rate": 9.678875171671776e-06, "loss": 1.6093, "num_input_tokens_seen": 59318976, "step": 28400 }, { "epoch": 4.63300432335427, "eval_loss": 2.539818048477173, "eval_runtime": 134.8784, "eval_samples_per_second": 20.203, "eval_steps_per_second": 10.105, "num_input_tokens_seen": 59318976, "step": 28400 }, { "epoch": 4.633820050575087, "grad_norm": 8.1875, "learning_rate": 9.671118551966246e-06, "loss": 2.7529, "num_input_tokens_seen": 59328672, "step": 28405 }, { "epoch": 4.634635777795905, "grad_norm": 5.09375, "learning_rate": 9.66336429616686e-06, "loss": 2.3953, "num_input_tokens_seen": 59338848, "step": 28410 }, { "epoch": 4.635451505016722, "grad_norm": 1.8984375, "learning_rate": 9.655612405469436e-06, "loss": 3.4484, "num_input_tokens_seen": 59348848, "step": 28415 }, { "epoch": 4.63626723223754, "grad_norm": 3.0, "learning_rate": 9.647862881069413e-06, "loss": 1.2669, "num_input_tokens_seen": 59359568, "step": 28420 }, { "epoch": 4.637082959458358, "grad_norm": 7.75, "learning_rate": 9.640115724161855e-06, "loss": 2.1765, "num_input_tokens_seen": 59371504, "step": 28425 }, { "epoch": 4.637898686679175, "grad_norm": 3.21875, "learning_rate": 9.632370935941483e-06, "loss": 1.8536, "num_input_tokens_seen": 59381904, "step": 28430 }, { "epoch": 4.638714413899992, "grad_norm": 6.0625, "learning_rate": 9.624628517602634e-06, "loss": 3.1756, "num_input_tokens_seen": 59392896, "step": 28435 }, { "epoch": 4.639530141120809, "grad_norm": 10.3125, "learning_rate": 9.61688847033928e-06, "loss": 2.7083, "num_input_tokens_seen": 59403712, "step": 28440 }, { "epoch": 4.640345868341626, "grad_norm": 7.3125, "learning_rate": 9.609150795345051e-06, "loss": 1.4618, "num_input_tokens_seen": 59413664, "step": 28445 }, { "epoch": 4.641161595562444, "grad_norm": 7.21875, "learning_rate": 9.601415493813171e-06, "loss": 2.306, "num_input_tokens_seen": 59423408, "step": 28450 }, { "epoch": 4.641977322783261, "grad_norm": 10.1875, "learning_rate": 9.593682566936533e-06, "loss": 1.4951, "num_input_tokens_seen": 59433808, "step": 28455 }, { "epoch": 4.642793050004078, "grad_norm": 9.125, "learning_rate": 9.58595201590766e-06, "loss": 3.7171, "num_input_tokens_seen": 59442560, "step": 28460 }, { "epoch": 4.643608777224896, "grad_norm": 9.75, "learning_rate": 9.578223841918681e-06, "loss": 2.8257, "num_input_tokens_seen": 59453696, "step": 28465 }, { "epoch": 4.6444245044457135, "grad_norm": 3.84375, "learning_rate": 9.570498046161389e-06, "loss": 1.9643, "num_input_tokens_seen": 59464592, "step": 28470 }, { "epoch": 4.645240231666531, "grad_norm": 6.6875, "learning_rate": 9.562774629827206e-06, "loss": 1.8316, "num_input_tokens_seen": 59475568, "step": 28475 }, { "epoch": 4.646055958887348, "grad_norm": 9.5625, "learning_rate": 9.555053594107163e-06, "loss": 4.0345, "num_input_tokens_seen": 59486560, "step": 28480 }, { "epoch": 4.646871686108166, "grad_norm": 7.0625, "learning_rate": 9.547334940191957e-06, "loss": 2.0991, "num_input_tokens_seen": 59495312, "step": 28485 }, { "epoch": 4.647687413328983, "grad_norm": 14.5625, "learning_rate": 9.539618669271886e-06, "loss": 2.1454, "num_input_tokens_seen": 59505408, "step": 28490 }, { "epoch": 4.6485031405498, "grad_norm": 6.03125, "learning_rate": 9.531904782536904e-06, "loss": 1.3281, "num_input_tokens_seen": 59516096, "step": 28495 }, { "epoch": 4.649318867770617, "grad_norm": 4.71875, "learning_rate": 9.524193281176597e-06, "loss": 1.9429, "num_input_tokens_seen": 59527264, "step": 28500 }, { "epoch": 4.650134594991435, "grad_norm": 5.34375, "learning_rate": 9.516484166380165e-06, "loss": 1.2131, "num_input_tokens_seen": 59537808, "step": 28505 }, { "epoch": 4.650950322212252, "grad_norm": 8.375, "learning_rate": 9.508777439336447e-06, "loss": 2.1242, "num_input_tokens_seen": 59549408, "step": 28510 }, { "epoch": 4.6517660494330695, "grad_norm": 4.71875, "learning_rate": 9.50107310123393e-06, "loss": 2.0554, "num_input_tokens_seen": 59560496, "step": 28515 }, { "epoch": 4.652581776653887, "grad_norm": 9.8125, "learning_rate": 9.493371153260702e-06, "loss": 2.671, "num_input_tokens_seen": 59570624, "step": 28520 }, { "epoch": 4.653397503874705, "grad_norm": 0.07666015625, "learning_rate": 9.485671596604523e-06, "loss": 2.5152, "num_input_tokens_seen": 59580192, "step": 28525 }, { "epoch": 4.654213231095522, "grad_norm": 6.90625, "learning_rate": 9.477974432452738e-06, "loss": 3.4834, "num_input_tokens_seen": 59590752, "step": 28530 }, { "epoch": 4.655028958316339, "grad_norm": 4.1875, "learning_rate": 9.470279661992356e-06, "loss": 1.4426, "num_input_tokens_seen": 59601440, "step": 28535 }, { "epoch": 4.655844685537156, "grad_norm": 3.9375, "learning_rate": 9.462587286410021e-06, "loss": 1.2188, "num_input_tokens_seen": 59611824, "step": 28540 }, { "epoch": 4.656660412757974, "grad_norm": 5.65625, "learning_rate": 9.454897306891972e-06, "loss": 2.1165, "num_input_tokens_seen": 59622880, "step": 28545 }, { "epoch": 4.657476139978791, "grad_norm": 10.75, "learning_rate": 9.44720972462411e-06, "loss": 3.6317, "num_input_tokens_seen": 59633904, "step": 28550 }, { "epoch": 4.658291867199608, "grad_norm": 14.0, "learning_rate": 9.439524540791964e-06, "loss": 3.1657, "num_input_tokens_seen": 59644992, "step": 28555 }, { "epoch": 4.659107594420425, "grad_norm": 11.0, "learning_rate": 9.431841756580673e-06, "loss": 3.0301, "num_input_tokens_seen": 59655392, "step": 28560 }, { "epoch": 4.6599233216412435, "grad_norm": 12.125, "learning_rate": 9.42416137317503e-06, "loss": 2.9359, "num_input_tokens_seen": 59665728, "step": 28565 }, { "epoch": 4.660739048862061, "grad_norm": 5.03125, "learning_rate": 9.416483391759437e-06, "loss": 2.0589, "num_input_tokens_seen": 59676656, "step": 28570 }, { "epoch": 4.661554776082878, "grad_norm": 7.65625, "learning_rate": 9.408807813517945e-06, "loss": 3.3369, "num_input_tokens_seen": 59686800, "step": 28575 }, { "epoch": 4.662370503303695, "grad_norm": 4.75, "learning_rate": 9.401134639634221e-06, "loss": 1.7582, "num_input_tokens_seen": 59696768, "step": 28580 }, { "epoch": 4.663186230524513, "grad_norm": 9.0, "learning_rate": 9.393463871291555e-06, "loss": 2.3603, "num_input_tokens_seen": 59707760, "step": 28585 }, { "epoch": 4.66400195774533, "grad_norm": 7.25, "learning_rate": 9.385795509672881e-06, "loss": 2.195, "num_input_tokens_seen": 59717952, "step": 28590 }, { "epoch": 4.664817684966147, "grad_norm": 0.29296875, "learning_rate": 9.378129555960771e-06, "loss": 2.0939, "num_input_tokens_seen": 59728832, "step": 28595 }, { "epoch": 4.665633412186965, "grad_norm": 13.375, "learning_rate": 9.370466011337392e-06, "loss": 1.9266, "num_input_tokens_seen": 59739680, "step": 28600 }, { "epoch": 4.665633412186965, "eval_loss": 2.5382657051086426, "eval_runtime": 135.0974, "eval_samples_per_second": 20.171, "eval_steps_per_second": 10.089, "num_input_tokens_seen": 59739680, "step": 28600 }, { "epoch": 4.666449139407782, "grad_norm": 9.25, "learning_rate": 9.362804876984573e-06, "loss": 2.6995, "num_input_tokens_seen": 59751520, "step": 28605 }, { "epoch": 4.667264866628599, "grad_norm": 3.71875, "learning_rate": 9.355146154083747e-06, "loss": 3.2112, "num_input_tokens_seen": 59762512, "step": 28610 }, { "epoch": 4.6680805938494165, "grad_norm": 7.59375, "learning_rate": 9.347489843815987e-06, "loss": 1.9752, "num_input_tokens_seen": 59773120, "step": 28615 }, { "epoch": 4.668896321070234, "grad_norm": 6.65625, "learning_rate": 9.339835947362002e-06, "loss": 2.55, "num_input_tokens_seen": 59783056, "step": 28620 }, { "epoch": 4.669712048291052, "grad_norm": 6.15625, "learning_rate": 9.332184465902105e-06, "loss": 2.4322, "num_input_tokens_seen": 59793744, "step": 28625 }, { "epoch": 4.670527775511869, "grad_norm": 4.34375, "learning_rate": 9.324535400616266e-06, "loss": 1.9307, "num_input_tokens_seen": 59804752, "step": 28630 }, { "epoch": 4.671343502732686, "grad_norm": 6.21875, "learning_rate": 9.31688875268405e-06, "loss": 4.8734, "num_input_tokens_seen": 59815392, "step": 28635 }, { "epoch": 4.672159229953504, "grad_norm": 6.8125, "learning_rate": 9.309244523284674e-06, "loss": 2.2389, "num_input_tokens_seen": 59826784, "step": 28640 }, { "epoch": 4.672974957174321, "grad_norm": 6.3125, "learning_rate": 9.301602713596982e-06, "loss": 2.1255, "num_input_tokens_seen": 59836560, "step": 28645 }, { "epoch": 4.673790684395138, "grad_norm": 4.15625, "learning_rate": 9.293963324799432e-06, "loss": 2.0453, "num_input_tokens_seen": 59846848, "step": 28650 }, { "epoch": 4.674606411615955, "grad_norm": 9.5, "learning_rate": 9.286326358070104e-06, "loss": 2.6656, "num_input_tokens_seen": 59857152, "step": 28655 }, { "epoch": 4.6754221388367725, "grad_norm": 7.375, "learning_rate": 9.278691814586729e-06, "loss": 3.6563, "num_input_tokens_seen": 59867328, "step": 28660 }, { "epoch": 4.6762378660575905, "grad_norm": 11.625, "learning_rate": 9.271059695526635e-06, "loss": 5.2712, "num_input_tokens_seen": 59877776, "step": 28665 }, { "epoch": 4.677053593278408, "grad_norm": 7.0625, "learning_rate": 9.263430002066805e-06, "loss": 1.384, "num_input_tokens_seen": 59887696, "step": 28670 }, { "epoch": 4.677869320499225, "grad_norm": 11.8125, "learning_rate": 9.25580273538382e-06, "loss": 3.4245, "num_input_tokens_seen": 59897952, "step": 28675 }, { "epoch": 4.678685047720043, "grad_norm": 7.46875, "learning_rate": 9.248177896653907e-06, "loss": 2.0009, "num_input_tokens_seen": 59908192, "step": 28680 }, { "epoch": 4.67950077494086, "grad_norm": 6.03125, "learning_rate": 9.240555487052918e-06, "loss": 2.277, "num_input_tokens_seen": 59918096, "step": 28685 }, { "epoch": 4.680316502161677, "grad_norm": 4.5625, "learning_rate": 9.232935507756313e-06, "loss": 2.2817, "num_input_tokens_seen": 59928672, "step": 28690 }, { "epoch": 4.681132229382494, "grad_norm": 2.671875, "learning_rate": 9.225317959939193e-06, "loss": 1.9533, "num_input_tokens_seen": 59938976, "step": 28695 }, { "epoch": 4.681947956603312, "grad_norm": 4.9375, "learning_rate": 9.217702844776287e-06, "loss": 1.4298, "num_input_tokens_seen": 59948224, "step": 28700 }, { "epoch": 4.682763683824129, "grad_norm": 6.46875, "learning_rate": 9.210090163441929e-06, "loss": 2.7252, "num_input_tokens_seen": 59956928, "step": 28705 }, { "epoch": 4.6835794110449465, "grad_norm": 7.9375, "learning_rate": 9.202479917110105e-06, "loss": 2.6277, "num_input_tokens_seen": 59967552, "step": 28710 }, { "epoch": 4.684395138265764, "grad_norm": 6.09375, "learning_rate": 9.194872106954392e-06, "loss": 2.2262, "num_input_tokens_seen": 59979072, "step": 28715 }, { "epoch": 4.685210865486582, "grad_norm": 4.8125, "learning_rate": 9.187266734148029e-06, "loss": 1.8501, "num_input_tokens_seen": 59989504, "step": 28720 }, { "epoch": 4.686026592707399, "grad_norm": 8.875, "learning_rate": 9.179663799863849e-06, "loss": 2.8195, "num_input_tokens_seen": 59999856, "step": 28725 }, { "epoch": 4.686842319928216, "grad_norm": 4.8125, "learning_rate": 9.172063305274317e-06, "loss": 2.5312, "num_input_tokens_seen": 60011200, "step": 28730 }, { "epoch": 4.687658047149033, "grad_norm": 4.25, "learning_rate": 9.164465251551527e-06, "loss": 4.282, "num_input_tokens_seen": 60021824, "step": 28735 }, { "epoch": 4.688473774369851, "grad_norm": 8.125, "learning_rate": 9.156869639867205e-06, "loss": 1.9375, "num_input_tokens_seen": 60031136, "step": 28740 }, { "epoch": 4.689289501590668, "grad_norm": 3.5, "learning_rate": 9.149276471392677e-06, "loss": 1.5486, "num_input_tokens_seen": 60041344, "step": 28745 }, { "epoch": 4.690105228811485, "grad_norm": 6.8125, "learning_rate": 9.141685747298914e-06, "loss": 2.6044, "num_input_tokens_seen": 60052272, "step": 28750 }, { "epoch": 4.690920956032302, "grad_norm": 7.03125, "learning_rate": 9.13409746875649e-06, "loss": 2.8864, "num_input_tokens_seen": 60063376, "step": 28755 }, { "epoch": 4.69173668325312, "grad_norm": 1.953125, "learning_rate": 9.12651163693562e-06, "loss": 2.2091, "num_input_tokens_seen": 60073776, "step": 28760 }, { "epoch": 4.692552410473938, "grad_norm": 14.6875, "learning_rate": 9.11892825300614e-06, "loss": 3.2949, "num_input_tokens_seen": 60084032, "step": 28765 }, { "epoch": 4.693368137694755, "grad_norm": 3.28125, "learning_rate": 9.111347318137491e-06, "loss": 2.9996, "num_input_tokens_seen": 60094944, "step": 28770 }, { "epoch": 4.694183864915573, "grad_norm": 2.15625, "learning_rate": 9.103768833498755e-06, "loss": 2.9596, "num_input_tokens_seen": 60106592, "step": 28775 }, { "epoch": 4.69499959213639, "grad_norm": 2.375, "learning_rate": 9.096192800258639e-06, "loss": 2.5903, "num_input_tokens_seen": 60116976, "step": 28780 }, { "epoch": 4.695815319357207, "grad_norm": 7.28125, "learning_rate": 9.088619219585443e-06, "loss": 1.9037, "num_input_tokens_seen": 60128064, "step": 28785 }, { "epoch": 4.696631046578024, "grad_norm": 10.9375, "learning_rate": 9.081048092647127e-06, "loss": 2.7493, "num_input_tokens_seen": 60139104, "step": 28790 }, { "epoch": 4.697446773798841, "grad_norm": 9.0, "learning_rate": 9.073479420611245e-06, "loss": 1.703, "num_input_tokens_seen": 60148912, "step": 28795 }, { "epoch": 4.698262501019659, "grad_norm": 14.6875, "learning_rate": 9.065913204644974e-06, "loss": 2.7749, "num_input_tokens_seen": 60159984, "step": 28800 }, { "epoch": 4.698262501019659, "eval_loss": 2.548642873764038, "eval_runtime": 134.8455, "eval_samples_per_second": 20.208, "eval_steps_per_second": 10.108, "num_input_tokens_seen": 60159984, "step": 28800 }, { "epoch": 4.699078228240476, "grad_norm": 5.90625, "learning_rate": 9.058349445915135e-06, "loss": 1.5363, "num_input_tokens_seen": 60169904, "step": 28805 }, { "epoch": 4.6998939554612935, "grad_norm": 9.5, "learning_rate": 9.050788145588138e-06, "loss": 2.4724, "num_input_tokens_seen": 60181232, "step": 28810 }, { "epoch": 4.7007096826821115, "grad_norm": 4.71875, "learning_rate": 9.043229304830039e-06, "loss": 2.4214, "num_input_tokens_seen": 60192144, "step": 28815 }, { "epoch": 4.701525409902929, "grad_norm": 15.625, "learning_rate": 9.035672924806515e-06, "loss": 3.5206, "num_input_tokens_seen": 60202992, "step": 28820 }, { "epoch": 4.702341137123746, "grad_norm": 0.04150390625, "learning_rate": 9.028119006682839e-06, "loss": 3.0826, "num_input_tokens_seen": 60212768, "step": 28825 }, { "epoch": 4.703156864344563, "grad_norm": 9.8125, "learning_rate": 9.020567551623935e-06, "loss": 2.9875, "num_input_tokens_seen": 60224016, "step": 28830 }, { "epoch": 4.70397259156538, "grad_norm": 5.4375, "learning_rate": 9.013018560794318e-06, "loss": 2.3459, "num_input_tokens_seen": 60234128, "step": 28835 }, { "epoch": 4.704788318786198, "grad_norm": 10.8125, "learning_rate": 9.005472035358139e-06, "loss": 2.9559, "num_input_tokens_seen": 60244272, "step": 28840 }, { "epoch": 4.705604046007015, "grad_norm": 5.34375, "learning_rate": 8.997927976479185e-06, "loss": 2.1347, "num_input_tokens_seen": 60255376, "step": 28845 }, { "epoch": 4.706419773227832, "grad_norm": 9.6875, "learning_rate": 8.99038638532082e-06, "loss": 1.9097, "num_input_tokens_seen": 60265616, "step": 28850 }, { "epoch": 4.70723550044865, "grad_norm": 4.96875, "learning_rate": 8.982847263046065e-06, "loss": 1.4363, "num_input_tokens_seen": 60276656, "step": 28855 }, { "epoch": 4.7080512276694675, "grad_norm": 12.25, "learning_rate": 8.975310610817555e-06, "loss": 2.7443, "num_input_tokens_seen": 60286624, "step": 28860 }, { "epoch": 4.708866954890285, "grad_norm": 8.0625, "learning_rate": 8.967776429797528e-06, "loss": 1.5421, "num_input_tokens_seen": 60297120, "step": 28865 }, { "epoch": 4.709682682111102, "grad_norm": 8.0, "learning_rate": 8.960244721147842e-06, "loss": 1.9063, "num_input_tokens_seen": 60309456, "step": 28870 }, { "epoch": 4.71049840933192, "grad_norm": 5.84375, "learning_rate": 8.952715486029995e-06, "loss": 2.4174, "num_input_tokens_seen": 60319680, "step": 28875 }, { "epoch": 4.711314136552737, "grad_norm": 8.0625, "learning_rate": 8.945188725605075e-06, "loss": 2.0819, "num_input_tokens_seen": 60331120, "step": 28880 }, { "epoch": 4.712129863773554, "grad_norm": 10.5625, "learning_rate": 8.937664441033817e-06, "loss": 2.2217, "num_input_tokens_seen": 60340560, "step": 28885 }, { "epoch": 4.712945590994371, "grad_norm": 11.125, "learning_rate": 8.930142633476549e-06, "loss": 2.3769, "num_input_tokens_seen": 60349776, "step": 28890 }, { "epoch": 4.713761318215189, "grad_norm": 4.71875, "learning_rate": 8.92262330409323e-06, "loss": 2.496, "num_input_tokens_seen": 60359824, "step": 28895 }, { "epoch": 4.714577045436006, "grad_norm": 4.65625, "learning_rate": 8.915106454043448e-06, "loss": 1.6167, "num_input_tokens_seen": 60369952, "step": 28900 }, { "epoch": 4.715392772656823, "grad_norm": 6.1875, "learning_rate": 8.90759208448638e-06, "loss": 2.4359, "num_input_tokens_seen": 60380368, "step": 28905 }, { "epoch": 4.716208499877641, "grad_norm": 9.3125, "learning_rate": 8.900080196580848e-06, "loss": 3.6153, "num_input_tokens_seen": 60390400, "step": 28910 }, { "epoch": 4.717024227098459, "grad_norm": 8.75, "learning_rate": 8.892570791485267e-06, "loss": 1.9987, "num_input_tokens_seen": 60402256, "step": 28915 }, { "epoch": 4.717839954319276, "grad_norm": 6.1875, "learning_rate": 8.885063870357688e-06, "loss": 2.3064, "num_input_tokens_seen": 60412128, "step": 28920 }, { "epoch": 4.718655681540093, "grad_norm": 6.03125, "learning_rate": 8.87755943435578e-06, "loss": 2.5007, "num_input_tokens_seen": 60421648, "step": 28925 }, { "epoch": 4.71947140876091, "grad_norm": 8.5, "learning_rate": 8.87005748463681e-06, "loss": 1.904, "num_input_tokens_seen": 60432384, "step": 28930 }, { "epoch": 4.720287135981728, "grad_norm": 8.9375, "learning_rate": 8.862558022357681e-06, "loss": 2.3383, "num_input_tokens_seen": 60441088, "step": 28935 }, { "epoch": 4.721102863202545, "grad_norm": 8.625, "learning_rate": 8.855061048674903e-06, "loss": 3.0606, "num_input_tokens_seen": 60451392, "step": 28940 }, { "epoch": 4.721918590423362, "grad_norm": 7.375, "learning_rate": 8.847566564744595e-06, "loss": 3.7979, "num_input_tokens_seen": 60461024, "step": 28945 }, { "epoch": 4.72273431764418, "grad_norm": 2.90625, "learning_rate": 8.840074571722512e-06, "loss": 1.8077, "num_input_tokens_seen": 60471680, "step": 28950 }, { "epoch": 4.723550044864997, "grad_norm": 7.4375, "learning_rate": 8.832585070764002e-06, "loss": 3.1042, "num_input_tokens_seen": 60483488, "step": 28955 }, { "epoch": 4.7243657720858145, "grad_norm": 5.53125, "learning_rate": 8.825098063024045e-06, "loss": 1.3728, "num_input_tokens_seen": 60494080, "step": 28960 }, { "epoch": 4.725181499306632, "grad_norm": 6.96875, "learning_rate": 8.817613549657244e-06, "loss": 3.4277, "num_input_tokens_seen": 60504592, "step": 28965 }, { "epoch": 4.725997226527449, "grad_norm": 6.6875, "learning_rate": 8.810131531817783e-06, "loss": 2.8987, "num_input_tokens_seen": 60515984, "step": 28970 }, { "epoch": 4.726812953748267, "grad_norm": 5.125, "learning_rate": 8.802652010659496e-06, "loss": 2.2613, "num_input_tokens_seen": 60526352, "step": 28975 }, { "epoch": 4.727628680969084, "grad_norm": 9.875, "learning_rate": 8.795174987335827e-06, "loss": 2.1431, "num_input_tokens_seen": 60535920, "step": 28980 }, { "epoch": 4.728444408189901, "grad_norm": 6.125, "learning_rate": 8.787700462999807e-06, "loss": 1.9851, "num_input_tokens_seen": 60546976, "step": 28985 }, { "epoch": 4.729260135410719, "grad_norm": 5.625, "learning_rate": 8.780228438804122e-06, "loss": 1.8261, "num_input_tokens_seen": 60557520, "step": 28990 }, { "epoch": 4.730075862631536, "grad_norm": 5.15625, "learning_rate": 8.772758915901032e-06, "loss": 4.3183, "num_input_tokens_seen": 60568208, "step": 28995 }, { "epoch": 4.730891589852353, "grad_norm": 5.34375, "learning_rate": 8.765291895442443e-06, "loss": 2.3452, "num_input_tokens_seen": 60579344, "step": 29000 }, { "epoch": 4.730891589852353, "eval_loss": 2.53888201713562, "eval_runtime": 134.8359, "eval_samples_per_second": 20.21, "eval_steps_per_second": 10.109, "num_input_tokens_seen": 60579344, "step": 29000 }, { "epoch": 4.7317073170731705, "grad_norm": 5.6875, "learning_rate": 8.75782737857987e-06, "loss": 1.9156, "num_input_tokens_seen": 60590496, "step": 29005 }, { "epoch": 4.732523044293988, "grad_norm": 7.78125, "learning_rate": 8.750365366464425e-06, "loss": 3.7307, "num_input_tokens_seen": 60600928, "step": 29010 }, { "epoch": 4.733338771514806, "grad_norm": 10.0, "learning_rate": 8.742905860246838e-06, "loss": 2.5871, "num_input_tokens_seen": 60609712, "step": 29015 }, { "epoch": 4.734154498735623, "grad_norm": 5.0625, "learning_rate": 8.735448861077478e-06, "loss": 2.4364, "num_input_tokens_seen": 60620160, "step": 29020 }, { "epoch": 4.73497022595644, "grad_norm": 6.09375, "learning_rate": 8.727994370106288e-06, "loss": 3.3468, "num_input_tokens_seen": 60630288, "step": 29025 }, { "epoch": 4.735785953177258, "grad_norm": 1.5859375, "learning_rate": 8.720542388482861e-06, "loss": 1.0243, "num_input_tokens_seen": 60640128, "step": 29030 }, { "epoch": 4.736601680398075, "grad_norm": 3.125, "learning_rate": 8.71309291735637e-06, "loss": 2.5953, "num_input_tokens_seen": 60649488, "step": 29035 }, { "epoch": 4.737417407618892, "grad_norm": 0.40234375, "learning_rate": 8.705645957875621e-06, "loss": 1.3528, "num_input_tokens_seen": 60660448, "step": 29040 }, { "epoch": 4.738233134839709, "grad_norm": 8.0625, "learning_rate": 8.698201511189048e-06, "loss": 2.8239, "num_input_tokens_seen": 60671168, "step": 29045 }, { "epoch": 4.739048862060527, "grad_norm": 5.0, "learning_rate": 8.690759578444649e-06, "loss": 2.2809, "num_input_tokens_seen": 60679776, "step": 29050 }, { "epoch": 4.7398645892813445, "grad_norm": 11.9375, "learning_rate": 8.68332016079008e-06, "loss": 2.9079, "num_input_tokens_seen": 60689792, "step": 29055 }, { "epoch": 4.740680316502162, "grad_norm": 7.3125, "learning_rate": 8.6758832593726e-06, "loss": 2.9563, "num_input_tokens_seen": 60701568, "step": 29060 }, { "epoch": 4.741496043722979, "grad_norm": 1.0703125, "learning_rate": 8.668448875339053e-06, "loss": 2.2458, "num_input_tokens_seen": 60711200, "step": 29065 }, { "epoch": 4.742311770943797, "grad_norm": 7.28125, "learning_rate": 8.661017009835933e-06, "loss": 2.1967, "num_input_tokens_seen": 60721456, "step": 29070 }, { "epoch": 4.743127498164614, "grad_norm": 6.8125, "learning_rate": 8.653587664009311e-06, "loss": 2.2493, "num_input_tokens_seen": 60730432, "step": 29075 }, { "epoch": 4.743943225385431, "grad_norm": 15.9375, "learning_rate": 8.646160839004902e-06, "loss": 3.0611, "num_input_tokens_seen": 60740768, "step": 29080 }, { "epoch": 4.744758952606248, "grad_norm": 7.375, "learning_rate": 8.638736535967998e-06, "loss": 1.5774, "num_input_tokens_seen": 60751680, "step": 29085 }, { "epoch": 4.745574679827066, "grad_norm": 12.1875, "learning_rate": 8.631314756043535e-06, "loss": 2.3536, "num_input_tokens_seen": 60762928, "step": 29090 }, { "epoch": 4.746390407047883, "grad_norm": 5.78125, "learning_rate": 8.62389550037603e-06, "loss": 1.4272, "num_input_tokens_seen": 60772320, "step": 29095 }, { "epoch": 4.7472061342687, "grad_norm": 5.59375, "learning_rate": 8.616478770109646e-06, "loss": 2.0383, "num_input_tokens_seen": 60782304, "step": 29100 }, { "epoch": 4.7480218614895175, "grad_norm": 6.71875, "learning_rate": 8.609064566388111e-06, "loss": 1.3792, "num_input_tokens_seen": 60793824, "step": 29105 }, { "epoch": 4.748837588710336, "grad_norm": 7.34375, "learning_rate": 8.601652890354815e-06, "loss": 2.6113, "num_input_tokens_seen": 60803936, "step": 29110 }, { "epoch": 4.749653315931153, "grad_norm": 4.34375, "learning_rate": 8.594243743152705e-06, "loss": 1.39, "num_input_tokens_seen": 60814064, "step": 29115 }, { "epoch": 4.75046904315197, "grad_norm": 8.4375, "learning_rate": 8.58683712592438e-06, "loss": 3.6088, "num_input_tokens_seen": 60825728, "step": 29120 }, { "epoch": 4.751284770372787, "grad_norm": 5.78125, "learning_rate": 8.579433039812037e-06, "loss": 3.157, "num_input_tokens_seen": 60836592, "step": 29125 }, { "epoch": 4.752100497593605, "grad_norm": 5.0625, "learning_rate": 8.572031485957466e-06, "loss": 2.4529, "num_input_tokens_seen": 60846224, "step": 29130 }, { "epoch": 4.752916224814422, "grad_norm": 5.40625, "learning_rate": 8.564632465502084e-06, "loss": 2.7171, "num_input_tokens_seen": 60856576, "step": 29135 }, { "epoch": 4.753731952035239, "grad_norm": 2.578125, "learning_rate": 8.557235979586928e-06, "loss": 1.907, "num_input_tokens_seen": 60866896, "step": 29140 }, { "epoch": 4.754547679256056, "grad_norm": 6.59375, "learning_rate": 8.549842029352606e-06, "loss": 2.772, "num_input_tokens_seen": 60878880, "step": 29145 }, { "epoch": 4.755363406476874, "grad_norm": 2.859375, "learning_rate": 8.542450615939376e-06, "loss": 1.6168, "num_input_tokens_seen": 60890768, "step": 29150 }, { "epoch": 4.7561791336976915, "grad_norm": 13.5, "learning_rate": 8.535061740487082e-06, "loss": 2.4291, "num_input_tokens_seen": 60900944, "step": 29155 }, { "epoch": 4.756994860918509, "grad_norm": 9.8125, "learning_rate": 8.527675404135168e-06, "loss": 1.6582, "num_input_tokens_seen": 60911200, "step": 29160 }, { "epoch": 4.757810588139327, "grad_norm": 3.65625, "learning_rate": 8.520291608022724e-06, "loss": 1.7639, "num_input_tokens_seen": 60919824, "step": 29165 }, { "epoch": 4.758626315360144, "grad_norm": 5.15625, "learning_rate": 8.512910353288398e-06, "loss": 1.6367, "num_input_tokens_seen": 60930640, "step": 29170 }, { "epoch": 4.759442042580961, "grad_norm": 3.921875, "learning_rate": 8.505531641070486e-06, "loss": 2.7253, "num_input_tokens_seen": 60941984, "step": 29175 }, { "epoch": 4.760257769801778, "grad_norm": 5.75, "learning_rate": 8.498155472506885e-06, "loss": 1.9723, "num_input_tokens_seen": 60952032, "step": 29180 }, { "epoch": 4.761073497022595, "grad_norm": 8.0625, "learning_rate": 8.49078184873508e-06, "loss": 2.5172, "num_input_tokens_seen": 60962800, "step": 29185 }, { "epoch": 4.761889224243413, "grad_norm": 5.0, "learning_rate": 8.483410770892188e-06, "loss": 3.5589, "num_input_tokens_seen": 60972320, "step": 29190 }, { "epoch": 4.76270495146423, "grad_norm": 3.234375, "learning_rate": 8.476042240114909e-06, "loss": 2.6208, "num_input_tokens_seen": 60983072, "step": 29195 }, { "epoch": 4.7635206786850475, "grad_norm": 9.6875, "learning_rate": 8.468676257539568e-06, "loss": 3.2174, "num_input_tokens_seen": 60992736, "step": 29200 }, { "epoch": 4.7635206786850475, "eval_loss": 2.533663034439087, "eval_runtime": 135.6807, "eval_samples_per_second": 20.084, "eval_steps_per_second": 10.046, "num_input_tokens_seen": 60992736, "step": 29200 }, { "epoch": 4.7643364059058655, "grad_norm": 5.90625, "learning_rate": 8.4613128243021e-06, "loss": 1.4508, "num_input_tokens_seen": 61003536, "step": 29205 }, { "epoch": 4.765152133126683, "grad_norm": 3.96875, "learning_rate": 8.453951941538028e-06, "loss": 2.5904, "num_input_tokens_seen": 61013056, "step": 29210 }, { "epoch": 4.7659678603475, "grad_norm": 6.21875, "learning_rate": 8.446593610382495e-06, "loss": 3.1013, "num_input_tokens_seen": 61022816, "step": 29215 }, { "epoch": 4.766783587568317, "grad_norm": 10.75, "learning_rate": 8.439237831970259e-06, "loss": 1.8063, "num_input_tokens_seen": 61033536, "step": 29220 }, { "epoch": 4.767599314789135, "grad_norm": 10.8125, "learning_rate": 8.431884607435667e-06, "loss": 3.6055, "num_input_tokens_seen": 61043920, "step": 29225 }, { "epoch": 4.768415042009952, "grad_norm": 15.1875, "learning_rate": 8.424533937912665e-06, "loss": 1.6303, "num_input_tokens_seen": 61053872, "step": 29230 }, { "epoch": 4.769230769230769, "grad_norm": 8.6875, "learning_rate": 8.41718582453484e-06, "loss": 3.2607, "num_input_tokens_seen": 61065536, "step": 29235 }, { "epoch": 4.770046496451586, "grad_norm": 6.21875, "learning_rate": 8.409840268435346e-06, "loss": 2.2722, "num_input_tokens_seen": 61075280, "step": 29240 }, { "epoch": 4.770862223672404, "grad_norm": 11.6875, "learning_rate": 8.402497270746976e-06, "loss": 3.0279, "num_input_tokens_seen": 61085776, "step": 29245 }, { "epoch": 4.771677950893221, "grad_norm": 11.625, "learning_rate": 8.395156832602095e-06, "loss": 2.3957, "num_input_tokens_seen": 61095328, "step": 29250 }, { "epoch": 4.772493678114039, "grad_norm": 8.375, "learning_rate": 8.387818955132707e-06, "loss": 2.5582, "num_input_tokens_seen": 61105744, "step": 29255 }, { "epoch": 4.773309405334856, "grad_norm": 7.09375, "learning_rate": 8.38048363947039e-06, "loss": 2.2532, "num_input_tokens_seen": 61115200, "step": 29260 }, { "epoch": 4.774125132555674, "grad_norm": 1.0, "learning_rate": 8.373150886746351e-06, "loss": 1.7752, "num_input_tokens_seen": 61125024, "step": 29265 }, { "epoch": 4.774940859776491, "grad_norm": 0.69140625, "learning_rate": 8.365820698091397e-06, "loss": 1.7667, "num_input_tokens_seen": 61136960, "step": 29270 }, { "epoch": 4.775756586997308, "grad_norm": 10.9375, "learning_rate": 8.358493074635922e-06, "loss": 2.0961, "num_input_tokens_seen": 61147888, "step": 29275 }, { "epoch": 4.776572314218125, "grad_norm": 13.25, "learning_rate": 8.351168017509948e-06, "loss": 1.7394, "num_input_tokens_seen": 61159184, "step": 29280 }, { "epoch": 4.777388041438943, "grad_norm": 7.65625, "learning_rate": 8.343845527843094e-06, "loss": 2.5913, "num_input_tokens_seen": 61169680, "step": 29285 }, { "epoch": 4.77820376865976, "grad_norm": 6.3125, "learning_rate": 8.336525606764566e-06, "loss": 2.5112, "num_input_tokens_seen": 61179376, "step": 29290 }, { "epoch": 4.779019495880577, "grad_norm": 6.0, "learning_rate": 8.329208255403204e-06, "loss": 1.542, "num_input_tokens_seen": 61189840, "step": 29295 }, { "epoch": 4.7798352231013945, "grad_norm": 9.0, "learning_rate": 8.321893474887426e-06, "loss": 2.6158, "num_input_tokens_seen": 61199856, "step": 29300 }, { "epoch": 4.7806509503222125, "grad_norm": 9.375, "learning_rate": 8.31458126634526e-06, "loss": 3.2177, "num_input_tokens_seen": 61209936, "step": 29305 }, { "epoch": 4.78146667754303, "grad_norm": 8.75, "learning_rate": 8.30727163090435e-06, "loss": 3.0107, "num_input_tokens_seen": 61220880, "step": 29310 }, { "epoch": 4.782282404763847, "grad_norm": 9.0, "learning_rate": 8.29996456969192e-06, "loss": 2.0529, "num_input_tokens_seen": 61232064, "step": 29315 }, { "epoch": 4.783098131984664, "grad_norm": 19.5, "learning_rate": 8.292660083834818e-06, "loss": 2.5228, "num_input_tokens_seen": 61241216, "step": 29320 }, { "epoch": 4.783913859205482, "grad_norm": 0.171875, "learning_rate": 8.2853581744595e-06, "loss": 1.9976, "num_input_tokens_seen": 61250272, "step": 29325 }, { "epoch": 4.784729586426299, "grad_norm": 8.25, "learning_rate": 8.278058842691991e-06, "loss": 1.9719, "num_input_tokens_seen": 61261872, "step": 29330 }, { "epoch": 4.785545313647116, "grad_norm": 6.375, "learning_rate": 8.27076208965796e-06, "loss": 1.9284, "num_input_tokens_seen": 61272016, "step": 29335 }, { "epoch": 4.786361040867934, "grad_norm": 8.8125, "learning_rate": 8.263467916482637e-06, "loss": 2.9884, "num_input_tokens_seen": 61283808, "step": 29340 }, { "epoch": 4.787176768088751, "grad_norm": 11.625, "learning_rate": 8.256176324290885e-06, "loss": 2.9141, "num_input_tokens_seen": 61295744, "step": 29345 }, { "epoch": 4.7879924953095685, "grad_norm": 4.0625, "learning_rate": 8.248887314207168e-06, "loss": 2.5051, "num_input_tokens_seen": 61307520, "step": 29350 }, { "epoch": 4.788808222530386, "grad_norm": 7.53125, "learning_rate": 8.24160088735553e-06, "loss": 3.0049, "num_input_tokens_seen": 61318064, "step": 29355 }, { "epoch": 4.789623949751203, "grad_norm": 5.125, "learning_rate": 8.234317044859629e-06, "loss": 1.2895, "num_input_tokens_seen": 61329200, "step": 29360 }, { "epoch": 4.790439676972021, "grad_norm": 4.75, "learning_rate": 8.227035787842744e-06, "loss": 2.4549, "num_input_tokens_seen": 61338976, "step": 29365 }, { "epoch": 4.791255404192838, "grad_norm": 6.84375, "learning_rate": 8.219757117427721e-06, "loss": 2.316, "num_input_tokens_seen": 61350368, "step": 29370 }, { "epoch": 4.792071131413655, "grad_norm": 7.15625, "learning_rate": 8.212481034737014e-06, "loss": 3.7944, "num_input_tokens_seen": 61360192, "step": 29375 }, { "epoch": 4.792886858634473, "grad_norm": 4.8125, "learning_rate": 8.205207540892707e-06, "loss": 1.5103, "num_input_tokens_seen": 61370912, "step": 29380 }, { "epoch": 4.79370258585529, "grad_norm": 1.2890625, "learning_rate": 8.197936637016442e-06, "loss": 0.4701, "num_input_tokens_seen": 61381712, "step": 29385 }, { "epoch": 4.794518313076107, "grad_norm": 5.96875, "learning_rate": 8.190668324229508e-06, "loss": 2.2722, "num_input_tokens_seen": 61392752, "step": 29390 }, { "epoch": 4.7953340402969244, "grad_norm": 7.53125, "learning_rate": 8.183402603652749e-06, "loss": 2.5569, "num_input_tokens_seen": 61405296, "step": 29395 }, { "epoch": 4.796149767517742, "grad_norm": 8.0, "learning_rate": 8.176139476406635e-06, "loss": 2.856, "num_input_tokens_seen": 61414080, "step": 29400 }, { "epoch": 4.796149767517742, "eval_loss": 2.5530948638916016, "eval_runtime": 134.8735, "eval_samples_per_second": 20.204, "eval_steps_per_second": 10.106, "num_input_tokens_seen": 61414080, "step": 29400 }, { "epoch": 4.79696549473856, "grad_norm": 10.1875, "learning_rate": 8.16887894361125e-06, "loss": 2.3682, "num_input_tokens_seen": 61423616, "step": 29405 }, { "epoch": 4.797781221959377, "grad_norm": 6.71875, "learning_rate": 8.161621006386233e-06, "loss": 2.9715, "num_input_tokens_seen": 61434048, "step": 29410 }, { "epoch": 4.798596949180194, "grad_norm": 8.5625, "learning_rate": 8.154365665850869e-06, "loss": 1.4095, "num_input_tokens_seen": 61442848, "step": 29415 }, { "epoch": 4.799412676401012, "grad_norm": 13.4375, "learning_rate": 8.147112923124005e-06, "loss": 3.8017, "num_input_tokens_seen": 61453664, "step": 29420 }, { "epoch": 4.800228403621829, "grad_norm": 4.625, "learning_rate": 8.13986277932412e-06, "loss": 2.6763, "num_input_tokens_seen": 61464880, "step": 29425 }, { "epoch": 4.801044130842646, "grad_norm": 6.25, "learning_rate": 8.132615235569277e-06, "loss": 1.9304, "num_input_tokens_seen": 61474464, "step": 29430 }, { "epoch": 4.801859858063463, "grad_norm": 13.4375, "learning_rate": 8.125370292977124e-06, "loss": 3.1859, "num_input_tokens_seen": 61483520, "step": 29435 }, { "epoch": 4.802675585284281, "grad_norm": 4.8125, "learning_rate": 8.118127952664944e-06, "loss": 2.3823, "num_input_tokens_seen": 61494000, "step": 29440 }, { "epoch": 4.803491312505098, "grad_norm": 7.28125, "learning_rate": 8.110888215749574e-06, "loss": 2.3261, "num_input_tokens_seen": 61503728, "step": 29445 }, { "epoch": 4.8043070397259156, "grad_norm": 5.46875, "learning_rate": 8.10365108334749e-06, "loss": 3.4818, "num_input_tokens_seen": 61514912, "step": 29450 }, { "epoch": 4.805122766946733, "grad_norm": 3.0625, "learning_rate": 8.096416556574743e-06, "loss": 2.3805, "num_input_tokens_seen": 61526080, "step": 29455 }, { "epoch": 4.805938494167551, "grad_norm": 6.4375, "learning_rate": 8.08918463654698e-06, "loss": 2.6585, "num_input_tokens_seen": 61535920, "step": 29460 }, { "epoch": 4.806754221388368, "grad_norm": 6.78125, "learning_rate": 8.081955324379458e-06, "loss": 2.1325, "num_input_tokens_seen": 61547280, "step": 29465 }, { "epoch": 4.807569948609185, "grad_norm": 10.75, "learning_rate": 8.074728621187039e-06, "loss": 3.5367, "num_input_tokens_seen": 61559104, "step": 29470 }, { "epoch": 4.808385675830002, "grad_norm": 11.3125, "learning_rate": 8.067504528084158e-06, "loss": 3.5439, "num_input_tokens_seen": 61569760, "step": 29475 }, { "epoch": 4.80920140305082, "grad_norm": 8.1875, "learning_rate": 8.060283046184861e-06, "loss": 2.936, "num_input_tokens_seen": 61580144, "step": 29480 }, { "epoch": 4.810017130271637, "grad_norm": 9.875, "learning_rate": 8.053064176602806e-06, "loss": 2.8643, "num_input_tokens_seen": 61591488, "step": 29485 }, { "epoch": 4.810832857492454, "grad_norm": 2.625, "learning_rate": 8.045847920451216e-06, "loss": 2.1768, "num_input_tokens_seen": 61601280, "step": 29490 }, { "epoch": 4.8116485847132715, "grad_norm": 3.40625, "learning_rate": 8.038634278842944e-06, "loss": 2.0438, "num_input_tokens_seen": 61612080, "step": 29495 }, { "epoch": 4.8124643119340895, "grad_norm": 7.25, "learning_rate": 8.031423252890408e-06, "loss": 2.5265, "num_input_tokens_seen": 61622896, "step": 29500 }, { "epoch": 4.813280039154907, "grad_norm": 6.1875, "learning_rate": 8.024214843705646e-06, "loss": 1.7685, "num_input_tokens_seen": 61633136, "step": 29505 }, { "epoch": 4.814095766375724, "grad_norm": 2.609375, "learning_rate": 8.017009052400295e-06, "loss": 1.5523, "num_input_tokens_seen": 61643440, "step": 29510 }, { "epoch": 4.814911493596542, "grad_norm": 8.8125, "learning_rate": 8.00980588008557e-06, "loss": 3.2456, "num_input_tokens_seen": 61654592, "step": 29515 }, { "epoch": 4.815727220817359, "grad_norm": 3.9375, "learning_rate": 8.002605327872282e-06, "loss": 2.5328, "num_input_tokens_seen": 61664112, "step": 29520 }, { "epoch": 4.816542948038176, "grad_norm": 14.3125, "learning_rate": 7.995407396870862e-06, "loss": 3.0823, "num_input_tokens_seen": 61674432, "step": 29525 }, { "epoch": 4.817358675258993, "grad_norm": 18.0, "learning_rate": 7.988212088191307e-06, "loss": 2.518, "num_input_tokens_seen": 61684576, "step": 29530 }, { "epoch": 4.81817440247981, "grad_norm": 9.125, "learning_rate": 7.98101940294324e-06, "loss": 2.5328, "num_input_tokens_seen": 61696240, "step": 29535 }, { "epoch": 4.818990129700628, "grad_norm": 7.78125, "learning_rate": 7.973829342235847e-06, "loss": 2.5787, "num_input_tokens_seen": 61706752, "step": 29540 }, { "epoch": 4.8198058569214455, "grad_norm": 12.125, "learning_rate": 7.966641907177936e-06, "loss": 3.3015, "num_input_tokens_seen": 61716576, "step": 29545 }, { "epoch": 4.820621584142263, "grad_norm": 14.25, "learning_rate": 7.959457098877901e-06, "loss": 2.4609, "num_input_tokens_seen": 61726368, "step": 29550 }, { "epoch": 4.821437311363081, "grad_norm": 8.875, "learning_rate": 7.952274918443719e-06, "loss": 3.0672, "num_input_tokens_seen": 61736704, "step": 29555 }, { "epoch": 4.822253038583898, "grad_norm": 9.875, "learning_rate": 7.945095366982983e-06, "loss": 2.2774, "num_input_tokens_seen": 61747296, "step": 29560 }, { "epoch": 4.823068765804715, "grad_norm": 6.84375, "learning_rate": 7.937918445602871e-06, "loss": 2.2685, "num_input_tokens_seen": 61757616, "step": 29565 }, { "epoch": 4.823884493025532, "grad_norm": 6.21875, "learning_rate": 7.930744155410145e-06, "loss": 2.6905, "num_input_tokens_seen": 61767120, "step": 29570 }, { "epoch": 4.824700220246349, "grad_norm": 12.25, "learning_rate": 7.923572497511181e-06, "loss": 2.5817, "num_input_tokens_seen": 61777584, "step": 29575 }, { "epoch": 4.825515947467167, "grad_norm": 3.203125, "learning_rate": 7.916403473011927e-06, "loss": 1.6481, "num_input_tokens_seen": 61787792, "step": 29580 }, { "epoch": 4.826331674687984, "grad_norm": 7.25, "learning_rate": 7.909237083017953e-06, "loss": 1.7452, "num_input_tokens_seen": 61797760, "step": 29585 }, { "epoch": 4.827147401908801, "grad_norm": 12.0, "learning_rate": 7.902073328634389e-06, "loss": 5.3937, "num_input_tokens_seen": 61808720, "step": 29590 }, { "epoch": 4.8279631291296194, "grad_norm": 9.5625, "learning_rate": 7.894912210965987e-06, "loss": 2.2002, "num_input_tokens_seen": 61818624, "step": 29595 }, { "epoch": 4.828778856350437, "grad_norm": 5.71875, "learning_rate": 7.887753731117075e-06, "loss": 3.9432, "num_input_tokens_seen": 61829776, "step": 29600 }, { "epoch": 4.828778856350437, "eval_loss": 2.546984910964966, "eval_runtime": 135.0121, "eval_samples_per_second": 20.183, "eval_steps_per_second": 10.095, "num_input_tokens_seen": 61829776, "step": 29600 }, { "epoch": 4.829594583571254, "grad_norm": 15.125, "learning_rate": 7.880597890191587e-06, "loss": 2.3684, "num_input_tokens_seen": 61839296, "step": 29605 }, { "epoch": 4.830410310792071, "grad_norm": 5.46875, "learning_rate": 7.873444689293036e-06, "loss": 2.4135, "num_input_tokens_seen": 61850144, "step": 29610 }, { "epoch": 4.831226038012889, "grad_norm": 5.5, "learning_rate": 7.866294129524548e-06, "loss": 3.3656, "num_input_tokens_seen": 61859952, "step": 29615 }, { "epoch": 4.832041765233706, "grad_norm": 3.734375, "learning_rate": 7.859146211988811e-06, "loss": 2.0489, "num_input_tokens_seen": 61870560, "step": 29620 }, { "epoch": 4.832857492454523, "grad_norm": 5.96875, "learning_rate": 7.852000937788134e-06, "loss": 2.7118, "num_input_tokens_seen": 61882064, "step": 29625 }, { "epoch": 4.83367321967534, "grad_norm": 6.21875, "learning_rate": 7.844858308024416e-06, "loss": 2.4155, "num_input_tokens_seen": 61892912, "step": 29630 }, { "epoch": 4.834488946896158, "grad_norm": 6.96875, "learning_rate": 7.837718323799122e-06, "loss": 2.2361, "num_input_tokens_seen": 61902944, "step": 29635 }, { "epoch": 4.835304674116975, "grad_norm": 6.8125, "learning_rate": 7.83058098621334e-06, "loss": 2.5438, "num_input_tokens_seen": 61912416, "step": 29640 }, { "epoch": 4.8361204013377925, "grad_norm": 9.8125, "learning_rate": 7.823446296367739e-06, "loss": 2.3656, "num_input_tokens_seen": 61923968, "step": 29645 }, { "epoch": 4.83693612855861, "grad_norm": 5.21875, "learning_rate": 7.81631425536257e-06, "loss": 1.8065, "num_input_tokens_seen": 61935376, "step": 29650 }, { "epoch": 4.837751855779428, "grad_norm": 2.265625, "learning_rate": 7.809184864297689e-06, "loss": 3.0252, "num_input_tokens_seen": 61946224, "step": 29655 }, { "epoch": 4.838567583000245, "grad_norm": 4.78125, "learning_rate": 7.802058124272532e-06, "loss": 1.5386, "num_input_tokens_seen": 61955136, "step": 29660 }, { "epoch": 4.839383310221062, "grad_norm": 6.09375, "learning_rate": 7.79493403638614e-06, "loss": 1.2349, "num_input_tokens_seen": 61966608, "step": 29665 }, { "epoch": 4.840199037441879, "grad_norm": 5.9375, "learning_rate": 7.787812601737132e-06, "loss": 2.4944, "num_input_tokens_seen": 61977280, "step": 29670 }, { "epoch": 4.841014764662697, "grad_norm": 9.25, "learning_rate": 7.780693821423715e-06, "loss": 2.0226, "num_input_tokens_seen": 61988224, "step": 29675 }, { "epoch": 4.841830491883514, "grad_norm": 10.0625, "learning_rate": 7.773577696543705e-06, "loss": 3.0113, "num_input_tokens_seen": 62000544, "step": 29680 }, { "epoch": 4.842646219104331, "grad_norm": 7.6875, "learning_rate": 7.7664642281945e-06, "loss": 2.6332, "num_input_tokens_seen": 62011632, "step": 29685 }, { "epoch": 4.8434619463251485, "grad_norm": 4.40625, "learning_rate": 7.759353417473072e-06, "loss": 1.5736, "num_input_tokens_seen": 62022848, "step": 29690 }, { "epoch": 4.8442776735459665, "grad_norm": 8.0, "learning_rate": 7.752245265476016e-06, "loss": 2.2513, "num_input_tokens_seen": 62034448, "step": 29695 }, { "epoch": 4.845093400766784, "grad_norm": 5.71875, "learning_rate": 7.745139773299481e-06, "loss": 1.4916, "num_input_tokens_seen": 62045136, "step": 29700 }, { "epoch": 4.845909127987601, "grad_norm": 4.21875, "learning_rate": 7.738036942039232e-06, "loss": 2.4376, "num_input_tokens_seen": 62055600, "step": 29705 }, { "epoch": 4.846724855208418, "grad_norm": 2.234375, "learning_rate": 7.73093677279062e-06, "loss": 3.9861, "num_input_tokens_seen": 62065824, "step": 29710 }, { "epoch": 4.847540582429236, "grad_norm": 6.8125, "learning_rate": 7.72383926664857e-06, "loss": 1.534, "num_input_tokens_seen": 62076368, "step": 29715 }, { "epoch": 4.848356309650053, "grad_norm": 5.5, "learning_rate": 7.716744424707606e-06, "loss": 1.4386, "num_input_tokens_seen": 62086064, "step": 29720 }, { "epoch": 4.84917203687087, "grad_norm": 10.3125, "learning_rate": 7.709652248061858e-06, "loss": 2.4453, "num_input_tokens_seen": 62095968, "step": 29725 }, { "epoch": 4.849987764091688, "grad_norm": 8.125, "learning_rate": 7.702562737805017e-06, "loss": 1.915, "num_input_tokens_seen": 62107440, "step": 29730 }, { "epoch": 4.850803491312505, "grad_norm": 1.9765625, "learning_rate": 7.695475895030365e-06, "loss": 2.3427, "num_input_tokens_seen": 62116608, "step": 29735 }, { "epoch": 4.8516192185333225, "grad_norm": 1.453125, "learning_rate": 7.6883917208308e-06, "loss": 1.6399, "num_input_tokens_seen": 62126384, "step": 29740 }, { "epoch": 4.85243494575414, "grad_norm": 0.173828125, "learning_rate": 7.681310216298778e-06, "loss": 1.3435, "num_input_tokens_seen": 62137760, "step": 29745 }, { "epoch": 4.853250672974957, "grad_norm": 6.4375, "learning_rate": 7.674231382526367e-06, "loss": 2.5647, "num_input_tokens_seen": 62148448, "step": 29750 }, { "epoch": 4.854066400195775, "grad_norm": 9.0625, "learning_rate": 7.667155220605198e-06, "loss": 2.3171, "num_input_tokens_seen": 62158992, "step": 29755 }, { "epoch": 4.854882127416592, "grad_norm": 11.125, "learning_rate": 7.660081731626515e-06, "loss": 2.8463, "num_input_tokens_seen": 62170272, "step": 29760 }, { "epoch": 4.855697854637409, "grad_norm": 6.1875, "learning_rate": 7.653010916681141e-06, "loss": 2.5586, "num_input_tokens_seen": 62180288, "step": 29765 }, { "epoch": 4.856513581858227, "grad_norm": 11.6875, "learning_rate": 7.645942776859472e-06, "loss": 2.8107, "num_input_tokens_seen": 62190176, "step": 29770 }, { "epoch": 4.857329309079044, "grad_norm": 12.625, "learning_rate": 7.63887731325152e-06, "loss": 2.0999, "num_input_tokens_seen": 62199616, "step": 29775 }, { "epoch": 4.858145036299861, "grad_norm": 9.25, "learning_rate": 7.63181452694685e-06, "loss": 2.6412, "num_input_tokens_seen": 62209456, "step": 29780 }, { "epoch": 4.858960763520678, "grad_norm": 5.65625, "learning_rate": 7.624754419034644e-06, "loss": 1.8011, "num_input_tokens_seen": 62219408, "step": 29785 }, { "epoch": 4.859776490741496, "grad_norm": 9.4375, "learning_rate": 7.6176969906036645e-06, "loss": 3.2227, "num_input_tokens_seen": 62229968, "step": 29790 }, { "epoch": 4.8605922179623136, "grad_norm": 8.25, "learning_rate": 7.610642242742242e-06, "loss": 3.4279, "num_input_tokens_seen": 62239616, "step": 29795 }, { "epoch": 4.861407945183131, "grad_norm": 3.375, "learning_rate": 7.603590176538322e-06, "loss": 2.0286, "num_input_tokens_seen": 62250704, "step": 29800 }, { "epoch": 4.861407945183131, "eval_loss": 2.546522855758667, "eval_runtime": 134.8504, "eval_samples_per_second": 20.208, "eval_steps_per_second": 10.107, "num_input_tokens_seen": 62250704, "step": 29800 }, { "epoch": 4.862223672403948, "grad_norm": 7.78125, "learning_rate": 7.596540793079404e-06, "loss": 3.2345, "num_input_tokens_seen": 62262048, "step": 29805 }, { "epoch": 4.863039399624766, "grad_norm": 10.8125, "learning_rate": 7.5894940934526125e-06, "loss": 2.3583, "num_input_tokens_seen": 62271216, "step": 29810 }, { "epoch": 4.863855126845583, "grad_norm": 8.125, "learning_rate": 7.582450078744621e-06, "loss": 2.3915, "num_input_tokens_seen": 62282272, "step": 29815 }, { "epoch": 4.8646708540664, "grad_norm": 7.15625, "learning_rate": 7.575408750041707e-06, "loss": 1.2879, "num_input_tokens_seen": 62293152, "step": 29820 }, { "epoch": 4.865486581287217, "grad_norm": 2.59375, "learning_rate": 7.568370108429732e-06, "loss": 2.0493, "num_input_tokens_seen": 62303536, "step": 29825 }, { "epoch": 4.866302308508035, "grad_norm": 3.640625, "learning_rate": 7.561334154994154e-06, "loss": 2.7441, "num_input_tokens_seen": 62315040, "step": 29830 }, { "epoch": 4.867118035728852, "grad_norm": 4.8125, "learning_rate": 7.55430089081999e-06, "loss": 2.2024, "num_input_tokens_seen": 62325584, "step": 29835 }, { "epoch": 4.8679337629496695, "grad_norm": 6.34375, "learning_rate": 7.547270316991864e-06, "loss": 3.1147, "num_input_tokens_seen": 62335856, "step": 29840 }, { "epoch": 4.868749490170487, "grad_norm": 6.65625, "learning_rate": 7.5402424345939884e-06, "loss": 2.425, "num_input_tokens_seen": 62346928, "step": 29845 }, { "epoch": 4.869565217391305, "grad_norm": 3.1875, "learning_rate": 7.533217244710133e-06, "loss": 2.006, "num_input_tokens_seen": 62356368, "step": 29850 }, { "epoch": 4.870380944612122, "grad_norm": 6.1875, "learning_rate": 7.52619474842369e-06, "loss": 1.2246, "num_input_tokens_seen": 62366816, "step": 29855 }, { "epoch": 4.871196671832939, "grad_norm": 12.3125, "learning_rate": 7.519174946817597e-06, "loss": 2.5904, "num_input_tokens_seen": 62377952, "step": 29860 }, { "epoch": 4.872012399053756, "grad_norm": 8.0625, "learning_rate": 7.512157840974407e-06, "loss": 2.5627, "num_input_tokens_seen": 62388736, "step": 29865 }, { "epoch": 4.872828126274574, "grad_norm": 8.8125, "learning_rate": 7.5051434319762496e-06, "loss": 2.8589, "num_input_tokens_seen": 62397968, "step": 29870 }, { "epoch": 4.873643853495391, "grad_norm": 9.4375, "learning_rate": 7.498131720904822e-06, "loss": 2.708, "num_input_tokens_seen": 62408064, "step": 29875 }, { "epoch": 4.874459580716208, "grad_norm": 9.9375, "learning_rate": 7.491122708841433e-06, "loss": 2.5858, "num_input_tokens_seen": 62418992, "step": 29880 }, { "epoch": 4.8752753079370255, "grad_norm": 0.049560546875, "learning_rate": 7.4841163968669524e-06, "loss": 2.2461, "num_input_tokens_seen": 62429392, "step": 29885 }, { "epoch": 4.8760910351578435, "grad_norm": 6.21875, "learning_rate": 7.4771127860618355e-06, "loss": 2.5393, "num_input_tokens_seen": 62438288, "step": 29890 }, { "epoch": 4.876906762378661, "grad_norm": 16.875, "learning_rate": 7.470111877506139e-06, "loss": 2.4143, "num_input_tokens_seen": 62447328, "step": 29895 }, { "epoch": 4.877722489599478, "grad_norm": 7.75, "learning_rate": 7.463113672279479e-06, "loss": 2.3594, "num_input_tokens_seen": 62457248, "step": 29900 }, { "epoch": 4.878538216820296, "grad_norm": 12.75, "learning_rate": 7.456118171461071e-06, "loss": 2.324, "num_input_tokens_seen": 62467056, "step": 29905 }, { "epoch": 4.879353944041113, "grad_norm": 10.0625, "learning_rate": 7.449125376129721e-06, "loss": 2.4611, "num_input_tokens_seen": 62477072, "step": 29910 }, { "epoch": 4.88016967126193, "grad_norm": 6.25, "learning_rate": 7.442135287363788e-06, "loss": 1.9883, "num_input_tokens_seen": 62487728, "step": 29915 }, { "epoch": 4.880985398482747, "grad_norm": 5.03125, "learning_rate": 7.435147906241247e-06, "loss": 2.6579, "num_input_tokens_seen": 62498880, "step": 29920 }, { "epoch": 4.881801125703564, "grad_norm": 4.125, "learning_rate": 7.428163233839624e-06, "loss": 3.1278, "num_input_tokens_seen": 62508048, "step": 29925 }, { "epoch": 4.882616852924382, "grad_norm": 8.9375, "learning_rate": 7.4211812712360525e-06, "loss": 2.5224, "num_input_tokens_seen": 62518464, "step": 29930 }, { "epoch": 4.883432580145199, "grad_norm": 8.3125, "learning_rate": 7.4142020195072464e-06, "loss": 2.5991, "num_input_tokens_seen": 62528144, "step": 29935 }, { "epoch": 4.884248307366017, "grad_norm": 8.8125, "learning_rate": 7.407225479729479e-06, "loss": 2.6162, "num_input_tokens_seen": 62538560, "step": 29940 }, { "epoch": 4.885064034586835, "grad_norm": 6.0625, "learning_rate": 7.400251652978632e-06, "loss": 1.4784, "num_input_tokens_seen": 62549648, "step": 29945 }, { "epoch": 4.885879761807652, "grad_norm": 8.625, "learning_rate": 7.393280540330147e-06, "loss": 3.1086, "num_input_tokens_seen": 62560480, "step": 29950 }, { "epoch": 4.886695489028469, "grad_norm": 4.96875, "learning_rate": 7.386312142859069e-06, "loss": 2.4506, "num_input_tokens_seen": 62569552, "step": 29955 }, { "epoch": 4.887511216249286, "grad_norm": 8.625, "learning_rate": 7.379346461640008e-06, "loss": 2.3781, "num_input_tokens_seen": 62580816, "step": 29960 }, { "epoch": 4.888326943470103, "grad_norm": 10.3125, "learning_rate": 7.372383497747149e-06, "loss": 2.6042, "num_input_tokens_seen": 62590096, "step": 29965 }, { "epoch": 4.889142670690921, "grad_norm": 9.6875, "learning_rate": 7.3654232522542775e-06, "loss": 2.6264, "num_input_tokens_seen": 62600256, "step": 29970 }, { "epoch": 4.889958397911738, "grad_norm": 3.875, "learning_rate": 7.358465726234756e-06, "loss": 2.0949, "num_input_tokens_seen": 62609792, "step": 29975 }, { "epoch": 4.890774125132555, "grad_norm": 3.984375, "learning_rate": 7.351510920761512e-06, "loss": 2.3601, "num_input_tokens_seen": 62620736, "step": 29980 }, { "epoch": 4.891589852353373, "grad_norm": 14.0625, "learning_rate": 7.344558836907067e-06, "loss": 2.8682, "num_input_tokens_seen": 62631904, "step": 29985 }, { "epoch": 4.8924055795741905, "grad_norm": 3.421875, "learning_rate": 7.3376094757435285e-06, "loss": 2.6288, "num_input_tokens_seen": 62642752, "step": 29990 }, { "epoch": 4.893221306795008, "grad_norm": 14.5, "learning_rate": 7.330662838342561e-06, "loss": 2.2831, "num_input_tokens_seen": 62653248, "step": 29995 }, { "epoch": 4.894037034015825, "grad_norm": 2.25, "learning_rate": 7.323718925775438e-06, "loss": 1.1607, "num_input_tokens_seen": 62662656, "step": 30000 }, { "epoch": 4.894037034015825, "eval_loss": 2.540228843688965, "eval_runtime": 135.0277, "eval_samples_per_second": 20.181, "eval_steps_per_second": 10.094, "num_input_tokens_seen": 62662656, "step": 30000 }, { "epoch": 4.894852761236643, "grad_norm": 6.53125, "learning_rate": 7.316777739112985e-06, "loss": 2.4085, "num_input_tokens_seen": 62671264, "step": 30005 }, { "epoch": 4.89566848845746, "grad_norm": 17.125, "learning_rate": 7.309839279425626e-06, "loss": 3.0376, "num_input_tokens_seen": 62682720, "step": 30010 }, { "epoch": 4.896484215678277, "grad_norm": 6.9375, "learning_rate": 7.302903547783366e-06, "loss": 1.9098, "num_input_tokens_seen": 62694144, "step": 30015 }, { "epoch": 4.897299942899094, "grad_norm": 3.9375, "learning_rate": 7.2959705452557644e-06, "loss": 2.577, "num_input_tokens_seen": 62706368, "step": 30020 }, { "epoch": 4.898115670119912, "grad_norm": 12.75, "learning_rate": 7.289040272911996e-06, "loss": 1.1663, "num_input_tokens_seen": 62717952, "step": 30025 }, { "epoch": 4.898931397340729, "grad_norm": 8.5625, "learning_rate": 7.282112731820789e-06, "loss": 2.4607, "num_input_tokens_seen": 62729552, "step": 30030 }, { "epoch": 4.8997471245615465, "grad_norm": 7.03125, "learning_rate": 7.275187923050447e-06, "loss": 1.9197, "num_input_tokens_seen": 62740064, "step": 30035 }, { "epoch": 4.900562851782364, "grad_norm": 7.0625, "learning_rate": 7.268265847668879e-06, "loss": 2.5057, "num_input_tokens_seen": 62751072, "step": 30040 }, { "epoch": 4.901378579003182, "grad_norm": 7.65625, "learning_rate": 7.261346506743538e-06, "loss": 4.4008, "num_input_tokens_seen": 62762304, "step": 30045 }, { "epoch": 4.902194306223999, "grad_norm": 5.625, "learning_rate": 7.254429901341486e-06, "loss": 2.3132, "num_input_tokens_seen": 62772432, "step": 30050 }, { "epoch": 4.903010033444816, "grad_norm": 9.3125, "learning_rate": 7.247516032529356e-06, "loss": 4.2587, "num_input_tokens_seen": 62782064, "step": 30055 }, { "epoch": 4.903825760665633, "grad_norm": 9.8125, "learning_rate": 7.240604901373338e-06, "loss": 2.4058, "num_input_tokens_seen": 62791744, "step": 30060 }, { "epoch": 4.904641487886451, "grad_norm": 5.9375, "learning_rate": 7.233696508939223e-06, "loss": 2.3155, "num_input_tokens_seen": 62802240, "step": 30065 }, { "epoch": 4.905457215107268, "grad_norm": 7.34375, "learning_rate": 7.226790856292376e-06, "loss": 2.2223, "num_input_tokens_seen": 62813120, "step": 30070 }, { "epoch": 4.906272942328085, "grad_norm": 3.515625, "learning_rate": 7.219887944497727e-06, "loss": 2.52, "num_input_tokens_seen": 62823792, "step": 30075 }, { "epoch": 4.907088669548903, "grad_norm": 5.625, "learning_rate": 7.2129877746198e-06, "loss": 3.1014, "num_input_tokens_seen": 62833440, "step": 30080 }, { "epoch": 4.9079043967697205, "grad_norm": 6.0, "learning_rate": 7.20609034772268e-06, "loss": 2.46, "num_input_tokens_seen": 62844544, "step": 30085 }, { "epoch": 4.908720123990538, "grad_norm": 7.46875, "learning_rate": 7.19919566487004e-06, "loss": 2.5033, "num_input_tokens_seen": 62854832, "step": 30090 }, { "epoch": 4.909535851211355, "grad_norm": 5.8125, "learning_rate": 7.192303727125132e-06, "loss": 1.0406, "num_input_tokens_seen": 62865248, "step": 30095 }, { "epoch": 4.910351578432172, "grad_norm": 5.75, "learning_rate": 7.185414535550777e-06, "loss": 1.8641, "num_input_tokens_seen": 62876384, "step": 30100 }, { "epoch": 4.91116730565299, "grad_norm": 3.828125, "learning_rate": 7.178528091209363e-06, "loss": 2.0712, "num_input_tokens_seen": 62888000, "step": 30105 }, { "epoch": 4.911983032873807, "grad_norm": 5.1875, "learning_rate": 7.171644395162888e-06, "loss": 1.693, "num_input_tokens_seen": 62899568, "step": 30110 }, { "epoch": 4.912798760094624, "grad_norm": 9.4375, "learning_rate": 7.164763448472881e-06, "loss": 1.6014, "num_input_tokens_seen": 62910944, "step": 30115 }, { "epoch": 4.913614487315442, "grad_norm": 2.140625, "learning_rate": 7.157885252200491e-06, "loss": 1.1534, "num_input_tokens_seen": 62921744, "step": 30120 }, { "epoch": 4.914430214536259, "grad_norm": 2.09375, "learning_rate": 7.151009807406403e-06, "loss": 2.7158, "num_input_tokens_seen": 62931952, "step": 30125 }, { "epoch": 4.915245941757076, "grad_norm": 5.78125, "learning_rate": 7.144137115150909e-06, "loss": 1.3309, "num_input_tokens_seen": 62943808, "step": 30130 }, { "epoch": 4.9160616689778935, "grad_norm": 10.625, "learning_rate": 7.1372671764938725e-06, "loss": 1.5028, "num_input_tokens_seen": 62955200, "step": 30135 }, { "epoch": 4.916877396198711, "grad_norm": 6.28125, "learning_rate": 7.130399992494705e-06, "loss": 1.5694, "num_input_tokens_seen": 62964256, "step": 30140 }, { "epoch": 4.917693123419529, "grad_norm": 3.59375, "learning_rate": 7.123535564212419e-06, "loss": 2.4667, "num_input_tokens_seen": 62975280, "step": 30145 }, { "epoch": 4.918508850640346, "grad_norm": 8.3125, "learning_rate": 7.116673892705611e-06, "loss": 1.4133, "num_input_tokens_seen": 62986288, "step": 30150 }, { "epoch": 4.919324577861163, "grad_norm": 4.28125, "learning_rate": 7.109814979032415e-06, "loss": 2.5597, "num_input_tokens_seen": 62997504, "step": 30155 }, { "epoch": 4.920140305081981, "grad_norm": 3.59375, "learning_rate": 7.102958824250577e-06, "loss": 2.0136, "num_input_tokens_seen": 63007344, "step": 30160 }, { "epoch": 4.920956032302798, "grad_norm": 8.625, "learning_rate": 7.096105429417393e-06, "loss": 2.6571, "num_input_tokens_seen": 63017920, "step": 30165 }, { "epoch": 4.921771759523615, "grad_norm": 5.40625, "learning_rate": 7.0892547955897506e-06, "loss": 1.8355, "num_input_tokens_seen": 63027088, "step": 30170 }, { "epoch": 4.922587486744432, "grad_norm": 4.71875, "learning_rate": 7.0824069238241e-06, "loss": 1.1425, "num_input_tokens_seen": 63037840, "step": 30175 }, { "epoch": 4.92340321396525, "grad_norm": 4.34375, "learning_rate": 7.075561815176462e-06, "loss": 2.4012, "num_input_tokens_seen": 63047552, "step": 30180 }, { "epoch": 4.9242189411860675, "grad_norm": 2.5, "learning_rate": 7.068719470702445e-06, "loss": 3.8797, "num_input_tokens_seen": 63057456, "step": 30185 }, { "epoch": 4.925034668406885, "grad_norm": 0.185546875, "learning_rate": 7.061879891457229e-06, "loss": 2.5669, "num_input_tokens_seen": 63068464, "step": 30190 }, { "epoch": 4.925850395627702, "grad_norm": 10.75, "learning_rate": 7.0550430784955515e-06, "loss": 2.2032, "num_input_tokens_seen": 63078112, "step": 30195 }, { "epoch": 4.92666612284852, "grad_norm": 5.6875, "learning_rate": 7.048209032871752e-06, "loss": 1.9432, "num_input_tokens_seen": 63088352, "step": 30200 }, { "epoch": 4.92666612284852, "eval_loss": 2.5601696968078613, "eval_runtime": 134.8372, "eval_samples_per_second": 20.21, "eval_steps_per_second": 10.108, "num_input_tokens_seen": 63088352, "step": 30200 }, { "epoch": 4.927481850069337, "grad_norm": 6.34375, "learning_rate": 7.0413777556397055e-06, "loss": 1.7634, "num_input_tokens_seen": 63098240, "step": 30205 }, { "epoch": 4.928297577290154, "grad_norm": 5.84375, "learning_rate": 7.0345492478528925e-06, "loss": 3.1316, "num_input_tokens_seen": 63108928, "step": 30210 }, { "epoch": 4.929113304510971, "grad_norm": 7.71875, "learning_rate": 7.02772351056436e-06, "loss": 2.1743, "num_input_tokens_seen": 63119520, "step": 30215 }, { "epoch": 4.929929031731789, "grad_norm": 4.5625, "learning_rate": 7.020900544826709e-06, "loss": 1.6402, "num_input_tokens_seen": 63130384, "step": 30220 }, { "epoch": 4.930744758952606, "grad_norm": 8.0, "learning_rate": 7.014080351692134e-06, "loss": 3.2698, "num_input_tokens_seen": 63141024, "step": 30225 }, { "epoch": 4.9315604861734235, "grad_norm": 9.125, "learning_rate": 7.0072629322124024e-06, "loss": 2.3034, "num_input_tokens_seen": 63151424, "step": 30230 }, { "epoch": 4.932376213394241, "grad_norm": 8.625, "learning_rate": 7.000448287438827e-06, "loss": 2.7584, "num_input_tokens_seen": 63161952, "step": 30235 }, { "epoch": 4.933191940615059, "grad_norm": 5.03125, "learning_rate": 6.993636418422331e-06, "loss": 2.8559, "num_input_tokens_seen": 63172496, "step": 30240 }, { "epoch": 4.934007667835876, "grad_norm": 1.609375, "learning_rate": 6.986827326213383e-06, "loss": 1.6351, "num_input_tokens_seen": 63181344, "step": 30245 }, { "epoch": 4.934823395056693, "grad_norm": 8.4375, "learning_rate": 6.9800210118620205e-06, "loss": 1.6842, "num_input_tokens_seen": 63190512, "step": 30250 }, { "epoch": 4.935639122277511, "grad_norm": 7.34375, "learning_rate": 6.973217476417876e-06, "loss": 2.6684, "num_input_tokens_seen": 63201712, "step": 30255 }, { "epoch": 4.936454849498328, "grad_norm": 6.5, "learning_rate": 6.96641672093013e-06, "loss": 3.6638, "num_input_tokens_seen": 63212976, "step": 30260 }, { "epoch": 4.937270576719145, "grad_norm": 0.71484375, "learning_rate": 6.95961874644755e-06, "loss": 2.0648, "num_input_tokens_seen": 63224096, "step": 30265 }, { "epoch": 4.938086303939962, "grad_norm": 5.9375, "learning_rate": 6.952823554018476e-06, "loss": 1.7147, "num_input_tokens_seen": 63234976, "step": 30270 }, { "epoch": 4.938902031160779, "grad_norm": 6.8125, "learning_rate": 6.946031144690798e-06, "loss": 2.462, "num_input_tokens_seen": 63244480, "step": 30275 }, { "epoch": 4.939717758381597, "grad_norm": 4.875, "learning_rate": 6.939241519512005e-06, "loss": 1.6392, "num_input_tokens_seen": 63255152, "step": 30280 }, { "epoch": 4.940533485602415, "grad_norm": 4.0625, "learning_rate": 6.932454679529129e-06, "loss": 2.6507, "num_input_tokens_seen": 63266544, "step": 30285 }, { "epoch": 4.941349212823232, "grad_norm": 4.6875, "learning_rate": 6.925670625788791e-06, "loss": 2.1968, "num_input_tokens_seen": 63277792, "step": 30290 }, { "epoch": 4.94216494004405, "grad_norm": 8.125, "learning_rate": 6.918889359337186e-06, "loss": 3.3984, "num_input_tokens_seen": 63288912, "step": 30295 }, { "epoch": 4.942980667264867, "grad_norm": 5.375, "learning_rate": 6.912110881220058e-06, "loss": 2.0162, "num_input_tokens_seen": 63298928, "step": 30300 }, { "epoch": 4.943796394485684, "grad_norm": 13.3125, "learning_rate": 6.905335192482735e-06, "loss": 4.4795, "num_input_tokens_seen": 63309328, "step": 30305 }, { "epoch": 4.944612121706501, "grad_norm": 5.59375, "learning_rate": 6.8985622941701275e-06, "loss": 2.1001, "num_input_tokens_seen": 63320864, "step": 30310 }, { "epoch": 4.945427848927318, "grad_norm": 4.5, "learning_rate": 6.89179218732669e-06, "loss": 2.0733, "num_input_tokens_seen": 63330432, "step": 30315 }, { "epoch": 4.946243576148136, "grad_norm": 2.4375, "learning_rate": 6.8850248729964595e-06, "loss": 1.5302, "num_input_tokens_seen": 63340848, "step": 30320 }, { "epoch": 4.947059303368953, "grad_norm": 7.8125, "learning_rate": 6.8782603522230314e-06, "loss": 3.5655, "num_input_tokens_seen": 63350048, "step": 30325 }, { "epoch": 4.9478750305897705, "grad_norm": 8.9375, "learning_rate": 6.871498626049591e-06, "loss": 3.1055, "num_input_tokens_seen": 63360608, "step": 30330 }, { "epoch": 4.9486907578105885, "grad_norm": 6.03125, "learning_rate": 6.8647396955188875e-06, "loss": 1.945, "num_input_tokens_seen": 63371632, "step": 30335 }, { "epoch": 4.949506485031406, "grad_norm": 7.96875, "learning_rate": 6.857983561673218e-06, "loss": 1.5745, "num_input_tokens_seen": 63380816, "step": 30340 }, { "epoch": 4.950322212252223, "grad_norm": 6.25, "learning_rate": 6.851230225554467e-06, "loss": 1.7087, "num_input_tokens_seen": 63390720, "step": 30345 }, { "epoch": 4.95113793947304, "grad_norm": 3.359375, "learning_rate": 6.8444796882040946e-06, "loss": 1.7388, "num_input_tokens_seen": 63400592, "step": 30350 }, { "epoch": 4.951953666693858, "grad_norm": 1.2578125, "learning_rate": 6.837731950663106e-06, "loss": 2.0631, "num_input_tokens_seen": 63409728, "step": 30355 }, { "epoch": 4.952769393914675, "grad_norm": 6.125, "learning_rate": 6.830987013972098e-06, "loss": 1.7889, "num_input_tokens_seen": 63420544, "step": 30360 }, { "epoch": 4.953585121135492, "grad_norm": 7.90625, "learning_rate": 6.82424487917121e-06, "loss": 1.7462, "num_input_tokens_seen": 63429984, "step": 30365 }, { "epoch": 4.954400848356309, "grad_norm": 6.625, "learning_rate": 6.8175055473001735e-06, "loss": 1.6469, "num_input_tokens_seen": 63440512, "step": 30370 }, { "epoch": 4.955216575577127, "grad_norm": 7.0, "learning_rate": 6.8107690193982855e-06, "loss": 1.9943, "num_input_tokens_seen": 63451424, "step": 30375 }, { "epoch": 4.9560323027979445, "grad_norm": 6.90625, "learning_rate": 6.804035296504385e-06, "loss": 2.5724, "num_input_tokens_seen": 63462256, "step": 30380 }, { "epoch": 4.956848030018762, "grad_norm": 1.25, "learning_rate": 6.797304379656916e-06, "loss": 1.2344, "num_input_tokens_seen": 63473568, "step": 30385 }, { "epoch": 4.957663757239579, "grad_norm": 6.71875, "learning_rate": 6.790576269893861e-06, "loss": 2.0058, "num_input_tokens_seen": 63485152, "step": 30390 }, { "epoch": 4.958479484460397, "grad_norm": 11.5625, "learning_rate": 6.783850968252772e-06, "loss": 2.8123, "num_input_tokens_seen": 63495200, "step": 30395 }, { "epoch": 4.959295211681214, "grad_norm": 9.4375, "learning_rate": 6.777128475770789e-06, "loss": 2.651, "num_input_tokens_seen": 63504960, "step": 30400 }, { "epoch": 4.959295211681214, "eval_loss": 2.5388436317443848, "eval_runtime": 135.0431, "eval_samples_per_second": 20.179, "eval_steps_per_second": 10.093, "num_input_tokens_seen": 63504960, "step": 30400 }, { "epoch": 4.960110938902031, "grad_norm": 10.4375, "learning_rate": 6.77040879348459e-06, "loss": 2.8872, "num_input_tokens_seen": 63516224, "step": 30405 }, { "epoch": 4.960926666122848, "grad_norm": 6.0, "learning_rate": 6.763691922430443e-06, "loss": 2.5476, "num_input_tokens_seen": 63527232, "step": 30410 }, { "epoch": 4.961742393343666, "grad_norm": 10.625, "learning_rate": 6.756977863644178e-06, "loss": 3.3086, "num_input_tokens_seen": 63537936, "step": 30415 }, { "epoch": 4.962558120564483, "grad_norm": 9.8125, "learning_rate": 6.7502666181611804e-06, "loss": 2.9306, "num_input_tokens_seen": 63547776, "step": 30420 }, { "epoch": 4.9633738477853, "grad_norm": 10.8125, "learning_rate": 6.743558187016405e-06, "loss": 3.1777, "num_input_tokens_seen": 63558144, "step": 30425 }, { "epoch": 4.964189575006118, "grad_norm": 7.375, "learning_rate": 6.7368525712443925e-06, "loss": 2.2969, "num_input_tokens_seen": 63568336, "step": 30430 }, { "epoch": 4.965005302226936, "grad_norm": 7.09375, "learning_rate": 6.7301497718792155e-06, "loss": 2.2702, "num_input_tokens_seen": 63578944, "step": 30435 }, { "epoch": 4.965821029447753, "grad_norm": 9.25, "learning_rate": 6.723449789954544e-06, "loss": 3.1901, "num_input_tokens_seen": 63589280, "step": 30440 }, { "epoch": 4.96663675666857, "grad_norm": 8.9375, "learning_rate": 6.716752626503586e-06, "loss": 3.409, "num_input_tokens_seen": 63598912, "step": 30445 }, { "epoch": 4.967452483889387, "grad_norm": 6.0, "learning_rate": 6.710058282559131e-06, "loss": 3.0741, "num_input_tokens_seen": 63611200, "step": 30450 }, { "epoch": 4.968268211110205, "grad_norm": 6.6875, "learning_rate": 6.703366759153545e-06, "loss": 2.9854, "num_input_tokens_seen": 63621056, "step": 30455 }, { "epoch": 4.969083938331022, "grad_norm": 2.375, "learning_rate": 6.6966780573187335e-06, "loss": 1.4058, "num_input_tokens_seen": 63631312, "step": 30460 }, { "epoch": 4.969899665551839, "grad_norm": 9.6875, "learning_rate": 6.689992178086174e-06, "loss": 2.234, "num_input_tokens_seen": 63642624, "step": 30465 }, { "epoch": 4.970715392772657, "grad_norm": 1.859375, "learning_rate": 6.683309122486925e-06, "loss": 1.7637, "num_input_tokens_seen": 63652912, "step": 30470 }, { "epoch": 4.971531119993474, "grad_norm": 5.59375, "learning_rate": 6.676628891551584e-06, "loss": 2.5849, "num_input_tokens_seen": 63663584, "step": 30475 }, { "epoch": 4.9723468472142915, "grad_norm": 9.25, "learning_rate": 6.6699514863103385e-06, "loss": 3.2895, "num_input_tokens_seen": 63675504, "step": 30480 }, { "epoch": 4.973162574435109, "grad_norm": 6.65625, "learning_rate": 6.663276907792921e-06, "loss": 3.3322, "num_input_tokens_seen": 63685776, "step": 30485 }, { "epoch": 4.973978301655926, "grad_norm": 5.09375, "learning_rate": 6.656605157028634e-06, "loss": 1.9437, "num_input_tokens_seen": 63695376, "step": 30490 }, { "epoch": 4.974794028876744, "grad_norm": 3.96875, "learning_rate": 6.649936235046358e-06, "loss": 2.0423, "num_input_tokens_seen": 63705200, "step": 30495 }, { "epoch": 4.975609756097561, "grad_norm": 10.75, "learning_rate": 6.643270142874508e-06, "loss": 2.1188, "num_input_tokens_seen": 63715568, "step": 30500 }, { "epoch": 4.976425483318378, "grad_norm": 10.625, "learning_rate": 6.636606881541094e-06, "loss": 2.596, "num_input_tokens_seen": 63724816, "step": 30505 }, { "epoch": 4.977241210539196, "grad_norm": 11.4375, "learning_rate": 6.629946452073662e-06, "loss": 2.1754, "num_input_tokens_seen": 63735072, "step": 30510 }, { "epoch": 4.978056937760013, "grad_norm": 1.796875, "learning_rate": 6.6232888554993375e-06, "loss": 1.9222, "num_input_tokens_seen": 63744944, "step": 30515 }, { "epoch": 4.97887266498083, "grad_norm": 2.90625, "learning_rate": 6.616634092844817e-06, "loss": 1.8107, "num_input_tokens_seen": 63755520, "step": 30520 }, { "epoch": 4.9796883922016475, "grad_norm": 5.0, "learning_rate": 6.609982165136331e-06, "loss": 2.3244, "num_input_tokens_seen": 63765152, "step": 30525 }, { "epoch": 4.9805041194224655, "grad_norm": 7.46875, "learning_rate": 6.603333073399706e-06, "loss": 1.8395, "num_input_tokens_seen": 63775360, "step": 30530 }, { "epoch": 4.981319846643283, "grad_norm": 4.0, "learning_rate": 6.596686818660308e-06, "loss": 2.5253, "num_input_tokens_seen": 63786800, "step": 30535 }, { "epoch": 4.9821355738641, "grad_norm": 6.6875, "learning_rate": 6.590043401943066e-06, "loss": 3.167, "num_input_tokens_seen": 63797600, "step": 30540 }, { "epoch": 4.982951301084917, "grad_norm": 6.6875, "learning_rate": 6.583402824272494e-06, "loss": 1.858, "num_input_tokens_seen": 63808816, "step": 30545 }, { "epoch": 4.983767028305735, "grad_norm": 7.4375, "learning_rate": 6.576765086672634e-06, "loss": 1.8794, "num_input_tokens_seen": 63819040, "step": 30550 }, { "epoch": 4.984582755526552, "grad_norm": 7.6875, "learning_rate": 6.57013019016712e-06, "loss": 2.7738, "num_input_tokens_seen": 63829824, "step": 30555 }, { "epoch": 4.985398482747369, "grad_norm": 5.125, "learning_rate": 6.563498135779142e-06, "loss": 2.3265, "num_input_tokens_seen": 63839456, "step": 30560 }, { "epoch": 4.986214209968186, "grad_norm": 7.1875, "learning_rate": 6.556868924531431e-06, "loss": 2.0422, "num_input_tokens_seen": 63849904, "step": 30565 }, { "epoch": 4.987029937189004, "grad_norm": 3.375, "learning_rate": 6.550242557446304e-06, "loss": 1.8995, "num_input_tokens_seen": 63861792, "step": 30570 }, { "epoch": 4.9878456644098215, "grad_norm": 3.90625, "learning_rate": 6.543619035545634e-06, "loss": 3.1391, "num_input_tokens_seen": 63872528, "step": 30575 }, { "epoch": 4.988661391630639, "grad_norm": 5.84375, "learning_rate": 6.53699835985084e-06, "loss": 2.8571, "num_input_tokens_seen": 63883104, "step": 30580 }, { "epoch": 4.989477118851456, "grad_norm": 8.0625, "learning_rate": 6.530380531382927e-06, "loss": 3.0143, "num_input_tokens_seen": 63893264, "step": 30585 }, { "epoch": 4.990292846072274, "grad_norm": 5.875, "learning_rate": 6.523765551162433e-06, "loss": 1.7848, "num_input_tokens_seen": 63904400, "step": 30590 }, { "epoch": 4.991108573293091, "grad_norm": 5.5625, "learning_rate": 6.517153420209476e-06, "loss": 2.6538, "num_input_tokens_seen": 63915328, "step": 30595 }, { "epoch": 4.991924300513908, "grad_norm": 7.15625, "learning_rate": 6.510544139543739e-06, "loss": 4.0587, "num_input_tokens_seen": 63926432, "step": 30600 }, { "epoch": 4.991924300513908, "eval_loss": 2.5485336780548096, "eval_runtime": 134.8054, "eval_samples_per_second": 20.214, "eval_steps_per_second": 10.111, "num_input_tokens_seen": 63926432, "step": 30600 }, { "epoch": 4.992740027734725, "grad_norm": 4.46875, "learning_rate": 6.503937710184452e-06, "loss": 3.1299, "num_input_tokens_seen": 63936720, "step": 30605 }, { "epoch": 4.993555754955543, "grad_norm": 13.6875, "learning_rate": 6.4973341331503954e-06, "loss": 3.4473, "num_input_tokens_seen": 63946576, "step": 30610 }, { "epoch": 4.99437148217636, "grad_norm": 14.1875, "learning_rate": 6.490733409459942e-06, "loss": 3.0186, "num_input_tokens_seen": 63957312, "step": 30615 }, { "epoch": 4.995187209397177, "grad_norm": 3.6875, "learning_rate": 6.484135540130995e-06, "loss": 2.3585, "num_input_tokens_seen": 63967360, "step": 30620 }, { "epoch": 4.9960029366179945, "grad_norm": 6.0625, "learning_rate": 6.4775405261810364e-06, "loss": 1.3133, "num_input_tokens_seen": 63976832, "step": 30625 }, { "epoch": 4.996818663838813, "grad_norm": 6.46875, "learning_rate": 6.470948368627092e-06, "loss": 3.6898, "num_input_tokens_seen": 63988352, "step": 30630 }, { "epoch": 4.99763439105963, "grad_norm": 7.1875, "learning_rate": 6.464359068485756e-06, "loss": 2.9102, "num_input_tokens_seen": 63998512, "step": 30635 }, { "epoch": 4.998450118280447, "grad_norm": 1.71875, "learning_rate": 6.457772626773195e-06, "loss": 1.3832, "num_input_tokens_seen": 64007648, "step": 30640 }, { "epoch": 4.999265845501265, "grad_norm": 6.125, "learning_rate": 6.451189044505104e-06, "loss": 2.035, "num_input_tokens_seen": 64019360, "step": 30645 }, { "epoch": 5.0, "grad_norm": 9.1875, "learning_rate": 6.44460832269676e-06, "loss": 2.0651, "num_input_tokens_seen": 64029024, "step": 30650 }, { "epoch": 5.000815727220817, "grad_norm": 0.267578125, "learning_rate": 6.438030462363001e-06, "loss": 1.5691, "num_input_tokens_seen": 64040032, "step": 30655 }, { "epoch": 5.001631454441635, "grad_norm": 12.6875, "learning_rate": 6.431455464518205e-06, "loss": 2.0846, "num_input_tokens_seen": 64050192, "step": 30660 }, { "epoch": 5.002447181662452, "grad_norm": 11.5, "learning_rate": 6.424883330176326e-06, "loss": 2.6294, "num_input_tokens_seen": 64061088, "step": 30665 }, { "epoch": 5.003262908883269, "grad_norm": 17.25, "learning_rate": 6.418314060350864e-06, "loss": 3.112, "num_input_tokens_seen": 64072320, "step": 30670 }, { "epoch": 5.0040786361040865, "grad_norm": 4.21875, "learning_rate": 6.4117476560548895e-06, "loss": 1.3646, "num_input_tokens_seen": 64083360, "step": 30675 }, { "epoch": 5.004894363324905, "grad_norm": 4.0625, "learning_rate": 6.405184118301016e-06, "loss": 1.4828, "num_input_tokens_seen": 64093664, "step": 30680 }, { "epoch": 5.005710090545722, "grad_norm": 9.4375, "learning_rate": 6.398623448101434e-06, "loss": 1.49, "num_input_tokens_seen": 64104320, "step": 30685 }, { "epoch": 5.006525817766539, "grad_norm": 4.40625, "learning_rate": 6.392065646467871e-06, "loss": 3.3381, "num_input_tokens_seen": 64114320, "step": 30690 }, { "epoch": 5.007341544987356, "grad_norm": 7.96875, "learning_rate": 6.385510714411632e-06, "loss": 2.523, "num_input_tokens_seen": 64126544, "step": 30695 }, { "epoch": 5.008157272208174, "grad_norm": 11.75, "learning_rate": 6.378958652943559e-06, "loss": 2.8549, "num_input_tokens_seen": 64136672, "step": 30700 }, { "epoch": 5.008972999428991, "grad_norm": 6.59375, "learning_rate": 6.3724094630740776e-06, "loss": 1.9989, "num_input_tokens_seen": 64147600, "step": 30705 }, { "epoch": 5.009788726649808, "grad_norm": 8.3125, "learning_rate": 6.365863145813136e-06, "loss": 3.0857, "num_input_tokens_seen": 64157408, "step": 30710 }, { "epoch": 5.010604453870625, "grad_norm": 7.625, "learning_rate": 6.359319702170269e-06, "loss": 2.1562, "num_input_tokens_seen": 64166864, "step": 30715 }, { "epoch": 5.011420181091443, "grad_norm": 7.875, "learning_rate": 6.352779133154566e-06, "loss": 2.4016, "num_input_tokens_seen": 64176080, "step": 30720 }, { "epoch": 5.0122359083122605, "grad_norm": 4.15625, "learning_rate": 6.346241439774648e-06, "loss": 1.6434, "num_input_tokens_seen": 64187056, "step": 30725 }, { "epoch": 5.013051635533078, "grad_norm": 4.84375, "learning_rate": 6.339706623038716e-06, "loss": 2.8118, "num_input_tokens_seen": 64197456, "step": 30730 }, { "epoch": 5.013867362753895, "grad_norm": 14.875, "learning_rate": 6.333174683954532e-06, "loss": 2.6078, "num_input_tokens_seen": 64208176, "step": 30735 }, { "epoch": 5.014683089974713, "grad_norm": 8.9375, "learning_rate": 6.326645623529387e-06, "loss": 2.9301, "num_input_tokens_seen": 64218672, "step": 30740 }, { "epoch": 5.01549881719553, "grad_norm": 4.1875, "learning_rate": 6.320119442770156e-06, "loss": 1.8898, "num_input_tokens_seen": 64228672, "step": 30745 }, { "epoch": 5.016314544416347, "grad_norm": 9.1875, "learning_rate": 6.313596142683254e-06, "loss": 1.4753, "num_input_tokens_seen": 64239488, "step": 30750 }, { "epoch": 5.017130271637164, "grad_norm": 4.34375, "learning_rate": 6.307075724274647e-06, "loss": 3.4443, "num_input_tokens_seen": 64250384, "step": 30755 }, { "epoch": 5.017945998857982, "grad_norm": 2.171875, "learning_rate": 6.300558188549882e-06, "loss": 1.6053, "num_input_tokens_seen": 64261456, "step": 30760 }, { "epoch": 5.018761726078799, "grad_norm": 8.4375, "learning_rate": 6.29404353651403e-06, "loss": 2.849, "num_input_tokens_seen": 64273424, "step": 30765 }, { "epoch": 5.0195774532996165, "grad_norm": 9.0, "learning_rate": 6.287531769171737e-06, "loss": 1.2974, "num_input_tokens_seen": 64283088, "step": 30770 }, { "epoch": 5.020393180520434, "grad_norm": 8.1875, "learning_rate": 6.2810228875272045e-06, "loss": 3.3826, "num_input_tokens_seen": 64294176, "step": 30775 }, { "epoch": 5.021208907741252, "grad_norm": 10.3125, "learning_rate": 6.274516892584179e-06, "loss": 2.2277, "num_input_tokens_seen": 64303936, "step": 30780 }, { "epoch": 5.022024634962069, "grad_norm": 7.34375, "learning_rate": 6.268013785345969e-06, "loss": 4.192, "num_input_tokens_seen": 64314736, "step": 30785 }, { "epoch": 5.022840362182886, "grad_norm": 6.34375, "learning_rate": 6.26151356681543e-06, "loss": 2.7223, "num_input_tokens_seen": 64324976, "step": 30790 }, { "epoch": 5.023656089403703, "grad_norm": 6.0, "learning_rate": 6.255016237994981e-06, "loss": 1.7371, "num_input_tokens_seen": 64337424, "step": 30795 }, { "epoch": 5.024471816624521, "grad_norm": 3.28125, "learning_rate": 6.248521799886603e-06, "loss": 1.5048, "num_input_tokens_seen": 64346032, "step": 30800 }, { "epoch": 5.024471816624521, "eval_loss": 2.5376741886138916, "eval_runtime": 134.9772, "eval_samples_per_second": 20.189, "eval_steps_per_second": 10.098, "num_input_tokens_seen": 64346032, "step": 30800 }, { "epoch": 5.025287543845338, "grad_norm": 5.34375, "learning_rate": 6.242030253491798e-06, "loss": 2.0636, "num_input_tokens_seen": 64357072, "step": 30805 }, { "epoch": 5.026103271066155, "grad_norm": 5.3125, "learning_rate": 6.235541599811656e-06, "loss": 2.9576, "num_input_tokens_seen": 64366352, "step": 30810 }, { "epoch": 5.026918998286972, "grad_norm": 6.03125, "learning_rate": 6.229055839846814e-06, "loss": 1.9813, "num_input_tokens_seen": 64377504, "step": 30815 }, { "epoch": 5.02773472550779, "grad_norm": 9.5, "learning_rate": 6.222572974597455e-06, "loss": 1.6147, "num_input_tokens_seen": 64386512, "step": 30820 }, { "epoch": 5.028550452728608, "grad_norm": 1.515625, "learning_rate": 6.216093005063306e-06, "loss": 2.4642, "num_input_tokens_seen": 64396400, "step": 30825 }, { "epoch": 5.029366179949425, "grad_norm": 7.125, "learning_rate": 6.209615932243678e-06, "loss": 2.3089, "num_input_tokens_seen": 64406864, "step": 30830 }, { "epoch": 5.030181907170242, "grad_norm": 9.1875, "learning_rate": 6.203141757137399e-06, "loss": 3.3419, "num_input_tokens_seen": 64415088, "step": 30835 }, { "epoch": 5.03099763439106, "grad_norm": 4.03125, "learning_rate": 6.196670480742886e-06, "loss": 2.4497, "num_input_tokens_seen": 64425744, "step": 30840 }, { "epoch": 5.031813361611877, "grad_norm": 4.5, "learning_rate": 6.190202104058074e-06, "loss": 1.3324, "num_input_tokens_seen": 64437008, "step": 30845 }, { "epoch": 5.032629088832694, "grad_norm": 4.90625, "learning_rate": 6.183736628080475e-06, "loss": 4.1345, "num_input_tokens_seen": 64446976, "step": 30850 }, { "epoch": 5.033444816053512, "grad_norm": 2.75, "learning_rate": 6.177274053807155e-06, "loss": 2.1393, "num_input_tokens_seen": 64457040, "step": 30855 }, { "epoch": 5.034260543274329, "grad_norm": 4.15625, "learning_rate": 6.170814382234713e-06, "loss": 1.7993, "num_input_tokens_seen": 64467440, "step": 30860 }, { "epoch": 5.035076270495146, "grad_norm": 9.875, "learning_rate": 6.16435761435932e-06, "loss": 2.9245, "num_input_tokens_seen": 64478704, "step": 30865 }, { "epoch": 5.0358919977159635, "grad_norm": 11.3125, "learning_rate": 6.157903751176681e-06, "loss": 2.226, "num_input_tokens_seen": 64488992, "step": 30870 }, { "epoch": 5.0367077249367815, "grad_norm": 10.75, "learning_rate": 6.151452793682066e-06, "loss": 3.8606, "num_input_tokens_seen": 64500496, "step": 30875 }, { "epoch": 5.037523452157599, "grad_norm": 7.65625, "learning_rate": 6.145004742870305e-06, "loss": 2.6339, "num_input_tokens_seen": 64510416, "step": 30880 }, { "epoch": 5.038339179378416, "grad_norm": 5.6875, "learning_rate": 6.138559599735752e-06, "loss": 1.9447, "num_input_tokens_seen": 64521760, "step": 30885 }, { "epoch": 5.039154906599233, "grad_norm": 3.109375, "learning_rate": 6.132117365272344e-06, "loss": 2.1504, "num_input_tokens_seen": 64531664, "step": 30890 }, { "epoch": 5.039970633820051, "grad_norm": 7.375, "learning_rate": 6.125678040473545e-06, "loss": 2.2273, "num_input_tokens_seen": 64541952, "step": 30895 }, { "epoch": 5.040786361040868, "grad_norm": 7.34375, "learning_rate": 6.1192416263323755e-06, "loss": 2.7108, "num_input_tokens_seen": 64551488, "step": 30900 }, { "epoch": 5.041602088261685, "grad_norm": 8.875, "learning_rate": 6.112808123841424e-06, "loss": 2.8598, "num_input_tokens_seen": 64561616, "step": 30905 }, { "epoch": 5.042417815482502, "grad_norm": 1.78125, "learning_rate": 6.106377533992805e-06, "loss": 1.9716, "num_input_tokens_seen": 64571952, "step": 30910 }, { "epoch": 5.04323354270332, "grad_norm": 2.84375, "learning_rate": 6.099949857778204e-06, "loss": 3.6317, "num_input_tokens_seen": 64582112, "step": 30915 }, { "epoch": 5.0440492699241375, "grad_norm": 7.625, "learning_rate": 6.093525096188852e-06, "loss": 1.9227, "num_input_tokens_seen": 64593200, "step": 30920 }, { "epoch": 5.044864997144955, "grad_norm": 7.3125, "learning_rate": 6.087103250215518e-06, "loss": 2.8798, "num_input_tokens_seen": 64603936, "step": 30925 }, { "epoch": 5.045680724365772, "grad_norm": 5.0, "learning_rate": 6.080684320848537e-06, "loss": 2.1224, "num_input_tokens_seen": 64614096, "step": 30930 }, { "epoch": 5.04649645158659, "grad_norm": 3.34375, "learning_rate": 6.074268309077794e-06, "loss": 2.3411, "num_input_tokens_seen": 64624928, "step": 30935 }, { "epoch": 5.047312178807407, "grad_norm": 14.4375, "learning_rate": 6.067855215892709e-06, "loss": 3.0373, "num_input_tokens_seen": 64635184, "step": 30940 }, { "epoch": 5.048127906028224, "grad_norm": 7.46875, "learning_rate": 6.061445042282271e-06, "loss": 1.0678, "num_input_tokens_seen": 64647344, "step": 30945 }, { "epoch": 5.048943633249041, "grad_norm": 5.28125, "learning_rate": 6.055037789234999e-06, "loss": 2.0144, "num_input_tokens_seen": 64657680, "step": 30950 }, { "epoch": 5.049759360469859, "grad_norm": 7.28125, "learning_rate": 6.048633457738975e-06, "loss": 1.5497, "num_input_tokens_seen": 64667856, "step": 30955 }, { "epoch": 5.050575087690676, "grad_norm": 4.6875, "learning_rate": 6.042232048781837e-06, "loss": 1.5605, "num_input_tokens_seen": 64677952, "step": 30960 }, { "epoch": 5.051390814911493, "grad_norm": 8.125, "learning_rate": 6.035833563350757e-06, "loss": 2.2136, "num_input_tokens_seen": 64690304, "step": 30965 }, { "epoch": 5.052206542132311, "grad_norm": 7.0625, "learning_rate": 6.0294380024324525e-06, "loss": 1.2953, "num_input_tokens_seen": 64700528, "step": 30970 }, { "epoch": 5.053022269353129, "grad_norm": 4.28125, "learning_rate": 6.023045367013213e-06, "loss": 3.1442, "num_input_tokens_seen": 64712144, "step": 30975 }, { "epoch": 5.053837996573946, "grad_norm": 4.3125, "learning_rate": 6.016655658078851e-06, "loss": 2.5257, "num_input_tokens_seen": 64722720, "step": 30980 }, { "epoch": 5.054653723794763, "grad_norm": 4.875, "learning_rate": 6.010268876614753e-06, "loss": 1.323, "num_input_tokens_seen": 64732896, "step": 30985 }, { "epoch": 5.05546945101558, "grad_norm": 9.6875, "learning_rate": 6.0038850236058266e-06, "loss": 1.5968, "num_input_tokens_seen": 64743184, "step": 30990 }, { "epoch": 5.056285178236398, "grad_norm": 3.140625, "learning_rate": 5.997504100036549e-06, "loss": 2.158, "num_input_tokens_seen": 64754608, "step": 30995 }, { "epoch": 5.057100905457215, "grad_norm": 10.3125, "learning_rate": 5.991126106890949e-06, "loss": 2.8357, "num_input_tokens_seen": 64764608, "step": 31000 }, { "epoch": 5.057100905457215, "eval_loss": 2.5535449981689453, "eval_runtime": 134.8211, "eval_samples_per_second": 20.212, "eval_steps_per_second": 10.11, "num_input_tokens_seen": 64764608, "step": 31000 }, { "epoch": 5.057916632678032, "grad_norm": 3.34375, "learning_rate": 5.984751045152576e-06, "loss": 1.6575, "num_input_tokens_seen": 64774208, "step": 31005 }, { "epoch": 5.058732359898849, "grad_norm": 2.953125, "learning_rate": 5.978378915804553e-06, "loss": 2.3217, "num_input_tokens_seen": 64785216, "step": 31010 }, { "epoch": 5.059548087119667, "grad_norm": 8.5625, "learning_rate": 5.972009719829547e-06, "loss": 2.5175, "num_input_tokens_seen": 64796016, "step": 31015 }, { "epoch": 5.0603638143404845, "grad_norm": 5.71875, "learning_rate": 5.965643458209755e-06, "loss": 2.5677, "num_input_tokens_seen": 64806640, "step": 31020 }, { "epoch": 5.061179541561302, "grad_norm": 6.5625, "learning_rate": 5.95928013192695e-06, "loss": 2.6477, "num_input_tokens_seen": 64815616, "step": 31025 }, { "epoch": 5.06199526878212, "grad_norm": 7.40625, "learning_rate": 5.952919741962423e-06, "loss": 2.0405, "num_input_tokens_seen": 64825360, "step": 31030 }, { "epoch": 5.062810996002937, "grad_norm": 9.125, "learning_rate": 5.946562289297042e-06, "loss": 2.5021, "num_input_tokens_seen": 64836768, "step": 31035 }, { "epoch": 5.063626723223754, "grad_norm": 5.5625, "learning_rate": 5.9402077749111855e-06, "loss": 2.6514, "num_input_tokens_seen": 64845504, "step": 31040 }, { "epoch": 5.064442450444571, "grad_norm": 12.875, "learning_rate": 5.933856199784821e-06, "loss": 4.383, "num_input_tokens_seen": 64856016, "step": 31045 }, { "epoch": 5.065258177665389, "grad_norm": 11.3125, "learning_rate": 5.927507564897419e-06, "loss": 2.7777, "num_input_tokens_seen": 64866528, "step": 31050 }, { "epoch": 5.066073904886206, "grad_norm": 9.375, "learning_rate": 5.9211618712280395e-06, "loss": 2.2448, "num_input_tokens_seen": 64877344, "step": 31055 }, { "epoch": 5.066889632107023, "grad_norm": 6.03125, "learning_rate": 5.914819119755255e-06, "loss": 2.4817, "num_input_tokens_seen": 64887504, "step": 31060 }, { "epoch": 5.0677053593278405, "grad_norm": 4.9375, "learning_rate": 5.908479311457205e-06, "loss": 1.874, "num_input_tokens_seen": 64898288, "step": 31065 }, { "epoch": 5.0685210865486585, "grad_norm": 11.25, "learning_rate": 5.902142447311559e-06, "loss": 2.5983, "num_input_tokens_seen": 64908656, "step": 31070 }, { "epoch": 5.069336813769476, "grad_norm": 9.5, "learning_rate": 5.895808528295546e-06, "loss": 2.952, "num_input_tokens_seen": 64919536, "step": 31075 }, { "epoch": 5.070152540990293, "grad_norm": 7.8125, "learning_rate": 5.889477555385941e-06, "loss": 3.6249, "num_input_tokens_seen": 64930032, "step": 31080 }, { "epoch": 5.07096826821111, "grad_norm": 3.484375, "learning_rate": 5.883149529559051e-06, "loss": 2.1797, "num_input_tokens_seen": 64940064, "step": 31085 }, { "epoch": 5.071783995431928, "grad_norm": 7.6875, "learning_rate": 5.876824451790738e-06, "loss": 1.6177, "num_input_tokens_seen": 64950464, "step": 31090 }, { "epoch": 5.072599722652745, "grad_norm": 0.8671875, "learning_rate": 5.87050232305642e-06, "loss": 1.5847, "num_input_tokens_seen": 64961280, "step": 31095 }, { "epoch": 5.073415449873562, "grad_norm": 2.09375, "learning_rate": 5.864183144331034e-06, "loss": 2.5036, "num_input_tokens_seen": 64972640, "step": 31100 }, { "epoch": 5.074231177094379, "grad_norm": 9.0625, "learning_rate": 5.857866916589089e-06, "loss": 2.5875, "num_input_tokens_seen": 64983888, "step": 31105 }, { "epoch": 5.075046904315197, "grad_norm": 6.09375, "learning_rate": 5.8515536408046216e-06, "loss": 2.4128, "num_input_tokens_seen": 64994560, "step": 31110 }, { "epoch": 5.0758626315360145, "grad_norm": 8.125, "learning_rate": 5.845243317951208e-06, "loss": 1.7371, "num_input_tokens_seen": 65004768, "step": 31115 }, { "epoch": 5.076678358756832, "grad_norm": 7.46875, "learning_rate": 5.838935949001997e-06, "loss": 2.2875, "num_input_tokens_seen": 65014816, "step": 31120 }, { "epoch": 5.077494085977649, "grad_norm": 6.125, "learning_rate": 5.8326315349296476e-06, "loss": 2.5717, "num_input_tokens_seen": 65024832, "step": 31125 }, { "epoch": 5.078309813198467, "grad_norm": 7.96875, "learning_rate": 5.826330076706396e-06, "loss": 1.6217, "num_input_tokens_seen": 65034272, "step": 31130 }, { "epoch": 5.079125540419284, "grad_norm": 5.90625, "learning_rate": 5.820031575303988e-06, "loss": 2.0515, "num_input_tokens_seen": 65044480, "step": 31135 }, { "epoch": 5.079941267640101, "grad_norm": 8.375, "learning_rate": 5.813736031693745e-06, "loss": 2.6278, "num_input_tokens_seen": 65054144, "step": 31140 }, { "epoch": 5.080756994860918, "grad_norm": 11.625, "learning_rate": 5.807443446846522e-06, "loss": 2.3139, "num_input_tokens_seen": 65065792, "step": 31145 }, { "epoch": 5.081572722081736, "grad_norm": 11.3125, "learning_rate": 5.801153821732699e-06, "loss": 2.9428, "num_input_tokens_seen": 65075840, "step": 31150 }, { "epoch": 5.082388449302553, "grad_norm": 10.0, "learning_rate": 5.794867157322229e-06, "loss": 2.9868, "num_input_tokens_seen": 65086080, "step": 31155 }, { "epoch": 5.08320417652337, "grad_norm": 5.6875, "learning_rate": 5.788583454584593e-06, "loss": 2.35, "num_input_tokens_seen": 65096384, "step": 31160 }, { "epoch": 5.0840199037441876, "grad_norm": 5.59375, "learning_rate": 5.7823027144888075e-06, "loss": 2.5418, "num_input_tokens_seen": 65105280, "step": 31165 }, { "epoch": 5.084835630965006, "grad_norm": 11.1875, "learning_rate": 5.776024938003455e-06, "loss": 3.0747, "num_input_tokens_seen": 65115184, "step": 31170 }, { "epoch": 5.085651358185823, "grad_norm": 6.71875, "learning_rate": 5.7697501260966345e-06, "loss": 2.5843, "num_input_tokens_seen": 65127168, "step": 31175 }, { "epoch": 5.08646708540664, "grad_norm": 11.3125, "learning_rate": 5.7634782797360145e-06, "loss": 2.3403, "num_input_tokens_seen": 65138080, "step": 31180 }, { "epoch": 5.087282812627457, "grad_norm": 4.34375, "learning_rate": 5.757209399888777e-06, "loss": 1.2868, "num_input_tokens_seen": 65147600, "step": 31185 }, { "epoch": 5.088098539848275, "grad_norm": 7.6875, "learning_rate": 5.750943487521679e-06, "loss": 2.8511, "num_input_tokens_seen": 65158544, "step": 31190 }, { "epoch": 5.088914267069092, "grad_norm": 4.15625, "learning_rate": 5.744680543600986e-06, "loss": 2.4018, "num_input_tokens_seen": 65169168, "step": 31195 }, { "epoch": 5.089729994289909, "grad_norm": 6.0625, "learning_rate": 5.738420569092537e-06, "loss": 1.7275, "num_input_tokens_seen": 65180560, "step": 31200 }, { "epoch": 5.089729994289909, "eval_loss": 2.5577526092529297, "eval_runtime": 134.8369, "eval_samples_per_second": 20.21, "eval_steps_per_second": 10.109, "num_input_tokens_seen": 65180560, "step": 31200 }, { "epoch": 5.090545721510727, "grad_norm": 6.8125, "learning_rate": 5.732163564961684e-06, "loss": 2.4664, "num_input_tokens_seen": 65191952, "step": 31205 }, { "epoch": 5.091361448731544, "grad_norm": 8.6875, "learning_rate": 5.725909532173354e-06, "loss": 4.4519, "num_input_tokens_seen": 65202112, "step": 31210 }, { "epoch": 5.0921771759523615, "grad_norm": 5.78125, "learning_rate": 5.719658471691977e-06, "loss": 2.8088, "num_input_tokens_seen": 65213296, "step": 31215 }, { "epoch": 5.092992903173179, "grad_norm": 7.34375, "learning_rate": 5.71341038448156e-06, "loss": 1.8959, "num_input_tokens_seen": 65224384, "step": 31220 }, { "epoch": 5.093808630393997, "grad_norm": 3.484375, "learning_rate": 5.707165271505635e-06, "loss": 1.9655, "num_input_tokens_seen": 65235296, "step": 31225 }, { "epoch": 5.094624357614814, "grad_norm": 4.40625, "learning_rate": 5.700923133727271e-06, "loss": 1.7492, "num_input_tokens_seen": 65246192, "step": 31230 }, { "epoch": 5.095440084835631, "grad_norm": 3.984375, "learning_rate": 5.694683972109083e-06, "loss": 2.1711, "num_input_tokens_seen": 65256608, "step": 31235 }, { "epoch": 5.096255812056448, "grad_norm": 7.4375, "learning_rate": 5.688447787613241e-06, "loss": 1.4588, "num_input_tokens_seen": 65266624, "step": 31240 }, { "epoch": 5.097071539277266, "grad_norm": 5.46875, "learning_rate": 5.6822145812014285e-06, "loss": 2.5992, "num_input_tokens_seen": 65277120, "step": 31245 }, { "epoch": 5.097887266498083, "grad_norm": 5.4375, "learning_rate": 5.675984353834896e-06, "loss": 3.2496, "num_input_tokens_seen": 65287280, "step": 31250 }, { "epoch": 5.0987029937189, "grad_norm": 7.4375, "learning_rate": 5.66975710647441e-06, "loss": 1.9577, "num_input_tokens_seen": 65298608, "step": 31255 }, { "epoch": 5.0995187209397175, "grad_norm": 4.90625, "learning_rate": 5.663532840080304e-06, "loss": 2.0327, "num_input_tokens_seen": 65309600, "step": 31260 }, { "epoch": 5.1003344481605355, "grad_norm": 12.4375, "learning_rate": 5.6573115556124325e-06, "loss": 1.492, "num_input_tokens_seen": 65319648, "step": 31265 }, { "epoch": 5.101150175381353, "grad_norm": 3.84375, "learning_rate": 5.651093254030185e-06, "loss": 1.5161, "num_input_tokens_seen": 65329232, "step": 31270 }, { "epoch": 5.10196590260217, "grad_norm": 11.5625, "learning_rate": 5.644877936292514e-06, "loss": 4.6343, "num_input_tokens_seen": 65339040, "step": 31275 }, { "epoch": 5.102781629822987, "grad_norm": 9.0, "learning_rate": 5.638665603357901e-06, "loss": 1.9148, "num_input_tokens_seen": 65349104, "step": 31280 }, { "epoch": 5.103597357043805, "grad_norm": 1.140625, "learning_rate": 5.632456256184357e-06, "loss": 2.6505, "num_input_tokens_seen": 65359648, "step": 31285 }, { "epoch": 5.104413084264622, "grad_norm": 9.375, "learning_rate": 5.626249895729452e-06, "loss": 3.299, "num_input_tokens_seen": 65368528, "step": 31290 }, { "epoch": 5.105228811485439, "grad_norm": 12.4375, "learning_rate": 5.620046522950273e-06, "loss": 3.1876, "num_input_tokens_seen": 65378976, "step": 31295 }, { "epoch": 5.106044538706256, "grad_norm": 11.6875, "learning_rate": 5.613846138803464e-06, "loss": 2.3639, "num_input_tokens_seen": 65389056, "step": 31300 }, { "epoch": 5.106860265927074, "grad_norm": 3.765625, "learning_rate": 5.607648744245206e-06, "loss": 1.5838, "num_input_tokens_seen": 65399936, "step": 31305 }, { "epoch": 5.1076759931478914, "grad_norm": 4.625, "learning_rate": 5.601454340231207e-06, "loss": 3.0206, "num_input_tokens_seen": 65410416, "step": 31310 }, { "epoch": 5.108491720368709, "grad_norm": 6.78125, "learning_rate": 5.595262927716724e-06, "loss": 2.3451, "num_input_tokens_seen": 65420656, "step": 31315 }, { "epoch": 5.109307447589526, "grad_norm": 2.734375, "learning_rate": 5.589074507656561e-06, "loss": 3.1689, "num_input_tokens_seen": 65432000, "step": 31320 }, { "epoch": 5.110123174810344, "grad_norm": 4.9375, "learning_rate": 5.582889081005044e-06, "loss": 2.8485, "num_input_tokens_seen": 65443632, "step": 31325 }, { "epoch": 5.110938902031161, "grad_norm": 7.09375, "learning_rate": 5.5767066487160316e-06, "loss": 3.7526, "num_input_tokens_seen": 65453312, "step": 31330 }, { "epoch": 5.111754629251978, "grad_norm": 11.6875, "learning_rate": 5.570527211742949e-06, "loss": 2.1587, "num_input_tokens_seen": 65464128, "step": 31335 }, { "epoch": 5.112570356472795, "grad_norm": 5.90625, "learning_rate": 5.564350771038731e-06, "loss": 1.6937, "num_input_tokens_seen": 65474128, "step": 31340 }, { "epoch": 5.113386083693613, "grad_norm": 0.82421875, "learning_rate": 5.558177327555875e-06, "loss": 2.5633, "num_input_tokens_seen": 65484512, "step": 31345 }, { "epoch": 5.11420181091443, "grad_norm": 15.375, "learning_rate": 5.552006882246388e-06, "loss": 3.2128, "num_input_tokens_seen": 65495776, "step": 31350 }, { "epoch": 5.115017538135247, "grad_norm": 6.5625, "learning_rate": 5.545839436061839e-06, "loss": 1.8067, "num_input_tokens_seen": 65506848, "step": 31355 }, { "epoch": 5.1158332653560645, "grad_norm": 13.9375, "learning_rate": 5.539674989953331e-06, "loss": 2.3077, "num_input_tokens_seen": 65518288, "step": 31360 }, { "epoch": 5.1166489925768825, "grad_norm": 10.4375, "learning_rate": 5.533513544871488e-06, "loss": 2.3604, "num_input_tokens_seen": 65529024, "step": 31365 }, { "epoch": 5.1174647197977, "grad_norm": 8.25, "learning_rate": 5.527355101766493e-06, "loss": 4.0398, "num_input_tokens_seen": 65538688, "step": 31370 }, { "epoch": 5.118280447018517, "grad_norm": 4.59375, "learning_rate": 5.521199661588044e-06, "loss": 3.0619, "num_input_tokens_seen": 65547984, "step": 31375 }, { "epoch": 5.119096174239334, "grad_norm": 8.5, "learning_rate": 5.5150472252853944e-06, "loss": 1.9902, "num_input_tokens_seen": 65558400, "step": 31380 }, { "epoch": 5.119911901460152, "grad_norm": 5.59375, "learning_rate": 5.50889779380733e-06, "loss": 1.9461, "num_input_tokens_seen": 65568816, "step": 31385 }, { "epoch": 5.120727628680969, "grad_norm": 4.9375, "learning_rate": 5.5027513681021605e-06, "loss": 3.6197, "num_input_tokens_seen": 65579536, "step": 31390 }, { "epoch": 5.121543355901786, "grad_norm": 7.25, "learning_rate": 5.4966079491177545e-06, "loss": 1.7787, "num_input_tokens_seen": 65589984, "step": 31395 }, { "epoch": 5.122359083122603, "grad_norm": 1.75, "learning_rate": 5.490467537801491e-06, "loss": 0.6242, "num_input_tokens_seen": 65600032, "step": 31400 }, { "epoch": 5.122359083122603, "eval_loss": 2.5458450317382812, "eval_runtime": 134.5275, "eval_samples_per_second": 20.256, "eval_steps_per_second": 10.132, "num_input_tokens_seen": 65600032, "step": 31400 }, { "epoch": 5.123174810343421, "grad_norm": 3.3125, "learning_rate": 5.484330135100313e-06, "loss": 1.4811, "num_input_tokens_seen": 65609696, "step": 31405 }, { "epoch": 5.1239905375642385, "grad_norm": 16.375, "learning_rate": 5.4781957419606785e-06, "loss": 2.3935, "num_input_tokens_seen": 65620784, "step": 31410 }, { "epoch": 5.124806264785056, "grad_norm": 1.2578125, "learning_rate": 5.472064359328577e-06, "loss": 0.8026, "num_input_tokens_seen": 65631760, "step": 31415 }, { "epoch": 5.125621992005874, "grad_norm": 4.09375, "learning_rate": 5.4659359881495565e-06, "loss": 1.9882, "num_input_tokens_seen": 65642784, "step": 31420 }, { "epoch": 5.126437719226691, "grad_norm": 4.46875, "learning_rate": 5.4598106293686916e-06, "loss": 1.474, "num_input_tokens_seen": 65653392, "step": 31425 }, { "epoch": 5.127253446447508, "grad_norm": 13.9375, "learning_rate": 5.45368828393058e-06, "loss": 3.1723, "num_input_tokens_seen": 65662032, "step": 31430 }, { "epoch": 5.128069173668325, "grad_norm": 2.171875, "learning_rate": 5.44756895277937e-06, "loss": 3.1463, "num_input_tokens_seen": 65672000, "step": 31435 }, { "epoch": 5.128884900889143, "grad_norm": 7.8125, "learning_rate": 5.441452636858746e-06, "loss": 2.1267, "num_input_tokens_seen": 65681360, "step": 31440 }, { "epoch": 5.12970062810996, "grad_norm": 13.0625, "learning_rate": 5.435339337111905e-06, "loss": 1.2637, "num_input_tokens_seen": 65691824, "step": 31445 }, { "epoch": 5.130516355330777, "grad_norm": 8.625, "learning_rate": 5.42922905448161e-06, "loss": 2.6203, "num_input_tokens_seen": 65701424, "step": 31450 }, { "epoch": 5.1313320825515945, "grad_norm": 6.75, "learning_rate": 5.423121789910129e-06, "loss": 2.3989, "num_input_tokens_seen": 65710192, "step": 31455 }, { "epoch": 5.1321478097724125, "grad_norm": 7.65625, "learning_rate": 5.417017544339287e-06, "loss": 0.9086, "num_input_tokens_seen": 65718128, "step": 31460 }, { "epoch": 5.13296353699323, "grad_norm": 5.15625, "learning_rate": 5.410916318710443e-06, "loss": 2.3129, "num_input_tokens_seen": 65729200, "step": 31465 }, { "epoch": 5.133779264214047, "grad_norm": 3.53125, "learning_rate": 5.404818113964466e-06, "loss": 3.1282, "num_input_tokens_seen": 65739136, "step": 31470 }, { "epoch": 5.134594991434864, "grad_norm": 6.5625, "learning_rate": 5.398722931041792e-06, "loss": 2.3737, "num_input_tokens_seen": 65750576, "step": 31475 }, { "epoch": 5.135410718655682, "grad_norm": 8.6875, "learning_rate": 5.392630770882367e-06, "loss": 2.7045, "num_input_tokens_seen": 65762304, "step": 31480 }, { "epoch": 5.136226445876499, "grad_norm": 4.53125, "learning_rate": 5.3865416344256705e-06, "loss": 2.1857, "num_input_tokens_seen": 65772672, "step": 31485 }, { "epoch": 5.137042173097316, "grad_norm": 2.171875, "learning_rate": 5.380455522610742e-06, "loss": 2.0472, "num_input_tokens_seen": 65781568, "step": 31490 }, { "epoch": 5.137857900318133, "grad_norm": 9.25, "learning_rate": 5.374372436376116e-06, "loss": 2.1372, "num_input_tokens_seen": 65791568, "step": 31495 }, { "epoch": 5.138673627538951, "grad_norm": 6.5, "learning_rate": 5.368292376659895e-06, "loss": 3.6817, "num_input_tokens_seen": 65801920, "step": 31500 }, { "epoch": 5.139489354759768, "grad_norm": 6.59375, "learning_rate": 5.362215344399701e-06, "loss": 2.7558, "num_input_tokens_seen": 65811280, "step": 31505 }, { "epoch": 5.1403050819805856, "grad_norm": 4.65625, "learning_rate": 5.356141340532678e-06, "loss": 1.0185, "num_input_tokens_seen": 65821712, "step": 31510 }, { "epoch": 5.141120809201403, "grad_norm": 3.484375, "learning_rate": 5.350070365995522e-06, "loss": 1.1248, "num_input_tokens_seen": 65832400, "step": 31515 }, { "epoch": 5.141936536422221, "grad_norm": 10.6875, "learning_rate": 5.344002421724459e-06, "loss": 2.8597, "num_input_tokens_seen": 65843712, "step": 31520 }, { "epoch": 5.142752263643038, "grad_norm": 9.375, "learning_rate": 5.337937508655228e-06, "loss": 3.1524, "num_input_tokens_seen": 65853920, "step": 31525 }, { "epoch": 5.143567990863855, "grad_norm": 8.875, "learning_rate": 5.331875627723126e-06, "loss": 3.1163, "num_input_tokens_seen": 65863536, "step": 31530 }, { "epoch": 5.144383718084672, "grad_norm": 7.75, "learning_rate": 5.325816779862963e-06, "loss": 1.6013, "num_input_tokens_seen": 65874544, "step": 31535 }, { "epoch": 5.14519944530549, "grad_norm": 3.953125, "learning_rate": 5.319760966009102e-06, "loss": 2.2666, "num_input_tokens_seen": 65884400, "step": 31540 }, { "epoch": 5.146015172526307, "grad_norm": 7.09375, "learning_rate": 5.3137081870954096e-06, "loss": 2.6424, "num_input_tokens_seen": 65895632, "step": 31545 }, { "epoch": 5.146830899747124, "grad_norm": 3.90625, "learning_rate": 5.307658444055313e-06, "loss": 4.0871, "num_input_tokens_seen": 65906416, "step": 31550 }, { "epoch": 5.1476466269679415, "grad_norm": 6.65625, "learning_rate": 5.301611737821749e-06, "loss": 1.9586, "num_input_tokens_seen": 65917248, "step": 31555 }, { "epoch": 5.1484623541887595, "grad_norm": 8.1875, "learning_rate": 5.295568069327206e-06, "loss": 2.1103, "num_input_tokens_seen": 65926992, "step": 31560 }, { "epoch": 5.149278081409577, "grad_norm": 6.0625, "learning_rate": 5.289527439503683e-06, "loss": 2.357, "num_input_tokens_seen": 65937008, "step": 31565 }, { "epoch": 5.150093808630394, "grad_norm": 7.75, "learning_rate": 5.28348984928273e-06, "loss": 1.8295, "num_input_tokens_seen": 65946864, "step": 31570 }, { "epoch": 5.150909535851211, "grad_norm": 6.59375, "learning_rate": 5.27745529959541e-06, "loss": 2.5535, "num_input_tokens_seen": 65958320, "step": 31575 }, { "epoch": 5.151725263072029, "grad_norm": 7.125, "learning_rate": 5.271423791372335e-06, "loss": 2.7574, "num_input_tokens_seen": 65966784, "step": 31580 }, { "epoch": 5.152540990292846, "grad_norm": 9.625, "learning_rate": 5.26539532554364e-06, "loss": 1.6575, "num_input_tokens_seen": 65977376, "step": 31585 }, { "epoch": 5.153356717513663, "grad_norm": 14.125, "learning_rate": 5.25936990303898e-06, "loss": 1.8816, "num_input_tokens_seen": 65987472, "step": 31590 }, { "epoch": 5.154172444734481, "grad_norm": 4.125, "learning_rate": 5.253347524787555e-06, "loss": 2.6775, "num_input_tokens_seen": 65997968, "step": 31595 }, { "epoch": 5.154988171955298, "grad_norm": 9.125, "learning_rate": 5.2473281917181035e-06, "loss": 2.2156, "num_input_tokens_seen": 66007440, "step": 31600 }, { "epoch": 5.154988171955298, "eval_loss": 2.548614740371704, "eval_runtime": 134.8377, "eval_samples_per_second": 20.209, "eval_steps_per_second": 10.108, "num_input_tokens_seen": 66007440, "step": 31600 }, { "epoch": 5.1558038991761155, "grad_norm": 4.53125, "learning_rate": 5.241311904758864e-06, "loss": 2.1731, "num_input_tokens_seen": 66018320, "step": 31605 }, { "epoch": 5.156619626396933, "grad_norm": 4.78125, "learning_rate": 5.23529866483764e-06, "loss": 3.0881, "num_input_tokens_seen": 66027536, "step": 31610 }, { "epoch": 5.157435353617751, "grad_norm": 8.875, "learning_rate": 5.229288472881732e-06, "loss": 2.8026, "num_input_tokens_seen": 66036640, "step": 31615 }, { "epoch": 5.158251080838568, "grad_norm": 2.46875, "learning_rate": 5.2232813298180025e-06, "loss": 2.0484, "num_input_tokens_seen": 66046416, "step": 31620 }, { "epoch": 5.159066808059385, "grad_norm": 4.625, "learning_rate": 5.217277236572824e-06, "loss": 2.4164, "num_input_tokens_seen": 66056624, "step": 31625 }, { "epoch": 5.159882535280202, "grad_norm": 6.59375, "learning_rate": 5.211276194072093e-06, "loss": 1.9527, "num_input_tokens_seen": 66066896, "step": 31630 }, { "epoch": 5.16069826250102, "grad_norm": 0.921875, "learning_rate": 5.205278203241254e-06, "loss": 1.3941, "num_input_tokens_seen": 66077888, "step": 31635 }, { "epoch": 5.161513989721837, "grad_norm": 9.0625, "learning_rate": 5.199283265005278e-06, "loss": 1.2259, "num_input_tokens_seen": 66088368, "step": 31640 }, { "epoch": 5.162329716942654, "grad_norm": 4.40625, "learning_rate": 5.193291380288648e-06, "loss": 2.1741, "num_input_tokens_seen": 66099520, "step": 31645 }, { "epoch": 5.163145444163471, "grad_norm": 7.09375, "learning_rate": 5.1873025500153995e-06, "loss": 2.6937, "num_input_tokens_seen": 66110880, "step": 31650 }, { "epoch": 5.1639611713842895, "grad_norm": 8.8125, "learning_rate": 5.181316775109071e-06, "loss": 1.6862, "num_input_tokens_seen": 66121072, "step": 31655 }, { "epoch": 5.164776898605107, "grad_norm": 6.375, "learning_rate": 5.1753340564927564e-06, "loss": 1.5433, "num_input_tokens_seen": 66130272, "step": 31660 }, { "epoch": 5.165592625825924, "grad_norm": 11.9375, "learning_rate": 5.169354395089068e-06, "loss": 2.9109, "num_input_tokens_seen": 66140704, "step": 31665 }, { "epoch": 5.166408353046741, "grad_norm": 4.8125, "learning_rate": 5.1633777918201346e-06, "loss": 1.189, "num_input_tokens_seen": 66151952, "step": 31670 }, { "epoch": 5.167224080267559, "grad_norm": 12.75, "learning_rate": 5.157404247607625e-06, "loss": 2.5385, "num_input_tokens_seen": 66161168, "step": 31675 }, { "epoch": 5.168039807488376, "grad_norm": 8.0625, "learning_rate": 5.1514337633727454e-06, "loss": 2.6702, "num_input_tokens_seen": 66170128, "step": 31680 }, { "epoch": 5.168855534709193, "grad_norm": 9.75, "learning_rate": 5.145466340036206e-06, "loss": 2.979, "num_input_tokens_seen": 66181056, "step": 31685 }, { "epoch": 5.16967126193001, "grad_norm": 12.4375, "learning_rate": 5.139501978518274e-06, "loss": 2.5643, "num_input_tokens_seen": 66192928, "step": 31690 }, { "epoch": 5.170486989150828, "grad_norm": 4.5625, "learning_rate": 5.133540679738716e-06, "loss": 3.7068, "num_input_tokens_seen": 66201744, "step": 31695 }, { "epoch": 5.171302716371645, "grad_norm": 8.75, "learning_rate": 5.127582444616838e-06, "loss": 2.7309, "num_input_tokens_seen": 66211488, "step": 31700 }, { "epoch": 5.1721184435924625, "grad_norm": 6.5, "learning_rate": 5.121627274071486e-06, "loss": 3.3983, "num_input_tokens_seen": 66222560, "step": 31705 }, { "epoch": 5.17293417081328, "grad_norm": 9.125, "learning_rate": 5.115675169021009e-06, "loss": 1.6988, "num_input_tokens_seen": 66233088, "step": 31710 }, { "epoch": 5.173749898034098, "grad_norm": 0.90625, "learning_rate": 5.1097261303832994e-06, "loss": 2.181, "num_input_tokens_seen": 66243952, "step": 31715 }, { "epoch": 5.174565625254915, "grad_norm": 0.07861328125, "learning_rate": 5.103780159075788e-06, "loss": 1.5375, "num_input_tokens_seen": 66253872, "step": 31720 }, { "epoch": 5.175381352475732, "grad_norm": 8.9375, "learning_rate": 5.0978372560154e-06, "loss": 3.1914, "num_input_tokens_seen": 66263152, "step": 31725 }, { "epoch": 5.176197079696549, "grad_norm": 11.0625, "learning_rate": 5.091897422118619e-06, "loss": 3.4578, "num_input_tokens_seen": 66273792, "step": 31730 }, { "epoch": 5.177012806917367, "grad_norm": 9.625, "learning_rate": 5.0859606583014305e-06, "loss": 2.2166, "num_input_tokens_seen": 66284208, "step": 31735 }, { "epoch": 5.177828534138184, "grad_norm": 5.75, "learning_rate": 5.080026965479365e-06, "loss": 2.4539, "num_input_tokens_seen": 66294992, "step": 31740 }, { "epoch": 5.178644261359001, "grad_norm": 20.25, "learning_rate": 5.074096344567475e-06, "loss": 1.4331, "num_input_tokens_seen": 66304032, "step": 31745 }, { "epoch": 5.1794599885798185, "grad_norm": 19.125, "learning_rate": 5.0681687964803294e-06, "loss": 4.2854, "num_input_tokens_seen": 66313328, "step": 31750 }, { "epoch": 5.1802757158006365, "grad_norm": 8.5, "learning_rate": 5.06224432213204e-06, "loss": 3.1259, "num_input_tokens_seen": 66321696, "step": 31755 }, { "epoch": 5.181091443021454, "grad_norm": 4.1875, "learning_rate": 5.056322922436224e-06, "loss": 1.9123, "num_input_tokens_seen": 66331744, "step": 31760 }, { "epoch": 5.181907170242271, "grad_norm": 11.6875, "learning_rate": 5.0504045983060465e-06, "loss": 2.8251, "num_input_tokens_seen": 66341456, "step": 31765 }, { "epoch": 5.182722897463089, "grad_norm": 11.0625, "learning_rate": 5.044489350654183e-06, "loss": 2.0641, "num_input_tokens_seen": 66350864, "step": 31770 }, { "epoch": 5.183538624683906, "grad_norm": 4.25, "learning_rate": 5.038577180392831e-06, "loss": 1.2624, "num_input_tokens_seen": 66361136, "step": 31775 }, { "epoch": 5.184354351904723, "grad_norm": 5.96875, "learning_rate": 5.032668088433729e-06, "loss": 2.4957, "num_input_tokens_seen": 66371056, "step": 31780 }, { "epoch": 5.18517007912554, "grad_norm": 10.0625, "learning_rate": 5.02676207568814e-06, "loss": 3.5543, "num_input_tokens_seen": 66381744, "step": 31785 }, { "epoch": 5.185985806346358, "grad_norm": 12.25, "learning_rate": 5.02085914306683e-06, "loss": 1.855, "num_input_tokens_seen": 66393056, "step": 31790 }, { "epoch": 5.186801533567175, "grad_norm": 6.9375, "learning_rate": 5.014959291480123e-06, "loss": 2.3199, "num_input_tokens_seen": 66405392, "step": 31795 }, { "epoch": 5.1876172607879925, "grad_norm": 9.375, "learning_rate": 5.009062521837835e-06, "loss": 3.6034, "num_input_tokens_seen": 66416480, "step": 31800 }, { "epoch": 5.1876172607879925, "eval_loss": 2.537795066833496, "eval_runtime": 134.81, "eval_samples_per_second": 20.214, "eval_steps_per_second": 10.111, "num_input_tokens_seen": 66416480, "step": 31800 }, { "epoch": 5.18843298800881, "grad_norm": 7.15625, "learning_rate": 5.003168835049324e-06, "loss": 2.4163, "num_input_tokens_seen": 66427040, "step": 31805 }, { "epoch": 5.189248715229628, "grad_norm": 6.1875, "learning_rate": 4.997278232023483e-06, "loss": 4.4261, "num_input_tokens_seen": 66436864, "step": 31810 }, { "epoch": 5.190064442450445, "grad_norm": 11.9375, "learning_rate": 4.9913907136687036e-06, "loss": 5.244, "num_input_tokens_seen": 66446400, "step": 31815 }, { "epoch": 5.190880169671262, "grad_norm": 5.84375, "learning_rate": 4.985506280892918e-06, "loss": 2.8668, "num_input_tokens_seen": 66456784, "step": 31820 }, { "epoch": 5.191695896892079, "grad_norm": 4.65625, "learning_rate": 4.979624934603589e-06, "loss": 2.4323, "num_input_tokens_seen": 66466704, "step": 31825 }, { "epoch": 5.192511624112897, "grad_norm": 4.71875, "learning_rate": 4.97374667570768e-06, "loss": 3.0871, "num_input_tokens_seen": 66477280, "step": 31830 }, { "epoch": 5.193327351333714, "grad_norm": 2.5625, "learning_rate": 4.967871505111704e-06, "loss": 2.7279, "num_input_tokens_seen": 66487744, "step": 31835 }, { "epoch": 5.194143078554531, "grad_norm": 5.15625, "learning_rate": 4.961999423721686e-06, "loss": 3.3858, "num_input_tokens_seen": 66498080, "step": 31840 }, { "epoch": 5.194958805775348, "grad_norm": 2.921875, "learning_rate": 4.956130432443159e-06, "loss": 2.5598, "num_input_tokens_seen": 66509040, "step": 31845 }, { "epoch": 5.195774532996166, "grad_norm": 8.375, "learning_rate": 4.950264532181215e-06, "loss": 2.0359, "num_input_tokens_seen": 66517952, "step": 31850 }, { "epoch": 5.196590260216984, "grad_norm": 5.3125, "learning_rate": 4.944401723840433e-06, "loss": 3.504, "num_input_tokens_seen": 66529056, "step": 31855 }, { "epoch": 5.197405987437801, "grad_norm": 6.625, "learning_rate": 4.938542008324942e-06, "loss": 2.0019, "num_input_tokens_seen": 66538912, "step": 31860 }, { "epoch": 5.198221714658618, "grad_norm": 8.4375, "learning_rate": 4.9326853865383855e-06, "loss": 1.951, "num_input_tokens_seen": 66548272, "step": 31865 }, { "epoch": 5.199037441879436, "grad_norm": 6.75, "learning_rate": 4.926831859383918e-06, "loss": 3.8909, "num_input_tokens_seen": 66559104, "step": 31870 }, { "epoch": 5.199853169100253, "grad_norm": 12.5, "learning_rate": 4.92098142776424e-06, "loss": 3.1511, "num_input_tokens_seen": 66568928, "step": 31875 }, { "epoch": 5.20066889632107, "grad_norm": 3.28125, "learning_rate": 4.91513409258155e-06, "loss": 2.3576, "num_input_tokens_seen": 66579952, "step": 31880 }, { "epoch": 5.201484623541887, "grad_norm": 7.15625, "learning_rate": 4.909289854737581e-06, "loss": 2.5183, "num_input_tokens_seen": 66589232, "step": 31885 }, { "epoch": 5.202300350762705, "grad_norm": 9.3125, "learning_rate": 4.903448715133602e-06, "loss": 2.877, "num_input_tokens_seen": 66599728, "step": 31890 }, { "epoch": 5.203116077983522, "grad_norm": 7.0625, "learning_rate": 4.897610674670372e-06, "loss": 2.5958, "num_input_tokens_seen": 66609392, "step": 31895 }, { "epoch": 5.2039318052043395, "grad_norm": 3.578125, "learning_rate": 4.8917757342482e-06, "loss": 2.2184, "num_input_tokens_seen": 66618816, "step": 31900 }, { "epoch": 5.204747532425157, "grad_norm": 5.0625, "learning_rate": 4.885943894766909e-06, "loss": 2.4726, "num_input_tokens_seen": 66629344, "step": 31905 }, { "epoch": 5.205563259645975, "grad_norm": 6.09375, "learning_rate": 4.880115157125842e-06, "loss": 1.6628, "num_input_tokens_seen": 66641392, "step": 31910 }, { "epoch": 5.206378986866792, "grad_norm": 7.125, "learning_rate": 4.874289522223857e-06, "loss": 2.5919, "num_input_tokens_seen": 66652272, "step": 31915 }, { "epoch": 5.207194714087609, "grad_norm": 6.6875, "learning_rate": 4.868466990959339e-06, "loss": 2.502, "num_input_tokens_seen": 66662016, "step": 31920 }, { "epoch": 5.208010441308426, "grad_norm": 7.59375, "learning_rate": 4.8626475642301964e-06, "loss": 2.9058, "num_input_tokens_seen": 66672032, "step": 31925 }, { "epoch": 5.208826168529244, "grad_norm": 5.65625, "learning_rate": 4.856831242933871e-06, "loss": 2.4547, "num_input_tokens_seen": 66684560, "step": 31930 }, { "epoch": 5.209641895750061, "grad_norm": 17.875, "learning_rate": 4.851018027967294e-06, "loss": 3.6895, "num_input_tokens_seen": 66694736, "step": 31935 }, { "epoch": 5.210457622970878, "grad_norm": 5.25, "learning_rate": 4.845207920226946e-06, "loss": 1.7334, "num_input_tokens_seen": 66705392, "step": 31940 }, { "epoch": 5.211273350191696, "grad_norm": 5.625, "learning_rate": 4.839400920608825e-06, "loss": 1.8278, "num_input_tokens_seen": 66716144, "step": 31945 }, { "epoch": 5.2120890774125135, "grad_norm": 4.71875, "learning_rate": 4.83359703000843e-06, "loss": 1.5545, "num_input_tokens_seen": 66726768, "step": 31950 }, { "epoch": 5.212904804633331, "grad_norm": 2.296875, "learning_rate": 4.827796249320804e-06, "loss": 3.9527, "num_input_tokens_seen": 66737952, "step": 31955 }, { "epoch": 5.213720531854148, "grad_norm": 14.4375, "learning_rate": 4.82199857944049e-06, "loss": 2.9756, "num_input_tokens_seen": 66746976, "step": 31960 }, { "epoch": 5.214536259074965, "grad_norm": 10.8125, "learning_rate": 4.8162040212615695e-06, "loss": 2.7158, "num_input_tokens_seen": 66757648, "step": 31965 }, { "epoch": 5.215351986295783, "grad_norm": 4.40625, "learning_rate": 4.810412575677639e-06, "loss": 2.2416, "num_input_tokens_seen": 66768464, "step": 31970 }, { "epoch": 5.2161677135166, "grad_norm": 6.375, "learning_rate": 4.804624243581801e-06, "loss": 2.1075, "num_input_tokens_seen": 66778624, "step": 31975 }, { "epoch": 5.216983440737417, "grad_norm": 8.25, "learning_rate": 4.798839025866703e-06, "loss": 3.6147, "num_input_tokens_seen": 66789776, "step": 31980 }, { "epoch": 5.217799167958235, "grad_norm": 6.65625, "learning_rate": 4.793056923424491e-06, "loss": 1.8919, "num_input_tokens_seen": 66798656, "step": 31985 }, { "epoch": 5.218614895179052, "grad_norm": 7.53125, "learning_rate": 4.78727793714683e-06, "loss": 2.1721, "num_input_tokens_seen": 66808448, "step": 31990 }, { "epoch": 5.219430622399869, "grad_norm": 7.5625, "learning_rate": 4.7815020679249285e-06, "loss": 1.8113, "num_input_tokens_seen": 66818384, "step": 31995 }, { "epoch": 5.220246349620687, "grad_norm": 3.5, "learning_rate": 4.775729316649483e-06, "loss": 1.1157, "num_input_tokens_seen": 66829712, "step": 32000 }, { "epoch": 5.220246349620687, "eval_loss": 2.5336523056030273, "eval_runtime": 134.9659, "eval_samples_per_second": 20.19, "eval_steps_per_second": 10.099, "num_input_tokens_seen": 66829712, "step": 32000 }, { "epoch": 5.221062076841505, "grad_norm": 6.125, "learning_rate": 4.769959684210728e-06, "loss": 1.2541, "num_input_tokens_seen": 66839920, "step": 32005 }, { "epoch": 5.221877804062322, "grad_norm": 4.71875, "learning_rate": 4.764193171498426e-06, "loss": 1.5787, "num_input_tokens_seen": 66851664, "step": 32010 }, { "epoch": 5.222693531283139, "grad_norm": 5.125, "learning_rate": 4.75842977940183e-06, "loss": 2.7063, "num_input_tokens_seen": 66861888, "step": 32015 }, { "epoch": 5.223509258503956, "grad_norm": 7.03125, "learning_rate": 4.752669508809729e-06, "loss": 2.559, "num_input_tokens_seen": 66873440, "step": 32020 }, { "epoch": 5.224324985724774, "grad_norm": 3.71875, "learning_rate": 4.746912360610445e-06, "loss": 1.5809, "num_input_tokens_seen": 66884336, "step": 32025 }, { "epoch": 5.225140712945591, "grad_norm": 5.4375, "learning_rate": 4.741158335691781e-06, "loss": 1.2973, "num_input_tokens_seen": 66894608, "step": 32030 }, { "epoch": 5.225956440166408, "grad_norm": 8.9375, "learning_rate": 4.7354074349410994e-06, "loss": 2.3383, "num_input_tokens_seen": 66904624, "step": 32035 }, { "epoch": 5.226772167387225, "grad_norm": 4.34375, "learning_rate": 4.729659659245245e-06, "loss": 3.308, "num_input_tokens_seen": 66916560, "step": 32040 }, { "epoch": 5.227587894608043, "grad_norm": 8.8125, "learning_rate": 4.723915009490601e-06, "loss": 3.1977, "num_input_tokens_seen": 66926144, "step": 32045 }, { "epoch": 5.2284036218288605, "grad_norm": 6.46875, "learning_rate": 4.718173486563077e-06, "loss": 1.9637, "num_input_tokens_seen": 66936800, "step": 32050 }, { "epoch": 5.229219349049678, "grad_norm": 8.1875, "learning_rate": 4.71243509134808e-06, "loss": 2.5018, "num_input_tokens_seen": 66947168, "step": 32055 }, { "epoch": 5.230035076270495, "grad_norm": 6.28125, "learning_rate": 4.706699824730532e-06, "loss": 2.8128, "num_input_tokens_seen": 66958176, "step": 32060 }, { "epoch": 5.230850803491313, "grad_norm": 8.4375, "learning_rate": 4.700967687594901e-06, "loss": 2.9674, "num_input_tokens_seen": 66969808, "step": 32065 }, { "epoch": 5.23166653071213, "grad_norm": 7.6875, "learning_rate": 4.69523868082514e-06, "loss": 3.2826, "num_input_tokens_seen": 66981632, "step": 32070 }, { "epoch": 5.232482257932947, "grad_norm": 12.6875, "learning_rate": 4.689512805304747e-06, "loss": 2.8637, "num_input_tokens_seen": 66992208, "step": 32075 }, { "epoch": 5.233297985153764, "grad_norm": 7.875, "learning_rate": 4.683790061916707e-06, "loss": 2.0746, "num_input_tokens_seen": 67002880, "step": 32080 }, { "epoch": 5.234113712374582, "grad_norm": 7.0, "learning_rate": 4.678070451543551e-06, "loss": 1.8365, "num_input_tokens_seen": 67014272, "step": 32085 }, { "epoch": 5.234929439595399, "grad_norm": 14.875, "learning_rate": 4.6723539750673204e-06, "loss": 2.4565, "num_input_tokens_seen": 67024096, "step": 32090 }, { "epoch": 5.2357451668162165, "grad_norm": 2.984375, "learning_rate": 4.666640633369551e-06, "loss": 1.5985, "num_input_tokens_seen": 67033792, "step": 32095 }, { "epoch": 5.236560894037034, "grad_norm": 6.59375, "learning_rate": 4.660930427331323e-06, "loss": 4.1726, "num_input_tokens_seen": 67044784, "step": 32100 }, { "epoch": 5.237376621257852, "grad_norm": 0.29296875, "learning_rate": 4.6552233578332244e-06, "loss": 1.3576, "num_input_tokens_seen": 67056784, "step": 32105 }, { "epoch": 5.238192348478669, "grad_norm": 4.28125, "learning_rate": 4.649519425755347e-06, "loss": 1.5543, "num_input_tokens_seen": 67067072, "step": 32110 }, { "epoch": 5.239008075699486, "grad_norm": 10.4375, "learning_rate": 4.64381863197732e-06, "loss": 2.8785, "num_input_tokens_seen": 67076736, "step": 32115 }, { "epoch": 5.239823802920303, "grad_norm": 3.859375, "learning_rate": 4.638120977378269e-06, "loss": 2.0424, "num_input_tokens_seen": 67087712, "step": 32120 }, { "epoch": 5.240639530141121, "grad_norm": 5.125, "learning_rate": 4.632426462836848e-06, "loss": 2.3282, "num_input_tokens_seen": 67097968, "step": 32125 }, { "epoch": 5.241455257361938, "grad_norm": 12.0, "learning_rate": 4.626735089231224e-06, "loss": 3.935, "num_input_tokens_seen": 67108800, "step": 32130 }, { "epoch": 5.242270984582755, "grad_norm": 0.138671875, "learning_rate": 4.621046857439068e-06, "loss": 3.3208, "num_input_tokens_seen": 67119088, "step": 32135 }, { "epoch": 5.243086711803572, "grad_norm": 9.3125, "learning_rate": 4.615361768337587e-06, "loss": 2.145, "num_input_tokens_seen": 67130176, "step": 32140 }, { "epoch": 5.2439024390243905, "grad_norm": 6.0625, "learning_rate": 4.6096798228034946e-06, "loss": 1.6552, "num_input_tokens_seen": 67140512, "step": 32145 }, { "epoch": 5.244718166245208, "grad_norm": 2.46875, "learning_rate": 4.604001021713008e-06, "loss": 1.3819, "num_input_tokens_seen": 67150560, "step": 32150 }, { "epoch": 5.245533893466025, "grad_norm": 4.75, "learning_rate": 4.598325365941883e-06, "loss": 2.4702, "num_input_tokens_seen": 67160880, "step": 32155 }, { "epoch": 5.246349620686843, "grad_norm": 2.53125, "learning_rate": 4.5926528563653645e-06, "loss": 2.679, "num_input_tokens_seen": 67171072, "step": 32160 }, { "epoch": 5.24716534790766, "grad_norm": 2.4375, "learning_rate": 4.5869834938582295e-06, "loss": 2.5635, "num_input_tokens_seen": 67182512, "step": 32165 }, { "epoch": 5.247981075128477, "grad_norm": 6.90625, "learning_rate": 4.581317279294772e-06, "loss": 1.6906, "num_input_tokens_seen": 67192528, "step": 32170 }, { "epoch": 5.248796802349294, "grad_norm": 3.828125, "learning_rate": 4.57565421354878e-06, "loss": 2.5127, "num_input_tokens_seen": 67201712, "step": 32175 }, { "epoch": 5.249612529570112, "grad_norm": 8.1875, "learning_rate": 4.569994297493579e-06, "loss": 2.983, "num_input_tokens_seen": 67211488, "step": 32180 }, { "epoch": 5.250428256790929, "grad_norm": 10.3125, "learning_rate": 4.564337532002002e-06, "loss": 2.785, "num_input_tokens_seen": 67223168, "step": 32185 }, { "epoch": 5.251243984011746, "grad_norm": 11.0, "learning_rate": 4.55868391794638e-06, "loss": 2.3999, "num_input_tokens_seen": 67233920, "step": 32190 }, { "epoch": 5.2520597112325635, "grad_norm": 5.6875, "learning_rate": 4.553033456198588e-06, "loss": 2.4823, "num_input_tokens_seen": 67244016, "step": 32195 }, { "epoch": 5.252875438453382, "grad_norm": 6.625, "learning_rate": 4.54738614762999e-06, "loss": 2.284, "num_input_tokens_seen": 67253936, "step": 32200 }, { "epoch": 5.252875438453382, "eval_loss": 2.551058292388916, "eval_runtime": 134.808, "eval_samples_per_second": 20.214, "eval_steps_per_second": 10.111, "num_input_tokens_seen": 67253936, "step": 32200 }, { "epoch": 5.253691165674199, "grad_norm": 16.25, "learning_rate": 4.541741993111465e-06, "loss": 4.9573, "num_input_tokens_seen": 67263344, "step": 32205 }, { "epoch": 5.254506892895016, "grad_norm": 6.46875, "learning_rate": 4.536100993513423e-06, "loss": 0.8461, "num_input_tokens_seen": 67272144, "step": 32210 }, { "epoch": 5.255322620115833, "grad_norm": 12.9375, "learning_rate": 4.530463149705768e-06, "loss": 1.5401, "num_input_tokens_seen": 67282128, "step": 32215 }, { "epoch": 5.256138347336651, "grad_norm": 8.9375, "learning_rate": 4.524828462557934e-06, "loss": 2.2447, "num_input_tokens_seen": 67293008, "step": 32220 }, { "epoch": 5.256954074557468, "grad_norm": 4.0, "learning_rate": 4.5191969329388625e-06, "loss": 2.0911, "num_input_tokens_seen": 67303792, "step": 32225 }, { "epoch": 5.257769801778285, "grad_norm": 11.0625, "learning_rate": 4.5135685617169965e-06, "loss": 3.7871, "num_input_tokens_seen": 67315088, "step": 32230 }, { "epoch": 5.258585528999102, "grad_norm": 9.625, "learning_rate": 4.507943349760313e-06, "loss": 1.454, "num_input_tokens_seen": 67325984, "step": 32235 }, { "epoch": 5.25940125621992, "grad_norm": 9.3125, "learning_rate": 4.502321297936277e-06, "loss": 3.54, "num_input_tokens_seen": 67336736, "step": 32240 }, { "epoch": 5.2602169834407375, "grad_norm": 6.28125, "learning_rate": 4.496702407111888e-06, "loss": 2.2236, "num_input_tokens_seen": 67348336, "step": 32245 }, { "epoch": 5.261032710661555, "grad_norm": 7.75, "learning_rate": 4.491086678153653e-06, "loss": 1.8731, "num_input_tokens_seen": 67358048, "step": 32250 }, { "epoch": 5.261848437882372, "grad_norm": 12.75, "learning_rate": 4.485474111927579e-06, "loss": 3.4648, "num_input_tokens_seen": 67367520, "step": 32255 }, { "epoch": 5.26266416510319, "grad_norm": 7.46875, "learning_rate": 4.479864709299197e-06, "loss": 1.6093, "num_input_tokens_seen": 67379680, "step": 32260 }, { "epoch": 5.263479892324007, "grad_norm": 5.15625, "learning_rate": 4.474258471133555e-06, "loss": 1.1844, "num_input_tokens_seen": 67389632, "step": 32265 }, { "epoch": 5.264295619544824, "grad_norm": 4.65625, "learning_rate": 4.4686553982952014e-06, "loss": 2.8365, "num_input_tokens_seen": 67400288, "step": 32270 }, { "epoch": 5.265111346765641, "grad_norm": 6.15625, "learning_rate": 4.463055491648191e-06, "loss": 3.1175, "num_input_tokens_seen": 67410336, "step": 32275 }, { "epoch": 5.265927073986459, "grad_norm": 6.84375, "learning_rate": 4.457458752056112e-06, "loss": 2.277, "num_input_tokens_seen": 67420944, "step": 32280 }, { "epoch": 5.266742801207276, "grad_norm": 5.90625, "learning_rate": 4.451865180382042e-06, "loss": 3.3839, "num_input_tokens_seen": 67431184, "step": 32285 }, { "epoch": 5.2675585284280935, "grad_norm": 9.25, "learning_rate": 4.4462747774885936e-06, "loss": 1.4052, "num_input_tokens_seen": 67442512, "step": 32290 }, { "epoch": 5.268374255648911, "grad_norm": 11.6875, "learning_rate": 4.440687544237859e-06, "loss": 2.108, "num_input_tokens_seen": 67454128, "step": 32295 }, { "epoch": 5.269189982869729, "grad_norm": 3.96875, "learning_rate": 4.435103481491471e-06, "loss": 1.6494, "num_input_tokens_seen": 67462240, "step": 32300 }, { "epoch": 5.270005710090546, "grad_norm": 2.203125, "learning_rate": 4.429522590110569e-06, "loss": 2.6513, "num_input_tokens_seen": 67473200, "step": 32305 }, { "epoch": 5.270821437311363, "grad_norm": 6.21875, "learning_rate": 4.423944870955779e-06, "loss": 1.4311, "num_input_tokens_seen": 67484400, "step": 32310 }, { "epoch": 5.27163716453218, "grad_norm": 4.71875, "learning_rate": 4.418370324887272e-06, "loss": 1.1231, "num_input_tokens_seen": 67494656, "step": 32315 }, { "epoch": 5.272452891752998, "grad_norm": 6.21875, "learning_rate": 4.412798952764699e-06, "loss": 3.011, "num_input_tokens_seen": 67504944, "step": 32320 }, { "epoch": 5.273268618973815, "grad_norm": 8.625, "learning_rate": 4.407230755447245e-06, "loss": 2.2878, "num_input_tokens_seen": 67516272, "step": 32325 }, { "epoch": 5.274084346194632, "grad_norm": 7.84375, "learning_rate": 4.401665733793598e-06, "loss": 1.6345, "num_input_tokens_seen": 67526672, "step": 32330 }, { "epoch": 5.27490007341545, "grad_norm": 5.53125, "learning_rate": 4.3961038886619425e-06, "loss": 1.953, "num_input_tokens_seen": 67537008, "step": 32335 }, { "epoch": 5.275715800636267, "grad_norm": 8.0, "learning_rate": 4.39054522091e-06, "loss": 2.5599, "num_input_tokens_seen": 67548528, "step": 32340 }, { "epoch": 5.276531527857085, "grad_norm": 15.625, "learning_rate": 4.384989731394979e-06, "loss": 2.9885, "num_input_tokens_seen": 67558592, "step": 32345 }, { "epoch": 5.277347255077902, "grad_norm": 5.25, "learning_rate": 4.379437420973598e-06, "loss": 2.4236, "num_input_tokens_seen": 67567600, "step": 32350 }, { "epoch": 5.27816298229872, "grad_norm": 7.875, "learning_rate": 4.373888290502107e-06, "loss": 2.086, "num_input_tokens_seen": 67579440, "step": 32355 }, { "epoch": 5.278978709519537, "grad_norm": 6.5, "learning_rate": 4.36834234083624e-06, "loss": 2.9554, "num_input_tokens_seen": 67591344, "step": 32360 }, { "epoch": 5.279794436740354, "grad_norm": 5.59375, "learning_rate": 4.362799572831258e-06, "loss": 2.9039, "num_input_tokens_seen": 67602144, "step": 32365 }, { "epoch": 5.280610163961171, "grad_norm": 5.5, "learning_rate": 4.35725998734193e-06, "loss": 2.2129, "num_input_tokens_seen": 67612032, "step": 32370 }, { "epoch": 5.281425891181989, "grad_norm": 20.125, "learning_rate": 4.3517235852225195e-06, "loss": 1.6613, "num_input_tokens_seen": 67621120, "step": 32375 }, { "epoch": 5.282241618402806, "grad_norm": 8.5, "learning_rate": 4.346190367326822e-06, "loss": 1.8807, "num_input_tokens_seen": 67631680, "step": 32380 }, { "epoch": 5.283057345623623, "grad_norm": 7.28125, "learning_rate": 4.340660334508115e-06, "loss": 2.5104, "num_input_tokens_seen": 67641568, "step": 32385 }, { "epoch": 5.2838730728444405, "grad_norm": 5.03125, "learning_rate": 4.335133487619206e-06, "loss": 3.0586, "num_input_tokens_seen": 67652336, "step": 32390 }, { "epoch": 5.2846888000652585, "grad_norm": 5.40625, "learning_rate": 4.329609827512409e-06, "loss": 2.4066, "num_input_tokens_seen": 67662848, "step": 32395 }, { "epoch": 5.285504527286076, "grad_norm": 5.53125, "learning_rate": 4.324089355039531e-06, "loss": 1.9202, "num_input_tokens_seen": 67674048, "step": 32400 }, { "epoch": 5.285504527286076, "eval_loss": 2.533475637435913, "eval_runtime": 134.8117, "eval_samples_per_second": 20.213, "eval_steps_per_second": 10.11, "num_input_tokens_seen": 67674048, "step": 32400 }, { "epoch": 5.286320254506893, "grad_norm": 5.0625, "learning_rate": 4.3185720710519075e-06, "loss": 1.4574, "num_input_tokens_seen": 67685024, "step": 32405 }, { "epoch": 5.28713598172771, "grad_norm": 7.6875, "learning_rate": 4.3130579764003724e-06, "loss": 1.5337, "num_input_tokens_seen": 67695840, "step": 32410 }, { "epoch": 5.287951708948528, "grad_norm": 12.9375, "learning_rate": 4.307547071935267e-06, "loss": 2.3591, "num_input_tokens_seen": 67705408, "step": 32415 }, { "epoch": 5.288767436169345, "grad_norm": 11.5, "learning_rate": 4.302039358506435e-06, "loss": 1.9742, "num_input_tokens_seen": 67715168, "step": 32420 }, { "epoch": 5.289583163390162, "grad_norm": 8.25, "learning_rate": 4.296534836963245e-06, "loss": 3.153, "num_input_tokens_seen": 67725184, "step": 32425 }, { "epoch": 5.290398890610979, "grad_norm": 6.90625, "learning_rate": 4.291033508154555e-06, "loss": 1.133, "num_input_tokens_seen": 67735760, "step": 32430 }, { "epoch": 5.291214617831797, "grad_norm": 2.875, "learning_rate": 4.285535372928748e-06, "loss": 1.3177, "num_input_tokens_seen": 67746560, "step": 32435 }, { "epoch": 5.2920303450526145, "grad_norm": 5.125, "learning_rate": 4.280040432133695e-06, "loss": 2.1957, "num_input_tokens_seen": 67757488, "step": 32440 }, { "epoch": 5.292846072273432, "grad_norm": 7.375, "learning_rate": 4.274548686616789e-06, "loss": 2.8725, "num_input_tokens_seen": 67768752, "step": 32445 }, { "epoch": 5.293661799494249, "grad_norm": 8.8125, "learning_rate": 4.2690601372249364e-06, "loss": 2.4883, "num_input_tokens_seen": 67778272, "step": 32450 }, { "epoch": 5.294477526715067, "grad_norm": 7.15625, "learning_rate": 4.263574784804525e-06, "loss": 2.4971, "num_input_tokens_seen": 67790016, "step": 32455 }, { "epoch": 5.295293253935884, "grad_norm": 1.7421875, "learning_rate": 4.258092630201479e-06, "loss": 2.5902, "num_input_tokens_seen": 67799584, "step": 32460 }, { "epoch": 5.296108981156701, "grad_norm": 5.5625, "learning_rate": 4.252613674261202e-06, "loss": 3.9137, "num_input_tokens_seen": 67809680, "step": 32465 }, { "epoch": 5.296924708377518, "grad_norm": 5.78125, "learning_rate": 4.2471379178286224e-06, "loss": 2.5223, "num_input_tokens_seen": 67819696, "step": 32470 }, { "epoch": 5.297740435598336, "grad_norm": 4.09375, "learning_rate": 4.241665361748181e-06, "loss": 3.1281, "num_input_tokens_seen": 67829616, "step": 32475 }, { "epoch": 5.298556162819153, "grad_norm": 5.09375, "learning_rate": 4.2361960068637994e-06, "loss": 1.7755, "num_input_tokens_seen": 67838944, "step": 32480 }, { "epoch": 5.2993718900399704, "grad_norm": 8.25, "learning_rate": 4.230729854018933e-06, "loss": 2.3426, "num_input_tokens_seen": 67848096, "step": 32485 }, { "epoch": 5.300187617260788, "grad_norm": 4.15625, "learning_rate": 4.225266904056521e-06, "loss": 1.8267, "num_input_tokens_seen": 67859296, "step": 32490 }, { "epoch": 5.301003344481606, "grad_norm": 2.875, "learning_rate": 4.21980715781903e-06, "loss": 2.4412, "num_input_tokens_seen": 67870448, "step": 32495 }, { "epoch": 5.301819071702423, "grad_norm": 8.3125, "learning_rate": 4.214350616148416e-06, "loss": 1.9188, "num_input_tokens_seen": 67882608, "step": 32500 }, { "epoch": 5.30263479892324, "grad_norm": 2.9375, "learning_rate": 4.20889727988614e-06, "loss": 2.1714, "num_input_tokens_seen": 67893648, "step": 32505 }, { "epoch": 5.303450526144058, "grad_norm": 4.4375, "learning_rate": 4.20344714987318e-06, "loss": 1.9919, "num_input_tokens_seen": 67903344, "step": 32510 }, { "epoch": 5.304266253364875, "grad_norm": 7.5625, "learning_rate": 4.198000226950022e-06, "loss": 2.2972, "num_input_tokens_seen": 67915104, "step": 32515 }, { "epoch": 5.305081980585692, "grad_norm": 13.375, "learning_rate": 4.192556511956635e-06, "loss": 3.1468, "num_input_tokens_seen": 67924016, "step": 32520 }, { "epoch": 5.305897707806509, "grad_norm": 8.9375, "learning_rate": 4.18711600573252e-06, "loss": 3.1367, "num_input_tokens_seen": 67935280, "step": 32525 }, { "epoch": 5.306713435027326, "grad_norm": 5.6875, "learning_rate": 4.181678709116671e-06, "loss": 2.1795, "num_input_tokens_seen": 67945904, "step": 32530 }, { "epoch": 5.307529162248144, "grad_norm": 10.125, "learning_rate": 4.1762446229475785e-06, "loss": 1.7417, "num_input_tokens_seen": 67956576, "step": 32535 }, { "epoch": 5.3083448894689615, "grad_norm": 8.3125, "learning_rate": 4.17081374806326e-06, "loss": 1.4875, "num_input_tokens_seen": 67966880, "step": 32540 }, { "epoch": 5.309160616689779, "grad_norm": 3.640625, "learning_rate": 4.165386085301212e-06, "loss": 2.3882, "num_input_tokens_seen": 67977120, "step": 32545 }, { "epoch": 5.309976343910597, "grad_norm": 2.703125, "learning_rate": 4.1599616354984525e-06, "loss": 2.4555, "num_input_tokens_seen": 67987424, "step": 32550 }, { "epoch": 5.310792071131414, "grad_norm": 6.3125, "learning_rate": 4.154540399491508e-06, "loss": 1.6428, "num_input_tokens_seen": 67997840, "step": 32555 }, { "epoch": 5.311607798352231, "grad_norm": 8.3125, "learning_rate": 4.149122378116394e-06, "loss": 2.4289, "num_input_tokens_seen": 68008352, "step": 32560 }, { "epoch": 5.312423525573048, "grad_norm": 3.765625, "learning_rate": 4.14370757220863e-06, "loss": 2.1248, "num_input_tokens_seen": 68018960, "step": 32565 }, { "epoch": 5.313239252793866, "grad_norm": 9.75, "learning_rate": 4.138295982603263e-06, "loss": 1.7286, "num_input_tokens_seen": 68029760, "step": 32570 }, { "epoch": 5.314054980014683, "grad_norm": 7.78125, "learning_rate": 4.132887610134814e-06, "loss": 2.0278, "num_input_tokens_seen": 68041344, "step": 32575 }, { "epoch": 5.3148707072355, "grad_norm": 3.8125, "learning_rate": 4.127482455637335e-06, "loss": 2.4586, "num_input_tokens_seen": 68052192, "step": 32580 }, { "epoch": 5.3156864344563175, "grad_norm": 0.103515625, "learning_rate": 4.1220805199443545e-06, "loss": 1.3991, "num_input_tokens_seen": 68063104, "step": 32585 }, { "epoch": 5.3165021616771355, "grad_norm": 8.125, "learning_rate": 4.116681803888925e-06, "loss": 1.8607, "num_input_tokens_seen": 68074704, "step": 32590 }, { "epoch": 5.317317888897953, "grad_norm": 6.25, "learning_rate": 4.111286308303605e-06, "loss": 1.6929, "num_input_tokens_seen": 68085408, "step": 32595 }, { "epoch": 5.31813361611877, "grad_norm": 8.75, "learning_rate": 4.105894034020433e-06, "loss": 2.2426, "num_input_tokens_seen": 68096656, "step": 32600 }, { "epoch": 5.31813361611877, "eval_loss": 2.5445759296417236, "eval_runtime": 134.7521, "eval_samples_per_second": 20.222, "eval_steps_per_second": 10.115, "num_input_tokens_seen": 68096656, "step": 32600 }, { "epoch": 5.318949343339587, "grad_norm": 8.4375, "learning_rate": 4.100504981870975e-06, "loss": 2.0147, "num_input_tokens_seen": 68107840, "step": 32605 }, { "epoch": 5.319765070560405, "grad_norm": 6.90625, "learning_rate": 4.0951191526862915e-06, "loss": 3.1333, "num_input_tokens_seen": 68119552, "step": 32610 }, { "epoch": 5.320580797781222, "grad_norm": 1.65625, "learning_rate": 4.089736547296938e-06, "loss": 2.0765, "num_input_tokens_seen": 68131072, "step": 32615 }, { "epoch": 5.321396525002039, "grad_norm": 4.40625, "learning_rate": 4.08435716653299e-06, "loss": 3.0563, "num_input_tokens_seen": 68141904, "step": 32620 }, { "epoch": 5.322212252222856, "grad_norm": 8.5, "learning_rate": 4.0789810112240005e-06, "loss": 1.7536, "num_input_tokens_seen": 68153152, "step": 32625 }, { "epoch": 5.323027979443674, "grad_norm": 2.75, "learning_rate": 4.073608082199057e-06, "loss": 2.3633, "num_input_tokens_seen": 68162144, "step": 32630 }, { "epoch": 5.3238437066644915, "grad_norm": 8.75, "learning_rate": 4.068238380286718e-06, "loss": 2.5521, "num_input_tokens_seen": 68173824, "step": 32635 }, { "epoch": 5.324659433885309, "grad_norm": 12.9375, "learning_rate": 4.062871906315072e-06, "loss": 4.4844, "num_input_tokens_seen": 68185104, "step": 32640 }, { "epoch": 5.325475161106126, "grad_norm": 12.625, "learning_rate": 4.057508661111686e-06, "loss": 1.91, "num_input_tokens_seen": 68195792, "step": 32645 }, { "epoch": 5.326290888326944, "grad_norm": 3.703125, "learning_rate": 4.052148645503648e-06, "loss": 2.0108, "num_input_tokens_seen": 68206432, "step": 32650 }, { "epoch": 5.327106615547761, "grad_norm": 6.65625, "learning_rate": 4.046791860317531e-06, "loss": 1.552, "num_input_tokens_seen": 68216560, "step": 32655 }, { "epoch": 5.327922342768578, "grad_norm": 5.90625, "learning_rate": 4.041438306379431e-06, "loss": 2.4758, "num_input_tokens_seen": 68226208, "step": 32660 }, { "epoch": 5.328738069989395, "grad_norm": 2.265625, "learning_rate": 4.036087984514916e-06, "loss": 1.5428, "num_input_tokens_seen": 68237168, "step": 32665 }, { "epoch": 5.329553797210213, "grad_norm": 9.6875, "learning_rate": 4.030740895549084e-06, "loss": 2.2219, "num_input_tokens_seen": 68246800, "step": 32670 }, { "epoch": 5.33036952443103, "grad_norm": 7.125, "learning_rate": 4.025397040306531e-06, "loss": 1.5325, "num_input_tokens_seen": 68256720, "step": 32675 }, { "epoch": 5.331185251651847, "grad_norm": 2.265625, "learning_rate": 4.0200564196113285e-06, "loss": 1.7349, "num_input_tokens_seen": 68268000, "step": 32680 }, { "epoch": 5.332000978872665, "grad_norm": 7.0625, "learning_rate": 4.014719034287079e-06, "loss": 3.0456, "num_input_tokens_seen": 68278608, "step": 32685 }, { "epoch": 5.332816706093483, "grad_norm": 12.0625, "learning_rate": 4.0093848851568775e-06, "loss": 2.742, "num_input_tokens_seen": 68288064, "step": 32690 }, { "epoch": 5.3336324333143, "grad_norm": 7.96875, "learning_rate": 4.004053973043304e-06, "loss": 1.7614, "num_input_tokens_seen": 68300560, "step": 32695 }, { "epoch": 5.334448160535117, "grad_norm": 4.6875, "learning_rate": 3.998726298768465e-06, "loss": 1.8683, "num_input_tokens_seen": 68312400, "step": 32700 }, { "epoch": 5.335263887755934, "grad_norm": 7.59375, "learning_rate": 3.99340186315395e-06, "loss": 2.6379, "num_input_tokens_seen": 68323984, "step": 32705 }, { "epoch": 5.336079614976752, "grad_norm": 7.0, "learning_rate": 3.988080667020849e-06, "loss": 3.6827, "num_input_tokens_seen": 68334384, "step": 32710 }, { "epoch": 5.336895342197569, "grad_norm": 11.75, "learning_rate": 3.982762711189766e-06, "loss": 2.6002, "num_input_tokens_seen": 68345152, "step": 32715 }, { "epoch": 5.337711069418386, "grad_norm": 3.734375, "learning_rate": 3.977447996480785e-06, "loss": 2.3311, "num_input_tokens_seen": 68355280, "step": 32720 }, { "epoch": 5.338526796639204, "grad_norm": 5.90625, "learning_rate": 3.97213652371351e-06, "loss": 3.7039, "num_input_tokens_seen": 68364992, "step": 32725 }, { "epoch": 5.339342523860021, "grad_norm": 7.75, "learning_rate": 3.966828293707042e-06, "loss": 3.4688, "num_input_tokens_seen": 68375824, "step": 32730 }, { "epoch": 5.3401582510808385, "grad_norm": 10.5625, "learning_rate": 3.961523307279963e-06, "loss": 2.4766, "num_input_tokens_seen": 68384864, "step": 32735 }, { "epoch": 5.340973978301656, "grad_norm": 2.984375, "learning_rate": 3.956221565250382e-06, "loss": 2.6872, "num_input_tokens_seen": 68395744, "step": 32740 }, { "epoch": 5.341789705522474, "grad_norm": 7.34375, "learning_rate": 3.950923068435883e-06, "loss": 2.4929, "num_input_tokens_seen": 68405536, "step": 32745 }, { "epoch": 5.342605432743291, "grad_norm": 8.5625, "learning_rate": 3.945627817653566e-06, "loss": 2.6151, "num_input_tokens_seen": 68415440, "step": 32750 }, { "epoch": 5.343421159964108, "grad_norm": 11.6875, "learning_rate": 3.9403358137200335e-06, "loss": 3.74, "num_input_tokens_seen": 68426080, "step": 32755 }, { "epoch": 5.344236887184925, "grad_norm": 5.1875, "learning_rate": 3.9350470574513605e-06, "loss": 1.9137, "num_input_tokens_seen": 68437056, "step": 32760 }, { "epoch": 5.345052614405743, "grad_norm": 9.5, "learning_rate": 3.9297615496631525e-06, "loss": 1.9923, "num_input_tokens_seen": 68447056, "step": 32765 }, { "epoch": 5.34586834162656, "grad_norm": 6.09375, "learning_rate": 3.924479291170505e-06, "loss": 1.5642, "num_input_tokens_seen": 68457936, "step": 32770 }, { "epoch": 5.346684068847377, "grad_norm": 9.1875, "learning_rate": 3.919200282788002e-06, "loss": 2.637, "num_input_tokens_seen": 68468528, "step": 32775 }, { "epoch": 5.3474997960681945, "grad_norm": 7.53125, "learning_rate": 3.913924525329726e-06, "loss": 1.9295, "num_input_tokens_seen": 68480448, "step": 32780 }, { "epoch": 5.3483155232890125, "grad_norm": 7.96875, "learning_rate": 3.908652019609279e-06, "loss": 2.6697, "num_input_tokens_seen": 68492144, "step": 32785 }, { "epoch": 5.34913125050983, "grad_norm": 10.625, "learning_rate": 3.9033827664397364e-06, "loss": 2.2259, "num_input_tokens_seen": 68501632, "step": 32790 }, { "epoch": 5.349946977730647, "grad_norm": 9.8125, "learning_rate": 3.898116766633694e-06, "loss": 2.8508, "num_input_tokens_seen": 68512016, "step": 32795 }, { "epoch": 5.350762704951464, "grad_norm": 8.1875, "learning_rate": 3.8928540210032225e-06, "loss": 2.4398, "num_input_tokens_seen": 68521600, "step": 32800 }, { "epoch": 5.350762704951464, "eval_loss": 2.5286903381347656, "eval_runtime": 134.6307, "eval_samples_per_second": 20.241, "eval_steps_per_second": 10.124, "num_input_tokens_seen": 68521600, "step": 32800 }, { "epoch": 5.351578432172282, "grad_norm": 13.6875, "learning_rate": 3.887594530359909e-06, "loss": 3.2388, "num_input_tokens_seen": 68532432, "step": 32805 }, { "epoch": 5.352394159393099, "grad_norm": 11.125, "learning_rate": 3.88233829551484e-06, "loss": 1.8996, "num_input_tokens_seen": 68542336, "step": 32810 }, { "epoch": 5.353209886613916, "grad_norm": 6.625, "learning_rate": 3.877085317278581e-06, "loss": 1.8685, "num_input_tokens_seen": 68553520, "step": 32815 }, { "epoch": 5.354025613834733, "grad_norm": 8.3125, "learning_rate": 3.87183559646122e-06, "loss": 2.354, "num_input_tokens_seen": 68564048, "step": 32820 }, { "epoch": 5.354841341055551, "grad_norm": 5.875, "learning_rate": 3.866589133872317e-06, "loss": 2.4411, "num_input_tokens_seen": 68574256, "step": 32825 }, { "epoch": 5.3556570682763684, "grad_norm": 5.59375, "learning_rate": 3.861345930320948e-06, "loss": 1.5555, "num_input_tokens_seen": 68585120, "step": 32830 }, { "epoch": 5.356472795497186, "grad_norm": 4.9375, "learning_rate": 3.856105986615688e-06, "loss": 1.8829, "num_input_tokens_seen": 68594976, "step": 32835 }, { "epoch": 5.357288522718003, "grad_norm": 3.625, "learning_rate": 3.850869303564589e-06, "loss": 2.2718, "num_input_tokens_seen": 68605920, "step": 32840 }, { "epoch": 5.358104249938821, "grad_norm": 10.5, "learning_rate": 3.845635881975226e-06, "loss": 1.4601, "num_input_tokens_seen": 68615888, "step": 32845 }, { "epoch": 5.358919977159638, "grad_norm": 12.9375, "learning_rate": 3.840405722654647e-06, "loss": 3.2964, "num_input_tokens_seen": 68626112, "step": 32850 }, { "epoch": 5.359735704380455, "grad_norm": 5.8125, "learning_rate": 3.835178826409419e-06, "loss": 2.7309, "num_input_tokens_seen": 68636736, "step": 32855 }, { "epoch": 5.360551431601272, "grad_norm": 11.875, "learning_rate": 3.8299551940455895e-06, "loss": 2.652, "num_input_tokens_seen": 68647936, "step": 32860 }, { "epoch": 5.36136715882209, "grad_norm": 5.78125, "learning_rate": 3.824734826368703e-06, "loss": 1.6112, "num_input_tokens_seen": 68658336, "step": 32865 }, { "epoch": 5.362182886042907, "grad_norm": 4.96875, "learning_rate": 3.819517724183813e-06, "loss": 2.8202, "num_input_tokens_seen": 68669232, "step": 32870 }, { "epoch": 5.362998613263724, "grad_norm": 6.25, "learning_rate": 3.8143038882954648e-06, "loss": 1.8867, "num_input_tokens_seen": 68681264, "step": 32875 }, { "epoch": 5.3638143404845415, "grad_norm": 5.6875, "learning_rate": 3.8090933195076867e-06, "loss": 2.3306, "num_input_tokens_seen": 68690752, "step": 32880 }, { "epoch": 5.3646300677053596, "grad_norm": 7.1875, "learning_rate": 3.8038860186240198e-06, "loss": 2.892, "num_input_tokens_seen": 68701584, "step": 32885 }, { "epoch": 5.365445794926177, "grad_norm": 7.59375, "learning_rate": 3.7986819864475026e-06, "loss": 2.5, "num_input_tokens_seen": 68711952, "step": 32890 }, { "epoch": 5.366261522146994, "grad_norm": 9.125, "learning_rate": 3.793481223780651e-06, "loss": 2.4748, "num_input_tokens_seen": 68721808, "step": 32895 }, { "epoch": 5.367077249367812, "grad_norm": 8.375, "learning_rate": 3.788283731425496e-06, "loss": 2.6711, "num_input_tokens_seen": 68732656, "step": 32900 }, { "epoch": 5.367892976588629, "grad_norm": 8.8125, "learning_rate": 3.7830895101835488e-06, "loss": 2.3863, "num_input_tokens_seen": 68744704, "step": 32905 }, { "epoch": 5.368708703809446, "grad_norm": 10.9375, "learning_rate": 3.7778985608558274e-06, "loss": 2.5039, "num_input_tokens_seen": 68755264, "step": 32910 }, { "epoch": 5.369524431030263, "grad_norm": 9.9375, "learning_rate": 3.7727108842428443e-06, "loss": 1.7379, "num_input_tokens_seen": 68765280, "step": 32915 }, { "epoch": 5.370340158251081, "grad_norm": 9.125, "learning_rate": 3.7675264811446065e-06, "loss": 3.6476, "num_input_tokens_seen": 68775232, "step": 32920 }, { "epoch": 5.371155885471898, "grad_norm": 8.375, "learning_rate": 3.7623453523605994e-06, "loss": 2.0667, "num_input_tokens_seen": 68785760, "step": 32925 }, { "epoch": 5.3719716126927155, "grad_norm": 1.2734375, "learning_rate": 3.757167498689834e-06, "loss": 1.6156, "num_input_tokens_seen": 68795424, "step": 32930 }, { "epoch": 5.372787339913533, "grad_norm": 4.375, "learning_rate": 3.7519929209307914e-06, "loss": 1.691, "num_input_tokens_seen": 68805504, "step": 32935 }, { "epoch": 5.373603067134351, "grad_norm": 11.1875, "learning_rate": 3.746821619881463e-06, "loss": 2.1756, "num_input_tokens_seen": 68816416, "step": 32940 }, { "epoch": 5.374418794355168, "grad_norm": 7.78125, "learning_rate": 3.74165359633932e-06, "loss": 3.088, "num_input_tokens_seen": 68825600, "step": 32945 }, { "epoch": 5.375234521575985, "grad_norm": 4.25, "learning_rate": 3.736488851101341e-06, "loss": 2.5653, "num_input_tokens_seen": 68836592, "step": 32950 }, { "epoch": 5.376050248796802, "grad_norm": 14.1875, "learning_rate": 3.7313273849640035e-06, "loss": 3.7658, "num_input_tokens_seen": 68846784, "step": 32955 }, { "epoch": 5.37686597601762, "grad_norm": 6.125, "learning_rate": 3.7261691987232533e-06, "loss": 1.0972, "num_input_tokens_seen": 68859424, "step": 32960 }, { "epoch": 5.377681703238437, "grad_norm": 4.9375, "learning_rate": 3.7210142931745575e-06, "loss": 1.5178, "num_input_tokens_seen": 68869312, "step": 32965 }, { "epoch": 5.378497430459254, "grad_norm": 2.703125, "learning_rate": 3.7158626691128712e-06, "loss": 2.2884, "num_input_tokens_seen": 68880224, "step": 32970 }, { "epoch": 5.3793131576800715, "grad_norm": 6.0, "learning_rate": 3.710714327332629e-06, "loss": 2.0863, "num_input_tokens_seen": 68892112, "step": 32975 }, { "epoch": 5.3801288849008895, "grad_norm": 2.453125, "learning_rate": 3.7055692686277815e-06, "loss": 2.1197, "num_input_tokens_seen": 68904640, "step": 32980 }, { "epoch": 5.380944612121707, "grad_norm": 7.8125, "learning_rate": 3.70042749379175e-06, "loss": 2.9294, "num_input_tokens_seen": 68915632, "step": 32985 }, { "epoch": 5.381760339342524, "grad_norm": 7.59375, "learning_rate": 3.6952890036174693e-06, "loss": 2.8629, "num_input_tokens_seen": 68926288, "step": 32990 }, { "epoch": 5.382576066563341, "grad_norm": 6.21875, "learning_rate": 3.690153798897353e-06, "loss": 1.9659, "num_input_tokens_seen": 68937328, "step": 32995 }, { "epoch": 5.383391793784159, "grad_norm": 10.625, "learning_rate": 3.6850218804233225e-06, "loss": 3.0732, "num_input_tokens_seen": 68948064, "step": 33000 }, { "epoch": 5.383391793784159, "eval_loss": 2.550351858139038, "eval_runtime": 134.7768, "eval_samples_per_second": 20.219, "eval_steps_per_second": 10.113, "num_input_tokens_seen": 68948064, "step": 33000 }, { "epoch": 5.384207521004976, "grad_norm": 8.3125, "learning_rate": 3.679893248986779e-06, "loss": 2.0239, "num_input_tokens_seen": 68958288, "step": 33005 }, { "epoch": 5.385023248225793, "grad_norm": 5.1875, "learning_rate": 3.6747679053786147e-06, "loss": 2.2177, "num_input_tokens_seen": 68968576, "step": 33010 }, { "epoch": 5.38583897544661, "grad_norm": 6.09375, "learning_rate": 3.669645850389228e-06, "loss": 1.8081, "num_input_tokens_seen": 68978176, "step": 33015 }, { "epoch": 5.386654702667428, "grad_norm": 12.25, "learning_rate": 3.664527084808514e-06, "loss": 4.6706, "num_input_tokens_seen": 68988560, "step": 33020 }, { "epoch": 5.387470429888245, "grad_norm": 7.15625, "learning_rate": 3.6594116094258337e-06, "loss": 2.5089, "num_input_tokens_seen": 68998704, "step": 33025 }, { "epoch": 5.388286157109063, "grad_norm": 12.1875, "learning_rate": 3.6542994250300665e-06, "loss": 2.9712, "num_input_tokens_seen": 69009760, "step": 33030 }, { "epoch": 5.38910188432988, "grad_norm": 8.1875, "learning_rate": 3.6491905324095825e-06, "loss": 1.837, "num_input_tokens_seen": 69020944, "step": 33035 }, { "epoch": 5.389917611550698, "grad_norm": 5.6875, "learning_rate": 3.644084932352221e-06, "loss": 4.4004, "num_input_tokens_seen": 69031296, "step": 33040 }, { "epoch": 5.390733338771515, "grad_norm": 3.265625, "learning_rate": 3.6389826256453457e-06, "loss": 2.9087, "num_input_tokens_seen": 69042720, "step": 33045 }, { "epoch": 5.391549065992332, "grad_norm": 4.4375, "learning_rate": 3.633883613075781e-06, "loss": 2.2119, "num_input_tokens_seen": 69052672, "step": 33050 }, { "epoch": 5.392364793213149, "grad_norm": 11.6875, "learning_rate": 3.6287878954298693e-06, "loss": 1.8129, "num_input_tokens_seen": 69063568, "step": 33055 }, { "epoch": 5.393180520433967, "grad_norm": 3.6875, "learning_rate": 3.6236954734934354e-06, "loss": 1.7636, "num_input_tokens_seen": 69073744, "step": 33060 }, { "epoch": 5.393996247654784, "grad_norm": 4.9375, "learning_rate": 3.618606348051784e-06, "loss": 0.8543, "num_input_tokens_seen": 69082464, "step": 33065 }, { "epoch": 5.394811974875601, "grad_norm": 2.78125, "learning_rate": 3.6135205198897376e-06, "loss": 1.3975, "num_input_tokens_seen": 69093296, "step": 33070 }, { "epoch": 5.395627702096419, "grad_norm": 10.375, "learning_rate": 3.6084379897915854e-06, "loss": 3.1128, "num_input_tokens_seen": 69104224, "step": 33075 }, { "epoch": 5.3964434293172365, "grad_norm": 12.0625, "learning_rate": 3.6033587585411115e-06, "loss": 1.6582, "num_input_tokens_seen": 69113376, "step": 33080 }, { "epoch": 5.397259156538054, "grad_norm": 5.8125, "learning_rate": 3.5982828269216117e-06, "loss": 2.8507, "num_input_tokens_seen": 69124336, "step": 33085 }, { "epoch": 5.398074883758871, "grad_norm": 9.25, "learning_rate": 3.593210195715843e-06, "loss": 3.9888, "num_input_tokens_seen": 69135296, "step": 33090 }, { "epoch": 5.398890610979688, "grad_norm": 9.3125, "learning_rate": 3.5881408657060773e-06, "loss": 2.3739, "num_input_tokens_seen": 69146496, "step": 33095 }, { "epoch": 5.399706338200506, "grad_norm": 3.625, "learning_rate": 3.583074837674075e-06, "loss": 1.8028, "num_input_tokens_seen": 69155952, "step": 33100 }, { "epoch": 5.400522065421323, "grad_norm": 6.9375, "learning_rate": 3.578012112401069e-06, "loss": 1.9399, "num_input_tokens_seen": 69165904, "step": 33105 }, { "epoch": 5.40133779264214, "grad_norm": 8.625, "learning_rate": 3.5729526906677996e-06, "loss": 3.6244, "num_input_tokens_seen": 69176560, "step": 33110 }, { "epoch": 5.402153519862958, "grad_norm": 3.453125, "learning_rate": 3.5678965732545007e-06, "loss": 1.523, "num_input_tokens_seen": 69186592, "step": 33115 }, { "epoch": 5.402969247083775, "grad_norm": 10.75, "learning_rate": 3.562843760940876e-06, "loss": 2.187, "num_input_tokens_seen": 69195792, "step": 33120 }, { "epoch": 5.4037849743045925, "grad_norm": 5.71875, "learning_rate": 3.5577942545061473e-06, "loss": 1.3583, "num_input_tokens_seen": 69205552, "step": 33125 }, { "epoch": 5.40460070152541, "grad_norm": 9.5, "learning_rate": 3.5527480547289967e-06, "loss": 3.0487, "num_input_tokens_seen": 69216192, "step": 33130 }, { "epoch": 5.405416428746228, "grad_norm": 7.25, "learning_rate": 3.547705162387624e-06, "loss": 2.1176, "num_input_tokens_seen": 69226304, "step": 33135 }, { "epoch": 5.406232155967045, "grad_norm": 5.65625, "learning_rate": 3.542665578259699e-06, "loss": 3.3311, "num_input_tokens_seen": 69236736, "step": 33140 }, { "epoch": 5.407047883187862, "grad_norm": 4.3125, "learning_rate": 3.5376293031223945e-06, "loss": 2.5978, "num_input_tokens_seen": 69247776, "step": 33145 }, { "epoch": 5.407863610408679, "grad_norm": 5.53125, "learning_rate": 3.5325963377523614e-06, "loss": 1.7964, "num_input_tokens_seen": 69255792, "step": 33150 }, { "epoch": 5.408679337629497, "grad_norm": 8.5, "learning_rate": 3.5275666829257536e-06, "loss": 2.332, "num_input_tokens_seen": 69265888, "step": 33155 }, { "epoch": 5.409495064850314, "grad_norm": 7.84375, "learning_rate": 3.5225403394181955e-06, "loss": 2.1464, "num_input_tokens_seen": 69275840, "step": 33160 }, { "epoch": 5.410310792071131, "grad_norm": 1.0546875, "learning_rate": 3.517517308004828e-06, "loss": 2.1067, "num_input_tokens_seen": 69285936, "step": 33165 }, { "epoch": 5.411126519291948, "grad_norm": 8.8125, "learning_rate": 3.512497589460251e-06, "loss": 2.6663, "num_input_tokens_seen": 69296416, "step": 33170 }, { "epoch": 5.4119422465127665, "grad_norm": 9.6875, "learning_rate": 3.5074811845585727e-06, "loss": 2.9226, "num_input_tokens_seen": 69305872, "step": 33175 }, { "epoch": 5.412757973733584, "grad_norm": 8.1875, "learning_rate": 3.5024680940733937e-06, "loss": 3.3949, "num_input_tokens_seen": 69316816, "step": 33180 }, { "epoch": 5.413573700954401, "grad_norm": 8.875, "learning_rate": 3.4974583187777852e-06, "loss": 2.5279, "num_input_tokens_seen": 69326928, "step": 33185 }, { "epoch": 5.414389428175218, "grad_norm": 3.53125, "learning_rate": 3.4924518594443204e-06, "loss": 1.714, "num_input_tokens_seen": 69338448, "step": 33190 }, { "epoch": 5.415205155396036, "grad_norm": 4.59375, "learning_rate": 3.4874487168450682e-06, "loss": 1.8615, "num_input_tokens_seen": 69347312, "step": 33195 }, { "epoch": 5.416020882616853, "grad_norm": 7.78125, "learning_rate": 3.482448891751558e-06, "loss": 3.9775, "num_input_tokens_seen": 69357008, "step": 33200 }, { "epoch": 5.416020882616853, "eval_loss": 2.529320478439331, "eval_runtime": 134.8074, "eval_samples_per_second": 20.214, "eval_steps_per_second": 10.111, "num_input_tokens_seen": 69357008, "step": 33200 }, { "epoch": 5.41683660983767, "grad_norm": 5.46875, "learning_rate": 3.477452384934843e-06, "loss": 1.408, "num_input_tokens_seen": 69367024, "step": 33205 }, { "epoch": 5.417652337058487, "grad_norm": 2.34375, "learning_rate": 3.472459197165434e-06, "loss": 1.645, "num_input_tokens_seen": 69376848, "step": 33210 }, { "epoch": 5.418468064279305, "grad_norm": 12.5625, "learning_rate": 3.4674693292133518e-06, "loss": 2.1229, "num_input_tokens_seen": 69386992, "step": 33215 }, { "epoch": 5.419283791500122, "grad_norm": 4.84375, "learning_rate": 3.4624827818480977e-06, "loss": 3.0874, "num_input_tokens_seen": 69395536, "step": 33220 }, { "epoch": 5.4200995187209395, "grad_norm": 9.5, "learning_rate": 3.4574995558386474e-06, "loss": 2.843, "num_input_tokens_seen": 69406144, "step": 33225 }, { "epoch": 5.420915245941757, "grad_norm": 10.5, "learning_rate": 3.452519651953487e-06, "loss": 2.2977, "num_input_tokens_seen": 69417696, "step": 33230 }, { "epoch": 5.421730973162575, "grad_norm": 11.4375, "learning_rate": 3.447543070960585e-06, "loss": 2.8949, "num_input_tokens_seen": 69427536, "step": 33235 }, { "epoch": 5.422546700383392, "grad_norm": 10.1875, "learning_rate": 3.4425698136273778e-06, "loss": 2.6095, "num_input_tokens_seen": 69437968, "step": 33240 }, { "epoch": 5.423362427604209, "grad_norm": 5.65625, "learning_rate": 3.437599880720821e-06, "loss": 1.8944, "num_input_tokens_seen": 69447936, "step": 33245 }, { "epoch": 5.424178154825027, "grad_norm": 8.375, "learning_rate": 3.4326332730073267e-06, "loss": 3.2479, "num_input_tokens_seen": 69458528, "step": 33250 }, { "epoch": 5.424993882045844, "grad_norm": 10.4375, "learning_rate": 3.427669991252813e-06, "loss": 2.2383, "num_input_tokens_seen": 69470064, "step": 33255 }, { "epoch": 5.425809609266661, "grad_norm": 5.53125, "learning_rate": 3.42271003622269e-06, "loss": 3.4743, "num_input_tokens_seen": 69480944, "step": 33260 }, { "epoch": 5.426625336487478, "grad_norm": 10.25, "learning_rate": 3.4177534086818286e-06, "loss": 2.3272, "num_input_tokens_seen": 69491008, "step": 33265 }, { "epoch": 5.4274410637082955, "grad_norm": 7.15625, "learning_rate": 3.412800109394612e-06, "loss": 2.808, "num_input_tokens_seen": 69501648, "step": 33270 }, { "epoch": 5.4282567909291135, "grad_norm": 5.59375, "learning_rate": 3.4078501391249044e-06, "loss": 2.886, "num_input_tokens_seen": 69512304, "step": 33275 }, { "epoch": 5.429072518149931, "grad_norm": 11.875, "learning_rate": 3.4029034986360453e-06, "loss": 2.659, "num_input_tokens_seen": 69522528, "step": 33280 }, { "epoch": 5.429888245370748, "grad_norm": 12.4375, "learning_rate": 3.397960188690877e-06, "loss": 2.4489, "num_input_tokens_seen": 69532704, "step": 33285 }, { "epoch": 5.430703972591566, "grad_norm": 6.15625, "learning_rate": 3.393020210051717e-06, "loss": 1.2056, "num_input_tokens_seen": 69542064, "step": 33290 }, { "epoch": 5.431519699812383, "grad_norm": 6.09375, "learning_rate": 3.3880835634803655e-06, "loss": 1.4297, "num_input_tokens_seen": 69552368, "step": 33295 }, { "epoch": 5.4323354270332, "grad_norm": 8.875, "learning_rate": 3.383150249738126e-06, "loss": 2.4705, "num_input_tokens_seen": 69562224, "step": 33300 }, { "epoch": 5.433151154254017, "grad_norm": 5.96875, "learning_rate": 3.3782202695857663e-06, "loss": 3.046, "num_input_tokens_seen": 69572112, "step": 33305 }, { "epoch": 5.433966881474835, "grad_norm": 8.4375, "learning_rate": 3.373293623783558e-06, "loss": 2.3007, "num_input_tokens_seen": 69584000, "step": 33310 }, { "epoch": 5.434782608695652, "grad_norm": 5.46875, "learning_rate": 3.368370313091257e-06, "loss": 1.2776, "num_input_tokens_seen": 69594048, "step": 33315 }, { "epoch": 5.4355983359164695, "grad_norm": 6.34375, "learning_rate": 3.363450338268087e-06, "loss": 1.4056, "num_input_tokens_seen": 69604560, "step": 33320 }, { "epoch": 5.436414063137287, "grad_norm": 8.6875, "learning_rate": 3.358533700072783e-06, "loss": 3.4181, "num_input_tokens_seen": 69614336, "step": 33325 }, { "epoch": 5.437229790358105, "grad_norm": 5.65625, "learning_rate": 3.3536203992635377e-06, "loss": 3.6387, "num_input_tokens_seen": 69625216, "step": 33330 }, { "epoch": 5.438045517578922, "grad_norm": 10.0625, "learning_rate": 3.348710436598057e-06, "loss": 2.6194, "num_input_tokens_seen": 69635632, "step": 33335 }, { "epoch": 5.438861244799739, "grad_norm": 6.21875, "learning_rate": 3.3438038128335155e-06, "loss": 2.3802, "num_input_tokens_seen": 69645936, "step": 33340 }, { "epoch": 5.439676972020556, "grad_norm": 14.625, "learning_rate": 3.338900528726571e-06, "loss": 3.2674, "num_input_tokens_seen": 69656928, "step": 33345 }, { "epoch": 5.440492699241374, "grad_norm": 4.8125, "learning_rate": 3.3340005850333812e-06, "loss": 2.105, "num_input_tokens_seen": 69668128, "step": 33350 }, { "epoch": 5.441308426462191, "grad_norm": 3.96875, "learning_rate": 3.329103982509568e-06, "loss": 3.1512, "num_input_tokens_seen": 69678864, "step": 33355 }, { "epoch": 5.442124153683008, "grad_norm": 8.5625, "learning_rate": 3.324210721910259e-06, "loss": 2.7581, "num_input_tokens_seen": 69689216, "step": 33360 }, { "epoch": 5.442939880903825, "grad_norm": 3.265625, "learning_rate": 3.319320803990053e-06, "loss": 2.2164, "num_input_tokens_seen": 69699936, "step": 33365 }, { "epoch": 5.443755608124643, "grad_norm": 4.71875, "learning_rate": 3.3144342295030274e-06, "loss": 1.7097, "num_input_tokens_seen": 69710640, "step": 33370 }, { "epoch": 5.444571335345461, "grad_norm": 6.90625, "learning_rate": 3.309550999202765e-06, "loss": 1.6203, "num_input_tokens_seen": 69721824, "step": 33375 }, { "epoch": 5.445387062566278, "grad_norm": 7.46875, "learning_rate": 3.3046711138423197e-06, "loss": 2.0463, "num_input_tokens_seen": 69730608, "step": 33380 }, { "epoch": 5.446202789787095, "grad_norm": 10.875, "learning_rate": 3.2997945741742255e-06, "loss": 2.4403, "num_input_tokens_seen": 69742128, "step": 33385 }, { "epoch": 5.447018517007913, "grad_norm": 8.4375, "learning_rate": 3.2949213809505082e-06, "loss": 2.7011, "num_input_tokens_seen": 69751952, "step": 33390 }, { "epoch": 5.44783424422873, "grad_norm": 9.25, "learning_rate": 3.2900515349226834e-06, "loss": 2.5441, "num_input_tokens_seen": 69762464, "step": 33395 }, { "epoch": 5.448649971449547, "grad_norm": 7.90625, "learning_rate": 3.285185036841731e-06, "loss": 2.9763, "num_input_tokens_seen": 69771824, "step": 33400 }, { "epoch": 5.448649971449547, "eval_loss": 2.5489401817321777, "eval_runtime": 134.7877, "eval_samples_per_second": 20.217, "eval_steps_per_second": 10.112, "num_input_tokens_seen": 69771824, "step": 33400 }, { "epoch": 5.449465698670364, "grad_norm": 7.1875, "learning_rate": 3.2803218874581377e-06, "loss": 2.7255, "num_input_tokens_seen": 69781600, "step": 33405 }, { "epoch": 5.450281425891182, "grad_norm": 11.1875, "learning_rate": 3.2754620875218494e-06, "loss": 4.1228, "num_input_tokens_seen": 69791936, "step": 33410 }, { "epoch": 5.451097153111999, "grad_norm": 1.9609375, "learning_rate": 3.2706056377823146e-06, "loss": 1.7119, "num_input_tokens_seen": 69801552, "step": 33415 }, { "epoch": 5.4519128803328165, "grad_norm": 8.1875, "learning_rate": 3.2657525389884647e-06, "loss": 2.1158, "num_input_tokens_seen": 69812304, "step": 33420 }, { "epoch": 5.4527286075536345, "grad_norm": 5.625, "learning_rate": 3.260902791888698e-06, "loss": 2.3729, "num_input_tokens_seen": 69822688, "step": 33425 }, { "epoch": 5.453544334774452, "grad_norm": 6.21875, "learning_rate": 3.2560563972309166e-06, "loss": 2.4317, "num_input_tokens_seen": 69833120, "step": 33430 }, { "epoch": 5.454360061995269, "grad_norm": 0.10595703125, "learning_rate": 3.251213355762489e-06, "loss": 2.6556, "num_input_tokens_seen": 69844432, "step": 33435 }, { "epoch": 5.455175789216086, "grad_norm": 3.484375, "learning_rate": 3.2463736682302707e-06, "loss": 2.3437, "num_input_tokens_seen": 69854976, "step": 33440 }, { "epoch": 5.455991516436903, "grad_norm": 9.0625, "learning_rate": 3.2415373353806124e-06, "loss": 2.554, "num_input_tokens_seen": 69864944, "step": 33445 }, { "epoch": 5.456807243657721, "grad_norm": 4.9375, "learning_rate": 3.236704357959322e-06, "loss": 1.8492, "num_input_tokens_seen": 69874608, "step": 33450 }, { "epoch": 5.457622970878538, "grad_norm": 5.125, "learning_rate": 3.2318747367117154e-06, "loss": 4.3637, "num_input_tokens_seen": 69884512, "step": 33455 }, { "epoch": 5.458438698099355, "grad_norm": 8.6875, "learning_rate": 3.227048472382585e-06, "loss": 2.8495, "num_input_tokens_seen": 69894128, "step": 33460 }, { "epoch": 5.459254425320173, "grad_norm": 10.25, "learning_rate": 3.2222255657161915e-06, "loss": 2.8073, "num_input_tokens_seen": 69903952, "step": 33465 }, { "epoch": 5.4600701525409905, "grad_norm": 6.25, "learning_rate": 3.2174060174562924e-06, "loss": 1.2857, "num_input_tokens_seen": 69915072, "step": 33470 }, { "epoch": 5.460885879761808, "grad_norm": 7.0625, "learning_rate": 3.2125898283461298e-06, "loss": 2.9893, "num_input_tokens_seen": 69926032, "step": 33475 }, { "epoch": 5.461701606982625, "grad_norm": 20.0, "learning_rate": 3.207776999128406e-06, "loss": 2.7275, "num_input_tokens_seen": 69936800, "step": 33480 }, { "epoch": 5.462517334203443, "grad_norm": 11.1875, "learning_rate": 3.202967530545331e-06, "loss": 2.1037, "num_input_tokens_seen": 69948208, "step": 33485 }, { "epoch": 5.46333306142426, "grad_norm": 6.5625, "learning_rate": 3.1981614233385778e-06, "loss": 2.4547, "num_input_tokens_seen": 69959552, "step": 33490 }, { "epoch": 5.464148788645077, "grad_norm": 5.1875, "learning_rate": 3.1933586782493115e-06, "loss": 1.4457, "num_input_tokens_seen": 69968208, "step": 33495 }, { "epoch": 5.464964515865894, "grad_norm": 8.3125, "learning_rate": 3.188559296018184e-06, "loss": 3.3946, "num_input_tokens_seen": 69978928, "step": 33500 }, { "epoch": 5.465780243086712, "grad_norm": 3.40625, "learning_rate": 3.1837632773853098e-06, "loss": 1.4608, "num_input_tokens_seen": 69988816, "step": 33505 }, { "epoch": 5.466595970307529, "grad_norm": 9.5625, "learning_rate": 3.178970623090294e-06, "loss": 2.8733, "num_input_tokens_seen": 69998800, "step": 33510 }, { "epoch": 5.467411697528346, "grad_norm": 4.46875, "learning_rate": 3.174181333872234e-06, "loss": 2.985, "num_input_tokens_seen": 70008688, "step": 33515 }, { "epoch": 5.468227424749164, "grad_norm": 0.19140625, "learning_rate": 3.169395410469686e-06, "loss": 1.2965, "num_input_tokens_seen": 70019232, "step": 33520 }, { "epoch": 5.469043151969982, "grad_norm": 6.1875, "learning_rate": 3.164612853620713e-06, "loss": 2.6284, "num_input_tokens_seen": 70030400, "step": 33525 }, { "epoch": 5.469858879190799, "grad_norm": 10.125, "learning_rate": 3.1598336640628333e-06, "loss": 2.1117, "num_input_tokens_seen": 70041184, "step": 33530 }, { "epoch": 5.470674606411616, "grad_norm": 13.1875, "learning_rate": 3.155057842533063e-06, "loss": 2.6915, "num_input_tokens_seen": 70052048, "step": 33535 }, { "epoch": 5.471490333632433, "grad_norm": 0.2734375, "learning_rate": 3.1502853897678984e-06, "loss": 2.4461, "num_input_tokens_seen": 70062160, "step": 33540 }, { "epoch": 5.472306060853251, "grad_norm": 11.5, "learning_rate": 3.1455163065033017e-06, "loss": 2.8573, "num_input_tokens_seen": 70071664, "step": 33545 }, { "epoch": 5.473121788074068, "grad_norm": 4.9375, "learning_rate": 3.140750593474734e-06, "loss": 1.3993, "num_input_tokens_seen": 70081184, "step": 33550 }, { "epoch": 5.473937515294885, "grad_norm": 6.125, "learning_rate": 3.1359882514171294e-06, "loss": 2.0756, "num_input_tokens_seen": 70092528, "step": 33555 }, { "epoch": 5.474753242515702, "grad_norm": 11.75, "learning_rate": 3.1312292810648903e-06, "loss": 2.6495, "num_input_tokens_seen": 70104464, "step": 33560 }, { "epoch": 5.47556896973652, "grad_norm": 5.6875, "learning_rate": 3.1264736831519204e-06, "loss": 1.9641, "num_input_tokens_seen": 70115648, "step": 33565 }, { "epoch": 5.4763846969573375, "grad_norm": 1.390625, "learning_rate": 3.1217214584115863e-06, "loss": 2.3778, "num_input_tokens_seen": 70126800, "step": 33570 }, { "epoch": 5.477200424178155, "grad_norm": 9.875, "learning_rate": 3.116972607576746e-06, "loss": 2.2255, "num_input_tokens_seen": 70137504, "step": 33575 }, { "epoch": 5.478016151398972, "grad_norm": 12.5, "learning_rate": 3.1122271313797303e-06, "loss": 3.8461, "num_input_tokens_seen": 70148528, "step": 33580 }, { "epoch": 5.47883187861979, "grad_norm": 11.375, "learning_rate": 3.107485030552343e-06, "loss": 3.4812, "num_input_tokens_seen": 70159392, "step": 33585 }, { "epoch": 5.479647605840607, "grad_norm": 6.75, "learning_rate": 3.1027463058258848e-06, "loss": 2.4764, "num_input_tokens_seen": 70169040, "step": 33590 }, { "epoch": 5.480463333061424, "grad_norm": 5.59375, "learning_rate": 3.0980109579311273e-06, "loss": 1.9224, "num_input_tokens_seen": 70178832, "step": 33595 }, { "epoch": 5.481279060282241, "grad_norm": 6.90625, "learning_rate": 3.093278987598314e-06, "loss": 4.1438, "num_input_tokens_seen": 70189824, "step": 33600 }, { "epoch": 5.481279060282241, "eval_loss": 2.542243003845215, "eval_runtime": 134.8092, "eval_samples_per_second": 20.214, "eval_steps_per_second": 10.111, "num_input_tokens_seen": 70189824, "step": 33600 }, { "epoch": 5.482094787503059, "grad_norm": 8.3125, "learning_rate": 3.0885503955571826e-06, "loss": 4.2614, "num_input_tokens_seen": 70200560, "step": 33605 }, { "epoch": 5.482910514723876, "grad_norm": 5.125, "learning_rate": 3.0838251825369313e-06, "loss": 2.9884, "num_input_tokens_seen": 70211088, "step": 33610 }, { "epoch": 5.4837262419446935, "grad_norm": 4.53125, "learning_rate": 3.0791033492662517e-06, "loss": 2.5172, "num_input_tokens_seen": 70222144, "step": 33615 }, { "epoch": 5.484541969165511, "grad_norm": 0.2578125, "learning_rate": 3.0743848964733203e-06, "loss": 1.2972, "num_input_tokens_seen": 70233568, "step": 33620 }, { "epoch": 5.485357696386329, "grad_norm": 6.8125, "learning_rate": 3.0696698248857625e-06, "loss": 1.912, "num_input_tokens_seen": 70244336, "step": 33625 }, { "epoch": 5.486173423607146, "grad_norm": 12.8125, "learning_rate": 3.0649581352307192e-06, "loss": 2.5585, "num_input_tokens_seen": 70255184, "step": 33630 }, { "epoch": 5.486989150827963, "grad_norm": 1.84375, "learning_rate": 3.060249828234776e-06, "loss": 1.825, "num_input_tokens_seen": 70265920, "step": 33635 }, { "epoch": 5.487804878048781, "grad_norm": 6.28125, "learning_rate": 3.055544904624025e-06, "loss": 2.2059, "num_input_tokens_seen": 70276672, "step": 33640 }, { "epoch": 5.488620605269598, "grad_norm": 10.9375, "learning_rate": 3.050843365124026e-06, "loss": 2.0991, "num_input_tokens_seen": 70287824, "step": 33645 }, { "epoch": 5.489436332490415, "grad_norm": 8.625, "learning_rate": 3.0461452104598083e-06, "loss": 3.462, "num_input_tokens_seen": 70298064, "step": 33650 }, { "epoch": 5.490252059711232, "grad_norm": 4.59375, "learning_rate": 3.0414504413558836e-06, "loss": 2.6807, "num_input_tokens_seen": 70308208, "step": 33655 }, { "epoch": 5.49106778693205, "grad_norm": 8.0, "learning_rate": 3.0367590585362564e-06, "loss": 3.6352, "num_input_tokens_seen": 70317648, "step": 33660 }, { "epoch": 5.4918835141528675, "grad_norm": 8.375, "learning_rate": 3.0320710627243813e-06, "loss": 2.053, "num_input_tokens_seen": 70328688, "step": 33665 }, { "epoch": 5.492699241373685, "grad_norm": 5.34375, "learning_rate": 3.027386454643222e-06, "loss": 2.041, "num_input_tokens_seen": 70339792, "step": 33670 }, { "epoch": 5.493514968594502, "grad_norm": 5.125, "learning_rate": 3.0227052350151914e-06, "loss": 2.0909, "num_input_tokens_seen": 70350032, "step": 33675 }, { "epoch": 5.49433069581532, "grad_norm": 3.34375, "learning_rate": 3.0180274045621957e-06, "loss": 2.7052, "num_input_tokens_seen": 70360688, "step": 33680 }, { "epoch": 5.495146423036137, "grad_norm": 8.3125, "learning_rate": 3.013352964005625e-06, "loss": 2.6262, "num_input_tokens_seen": 70370272, "step": 33685 }, { "epoch": 5.495962150256954, "grad_norm": 7.84375, "learning_rate": 3.0086819140663218e-06, "loss": 1.4663, "num_input_tokens_seen": 70380080, "step": 33690 }, { "epoch": 5.496777877477771, "grad_norm": 17.0, "learning_rate": 3.0040142554646265e-06, "loss": 2.9524, "num_input_tokens_seen": 70390080, "step": 33695 }, { "epoch": 5.497593604698589, "grad_norm": 5.59375, "learning_rate": 2.999349988920361e-06, "loss": 1.8148, "num_input_tokens_seen": 70400720, "step": 33700 }, { "epoch": 5.498409331919406, "grad_norm": 11.1875, "learning_rate": 2.994689115152796e-06, "loss": 3.4553, "num_input_tokens_seen": 70410736, "step": 33705 }, { "epoch": 5.499225059140223, "grad_norm": 4.28125, "learning_rate": 2.9900316348807105e-06, "loss": 1.3946, "num_input_tokens_seen": 70420976, "step": 33710 }, { "epoch": 5.5000407863610405, "grad_norm": 16.5, "learning_rate": 2.985377548822338e-06, "loss": 2.8065, "num_input_tokens_seen": 70431280, "step": 33715 }, { "epoch": 5.500856513581859, "grad_norm": 8.25, "learning_rate": 2.980726857695404e-06, "loss": 2.0615, "num_input_tokens_seen": 70441936, "step": 33720 }, { "epoch": 5.501672240802676, "grad_norm": 12.0, "learning_rate": 2.9760795622171017e-06, "loss": 2.7577, "num_input_tokens_seen": 70450240, "step": 33725 }, { "epoch": 5.502487968023493, "grad_norm": 4.96875, "learning_rate": 2.971435663104094e-06, "loss": 1.4691, "num_input_tokens_seen": 70458880, "step": 33730 }, { "epoch": 5.50330369524431, "grad_norm": 7.84375, "learning_rate": 2.9667951610725385e-06, "loss": 2.7232, "num_input_tokens_seen": 70468288, "step": 33735 }, { "epoch": 5.504119422465128, "grad_norm": 12.4375, "learning_rate": 2.9621580568380575e-06, "loss": 2.0621, "num_input_tokens_seen": 70478384, "step": 33740 }, { "epoch": 5.504935149685945, "grad_norm": 6.5, "learning_rate": 2.9575243511157453e-06, "loss": 1.9384, "num_input_tokens_seen": 70489344, "step": 33745 }, { "epoch": 5.505750876906762, "grad_norm": 13.625, "learning_rate": 2.952894044620186e-06, "loss": 1.8743, "num_input_tokens_seen": 70500352, "step": 33750 }, { "epoch": 5.506566604127579, "grad_norm": 3.234375, "learning_rate": 2.948267138065419e-06, "loss": 2.2559, "num_input_tokens_seen": 70510560, "step": 33755 }, { "epoch": 5.507382331348397, "grad_norm": 0.69140625, "learning_rate": 2.943643632164983e-06, "loss": 1.3274, "num_input_tokens_seen": 70519920, "step": 33760 }, { "epoch": 5.5081980585692145, "grad_norm": 3.984375, "learning_rate": 2.939023527631879e-06, "loss": 3.6455, "num_input_tokens_seen": 70530768, "step": 33765 }, { "epoch": 5.509013785790032, "grad_norm": 9.0, "learning_rate": 2.934406825178576e-06, "loss": 2.5683, "num_input_tokens_seen": 70541920, "step": 33770 }, { "epoch": 5.50982951301085, "grad_norm": 6.875, "learning_rate": 2.9297935255170357e-06, "loss": 2.6147, "num_input_tokens_seen": 70551472, "step": 33775 }, { "epoch": 5.510645240231667, "grad_norm": 8.3125, "learning_rate": 2.925183629358691e-06, "loss": 2.1595, "num_input_tokens_seen": 70562112, "step": 33780 }, { "epoch": 5.511460967452484, "grad_norm": 7.65625, "learning_rate": 2.9205771374144346e-06, "loss": 1.6327, "num_input_tokens_seen": 70571520, "step": 33785 }, { "epoch": 5.512276694673301, "grad_norm": 7.375, "learning_rate": 2.915974050394657e-06, "loss": 2.1792, "num_input_tokens_seen": 70581776, "step": 33790 }, { "epoch": 5.513092421894118, "grad_norm": 11.0, "learning_rate": 2.9113743690092067e-06, "loss": 2.4173, "num_input_tokens_seen": 70592048, "step": 33795 }, { "epoch": 5.513908149114936, "grad_norm": 3.265625, "learning_rate": 2.906778093967402e-06, "loss": 1.7663, "num_input_tokens_seen": 70602704, "step": 33800 }, { "epoch": 5.513908149114936, "eval_loss": 2.5325217247009277, "eval_runtime": 134.7687, "eval_samples_per_second": 20.22, "eval_steps_per_second": 10.114, "num_input_tokens_seen": 70602704, "step": 33800 }, { "epoch": 5.514723876335753, "grad_norm": 4.21875, "learning_rate": 2.9021852259780656e-06, "loss": 1.7847, "num_input_tokens_seen": 70614112, "step": 33805 }, { "epoch": 5.5155396035565705, "grad_norm": 6.875, "learning_rate": 2.8975957657494583e-06, "loss": 2.2548, "num_input_tokens_seen": 70624400, "step": 33810 }, { "epoch": 5.5163553307773885, "grad_norm": 9.375, "learning_rate": 2.8930097139893417e-06, "loss": 3.6829, "num_input_tokens_seen": 70636048, "step": 33815 }, { "epoch": 5.517171057998206, "grad_norm": 15.25, "learning_rate": 2.888427071404945e-06, "loss": 3.1673, "num_input_tokens_seen": 70647168, "step": 33820 }, { "epoch": 5.517986785219023, "grad_norm": 4.34375, "learning_rate": 2.8838478387029606e-06, "loss": 2.8413, "num_input_tokens_seen": 70658976, "step": 33825 }, { "epoch": 5.51880251243984, "grad_norm": 9.75, "learning_rate": 2.8792720165895737e-06, "loss": 3.1115, "num_input_tokens_seen": 70670608, "step": 33830 }, { "epoch": 5.519618239660657, "grad_norm": 2.84375, "learning_rate": 2.874699605770423e-06, "loss": 2.2639, "num_input_tokens_seen": 70681008, "step": 33835 }, { "epoch": 5.520433966881475, "grad_norm": 8.9375, "learning_rate": 2.8701306069506383e-06, "loss": 1.9286, "num_input_tokens_seen": 70692576, "step": 33840 }, { "epoch": 5.521249694102292, "grad_norm": 8.4375, "learning_rate": 2.8655650208348178e-06, "loss": 1.8624, "num_input_tokens_seen": 70703616, "step": 33845 }, { "epoch": 5.522065421323109, "grad_norm": 2.578125, "learning_rate": 2.8610028481270257e-06, "loss": 3.7743, "num_input_tokens_seen": 70714848, "step": 33850 }, { "epoch": 5.522881148543927, "grad_norm": 4.3125, "learning_rate": 2.856444089530813e-06, "loss": 1.7127, "num_input_tokens_seen": 70726000, "step": 33855 }, { "epoch": 5.523696875764744, "grad_norm": 6.875, "learning_rate": 2.8518887457491955e-06, "loss": 2.2915, "num_input_tokens_seen": 70736960, "step": 33860 }, { "epoch": 5.524512602985562, "grad_norm": 8.8125, "learning_rate": 2.8473368174846666e-06, "loss": 2.4495, "num_input_tokens_seen": 70746688, "step": 33865 }, { "epoch": 5.525328330206379, "grad_norm": 1.578125, "learning_rate": 2.842788305439184e-06, "loss": 2.8257, "num_input_tokens_seen": 70758304, "step": 33870 }, { "epoch": 5.526144057427197, "grad_norm": 10.6875, "learning_rate": 2.8382432103141925e-06, "loss": 3.0169, "num_input_tokens_seen": 70768304, "step": 33875 }, { "epoch": 5.526959784648014, "grad_norm": 5.1875, "learning_rate": 2.833701532810598e-06, "loss": 2.1918, "num_input_tokens_seen": 70778944, "step": 33880 }, { "epoch": 5.527775511868831, "grad_norm": 5.0, "learning_rate": 2.8291632736287877e-06, "loss": 1.6675, "num_input_tokens_seen": 70790960, "step": 33885 }, { "epoch": 5.528591239089648, "grad_norm": 8.3125, "learning_rate": 2.824628433468615e-06, "loss": 2.6015, "num_input_tokens_seen": 70801744, "step": 33890 }, { "epoch": 5.529406966310466, "grad_norm": 9.8125, "learning_rate": 2.8200970130294073e-06, "loss": 3.1672, "num_input_tokens_seen": 70811344, "step": 33895 }, { "epoch": 5.530222693531283, "grad_norm": 5.375, "learning_rate": 2.8155690130099775e-06, "loss": 1.8632, "num_input_tokens_seen": 70820608, "step": 33900 }, { "epoch": 5.5310384207521, "grad_norm": 10.625, "learning_rate": 2.8110444341085895e-06, "loss": 3.0267, "num_input_tokens_seen": 70832208, "step": 33905 }, { "epoch": 5.5318541479729175, "grad_norm": 6.5, "learning_rate": 2.806523277022996e-06, "loss": 1.6474, "num_input_tokens_seen": 70842064, "step": 33910 }, { "epoch": 5.5326698751937355, "grad_norm": 7.40625, "learning_rate": 2.802005542450409e-06, "loss": 2.4329, "num_input_tokens_seen": 70851280, "step": 33915 }, { "epoch": 5.533485602414553, "grad_norm": 10.875, "learning_rate": 2.797491231087526e-06, "loss": 3.2764, "num_input_tokens_seen": 70860832, "step": 33920 }, { "epoch": 5.53430132963537, "grad_norm": 5.53125, "learning_rate": 2.7929803436305137e-06, "loss": 2.0809, "num_input_tokens_seen": 70871840, "step": 33925 }, { "epoch": 5.535117056856187, "grad_norm": 8.125, "learning_rate": 2.788472880774998e-06, "loss": 2.6532, "num_input_tokens_seen": 70881504, "step": 33930 }, { "epoch": 5.535932784077005, "grad_norm": 7.75, "learning_rate": 2.7839688432160977e-06, "loss": 2.5554, "num_input_tokens_seen": 70892464, "step": 33935 }, { "epoch": 5.536748511297822, "grad_norm": 10.625, "learning_rate": 2.779468231648383e-06, "loss": 3.6217, "num_input_tokens_seen": 70903888, "step": 33940 }, { "epoch": 5.537564238518639, "grad_norm": 6.59375, "learning_rate": 2.774971046765906e-06, "loss": 1.8997, "num_input_tokens_seen": 70914432, "step": 33945 }, { "epoch": 5.538379965739456, "grad_norm": 5.0, "learning_rate": 2.770477289262194e-06, "loss": 2.8753, "num_input_tokens_seen": 70926496, "step": 33950 }, { "epoch": 5.539195692960274, "grad_norm": 7.34375, "learning_rate": 2.765986959830233e-06, "loss": 3.0174, "num_input_tokens_seen": 70935280, "step": 33955 }, { "epoch": 5.5400114201810915, "grad_norm": 2.0, "learning_rate": 2.761500059162492e-06, "loss": 1.9065, "num_input_tokens_seen": 70943968, "step": 33960 }, { "epoch": 5.540827147401909, "grad_norm": 4.34375, "learning_rate": 2.757016587950914e-06, "loss": 2.7882, "num_input_tokens_seen": 70955440, "step": 33965 }, { "epoch": 5.541642874622726, "grad_norm": 12.0, "learning_rate": 2.752536546886897e-06, "loss": 2.7829, "num_input_tokens_seen": 70965936, "step": 33970 }, { "epoch": 5.542458601843544, "grad_norm": 3.453125, "learning_rate": 2.7480599366613234e-06, "loss": 2.091, "num_input_tokens_seen": 70976880, "step": 33975 }, { "epoch": 5.543274329064361, "grad_norm": 6.375, "learning_rate": 2.7435867579645473e-06, "loss": 1.7876, "num_input_tokens_seen": 70988896, "step": 33980 }, { "epoch": 5.544090056285178, "grad_norm": 2.671875, "learning_rate": 2.739117011486378e-06, "loss": 2.527, "num_input_tokens_seen": 70999376, "step": 33985 }, { "epoch": 5.544905783505996, "grad_norm": 5.125, "learning_rate": 2.7346506979161216e-06, "loss": 1.8347, "num_input_tokens_seen": 71011360, "step": 33990 }, { "epoch": 5.545721510726813, "grad_norm": 8.25, "learning_rate": 2.7301878179425227e-06, "loss": 1.6923, "num_input_tokens_seen": 71022528, "step": 33995 }, { "epoch": 5.54653723794763, "grad_norm": 4.09375, "learning_rate": 2.7257283722538244e-06, "loss": 2.9164, "num_input_tokens_seen": 71032768, "step": 34000 }, { "epoch": 5.54653723794763, "eval_loss": 2.547989845275879, "eval_runtime": 134.6502, "eval_samples_per_second": 20.238, "eval_steps_per_second": 10.123, "num_input_tokens_seen": 71032768, "step": 34000 }, { "epoch": 5.5473529651684474, "grad_norm": 14.625, "learning_rate": 2.7212723615377326e-06, "loss": 2.8876, "num_input_tokens_seen": 71043328, "step": 34005 }, { "epoch": 5.548168692389265, "grad_norm": 6.625, "learning_rate": 2.7168197864814145e-06, "loss": 3.3343, "num_input_tokens_seen": 71054096, "step": 34010 }, { "epoch": 5.548984419610083, "grad_norm": 6.5625, "learning_rate": 2.712370647771509e-06, "loss": 2.4487, "num_input_tokens_seen": 71064384, "step": 34015 }, { "epoch": 5.5498001468309, "grad_norm": 6.8125, "learning_rate": 2.707924946094137e-06, "loss": 2.6793, "num_input_tokens_seen": 71074272, "step": 34020 }, { "epoch": 5.550615874051717, "grad_norm": 14.125, "learning_rate": 2.7034826821348723e-06, "loss": 2.8552, "num_input_tokens_seen": 71083584, "step": 34025 }, { "epoch": 5.551431601272535, "grad_norm": 5.875, "learning_rate": 2.6990438565787786e-06, "loss": 1.2556, "num_input_tokens_seen": 71092560, "step": 34030 }, { "epoch": 5.552247328493352, "grad_norm": 9.625, "learning_rate": 2.6946084701103714e-06, "loss": 2.0561, "num_input_tokens_seen": 71103216, "step": 34035 }, { "epoch": 5.553063055714169, "grad_norm": 3.1875, "learning_rate": 2.6901765234136428e-06, "loss": 3.5946, "num_input_tokens_seen": 71113568, "step": 34040 }, { "epoch": 5.553878782934986, "grad_norm": 6.5625, "learning_rate": 2.685748017172063e-06, "loss": 1.2238, "num_input_tokens_seen": 71123872, "step": 34045 }, { "epoch": 5.554694510155803, "grad_norm": 9.375, "learning_rate": 2.681322952068549e-06, "loss": 3.0264, "num_input_tokens_seen": 71133792, "step": 34050 }, { "epoch": 5.555510237376621, "grad_norm": 9.4375, "learning_rate": 2.6769013287855137e-06, "loss": 2.726, "num_input_tokens_seen": 71143472, "step": 34055 }, { "epoch": 5.5563259645974385, "grad_norm": 12.0625, "learning_rate": 2.6724831480048286e-06, "loss": 2.5001, "num_input_tokens_seen": 71154528, "step": 34060 }, { "epoch": 5.557141691818256, "grad_norm": 11.0625, "learning_rate": 2.66806841040782e-06, "loss": 3.4923, "num_input_tokens_seen": 71166400, "step": 34065 }, { "epoch": 5.557957419039074, "grad_norm": 5.0625, "learning_rate": 2.6636571166753083e-06, "loss": 2.7179, "num_input_tokens_seen": 71178000, "step": 34070 }, { "epoch": 5.558773146259891, "grad_norm": 5.53125, "learning_rate": 2.6592492674875598e-06, "loss": 2.2232, "num_input_tokens_seen": 71187856, "step": 34075 }, { "epoch": 5.559588873480708, "grad_norm": 2.765625, "learning_rate": 2.6548448635243305e-06, "loss": 1.4215, "num_input_tokens_seen": 71198256, "step": 34080 }, { "epoch": 5.560404600701525, "grad_norm": 9.125, "learning_rate": 2.650443905464828e-06, "loss": 2.8933, "num_input_tokens_seen": 71208928, "step": 34085 }, { "epoch": 5.561220327922343, "grad_norm": 5.6875, "learning_rate": 2.646046393987739e-06, "loss": 1.9546, "num_input_tokens_seen": 71219168, "step": 34090 }, { "epoch": 5.56203605514316, "grad_norm": 10.3125, "learning_rate": 2.64165232977121e-06, "loss": 2.9019, "num_input_tokens_seen": 71229696, "step": 34095 }, { "epoch": 5.562851782363977, "grad_norm": 3.25, "learning_rate": 2.6372617134928695e-06, "loss": 1.7256, "num_input_tokens_seen": 71240736, "step": 34100 }, { "epoch": 5.5636675095847945, "grad_norm": 3.671875, "learning_rate": 2.6328745458297943e-06, "loss": 1.7602, "num_input_tokens_seen": 71249376, "step": 34105 }, { "epoch": 5.5644832368056125, "grad_norm": 14.25, "learning_rate": 2.6284908274585546e-06, "loss": 2.2534, "num_input_tokens_seen": 71258848, "step": 34110 }, { "epoch": 5.56529896402643, "grad_norm": 2.671875, "learning_rate": 2.6241105590551595e-06, "loss": 2.0429, "num_input_tokens_seen": 71267808, "step": 34115 }, { "epoch": 5.566114691247247, "grad_norm": 9.1875, "learning_rate": 2.6197337412951105e-06, "loss": 1.5822, "num_input_tokens_seen": 71279072, "step": 34120 }, { "epoch": 5.566930418468064, "grad_norm": 5.65625, "learning_rate": 2.6153603748533705e-06, "loss": 2.4592, "num_input_tokens_seen": 71289504, "step": 34125 }, { "epoch": 5.567746145688882, "grad_norm": 5.1875, "learning_rate": 2.6109904604043585e-06, "loss": 3.075, "num_input_tokens_seen": 71299392, "step": 34130 }, { "epoch": 5.568561872909699, "grad_norm": 3.484375, "learning_rate": 2.6066239986219765e-06, "loss": 1.5505, "num_input_tokens_seen": 71309664, "step": 34135 }, { "epoch": 5.569377600130516, "grad_norm": 7.53125, "learning_rate": 2.602260990179592e-06, "loss": 2.2735, "num_input_tokens_seen": 71320400, "step": 34140 }, { "epoch": 5.570193327351333, "grad_norm": 4.625, "learning_rate": 2.5979014357500248e-06, "loss": 2.2617, "num_input_tokens_seen": 71331072, "step": 34145 }, { "epoch": 5.571009054572151, "grad_norm": 3.734375, "learning_rate": 2.5935453360055844e-06, "loss": 2.1597, "num_input_tokens_seen": 71341824, "step": 34150 }, { "epoch": 5.5718247817929685, "grad_norm": 8.1875, "learning_rate": 2.5891926916180283e-06, "loss": 4.0901, "num_input_tokens_seen": 71352608, "step": 34155 }, { "epoch": 5.572640509013786, "grad_norm": 4.84375, "learning_rate": 2.5848435032585883e-06, "loss": 0.9527, "num_input_tokens_seen": 71362656, "step": 34160 }, { "epoch": 5.573456236234604, "grad_norm": 9.8125, "learning_rate": 2.58049777159797e-06, "loss": 2.4373, "num_input_tokens_seen": 71372496, "step": 34165 }, { "epoch": 5.574271963455421, "grad_norm": 0.18359375, "learning_rate": 2.576155497306332e-06, "loss": 1.6881, "num_input_tokens_seen": 71382528, "step": 34170 }, { "epoch": 5.575087690676238, "grad_norm": 7.28125, "learning_rate": 2.57181668105331e-06, "loss": 1.7478, "num_input_tokens_seen": 71393696, "step": 34175 }, { "epoch": 5.575903417897055, "grad_norm": 7.6875, "learning_rate": 2.567481323508014e-06, "loss": 3.3361, "num_input_tokens_seen": 71403216, "step": 34180 }, { "epoch": 5.576719145117872, "grad_norm": 9.125, "learning_rate": 2.5631494253389954e-06, "loss": 2.4501, "num_input_tokens_seen": 71414112, "step": 34185 }, { "epoch": 5.57753487233869, "grad_norm": 5.5, "learning_rate": 2.5588209872142997e-06, "loss": 3.7851, "num_input_tokens_seen": 71425344, "step": 34190 }, { "epoch": 5.578350599559507, "grad_norm": 3.21875, "learning_rate": 2.5544960098014186e-06, "loss": 1.1406, "num_input_tokens_seen": 71435504, "step": 34195 }, { "epoch": 5.579166326780324, "grad_norm": 11.6875, "learning_rate": 2.550174493767318e-06, "loss": 3.1158, "num_input_tokens_seen": 71445488, "step": 34200 }, { "epoch": 5.579166326780324, "eval_loss": 2.547051191329956, "eval_runtime": 134.6809, "eval_samples_per_second": 20.233, "eval_steps_per_second": 10.12, "num_input_tokens_seen": 71445488, "step": 34200 }, { "epoch": 5.5799820540011424, "grad_norm": 11.9375, "learning_rate": 2.545856439778438e-06, "loss": 1.919, "num_input_tokens_seen": 71455584, "step": 34205 }, { "epoch": 5.58079778122196, "grad_norm": 4.25, "learning_rate": 2.541541848500667e-06, "loss": 2.3945, "num_input_tokens_seen": 71466432, "step": 34210 }, { "epoch": 5.581613508442777, "grad_norm": 2.890625, "learning_rate": 2.5372307205993733e-06, "loss": 2.1849, "num_input_tokens_seen": 71476048, "step": 34215 }, { "epoch": 5.582429235663594, "grad_norm": 6.96875, "learning_rate": 2.5329230567393917e-06, "loss": 2.5195, "num_input_tokens_seen": 71486128, "step": 34220 }, { "epoch": 5.583244962884411, "grad_norm": 8.375, "learning_rate": 2.5286188575850164e-06, "loss": 1.8473, "num_input_tokens_seen": 71497328, "step": 34225 }, { "epoch": 5.584060690105229, "grad_norm": 12.4375, "learning_rate": 2.5243181237999984e-06, "loss": 2.645, "num_input_tokens_seen": 71507792, "step": 34230 }, { "epoch": 5.584876417326046, "grad_norm": 8.1875, "learning_rate": 2.520020856047578e-06, "loss": 1.9142, "num_input_tokens_seen": 71517600, "step": 34235 }, { "epoch": 5.585692144546863, "grad_norm": 6.96875, "learning_rate": 2.515727054990438e-06, "loss": 2.8674, "num_input_tokens_seen": 71529728, "step": 34240 }, { "epoch": 5.586507871767681, "grad_norm": 9.5625, "learning_rate": 2.511436721290747e-06, "loss": 1.9403, "num_input_tokens_seen": 71540784, "step": 34245 }, { "epoch": 5.587323598988498, "grad_norm": 5.8125, "learning_rate": 2.5071498556101164e-06, "loss": 3.4682, "num_input_tokens_seen": 71550576, "step": 34250 }, { "epoch": 5.5881393262093155, "grad_norm": 5.875, "learning_rate": 2.5028664586096485e-06, "loss": 2.4534, "num_input_tokens_seen": 71560656, "step": 34255 }, { "epoch": 5.588955053430133, "grad_norm": 7.9375, "learning_rate": 2.498586530949881e-06, "loss": 1.8271, "num_input_tokens_seen": 71571792, "step": 34260 }, { "epoch": 5.589770780650951, "grad_norm": 12.125, "learning_rate": 2.4943100732908427e-06, "loss": 2.5432, "num_input_tokens_seen": 71579712, "step": 34265 }, { "epoch": 5.590586507871768, "grad_norm": 1.5625, "learning_rate": 2.4900370862920188e-06, "loss": 1.4368, "num_input_tokens_seen": 71590704, "step": 34270 }, { "epoch": 5.591402235092585, "grad_norm": 1.8359375, "learning_rate": 2.4857675706123518e-06, "loss": 2.5995, "num_input_tokens_seen": 71600912, "step": 34275 }, { "epoch": 5.592217962313402, "grad_norm": 9.875, "learning_rate": 2.4815015269102543e-06, "loss": 1.6792, "num_input_tokens_seen": 71610848, "step": 34280 }, { "epoch": 5.59303368953422, "grad_norm": 8.375, "learning_rate": 2.477238955843611e-06, "loss": 2.04, "num_input_tokens_seen": 71620688, "step": 34285 }, { "epoch": 5.593849416755037, "grad_norm": 10.4375, "learning_rate": 2.4729798580697573e-06, "loss": 2.9295, "num_input_tokens_seen": 71631792, "step": 34290 }, { "epoch": 5.594665143975854, "grad_norm": 7.4375, "learning_rate": 2.4687242342455034e-06, "loss": 1.723, "num_input_tokens_seen": 71641664, "step": 34295 }, { "epoch": 5.5954808711966715, "grad_norm": 9.5, "learning_rate": 2.4644720850271196e-06, "loss": 3.2036, "num_input_tokens_seen": 71652096, "step": 34300 }, { "epoch": 5.5962965984174895, "grad_norm": 10.125, "learning_rate": 2.4602234110703364e-06, "loss": 3.1829, "num_input_tokens_seen": 71662576, "step": 34305 }, { "epoch": 5.597112325638307, "grad_norm": 10.9375, "learning_rate": 2.4559782130303576e-06, "loss": 2.9722, "num_input_tokens_seen": 71672560, "step": 34310 }, { "epoch": 5.597928052859124, "grad_norm": 10.0, "learning_rate": 2.451736491561843e-06, "loss": 2.5311, "num_input_tokens_seen": 71682288, "step": 34315 }, { "epoch": 5.598743780079941, "grad_norm": 10.0, "learning_rate": 2.4474982473189163e-06, "loss": 2.8814, "num_input_tokens_seen": 71693792, "step": 34320 }, { "epoch": 5.599559507300759, "grad_norm": 3.125, "learning_rate": 2.4432634809551796e-06, "loss": 1.737, "num_input_tokens_seen": 71702880, "step": 34325 }, { "epoch": 5.600375234521576, "grad_norm": 0.7421875, "learning_rate": 2.439032193123675e-06, "loss": 2.1028, "num_input_tokens_seen": 71714128, "step": 34330 }, { "epoch": 5.601190961742393, "grad_norm": 9.8125, "learning_rate": 2.4348043844769297e-06, "loss": 3.191, "num_input_tokens_seen": 71724640, "step": 34335 }, { "epoch": 5.602006688963211, "grad_norm": 3.5625, "learning_rate": 2.4305800556669146e-06, "loss": 1.7909, "num_input_tokens_seen": 71733648, "step": 34340 }, { "epoch": 5.602822416184028, "grad_norm": 9.1875, "learning_rate": 2.426359207345083e-06, "loss": 1.2112, "num_input_tokens_seen": 71744624, "step": 34345 }, { "epoch": 5.6036381434048455, "grad_norm": 9.1875, "learning_rate": 2.4221418401623396e-06, "loss": 2.5706, "num_input_tokens_seen": 71754624, "step": 34350 }, { "epoch": 5.604453870625663, "grad_norm": 12.4375, "learning_rate": 2.4179279547690557e-06, "loss": 2.6951, "num_input_tokens_seen": 71764720, "step": 34355 }, { "epoch": 5.60526959784648, "grad_norm": 3.90625, "learning_rate": 2.413717551815062e-06, "loss": 2.5574, "num_input_tokens_seen": 71774736, "step": 34360 }, { "epoch": 5.606085325067298, "grad_norm": 11.0, "learning_rate": 2.409510631949666e-06, "loss": 2.4681, "num_input_tokens_seen": 71785712, "step": 34365 }, { "epoch": 5.606901052288115, "grad_norm": 10.4375, "learning_rate": 2.405307195821618e-06, "loss": 3.8565, "num_input_tokens_seen": 71796608, "step": 34370 }, { "epoch": 5.607716779508932, "grad_norm": 6.15625, "learning_rate": 2.4011072440791372e-06, "loss": 2.0812, "num_input_tokens_seen": 71806048, "step": 34375 }, { "epoch": 5.60853250672975, "grad_norm": 9.75, "learning_rate": 2.3969107773699233e-06, "loss": 2.6423, "num_input_tokens_seen": 71816384, "step": 34380 }, { "epoch": 5.609348233950567, "grad_norm": 3.578125, "learning_rate": 2.3927177963411096e-06, "loss": 3.7692, "num_input_tokens_seen": 71827088, "step": 34385 }, { "epoch": 5.610163961171384, "grad_norm": 12.25, "learning_rate": 2.3885283016393144e-06, "loss": 1.9385, "num_input_tokens_seen": 71838288, "step": 34390 }, { "epoch": 5.610979688392201, "grad_norm": 5.625, "learning_rate": 2.3843422939106076e-06, "loss": 2.0691, "num_input_tokens_seen": 71848496, "step": 34395 }, { "epoch": 5.6117954156130185, "grad_norm": 2.8125, "learning_rate": 2.380159773800525e-06, "loss": 1.7717, "num_input_tokens_seen": 71858096, "step": 34400 }, { "epoch": 5.6117954156130185, "eval_loss": 2.5390069484710693, "eval_runtime": 134.7353, "eval_samples_per_second": 20.225, "eval_steps_per_second": 10.116, "num_input_tokens_seen": 71858096, "step": 34400 }, { "epoch": 5.6126111428338366, "grad_norm": 9.5625, "learning_rate": 2.3759807419540675e-06, "loss": 1.9853, "num_input_tokens_seen": 71868224, "step": 34405 }, { "epoch": 5.613426870054654, "grad_norm": 6.6875, "learning_rate": 2.3718051990156835e-06, "loss": 1.1833, "num_input_tokens_seen": 71877264, "step": 34410 }, { "epoch": 5.614242597275471, "grad_norm": 7.125, "learning_rate": 2.367633145629311e-06, "loss": 3.5393, "num_input_tokens_seen": 71888592, "step": 34415 }, { "epoch": 5.615058324496289, "grad_norm": 8.25, "learning_rate": 2.363464582438316e-06, "loss": 2.373, "num_input_tokens_seen": 71898640, "step": 34420 }, { "epoch": 5.615874051717106, "grad_norm": 5.625, "learning_rate": 2.3592995100855526e-06, "loss": 1.7478, "num_input_tokens_seen": 71909040, "step": 34425 }, { "epoch": 5.616689778937923, "grad_norm": 6.96875, "learning_rate": 2.3551379292133273e-06, "loss": 1.6532, "num_input_tokens_seen": 71919440, "step": 34430 }, { "epoch": 5.61750550615874, "grad_norm": 11.75, "learning_rate": 2.3509798404634047e-06, "loss": 2.5449, "num_input_tokens_seen": 71930384, "step": 34435 }, { "epoch": 5.618321233379558, "grad_norm": 9.125, "learning_rate": 2.346825244477019e-06, "loss": 2.4725, "num_input_tokens_seen": 71940016, "step": 34440 }, { "epoch": 5.619136960600375, "grad_norm": 5.53125, "learning_rate": 2.3426741418948545e-06, "loss": 2.2715, "num_input_tokens_seen": 71951776, "step": 34445 }, { "epoch": 5.6199526878211925, "grad_norm": 11.0625, "learning_rate": 2.3385265333570715e-06, "loss": 1.6268, "num_input_tokens_seen": 71962688, "step": 34450 }, { "epoch": 5.62076841504201, "grad_norm": 18.375, "learning_rate": 2.334382419503278e-06, "loss": 3.108, "num_input_tokens_seen": 71973040, "step": 34455 }, { "epoch": 5.621584142262828, "grad_norm": 5.71875, "learning_rate": 2.3302418009725465e-06, "loss": 3.6945, "num_input_tokens_seen": 71982976, "step": 34460 }, { "epoch": 5.622399869483645, "grad_norm": 5.03125, "learning_rate": 2.326104678403415e-06, "loss": 3.1249, "num_input_tokens_seen": 71994320, "step": 34465 }, { "epoch": 5.623215596704462, "grad_norm": 5.59375, "learning_rate": 2.321971052433883e-06, "loss": 2.1336, "num_input_tokens_seen": 72004960, "step": 34470 }, { "epoch": 5.624031323925279, "grad_norm": 15.125, "learning_rate": 2.3178409237014004e-06, "loss": 2.2603, "num_input_tokens_seen": 72015216, "step": 34475 }, { "epoch": 5.624847051146097, "grad_norm": 4.6875, "learning_rate": 2.313714292842889e-06, "loss": 2.7268, "num_input_tokens_seen": 72027328, "step": 34480 }, { "epoch": 5.625662778366914, "grad_norm": 4.5625, "learning_rate": 2.309591160494734e-06, "loss": 1.6208, "num_input_tokens_seen": 72037600, "step": 34485 }, { "epoch": 5.626478505587731, "grad_norm": 6.59375, "learning_rate": 2.305471527292763e-06, "loss": 2.2829, "num_input_tokens_seen": 72045760, "step": 34490 }, { "epoch": 5.6272942328085485, "grad_norm": 7.40625, "learning_rate": 2.3013553938722817e-06, "loss": 2.0435, "num_input_tokens_seen": 72056608, "step": 34495 }, { "epoch": 5.6281099600293665, "grad_norm": 8.1875, "learning_rate": 2.297242760868043e-06, "loss": 3.1942, "num_input_tokens_seen": 72066976, "step": 34500 }, { "epoch": 5.628925687250184, "grad_norm": 0.7265625, "learning_rate": 2.2931336289142735e-06, "loss": 0.8745, "num_input_tokens_seen": 72077504, "step": 34505 }, { "epoch": 5.629741414471001, "grad_norm": 7.9375, "learning_rate": 2.289027998644655e-06, "loss": 3.1736, "num_input_tokens_seen": 72086000, "step": 34510 }, { "epoch": 5.630557141691818, "grad_norm": 11.4375, "learning_rate": 2.2849258706923228e-06, "loss": 3.1483, "num_input_tokens_seen": 72095824, "step": 34515 }, { "epoch": 5.631372868912636, "grad_norm": 4.21875, "learning_rate": 2.2808272456898705e-06, "loss": 2.6973, "num_input_tokens_seen": 72106224, "step": 34520 }, { "epoch": 5.632188596133453, "grad_norm": 5.5625, "learning_rate": 2.2767321242693707e-06, "loss": 2.2799, "num_input_tokens_seen": 72117584, "step": 34525 }, { "epoch": 5.63300432335427, "grad_norm": 6.5, "learning_rate": 2.272640507062329e-06, "loss": 1.6744, "num_input_tokens_seen": 72126896, "step": 34530 }, { "epoch": 5.633820050575087, "grad_norm": 9.0625, "learning_rate": 2.2685523946997382e-06, "loss": 2.5545, "num_input_tokens_seen": 72138912, "step": 34535 }, { "epoch": 5.634635777795905, "grad_norm": 6.09375, "learning_rate": 2.2644677878120245e-06, "loss": 2.5968, "num_input_tokens_seen": 72150160, "step": 34540 }, { "epoch": 5.635451505016722, "grad_norm": 14.375, "learning_rate": 2.2603866870290897e-06, "loss": 2.964, "num_input_tokens_seen": 72161008, "step": 34545 }, { "epoch": 5.63626723223754, "grad_norm": 7.875, "learning_rate": 2.256309092980294e-06, "loss": 1.7179, "num_input_tokens_seen": 72172768, "step": 34550 }, { "epoch": 5.637082959458358, "grad_norm": 5.46875, "learning_rate": 2.252235006294448e-06, "loss": 2.795, "num_input_tokens_seen": 72183552, "step": 34555 }, { "epoch": 5.637898686679175, "grad_norm": 7.75, "learning_rate": 2.2481644275998333e-06, "loss": 2.7397, "num_input_tokens_seen": 72193664, "step": 34560 }, { "epoch": 5.638714413899992, "grad_norm": 6.5, "learning_rate": 2.2440973575241832e-06, "loss": 1.921, "num_input_tokens_seen": 72204592, "step": 34565 }, { "epoch": 5.639530141120809, "grad_norm": 4.28125, "learning_rate": 2.240033796694685e-06, "loss": 3.181, "num_input_tokens_seen": 72215760, "step": 34570 }, { "epoch": 5.640345868341626, "grad_norm": 4.53125, "learning_rate": 2.235973745737999e-06, "loss": 2.3728, "num_input_tokens_seen": 72226368, "step": 34575 }, { "epoch": 5.641161595562444, "grad_norm": 10.75, "learning_rate": 2.2319172052802263e-06, "loss": 2.4891, "num_input_tokens_seen": 72237360, "step": 34580 }, { "epoch": 5.641977322783261, "grad_norm": 8.875, "learning_rate": 2.2278641759469477e-06, "loss": 3.0054, "num_input_tokens_seen": 72246928, "step": 34585 }, { "epoch": 5.642793050004078, "grad_norm": 7.625, "learning_rate": 2.2238146583631825e-06, "loss": 1.6162, "num_input_tokens_seen": 72256400, "step": 34590 }, { "epoch": 5.643608777224896, "grad_norm": 5.25, "learning_rate": 2.2197686531534256e-06, "loss": 2.5231, "num_input_tokens_seen": 72265744, "step": 34595 }, { "epoch": 5.6444245044457135, "grad_norm": 9.25, "learning_rate": 2.2157261609416087e-06, "loss": 2.4476, "num_input_tokens_seen": 72276272, "step": 34600 }, { "epoch": 5.6444245044457135, "eval_loss": 2.53377103805542, "eval_runtime": 134.7344, "eval_samples_per_second": 20.225, "eval_steps_per_second": 10.116, "num_input_tokens_seen": 72276272, "step": 34600 }, { "epoch": 5.645240231666531, "grad_norm": 6.15625, "learning_rate": 2.211687182351149e-06, "loss": 2.8777, "num_input_tokens_seen": 72287232, "step": 34605 }, { "epoch": 5.646055958887348, "grad_norm": 8.4375, "learning_rate": 2.2076517180048993e-06, "loss": 1.5784, "num_input_tokens_seen": 72298224, "step": 34610 }, { "epoch": 5.646871686108166, "grad_norm": 0.054443359375, "learning_rate": 2.2036197685251834e-06, "loss": 3.3634, "num_input_tokens_seen": 72308384, "step": 34615 }, { "epoch": 5.647687413328983, "grad_norm": 3.25, "learning_rate": 2.199591334533771e-06, "loss": 1.9471, "num_input_tokens_seen": 72318208, "step": 34620 }, { "epoch": 5.6485031405498, "grad_norm": 2.5, "learning_rate": 2.1955664166519036e-06, "loss": 1.3057, "num_input_tokens_seen": 72329152, "step": 34625 }, { "epoch": 5.649318867770617, "grad_norm": 6.71875, "learning_rate": 2.1915450155002793e-06, "loss": 3.5041, "num_input_tokens_seen": 72340016, "step": 34630 }, { "epoch": 5.650134594991435, "grad_norm": 12.875, "learning_rate": 2.187527131699038e-06, "loss": 2.7681, "num_input_tokens_seen": 72351392, "step": 34635 }, { "epoch": 5.650950322212252, "grad_norm": 6.5, "learning_rate": 2.18351276586779e-06, "loss": 3.3497, "num_input_tokens_seen": 72361888, "step": 34640 }, { "epoch": 5.6517660494330695, "grad_norm": 10.75, "learning_rate": 2.1795019186256092e-06, "loss": 3.9367, "num_input_tokens_seen": 72372320, "step": 34645 }, { "epoch": 5.652581776653887, "grad_norm": 11.0, "learning_rate": 2.1754945905910094e-06, "loss": 2.655, "num_input_tokens_seen": 72382960, "step": 34650 }, { "epoch": 5.653397503874705, "grad_norm": 3.578125, "learning_rate": 2.171490782381977e-06, "loss": 1.4699, "num_input_tokens_seen": 72392912, "step": 34655 }, { "epoch": 5.654213231095522, "grad_norm": 4.78125, "learning_rate": 2.1674904946159425e-06, "loss": 1.5977, "num_input_tokens_seen": 72402496, "step": 34660 }, { "epoch": 5.655028958316339, "grad_norm": 3.359375, "learning_rate": 2.16349372790981e-06, "loss": 1.7798, "num_input_tokens_seen": 72413264, "step": 34665 }, { "epoch": 5.655844685537156, "grad_norm": 4.84375, "learning_rate": 2.159500482879928e-06, "loss": 2.6119, "num_input_tokens_seen": 72422864, "step": 34670 }, { "epoch": 5.656660412757974, "grad_norm": 8.875, "learning_rate": 2.155510760142096e-06, "loss": 3.4643, "num_input_tokens_seen": 72432944, "step": 34675 }, { "epoch": 5.657476139978791, "grad_norm": 5.78125, "learning_rate": 2.151524560311588e-06, "loss": 1.5527, "num_input_tokens_seen": 72443168, "step": 34680 }, { "epoch": 5.658291867199608, "grad_norm": 14.4375, "learning_rate": 2.147541884003129e-06, "loss": 3.2125, "num_input_tokens_seen": 72452128, "step": 34685 }, { "epoch": 5.659107594420425, "grad_norm": 4.3125, "learning_rate": 2.1435627318308895e-06, "loss": 2.0491, "num_input_tokens_seen": 72463520, "step": 34690 }, { "epoch": 5.6599233216412435, "grad_norm": 6.3125, "learning_rate": 2.139587104408511e-06, "loss": 2.391, "num_input_tokens_seen": 72474160, "step": 34695 }, { "epoch": 5.660739048862061, "grad_norm": 6.25, "learning_rate": 2.1356150023490783e-06, "loss": 1.7607, "num_input_tokens_seen": 72483936, "step": 34700 }, { "epoch": 5.661554776082878, "grad_norm": 10.125, "learning_rate": 2.1316464262651464e-06, "loss": 2.705, "num_input_tokens_seen": 72496208, "step": 34705 }, { "epoch": 5.662370503303695, "grad_norm": 6.75, "learning_rate": 2.1276813767687224e-06, "loss": 1.9463, "num_input_tokens_seen": 72506976, "step": 34710 }, { "epoch": 5.663186230524513, "grad_norm": 5.5625, "learning_rate": 2.123719854471254e-06, "loss": 2.936, "num_input_tokens_seen": 72517504, "step": 34715 }, { "epoch": 5.66400195774533, "grad_norm": 6.4375, "learning_rate": 2.119761859983668e-06, "loss": 1.6663, "num_input_tokens_seen": 72527792, "step": 34720 }, { "epoch": 5.664817684966147, "grad_norm": 9.25, "learning_rate": 2.1158073939163386e-06, "loss": 2.1571, "num_input_tokens_seen": 72538128, "step": 34725 }, { "epoch": 5.665633412186965, "grad_norm": 8.6875, "learning_rate": 2.111856456879088e-06, "loss": 2.1242, "num_input_tokens_seen": 72548000, "step": 34730 }, { "epoch": 5.666449139407782, "grad_norm": 8.8125, "learning_rate": 2.1079090494811993e-06, "loss": 2.4869, "num_input_tokens_seen": 72558096, "step": 34735 }, { "epoch": 5.667264866628599, "grad_norm": 6.3125, "learning_rate": 2.103965172331418e-06, "loss": 2.6868, "num_input_tokens_seen": 72568112, "step": 34740 }, { "epoch": 5.6680805938494165, "grad_norm": 5.59375, "learning_rate": 2.100024826037933e-06, "loss": 3.843, "num_input_tokens_seen": 72578112, "step": 34745 }, { "epoch": 5.668896321070234, "grad_norm": 8.3125, "learning_rate": 2.0960880112084027e-06, "loss": 1.8033, "num_input_tokens_seen": 72588816, "step": 34750 }, { "epoch": 5.669712048291052, "grad_norm": 13.625, "learning_rate": 2.092154728449927e-06, "loss": 2.2841, "num_input_tokens_seen": 72598864, "step": 34755 }, { "epoch": 5.670527775511869, "grad_norm": 0.15234375, "learning_rate": 2.0882249783690687e-06, "loss": 1.8777, "num_input_tokens_seen": 72608976, "step": 34760 }, { "epoch": 5.671343502732686, "grad_norm": 8.75, "learning_rate": 2.084298761571851e-06, "loss": 2.5514, "num_input_tokens_seen": 72620032, "step": 34765 }, { "epoch": 5.672159229953504, "grad_norm": 3.4375, "learning_rate": 2.080376078663737e-06, "loss": 1.5709, "num_input_tokens_seen": 72630608, "step": 34770 }, { "epoch": 5.672974957174321, "grad_norm": 4.0625, "learning_rate": 2.0764569302496593e-06, "loss": 1.651, "num_input_tokens_seen": 72642416, "step": 34775 }, { "epoch": 5.673790684395138, "grad_norm": 7.40625, "learning_rate": 2.0725413169339957e-06, "loss": 1.8498, "num_input_tokens_seen": 72653840, "step": 34780 }, { "epoch": 5.674606411615955, "grad_norm": 11.3125, "learning_rate": 2.068629239320588e-06, "loss": 4.7419, "num_input_tokens_seen": 72663408, "step": 34785 }, { "epoch": 5.6754221388367725, "grad_norm": 19.125, "learning_rate": 2.064720698012726e-06, "loss": 3.3115, "num_input_tokens_seen": 72673824, "step": 34790 }, { "epoch": 5.6762378660575905, "grad_norm": 16.375, "learning_rate": 2.0608156936131522e-06, "loss": 2.3557, "num_input_tokens_seen": 72684000, "step": 34795 }, { "epoch": 5.677053593278408, "grad_norm": 6.0625, "learning_rate": 2.056914226724074e-06, "loss": 1.8602, "num_input_tokens_seen": 72694032, "step": 34800 }, { "epoch": 5.677053593278408, "eval_loss": 2.5323657989501953, "eval_runtime": 134.7521, "eval_samples_per_second": 20.222, "eval_steps_per_second": 10.115, "num_input_tokens_seen": 72694032, "step": 34800 }, { "epoch": 5.677869320499225, "grad_norm": 6.40625, "learning_rate": 2.0530162979471385e-06, "loss": 1.9967, "num_input_tokens_seen": 72704208, "step": 34805 }, { "epoch": 5.678685047720043, "grad_norm": 6.78125, "learning_rate": 2.0491219078834667e-06, "loss": 2.0054, "num_input_tokens_seen": 72717472, "step": 34810 }, { "epoch": 5.67950077494086, "grad_norm": 4.8125, "learning_rate": 2.045231057133612e-06, "loss": 1.7196, "num_input_tokens_seen": 72728048, "step": 34815 }, { "epoch": 5.680316502161677, "grad_norm": 8.75, "learning_rate": 2.0413437462975944e-06, "loss": 2.2625, "num_input_tokens_seen": 72737696, "step": 34820 }, { "epoch": 5.681132229382494, "grad_norm": 6.9375, "learning_rate": 2.0374599759748843e-06, "loss": 2.6584, "num_input_tokens_seen": 72748304, "step": 34825 }, { "epoch": 5.681947956603312, "grad_norm": 11.0625, "learning_rate": 2.033579746764419e-06, "loss": 3.2357, "num_input_tokens_seen": 72757024, "step": 34830 }, { "epoch": 5.682763683824129, "grad_norm": 7.78125, "learning_rate": 2.029703059264565e-06, "loss": 2.3419, "num_input_tokens_seen": 72768320, "step": 34835 }, { "epoch": 5.6835794110449465, "grad_norm": 11.8125, "learning_rate": 2.02582991407316e-06, "loss": 3.2673, "num_input_tokens_seen": 72778656, "step": 34840 }, { "epoch": 5.684395138265764, "grad_norm": 4.21875, "learning_rate": 2.0219603117874992e-06, "loss": 1.9369, "num_input_tokens_seen": 72788544, "step": 34845 }, { "epoch": 5.685210865486582, "grad_norm": 6.53125, "learning_rate": 2.0180942530043156e-06, "loss": 1.7424, "num_input_tokens_seen": 72798352, "step": 34850 }, { "epoch": 5.686026592707399, "grad_norm": 15.8125, "learning_rate": 2.0142317383198107e-06, "loss": 2.663, "num_input_tokens_seen": 72809360, "step": 34855 }, { "epoch": 5.686842319928216, "grad_norm": 6.21875, "learning_rate": 2.0103727683296243e-06, "loss": 1.8504, "num_input_tokens_seen": 72820496, "step": 34860 }, { "epoch": 5.687658047149033, "grad_norm": 4.28125, "learning_rate": 2.0065173436288636e-06, "loss": 2.5163, "num_input_tokens_seen": 72830000, "step": 34865 }, { "epoch": 5.688473774369851, "grad_norm": 3.6875, "learning_rate": 2.002665464812087e-06, "loss": 1.7411, "num_input_tokens_seen": 72841584, "step": 34870 }, { "epoch": 5.689289501590668, "grad_norm": 5.375, "learning_rate": 1.998817132473291e-06, "loss": 1.6034, "num_input_tokens_seen": 72853072, "step": 34875 }, { "epoch": 5.690105228811485, "grad_norm": 12.5, "learning_rate": 1.9949723472059507e-06, "loss": 2.8051, "num_input_tokens_seen": 72863376, "step": 34880 }, { "epoch": 5.690920956032302, "grad_norm": 6.71875, "learning_rate": 1.9911311096029726e-06, "loss": 2.5916, "num_input_tokens_seen": 72875280, "step": 34885 }, { "epoch": 5.69173668325312, "grad_norm": 12.0625, "learning_rate": 1.9872934202567224e-06, "loss": 2.5245, "num_input_tokens_seen": 72884912, "step": 34890 }, { "epoch": 5.692552410473938, "grad_norm": 8.25, "learning_rate": 1.9834592797590257e-06, "loss": 4.7388, "num_input_tokens_seen": 72895264, "step": 34895 }, { "epoch": 5.693368137694755, "grad_norm": 12.625, "learning_rate": 1.979628688701149e-06, "loss": 3.1135, "num_input_tokens_seen": 72906160, "step": 34900 }, { "epoch": 5.694183864915573, "grad_norm": 7.125, "learning_rate": 1.9758016476738193e-06, "loss": 1.3217, "num_input_tokens_seen": 72918880, "step": 34905 }, { "epoch": 5.69499959213639, "grad_norm": 4.59375, "learning_rate": 1.971978157267221e-06, "loss": 2.2688, "num_input_tokens_seen": 72930736, "step": 34910 }, { "epoch": 5.695815319357207, "grad_norm": 9.625, "learning_rate": 1.968158218070973e-06, "loss": 3.54, "num_input_tokens_seen": 72941216, "step": 34915 }, { "epoch": 5.696631046578024, "grad_norm": 3.90625, "learning_rate": 1.9643418306741682e-06, "loss": 1.4181, "num_input_tokens_seen": 72952560, "step": 34920 }, { "epoch": 5.697446773798841, "grad_norm": 3.40625, "learning_rate": 1.9605289956653337e-06, "loss": 3.2634, "num_input_tokens_seen": 72962800, "step": 34925 }, { "epoch": 5.698262501019659, "grad_norm": 5.375, "learning_rate": 1.9567197136324626e-06, "loss": 3.3198, "num_input_tokens_seen": 72973664, "step": 34930 }, { "epoch": 5.699078228240476, "grad_norm": 4.3125, "learning_rate": 1.9529139851629935e-06, "loss": 2.6721, "num_input_tokens_seen": 72985504, "step": 34935 }, { "epoch": 5.6998939554612935, "grad_norm": 9.5625, "learning_rate": 1.949111810843812e-06, "loss": 2.2349, "num_input_tokens_seen": 72995904, "step": 34940 }, { "epoch": 5.7007096826821115, "grad_norm": 9.4375, "learning_rate": 1.9453131912612694e-06, "loss": 2.43, "num_input_tokens_seen": 73006592, "step": 34945 }, { "epoch": 5.701525409902929, "grad_norm": 8.0, "learning_rate": 1.941518127001149e-06, "loss": 2.3815, "num_input_tokens_seen": 73018432, "step": 34950 }, { "epoch": 5.702341137123746, "grad_norm": 7.21875, "learning_rate": 1.9377266186487107e-06, "loss": 1.8554, "num_input_tokens_seen": 73028768, "step": 34955 }, { "epoch": 5.703156864344563, "grad_norm": 5.5625, "learning_rate": 1.9339386667886483e-06, "loss": 2.602, "num_input_tokens_seen": 73040016, "step": 34960 }, { "epoch": 5.70397259156538, "grad_norm": 10.0625, "learning_rate": 1.9301542720051024e-06, "loss": 2.1533, "num_input_tokens_seen": 73049792, "step": 34965 }, { "epoch": 5.704788318786198, "grad_norm": 3.078125, "learning_rate": 1.926373434881684e-06, "loss": 2.168, "num_input_tokens_seen": 73058704, "step": 34970 }, { "epoch": 5.705604046007015, "grad_norm": 7.84375, "learning_rate": 1.9225961560014468e-06, "loss": 2.9266, "num_input_tokens_seen": 73067696, "step": 34975 }, { "epoch": 5.706419773227832, "grad_norm": 12.8125, "learning_rate": 1.918822435946885e-06, "loss": 3.9945, "num_input_tokens_seen": 73079440, "step": 34980 }, { "epoch": 5.70723550044865, "grad_norm": 2.5625, "learning_rate": 1.915052275299961e-06, "loss": 1.6514, "num_input_tokens_seen": 73089152, "step": 34985 }, { "epoch": 5.7080512276694675, "grad_norm": 9.9375, "learning_rate": 1.9112856746420854e-06, "loss": 2.1754, "num_input_tokens_seen": 73100144, "step": 34990 }, { "epoch": 5.708866954890285, "grad_norm": 11.8125, "learning_rate": 1.907522634554104e-06, "loss": 2.5818, "num_input_tokens_seen": 73110544, "step": 34995 }, { "epoch": 5.709682682111102, "grad_norm": 7.625, "learning_rate": 1.9037631556163337e-06, "loss": 2.1456, "num_input_tokens_seen": 73119856, "step": 35000 }, { "epoch": 5.709682682111102, "eval_loss": 2.546726703643799, "eval_runtime": 134.5612, "eval_samples_per_second": 20.251, "eval_steps_per_second": 10.129, "num_input_tokens_seen": 73119856, "step": 35000 }, { "epoch": 5.71049840933192, "grad_norm": 7.875, "learning_rate": 1.9000072384085272e-06, "loss": 2.3632, "num_input_tokens_seen": 73129952, "step": 35005 }, { "epoch": 5.711314136552737, "grad_norm": 7.375, "learning_rate": 1.8962548835098987e-06, "loss": 2.0775, "num_input_tokens_seen": 73138528, "step": 35010 }, { "epoch": 5.712129863773554, "grad_norm": 7.90625, "learning_rate": 1.8925060914991077e-06, "loss": 2.9103, "num_input_tokens_seen": 73148288, "step": 35015 }, { "epoch": 5.712945590994371, "grad_norm": 8.5625, "learning_rate": 1.888760862954264e-06, "loss": 2.1938, "num_input_tokens_seen": 73158304, "step": 35020 }, { "epoch": 5.713761318215189, "grad_norm": 4.5, "learning_rate": 1.8850191984529309e-06, "loss": 1.8173, "num_input_tokens_seen": 73167824, "step": 35025 }, { "epoch": 5.714577045436006, "grad_norm": 12.8125, "learning_rate": 1.8812810985721186e-06, "loss": 3.4717, "num_input_tokens_seen": 73179744, "step": 35030 }, { "epoch": 5.715392772656823, "grad_norm": 4.84375, "learning_rate": 1.8775465638882856e-06, "loss": 3.1615, "num_input_tokens_seen": 73189840, "step": 35035 }, { "epoch": 5.716208499877641, "grad_norm": 7.71875, "learning_rate": 1.8738155949773517e-06, "loss": 2.5083, "num_input_tokens_seen": 73199648, "step": 35040 }, { "epoch": 5.717024227098459, "grad_norm": 0.70703125, "learning_rate": 1.8700881924146707e-06, "loss": 1.502, "num_input_tokens_seen": 73208720, "step": 35045 }, { "epoch": 5.717839954319276, "grad_norm": 3.953125, "learning_rate": 1.8663643567750577e-06, "loss": 2.6797, "num_input_tokens_seen": 73220592, "step": 35050 }, { "epoch": 5.718655681540093, "grad_norm": 4.21875, "learning_rate": 1.8626440886327813e-06, "loss": 2.6372, "num_input_tokens_seen": 73230896, "step": 35055 }, { "epoch": 5.71947140876091, "grad_norm": 7.96875, "learning_rate": 1.8589273885615432e-06, "loss": 3.7997, "num_input_tokens_seen": 73242704, "step": 35060 }, { "epoch": 5.720287135981728, "grad_norm": 3.390625, "learning_rate": 1.8552142571345133e-06, "loss": 2.9957, "num_input_tokens_seen": 73252992, "step": 35065 }, { "epoch": 5.721102863202545, "grad_norm": 8.9375, "learning_rate": 1.8515046949243025e-06, "loss": 2.4913, "num_input_tokens_seen": 73263776, "step": 35070 }, { "epoch": 5.721918590423362, "grad_norm": 4.4375, "learning_rate": 1.8477987025029674e-06, "loss": 1.8598, "num_input_tokens_seen": 73275184, "step": 35075 }, { "epoch": 5.72273431764418, "grad_norm": 5.96875, "learning_rate": 1.8440962804420232e-06, "loss": 2.9535, "num_input_tokens_seen": 73285600, "step": 35080 }, { "epoch": 5.723550044864997, "grad_norm": 5.03125, "learning_rate": 1.8403974293124265e-06, "loss": 1.8279, "num_input_tokens_seen": 73294976, "step": 35085 }, { "epoch": 5.7243657720858145, "grad_norm": 3.984375, "learning_rate": 1.8367021496845854e-06, "loss": 3.7718, "num_input_tokens_seen": 73306224, "step": 35090 }, { "epoch": 5.725181499306632, "grad_norm": 7.53125, "learning_rate": 1.8330104421283662e-06, "loss": 1.6353, "num_input_tokens_seen": 73316448, "step": 35095 }, { "epoch": 5.725997226527449, "grad_norm": 8.625, "learning_rate": 1.8293223072130717e-06, "loss": 2.9711, "num_input_tokens_seen": 73327552, "step": 35100 }, { "epoch": 5.726812953748267, "grad_norm": 7.34375, "learning_rate": 1.8256377455074525e-06, "loss": 2.4942, "num_input_tokens_seen": 73338384, "step": 35105 }, { "epoch": 5.727628680969084, "grad_norm": 4.78125, "learning_rate": 1.8219567575797263e-06, "loss": 1.894, "num_input_tokens_seen": 73349216, "step": 35110 }, { "epoch": 5.728444408189901, "grad_norm": 1.9453125, "learning_rate": 1.8182793439975365e-06, "loss": 1.0833, "num_input_tokens_seen": 73359792, "step": 35115 }, { "epoch": 5.729260135410719, "grad_norm": 5.25, "learning_rate": 1.8146055053279958e-06, "loss": 2.4312, "num_input_tokens_seen": 73370544, "step": 35120 }, { "epoch": 5.730075862631536, "grad_norm": 6.15625, "learning_rate": 1.8109352421376486e-06, "loss": 2.0853, "num_input_tokens_seen": 73379712, "step": 35125 }, { "epoch": 5.730891589852353, "grad_norm": 13.0625, "learning_rate": 1.8072685549924972e-06, "loss": 2.7545, "num_input_tokens_seen": 73390752, "step": 35130 }, { "epoch": 5.7317073170731705, "grad_norm": 5.625, "learning_rate": 1.8036054444579982e-06, "loss": 2.143, "num_input_tokens_seen": 73400832, "step": 35135 }, { "epoch": 5.732523044293988, "grad_norm": 1.7890625, "learning_rate": 1.7999459110990407e-06, "loss": 1.5524, "num_input_tokens_seen": 73412800, "step": 35140 }, { "epoch": 5.733338771514806, "grad_norm": 7.6875, "learning_rate": 1.7962899554799712e-06, "loss": 1.6, "num_input_tokens_seen": 73423920, "step": 35145 }, { "epoch": 5.734154498735623, "grad_norm": 3.59375, "learning_rate": 1.7926375781645937e-06, "loss": 1.5129, "num_input_tokens_seen": 73435200, "step": 35150 }, { "epoch": 5.73497022595644, "grad_norm": 6.4375, "learning_rate": 1.7889887797161359e-06, "loss": 1.3796, "num_input_tokens_seen": 73447296, "step": 35155 }, { "epoch": 5.735785953177258, "grad_norm": 2.546875, "learning_rate": 1.7853435606973028e-06, "loss": 2.1926, "num_input_tokens_seen": 73458016, "step": 35160 }, { "epoch": 5.736601680398075, "grad_norm": 7.21875, "learning_rate": 1.781701921670223e-06, "loss": 2.1694, "num_input_tokens_seen": 73468784, "step": 35165 }, { "epoch": 5.737417407618892, "grad_norm": 8.4375, "learning_rate": 1.7780638631964886e-06, "loss": 1.8736, "num_input_tokens_seen": 73479536, "step": 35170 }, { "epoch": 5.738233134839709, "grad_norm": 6.28125, "learning_rate": 1.7744293858371314e-06, "loss": 2.0445, "num_input_tokens_seen": 73489024, "step": 35175 }, { "epoch": 5.739048862060527, "grad_norm": 6.0625, "learning_rate": 1.770798490152631e-06, "loss": 2.3042, "num_input_tokens_seen": 73499216, "step": 35180 }, { "epoch": 5.7398645892813445, "grad_norm": 2.46875, "learning_rate": 1.767171176702917e-06, "loss": 1.2762, "num_input_tokens_seen": 73509232, "step": 35185 }, { "epoch": 5.740680316502162, "grad_norm": 6.96875, "learning_rate": 1.7635474460473755e-06, "loss": 2.439, "num_input_tokens_seen": 73518752, "step": 35190 }, { "epoch": 5.741496043722979, "grad_norm": 8.4375, "learning_rate": 1.7599272987448206e-06, "loss": 2.539, "num_input_tokens_seen": 73527936, "step": 35195 }, { "epoch": 5.742311770943797, "grad_norm": 5.96875, "learning_rate": 1.7563107353535362e-06, "loss": 2.6911, "num_input_tokens_seen": 73537984, "step": 35200 }, { "epoch": 5.742311770943797, "eval_loss": 2.546726703643799, "eval_runtime": 134.8088, "eval_samples_per_second": 20.214, "eval_steps_per_second": 10.111, "num_input_tokens_seen": 73537984, "step": 35200 }, { "epoch": 5.743127498164614, "grad_norm": 6.3125, "learning_rate": 1.7526977564312263e-06, "loss": 3.0883, "num_input_tokens_seen": 73548880, "step": 35205 }, { "epoch": 5.743943225385431, "grad_norm": 8.625, "learning_rate": 1.7490883625350701e-06, "loss": 3.3915, "num_input_tokens_seen": 73560624, "step": 35210 }, { "epoch": 5.744758952606248, "grad_norm": 4.8125, "learning_rate": 1.7454825542216807e-06, "loss": 1.0819, "num_input_tokens_seen": 73571536, "step": 35215 }, { "epoch": 5.745574679827066, "grad_norm": 8.875, "learning_rate": 1.7418803320471105e-06, "loss": 2.7838, "num_input_tokens_seen": 73582240, "step": 35220 }, { "epoch": 5.746390407047883, "grad_norm": 6.375, "learning_rate": 1.7382816965668737e-06, "loss": 3.5861, "num_input_tokens_seen": 73593120, "step": 35225 }, { "epoch": 5.7472061342687, "grad_norm": 4.625, "learning_rate": 1.7346866483359285e-06, "loss": 3.349, "num_input_tokens_seen": 73603680, "step": 35230 }, { "epoch": 5.7480218614895175, "grad_norm": 6.53125, "learning_rate": 1.7310951879086657e-06, "loss": 2.4038, "num_input_tokens_seen": 73615024, "step": 35235 }, { "epoch": 5.748837588710336, "grad_norm": 6.53125, "learning_rate": 1.7275073158389471e-06, "loss": 1.5901, "num_input_tokens_seen": 73625232, "step": 35240 }, { "epoch": 5.749653315931153, "grad_norm": 11.8125, "learning_rate": 1.723923032680061e-06, "loss": 4.5173, "num_input_tokens_seen": 73636096, "step": 35245 }, { "epoch": 5.75046904315197, "grad_norm": 7.375, "learning_rate": 1.7203423389847428e-06, "loss": 2.5933, "num_input_tokens_seen": 73645248, "step": 35250 }, { "epoch": 5.751284770372787, "grad_norm": 5.75, "learning_rate": 1.7167652353051928e-06, "loss": 1.8978, "num_input_tokens_seen": 73656912, "step": 35255 }, { "epoch": 5.752100497593605, "grad_norm": 13.5, "learning_rate": 1.7131917221930333e-06, "loss": 2.2594, "num_input_tokens_seen": 73667040, "step": 35260 }, { "epoch": 5.752916224814422, "grad_norm": 6.75, "learning_rate": 1.7096218001993513e-06, "loss": 2.3742, "num_input_tokens_seen": 73677072, "step": 35265 }, { "epoch": 5.753731952035239, "grad_norm": 2.171875, "learning_rate": 1.706055469874676e-06, "loss": 2.1362, "num_input_tokens_seen": 73686224, "step": 35270 }, { "epoch": 5.754547679256056, "grad_norm": 12.0, "learning_rate": 1.702492731768976e-06, "loss": 2.7959, "num_input_tokens_seen": 73697168, "step": 35275 }, { "epoch": 5.755363406476874, "grad_norm": 9.625, "learning_rate": 1.6989335864316724e-06, "loss": 1.4453, "num_input_tokens_seen": 73707232, "step": 35280 }, { "epoch": 5.7561791336976915, "grad_norm": 6.125, "learning_rate": 1.6953780344116265e-06, "loss": 0.9594, "num_input_tokens_seen": 73718160, "step": 35285 }, { "epoch": 5.756994860918509, "grad_norm": 9.8125, "learning_rate": 1.6918260762571497e-06, "loss": 2.923, "num_input_tokens_seen": 73729648, "step": 35290 }, { "epoch": 5.757810588139327, "grad_norm": 12.1875, "learning_rate": 1.6882777125160093e-06, "loss": 3.301, "num_input_tokens_seen": 73739568, "step": 35295 }, { "epoch": 5.758626315360144, "grad_norm": 7.4375, "learning_rate": 1.6847329437353899e-06, "loss": 1.9258, "num_input_tokens_seen": 73749776, "step": 35300 }, { "epoch": 5.759442042580961, "grad_norm": 9.4375, "learning_rate": 1.6811917704619511e-06, "loss": 3.2924, "num_input_tokens_seen": 73759600, "step": 35305 }, { "epoch": 5.760257769801778, "grad_norm": 5.6875, "learning_rate": 1.67765419324179e-06, "loss": 1.3106, "num_input_tokens_seen": 73768608, "step": 35310 }, { "epoch": 5.761073497022595, "grad_norm": 4.875, "learning_rate": 1.6741202126204364e-06, "loss": 1.9535, "num_input_tokens_seen": 73779168, "step": 35315 }, { "epoch": 5.761889224243413, "grad_norm": 11.25, "learning_rate": 1.6705898291428767e-06, "loss": 3.5144, "num_input_tokens_seen": 73789184, "step": 35320 }, { "epoch": 5.76270495146423, "grad_norm": 4.5625, "learning_rate": 1.6670630433535395e-06, "loss": 1.8432, "num_input_tokens_seen": 73800160, "step": 35325 }, { "epoch": 5.7635206786850475, "grad_norm": 9.6875, "learning_rate": 1.6635398557962979e-06, "loss": 3.3577, "num_input_tokens_seen": 73809072, "step": 35330 }, { "epoch": 5.7643364059058655, "grad_norm": 7.0, "learning_rate": 1.660020267014481e-06, "loss": 2.4257, "num_input_tokens_seen": 73819504, "step": 35335 }, { "epoch": 5.765152133126683, "grad_norm": 1.8203125, "learning_rate": 1.6565042775508438e-06, "loss": 2.9431, "num_input_tokens_seen": 73829472, "step": 35340 }, { "epoch": 5.7659678603475, "grad_norm": 12.125, "learning_rate": 1.6529918879475997e-06, "loss": 2.6111, "num_input_tokens_seen": 73840576, "step": 35345 }, { "epoch": 5.766783587568317, "grad_norm": 11.375, "learning_rate": 1.6494830987464043e-06, "loss": 3.5459, "num_input_tokens_seen": 73850752, "step": 35350 }, { "epoch": 5.767599314789135, "grad_norm": 3.828125, "learning_rate": 1.6459779104883555e-06, "loss": 2.3998, "num_input_tokens_seen": 73859776, "step": 35355 }, { "epoch": 5.768415042009952, "grad_norm": 4.9375, "learning_rate": 1.6424763237140013e-06, "loss": 2.5301, "num_input_tokens_seen": 73869104, "step": 35360 }, { "epoch": 5.769230769230769, "grad_norm": 9.4375, "learning_rate": 1.6389783389633207e-06, "loss": 1.3885, "num_input_tokens_seen": 73880352, "step": 35365 }, { "epoch": 5.770046496451586, "grad_norm": 10.4375, "learning_rate": 1.6354839567757546e-06, "loss": 1.858, "num_input_tokens_seen": 73891024, "step": 35370 }, { "epoch": 5.770862223672404, "grad_norm": 5.9375, "learning_rate": 1.6319931776901831e-06, "loss": 1.7119, "num_input_tokens_seen": 73902336, "step": 35375 }, { "epoch": 5.771677950893221, "grad_norm": 5.1875, "learning_rate": 1.6285060022449229e-06, "loss": 3.0047, "num_input_tokens_seen": 73912832, "step": 35380 }, { "epoch": 5.772493678114039, "grad_norm": 2.375, "learning_rate": 1.6250224309777434e-06, "loss": 2.1407, "num_input_tokens_seen": 73924496, "step": 35385 }, { "epoch": 5.773309405334856, "grad_norm": 10.5625, "learning_rate": 1.6215424644258515e-06, "loss": 1.5998, "num_input_tokens_seen": 73933136, "step": 35390 }, { "epoch": 5.774125132555674, "grad_norm": 9.6875, "learning_rate": 1.6180661031259036e-06, "loss": 1.894, "num_input_tokens_seen": 73944048, "step": 35395 }, { "epoch": 5.774940859776491, "grad_norm": 2.421875, "learning_rate": 1.614593347613999e-06, "loss": 1.2754, "num_input_tokens_seen": 73955216, "step": 35400 }, { "epoch": 5.774940859776491, "eval_loss": 2.546428680419922, "eval_runtime": 134.659, "eval_samples_per_second": 20.236, "eval_steps_per_second": 10.122, "num_input_tokens_seen": 73955216, "step": 35400 }, { "epoch": 5.775756586997308, "grad_norm": 7.875, "learning_rate": 1.6111241984256758e-06, "loss": 1.4236, "num_input_tokens_seen": 73965456, "step": 35405 }, { "epoch": 5.776572314218125, "grad_norm": 5.375, "learning_rate": 1.6076586560959257e-06, "loss": 3.0319, "num_input_tokens_seen": 73976192, "step": 35410 }, { "epoch": 5.777388041438943, "grad_norm": 8.8125, "learning_rate": 1.604196721159182e-06, "loss": 3.9086, "num_input_tokens_seen": 73985856, "step": 35415 }, { "epoch": 5.77820376865976, "grad_norm": 3.1875, "learning_rate": 1.6007383941493092e-06, "loss": 2.2557, "num_input_tokens_seen": 73996528, "step": 35420 }, { "epoch": 5.779019495880577, "grad_norm": 11.5625, "learning_rate": 1.5972836755996285e-06, "loss": 3.0283, "num_input_tokens_seen": 74005680, "step": 35425 }, { "epoch": 5.7798352231013945, "grad_norm": 5.09375, "learning_rate": 1.5938325660429076e-06, "loss": 2.8715, "num_input_tokens_seen": 74016384, "step": 35430 }, { "epoch": 5.7806509503222125, "grad_norm": 7.8125, "learning_rate": 1.5903850660113378e-06, "loss": 2.968, "num_input_tokens_seen": 74026704, "step": 35435 }, { "epoch": 5.78146667754303, "grad_norm": 4.8125, "learning_rate": 1.5869411760365826e-06, "loss": 1.6205, "num_input_tokens_seen": 74036544, "step": 35440 }, { "epoch": 5.782282404763847, "grad_norm": 3.28125, "learning_rate": 1.58350089664972e-06, "loss": 1.7098, "num_input_tokens_seen": 74047072, "step": 35445 }, { "epoch": 5.783098131984664, "grad_norm": 5.3125, "learning_rate": 1.5800642283812865e-06, "loss": 1.7302, "num_input_tokens_seen": 74058192, "step": 35450 }, { "epoch": 5.783913859205482, "grad_norm": 6.21875, "learning_rate": 1.5766311717612698e-06, "loss": 1.9517, "num_input_tokens_seen": 74068992, "step": 35455 }, { "epoch": 5.784729586426299, "grad_norm": 2.84375, "learning_rate": 1.5732017273190818e-06, "loss": 3.5216, "num_input_tokens_seen": 74080048, "step": 35460 }, { "epoch": 5.785545313647116, "grad_norm": 8.3125, "learning_rate": 1.5697758955835806e-06, "loss": 2.0481, "num_input_tokens_seen": 74090960, "step": 35465 }, { "epoch": 5.786361040867934, "grad_norm": 7.25, "learning_rate": 1.566353677083085e-06, "loss": 2.799, "num_input_tokens_seen": 74101232, "step": 35470 }, { "epoch": 5.787176768088751, "grad_norm": 6.46875, "learning_rate": 1.562935072345334e-06, "loss": 2.5027, "num_input_tokens_seen": 74111344, "step": 35475 }, { "epoch": 5.7879924953095685, "grad_norm": 5.8125, "learning_rate": 1.5595200818975281e-06, "loss": 2.4777, "num_input_tokens_seen": 74121104, "step": 35480 }, { "epoch": 5.788808222530386, "grad_norm": 6.78125, "learning_rate": 1.5561087062662905e-06, "loss": 2.2983, "num_input_tokens_seen": 74131136, "step": 35485 }, { "epoch": 5.789623949751203, "grad_norm": 5.0625, "learning_rate": 1.5527009459777087e-06, "loss": 2.568, "num_input_tokens_seen": 74141280, "step": 35490 }, { "epoch": 5.790439676972021, "grad_norm": 5.125, "learning_rate": 1.5492968015572984e-06, "loss": 1.637, "num_input_tokens_seen": 74153408, "step": 35495 }, { "epoch": 5.791255404192838, "grad_norm": 8.75, "learning_rate": 1.5458962735300203e-06, "loss": 1.8708, "num_input_tokens_seen": 74165248, "step": 35500 }, { "epoch": 5.792071131413655, "grad_norm": 6.34375, "learning_rate": 1.54249936242028e-06, "loss": 3.4596, "num_input_tokens_seen": 74175312, "step": 35505 }, { "epoch": 5.792886858634473, "grad_norm": 2.921875, "learning_rate": 1.5391060687519222e-06, "loss": 1.6101, "num_input_tokens_seen": 74186016, "step": 35510 }, { "epoch": 5.79370258585529, "grad_norm": 8.75, "learning_rate": 1.5357163930482367e-06, "loss": 1.4274, "num_input_tokens_seen": 74196992, "step": 35515 }, { "epoch": 5.794518313076107, "grad_norm": 5.25, "learning_rate": 1.532330335831955e-06, "loss": 2.471, "num_input_tokens_seen": 74207792, "step": 35520 }, { "epoch": 5.7953340402969244, "grad_norm": 14.875, "learning_rate": 1.5289478976252491e-06, "loss": 2.7722, "num_input_tokens_seen": 74217008, "step": 35525 }, { "epoch": 5.796149767517742, "grad_norm": 6.28125, "learning_rate": 1.5255690789497345e-06, "loss": 2.5851, "num_input_tokens_seen": 74227120, "step": 35530 }, { "epoch": 5.79696549473856, "grad_norm": 6.75, "learning_rate": 1.5221938803264641e-06, "loss": 3.8273, "num_input_tokens_seen": 74238304, "step": 35535 }, { "epoch": 5.797781221959377, "grad_norm": 3.234375, "learning_rate": 1.518822302275938e-06, "loss": 1.6338, "num_input_tokens_seen": 74249136, "step": 35540 }, { "epoch": 5.798596949180194, "grad_norm": 5.59375, "learning_rate": 1.5154543453180958e-06, "loss": 2.5172, "num_input_tokens_seen": 74258720, "step": 35545 }, { "epoch": 5.799412676401012, "grad_norm": 6.25, "learning_rate": 1.5120900099723167e-06, "loss": 2.5485, "num_input_tokens_seen": 74269584, "step": 35550 }, { "epoch": 5.800228403621829, "grad_norm": 5.78125, "learning_rate": 1.5087292967574273e-06, "loss": 2.9919, "num_input_tokens_seen": 74279088, "step": 35555 }, { "epoch": 5.801044130842646, "grad_norm": 1.65625, "learning_rate": 1.5053722061916908e-06, "loss": 1.7972, "num_input_tokens_seen": 74289312, "step": 35560 }, { "epoch": 5.801859858063463, "grad_norm": 7.15625, "learning_rate": 1.5020187387928124e-06, "loss": 2.6373, "num_input_tokens_seen": 74298640, "step": 35565 }, { "epoch": 5.802675585284281, "grad_norm": 0.07958984375, "learning_rate": 1.4986688950779343e-06, "loss": 1.1566, "num_input_tokens_seen": 74308160, "step": 35570 }, { "epoch": 5.803491312505098, "grad_norm": 8.6875, "learning_rate": 1.495322675563654e-06, "loss": 4.2447, "num_input_tokens_seen": 74319088, "step": 35575 }, { "epoch": 5.8043070397259156, "grad_norm": 8.4375, "learning_rate": 1.4919800807659922e-06, "loss": 2.2337, "num_input_tokens_seen": 74329712, "step": 35580 }, { "epoch": 5.805122766946733, "grad_norm": 6.28125, "learning_rate": 1.4886411112004255e-06, "loss": 2.1182, "num_input_tokens_seen": 74340640, "step": 35585 }, { "epoch": 5.805938494167551, "grad_norm": 1.109375, "learning_rate": 1.4853057673818588e-06, "loss": 1.9646, "num_input_tokens_seen": 74350544, "step": 35590 }, { "epoch": 5.806754221388368, "grad_norm": 4.46875, "learning_rate": 1.481974049824647e-06, "loss": 1.7603, "num_input_tokens_seen": 74360256, "step": 35595 }, { "epoch": 5.807569948609185, "grad_norm": 7.6875, "learning_rate": 1.4786459590425849e-06, "loss": 1.6987, "num_input_tokens_seen": 74371040, "step": 35600 }, { "epoch": 5.807569948609185, "eval_loss": 2.53898549079895, "eval_runtime": 134.6951, "eval_samples_per_second": 20.231, "eval_steps_per_second": 10.119, "num_input_tokens_seen": 74371040, "step": 35600 }, { "epoch": 5.808385675830002, "grad_norm": 2.140625, "learning_rate": 1.4753214955489036e-06, "loss": 2.4927, "num_input_tokens_seen": 74381664, "step": 35605 }, { "epoch": 5.80920140305082, "grad_norm": 9.0625, "learning_rate": 1.4720006598562737e-06, "loss": 3.0611, "num_input_tokens_seen": 74392912, "step": 35610 }, { "epoch": 5.810017130271637, "grad_norm": 2.609375, "learning_rate": 1.4686834524768185e-06, "loss": 1.2846, "num_input_tokens_seen": 74404320, "step": 35615 }, { "epoch": 5.810832857492454, "grad_norm": 5.5625, "learning_rate": 1.4653698739220844e-06, "loss": 2.6955, "num_input_tokens_seen": 74413904, "step": 35620 }, { "epoch": 5.8116485847132715, "grad_norm": 5.25, "learning_rate": 1.4620599247030715e-06, "loss": 1.5636, "num_input_tokens_seen": 74425440, "step": 35625 }, { "epoch": 5.8124643119340895, "grad_norm": 13.0, "learning_rate": 1.4587536053302125e-06, "loss": 1.8263, "num_input_tokens_seen": 74435184, "step": 35630 }, { "epoch": 5.813280039154907, "grad_norm": 9.625, "learning_rate": 1.4554509163133862e-06, "loss": 2.4736, "num_input_tokens_seen": 74444448, "step": 35635 }, { "epoch": 5.814095766375724, "grad_norm": 8.5, "learning_rate": 1.4521518581619098e-06, "loss": 1.7657, "num_input_tokens_seen": 74454960, "step": 35640 }, { "epoch": 5.814911493596542, "grad_norm": 4.84375, "learning_rate": 1.4488564313845348e-06, "loss": 1.7279, "num_input_tokens_seen": 74464624, "step": 35645 }, { "epoch": 5.815727220817359, "grad_norm": 10.0625, "learning_rate": 1.4455646364894603e-06, "loss": 2.0595, "num_input_tokens_seen": 74474800, "step": 35650 }, { "epoch": 5.816542948038176, "grad_norm": 10.25, "learning_rate": 1.4422764739843247e-06, "loss": 3.1276, "num_input_tokens_seen": 74484224, "step": 35655 }, { "epoch": 5.817358675258993, "grad_norm": 7.09375, "learning_rate": 1.4389919443762e-06, "loss": 2.675, "num_input_tokens_seen": 74493856, "step": 35660 }, { "epoch": 5.81817440247981, "grad_norm": 8.3125, "learning_rate": 1.4357110481716063e-06, "loss": 3.3379, "num_input_tokens_seen": 74506032, "step": 35665 }, { "epoch": 5.818990129700628, "grad_norm": 7.21875, "learning_rate": 1.4324337858764941e-06, "loss": 2.5635, "num_input_tokens_seen": 74515440, "step": 35670 }, { "epoch": 5.8198058569214455, "grad_norm": 13.625, "learning_rate": 1.4291601579962622e-06, "loss": 2.9844, "num_input_tokens_seen": 74526096, "step": 35675 }, { "epoch": 5.820621584142263, "grad_norm": 9.75, "learning_rate": 1.42589016503574e-06, "loss": 4.0098, "num_input_tokens_seen": 74536512, "step": 35680 }, { "epoch": 5.821437311363081, "grad_norm": 10.5625, "learning_rate": 1.4226238074992099e-06, "loss": 2.8852, "num_input_tokens_seen": 74548288, "step": 35685 }, { "epoch": 5.822253038583898, "grad_norm": 2.921875, "learning_rate": 1.4193610858903778e-06, "loss": 1.5366, "num_input_tokens_seen": 74558192, "step": 35690 }, { "epoch": 5.823068765804715, "grad_norm": 7.75, "learning_rate": 1.416102000712402e-06, "loss": 2.2539, "num_input_tokens_seen": 74569312, "step": 35695 }, { "epoch": 5.823884493025532, "grad_norm": 5.09375, "learning_rate": 1.4128465524678668e-06, "loss": 1.938, "num_input_tokens_seen": 74576928, "step": 35700 }, { "epoch": 5.824700220246349, "grad_norm": 11.3125, "learning_rate": 1.4095947416588124e-06, "loss": 2.3439, "num_input_tokens_seen": 74587664, "step": 35705 }, { "epoch": 5.825515947467167, "grad_norm": 7.375, "learning_rate": 1.4063465687866983e-06, "loss": 1.7678, "num_input_tokens_seen": 74600128, "step": 35710 }, { "epoch": 5.826331674687984, "grad_norm": 6.625, "learning_rate": 1.4031020343524438e-06, "loss": 4.0562, "num_input_tokens_seen": 74611136, "step": 35715 }, { "epoch": 5.827147401908801, "grad_norm": 4.8125, "learning_rate": 1.3998611388563926e-06, "loss": 0.8902, "num_input_tokens_seen": 74623008, "step": 35720 }, { "epoch": 5.8279631291296194, "grad_norm": 7.53125, "learning_rate": 1.3966238827983314e-06, "loss": 2.96, "num_input_tokens_seen": 74634848, "step": 35725 }, { "epoch": 5.828778856350437, "grad_norm": 11.6875, "learning_rate": 1.393390266677483e-06, "loss": 2.5517, "num_input_tokens_seen": 74644288, "step": 35730 }, { "epoch": 5.829594583571254, "grad_norm": 12.75, "learning_rate": 1.3901602909925204e-06, "loss": 2.3526, "num_input_tokens_seen": 74656336, "step": 35735 }, { "epoch": 5.830410310792071, "grad_norm": 2.3125, "learning_rate": 1.3869339562415373e-06, "loss": 1.8461, "num_input_tokens_seen": 74667408, "step": 35740 }, { "epoch": 5.831226038012889, "grad_norm": 4.65625, "learning_rate": 1.38371126292208e-06, "loss": 2.423, "num_input_tokens_seen": 74678768, "step": 35745 }, { "epoch": 5.832041765233706, "grad_norm": 7.875, "learning_rate": 1.3804922115311286e-06, "loss": 3.2759, "num_input_tokens_seen": 74688944, "step": 35750 }, { "epoch": 5.832857492454523, "grad_norm": 8.125, "learning_rate": 1.3772768025650945e-06, "loss": 2.0144, "num_input_tokens_seen": 74698720, "step": 35755 }, { "epoch": 5.83367321967534, "grad_norm": 5.46875, "learning_rate": 1.3740650365198448e-06, "loss": 1.7589, "num_input_tokens_seen": 74709936, "step": 35760 }, { "epoch": 5.834488946896158, "grad_norm": 8.5625, "learning_rate": 1.3708569138906612e-06, "loss": 2.3214, "num_input_tokens_seen": 74719792, "step": 35765 }, { "epoch": 5.835304674116975, "grad_norm": 12.0625, "learning_rate": 1.367652435172287e-06, "loss": 3.238, "num_input_tokens_seen": 74731040, "step": 35770 }, { "epoch": 5.8361204013377925, "grad_norm": 3.25, "learning_rate": 1.364451600858893e-06, "loss": 1.9436, "num_input_tokens_seen": 74742256, "step": 35775 }, { "epoch": 5.83693612855861, "grad_norm": 11.375, "learning_rate": 1.3612544114440823e-06, "loss": 2.1525, "num_input_tokens_seen": 74754048, "step": 35780 }, { "epoch": 5.837751855779428, "grad_norm": 7.4375, "learning_rate": 1.3580608674209072e-06, "loss": 2.0106, "num_input_tokens_seen": 74763952, "step": 35785 }, { "epoch": 5.838567583000245, "grad_norm": 6.875, "learning_rate": 1.3548709692818434e-06, "loss": 3.0906, "num_input_tokens_seen": 74772224, "step": 35790 }, { "epoch": 5.839383310221062, "grad_norm": 6.34375, "learning_rate": 1.3516847175188223e-06, "loss": 2.9262, "num_input_tokens_seen": 74784368, "step": 35795 }, { "epoch": 5.840199037441879, "grad_norm": 13.875, "learning_rate": 1.348502112623204e-06, "loss": 1.9141, "num_input_tokens_seen": 74795680, "step": 35800 }, { "epoch": 5.840199037441879, "eval_loss": 2.5437536239624023, "eval_runtime": 134.7998, "eval_samples_per_second": 20.215, "eval_steps_per_second": 10.111, "num_input_tokens_seen": 74795680, "step": 35800 }, { "epoch": 5.841014764662697, "grad_norm": 8.0625, "learning_rate": 1.3453231550857787e-06, "loss": 2.7885, "num_input_tokens_seen": 74806624, "step": 35805 }, { "epoch": 5.841830491883514, "grad_norm": 12.625, "learning_rate": 1.3421478453967878e-06, "loss": 2.7153, "num_input_tokens_seen": 74817232, "step": 35810 }, { "epoch": 5.842646219104331, "grad_norm": 7.8125, "learning_rate": 1.3389761840459065e-06, "loss": 3.4311, "num_input_tokens_seen": 74827904, "step": 35815 }, { "epoch": 5.8434619463251485, "grad_norm": 7.65625, "learning_rate": 1.3358081715222376e-06, "loss": 1.8821, "num_input_tokens_seen": 74838080, "step": 35820 }, { "epoch": 5.8442776735459665, "grad_norm": 11.6875, "learning_rate": 1.3326438083143295e-06, "loss": 2.6483, "num_input_tokens_seen": 74847616, "step": 35825 }, { "epoch": 5.845093400766784, "grad_norm": 4.0, "learning_rate": 1.3294830949101723e-06, "loss": 1.7411, "num_input_tokens_seen": 74857664, "step": 35830 }, { "epoch": 5.845909127987601, "grad_norm": 3.8125, "learning_rate": 1.3263260317971815e-06, "loss": 0.8441, "num_input_tokens_seen": 74867824, "step": 35835 }, { "epoch": 5.846724855208418, "grad_norm": 3.125, "learning_rate": 1.3231726194622208e-06, "loss": 2.2185, "num_input_tokens_seen": 74876944, "step": 35840 }, { "epoch": 5.847540582429236, "grad_norm": 2.140625, "learning_rate": 1.3200228583915814e-06, "loss": 3.4526, "num_input_tokens_seen": 74887088, "step": 35845 }, { "epoch": 5.848356309650053, "grad_norm": 0.80078125, "learning_rate": 1.3168767490709971e-06, "loss": 1.35, "num_input_tokens_seen": 74896656, "step": 35850 }, { "epoch": 5.84917203687087, "grad_norm": 4.53125, "learning_rate": 1.3137342919856437e-06, "loss": 1.8672, "num_input_tokens_seen": 74906880, "step": 35855 }, { "epoch": 5.849987764091688, "grad_norm": 8.5625, "learning_rate": 1.310595487620117e-06, "loss": 1.8657, "num_input_tokens_seen": 74917232, "step": 35860 }, { "epoch": 5.850803491312505, "grad_norm": 14.0625, "learning_rate": 1.3074603364584715e-06, "loss": 3.4065, "num_input_tokens_seen": 74926736, "step": 35865 }, { "epoch": 5.8516192185333225, "grad_norm": 11.1875, "learning_rate": 1.3043288389841758e-06, "loss": 2.9325, "num_input_tokens_seen": 74937584, "step": 35870 }, { "epoch": 5.85243494575414, "grad_norm": 2.390625, "learning_rate": 1.3012009956801546e-06, "loss": 1.7972, "num_input_tokens_seen": 74947008, "step": 35875 }, { "epoch": 5.853250672974957, "grad_norm": 6.96875, "learning_rate": 1.2980768070287586e-06, "loss": 2.7758, "num_input_tokens_seen": 74958480, "step": 35880 }, { "epoch": 5.854066400195775, "grad_norm": 9.125, "learning_rate": 1.2949562735117716e-06, "loss": 2.6259, "num_input_tokens_seen": 74967792, "step": 35885 }, { "epoch": 5.854882127416592, "grad_norm": 8.4375, "learning_rate": 1.291839395610428e-06, "loss": 3.4418, "num_input_tokens_seen": 74977520, "step": 35890 }, { "epoch": 5.855697854637409, "grad_norm": 8.75, "learning_rate": 1.2887261738053852e-06, "loss": 3.1822, "num_input_tokens_seen": 74988288, "step": 35895 }, { "epoch": 5.856513581858227, "grad_norm": 6.84375, "learning_rate": 1.2856166085767396e-06, "loss": 2.0171, "num_input_tokens_seen": 74999408, "step": 35900 }, { "epoch": 5.857329309079044, "grad_norm": 5.0625, "learning_rate": 1.2825107004040272e-06, "loss": 1.9911, "num_input_tokens_seen": 75008240, "step": 35905 }, { "epoch": 5.858145036299861, "grad_norm": 8.3125, "learning_rate": 1.2794084497662146e-06, "loss": 3.1941, "num_input_tokens_seen": 75018480, "step": 35910 }, { "epoch": 5.858960763520678, "grad_norm": 18.125, "learning_rate": 1.276309857141711e-06, "loss": 2.8652, "num_input_tokens_seen": 75029088, "step": 35915 }, { "epoch": 5.859776490741496, "grad_norm": 5.90625, "learning_rate": 1.273214923008359e-06, "loss": 2.3479, "num_input_tokens_seen": 75040048, "step": 35920 }, { "epoch": 5.8605922179623136, "grad_norm": 4.46875, "learning_rate": 1.2701236478434352e-06, "loss": 2.7387, "num_input_tokens_seen": 75048928, "step": 35925 }, { "epoch": 5.861407945183131, "grad_norm": 7.125, "learning_rate": 1.2670360321236502e-06, "loss": 1.8982, "num_input_tokens_seen": 75060000, "step": 35930 }, { "epoch": 5.862223672403948, "grad_norm": 6.15625, "learning_rate": 1.2639520763251617e-06, "loss": 2.7139, "num_input_tokens_seen": 75071168, "step": 35935 }, { "epoch": 5.863039399624766, "grad_norm": 11.4375, "learning_rate": 1.2608717809235448e-06, "loss": 1.9292, "num_input_tokens_seen": 75082272, "step": 35940 }, { "epoch": 5.863855126845583, "grad_norm": 0.08544921875, "learning_rate": 1.2577951463938282e-06, "loss": 0.8078, "num_input_tokens_seen": 75094512, "step": 35945 }, { "epoch": 5.8646708540664, "grad_norm": 3.078125, "learning_rate": 1.2547221732104569e-06, "loss": 2.3352, "num_input_tokens_seen": 75105952, "step": 35950 }, { "epoch": 5.865486581287217, "grad_norm": 3.890625, "learning_rate": 1.25165286184733e-06, "loss": 1.6971, "num_input_tokens_seen": 75115920, "step": 35955 }, { "epoch": 5.866302308508035, "grad_norm": 3.21875, "learning_rate": 1.248587212777777e-06, "loss": 1.4573, "num_input_tokens_seen": 75126304, "step": 35960 }, { "epoch": 5.867118035728852, "grad_norm": 6.6875, "learning_rate": 1.2455252264745532e-06, "loss": 2.2793, "num_input_tokens_seen": 75136544, "step": 35965 }, { "epoch": 5.8679337629496695, "grad_norm": 7.53125, "learning_rate": 1.2424669034098528e-06, "loss": 2.7574, "num_input_tokens_seen": 75147392, "step": 35970 }, { "epoch": 5.868749490170487, "grad_norm": 9.5, "learning_rate": 1.2394122440553185e-06, "loss": 1.9751, "num_input_tokens_seen": 75157632, "step": 35975 }, { "epoch": 5.869565217391305, "grad_norm": 5.8125, "learning_rate": 1.2363612488820037e-06, "loss": 1.771, "num_input_tokens_seen": 75168016, "step": 35980 }, { "epoch": 5.870380944612122, "grad_norm": 8.4375, "learning_rate": 1.2333139183604208e-06, "loss": 2.336, "num_input_tokens_seen": 75178096, "step": 35985 }, { "epoch": 5.871196671832939, "grad_norm": 10.125, "learning_rate": 1.2302702529604998e-06, "loss": 2.5398, "num_input_tokens_seen": 75188704, "step": 35990 }, { "epoch": 5.872012399053756, "grad_norm": 5.5625, "learning_rate": 1.227230253151615e-06, "loss": 2.8719, "num_input_tokens_seen": 75200320, "step": 35995 }, { "epoch": 5.872828126274574, "grad_norm": 11.0625, "learning_rate": 1.2241939194025748e-06, "loss": 2.1558, "num_input_tokens_seen": 75209824, "step": 36000 }, { "epoch": 5.872828126274574, "eval_loss": 2.543876886367798, "eval_runtime": 134.69, "eval_samples_per_second": 20.232, "eval_steps_per_second": 10.12, "num_input_tokens_seen": 75209824, "step": 36000 }, { "epoch": 5.873643853495391, "grad_norm": 4.9375, "learning_rate": 1.2211612521816156e-06, "loss": 3.0455, "num_input_tokens_seen": 75220640, "step": 36005 }, { "epoch": 5.874459580716208, "grad_norm": 12.0625, "learning_rate": 1.2181322519564137e-06, "loss": 3.4559, "num_input_tokens_seen": 75231136, "step": 36010 }, { "epoch": 5.8752753079370255, "grad_norm": 6.4375, "learning_rate": 1.2151069191940839e-06, "loss": 2.4734, "num_input_tokens_seen": 75243488, "step": 36015 }, { "epoch": 5.8760910351578435, "grad_norm": 8.5625, "learning_rate": 1.2120852543611644e-06, "loss": 1.5399, "num_input_tokens_seen": 75255216, "step": 36020 }, { "epoch": 5.876906762378661, "grad_norm": 3.953125, "learning_rate": 1.2090672579236379e-06, "loss": 1.9293, "num_input_tokens_seen": 75265920, "step": 36025 }, { "epoch": 5.877722489599478, "grad_norm": 3.0625, "learning_rate": 1.2060529303469126e-06, "loss": 2.5407, "num_input_tokens_seen": 75275520, "step": 36030 }, { "epoch": 5.878538216820296, "grad_norm": 9.0625, "learning_rate": 1.2030422720958445e-06, "loss": 2.7294, "num_input_tokens_seen": 75284864, "step": 36035 }, { "epoch": 5.879353944041113, "grad_norm": 14.5, "learning_rate": 1.200035283634704e-06, "loss": 2.2513, "num_input_tokens_seen": 75296912, "step": 36040 }, { "epoch": 5.88016967126193, "grad_norm": 6.3125, "learning_rate": 1.1970319654272144e-06, "loss": 1.4545, "num_input_tokens_seen": 75306672, "step": 36045 }, { "epoch": 5.880985398482747, "grad_norm": 2.5, "learning_rate": 1.1940323179365192e-06, "loss": 1.7513, "num_input_tokens_seen": 75316064, "step": 36050 }, { "epoch": 5.881801125703564, "grad_norm": 6.1875, "learning_rate": 1.1910363416252095e-06, "loss": 3.5684, "num_input_tokens_seen": 75325568, "step": 36055 }, { "epoch": 5.882616852924382, "grad_norm": 0.05224609375, "learning_rate": 1.1880440369552964e-06, "loss": 3.5902, "num_input_tokens_seen": 75336288, "step": 36060 }, { "epoch": 5.883432580145199, "grad_norm": 5.5625, "learning_rate": 1.1850554043882328e-06, "loss": 2.3769, "num_input_tokens_seen": 75347280, "step": 36065 }, { "epoch": 5.884248307366017, "grad_norm": 6.09375, "learning_rate": 1.1820704443849028e-06, "loss": 2.9168, "num_input_tokens_seen": 75358304, "step": 36070 }, { "epoch": 5.885064034586835, "grad_norm": 8.5, "learning_rate": 1.1790891574056219e-06, "loss": 3.3791, "num_input_tokens_seen": 75369184, "step": 36075 }, { "epoch": 5.885879761807652, "grad_norm": 5.4375, "learning_rate": 1.1761115439101523e-06, "loss": 1.8295, "num_input_tokens_seen": 75379824, "step": 36080 }, { "epoch": 5.886695489028469, "grad_norm": 6.34375, "learning_rate": 1.1731376043576659e-06, "loss": 1.8702, "num_input_tokens_seen": 75390688, "step": 36085 }, { "epoch": 5.887511216249286, "grad_norm": 4.03125, "learning_rate": 1.1701673392067875e-06, "loss": 1.1793, "num_input_tokens_seen": 75400640, "step": 36090 }, { "epoch": 5.888326943470103, "grad_norm": 6.5625, "learning_rate": 1.1672007489155757e-06, "loss": 1.9874, "num_input_tokens_seen": 75411088, "step": 36095 }, { "epoch": 5.889142670690921, "grad_norm": 11.375, "learning_rate": 1.164237833941506e-06, "loss": 3.1804, "num_input_tokens_seen": 75422512, "step": 36100 }, { "epoch": 5.889958397911738, "grad_norm": 7.5, "learning_rate": 1.1612785947415022e-06, "loss": 2.7894, "num_input_tokens_seen": 75433360, "step": 36105 }, { "epoch": 5.890774125132555, "grad_norm": 5.6875, "learning_rate": 1.1583230317719185e-06, "loss": 2.1666, "num_input_tokens_seen": 75444400, "step": 36110 }, { "epoch": 5.891589852353373, "grad_norm": 4.5625, "learning_rate": 1.1553711454885318e-06, "loss": 2.551, "num_input_tokens_seen": 75454544, "step": 36115 }, { "epoch": 5.8924055795741905, "grad_norm": 3.0, "learning_rate": 1.152422936346567e-06, "loss": 0.809, "num_input_tokens_seen": 75464624, "step": 36120 }, { "epoch": 5.893221306795008, "grad_norm": 1.4453125, "learning_rate": 1.1494784048006718e-06, "loss": 1.795, "num_input_tokens_seen": 75475104, "step": 36125 }, { "epoch": 5.894037034015825, "grad_norm": 5.25, "learning_rate": 1.1465375513049326e-06, "loss": 1.7194, "num_input_tokens_seen": 75485472, "step": 36130 }, { "epoch": 5.894852761236643, "grad_norm": 11.625, "learning_rate": 1.1436003763128616e-06, "loss": 3.6648, "num_input_tokens_seen": 75496656, "step": 36135 }, { "epoch": 5.89566848845746, "grad_norm": 3.828125, "learning_rate": 1.1406668802774106e-06, "loss": 1.8796, "num_input_tokens_seen": 75507232, "step": 36140 }, { "epoch": 5.896484215678277, "grad_norm": 7.96875, "learning_rate": 1.137737063650965e-06, "loss": 3.2753, "num_input_tokens_seen": 75516800, "step": 36145 }, { "epoch": 5.897299942899094, "grad_norm": 0.1611328125, "learning_rate": 1.1348109268853323e-06, "loss": 1.5365, "num_input_tokens_seen": 75528800, "step": 36150 }, { "epoch": 5.898115670119912, "grad_norm": 2.84375, "learning_rate": 1.1318884704317634e-06, "loss": 2.0436, "num_input_tokens_seen": 75540256, "step": 36155 }, { "epoch": 5.898931397340729, "grad_norm": 6.40625, "learning_rate": 1.1289696947409417e-06, "loss": 2.0902, "num_input_tokens_seen": 75552112, "step": 36160 }, { "epoch": 5.8997471245615465, "grad_norm": 9.375, "learning_rate": 1.126054600262974e-06, "loss": 3.0942, "num_input_tokens_seen": 75561888, "step": 36165 }, { "epoch": 5.900562851782364, "grad_norm": 6.0, "learning_rate": 1.1231431874474064e-06, "loss": 1.3848, "num_input_tokens_seen": 75571888, "step": 36170 }, { "epoch": 5.901378579003182, "grad_norm": 4.8125, "learning_rate": 1.12023545674321e-06, "loss": 2.1814, "num_input_tokens_seen": 75581872, "step": 36175 }, { "epoch": 5.902194306223999, "grad_norm": 9.375, "learning_rate": 1.117331408598804e-06, "loss": 2.7819, "num_input_tokens_seen": 75591600, "step": 36180 }, { "epoch": 5.903010033444816, "grad_norm": 9.5625, "learning_rate": 1.1144310434620191e-06, "loss": 2.5498, "num_input_tokens_seen": 75602912, "step": 36185 }, { "epoch": 5.903825760665633, "grad_norm": 7.09375, "learning_rate": 1.1115343617801365e-06, "loss": 1.4844, "num_input_tokens_seen": 75613792, "step": 36190 }, { "epoch": 5.904641487886451, "grad_norm": 2.71875, "learning_rate": 1.1086413639998515e-06, "loss": 2.5284, "num_input_tokens_seen": 75624720, "step": 36195 }, { "epoch": 5.905457215107268, "grad_norm": 11.875, "learning_rate": 1.1057520505673103e-06, "loss": 2.556, "num_input_tokens_seen": 75634096, "step": 36200 }, { "epoch": 5.905457215107268, "eval_loss": 2.5377860069274902, "eval_runtime": 134.8125, "eval_samples_per_second": 20.213, "eval_steps_per_second": 10.11, "num_input_tokens_seen": 75634096, "step": 36200 }, { "epoch": 5.906272942328085, "grad_norm": 5.5625, "learning_rate": 1.1028664219280727e-06, "loss": 2.7059, "num_input_tokens_seen": 75644848, "step": 36205 }, { "epoch": 5.907088669548903, "grad_norm": 2.953125, "learning_rate": 1.0999844785271468e-06, "loss": 2.6724, "num_input_tokens_seen": 75654928, "step": 36210 }, { "epoch": 5.9079043967697205, "grad_norm": 9.3125, "learning_rate": 1.097106220808955e-06, "loss": 3.5023, "num_input_tokens_seen": 75663952, "step": 36215 }, { "epoch": 5.908720123990538, "grad_norm": 11.3125, "learning_rate": 1.0942316492173698e-06, "loss": 2.9748, "num_input_tokens_seen": 75674032, "step": 36220 }, { "epoch": 5.909535851211355, "grad_norm": 8.375, "learning_rate": 1.0913607641956841e-06, "loss": 2.7445, "num_input_tokens_seen": 75684448, "step": 36225 }, { "epoch": 5.910351578432172, "grad_norm": 2.203125, "learning_rate": 1.0884935661866213e-06, "loss": 1.8918, "num_input_tokens_seen": 75694512, "step": 36230 }, { "epoch": 5.91116730565299, "grad_norm": 2.296875, "learning_rate": 1.0856300556323418e-06, "loss": 1.2326, "num_input_tokens_seen": 75705120, "step": 36235 }, { "epoch": 5.911983032873807, "grad_norm": 0.18359375, "learning_rate": 1.0827702329744365e-06, "loss": 2.3322, "num_input_tokens_seen": 75714496, "step": 36240 }, { "epoch": 5.912798760094624, "grad_norm": 14.25, "learning_rate": 1.0799140986539197e-06, "loss": 2.5611, "num_input_tokens_seen": 75726192, "step": 36245 }, { "epoch": 5.913614487315442, "grad_norm": 5.0625, "learning_rate": 1.0770616531112526e-06, "loss": 2.3584, "num_input_tokens_seen": 75736304, "step": 36250 }, { "epoch": 5.914430214536259, "grad_norm": 5.9375, "learning_rate": 1.0742128967863085e-06, "loss": 2.3596, "num_input_tokens_seen": 75745488, "step": 36255 }, { "epoch": 5.915245941757076, "grad_norm": 5.65625, "learning_rate": 1.071367830118411e-06, "loss": 1.9459, "num_input_tokens_seen": 75756176, "step": 36260 }, { "epoch": 5.9160616689778935, "grad_norm": 8.125, "learning_rate": 1.068526453546298e-06, "loss": 1.7194, "num_input_tokens_seen": 75767296, "step": 36265 }, { "epoch": 5.916877396198711, "grad_norm": 8.625, "learning_rate": 1.0656887675081467e-06, "loss": 2.5359, "num_input_tokens_seen": 75778384, "step": 36270 }, { "epoch": 5.917693123419529, "grad_norm": 7.8125, "learning_rate": 1.0628547724415628e-06, "loss": 1.9387, "num_input_tokens_seen": 75787296, "step": 36275 }, { "epoch": 5.918508850640346, "grad_norm": 2.6875, "learning_rate": 1.0600244687835881e-06, "loss": 1.1133, "num_input_tokens_seen": 75797280, "step": 36280 }, { "epoch": 5.919324577861163, "grad_norm": 4.875, "learning_rate": 1.0571978569706876e-06, "loss": 1.4102, "num_input_tokens_seen": 75807408, "step": 36285 }, { "epoch": 5.920140305081981, "grad_norm": 8.8125, "learning_rate": 1.0543749374387652e-06, "loss": 1.9541, "num_input_tokens_seen": 75816800, "step": 36290 }, { "epoch": 5.920956032302798, "grad_norm": 10.125, "learning_rate": 1.051555710623142e-06, "loss": 2.1324, "num_input_tokens_seen": 75827808, "step": 36295 }, { "epoch": 5.921771759523615, "grad_norm": 13.125, "learning_rate": 1.0487401769585847e-06, "loss": 4.4146, "num_input_tokens_seen": 75839584, "step": 36300 }, { "epoch": 5.922587486744432, "grad_norm": 5.84375, "learning_rate": 1.0459283368792845e-06, "loss": 1.9754, "num_input_tokens_seen": 75849248, "step": 36305 }, { "epoch": 5.92340321396525, "grad_norm": 4.875, "learning_rate": 1.043120190818858e-06, "loss": 2.6273, "num_input_tokens_seen": 75860112, "step": 36310 }, { "epoch": 5.9242189411860675, "grad_norm": 5.875, "learning_rate": 1.0403157392103596e-06, "loss": 2.5857, "num_input_tokens_seen": 75871200, "step": 36315 }, { "epoch": 5.925034668406885, "grad_norm": 7.03125, "learning_rate": 1.0375149824862735e-06, "loss": 2.8168, "num_input_tokens_seen": 75882256, "step": 36320 }, { "epoch": 5.925850395627702, "grad_norm": 2.828125, "learning_rate": 1.034717921078507e-06, "loss": 2.4316, "num_input_tokens_seen": 75893184, "step": 36325 }, { "epoch": 5.92666612284852, "grad_norm": 8.375, "learning_rate": 1.0319245554184009e-06, "loss": 3.6432, "num_input_tokens_seen": 75902352, "step": 36330 }, { "epoch": 5.927481850069337, "grad_norm": 7.3125, "learning_rate": 1.0291348859367361e-06, "loss": 1.7434, "num_input_tokens_seen": 75913216, "step": 36335 }, { "epoch": 5.928297577290154, "grad_norm": 6.9375, "learning_rate": 1.0263489130637016e-06, "loss": 1.9111, "num_input_tokens_seen": 75924304, "step": 36340 }, { "epoch": 5.929113304510971, "grad_norm": 10.875, "learning_rate": 1.0235666372289427e-06, "loss": 1.6741, "num_input_tokens_seen": 75934400, "step": 36345 }, { "epoch": 5.929929031731789, "grad_norm": 7.625, "learning_rate": 1.0207880588615076e-06, "loss": 2.5774, "num_input_tokens_seen": 75943616, "step": 36350 }, { "epoch": 5.930744758952606, "grad_norm": 10.0, "learning_rate": 1.0180131783898984e-06, "loss": 1.9461, "num_input_tokens_seen": 75954736, "step": 36355 }, { "epoch": 5.9315604861734235, "grad_norm": 6.125, "learning_rate": 1.0152419962420362e-06, "loss": 1.9901, "num_input_tokens_seen": 75965424, "step": 36360 }, { "epoch": 5.932376213394241, "grad_norm": 8.6875, "learning_rate": 1.0124745128452685e-06, "loss": 1.7863, "num_input_tokens_seen": 75975680, "step": 36365 }, { "epoch": 5.933191940615059, "grad_norm": 3.0, "learning_rate": 1.0097107286263758e-06, "loss": 1.6795, "num_input_tokens_seen": 75985360, "step": 36370 }, { "epoch": 5.934007667835876, "grad_norm": 0.77734375, "learning_rate": 1.00695064401157e-06, "loss": 2.202, "num_input_tokens_seen": 75994256, "step": 36375 }, { "epoch": 5.934823395056693, "grad_norm": 15.375, "learning_rate": 1.0041942594264886e-06, "loss": 2.1379, "num_input_tokens_seen": 76004416, "step": 36380 }, { "epoch": 5.935639122277511, "grad_norm": 3.515625, "learning_rate": 1.001441575296208e-06, "loss": 2.2854, "num_input_tokens_seen": 76013728, "step": 36385 }, { "epoch": 5.936454849498328, "grad_norm": 7.96875, "learning_rate": 9.986925920452139e-07, "loss": 1.5516, "num_input_tokens_seen": 76025168, "step": 36390 }, { "epoch": 5.937270576719145, "grad_norm": 5.625, "learning_rate": 9.959473100974475e-07, "loss": 2.4595, "num_input_tokens_seen": 76035776, "step": 36395 }, { "epoch": 5.938086303939962, "grad_norm": 8.1875, "learning_rate": 9.932057298762564e-07, "loss": 1.1027, "num_input_tokens_seen": 76046144, "step": 36400 }, { "epoch": 5.938086303939962, "eval_loss": 2.538872003555298, "eval_runtime": 134.7728, "eval_samples_per_second": 20.219, "eval_steps_per_second": 10.113, "num_input_tokens_seen": 76046144, "step": 36400 }, { "epoch": 5.938902031160779, "grad_norm": 4.34375, "learning_rate": 9.90467851804433e-07, "loss": 3.117, "num_input_tokens_seen": 76057040, "step": 36405 }, { "epoch": 5.939717758381597, "grad_norm": 10.25, "learning_rate": 9.877336763041895e-07, "loss": 4.3023, "num_input_tokens_seen": 76066784, "step": 36410 }, { "epoch": 5.940533485602415, "grad_norm": 5.5625, "learning_rate": 9.850032037971662e-07, "loss": 1.2294, "num_input_tokens_seen": 76076880, "step": 36415 }, { "epoch": 5.941349212823232, "grad_norm": 6.4375, "learning_rate": 9.822764347044406e-07, "loss": 2.5526, "num_input_tokens_seen": 76086960, "step": 36420 }, { "epoch": 5.94216494004405, "grad_norm": 10.8125, "learning_rate": 9.795533694465175e-07, "loss": 1.8407, "num_input_tokens_seen": 76098240, "step": 36425 }, { "epoch": 5.942980667264867, "grad_norm": 9.75, "learning_rate": 9.768340084433197e-07, "loss": 3.0187, "num_input_tokens_seen": 76105760, "step": 36430 }, { "epoch": 5.943796394485684, "grad_norm": 10.6875, "learning_rate": 9.741183521142143e-07, "loss": 1.9054, "num_input_tokens_seen": 76114832, "step": 36435 }, { "epoch": 5.944612121706501, "grad_norm": 2.546875, "learning_rate": 9.714064008779889e-07, "loss": 1.9391, "num_input_tokens_seen": 76124800, "step": 36440 }, { "epoch": 5.945427848927318, "grad_norm": 9.125, "learning_rate": 9.686981551528584e-07, "loss": 2.9405, "num_input_tokens_seen": 76136336, "step": 36445 }, { "epoch": 5.946243576148136, "grad_norm": 9.75, "learning_rate": 9.65993615356467e-07, "loss": 2.5187, "num_input_tokens_seen": 76147744, "step": 36450 }, { "epoch": 5.947059303368953, "grad_norm": 4.71875, "learning_rate": 9.632927819058917e-07, "loss": 1.441, "num_input_tokens_seen": 76157040, "step": 36455 }, { "epoch": 5.9478750305897705, "grad_norm": 3.875, "learning_rate": 9.605956552176305e-07, "loss": 2.0194, "num_input_tokens_seen": 76167424, "step": 36460 }, { "epoch": 5.9486907578105885, "grad_norm": 4.9375, "learning_rate": 9.579022357076223e-07, "loss": 1.8622, "num_input_tokens_seen": 76177216, "step": 36465 }, { "epoch": 5.949506485031406, "grad_norm": 6.71875, "learning_rate": 9.552125237912158e-07, "loss": 2.2342, "num_input_tokens_seen": 76186768, "step": 36470 }, { "epoch": 5.950322212252223, "grad_norm": 6.46875, "learning_rate": 9.525265198832096e-07, "loss": 1.7613, "num_input_tokens_seen": 76197264, "step": 36475 }, { "epoch": 5.95113793947304, "grad_norm": 3.46875, "learning_rate": 9.498442243978112e-07, "loss": 2.5866, "num_input_tokens_seen": 76207232, "step": 36480 }, { "epoch": 5.951953666693858, "grad_norm": 6.5, "learning_rate": 9.471656377486649e-07, "loss": 1.9161, "num_input_tokens_seen": 76217920, "step": 36485 }, { "epoch": 5.952769393914675, "grad_norm": 4.53125, "learning_rate": 9.444907603488456e-07, "loss": 2.4496, "num_input_tokens_seen": 76228608, "step": 36490 }, { "epoch": 5.953585121135492, "grad_norm": 5.0625, "learning_rate": 9.418195926108514e-07, "loss": 3.4008, "num_input_tokens_seen": 76238656, "step": 36495 }, { "epoch": 5.954400848356309, "grad_norm": 5.1875, "learning_rate": 9.391521349466053e-07, "loss": 2.6301, "num_input_tokens_seen": 76247696, "step": 36500 }, { "epoch": 5.955216575577127, "grad_norm": 5.15625, "learning_rate": 9.364883877674758e-07, "loss": 2.1221, "num_input_tokens_seen": 76258256, "step": 36505 }, { "epoch": 5.9560323027979445, "grad_norm": 10.5, "learning_rate": 9.33828351484231e-07, "loss": 2.3932, "num_input_tokens_seen": 76268544, "step": 36510 }, { "epoch": 5.956848030018762, "grad_norm": 7.84375, "learning_rate": 9.311720265070906e-07, "loss": 2.5697, "num_input_tokens_seen": 76277440, "step": 36515 }, { "epoch": 5.957663757239579, "grad_norm": 6.78125, "learning_rate": 9.285194132456931e-07, "loss": 3.8001, "num_input_tokens_seen": 76288512, "step": 36520 }, { "epoch": 5.958479484460397, "grad_norm": 6.21875, "learning_rate": 9.258705121091032e-07, "loss": 2.5367, "num_input_tokens_seen": 76297424, "step": 36525 }, { "epoch": 5.959295211681214, "grad_norm": 9.9375, "learning_rate": 9.232253235058136e-07, "loss": 1.6011, "num_input_tokens_seen": 76308160, "step": 36530 }, { "epoch": 5.960110938902031, "grad_norm": 7.0625, "learning_rate": 9.205838478437478e-07, "loss": 1.4005, "num_input_tokens_seen": 76320032, "step": 36535 }, { "epoch": 5.960926666122848, "grad_norm": 4.78125, "learning_rate": 9.179460855302524e-07, "loss": 1.4494, "num_input_tokens_seen": 76330368, "step": 36540 }, { "epoch": 5.961742393343666, "grad_norm": 6.4375, "learning_rate": 9.153120369721046e-07, "loss": 2.7868, "num_input_tokens_seen": 76340896, "step": 36545 }, { "epoch": 5.962558120564483, "grad_norm": 11.3125, "learning_rate": 9.126817025755103e-07, "loss": 2.6861, "num_input_tokens_seen": 76350032, "step": 36550 }, { "epoch": 5.9633738477853, "grad_norm": 8.625, "learning_rate": 9.100550827460947e-07, "loss": 1.9918, "num_input_tokens_seen": 76361312, "step": 36555 }, { "epoch": 5.964189575006118, "grad_norm": 8.4375, "learning_rate": 9.0743217788892e-07, "loss": 2.1494, "num_input_tokens_seen": 76370128, "step": 36560 }, { "epoch": 5.965005302226936, "grad_norm": 8.9375, "learning_rate": 9.048129884084683e-07, "loss": 1.7838, "num_input_tokens_seen": 76379952, "step": 36565 }, { "epoch": 5.965821029447753, "grad_norm": 8.25, "learning_rate": 9.021975147086553e-07, "loss": 3.7507, "num_input_tokens_seen": 76390992, "step": 36570 }, { "epoch": 5.96663675666857, "grad_norm": 4.875, "learning_rate": 8.995857571928141e-07, "loss": 1.2415, "num_input_tokens_seen": 76402128, "step": 36575 }, { "epoch": 5.967452483889387, "grad_norm": 3.46875, "learning_rate": 8.969777162637139e-07, "loss": 3.3688, "num_input_tokens_seen": 76412672, "step": 36580 }, { "epoch": 5.968268211110205, "grad_norm": 10.8125, "learning_rate": 8.943733923235525e-07, "loss": 2.6664, "num_input_tokens_seen": 76422704, "step": 36585 }, { "epoch": 5.969083938331022, "grad_norm": 5.4375, "learning_rate": 8.917727857739394e-07, "loss": 2.5651, "num_input_tokens_seen": 76432464, "step": 36590 }, { "epoch": 5.969899665551839, "grad_norm": 7.59375, "learning_rate": 8.891758970159258e-07, "loss": 1.8628, "num_input_tokens_seen": 76443008, "step": 36595 }, { "epoch": 5.970715392772657, "grad_norm": 4.375, "learning_rate": 8.86582726449986e-07, "loss": 2.3357, "num_input_tokens_seen": 76453936, "step": 36600 }, { "epoch": 5.970715392772657, "eval_loss": 2.5388035774230957, "eval_runtime": 134.7745, "eval_samples_per_second": 20.219, "eval_steps_per_second": 10.113, "num_input_tokens_seen": 76453936, "step": 36600 }, { "epoch": 5.971531119993474, "grad_norm": 6.71875, "learning_rate": 8.839932744760165e-07, "loss": 2.2442, "num_input_tokens_seen": 76464688, "step": 36605 }, { "epoch": 5.9723468472142915, "grad_norm": 7.3125, "learning_rate": 8.814075414933482e-07, "loss": 1.996, "num_input_tokens_seen": 76474640, "step": 36610 }, { "epoch": 5.973162574435109, "grad_norm": 8.125, "learning_rate": 8.788255279007257e-07, "loss": 3.8936, "num_input_tokens_seen": 76484944, "step": 36615 }, { "epoch": 5.973978301655926, "grad_norm": 5.75, "learning_rate": 8.762472340963362e-07, "loss": 1.64, "num_input_tokens_seen": 76495760, "step": 36620 }, { "epoch": 5.974794028876744, "grad_norm": 5.28125, "learning_rate": 8.736726604777811e-07, "loss": 2.0511, "num_input_tokens_seen": 76505008, "step": 36625 }, { "epoch": 5.975609756097561, "grad_norm": 8.5, "learning_rate": 8.711018074420901e-07, "loss": 2.9678, "num_input_tokens_seen": 76516160, "step": 36630 }, { "epoch": 5.976425483318378, "grad_norm": 11.75, "learning_rate": 8.685346753857209e-07, "loss": 3.3453, "num_input_tokens_seen": 76527360, "step": 36635 }, { "epoch": 5.977241210539196, "grad_norm": 2.15625, "learning_rate": 8.659712647045654e-07, "loss": 2.418, "num_input_tokens_seen": 76537792, "step": 36640 }, { "epoch": 5.978056937760013, "grad_norm": 6.90625, "learning_rate": 8.634115757939209e-07, "loss": 3.0223, "num_input_tokens_seen": 76548192, "step": 36645 }, { "epoch": 5.97887266498083, "grad_norm": 7.34375, "learning_rate": 8.608556090485387e-07, "loss": 2.4447, "num_input_tokens_seen": 76558736, "step": 36650 }, { "epoch": 5.9796883922016475, "grad_norm": 9.5625, "learning_rate": 8.583033648625671e-07, "loss": 2.1453, "num_input_tokens_seen": 76569808, "step": 36655 }, { "epoch": 5.9805041194224655, "grad_norm": 11.6875, "learning_rate": 8.557548436295998e-07, "loss": 3.1722, "num_input_tokens_seen": 76580848, "step": 36660 }, { "epoch": 5.981319846643283, "grad_norm": 6.375, "learning_rate": 8.532100457426556e-07, "loss": 2.6172, "num_input_tokens_seen": 76591184, "step": 36665 }, { "epoch": 5.9821355738641, "grad_norm": 19.75, "learning_rate": 8.506689715941679e-07, "loss": 3.0932, "num_input_tokens_seen": 76600016, "step": 36670 }, { "epoch": 5.982951301084917, "grad_norm": 6.96875, "learning_rate": 8.481316215760011e-07, "loss": 2.2336, "num_input_tokens_seen": 76611648, "step": 36675 }, { "epoch": 5.983767028305735, "grad_norm": 4.9375, "learning_rate": 8.455979960794558e-07, "loss": 2.5699, "num_input_tokens_seen": 76620976, "step": 36680 }, { "epoch": 5.984582755526552, "grad_norm": 7.09375, "learning_rate": 8.430680954952364e-07, "loss": 2.7929, "num_input_tokens_seen": 76631728, "step": 36685 }, { "epoch": 5.985398482747369, "grad_norm": 15.5625, "learning_rate": 8.405419202134974e-07, "loss": 1.7288, "num_input_tokens_seen": 76642432, "step": 36690 }, { "epoch": 5.986214209968186, "grad_norm": 7.90625, "learning_rate": 8.380194706237993e-07, "loss": 2.1495, "num_input_tokens_seen": 76653600, "step": 36695 }, { "epoch": 5.987029937189004, "grad_norm": 6.46875, "learning_rate": 8.355007471151366e-07, "loss": 2.1034, "num_input_tokens_seen": 76663904, "step": 36700 }, { "epoch": 5.9878456644098215, "grad_norm": 0.3203125, "learning_rate": 8.329857500759292e-07, "loss": 1.1883, "num_input_tokens_seen": 76674880, "step": 36705 }, { "epoch": 5.988661391630639, "grad_norm": 11.0, "learning_rate": 8.304744798940194e-07, "loss": 2.8017, "num_input_tokens_seen": 76685600, "step": 36710 }, { "epoch": 5.989477118851456, "grad_norm": 7.15625, "learning_rate": 8.279669369566756e-07, "loss": 3.378, "num_input_tokens_seen": 76695392, "step": 36715 }, { "epoch": 5.990292846072274, "grad_norm": 3.453125, "learning_rate": 8.254631216505993e-07, "loss": 3.5544, "num_input_tokens_seen": 76704784, "step": 36720 }, { "epoch": 5.991108573293091, "grad_norm": 6.09375, "learning_rate": 8.229630343619038e-07, "loss": 1.9652, "num_input_tokens_seen": 76715152, "step": 36725 }, { "epoch": 5.991924300513908, "grad_norm": 7.46875, "learning_rate": 8.204666754761392e-07, "loss": 1.965, "num_input_tokens_seen": 76726512, "step": 36730 }, { "epoch": 5.992740027734725, "grad_norm": 7.84375, "learning_rate": 8.179740453782669e-07, "loss": 2.1505, "num_input_tokens_seen": 76736720, "step": 36735 }, { "epoch": 5.993555754955543, "grad_norm": 3.0, "learning_rate": 8.154851444526907e-07, "loss": 3.4087, "num_input_tokens_seen": 76748640, "step": 36740 }, { "epoch": 5.99437148217636, "grad_norm": 7.125, "learning_rate": 8.129999730832283e-07, "loss": 1.96, "num_input_tokens_seen": 76758736, "step": 36745 }, { "epoch": 5.995187209397177, "grad_norm": 4.8125, "learning_rate": 8.105185316531178e-07, "loss": 1.8015, "num_input_tokens_seen": 76767664, "step": 36750 }, { "epoch": 5.9960029366179945, "grad_norm": 6.78125, "learning_rate": 8.08040820545039e-07, "loss": 2.0898, "num_input_tokens_seen": 76778704, "step": 36755 }, { "epoch": 5.996818663838813, "grad_norm": 3.6875, "learning_rate": 8.055668401410782e-07, "loss": 2.6966, "num_input_tokens_seen": 76789360, "step": 36760 }, { "epoch": 5.99763439105963, "grad_norm": 1.09375, "learning_rate": 8.030965908227578e-07, "loss": 1.8653, "num_input_tokens_seen": 76799728, "step": 36765 }, { "epoch": 5.998450118280447, "grad_norm": 4.46875, "learning_rate": 8.006300729710203e-07, "loss": 2.5859, "num_input_tokens_seen": 76809392, "step": 36770 }, { "epoch": 5.999265845501265, "grad_norm": 3.390625, "learning_rate": 7.981672869662337e-07, "loss": 0.9555, "num_input_tokens_seen": 76818672, "step": 36775 }, { "epoch": 6.0, "grad_norm": 5.6875, "learning_rate": 7.957082331881888e-07, "loss": 1.3854, "num_input_tokens_seen": 76827984, "step": 36780 }, { "epoch": 6.000815727220817, "grad_norm": 7.875, "learning_rate": 7.932529120161069e-07, "loss": 1.995, "num_input_tokens_seen": 76838816, "step": 36785 }, { "epoch": 6.001631454441635, "grad_norm": 6.65625, "learning_rate": 7.908013238286243e-07, "loss": 2.7211, "num_input_tokens_seen": 76850432, "step": 36790 }, { "epoch": 6.002447181662452, "grad_norm": 9.9375, "learning_rate": 7.883534690038136e-07, "loss": 2.9282, "num_input_tokens_seen": 76862288, "step": 36795 }, { "epoch": 6.003262908883269, "grad_norm": 9.6875, "learning_rate": 7.859093479191559e-07, "loss": 1.7899, "num_input_tokens_seen": 76873152, "step": 36800 }, { "epoch": 6.003262908883269, "eval_loss": 2.5388035774230957, "eval_runtime": 134.714, "eval_samples_per_second": 20.228, "eval_steps_per_second": 10.118, "num_input_tokens_seen": 76873152, "step": 36800 }, { "epoch": 6.0040786361040865, "grad_norm": 11.125, "learning_rate": 7.834689609515722e-07, "loss": 2.9852, "num_input_tokens_seen": 76883280, "step": 36805 }, { "epoch": 6.004894363324905, "grad_norm": 2.015625, "learning_rate": 7.810323084774002e-07, "loss": 1.0761, "num_input_tokens_seen": 76893440, "step": 36810 }, { "epoch": 6.005710090545722, "grad_norm": 10.125, "learning_rate": 7.785993908723976e-07, "loss": 2.0809, "num_input_tokens_seen": 76903248, "step": 36815 }, { "epoch": 6.006525817766539, "grad_norm": 6.8125, "learning_rate": 7.761702085117534e-07, "loss": 1.9467, "num_input_tokens_seen": 76912240, "step": 36820 }, { "epoch": 6.007341544987356, "grad_norm": 15.9375, "learning_rate": 7.737447617700844e-07, "loss": 2.7685, "num_input_tokens_seen": 76923536, "step": 36825 }, { "epoch": 6.008157272208174, "grad_norm": 4.5625, "learning_rate": 7.713230510214136e-07, "loss": 1.8432, "num_input_tokens_seen": 76933200, "step": 36830 }, { "epoch": 6.008972999428991, "grad_norm": 7.1875, "learning_rate": 7.689050766392092e-07, "loss": 3.5864, "num_input_tokens_seen": 76941984, "step": 36835 }, { "epoch": 6.009788726649808, "grad_norm": 2.578125, "learning_rate": 7.664908389963477e-07, "loss": 2.4026, "num_input_tokens_seen": 76952912, "step": 36840 }, { "epoch": 6.010604453870625, "grad_norm": 1.3671875, "learning_rate": 7.64080338465134e-07, "loss": 1.7915, "num_input_tokens_seen": 76962512, "step": 36845 }, { "epoch": 6.011420181091443, "grad_norm": 11.25, "learning_rate": 7.616735754173043e-07, "loss": 2.6586, "num_input_tokens_seen": 76973392, "step": 36850 }, { "epoch": 6.0122359083122605, "grad_norm": 6.78125, "learning_rate": 7.592705502240005e-07, "loss": 2.9429, "num_input_tokens_seen": 76984064, "step": 36855 }, { "epoch": 6.013051635533078, "grad_norm": 10.9375, "learning_rate": 7.568712632558095e-07, "loss": 2.6769, "num_input_tokens_seen": 76995280, "step": 36860 }, { "epoch": 6.013867362753895, "grad_norm": 7.25, "learning_rate": 7.544757148827297e-07, "loss": 1.708, "num_input_tokens_seen": 77006688, "step": 36865 }, { "epoch": 6.014683089974713, "grad_norm": 3.171875, "learning_rate": 7.520839054741797e-07, "loss": 1.639, "num_input_tokens_seen": 77018864, "step": 36870 }, { "epoch": 6.01549881719553, "grad_norm": 10.875, "learning_rate": 7.496958353990113e-07, "loss": 1.4967, "num_input_tokens_seen": 77030208, "step": 36875 }, { "epoch": 6.016314544416347, "grad_norm": 9.375, "learning_rate": 7.473115050254941e-07, "loss": 2.11, "num_input_tokens_seen": 77041664, "step": 36880 }, { "epoch": 6.017130271637164, "grad_norm": 1.5546875, "learning_rate": 7.449309147213173e-07, "loss": 3.1034, "num_input_tokens_seen": 77053616, "step": 36885 }, { "epoch": 6.017945998857982, "grad_norm": 6.53125, "learning_rate": 7.425540648536067e-07, "loss": 1.8416, "num_input_tokens_seen": 77062064, "step": 36890 }, { "epoch": 6.018761726078799, "grad_norm": 6.875, "learning_rate": 7.40180955788894e-07, "loss": 2.0576, "num_input_tokens_seen": 77072272, "step": 36895 }, { "epoch": 6.0195774532996165, "grad_norm": 8.25, "learning_rate": 7.378115878931474e-07, "loss": 3.5937, "num_input_tokens_seen": 77082320, "step": 36900 }, { "epoch": 6.020393180520434, "grad_norm": 1.953125, "learning_rate": 7.354459615317527e-07, "loss": 2.0419, "num_input_tokens_seen": 77091840, "step": 36905 }, { "epoch": 6.021208907741252, "grad_norm": 0.81640625, "learning_rate": 7.33084077069518e-07, "loss": 1.2786, "num_input_tokens_seen": 77101872, "step": 36910 }, { "epoch": 6.022024634962069, "grad_norm": 7.65625, "learning_rate": 7.307259348706768e-07, "loss": 2.5058, "num_input_tokens_seen": 77110336, "step": 36915 }, { "epoch": 6.022840362182886, "grad_norm": 6.84375, "learning_rate": 7.283715352988801e-07, "loss": 2.1307, "num_input_tokens_seen": 77120096, "step": 36920 }, { "epoch": 6.023656089403703, "grad_norm": 4.125, "learning_rate": 7.260208787172068e-07, "loss": 1.9174, "num_input_tokens_seen": 77131136, "step": 36925 }, { "epoch": 6.024471816624521, "grad_norm": 5.40625, "learning_rate": 7.23673965488167e-07, "loss": 1.6429, "num_input_tokens_seen": 77141296, "step": 36930 }, { "epoch": 6.025287543845338, "grad_norm": 6.0625, "learning_rate": 7.213307959736709e-07, "loss": 2.6221, "num_input_tokens_seen": 77153008, "step": 36935 }, { "epoch": 6.026103271066155, "grad_norm": 7.75, "learning_rate": 7.189913705350715e-07, "loss": 1.4906, "num_input_tokens_seen": 77162848, "step": 36940 }, { "epoch": 6.026918998286972, "grad_norm": 0.98046875, "learning_rate": 7.166556895331411e-07, "loss": 2.3008, "num_input_tokens_seen": 77173920, "step": 36945 }, { "epoch": 6.02773472550779, "grad_norm": 3.515625, "learning_rate": 7.143237533280639e-07, "loss": 2.6978, "num_input_tokens_seen": 77183936, "step": 36950 }, { "epoch": 6.028550452728608, "grad_norm": 4.53125, "learning_rate": 7.119955622794578e-07, "loss": 3.0205, "num_input_tokens_seen": 77194256, "step": 36955 }, { "epoch": 6.029366179949425, "grad_norm": 19.125, "learning_rate": 7.096711167463577e-07, "loss": 2.1577, "num_input_tokens_seen": 77205296, "step": 36960 }, { "epoch": 6.030181907170242, "grad_norm": 4.96875, "learning_rate": 7.073504170872213e-07, "loss": 4.1926, "num_input_tokens_seen": 77216608, "step": 36965 }, { "epoch": 6.03099763439106, "grad_norm": 0.73828125, "learning_rate": 7.05033463659932e-07, "loss": 2.1809, "num_input_tokens_seen": 77225840, "step": 36970 }, { "epoch": 6.031813361611877, "grad_norm": 4.875, "learning_rate": 7.027202568217928e-07, "loss": 2.6806, "num_input_tokens_seen": 77237120, "step": 36975 }, { "epoch": 6.032629088832694, "grad_norm": 11.1875, "learning_rate": 7.004107969295293e-07, "loss": 2.6365, "num_input_tokens_seen": 77246736, "step": 36980 }, { "epoch": 6.033444816053512, "grad_norm": 8.625, "learning_rate": 6.9810508433929e-07, "loss": 1.5145, "num_input_tokens_seen": 77258144, "step": 36985 }, { "epoch": 6.034260543274329, "grad_norm": 6.625, "learning_rate": 6.958031194066406e-07, "loss": 1.7922, "num_input_tokens_seen": 77268016, "step": 36990 }, { "epoch": 6.035076270495146, "grad_norm": 3.875, "learning_rate": 6.935049024865776e-07, "loss": 1.9024, "num_input_tokens_seen": 77279504, "step": 36995 }, { "epoch": 6.0358919977159635, "grad_norm": 9.6875, "learning_rate": 6.912104339335118e-07, "loss": 3.1736, "num_input_tokens_seen": 77290000, "step": 37000 }, { "epoch": 6.0358919977159635, "eval_loss": 2.5388035774230957, "eval_runtime": 134.7763, "eval_samples_per_second": 20.219, "eval_steps_per_second": 10.113, "num_input_tokens_seen": 77290000, "step": 37000 }, { "epoch": 6.0367077249367815, "grad_norm": 7.40625, "learning_rate": 6.889197141012799e-07, "loss": 3.0752, "num_input_tokens_seen": 77298624, "step": 37005 }, { "epoch": 6.037523452157599, "grad_norm": 7.53125, "learning_rate": 6.866327433431435e-07, "loss": 2.1342, "num_input_tokens_seen": 77308416, "step": 37010 }, { "epoch": 6.038339179378416, "grad_norm": 9.0, "learning_rate": 6.843495220117735e-07, "loss": 1.8866, "num_input_tokens_seen": 77319728, "step": 37015 }, { "epoch": 6.039154906599233, "grad_norm": 6.5625, "learning_rate": 6.820700504592798e-07, "loss": 2.0237, "num_input_tokens_seen": 77330864, "step": 37020 }, { "epoch": 6.039970633820051, "grad_norm": 9.6875, "learning_rate": 6.797943290371839e-07, "loss": 3.3293, "num_input_tokens_seen": 77341616, "step": 37025 }, { "epoch": 6.040786361040868, "grad_norm": 12.5625, "learning_rate": 6.775223580964274e-07, "loss": 2.8243, "num_input_tokens_seen": 77354208, "step": 37030 }, { "epoch": 6.041602088261685, "grad_norm": 8.0, "learning_rate": 6.7525413798738e-07, "loss": 3.509, "num_input_tokens_seen": 77365424, "step": 37035 }, { "epoch": 6.042417815482502, "grad_norm": 5.6875, "learning_rate": 6.729896690598259e-07, "loss": 1.7226, "num_input_tokens_seen": 77374656, "step": 37040 }, { "epoch": 6.04323354270332, "grad_norm": 5.15625, "learning_rate": 6.707289516629772e-07, "loss": 2.6891, "num_input_tokens_seen": 77385168, "step": 37045 }, { "epoch": 6.0440492699241375, "grad_norm": 0.95703125, "learning_rate": 6.684719861454692e-07, "loss": 1.5253, "num_input_tokens_seen": 77396656, "step": 37050 }, { "epoch": 6.044864997144955, "grad_norm": 6.75, "learning_rate": 6.662187728553481e-07, "loss": 1.3854, "num_input_tokens_seen": 77406192, "step": 37055 }, { "epoch": 6.045680724365772, "grad_norm": 4.25, "learning_rate": 6.639693121400892e-07, "loss": 2.197, "num_input_tokens_seen": 77418224, "step": 37060 }, { "epoch": 6.04649645158659, "grad_norm": 2.0625, "learning_rate": 6.617236043465868e-07, "loss": 1.475, "num_input_tokens_seen": 77429408, "step": 37065 }, { "epoch": 6.047312178807407, "grad_norm": 4.875, "learning_rate": 6.594816498211587e-07, "loss": 2.0617, "num_input_tokens_seen": 77439424, "step": 37070 }, { "epoch": 6.048127906028224, "grad_norm": 6.375, "learning_rate": 6.572434489095447e-07, "loss": 1.9578, "num_input_tokens_seen": 77450400, "step": 37075 }, { "epoch": 6.048943633249041, "grad_norm": 5.21875, "learning_rate": 6.550090019568994e-07, "loss": 1.9321, "num_input_tokens_seen": 77461232, "step": 37080 }, { "epoch": 6.049759360469859, "grad_norm": 9.125, "learning_rate": 6.527783093078027e-07, "loss": 0.9982, "num_input_tokens_seen": 77472656, "step": 37085 }, { "epoch": 6.050575087690676, "grad_norm": 8.9375, "learning_rate": 6.5055137130626e-07, "loss": 3.3248, "num_input_tokens_seen": 77482944, "step": 37090 }, { "epoch": 6.051390814911493, "grad_norm": 13.0, "learning_rate": 6.483281882956854e-07, "loss": 1.7543, "num_input_tokens_seen": 77494544, "step": 37095 }, { "epoch": 6.052206542132311, "grad_norm": 4.5, "learning_rate": 6.461087606189298e-07, "loss": 1.7041, "num_input_tokens_seen": 77503920, "step": 37100 }, { "epoch": 6.053022269353129, "grad_norm": 5.4375, "learning_rate": 6.438930886182554e-07, "loss": 2.4937, "num_input_tokens_seen": 77513344, "step": 37105 }, { "epoch": 6.053837996573946, "grad_norm": 5.3125, "learning_rate": 6.416811726353417e-07, "loss": 2.0693, "num_input_tokens_seen": 77523312, "step": 37110 }, { "epoch": 6.054653723794763, "grad_norm": 10.0625, "learning_rate": 6.394730130112991e-07, "loss": 2.884, "num_input_tokens_seen": 77533088, "step": 37115 }, { "epoch": 6.05546945101558, "grad_norm": 6.125, "learning_rate": 6.372686100866471e-07, "loss": 2.1008, "num_input_tokens_seen": 77542928, "step": 37120 }, { "epoch": 6.056285178236398, "grad_norm": 5.3125, "learning_rate": 6.350679642013413e-07, "loss": 1.6542, "num_input_tokens_seen": 77553328, "step": 37125 }, { "epoch": 6.057100905457215, "grad_norm": 13.5, "learning_rate": 6.328710756947437e-07, "loss": 1.963, "num_input_tokens_seen": 77563808, "step": 37130 }, { "epoch": 6.057916632678032, "grad_norm": 17.25, "learning_rate": 6.306779449056416e-07, "loss": 2.9676, "num_input_tokens_seen": 77574160, "step": 37135 }, { "epoch": 6.058732359898849, "grad_norm": 7.46875, "learning_rate": 6.284885721722422e-07, "loss": 2.2226, "num_input_tokens_seen": 77585888, "step": 37140 }, { "epoch": 6.059548087119667, "grad_norm": 2.984375, "learning_rate": 6.26302957832181e-07, "loss": 2.8989, "num_input_tokens_seen": 77597664, "step": 37145 }, { "epoch": 6.0603638143404845, "grad_norm": 4.59375, "learning_rate": 6.241211022224997e-07, "loss": 2.5862, "num_input_tokens_seen": 77608592, "step": 37150 }, { "epoch": 6.061179541561302, "grad_norm": 3.25, "learning_rate": 6.219430056796732e-07, "loss": 1.7921, "num_input_tokens_seen": 77618736, "step": 37155 }, { "epoch": 6.06199526878212, "grad_norm": 6.6875, "learning_rate": 6.19768668539586e-07, "loss": 3.1248, "num_input_tokens_seen": 77629808, "step": 37160 }, { "epoch": 6.062810996002937, "grad_norm": 7.96875, "learning_rate": 6.175980911375528e-07, "loss": 1.646, "num_input_tokens_seen": 77640416, "step": 37165 }, { "epoch": 6.063626723223754, "grad_norm": 11.0, "learning_rate": 6.154312738083034e-07, "loss": 2.3296, "num_input_tokens_seen": 77651008, "step": 37170 }, { "epoch": 6.064442450444571, "grad_norm": 10.875, "learning_rate": 6.132682168859843e-07, "loss": 2.558, "num_input_tokens_seen": 77660224, "step": 37175 }, { "epoch": 6.065258177665389, "grad_norm": 1.15625, "learning_rate": 6.111089207041704e-07, "loss": 0.9299, "num_input_tokens_seen": 77669808, "step": 37180 }, { "epoch": 6.066073904886206, "grad_norm": 6.75, "learning_rate": 6.089533855958507e-07, "loss": 3.2431, "num_input_tokens_seen": 77679744, "step": 37185 }, { "epoch": 6.066889632107023, "grad_norm": 12.5, "learning_rate": 6.068016118934372e-07, "loss": 3.0661, "num_input_tokens_seen": 77688512, "step": 37190 }, { "epoch": 6.0677053593278405, "grad_norm": 8.6875, "learning_rate": 6.04653599928759e-07, "loss": 1.8537, "num_input_tokens_seen": 77698336, "step": 37195 }, { "epoch": 6.0685210865486585, "grad_norm": 5.25, "learning_rate": 6.025093500330675e-07, "loss": 2.7029, "num_input_tokens_seen": 77708416, "step": 37200 }, { "epoch": 6.0685210865486585, "eval_loss": 2.5388035774230957, "eval_runtime": 134.755, "eval_samples_per_second": 20.222, "eval_steps_per_second": 10.115, "num_input_tokens_seen": 77708416, "step": 37200 }, { "epoch": 6.069336813769476, "grad_norm": 8.875, "learning_rate": 6.003688625370291e-07, "loss": 1.5637, "num_input_tokens_seen": 77718320, "step": 37205 }, { "epoch": 6.070152540990293, "grad_norm": 8.75, "learning_rate": 5.982321377707406e-07, "loss": 1.9735, "num_input_tokens_seen": 77728624, "step": 37210 }, { "epoch": 6.07096826821111, "grad_norm": 6.40625, "learning_rate": 5.96099176063708e-07, "loss": 2.1515, "num_input_tokens_seen": 77738368, "step": 37215 }, { "epoch": 6.071783995431928, "grad_norm": 11.1875, "learning_rate": 5.93969977744857e-07, "loss": 1.6543, "num_input_tokens_seen": 77749792, "step": 37220 }, { "epoch": 6.072599722652745, "grad_norm": 9.125, "learning_rate": 5.918445431425445e-07, "loss": 2.1792, "num_input_tokens_seen": 77759264, "step": 37225 }, { "epoch": 6.073415449873562, "grad_norm": 0.1708984375, "learning_rate": 5.897228725845333e-07, "loss": 2.9578, "num_input_tokens_seen": 77769888, "step": 37230 }, { "epoch": 6.074231177094379, "grad_norm": 6.375, "learning_rate": 5.876049663980171e-07, "loss": 2.6253, "num_input_tokens_seen": 77779120, "step": 37235 }, { "epoch": 6.075046904315197, "grad_norm": 7.71875, "learning_rate": 5.854908249095959e-07, "loss": 2.9896, "num_input_tokens_seen": 77788288, "step": 37240 }, { "epoch": 6.0758626315360145, "grad_norm": 6.09375, "learning_rate": 5.833804484453031e-07, "loss": 2.8922, "num_input_tokens_seen": 77798288, "step": 37245 }, { "epoch": 6.076678358756832, "grad_norm": 7.46875, "learning_rate": 5.81273837330587e-07, "loss": 1.3703, "num_input_tokens_seen": 77809072, "step": 37250 }, { "epoch": 6.077494085977649, "grad_norm": 1.6015625, "learning_rate": 5.791709918903071e-07, "loss": 0.9698, "num_input_tokens_seen": 77819184, "step": 37255 }, { "epoch": 6.078309813198467, "grad_norm": 6.625, "learning_rate": 5.770719124487483e-07, "loss": 2.8986, "num_input_tokens_seen": 77830048, "step": 37260 }, { "epoch": 6.079125540419284, "grad_norm": 2.796875, "learning_rate": 5.749765993296241e-07, "loss": 2.3397, "num_input_tokens_seen": 77840368, "step": 37265 }, { "epoch": 6.079941267640101, "grad_norm": 9.125, "learning_rate": 5.728850528560509e-07, "loss": 3.7692, "num_input_tokens_seen": 77851552, "step": 37270 }, { "epoch": 6.080756994860918, "grad_norm": 9.5, "learning_rate": 5.707972733505707e-07, "loss": 2.729, "num_input_tokens_seen": 77862192, "step": 37275 }, { "epoch": 6.081572722081736, "grad_norm": 3.28125, "learning_rate": 5.687132611351509e-07, "loss": 1.9384, "num_input_tokens_seen": 77872464, "step": 37280 }, { "epoch": 6.082388449302553, "grad_norm": 14.125, "learning_rate": 5.666330165311651e-07, "loss": 3.2284, "num_input_tokens_seen": 77883024, "step": 37285 }, { "epoch": 6.08320417652337, "grad_norm": 7.875, "learning_rate": 5.645565398594204e-07, "loss": 1.9556, "num_input_tokens_seen": 77893824, "step": 37290 }, { "epoch": 6.0840199037441876, "grad_norm": 9.0625, "learning_rate": 5.624838314401304e-07, "loss": 2.8368, "num_input_tokens_seen": 77903056, "step": 37295 }, { "epoch": 6.084835630965006, "grad_norm": 4.1875, "learning_rate": 5.604148915929336e-07, "loss": 2.5564, "num_input_tokens_seen": 77914144, "step": 37300 }, { "epoch": 6.085651358185823, "grad_norm": 4.65625, "learning_rate": 5.583497206368887e-07, "loss": 3.0775, "num_input_tokens_seen": 77924352, "step": 37305 }, { "epoch": 6.08646708540664, "grad_norm": 14.0, "learning_rate": 5.562883188904688e-07, "loss": 2.9592, "num_input_tokens_seen": 77935680, "step": 37310 }, { "epoch": 6.087282812627457, "grad_norm": 12.25, "learning_rate": 5.542306866715724e-07, "loss": 2.5899, "num_input_tokens_seen": 77946192, "step": 37315 }, { "epoch": 6.088098539848275, "grad_norm": 4.5, "learning_rate": 5.52176824297504e-07, "loss": 1.3172, "num_input_tokens_seen": 77957408, "step": 37320 }, { "epoch": 6.088914267069092, "grad_norm": 5.03125, "learning_rate": 5.501267320850018e-07, "loss": 2.8001, "num_input_tokens_seen": 77966896, "step": 37325 }, { "epoch": 6.089729994289909, "grad_norm": 9.875, "learning_rate": 5.480804103502157e-07, "loss": 2.823, "num_input_tokens_seen": 77978240, "step": 37330 }, { "epoch": 6.090545721510727, "grad_norm": 4.90625, "learning_rate": 5.460378594087101e-07, "loss": 2.3668, "num_input_tokens_seen": 77988144, "step": 37335 }, { "epoch": 6.091361448731544, "grad_norm": 0.62890625, "learning_rate": 5.439990795754773e-07, "loss": 1.5266, "num_input_tokens_seen": 77997536, "step": 37340 }, { "epoch": 6.0921771759523615, "grad_norm": 14.5, "learning_rate": 5.419640711649188e-07, "loss": 2.4564, "num_input_tokens_seen": 78007664, "step": 37345 }, { "epoch": 6.092992903173179, "grad_norm": 4.6875, "learning_rate": 5.399328344908583e-07, "loss": 3.1869, "num_input_tokens_seen": 78018256, "step": 37350 }, { "epoch": 6.093808630393997, "grad_norm": 3.125, "learning_rate": 5.379053698665399e-07, "loss": 1.4886, "num_input_tokens_seen": 78028672, "step": 37355 }, { "epoch": 6.094624357614814, "grad_norm": 13.0, "learning_rate": 5.358816776046216e-07, "loss": 1.144, "num_input_tokens_seen": 78039440, "step": 37360 }, { "epoch": 6.095440084835631, "grad_norm": 9.8125, "learning_rate": 5.338617580171817e-07, "loss": 4.3136, "num_input_tokens_seen": 78050848, "step": 37365 }, { "epoch": 6.096255812056448, "grad_norm": 7.15625, "learning_rate": 5.318456114157239e-07, "loss": 2.7774, "num_input_tokens_seen": 78060528, "step": 37370 }, { "epoch": 6.097071539277266, "grad_norm": 5.40625, "learning_rate": 5.298332381111576e-07, "loss": 1.6386, "num_input_tokens_seen": 78071344, "step": 37375 }, { "epoch": 6.097887266498083, "grad_norm": 6.6875, "learning_rate": 5.27824638413818e-07, "loss": 2.3291, "num_input_tokens_seen": 78082832, "step": 37380 }, { "epoch": 6.0987029937189, "grad_norm": 4.6875, "learning_rate": 5.258198126334546e-07, "loss": 2.0565, "num_input_tokens_seen": 78092896, "step": 37385 }, { "epoch": 6.0995187209397175, "grad_norm": 8.625, "learning_rate": 5.238187610792367e-07, "loss": 2.3567, "num_input_tokens_seen": 78103872, "step": 37390 }, { "epoch": 6.1003344481605355, "grad_norm": 4.78125, "learning_rate": 5.218214840597563e-07, "loss": 2.4276, "num_input_tokens_seen": 78113824, "step": 37395 }, { "epoch": 6.101150175381353, "grad_norm": 6.4375, "learning_rate": 5.198279818830115e-07, "loss": 1.6779, "num_input_tokens_seen": 78124432, "step": 37400 }, { "epoch": 6.101150175381353, "eval_loss": 2.5388035774230957, "eval_runtime": 134.7893, "eval_samples_per_second": 20.217, "eval_steps_per_second": 10.112, "num_input_tokens_seen": 78124432, "step": 37400 }, { "epoch": 6.10196590260217, "grad_norm": 5.9375, "learning_rate": 5.178382548564287e-07, "loss": 1.6727, "num_input_tokens_seen": 78133600, "step": 37405 }, { "epoch": 6.102781629822987, "grad_norm": 5.90625, "learning_rate": 5.15852303286854e-07, "loss": 2.3785, "num_input_tokens_seen": 78144688, "step": 37410 }, { "epoch": 6.103597357043805, "grad_norm": 12.4375, "learning_rate": 5.138701274805396e-07, "loss": 2.6792, "num_input_tokens_seen": 78155968, "step": 37415 }, { "epoch": 6.104413084264622, "grad_norm": 3.71875, "learning_rate": 5.118917277431606e-07, "loss": 1.6123, "num_input_tokens_seen": 78165616, "step": 37420 }, { "epoch": 6.105228811485439, "grad_norm": 11.75, "learning_rate": 5.099171043798145e-07, "loss": 1.8633, "num_input_tokens_seen": 78175168, "step": 37425 }, { "epoch": 6.106044538706256, "grad_norm": 3.359375, "learning_rate": 5.079462576950133e-07, "loss": 2.2778, "num_input_tokens_seen": 78184784, "step": 37430 }, { "epoch": 6.106860265927074, "grad_norm": 2.890625, "learning_rate": 5.059791879926862e-07, "loss": 2.5979, "num_input_tokens_seen": 78195328, "step": 37435 }, { "epoch": 6.1076759931478914, "grad_norm": 9.4375, "learning_rate": 5.040158955761793e-07, "loss": 1.1616, "num_input_tokens_seen": 78206128, "step": 37440 }, { "epoch": 6.108491720368709, "grad_norm": 10.5625, "learning_rate": 5.020563807482559e-07, "loss": 3.0559, "num_input_tokens_seen": 78215264, "step": 37445 }, { "epoch": 6.109307447589526, "grad_norm": 10.625, "learning_rate": 5.001006438110995e-07, "loss": 2.7784, "num_input_tokens_seen": 78224640, "step": 37450 }, { "epoch": 6.110123174810344, "grad_norm": 9.375, "learning_rate": 4.981486850663075e-07, "loss": 1.4426, "num_input_tokens_seen": 78235056, "step": 37455 }, { "epoch": 6.110938902031161, "grad_norm": 3.171875, "learning_rate": 4.962005048149005e-07, "loss": 1.2803, "num_input_tokens_seen": 78245984, "step": 37460 }, { "epoch": 6.111754629251978, "grad_norm": 14.0625, "learning_rate": 4.942561033573073e-07, "loss": 2.209, "num_input_tokens_seen": 78256080, "step": 37465 }, { "epoch": 6.112570356472795, "grad_norm": 5.375, "learning_rate": 4.923154809933827e-07, "loss": 2.2469, "num_input_tokens_seen": 78265104, "step": 37470 }, { "epoch": 6.113386083693613, "grad_norm": 5.0625, "learning_rate": 4.903786380223957e-07, "loss": 2.0929, "num_input_tokens_seen": 78274992, "step": 37475 }, { "epoch": 6.11420181091443, "grad_norm": 6.21875, "learning_rate": 4.884455747430266e-07, "loss": 2.3949, "num_input_tokens_seen": 78286320, "step": 37480 }, { "epoch": 6.115017538135247, "grad_norm": 3.9375, "learning_rate": 4.865162914533816e-07, "loss": 2.7219, "num_input_tokens_seen": 78298160, "step": 37485 }, { "epoch": 6.1158332653560645, "grad_norm": 17.5, "learning_rate": 4.845907884509809e-07, "loss": 2.1082, "num_input_tokens_seen": 78308624, "step": 37490 }, { "epoch": 6.1166489925768825, "grad_norm": 5.125, "learning_rate": 4.82669066032762e-07, "loss": 1.5608, "num_input_tokens_seen": 78318560, "step": 37495 }, { "epoch": 6.1174647197977, "grad_norm": 8.625, "learning_rate": 4.807511244950768e-07, "loss": 2.5177, "num_input_tokens_seen": 78329296, "step": 37500 }, { "epoch": 6.118280447018517, "grad_norm": 5.65625, "learning_rate": 4.788369641336943e-07, "loss": 2.8352, "num_input_tokens_seen": 78339648, "step": 37505 }, { "epoch": 6.119096174239334, "grad_norm": 4.71875, "learning_rate": 4.769265852438032e-07, "loss": 2.8495, "num_input_tokens_seen": 78350320, "step": 37510 }, { "epoch": 6.119911901460152, "grad_norm": 2.6875, "learning_rate": 4.750199881200124e-07, "loss": 1.3317, "num_input_tokens_seen": 78361168, "step": 37515 }, { "epoch": 6.120727628680969, "grad_norm": 7.21875, "learning_rate": 4.7311717305633664e-07, "loss": 2.4742, "num_input_tokens_seen": 78371328, "step": 37520 }, { "epoch": 6.121543355901786, "grad_norm": 3.46875, "learning_rate": 4.7121814034621623e-07, "loss": 2.2901, "num_input_tokens_seen": 78382928, "step": 37525 }, { "epoch": 6.122359083122603, "grad_norm": 6.3125, "learning_rate": 4.693228902825114e-07, "loss": 2.4296, "num_input_tokens_seen": 78393872, "step": 37530 }, { "epoch": 6.123174810343421, "grad_norm": 6.6875, "learning_rate": 4.6743142315748277e-07, "loss": 2.9927, "num_input_tokens_seen": 78405536, "step": 37535 }, { "epoch": 6.1239905375642385, "grad_norm": 8.5, "learning_rate": 4.655437392628276e-07, "loss": 2.7864, "num_input_tokens_seen": 78415200, "step": 37540 }, { "epoch": 6.124806264785056, "grad_norm": 9.6875, "learning_rate": 4.636598388896463e-07, "loss": 2.411, "num_input_tokens_seen": 78425984, "step": 37545 }, { "epoch": 6.125621992005874, "grad_norm": 9.6875, "learning_rate": 4.6177972232845925e-07, "loss": 2.0575, "num_input_tokens_seen": 78436480, "step": 37550 }, { "epoch": 6.126437719226691, "grad_norm": 10.5625, "learning_rate": 4.5990338986920953e-07, "loss": 2.6513, "num_input_tokens_seen": 78447504, "step": 37555 }, { "epoch": 6.127253446447508, "grad_norm": 10.6875, "learning_rate": 4.5803084180124633e-07, "loss": 2.9949, "num_input_tokens_seen": 78457632, "step": 37560 }, { "epoch": 6.128069173668325, "grad_norm": 4.0625, "learning_rate": 4.561620784133386e-07, "loss": 1.9786, "num_input_tokens_seen": 78469408, "step": 37565 }, { "epoch": 6.128884900889143, "grad_norm": 7.71875, "learning_rate": 4.5429709999367796e-07, "loss": 2.0313, "num_input_tokens_seen": 78481472, "step": 37570 }, { "epoch": 6.12970062810996, "grad_norm": 7.59375, "learning_rate": 4.5243590682986223e-07, "loss": 3.0817, "num_input_tokens_seen": 78490992, "step": 37575 }, { "epoch": 6.130516355330777, "grad_norm": 6.625, "learning_rate": 4.5057849920891735e-07, "loss": 1.8336, "num_input_tokens_seen": 78500864, "step": 37580 }, { "epoch": 6.1313320825515945, "grad_norm": 0.34375, "learning_rate": 4.487248774172698e-07, "loss": 2.036, "num_input_tokens_seen": 78511696, "step": 37585 }, { "epoch": 6.1321478097724125, "grad_norm": 9.5, "learning_rate": 4.4687504174077965e-07, "loss": 1.5751, "num_input_tokens_seen": 78522112, "step": 37590 }, { "epoch": 6.13296353699323, "grad_norm": 15.75, "learning_rate": 4.450289924647133e-07, "loss": 3.4028, "num_input_tokens_seen": 78532192, "step": 37595 }, { "epoch": 6.133779264214047, "grad_norm": 7.96875, "learning_rate": 4.431867298737513e-07, "loss": 3.2998, "num_input_tokens_seen": 78542400, "step": 37600 }, { "epoch": 6.133779264214047, "eval_loss": 2.5388035774230957, "eval_runtime": 134.7096, "eval_samples_per_second": 20.229, "eval_steps_per_second": 10.118, "num_input_tokens_seen": 78542400, "step": 37600 }, { "epoch": 6.134594991434864, "grad_norm": 4.90625, "learning_rate": 4.41348254251997e-07, "loss": 2.2696, "num_input_tokens_seen": 78552080, "step": 37605 }, { "epoch": 6.135410718655682, "grad_norm": 3.03125, "learning_rate": 4.395135658829652e-07, "loss": 0.615, "num_input_tokens_seen": 78561856, "step": 37610 }, { "epoch": 6.136226445876499, "grad_norm": 6.15625, "learning_rate": 4.376826650495852e-07, "loss": 2.2805, "num_input_tokens_seen": 78573328, "step": 37615 }, { "epoch": 6.137042173097316, "grad_norm": 8.1875, "learning_rate": 4.358555520342117e-07, "loss": 2.4129, "num_input_tokens_seen": 78582816, "step": 37620 }, { "epoch": 6.137857900318133, "grad_norm": 6.4375, "learning_rate": 4.3403222711860257e-07, "loss": 2.962, "num_input_tokens_seen": 78593616, "step": 37625 }, { "epoch": 6.138673627538951, "grad_norm": 5.71875, "learning_rate": 4.3221269058394133e-07, "loss": 2.5541, "num_input_tokens_seen": 78605536, "step": 37630 }, { "epoch": 6.139489354759768, "grad_norm": 2.375, "learning_rate": 4.303969427108173e-07, "loss": 2.1021, "num_input_tokens_seen": 78617152, "step": 37635 }, { "epoch": 6.1403050819805856, "grad_norm": 5.84375, "learning_rate": 4.2858498377924825e-07, "loss": 2.5952, "num_input_tokens_seen": 78629088, "step": 37640 }, { "epoch": 6.141120809201403, "grad_norm": 5.9375, "learning_rate": 4.267768140686579e-07, "loss": 0.9042, "num_input_tokens_seen": 78639680, "step": 37645 }, { "epoch": 6.141936536422221, "grad_norm": 13.3125, "learning_rate": 4.2497243385788975e-07, "loss": 3.7055, "num_input_tokens_seen": 78649680, "step": 37650 }, { "epoch": 6.142752263643038, "grad_norm": 8.75, "learning_rate": 4.231718434251991e-07, "loss": 3.3527, "num_input_tokens_seen": 78661200, "step": 37655 }, { "epoch": 6.143567990863855, "grad_norm": 11.3125, "learning_rate": 4.213750430482666e-07, "loss": 2.4022, "num_input_tokens_seen": 78671248, "step": 37660 }, { "epoch": 6.144383718084672, "grad_norm": 10.9375, "learning_rate": 4.1958203300417054e-07, "loss": 2.5857, "num_input_tokens_seen": 78681440, "step": 37665 }, { "epoch": 6.14519944530549, "grad_norm": 11.375, "learning_rate": 4.177928135694259e-07, "loss": 3.3705, "num_input_tokens_seen": 78693152, "step": 37670 }, { "epoch": 6.146015172526307, "grad_norm": 9.3125, "learning_rate": 4.1600738501994807e-07, "loss": 2.4941, "num_input_tokens_seen": 78703936, "step": 37675 }, { "epoch": 6.146830899747124, "grad_norm": 3.34375, "learning_rate": 4.1422574763107237e-07, "loss": 2.5601, "num_input_tokens_seen": 78714448, "step": 37680 }, { "epoch": 6.1476466269679415, "grad_norm": 7.1875, "learning_rate": 4.124479016775512e-07, "loss": 2.2415, "num_input_tokens_seen": 78725568, "step": 37685 }, { "epoch": 6.1484623541887595, "grad_norm": 11.1875, "learning_rate": 4.106738474335514e-07, "loss": 3.1499, "num_input_tokens_seen": 78736144, "step": 37690 }, { "epoch": 6.149278081409577, "grad_norm": 8.0625, "learning_rate": 4.089035851726486e-07, "loss": 2.6113, "num_input_tokens_seen": 78745888, "step": 37695 }, { "epoch": 6.150093808630394, "grad_norm": 7.53125, "learning_rate": 4.0713711516784937e-07, "loss": 3.1926, "num_input_tokens_seen": 78757104, "step": 37700 }, { "epoch": 6.150909535851211, "grad_norm": 4.875, "learning_rate": 4.05374437691558e-07, "loss": 1.6964, "num_input_tokens_seen": 78767344, "step": 37705 }, { "epoch": 6.151725263072029, "grad_norm": 9.9375, "learning_rate": 4.036155530156044e-07, "loss": 2.5212, "num_input_tokens_seen": 78777984, "step": 37710 }, { "epoch": 6.152540990292846, "grad_norm": 16.125, "learning_rate": 4.018604614112298e-07, "loss": 3.8754, "num_input_tokens_seen": 78788976, "step": 37715 }, { "epoch": 6.153356717513663, "grad_norm": 6.78125, "learning_rate": 4.0010916314908996e-07, "loss": 1.253, "num_input_tokens_seen": 78800800, "step": 37720 }, { "epoch": 6.154172444734481, "grad_norm": 10.8125, "learning_rate": 3.983616584992578e-07, "loss": 2.4841, "num_input_tokens_seen": 78811568, "step": 37725 }, { "epoch": 6.154988171955298, "grad_norm": 4.5, "learning_rate": 3.9661794773122595e-07, "loss": 2.8411, "num_input_tokens_seen": 78821792, "step": 37730 }, { "epoch": 6.1558038991761155, "grad_norm": 7.5, "learning_rate": 3.9487803111388777e-07, "loss": 1.961, "num_input_tokens_seen": 78832368, "step": 37735 }, { "epoch": 6.156619626396933, "grad_norm": 12.5625, "learning_rate": 3.9314190891556747e-07, "loss": 3.5449, "num_input_tokens_seen": 78842160, "step": 37740 }, { "epoch": 6.157435353617751, "grad_norm": 12.4375, "learning_rate": 3.914095814039925e-07, "loss": 2.9113, "num_input_tokens_seen": 78852560, "step": 37745 }, { "epoch": 6.158251080838568, "grad_norm": 7.78125, "learning_rate": 3.896810488463104e-07, "loss": 2.0581, "num_input_tokens_seen": 78863808, "step": 37750 }, { "epoch": 6.159066808059385, "grad_norm": 4.53125, "learning_rate": 3.8795631150908565e-07, "loss": 1.3254, "num_input_tokens_seen": 78875072, "step": 37755 }, { "epoch": 6.159882535280202, "grad_norm": 6.1875, "learning_rate": 3.862353696582888e-07, "loss": 2.9676, "num_input_tokens_seen": 78884928, "step": 37760 }, { "epoch": 6.16069826250102, "grad_norm": 11.0, "learning_rate": 3.8451822355931313e-07, "loss": 2.9994, "num_input_tokens_seen": 78895376, "step": 37765 }, { "epoch": 6.161513989721837, "grad_norm": 14.25, "learning_rate": 3.82804873476969e-07, "loss": 2.4806, "num_input_tokens_seen": 78906896, "step": 37770 }, { "epoch": 6.162329716942654, "grad_norm": 5.5, "learning_rate": 3.810953196754702e-07, "loss": 2.4156, "num_input_tokens_seen": 78916768, "step": 37775 }, { "epoch": 6.163145444163471, "grad_norm": 10.875, "learning_rate": 3.793895624184529e-07, "loss": 3.0739, "num_input_tokens_seen": 78927424, "step": 37780 }, { "epoch": 6.1639611713842895, "grad_norm": 3.296875, "learning_rate": 3.776876019689679e-07, "loss": 2.1586, "num_input_tokens_seen": 78938368, "step": 37785 }, { "epoch": 6.164776898605107, "grad_norm": 10.1875, "learning_rate": 3.7598943858947743e-07, "loss": 3.1265, "num_input_tokens_seen": 78947824, "step": 37790 }, { "epoch": 6.165592625825924, "grad_norm": 10.0625, "learning_rate": 3.742950725418637e-07, "loss": 2.5158, "num_input_tokens_seen": 78958352, "step": 37795 }, { "epoch": 6.166408353046741, "grad_norm": 2.609375, "learning_rate": 3.726045040874093e-07, "loss": 2.5407, "num_input_tokens_seen": 78968368, "step": 37800 }, { "epoch": 6.166408353046741, "eval_loss": 2.5388035774230957, "eval_runtime": 134.7131, "eval_samples_per_second": 20.228, "eval_steps_per_second": 10.118, "num_input_tokens_seen": 78968368, "step": 37800 }, { "epoch": 6.167224080267559, "grad_norm": 9.4375, "learning_rate": 3.709177334868308e-07, "loss": 1.8345, "num_input_tokens_seen": 78979024, "step": 37805 }, { "epoch": 6.168039807488376, "grad_norm": 8.0625, "learning_rate": 3.692347610002478e-07, "loss": 2.3452, "num_input_tokens_seen": 78988768, "step": 37810 }, { "epoch": 6.168855534709193, "grad_norm": 5.65625, "learning_rate": 3.675555868871916e-07, "loss": 3.0511, "num_input_tokens_seen": 78998128, "step": 37815 }, { "epoch": 6.16967126193001, "grad_norm": 10.375, "learning_rate": 3.658802114066162e-07, "loss": 2.3439, "num_input_tokens_seen": 79009536, "step": 37820 }, { "epoch": 6.170486989150828, "grad_norm": 5.0, "learning_rate": 3.6420863481688437e-07, "loss": 1.1631, "num_input_tokens_seen": 79020048, "step": 37825 }, { "epoch": 6.171302716371645, "grad_norm": 10.9375, "learning_rate": 3.625408573757705e-07, "loss": 1.6322, "num_input_tokens_seen": 79030624, "step": 37830 }, { "epoch": 6.1721184435924625, "grad_norm": 3.921875, "learning_rate": 3.608768793404743e-07, "loss": 2.5218, "num_input_tokens_seen": 79040512, "step": 37835 }, { "epoch": 6.17293417081328, "grad_norm": 5.21875, "learning_rate": 3.592167009675934e-07, "loss": 2.6723, "num_input_tokens_seen": 79050864, "step": 37840 }, { "epoch": 6.173749898034098, "grad_norm": 8.8125, "learning_rate": 3.575603225131563e-07, "loss": 2.3568, "num_input_tokens_seen": 79061008, "step": 37845 }, { "epoch": 6.174565625254915, "grad_norm": 9.125, "learning_rate": 3.55907744232592e-07, "loss": 2.4561, "num_input_tokens_seen": 79071696, "step": 37850 }, { "epoch": 6.175381352475732, "grad_norm": 5.9375, "learning_rate": 3.5425896638075217e-07, "loss": 1.9724, "num_input_tokens_seen": 79081568, "step": 37855 }, { "epoch": 6.176197079696549, "grad_norm": 12.25, "learning_rate": 3.5261398921189736e-07, "loss": 2.9743, "num_input_tokens_seen": 79090976, "step": 37860 }, { "epoch": 6.177012806917367, "grad_norm": 10.9375, "learning_rate": 3.509728129797024e-07, "loss": 2.4955, "num_input_tokens_seen": 79101808, "step": 37865 }, { "epoch": 6.177828534138184, "grad_norm": 4.78125, "learning_rate": 3.4933543793725656e-07, "loss": 2.5082, "num_input_tokens_seen": 79112272, "step": 37870 }, { "epoch": 6.178644261359001, "grad_norm": 11.0625, "learning_rate": 3.4770186433707163e-07, "loss": 3.1335, "num_input_tokens_seen": 79123088, "step": 37875 }, { "epoch": 6.1794599885798185, "grad_norm": 14.1875, "learning_rate": 3.4607209243105453e-07, "loss": 2.1164, "num_input_tokens_seen": 79133488, "step": 37880 }, { "epoch": 6.1802757158006365, "grad_norm": 5.96875, "learning_rate": 3.444461224705431e-07, "loss": 2.5156, "num_input_tokens_seen": 79144224, "step": 37885 }, { "epoch": 6.181091443021454, "grad_norm": 3.921875, "learning_rate": 3.4282395470628116e-07, "loss": 1.7246, "num_input_tokens_seen": 79153888, "step": 37890 }, { "epoch": 6.181907170242271, "grad_norm": 5.78125, "learning_rate": 3.4120558938842417e-07, "loss": 3.2096, "num_input_tokens_seen": 79164576, "step": 37895 }, { "epoch": 6.182722897463089, "grad_norm": 8.0625, "learning_rate": 3.395910267665503e-07, "loss": 2.5205, "num_input_tokens_seen": 79175104, "step": 37900 }, { "epoch": 6.183538624683906, "grad_norm": 8.9375, "learning_rate": 3.3798026708964094e-07, "loss": 2.7483, "num_input_tokens_seen": 79185472, "step": 37905 }, { "epoch": 6.184354351904723, "grad_norm": 7.96875, "learning_rate": 3.3637331060609456e-07, "loss": 3.2198, "num_input_tokens_seen": 79196848, "step": 37910 }, { "epoch": 6.18517007912554, "grad_norm": 6.75, "learning_rate": 3.3477015756372966e-07, "loss": 1.211, "num_input_tokens_seen": 79206096, "step": 37915 }, { "epoch": 6.185985806346358, "grad_norm": 3.875, "learning_rate": 3.3317080820976785e-07, "loss": 2.4185, "num_input_tokens_seen": 79215904, "step": 37920 }, { "epoch": 6.186801533567175, "grad_norm": 6.34375, "learning_rate": 3.315752627908508e-07, "loss": 3.0412, "num_input_tokens_seen": 79226176, "step": 37925 }, { "epoch": 6.1876172607879925, "grad_norm": 11.0, "learning_rate": 3.299835215530317e-07, "loss": 2.7692, "num_input_tokens_seen": 79236272, "step": 37930 }, { "epoch": 6.18843298800881, "grad_norm": 11.0625, "learning_rate": 3.2839558474177245e-07, "loss": 3.2939, "num_input_tokens_seen": 79246736, "step": 37935 }, { "epoch": 6.189248715229628, "grad_norm": 6.0, "learning_rate": 3.2681145260196056e-07, "loss": 2.7709, "num_input_tokens_seen": 79256384, "step": 37940 }, { "epoch": 6.190064442450445, "grad_norm": 3.203125, "learning_rate": 3.252311253778839e-07, "loss": 2.5421, "num_input_tokens_seen": 79268032, "step": 37945 }, { "epoch": 6.190880169671262, "grad_norm": 9.0, "learning_rate": 3.2365460331325034e-07, "loss": 2.6012, "num_input_tokens_seen": 79279328, "step": 37950 }, { "epoch": 6.191695896892079, "grad_norm": 7.125, "learning_rate": 3.2208188665117934e-07, "loss": 1.3524, "num_input_tokens_seen": 79289616, "step": 37955 }, { "epoch": 6.192511624112897, "grad_norm": 5.5625, "learning_rate": 3.205129756342018e-07, "loss": 2.0632, "num_input_tokens_seen": 79298368, "step": 37960 }, { "epoch": 6.193327351333714, "grad_norm": 8.1875, "learning_rate": 3.189478705042659e-07, "loss": 2.1626, "num_input_tokens_seen": 79306544, "step": 37965 }, { "epoch": 6.194143078554531, "grad_norm": 10.9375, "learning_rate": 3.173865715027341e-07, "loss": 2.0486, "num_input_tokens_seen": 79316144, "step": 37970 }, { "epoch": 6.194958805775348, "grad_norm": 9.4375, "learning_rate": 3.158290788703694e-07, "loss": 3.0197, "num_input_tokens_seen": 79325568, "step": 37975 }, { "epoch": 6.195774532996166, "grad_norm": 5.6875, "learning_rate": 3.1427539284736297e-07, "loss": 2.3251, "num_input_tokens_seen": 79337152, "step": 37980 }, { "epoch": 6.196590260216984, "grad_norm": 4.84375, "learning_rate": 3.127255136733093e-07, "loss": 2.7295, "num_input_tokens_seen": 79346832, "step": 37985 }, { "epoch": 6.197405987437801, "grad_norm": 6.5625, "learning_rate": 3.1117944158722544e-07, "loss": 1.8806, "num_input_tokens_seen": 79357984, "step": 37990 }, { "epoch": 6.198221714658618, "grad_norm": 9.875, "learning_rate": 3.0963717682752635e-07, "loss": 2.1924, "num_input_tokens_seen": 79368224, "step": 37995 }, { "epoch": 6.199037441879436, "grad_norm": 2.390625, "learning_rate": 3.080987196320578e-07, "loss": 1.2663, "num_input_tokens_seen": 79378528, "step": 38000 }, { "epoch": 6.199037441879436, "eval_loss": 2.5388035774230957, "eval_runtime": 134.6952, "eval_samples_per_second": 20.231, "eval_steps_per_second": 10.119, "num_input_tokens_seen": 79378528, "step": 38000 }, { "epoch": 6.199853169100253, "grad_norm": 0.2255859375, "learning_rate": 3.065640702380607e-07, "loss": 1.7786, "num_input_tokens_seen": 79389168, "step": 38005 }, { "epoch": 6.20066889632107, "grad_norm": 2.078125, "learning_rate": 3.050332288822011e-07, "loss": 1.9805, "num_input_tokens_seen": 79399120, "step": 38010 }, { "epoch": 6.201484623541887, "grad_norm": 3.984375, "learning_rate": 3.035061958005542e-07, "loss": 3.0661, "num_input_tokens_seen": 79411024, "step": 38015 }, { "epoch": 6.202300350762705, "grad_norm": 4.53125, "learning_rate": 3.019829712286093e-07, "loss": 1.5562, "num_input_tokens_seen": 79422688, "step": 38020 }, { "epoch": 6.203116077983522, "grad_norm": 10.5625, "learning_rate": 3.004635554012647e-07, "loss": 2.5306, "num_input_tokens_seen": 79432832, "step": 38025 }, { "epoch": 6.2039318052043395, "grad_norm": 9.5, "learning_rate": 2.9894794855283017e-07, "loss": 3.0771, "num_input_tokens_seen": 79444064, "step": 38030 }, { "epoch": 6.204747532425157, "grad_norm": 7.3125, "learning_rate": 2.9743615091703816e-07, "loss": 1.8517, "num_input_tokens_seen": 79453808, "step": 38035 }, { "epoch": 6.205563259645975, "grad_norm": 7.40625, "learning_rate": 2.959281627270216e-07, "loss": 2.0853, "num_input_tokens_seen": 79463552, "step": 38040 }, { "epoch": 6.206378986866792, "grad_norm": 0.47265625, "learning_rate": 2.944239842153362e-07, "loss": 1.8826, "num_input_tokens_seen": 79474240, "step": 38045 }, { "epoch": 6.207194714087609, "grad_norm": 8.6875, "learning_rate": 2.929236156139381e-07, "loss": 2.7772, "num_input_tokens_seen": 79484096, "step": 38050 }, { "epoch": 6.208010441308426, "grad_norm": 9.3125, "learning_rate": 2.9142705715420883e-07, "loss": 1.9524, "num_input_tokens_seen": 79494608, "step": 38055 }, { "epoch": 6.208826168529244, "grad_norm": 5.75, "learning_rate": 2.8993430906693595e-07, "loss": 1.8412, "num_input_tokens_seen": 79505360, "step": 38060 }, { "epoch": 6.209641895750061, "grad_norm": 5.65625, "learning_rate": 2.88445371582316e-07, "loss": 1.4115, "num_input_tokens_seen": 79515488, "step": 38065 }, { "epoch": 6.210457622970878, "grad_norm": 5.40625, "learning_rate": 2.8696024492996796e-07, "loss": 2.8746, "num_input_tokens_seen": 79525840, "step": 38070 }, { "epoch": 6.211273350191696, "grad_norm": 5.625, "learning_rate": 2.854789293389115e-07, "loss": 2.7681, "num_input_tokens_seen": 79536112, "step": 38075 }, { "epoch": 6.2120890774125135, "grad_norm": 7.84375, "learning_rate": 2.8400142503758606e-07, "loss": 1.0938, "num_input_tokens_seen": 79546720, "step": 38080 }, { "epoch": 6.212904804633331, "grad_norm": 4.46875, "learning_rate": 2.8252773225384276e-07, "loss": 2.302, "num_input_tokens_seen": 79557936, "step": 38085 }, { "epoch": 6.213720531854148, "grad_norm": 9.625, "learning_rate": 2.8105785121494143e-07, "loss": 3.8971, "num_input_tokens_seen": 79569584, "step": 38090 }, { "epoch": 6.214536259074965, "grad_norm": 8.6875, "learning_rate": 2.795917821475563e-07, "loss": 2.8693, "num_input_tokens_seen": 79579840, "step": 38095 }, { "epoch": 6.215351986295783, "grad_norm": 6.96875, "learning_rate": 2.78129525277776e-07, "loss": 1.7988, "num_input_tokens_seen": 79590464, "step": 38100 }, { "epoch": 6.2161677135166, "grad_norm": 5.15625, "learning_rate": 2.766710808310952e-07, "loss": 2.9476, "num_input_tokens_seen": 79600848, "step": 38105 }, { "epoch": 6.216983440737417, "grad_norm": 5.125, "learning_rate": 2.7521644903242827e-07, "loss": 2.1424, "num_input_tokens_seen": 79611408, "step": 38110 }, { "epoch": 6.217799167958235, "grad_norm": 8.375, "learning_rate": 2.7376563010609593e-07, "loss": 2.7264, "num_input_tokens_seen": 79621056, "step": 38115 }, { "epoch": 6.218614895179052, "grad_norm": 7.46875, "learning_rate": 2.72318624275833e-07, "loss": 1.365, "num_input_tokens_seen": 79631456, "step": 38120 }, { "epoch": 6.219430622399869, "grad_norm": 3.328125, "learning_rate": 2.7087543176478324e-07, "loss": 2.493, "num_input_tokens_seen": 79642448, "step": 38125 }, { "epoch": 6.220246349620687, "grad_norm": 4.46875, "learning_rate": 2.694360527955103e-07, "loss": 2.0805, "num_input_tokens_seen": 79652016, "step": 38130 }, { "epoch": 6.221062076841505, "grad_norm": 8.125, "learning_rate": 2.680004875899811e-07, "loss": 2.3353, "num_input_tokens_seen": 79663264, "step": 38135 }, { "epoch": 6.221877804062322, "grad_norm": 5.96875, "learning_rate": 2.665687363695768e-07, "loss": 3.5997, "num_input_tokens_seen": 79673168, "step": 38140 }, { "epoch": 6.222693531283139, "grad_norm": 13.0625, "learning_rate": 2.6514079935509584e-07, "loss": 3.0249, "num_input_tokens_seen": 79683728, "step": 38145 }, { "epoch": 6.223509258503956, "grad_norm": 7.96875, "learning_rate": 2.6371667676673983e-07, "loss": 2.2844, "num_input_tokens_seen": 79694672, "step": 38150 }, { "epoch": 6.224324985724774, "grad_norm": 10.625, "learning_rate": 2.6229636882412755e-07, "loss": 2.62, "num_input_tokens_seen": 79705216, "step": 38155 }, { "epoch": 6.225140712945591, "grad_norm": 5.78125, "learning_rate": 2.6087987574628935e-07, "loss": 1.8438, "num_input_tokens_seen": 79715984, "step": 38160 }, { "epoch": 6.225956440166408, "grad_norm": 1.8046875, "learning_rate": 2.5946719775166437e-07, "loss": 1.8779, "num_input_tokens_seen": 79725904, "step": 38165 }, { "epoch": 6.226772167387225, "grad_norm": 1.9140625, "learning_rate": 2.5805833505810616e-07, "loss": 2.4688, "num_input_tokens_seen": 79736944, "step": 38170 }, { "epoch": 6.227587894608043, "grad_norm": 6.4375, "learning_rate": 2.566532878828798e-07, "loss": 1.251, "num_input_tokens_seen": 79748304, "step": 38175 }, { "epoch": 6.2284036218288605, "grad_norm": 6.40625, "learning_rate": 2.552520564426619e-07, "loss": 1.9093, "num_input_tokens_seen": 79759728, "step": 38180 }, { "epoch": 6.229219349049678, "grad_norm": 10.5, "learning_rate": 2.5385464095353803e-07, "loss": 3.637, "num_input_tokens_seen": 79769776, "step": 38185 }, { "epoch": 6.230035076270495, "grad_norm": 6.3125, "learning_rate": 2.5246104163100804e-07, "loss": 2.6777, "num_input_tokens_seen": 79780208, "step": 38190 }, { "epoch": 6.230850803491313, "grad_norm": 7.375, "learning_rate": 2.510712586899833e-07, "loss": 1.783, "num_input_tokens_seen": 79791344, "step": 38195 }, { "epoch": 6.23166653071213, "grad_norm": 7.15625, "learning_rate": 2.4968529234478124e-07, "loss": 4.8279, "num_input_tokens_seen": 79802112, "step": 38200 }, { "epoch": 6.23166653071213, "eval_loss": 2.5388035774230957, "eval_runtime": 134.7528, "eval_samples_per_second": 20.222, "eval_steps_per_second": 10.115, "num_input_tokens_seen": 79802112, "step": 38200 }, { "epoch": 6.232482257932947, "grad_norm": 6.125, "learning_rate": 2.483031428091448e-07, "loss": 1.9544, "num_input_tokens_seen": 79812512, "step": 38205 }, { "epoch": 6.233297985153764, "grad_norm": 13.375, "learning_rate": 2.469248102962091e-07, "loss": 2.2185, "num_input_tokens_seen": 79823568, "step": 38210 }, { "epoch": 6.234113712374582, "grad_norm": 7.71875, "learning_rate": 2.4555029501853455e-07, "loss": 2.3884, "num_input_tokens_seen": 79834336, "step": 38215 }, { "epoch": 6.234929439595399, "grad_norm": 10.25, "learning_rate": 2.441795971880906e-07, "loss": 3.7727, "num_input_tokens_seen": 79844960, "step": 38220 }, { "epoch": 6.2357451668162165, "grad_norm": 8.75, "learning_rate": 2.4281271701625255e-07, "loss": 1.5325, "num_input_tokens_seen": 79855664, "step": 38225 }, { "epoch": 6.236560894037034, "grad_norm": 8.0, "learning_rate": 2.4144965471381007e-07, "loss": 2.7582, "num_input_tokens_seen": 79866848, "step": 38230 }, { "epoch": 6.237376621257852, "grad_norm": 8.4375, "learning_rate": 2.400904104909674e-07, "loss": 1.6745, "num_input_tokens_seen": 79877536, "step": 38235 }, { "epoch": 6.238192348478669, "grad_norm": 7.59375, "learning_rate": 2.3873498455733725e-07, "loss": 2.7208, "num_input_tokens_seen": 79887696, "step": 38240 }, { "epoch": 6.239008075699486, "grad_norm": 2.046875, "learning_rate": 2.3738337712194137e-07, "loss": 1.8004, "num_input_tokens_seen": 79897888, "step": 38245 }, { "epoch": 6.239823802920303, "grad_norm": 5.59375, "learning_rate": 2.3603558839321305e-07, "loss": 2.1685, "num_input_tokens_seen": 79908800, "step": 38250 }, { "epoch": 6.240639530141121, "grad_norm": 13.9375, "learning_rate": 2.3469161857900267e-07, "loss": 2.2996, "num_input_tokens_seen": 79919232, "step": 38255 }, { "epoch": 6.241455257361938, "grad_norm": 10.625, "learning_rate": 2.3335146788656393e-07, "loss": 2.6209, "num_input_tokens_seen": 79930480, "step": 38260 }, { "epoch": 6.242270984582755, "grad_norm": 9.8125, "learning_rate": 2.3201513652256757e-07, "loss": 2.1849, "num_input_tokens_seen": 79940784, "step": 38265 }, { "epoch": 6.243086711803572, "grad_norm": 7.5625, "learning_rate": 2.3068262469308766e-07, "loss": 0.9894, "num_input_tokens_seen": 79950208, "step": 38270 }, { "epoch": 6.2439024390243905, "grad_norm": 2.90625, "learning_rate": 2.2935393260362093e-07, "loss": 2.6364, "num_input_tokens_seen": 79960944, "step": 38275 }, { "epoch": 6.244718166245208, "grad_norm": 5.21875, "learning_rate": 2.2802906045906458e-07, "loss": 1.6578, "num_input_tokens_seen": 79970528, "step": 38280 }, { "epoch": 6.245533893466025, "grad_norm": 0.11083984375, "learning_rate": 2.2670800846373018e-07, "loss": 1.1059, "num_input_tokens_seen": 79979456, "step": 38285 }, { "epoch": 6.246349620686843, "grad_norm": 8.9375, "learning_rate": 2.2539077682134367e-07, "loss": 3.0101, "num_input_tokens_seen": 79990864, "step": 38290 }, { "epoch": 6.24716534790766, "grad_norm": 12.125, "learning_rate": 2.2407736573503423e-07, "loss": 1.4965, "num_input_tokens_seen": 80001856, "step": 38295 }, { "epoch": 6.247981075128477, "grad_norm": 5.53125, "learning_rate": 2.2276777540735093e-07, "loss": 1.2453, "num_input_tokens_seen": 80013392, "step": 38300 }, { "epoch": 6.248796802349294, "grad_norm": 3.796875, "learning_rate": 2.2146200604024613e-07, "loss": 0.9908, "num_input_tokens_seen": 80024736, "step": 38305 }, { "epoch": 6.249612529570112, "grad_norm": 10.75, "learning_rate": 2.2016005783508375e-07, "loss": 2.1107, "num_input_tokens_seen": 80034800, "step": 38310 }, { "epoch": 6.250428256790929, "grad_norm": 3.6875, "learning_rate": 2.1886193099264763e-07, "loss": 2.7397, "num_input_tokens_seen": 80044960, "step": 38315 }, { "epoch": 6.251243984011746, "grad_norm": 6.25, "learning_rate": 2.175676257131165e-07, "loss": 2.3341, "num_input_tokens_seen": 80056800, "step": 38320 }, { "epoch": 6.2520597112325635, "grad_norm": 12.9375, "learning_rate": 2.162771421960974e-07, "loss": 2.219, "num_input_tokens_seen": 80067328, "step": 38325 }, { "epoch": 6.252875438453382, "grad_norm": 8.5625, "learning_rate": 2.1499048064059224e-07, "loss": 1.6496, "num_input_tokens_seen": 80077776, "step": 38330 }, { "epoch": 6.253691165674199, "grad_norm": 5.40625, "learning_rate": 2.1370764124502285e-07, "loss": 2.8483, "num_input_tokens_seen": 80086320, "step": 38335 }, { "epoch": 6.254506892895016, "grad_norm": 2.734375, "learning_rate": 2.1242862420721988e-07, "loss": 2.7111, "num_input_tokens_seen": 80097920, "step": 38340 }, { "epoch": 6.255322620115833, "grad_norm": 9.75, "learning_rate": 2.1115342972442276e-07, "loss": 2.112, "num_input_tokens_seen": 80110304, "step": 38345 }, { "epoch": 6.256138347336651, "grad_norm": 5.75, "learning_rate": 2.0988205799328252e-07, "loss": 2.0897, "num_input_tokens_seen": 80120256, "step": 38350 }, { "epoch": 6.256954074557468, "grad_norm": 6.0, "learning_rate": 2.0861450920986182e-07, "loss": 2.1309, "num_input_tokens_seen": 80131072, "step": 38355 }, { "epoch": 6.257769801778285, "grad_norm": 8.6875, "learning_rate": 2.07350783569632e-07, "loss": 1.6689, "num_input_tokens_seen": 80142256, "step": 38360 }, { "epoch": 6.258585528999102, "grad_norm": 3.640625, "learning_rate": 2.060908812674761e-07, "loss": 1.4719, "num_input_tokens_seen": 80152928, "step": 38365 }, { "epoch": 6.25940125621992, "grad_norm": 5.21875, "learning_rate": 2.0483480249768317e-07, "loss": 1.9444, "num_input_tokens_seen": 80163472, "step": 38370 }, { "epoch": 6.2602169834407375, "grad_norm": 7.6875, "learning_rate": 2.035825474539621e-07, "loss": 1.719, "num_input_tokens_seen": 80174640, "step": 38375 }, { "epoch": 6.261032710661555, "grad_norm": 9.0625, "learning_rate": 2.0233411632942235e-07, "loss": 1.7242, "num_input_tokens_seen": 80184272, "step": 38380 }, { "epoch": 6.261848437882372, "grad_norm": 4.90625, "learning_rate": 2.0108950931658764e-07, "loss": 1.5439, "num_input_tokens_seen": 80196304, "step": 38385 }, { "epoch": 6.26266416510319, "grad_norm": 8.5625, "learning_rate": 1.998487266073934e-07, "loss": 3.0629, "num_input_tokens_seen": 80206544, "step": 38390 }, { "epoch": 6.263479892324007, "grad_norm": 1.21875, "learning_rate": 1.986117683931865e-07, "loss": 1.6194, "num_input_tokens_seen": 80217744, "step": 38395 }, { "epoch": 6.264295619544824, "grad_norm": 7.46875, "learning_rate": 1.9737863486471442e-07, "loss": 3.159, "num_input_tokens_seen": 80229344, "step": 38400 }, { "epoch": 6.264295619544824, "eval_loss": 2.5388035774230957, "eval_runtime": 134.7266, "eval_samples_per_second": 20.226, "eval_steps_per_second": 10.117, "num_input_tokens_seen": 80229344, "step": 38400 }, { "epoch": 6.265111346765641, "grad_norm": 13.25, "learning_rate": 1.9614932621215e-07, "loss": 2.3409, "num_input_tokens_seen": 80238736, "step": 38405 }, { "epoch": 6.265927073986459, "grad_norm": 3.9375, "learning_rate": 1.9492384262506102e-07, "loss": 1.9272, "num_input_tokens_seen": 80248368, "step": 38410 }, { "epoch": 6.266742801207276, "grad_norm": 8.4375, "learning_rate": 1.9370218429243524e-07, "loss": 2.7534, "num_input_tokens_seen": 80259360, "step": 38415 }, { "epoch": 6.2675585284280935, "grad_norm": 2.9375, "learning_rate": 1.9248435140267197e-07, "loss": 3.2606, "num_input_tokens_seen": 80270256, "step": 38420 }, { "epoch": 6.268374255648911, "grad_norm": 15.0, "learning_rate": 1.9127034414356814e-07, "loss": 2.7334, "num_input_tokens_seen": 80280272, "step": 38425 }, { "epoch": 6.269189982869729, "grad_norm": 10.1875, "learning_rate": 1.9006016270234627e-07, "loss": 2.5592, "num_input_tokens_seen": 80290000, "step": 38430 }, { "epoch": 6.270005710090546, "grad_norm": 6.90625, "learning_rate": 1.888538072656293e-07, "loss": 3.1807, "num_input_tokens_seen": 80301968, "step": 38435 }, { "epoch": 6.270821437311363, "grad_norm": 5.125, "learning_rate": 1.8765127801944893e-07, "loss": 2.5819, "num_input_tokens_seen": 80311936, "step": 38440 }, { "epoch": 6.27163716453218, "grad_norm": 7.5625, "learning_rate": 1.8645257514925406e-07, "loss": 1.8347, "num_input_tokens_seen": 80322560, "step": 38445 }, { "epoch": 6.272452891752998, "grad_norm": 9.25, "learning_rate": 1.8525769883989685e-07, "loss": 2.0879, "num_input_tokens_seen": 80333280, "step": 38450 }, { "epoch": 6.273268618973815, "grad_norm": 3.671875, "learning_rate": 1.8406664927564654e-07, "loss": 1.992, "num_input_tokens_seen": 80342384, "step": 38455 }, { "epoch": 6.274084346194632, "grad_norm": 7.3125, "learning_rate": 1.8287942664017566e-07, "loss": 3.3605, "num_input_tokens_seen": 80352576, "step": 38460 }, { "epoch": 6.27490007341545, "grad_norm": 5.34375, "learning_rate": 1.8169603111656552e-07, "loss": 2.1352, "num_input_tokens_seen": 80361920, "step": 38465 }, { "epoch": 6.275715800636267, "grad_norm": 5.25, "learning_rate": 1.805164628873146e-07, "loss": 3.5429, "num_input_tokens_seen": 80372112, "step": 38470 }, { "epoch": 6.276531527857085, "grad_norm": 3.75, "learning_rate": 1.793407221343274e-07, "loss": 3.2005, "num_input_tokens_seen": 80382368, "step": 38475 }, { "epoch": 6.277347255077902, "grad_norm": 9.0, "learning_rate": 1.781688090389172e-07, "loss": 2.701, "num_input_tokens_seen": 80391328, "step": 38480 }, { "epoch": 6.27816298229872, "grad_norm": 18.75, "learning_rate": 1.770007237818061e-07, "loss": 2.7116, "num_input_tokens_seen": 80402800, "step": 38485 }, { "epoch": 6.278978709519537, "grad_norm": 10.8125, "learning_rate": 1.7583646654313059e-07, "loss": 2.6534, "num_input_tokens_seen": 80413136, "step": 38490 }, { "epoch": 6.279794436740354, "grad_norm": 6.84375, "learning_rate": 1.7467603750242757e-07, "loss": 2.8355, "num_input_tokens_seen": 80423584, "step": 38495 }, { "epoch": 6.280610163961171, "grad_norm": 5.125, "learning_rate": 1.7351943683865944e-07, "loss": 0.8452, "num_input_tokens_seen": 80435376, "step": 38500 }, { "epoch": 6.281425891181989, "grad_norm": 7.71875, "learning_rate": 1.723666647301808e-07, "loss": 2.313, "num_input_tokens_seen": 80445904, "step": 38505 }, { "epoch": 6.282241618402806, "grad_norm": 11.5625, "learning_rate": 1.712177213547661e-07, "loss": 3.6515, "num_input_tokens_seen": 80455344, "step": 38510 }, { "epoch": 6.283057345623623, "grad_norm": 1.609375, "learning_rate": 1.7007260688959581e-07, "loss": 2.459, "num_input_tokens_seen": 80466160, "step": 38515 }, { "epoch": 6.2838730728444405, "grad_norm": 8.1875, "learning_rate": 1.68931321511262e-07, "loss": 2.7, "num_input_tokens_seen": 80476128, "step": 38520 }, { "epoch": 6.2846888000652585, "grad_norm": 10.9375, "learning_rate": 1.6779386539576835e-07, "loss": 2.4041, "num_input_tokens_seen": 80485872, "step": 38525 }, { "epoch": 6.285504527286076, "grad_norm": 0.232421875, "learning_rate": 1.666602387185162e-07, "loss": 1.1816, "num_input_tokens_seen": 80496016, "step": 38530 }, { "epoch": 6.286320254506893, "grad_norm": 3.609375, "learning_rate": 1.655304416543352e-07, "loss": 2.8938, "num_input_tokens_seen": 80505536, "step": 38535 }, { "epoch": 6.28713598172771, "grad_norm": 10.6875, "learning_rate": 1.6440447437744698e-07, "loss": 4.2738, "num_input_tokens_seen": 80515776, "step": 38540 }, { "epoch": 6.287951708948528, "grad_norm": 11.6875, "learning_rate": 1.6328233706149332e-07, "loss": 4.796, "num_input_tokens_seen": 80526752, "step": 38545 }, { "epoch": 6.288767436169345, "grad_norm": 3.578125, "learning_rate": 1.6216402987951906e-07, "loss": 2.3432, "num_input_tokens_seen": 80536496, "step": 38550 }, { "epoch": 6.289583163390162, "grad_norm": 7.09375, "learning_rate": 1.6104955300398627e-07, "loss": 2.9521, "num_input_tokens_seen": 80547424, "step": 38555 }, { "epoch": 6.290398890610979, "grad_norm": 3.015625, "learning_rate": 1.5993890660675748e-07, "loss": 2.4793, "num_input_tokens_seen": 80557984, "step": 38560 }, { "epoch": 6.291214617831797, "grad_norm": 9.5, "learning_rate": 1.5883209085910678e-07, "loss": 3.9188, "num_input_tokens_seen": 80569104, "step": 38565 }, { "epoch": 6.2920303450526145, "grad_norm": 5.65625, "learning_rate": 1.5772910593172264e-07, "loss": 2.5822, "num_input_tokens_seen": 80579328, "step": 38570 }, { "epoch": 6.292846072273432, "grad_norm": 6.90625, "learning_rate": 1.5662995199469954e-07, "loss": 2.4876, "num_input_tokens_seen": 80589808, "step": 38575 }, { "epoch": 6.293661799494249, "grad_norm": 10.25, "learning_rate": 1.5553462921753802e-07, "loss": 1.3114, "num_input_tokens_seen": 80599456, "step": 38580 }, { "epoch": 6.294477526715067, "grad_norm": 7.40625, "learning_rate": 1.544431377691502e-07, "loss": 2.4901, "num_input_tokens_seen": 80609312, "step": 38585 }, { "epoch": 6.295293253935884, "grad_norm": 6.5, "learning_rate": 1.5335547781785975e-07, "loss": 1.999, "num_input_tokens_seen": 80619904, "step": 38590 }, { "epoch": 6.296108981156701, "grad_norm": 6.375, "learning_rate": 1.5227164953139917e-07, "loss": 3.3656, "num_input_tokens_seen": 80632800, "step": 38595 }, { "epoch": 6.296924708377518, "grad_norm": 14.75, "learning_rate": 1.511916530769042e-07, "loss": 1.9346, "num_input_tokens_seen": 80643632, "step": 38600 }, { "epoch": 6.296924708377518, "eval_loss": 2.5388035774230957, "eval_runtime": 134.6117, "eval_samples_per_second": 20.243, "eval_steps_per_second": 10.125, "num_input_tokens_seen": 80643632, "step": 38600 }, { "epoch": 6.297740435598336, "grad_norm": 3.875, "learning_rate": 1.5011548862092773e-07, "loss": 1.5578, "num_input_tokens_seen": 80653280, "step": 38605 }, { "epoch": 6.298556162819153, "grad_norm": 12.0, "learning_rate": 1.490431563294231e-07, "loss": 2.8874, "num_input_tokens_seen": 80664608, "step": 38610 }, { "epoch": 6.2993718900399704, "grad_norm": 9.8125, "learning_rate": 1.4797465636776365e-07, "loss": 1.9634, "num_input_tokens_seen": 80675408, "step": 38615 }, { "epoch": 6.300187617260788, "grad_norm": 10.5625, "learning_rate": 1.4690998890072027e-07, "loss": 1.9092, "num_input_tokens_seen": 80686720, "step": 38620 }, { "epoch": 6.301003344481606, "grad_norm": 5.6875, "learning_rate": 1.4584915409248112e-07, "loss": 2.5565, "num_input_tokens_seen": 80696592, "step": 38625 }, { "epoch": 6.301819071702423, "grad_norm": 6.375, "learning_rate": 1.4479215210663754e-07, "loss": 2.9335, "num_input_tokens_seen": 80707376, "step": 38630 }, { "epoch": 6.30263479892324, "grad_norm": 7.0, "learning_rate": 1.4373898310619528e-07, "loss": 4.6723, "num_input_tokens_seen": 80717056, "step": 38635 }, { "epoch": 6.303450526144058, "grad_norm": 10.375, "learning_rate": 1.4268964725356604e-07, "loss": 2.2538, "num_input_tokens_seen": 80727728, "step": 38640 }, { "epoch": 6.304266253364875, "grad_norm": 10.625, "learning_rate": 1.4164414471056764e-07, "loss": 2.6737, "num_input_tokens_seen": 80736944, "step": 38645 }, { "epoch": 6.305081980585692, "grad_norm": 9.375, "learning_rate": 1.4060247563843497e-07, "loss": 2.6912, "num_input_tokens_seen": 80747520, "step": 38650 }, { "epoch": 6.305897707806509, "grad_norm": 11.0625, "learning_rate": 1.3956464019780068e-07, "loss": 1.8926, "num_input_tokens_seen": 80757600, "step": 38655 }, { "epoch": 6.306713435027326, "grad_norm": 5.09375, "learning_rate": 1.385306385487145e-07, "loss": 3.3906, "num_input_tokens_seen": 80767072, "step": 38660 }, { "epoch": 6.307529162248144, "grad_norm": 3.40625, "learning_rate": 1.3750047085063222e-07, "loss": 1.7267, "num_input_tokens_seen": 80777776, "step": 38665 }, { "epoch": 6.3083448894689615, "grad_norm": 2.5, "learning_rate": 1.3647413726242119e-07, "loss": 2.2446, "num_input_tokens_seen": 80787712, "step": 38670 }, { "epoch": 6.309160616689779, "grad_norm": 7.3125, "learning_rate": 1.3545163794235205e-07, "loss": 3.4725, "num_input_tokens_seen": 80798736, "step": 38675 }, { "epoch": 6.309976343910597, "grad_norm": 4.6875, "learning_rate": 1.3443297304810698e-07, "loss": 2.3188, "num_input_tokens_seen": 80809040, "step": 38680 }, { "epoch": 6.310792071131414, "grad_norm": 2.953125, "learning_rate": 1.3341814273677977e-07, "loss": 1.1266, "num_input_tokens_seen": 80818320, "step": 38685 }, { "epoch": 6.311607798352231, "grad_norm": 7.84375, "learning_rate": 1.324071471648647e-07, "loss": 2.4951, "num_input_tokens_seen": 80828656, "step": 38690 }, { "epoch": 6.312423525573048, "grad_norm": 10.4375, "learning_rate": 1.3139998648827312e-07, "loss": 1.5555, "num_input_tokens_seen": 80839120, "step": 38695 }, { "epoch": 6.313239252793866, "grad_norm": 4.875, "learning_rate": 1.3039666086232526e-07, "loss": 2.1156, "num_input_tokens_seen": 80850912, "step": 38700 }, { "epoch": 6.314054980014683, "grad_norm": 6.34375, "learning_rate": 1.2939717044174183e-07, "loss": 3.3882, "num_input_tokens_seen": 80860240, "step": 38705 }, { "epoch": 6.3148707072355, "grad_norm": 12.0625, "learning_rate": 1.284015153806578e-07, "loss": 2.1669, "num_input_tokens_seen": 80868112, "step": 38710 }, { "epoch": 6.3156864344563175, "grad_norm": 6.5, "learning_rate": 1.274096958326171e-07, "loss": 2.9756, "num_input_tokens_seen": 80878592, "step": 38715 }, { "epoch": 6.3165021616771355, "grad_norm": 11.9375, "learning_rate": 1.2642171195056952e-07, "loss": 3.5525, "num_input_tokens_seen": 80887792, "step": 38720 }, { "epoch": 6.317317888897953, "grad_norm": 5.03125, "learning_rate": 1.2543756388687377e-07, "loss": 2.1238, "num_input_tokens_seen": 80898208, "step": 38725 }, { "epoch": 6.31813361611877, "grad_norm": 5.8125, "learning_rate": 1.2445725179330014e-07, "loss": 2.6483, "num_input_tokens_seen": 80908928, "step": 38730 }, { "epoch": 6.318949343339587, "grad_norm": 9.75, "learning_rate": 1.2348077582102212e-07, "loss": 2.419, "num_input_tokens_seen": 80919424, "step": 38735 }, { "epoch": 6.319765070560405, "grad_norm": 5.5, "learning_rate": 1.2250813612062762e-07, "loss": 1.5029, "num_input_tokens_seen": 80930128, "step": 38740 }, { "epoch": 6.320580797781222, "grad_norm": 8.5625, "learning_rate": 1.215393328421105e-07, "loss": 2.4536, "num_input_tokens_seen": 80940208, "step": 38745 }, { "epoch": 6.321396525002039, "grad_norm": 6.4375, "learning_rate": 1.2057436613486796e-07, "loss": 3.1785, "num_input_tokens_seen": 80949808, "step": 38750 }, { "epoch": 6.322212252222856, "grad_norm": 8.5625, "learning_rate": 1.1961323614771424e-07, "loss": 2.3832, "num_input_tokens_seen": 80960672, "step": 38755 }, { "epoch": 6.323027979443674, "grad_norm": 5.5625, "learning_rate": 1.1865594302886418e-07, "loss": 3.8472, "num_input_tokens_seen": 80971040, "step": 38760 }, { "epoch": 6.3238437066644915, "grad_norm": 8.6875, "learning_rate": 1.1770248692594687e-07, "loss": 2.1042, "num_input_tokens_seen": 80980080, "step": 38765 }, { "epoch": 6.324659433885309, "grad_norm": 9.625, "learning_rate": 1.167528679859975e-07, "loss": 1.7199, "num_input_tokens_seen": 80990080, "step": 38770 }, { "epoch": 6.325475161106126, "grad_norm": 6.40625, "learning_rate": 1.1580708635545446e-07, "loss": 2.4792, "num_input_tokens_seen": 81000416, "step": 38775 }, { "epoch": 6.326290888326944, "grad_norm": 2.6875, "learning_rate": 1.1486514218017885e-07, "loss": 1.9909, "num_input_tokens_seen": 81011376, "step": 38780 }, { "epoch": 6.327106615547761, "grad_norm": 7.03125, "learning_rate": 1.1392703560542117e-07, "loss": 2.0435, "num_input_tokens_seen": 81021760, "step": 38785 }, { "epoch": 6.327922342768578, "grad_norm": 7.0625, "learning_rate": 1.129927667758518e-07, "loss": 3.628, "num_input_tokens_seen": 81032352, "step": 38790 }, { "epoch": 6.328738069989395, "grad_norm": 7.34375, "learning_rate": 1.1206233583554992e-07, "loss": 3.9584, "num_input_tokens_seen": 81042832, "step": 38795 }, { "epoch": 6.329553797210213, "grad_norm": 8.375, "learning_rate": 1.1113574292799523e-07, "loss": 1.6984, "num_input_tokens_seen": 81051936, "step": 38800 }, { "epoch": 6.329553797210213, "eval_loss": 2.5388035774230957, "eval_runtime": 134.3488, "eval_samples_per_second": 20.283, "eval_steps_per_second": 10.145, "num_input_tokens_seen": 81051936, "step": 38800 }, { "epoch": 6.33036952443103, "grad_norm": 9.0, "learning_rate": 1.1021298819608449e-07, "loss": 2.267, "num_input_tokens_seen": 81062848, "step": 38805 }, { "epoch": 6.331185251651847, "grad_norm": 5.8125, "learning_rate": 1.0929407178211226e-07, "loss": 2.1411, "num_input_tokens_seen": 81072736, "step": 38810 }, { "epoch": 6.332000978872665, "grad_norm": 5.53125, "learning_rate": 1.0837899382779293e-07, "loss": 2.4041, "num_input_tokens_seen": 81083248, "step": 38815 }, { "epoch": 6.332816706093483, "grad_norm": 0.09228515625, "learning_rate": 1.0746775447423862e-07, "loss": 1.9642, "num_input_tokens_seen": 81093264, "step": 38820 }, { "epoch": 6.3336324333143, "grad_norm": 6.125, "learning_rate": 1.0656035386197583e-07, "loss": 2.0931, "num_input_tokens_seen": 81105168, "step": 38825 }, { "epoch": 6.334448160535117, "grad_norm": 9.3125, "learning_rate": 1.0565679213093982e-07, "loss": 2.1358, "num_input_tokens_seen": 81116016, "step": 38830 }, { "epoch": 6.335263887755934, "grad_norm": 5.90625, "learning_rate": 1.0475706942046638e-07, "loss": 2.0985, "num_input_tokens_seen": 81128192, "step": 38835 }, { "epoch": 6.336079614976752, "grad_norm": 8.1875, "learning_rate": 1.0386118586930282e-07, "loss": 2.4736, "num_input_tokens_seen": 81138640, "step": 38840 }, { "epoch": 6.336895342197569, "grad_norm": 11.5625, "learning_rate": 1.0296914161561367e-07, "loss": 1.9331, "num_input_tokens_seen": 81150960, "step": 38845 }, { "epoch": 6.337711069418386, "grad_norm": 7.59375, "learning_rate": 1.0208093679695552e-07, "loss": 1.7168, "num_input_tokens_seen": 81162768, "step": 38850 }, { "epoch": 6.338526796639204, "grad_norm": 10.5, "learning_rate": 1.0119657155030493e-07, "loss": 2.0389, "num_input_tokens_seen": 81173856, "step": 38855 }, { "epoch": 6.339342523860021, "grad_norm": 7.125, "learning_rate": 1.003160460120417e-07, "loss": 1.7666, "num_input_tokens_seen": 81185152, "step": 38860 }, { "epoch": 6.3401582510808385, "grad_norm": 4.125, "learning_rate": 9.943936031795165e-08, "loss": 2.2515, "num_input_tokens_seen": 81196720, "step": 38865 }, { "epoch": 6.340973978301656, "grad_norm": 7.96875, "learning_rate": 9.856651460323219e-08, "loss": 2.0022, "num_input_tokens_seen": 81207584, "step": 38870 }, { "epoch": 6.341789705522474, "grad_norm": 5.59375, "learning_rate": 9.769750900248953e-08, "loss": 2.2842, "num_input_tokens_seen": 81217744, "step": 38875 }, { "epoch": 6.342605432743291, "grad_norm": 14.75, "learning_rate": 9.683234364973038e-08, "loss": 1.6487, "num_input_tokens_seen": 81226160, "step": 38880 }, { "epoch": 6.343421159964108, "grad_norm": 8.0625, "learning_rate": 9.597101867837854e-08, "loss": 2.1504, "num_input_tokens_seen": 81237264, "step": 38885 }, { "epoch": 6.344236887184925, "grad_norm": 14.875, "learning_rate": 9.511353422125835e-08, "loss": 2.5346, "num_input_tokens_seen": 81247472, "step": 38890 }, { "epoch": 6.345052614405743, "grad_norm": 4.625, "learning_rate": 9.42598904106029e-08, "loss": 1.9476, "num_input_tokens_seen": 81259264, "step": 38895 }, { "epoch": 6.34586834162656, "grad_norm": 7.6875, "learning_rate": 9.341008737806245e-08, "loss": 1.8462, "num_input_tokens_seen": 81269632, "step": 38900 }, { "epoch": 6.346684068847377, "grad_norm": 3.21875, "learning_rate": 9.256412525467661e-08, "loss": 2.6333, "num_input_tokens_seen": 81280368, "step": 38905 }, { "epoch": 6.3474997960681945, "grad_norm": 4.71875, "learning_rate": 9.172200417091326e-08, "loss": 1.434, "num_input_tokens_seen": 81291488, "step": 38910 }, { "epoch": 6.3483155232890125, "grad_norm": 6.03125, "learning_rate": 9.088372425663239e-08, "loss": 2.1019, "num_input_tokens_seen": 81300912, "step": 38915 }, { "epoch": 6.34913125050983, "grad_norm": 7.21875, "learning_rate": 9.004928564110837e-08, "loss": 2.4287, "num_input_tokens_seen": 81310944, "step": 38920 }, { "epoch": 6.349946977730647, "grad_norm": 2.34375, "learning_rate": 8.92186884530244e-08, "loss": 2.233, "num_input_tokens_seen": 81321136, "step": 38925 }, { "epoch": 6.350762704951464, "grad_norm": 14.1875, "learning_rate": 8.83919328204641e-08, "loss": 2.9086, "num_input_tokens_seen": 81332144, "step": 38930 }, { "epoch": 6.351578432172282, "grad_norm": 6.875, "learning_rate": 8.756901887093105e-08, "loss": 2.5815, "num_input_tokens_seen": 81343024, "step": 38935 }, { "epoch": 6.352394159393099, "grad_norm": 7.53125, "learning_rate": 8.674994673132098e-08, "loss": 1.8414, "num_input_tokens_seen": 81354464, "step": 38940 }, { "epoch": 6.353209886613916, "grad_norm": 3.703125, "learning_rate": 8.593471652794949e-08, "loss": 2.4025, "num_input_tokens_seen": 81365216, "step": 38945 }, { "epoch": 6.354025613834733, "grad_norm": 7.84375, "learning_rate": 8.512332838653548e-08, "loss": 3.3861, "num_input_tokens_seen": 81374624, "step": 38950 }, { "epoch": 6.354841341055551, "grad_norm": 6.5625, "learning_rate": 8.431578243220106e-08, "loss": 3.1781, "num_input_tokens_seen": 81384048, "step": 38955 }, { "epoch": 6.3556570682763684, "grad_norm": 5.15625, "learning_rate": 8.351207878948552e-08, "loss": 2.7992, "num_input_tokens_seen": 81393536, "step": 38960 }, { "epoch": 6.356472795497186, "grad_norm": 7.0, "learning_rate": 8.271221758232583e-08, "loss": 1.6178, "num_input_tokens_seen": 81404128, "step": 38965 }, { "epoch": 6.357288522718003, "grad_norm": 6.125, "learning_rate": 8.191619893407332e-08, "loss": 2.1865, "num_input_tokens_seen": 81415680, "step": 38970 }, { "epoch": 6.358104249938821, "grad_norm": 9.6875, "learning_rate": 8.112402296748534e-08, "loss": 1.545, "num_input_tokens_seen": 81425568, "step": 38975 }, { "epoch": 6.358919977159638, "grad_norm": 9.1875, "learning_rate": 8.033568980471973e-08, "loss": 2.2482, "num_input_tokens_seen": 81434080, "step": 38980 }, { "epoch": 6.359735704380455, "grad_norm": 3.15625, "learning_rate": 7.955119956735146e-08, "loss": 1.6244, "num_input_tokens_seen": 81445184, "step": 38985 }, { "epoch": 6.360551431601272, "grad_norm": 4.5, "learning_rate": 7.877055237636155e-08, "loss": 1.6888, "num_input_tokens_seen": 81456352, "step": 38990 }, { "epoch": 6.36136715882209, "grad_norm": 10.5625, "learning_rate": 7.79937483521287e-08, "loss": 1.603, "num_input_tokens_seen": 81466752, "step": 38995 }, { "epoch": 6.362182886042907, "grad_norm": 7.875, "learning_rate": 7.722078761444873e-08, "loss": 3.1322, "num_input_tokens_seen": 81475504, "step": 39000 }, { "epoch": 6.362182886042907, "eval_loss": 2.5388035774230957, "eval_runtime": 134.4246, "eval_samples_per_second": 20.272, "eval_steps_per_second": 10.14, "num_input_tokens_seen": 81475504, "step": 39000 }, { "epoch": 6.362998613263724, "grad_norm": 5.59375, "learning_rate": 7.645167028252631e-08, "loss": 2.1323, "num_input_tokens_seen": 81486336, "step": 39005 }, { "epoch": 6.3638143404845415, "grad_norm": 6.28125, "learning_rate": 7.568639647496379e-08, "loss": 0.714, "num_input_tokens_seen": 81496464, "step": 39010 }, { "epoch": 6.3646300677053596, "grad_norm": 14.375, "learning_rate": 7.492496630977508e-08, "loss": 2.0721, "num_input_tokens_seen": 81507248, "step": 39015 }, { "epoch": 6.365445794926177, "grad_norm": 6.71875, "learning_rate": 7.416737990438571e-08, "loss": 1.6645, "num_input_tokens_seen": 81517984, "step": 39020 }, { "epoch": 6.366261522146994, "grad_norm": 5.875, "learning_rate": 7.341363737562445e-08, "loss": 2.4416, "num_input_tokens_seen": 81527440, "step": 39025 }, { "epoch": 6.367077249367812, "grad_norm": 14.875, "learning_rate": 7.266373883972887e-08, "loss": 2.7039, "num_input_tokens_seen": 81537648, "step": 39030 }, { "epoch": 6.367892976588629, "grad_norm": 12.3125, "learning_rate": 7.191768441233981e-08, "loss": 3.2938, "num_input_tokens_seen": 81548528, "step": 39035 }, { "epoch": 6.368708703809446, "grad_norm": 8.75, "learning_rate": 7.11754742085069e-08, "loss": 2.153, "num_input_tokens_seen": 81557632, "step": 39040 }, { "epoch": 6.369524431030263, "grad_norm": 9.625, "learning_rate": 7.043710834269413e-08, "loss": 3.7709, "num_input_tokens_seen": 81568448, "step": 39045 }, { "epoch": 6.370340158251081, "grad_norm": 8.8125, "learning_rate": 6.970258692876319e-08, "loss": 1.7409, "num_input_tokens_seen": 81577360, "step": 39050 }, { "epoch": 6.371155885471898, "grad_norm": 10.9375, "learning_rate": 6.897191007998738e-08, "loss": 3.9392, "num_input_tokens_seen": 81588080, "step": 39055 }, { "epoch": 6.3719716126927155, "grad_norm": 5.75, "learning_rate": 6.824507790904599e-08, "loss": 2.0936, "num_input_tokens_seen": 81598816, "step": 39060 }, { "epoch": 6.372787339913533, "grad_norm": 0.83203125, "learning_rate": 6.752209052802439e-08, "loss": 1.4064, "num_input_tokens_seen": 81609840, "step": 39065 }, { "epoch": 6.373603067134351, "grad_norm": 3.5625, "learning_rate": 6.680294804841946e-08, "loss": 1.7898, "num_input_tokens_seen": 81620064, "step": 39070 }, { "epoch": 6.374418794355168, "grad_norm": 7.4375, "learning_rate": 6.608765058112865e-08, "loss": 2.1981, "num_input_tokens_seen": 81630000, "step": 39075 }, { "epoch": 6.375234521575985, "grad_norm": 6.5, "learning_rate": 6.537619823646368e-08, "loss": 1.7153, "num_input_tokens_seen": 81641088, "step": 39080 }, { "epoch": 6.376050248796802, "grad_norm": 19.0, "learning_rate": 6.466859112413404e-08, "loss": 2.3992, "num_input_tokens_seen": 81651056, "step": 39085 }, { "epoch": 6.37686597601762, "grad_norm": 8.9375, "learning_rate": 6.39648293532663e-08, "loss": 2.9019, "num_input_tokens_seen": 81661632, "step": 39090 }, { "epoch": 6.377681703238437, "grad_norm": 10.4375, "learning_rate": 6.32649130323848e-08, "loss": 2.648, "num_input_tokens_seen": 81670640, "step": 39095 }, { "epoch": 6.378497430459254, "grad_norm": 9.1875, "learning_rate": 6.256884226943094e-08, "loss": 2.683, "num_input_tokens_seen": 81679648, "step": 39100 }, { "epoch": 6.3793131576800715, "grad_norm": 8.625, "learning_rate": 6.187661717174386e-08, "loss": 1.8595, "num_input_tokens_seen": 81690768, "step": 39105 }, { "epoch": 6.3801288849008895, "grad_norm": 6.65625, "learning_rate": 6.118823784607708e-08, "loss": 1.1874, "num_input_tokens_seen": 81699984, "step": 39110 }, { "epoch": 6.380944612121707, "grad_norm": 10.625, "learning_rate": 6.050370439858178e-08, "loss": 2.9432, "num_input_tokens_seen": 81709840, "step": 39115 }, { "epoch": 6.381760339342524, "grad_norm": 5.125, "learning_rate": 5.98230169348235e-08, "loss": 2.3406, "num_input_tokens_seen": 81720080, "step": 39120 }, { "epoch": 6.382576066563341, "grad_norm": 8.375, "learning_rate": 5.914617555977664e-08, "loss": 2.5976, "num_input_tokens_seen": 81730688, "step": 39125 }, { "epoch": 6.383391793784159, "grad_norm": 6.90625, "learning_rate": 5.8473180377816017e-08, "loss": 1.4913, "num_input_tokens_seen": 81741984, "step": 39130 }, { "epoch": 6.384207521004976, "grad_norm": 7.15625, "learning_rate": 5.780403149272251e-08, "loss": 3.6142, "num_input_tokens_seen": 81753808, "step": 39135 }, { "epoch": 6.385023248225793, "grad_norm": 0.57421875, "learning_rate": 5.7138729007694126e-08, "loss": 3.0723, "num_input_tokens_seen": 81764848, "step": 39140 }, { "epoch": 6.38583897544661, "grad_norm": 7.1875, "learning_rate": 5.64772730253238e-08, "loss": 3.0741, "num_input_tokens_seen": 81776176, "step": 39145 }, { "epoch": 6.386654702667428, "grad_norm": 11.5625, "learning_rate": 5.5819663647618814e-08, "loss": 2.1012, "num_input_tokens_seen": 81786448, "step": 39150 }, { "epoch": 6.387470429888245, "grad_norm": 4.1875, "learning_rate": 5.5165900975989723e-08, "loss": 2.2497, "num_input_tokens_seen": 81797184, "step": 39155 }, { "epoch": 6.388286157109063, "grad_norm": 2.328125, "learning_rate": 5.451598511125311e-08, "loss": 2.5918, "num_input_tokens_seen": 81807152, "step": 39160 }, { "epoch": 6.38910188432988, "grad_norm": 8.125, "learning_rate": 5.3869916153637124e-08, "loss": 1.9328, "num_input_tokens_seen": 81818192, "step": 39165 }, { "epoch": 6.389917611550698, "grad_norm": 3.265625, "learning_rate": 5.322769420277318e-08, "loss": 2.3046, "num_input_tokens_seen": 81828800, "step": 39170 }, { "epoch": 6.390733338771515, "grad_norm": 14.6875, "learning_rate": 5.258931935769873e-08, "loss": 3.2759, "num_input_tokens_seen": 81839456, "step": 39175 }, { "epoch": 6.391549065992332, "grad_norm": 7.53125, "learning_rate": 5.19547917168628e-08, "loss": 2.7797, "num_input_tokens_seen": 81849616, "step": 39180 }, { "epoch": 6.392364793213149, "grad_norm": 5.1875, "learning_rate": 5.13241113781121e-08, "loss": 1.6261, "num_input_tokens_seen": 81860432, "step": 39185 }, { "epoch": 6.393180520433967, "grad_norm": 0.294921875, "learning_rate": 5.0697278438707755e-08, "loss": 1.3949, "num_input_tokens_seen": 81869312, "step": 39190 }, { "epoch": 6.393996247654784, "grad_norm": 11.0, "learning_rate": 5.0074292995316854e-08, "loss": 3.0205, "num_input_tokens_seen": 81880000, "step": 39195 }, { "epoch": 6.394811974875601, "grad_norm": 10.1875, "learning_rate": 4.945515514400978e-08, "loss": 3.2842, "num_input_tokens_seen": 81889856, "step": 39200 }, { "epoch": 6.394811974875601, "eval_loss": 2.5388035774230957, "eval_runtime": 134.5786, "eval_samples_per_second": 20.248, "eval_steps_per_second": 10.128, "num_input_tokens_seen": 81889856, "step": 39200 }, { "epoch": 6.395627702096419, "grad_norm": 5.75, "learning_rate": 4.883986498026571e-08, "loss": 3.0712, "num_input_tokens_seen": 81900016, "step": 39205 }, { "epoch": 6.3964434293172365, "grad_norm": 6.21875, "learning_rate": 4.822842259896987e-08, "loss": 3.04, "num_input_tokens_seen": 81910944, "step": 39210 }, { "epoch": 6.397259156538054, "grad_norm": 5.1875, "learning_rate": 4.762082809441626e-08, "loss": 1.8931, "num_input_tokens_seen": 81921936, "step": 39215 }, { "epoch": 6.398074883758871, "grad_norm": 8.9375, "learning_rate": 4.7017081560302156e-08, "loss": 2.0019, "num_input_tokens_seen": 81932640, "step": 39220 }, { "epoch": 6.398890610979688, "grad_norm": 9.1875, "learning_rate": 4.6417183089730866e-08, "loss": 3.0279, "num_input_tokens_seen": 81941872, "step": 39225 }, { "epoch": 6.399706338200506, "grad_norm": 4.21875, "learning_rate": 4.5821132775217265e-08, "loss": 1.8014, "num_input_tokens_seen": 81951376, "step": 39230 }, { "epoch": 6.400522065421323, "grad_norm": 4.625, "learning_rate": 4.5228930708679504e-08, "loss": 2.6987, "num_input_tokens_seen": 81960416, "step": 39235 }, { "epoch": 6.40133779264214, "grad_norm": 1.5, "learning_rate": 4.464057698144175e-08, "loss": 1.5615, "num_input_tokens_seen": 81970560, "step": 39240 }, { "epoch": 6.402153519862958, "grad_norm": 8.4375, "learning_rate": 4.4056071684236974e-08, "loss": 2.7491, "num_input_tokens_seen": 81980416, "step": 39245 }, { "epoch": 6.402969247083775, "grad_norm": 11.0625, "learning_rate": 4.347541490719864e-08, "loss": 2.8378, "num_input_tokens_seen": 81990944, "step": 39250 }, { "epoch": 6.4037849743045925, "grad_norm": 8.125, "learning_rate": 4.2898606739877336e-08, "loss": 2.958, "num_input_tokens_seen": 82002032, "step": 39255 }, { "epoch": 6.40460070152541, "grad_norm": 2.921875, "learning_rate": 4.232564727122135e-08, "loss": 2.1829, "num_input_tokens_seen": 82012352, "step": 39260 }, { "epoch": 6.405416428746228, "grad_norm": 6.125, "learning_rate": 4.1756536589585004e-08, "loss": 3.6478, "num_input_tokens_seen": 82022416, "step": 39265 }, { "epoch": 6.406232155967045, "grad_norm": 9.625, "learning_rate": 4.119127478273976e-08, "loss": 2.8536, "num_input_tokens_seen": 82032992, "step": 39270 }, { "epoch": 6.407047883187862, "grad_norm": 10.1875, "learning_rate": 4.062986193784923e-08, "loss": 2.2831, "num_input_tokens_seen": 82042000, "step": 39275 }, { "epoch": 6.407863610408679, "grad_norm": 10.625, "learning_rate": 4.007229814149416e-08, "loss": 2.9864, "num_input_tokens_seen": 82052288, "step": 39280 }, { "epoch": 6.408679337629497, "grad_norm": 4.375, "learning_rate": 3.951858347965576e-08, "loss": 2.6389, "num_input_tokens_seen": 82060864, "step": 39285 }, { "epoch": 6.409495064850314, "grad_norm": 4.90625, "learning_rate": 3.896871803772684e-08, "loss": 2.2337, "num_input_tokens_seen": 82071424, "step": 39290 }, { "epoch": 6.410310792071131, "grad_norm": 1.515625, "learning_rate": 3.842270190050068e-08, "loss": 2.713, "num_input_tokens_seen": 82081456, "step": 39295 }, { "epoch": 6.411126519291948, "grad_norm": 7.75, "learning_rate": 3.7880535152179376e-08, "loss": 2.2665, "num_input_tokens_seen": 82091264, "step": 39300 }, { "epoch": 6.4119422465127665, "grad_norm": 12.0, "learning_rate": 3.734221787637382e-08, "loss": 2.2764, "num_input_tokens_seen": 82100880, "step": 39305 }, { "epoch": 6.412757973733584, "grad_norm": 5.9375, "learning_rate": 3.680775015609817e-08, "loss": 3.8364, "num_input_tokens_seen": 82112976, "step": 39310 }, { "epoch": 6.413573700954401, "grad_norm": 8.1875, "learning_rate": 3.627713207377537e-08, "loss": 2.8558, "num_input_tokens_seen": 82123040, "step": 39315 }, { "epoch": 6.414389428175218, "grad_norm": 10.875, "learning_rate": 3.575036371123164e-08, "loss": 2.046, "num_input_tokens_seen": 82134928, "step": 39320 }, { "epoch": 6.415205155396036, "grad_norm": 10.3125, "learning_rate": 3.5227445149704776e-08, "loss": 2.1491, "num_input_tokens_seen": 82145008, "step": 39325 }, { "epoch": 6.416020882616853, "grad_norm": 4.9375, "learning_rate": 3.470837646983027e-08, "loss": 3.1118, "num_input_tokens_seen": 82155824, "step": 39330 }, { "epoch": 6.41683660983767, "grad_norm": 11.125, "learning_rate": 3.419315775165799e-08, "loss": 0.8422, "num_input_tokens_seen": 82166352, "step": 39335 }, { "epoch": 6.417652337058487, "grad_norm": 8.1875, "learning_rate": 3.368178907464103e-08, "loss": 2.106, "num_input_tokens_seen": 82177872, "step": 39340 }, { "epoch": 6.418468064279305, "grad_norm": 11.8125, "learning_rate": 3.317427051763855e-08, "loss": 3.1297, "num_input_tokens_seen": 82188000, "step": 39345 }, { "epoch": 6.419283791500122, "grad_norm": 4.28125, "learning_rate": 3.267060215891571e-08, "loss": 1.352, "num_input_tokens_seen": 82199440, "step": 39350 }, { "epoch": 6.4200995187209395, "grad_norm": 13.3125, "learning_rate": 3.217078407614649e-08, "loss": 2.5285, "num_input_tokens_seen": 82208624, "step": 39355 }, { "epoch": 6.420915245941757, "grad_norm": 6.78125, "learning_rate": 3.1674816346405345e-08, "loss": 2.2726, "num_input_tokens_seen": 82218160, "step": 39360 }, { "epoch": 6.421730973162575, "grad_norm": 4.65625, "learning_rate": 3.11826990461811e-08, "loss": 3.6438, "num_input_tokens_seen": 82230352, "step": 39365 }, { "epoch": 6.422546700383392, "grad_norm": 6.84375, "learning_rate": 3.069443225136304e-08, "loss": 2.4759, "num_input_tokens_seen": 82241536, "step": 39370 }, { "epoch": 6.423362427604209, "grad_norm": 4.09375, "learning_rate": 3.021001603724372e-08, "loss": 1.5967, "num_input_tokens_seen": 82252832, "step": 39375 }, { "epoch": 6.424178154825027, "grad_norm": 4.21875, "learning_rate": 2.9729450478532818e-08, "loss": 1.9682, "num_input_tokens_seen": 82263392, "step": 39380 }, { "epoch": 6.424993882045844, "grad_norm": 5.59375, "learning_rate": 2.9252735649337726e-08, "loss": 1.5289, "num_input_tokens_seen": 82274112, "step": 39385 }, { "epoch": 6.425809609266661, "grad_norm": 9.125, "learning_rate": 2.8779871623171863e-08, "loss": 3.4401, "num_input_tokens_seen": 82284160, "step": 39390 }, { "epoch": 6.426625336487478, "grad_norm": 5.28125, "learning_rate": 2.8310858472957448e-08, "loss": 2.5777, "num_input_tokens_seen": 82294448, "step": 39395 }, { "epoch": 6.4274410637082955, "grad_norm": 7.0625, "learning_rate": 2.784569627101996e-08, "loss": 2.8503, "num_input_tokens_seen": 82305408, "step": 39400 }, { "epoch": 6.4274410637082955, "eval_loss": 2.5388035774230957, "eval_runtime": 134.3679, "eval_samples_per_second": 20.28, "eval_steps_per_second": 10.144, "num_input_tokens_seen": 82305408, "step": 39400 }, { "epoch": 6.4282567909291135, "grad_norm": 5.78125, "learning_rate": 2.738438508909924e-08, "loss": 1.3175, "num_input_tokens_seen": 82315808, "step": 39405 }, { "epoch": 6.429072518149931, "grad_norm": 8.9375, "learning_rate": 2.692692499833005e-08, "loss": 3.011, "num_input_tokens_seen": 82326832, "step": 39410 }, { "epoch": 6.429888245370748, "grad_norm": 10.9375, "learning_rate": 2.647331606926151e-08, "loss": 1.8427, "num_input_tokens_seen": 82337600, "step": 39415 }, { "epoch": 6.430703972591566, "grad_norm": 5.03125, "learning_rate": 2.6023558371843225e-08, "loss": 1.9736, "num_input_tokens_seen": 82347360, "step": 39420 }, { "epoch": 6.431519699812383, "grad_norm": 10.4375, "learning_rate": 2.557765197543638e-08, "loss": 2.5894, "num_input_tokens_seen": 82358544, "step": 39425 }, { "epoch": 6.4323354270332, "grad_norm": 8.0625, "learning_rate": 2.513559694880263e-08, "loss": 2.0908, "num_input_tokens_seen": 82367168, "step": 39430 }, { "epoch": 6.433151154254017, "grad_norm": 9.25, "learning_rate": 2.469739336011523e-08, "loss": 2.4636, "num_input_tokens_seen": 82375824, "step": 39435 }, { "epoch": 6.433966881474835, "grad_norm": 6.4375, "learning_rate": 2.4263041276947894e-08, "loss": 3.0135, "num_input_tokens_seen": 82384320, "step": 39440 }, { "epoch": 6.434782608695652, "grad_norm": 7.46875, "learning_rate": 2.3832540766283164e-08, "loss": 1.4056, "num_input_tokens_seen": 82394512, "step": 39445 }, { "epoch": 6.4355983359164695, "grad_norm": 6.28125, "learning_rate": 2.3405891894512366e-08, "loss": 2.2534, "num_input_tokens_seen": 82403376, "step": 39450 }, { "epoch": 6.436414063137287, "grad_norm": 6.375, "learning_rate": 2.29830947274301e-08, "loss": 3.3404, "num_input_tokens_seen": 82414784, "step": 39455 }, { "epoch": 6.437229790358105, "grad_norm": 2.078125, "learning_rate": 2.2564149330231432e-08, "loss": 1.5016, "num_input_tokens_seen": 82427136, "step": 39460 }, { "epoch": 6.438045517578922, "grad_norm": 1.546875, "learning_rate": 2.2149055767528572e-08, "loss": 1.9384, "num_input_tokens_seen": 82436800, "step": 39465 }, { "epoch": 6.438861244799739, "grad_norm": 0.8671875, "learning_rate": 2.1737814103334197e-08, "loss": 1.5498, "num_input_tokens_seen": 82446528, "step": 39470 }, { "epoch": 6.439676972020556, "grad_norm": 12.0625, "learning_rate": 2.1330424401064253e-08, "loss": 2.8547, "num_input_tokens_seen": 82455792, "step": 39475 }, { "epoch": 6.440492699241374, "grad_norm": 9.1875, "learning_rate": 2.092688672354348e-08, "loss": 2.6919, "num_input_tokens_seen": 82466240, "step": 39480 }, { "epoch": 6.441308426462191, "grad_norm": 4.875, "learning_rate": 2.0527201133005435e-08, "loss": 1.3794, "num_input_tokens_seen": 82475008, "step": 39485 }, { "epoch": 6.442124153683008, "grad_norm": 2.859375, "learning_rate": 2.0131367691084148e-08, "loss": 2.2866, "num_input_tokens_seen": 82484880, "step": 39490 }, { "epoch": 6.442939880903825, "grad_norm": 4.96875, "learning_rate": 1.9739386458819675e-08, "loss": 1.718, "num_input_tokens_seen": 82494848, "step": 39495 }, { "epoch": 6.443755608124643, "grad_norm": 8.6875, "learning_rate": 1.9351257496666442e-08, "loss": 2.0118, "num_input_tokens_seen": 82506272, "step": 39500 }, { "epoch": 6.444571335345461, "grad_norm": 5.1875, "learning_rate": 1.896698086447657e-08, "loss": 1.312, "num_input_tokens_seen": 82518704, "step": 39505 }, { "epoch": 6.445387062566278, "grad_norm": 7.6875, "learning_rate": 1.8586556621505436e-08, "loss": 2.4795, "num_input_tokens_seen": 82527280, "step": 39510 }, { "epoch": 6.446202789787095, "grad_norm": 6.15625, "learning_rate": 1.820998482642833e-08, "loss": 3.2399, "num_input_tokens_seen": 82538816, "step": 39515 }, { "epoch": 6.447018517007913, "grad_norm": 8.5625, "learning_rate": 1.7837265537309912e-08, "loss": 2.8413, "num_input_tokens_seen": 82548464, "step": 39520 }, { "epoch": 6.44783424422873, "grad_norm": 9.9375, "learning_rate": 1.7468398811629206e-08, "loss": 2.4373, "num_input_tokens_seen": 82559328, "step": 39525 }, { "epoch": 6.448649971449547, "grad_norm": 12.8125, "learning_rate": 1.710338470627404e-08, "loss": 2.7398, "num_input_tokens_seen": 82569888, "step": 39530 }, { "epoch": 6.449465698670364, "grad_norm": 11.1875, "learning_rate": 1.6742223277529945e-08, "loss": 1.8654, "num_input_tokens_seen": 82580416, "step": 39535 }, { "epoch": 6.450281425891182, "grad_norm": 7.75, "learning_rate": 1.6384914581094036e-08, "loss": 2.1097, "num_input_tokens_seen": 82588496, "step": 39540 }, { "epoch": 6.451097153111999, "grad_norm": 6.46875, "learning_rate": 1.6031458672069455e-08, "loss": 1.5562, "num_input_tokens_seen": 82598880, "step": 39545 }, { "epoch": 6.4519128803328165, "grad_norm": 5.65625, "learning_rate": 1.5681855604962602e-08, "loss": 2.7466, "num_input_tokens_seen": 82609360, "step": 39550 }, { "epoch": 6.4527286075536345, "grad_norm": 10.1875, "learning_rate": 1.5336105433683135e-08, "loss": 2.6548, "num_input_tokens_seen": 82618112, "step": 39555 }, { "epoch": 6.453544334774452, "grad_norm": 5.65625, "learning_rate": 1.499420821155506e-08, "loss": 2.6135, "num_input_tokens_seen": 82628160, "step": 39560 }, { "epoch": 6.454360061995269, "grad_norm": 5.78125, "learning_rate": 1.4656163991302874e-08, "loss": 2.1513, "num_input_tokens_seen": 82639168, "step": 39565 }, { "epoch": 6.455175789216086, "grad_norm": 7.96875, "learning_rate": 1.4321972825051544e-08, "loss": 2.153, "num_input_tokens_seen": 82649872, "step": 39570 }, { "epoch": 6.455991516436903, "grad_norm": 6.5625, "learning_rate": 1.3991634764345951e-08, "loss": 3.4268, "num_input_tokens_seen": 82661040, "step": 39575 }, { "epoch": 6.456807243657721, "grad_norm": 7.03125, "learning_rate": 1.3665149860120352e-08, "loss": 1.7663, "num_input_tokens_seen": 82671712, "step": 39580 }, { "epoch": 6.457622970878538, "grad_norm": 9.5625, "learning_rate": 1.3342518162728912e-08, "loss": 2.5543, "num_input_tokens_seen": 82681328, "step": 39585 }, { "epoch": 6.458438698099355, "grad_norm": 3.421875, "learning_rate": 1.30237397219235e-08, "loss": 3.1577, "num_input_tokens_seen": 82691968, "step": 39590 }, { "epoch": 6.459254425320173, "grad_norm": 6.59375, "learning_rate": 1.2708814586862016e-08, "loss": 3.3014, "num_input_tokens_seen": 82702032, "step": 39595 }, { "epoch": 6.4600701525409905, "grad_norm": 8.375, "learning_rate": 1.2397742806111168e-08, "loss": 2.0501, "num_input_tokens_seen": 82712800, "step": 39600 }, { "epoch": 6.4600701525409905, "eval_loss": 2.5388035774230957, "eval_runtime": 134.3462, "eval_samples_per_second": 20.283, "eval_steps_per_second": 10.145, "num_input_tokens_seen": 82712800, "step": 39600 }, { "epoch": 6.460885879761808, "grad_norm": 5.59375, "learning_rate": 1.209052442764369e-08, "loss": 1.4544, "num_input_tokens_seen": 82724720, "step": 39605 }, { "epoch": 6.461701606982625, "grad_norm": 4.53125, "learning_rate": 1.17871594988328e-08, "loss": 2.4737, "num_input_tokens_seen": 82734720, "step": 39610 }, { "epoch": 6.462517334203443, "grad_norm": 10.875, "learning_rate": 1.1487648066466072e-08, "loss": 2.4534, "num_input_tokens_seen": 82746320, "step": 39615 }, { "epoch": 6.46333306142426, "grad_norm": 4.28125, "learning_rate": 1.1191990176728784e-08, "loss": 3.0193, "num_input_tokens_seen": 82756992, "step": 39620 }, { "epoch": 6.464148788645077, "grad_norm": 3.484375, "learning_rate": 1.0900185875215018e-08, "loss": 1.8533, "num_input_tokens_seen": 82768560, "step": 39625 }, { "epoch": 6.464964515865894, "grad_norm": 8.1875, "learning_rate": 1.0612235206924891e-08, "loss": 1.4567, "num_input_tokens_seen": 82777632, "step": 39630 }, { "epoch": 6.465780243086712, "grad_norm": 9.625, "learning_rate": 1.0328138216264549e-08, "loss": 3.0455, "num_input_tokens_seen": 82787616, "step": 39635 }, { "epoch": 6.466595970307529, "grad_norm": 5.4375, "learning_rate": 1.004789494704339e-08, "loss": 2.3409, "num_input_tokens_seen": 82796992, "step": 39640 }, { "epoch": 6.467411697528346, "grad_norm": 5.40625, "learning_rate": 9.771505442482397e-09, "loss": 3.6424, "num_input_tokens_seen": 82807456, "step": 39645 }, { "epoch": 6.468227424749164, "grad_norm": 2.0, "learning_rate": 9.498969745200259e-09, "loss": 1.2649, "num_input_tokens_seen": 82818016, "step": 39650 }, { "epoch": 6.469043151969982, "grad_norm": 6.25, "learning_rate": 9.230287897230017e-09, "loss": 1.6902, "num_input_tokens_seen": 82827872, "step": 39655 }, { "epoch": 6.469858879190799, "grad_norm": 5.5, "learning_rate": 8.965459940002419e-09, "loss": 2.9727, "num_input_tokens_seen": 82837984, "step": 39660 }, { "epoch": 6.470674606411616, "grad_norm": 9.1875, "learning_rate": 8.704485914357019e-09, "loss": 2.9865, "num_input_tokens_seen": 82847584, "step": 39665 }, { "epoch": 6.471490333632433, "grad_norm": 6.09375, "learning_rate": 8.447365860539402e-09, "loss": 3.0881, "num_input_tokens_seen": 82857392, "step": 39670 }, { "epoch": 6.472306060853251, "grad_norm": 13.6875, "learning_rate": 8.194099818201184e-09, "loss": 2.4919, "num_input_tokens_seen": 82868144, "step": 39675 }, { "epoch": 6.473121788074068, "grad_norm": 5.75, "learning_rate": 7.944687826400011e-09, "loss": 3.0366, "num_input_tokens_seen": 82879424, "step": 39680 }, { "epoch": 6.473937515294885, "grad_norm": 4.71875, "learning_rate": 7.699129923599557e-09, "loss": 1.9371, "num_input_tokens_seen": 82888720, "step": 39685 }, { "epoch": 6.474753242515702, "grad_norm": 5.125, "learning_rate": 7.457426147663982e-09, "loss": 1.9816, "num_input_tokens_seen": 82899440, "step": 39690 }, { "epoch": 6.47556896973652, "grad_norm": 11.75, "learning_rate": 7.219576535871797e-09, "loss": 1.7721, "num_input_tokens_seen": 82908896, "step": 39695 }, { "epoch": 6.4763846969573375, "grad_norm": 6.9375, "learning_rate": 6.985581124896445e-09, "loss": 1.6336, "num_input_tokens_seen": 82919536, "step": 39700 }, { "epoch": 6.477200424178155, "grad_norm": 7.21875, "learning_rate": 6.755439950828501e-09, "loss": 2.6977, "num_input_tokens_seen": 82930080, "step": 39705 }, { "epoch": 6.478016151398972, "grad_norm": 6.0625, "learning_rate": 6.5291530491562444e-09, "loss": 2.0725, "num_input_tokens_seen": 82940800, "step": 39710 }, { "epoch": 6.47883187861979, "grad_norm": 9.6875, "learning_rate": 6.3067204547739845e-09, "loss": 2.6733, "num_input_tokens_seen": 82949920, "step": 39715 }, { "epoch": 6.479647605840607, "grad_norm": 14.0, "learning_rate": 6.088142201987612e-09, "loss": 1.7935, "num_input_tokens_seen": 82961248, "step": 39720 }, { "epoch": 6.480463333061424, "grad_norm": 7.53125, "learning_rate": 5.873418324503499e-09, "loss": 2.5349, "num_input_tokens_seen": 82972816, "step": 39725 }, { "epoch": 6.481279060282241, "grad_norm": 1.21875, "learning_rate": 5.6625488554340465e-09, "loss": 1.1489, "num_input_tokens_seen": 82983440, "step": 39730 }, { "epoch": 6.482094787503059, "grad_norm": 6.46875, "learning_rate": 5.455533827297688e-09, "loss": 2.0956, "num_input_tokens_seen": 82994512, "step": 39735 }, { "epoch": 6.482910514723876, "grad_norm": 5.0, "learning_rate": 5.252373272018885e-09, "loss": 2.0931, "num_input_tokens_seen": 83005104, "step": 39740 }, { "epoch": 6.4837262419446935, "grad_norm": 5.25, "learning_rate": 5.053067220925356e-09, "loss": 2.2688, "num_input_tokens_seen": 83017120, "step": 39745 }, { "epoch": 6.484541969165511, "grad_norm": 10.6875, "learning_rate": 4.857615704759177e-09, "loss": 2.7728, "num_input_tokens_seen": 83025888, "step": 39750 }, { "epoch": 6.485357696386329, "grad_norm": 9.3125, "learning_rate": 4.666018753654577e-09, "loss": 2.358, "num_input_tokens_seen": 83036416, "step": 39755 }, { "epoch": 6.486173423607146, "grad_norm": 6.03125, "learning_rate": 4.478276397162917e-09, "loss": 2.7612, "num_input_tokens_seen": 83046304, "step": 39760 }, { "epoch": 6.486989150827963, "grad_norm": 1.875, "learning_rate": 4.294388664233262e-09, "loss": 2.0276, "num_input_tokens_seen": 83055904, "step": 39765 }, { "epoch": 6.487804878048781, "grad_norm": 9.375, "learning_rate": 4.114355583223484e-09, "loss": 3.4098, "num_input_tokens_seen": 83065600, "step": 39770 }, { "epoch": 6.488620605269598, "grad_norm": 4.4375, "learning_rate": 3.9381771818974845e-09, "loss": 2.1891, "num_input_tokens_seen": 83076960, "step": 39775 }, { "epoch": 6.489436332490415, "grad_norm": 5.25, "learning_rate": 3.765853487427973e-09, "loss": 1.0714, "num_input_tokens_seen": 83088112, "step": 39780 }, { "epoch": 6.490252059711232, "grad_norm": 9.3125, "learning_rate": 3.5973845263825857e-09, "loss": 2.4803, "num_input_tokens_seen": 83097584, "step": 39785 }, { "epoch": 6.49106778693205, "grad_norm": 6.4375, "learning_rate": 3.4327703247488684e-09, "loss": 1.7519, "num_input_tokens_seen": 83106912, "step": 39790 }, { "epoch": 6.4918835141528675, "grad_norm": 8.5, "learning_rate": 3.2720109079037443e-09, "loss": 2.3986, "num_input_tokens_seen": 83117280, "step": 39795 }, { "epoch": 6.492699241373685, "grad_norm": 8.125, "learning_rate": 3.1151063006468193e-09, "loss": 2.6112, "num_input_tokens_seen": 83128704, "step": 39800 }, { "epoch": 6.492699241373685, "eval_loss": 2.5388035774230957, "eval_runtime": 134.3078, "eval_samples_per_second": 20.289, "eval_steps_per_second": 10.148, "num_input_tokens_seen": 83128704, "step": 39800 }, { "epoch": 6.493514968594502, "grad_norm": 5.96875, "learning_rate": 2.962056527169854e-09, "loss": 2.7234, "num_input_tokens_seen": 83139264, "step": 39805 }, { "epoch": 6.49433069581532, "grad_norm": 4.84375, "learning_rate": 2.8128616110761898e-09, "loss": 2.7636, "num_input_tokens_seen": 83149616, "step": 39810 }, { "epoch": 6.495146423036137, "grad_norm": 8.6875, "learning_rate": 2.6675215753724223e-09, "loss": 1.7141, "num_input_tokens_seen": 83160880, "step": 39815 }, { "epoch": 6.495962150256954, "grad_norm": 0.06494140625, "learning_rate": 2.5260364424739557e-09, "loss": 1.15, "num_input_tokens_seen": 83171664, "step": 39820 }, { "epoch": 6.496777877477771, "grad_norm": 0.75, "learning_rate": 2.3884062341994475e-09, "loss": 1.2836, "num_input_tokens_seen": 83182000, "step": 39825 }, { "epoch": 6.497593604698589, "grad_norm": 0.7109375, "learning_rate": 2.25463097177081e-09, "loss": 1.5591, "num_input_tokens_seen": 83190688, "step": 39830 }, { "epoch": 6.498409331919406, "grad_norm": 9.8125, "learning_rate": 2.1247106758215397e-09, "loss": 2.2348, "num_input_tokens_seen": 83199120, "step": 39835 }, { "epoch": 6.499225059140223, "grad_norm": 9.1875, "learning_rate": 1.998645366382834e-09, "loss": 3.8195, "num_input_tokens_seen": 83210832, "step": 39840 }, { "epoch": 6.5000407863610405, "grad_norm": 2.125, "learning_rate": 1.876435062897475e-09, "loss": 3.2435, "num_input_tokens_seen": 83222880, "step": 39845 }, { "epoch": 6.500856513581859, "grad_norm": 11.375, "learning_rate": 1.758079784211497e-09, "loss": 2.74, "num_input_tokens_seen": 83233184, "step": 39850 }, { "epoch": 6.501672240802676, "grad_norm": 9.75, "learning_rate": 1.6435795485797434e-09, "loss": 1.5031, "num_input_tokens_seen": 83243648, "step": 39855 }, { "epoch": 6.502487968023493, "grad_norm": 4.34375, "learning_rate": 1.5329343736547596e-09, "loss": 3.1175, "num_input_tokens_seen": 83254464, "step": 39860 }, { "epoch": 6.50330369524431, "grad_norm": 10.875, "learning_rate": 1.4261442765006739e-09, "loss": 3.1733, "num_input_tokens_seen": 83264800, "step": 39865 }, { "epoch": 6.504119422465128, "grad_norm": 4.40625, "learning_rate": 1.3232092735876445e-09, "loss": 1.3792, "num_input_tokens_seen": 83275616, "step": 39870 }, { "epoch": 6.504935149685945, "grad_norm": 6.65625, "learning_rate": 1.2241293807918607e-09, "loss": 2.2311, "num_input_tokens_seen": 83285712, "step": 39875 }, { "epoch": 6.505750876906762, "grad_norm": 6.78125, "learning_rate": 1.128904613387216e-09, "loss": 2.0064, "num_input_tokens_seen": 83297248, "step": 39880 }, { "epoch": 6.506566604127579, "grad_norm": 7.1875, "learning_rate": 1.0375349860591853e-09, "loss": 2.6212, "num_input_tokens_seen": 83308752, "step": 39885 }, { "epoch": 6.507382331348397, "grad_norm": 7.8125, "learning_rate": 9.5002051290205e-10, "loss": 1.6593, "num_input_tokens_seen": 83317312, "step": 39890 }, { "epoch": 6.5081980585692145, "grad_norm": 5.28125, "learning_rate": 8.663612074077954e-10, "loss": 3.4763, "num_input_tokens_seen": 83327600, "step": 39895 }, { "epoch": 6.509013785790032, "grad_norm": 17.5, "learning_rate": 7.865570824799884e-10, "loss": 2.7803, "num_input_tokens_seen": 83338448, "step": 39900 }, { "epoch": 6.50982951301085, "grad_norm": 9.875, "learning_rate": 7.106081504254514e-10, "loss": 3.8732, "num_input_tokens_seen": 83348896, "step": 39905 }, { "epoch": 6.510645240231667, "grad_norm": 6.75, "learning_rate": 6.385144229570372e-10, "loss": 1.9264, "num_input_tokens_seen": 83359104, "step": 39910 }, { "epoch": 6.511460967452484, "grad_norm": 4.34375, "learning_rate": 5.70275911190854e-10, "loss": 2.5507, "num_input_tokens_seen": 83368384, "step": 39915 }, { "epoch": 6.512276694673301, "grad_norm": 10.8125, "learning_rate": 5.058926256490403e-10, "loss": 2.3565, "num_input_tokens_seen": 83378352, "step": 39920 }, { "epoch": 6.513092421894118, "grad_norm": 8.3125, "learning_rate": 4.4536457626254134e-10, "loss": 3.8525, "num_input_tokens_seen": 83388128, "step": 39925 }, { "epoch": 6.513908149114936, "grad_norm": 10.0, "learning_rate": 3.88691772365557e-10, "loss": 4.0576, "num_input_tokens_seen": 83398288, "step": 39930 }, { "epoch": 6.514723876335753, "grad_norm": 4.96875, "learning_rate": 3.358742226955425e-10, "loss": 1.1232, "num_input_tokens_seen": 83408256, "step": 39935 }, { "epoch": 6.5155396035565705, "grad_norm": 4.28125, "learning_rate": 2.8691193539875925e-10, "loss": 2.312, "num_input_tokens_seen": 83418560, "step": 39940 }, { "epoch": 6.5163553307773885, "grad_norm": 11.6875, "learning_rate": 2.418049180274995e-10, "loss": 2.6746, "num_input_tokens_seen": 83429440, "step": 39945 }, { "epoch": 6.517171057998206, "grad_norm": 9.875, "learning_rate": 2.005531775373104e-10, "loss": 3.634, "num_input_tokens_seen": 83439312, "step": 39950 }, { "epoch": 6.517986785219023, "grad_norm": 7.125, "learning_rate": 1.6315672028699435e-10, "loss": 1.9588, "num_input_tokens_seen": 83449776, "step": 39955 }, { "epoch": 6.51880251243984, "grad_norm": 12.1875, "learning_rate": 1.2961555204693555e-10, "loss": 2.1335, "num_input_tokens_seen": 83460704, "step": 39960 }, { "epoch": 6.519618239660657, "grad_norm": 7.125, "learning_rate": 9.992967798799768e-11, "loss": 2.24, "num_input_tokens_seen": 83470784, "step": 39965 }, { "epoch": 6.520433966881475, "grad_norm": 9.3125, "learning_rate": 7.409910268707521e-11, "loss": 3.2386, "num_input_tokens_seen": 83481760, "step": 39970 }, { "epoch": 6.521249694102292, "grad_norm": 5.53125, "learning_rate": 5.212383012986877e-11, "loss": 1.9327, "num_input_tokens_seen": 83490128, "step": 39975 }, { "epoch": 6.522065421323109, "grad_norm": 0.33984375, "learning_rate": 3.400386370533415e-11, "loss": 2.0816, "num_input_tokens_seen": 83499984, "step": 39980 }, { "epoch": 6.522881148543927, "grad_norm": 7.84375, "learning_rate": 1.9739206205682258e-11, "loss": 3.2304, "num_input_tokens_seen": 83511088, "step": 39985 }, { "epoch": 6.523696875764744, "grad_norm": 9.3125, "learning_rate": 9.329859829154685e-12, "loss": 4.2414, "num_input_tokens_seen": 83522192, "step": 39990 }, { "epoch": 6.524512602985562, "grad_norm": 12.3125, "learning_rate": 2.7758261855748148e-12, "loss": 2.0651, "num_input_tokens_seen": 83532624, "step": 39995 }, { "epoch": 6.525328330206379, "grad_norm": 6.90625, "learning_rate": 7.710628524559838e-14, "loss": 2.4207, "num_input_tokens_seen": 83543088, "step": 40000 }, { "epoch": 6.525328330206379, "eval_loss": 2.5388035774230957, "eval_runtime": 134.1733, "eval_samples_per_second": 20.31, "eval_steps_per_second": 10.159, "num_input_tokens_seen": 83543088, "step": 40000 }, { "epoch": 6.525328330206379, "num_input_tokens_seen": 83543088, "step": 40000, "total_flos": 3.565989290886562e+18, "train_loss": 2.3986957787156107, "train_runtime": 52463.916, "train_samples_per_second": 3.05, "train_steps_per_second": 0.762 } ], "logging_steps": 5, "max_steps": 40000, "num_input_tokens_seen": 83543088, "num_train_epochs": 7, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.565989290886562e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }