{ "best_global_step": 33400, "best_metric": 0.06660357117652893, "best_model_checkpoint": "saves/ia3/llama-3-8b-instruct/train_sst2_1744902621/checkpoint-33400", "epoch": 10.55694866041969, "eval_steps": 200, "global_step": 40000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0013197835554968984, "grad_norm": 3.1349356174468994, "learning_rate": 4.999999876629946e-05, "loss": 0.9631, "num_input_tokens_seen": 4480, "step": 5 }, { "epoch": 0.002639567110993797, "grad_norm": 2.9442992210388184, "learning_rate": 4.999999375439123e-05, "loss": 0.9189, "num_input_tokens_seen": 9024, "step": 10 }, { "epoch": 0.0039593506664906955, "grad_norm": 3.237074136734009, "learning_rate": 4.9999984887169785e-05, "loss": 0.9132, "num_input_tokens_seen": 14176, "step": 15 }, { "epoch": 0.005279134221987594, "grad_norm": 2.991049289703369, "learning_rate": 4.9999972164636506e-05, "loss": 1.098, "num_input_tokens_seen": 18528, "step": 20 }, { "epoch": 0.006598917777484493, "grad_norm": 2.9857192039489746, "learning_rate": 4.999995558679334e-05, "loss": 1.026, "num_input_tokens_seen": 22848, "step": 25 }, { "epoch": 0.007918701332981391, "grad_norm": 3.1116230487823486, "learning_rate": 4.999993515364287e-05, "loss": 0.9916, "num_input_tokens_seen": 27648, "step": 30 }, { "epoch": 0.00923848488847829, "grad_norm": 2.9589507579803467, "learning_rate": 4.999991086518822e-05, "loss": 0.9495, "num_input_tokens_seen": 32256, "step": 35 }, { "epoch": 0.010558268443975187, "grad_norm": 2.8274903297424316, "learning_rate": 4.999988272143315e-05, "loss": 0.9066, "num_input_tokens_seen": 36832, "step": 40 }, { "epoch": 0.011878051999472087, "grad_norm": 2.7399206161499023, "learning_rate": 4.999985072238199e-05, "loss": 0.8802, "num_input_tokens_seen": 41216, "step": 45 }, { "epoch": 0.013197835554968985, "grad_norm": 1.9083120822906494, "learning_rate": 4.999981486803969e-05, "loss": 0.5723, "num_input_tokens_seen": 45536, "step": 50 }, { "epoch": 0.014517619110465884, "grad_norm": 1.5509576797485352, "learning_rate": 4.999977515841176e-05, "loss": 0.4125, "num_input_tokens_seen": 50112, "step": 55 }, { "epoch": 0.015837402665962782, "grad_norm": 0.4766233265399933, "learning_rate": 4.9999731593504344e-05, "loss": 0.22, "num_input_tokens_seen": 54336, "step": 60 }, { "epoch": 0.017157186221459682, "grad_norm": 0.9432277679443359, "learning_rate": 4.999968417332415e-05, "loss": 0.2582, "num_input_tokens_seen": 58688, "step": 65 }, { "epoch": 0.01847696977695658, "grad_norm": 0.5134417414665222, "learning_rate": 4.999963289787848e-05, "loss": 0.162, "num_input_tokens_seen": 63456, "step": 70 }, { "epoch": 0.019796753332453478, "grad_norm": 0.6916096806526184, "learning_rate": 4.999957776717526e-05, "loss": 0.1623, "num_input_tokens_seen": 67840, "step": 75 }, { "epoch": 0.021116536887950375, "grad_norm": 0.42286863923072815, "learning_rate": 4.9999518781222984e-05, "loss": 0.0522, "num_input_tokens_seen": 72192, "step": 80 }, { "epoch": 0.022436320443447275, "grad_norm": 0.8365755677223206, "learning_rate": 4.9999455940030746e-05, "loss": 0.2078, "num_input_tokens_seen": 76960, "step": 85 }, { "epoch": 0.023756103998944175, "grad_norm": 0.22348973155021667, "learning_rate": 4.999938924360824e-05, "loss": 0.1517, "num_input_tokens_seen": 81504, "step": 90 }, { "epoch": 0.02507588755444107, "grad_norm": 0.5905721783638, "learning_rate": 4.999931869196575e-05, "loss": 0.1214, "num_input_tokens_seen": 85696, "step": 95 }, { "epoch": 0.02639567110993797, "grad_norm": 0.593951940536499, "learning_rate": 4.999924428511416e-05, "loss": 0.1393, "num_input_tokens_seen": 90016, "step": 100 }, { "epoch": 0.027715454665434867, "grad_norm": 0.1924770474433899, "learning_rate": 4.999916602306494e-05, "loss": 0.1354, "num_input_tokens_seen": 94528, "step": 105 }, { "epoch": 0.029035238220931767, "grad_norm": 0.9475557208061218, "learning_rate": 4.999908390583016e-05, "loss": 0.1704, "num_input_tokens_seen": 98848, "step": 110 }, { "epoch": 0.030355021776428667, "grad_norm": 0.4281267821788788, "learning_rate": 4.999899793342247e-05, "loss": 0.0849, "num_input_tokens_seen": 103584, "step": 115 }, { "epoch": 0.031674805331925564, "grad_norm": 0.3587918281555176, "learning_rate": 4.999890810585516e-05, "loss": 0.0524, "num_input_tokens_seen": 108160, "step": 120 }, { "epoch": 0.03299458888742246, "grad_norm": 0.9242098331451416, "learning_rate": 4.999881442314206e-05, "loss": 0.1536, "num_input_tokens_seen": 112640, "step": 125 }, { "epoch": 0.034314372442919364, "grad_norm": 0.08543769270181656, "learning_rate": 4.9998716885297617e-05, "loss": 0.178, "num_input_tokens_seen": 116992, "step": 130 }, { "epoch": 0.03563415599841626, "grad_norm": 0.4117920398712158, "learning_rate": 4.999861549233688e-05, "loss": 0.0702, "num_input_tokens_seen": 121440, "step": 135 }, { "epoch": 0.03695393955391316, "grad_norm": 0.25010940432548523, "learning_rate": 4.999851024427548e-05, "loss": 0.067, "num_input_tokens_seen": 126016, "step": 140 }, { "epoch": 0.03827372310941006, "grad_norm": 0.4828072488307953, "learning_rate": 4.999840114112965e-05, "loss": 0.1186, "num_input_tokens_seen": 130304, "step": 145 }, { "epoch": 0.039593506664906956, "grad_norm": 0.5798388123512268, "learning_rate": 4.999828818291621e-05, "loss": 0.1314, "num_input_tokens_seen": 134688, "step": 150 }, { "epoch": 0.04091329022040385, "grad_norm": 0.533508837223053, "learning_rate": 4.999817136965259e-05, "loss": 0.1878, "num_input_tokens_seen": 138784, "step": 155 }, { "epoch": 0.04223307377590075, "grad_norm": 0.2037423849105835, "learning_rate": 4.9998050701356794e-05, "loss": 0.1773, "num_input_tokens_seen": 143392, "step": 160 }, { "epoch": 0.04355285733139765, "grad_norm": 0.3969312310218811, "learning_rate": 4.999792617804744e-05, "loss": 0.1016, "num_input_tokens_seen": 147520, "step": 165 }, { "epoch": 0.04487264088689455, "grad_norm": 0.5613162517547607, "learning_rate": 4.9997797799743724e-05, "loss": 0.1199, "num_input_tokens_seen": 151840, "step": 170 }, { "epoch": 0.046192424442391446, "grad_norm": 0.7909346222877502, "learning_rate": 4.999766556646545e-05, "loss": 0.1986, "num_input_tokens_seen": 156160, "step": 175 }, { "epoch": 0.04751220799788835, "grad_norm": 0.7198038101196289, "learning_rate": 4.9997529478232996e-05, "loss": 0.1177, "num_input_tokens_seen": 160480, "step": 180 }, { "epoch": 0.048831991553385246, "grad_norm": 0.8182074427604675, "learning_rate": 4.9997389535067365e-05, "loss": 0.1799, "num_input_tokens_seen": 164992, "step": 185 }, { "epoch": 0.05015177510888214, "grad_norm": 0.6495415568351746, "learning_rate": 4.999724573699012e-05, "loss": 0.2043, "num_input_tokens_seen": 169600, "step": 190 }, { "epoch": 0.05147155866437904, "grad_norm": 0.4754011929035187, "learning_rate": 4.9997098084023457e-05, "loss": 0.1368, "num_input_tokens_seen": 174048, "step": 195 }, { "epoch": 0.05279134221987594, "grad_norm": 0.1435583531856537, "learning_rate": 4.999694657619013e-05, "loss": 0.0985, "num_input_tokens_seen": 178464, "step": 200 }, { "epoch": 0.05279134221987594, "eval_loss": 0.12938973307609558, "eval_runtime": 64.7462, "eval_samples_per_second": 104.022, "eval_steps_per_second": 26.009, "num_input_tokens_seen": 178464, "step": 200 }, { "epoch": 0.05411112577537284, "grad_norm": 0.25542405247688293, "learning_rate": 4.999679121351352e-05, "loss": 0.2034, "num_input_tokens_seen": 182784, "step": 205 }, { "epoch": 0.055430909330869735, "grad_norm": 0.7231872081756592, "learning_rate": 4.9996631996017565e-05, "loss": 0.1956, "num_input_tokens_seen": 187616, "step": 210 }, { "epoch": 0.05675069288636664, "grad_norm": 0.4120265245437622, "learning_rate": 4.9996468923726835e-05, "loss": 0.0657, "num_input_tokens_seen": 192224, "step": 215 }, { "epoch": 0.058070476441863535, "grad_norm": 0.32709211111068726, "learning_rate": 4.999630199666647e-05, "loss": 0.1156, "num_input_tokens_seen": 196896, "step": 220 }, { "epoch": 0.05939025999736043, "grad_norm": 0.28501302003860474, "learning_rate": 4.999613121486222e-05, "loss": 0.094, "num_input_tokens_seen": 201184, "step": 225 }, { "epoch": 0.060710043552857335, "grad_norm": 0.29709622263908386, "learning_rate": 4.999595657834041e-05, "loss": 0.1614, "num_input_tokens_seen": 205664, "step": 230 }, { "epoch": 0.06202982710835423, "grad_norm": 0.8627525568008423, "learning_rate": 4.999577808712798e-05, "loss": 0.1521, "num_input_tokens_seen": 210112, "step": 235 }, { "epoch": 0.06334961066385113, "grad_norm": 0.5421628952026367, "learning_rate": 4.999559574125244e-05, "loss": 0.0883, "num_input_tokens_seen": 214656, "step": 240 }, { "epoch": 0.06466939421934803, "grad_norm": 0.393868625164032, "learning_rate": 4.9995409540741934e-05, "loss": 0.0978, "num_input_tokens_seen": 219072, "step": 245 }, { "epoch": 0.06598917777484492, "grad_norm": 0.28029927611351013, "learning_rate": 4.999521948562516e-05, "loss": 0.1211, "num_input_tokens_seen": 223520, "step": 250 }, { "epoch": 0.06730896133034182, "grad_norm": 0.622859001159668, "learning_rate": 4.999502557593143e-05, "loss": 0.0853, "num_input_tokens_seen": 227936, "step": 255 }, { "epoch": 0.06862874488583873, "grad_norm": 0.47244149446487427, "learning_rate": 4.999482781169066e-05, "loss": 0.1765, "num_input_tokens_seen": 232128, "step": 260 }, { "epoch": 0.06994852844133562, "grad_norm": 0.2308342307806015, "learning_rate": 4.9994626192933324e-05, "loss": 0.0921, "num_input_tokens_seen": 236800, "step": 265 }, { "epoch": 0.07126831199683252, "grad_norm": 0.16735781729221344, "learning_rate": 4.999442071969054e-05, "loss": 0.1109, "num_input_tokens_seen": 241280, "step": 270 }, { "epoch": 0.07258809555232942, "grad_norm": 0.29746511578559875, "learning_rate": 4.999421139199397e-05, "loss": 0.1201, "num_input_tokens_seen": 245888, "step": 275 }, { "epoch": 0.07390787910782631, "grad_norm": 0.47787079215049744, "learning_rate": 4.999399820987592e-05, "loss": 0.1735, "num_input_tokens_seen": 250496, "step": 280 }, { "epoch": 0.07522766266332322, "grad_norm": 0.6780644059181213, "learning_rate": 4.999378117336924e-05, "loss": 0.1371, "num_input_tokens_seen": 254944, "step": 285 }, { "epoch": 0.07654744621882012, "grad_norm": 0.8207125663757324, "learning_rate": 4.9993560282507415e-05, "loss": 0.1502, "num_input_tokens_seen": 259392, "step": 290 }, { "epoch": 0.07786722977431701, "grad_norm": 0.19681544601917267, "learning_rate": 4.9993335537324495e-05, "loss": 0.0913, "num_input_tokens_seen": 263872, "step": 295 }, { "epoch": 0.07918701332981391, "grad_norm": 0.2524445950984955, "learning_rate": 4.999310693785516e-05, "loss": 0.0636, "num_input_tokens_seen": 268384, "step": 300 }, { "epoch": 0.0805067968853108, "grad_norm": 0.28487133979797363, "learning_rate": 4.9992874484134653e-05, "loss": 0.0887, "num_input_tokens_seen": 272736, "step": 305 }, { "epoch": 0.0818265804408077, "grad_norm": 0.6542393565177917, "learning_rate": 4.999263817619882e-05, "loss": 0.0947, "num_input_tokens_seen": 277376, "step": 310 }, { "epoch": 0.08314636399630461, "grad_norm": 0.25458741188049316, "learning_rate": 4.9992398014084105e-05, "loss": 0.1472, "num_input_tokens_seen": 281664, "step": 315 }, { "epoch": 0.0844661475518015, "grad_norm": 0.16027264297008514, "learning_rate": 4.999215399782754e-05, "loss": 0.1076, "num_input_tokens_seen": 286304, "step": 320 }, { "epoch": 0.0857859311072984, "grad_norm": 0.4422883987426758, "learning_rate": 4.999190612746675e-05, "loss": 0.1528, "num_input_tokens_seen": 290368, "step": 325 }, { "epoch": 0.0871057146627953, "grad_norm": 0.8368133902549744, "learning_rate": 4.999165440303998e-05, "loss": 0.1126, "num_input_tokens_seen": 294624, "step": 330 }, { "epoch": 0.0884254982182922, "grad_norm": 0.23252594470977783, "learning_rate": 4.999139882458603e-05, "loss": 0.1385, "num_input_tokens_seen": 299168, "step": 335 }, { "epoch": 0.0897452817737891, "grad_norm": 0.4841640293598175, "learning_rate": 4.9991139392144314e-05, "loss": 0.1812, "num_input_tokens_seen": 303936, "step": 340 }, { "epoch": 0.091065065329286, "grad_norm": 0.1294574737548828, "learning_rate": 4.999087610575485e-05, "loss": 0.0741, "num_input_tokens_seen": 308384, "step": 345 }, { "epoch": 0.09238484888478289, "grad_norm": 0.4516092836856842, "learning_rate": 4.999060896545824e-05, "loss": 0.1173, "num_input_tokens_seen": 312768, "step": 350 }, { "epoch": 0.0937046324402798, "grad_norm": 0.5316590070724487, "learning_rate": 4.999033797129568e-05, "loss": 0.0749, "num_input_tokens_seen": 317248, "step": 355 }, { "epoch": 0.0950244159957767, "grad_norm": 0.5188689827919006, "learning_rate": 4.999006312330894e-05, "loss": 0.1695, "num_input_tokens_seen": 321760, "step": 360 }, { "epoch": 0.09634419955127359, "grad_norm": 0.2947482466697693, "learning_rate": 4.998978442154043e-05, "loss": 0.0884, "num_input_tokens_seen": 326176, "step": 365 }, { "epoch": 0.09766398310677049, "grad_norm": 0.44138023257255554, "learning_rate": 4.9989501866033125e-05, "loss": 0.1289, "num_input_tokens_seen": 330592, "step": 370 }, { "epoch": 0.0989837666622674, "grad_norm": 0.3255839943885803, "learning_rate": 4.998921545683059e-05, "loss": 0.0932, "num_input_tokens_seen": 334912, "step": 375 }, { "epoch": 0.10030355021776428, "grad_norm": 0.4621589183807373, "learning_rate": 4.9988925193976996e-05, "loss": 0.1511, "num_input_tokens_seen": 339840, "step": 380 }, { "epoch": 0.10162333377326119, "grad_norm": 0.33085736632347107, "learning_rate": 4.998863107751711e-05, "loss": 0.144, "num_input_tokens_seen": 344064, "step": 385 }, { "epoch": 0.10294311732875808, "grad_norm": 0.5871715545654297, "learning_rate": 4.998833310749629e-05, "loss": 0.1912, "num_input_tokens_seen": 348512, "step": 390 }, { "epoch": 0.10426290088425498, "grad_norm": 0.07381974905729294, "learning_rate": 4.998803128396047e-05, "loss": 0.0682, "num_input_tokens_seen": 352960, "step": 395 }, { "epoch": 0.10558268443975188, "grad_norm": 0.5194829702377319, "learning_rate": 4.9987725606956215e-05, "loss": 0.0784, "num_input_tokens_seen": 357184, "step": 400 }, { "epoch": 0.10558268443975188, "eval_loss": 0.12012168020009995, "eval_runtime": 64.7271, "eval_samples_per_second": 104.052, "eval_steps_per_second": 26.017, "num_input_tokens_seen": 357184, "step": 400 }, { "epoch": 0.10690246799524877, "grad_norm": 0.3671155869960785, "learning_rate": 4.998741607653066e-05, "loss": 0.0875, "num_input_tokens_seen": 361888, "step": 405 }, { "epoch": 0.10822225155074568, "grad_norm": 0.12836100161075592, "learning_rate": 4.9987102692731523e-05, "loss": 0.0726, "num_input_tokens_seen": 366464, "step": 410 }, { "epoch": 0.10954203510624258, "grad_norm": 0.20675607025623322, "learning_rate": 4.9986785455607157e-05, "loss": 0.1098, "num_input_tokens_seen": 371008, "step": 415 }, { "epoch": 0.11086181866173947, "grad_norm": 0.10741525143384933, "learning_rate": 4.9986464365206456e-05, "loss": 0.0859, "num_input_tokens_seen": 375424, "step": 420 }, { "epoch": 0.11218160221723637, "grad_norm": 0.44301462173461914, "learning_rate": 4.9986139421578956e-05, "loss": 0.0724, "num_input_tokens_seen": 379872, "step": 425 }, { "epoch": 0.11350138577273328, "grad_norm": 0.22900453209877014, "learning_rate": 4.998581062477477e-05, "loss": 0.0637, "num_input_tokens_seen": 384320, "step": 430 }, { "epoch": 0.11482116932823017, "grad_norm": 0.24082441627979279, "learning_rate": 4.998547797484458e-05, "loss": 0.1081, "num_input_tokens_seen": 388448, "step": 435 }, { "epoch": 0.11614095288372707, "grad_norm": 0.38517895340919495, "learning_rate": 4.9985141471839706e-05, "loss": 0.1165, "num_input_tokens_seen": 393280, "step": 440 }, { "epoch": 0.11746073643922397, "grad_norm": 0.15212547779083252, "learning_rate": 4.998480111581203e-05, "loss": 0.2165, "num_input_tokens_seen": 397792, "step": 445 }, { "epoch": 0.11878051999472086, "grad_norm": 0.19099442660808563, "learning_rate": 4.998445690681405e-05, "loss": 0.0758, "num_input_tokens_seen": 402208, "step": 450 }, { "epoch": 0.12010030355021777, "grad_norm": 0.8132048845291138, "learning_rate": 4.9984108844898834e-05, "loss": 0.2056, "num_input_tokens_seen": 406720, "step": 455 }, { "epoch": 0.12142008710571467, "grad_norm": 0.10390175133943558, "learning_rate": 4.9983756930120076e-05, "loss": 0.0776, "num_input_tokens_seen": 411040, "step": 460 }, { "epoch": 0.12273987066121156, "grad_norm": 0.234686940908432, "learning_rate": 4.9983401162532025e-05, "loss": 0.1167, "num_input_tokens_seen": 415648, "step": 465 }, { "epoch": 0.12405965421670846, "grad_norm": 0.6214133501052856, "learning_rate": 4.998304154218955e-05, "loss": 0.125, "num_input_tokens_seen": 420224, "step": 470 }, { "epoch": 0.12537943777220537, "grad_norm": 0.15896718204021454, "learning_rate": 4.998267806914812e-05, "loss": 0.0619, "num_input_tokens_seen": 424768, "step": 475 }, { "epoch": 0.12669922132770225, "grad_norm": 0.8189268708229065, "learning_rate": 4.998231074346378e-05, "loss": 0.1535, "num_input_tokens_seen": 429280, "step": 480 }, { "epoch": 0.12801900488319914, "grad_norm": 0.1861373335123062, "learning_rate": 4.998193956519317e-05, "loss": 0.1155, "num_input_tokens_seen": 433600, "step": 485 }, { "epoch": 0.12933878843869606, "grad_norm": 0.4415060579776764, "learning_rate": 4.9981564534393545e-05, "loss": 0.0966, "num_input_tokens_seen": 438048, "step": 490 }, { "epoch": 0.13065857199419295, "grad_norm": 0.5352500677108765, "learning_rate": 4.998118565112272e-05, "loss": 0.102, "num_input_tokens_seen": 442304, "step": 495 }, { "epoch": 0.13197835554968984, "grad_norm": 0.433537095785141, "learning_rate": 4.998080291543914e-05, "loss": 0.1439, "num_input_tokens_seen": 446976, "step": 500 }, { "epoch": 0.13329813910518676, "grad_norm": 0.5941283702850342, "learning_rate": 4.9980416327401826e-05, "loss": 0.0704, "num_input_tokens_seen": 451104, "step": 505 }, { "epoch": 0.13461792266068365, "grad_norm": 0.21125994622707367, "learning_rate": 4.998002588707038e-05, "loss": 0.0957, "num_input_tokens_seen": 455712, "step": 510 }, { "epoch": 0.13593770621618054, "grad_norm": 0.39476466178894043, "learning_rate": 4.997963159450503e-05, "loss": 0.0584, "num_input_tokens_seen": 460416, "step": 515 }, { "epoch": 0.13725748977167745, "grad_norm": 0.11853265762329102, "learning_rate": 4.9979233449766575e-05, "loss": 0.151, "num_input_tokens_seen": 464768, "step": 520 }, { "epoch": 0.13857727332717434, "grad_norm": 0.4697645902633667, "learning_rate": 4.997883145291641e-05, "loss": 0.1427, "num_input_tokens_seen": 469120, "step": 525 }, { "epoch": 0.13989705688267123, "grad_norm": 0.7779855132102966, "learning_rate": 4.9978425604016536e-05, "loss": 0.1142, "num_input_tokens_seen": 473376, "step": 530 }, { "epoch": 0.14121684043816815, "grad_norm": 0.25538647174835205, "learning_rate": 4.9978015903129536e-05, "loss": 0.0926, "num_input_tokens_seen": 477824, "step": 535 }, { "epoch": 0.14253662399366504, "grad_norm": 0.15424871444702148, "learning_rate": 4.997760235031859e-05, "loss": 0.1274, "num_input_tokens_seen": 482304, "step": 540 }, { "epoch": 0.14385640754916193, "grad_norm": 0.3539102375507355, "learning_rate": 4.9977184945647473e-05, "loss": 0.114, "num_input_tokens_seen": 486720, "step": 545 }, { "epoch": 0.14517619110465885, "grad_norm": 0.6351729035377502, "learning_rate": 4.997676368918055e-05, "loss": 0.1305, "num_input_tokens_seen": 491328, "step": 550 }, { "epoch": 0.14649597466015574, "grad_norm": 0.5514878630638123, "learning_rate": 4.9976338580982794e-05, "loss": 0.1498, "num_input_tokens_seen": 495616, "step": 555 }, { "epoch": 0.14781575821565263, "grad_norm": 0.1769968867301941, "learning_rate": 4.9975909621119755e-05, "loss": 0.0975, "num_input_tokens_seen": 499936, "step": 560 }, { "epoch": 0.14913554177114954, "grad_norm": 0.4349355101585388, "learning_rate": 4.997547680965758e-05, "loss": 0.1023, "num_input_tokens_seen": 504864, "step": 565 }, { "epoch": 0.15045532532664643, "grad_norm": 0.3109443783760071, "learning_rate": 4.997504014666302e-05, "loss": 0.0888, "num_input_tokens_seen": 509312, "step": 570 }, { "epoch": 0.15177510888214332, "grad_norm": 0.27755749225616455, "learning_rate": 4.997459963220342e-05, "loss": 0.0868, "num_input_tokens_seen": 513824, "step": 575 }, { "epoch": 0.15309489243764024, "grad_norm": 0.654373824596405, "learning_rate": 4.997415526634671e-05, "loss": 0.1699, "num_input_tokens_seen": 518208, "step": 580 }, { "epoch": 0.15441467599313713, "grad_norm": 0.65253746509552, "learning_rate": 4.99737070491614e-05, "loss": 0.1384, "num_input_tokens_seen": 522592, "step": 585 }, { "epoch": 0.15573445954863402, "grad_norm": 0.7733871340751648, "learning_rate": 4.997325498071663e-05, "loss": 0.2244, "num_input_tokens_seen": 527040, "step": 590 }, { "epoch": 0.15705424310413094, "grad_norm": 0.7715486288070679, "learning_rate": 4.997279906108211e-05, "loss": 0.1591, "num_input_tokens_seen": 531296, "step": 595 }, { "epoch": 0.15837402665962783, "grad_norm": 0.0990673080086708, "learning_rate": 4.9972339290328155e-05, "loss": 0.0533, "num_input_tokens_seen": 535488, "step": 600 }, { "epoch": 0.15837402665962783, "eval_loss": 0.1138034537434578, "eval_runtime": 64.6906, "eval_samples_per_second": 104.111, "eval_steps_per_second": 26.032, "num_input_tokens_seen": 535488, "step": 600 }, { "epoch": 0.15969381021512472, "grad_norm": 0.33363252878189087, "learning_rate": 4.9971875668525646e-05, "loss": 0.1122, "num_input_tokens_seen": 540000, "step": 605 }, { "epoch": 0.1610135937706216, "grad_norm": 0.49440857768058777, "learning_rate": 4.997140819574609e-05, "loss": 0.1298, "num_input_tokens_seen": 544256, "step": 610 }, { "epoch": 0.16233337732611852, "grad_norm": 0.2718019485473633, "learning_rate": 4.997093687206159e-05, "loss": 0.0871, "num_input_tokens_seen": 548704, "step": 615 }, { "epoch": 0.1636531608816154, "grad_norm": 0.13265176117420197, "learning_rate": 4.997046169754482e-05, "loss": 0.0931, "num_input_tokens_seen": 553184, "step": 620 }, { "epoch": 0.1649729444371123, "grad_norm": 0.25224459171295166, "learning_rate": 4.996998267226905e-05, "loss": 0.1366, "num_input_tokens_seen": 557664, "step": 625 }, { "epoch": 0.16629272799260922, "grad_norm": 0.31581202149391174, "learning_rate": 4.996949979630817e-05, "loss": 0.0719, "num_input_tokens_seen": 562080, "step": 630 }, { "epoch": 0.1676125115481061, "grad_norm": 0.4302205443382263, "learning_rate": 4.996901306973663e-05, "loss": 0.144, "num_input_tokens_seen": 566528, "step": 635 }, { "epoch": 0.168932295103603, "grad_norm": 0.27955928444862366, "learning_rate": 4.996852249262949e-05, "loss": 0.0589, "num_input_tokens_seen": 571040, "step": 640 }, { "epoch": 0.17025207865909991, "grad_norm": 0.2058146893978119, "learning_rate": 4.996802806506241e-05, "loss": 0.0682, "num_input_tokens_seen": 575488, "step": 645 }, { "epoch": 0.1715718622145968, "grad_norm": 0.29260969161987305, "learning_rate": 4.996752978711164e-05, "loss": 0.1596, "num_input_tokens_seen": 580192, "step": 650 }, { "epoch": 0.1728916457700937, "grad_norm": 0.3859860897064209, "learning_rate": 4.996702765885401e-05, "loss": 0.1154, "num_input_tokens_seen": 584768, "step": 655 }, { "epoch": 0.1742114293255906, "grad_norm": 0.34305039048194885, "learning_rate": 4.9966521680366964e-05, "loss": 0.1267, "num_input_tokens_seen": 588928, "step": 660 }, { "epoch": 0.1755312128810875, "grad_norm": 0.5077118277549744, "learning_rate": 4.9966011851728524e-05, "loss": 0.1181, "num_input_tokens_seen": 593248, "step": 665 }, { "epoch": 0.1768509964365844, "grad_norm": 0.15490347146987915, "learning_rate": 4.996549817301731e-05, "loss": 0.1128, "num_input_tokens_seen": 597728, "step": 670 }, { "epoch": 0.1781707799920813, "grad_norm": 0.15187938511371613, "learning_rate": 4.9964980644312544e-05, "loss": 0.0793, "num_input_tokens_seen": 601888, "step": 675 }, { "epoch": 0.1794905635475782, "grad_norm": 0.3256392478942871, "learning_rate": 4.996445926569403e-05, "loss": 0.1067, "num_input_tokens_seen": 606336, "step": 680 }, { "epoch": 0.1808103471030751, "grad_norm": 0.2006968855857849, "learning_rate": 4.996393403724218e-05, "loss": 0.0908, "num_input_tokens_seen": 610912, "step": 685 }, { "epoch": 0.182130130658572, "grad_norm": 0.24015803635120392, "learning_rate": 4.9963404959037985e-05, "loss": 0.076, "num_input_tokens_seen": 615392, "step": 690 }, { "epoch": 0.1834499142140689, "grad_norm": 0.7320178151130676, "learning_rate": 4.996287203116303e-05, "loss": 0.1118, "num_input_tokens_seen": 620000, "step": 695 }, { "epoch": 0.18476969776956578, "grad_norm": 0.22626900672912598, "learning_rate": 4.996233525369951e-05, "loss": 0.1456, "num_input_tokens_seen": 624704, "step": 700 }, { "epoch": 0.1860894813250627, "grad_norm": 0.4822400212287903, "learning_rate": 4.99617946267302e-05, "loss": 0.1091, "num_input_tokens_seen": 629088, "step": 705 }, { "epoch": 0.1874092648805596, "grad_norm": 0.17198865115642548, "learning_rate": 4.996125015033846e-05, "loss": 0.1028, "num_input_tokens_seen": 633632, "step": 710 }, { "epoch": 0.18872904843605648, "grad_norm": 0.2185218632221222, "learning_rate": 4.996070182460827e-05, "loss": 0.1406, "num_input_tokens_seen": 637888, "step": 715 }, { "epoch": 0.1900488319915534, "grad_norm": 0.12374971807003021, "learning_rate": 4.996014964962418e-05, "loss": 0.0775, "num_input_tokens_seen": 642240, "step": 720 }, { "epoch": 0.19136861554705029, "grad_norm": 0.33605867624282837, "learning_rate": 4.9959593625471344e-05, "loss": 0.1513, "num_input_tokens_seen": 646784, "step": 725 }, { "epoch": 0.19268839910254718, "grad_norm": 0.5107312798500061, "learning_rate": 4.995903375223552e-05, "loss": 0.115, "num_input_tokens_seen": 651424, "step": 730 }, { "epoch": 0.1940081826580441, "grad_norm": 0.3866291046142578, "learning_rate": 4.995847003000302e-05, "loss": 0.1214, "num_input_tokens_seen": 655712, "step": 735 }, { "epoch": 0.19532796621354098, "grad_norm": 0.19646111130714417, "learning_rate": 4.9957902458860804e-05, "loss": 0.1104, "num_input_tokens_seen": 660448, "step": 740 }, { "epoch": 0.19664774976903787, "grad_norm": 0.40927743911743164, "learning_rate": 4.995733103889639e-05, "loss": 0.1318, "num_input_tokens_seen": 664960, "step": 745 }, { "epoch": 0.1979675333245348, "grad_norm": 0.1660161167383194, "learning_rate": 4.99567557701979e-05, "loss": 0.1034, "num_input_tokens_seen": 669536, "step": 750 }, { "epoch": 0.19928731688003168, "grad_norm": 0.4235718250274658, "learning_rate": 4.995617665285403e-05, "loss": 0.0837, "num_input_tokens_seen": 674016, "step": 755 }, { "epoch": 0.20060710043552857, "grad_norm": 0.45117032527923584, "learning_rate": 4.99555936869541e-05, "loss": 0.0691, "num_input_tokens_seen": 678592, "step": 760 }, { "epoch": 0.20192688399102549, "grad_norm": 0.2279418706893921, "learning_rate": 4.995500687258803e-05, "loss": 0.1328, "num_input_tokens_seen": 682944, "step": 765 }, { "epoch": 0.20324666754652237, "grad_norm": 0.4901421070098877, "learning_rate": 4.995441620984628e-05, "loss": 0.1006, "num_input_tokens_seen": 687328, "step": 770 }, { "epoch": 0.20456645110201926, "grad_norm": 0.7021390199661255, "learning_rate": 4.995382169881996e-05, "loss": 0.1481, "num_input_tokens_seen": 691968, "step": 775 }, { "epoch": 0.20588623465751615, "grad_norm": 0.3093251585960388, "learning_rate": 4.9953223339600755e-05, "loss": 0.0627, "num_input_tokens_seen": 696512, "step": 780 }, { "epoch": 0.20720601821301307, "grad_norm": 0.23922395706176758, "learning_rate": 4.995262113228091e-05, "loss": 0.0906, "num_input_tokens_seen": 701088, "step": 785 }, { "epoch": 0.20852580176850996, "grad_norm": 0.4129209518432617, "learning_rate": 4.995201507695332e-05, "loss": 0.0968, "num_input_tokens_seen": 705472, "step": 790 }, { "epoch": 0.20984558532400685, "grad_norm": 0.3629891276359558, "learning_rate": 4.995140517371144e-05, "loss": 0.1401, "num_input_tokens_seen": 709984, "step": 795 }, { "epoch": 0.21116536887950377, "grad_norm": 0.18731938302516937, "learning_rate": 4.995079142264932e-05, "loss": 0.0606, "num_input_tokens_seen": 714592, "step": 800 }, { "epoch": 0.21116536887950377, "eval_loss": 0.10932759195566177, "eval_runtime": 64.7822, "eval_samples_per_second": 103.964, "eval_steps_per_second": 25.995, "num_input_tokens_seen": 714592, "step": 800 }, { "epoch": 0.21248515243500066, "grad_norm": 0.2243635207414627, "learning_rate": 4.995017382386162e-05, "loss": 0.0957, "num_input_tokens_seen": 719168, "step": 805 }, { "epoch": 0.21380493599049755, "grad_norm": 0.24593189358711243, "learning_rate": 4.994955237744356e-05, "loss": 0.0957, "num_input_tokens_seen": 723840, "step": 810 }, { "epoch": 0.21512471954599446, "grad_norm": 0.21937479078769684, "learning_rate": 4.994892708349101e-05, "loss": 0.0539, "num_input_tokens_seen": 728096, "step": 815 }, { "epoch": 0.21644450310149135, "grad_norm": 0.24053339660167694, "learning_rate": 4.994829794210035e-05, "loss": 0.0675, "num_input_tokens_seen": 732736, "step": 820 }, { "epoch": 0.21776428665698824, "grad_norm": 0.2171311229467392, "learning_rate": 4.994766495336864e-05, "loss": 0.1277, "num_input_tokens_seen": 737088, "step": 825 }, { "epoch": 0.21908407021248516, "grad_norm": 0.2992972135543823, "learning_rate": 4.994702811739348e-05, "loss": 0.1271, "num_input_tokens_seen": 741664, "step": 830 }, { "epoch": 0.22040385376798205, "grad_norm": 0.3149014413356781, "learning_rate": 4.994638743427308e-05, "loss": 0.1334, "num_input_tokens_seen": 746048, "step": 835 }, { "epoch": 0.22172363732347894, "grad_norm": 0.17598281800746918, "learning_rate": 4.994574290410624e-05, "loss": 0.1007, "num_input_tokens_seen": 750880, "step": 840 }, { "epoch": 0.22304342087897586, "grad_norm": 0.20840731263160706, "learning_rate": 4.9945094526992364e-05, "loss": 0.1269, "num_input_tokens_seen": 755232, "step": 845 }, { "epoch": 0.22436320443447275, "grad_norm": 0.24480533599853516, "learning_rate": 4.994444230303142e-05, "loss": 0.1159, "num_input_tokens_seen": 759712, "step": 850 }, { "epoch": 0.22568298798996964, "grad_norm": 0.2270325869321823, "learning_rate": 4.994378623232402e-05, "loss": 0.0653, "num_input_tokens_seen": 764064, "step": 855 }, { "epoch": 0.22700277154546655, "grad_norm": 0.4034910798072815, "learning_rate": 4.99431263149713e-05, "loss": 0.1317, "num_input_tokens_seen": 768544, "step": 860 }, { "epoch": 0.22832255510096344, "grad_norm": 0.2449798285961151, "learning_rate": 4.9942462551075056e-05, "loss": 0.0562, "num_input_tokens_seen": 773152, "step": 865 }, { "epoch": 0.22964233865646033, "grad_norm": 0.5677728056907654, "learning_rate": 4.994179494073764e-05, "loss": 0.1719, "num_input_tokens_seen": 777440, "step": 870 }, { "epoch": 0.23096212221195725, "grad_norm": 0.22239822149276733, "learning_rate": 4.9941123484062e-05, "loss": 0.0922, "num_input_tokens_seen": 782016, "step": 875 }, { "epoch": 0.23228190576745414, "grad_norm": 0.5216113328933716, "learning_rate": 4.99404481811517e-05, "loss": 0.1556, "num_input_tokens_seen": 786336, "step": 880 }, { "epoch": 0.23360168932295103, "grad_norm": 0.14891619980335236, "learning_rate": 4.9939769032110864e-05, "loss": 0.1041, "num_input_tokens_seen": 790976, "step": 885 }, { "epoch": 0.23492147287844795, "grad_norm": 0.16395199298858643, "learning_rate": 4.993908603704423e-05, "loss": 0.079, "num_input_tokens_seen": 795456, "step": 890 }, { "epoch": 0.23624125643394484, "grad_norm": 0.5704664587974548, "learning_rate": 4.9938399196057126e-05, "loss": 0.1439, "num_input_tokens_seen": 799808, "step": 895 }, { "epoch": 0.23756103998944172, "grad_norm": 0.23360532522201538, "learning_rate": 4.993770850925547e-05, "loss": 0.0771, "num_input_tokens_seen": 804576, "step": 900 }, { "epoch": 0.23888082354493864, "grad_norm": 0.27320852875709534, "learning_rate": 4.993701397674577e-05, "loss": 0.0758, "num_input_tokens_seen": 809248, "step": 905 }, { "epoch": 0.24020060710043553, "grad_norm": 0.40846580266952515, "learning_rate": 4.993631559863515e-05, "loss": 0.1505, "num_input_tokens_seen": 813888, "step": 910 }, { "epoch": 0.24152039065593242, "grad_norm": 0.4350108802318573, "learning_rate": 4.9935613375031283e-05, "loss": 0.1104, "num_input_tokens_seen": 817952, "step": 915 }, { "epoch": 0.24284017421142934, "grad_norm": 0.4193384051322937, "learning_rate": 4.993490730604248e-05, "loss": 0.0823, "num_input_tokens_seen": 822368, "step": 920 }, { "epoch": 0.24415995776692623, "grad_norm": 0.25415298342704773, "learning_rate": 4.993419739177761e-05, "loss": 0.1079, "num_input_tokens_seen": 826816, "step": 925 }, { "epoch": 0.24547974132242312, "grad_norm": 0.10651128739118576, "learning_rate": 4.9933483632346164e-05, "loss": 0.1071, "num_input_tokens_seen": 831104, "step": 930 }, { "epoch": 0.24679952487792003, "grad_norm": 0.2735975980758667, "learning_rate": 4.993276602785821e-05, "loss": 0.1139, "num_input_tokens_seen": 835680, "step": 935 }, { "epoch": 0.24811930843341692, "grad_norm": 0.3815348148345947, "learning_rate": 4.993204457842441e-05, "loss": 0.0761, "num_input_tokens_seen": 840352, "step": 940 }, { "epoch": 0.2494390919889138, "grad_norm": 0.4771551191806793, "learning_rate": 4.993131928415602e-05, "loss": 0.0788, "num_input_tokens_seen": 844800, "step": 945 }, { "epoch": 0.25075887554441073, "grad_norm": 0.24847100675106049, "learning_rate": 4.993059014516489e-05, "loss": 0.0904, "num_input_tokens_seen": 849152, "step": 950 }, { "epoch": 0.2520786590999076, "grad_norm": 0.16576330363750458, "learning_rate": 4.9929857161563464e-05, "loss": 0.0881, "num_input_tokens_seen": 853632, "step": 955 }, { "epoch": 0.2533984426554045, "grad_norm": 0.12331245094537735, "learning_rate": 4.992912033346477e-05, "loss": 0.0996, "num_input_tokens_seen": 858336, "step": 960 }, { "epoch": 0.2547182262109014, "grad_norm": 0.28777411580085754, "learning_rate": 4.992837966098245e-05, "loss": 0.1248, "num_input_tokens_seen": 862528, "step": 965 }, { "epoch": 0.2560380097663983, "grad_norm": 0.22176648676395416, "learning_rate": 4.992763514423071e-05, "loss": 0.08, "num_input_tokens_seen": 866976, "step": 970 }, { "epoch": 0.25735779332189523, "grad_norm": 0.42294949293136597, "learning_rate": 4.992688678332437e-05, "loss": 0.0884, "num_input_tokens_seen": 871456, "step": 975 }, { "epoch": 0.2586775768773921, "grad_norm": 0.44421321153640747, "learning_rate": 4.992613457837884e-05, "loss": 0.1199, "num_input_tokens_seen": 876128, "step": 980 }, { "epoch": 0.259997360432889, "grad_norm": 0.5526481866836548, "learning_rate": 4.992537852951011e-05, "loss": 0.1402, "num_input_tokens_seen": 880352, "step": 985 }, { "epoch": 0.2613171439883859, "grad_norm": 0.32688695192337036, "learning_rate": 4.9924618636834785e-05, "loss": 0.1268, "num_input_tokens_seen": 884672, "step": 990 }, { "epoch": 0.2626369275438828, "grad_norm": 0.17850558459758759, "learning_rate": 4.9923854900470046e-05, "loss": 0.1902, "num_input_tokens_seen": 888800, "step": 995 }, { "epoch": 0.2639567110993797, "grad_norm": 0.15969976782798767, "learning_rate": 4.992308732053367e-05, "loss": 0.0847, "num_input_tokens_seen": 893216, "step": 1000 }, { "epoch": 0.2639567110993797, "eval_loss": 0.10668887197971344, "eval_runtime": 64.7406, "eval_samples_per_second": 104.031, "eval_steps_per_second": 26.012, "num_input_tokens_seen": 893216, "step": 1000 }, { "epoch": 0.2652764946548766, "grad_norm": 0.27260082960128784, "learning_rate": 4.992231589714402e-05, "loss": 0.0547, "num_input_tokens_seen": 897600, "step": 1005 }, { "epoch": 0.2665962782103735, "grad_norm": 0.22051632404327393, "learning_rate": 4.992154063042007e-05, "loss": 0.1193, "num_input_tokens_seen": 902112, "step": 1010 }, { "epoch": 0.2679160617658704, "grad_norm": 0.2527467608451843, "learning_rate": 4.992076152048136e-05, "loss": 0.1315, "num_input_tokens_seen": 906560, "step": 1015 }, { "epoch": 0.2692358453213673, "grad_norm": 0.22719652950763702, "learning_rate": 4.991997856744807e-05, "loss": 0.0928, "num_input_tokens_seen": 910720, "step": 1020 }, { "epoch": 0.2705556288768642, "grad_norm": 0.3573048412799835, "learning_rate": 4.9919191771440905e-05, "loss": 0.0639, "num_input_tokens_seen": 915040, "step": 1025 }, { "epoch": 0.2718754124323611, "grad_norm": 0.16758829355239868, "learning_rate": 4.991840113258122e-05, "loss": 0.1287, "num_input_tokens_seen": 919712, "step": 1030 }, { "epoch": 0.27319519598785796, "grad_norm": 0.3242450952529907, "learning_rate": 4.9917606650990933e-05, "loss": 0.0859, "num_input_tokens_seen": 924224, "step": 1035 }, { "epoch": 0.2745149795433549, "grad_norm": 0.08949166536331177, "learning_rate": 4.9916808326792566e-05, "loss": 0.1408, "num_input_tokens_seen": 928416, "step": 1040 }, { "epoch": 0.2758347630988518, "grad_norm": 0.29730647802352905, "learning_rate": 4.9916006160109235e-05, "loss": 0.136, "num_input_tokens_seen": 932800, "step": 1045 }, { "epoch": 0.2771545466543487, "grad_norm": 0.5288802981376648, "learning_rate": 4.991520015106464e-05, "loss": 0.1427, "num_input_tokens_seen": 937440, "step": 1050 }, { "epoch": 0.2784743302098456, "grad_norm": 0.1059739887714386, "learning_rate": 4.991439029978308e-05, "loss": 0.1003, "num_input_tokens_seen": 941952, "step": 1055 }, { "epoch": 0.27979411376534247, "grad_norm": 1.870263934135437, "learning_rate": 4.9913576606389434e-05, "loss": 0.0856, "num_input_tokens_seen": 946496, "step": 1060 }, { "epoch": 0.28111389732083936, "grad_norm": 0.20395955443382263, "learning_rate": 4.991275907100919e-05, "loss": 0.1319, "num_input_tokens_seen": 950496, "step": 1065 }, { "epoch": 0.2824336808763363, "grad_norm": 0.20579855144023895, "learning_rate": 4.9911937693768434e-05, "loss": 0.0463, "num_input_tokens_seen": 955104, "step": 1070 }, { "epoch": 0.2837534644318332, "grad_norm": 0.34967049956321716, "learning_rate": 4.991111247479382e-05, "loss": 0.1426, "num_input_tokens_seen": 959648, "step": 1075 }, { "epoch": 0.2850732479873301, "grad_norm": 0.33796805143356323, "learning_rate": 4.9910283414212605e-05, "loss": 0.0656, "num_input_tokens_seen": 964256, "step": 1080 }, { "epoch": 0.28639303154282697, "grad_norm": 0.38363730907440186, "learning_rate": 4.990945051215265e-05, "loss": 0.1718, "num_input_tokens_seen": 968608, "step": 1085 }, { "epoch": 0.28771281509832386, "grad_norm": 0.11821810901165009, "learning_rate": 4.99086137687424e-05, "loss": 0.0615, "num_input_tokens_seen": 973376, "step": 1090 }, { "epoch": 0.28903259865382075, "grad_norm": 0.5425934195518494, "learning_rate": 4.9907773184110874e-05, "loss": 0.1136, "num_input_tokens_seen": 977696, "step": 1095 }, { "epoch": 0.2903523822093177, "grad_norm": 0.24836581945419312, "learning_rate": 4.9906928758387715e-05, "loss": 0.0924, "num_input_tokens_seen": 981856, "step": 1100 }, { "epoch": 0.2916721657648146, "grad_norm": 0.2982899844646454, "learning_rate": 4.9906080491703146e-05, "loss": 0.1274, "num_input_tokens_seen": 986368, "step": 1105 }, { "epoch": 0.2929919493203115, "grad_norm": 0.2044144719839096, "learning_rate": 4.990522838418797e-05, "loss": 0.1227, "num_input_tokens_seen": 990848, "step": 1110 }, { "epoch": 0.29431173287580836, "grad_norm": 0.34760767221450806, "learning_rate": 4.9904372435973604e-05, "loss": 0.1054, "num_input_tokens_seen": 995488, "step": 1115 }, { "epoch": 0.29563151643130525, "grad_norm": 0.6031702160835266, "learning_rate": 4.990351264719203e-05, "loss": 0.0651, "num_input_tokens_seen": 999872, "step": 1120 }, { "epoch": 0.29695129998680214, "grad_norm": 0.30428245663642883, "learning_rate": 4.990264901797586e-05, "loss": 0.1183, "num_input_tokens_seen": 1004256, "step": 1125 }, { "epoch": 0.2982710835422991, "grad_norm": 0.26574578881263733, "learning_rate": 4.990178154845826e-05, "loss": 0.0815, "num_input_tokens_seen": 1008864, "step": 1130 }, { "epoch": 0.299590867097796, "grad_norm": 0.15483444929122925, "learning_rate": 4.9900910238773014e-05, "loss": 0.0753, "num_input_tokens_seen": 1013472, "step": 1135 }, { "epoch": 0.30091065065329287, "grad_norm": 0.17342835664749146, "learning_rate": 4.990003508905448e-05, "loss": 0.0752, "num_input_tokens_seen": 1018016, "step": 1140 }, { "epoch": 0.30223043420878976, "grad_norm": 0.15781110525131226, "learning_rate": 4.989915609943763e-05, "loss": 0.0635, "num_input_tokens_seen": 1022272, "step": 1145 }, { "epoch": 0.30355021776428665, "grad_norm": 0.28851866722106934, "learning_rate": 4.9898273270058e-05, "loss": 0.0783, "num_input_tokens_seen": 1026816, "step": 1150 }, { "epoch": 0.30487000131978353, "grad_norm": 0.0742795318365097, "learning_rate": 4.989738660105174e-05, "loss": 0.0932, "num_input_tokens_seen": 1031616, "step": 1155 }, { "epoch": 0.3061897848752805, "grad_norm": 0.460222065448761, "learning_rate": 4.989649609255559e-05, "loss": 0.1239, "num_input_tokens_seen": 1036320, "step": 1160 }, { "epoch": 0.30750956843077737, "grad_norm": 0.6675683259963989, "learning_rate": 4.989560174470687e-05, "loss": 0.0998, "num_input_tokens_seen": 1041248, "step": 1165 }, { "epoch": 0.30882935198627426, "grad_norm": 0.09712900221347809, "learning_rate": 4.989470355764351e-05, "loss": 0.1106, "num_input_tokens_seen": 1045824, "step": 1170 }, { "epoch": 0.31014913554177115, "grad_norm": 0.21646857261657715, "learning_rate": 4.9893801531504e-05, "loss": 0.0934, "num_input_tokens_seen": 1050368, "step": 1175 }, { "epoch": 0.31146891909726804, "grad_norm": 0.22017158567905426, "learning_rate": 4.9892895666427475e-05, "loss": 0.0898, "num_input_tokens_seen": 1054880, "step": 1180 }, { "epoch": 0.3127887026527649, "grad_norm": 0.2171737104654312, "learning_rate": 4.9891985962553606e-05, "loss": 0.0475, "num_input_tokens_seen": 1059168, "step": 1185 }, { "epoch": 0.3141084862082619, "grad_norm": 0.35547348856925964, "learning_rate": 4.989107242002269e-05, "loss": 0.1087, "num_input_tokens_seen": 1063584, "step": 1190 }, { "epoch": 0.31542826976375876, "grad_norm": 0.3880825936794281, "learning_rate": 4.989015503897561e-05, "loss": 0.104, "num_input_tokens_seen": 1067936, "step": 1195 }, { "epoch": 0.31674805331925565, "grad_norm": 0.09029817581176758, "learning_rate": 4.988923381955383e-05, "loss": 0.1027, "num_input_tokens_seen": 1072832, "step": 1200 }, { "epoch": 0.31674805331925565, "eval_loss": 0.10287895798683167, "eval_runtime": 64.7186, "eval_samples_per_second": 104.066, "eval_steps_per_second": 26.02, "num_input_tokens_seen": 1072832, "step": 1200 }, { "epoch": 0.31806783687475254, "grad_norm": 0.47186365723609924, "learning_rate": 4.988830876189942e-05, "loss": 0.127, "num_input_tokens_seen": 1077440, "step": 1205 }, { "epoch": 0.31938762043024943, "grad_norm": 0.2922489047050476, "learning_rate": 4.988737986615503e-05, "loss": 0.0803, "num_input_tokens_seen": 1081696, "step": 1210 }, { "epoch": 0.3207074039857463, "grad_norm": 0.19668005406856537, "learning_rate": 4.988644713246391e-05, "loss": 0.079, "num_input_tokens_seen": 1086112, "step": 1215 }, { "epoch": 0.3220271875412432, "grad_norm": 0.3127315044403076, "learning_rate": 4.988551056096991e-05, "loss": 0.0707, "num_input_tokens_seen": 1090528, "step": 1220 }, { "epoch": 0.32334697109674015, "grad_norm": 0.529601514339447, "learning_rate": 4.988457015181743e-05, "loss": 0.127, "num_input_tokens_seen": 1094944, "step": 1225 }, { "epoch": 0.32466675465223704, "grad_norm": 0.31953519582748413, "learning_rate": 4.988362590515153e-05, "loss": 0.07, "num_input_tokens_seen": 1099296, "step": 1230 }, { "epoch": 0.32598653820773393, "grad_norm": 0.22914747893810272, "learning_rate": 4.9882677821117805e-05, "loss": 0.0883, "num_input_tokens_seen": 1103552, "step": 1235 }, { "epoch": 0.3273063217632308, "grad_norm": 0.19504457712173462, "learning_rate": 4.988172589986246e-05, "loss": 0.176, "num_input_tokens_seen": 1108096, "step": 1240 }, { "epoch": 0.3286261053187277, "grad_norm": 0.25285378098487854, "learning_rate": 4.9880770141532304e-05, "loss": 0.087, "num_input_tokens_seen": 1112576, "step": 1245 }, { "epoch": 0.3299458888742246, "grad_norm": 0.04385167732834816, "learning_rate": 4.987981054627472e-05, "loss": 0.0911, "num_input_tokens_seen": 1116832, "step": 1250 }, { "epoch": 0.33126567242972155, "grad_norm": 0.31458359956741333, "learning_rate": 4.987884711423769e-05, "loss": 0.1843, "num_input_tokens_seen": 1121280, "step": 1255 }, { "epoch": 0.33258545598521844, "grad_norm": 0.6382271647453308, "learning_rate": 4.9877879845569784e-05, "loss": 0.1226, "num_input_tokens_seen": 1125728, "step": 1260 }, { "epoch": 0.3339052395407153, "grad_norm": 0.3020384907722473, "learning_rate": 4.9876908740420175e-05, "loss": 0.0935, "num_input_tokens_seen": 1130368, "step": 1265 }, { "epoch": 0.3352250230962122, "grad_norm": 0.15630574524402618, "learning_rate": 4.987593379893861e-05, "loss": 0.0766, "num_input_tokens_seen": 1134944, "step": 1270 }, { "epoch": 0.3365448066517091, "grad_norm": 0.13506488502025604, "learning_rate": 4.987495502127545e-05, "loss": 0.0957, "num_input_tokens_seen": 1139168, "step": 1275 }, { "epoch": 0.337864590207206, "grad_norm": 0.617756724357605, "learning_rate": 4.987397240758162e-05, "loss": 0.1222, "num_input_tokens_seen": 1143392, "step": 1280 }, { "epoch": 0.33918437376270294, "grad_norm": 0.395846962928772, "learning_rate": 4.9872985958008664e-05, "loss": 0.0828, "num_input_tokens_seen": 1147648, "step": 1285 }, { "epoch": 0.34050415731819983, "grad_norm": 0.5758973956108093, "learning_rate": 4.987199567270871e-05, "loss": 0.1162, "num_input_tokens_seen": 1151744, "step": 1290 }, { "epoch": 0.3418239408736967, "grad_norm": 0.5372743606567383, "learning_rate": 4.9871001551834444e-05, "loss": 0.125, "num_input_tokens_seen": 1156256, "step": 1295 }, { "epoch": 0.3431437244291936, "grad_norm": 0.3537611663341522, "learning_rate": 4.98700035955392e-05, "loss": 0.0959, "num_input_tokens_seen": 1160768, "step": 1300 }, { "epoch": 0.3444635079846905, "grad_norm": 0.30623751878738403, "learning_rate": 4.986900180397686e-05, "loss": 0.1389, "num_input_tokens_seen": 1165248, "step": 1305 }, { "epoch": 0.3457832915401874, "grad_norm": 0.3520481288433075, "learning_rate": 4.9867996177301926e-05, "loss": 0.0638, "num_input_tokens_seen": 1169504, "step": 1310 }, { "epoch": 0.34710307509568433, "grad_norm": 0.18603025376796722, "learning_rate": 4.9866986715669464e-05, "loss": 0.0638, "num_input_tokens_seen": 1173856, "step": 1315 }, { "epoch": 0.3484228586511812, "grad_norm": 0.11228612810373306, "learning_rate": 4.9865973419235155e-05, "loss": 0.0818, "num_input_tokens_seen": 1178432, "step": 1320 }, { "epoch": 0.3497426422066781, "grad_norm": 0.319702684879303, "learning_rate": 4.986495628815526e-05, "loss": 0.0846, "num_input_tokens_seen": 1182944, "step": 1325 }, { "epoch": 0.351062425762175, "grad_norm": 0.23791475594043732, "learning_rate": 4.986393532258663e-05, "loss": 0.1379, "num_input_tokens_seen": 1187456, "step": 1330 }, { "epoch": 0.3523822093176719, "grad_norm": 0.19384010136127472, "learning_rate": 4.986291052268671e-05, "loss": 0.1021, "num_input_tokens_seen": 1191968, "step": 1335 }, { "epoch": 0.3537019928731688, "grad_norm": 0.20502661168575287, "learning_rate": 4.986188188861355e-05, "loss": 0.0985, "num_input_tokens_seen": 1196416, "step": 1340 }, { "epoch": 0.3550217764286657, "grad_norm": 0.1767132580280304, "learning_rate": 4.9860849420525766e-05, "loss": 0.0543, "num_input_tokens_seen": 1201120, "step": 1345 }, { "epoch": 0.3563415599841626, "grad_norm": 0.17463809251785278, "learning_rate": 4.9859813118582575e-05, "loss": 0.0865, "num_input_tokens_seen": 1205952, "step": 1350 }, { "epoch": 0.3576613435396595, "grad_norm": 0.13529054820537567, "learning_rate": 4.98587729829438e-05, "loss": 0.1058, "num_input_tokens_seen": 1210368, "step": 1355 }, { "epoch": 0.3589811270951564, "grad_norm": 0.37380582094192505, "learning_rate": 4.985772901376983e-05, "loss": 0.1259, "num_input_tokens_seen": 1214848, "step": 1360 }, { "epoch": 0.3603009106506533, "grad_norm": 0.45343899726867676, "learning_rate": 4.9856681211221666e-05, "loss": 0.0779, "num_input_tokens_seen": 1219328, "step": 1365 }, { "epoch": 0.3616206942061502, "grad_norm": 0.5038014054298401, "learning_rate": 4.985562957546089e-05, "loss": 0.0932, "num_input_tokens_seen": 1224128, "step": 1370 }, { "epoch": 0.36294047776164706, "grad_norm": 0.09238982200622559, "learning_rate": 4.9854574106649686e-05, "loss": 0.0792, "num_input_tokens_seen": 1228512, "step": 1375 }, { "epoch": 0.364260261317144, "grad_norm": 0.17974753677845, "learning_rate": 4.985351480495081e-05, "loss": 0.1263, "num_input_tokens_seen": 1232864, "step": 1380 }, { "epoch": 0.3655800448726409, "grad_norm": 0.12978380918502808, "learning_rate": 4.985245167052762e-05, "loss": 0.0709, "num_input_tokens_seen": 1237248, "step": 1385 }, { "epoch": 0.3668998284281378, "grad_norm": 0.29572758078575134, "learning_rate": 4.9851384703544066e-05, "loss": 0.0769, "num_input_tokens_seen": 1241952, "step": 1390 }, { "epoch": 0.3682196119836347, "grad_norm": 0.227921724319458, "learning_rate": 4.985031390416469e-05, "loss": 0.0706, "num_input_tokens_seen": 1246048, "step": 1395 }, { "epoch": 0.36953939553913157, "grad_norm": 0.09136226773262024, "learning_rate": 4.984923927255461e-05, "loss": 0.0438, "num_input_tokens_seen": 1250688, "step": 1400 }, { "epoch": 0.36953939553913157, "eval_loss": 0.10004811733961105, "eval_runtime": 64.7302, "eval_samples_per_second": 104.047, "eval_steps_per_second": 26.016, "num_input_tokens_seen": 1250688, "step": 1400 }, { "epoch": 0.37085917909462845, "grad_norm": 0.12683330476284027, "learning_rate": 4.984816080887958e-05, "loss": 0.1363, "num_input_tokens_seen": 1255200, "step": 1405 }, { "epoch": 0.3721789626501254, "grad_norm": 0.45369547605514526, "learning_rate": 4.9847078513305875e-05, "loss": 0.1014, "num_input_tokens_seen": 1259872, "step": 1410 }, { "epoch": 0.3734987462056223, "grad_norm": 0.33496251702308655, "learning_rate": 4.984599238600043e-05, "loss": 0.1219, "num_input_tokens_seen": 1264352, "step": 1415 }, { "epoch": 0.3748185297611192, "grad_norm": 0.16501431167125702, "learning_rate": 4.9844902427130716e-05, "loss": 0.0759, "num_input_tokens_seen": 1268736, "step": 1420 }, { "epoch": 0.37613831331661607, "grad_norm": 0.21569593250751495, "learning_rate": 4.984380863686482e-05, "loss": 0.0653, "num_input_tokens_seen": 1273376, "step": 1425 }, { "epoch": 0.37745809687211296, "grad_norm": 0.45600253343582153, "learning_rate": 4.984271101537143e-05, "loss": 0.1084, "num_input_tokens_seen": 1277664, "step": 1430 }, { "epoch": 0.37877788042760985, "grad_norm": 0.16238021850585938, "learning_rate": 4.9841609562819816e-05, "loss": 0.0805, "num_input_tokens_seen": 1281856, "step": 1435 }, { "epoch": 0.3800976639831068, "grad_norm": 0.2542300224304199, "learning_rate": 4.984050427937983e-05, "loss": 0.0666, "num_input_tokens_seen": 1286112, "step": 1440 }, { "epoch": 0.3814174475386037, "grad_norm": 0.2706325948238373, "learning_rate": 4.983939516522191e-05, "loss": 0.1064, "num_input_tokens_seen": 1290208, "step": 1445 }, { "epoch": 0.38273723109410057, "grad_norm": 0.28565552830696106, "learning_rate": 4.983828222051711e-05, "loss": 0.1612, "num_input_tokens_seen": 1294656, "step": 1450 }, { "epoch": 0.38405701464959746, "grad_norm": 0.1407063603401184, "learning_rate": 4.983716544543705e-05, "loss": 0.0537, "num_input_tokens_seen": 1299296, "step": 1455 }, { "epoch": 0.38537679820509435, "grad_norm": 0.40513288974761963, "learning_rate": 4.983604484015395e-05, "loss": 0.0901, "num_input_tokens_seen": 1303776, "step": 1460 }, { "epoch": 0.38669658176059124, "grad_norm": 0.10866474360227585, "learning_rate": 4.983492040484064e-05, "loss": 0.0832, "num_input_tokens_seen": 1308160, "step": 1465 }, { "epoch": 0.3880163653160882, "grad_norm": 0.6641761064529419, "learning_rate": 4.98337921396705e-05, "loss": 0.1073, "num_input_tokens_seen": 1312736, "step": 1470 }, { "epoch": 0.3893361488715851, "grad_norm": 0.3927798569202423, "learning_rate": 4.983266004481753e-05, "loss": 0.1041, "num_input_tokens_seen": 1317248, "step": 1475 }, { "epoch": 0.39065593242708196, "grad_norm": 0.1425616443157196, "learning_rate": 4.9831524120456316e-05, "loss": 0.1212, "num_input_tokens_seen": 1321696, "step": 1480 }, { "epoch": 0.39197571598257885, "grad_norm": 0.22967137396335602, "learning_rate": 4.9830384366762026e-05, "loss": 0.0832, "num_input_tokens_seen": 1326080, "step": 1485 }, { "epoch": 0.39329549953807574, "grad_norm": 0.20993387699127197, "learning_rate": 4.9829240783910436e-05, "loss": 0.0565, "num_input_tokens_seen": 1330688, "step": 1490 }, { "epoch": 0.39461528309357263, "grad_norm": 0.36247509717941284, "learning_rate": 4.982809337207789e-05, "loss": 0.0726, "num_input_tokens_seen": 1335200, "step": 1495 }, { "epoch": 0.3959350666490696, "grad_norm": 0.30440714955329895, "learning_rate": 4.9826942131441337e-05, "loss": 0.1143, "num_input_tokens_seen": 1339520, "step": 1500 }, { "epoch": 0.39725485020456647, "grad_norm": 0.1928398609161377, "learning_rate": 4.9825787062178315e-05, "loss": 0.0962, "num_input_tokens_seen": 1343872, "step": 1505 }, { "epoch": 0.39857463376006336, "grad_norm": 0.17904697358608246, "learning_rate": 4.9824628164466945e-05, "loss": 0.079, "num_input_tokens_seen": 1348288, "step": 1510 }, { "epoch": 0.39989441731556025, "grad_norm": 0.2993811070919037, "learning_rate": 4.982346543848595e-05, "loss": 0.1054, "num_input_tokens_seen": 1352928, "step": 1515 }, { "epoch": 0.40121420087105714, "grad_norm": 0.4796639680862427, "learning_rate": 4.9822298884414626e-05, "loss": 0.1079, "num_input_tokens_seen": 1357312, "step": 1520 }, { "epoch": 0.402533984426554, "grad_norm": 0.19794157147407532, "learning_rate": 4.982112850243288e-05, "loss": 0.0935, "num_input_tokens_seen": 1361664, "step": 1525 }, { "epoch": 0.40385376798205097, "grad_norm": 0.0979548767209053, "learning_rate": 4.98199542927212e-05, "loss": 0.0595, "num_input_tokens_seen": 1366048, "step": 1530 }, { "epoch": 0.40517355153754786, "grad_norm": 0.8280842900276184, "learning_rate": 4.981877625546066e-05, "loss": 0.1531, "num_input_tokens_seen": 1370720, "step": 1535 }, { "epoch": 0.40649333509304475, "grad_norm": 0.051977045834064484, "learning_rate": 4.981759439083293e-05, "loss": 0.1009, "num_input_tokens_seen": 1375232, "step": 1540 }, { "epoch": 0.40781311864854164, "grad_norm": 0.25704362988471985, "learning_rate": 4.981640869902027e-05, "loss": 0.0584, "num_input_tokens_seen": 1379904, "step": 1545 }, { "epoch": 0.40913290220403853, "grad_norm": 0.3540825843811035, "learning_rate": 4.9815219180205517e-05, "loss": 0.1048, "num_input_tokens_seen": 1384544, "step": 1550 }, { "epoch": 0.4104526857595354, "grad_norm": 0.1316901594400406, "learning_rate": 4.9814025834572126e-05, "loss": 0.0881, "num_input_tokens_seen": 1388960, "step": 1555 }, { "epoch": 0.4117724693150323, "grad_norm": 0.1883913278579712, "learning_rate": 4.981282866230411e-05, "loss": 0.057, "num_input_tokens_seen": 1393408, "step": 1560 }, { "epoch": 0.41309225287052925, "grad_norm": 0.5797374248504639, "learning_rate": 4.981162766358611e-05, "loss": 0.131, "num_input_tokens_seen": 1398016, "step": 1565 }, { "epoch": 0.41441203642602614, "grad_norm": 0.45674562454223633, "learning_rate": 4.9810422838603316e-05, "loss": 0.0839, "num_input_tokens_seen": 1403008, "step": 1570 }, { "epoch": 0.41573181998152303, "grad_norm": 0.22351659834384918, "learning_rate": 4.9809214187541533e-05, "loss": 0.0663, "num_input_tokens_seen": 1407328, "step": 1575 }, { "epoch": 0.4170516035370199, "grad_norm": 0.30450570583343506, "learning_rate": 4.980800171058715e-05, "loss": 0.1239, "num_input_tokens_seen": 1411776, "step": 1580 }, { "epoch": 0.4183713870925168, "grad_norm": 0.49518051743507385, "learning_rate": 4.980678540792715e-05, "loss": 0.1054, "num_input_tokens_seen": 1415968, "step": 1585 }, { "epoch": 0.4196911706480137, "grad_norm": 0.3628959655761719, "learning_rate": 4.980556527974909e-05, "loss": 0.0749, "num_input_tokens_seen": 1420288, "step": 1590 }, { "epoch": 0.42101095420351065, "grad_norm": 0.20368343591690063, "learning_rate": 4.980434132624114e-05, "loss": 0.0727, "num_input_tokens_seen": 1425216, "step": 1595 }, { "epoch": 0.42233073775900754, "grad_norm": 0.42046883702278137, "learning_rate": 4.980311354759205e-05, "loss": 0.1518, "num_input_tokens_seen": 1429824, "step": 1600 }, { "epoch": 0.42233073775900754, "eval_loss": 0.09779907763004303, "eval_runtime": 64.8213, "eval_samples_per_second": 103.901, "eval_steps_per_second": 25.979, "num_input_tokens_seen": 1429824, "step": 1600 }, { "epoch": 0.4236505213145044, "grad_norm": 0.19637933373451233, "learning_rate": 4.980188194399116e-05, "loss": 0.0879, "num_input_tokens_seen": 1434144, "step": 1605 }, { "epoch": 0.4249703048700013, "grad_norm": 0.05599268525838852, "learning_rate": 4.9800646515628384e-05, "loss": 0.09, "num_input_tokens_seen": 1438240, "step": 1610 }, { "epoch": 0.4262900884254982, "grad_norm": 0.7357270121574402, "learning_rate": 4.979940726269426e-05, "loss": 0.133, "num_input_tokens_seen": 1442784, "step": 1615 }, { "epoch": 0.4276098719809951, "grad_norm": 0.1672726571559906, "learning_rate": 4.979816418537988e-05, "loss": 0.0401, "num_input_tokens_seen": 1447200, "step": 1620 }, { "epoch": 0.42892965553649204, "grad_norm": 0.10277122259140015, "learning_rate": 4.979691728387696e-05, "loss": 0.0731, "num_input_tokens_seen": 1451552, "step": 1625 }, { "epoch": 0.43024943909198893, "grad_norm": 0.38115474581718445, "learning_rate": 4.979566655837776e-05, "loss": 0.0873, "num_input_tokens_seen": 1455904, "step": 1630 }, { "epoch": 0.4315692226474858, "grad_norm": 0.1787000149488449, "learning_rate": 4.9794412009075184e-05, "loss": 0.0862, "num_input_tokens_seen": 1460288, "step": 1635 }, { "epoch": 0.4328890062029827, "grad_norm": 0.04687885567545891, "learning_rate": 4.979315363616269e-05, "loss": 0.101, "num_input_tokens_seen": 1464544, "step": 1640 }, { "epoch": 0.4342087897584796, "grad_norm": 0.3999486565589905, "learning_rate": 4.979189143983434e-05, "loss": 0.0551, "num_input_tokens_seen": 1468896, "step": 1645 }, { "epoch": 0.4355285733139765, "grad_norm": 0.32199081778526306, "learning_rate": 4.979062542028478e-05, "loss": 0.0381, "num_input_tokens_seen": 1473568, "step": 1650 }, { "epoch": 0.43684835686947343, "grad_norm": 0.7778474688529968, "learning_rate": 4.978935557770923e-05, "loss": 0.1042, "num_input_tokens_seen": 1478016, "step": 1655 }, { "epoch": 0.4381681404249703, "grad_norm": 0.12072020769119263, "learning_rate": 4.978808191230353e-05, "loss": 0.0517, "num_input_tokens_seen": 1482688, "step": 1660 }, { "epoch": 0.4394879239804672, "grad_norm": 0.34124454855918884, "learning_rate": 4.9786804424264085e-05, "loss": 0.0798, "num_input_tokens_seen": 1487104, "step": 1665 }, { "epoch": 0.4408077075359641, "grad_norm": 0.4942472577095032, "learning_rate": 4.978552311378792e-05, "loss": 0.0856, "num_input_tokens_seen": 1491712, "step": 1670 }, { "epoch": 0.442127491091461, "grad_norm": 0.2944345772266388, "learning_rate": 4.978423798107261e-05, "loss": 0.0666, "num_input_tokens_seen": 1495968, "step": 1675 }, { "epoch": 0.4434472746469579, "grad_norm": 0.5714668035507202, "learning_rate": 4.978294902631635e-05, "loss": 0.0922, "num_input_tokens_seen": 1500544, "step": 1680 }, { "epoch": 0.4447670582024548, "grad_norm": 0.13224996626377106, "learning_rate": 4.9781656249717914e-05, "loss": 0.0507, "num_input_tokens_seen": 1504864, "step": 1685 }, { "epoch": 0.4460868417579517, "grad_norm": 0.3308027684688568, "learning_rate": 4.9780359651476645e-05, "loss": 0.072, "num_input_tokens_seen": 1509184, "step": 1690 }, { "epoch": 0.4474066253134486, "grad_norm": 0.5095565319061279, "learning_rate": 4.977905923179251e-05, "loss": 0.157, "num_input_tokens_seen": 1513536, "step": 1695 }, { "epoch": 0.4487264088689455, "grad_norm": 0.18884052336215973, "learning_rate": 4.977775499086606e-05, "loss": 0.0998, "num_input_tokens_seen": 1517920, "step": 1700 }, { "epoch": 0.4500461924244424, "grad_norm": 0.0852644145488739, "learning_rate": 4.97764469288984e-05, "loss": 0.0829, "num_input_tokens_seen": 1522688, "step": 1705 }, { "epoch": 0.45136597597993927, "grad_norm": 0.6086108684539795, "learning_rate": 4.977513504609127e-05, "loss": 0.0793, "num_input_tokens_seen": 1527360, "step": 1710 }, { "epoch": 0.45268575953543616, "grad_norm": 0.30944541096687317, "learning_rate": 4.9773819342646965e-05, "loss": 0.096, "num_input_tokens_seen": 1531776, "step": 1715 }, { "epoch": 0.4540055430909331, "grad_norm": 0.5952541828155518, "learning_rate": 4.97724998187684e-05, "loss": 0.185, "num_input_tokens_seen": 1536384, "step": 1720 }, { "epoch": 0.45532532664643, "grad_norm": 0.259219229221344, "learning_rate": 4.9771176474659045e-05, "loss": 0.1026, "num_input_tokens_seen": 1541184, "step": 1725 }, { "epoch": 0.4566451102019269, "grad_norm": 0.25308936834335327, "learning_rate": 4.976984931052299e-05, "loss": 0.0588, "num_input_tokens_seen": 1545568, "step": 1730 }, { "epoch": 0.4579648937574238, "grad_norm": 0.2521771788597107, "learning_rate": 4.976851832656489e-05, "loss": 0.0782, "num_input_tokens_seen": 1550048, "step": 1735 }, { "epoch": 0.45928467731292066, "grad_norm": 0.11577163636684418, "learning_rate": 4.9767183522990004e-05, "loss": 0.0561, "num_input_tokens_seen": 1554656, "step": 1740 }, { "epoch": 0.46060446086841755, "grad_norm": 0.647648811340332, "learning_rate": 4.9765844900004176e-05, "loss": 0.1031, "num_input_tokens_seen": 1559232, "step": 1745 }, { "epoch": 0.4619242444239145, "grad_norm": 0.08472394943237305, "learning_rate": 4.9764502457813834e-05, "loss": 0.1291, "num_input_tokens_seen": 1564192, "step": 1750 }, { "epoch": 0.4632440279794114, "grad_norm": 0.07346078753471375, "learning_rate": 4.9763156196626005e-05, "loss": 0.0525, "num_input_tokens_seen": 1568672, "step": 1755 }, { "epoch": 0.4645638115349083, "grad_norm": 0.22509826719760895, "learning_rate": 4.97618061166483e-05, "loss": 0.0702, "num_input_tokens_seen": 1573184, "step": 1760 }, { "epoch": 0.46588359509040517, "grad_norm": 0.28262823820114136, "learning_rate": 4.9760452218088915e-05, "loss": 0.1343, "num_input_tokens_seen": 1577568, "step": 1765 }, { "epoch": 0.46720337864590206, "grad_norm": 0.3671574294567108, "learning_rate": 4.975909450115663e-05, "loss": 0.1119, "num_input_tokens_seen": 1582144, "step": 1770 }, { "epoch": 0.46852316220139895, "grad_norm": 0.07467343658208847, "learning_rate": 4.975773296606084e-05, "loss": 0.0904, "num_input_tokens_seen": 1586656, "step": 1775 }, { "epoch": 0.4698429457568959, "grad_norm": 0.33608803153038025, "learning_rate": 4.97563676130115e-05, "loss": 0.079, "num_input_tokens_seen": 1590976, "step": 1780 }, { "epoch": 0.4711627293123928, "grad_norm": 0.1629789173603058, "learning_rate": 4.9754998442219166e-05, "loss": 0.0366, "num_input_tokens_seen": 1595360, "step": 1785 }, { "epoch": 0.47248251286788967, "grad_norm": 0.5906822681427002, "learning_rate": 4.9753625453894984e-05, "loss": 0.1074, "num_input_tokens_seen": 1599744, "step": 1790 }, { "epoch": 0.47380229642338656, "grad_norm": 0.07769432663917542, "learning_rate": 4.975224864825068e-05, "loss": 0.0439, "num_input_tokens_seen": 1604096, "step": 1795 }, { "epoch": 0.47512207997888345, "grad_norm": 0.136910080909729, "learning_rate": 4.9750868025498576e-05, "loss": 0.0699, "num_input_tokens_seen": 1608736, "step": 1800 }, { "epoch": 0.47512207997888345, "eval_loss": 0.09629397094249725, "eval_runtime": 64.7424, "eval_samples_per_second": 104.028, "eval_steps_per_second": 26.011, "num_input_tokens_seen": 1608736, "step": 1800 }, { "epoch": 0.47644186353438034, "grad_norm": 0.3345334529876709, "learning_rate": 4.974948358585158e-05, "loss": 0.1076, "num_input_tokens_seen": 1613440, "step": 1805 }, { "epoch": 0.4777616470898773, "grad_norm": 0.3737996816635132, "learning_rate": 4.9748095329523205e-05, "loss": 0.0784, "num_input_tokens_seen": 1618144, "step": 1810 }, { "epoch": 0.4790814306453742, "grad_norm": 0.14992079138755798, "learning_rate": 4.974670325672752e-05, "loss": 0.0843, "num_input_tokens_seen": 1622560, "step": 1815 }, { "epoch": 0.48040121420087106, "grad_norm": 0.22753696143627167, "learning_rate": 4.974530736767921e-05, "loss": 0.0533, "num_input_tokens_seen": 1626976, "step": 1820 }, { "epoch": 0.48172099775636795, "grad_norm": 0.48472854495048523, "learning_rate": 4.9743907662593524e-05, "loss": 0.0916, "num_input_tokens_seen": 1631616, "step": 1825 }, { "epoch": 0.48304078131186484, "grad_norm": 0.19427478313446045, "learning_rate": 4.974250414168633e-05, "loss": 0.0613, "num_input_tokens_seen": 1636224, "step": 1830 }, { "epoch": 0.48436056486736173, "grad_norm": 0.18898725509643555, "learning_rate": 4.974109680517407e-05, "loss": 0.1095, "num_input_tokens_seen": 1640896, "step": 1835 }, { "epoch": 0.4856803484228587, "grad_norm": 0.4109775424003601, "learning_rate": 4.973968565327376e-05, "loss": 0.1206, "num_input_tokens_seen": 1645504, "step": 1840 }, { "epoch": 0.48700013197835557, "grad_norm": 0.25475284457206726, "learning_rate": 4.973827068620303e-05, "loss": 0.1392, "num_input_tokens_seen": 1649952, "step": 1845 }, { "epoch": 0.48831991553385246, "grad_norm": 0.3834966719150543, "learning_rate": 4.973685190418008e-05, "loss": 0.0923, "num_input_tokens_seen": 1654432, "step": 1850 }, { "epoch": 0.48963969908934935, "grad_norm": 0.32690882682800293, "learning_rate": 4.97354293074237e-05, "loss": 0.0483, "num_input_tokens_seen": 1658720, "step": 1855 }, { "epoch": 0.49095948264484623, "grad_norm": 0.20059968531131744, "learning_rate": 4.9734002896153276e-05, "loss": 0.1034, "num_input_tokens_seen": 1663040, "step": 1860 }, { "epoch": 0.4922792662003431, "grad_norm": 0.22899504005908966, "learning_rate": 4.973257267058877e-05, "loss": 0.0954, "num_input_tokens_seen": 1667776, "step": 1865 }, { "epoch": 0.49359904975584007, "grad_norm": 0.24791018664836884, "learning_rate": 4.973113863095076e-05, "loss": 0.093, "num_input_tokens_seen": 1672384, "step": 1870 }, { "epoch": 0.49491883331133696, "grad_norm": 0.15553689002990723, "learning_rate": 4.9729700777460384e-05, "loss": 0.0956, "num_input_tokens_seen": 1676768, "step": 1875 }, { "epoch": 0.49623861686683385, "grad_norm": 0.3527602553367615, "learning_rate": 4.972825911033937e-05, "loss": 0.1073, "num_input_tokens_seen": 1681344, "step": 1880 }, { "epoch": 0.49755840042233074, "grad_norm": 0.6908682584762573, "learning_rate": 4.9726813629810056e-05, "loss": 0.0681, "num_input_tokens_seen": 1685824, "step": 1885 }, { "epoch": 0.4988781839778276, "grad_norm": 0.21874700486660004, "learning_rate": 4.9725364336095326e-05, "loss": 0.0817, "num_input_tokens_seen": 1690208, "step": 1890 }, { "epoch": 0.5001979675333246, "grad_norm": 0.4724104106426239, "learning_rate": 4.972391122941871e-05, "loss": 0.0702, "num_input_tokens_seen": 1694784, "step": 1895 }, { "epoch": 0.5015177510888215, "grad_norm": 0.49158525466918945, "learning_rate": 4.972245431000428e-05, "loss": 0.084, "num_input_tokens_seen": 1699232, "step": 1900 }, { "epoch": 0.5028375346443184, "grad_norm": 0.3119906187057495, "learning_rate": 4.972099357807671e-05, "loss": 0.1149, "num_input_tokens_seen": 1703712, "step": 1905 }, { "epoch": 0.5041573181998152, "grad_norm": 0.11862573027610779, "learning_rate": 4.971952903386127e-05, "loss": 0.1251, "num_input_tokens_seen": 1707968, "step": 1910 }, { "epoch": 0.5054771017553121, "grad_norm": 0.3632994592189789, "learning_rate": 4.971806067758381e-05, "loss": 0.0912, "num_input_tokens_seen": 1712224, "step": 1915 }, { "epoch": 0.506796885310809, "grad_norm": 0.4220863878726959, "learning_rate": 4.971658850947076e-05, "loss": 0.0933, "num_input_tokens_seen": 1716416, "step": 1920 }, { "epoch": 0.5081166688663059, "grad_norm": 0.13918663561344147, "learning_rate": 4.9715112529749165e-05, "loss": 0.0446, "num_input_tokens_seen": 1720992, "step": 1925 }, { "epoch": 0.5094364524218028, "grad_norm": 0.20145854353904724, "learning_rate": 4.9713632738646624e-05, "loss": 0.0952, "num_input_tokens_seen": 1725184, "step": 1930 }, { "epoch": 0.5107562359772997, "grad_norm": 0.2000434249639511, "learning_rate": 4.971214913639134e-05, "loss": 0.0966, "num_input_tokens_seen": 1729664, "step": 1935 }, { "epoch": 0.5120760195327966, "grad_norm": 0.13887614011764526, "learning_rate": 4.9710661723212104e-05, "loss": 0.0899, "num_input_tokens_seen": 1734208, "step": 1940 }, { "epoch": 0.5133958030882935, "grad_norm": 0.06072315573692322, "learning_rate": 4.9709170499338295e-05, "loss": 0.0528, "num_input_tokens_seen": 1738688, "step": 1945 }, { "epoch": 0.5147155866437905, "grad_norm": 0.4405849575996399, "learning_rate": 4.9707675464999895e-05, "loss": 0.0872, "num_input_tokens_seen": 1743296, "step": 1950 }, { "epoch": 0.5160353701992874, "grad_norm": 0.22208745777606964, "learning_rate": 4.970617662042743e-05, "loss": 0.0946, "num_input_tokens_seen": 1747776, "step": 1955 }, { "epoch": 0.5173551537547842, "grad_norm": 0.21280348300933838, "learning_rate": 4.970467396585206e-05, "loss": 0.0721, "num_input_tokens_seen": 1752096, "step": 1960 }, { "epoch": 0.5186749373102811, "grad_norm": 0.24638284742832184, "learning_rate": 4.97031675015055e-05, "loss": 0.0419, "num_input_tokens_seen": 1756416, "step": 1965 }, { "epoch": 0.519994720865778, "grad_norm": 0.10897671431303024, "learning_rate": 4.9701657227620075e-05, "loss": 0.0488, "num_input_tokens_seen": 1760864, "step": 1970 }, { "epoch": 0.5213145044212749, "grad_norm": 0.05196571350097656, "learning_rate": 4.9700143144428685e-05, "loss": 0.0409, "num_input_tokens_seen": 1765312, "step": 1975 }, { "epoch": 0.5226342879767718, "grad_norm": 0.3574368953704834, "learning_rate": 4.969862525216482e-05, "loss": 0.0749, "num_input_tokens_seen": 1769824, "step": 1980 }, { "epoch": 0.5239540715322687, "grad_norm": 0.22640113532543182, "learning_rate": 4.9697103551062556e-05, "loss": 0.0488, "num_input_tokens_seen": 1774208, "step": 1985 }, { "epoch": 0.5252738550877656, "grad_norm": 0.16414359211921692, "learning_rate": 4.9695578041356565e-05, "loss": 0.1292, "num_input_tokens_seen": 1778528, "step": 1990 }, { "epoch": 0.5265936386432625, "grad_norm": 0.6334508657455444, "learning_rate": 4.969404872328209e-05, "loss": 0.1417, "num_input_tokens_seen": 1783008, "step": 1995 }, { "epoch": 0.5279134221987594, "grad_norm": 0.840112030506134, "learning_rate": 4.969251559707498e-05, "loss": 0.1294, "num_input_tokens_seen": 1787552, "step": 2000 }, { "epoch": 0.5279134221987594, "eval_loss": 0.09432559460401535, "eval_runtime": 64.7151, "eval_samples_per_second": 104.072, "eval_steps_per_second": 26.022, "num_input_tokens_seen": 1787552, "step": 2000 }, { "epoch": 0.5292332057542563, "grad_norm": 0.39028042554855347, "learning_rate": 4.9690978662971674e-05, "loss": 0.0987, "num_input_tokens_seen": 1791936, "step": 2005 }, { "epoch": 0.5305529893097533, "grad_norm": 0.4104963541030884, "learning_rate": 4.968943792120916e-05, "loss": 0.0532, "num_input_tokens_seen": 1796480, "step": 2010 }, { "epoch": 0.5318727728652501, "grad_norm": 0.1568603515625, "learning_rate": 4.9687893372025046e-05, "loss": 0.0864, "num_input_tokens_seen": 1800416, "step": 2015 }, { "epoch": 0.533192556420747, "grad_norm": 0.5351195335388184, "learning_rate": 4.9686345015657535e-05, "loss": 0.0945, "num_input_tokens_seen": 1804896, "step": 2020 }, { "epoch": 0.5345123399762439, "grad_norm": 0.24083472788333893, "learning_rate": 4.968479285234538e-05, "loss": 0.0738, "num_input_tokens_seen": 1809568, "step": 2025 }, { "epoch": 0.5358321235317408, "grad_norm": 0.09859143197536469, "learning_rate": 4.9683236882327974e-05, "loss": 0.0811, "num_input_tokens_seen": 1814624, "step": 2030 }, { "epoch": 0.5371519070872377, "grad_norm": 0.3288732171058655, "learning_rate": 4.968167710584526e-05, "loss": 0.0854, "num_input_tokens_seen": 1819136, "step": 2035 }, { "epoch": 0.5384716906427346, "grad_norm": 0.16594676673412323, "learning_rate": 4.968011352313775e-05, "loss": 0.1078, "num_input_tokens_seen": 1823776, "step": 2040 }, { "epoch": 0.5397914741982315, "grad_norm": 0.090375155210495, "learning_rate": 4.967854613444659e-05, "loss": 0.0515, "num_input_tokens_seen": 1828576, "step": 2045 }, { "epoch": 0.5411112577537284, "grad_norm": 0.17054040729999542, "learning_rate": 4.967697494001349e-05, "loss": 0.0676, "num_input_tokens_seen": 1833152, "step": 2050 }, { "epoch": 0.5424310413092253, "grad_norm": 0.2564355432987213, "learning_rate": 4.9675399940080736e-05, "loss": 0.1324, "num_input_tokens_seen": 1837376, "step": 2055 }, { "epoch": 0.5437508248647221, "grad_norm": 0.09312942624092102, "learning_rate": 4.9673821134891226e-05, "loss": 0.1044, "num_input_tokens_seen": 1841824, "step": 2060 }, { "epoch": 0.545070608420219, "grad_norm": 0.27085360884666443, "learning_rate": 4.967223852468842e-05, "loss": 0.0729, "num_input_tokens_seen": 1846240, "step": 2065 }, { "epoch": 0.5463903919757159, "grad_norm": 0.1996983140707016, "learning_rate": 4.967065210971639e-05, "loss": 0.1298, "num_input_tokens_seen": 1850816, "step": 2070 }, { "epoch": 0.5477101755312129, "grad_norm": 0.10482937097549438, "learning_rate": 4.966906189021977e-05, "loss": 0.0835, "num_input_tokens_seen": 1855328, "step": 2075 }, { "epoch": 0.5490299590867098, "grad_norm": 0.3350965678691864, "learning_rate": 4.966746786644379e-05, "loss": 0.0997, "num_input_tokens_seen": 1859424, "step": 2080 }, { "epoch": 0.5503497426422067, "grad_norm": 0.1711457520723343, "learning_rate": 4.966587003863429e-05, "loss": 0.0979, "num_input_tokens_seen": 1864000, "step": 2085 }, { "epoch": 0.5516695261977036, "grad_norm": 0.42763251066207886, "learning_rate": 4.966426840703765e-05, "loss": 0.1011, "num_input_tokens_seen": 1868416, "step": 2090 }, { "epoch": 0.5529893097532005, "grad_norm": 0.22510138154029846, "learning_rate": 4.9662662971900875e-05, "loss": 0.0646, "num_input_tokens_seen": 1872864, "step": 2095 }, { "epoch": 0.5543090933086974, "grad_norm": 0.08934833109378815, "learning_rate": 4.9661053733471534e-05, "loss": 0.0668, "num_input_tokens_seen": 1877312, "step": 2100 }, { "epoch": 0.5556288768641943, "grad_norm": 0.11473236232995987, "learning_rate": 4.965944069199781e-05, "loss": 0.0643, "num_input_tokens_seen": 1881824, "step": 2105 }, { "epoch": 0.5569486604196912, "grad_norm": 0.2178570181131363, "learning_rate": 4.965782384772842e-05, "loss": 0.0881, "num_input_tokens_seen": 1886400, "step": 2110 }, { "epoch": 0.558268443975188, "grad_norm": 0.4115614593029022, "learning_rate": 4.9656203200912734e-05, "loss": 0.0571, "num_input_tokens_seen": 1891072, "step": 2115 }, { "epoch": 0.5595882275306849, "grad_norm": 0.08171983063220978, "learning_rate": 4.965457875180067e-05, "loss": 0.031, "num_input_tokens_seen": 1895328, "step": 2120 }, { "epoch": 0.5609080110861818, "grad_norm": 0.08847381174564362, "learning_rate": 4.9652950500642724e-05, "loss": 0.1079, "num_input_tokens_seen": 1899904, "step": 2125 }, { "epoch": 0.5622277946416787, "grad_norm": 0.12006539106369019, "learning_rate": 4.965131844769001e-05, "loss": 0.1674, "num_input_tokens_seen": 1904288, "step": 2130 }, { "epoch": 0.5635475781971757, "grad_norm": 0.24875101447105408, "learning_rate": 4.96496825931942e-05, "loss": 0.0609, "num_input_tokens_seen": 1908960, "step": 2135 }, { "epoch": 0.5648673617526726, "grad_norm": 0.2574158310890198, "learning_rate": 4.9648042937407566e-05, "loss": 0.1143, "num_input_tokens_seen": 1913248, "step": 2140 }, { "epoch": 0.5661871453081695, "grad_norm": 0.358491986989975, "learning_rate": 4.964639948058297e-05, "loss": 0.1472, "num_input_tokens_seen": 1917792, "step": 2145 }, { "epoch": 0.5675069288636664, "grad_norm": 0.5776029229164124, "learning_rate": 4.9644752222973846e-05, "loss": 0.1022, "num_input_tokens_seen": 1922464, "step": 2150 }, { "epoch": 0.5688267124191633, "grad_norm": 0.192543625831604, "learning_rate": 4.964310116483422e-05, "loss": 0.0435, "num_input_tokens_seen": 1927168, "step": 2155 }, { "epoch": 0.5701464959746602, "grad_norm": 0.4438026249408722, "learning_rate": 4.964144630641872e-05, "loss": 0.1011, "num_input_tokens_seen": 1932096, "step": 2160 }, { "epoch": 0.571466279530157, "grad_norm": 0.303320050239563, "learning_rate": 4.9639787647982525e-05, "loss": 0.0974, "num_input_tokens_seen": 1936384, "step": 2165 }, { "epoch": 0.5727860630856539, "grad_norm": 0.22063741087913513, "learning_rate": 4.963812518978143e-05, "loss": 0.089, "num_input_tokens_seen": 1940960, "step": 2170 }, { "epoch": 0.5741058466411508, "grad_norm": 0.4069156348705292, "learning_rate": 4.963645893207182e-05, "loss": 0.1086, "num_input_tokens_seen": 1945408, "step": 2175 }, { "epoch": 0.5754256301966477, "grad_norm": 0.16308940947055817, "learning_rate": 4.963478887511063e-05, "loss": 0.0715, "num_input_tokens_seen": 1949888, "step": 2180 }, { "epoch": 0.5767454137521446, "grad_norm": 0.1811479777097702, "learning_rate": 4.963311501915542e-05, "loss": 0.0517, "num_input_tokens_seen": 1954304, "step": 2185 }, { "epoch": 0.5780651973076415, "grad_norm": 0.08718302100896835, "learning_rate": 4.963143736446432e-05, "loss": 0.0984, "num_input_tokens_seen": 1958912, "step": 2190 }, { "epoch": 0.5793849808631385, "grad_norm": 0.35911914706230164, "learning_rate": 4.962975591129603e-05, "loss": 0.0937, "num_input_tokens_seen": 1963456, "step": 2195 }, { "epoch": 0.5807047644186354, "grad_norm": 0.18520918488502502, "learning_rate": 4.962807065990986e-05, "loss": 0.0825, "num_input_tokens_seen": 1968064, "step": 2200 }, { "epoch": 0.5807047644186354, "eval_loss": 0.09268565475940704, "eval_runtime": 64.744, "eval_samples_per_second": 104.025, "eval_steps_per_second": 26.01, "num_input_tokens_seen": 1968064, "step": 2200 }, { "epoch": 0.5820245479741323, "grad_norm": 0.27053436636924744, "learning_rate": 4.9626381610565714e-05, "loss": 0.0981, "num_input_tokens_seen": 1972480, "step": 2205 }, { "epoch": 0.5833443315296292, "grad_norm": 0.1914701610803604, "learning_rate": 4.9624688763524043e-05, "loss": 0.061, "num_input_tokens_seen": 1976832, "step": 2210 }, { "epoch": 0.5846641150851261, "grad_norm": 0.03107849694788456, "learning_rate": 4.962299211904591e-05, "loss": 0.0698, "num_input_tokens_seen": 1981344, "step": 2215 }, { "epoch": 0.585983898640623, "grad_norm": 0.2835747003555298, "learning_rate": 4.962129167739296e-05, "loss": 0.0847, "num_input_tokens_seen": 1986080, "step": 2220 }, { "epoch": 0.5873036821961198, "grad_norm": 0.6673892736434937, "learning_rate": 4.961958743882742e-05, "loss": 0.0914, "num_input_tokens_seen": 1990496, "step": 2225 }, { "epoch": 0.5886234657516167, "grad_norm": 0.4252258241176605, "learning_rate": 4.961787940361211e-05, "loss": 0.0917, "num_input_tokens_seen": 1994880, "step": 2230 }, { "epoch": 0.5899432493071136, "grad_norm": 0.29872697591781616, "learning_rate": 4.961616757201043e-05, "loss": 0.062, "num_input_tokens_seen": 1999232, "step": 2235 }, { "epoch": 0.5912630328626105, "grad_norm": 0.15087249875068665, "learning_rate": 4.961445194428637e-05, "loss": 0.0507, "num_input_tokens_seen": 2003680, "step": 2240 }, { "epoch": 0.5925828164181074, "grad_norm": 0.46493667364120483, "learning_rate": 4.9612732520704486e-05, "loss": 0.1081, "num_input_tokens_seen": 2008128, "step": 2245 }, { "epoch": 0.5939025999736043, "grad_norm": 0.12040568888187408, "learning_rate": 4.961100930152994e-05, "loss": 0.0699, "num_input_tokens_seen": 2012864, "step": 2250 }, { "epoch": 0.5952223835291012, "grad_norm": 0.3718259632587433, "learning_rate": 4.960928228702849e-05, "loss": 0.115, "num_input_tokens_seen": 2017248, "step": 2255 }, { "epoch": 0.5965421670845982, "grad_norm": 0.4353402853012085, "learning_rate": 4.960755147746645e-05, "loss": 0.1249, "num_input_tokens_seen": 2021216, "step": 2260 }, { "epoch": 0.5978619506400951, "grad_norm": 0.4015991985797882, "learning_rate": 4.9605816873110736e-05, "loss": 0.0889, "num_input_tokens_seen": 2025600, "step": 2265 }, { "epoch": 0.599181734195592, "grad_norm": 0.47197839617729187, "learning_rate": 4.960407847422883e-05, "loss": 0.1076, "num_input_tokens_seen": 2029760, "step": 2270 }, { "epoch": 0.6005015177510888, "grad_norm": 0.09070291370153427, "learning_rate": 4.960233628108885e-05, "loss": 0.0808, "num_input_tokens_seen": 2034272, "step": 2275 }, { "epoch": 0.6018213013065857, "grad_norm": 0.39558684825897217, "learning_rate": 4.960059029395942e-05, "loss": 0.1061, "num_input_tokens_seen": 2038464, "step": 2280 }, { "epoch": 0.6031410848620826, "grad_norm": 0.33149096369743347, "learning_rate": 4.959884051310983e-05, "loss": 0.1393, "num_input_tokens_seen": 2042528, "step": 2285 }, { "epoch": 0.6044608684175795, "grad_norm": 0.1789817363023758, "learning_rate": 4.959708693880991e-05, "loss": 0.0529, "num_input_tokens_seen": 2047008, "step": 2290 }, { "epoch": 0.6057806519730764, "grad_norm": 0.29706957936286926, "learning_rate": 4.9595329571330074e-05, "loss": 0.0279, "num_input_tokens_seen": 2051776, "step": 2295 }, { "epoch": 0.6071004355285733, "grad_norm": 0.22138841450214386, "learning_rate": 4.9593568410941326e-05, "loss": 0.0398, "num_input_tokens_seen": 2056384, "step": 2300 }, { "epoch": 0.6084202190840702, "grad_norm": 0.46790894865989685, "learning_rate": 4.959180345791528e-05, "loss": 0.1314, "num_input_tokens_seen": 2060512, "step": 2305 }, { "epoch": 0.6097400026395671, "grad_norm": 0.30988621711730957, "learning_rate": 4.9590034712524086e-05, "loss": 0.0953, "num_input_tokens_seen": 2064864, "step": 2310 }, { "epoch": 0.611059786195064, "grad_norm": 0.35101160407066345, "learning_rate": 4.958826217504053e-05, "loss": 0.0846, "num_input_tokens_seen": 2068896, "step": 2315 }, { "epoch": 0.612379569750561, "grad_norm": 0.3100353181362152, "learning_rate": 4.958648584573795e-05, "loss": 0.0538, "num_input_tokens_seen": 2073152, "step": 2320 }, { "epoch": 0.6136993533060578, "grad_norm": 0.3215196430683136, "learning_rate": 4.958470572489028e-05, "loss": 0.0922, "num_input_tokens_seen": 2077664, "step": 2325 }, { "epoch": 0.6150191368615547, "grad_norm": 0.1023888811469078, "learning_rate": 4.958292181277203e-05, "loss": 0.0528, "num_input_tokens_seen": 2082496, "step": 2330 }, { "epoch": 0.6163389204170516, "grad_norm": 0.7924720644950867, "learning_rate": 4.958113410965832e-05, "loss": 0.1278, "num_input_tokens_seen": 2086880, "step": 2335 }, { "epoch": 0.6176587039725485, "grad_norm": 0.07269216328859329, "learning_rate": 4.957934261582481e-05, "loss": 0.0495, "num_input_tokens_seen": 2091488, "step": 2340 }, { "epoch": 0.6189784875280454, "grad_norm": 0.12217990309000015, "learning_rate": 4.95775473315478e-05, "loss": 0.0602, "num_input_tokens_seen": 2096000, "step": 2345 }, { "epoch": 0.6202982710835423, "grad_norm": 0.33540695905685425, "learning_rate": 4.9575748257104124e-05, "loss": 0.0447, "num_input_tokens_seen": 2100544, "step": 2350 }, { "epoch": 0.6216180546390392, "grad_norm": 0.2551947236061096, "learning_rate": 4.9573945392771224e-05, "loss": 0.0528, "num_input_tokens_seen": 2104928, "step": 2355 }, { "epoch": 0.6229378381945361, "grad_norm": 0.26239874958992004, "learning_rate": 4.9572138738827134e-05, "loss": 0.0501, "num_input_tokens_seen": 2109120, "step": 2360 }, { "epoch": 0.624257621750033, "grad_norm": 0.15608693659305573, "learning_rate": 4.957032829555046e-05, "loss": 0.0537, "num_input_tokens_seen": 2113632, "step": 2365 }, { "epoch": 0.6255774053055299, "grad_norm": 0.15318377315998077, "learning_rate": 4.956851406322039e-05, "loss": 0.0799, "num_input_tokens_seen": 2118496, "step": 2370 }, { "epoch": 0.6268971888610267, "grad_norm": 0.17658647894859314, "learning_rate": 4.9566696042116704e-05, "loss": 0.0571, "num_input_tokens_seen": 2123008, "step": 2375 }, { "epoch": 0.6282169724165237, "grad_norm": 0.28433656692504883, "learning_rate": 4.9564874232519766e-05, "loss": 0.097, "num_input_tokens_seen": 2127488, "step": 2380 }, { "epoch": 0.6295367559720206, "grad_norm": 0.4256756007671356, "learning_rate": 4.9563048634710516e-05, "loss": 0.0586, "num_input_tokens_seen": 2132032, "step": 2385 }, { "epoch": 0.6308565395275175, "grad_norm": 0.1269175261259079, "learning_rate": 4.956121924897049e-05, "loss": 0.0535, "num_input_tokens_seen": 2136416, "step": 2390 }, { "epoch": 0.6321763230830144, "grad_norm": 0.11275062710046768, "learning_rate": 4.955938607558181e-05, "loss": 0.1297, "num_input_tokens_seen": 2140608, "step": 2395 }, { "epoch": 0.6334961066385113, "grad_norm": 0.26657289266586304, "learning_rate": 4.955754911482715e-05, "loss": 0.1149, "num_input_tokens_seen": 2145056, "step": 2400 }, { "epoch": 0.6334961066385113, "eval_loss": 0.09172502905130386, "eval_runtime": 64.8864, "eval_samples_per_second": 103.797, "eval_steps_per_second": 25.953, "num_input_tokens_seen": 2145056, "step": 2400 }, { "epoch": 0.6348158901940082, "grad_norm": 0.19227822124958038, "learning_rate": 4.9555708366989804e-05, "loss": 0.1678, "num_input_tokens_seen": 2149376, "step": 2405 }, { "epoch": 0.6361356737495051, "grad_norm": 0.25120624899864197, "learning_rate": 4.9553863832353655e-05, "loss": 0.081, "num_input_tokens_seen": 2153728, "step": 2410 }, { "epoch": 0.637455457305002, "grad_norm": 0.24276836216449738, "learning_rate": 4.955201551120313e-05, "loss": 0.0659, "num_input_tokens_seen": 2158464, "step": 2415 }, { "epoch": 0.6387752408604989, "grad_norm": 0.2187170386314392, "learning_rate": 4.955016340382328e-05, "loss": 0.079, "num_input_tokens_seen": 2162912, "step": 2420 }, { "epoch": 0.6400950244159958, "grad_norm": 0.3755386769771576, "learning_rate": 4.954830751049972e-05, "loss": 0.0792, "num_input_tokens_seen": 2167552, "step": 2425 }, { "epoch": 0.6414148079714926, "grad_norm": 0.48159822821617126, "learning_rate": 4.954644783151864e-05, "loss": 0.0781, "num_input_tokens_seen": 2171904, "step": 2430 }, { "epoch": 0.6427345915269895, "grad_norm": 0.2776750922203064, "learning_rate": 4.954458436716684e-05, "loss": 0.0618, "num_input_tokens_seen": 2176480, "step": 2435 }, { "epoch": 0.6440543750824864, "grad_norm": 0.2558130621910095, "learning_rate": 4.954271711773168e-05, "loss": 0.0658, "num_input_tokens_seen": 2181024, "step": 2440 }, { "epoch": 0.6453741586379834, "grad_norm": 0.21499189734458923, "learning_rate": 4.9540846083501115e-05, "loss": 0.0774, "num_input_tokens_seen": 2185376, "step": 2445 }, { "epoch": 0.6466939421934803, "grad_norm": 0.32245105504989624, "learning_rate": 4.953897126476369e-05, "loss": 0.0964, "num_input_tokens_seen": 2189792, "step": 2450 }, { "epoch": 0.6480137257489772, "grad_norm": 0.1765044629573822, "learning_rate": 4.9537092661808514e-05, "loss": 0.0775, "num_input_tokens_seen": 2194272, "step": 2455 }, { "epoch": 0.6493335093044741, "grad_norm": 0.21801163256168365, "learning_rate": 4.9535210274925306e-05, "loss": 0.0538, "num_input_tokens_seen": 2198496, "step": 2460 }, { "epoch": 0.650653292859971, "grad_norm": 0.18894007802009583, "learning_rate": 4.953332410440435e-05, "loss": 0.0871, "num_input_tokens_seen": 2202720, "step": 2465 }, { "epoch": 0.6519730764154679, "grad_norm": 0.30771195888519287, "learning_rate": 4.9531434150536496e-05, "loss": 0.1158, "num_input_tokens_seen": 2207488, "step": 2470 }, { "epoch": 0.6532928599709648, "grad_norm": 0.23108595609664917, "learning_rate": 4.952954041361322e-05, "loss": 0.0905, "num_input_tokens_seen": 2211904, "step": 2475 }, { "epoch": 0.6546126435264616, "grad_norm": 0.16950362920761108, "learning_rate": 4.952764289392655e-05, "loss": 0.0747, "num_input_tokens_seen": 2216128, "step": 2480 }, { "epoch": 0.6559324270819585, "grad_norm": 0.21451860666275024, "learning_rate": 4.952574159176912e-05, "loss": 0.0796, "num_input_tokens_seen": 2220736, "step": 2485 }, { "epoch": 0.6572522106374554, "grad_norm": 0.22987647354602814, "learning_rate": 4.952383650743413e-05, "loss": 0.0619, "num_input_tokens_seen": 2225248, "step": 2490 }, { "epoch": 0.6585719941929523, "grad_norm": 0.28814026713371277, "learning_rate": 4.952192764121536e-05, "loss": 0.0875, "num_input_tokens_seen": 2229664, "step": 2495 }, { "epoch": 0.6598917777484492, "grad_norm": 0.151066392660141, "learning_rate": 4.9520014993407185e-05, "loss": 0.0701, "num_input_tokens_seen": 2233984, "step": 2500 }, { "epoch": 0.6612115613039462, "grad_norm": 0.11902409046888351, "learning_rate": 4.951809856430456e-05, "loss": 0.0988, "num_input_tokens_seen": 2238176, "step": 2505 }, { "epoch": 0.6625313448594431, "grad_norm": 0.16067589819431305, "learning_rate": 4.951617835420303e-05, "loss": 0.0566, "num_input_tokens_seen": 2242528, "step": 2510 }, { "epoch": 0.66385112841494, "grad_norm": 0.3467065393924713, "learning_rate": 4.951425436339869e-05, "loss": 0.0933, "num_input_tokens_seen": 2247136, "step": 2515 }, { "epoch": 0.6651709119704369, "grad_norm": 0.06977517902851105, "learning_rate": 4.9512326592188274e-05, "loss": 0.0749, "num_input_tokens_seen": 2251328, "step": 2520 }, { "epoch": 0.6664906955259338, "grad_norm": 0.18142037093639374, "learning_rate": 4.9510395040869054e-05, "loss": 0.0498, "num_input_tokens_seen": 2255744, "step": 2525 }, { "epoch": 0.6678104790814307, "grad_norm": 0.30557191371917725, "learning_rate": 4.9508459709738905e-05, "loss": 0.0793, "num_input_tokens_seen": 2260384, "step": 2530 }, { "epoch": 0.6691302626369275, "grad_norm": 0.22357675433158875, "learning_rate": 4.950652059909627e-05, "loss": 0.0703, "num_input_tokens_seen": 2264800, "step": 2535 }, { "epoch": 0.6704500461924244, "grad_norm": 0.1345997303724289, "learning_rate": 4.95045777092402e-05, "loss": 0.0687, "num_input_tokens_seen": 2269376, "step": 2540 }, { "epoch": 0.6717698297479213, "grad_norm": 0.26949450373649597, "learning_rate": 4.950263104047031e-05, "loss": 0.0743, "num_input_tokens_seen": 2273888, "step": 2545 }, { "epoch": 0.6730896133034182, "grad_norm": 0.07178498059511185, "learning_rate": 4.9500680593086775e-05, "loss": 0.0689, "num_input_tokens_seen": 2278624, "step": 2550 }, { "epoch": 0.6744093968589151, "grad_norm": 0.1158762127161026, "learning_rate": 4.94987263673904e-05, "loss": 0.0643, "num_input_tokens_seen": 2283232, "step": 2555 }, { "epoch": 0.675729180414412, "grad_norm": 0.23397794365882874, "learning_rate": 4.949676836368256e-05, "loss": 0.0641, "num_input_tokens_seen": 2287456, "step": 2560 }, { "epoch": 0.677048963969909, "grad_norm": 0.27922961115837097, "learning_rate": 4.949480658226518e-05, "loss": 0.0903, "num_input_tokens_seen": 2292320, "step": 2565 }, { "epoch": 0.6783687475254059, "grad_norm": 0.10881134122610092, "learning_rate": 4.949284102344082e-05, "loss": 0.0472, "num_input_tokens_seen": 2296736, "step": 2570 }, { "epoch": 0.6796885310809028, "grad_norm": 0.12265644967556, "learning_rate": 4.9490871687512565e-05, "loss": 0.0784, "num_input_tokens_seen": 2301312, "step": 2575 }, { "epoch": 0.6810083146363997, "grad_norm": 0.22348056733608246, "learning_rate": 4.948889857478413e-05, "loss": 0.0526, "num_input_tokens_seen": 2305792, "step": 2580 }, { "epoch": 0.6823280981918965, "grad_norm": 0.1187608391046524, "learning_rate": 4.948692168555978e-05, "loss": 0.0558, "num_input_tokens_seen": 2310272, "step": 2585 }, { "epoch": 0.6836478817473934, "grad_norm": 0.18306167423725128, "learning_rate": 4.94849410201444e-05, "loss": 0.0959, "num_input_tokens_seen": 2314400, "step": 2590 }, { "epoch": 0.6849676653028903, "grad_norm": 0.34867310523986816, "learning_rate": 4.948295657884341e-05, "loss": 0.0849, "num_input_tokens_seen": 2319200, "step": 2595 }, { "epoch": 0.6862874488583872, "grad_norm": 0.10164482146501541, "learning_rate": 4.9480968361962835e-05, "loss": 0.0759, "num_input_tokens_seen": 2323552, "step": 2600 }, { "epoch": 0.6862874488583872, "eval_loss": 0.09086666256189346, "eval_runtime": 64.7204, "eval_samples_per_second": 104.063, "eval_steps_per_second": 26.02, "num_input_tokens_seen": 2323552, "step": 2600 }, { "epoch": 0.6876072324138841, "grad_norm": 0.39820751547813416, "learning_rate": 4.9478976369809305e-05, "loss": 0.1088, "num_input_tokens_seen": 2328352, "step": 2605 }, { "epoch": 0.688927015969381, "grad_norm": 0.2730591297149658, "learning_rate": 4.947698060268999e-05, "loss": 0.12, "num_input_tokens_seen": 2332864, "step": 2610 }, { "epoch": 0.6902467995248779, "grad_norm": 0.07695958018302917, "learning_rate": 4.9474981060912665e-05, "loss": 0.0602, "num_input_tokens_seen": 2337056, "step": 2615 }, { "epoch": 0.6915665830803748, "grad_norm": 0.05772676318883896, "learning_rate": 4.94729777447857e-05, "loss": 0.0681, "num_input_tokens_seen": 2341664, "step": 2620 }, { "epoch": 0.6928863666358717, "grad_norm": 0.5159198641777039, "learning_rate": 4.947097065461801e-05, "loss": 0.0939, "num_input_tokens_seen": 2346144, "step": 2625 }, { "epoch": 0.6942061501913687, "grad_norm": 0.27272090315818787, "learning_rate": 4.9468959790719125e-05, "loss": 0.0579, "num_input_tokens_seen": 2350496, "step": 2630 }, { "epoch": 0.6955259337468656, "grad_norm": 0.20330604910850525, "learning_rate": 4.9466945153399146e-05, "loss": 0.0753, "num_input_tokens_seen": 2354976, "step": 2635 }, { "epoch": 0.6968457173023624, "grad_norm": 0.3083922564983368, "learning_rate": 4.9464926742968755e-05, "loss": 0.0749, "num_input_tokens_seen": 2359520, "step": 2640 }, { "epoch": 0.6981655008578593, "grad_norm": 0.40600186586380005, "learning_rate": 4.946290455973921e-05, "loss": 0.1596, "num_input_tokens_seen": 2363808, "step": 2645 }, { "epoch": 0.6994852844133562, "grad_norm": 0.44406139850616455, "learning_rate": 4.9460878604022365e-05, "loss": 0.1028, "num_input_tokens_seen": 2368000, "step": 2650 }, { "epoch": 0.7008050679688531, "grad_norm": 0.37707364559173584, "learning_rate": 4.945884887613065e-05, "loss": 0.1023, "num_input_tokens_seen": 2372352, "step": 2655 }, { "epoch": 0.70212485152435, "grad_norm": 0.06486539542675018, "learning_rate": 4.9456815376377055e-05, "loss": 0.0739, "num_input_tokens_seen": 2376928, "step": 2660 }, { "epoch": 0.7034446350798469, "grad_norm": 0.13737806677818298, "learning_rate": 4.9454778105075195e-05, "loss": 0.0907, "num_input_tokens_seen": 2381568, "step": 2665 }, { "epoch": 0.7047644186353438, "grad_norm": 0.17064864933490753, "learning_rate": 4.945273706253924e-05, "loss": 0.0756, "num_input_tokens_seen": 2386144, "step": 2670 }, { "epoch": 0.7060842021908407, "grad_norm": 0.20768266916275024, "learning_rate": 4.9450692249083925e-05, "loss": 0.1401, "num_input_tokens_seen": 2390432, "step": 2675 }, { "epoch": 0.7074039857463376, "grad_norm": 0.2147536724805832, "learning_rate": 4.9448643665024605e-05, "loss": 0.0485, "num_input_tokens_seen": 2395008, "step": 2680 }, { "epoch": 0.7087237693018344, "grad_norm": 0.31932583451271057, "learning_rate": 4.944659131067719e-05, "loss": 0.0752, "num_input_tokens_seen": 2399424, "step": 2685 }, { "epoch": 0.7100435528573315, "grad_norm": 0.21582411229610443, "learning_rate": 4.944453518635818e-05, "loss": 0.1005, "num_input_tokens_seen": 2403840, "step": 2690 }, { "epoch": 0.7113633364128283, "grad_norm": 0.07626861333847046, "learning_rate": 4.944247529238465e-05, "loss": 0.0609, "num_input_tokens_seen": 2408000, "step": 2695 }, { "epoch": 0.7126831199683252, "grad_norm": 0.2858969569206238, "learning_rate": 4.944041162907427e-05, "loss": 0.0944, "num_input_tokens_seen": 2412320, "step": 2700 }, { "epoch": 0.7140029035238221, "grad_norm": 0.26525992155075073, "learning_rate": 4.943834419674529e-05, "loss": 0.0927, "num_input_tokens_seen": 2416864, "step": 2705 }, { "epoch": 0.715322687079319, "grad_norm": 0.31241464614868164, "learning_rate": 4.9436272995716506e-05, "loss": 0.0832, "num_input_tokens_seen": 2420992, "step": 2710 }, { "epoch": 0.7166424706348159, "grad_norm": 0.12966488301753998, "learning_rate": 4.943419802630735e-05, "loss": 0.0938, "num_input_tokens_seen": 2425408, "step": 2715 }, { "epoch": 0.7179622541903128, "grad_norm": 0.35114187002182007, "learning_rate": 4.94321192888378e-05, "loss": 0.1361, "num_input_tokens_seen": 2429888, "step": 2720 }, { "epoch": 0.7192820377458097, "grad_norm": 0.4297981858253479, "learning_rate": 4.943003678362842e-05, "loss": 0.0705, "num_input_tokens_seen": 2434144, "step": 2725 }, { "epoch": 0.7206018213013066, "grad_norm": 0.321590781211853, "learning_rate": 4.942795051100036e-05, "loss": 0.0879, "num_input_tokens_seen": 2438656, "step": 2730 }, { "epoch": 0.7219216048568035, "grad_norm": 0.1313585788011551, "learning_rate": 4.942586047127536e-05, "loss": 0.0497, "num_input_tokens_seen": 2443072, "step": 2735 }, { "epoch": 0.7232413884123003, "grad_norm": 0.33832108974456787, "learning_rate": 4.942376666477571e-05, "loss": 0.1593, "num_input_tokens_seen": 2447392, "step": 2740 }, { "epoch": 0.7245611719677972, "grad_norm": 0.3430827260017395, "learning_rate": 4.9421669091824304e-05, "loss": 0.0778, "num_input_tokens_seen": 2451744, "step": 2745 }, { "epoch": 0.7258809555232941, "grad_norm": 0.2370171844959259, "learning_rate": 4.9419567752744634e-05, "loss": 0.0927, "num_input_tokens_seen": 2456032, "step": 2750 }, { "epoch": 0.7272007390787911, "grad_norm": 0.3264555335044861, "learning_rate": 4.941746264786074e-05, "loss": 0.0611, "num_input_tokens_seen": 2460832, "step": 2755 }, { "epoch": 0.728520522634288, "grad_norm": 0.2719355821609497, "learning_rate": 4.9415353777497254e-05, "loss": 0.0921, "num_input_tokens_seen": 2465120, "step": 2760 }, { "epoch": 0.7298403061897849, "grad_norm": 0.3218131363391876, "learning_rate": 4.9413241141979394e-05, "loss": 0.0702, "num_input_tokens_seen": 2469760, "step": 2765 }, { "epoch": 0.7311600897452818, "grad_norm": 0.09677737951278687, "learning_rate": 4.9411124741632956e-05, "loss": 0.082, "num_input_tokens_seen": 2474208, "step": 2770 }, { "epoch": 0.7324798733007787, "grad_norm": 0.22909219563007355, "learning_rate": 4.940900457678431e-05, "loss": 0.0522, "num_input_tokens_seen": 2478656, "step": 2775 }, { "epoch": 0.7337996568562756, "grad_norm": 0.18849419057369232, "learning_rate": 4.9406880647760425e-05, "loss": 0.0643, "num_input_tokens_seen": 2483232, "step": 2780 }, { "epoch": 0.7351194404117725, "grad_norm": 0.14013653993606567, "learning_rate": 4.9404752954888824e-05, "loss": 0.0864, "num_input_tokens_seen": 2487872, "step": 2785 }, { "epoch": 0.7364392239672694, "grad_norm": 0.11565928161144257, "learning_rate": 4.940262149849762e-05, "loss": 0.0598, "num_input_tokens_seen": 2492640, "step": 2790 }, { "epoch": 0.7377590075227662, "grad_norm": 0.1926334649324417, "learning_rate": 4.9400486278915526e-05, "loss": 0.0519, "num_input_tokens_seen": 2497120, "step": 2795 }, { "epoch": 0.7390787910782631, "grad_norm": 0.62828528881073, "learning_rate": 4.939834729647181e-05, "loss": 0.1633, "num_input_tokens_seen": 2501632, "step": 2800 }, { "epoch": 0.7390787910782631, "eval_loss": 0.08939369022846222, "eval_runtime": 64.7515, "eval_samples_per_second": 104.013, "eval_steps_per_second": 26.007, "num_input_tokens_seen": 2501632, "step": 2800 }, { "epoch": 0.74039857463376, "grad_norm": 0.3150688409805298, "learning_rate": 4.9396204551496326e-05, "loss": 0.0693, "num_input_tokens_seen": 2506080, "step": 2805 }, { "epoch": 0.7417183581892569, "grad_norm": 0.17197035253047943, "learning_rate": 4.939405804431952e-05, "loss": 0.054, "num_input_tokens_seen": 2510272, "step": 2810 }, { "epoch": 0.7430381417447539, "grad_norm": 0.08846244215965271, "learning_rate": 4.9391907775272414e-05, "loss": 0.1341, "num_input_tokens_seen": 2514944, "step": 2815 }, { "epoch": 0.7443579253002508, "grad_norm": 0.3886677026748657, "learning_rate": 4.9389753744686604e-05, "loss": 0.0793, "num_input_tokens_seen": 2519424, "step": 2820 }, { "epoch": 0.7456777088557477, "grad_norm": 0.4647468626499176, "learning_rate": 4.938759595289426e-05, "loss": 0.0912, "num_input_tokens_seen": 2524000, "step": 2825 }, { "epoch": 0.7469974924112446, "grad_norm": 0.25056251883506775, "learning_rate": 4.938543440022815e-05, "loss": 0.1046, "num_input_tokens_seen": 2528448, "step": 2830 }, { "epoch": 0.7483172759667415, "grad_norm": 0.44536638259887695, "learning_rate": 4.938326908702161e-05, "loss": 0.1374, "num_input_tokens_seen": 2532672, "step": 2835 }, { "epoch": 0.7496370595222384, "grad_norm": 0.18656006455421448, "learning_rate": 4.9381100013608554e-05, "loss": 0.1024, "num_input_tokens_seen": 2536928, "step": 2840 }, { "epoch": 0.7509568430777352, "grad_norm": 0.32168400287628174, "learning_rate": 4.9378927180323485e-05, "loss": 0.1016, "num_input_tokens_seen": 2541280, "step": 2845 }, { "epoch": 0.7522766266332321, "grad_norm": 0.34983646869659424, "learning_rate": 4.937675058750148e-05, "loss": 0.0927, "num_input_tokens_seen": 2545856, "step": 2850 }, { "epoch": 0.753596410188729, "grad_norm": 0.3630096912384033, "learning_rate": 4.937457023547819e-05, "loss": 0.0772, "num_input_tokens_seen": 2550432, "step": 2855 }, { "epoch": 0.7549161937442259, "grad_norm": 0.1443314403295517, "learning_rate": 4.9372386124589876e-05, "loss": 0.0421, "num_input_tokens_seen": 2555008, "step": 2860 }, { "epoch": 0.7562359772997228, "grad_norm": 0.12975271046161652, "learning_rate": 4.937019825517333e-05, "loss": 0.0398, "num_input_tokens_seen": 2559424, "step": 2865 }, { "epoch": 0.7575557608552197, "grad_norm": 0.09849181026220322, "learning_rate": 4.9368006627565954e-05, "loss": 0.0476, "num_input_tokens_seen": 2563840, "step": 2870 }, { "epoch": 0.7588755444107167, "grad_norm": 0.2932078242301941, "learning_rate": 4.936581124210573e-05, "loss": 0.0717, "num_input_tokens_seen": 2568416, "step": 2875 }, { "epoch": 0.7601953279662136, "grad_norm": 0.13444513082504272, "learning_rate": 4.9363612099131216e-05, "loss": 0.1089, "num_input_tokens_seen": 2572768, "step": 2880 }, { "epoch": 0.7615151115217105, "grad_norm": 0.5697705149650574, "learning_rate": 4.936140919898155e-05, "loss": 0.0899, "num_input_tokens_seen": 2577120, "step": 2885 }, { "epoch": 0.7628348950772074, "grad_norm": 0.2412351816892624, "learning_rate": 4.9359202541996426e-05, "loss": 0.0836, "num_input_tokens_seen": 2581824, "step": 2890 }, { "epoch": 0.7641546786327043, "grad_norm": 0.2971256971359253, "learning_rate": 4.935699212851616e-05, "loss": 0.0955, "num_input_tokens_seen": 2586464, "step": 2895 }, { "epoch": 0.7654744621882011, "grad_norm": 0.3475034534931183, "learning_rate": 4.935477795888162e-05, "loss": 0.1162, "num_input_tokens_seen": 2591456, "step": 2900 }, { "epoch": 0.766794245743698, "grad_norm": 0.06647197157144547, "learning_rate": 4.935256003343426e-05, "loss": 0.0507, "num_input_tokens_seen": 2595840, "step": 2905 }, { "epoch": 0.7681140292991949, "grad_norm": 0.5598300695419312, "learning_rate": 4.93503383525161e-05, "loss": 0.0917, "num_input_tokens_seen": 2600096, "step": 2910 }, { "epoch": 0.7694338128546918, "grad_norm": 0.2598221004009247, "learning_rate": 4.934811291646977e-05, "loss": 0.0668, "num_input_tokens_seen": 2604608, "step": 2915 }, { "epoch": 0.7707535964101887, "grad_norm": 0.29899170994758606, "learning_rate": 4.934588372563845e-05, "loss": 0.1085, "num_input_tokens_seen": 2609056, "step": 2920 }, { "epoch": 0.7720733799656856, "grad_norm": 0.4705277383327484, "learning_rate": 4.93436507803659e-05, "loss": 0.1212, "num_input_tokens_seen": 2613440, "step": 2925 }, { "epoch": 0.7733931635211825, "grad_norm": 0.43246787786483765, "learning_rate": 4.934141408099649e-05, "loss": 0.0598, "num_input_tokens_seen": 2617920, "step": 2930 }, { "epoch": 0.7747129470766794, "grad_norm": 0.10244724154472351, "learning_rate": 4.9339173627875135e-05, "loss": 0.0838, "num_input_tokens_seen": 2622560, "step": 2935 }, { "epoch": 0.7760327306321764, "grad_norm": 0.2124890238046646, "learning_rate": 4.9336929421347335e-05, "loss": 0.0574, "num_input_tokens_seen": 2627200, "step": 2940 }, { "epoch": 0.7773525141876733, "grad_norm": 0.31476423144340515, "learning_rate": 4.933468146175918e-05, "loss": 0.1223, "num_input_tokens_seen": 2631904, "step": 2945 }, { "epoch": 0.7786722977431701, "grad_norm": 0.21239709854125977, "learning_rate": 4.933242974945734e-05, "loss": 0.052, "num_input_tokens_seen": 2636608, "step": 2950 }, { "epoch": 0.779992081298667, "grad_norm": 0.3374054431915283, "learning_rate": 4.933017428478906e-05, "loss": 0.0966, "num_input_tokens_seen": 2641184, "step": 2955 }, { "epoch": 0.7813118648541639, "grad_norm": 0.11836996674537659, "learning_rate": 4.932791506810214e-05, "loss": 0.0548, "num_input_tokens_seen": 2645792, "step": 2960 }, { "epoch": 0.7826316484096608, "grad_norm": 0.29594671726226807, "learning_rate": 4.932565209974499e-05, "loss": 0.1404, "num_input_tokens_seen": 2650080, "step": 2965 }, { "epoch": 0.7839514319651577, "grad_norm": 0.10724084079265594, "learning_rate": 4.93233853800666e-05, "loss": 0.0525, "num_input_tokens_seen": 2654560, "step": 2970 }, { "epoch": 0.7852712155206546, "grad_norm": 0.1988878697156906, "learning_rate": 4.932111490941651e-05, "loss": 0.0633, "num_input_tokens_seen": 2659264, "step": 2975 }, { "epoch": 0.7865909990761515, "grad_norm": 0.4171641767024994, "learning_rate": 4.9318840688144876e-05, "loss": 0.1021, "num_input_tokens_seen": 2663744, "step": 2980 }, { "epoch": 0.7879107826316484, "grad_norm": 0.5788004994392395, "learning_rate": 4.9316562716602387e-05, "loss": 0.051, "num_input_tokens_seen": 2668128, "step": 2985 }, { "epoch": 0.7892305661871453, "grad_norm": 0.30139848589897156, "learning_rate": 4.9314280995140346e-05, "loss": 0.0935, "num_input_tokens_seen": 2672544, "step": 2990 }, { "epoch": 0.7905503497426422, "grad_norm": 0.19543936848640442, "learning_rate": 4.931199552411063e-05, "loss": 0.0759, "num_input_tokens_seen": 2677120, "step": 2995 }, { "epoch": 0.7918701332981392, "grad_norm": 0.575272262096405, "learning_rate": 4.930970630386568e-05, "loss": 0.1118, "num_input_tokens_seen": 2681600, "step": 3000 }, { "epoch": 0.7918701332981392, "eval_loss": 0.08825664222240448, "eval_runtime": 64.717, "eval_samples_per_second": 104.068, "eval_steps_per_second": 26.021, "num_input_tokens_seen": 2681600, "step": 3000 }, { "epoch": 0.793189916853636, "grad_norm": 0.3676469922065735, "learning_rate": 4.9307413334758524e-05, "loss": 0.1311, "num_input_tokens_seen": 2685856, "step": 3005 }, { "epoch": 0.7945097004091329, "grad_norm": 0.5000125765800476, "learning_rate": 4.930511661714276e-05, "loss": 0.0976, "num_input_tokens_seen": 2690080, "step": 3010 }, { "epoch": 0.7958294839646298, "grad_norm": 0.32019558548927307, "learning_rate": 4.9302816151372576e-05, "loss": 0.1112, "num_input_tokens_seen": 2694208, "step": 3015 }, { "epoch": 0.7971492675201267, "grad_norm": 0.3597044050693512, "learning_rate": 4.930051193780274e-05, "loss": 0.1252, "num_input_tokens_seen": 2698880, "step": 3020 }, { "epoch": 0.7984690510756236, "grad_norm": 0.38136425614356995, "learning_rate": 4.929820397678858e-05, "loss": 0.1111, "num_input_tokens_seen": 2703424, "step": 3025 }, { "epoch": 0.7997888346311205, "grad_norm": 0.24444857239723206, "learning_rate": 4.9295892268686015e-05, "loss": 0.0973, "num_input_tokens_seen": 2708032, "step": 3030 }, { "epoch": 0.8011086181866174, "grad_norm": 0.07816655188798904, "learning_rate": 4.9293576813851536e-05, "loss": 0.1342, "num_input_tokens_seen": 2712320, "step": 3035 }, { "epoch": 0.8024284017421143, "grad_norm": 0.21107934415340424, "learning_rate": 4.929125761264223e-05, "loss": 0.0823, "num_input_tokens_seen": 2716896, "step": 3040 }, { "epoch": 0.8037481852976112, "grad_norm": 0.13296520709991455, "learning_rate": 4.928893466541573e-05, "loss": 0.0558, "num_input_tokens_seen": 2721568, "step": 3045 }, { "epoch": 0.805067968853108, "grad_norm": 0.1991877406835556, "learning_rate": 4.928660797253027e-05, "loss": 0.0501, "num_input_tokens_seen": 2726144, "step": 3050 }, { "epoch": 0.8063877524086049, "grad_norm": 0.32446032762527466, "learning_rate": 4.928427753434467e-05, "loss": 0.1009, "num_input_tokens_seen": 2730496, "step": 3055 }, { "epoch": 0.8077075359641019, "grad_norm": 0.5705958604812622, "learning_rate": 4.9281943351218286e-05, "loss": 0.1265, "num_input_tokens_seen": 2735104, "step": 3060 }, { "epoch": 0.8090273195195988, "grad_norm": 0.3894815444946289, "learning_rate": 4.9279605423511095e-05, "loss": 0.0707, "num_input_tokens_seen": 2739776, "step": 3065 }, { "epoch": 0.8103471030750957, "grad_norm": 0.202248215675354, "learning_rate": 4.927726375158363e-05, "loss": 0.0494, "num_input_tokens_seen": 2743968, "step": 3070 }, { "epoch": 0.8116668866305926, "grad_norm": 0.2518572509288788, "learning_rate": 4.9274918335797004e-05, "loss": 0.071, "num_input_tokens_seen": 2748512, "step": 3075 }, { "epoch": 0.8129866701860895, "grad_norm": 0.5856395363807678, "learning_rate": 4.927256917651292e-05, "loss": 0.0763, "num_input_tokens_seen": 2752800, "step": 3080 }, { "epoch": 0.8143064537415864, "grad_norm": 0.42110559344291687, "learning_rate": 4.927021627409364e-05, "loss": 0.1081, "num_input_tokens_seen": 2757344, "step": 3085 }, { "epoch": 0.8156262372970833, "grad_norm": 0.16488933563232422, "learning_rate": 4.9267859628902005e-05, "loss": 0.1161, "num_input_tokens_seen": 2761760, "step": 3090 }, { "epoch": 0.8169460208525802, "grad_norm": 0.24452261626720428, "learning_rate": 4.9265499241301454e-05, "loss": 0.0808, "num_input_tokens_seen": 2766016, "step": 3095 }, { "epoch": 0.8182658044080771, "grad_norm": 0.42030489444732666, "learning_rate": 4.926313511165598e-05, "loss": 0.0999, "num_input_tokens_seen": 2770592, "step": 3100 }, { "epoch": 0.819585587963574, "grad_norm": 0.3904484510421753, "learning_rate": 4.926076724033016e-05, "loss": 0.0856, "num_input_tokens_seen": 2774720, "step": 3105 }, { "epoch": 0.8209053715190708, "grad_norm": 0.40583816170692444, "learning_rate": 4.9258395627689146e-05, "loss": 0.0803, "num_input_tokens_seen": 2779104, "step": 3110 }, { "epoch": 0.8222251550745677, "grad_norm": 0.08078955113887787, "learning_rate": 4.925602027409868e-05, "loss": 0.0733, "num_input_tokens_seen": 2783488, "step": 3115 }, { "epoch": 0.8235449386300646, "grad_norm": 0.4615092873573303, "learning_rate": 4.925364117992507e-05, "loss": 0.0823, "num_input_tokens_seen": 2787712, "step": 3120 }, { "epoch": 0.8248647221855616, "grad_norm": 0.3310774266719818, "learning_rate": 4.92512583455352e-05, "loss": 0.0877, "num_input_tokens_seen": 2791776, "step": 3125 }, { "epoch": 0.8261845057410585, "grad_norm": 0.12238745391368866, "learning_rate": 4.9248871771296536e-05, "loss": 0.0923, "num_input_tokens_seen": 2796448, "step": 3130 }, { "epoch": 0.8275042892965554, "grad_norm": 0.20484626293182373, "learning_rate": 4.924648145757711e-05, "loss": 0.0543, "num_input_tokens_seen": 2800992, "step": 3135 }, { "epoch": 0.8288240728520523, "grad_norm": 0.7171216607093811, "learning_rate": 4.924408740474554e-05, "loss": 0.0834, "num_input_tokens_seen": 2805184, "step": 3140 }, { "epoch": 0.8301438564075492, "grad_norm": 0.09341349452733994, "learning_rate": 4.924168961317103e-05, "loss": 0.0823, "num_input_tokens_seen": 2809824, "step": 3145 }, { "epoch": 0.8314636399630461, "grad_norm": 0.7684889435768127, "learning_rate": 4.9239288083223334e-05, "loss": 0.1546, "num_input_tokens_seen": 2814432, "step": 3150 }, { "epoch": 0.832783423518543, "grad_norm": 0.15415146946907043, "learning_rate": 4.9236882815272803e-05, "loss": 0.0457, "num_input_tokens_seen": 2818688, "step": 3155 }, { "epoch": 0.8341032070740398, "grad_norm": 0.10995927453041077, "learning_rate": 4.9234473809690365e-05, "loss": 0.0538, "num_input_tokens_seen": 2823200, "step": 3160 }, { "epoch": 0.8354229906295367, "grad_norm": 0.5262004137039185, "learning_rate": 4.923206106684752e-05, "loss": 0.082, "num_input_tokens_seen": 2827616, "step": 3165 }, { "epoch": 0.8367427741850336, "grad_norm": 0.22380705177783966, "learning_rate": 4.922964458711634e-05, "loss": 0.0551, "num_input_tokens_seen": 2832320, "step": 3170 }, { "epoch": 0.8380625577405305, "grad_norm": 0.5028089284896851, "learning_rate": 4.9227224370869474e-05, "loss": 0.1208, "num_input_tokens_seen": 2836736, "step": 3175 }, { "epoch": 0.8393823412960274, "grad_norm": 0.20682218670845032, "learning_rate": 4.9224800418480155e-05, "loss": 0.096, "num_input_tokens_seen": 2841376, "step": 3180 }, { "epoch": 0.8407021248515244, "grad_norm": 0.3023107647895813, "learning_rate": 4.9222372730322176e-05, "loss": 0.0749, "num_input_tokens_seen": 2845920, "step": 3185 }, { "epoch": 0.8420219084070213, "grad_norm": 0.0918809026479721, "learning_rate": 4.921994130676993e-05, "loss": 0.0744, "num_input_tokens_seen": 2850144, "step": 3190 }, { "epoch": 0.8433416919625182, "grad_norm": 0.1666431427001953, "learning_rate": 4.9217506148198366e-05, "loss": 0.0717, "num_input_tokens_seen": 2854880, "step": 3195 }, { "epoch": 0.8446614755180151, "grad_norm": 0.41103827953338623, "learning_rate": 4.921506725498302e-05, "loss": 0.0884, "num_input_tokens_seen": 2859456, "step": 3200 }, { "epoch": 0.8446614755180151, "eval_loss": 0.08669708669185638, "eval_runtime": 64.7141, "eval_samples_per_second": 104.073, "eval_steps_per_second": 26.022, "num_input_tokens_seen": 2859456, "step": 3200 }, { "epoch": 0.845981259073512, "grad_norm": 0.8006001710891724, "learning_rate": 4.9212624627499994e-05, "loss": 0.1255, "num_input_tokens_seen": 2864320, "step": 3205 }, { "epoch": 0.8473010426290088, "grad_norm": 0.29559987783432007, "learning_rate": 4.921017826612597e-05, "loss": 0.088, "num_input_tokens_seen": 2868928, "step": 3210 }, { "epoch": 0.8486208261845057, "grad_norm": 0.14707621932029724, "learning_rate": 4.9207728171238223e-05, "loss": 0.0444, "num_input_tokens_seen": 2873536, "step": 3215 }, { "epoch": 0.8499406097400026, "grad_norm": 0.357501357793808, "learning_rate": 4.920527434321458e-05, "loss": 0.0755, "num_input_tokens_seen": 2878112, "step": 3220 }, { "epoch": 0.8512603932954995, "grad_norm": 0.3020641803741455, "learning_rate": 4.920281678243345e-05, "loss": 0.1027, "num_input_tokens_seen": 2882240, "step": 3225 }, { "epoch": 0.8525801768509964, "grad_norm": 0.19769532978534698, "learning_rate": 4.920035548927381e-05, "loss": 0.0494, "num_input_tokens_seen": 2886720, "step": 3230 }, { "epoch": 0.8538999604064933, "grad_norm": 0.29588621854782104, "learning_rate": 4.919789046411525e-05, "loss": 0.0806, "num_input_tokens_seen": 2891072, "step": 3235 }, { "epoch": 0.8552197439619902, "grad_norm": 0.31500303745269775, "learning_rate": 4.919542170733787e-05, "loss": 0.0845, "num_input_tokens_seen": 2895584, "step": 3240 }, { "epoch": 0.8565395275174872, "grad_norm": 0.25430646538734436, "learning_rate": 4.919294921932242e-05, "loss": 0.078, "num_input_tokens_seen": 2899808, "step": 3245 }, { "epoch": 0.8578593110729841, "grad_norm": 0.29362353682518005, "learning_rate": 4.919047300045016e-05, "loss": 0.0642, "num_input_tokens_seen": 2904288, "step": 3250 }, { "epoch": 0.859179094628481, "grad_norm": 0.16907300055027008, "learning_rate": 4.918799305110299e-05, "loss": 0.0351, "num_input_tokens_seen": 2908608, "step": 3255 }, { "epoch": 0.8604988781839779, "grad_norm": 0.07650022208690643, "learning_rate": 4.918550937166331e-05, "loss": 0.0619, "num_input_tokens_seen": 2912768, "step": 3260 }, { "epoch": 0.8618186617394747, "grad_norm": 0.1158524677157402, "learning_rate": 4.918302196251415e-05, "loss": 0.0775, "num_input_tokens_seen": 2917280, "step": 3265 }, { "epoch": 0.8631384452949716, "grad_norm": 0.2968883216381073, "learning_rate": 4.91805308240391e-05, "loss": 0.0798, "num_input_tokens_seen": 2922112, "step": 3270 }, { "epoch": 0.8644582288504685, "grad_norm": 0.40005528926849365, "learning_rate": 4.9178035956622326e-05, "loss": 0.0793, "num_input_tokens_seen": 2926496, "step": 3275 }, { "epoch": 0.8657780124059654, "grad_norm": 0.4341690242290497, "learning_rate": 4.917553736064857e-05, "loss": 0.1084, "num_input_tokens_seen": 2931040, "step": 3280 }, { "epoch": 0.8670977959614623, "grad_norm": 0.26169392466545105, "learning_rate": 4.917303503650314e-05, "loss": 0.0982, "num_input_tokens_seen": 2935648, "step": 3285 }, { "epoch": 0.8684175795169592, "grad_norm": 0.31261658668518066, "learning_rate": 4.917052898457194e-05, "loss": 0.1074, "num_input_tokens_seen": 2939968, "step": 3290 }, { "epoch": 0.8697373630724561, "grad_norm": 0.20674723386764526, "learning_rate": 4.916801920524141e-05, "loss": 0.0539, "num_input_tokens_seen": 2944288, "step": 3295 }, { "epoch": 0.871057146627953, "grad_norm": 0.2808683216571808, "learning_rate": 4.916550569889862e-05, "loss": 0.1111, "num_input_tokens_seen": 2948960, "step": 3300 }, { "epoch": 0.8723769301834499, "grad_norm": 0.6423003077507019, "learning_rate": 4.916298846593116e-05, "loss": 0.0774, "num_input_tokens_seen": 2953472, "step": 3305 }, { "epoch": 0.8736967137389469, "grad_norm": 0.15420837700366974, "learning_rate": 4.916046750672722e-05, "loss": 0.0539, "num_input_tokens_seen": 2957920, "step": 3310 }, { "epoch": 0.8750164972944438, "grad_norm": 0.42586788535118103, "learning_rate": 4.915794282167559e-05, "loss": 0.0668, "num_input_tokens_seen": 2962240, "step": 3315 }, { "epoch": 0.8763362808499406, "grad_norm": 0.2385115772485733, "learning_rate": 4.915541441116558e-05, "loss": 0.0593, "num_input_tokens_seen": 2966912, "step": 3320 }, { "epoch": 0.8776560644054375, "grad_norm": 0.29325252771377563, "learning_rate": 4.915288227558711e-05, "loss": 0.0695, "num_input_tokens_seen": 2971712, "step": 3325 }, { "epoch": 0.8789758479609344, "grad_norm": 0.06891842186450958, "learning_rate": 4.915034641533066e-05, "loss": 0.1283, "num_input_tokens_seen": 2976416, "step": 3330 }, { "epoch": 0.8802956315164313, "grad_norm": 0.5288883447647095, "learning_rate": 4.914780683078731e-05, "loss": 0.0787, "num_input_tokens_seen": 2981120, "step": 3335 }, { "epoch": 0.8816154150719282, "grad_norm": 0.24800622463226318, "learning_rate": 4.9145263522348695e-05, "loss": 0.0916, "num_input_tokens_seen": 2985472, "step": 3340 }, { "epoch": 0.8829351986274251, "grad_norm": 0.4597092866897583, "learning_rate": 4.9142716490407e-05, "loss": 0.115, "num_input_tokens_seen": 2989952, "step": 3345 }, { "epoch": 0.884254982182922, "grad_norm": 0.39968809485435486, "learning_rate": 4.914016573535504e-05, "loss": 0.0462, "num_input_tokens_seen": 2994624, "step": 3350 }, { "epoch": 0.8855747657384189, "grad_norm": 0.17291751503944397, "learning_rate": 4.9137611257586154e-05, "loss": 0.0699, "num_input_tokens_seen": 2999232, "step": 3355 }, { "epoch": 0.8868945492939158, "grad_norm": 0.35235458612442017, "learning_rate": 4.9135053057494274e-05, "loss": 0.1439, "num_input_tokens_seen": 3003648, "step": 3360 }, { "epoch": 0.8882143328494126, "grad_norm": 0.12908470630645752, "learning_rate": 4.913249113547392e-05, "loss": 0.064, "num_input_tokens_seen": 3008448, "step": 3365 }, { "epoch": 0.8895341164049096, "grad_norm": 0.17904619872570038, "learning_rate": 4.912992549192016e-05, "loss": 0.0539, "num_input_tokens_seen": 3012960, "step": 3370 }, { "epoch": 0.8908538999604065, "grad_norm": 0.06685513257980347, "learning_rate": 4.9127356127228665e-05, "loss": 0.1, "num_input_tokens_seen": 3017408, "step": 3375 }, { "epoch": 0.8921736835159034, "grad_norm": 0.19979850947856903, "learning_rate": 4.912478304179564e-05, "loss": 0.0454, "num_input_tokens_seen": 3021728, "step": 3380 }, { "epoch": 0.8934934670714003, "grad_norm": 0.378202348947525, "learning_rate": 4.9122206236017896e-05, "loss": 0.0868, "num_input_tokens_seen": 3026208, "step": 3385 }, { "epoch": 0.8948132506268972, "grad_norm": 0.09463414549827576, "learning_rate": 4.911962571029282e-05, "loss": 0.0613, "num_input_tokens_seen": 3030688, "step": 3390 }, { "epoch": 0.8961330341823941, "grad_norm": 0.45749804377555847, "learning_rate": 4.9117041465018353e-05, "loss": 0.127, "num_input_tokens_seen": 3035040, "step": 3395 }, { "epoch": 0.897452817737891, "grad_norm": 0.072332464158535, "learning_rate": 4.911445350059302e-05, "loss": 0.0661, "num_input_tokens_seen": 3039712, "step": 3400 }, { "epoch": 0.897452817737891, "eval_loss": 0.08539655804634094, "eval_runtime": 64.8209, "eval_samples_per_second": 103.902, "eval_steps_per_second": 25.979, "num_input_tokens_seen": 3039712, "step": 3400 }, { "epoch": 0.8987726012933879, "grad_norm": 0.5369530320167542, "learning_rate": 4.9111861817415905e-05, "loss": 0.0718, "num_input_tokens_seen": 3043904, "step": 3405 }, { "epoch": 0.9000923848488848, "grad_norm": 0.24395689368247986, "learning_rate": 4.91092664158867e-05, "loss": 0.0625, "num_input_tokens_seen": 3048384, "step": 3410 }, { "epoch": 0.9014121684043817, "grad_norm": 0.11107509583234787, "learning_rate": 4.910666729640563e-05, "loss": 0.0868, "num_input_tokens_seen": 3052736, "step": 3415 }, { "epoch": 0.9027319519598785, "grad_norm": 0.15976864099502563, "learning_rate": 4.910406445937353e-05, "loss": 0.0726, "num_input_tokens_seen": 3057376, "step": 3420 }, { "epoch": 0.9040517355153754, "grad_norm": 0.6188194751739502, "learning_rate": 4.9101457905191774e-05, "loss": 0.0803, "num_input_tokens_seen": 3062208, "step": 3425 }, { "epoch": 0.9053715190708723, "grad_norm": 0.2900412380695343, "learning_rate": 4.909884763426233e-05, "loss": 0.1013, "num_input_tokens_seen": 3066304, "step": 3430 }, { "epoch": 0.9066913026263693, "grad_norm": 0.12687575817108154, "learning_rate": 4.9096233646987736e-05, "loss": 0.0705, "num_input_tokens_seen": 3070720, "step": 3435 }, { "epoch": 0.9080110861818662, "grad_norm": 0.15802253782749176, "learning_rate": 4.9093615943771104e-05, "loss": 0.0421, "num_input_tokens_seen": 3075168, "step": 3440 }, { "epoch": 0.9093308697373631, "grad_norm": 0.42126262187957764, "learning_rate": 4.909099452501611e-05, "loss": 0.0763, "num_input_tokens_seen": 3079488, "step": 3445 }, { "epoch": 0.91065065329286, "grad_norm": 0.41974180936813354, "learning_rate": 4.908836939112702e-05, "loss": 0.1557, "num_input_tokens_seen": 3084224, "step": 3450 }, { "epoch": 0.9119704368483569, "grad_norm": 0.26655495166778564, "learning_rate": 4.908574054250865e-05, "loss": 0.1134, "num_input_tokens_seen": 3088800, "step": 3455 }, { "epoch": 0.9132902204038538, "grad_norm": 0.36356255412101746, "learning_rate": 4.9083107979566414e-05, "loss": 0.1157, "num_input_tokens_seen": 3093344, "step": 3460 }, { "epoch": 0.9146100039593507, "grad_norm": 0.2589225769042969, "learning_rate": 4.908047170270628e-05, "loss": 0.0839, "num_input_tokens_seen": 3097600, "step": 3465 }, { "epoch": 0.9159297875148475, "grad_norm": 0.34415295720100403, "learning_rate": 4.9077831712334784e-05, "loss": 0.0756, "num_input_tokens_seen": 3101792, "step": 3470 }, { "epoch": 0.9172495710703444, "grad_norm": 0.5704537034034729, "learning_rate": 4.907518800885907e-05, "loss": 0.095, "num_input_tokens_seen": 3106240, "step": 3475 }, { "epoch": 0.9185693546258413, "grad_norm": 0.37569931149482727, "learning_rate": 4.907254059268681e-05, "loss": 0.1136, "num_input_tokens_seen": 3110688, "step": 3480 }, { "epoch": 0.9198891381813382, "grad_norm": 0.14082498848438263, "learning_rate": 4.906988946422628e-05, "loss": 0.1023, "num_input_tokens_seen": 3115520, "step": 3485 }, { "epoch": 0.9212089217368351, "grad_norm": 0.2145296186208725, "learning_rate": 4.9067234623886315e-05, "loss": 0.0877, "num_input_tokens_seen": 3120128, "step": 3490 }, { "epoch": 0.9225287052923321, "grad_norm": 0.2338973879814148, "learning_rate": 4.9064576072076316e-05, "loss": 0.0814, "num_input_tokens_seen": 3124448, "step": 3495 }, { "epoch": 0.923848488847829, "grad_norm": 0.6379575729370117, "learning_rate": 4.906191380920628e-05, "loss": 0.1634, "num_input_tokens_seen": 3128960, "step": 3500 }, { "epoch": 0.9251682724033259, "grad_norm": 0.18910695612430573, "learning_rate": 4.905924783568675e-05, "loss": 0.1442, "num_input_tokens_seen": 3133312, "step": 3505 }, { "epoch": 0.9264880559588228, "grad_norm": 0.33646896481513977, "learning_rate": 4.905657815192886e-05, "loss": 0.0937, "num_input_tokens_seen": 3137792, "step": 3510 }, { "epoch": 0.9278078395143197, "grad_norm": 0.22252024710178375, "learning_rate": 4.90539047583443e-05, "loss": 0.0787, "num_input_tokens_seen": 3142304, "step": 3515 }, { "epoch": 0.9291276230698166, "grad_norm": 0.1181967705488205, "learning_rate": 4.905122765534534e-05, "loss": 0.079, "num_input_tokens_seen": 3146912, "step": 3520 }, { "epoch": 0.9304474066253134, "grad_norm": 0.4838852882385254, "learning_rate": 4.9048546843344846e-05, "loss": 0.0949, "num_input_tokens_seen": 3151424, "step": 3525 }, { "epoch": 0.9317671901808103, "grad_norm": 0.2745245397090912, "learning_rate": 4.9045862322756206e-05, "loss": 0.0588, "num_input_tokens_seen": 3155616, "step": 3530 }, { "epoch": 0.9330869737363072, "grad_norm": 0.10697958618402481, "learning_rate": 4.904317409399342e-05, "loss": 0.0605, "num_input_tokens_seen": 3159936, "step": 3535 }, { "epoch": 0.9344067572918041, "grad_norm": 0.09313882142305374, "learning_rate": 4.904048215747104e-05, "loss": 0.094, "num_input_tokens_seen": 3164736, "step": 3540 }, { "epoch": 0.935726540847301, "grad_norm": 0.3413710594177246, "learning_rate": 4.90377865136042e-05, "loss": 0.0694, "num_input_tokens_seen": 3169120, "step": 3545 }, { "epoch": 0.9370463244027979, "grad_norm": 0.18707430362701416, "learning_rate": 4.90350871628086e-05, "loss": 0.1152, "num_input_tokens_seen": 3173984, "step": 3550 }, { "epoch": 0.9383661079582949, "grad_norm": 0.38687068223953247, "learning_rate": 4.903238410550052e-05, "loss": 0.1112, "num_input_tokens_seen": 3178400, "step": 3555 }, { "epoch": 0.9396858915137918, "grad_norm": 0.18848969042301178, "learning_rate": 4.90296773420968e-05, "loss": 0.088, "num_input_tokens_seen": 3182656, "step": 3560 }, { "epoch": 0.9410056750692887, "grad_norm": 0.196629598736763, "learning_rate": 4.902696687301486e-05, "loss": 0.0566, "num_input_tokens_seen": 3187232, "step": 3565 }, { "epoch": 0.9423254586247856, "grad_norm": 0.1749841868877411, "learning_rate": 4.902425269867268e-05, "loss": 0.111, "num_input_tokens_seen": 3191616, "step": 3570 }, { "epoch": 0.9436452421802825, "grad_norm": 0.10314325988292694, "learning_rate": 4.902153481948883e-05, "loss": 0.0792, "num_input_tokens_seen": 3196096, "step": 3575 }, { "epoch": 0.9449650257357793, "grad_norm": 0.11345010250806808, "learning_rate": 4.901881323588244e-05, "loss": 0.0407, "num_input_tokens_seen": 3200640, "step": 3580 }, { "epoch": 0.9462848092912762, "grad_norm": 0.3500674068927765, "learning_rate": 4.90160879482732e-05, "loss": 0.1193, "num_input_tokens_seen": 3205088, "step": 3585 }, { "epoch": 0.9476045928467731, "grad_norm": 0.2619893252849579, "learning_rate": 4.9013358957081405e-05, "loss": 0.0553, "num_input_tokens_seen": 3209536, "step": 3590 }, { "epoch": 0.94892437640227, "grad_norm": 0.30071645975112915, "learning_rate": 4.901062626272789e-05, "loss": 0.0931, "num_input_tokens_seen": 3213856, "step": 3595 }, { "epoch": 0.9502441599577669, "grad_norm": 0.0862569659948349, "learning_rate": 4.900788986563406e-05, "loss": 0.1093, "num_input_tokens_seen": 3218400, "step": 3600 }, { "epoch": 0.9502441599577669, "eval_loss": 0.08537984639406204, "eval_runtime": 64.7456, "eval_samples_per_second": 104.022, "eval_steps_per_second": 26.009, "num_input_tokens_seen": 3218400, "step": 3600 }, { "epoch": 0.9515639435132638, "grad_norm": 0.18426357209682465, "learning_rate": 4.9005149766221915e-05, "loss": 0.0864, "num_input_tokens_seen": 3222912, "step": 3605 }, { "epoch": 0.9528837270687607, "grad_norm": 0.12116490304470062, "learning_rate": 4.9002405964914e-05, "loss": 0.0809, "num_input_tokens_seen": 3227104, "step": 3610 }, { "epoch": 0.9542035106242576, "grad_norm": 0.3477870523929596, "learning_rate": 4.899965846213346e-05, "loss": 0.0748, "num_input_tokens_seen": 3231552, "step": 3615 }, { "epoch": 0.9555232941797546, "grad_norm": 0.4415113627910614, "learning_rate": 4.899690725830399e-05, "loss": 0.0896, "num_input_tokens_seen": 3236448, "step": 3620 }, { "epoch": 0.9568430777352515, "grad_norm": 0.35103827714920044, "learning_rate": 4.899415235384985e-05, "loss": 0.1062, "num_input_tokens_seen": 3240576, "step": 3625 }, { "epoch": 0.9581628612907483, "grad_norm": 0.15636710822582245, "learning_rate": 4.899139374919589e-05, "loss": 0.0806, "num_input_tokens_seen": 3244896, "step": 3630 }, { "epoch": 0.9594826448462452, "grad_norm": 0.2798616886138916, "learning_rate": 4.898863144476752e-05, "loss": 0.0939, "num_input_tokens_seen": 3249376, "step": 3635 }, { "epoch": 0.9608024284017421, "grad_norm": 0.5134257078170776, "learning_rate": 4.898586544099072e-05, "loss": 0.1171, "num_input_tokens_seen": 3254080, "step": 3640 }, { "epoch": 0.962122211957239, "grad_norm": 0.36010757088661194, "learning_rate": 4.898309573829204e-05, "loss": 0.1185, "num_input_tokens_seen": 3258592, "step": 3645 }, { "epoch": 0.9634419955127359, "grad_norm": 0.38726165890693665, "learning_rate": 4.898032233709862e-05, "loss": 0.0594, "num_input_tokens_seen": 3263360, "step": 3650 }, { "epoch": 0.9647617790682328, "grad_norm": 0.12077130377292633, "learning_rate": 4.8977545237838123e-05, "loss": 0.0419, "num_input_tokens_seen": 3267872, "step": 3655 }, { "epoch": 0.9660815626237297, "grad_norm": 0.16140489280223846, "learning_rate": 4.8974764440938836e-05, "loss": 0.1001, "num_input_tokens_seen": 3272480, "step": 3660 }, { "epoch": 0.9674013461792266, "grad_norm": 0.20887038111686707, "learning_rate": 4.897197994682959e-05, "loss": 0.1052, "num_input_tokens_seen": 3276960, "step": 3665 }, { "epoch": 0.9687211297347235, "grad_norm": 0.12402015179395676, "learning_rate": 4.8969191755939786e-05, "loss": 0.0624, "num_input_tokens_seen": 3281216, "step": 3670 }, { "epoch": 0.9700409132902204, "grad_norm": 0.18823190033435822, "learning_rate": 4.8966399868699396e-05, "loss": 0.0509, "num_input_tokens_seen": 3285856, "step": 3675 }, { "epoch": 0.9713606968457174, "grad_norm": 0.23505936563014984, "learning_rate": 4.8963604285538965e-05, "loss": 0.05, "num_input_tokens_seen": 3290208, "step": 3680 }, { "epoch": 0.9726804804012142, "grad_norm": 0.19444938004016876, "learning_rate": 4.8960805006889604e-05, "loss": 0.0689, "num_input_tokens_seen": 3294464, "step": 3685 }, { "epoch": 0.9740002639567111, "grad_norm": 0.04791402816772461, "learning_rate": 4.8958002033183004e-05, "loss": 0.0916, "num_input_tokens_seen": 3298784, "step": 3690 }, { "epoch": 0.975320047512208, "grad_norm": 0.11152095347642899, "learning_rate": 4.8955195364851414e-05, "loss": 0.0521, "num_input_tokens_seen": 3303424, "step": 3695 }, { "epoch": 0.9766398310677049, "grad_norm": 0.14665816724300385, "learning_rate": 4.895238500232766e-05, "loss": 0.0699, "num_input_tokens_seen": 3308032, "step": 3700 }, { "epoch": 0.9779596146232018, "grad_norm": 0.09416782855987549, "learning_rate": 4.8949570946045143e-05, "loss": 0.0526, "num_input_tokens_seen": 3312640, "step": 3705 }, { "epoch": 0.9792793981786987, "grad_norm": 0.1455061435699463, "learning_rate": 4.89467531964378e-05, "loss": 0.0889, "num_input_tokens_seen": 3317024, "step": 3710 }, { "epoch": 0.9805991817341956, "grad_norm": 0.08683057129383087, "learning_rate": 4.894393175394019e-05, "loss": 0.0472, "num_input_tokens_seen": 3321568, "step": 3715 }, { "epoch": 0.9819189652896925, "grad_norm": 0.0437285378575325, "learning_rate": 4.8941106618987406e-05, "loss": 0.0516, "num_input_tokens_seen": 3325856, "step": 3720 }, { "epoch": 0.9832387488451894, "grad_norm": 0.47485217452049255, "learning_rate": 4.893827779201512e-05, "loss": 0.084, "num_input_tokens_seen": 3330240, "step": 3725 }, { "epoch": 0.9845585324006862, "grad_norm": 0.24226413667201996, "learning_rate": 4.893544527345957e-05, "loss": 0.0584, "num_input_tokens_seen": 3334560, "step": 3730 }, { "epoch": 0.9858783159561831, "grad_norm": 0.04881235957145691, "learning_rate": 4.8932609063757563e-05, "loss": 0.0393, "num_input_tokens_seen": 3338656, "step": 3735 }, { "epoch": 0.9871980995116801, "grad_norm": 0.17872962355613708, "learning_rate": 4.8929769163346484e-05, "loss": 0.0537, "num_input_tokens_seen": 3343104, "step": 3740 }, { "epoch": 0.988517883067177, "grad_norm": 0.2904002070426941, "learning_rate": 4.892692557266429e-05, "loss": 0.0579, "num_input_tokens_seen": 3347456, "step": 3745 }, { "epoch": 0.9898376666226739, "grad_norm": 0.11871351301670074, "learning_rate": 4.8924078292149464e-05, "loss": 0.0885, "num_input_tokens_seen": 3351712, "step": 3750 }, { "epoch": 0.9911574501781708, "grad_norm": 0.5551296472549438, "learning_rate": 4.892122732224114e-05, "loss": 0.0665, "num_input_tokens_seen": 3356192, "step": 3755 }, { "epoch": 0.9924772337336677, "grad_norm": 0.1894797831773758, "learning_rate": 4.8918372663378944e-05, "loss": 0.0602, "num_input_tokens_seen": 3360704, "step": 3760 }, { "epoch": 0.9937970172891646, "grad_norm": 0.3918653726577759, "learning_rate": 4.89155143160031e-05, "loss": 0.0651, "num_input_tokens_seen": 3365216, "step": 3765 }, { "epoch": 0.9951168008446615, "grad_norm": 0.5060731768608093, "learning_rate": 4.891265228055441e-05, "loss": 0.1076, "num_input_tokens_seen": 3369696, "step": 3770 }, { "epoch": 0.9964365844001584, "grad_norm": 0.1691993623971939, "learning_rate": 4.890978655747424e-05, "loss": 0.0398, "num_input_tokens_seen": 3373952, "step": 3775 }, { "epoch": 0.9977563679556553, "grad_norm": 0.2829606831073761, "learning_rate": 4.89069171472045e-05, "loss": 0.0805, "num_input_tokens_seen": 3378432, "step": 3780 }, { "epoch": 0.9990761515111521, "grad_norm": 0.07150144129991531, "learning_rate": 4.890404405018772e-05, "loss": 0.05, "num_input_tokens_seen": 3382528, "step": 3785 }, { "epoch": 1.0002639567110994, "grad_norm": 0.6697476506233215, "learning_rate": 4.8901167266866934e-05, "loss": 0.1111, "num_input_tokens_seen": 3386576, "step": 3790 }, { "epoch": 1.0015837402665964, "grad_norm": 0.48284265398979187, "learning_rate": 4.88982867976858e-05, "loss": 0.0761, "num_input_tokens_seen": 3391376, "step": 3795 }, { "epoch": 1.0029035238220931, "grad_norm": 0.13684047758579254, "learning_rate": 4.889540264308852e-05, "loss": 0.0835, "num_input_tokens_seen": 3395632, "step": 3800 }, { "epoch": 1.0029035238220931, "eval_loss": 0.0841970220208168, "eval_runtime": 64.6968, "eval_samples_per_second": 104.101, "eval_steps_per_second": 26.029, "num_input_tokens_seen": 3395632, "step": 3800 }, { "epoch": 1.0042233073775901, "grad_norm": 0.40840673446655273, "learning_rate": 4.889251480351986e-05, "loss": 0.0946, "num_input_tokens_seen": 3400144, "step": 3805 }, { "epoch": 1.005543090933087, "grad_norm": 0.1282190978527069, "learning_rate": 4.888962327942517e-05, "loss": 0.0289, "num_input_tokens_seen": 3404752, "step": 3810 }, { "epoch": 1.006862874488584, "grad_norm": 0.18420645594596863, "learning_rate": 4.8886728071250356e-05, "loss": 0.0692, "num_input_tokens_seen": 3409456, "step": 3815 }, { "epoch": 1.0081826580440807, "grad_norm": 0.176153764128685, "learning_rate": 4.8883829179441884e-05, "loss": 0.0756, "num_input_tokens_seen": 3413840, "step": 3820 }, { "epoch": 1.0095024415995777, "grad_norm": 0.4257528781890869, "learning_rate": 4.888092660444682e-05, "loss": 0.0789, "num_input_tokens_seen": 3418160, "step": 3825 }, { "epoch": 1.0108222251550745, "grad_norm": 0.13079734146595, "learning_rate": 4.887802034671276e-05, "loss": 0.0716, "num_input_tokens_seen": 3422672, "step": 3830 }, { "epoch": 1.0121420087105715, "grad_norm": 0.40562328696250916, "learning_rate": 4.88751104066879e-05, "loss": 0.0877, "num_input_tokens_seen": 3427216, "step": 3835 }, { "epoch": 1.0134617922660685, "grad_norm": 0.2332824319601059, "learning_rate": 4.887219678482098e-05, "loss": 0.1141, "num_input_tokens_seen": 3431952, "step": 3840 }, { "epoch": 1.0147815758215653, "grad_norm": 0.5409038066864014, "learning_rate": 4.8869279481561316e-05, "loss": 0.0626, "num_input_tokens_seen": 3436368, "step": 3845 }, { "epoch": 1.0161013593770623, "grad_norm": 0.28266409039497375, "learning_rate": 4.88663584973588e-05, "loss": 0.0901, "num_input_tokens_seen": 3440624, "step": 3850 }, { "epoch": 1.017421142932559, "grad_norm": 0.1674271821975708, "learning_rate": 4.8863433832663874e-05, "loss": 0.0598, "num_input_tokens_seen": 3445296, "step": 3855 }, { "epoch": 1.018740926488056, "grad_norm": 0.3472689986228943, "learning_rate": 4.886050548792757e-05, "loss": 0.0605, "num_input_tokens_seen": 3449552, "step": 3860 }, { "epoch": 1.0200607100435528, "grad_norm": 0.5386594533920288, "learning_rate": 4.8857573463601465e-05, "loss": 0.1133, "num_input_tokens_seen": 3454384, "step": 3865 }, { "epoch": 1.0213804935990498, "grad_norm": 0.4810366928577423, "learning_rate": 4.885463776013772e-05, "loss": 0.0849, "num_input_tokens_seen": 3458896, "step": 3870 }, { "epoch": 1.0227002771545466, "grad_norm": 0.3793335556983948, "learning_rate": 4.8851698377989056e-05, "loss": 0.0797, "num_input_tokens_seen": 3463184, "step": 3875 }, { "epoch": 1.0240200607100436, "grad_norm": 0.3431379497051239, "learning_rate": 4.884875531760876e-05, "loss": 0.076, "num_input_tokens_seen": 3467824, "step": 3880 }, { "epoch": 1.0253398442655404, "grad_norm": 0.24647897481918335, "learning_rate": 4.88458085794507e-05, "loss": 0.0458, "num_input_tokens_seen": 3471984, "step": 3885 }, { "epoch": 1.0266596278210374, "grad_norm": 0.20652002096176147, "learning_rate": 4.884285816396929e-05, "loss": 0.0714, "num_input_tokens_seen": 3476560, "step": 3890 }, { "epoch": 1.0279794113765341, "grad_norm": 0.2015364170074463, "learning_rate": 4.8839904071619526e-05, "loss": 0.0579, "num_input_tokens_seen": 3480912, "step": 3895 }, { "epoch": 1.0292991949320311, "grad_norm": 0.26247185468673706, "learning_rate": 4.8836946302856955e-05, "loss": 0.1009, "num_input_tokens_seen": 3485360, "step": 3900 }, { "epoch": 1.0306189784875281, "grad_norm": 0.33072471618652344, "learning_rate": 4.8833984858137715e-05, "loss": 0.0564, "num_input_tokens_seen": 3490000, "step": 3905 }, { "epoch": 1.031938762043025, "grad_norm": 0.10270272940397263, "learning_rate": 4.8831019737918494e-05, "loss": 0.1014, "num_input_tokens_seen": 3494480, "step": 3910 }, { "epoch": 1.033258545598522, "grad_norm": 0.23943404853343964, "learning_rate": 4.882805094265655e-05, "loss": 0.0916, "num_input_tokens_seen": 3499152, "step": 3915 }, { "epoch": 1.0345783291540187, "grad_norm": 0.42587146162986755, "learning_rate": 4.8825078472809706e-05, "loss": 0.1336, "num_input_tokens_seen": 3503568, "step": 3920 }, { "epoch": 1.0358981127095157, "grad_norm": 0.3739672899246216, "learning_rate": 4.882210232883635e-05, "loss": 0.0979, "num_input_tokens_seen": 3508016, "step": 3925 }, { "epoch": 1.0372178962650125, "grad_norm": 0.43589022755622864, "learning_rate": 4.881912251119546e-05, "loss": 0.0896, "num_input_tokens_seen": 3512208, "step": 3930 }, { "epoch": 1.0385376798205095, "grad_norm": 0.20134888589382172, "learning_rate": 4.881613902034654e-05, "loss": 0.0452, "num_input_tokens_seen": 3516816, "step": 3935 }, { "epoch": 1.0398574633760063, "grad_norm": 0.24141134321689606, "learning_rate": 4.88131518567497e-05, "loss": 0.1149, "num_input_tokens_seen": 3521264, "step": 3940 }, { "epoch": 1.0411772469315033, "grad_norm": 0.16694284975528717, "learning_rate": 4.881016102086558e-05, "loss": 0.1032, "num_input_tokens_seen": 3525872, "step": 3945 }, { "epoch": 1.042497030487, "grad_norm": 0.3523333668708801, "learning_rate": 4.8807166513155425e-05, "loss": 0.079, "num_input_tokens_seen": 3530576, "step": 3950 }, { "epoch": 1.043816814042497, "grad_norm": 0.13776417076587677, "learning_rate": 4.8804168334081004e-05, "loss": 0.0438, "num_input_tokens_seen": 3535056, "step": 3955 }, { "epoch": 1.0451365975979938, "grad_norm": 0.08046849071979523, "learning_rate": 4.880116648410468e-05, "loss": 0.0852, "num_input_tokens_seen": 3539248, "step": 3960 }, { "epoch": 1.0464563811534908, "grad_norm": 0.21321848034858704, "learning_rate": 4.879816096368939e-05, "loss": 0.0517, "num_input_tokens_seen": 3543344, "step": 3965 }, { "epoch": 1.0477761647089878, "grad_norm": 0.10912656038999557, "learning_rate": 4.879515177329861e-05, "loss": 0.0569, "num_input_tokens_seen": 3547696, "step": 3970 }, { "epoch": 1.0490959482644846, "grad_norm": 0.07434891909360886, "learning_rate": 4.8792138913396394e-05, "loss": 0.0599, "num_input_tokens_seen": 3552624, "step": 3975 }, { "epoch": 1.0504157318199816, "grad_norm": 0.35398176312446594, "learning_rate": 4.8789122384447374e-05, "loss": 0.0498, "num_input_tokens_seen": 3556848, "step": 3980 }, { "epoch": 1.0517355153754784, "grad_norm": 0.707976758480072, "learning_rate": 4.878610218691673e-05, "loss": 0.0968, "num_input_tokens_seen": 3561616, "step": 3985 }, { "epoch": 1.0530552989309754, "grad_norm": 0.2761719226837158, "learning_rate": 4.87830783212702e-05, "loss": 0.0637, "num_input_tokens_seen": 3566224, "step": 3990 }, { "epoch": 1.0543750824864722, "grad_norm": 0.15643230080604553, "learning_rate": 4.878005078797413e-05, "loss": 0.0662, "num_input_tokens_seen": 3570416, "step": 3995 }, { "epoch": 1.0556948660419692, "grad_norm": 0.2590215802192688, "learning_rate": 4.877701958749539e-05, "loss": 0.1081, "num_input_tokens_seen": 3575248, "step": 4000 }, { "epoch": 1.0556948660419692, "eval_loss": 0.08392225950956345, "eval_runtime": 64.7543, "eval_samples_per_second": 104.009, "eval_steps_per_second": 26.006, "num_input_tokens_seen": 3575248, "step": 4000 }, { "epoch": 1.057014649597466, "grad_norm": 0.5423316955566406, "learning_rate": 4.877398472030142e-05, "loss": 0.0845, "num_input_tokens_seen": 3579856, "step": 4005 }, { "epoch": 1.058334433152963, "grad_norm": 0.13931109011173248, "learning_rate": 4.877094618686024e-05, "loss": 0.0692, "num_input_tokens_seen": 3584304, "step": 4010 }, { "epoch": 1.0596542167084597, "grad_norm": 0.7526949644088745, "learning_rate": 4.876790398764045e-05, "loss": 0.0838, "num_input_tokens_seen": 3588784, "step": 4015 }, { "epoch": 1.0609740002639567, "grad_norm": 0.25323137640953064, "learning_rate": 4.8764858123111167e-05, "loss": 0.0526, "num_input_tokens_seen": 3593168, "step": 4020 }, { "epoch": 1.0622937838194537, "grad_norm": 0.1867348849773407, "learning_rate": 4.876180859374212e-05, "loss": 0.0951, "num_input_tokens_seen": 3597584, "step": 4025 }, { "epoch": 1.0636135673749505, "grad_norm": 0.5692929625511169, "learning_rate": 4.875875540000357e-05, "loss": 0.0821, "num_input_tokens_seen": 3602224, "step": 4030 }, { "epoch": 1.0649333509304475, "grad_norm": 0.27245578169822693, "learning_rate": 4.8755698542366376e-05, "loss": 0.0876, "num_input_tokens_seen": 3606576, "step": 4035 }, { "epoch": 1.0662531344859443, "grad_norm": 0.3179202973842621, "learning_rate": 4.875263802130193e-05, "loss": 0.0448, "num_input_tokens_seen": 3611216, "step": 4040 }, { "epoch": 1.0675729180414413, "grad_norm": 0.45173290371894836, "learning_rate": 4.8749573837282207e-05, "loss": 0.1154, "num_input_tokens_seen": 3615568, "step": 4045 }, { "epoch": 1.068892701596938, "grad_norm": 0.5114203095436096, "learning_rate": 4.874650599077974e-05, "loss": 0.1048, "num_input_tokens_seen": 3619984, "step": 4050 }, { "epoch": 1.070212485152435, "grad_norm": 0.08957991749048233, "learning_rate": 4.874343448226764e-05, "loss": 0.0358, "num_input_tokens_seen": 3624432, "step": 4055 }, { "epoch": 1.0715322687079318, "grad_norm": 0.19655869901180267, "learning_rate": 4.874035931221955e-05, "loss": 0.0685, "num_input_tokens_seen": 3628944, "step": 4060 }, { "epoch": 1.0728520522634288, "grad_norm": 0.14045877754688263, "learning_rate": 4.8737280481109724e-05, "loss": 0.0793, "num_input_tokens_seen": 3633648, "step": 4065 }, { "epoch": 1.0741718358189256, "grad_norm": 0.0934409350156784, "learning_rate": 4.873419798941294e-05, "loss": 0.0886, "num_input_tokens_seen": 3638192, "step": 4070 }, { "epoch": 1.0754916193744226, "grad_norm": 0.249934583902359, "learning_rate": 4.873111183760458e-05, "loss": 0.1038, "num_input_tokens_seen": 3642576, "step": 4075 }, { "epoch": 1.0768114029299194, "grad_norm": 0.16090010106563568, "learning_rate": 4.8728022026160537e-05, "loss": 0.0575, "num_input_tokens_seen": 3647088, "step": 4080 }, { "epoch": 1.0781311864854164, "grad_norm": 0.44176578521728516, "learning_rate": 4.872492855555732e-05, "loss": 0.0991, "num_input_tokens_seen": 3651664, "step": 4085 }, { "epoch": 1.0794509700409134, "grad_norm": 0.21694275736808777, "learning_rate": 4.8721831426271956e-05, "loss": 0.1181, "num_input_tokens_seen": 3656208, "step": 4090 }, { "epoch": 1.0807707535964102, "grad_norm": 0.20099543035030365, "learning_rate": 4.87187306387821e-05, "loss": 0.0768, "num_input_tokens_seen": 3660816, "step": 4095 }, { "epoch": 1.0820905371519072, "grad_norm": 0.16338351368904114, "learning_rate": 4.87156261935659e-05, "loss": 0.0663, "num_input_tokens_seen": 3665328, "step": 4100 }, { "epoch": 1.083410320707404, "grad_norm": 0.43167057633399963, "learning_rate": 4.871251809110211e-05, "loss": 0.1118, "num_input_tokens_seen": 3670032, "step": 4105 }, { "epoch": 1.084730104262901, "grad_norm": 0.5297998785972595, "learning_rate": 4.8709406331870044e-05, "loss": 0.1498, "num_input_tokens_seen": 3674256, "step": 4110 }, { "epoch": 1.0860498878183977, "grad_norm": 0.08996012806892395, "learning_rate": 4.8706290916349574e-05, "loss": 0.0795, "num_input_tokens_seen": 3678608, "step": 4115 }, { "epoch": 1.0873696713738947, "grad_norm": 0.08041474968194962, "learning_rate": 4.8703171845021134e-05, "loss": 0.1076, "num_input_tokens_seen": 3683120, "step": 4120 }, { "epoch": 1.0886894549293915, "grad_norm": 0.2204785794019699, "learning_rate": 4.870004911836572e-05, "loss": 0.0593, "num_input_tokens_seen": 3687568, "step": 4125 }, { "epoch": 1.0900092384848885, "grad_norm": 0.1558510959148407, "learning_rate": 4.869692273686489e-05, "loss": 0.0877, "num_input_tokens_seen": 3692112, "step": 4130 }, { "epoch": 1.0913290220403853, "grad_norm": 0.1930859535932541, "learning_rate": 4.869379270100079e-05, "loss": 0.1289, "num_input_tokens_seen": 3696688, "step": 4135 }, { "epoch": 1.0926488055958823, "grad_norm": 0.1563756763935089, "learning_rate": 4.86906590112561e-05, "loss": 0.0754, "num_input_tokens_seen": 3701456, "step": 4140 }, { "epoch": 1.093968589151379, "grad_norm": 0.2564115822315216, "learning_rate": 4.8687521668114064e-05, "loss": 0.0505, "num_input_tokens_seen": 3706064, "step": 4145 }, { "epoch": 1.095288372706876, "grad_norm": 0.10277988761663437, "learning_rate": 4.868438067205853e-05, "loss": 0.073, "num_input_tokens_seen": 3710768, "step": 4150 }, { "epoch": 1.096608156262373, "grad_norm": 0.31413015723228455, "learning_rate": 4.8681236023573844e-05, "loss": 0.0709, "num_input_tokens_seen": 3715216, "step": 4155 }, { "epoch": 1.0979279398178698, "grad_norm": 0.18851597607135773, "learning_rate": 4.867808772314497e-05, "loss": 0.0734, "num_input_tokens_seen": 3719632, "step": 4160 }, { "epoch": 1.0992477233733668, "grad_norm": 0.22655442357063293, "learning_rate": 4.867493577125741e-05, "loss": 0.1052, "num_input_tokens_seen": 3723600, "step": 4165 }, { "epoch": 1.1005675069288636, "grad_norm": 0.42436203360557556, "learning_rate": 4.867178016839725e-05, "loss": 0.0632, "num_input_tokens_seen": 3728080, "step": 4170 }, { "epoch": 1.1018872904843606, "grad_norm": 0.3642787039279938, "learning_rate": 4.8668620915051094e-05, "loss": 0.1083, "num_input_tokens_seen": 3732816, "step": 4175 }, { "epoch": 1.1032070740398574, "grad_norm": 0.2699391543865204, "learning_rate": 4.866545801170616e-05, "loss": 0.0979, "num_input_tokens_seen": 3737008, "step": 4180 }, { "epoch": 1.1045268575953544, "grad_norm": 0.4498985707759857, "learning_rate": 4.86622914588502e-05, "loss": 0.0847, "num_input_tokens_seen": 3741616, "step": 4185 }, { "epoch": 1.1058466411508512, "grad_norm": 0.16889449954032898, "learning_rate": 4.865912125697154e-05, "loss": 0.1105, "num_input_tokens_seen": 3746064, "step": 4190 }, { "epoch": 1.1071664247063482, "grad_norm": 0.5030496120452881, "learning_rate": 4.865594740655907e-05, "loss": 0.058, "num_input_tokens_seen": 3750800, "step": 4195 }, { "epoch": 1.108486208261845, "grad_norm": 0.16665053367614746, "learning_rate": 4.865276990810222e-05, "loss": 0.0535, "num_input_tokens_seen": 3754960, "step": 4200 }, { "epoch": 1.108486208261845, "eval_loss": 0.08246643096208572, "eval_runtime": 64.8096, "eval_samples_per_second": 103.92, "eval_steps_per_second": 25.984, "num_input_tokens_seen": 3754960, "step": 4200 }, { "epoch": 1.109805991817342, "grad_norm": 0.1580665558576584, "learning_rate": 4.8649588762091016e-05, "loss": 0.072, "num_input_tokens_seen": 3759248, "step": 4205 }, { "epoch": 1.1111257753728387, "grad_norm": 0.3454453647136688, "learning_rate": 4.8646403969016016e-05, "loss": 0.0758, "num_input_tokens_seen": 3763824, "step": 4210 }, { "epoch": 1.1124455589283357, "grad_norm": 0.16657036542892456, "learning_rate": 4.864321552936838e-05, "loss": 0.0639, "num_input_tokens_seen": 3768080, "step": 4215 }, { "epoch": 1.1137653424838327, "grad_norm": 0.12236739695072174, "learning_rate": 4.864002344363978e-05, "loss": 0.0587, "num_input_tokens_seen": 3772336, "step": 4220 }, { "epoch": 1.1150851260393295, "grad_norm": 0.19526484608650208, "learning_rate": 4.863682771232248e-05, "loss": 0.0965, "num_input_tokens_seen": 3777008, "step": 4225 }, { "epoch": 1.1164049095948265, "grad_norm": 0.11880189925432205, "learning_rate": 4.8633628335909324e-05, "loss": 0.0672, "num_input_tokens_seen": 3781520, "step": 4230 }, { "epoch": 1.1177246931503233, "grad_norm": 0.3267318904399872, "learning_rate": 4.8630425314893676e-05, "loss": 0.0824, "num_input_tokens_seen": 3785648, "step": 4235 }, { "epoch": 1.1190444767058203, "grad_norm": 0.14763163030147552, "learning_rate": 4.862721864976948e-05, "loss": 0.0586, "num_input_tokens_seen": 3790032, "step": 4240 }, { "epoch": 1.120364260261317, "grad_norm": 0.3275463581085205, "learning_rate": 4.862400834103125e-05, "loss": 0.0801, "num_input_tokens_seen": 3794384, "step": 4245 }, { "epoch": 1.121684043816814, "grad_norm": 0.19305244088172913, "learning_rate": 4.862079438917406e-05, "loss": 0.0361, "num_input_tokens_seen": 3798704, "step": 4250 }, { "epoch": 1.1230038273723109, "grad_norm": 0.16991020739078522, "learning_rate": 4.8617576794693536e-05, "loss": 0.0636, "num_input_tokens_seen": 3803152, "step": 4255 }, { "epoch": 1.1243236109278079, "grad_norm": 0.4030729830265045, "learning_rate": 4.8614355558085875e-05, "loss": 0.0661, "num_input_tokens_seen": 3807792, "step": 4260 }, { "epoch": 1.1256433944833049, "grad_norm": 0.13737830519676208, "learning_rate": 4.861113067984783e-05, "loss": 0.0432, "num_input_tokens_seen": 3812240, "step": 4265 }, { "epoch": 1.1269631780388016, "grad_norm": 0.06664368510246277, "learning_rate": 4.860790216047671e-05, "loss": 0.0399, "num_input_tokens_seen": 3816912, "step": 4270 }, { "epoch": 1.1282829615942984, "grad_norm": 0.5525670647621155, "learning_rate": 4.860467000047041e-05, "loss": 0.126, "num_input_tokens_seen": 3821392, "step": 4275 }, { "epoch": 1.1296027451497954, "grad_norm": 0.16312049329280853, "learning_rate": 4.860143420032737e-05, "loss": 0.0499, "num_input_tokens_seen": 3825904, "step": 4280 }, { "epoch": 1.1309225287052924, "grad_norm": 0.19475403428077698, "learning_rate": 4.859819476054657e-05, "loss": 0.1336, "num_input_tokens_seen": 3830224, "step": 4285 }, { "epoch": 1.1322423122607892, "grad_norm": 0.11151828616857529, "learning_rate": 4.859495168162758e-05, "loss": 0.0631, "num_input_tokens_seen": 3834864, "step": 4290 }, { "epoch": 1.1335620958162862, "grad_norm": 0.14119647443294525, "learning_rate": 4.859170496407054e-05, "loss": 0.0911, "num_input_tokens_seen": 3839216, "step": 4295 }, { "epoch": 1.134881879371783, "grad_norm": 0.141139954328537, "learning_rate": 4.8588454608376114e-05, "loss": 0.0753, "num_input_tokens_seen": 3844048, "step": 4300 }, { "epoch": 1.13620166292728, "grad_norm": 0.05400727689266205, "learning_rate": 4.8585200615045555e-05, "loss": 0.0902, "num_input_tokens_seen": 3848528, "step": 4305 }, { "epoch": 1.1375214464827768, "grad_norm": 0.18937702476978302, "learning_rate": 4.8581942984580674e-05, "loss": 0.0877, "num_input_tokens_seen": 3852720, "step": 4310 }, { "epoch": 1.1388412300382738, "grad_norm": 0.144100159406662, "learning_rate": 4.857868171748384e-05, "loss": 0.0517, "num_input_tokens_seen": 3857232, "step": 4315 }, { "epoch": 1.1401610135937705, "grad_norm": 0.21508780121803284, "learning_rate": 4.8575416814257976e-05, "loss": 0.0707, "num_input_tokens_seen": 3861680, "step": 4320 }, { "epoch": 1.1414807971492675, "grad_norm": 0.2570151388645172, "learning_rate": 4.857214827540657e-05, "loss": 0.046, "num_input_tokens_seen": 3866096, "step": 4325 }, { "epoch": 1.1428005807047645, "grad_norm": 0.13804972171783447, "learning_rate": 4.856887610143367e-05, "loss": 0.0656, "num_input_tokens_seen": 3870480, "step": 4330 }, { "epoch": 1.1441203642602613, "grad_norm": 0.21129173040390015, "learning_rate": 4.8565600292843896e-05, "loss": 0.0634, "num_input_tokens_seen": 3874736, "step": 4335 }, { "epoch": 1.145440147815758, "grad_norm": 0.42242226004600525, "learning_rate": 4.856232085014241e-05, "loss": 0.0766, "num_input_tokens_seen": 3879024, "step": 4340 }, { "epoch": 1.146759931371255, "grad_norm": 0.15966832637786865, "learning_rate": 4.855903777383495e-05, "loss": 0.055, "num_input_tokens_seen": 3883504, "step": 4345 }, { "epoch": 1.148079714926752, "grad_norm": 0.2897903323173523, "learning_rate": 4.85557510644278e-05, "loss": 0.0508, "num_input_tokens_seen": 3888144, "step": 4350 }, { "epoch": 1.1493994984822489, "grad_norm": 0.09813579171895981, "learning_rate": 4.855246072242782e-05, "loss": 0.0605, "num_input_tokens_seen": 3892432, "step": 4355 }, { "epoch": 1.1507192820377459, "grad_norm": 0.15595395863056183, "learning_rate": 4.8549166748342414e-05, "loss": 0.0493, "num_input_tokens_seen": 3897104, "step": 4360 }, { "epoch": 1.1520390655932427, "grad_norm": 0.31183916330337524, "learning_rate": 4.8545869142679556e-05, "loss": 0.0433, "num_input_tokens_seen": 3901552, "step": 4365 }, { "epoch": 1.1533588491487397, "grad_norm": 0.1998717039823532, "learning_rate": 4.8542567905947776e-05, "loss": 0.0343, "num_input_tokens_seen": 3906096, "step": 4370 }, { "epoch": 1.1546786327042364, "grad_norm": 0.1302712857723236, "learning_rate": 4.853926303865618e-05, "loss": 0.0721, "num_input_tokens_seen": 3910480, "step": 4375 }, { "epoch": 1.1559984162597334, "grad_norm": 0.45480620861053467, "learning_rate": 4.853595454131441e-05, "loss": 0.1382, "num_input_tokens_seen": 3914832, "step": 4380 }, { "epoch": 1.1573181998152302, "grad_norm": 0.34436386823654175, "learning_rate": 4.8532642414432674e-05, "loss": 0.0758, "num_input_tokens_seen": 3919280, "step": 4385 }, { "epoch": 1.1586379833707272, "grad_norm": 0.08240567892789841, "learning_rate": 4.8529326658521754e-05, "loss": 0.0235, "num_input_tokens_seen": 3924112, "step": 4390 }, { "epoch": 1.1599577669262242, "grad_norm": 0.21992577612400055, "learning_rate": 4.8526007274092965e-05, "loss": 0.0769, "num_input_tokens_seen": 3928464, "step": 4395 }, { "epoch": 1.161277550481721, "grad_norm": 0.22887659072875977, "learning_rate": 4.852268426165822e-05, "loss": 0.1007, "num_input_tokens_seen": 3932752, "step": 4400 }, { "epoch": 1.161277550481721, "eval_loss": 0.08230160176753998, "eval_runtime": 64.7534, "eval_samples_per_second": 104.01, "eval_steps_per_second": 26.006, "num_input_tokens_seen": 3932752, "step": 4400 }, { "epoch": 1.162597334037218, "grad_norm": 0.08518625050783157, "learning_rate": 4.851935762172995e-05, "loss": 0.0869, "num_input_tokens_seen": 3936976, "step": 4405 }, { "epoch": 1.1639171175927148, "grad_norm": 0.18197834491729736, "learning_rate": 4.8516027354821175e-05, "loss": 0.0516, "num_input_tokens_seen": 3941264, "step": 4410 }, { "epoch": 1.1652369011482118, "grad_norm": 0.21768225729465485, "learning_rate": 4.851269346144546e-05, "loss": 0.0781, "num_input_tokens_seen": 3945616, "step": 4415 }, { "epoch": 1.1665566847037085, "grad_norm": 0.25081053376197815, "learning_rate": 4.850935594211693e-05, "loss": 0.0513, "num_input_tokens_seen": 3950128, "step": 4420 }, { "epoch": 1.1678764682592055, "grad_norm": 0.42511430382728577, "learning_rate": 4.850601479735029e-05, "loss": 0.0924, "num_input_tokens_seen": 3954576, "step": 4425 }, { "epoch": 1.1691962518147023, "grad_norm": 0.2510102689266205, "learning_rate": 4.850267002766076e-05, "loss": 0.0965, "num_input_tokens_seen": 3958960, "step": 4430 }, { "epoch": 1.1705160353701993, "grad_norm": 0.18003112077713013, "learning_rate": 4.849932163356417e-05, "loss": 0.1153, "num_input_tokens_seen": 3963024, "step": 4435 }, { "epoch": 1.171835818925696, "grad_norm": 0.13992653787136078, "learning_rate": 4.8495969615576864e-05, "loss": 0.0667, "num_input_tokens_seen": 3967632, "step": 4440 }, { "epoch": 1.173155602481193, "grad_norm": 0.06868690997362137, "learning_rate": 4.849261397421577e-05, "loss": 0.0544, "num_input_tokens_seen": 3972400, "step": 4445 }, { "epoch": 1.1744753860366899, "grad_norm": 0.1193857491016388, "learning_rate": 4.848925470999839e-05, "loss": 0.0617, "num_input_tokens_seen": 3976784, "step": 4450 }, { "epoch": 1.1757951695921869, "grad_norm": 0.27131497859954834, "learning_rate": 4.848589182344273e-05, "loss": 0.0616, "num_input_tokens_seen": 3981296, "step": 4455 }, { "epoch": 1.1771149531476839, "grad_norm": 0.14180465042591095, "learning_rate": 4.848252531506742e-05, "loss": 0.0624, "num_input_tokens_seen": 3985712, "step": 4460 }, { "epoch": 1.1784347367031807, "grad_norm": 0.35425296425819397, "learning_rate": 4.847915518539161e-05, "loss": 0.0479, "num_input_tokens_seen": 3989872, "step": 4465 }, { "epoch": 1.1797545202586777, "grad_norm": 0.2901477515697479, "learning_rate": 4.847578143493501e-05, "loss": 0.1293, "num_input_tokens_seen": 3994352, "step": 4470 }, { "epoch": 1.1810743038141744, "grad_norm": 0.24459728598594666, "learning_rate": 4.847240406421789e-05, "loss": 0.0742, "num_input_tokens_seen": 3999056, "step": 4475 }, { "epoch": 1.1823940873696714, "grad_norm": 0.3255689740180969, "learning_rate": 4.84690230737611e-05, "loss": 0.0545, "num_input_tokens_seen": 4003504, "step": 4480 }, { "epoch": 1.1837138709251682, "grad_norm": 0.8029049038887024, "learning_rate": 4.846563846408602e-05, "loss": 0.1275, "num_input_tokens_seen": 4008112, "step": 4485 }, { "epoch": 1.1850336544806652, "grad_norm": 0.13837052881717682, "learning_rate": 4.84622502357146e-05, "loss": 0.0952, "num_input_tokens_seen": 4012496, "step": 4490 }, { "epoch": 1.186353438036162, "grad_norm": 0.15304633975028992, "learning_rate": 4.8458858389169345e-05, "loss": 0.0671, "num_input_tokens_seen": 4016720, "step": 4495 }, { "epoch": 1.187673221591659, "grad_norm": 0.3612307012081146, "learning_rate": 4.8455462924973334e-05, "loss": 0.094, "num_input_tokens_seen": 4021584, "step": 4500 }, { "epoch": 1.1889930051471558, "grad_norm": 0.11760736256837845, "learning_rate": 4.845206384365018e-05, "loss": 0.07, "num_input_tokens_seen": 4026032, "step": 4505 }, { "epoch": 1.1903127887026528, "grad_norm": 0.19630874693393707, "learning_rate": 4.844866114572405e-05, "loss": 0.0972, "num_input_tokens_seen": 4030512, "step": 4510 }, { "epoch": 1.1916325722581496, "grad_norm": 0.4415590763092041, "learning_rate": 4.8445254831719706e-05, "loss": 0.0737, "num_input_tokens_seen": 4035056, "step": 4515 }, { "epoch": 1.1929523558136466, "grad_norm": 0.2646600008010864, "learning_rate": 4.8441844902162434e-05, "loss": 0.1213, "num_input_tokens_seen": 4039408, "step": 4520 }, { "epoch": 1.1942721393691436, "grad_norm": 0.07634265720844269, "learning_rate": 4.843843135757809e-05, "loss": 0.0709, "num_input_tokens_seen": 4043408, "step": 4525 }, { "epoch": 1.1955919229246403, "grad_norm": 0.02797778695821762, "learning_rate": 4.843501419849308e-05, "loss": 0.0536, "num_input_tokens_seen": 4048560, "step": 4530 }, { "epoch": 1.1969117064801373, "grad_norm": 0.10018549859523773, "learning_rate": 4.8431593425434386e-05, "loss": 0.049, "num_input_tokens_seen": 4053360, "step": 4535 }, { "epoch": 1.1982314900356341, "grad_norm": 0.4374690651893616, "learning_rate": 4.8428169038929526e-05, "loss": 0.0937, "num_input_tokens_seen": 4057744, "step": 4540 }, { "epoch": 1.1995512735911311, "grad_norm": 0.19169427454471588, "learning_rate": 4.8424741039506575e-05, "loss": 0.061, "num_input_tokens_seen": 4062416, "step": 4545 }, { "epoch": 1.200871057146628, "grad_norm": 0.13905975222587585, "learning_rate": 4.842130942769419e-05, "loss": 0.114, "num_input_tokens_seen": 4067152, "step": 4550 }, { "epoch": 1.202190840702125, "grad_norm": 0.3515130579471588, "learning_rate": 4.841787420402156e-05, "loss": 0.0549, "num_input_tokens_seen": 4071760, "step": 4555 }, { "epoch": 1.2035106242576217, "grad_norm": 0.3192265033721924, "learning_rate": 4.841443536901844e-05, "loss": 0.0818, "num_input_tokens_seen": 4076240, "step": 4560 }, { "epoch": 1.2048304078131187, "grad_norm": 0.5361230373382568, "learning_rate": 4.841099292321514e-05, "loss": 0.0784, "num_input_tokens_seen": 4080432, "step": 4565 }, { "epoch": 1.2061501913686155, "grad_norm": 0.267772912979126, "learning_rate": 4.8407546867142525e-05, "loss": 0.1007, "num_input_tokens_seen": 4084976, "step": 4570 }, { "epoch": 1.2074699749241125, "grad_norm": 0.09927910566329956, "learning_rate": 4.840409720133203e-05, "loss": 0.0945, "num_input_tokens_seen": 4089936, "step": 4575 }, { "epoch": 1.2087897584796092, "grad_norm": 0.22060927748680115, "learning_rate": 4.8400643926315634e-05, "loss": 0.0556, "num_input_tokens_seen": 4094608, "step": 4580 }, { "epoch": 1.2101095420351062, "grad_norm": 0.4117361605167389, "learning_rate": 4.839718704262587e-05, "loss": 0.0993, "num_input_tokens_seen": 4099088, "step": 4585 }, { "epoch": 1.2114293255906032, "grad_norm": 0.5627230405807495, "learning_rate": 4.839372655079585e-05, "loss": 0.0969, "num_input_tokens_seen": 4103440, "step": 4590 }, { "epoch": 1.2127491091461, "grad_norm": 0.27293699979782104, "learning_rate": 4.83902624513592e-05, "loss": 0.068, "num_input_tokens_seen": 4108336, "step": 4595 }, { "epoch": 1.214068892701597, "grad_norm": 0.4113752245903015, "learning_rate": 4.838679474485014e-05, "loss": 0.0769, "num_input_tokens_seen": 4112272, "step": 4600 }, { "epoch": 1.214068892701597, "eval_loss": 0.08132775127887726, "eval_runtime": 64.7147, "eval_samples_per_second": 104.072, "eval_steps_per_second": 26.022, "num_input_tokens_seen": 4112272, "step": 4600 }, { "epoch": 1.2153886762570938, "grad_norm": 0.3161044716835022, "learning_rate": 4.838332343180343e-05, "loss": 0.1105, "num_input_tokens_seen": 4116496, "step": 4605 }, { "epoch": 1.2167084598125908, "grad_norm": 0.3680439889431, "learning_rate": 4.83798485127544e-05, "loss": 0.0962, "num_input_tokens_seen": 4121072, "step": 4610 }, { "epoch": 1.2180282433680876, "grad_norm": 0.7212788462638855, "learning_rate": 4.837636998823892e-05, "loss": 0.0922, "num_input_tokens_seen": 4125488, "step": 4615 }, { "epoch": 1.2193480269235846, "grad_norm": 0.18974103033542633, "learning_rate": 4.8372887858793414e-05, "loss": 0.1187, "num_input_tokens_seen": 4129936, "step": 4620 }, { "epoch": 1.2206678104790813, "grad_norm": 0.1769970804452896, "learning_rate": 4.836940212495489e-05, "loss": 0.1, "num_input_tokens_seen": 4133968, "step": 4625 }, { "epoch": 1.2219875940345784, "grad_norm": 0.443345844745636, "learning_rate": 4.836591278726087e-05, "loss": 0.0744, "num_input_tokens_seen": 4138544, "step": 4630 }, { "epoch": 1.2233073775900754, "grad_norm": 0.442806601524353, "learning_rate": 4.836241984624947e-05, "loss": 0.1084, "num_input_tokens_seen": 4143024, "step": 4635 }, { "epoch": 1.2246271611455721, "grad_norm": 0.2790873944759369, "learning_rate": 4.8358923302459336e-05, "loss": 0.1237, "num_input_tokens_seen": 4147472, "step": 4640 }, { "epoch": 1.225946944701069, "grad_norm": 0.18140792846679688, "learning_rate": 4.835542315642968e-05, "loss": 0.0984, "num_input_tokens_seen": 4152080, "step": 4645 }, { "epoch": 1.227266728256566, "grad_norm": 0.1695387065410614, "learning_rate": 4.8351919408700274e-05, "loss": 0.0643, "num_input_tokens_seen": 4156816, "step": 4650 }, { "epoch": 1.228586511812063, "grad_norm": 0.11536339670419693, "learning_rate": 4.834841205981144e-05, "loss": 0.0356, "num_input_tokens_seen": 4161072, "step": 4655 }, { "epoch": 1.2299062953675597, "grad_norm": 0.0577889047563076, "learning_rate": 4.8344901110304054e-05, "loss": 0.0418, "num_input_tokens_seen": 4165232, "step": 4660 }, { "epoch": 1.2312260789230567, "grad_norm": 0.27206772565841675, "learning_rate": 4.8341386560719534e-05, "loss": 0.0648, "num_input_tokens_seen": 4169840, "step": 4665 }, { "epoch": 1.2325458624785535, "grad_norm": 0.08307453989982605, "learning_rate": 4.833786841159989e-05, "loss": 0.1001, "num_input_tokens_seen": 4174640, "step": 4670 }, { "epoch": 1.2338656460340505, "grad_norm": 0.167721226811409, "learning_rate": 4.833434666348765e-05, "loss": 0.1026, "num_input_tokens_seen": 4179344, "step": 4675 }, { "epoch": 1.2351854295895472, "grad_norm": 0.15354494750499725, "learning_rate": 4.833082131692592e-05, "loss": 0.0398, "num_input_tokens_seen": 4184144, "step": 4680 }, { "epoch": 1.2365052131450442, "grad_norm": 0.19773054122924805, "learning_rate": 4.832729237245835e-05, "loss": 0.1026, "num_input_tokens_seen": 4189040, "step": 4685 }, { "epoch": 1.237824996700541, "grad_norm": 0.10577980428934097, "learning_rate": 4.8323759830629145e-05, "loss": 0.0501, "num_input_tokens_seen": 4193648, "step": 4690 }, { "epoch": 1.239144780256038, "grad_norm": 0.22638793289661407, "learning_rate": 4.8320223691983066e-05, "loss": 0.0612, "num_input_tokens_seen": 4197968, "step": 4695 }, { "epoch": 1.240464563811535, "grad_norm": 0.6550453305244446, "learning_rate": 4.831668395706544e-05, "loss": 0.1026, "num_input_tokens_seen": 4202512, "step": 4700 }, { "epoch": 1.2417843473670318, "grad_norm": 0.3708903193473816, "learning_rate": 4.8313140626422125e-05, "loss": 0.0881, "num_input_tokens_seen": 4207024, "step": 4705 }, { "epoch": 1.2431041309225286, "grad_norm": 0.26724153757095337, "learning_rate": 4.830959370059956e-05, "loss": 0.0744, "num_input_tokens_seen": 4211440, "step": 4710 }, { "epoch": 1.2444239144780256, "grad_norm": 0.09195451438426971, "learning_rate": 4.830604318014472e-05, "loss": 0.0639, "num_input_tokens_seen": 4215760, "step": 4715 }, { "epoch": 1.2457436980335226, "grad_norm": 0.30698201060295105, "learning_rate": 4.830248906560514e-05, "loss": 0.0521, "num_input_tokens_seen": 4220400, "step": 4720 }, { "epoch": 1.2470634815890194, "grad_norm": 0.3779836595058441, "learning_rate": 4.829893135752891e-05, "loss": 0.0583, "num_input_tokens_seen": 4224752, "step": 4725 }, { "epoch": 1.2483832651445164, "grad_norm": 0.24843882024288177, "learning_rate": 4.829537005646466e-05, "loss": 0.0934, "num_input_tokens_seen": 4229264, "step": 4730 }, { "epoch": 1.2497030487000131, "grad_norm": 0.3456835150718689, "learning_rate": 4.8291805162961615e-05, "loss": 0.12, "num_input_tokens_seen": 4233648, "step": 4735 }, { "epoch": 1.2510228322555101, "grad_norm": 0.3580072224140167, "learning_rate": 4.82882366775695e-05, "loss": 0.0628, "num_input_tokens_seen": 4237968, "step": 4740 }, { "epoch": 1.252342615811007, "grad_norm": 0.15434615314006805, "learning_rate": 4.828466460083864e-05, "loss": 0.0629, "num_input_tokens_seen": 4242448, "step": 4745 }, { "epoch": 1.253662399366504, "grad_norm": 0.213471457362175, "learning_rate": 4.8281088933319877e-05, "loss": 0.0734, "num_input_tokens_seen": 4246896, "step": 4750 }, { "epoch": 1.2549821829220007, "grad_norm": 0.22187449038028717, "learning_rate": 4.827750967556464e-05, "loss": 0.0871, "num_input_tokens_seen": 4251312, "step": 4755 }, { "epoch": 1.2563019664774977, "grad_norm": 0.20207275450229645, "learning_rate": 4.827392682812488e-05, "loss": 0.06, "num_input_tokens_seen": 4256016, "step": 4760 }, { "epoch": 1.2576217500329947, "grad_norm": 0.280529648065567, "learning_rate": 4.827034039155312e-05, "loss": 0.0529, "num_input_tokens_seen": 4260656, "step": 4765 }, { "epoch": 1.2589415335884915, "grad_norm": 0.10901498794555664, "learning_rate": 4.8266750366402445e-05, "loss": 0.0885, "num_input_tokens_seen": 4264848, "step": 4770 }, { "epoch": 1.2602613171439883, "grad_norm": 0.16844114661216736, "learning_rate": 4.8263156753226476e-05, "loss": 0.0859, "num_input_tokens_seen": 4269520, "step": 4775 }, { "epoch": 1.2615811006994853, "grad_norm": 0.1793612539768219, "learning_rate": 4.8259559552579394e-05, "loss": 0.0792, "num_input_tokens_seen": 4274128, "step": 4780 }, { "epoch": 1.2629008842549823, "grad_norm": 0.37080785632133484, "learning_rate": 4.825595876501593e-05, "loss": 0.0798, "num_input_tokens_seen": 4278416, "step": 4785 }, { "epoch": 1.264220667810479, "grad_norm": 0.26953259110450745, "learning_rate": 4.825235439109137e-05, "loss": 0.1067, "num_input_tokens_seen": 4282928, "step": 4790 }, { "epoch": 1.265540451365976, "grad_norm": 0.4278879761695862, "learning_rate": 4.824874643136156e-05, "loss": 0.1268, "num_input_tokens_seen": 4287216, "step": 4795 }, { "epoch": 1.2668602349214728, "grad_norm": 0.23623058199882507, "learning_rate": 4.824513488638288e-05, "loss": 0.0649, "num_input_tokens_seen": 4291792, "step": 4800 }, { "epoch": 1.2668602349214728, "eval_loss": 0.08161582052707672, "eval_runtime": 64.7596, "eval_samples_per_second": 104.0, "eval_steps_per_second": 26.004, "num_input_tokens_seen": 4291792, "step": 4800 }, { "epoch": 1.2681800184769698, "grad_norm": 0.13478226959705353, "learning_rate": 4.8241519756712293e-05, "loss": 0.0475, "num_input_tokens_seen": 4296176, "step": 4805 }, { "epoch": 1.2694998020324666, "grad_norm": 0.28952741622924805, "learning_rate": 4.8237901042907285e-05, "loss": 0.0383, "num_input_tokens_seen": 4300656, "step": 4810 }, { "epoch": 1.2708195855879636, "grad_norm": 0.21401876211166382, "learning_rate": 4.823427874552591e-05, "loss": 0.0513, "num_input_tokens_seen": 4305232, "step": 4815 }, { "epoch": 1.2721393691434604, "grad_norm": 0.1189020425081253, "learning_rate": 4.823065286512677e-05, "loss": 0.0498, "num_input_tokens_seen": 4310192, "step": 4820 }, { "epoch": 1.2734591526989574, "grad_norm": 0.03214819356799126, "learning_rate": 4.8227023402269025e-05, "loss": 0.0815, "num_input_tokens_seen": 4314768, "step": 4825 }, { "epoch": 1.2747789362544544, "grad_norm": 0.09765693545341492, "learning_rate": 4.822339035751239e-05, "loss": 0.0566, "num_input_tokens_seen": 4319120, "step": 4830 }, { "epoch": 1.2760987198099512, "grad_norm": 0.3685755431652069, "learning_rate": 4.8219753731417104e-05, "loss": 0.0589, "num_input_tokens_seen": 4323664, "step": 4835 }, { "epoch": 1.277418503365448, "grad_norm": 0.467263787984848, "learning_rate": 4.821611352454401e-05, "loss": 0.0694, "num_input_tokens_seen": 4328368, "step": 4840 }, { "epoch": 1.278738286920945, "grad_norm": 0.29263240098953247, "learning_rate": 4.8212469737454444e-05, "loss": 0.0774, "num_input_tokens_seen": 4333136, "step": 4845 }, { "epoch": 1.280058070476442, "grad_norm": 0.09253711998462677, "learning_rate": 4.820882237071035e-05, "loss": 0.0601, "num_input_tokens_seen": 4337616, "step": 4850 }, { "epoch": 1.2813778540319387, "grad_norm": 0.08086097985506058, "learning_rate": 4.820517142487417e-05, "loss": 0.0317, "num_input_tokens_seen": 4342192, "step": 4855 }, { "epoch": 1.2826976375874357, "grad_norm": 0.6061115860939026, "learning_rate": 4.8201516900508956e-05, "loss": 0.1437, "num_input_tokens_seen": 4346736, "step": 4860 }, { "epoch": 1.2840174211429325, "grad_norm": 0.1166522204875946, "learning_rate": 4.819785879817827e-05, "loss": 0.0267, "num_input_tokens_seen": 4351216, "step": 4865 }, { "epoch": 1.2853372046984295, "grad_norm": 0.17058229446411133, "learning_rate": 4.8194197118446226e-05, "loss": 0.0603, "num_input_tokens_seen": 4355408, "step": 4870 }, { "epoch": 1.2866569882539265, "grad_norm": 0.24322225153446198, "learning_rate": 4.819053186187752e-05, "loss": 0.0553, "num_input_tokens_seen": 4359888, "step": 4875 }, { "epoch": 1.2879767718094233, "grad_norm": 0.21464328467845917, "learning_rate": 4.818686302903736e-05, "loss": 0.0532, "num_input_tokens_seen": 4364272, "step": 4880 }, { "epoch": 1.28929655536492, "grad_norm": 0.20886161923408508, "learning_rate": 4.818319062049154e-05, "loss": 0.0609, "num_input_tokens_seen": 4368912, "step": 4885 }, { "epoch": 1.290616338920417, "grad_norm": 0.7577788233757019, "learning_rate": 4.817951463680639e-05, "loss": 0.0899, "num_input_tokens_seen": 4373232, "step": 4890 }, { "epoch": 1.291936122475914, "grad_norm": 0.03194143995642662, "learning_rate": 4.817583507854879e-05, "loss": 0.0404, "num_input_tokens_seen": 4377584, "step": 4895 }, { "epoch": 1.2932559060314108, "grad_norm": 0.16970330476760864, "learning_rate": 4.817215194628617e-05, "loss": 0.0438, "num_input_tokens_seen": 4381968, "step": 4900 }, { "epoch": 1.2945756895869078, "grad_norm": 0.18699966371059418, "learning_rate": 4.816846524058653e-05, "loss": 0.0953, "num_input_tokens_seen": 4386768, "step": 4905 }, { "epoch": 1.2958954731424046, "grad_norm": 0.26557132601737976, "learning_rate": 4.816477496201839e-05, "loss": 0.1242, "num_input_tokens_seen": 4391312, "step": 4910 }, { "epoch": 1.2972152566979016, "grad_norm": 0.4463260769844055, "learning_rate": 4.8161081111150845e-05, "loss": 0.0734, "num_input_tokens_seen": 4395856, "step": 4915 }, { "epoch": 1.2985350402533984, "grad_norm": 0.0483197383582592, "learning_rate": 4.815738368855354e-05, "loss": 0.0641, "num_input_tokens_seen": 4400240, "step": 4920 }, { "epoch": 1.2998548238088954, "grad_norm": 0.3342621922492981, "learning_rate": 4.815368269479664e-05, "loss": 0.1099, "num_input_tokens_seen": 4405008, "step": 4925 }, { "epoch": 1.3011746073643922, "grad_norm": 0.4109770655632019, "learning_rate": 4.814997813045092e-05, "loss": 0.0371, "num_input_tokens_seen": 4409872, "step": 4930 }, { "epoch": 1.3024943909198892, "grad_norm": 0.11418696492910385, "learning_rate": 4.814626999608764e-05, "loss": 0.0538, "num_input_tokens_seen": 4414256, "step": 4935 }, { "epoch": 1.3038141744753862, "grad_norm": 0.11664246767759323, "learning_rate": 4.814255829227865e-05, "loss": 0.0574, "num_input_tokens_seen": 4419024, "step": 4940 }, { "epoch": 1.305133958030883, "grad_norm": 0.6612749099731445, "learning_rate": 4.813884301959635e-05, "loss": 0.0862, "num_input_tokens_seen": 4423952, "step": 4945 }, { "epoch": 1.3064537415863797, "grad_norm": 0.6188772320747375, "learning_rate": 4.813512417861368e-05, "loss": 0.0893, "num_input_tokens_seen": 4428304, "step": 4950 }, { "epoch": 1.3077735251418767, "grad_norm": 0.6771420836448669, "learning_rate": 4.813140176990411e-05, "loss": 0.1015, "num_input_tokens_seen": 4432784, "step": 4955 }, { "epoch": 1.3090933086973737, "grad_norm": 0.16383981704711914, "learning_rate": 4.8127675794041714e-05, "loss": 0.0798, "num_input_tokens_seen": 4437456, "step": 4960 }, { "epoch": 1.3104130922528705, "grad_norm": 0.3050370514392853, "learning_rate": 4.812394625160107e-05, "loss": 0.0889, "num_input_tokens_seen": 4442320, "step": 4965 }, { "epoch": 1.3117328758083675, "grad_norm": 0.3932335078716278, "learning_rate": 4.812021314315732e-05, "loss": 0.0873, "num_input_tokens_seen": 4446832, "step": 4970 }, { "epoch": 1.3130526593638643, "grad_norm": 0.5249871611595154, "learning_rate": 4.811647646928616e-05, "loss": 0.0701, "num_input_tokens_seen": 4451184, "step": 4975 }, { "epoch": 1.3143724429193613, "grad_norm": 0.26924532651901245, "learning_rate": 4.8112736230563814e-05, "loss": 0.0858, "num_input_tokens_seen": 4455728, "step": 4980 }, { "epoch": 1.315692226474858, "grad_norm": 0.28766703605651855, "learning_rate": 4.81089924275671e-05, "loss": 0.0407, "num_input_tokens_seen": 4459920, "step": 4985 }, { "epoch": 1.317012010030355, "grad_norm": 0.16108350455760956, "learning_rate": 4.810524506087335e-05, "loss": 0.0803, "num_input_tokens_seen": 4464336, "step": 4990 }, { "epoch": 1.3183317935858518, "grad_norm": 0.11470439285039902, "learning_rate": 4.810149413106044e-05, "loss": 0.0522, "num_input_tokens_seen": 4468528, "step": 4995 }, { "epoch": 1.3196515771413488, "grad_norm": 0.27432647347450256, "learning_rate": 4.809773963870684e-05, "loss": 0.0851, "num_input_tokens_seen": 4472784, "step": 5000 }, { "epoch": 1.3196515771413488, "eval_loss": 0.08041132986545563, "eval_runtime": 64.7123, "eval_samples_per_second": 104.076, "eval_steps_per_second": 26.023, "num_input_tokens_seen": 4472784, "step": 5000 }, { "epoch": 1.3209713606968458, "grad_norm": 0.20007339119911194, "learning_rate": 4.809398158439151e-05, "loss": 0.1154, "num_input_tokens_seen": 4477232, "step": 5005 }, { "epoch": 1.3222911442523426, "grad_norm": 0.3440200686454773, "learning_rate": 4.8090219968694005e-05, "loss": 0.0779, "num_input_tokens_seen": 4481840, "step": 5010 }, { "epoch": 1.3236109278078394, "grad_norm": 0.2975916266441345, "learning_rate": 4.808645479219442e-05, "loss": 0.1149, "num_input_tokens_seen": 4486416, "step": 5015 }, { "epoch": 1.3249307113633364, "grad_norm": 0.22423741221427917, "learning_rate": 4.8082686055473375e-05, "loss": 0.0463, "num_input_tokens_seen": 4490928, "step": 5020 }, { "epoch": 1.3262504949188334, "grad_norm": 0.3804813027381897, "learning_rate": 4.8078913759112066e-05, "loss": 0.0856, "num_input_tokens_seen": 4495344, "step": 5025 }, { "epoch": 1.3275702784743302, "grad_norm": 0.6519297361373901, "learning_rate": 4.807513790369223e-05, "loss": 0.1374, "num_input_tokens_seen": 4499504, "step": 5030 }, { "epoch": 1.3288900620298272, "grad_norm": 0.2891515791416168, "learning_rate": 4.8071358489796145e-05, "loss": 0.056, "num_input_tokens_seen": 4504016, "step": 5035 }, { "epoch": 1.330209845585324, "grad_norm": 0.22226184606552124, "learning_rate": 4.806757551800665e-05, "loss": 0.0454, "num_input_tokens_seen": 4508144, "step": 5040 }, { "epoch": 1.331529629140821, "grad_norm": 0.08077623695135117, "learning_rate": 4.806378898890713e-05, "loss": 0.0368, "num_input_tokens_seen": 4512592, "step": 5045 }, { "epoch": 1.3328494126963177, "grad_norm": 0.5659134387969971, "learning_rate": 4.80599989030815e-05, "loss": 0.0953, "num_input_tokens_seen": 4516848, "step": 5050 }, { "epoch": 1.3341691962518147, "grad_norm": 0.1509067714214325, "learning_rate": 4.805620526111426e-05, "loss": 0.0622, "num_input_tokens_seen": 4521296, "step": 5055 }, { "epoch": 1.3354889798073115, "grad_norm": 0.2895316481590271, "learning_rate": 4.805240806359042e-05, "loss": 0.0744, "num_input_tokens_seen": 4525584, "step": 5060 }, { "epoch": 1.3368087633628085, "grad_norm": 0.3272639811038971, "learning_rate": 4.804860731109557e-05, "loss": 0.0784, "num_input_tokens_seen": 4530320, "step": 5065 }, { "epoch": 1.3381285469183055, "grad_norm": 0.2112981677055359, "learning_rate": 4.804480300421581e-05, "loss": 0.0753, "num_input_tokens_seen": 4534640, "step": 5070 }, { "epoch": 1.3394483304738023, "grad_norm": 0.3563102185726166, "learning_rate": 4.804099514353784e-05, "loss": 0.0505, "num_input_tokens_seen": 4538928, "step": 5075 }, { "epoch": 1.340768114029299, "grad_norm": 0.10504698008298874, "learning_rate": 4.8037183729648867e-05, "loss": 0.0672, "num_input_tokens_seen": 4543280, "step": 5080 }, { "epoch": 1.342087897584796, "grad_norm": 0.2180953025817871, "learning_rate": 4.803336876313666e-05, "loss": 0.0953, "num_input_tokens_seen": 4547504, "step": 5085 }, { "epoch": 1.343407681140293, "grad_norm": 0.2923080027103424, "learning_rate": 4.802955024458953e-05, "loss": 0.0855, "num_input_tokens_seen": 4551984, "step": 5090 }, { "epoch": 1.3447274646957899, "grad_norm": 0.06934521347284317, "learning_rate": 4.802572817459634e-05, "loss": 0.1016, "num_input_tokens_seen": 4556784, "step": 5095 }, { "epoch": 1.3460472482512869, "grad_norm": 0.0800878182053566, "learning_rate": 4.802190255374651e-05, "loss": 0.0685, "num_input_tokens_seen": 4561520, "step": 5100 }, { "epoch": 1.3473670318067836, "grad_norm": 0.06487201899290085, "learning_rate": 4.801807338263e-05, "loss": 0.0851, "num_input_tokens_seen": 4565648, "step": 5105 }, { "epoch": 1.3486868153622806, "grad_norm": 0.43343186378479004, "learning_rate": 4.8014240661837306e-05, "loss": 0.0713, "num_input_tokens_seen": 4570352, "step": 5110 }, { "epoch": 1.3500065989177774, "grad_norm": 0.24935777485370636, "learning_rate": 4.80104043919595e-05, "loss": 0.0494, "num_input_tokens_seen": 4575088, "step": 5115 }, { "epoch": 1.3513263824732744, "grad_norm": 0.165144145488739, "learning_rate": 4.800656457358815e-05, "loss": 0.048, "num_input_tokens_seen": 4579664, "step": 5120 }, { "epoch": 1.3526461660287712, "grad_norm": 0.19699345529079437, "learning_rate": 4.800272120731544e-05, "loss": 0.0619, "num_input_tokens_seen": 4584144, "step": 5125 }, { "epoch": 1.3539659495842682, "grad_norm": 0.10191094130277634, "learning_rate": 4.799887429373404e-05, "loss": 0.0585, "num_input_tokens_seen": 4588720, "step": 5130 }, { "epoch": 1.3552857331397652, "grad_norm": 0.11645718663930893, "learning_rate": 4.79950238334372e-05, "loss": 0.0855, "num_input_tokens_seen": 4593072, "step": 5135 }, { "epoch": 1.356605516695262, "grad_norm": 0.33851832151412964, "learning_rate": 4.799116982701872e-05, "loss": 0.0796, "num_input_tokens_seen": 4597680, "step": 5140 }, { "epoch": 1.3579253002507587, "grad_norm": 0.6722334027290344, "learning_rate": 4.7987312275072926e-05, "loss": 0.0784, "num_input_tokens_seen": 4602064, "step": 5145 }, { "epoch": 1.3592450838062557, "grad_norm": 0.2841826379299164, "learning_rate": 4.79834511781947e-05, "loss": 0.0869, "num_input_tokens_seen": 4606416, "step": 5150 }, { "epoch": 1.3605648673617527, "grad_norm": 0.3335546553134918, "learning_rate": 4.797958653697947e-05, "loss": 0.0664, "num_input_tokens_seen": 4610896, "step": 5155 }, { "epoch": 1.3618846509172495, "grad_norm": 0.030426545068621635, "learning_rate": 4.7975718352023225e-05, "loss": 0.0412, "num_input_tokens_seen": 4615376, "step": 5160 }, { "epoch": 1.3632044344727465, "grad_norm": 0.26046568155288696, "learning_rate": 4.7971846623922476e-05, "loss": 0.0753, "num_input_tokens_seen": 4619536, "step": 5165 }, { "epoch": 1.3645242180282433, "grad_norm": 0.0727534219622612, "learning_rate": 4.7967971353274294e-05, "loss": 0.0637, "num_input_tokens_seen": 4624080, "step": 5170 }, { "epoch": 1.3658440015837403, "grad_norm": 0.1809157282114029, "learning_rate": 4.79640925406763e-05, "loss": 0.0392, "num_input_tokens_seen": 4628432, "step": 5175 }, { "epoch": 1.367163785139237, "grad_norm": 0.2920337915420532, "learning_rate": 4.796021018672664e-05, "loss": 0.1042, "num_input_tokens_seen": 4633232, "step": 5180 }, { "epoch": 1.368483568694734, "grad_norm": 0.783295214176178, "learning_rate": 4.795632429202405e-05, "loss": 0.132, "num_input_tokens_seen": 4637840, "step": 5185 }, { "epoch": 1.3698033522502309, "grad_norm": 0.08794157207012177, "learning_rate": 4.795243485716775e-05, "loss": 0.061, "num_input_tokens_seen": 4642544, "step": 5190 }, { "epoch": 1.3711231358057279, "grad_norm": 0.20806986093521118, "learning_rate": 4.794854188275757e-05, "loss": 0.0675, "num_input_tokens_seen": 4647280, "step": 5195 }, { "epoch": 1.3724429193612249, "grad_norm": 0.23969388008117676, "learning_rate": 4.794464536939384e-05, "loss": 0.0771, "num_input_tokens_seen": 4651696, "step": 5200 }, { "epoch": 1.3724429193612249, "eval_loss": 0.07964098453521729, "eval_runtime": 64.6576, "eval_samples_per_second": 104.164, "eval_steps_per_second": 26.045, "num_input_tokens_seen": 4651696, "step": 5200 }, { "epoch": 1.3737627029167216, "grad_norm": 0.22421321272850037, "learning_rate": 4.794074531767745e-05, "loss": 0.0731, "num_input_tokens_seen": 4656144, "step": 5205 }, { "epoch": 1.3750824864722184, "grad_norm": 0.16617850959300995, "learning_rate": 4.7936841728209834e-05, "loss": 0.0593, "num_input_tokens_seen": 4660624, "step": 5210 }, { "epoch": 1.3764022700277154, "grad_norm": 0.020756810903549194, "learning_rate": 4.7932934601593e-05, "loss": 0.0844, "num_input_tokens_seen": 4665200, "step": 5215 }, { "epoch": 1.3777220535832124, "grad_norm": 0.21342386305332184, "learning_rate": 4.792902393842943e-05, "loss": 0.0632, "num_input_tokens_seen": 4669552, "step": 5220 }, { "epoch": 1.3790418371387092, "grad_norm": 0.46818751096725464, "learning_rate": 4.792510973932225e-05, "loss": 0.0742, "num_input_tokens_seen": 4673712, "step": 5225 }, { "epoch": 1.3803616206942062, "grad_norm": 0.33836469054222107, "learning_rate": 4.7921192004875036e-05, "loss": 0.0986, "num_input_tokens_seen": 4678160, "step": 5230 }, { "epoch": 1.381681404249703, "grad_norm": 0.3834356665611267, "learning_rate": 4.791727073569198e-05, "loss": 0.0744, "num_input_tokens_seen": 4682960, "step": 5235 }, { "epoch": 1.3830011878052, "grad_norm": 0.18832901120185852, "learning_rate": 4.7913345932377775e-05, "loss": 0.096, "num_input_tokens_seen": 4687408, "step": 5240 }, { "epoch": 1.384320971360697, "grad_norm": 0.16693943738937378, "learning_rate": 4.790941759553769e-05, "loss": 0.0689, "num_input_tokens_seen": 4691888, "step": 5245 }, { "epoch": 1.3856407549161938, "grad_norm": 0.07065894454717636, "learning_rate": 4.79054857257775e-05, "loss": 0.0468, "num_input_tokens_seen": 4696240, "step": 5250 }, { "epoch": 1.3869605384716905, "grad_norm": 0.2353706657886505, "learning_rate": 4.790155032370357e-05, "loss": 0.0738, "num_input_tokens_seen": 4700816, "step": 5255 }, { "epoch": 1.3882803220271875, "grad_norm": 0.03853423520922661, "learning_rate": 4.789761138992278e-05, "loss": 0.0612, "num_input_tokens_seen": 4705360, "step": 5260 }, { "epoch": 1.3896001055826845, "grad_norm": 0.14937782287597656, "learning_rate": 4.7893668925042565e-05, "loss": 0.0515, "num_input_tokens_seen": 4709712, "step": 5265 }, { "epoch": 1.3909198891381813, "grad_norm": 0.4335743188858032, "learning_rate": 4.78897229296709e-05, "loss": 0.1052, "num_input_tokens_seen": 4714032, "step": 5270 }, { "epoch": 1.392239672693678, "grad_norm": 0.2282271832227707, "learning_rate": 4.7885773404416315e-05, "loss": 0.0614, "num_input_tokens_seen": 4718384, "step": 5275 }, { "epoch": 1.393559456249175, "grad_norm": 0.11093328893184662, "learning_rate": 4.788182034988786e-05, "loss": 0.0418, "num_input_tokens_seen": 4722832, "step": 5280 }, { "epoch": 1.394879239804672, "grad_norm": 0.38227182626724243, "learning_rate": 4.787786376669516e-05, "loss": 0.1125, "num_input_tokens_seen": 4726992, "step": 5285 }, { "epoch": 1.3961990233601689, "grad_norm": 0.14109593629837036, "learning_rate": 4.787390365544837e-05, "loss": 0.0792, "num_input_tokens_seen": 4731216, "step": 5290 }, { "epoch": 1.3975188069156659, "grad_norm": 0.060291215777397156, "learning_rate": 4.786994001675818e-05, "loss": 0.0621, "num_input_tokens_seen": 4735504, "step": 5295 }, { "epoch": 1.3988385904711627, "grad_norm": 0.12635262310504913, "learning_rate": 4.786597285123584e-05, "loss": 0.0501, "num_input_tokens_seen": 4740144, "step": 5300 }, { "epoch": 1.4001583740266597, "grad_norm": 0.3112564980983734, "learning_rate": 4.7862002159493135e-05, "loss": 0.0879, "num_input_tokens_seen": 4744624, "step": 5305 }, { "epoch": 1.4014781575821567, "grad_norm": 0.34095343947410583, "learning_rate": 4.785802794214239e-05, "loss": 0.0942, "num_input_tokens_seen": 4749136, "step": 5310 }, { "epoch": 1.4027979411376534, "grad_norm": 0.32172417640686035, "learning_rate": 4.7854050199796495e-05, "loss": 0.1223, "num_input_tokens_seen": 4753552, "step": 5315 }, { "epoch": 1.4041177246931502, "grad_norm": 0.24310630559921265, "learning_rate": 4.7850068933068845e-05, "loss": 0.0923, "num_input_tokens_seen": 4758256, "step": 5320 }, { "epoch": 1.4054375082486472, "grad_norm": 0.32415255904197693, "learning_rate": 4.7846084142573425e-05, "loss": 0.1003, "num_input_tokens_seen": 4762832, "step": 5325 }, { "epoch": 1.4067572918041442, "grad_norm": 0.11996932327747345, "learning_rate": 4.7842095828924725e-05, "loss": 0.0731, "num_input_tokens_seen": 4766960, "step": 5330 }, { "epoch": 1.408077075359641, "grad_norm": 0.23023372888565063, "learning_rate": 4.783810399273779e-05, "loss": 0.0443, "num_input_tokens_seen": 4771248, "step": 5335 }, { "epoch": 1.409396858915138, "grad_norm": 0.1694478690624237, "learning_rate": 4.7834108634628226e-05, "loss": 0.056, "num_input_tokens_seen": 4775440, "step": 5340 }, { "epoch": 1.4107166424706348, "grad_norm": 0.19602078199386597, "learning_rate": 4.783010975521216e-05, "loss": 0.0389, "num_input_tokens_seen": 4780080, "step": 5345 }, { "epoch": 1.4120364260261318, "grad_norm": 0.08027384430170059, "learning_rate": 4.782610735510626e-05, "loss": 0.0612, "num_input_tokens_seen": 4784688, "step": 5350 }, { "epoch": 1.4133562095816286, "grad_norm": 0.16327489912509918, "learning_rate": 4.782210143492776e-05, "loss": 0.0563, "num_input_tokens_seen": 4789040, "step": 5355 }, { "epoch": 1.4146759931371256, "grad_norm": 0.18734019994735718, "learning_rate": 4.781809199529442e-05, "loss": 0.0939, "num_input_tokens_seen": 4793520, "step": 5360 }, { "epoch": 1.4159957766926223, "grad_norm": 0.16671977937221527, "learning_rate": 4.781407903682454e-05, "loss": 0.0475, "num_input_tokens_seen": 4797968, "step": 5365 }, { "epoch": 1.4173155602481193, "grad_norm": 0.1296880692243576, "learning_rate": 4.781006256013698e-05, "loss": 0.0719, "num_input_tokens_seen": 4802544, "step": 5370 }, { "epoch": 1.4186353438036163, "grad_norm": 0.36691993474960327, "learning_rate": 4.7806042565851115e-05, "loss": 0.1043, "num_input_tokens_seen": 4806800, "step": 5375 }, { "epoch": 1.4199551273591131, "grad_norm": 0.20597416162490845, "learning_rate": 4.7802019054586895e-05, "loss": 0.121, "num_input_tokens_seen": 4811344, "step": 5380 }, { "epoch": 1.42127491091461, "grad_norm": 0.10593754798173904, "learning_rate": 4.779799202696479e-05, "loss": 0.0333, "num_input_tokens_seen": 4816144, "step": 5385 }, { "epoch": 1.422594694470107, "grad_norm": 0.3683891296386719, "learning_rate": 4.779396148360581e-05, "loss": 0.1124, "num_input_tokens_seen": 4820528, "step": 5390 }, { "epoch": 1.423914478025604, "grad_norm": 0.3037063479423523, "learning_rate": 4.7789927425131517e-05, "loss": 0.1908, "num_input_tokens_seen": 4824976, "step": 5395 }, { "epoch": 1.4252342615811007, "grad_norm": 0.24169357120990753, "learning_rate": 4.778588985216403e-05, "loss": 0.1098, "num_input_tokens_seen": 4829360, "step": 5400 }, { "epoch": 1.4252342615811007, "eval_loss": 0.07894226163625717, "eval_runtime": 64.7772, "eval_samples_per_second": 103.972, "eval_steps_per_second": 25.997, "num_input_tokens_seen": 4829360, "step": 5400 }, { "epoch": 1.4265540451365977, "grad_norm": 0.20239225029945374, "learning_rate": 4.778184876532598e-05, "loss": 0.1145, "num_input_tokens_seen": 4833680, "step": 5405 }, { "epoch": 1.4278738286920944, "grad_norm": 0.20283399522304535, "learning_rate": 4.7777804165240556e-05, "loss": 0.0772, "num_input_tokens_seen": 4838000, "step": 5410 }, { "epoch": 1.4291936122475914, "grad_norm": 0.46933332085609436, "learning_rate": 4.7773756052531485e-05, "loss": 0.1082, "num_input_tokens_seen": 4842384, "step": 5415 }, { "epoch": 1.4305133958030882, "grad_norm": 0.16172149777412415, "learning_rate": 4.7769704427823035e-05, "loss": 0.0315, "num_input_tokens_seen": 4846768, "step": 5420 }, { "epoch": 1.4318331793585852, "grad_norm": 0.3040544092655182, "learning_rate": 4.776564929174003e-05, "loss": 0.0755, "num_input_tokens_seen": 4851440, "step": 5425 }, { "epoch": 1.433152962914082, "grad_norm": 0.16845110058784485, "learning_rate": 4.7761590644907806e-05, "loss": 0.0816, "num_input_tokens_seen": 4856304, "step": 5430 }, { "epoch": 1.434472746469579, "grad_norm": 0.33745089173316956, "learning_rate": 4.7757528487952263e-05, "loss": 0.0744, "num_input_tokens_seen": 4860784, "step": 5435 }, { "epoch": 1.435792530025076, "grad_norm": 0.2157873809337616, "learning_rate": 4.7753462821499836e-05, "loss": 0.1318, "num_input_tokens_seen": 4865104, "step": 5440 }, { "epoch": 1.4371123135805728, "grad_norm": 0.07641094923019409, "learning_rate": 4.774939364617751e-05, "loss": 0.0693, "num_input_tokens_seen": 4869680, "step": 5445 }, { "epoch": 1.4384320971360696, "grad_norm": 0.14259940385818481, "learning_rate": 4.7745320962612795e-05, "loss": 0.0669, "num_input_tokens_seen": 4873936, "step": 5450 }, { "epoch": 1.4397518806915666, "grad_norm": 0.1825931966304779, "learning_rate": 4.7741244771433756e-05, "loss": 0.0728, "num_input_tokens_seen": 4878352, "step": 5455 }, { "epoch": 1.4410716642470636, "grad_norm": 0.4214733839035034, "learning_rate": 4.7737165073268985e-05, "loss": 0.0802, "num_input_tokens_seen": 4882704, "step": 5460 }, { "epoch": 1.4423914478025603, "grad_norm": 0.17257174849510193, "learning_rate": 4.7733081868747626e-05, "loss": 0.0422, "num_input_tokens_seen": 4887152, "step": 5465 }, { "epoch": 1.4437112313580573, "grad_norm": 0.5272191166877747, "learning_rate": 4.772899515849936e-05, "loss": 0.1172, "num_input_tokens_seen": 4891280, "step": 5470 }, { "epoch": 1.4450310149135541, "grad_norm": 0.28314852714538574, "learning_rate": 4.7724904943154414e-05, "loss": 0.042, "num_input_tokens_seen": 4895664, "step": 5475 }, { "epoch": 1.4463507984690511, "grad_norm": 0.2088296264410019, "learning_rate": 4.772081122334354e-05, "loss": 0.1137, "num_input_tokens_seen": 4900208, "step": 5480 }, { "epoch": 1.447670582024548, "grad_norm": 0.3795498013496399, "learning_rate": 4.771671399969806e-05, "loss": 0.0738, "num_input_tokens_seen": 4904496, "step": 5485 }, { "epoch": 1.448990365580045, "grad_norm": 0.19824357330799103, "learning_rate": 4.7712613272849794e-05, "loss": 0.0873, "num_input_tokens_seen": 4908912, "step": 5490 }, { "epoch": 1.4503101491355417, "grad_norm": 0.3328114449977875, "learning_rate": 4.770850904343114e-05, "loss": 0.1139, "num_input_tokens_seen": 4913456, "step": 5495 }, { "epoch": 1.4516299326910387, "grad_norm": 0.1506216675043106, "learning_rate": 4.770440131207502e-05, "loss": 0.0326, "num_input_tokens_seen": 4918128, "step": 5500 }, { "epoch": 1.4529497162465357, "grad_norm": 0.1752959042787552, "learning_rate": 4.7700290079414896e-05, "loss": 0.0818, "num_input_tokens_seen": 4922576, "step": 5505 }, { "epoch": 1.4542694998020325, "grad_norm": 0.08696412295103073, "learning_rate": 4.769617534608477e-05, "loss": 0.0515, "num_input_tokens_seen": 4926928, "step": 5510 }, { "epoch": 1.4555892833575292, "grad_norm": 0.09538356959819794, "learning_rate": 4.7692057112719193e-05, "loss": 0.0304, "num_input_tokens_seen": 4931344, "step": 5515 }, { "epoch": 1.4569090669130262, "grad_norm": 0.058067869395017624, "learning_rate": 4.7687935379953234e-05, "loss": 0.0541, "num_input_tokens_seen": 4936016, "step": 5520 }, { "epoch": 1.4582288504685232, "grad_norm": 0.03800162300467491, "learning_rate": 4.7683810148422534e-05, "loss": 0.0559, "num_input_tokens_seen": 4940848, "step": 5525 }, { "epoch": 1.45954863402402, "grad_norm": 0.41322988271713257, "learning_rate": 4.767968141876324e-05, "loss": 0.0887, "num_input_tokens_seen": 4945200, "step": 5530 }, { "epoch": 1.460868417579517, "grad_norm": 0.21540051698684692, "learning_rate": 4.767554919161207e-05, "loss": 0.0549, "num_input_tokens_seen": 4949936, "step": 5535 }, { "epoch": 1.4621882011350138, "grad_norm": 0.0661291778087616, "learning_rate": 4.767141346760624e-05, "loss": 0.0715, "num_input_tokens_seen": 4954480, "step": 5540 }, { "epoch": 1.4635079846905108, "grad_norm": 0.41380563378334045, "learning_rate": 4.766727424738356e-05, "loss": 0.1085, "num_input_tokens_seen": 4958832, "step": 5545 }, { "epoch": 1.4648277682460076, "grad_norm": 0.19801203906536102, "learning_rate": 4.7663131531582325e-05, "loss": 0.0873, "num_input_tokens_seen": 4963504, "step": 5550 }, { "epoch": 1.4661475518015046, "grad_norm": 0.43371960520744324, "learning_rate": 4.765898532084142e-05, "loss": 0.1072, "num_input_tokens_seen": 4968208, "step": 5555 }, { "epoch": 1.4674673353570014, "grad_norm": 0.1632298082113266, "learning_rate": 4.765483561580022e-05, "loss": 0.0679, "num_input_tokens_seen": 4972528, "step": 5560 }, { "epoch": 1.4687871189124984, "grad_norm": 0.39230212569236755, "learning_rate": 4.7650682417098666e-05, "loss": 0.0693, "num_input_tokens_seen": 4977072, "step": 5565 }, { "epoch": 1.4701069024679954, "grad_norm": 0.1705838292837143, "learning_rate": 4.7646525725377244e-05, "loss": 0.155, "num_input_tokens_seen": 4981520, "step": 5570 }, { "epoch": 1.4714266860234921, "grad_norm": 0.17158518731594086, "learning_rate": 4.764236554127696e-05, "loss": 0.0766, "num_input_tokens_seen": 4986000, "step": 5575 }, { "epoch": 1.472746469578989, "grad_norm": 0.1324654370546341, "learning_rate": 4.7638201865439356e-05, "loss": 0.0496, "num_input_tokens_seen": 4990032, "step": 5580 }, { "epoch": 1.474066253134486, "grad_norm": 0.08171235024929047, "learning_rate": 4.7634034698506545e-05, "loss": 0.0781, "num_input_tokens_seen": 4994256, "step": 5585 }, { "epoch": 1.475386036689983, "grad_norm": 0.20350681245326996, "learning_rate": 4.762986404112115e-05, "loss": 0.1045, "num_input_tokens_seen": 4998672, "step": 5590 }, { "epoch": 1.4767058202454797, "grad_norm": 0.31480562686920166, "learning_rate": 4.762568989392633e-05, "loss": 0.0693, "num_input_tokens_seen": 5003312, "step": 5595 }, { "epoch": 1.4780256038009767, "grad_norm": 0.24262858927249908, "learning_rate": 4.76215122575658e-05, "loss": 0.0919, "num_input_tokens_seen": 5007920, "step": 5600 }, { "epoch": 1.4780256038009767, "eval_loss": 0.0785464346408844, "eval_runtime": 64.7571, "eval_samples_per_second": 104.004, "eval_steps_per_second": 26.005, "num_input_tokens_seen": 5007920, "step": 5600 }, { "epoch": 1.4793453873564735, "grad_norm": 0.10564015805721283, "learning_rate": 4.7617331132683795e-05, "loss": 0.0317, "num_input_tokens_seen": 5012144, "step": 5605 }, { "epoch": 1.4806651709119705, "grad_norm": 0.06095464155077934, "learning_rate": 4.7613146519925105e-05, "loss": 0.0534, "num_input_tokens_seen": 5016624, "step": 5610 }, { "epoch": 1.4819849544674673, "grad_norm": 0.3032098710536957, "learning_rate": 4.7608958419935045e-05, "loss": 0.0462, "num_input_tokens_seen": 5021008, "step": 5615 }, { "epoch": 1.4833047380229643, "grad_norm": 0.30858275294303894, "learning_rate": 4.760476683335948e-05, "loss": 0.1104, "num_input_tokens_seen": 5025392, "step": 5620 }, { "epoch": 1.484624521578461, "grad_norm": 0.2372182309627533, "learning_rate": 4.760057176084479e-05, "loss": 0.0429, "num_input_tokens_seen": 5030096, "step": 5625 }, { "epoch": 1.485944305133958, "grad_norm": 0.10320360958576202, "learning_rate": 4.759637320303793e-05, "loss": 0.0219, "num_input_tokens_seen": 5035120, "step": 5630 }, { "epoch": 1.487264088689455, "grad_norm": 0.49019500613212585, "learning_rate": 4.759217116058635e-05, "loss": 0.0427, "num_input_tokens_seen": 5039408, "step": 5635 }, { "epoch": 1.4885838722449518, "grad_norm": 0.352451354265213, "learning_rate": 4.758796563413807e-05, "loss": 0.0823, "num_input_tokens_seen": 5043728, "step": 5640 }, { "epoch": 1.4899036558004486, "grad_norm": 0.05368480831384659, "learning_rate": 4.758375662434163e-05, "loss": 0.0223, "num_input_tokens_seen": 5048272, "step": 5645 }, { "epoch": 1.4912234393559456, "grad_norm": 0.27110958099365234, "learning_rate": 4.7579544131846114e-05, "loss": 0.0521, "num_input_tokens_seen": 5052880, "step": 5650 }, { "epoch": 1.4925432229114426, "grad_norm": 0.08626504987478256, "learning_rate": 4.757532815730114e-05, "loss": 0.0664, "num_input_tokens_seen": 5057328, "step": 5655 }, { "epoch": 1.4938630064669394, "grad_norm": 0.23050181567668915, "learning_rate": 4.7571108701356865e-05, "loss": 0.0639, "num_input_tokens_seen": 5061936, "step": 5660 }, { "epoch": 1.4951827900224364, "grad_norm": 0.4139077365398407, "learning_rate": 4.756688576466398e-05, "loss": 0.0505, "num_input_tokens_seen": 5066416, "step": 5665 }, { "epoch": 1.4965025735779331, "grad_norm": 0.3134812116622925, "learning_rate": 4.756265934787372e-05, "loss": 0.0639, "num_input_tokens_seen": 5070832, "step": 5670 }, { "epoch": 1.4978223571334301, "grad_norm": 0.06357229501008987, "learning_rate": 4.755842945163785e-05, "loss": 0.0911, "num_input_tokens_seen": 5075440, "step": 5675 }, { "epoch": 1.4991421406889271, "grad_norm": 0.07387790828943253, "learning_rate": 4.755419607660867e-05, "loss": 0.0791, "num_input_tokens_seen": 5079888, "step": 5680 }, { "epoch": 1.500461924244424, "grad_norm": 0.18835237622261047, "learning_rate": 4.7549959223439016e-05, "loss": 0.0879, "num_input_tokens_seen": 5084272, "step": 5685 }, { "epoch": 1.5017817077999207, "grad_norm": 0.11256110668182373, "learning_rate": 4.754571889278228e-05, "loss": 0.0636, "num_input_tokens_seen": 5089040, "step": 5690 }, { "epoch": 1.5031014913554177, "grad_norm": 0.07079589366912842, "learning_rate": 4.754147508529235e-05, "loss": 0.0817, "num_input_tokens_seen": 5093552, "step": 5695 }, { "epoch": 1.5044212749109147, "grad_norm": 0.18633604049682617, "learning_rate": 4.75372278016237e-05, "loss": 0.0742, "num_input_tokens_seen": 5098128, "step": 5700 }, { "epoch": 1.5057410584664115, "grad_norm": 0.07297640293836594, "learning_rate": 4.753297704243129e-05, "loss": 0.0672, "num_input_tokens_seen": 5102576, "step": 5705 }, { "epoch": 1.5070608420219083, "grad_norm": 0.20948012173175812, "learning_rate": 4.752872280837066e-05, "loss": 0.0883, "num_input_tokens_seen": 5106960, "step": 5710 }, { "epoch": 1.5083806255774053, "grad_norm": 0.24794456362724304, "learning_rate": 4.752446510009786e-05, "loss": 0.0656, "num_input_tokens_seen": 5111600, "step": 5715 }, { "epoch": 1.5097004091329023, "grad_norm": 0.273430198431015, "learning_rate": 4.7520203918269476e-05, "loss": 0.0959, "num_input_tokens_seen": 5116144, "step": 5720 }, { "epoch": 1.5110201926883993, "grad_norm": 0.20416802167892456, "learning_rate": 4.751593926354265e-05, "loss": 0.0456, "num_input_tokens_seen": 5120624, "step": 5725 }, { "epoch": 1.512339976243896, "grad_norm": 0.3313671052455902, "learning_rate": 4.751167113657503e-05, "loss": 0.0651, "num_input_tokens_seen": 5124944, "step": 5730 }, { "epoch": 1.5136597597993928, "grad_norm": 0.22932900488376617, "learning_rate": 4.7507399538024834e-05, "loss": 0.0552, "num_input_tokens_seen": 5129744, "step": 5735 }, { "epoch": 1.5149795433548898, "grad_norm": 0.16310812532901764, "learning_rate": 4.750312446855077e-05, "loss": 0.0675, "num_input_tokens_seen": 5134000, "step": 5740 }, { "epoch": 1.5162993269103868, "grad_norm": 0.1686880737543106, "learning_rate": 4.749884592881212e-05, "loss": 0.1054, "num_input_tokens_seen": 5138608, "step": 5745 }, { "epoch": 1.5176191104658836, "grad_norm": 0.03517255559563637, "learning_rate": 4.74945639194687e-05, "loss": 0.0889, "num_input_tokens_seen": 5143056, "step": 5750 }, { "epoch": 1.5189388940213804, "grad_norm": 0.26234811544418335, "learning_rate": 4.749027844118083e-05, "loss": 0.0334, "num_input_tokens_seen": 5147696, "step": 5755 }, { "epoch": 1.5202586775768774, "grad_norm": 0.02521154098212719, "learning_rate": 4.7485989494609395e-05, "loss": 0.0238, "num_input_tokens_seen": 5152208, "step": 5760 }, { "epoch": 1.5215784611323744, "grad_norm": 0.22014053165912628, "learning_rate": 4.748169708041581e-05, "loss": 0.0424, "num_input_tokens_seen": 5156592, "step": 5765 }, { "epoch": 1.5228982446878712, "grad_norm": 0.31146240234375, "learning_rate": 4.7477401199262004e-05, "loss": 0.047, "num_input_tokens_seen": 5161328, "step": 5770 }, { "epoch": 1.524218028243368, "grad_norm": 0.21080772578716278, "learning_rate": 4.747310185181048e-05, "loss": 0.1124, "num_input_tokens_seen": 5165968, "step": 5775 }, { "epoch": 1.525537811798865, "grad_norm": 0.05414314568042755, "learning_rate": 4.746879903872422e-05, "loss": 0.0258, "num_input_tokens_seen": 5170448, "step": 5780 }, { "epoch": 1.526857595354362, "grad_norm": 0.34524157643318176, "learning_rate": 4.746449276066679e-05, "loss": 0.0664, "num_input_tokens_seen": 5174928, "step": 5785 }, { "epoch": 1.528177378909859, "grad_norm": 0.15992657840251923, "learning_rate": 4.746018301830227e-05, "loss": 0.0423, "num_input_tokens_seen": 5179376, "step": 5790 }, { "epoch": 1.5294971624653557, "grad_norm": 0.376916766166687, "learning_rate": 4.7455869812295275e-05, "loss": 0.0563, "num_input_tokens_seen": 5183664, "step": 5795 }, { "epoch": 1.5308169460208525, "grad_norm": 0.34162667393684387, "learning_rate": 4.7451553143310964e-05, "loss": 0.082, "num_input_tokens_seen": 5188208, "step": 5800 }, { "epoch": 1.5308169460208525, "eval_loss": 0.07894091308116913, "eval_runtime": 64.7573, "eval_samples_per_second": 104.004, "eval_steps_per_second": 26.005, "num_input_tokens_seen": 5188208, "step": 5800 }, { "epoch": 1.5321367295763495, "grad_norm": 0.20969893038272858, "learning_rate": 4.744723301201501e-05, "loss": 0.0539, "num_input_tokens_seen": 5192656, "step": 5805 }, { "epoch": 1.5334565131318465, "grad_norm": 0.30848294496536255, "learning_rate": 4.744290941907364e-05, "loss": 0.0601, "num_input_tokens_seen": 5197296, "step": 5810 }, { "epoch": 1.5347762966873433, "grad_norm": 0.49358293414115906, "learning_rate": 4.7438582365153594e-05, "loss": 0.0564, "num_input_tokens_seen": 5202128, "step": 5815 }, { "epoch": 1.53609608024284, "grad_norm": 0.09449385851621628, "learning_rate": 4.743425185092217e-05, "loss": 0.0511, "num_input_tokens_seen": 5206704, "step": 5820 }, { "epoch": 1.537415863798337, "grad_norm": 0.0785050019621849, "learning_rate": 4.742991787704719e-05, "loss": 0.0917, "num_input_tokens_seen": 5211088, "step": 5825 }, { "epoch": 1.538735647353834, "grad_norm": 0.2837215065956116, "learning_rate": 4.7425580444196994e-05, "loss": 0.0477, "num_input_tokens_seen": 5215568, "step": 5830 }, { "epoch": 1.5400554309093308, "grad_norm": 0.06769419461488724, "learning_rate": 4.742123955304048e-05, "loss": 0.0767, "num_input_tokens_seen": 5219760, "step": 5835 }, { "epoch": 1.5413752144648276, "grad_norm": 0.37260809540748596, "learning_rate": 4.741689520424706e-05, "loss": 0.0581, "num_input_tokens_seen": 5224016, "step": 5840 }, { "epoch": 1.5426949980203246, "grad_norm": 0.1141723021864891, "learning_rate": 4.741254739848669e-05, "loss": 0.07, "num_input_tokens_seen": 5228592, "step": 5845 }, { "epoch": 1.5440147815758216, "grad_norm": 0.08904385566711426, "learning_rate": 4.740819613642987e-05, "loss": 0.0628, "num_input_tokens_seen": 5232848, "step": 5850 }, { "epoch": 1.5453345651313186, "grad_norm": 0.055948033928871155, "learning_rate": 4.74038414187476e-05, "loss": 0.0292, "num_input_tokens_seen": 5237296, "step": 5855 }, { "epoch": 1.5466543486868154, "grad_norm": 0.3357619643211365, "learning_rate": 4.739948324611144e-05, "loss": 0.0703, "num_input_tokens_seen": 5241744, "step": 5860 }, { "epoch": 1.5479741322423122, "grad_norm": 0.26810574531555176, "learning_rate": 4.7395121619193465e-05, "loss": 0.0762, "num_input_tokens_seen": 5246352, "step": 5865 }, { "epoch": 1.5492939157978092, "grad_norm": 0.20834662020206451, "learning_rate": 4.7390756538666313e-05, "loss": 0.1291, "num_input_tokens_seen": 5250608, "step": 5870 }, { "epoch": 1.5506136993533062, "grad_norm": 0.26430609822273254, "learning_rate": 4.738638800520311e-05, "loss": 0.0611, "num_input_tokens_seen": 5254864, "step": 5875 }, { "epoch": 1.551933482908803, "grad_norm": 0.34068745374679565, "learning_rate": 4.738201601947757e-05, "loss": 0.1136, "num_input_tokens_seen": 5259024, "step": 5880 }, { "epoch": 1.5532532664642997, "grad_norm": 0.3501797914505005, "learning_rate": 4.7377640582163876e-05, "loss": 0.0324, "num_input_tokens_seen": 5263312, "step": 5885 }, { "epoch": 1.5545730500197967, "grad_norm": 0.20551666617393494, "learning_rate": 4.7373261693936786e-05, "loss": 0.0449, "num_input_tokens_seen": 5268016, "step": 5890 }, { "epoch": 1.5558928335752937, "grad_norm": 0.2281952202320099, "learning_rate": 4.7368879355471595e-05, "loss": 0.052, "num_input_tokens_seen": 5272432, "step": 5895 }, { "epoch": 1.5572126171307905, "grad_norm": 0.15988720953464508, "learning_rate": 4.736449356744409e-05, "loss": 0.0639, "num_input_tokens_seen": 5276944, "step": 5900 }, { "epoch": 1.5585324006862873, "grad_norm": 0.292346715927124, "learning_rate": 4.736010433053064e-05, "loss": 0.102, "num_input_tokens_seen": 5281360, "step": 5905 }, { "epoch": 1.5598521842417843, "grad_norm": 0.4188351035118103, "learning_rate": 4.73557116454081e-05, "loss": 0.0921, "num_input_tokens_seen": 5285776, "step": 5910 }, { "epoch": 1.5611719677972813, "grad_norm": 0.2938206195831299, "learning_rate": 4.735131551275389e-05, "loss": 0.109, "num_input_tokens_seen": 5290224, "step": 5915 }, { "epoch": 1.5624917513527783, "grad_norm": 0.3782735764980316, "learning_rate": 4.734691593324594e-05, "loss": 0.1301, "num_input_tokens_seen": 5294448, "step": 5920 }, { "epoch": 1.563811534908275, "grad_norm": 0.25608259439468384, "learning_rate": 4.734251290756272e-05, "loss": 0.05, "num_input_tokens_seen": 5298928, "step": 5925 }, { "epoch": 1.5651313184637718, "grad_norm": 0.16561755537986755, "learning_rate": 4.7338106436383246e-05, "loss": 0.048, "num_input_tokens_seen": 5303280, "step": 5930 }, { "epoch": 1.5664511020192688, "grad_norm": 0.3275984227657318, "learning_rate": 4.733369652038703e-05, "loss": 0.0962, "num_input_tokens_seen": 5307888, "step": 5935 }, { "epoch": 1.5677708855747658, "grad_norm": 0.3632076382637024, "learning_rate": 4.7329283160254156e-05, "loss": 0.0716, "num_input_tokens_seen": 5312240, "step": 5940 }, { "epoch": 1.5690906691302626, "grad_norm": 0.24520328640937805, "learning_rate": 4.732486635666521e-05, "loss": 0.0858, "num_input_tokens_seen": 5317008, "step": 5945 }, { "epoch": 1.5704104526857594, "grad_norm": 0.5132139325141907, "learning_rate": 4.732044611030132e-05, "loss": 0.0583, "num_input_tokens_seen": 5321712, "step": 5950 }, { "epoch": 1.5717302362412564, "grad_norm": 0.11438081413507462, "learning_rate": 4.731602242184414e-05, "loss": 0.0592, "num_input_tokens_seen": 5325968, "step": 5955 }, { "epoch": 1.5730500197967534, "grad_norm": 0.1153215691447258, "learning_rate": 4.7311595291975864e-05, "loss": 0.0693, "num_input_tokens_seen": 5330480, "step": 5960 }, { "epoch": 1.5743698033522502, "grad_norm": 0.1654602736234665, "learning_rate": 4.7307164721379216e-05, "loss": 0.0429, "num_input_tokens_seen": 5335184, "step": 5965 }, { "epoch": 1.575689586907747, "grad_norm": 0.06799542903900146, "learning_rate": 4.730273071073743e-05, "loss": 0.0775, "num_input_tokens_seen": 5339600, "step": 5970 }, { "epoch": 1.577009370463244, "grad_norm": 0.17956531047821045, "learning_rate": 4.729829326073429e-05, "loss": 0.0574, "num_input_tokens_seen": 5344336, "step": 5975 }, { "epoch": 1.578329154018741, "grad_norm": 0.17375290393829346, "learning_rate": 4.7293852372054126e-05, "loss": 0.0473, "num_input_tokens_seen": 5348688, "step": 5980 }, { "epoch": 1.579648937574238, "grad_norm": 0.0627119392156601, "learning_rate": 4.728940804538176e-05, "loss": 0.0576, "num_input_tokens_seen": 5353200, "step": 5985 }, { "epoch": 1.5809687211297347, "grad_norm": 0.11077764630317688, "learning_rate": 4.7284960281402556e-05, "loss": 0.0804, "num_input_tokens_seen": 5357616, "step": 5990 }, { "epoch": 1.5822885046852315, "grad_norm": 0.6453601717948914, "learning_rate": 4.728050908080244e-05, "loss": 0.0846, "num_input_tokens_seen": 5362288, "step": 5995 }, { "epoch": 1.5836082882407285, "grad_norm": 0.21794354915618896, "learning_rate": 4.727605444426782e-05, "loss": 0.0603, "num_input_tokens_seen": 5366384, "step": 6000 }, { "epoch": 1.5836082882407285, "eval_loss": 0.07824458926916122, "eval_runtime": 64.7806, "eval_samples_per_second": 103.966, "eval_steps_per_second": 25.995, "num_input_tokens_seen": 5366384, "step": 6000 }, { "epoch": 1.5849280717962255, "grad_norm": 0.29802024364471436, "learning_rate": 4.727159637248567e-05, "loss": 0.0567, "num_input_tokens_seen": 5370576, "step": 6005 }, { "epoch": 1.5862478553517223, "grad_norm": 0.21445690095424652, "learning_rate": 4.7267134866143474e-05, "loss": 0.0558, "num_input_tokens_seen": 5375216, "step": 6010 }, { "epoch": 1.587567638907219, "grad_norm": 0.2629227638244629, "learning_rate": 4.726266992592926e-05, "loss": 0.0331, "num_input_tokens_seen": 5379568, "step": 6015 }, { "epoch": 1.588887422462716, "grad_norm": 0.2240363210439682, "learning_rate": 4.725820155253157e-05, "loss": 0.1439, "num_input_tokens_seen": 5383984, "step": 6020 }, { "epoch": 1.590207206018213, "grad_norm": 0.41963574290275574, "learning_rate": 4.725372974663948e-05, "loss": 0.0438, "num_input_tokens_seen": 5388528, "step": 6025 }, { "epoch": 1.5915269895737099, "grad_norm": 0.3622005879878998, "learning_rate": 4.724925450894262e-05, "loss": 0.0839, "num_input_tokens_seen": 5392784, "step": 6030 }, { "epoch": 1.5928467731292069, "grad_norm": 0.14770478010177612, "learning_rate": 4.72447758401311e-05, "loss": 0.0457, "num_input_tokens_seen": 5397488, "step": 6035 }, { "epoch": 1.5941665566847036, "grad_norm": 0.41996148228645325, "learning_rate": 4.7240293740895616e-05, "loss": 0.0977, "num_input_tokens_seen": 5401904, "step": 6040 }, { "epoch": 1.5954863402402006, "grad_norm": 0.18501776456832886, "learning_rate": 4.723580821192733e-05, "loss": 0.0705, "num_input_tokens_seen": 5406416, "step": 6045 }, { "epoch": 1.5968061237956976, "grad_norm": 0.2960822284221649, "learning_rate": 4.7231319253917996e-05, "loss": 0.0483, "num_input_tokens_seen": 5410864, "step": 6050 }, { "epoch": 1.5981259073511944, "grad_norm": 0.2859703600406647, "learning_rate": 4.722682686755986e-05, "loss": 0.1004, "num_input_tokens_seen": 5415312, "step": 6055 }, { "epoch": 1.5994456909066912, "grad_norm": 0.23860345780849457, "learning_rate": 4.722233105354569e-05, "loss": 0.0299, "num_input_tokens_seen": 5419824, "step": 6060 }, { "epoch": 1.6007654744621882, "grad_norm": 0.11686433106660843, "learning_rate": 4.7217831812568815e-05, "loss": 0.094, "num_input_tokens_seen": 5424208, "step": 6065 }, { "epoch": 1.6020852580176852, "grad_norm": 0.26035457849502563, "learning_rate": 4.721332914532307e-05, "loss": 0.0368, "num_input_tokens_seen": 5428816, "step": 6070 }, { "epoch": 1.603405041573182, "grad_norm": 0.15029872953891754, "learning_rate": 4.720882305250281e-05, "loss": 0.0942, "num_input_tokens_seen": 5433488, "step": 6075 }, { "epoch": 1.6047248251286788, "grad_norm": 0.06308599561452866, "learning_rate": 4.720431353480295e-05, "loss": 0.0574, "num_input_tokens_seen": 5437840, "step": 6080 }, { "epoch": 1.6060446086841758, "grad_norm": 0.27090874314308167, "learning_rate": 4.719980059291891e-05, "loss": 0.0603, "num_input_tokens_seen": 5442096, "step": 6085 }, { "epoch": 1.6073643922396728, "grad_norm": 0.45265886187553406, "learning_rate": 4.7195284227546634e-05, "loss": 0.0546, "num_input_tokens_seen": 5446384, "step": 6090 }, { "epoch": 1.6086841757951698, "grad_norm": 0.08372284471988678, "learning_rate": 4.7190764439382604e-05, "loss": 0.0475, "num_input_tokens_seen": 5450736, "step": 6095 }, { "epoch": 1.6100039593506665, "grad_norm": 0.046336691826581955, "learning_rate": 4.7186241229123826e-05, "loss": 0.0278, "num_input_tokens_seen": 5455248, "step": 6100 }, { "epoch": 1.6113237429061633, "grad_norm": 0.16240645945072174, "learning_rate": 4.718171459746785e-05, "loss": 0.0993, "num_input_tokens_seen": 5459440, "step": 6105 }, { "epoch": 1.6126435264616603, "grad_norm": 0.13885074853897095, "learning_rate": 4.717718454511273e-05, "loss": 0.098, "num_input_tokens_seen": 5464336, "step": 6110 }, { "epoch": 1.6139633100171573, "grad_norm": 0.10553596168756485, "learning_rate": 4.7172651072757056e-05, "loss": 0.0556, "num_input_tokens_seen": 5468656, "step": 6115 }, { "epoch": 1.615283093572654, "grad_norm": 0.12766800820827484, "learning_rate": 4.7168114181099945e-05, "loss": 0.0493, "num_input_tokens_seen": 5473136, "step": 6120 }, { "epoch": 1.6166028771281509, "grad_norm": 0.2556128203868866, "learning_rate": 4.716357387084105e-05, "loss": 0.0509, "num_input_tokens_seen": 5477168, "step": 6125 }, { "epoch": 1.6179226606836479, "grad_norm": 0.3456389904022217, "learning_rate": 4.715903014268054e-05, "loss": 0.0834, "num_input_tokens_seen": 5481776, "step": 6130 }, { "epoch": 1.6192424442391449, "grad_norm": 0.1864997148513794, "learning_rate": 4.715448299731911e-05, "loss": 0.0643, "num_input_tokens_seen": 5485968, "step": 6135 }, { "epoch": 1.6205622277946417, "grad_norm": 0.28685256838798523, "learning_rate": 4.7149932435457986e-05, "loss": 0.0686, "num_input_tokens_seen": 5490640, "step": 6140 }, { "epoch": 1.6218820113501384, "grad_norm": 0.3350217640399933, "learning_rate": 4.714537845779894e-05, "loss": 0.1057, "num_input_tokens_seen": 5495024, "step": 6145 }, { "epoch": 1.6232017949056354, "grad_norm": 0.2627733051776886, "learning_rate": 4.714082106504423e-05, "loss": 0.0878, "num_input_tokens_seen": 5499472, "step": 6150 }, { "epoch": 1.6245215784611324, "grad_norm": 0.07647845149040222, "learning_rate": 4.713626025789667e-05, "loss": 0.0549, "num_input_tokens_seen": 5504176, "step": 6155 }, { "epoch": 1.6258413620166294, "grad_norm": 0.1756701022386551, "learning_rate": 4.7131696037059606e-05, "loss": 0.0875, "num_input_tokens_seen": 5508656, "step": 6160 }, { "epoch": 1.6271611455721262, "grad_norm": 0.20765821635723114, "learning_rate": 4.712712840323689e-05, "loss": 0.12, "num_input_tokens_seen": 5512976, "step": 6165 }, { "epoch": 1.628480929127623, "grad_norm": 0.19779954850673676, "learning_rate": 4.71225573571329e-05, "loss": 0.0706, "num_input_tokens_seen": 5517616, "step": 6170 }, { "epoch": 1.62980071268312, "grad_norm": 0.28386735916137695, "learning_rate": 4.711798289945256e-05, "loss": 0.1184, "num_input_tokens_seen": 5522192, "step": 6175 }, { "epoch": 1.631120496238617, "grad_norm": 0.3031769096851349, "learning_rate": 4.71134050309013e-05, "loss": 0.1052, "num_input_tokens_seen": 5526768, "step": 6180 }, { "epoch": 1.6324402797941138, "grad_norm": 0.30241701006889343, "learning_rate": 4.710882375218509e-05, "loss": 0.0424, "num_input_tokens_seen": 5531152, "step": 6185 }, { "epoch": 1.6337600633496105, "grad_norm": 0.08450118452310562, "learning_rate": 4.7104239064010424e-05, "loss": 0.0457, "num_input_tokens_seen": 5535312, "step": 6190 }, { "epoch": 1.6350798469051075, "grad_norm": 0.18649108707904816, "learning_rate": 4.709965096708432e-05, "loss": 0.063, "num_input_tokens_seen": 5539760, "step": 6195 }, { "epoch": 1.6363996304606045, "grad_norm": 0.6834298372268677, "learning_rate": 4.709505946211431e-05, "loss": 0.1797, "num_input_tokens_seen": 5544176, "step": 6200 }, { "epoch": 1.6363996304606045, "eval_loss": 0.07735671103000641, "eval_runtime": 64.7177, "eval_samples_per_second": 104.067, "eval_steps_per_second": 26.021, "num_input_tokens_seen": 5544176, "step": 6200 }, { "epoch": 1.6377194140161013, "grad_norm": 0.10068556666374207, "learning_rate": 4.709046454980846e-05, "loss": 0.0704, "num_input_tokens_seen": 5548592, "step": 6205 }, { "epoch": 1.639039197571598, "grad_norm": 0.5610377788543701, "learning_rate": 4.708586623087538e-05, "loss": 0.1138, "num_input_tokens_seen": 5553072, "step": 6210 }, { "epoch": 1.640358981127095, "grad_norm": 0.07748523354530334, "learning_rate": 4.708126450602418e-05, "loss": 0.0602, "num_input_tokens_seen": 5557456, "step": 6215 }, { "epoch": 1.641678764682592, "grad_norm": 0.37537017464637756, "learning_rate": 4.7076659375964495e-05, "loss": 0.125, "num_input_tokens_seen": 5561712, "step": 6220 }, { "epoch": 1.642998548238089, "grad_norm": 0.6133891940116882, "learning_rate": 4.707205084140651e-05, "loss": 0.0615, "num_input_tokens_seen": 5566064, "step": 6225 }, { "epoch": 1.6443183317935859, "grad_norm": 0.14687713980674744, "learning_rate": 4.7067438903060904e-05, "loss": 0.0514, "num_input_tokens_seen": 5570704, "step": 6230 }, { "epoch": 1.6456381153490827, "grad_norm": 0.2580854892730713, "learning_rate": 4.70628235616389e-05, "loss": 0.0584, "num_input_tokens_seen": 5574928, "step": 6235 }, { "epoch": 1.6469578989045797, "grad_norm": 0.2071758210659027, "learning_rate": 4.7058204817852256e-05, "loss": 0.0687, "num_input_tokens_seen": 5579408, "step": 6240 }, { "epoch": 1.6482776824600767, "grad_norm": 0.323658287525177, "learning_rate": 4.705358267241322e-05, "loss": 0.0483, "num_input_tokens_seen": 5584176, "step": 6245 }, { "epoch": 1.6495974660155734, "grad_norm": 0.31353524327278137, "learning_rate": 4.704895712603459e-05, "loss": 0.0915, "num_input_tokens_seen": 5588688, "step": 6250 }, { "epoch": 1.6509172495710702, "grad_norm": 0.15379095077514648, "learning_rate": 4.704432817942969e-05, "loss": 0.0886, "num_input_tokens_seen": 5593264, "step": 6255 }, { "epoch": 1.6522370331265672, "grad_norm": 0.13582763075828552, "learning_rate": 4.703969583331236e-05, "loss": 0.0733, "num_input_tokens_seen": 5597840, "step": 6260 }, { "epoch": 1.6535568166820642, "grad_norm": 0.5205592513084412, "learning_rate": 4.7035060088396965e-05, "loss": 0.0891, "num_input_tokens_seen": 5602512, "step": 6265 }, { "epoch": 1.654876600237561, "grad_norm": 0.2897705137729645, "learning_rate": 4.703042094539839e-05, "loss": 0.0798, "num_input_tokens_seen": 5607088, "step": 6270 }, { "epoch": 1.6561963837930578, "grad_norm": 0.21647261083126068, "learning_rate": 4.702577840503206e-05, "loss": 0.0493, "num_input_tokens_seen": 5611600, "step": 6275 }, { "epoch": 1.6575161673485548, "grad_norm": 0.15820325911045074, "learning_rate": 4.70211324680139e-05, "loss": 0.0875, "num_input_tokens_seen": 5616304, "step": 6280 }, { "epoch": 1.6588359509040518, "grad_norm": 0.14594845473766327, "learning_rate": 4.7016483135060386e-05, "loss": 0.0422, "num_input_tokens_seen": 5620688, "step": 6285 }, { "epoch": 1.6601557344595488, "grad_norm": 0.21176597476005554, "learning_rate": 4.701183040688849e-05, "loss": 0.1025, "num_input_tokens_seen": 5625520, "step": 6290 }, { "epoch": 1.6614755180150456, "grad_norm": 0.24600651860237122, "learning_rate": 4.700717428421573e-05, "loss": 0.0587, "num_input_tokens_seen": 5630032, "step": 6295 }, { "epoch": 1.6627953015705423, "grad_norm": 0.12306217849254608, "learning_rate": 4.700251476776014e-05, "loss": 0.0574, "num_input_tokens_seen": 5634448, "step": 6300 }, { "epoch": 1.6641150851260393, "grad_norm": 0.29385313391685486, "learning_rate": 4.699785185824026e-05, "loss": 0.1126, "num_input_tokens_seen": 5639024, "step": 6305 }, { "epoch": 1.6654348686815363, "grad_norm": 0.1173398345708847, "learning_rate": 4.699318555637519e-05, "loss": 0.0562, "num_input_tokens_seen": 5643504, "step": 6310 }, { "epoch": 1.6667546522370331, "grad_norm": 0.0948636531829834, "learning_rate": 4.6988515862884525e-05, "loss": 0.0891, "num_input_tokens_seen": 5648048, "step": 6315 }, { "epoch": 1.66807443579253, "grad_norm": 0.28425514698028564, "learning_rate": 4.698384277848838e-05, "loss": 0.0972, "num_input_tokens_seen": 5652368, "step": 6320 }, { "epoch": 1.669394219348027, "grad_norm": 0.17787136137485504, "learning_rate": 4.6979166303907425e-05, "loss": 0.0966, "num_input_tokens_seen": 5656880, "step": 6325 }, { "epoch": 1.670714002903524, "grad_norm": 0.4983280599117279, "learning_rate": 4.697448643986281e-05, "loss": 0.1113, "num_input_tokens_seen": 5661392, "step": 6330 }, { "epoch": 1.6720337864590207, "grad_norm": 0.07861651480197906, "learning_rate": 4.696980318707624e-05, "loss": 0.0259, "num_input_tokens_seen": 5665808, "step": 6335 }, { "epoch": 1.6733535700145175, "grad_norm": 0.23327556252479553, "learning_rate": 4.6965116546269924e-05, "loss": 0.0605, "num_input_tokens_seen": 5670384, "step": 6340 }, { "epoch": 1.6746733535700145, "grad_norm": 0.3305080533027649, "learning_rate": 4.6960426518166615e-05, "loss": 0.066, "num_input_tokens_seen": 5674736, "step": 6345 }, { "epoch": 1.6759931371255115, "grad_norm": 0.13714560866355896, "learning_rate": 4.6955733103489556e-05, "loss": 0.0579, "num_input_tokens_seen": 5679088, "step": 6350 }, { "epoch": 1.6773129206810085, "grad_norm": 0.3895653486251831, "learning_rate": 4.695103630296255e-05, "loss": 0.0974, "num_input_tokens_seen": 5683536, "step": 6355 }, { "epoch": 1.6786327042365052, "grad_norm": 0.34837237000465393, "learning_rate": 4.694633611730988e-05, "loss": 0.04, "num_input_tokens_seen": 5687856, "step": 6360 }, { "epoch": 1.679952487792002, "grad_norm": 0.23766452074050903, "learning_rate": 4.694163254725639e-05, "loss": 0.0599, "num_input_tokens_seen": 5692496, "step": 6365 }, { "epoch": 1.681272271347499, "grad_norm": 0.08897238224744797, "learning_rate": 4.693692559352743e-05, "loss": 0.0903, "num_input_tokens_seen": 5696688, "step": 6370 }, { "epoch": 1.682592054902996, "grad_norm": 0.17052972316741943, "learning_rate": 4.693221525684886e-05, "loss": 0.0432, "num_input_tokens_seen": 5701136, "step": 6375 }, { "epoch": 1.6839118384584928, "grad_norm": 0.08918283879756927, "learning_rate": 4.6927501537947084e-05, "loss": 0.0443, "num_input_tokens_seen": 5705328, "step": 6380 }, { "epoch": 1.6852316220139896, "grad_norm": 0.06502197682857513, "learning_rate": 4.692278443754901e-05, "loss": 0.0938, "num_input_tokens_seen": 5709680, "step": 6385 }, { "epoch": 1.6865514055694866, "grad_norm": 0.17533433437347412, "learning_rate": 4.691806395638208e-05, "loss": 0.0624, "num_input_tokens_seen": 5714128, "step": 6390 }, { "epoch": 1.6878711891249836, "grad_norm": 0.13499194383621216, "learning_rate": 4.6913340095174255e-05, "loss": 0.0791, "num_input_tokens_seen": 5718640, "step": 6395 }, { "epoch": 1.6891909726804804, "grad_norm": 0.37257638573646545, "learning_rate": 4.690861285465399e-05, "loss": 0.0652, "num_input_tokens_seen": 5723216, "step": 6400 }, { "epoch": 1.6891909726804804, "eval_loss": 0.07763779163360596, "eval_runtime": 64.718, "eval_samples_per_second": 104.067, "eval_steps_per_second": 26.021, "num_input_tokens_seen": 5723216, "step": 6400 }, { "epoch": 1.6905107562359774, "grad_norm": 0.45880937576293945, "learning_rate": 4.690388223555031e-05, "loss": 0.1096, "num_input_tokens_seen": 5727984, "step": 6405 }, { "epoch": 1.6918305397914741, "grad_norm": 0.40928998589515686, "learning_rate": 4.689914823859273e-05, "loss": 0.0658, "num_input_tokens_seen": 5732816, "step": 6410 }, { "epoch": 1.6931503233469711, "grad_norm": 0.04590551182627678, "learning_rate": 4.689441086451129e-05, "loss": 0.0467, "num_input_tokens_seen": 5737424, "step": 6415 }, { "epoch": 1.6944701069024681, "grad_norm": 0.15273034572601318, "learning_rate": 4.688967011403655e-05, "loss": 0.0636, "num_input_tokens_seen": 5741616, "step": 6420 }, { "epoch": 1.695789890457965, "grad_norm": 0.12678445875644684, "learning_rate": 4.68849259878996e-05, "loss": 0.0527, "num_input_tokens_seen": 5746448, "step": 6425 }, { "epoch": 1.6971096740134617, "grad_norm": 0.18746735155582428, "learning_rate": 4.6880178486832036e-05, "loss": 0.0678, "num_input_tokens_seen": 5750768, "step": 6430 }, { "epoch": 1.6984294575689587, "grad_norm": 0.09261399507522583, "learning_rate": 4.687542761156598e-05, "loss": 0.0241, "num_input_tokens_seen": 5755088, "step": 6435 }, { "epoch": 1.6997492411244557, "grad_norm": 0.21981015801429749, "learning_rate": 4.6870673362834096e-05, "loss": 0.0882, "num_input_tokens_seen": 5759408, "step": 6440 }, { "epoch": 1.7010690246799525, "grad_norm": 0.21868005394935608, "learning_rate": 4.6865915741369526e-05, "loss": 0.0423, "num_input_tokens_seen": 5763824, "step": 6445 }, { "epoch": 1.7023888082354492, "grad_norm": 0.16330015659332275, "learning_rate": 4.686115474790597e-05, "loss": 0.0523, "num_input_tokens_seen": 5768240, "step": 6450 }, { "epoch": 1.7037085917909462, "grad_norm": 0.09738574922084808, "learning_rate": 4.685639038317762e-05, "loss": 0.0812, "num_input_tokens_seen": 5772784, "step": 6455 }, { "epoch": 1.7050283753464432, "grad_norm": 0.04816265404224396, "learning_rate": 4.685162264791921e-05, "loss": 0.0399, "num_input_tokens_seen": 5777200, "step": 6460 }, { "epoch": 1.70634815890194, "grad_norm": 0.037401869893074036, "learning_rate": 4.684685154286599e-05, "loss": 0.0399, "num_input_tokens_seen": 5781968, "step": 6465 }, { "epoch": 1.707667942457437, "grad_norm": 0.407725989818573, "learning_rate": 4.684207706875371e-05, "loss": 0.1491, "num_input_tokens_seen": 5786704, "step": 6470 }, { "epoch": 1.7089877260129338, "grad_norm": 0.5893194675445557, "learning_rate": 4.683729922631866e-05, "loss": 0.1009, "num_input_tokens_seen": 5791216, "step": 6475 }, { "epoch": 1.7103075095684308, "grad_norm": 0.13526876270771027, "learning_rate": 4.683251801629765e-05, "loss": 0.0367, "num_input_tokens_seen": 5795376, "step": 6480 }, { "epoch": 1.7116272931239278, "grad_norm": 0.14884258806705475, "learning_rate": 4.6827733439428e-05, "loss": 0.0475, "num_input_tokens_seen": 5799664, "step": 6485 }, { "epoch": 1.7129470766794246, "grad_norm": 0.3277275860309601, "learning_rate": 4.682294549644754e-05, "loss": 0.1186, "num_input_tokens_seen": 5804080, "step": 6490 }, { "epoch": 1.7142668602349214, "grad_norm": 0.27445316314697266, "learning_rate": 4.681815418809464e-05, "loss": 0.1054, "num_input_tokens_seen": 5808592, "step": 6495 }, { "epoch": 1.7155866437904184, "grad_norm": 0.37290483713150024, "learning_rate": 4.681335951510819e-05, "loss": 0.053, "num_input_tokens_seen": 5813072, "step": 6500 }, { "epoch": 1.7169064273459154, "grad_norm": 0.35742345452308655, "learning_rate": 4.6808561478227576e-05, "loss": 0.0821, "num_input_tokens_seen": 5817648, "step": 6505 }, { "epoch": 1.7182262109014121, "grad_norm": 0.18662406504154205, "learning_rate": 4.680376007819271e-05, "loss": 0.0398, "num_input_tokens_seen": 5822320, "step": 6510 }, { "epoch": 1.719545994456909, "grad_norm": 0.1383635401725769, "learning_rate": 4.679895531574405e-05, "loss": 0.0191, "num_input_tokens_seen": 5826512, "step": 6515 }, { "epoch": 1.720865778012406, "grad_norm": 0.19470007717609406, "learning_rate": 4.679414719162253e-05, "loss": 0.0435, "num_input_tokens_seen": 5831088, "step": 6520 }, { "epoch": 1.722185561567903, "grad_norm": 0.15924309194087982, "learning_rate": 4.6789335706569635e-05, "loss": 0.1296, "num_input_tokens_seen": 5835344, "step": 6525 }, { "epoch": 1.7235053451234, "grad_norm": 0.1498899608850479, "learning_rate": 4.678452086132734e-05, "loss": 0.1469, "num_input_tokens_seen": 5839728, "step": 6530 }, { "epoch": 1.7248251286788967, "grad_norm": 0.1980186104774475, "learning_rate": 4.677970265663818e-05, "loss": 0.0566, "num_input_tokens_seen": 5844176, "step": 6535 }, { "epoch": 1.7261449122343935, "grad_norm": 0.08850986510515213, "learning_rate": 4.677488109324517e-05, "loss": 0.0749, "num_input_tokens_seen": 5848880, "step": 6540 }, { "epoch": 1.7274646957898905, "grad_norm": 0.5354941487312317, "learning_rate": 4.6770056171891846e-05, "loss": 0.1066, "num_input_tokens_seen": 5853136, "step": 6545 }, { "epoch": 1.7287844793453875, "grad_norm": 0.17719906568527222, "learning_rate": 4.6765227893322286e-05, "loss": 0.0659, "num_input_tokens_seen": 5857392, "step": 6550 }, { "epoch": 1.7301042629008843, "grad_norm": 0.12624286115169525, "learning_rate": 4.676039625828107e-05, "loss": 0.0788, "num_input_tokens_seen": 5862000, "step": 6555 }, { "epoch": 1.731424046456381, "grad_norm": 0.07651223987340927, "learning_rate": 4.675556126751328e-05, "loss": 0.0593, "num_input_tokens_seen": 5866672, "step": 6560 }, { "epoch": 1.732743830011878, "grad_norm": 0.35980457067489624, "learning_rate": 4.6750722921764556e-05, "loss": 0.0749, "num_input_tokens_seen": 5871152, "step": 6565 }, { "epoch": 1.734063613567375, "grad_norm": 0.19881732761859894, "learning_rate": 4.674588122178102e-05, "loss": 0.0396, "num_input_tokens_seen": 5875536, "step": 6570 }, { "epoch": 1.7353833971228718, "grad_norm": 0.10140056908130646, "learning_rate": 4.674103616830931e-05, "loss": 0.047, "num_input_tokens_seen": 5880048, "step": 6575 }, { "epoch": 1.7367031806783686, "grad_norm": 0.3566274046897888, "learning_rate": 4.673618776209663e-05, "loss": 0.0466, "num_input_tokens_seen": 5884752, "step": 6580 }, { "epoch": 1.7380229642338656, "grad_norm": 0.3141302466392517, "learning_rate": 4.673133600389063e-05, "loss": 0.0737, "num_input_tokens_seen": 5889008, "step": 6585 }, { "epoch": 1.7393427477893626, "grad_norm": 0.1597874015569687, "learning_rate": 4.672648089443953e-05, "loss": 0.0521, "num_input_tokens_seen": 5893456, "step": 6590 }, { "epoch": 1.7406625313448596, "grad_norm": 0.23406513035297394, "learning_rate": 4.672162243449204e-05, "loss": 0.0862, "num_input_tokens_seen": 5898288, "step": 6595 }, { "epoch": 1.7419823149003564, "grad_norm": 0.2751595079898834, "learning_rate": 4.67167606247974e-05, "loss": 0.1077, "num_input_tokens_seen": 5902896, "step": 6600 }, { "epoch": 1.7419823149003564, "eval_loss": 0.07670366764068604, "eval_runtime": 64.778, "eval_samples_per_second": 103.97, "eval_steps_per_second": 25.996, "num_input_tokens_seen": 5902896, "step": 6600 }, { "epoch": 1.7433020984558532, "grad_norm": 0.06079903244972229, "learning_rate": 4.671189546610536e-05, "loss": 0.0289, "num_input_tokens_seen": 5907536, "step": 6605 }, { "epoch": 1.7446218820113502, "grad_norm": 0.28400281071662903, "learning_rate": 4.67070269591662e-05, "loss": 0.0367, "num_input_tokens_seen": 5912048, "step": 6610 }, { "epoch": 1.7459416655668472, "grad_norm": 0.1595672369003296, "learning_rate": 4.670215510473068e-05, "loss": 0.0495, "num_input_tokens_seen": 5916496, "step": 6615 }, { "epoch": 1.747261449122344, "grad_norm": 0.10201507806777954, "learning_rate": 4.669727990355013e-05, "loss": 0.051, "num_input_tokens_seen": 5920784, "step": 6620 }, { "epoch": 1.7485812326778407, "grad_norm": 0.2828303277492523, "learning_rate": 4.669240135637635e-05, "loss": 0.0916, "num_input_tokens_seen": 5925680, "step": 6625 }, { "epoch": 1.7499010162333377, "grad_norm": 0.06887925416231155, "learning_rate": 4.6687519463961675e-05, "loss": 0.1076, "num_input_tokens_seen": 5930064, "step": 6630 }, { "epoch": 1.7512207997888347, "grad_norm": 0.22175997495651245, "learning_rate": 4.668263422705896e-05, "loss": 0.0671, "num_input_tokens_seen": 5934352, "step": 6635 }, { "epoch": 1.7525405833443315, "grad_norm": 0.05469990894198418, "learning_rate": 4.667774564642156e-05, "loss": 0.0565, "num_input_tokens_seen": 5938928, "step": 6640 }, { "epoch": 1.7538603668998283, "grad_norm": 0.06107835844159126, "learning_rate": 4.6672853722803365e-05, "loss": 0.0536, "num_input_tokens_seen": 5943760, "step": 6645 }, { "epoch": 1.7551801504553253, "grad_norm": 0.20193369686603546, "learning_rate": 4.666795845695877e-05, "loss": 0.0728, "num_input_tokens_seen": 5948496, "step": 6650 }, { "epoch": 1.7564999340108223, "grad_norm": 0.18798863887786865, "learning_rate": 4.666305984964269e-05, "loss": 0.0385, "num_input_tokens_seen": 5952624, "step": 6655 }, { "epoch": 1.7578197175663193, "grad_norm": 0.22653409838676453, "learning_rate": 4.6658157901610535e-05, "loss": 0.0441, "num_input_tokens_seen": 5957040, "step": 6660 }, { "epoch": 1.759139501121816, "grad_norm": 0.09003376215696335, "learning_rate": 4.665325261361826e-05, "loss": 0.0782, "num_input_tokens_seen": 5961328, "step": 6665 }, { "epoch": 1.7604592846773128, "grad_norm": 0.3066350817680359, "learning_rate": 4.664834398642232e-05, "loss": 0.092, "num_input_tokens_seen": 5966032, "step": 6670 }, { "epoch": 1.7617790682328098, "grad_norm": 0.9687243103981018, "learning_rate": 4.6643432020779686e-05, "loss": 0.036, "num_input_tokens_seen": 5970000, "step": 6675 }, { "epoch": 1.7630988517883068, "grad_norm": 0.4614083468914032, "learning_rate": 4.663851671744786e-05, "loss": 0.0614, "num_input_tokens_seen": 5974256, "step": 6680 }, { "epoch": 1.7644186353438036, "grad_norm": 0.034824226051568985, "learning_rate": 4.6633598077184815e-05, "loss": 0.0929, "num_input_tokens_seen": 5978384, "step": 6685 }, { "epoch": 1.7657384188993004, "grad_norm": 0.09816955029964447, "learning_rate": 4.662867610074908e-05, "loss": 0.0775, "num_input_tokens_seen": 5982896, "step": 6690 }, { "epoch": 1.7670582024547974, "grad_norm": 0.037977058440446854, "learning_rate": 4.6623750788899696e-05, "loss": 0.0671, "num_input_tokens_seen": 5987152, "step": 6695 }, { "epoch": 1.7683779860102944, "grad_norm": 0.14718440175056458, "learning_rate": 4.6618822142396195e-05, "loss": 0.0624, "num_input_tokens_seen": 5991632, "step": 6700 }, { "epoch": 1.7696977695657912, "grad_norm": 0.2078886479139328, "learning_rate": 4.661389016199864e-05, "loss": 0.0619, "num_input_tokens_seen": 5996208, "step": 6705 }, { "epoch": 1.771017553121288, "grad_norm": 0.3250770568847656, "learning_rate": 4.660895484846761e-05, "loss": 0.0644, "num_input_tokens_seen": 6000912, "step": 6710 }, { "epoch": 1.772337336676785, "grad_norm": 0.23032908141613007, "learning_rate": 4.660401620256418e-05, "loss": 0.0597, "num_input_tokens_seen": 6004848, "step": 6715 }, { "epoch": 1.773657120232282, "grad_norm": 0.23126226663589478, "learning_rate": 4.659907422504997e-05, "loss": 0.0656, "num_input_tokens_seen": 6009584, "step": 6720 }, { "epoch": 1.774976903787779, "grad_norm": 0.08596619963645935, "learning_rate": 4.6594128916687074e-05, "loss": 0.0352, "num_input_tokens_seen": 6013936, "step": 6725 }, { "epoch": 1.7762966873432757, "grad_norm": 0.08057559281587601, "learning_rate": 4.658918027823813e-05, "loss": 0.048, "num_input_tokens_seen": 6018160, "step": 6730 }, { "epoch": 1.7776164708987725, "grad_norm": 0.04312612861394882, "learning_rate": 4.658422831046628e-05, "loss": 0.0708, "num_input_tokens_seen": 6022896, "step": 6735 }, { "epoch": 1.7789362544542695, "grad_norm": 0.2413734644651413, "learning_rate": 4.657927301413518e-05, "loss": 0.0602, "num_input_tokens_seen": 6027536, "step": 6740 }, { "epoch": 1.7802560380097665, "grad_norm": 0.05529560148715973, "learning_rate": 4.657431439000901e-05, "loss": 0.0921, "num_input_tokens_seen": 6031952, "step": 6745 }, { "epoch": 1.7815758215652633, "grad_norm": 0.0844552218914032, "learning_rate": 4.656935243885243e-05, "loss": 0.0733, "num_input_tokens_seen": 6036656, "step": 6750 }, { "epoch": 1.78289560512076, "grad_norm": 0.28549104928970337, "learning_rate": 4.656438716143066e-05, "loss": 0.085, "num_input_tokens_seen": 6041136, "step": 6755 }, { "epoch": 1.784215388676257, "grad_norm": 0.15741398930549622, "learning_rate": 4.6559418558509384e-05, "loss": 0.0352, "num_input_tokens_seen": 6045872, "step": 6760 }, { "epoch": 1.785535172231754, "grad_norm": 0.28424903750419617, "learning_rate": 4.6554446630854833e-05, "loss": 0.0827, "num_input_tokens_seen": 6050384, "step": 6765 }, { "epoch": 1.7868549557872508, "grad_norm": 0.09712236374616623, "learning_rate": 4.654947137923374e-05, "loss": 0.0526, "num_input_tokens_seen": 6054896, "step": 6770 }, { "epoch": 1.7881747393427478, "grad_norm": 0.40514037013053894, "learning_rate": 4.654449280441335e-05, "loss": 0.1001, "num_input_tokens_seen": 6059440, "step": 6775 }, { "epoch": 1.7894945228982446, "grad_norm": 0.5149907469749451, "learning_rate": 4.653951090716143e-05, "loss": 0.0898, "num_input_tokens_seen": 6063664, "step": 6780 }, { "epoch": 1.7908143064537416, "grad_norm": 0.21506710350513458, "learning_rate": 4.653452568824625e-05, "loss": 0.0557, "num_input_tokens_seen": 6068208, "step": 6785 }, { "epoch": 1.7921340900092386, "grad_norm": 0.4691607356071472, "learning_rate": 4.6529537148436585e-05, "loss": 0.1057, "num_input_tokens_seen": 6072656, "step": 6790 }, { "epoch": 1.7934538735647354, "grad_norm": 0.05693565309047699, "learning_rate": 4.6524545288501734e-05, "loss": 0.1345, "num_input_tokens_seen": 6076784, "step": 6795 }, { "epoch": 1.7947736571202322, "grad_norm": 0.3669026792049408, "learning_rate": 4.6519550109211506e-05, "loss": 0.0641, "num_input_tokens_seen": 6081040, "step": 6800 }, { "epoch": 1.7947736571202322, "eval_loss": 0.07645101845264435, "eval_runtime": 64.7837, "eval_samples_per_second": 103.961, "eval_steps_per_second": 25.994, "num_input_tokens_seen": 6081040, "step": 6800 }, { "epoch": 1.7960934406757292, "grad_norm": 0.2074628472328186, "learning_rate": 4.651455161133622e-05, "loss": 0.0653, "num_input_tokens_seen": 6085680, "step": 6805 }, { "epoch": 1.7974132242312262, "grad_norm": 0.1918603479862213, "learning_rate": 4.6509549795646704e-05, "loss": 0.0449, "num_input_tokens_seen": 6090320, "step": 6810 }, { "epoch": 1.798733007786723, "grad_norm": 0.23296843469142914, "learning_rate": 4.6504544662914306e-05, "loss": 0.0568, "num_input_tokens_seen": 6094608, "step": 6815 }, { "epoch": 1.8000527913422197, "grad_norm": 0.15890564024448395, "learning_rate": 4.6499536213910876e-05, "loss": 0.0582, "num_input_tokens_seen": 6099280, "step": 6820 }, { "epoch": 1.8013725748977167, "grad_norm": 0.24076932668685913, "learning_rate": 4.6494524449408786e-05, "loss": 0.056, "num_input_tokens_seen": 6103888, "step": 6825 }, { "epoch": 1.8026923584532137, "grad_norm": 0.1924286186695099, "learning_rate": 4.6489509370180903e-05, "loss": 0.0296, "num_input_tokens_seen": 6108176, "step": 6830 }, { "epoch": 1.8040121420087105, "grad_norm": 0.14083442091941833, "learning_rate": 4.648449097700063e-05, "loss": 0.0614, "num_input_tokens_seen": 6112560, "step": 6835 }, { "epoch": 1.8053319255642075, "grad_norm": 0.0990385189652443, "learning_rate": 4.647946927064185e-05, "loss": 0.0684, "num_input_tokens_seen": 6117008, "step": 6840 }, { "epoch": 1.8066517091197043, "grad_norm": 0.2637682557106018, "learning_rate": 4.647444425187898e-05, "loss": 0.089, "num_input_tokens_seen": 6121264, "step": 6845 }, { "epoch": 1.8079714926752013, "grad_norm": 0.5847938656806946, "learning_rate": 4.646941592148695e-05, "loss": 0.0552, "num_input_tokens_seen": 6125744, "step": 6850 }, { "epoch": 1.8092912762306983, "grad_norm": 0.3137251138687134, "learning_rate": 4.646438428024117e-05, "loss": 0.1137, "num_input_tokens_seen": 6130480, "step": 6855 }, { "epoch": 1.810611059786195, "grad_norm": 0.24401280283927917, "learning_rate": 4.64593493289176e-05, "loss": 0.0263, "num_input_tokens_seen": 6134928, "step": 6860 }, { "epoch": 1.8119308433416919, "grad_norm": 0.21498775482177734, "learning_rate": 4.64543110682927e-05, "loss": 0.0916, "num_input_tokens_seen": 6139504, "step": 6865 }, { "epoch": 1.8132506268971889, "grad_norm": 0.47805729508399963, "learning_rate": 4.644926949914341e-05, "loss": 0.0592, "num_input_tokens_seen": 6143920, "step": 6870 }, { "epoch": 1.8145704104526859, "grad_norm": 0.11170867830514908, "learning_rate": 4.644422462224722e-05, "loss": 0.0434, "num_input_tokens_seen": 6148400, "step": 6875 }, { "epoch": 1.8158901940081826, "grad_norm": 0.11510006338357925, "learning_rate": 4.643917643838211e-05, "loss": 0.091, "num_input_tokens_seen": 6152720, "step": 6880 }, { "epoch": 1.8172099775636794, "grad_norm": 0.27835461497306824, "learning_rate": 4.6434124948326564e-05, "loss": 0.0787, "num_input_tokens_seen": 6157424, "step": 6885 }, { "epoch": 1.8185297611191764, "grad_norm": 0.36345091462135315, "learning_rate": 4.6429070152859594e-05, "loss": 0.1238, "num_input_tokens_seen": 6161840, "step": 6890 }, { "epoch": 1.8198495446746734, "grad_norm": 0.31958088278770447, "learning_rate": 4.6424012052760714e-05, "loss": 0.081, "num_input_tokens_seen": 6166352, "step": 6895 }, { "epoch": 1.8211693282301704, "grad_norm": 0.061690185219049454, "learning_rate": 4.6418950648809945e-05, "loss": 0.0728, "num_input_tokens_seen": 6170640, "step": 6900 }, { "epoch": 1.8224891117856672, "grad_norm": 0.15668371319770813, "learning_rate": 4.641388594178782e-05, "loss": 0.0666, "num_input_tokens_seen": 6175056, "step": 6905 }, { "epoch": 1.823808895341164, "grad_norm": 0.10498514026403427, "learning_rate": 4.640881793247538e-05, "loss": 0.0979, "num_input_tokens_seen": 6179472, "step": 6910 }, { "epoch": 1.825128678896661, "grad_norm": 0.32304131984710693, "learning_rate": 4.6403746621654173e-05, "loss": 0.0889, "num_input_tokens_seen": 6183664, "step": 6915 }, { "epoch": 1.826448462452158, "grad_norm": 0.06814830005168915, "learning_rate": 4.639867201010626e-05, "loss": 0.0544, "num_input_tokens_seen": 6188272, "step": 6920 }, { "epoch": 1.8277682460076548, "grad_norm": 0.4275568723678589, "learning_rate": 4.6393594098614204e-05, "loss": 0.0738, "num_input_tokens_seen": 6192336, "step": 6925 }, { "epoch": 1.8290880295631515, "grad_norm": 0.39397117495536804, "learning_rate": 4.63885128879611e-05, "loss": 0.0872, "num_input_tokens_seen": 6196624, "step": 6930 }, { "epoch": 1.8304078131186485, "grad_norm": 0.4087122976779938, "learning_rate": 4.638342837893052e-05, "loss": 0.0673, "num_input_tokens_seen": 6201424, "step": 6935 }, { "epoch": 1.8317275966741455, "grad_norm": 0.03938461095094681, "learning_rate": 4.6378340572306565e-05, "loss": 0.0348, "num_input_tokens_seen": 6205776, "step": 6940 }, { "epoch": 1.8330473802296423, "grad_norm": 0.17708829045295715, "learning_rate": 4.6373249468873833e-05, "loss": 0.0482, "num_input_tokens_seen": 6210416, "step": 6945 }, { "epoch": 1.834367163785139, "grad_norm": 0.23982881009578705, "learning_rate": 4.636815506941744e-05, "loss": 0.0521, "num_input_tokens_seen": 6214736, "step": 6950 }, { "epoch": 1.835686947340636, "grad_norm": 0.22591456770896912, "learning_rate": 4.6363057374723004e-05, "loss": 0.0775, "num_input_tokens_seen": 6219088, "step": 6955 }, { "epoch": 1.837006730896133, "grad_norm": 0.06749723851680756, "learning_rate": 4.635795638557666e-05, "loss": 0.0885, "num_input_tokens_seen": 6223568, "step": 6960 }, { "epoch": 1.83832651445163, "grad_norm": 0.039544425904750824, "learning_rate": 4.635285210276504e-05, "loss": 0.0342, "num_input_tokens_seen": 6227920, "step": 6965 }, { "epoch": 1.8396462980071269, "grad_norm": 0.20393970608711243, "learning_rate": 4.6347744527075295e-05, "loss": 0.0685, "num_input_tokens_seen": 6232144, "step": 6970 }, { "epoch": 1.8409660815626236, "grad_norm": 0.1472388356924057, "learning_rate": 4.634263365929506e-05, "loss": 0.0324, "num_input_tokens_seen": 6236624, "step": 6975 }, { "epoch": 1.8422858651181206, "grad_norm": 0.11189628392457962, "learning_rate": 4.6337519500212515e-05, "loss": 0.0247, "num_input_tokens_seen": 6241328, "step": 6980 }, { "epoch": 1.8436056486736176, "grad_norm": 0.25500476360321045, "learning_rate": 4.633240205061632e-05, "loss": 0.1124, "num_input_tokens_seen": 6245744, "step": 6985 }, { "epoch": 1.8449254322291144, "grad_norm": 0.1959000676870346, "learning_rate": 4.632728131129565e-05, "loss": 0.0771, "num_input_tokens_seen": 6250064, "step": 6990 }, { "epoch": 1.8462452157846112, "grad_norm": 0.12271429598331451, "learning_rate": 4.632215728304018e-05, "loss": 0.0404, "num_input_tokens_seen": 6254544, "step": 6995 }, { "epoch": 1.8475649993401082, "grad_norm": 0.38256895542144775, "learning_rate": 4.63170299666401e-05, "loss": 0.0756, "num_input_tokens_seen": 6259056, "step": 7000 }, { "epoch": 1.8475649993401082, "eval_loss": 0.0762862041592598, "eval_runtime": 64.7861, "eval_samples_per_second": 103.958, "eval_steps_per_second": 25.993, "num_input_tokens_seen": 6259056, "step": 7000 }, { "epoch": 1.8488847828956052, "grad_norm": 0.0857502892613411, "learning_rate": 4.631189936288612e-05, "loss": 0.1085, "num_input_tokens_seen": 6263632, "step": 7005 }, { "epoch": 1.850204566451102, "grad_norm": 0.13003812730312347, "learning_rate": 4.630676547256944e-05, "loss": 0.0876, "num_input_tokens_seen": 6268304, "step": 7010 }, { "epoch": 1.8515243500065988, "grad_norm": 0.014970618300139904, "learning_rate": 4.630162829648176e-05, "loss": 0.0554, "num_input_tokens_seen": 6272816, "step": 7015 }, { "epoch": 1.8528441335620958, "grad_norm": 0.13892458379268646, "learning_rate": 4.629648783541531e-05, "loss": 0.1056, "num_input_tokens_seen": 6277264, "step": 7020 }, { "epoch": 1.8541639171175928, "grad_norm": 0.1330774426460266, "learning_rate": 4.6291344090162804e-05, "loss": 0.0209, "num_input_tokens_seen": 6281552, "step": 7025 }, { "epoch": 1.8554837006730898, "grad_norm": 0.6068649291992188, "learning_rate": 4.628619706151748e-05, "loss": 0.0913, "num_input_tokens_seen": 6285712, "step": 7030 }, { "epoch": 1.8568034842285865, "grad_norm": 0.14965306222438812, "learning_rate": 4.628104675027306e-05, "loss": 0.0534, "num_input_tokens_seen": 6289840, "step": 7035 }, { "epoch": 1.8581232677840833, "grad_norm": 0.30804044008255005, "learning_rate": 4.6275893157223805e-05, "loss": 0.1643, "num_input_tokens_seen": 6294416, "step": 7040 }, { "epoch": 1.8594430513395803, "grad_norm": 0.11024698615074158, "learning_rate": 4.627073628316445e-05, "loss": 0.0571, "num_input_tokens_seen": 6298928, "step": 7045 }, { "epoch": 1.8607628348950773, "grad_norm": 0.3184722661972046, "learning_rate": 4.626557612889026e-05, "loss": 0.0867, "num_input_tokens_seen": 6303568, "step": 7050 }, { "epoch": 1.862082618450574, "grad_norm": 0.22575193643569946, "learning_rate": 4.626041269519699e-05, "loss": 0.0374, "num_input_tokens_seen": 6308048, "step": 7055 }, { "epoch": 1.8634024020060709, "grad_norm": 0.12449900805950165, "learning_rate": 4.6255245982880905e-05, "loss": 0.031, "num_input_tokens_seen": 6312720, "step": 7060 }, { "epoch": 1.8647221855615679, "grad_norm": 0.2679610848426819, "learning_rate": 4.625007599273879e-05, "loss": 0.1217, "num_input_tokens_seen": 6317296, "step": 7065 }, { "epoch": 1.8660419691170649, "grad_norm": 0.37533795833587646, "learning_rate": 4.6244902725567895e-05, "loss": 0.0827, "num_input_tokens_seen": 6321936, "step": 7070 }, { "epoch": 1.8673617526725617, "grad_norm": 0.22294484078884125, "learning_rate": 4.6239726182166024e-05, "loss": 0.0658, "num_input_tokens_seen": 6326640, "step": 7075 }, { "epoch": 1.8686815362280584, "grad_norm": 0.2395455688238144, "learning_rate": 4.623454636333147e-05, "loss": 0.075, "num_input_tokens_seen": 6331344, "step": 7080 }, { "epoch": 1.8700013197835554, "grad_norm": 0.3017948567867279, "learning_rate": 4.622936326986301e-05, "loss": 0.088, "num_input_tokens_seen": 6335600, "step": 7085 }, { "epoch": 1.8713211033390524, "grad_norm": 0.1621217131614685, "learning_rate": 4.6224176902559946e-05, "loss": 0.0766, "num_input_tokens_seen": 6339888, "step": 7090 }, { "epoch": 1.8726408868945494, "grad_norm": 0.14675092697143555, "learning_rate": 4.621898726222209e-05, "loss": 0.0466, "num_input_tokens_seen": 6344400, "step": 7095 }, { "epoch": 1.8739606704500462, "grad_norm": 0.33293139934539795, "learning_rate": 4.6213794349649744e-05, "loss": 0.1457, "num_input_tokens_seen": 6349008, "step": 7100 }, { "epoch": 1.875280454005543, "grad_norm": 0.4207361042499542, "learning_rate": 4.6208598165643715e-05, "loss": 0.0497, "num_input_tokens_seen": 6353072, "step": 7105 }, { "epoch": 1.87660023756104, "grad_norm": 0.28801441192626953, "learning_rate": 4.620339871100533e-05, "loss": 0.11, "num_input_tokens_seen": 6357968, "step": 7110 }, { "epoch": 1.877920021116537, "grad_norm": 0.21969226002693176, "learning_rate": 4.6198195986536394e-05, "loss": 0.0732, "num_input_tokens_seen": 6362736, "step": 7115 }, { "epoch": 1.8792398046720338, "grad_norm": 0.10012209415435791, "learning_rate": 4.619298999303926e-05, "loss": 0.0728, "num_input_tokens_seen": 6367216, "step": 7120 }, { "epoch": 1.8805595882275306, "grad_norm": 0.1143481582403183, "learning_rate": 4.618778073131673e-05, "loss": 0.0779, "num_input_tokens_seen": 6371696, "step": 7125 }, { "epoch": 1.8818793717830276, "grad_norm": 0.03170226514339447, "learning_rate": 4.618256820217215e-05, "loss": 0.0826, "num_input_tokens_seen": 6375888, "step": 7130 }, { "epoch": 1.8831991553385246, "grad_norm": 0.07537917792797089, "learning_rate": 4.617735240640936e-05, "loss": 0.0229, "num_input_tokens_seen": 6380176, "step": 7135 }, { "epoch": 1.8845189388940213, "grad_norm": 0.06016154587268829, "learning_rate": 4.6172133344832705e-05, "loss": 0.0323, "num_input_tokens_seen": 6384880, "step": 7140 }, { "epoch": 1.8858387224495183, "grad_norm": 0.17361405491828918, "learning_rate": 4.6166911018247004e-05, "loss": 0.0491, "num_input_tokens_seen": 6389296, "step": 7145 }, { "epoch": 1.8871585060050151, "grad_norm": 0.10163392871618271, "learning_rate": 4.616168542745764e-05, "loss": 0.0826, "num_input_tokens_seen": 6393712, "step": 7150 }, { "epoch": 1.8884782895605121, "grad_norm": 0.09090641140937805, "learning_rate": 4.6156456573270446e-05, "loss": 0.0435, "num_input_tokens_seen": 6398000, "step": 7155 }, { "epoch": 1.8897980731160091, "grad_norm": 0.12246877700090408, "learning_rate": 4.615122445649177e-05, "loss": 0.0424, "num_input_tokens_seen": 6402640, "step": 7160 }, { "epoch": 1.891117856671506, "grad_norm": 0.051915451884269714, "learning_rate": 4.6145989077928486e-05, "loss": 0.0613, "num_input_tokens_seen": 6407184, "step": 7165 }, { "epoch": 1.8924376402270027, "grad_norm": 0.08832618594169617, "learning_rate": 4.6140750438387953e-05, "loss": 0.0703, "num_input_tokens_seen": 6411568, "step": 7170 }, { "epoch": 1.8937574237824997, "grad_norm": 0.2811886966228485, "learning_rate": 4.613550853867803e-05, "loss": 0.0792, "num_input_tokens_seen": 6415760, "step": 7175 }, { "epoch": 1.8950772073379967, "grad_norm": 0.030251650139689445, "learning_rate": 4.613026337960708e-05, "loss": 0.0717, "num_input_tokens_seen": 6419952, "step": 7180 }, { "epoch": 1.8963969908934935, "grad_norm": 0.2344350516796112, "learning_rate": 4.612501496198398e-05, "loss": 0.0488, "num_input_tokens_seen": 6424592, "step": 7185 }, { "epoch": 1.8977167744489902, "grad_norm": 0.08863913267850876, "learning_rate": 4.61197632866181e-05, "loss": 0.0492, "num_input_tokens_seen": 6429040, "step": 7190 }, { "epoch": 1.8990365580044872, "grad_norm": 0.19203996658325195, "learning_rate": 4.611450835431931e-05, "loss": 0.0769, "num_input_tokens_seen": 6433552, "step": 7195 }, { "epoch": 1.9003563415599842, "grad_norm": 0.17268742620944977, "learning_rate": 4.6109250165898e-05, "loss": 0.0357, "num_input_tokens_seen": 6437904, "step": 7200 }, { "epoch": 1.9003563415599842, "eval_loss": 0.07598280161619186, "eval_runtime": 64.5813, "eval_samples_per_second": 104.287, "eval_steps_per_second": 26.076, "num_input_tokens_seen": 6437904, "step": 7200 }, { "epoch": 1.901676125115481, "grad_norm": 0.10872519016265869, "learning_rate": 4.610398872216503e-05, "loss": 0.0525, "num_input_tokens_seen": 6442672, "step": 7205 }, { "epoch": 1.902995908670978, "grad_norm": 0.28377804160118103, "learning_rate": 4.6098724023931796e-05, "loss": 0.078, "num_input_tokens_seen": 6447376, "step": 7210 }, { "epoch": 1.9043156922264748, "grad_norm": 0.31565606594085693, "learning_rate": 4.609345607201017e-05, "loss": 0.1399, "num_input_tokens_seen": 6451792, "step": 7215 }, { "epoch": 1.9056354757819718, "grad_norm": 0.25704529881477356, "learning_rate": 4.608818486721254e-05, "loss": 0.1148, "num_input_tokens_seen": 6456336, "step": 7220 }, { "epoch": 1.9069552593374688, "grad_norm": 0.047139327973127365, "learning_rate": 4.608291041035179e-05, "loss": 0.0437, "num_input_tokens_seen": 6460720, "step": 7225 }, { "epoch": 1.9082750428929656, "grad_norm": 0.06276936084032059, "learning_rate": 4.607763270224132e-05, "loss": 0.0508, "num_input_tokens_seen": 6465360, "step": 7230 }, { "epoch": 1.9095948264484623, "grad_norm": 0.1673480123281479, "learning_rate": 4.6072351743695e-05, "loss": 0.0258, "num_input_tokens_seen": 6469744, "step": 7235 }, { "epoch": 1.9109146100039593, "grad_norm": 0.07516109198331833, "learning_rate": 4.606706753552723e-05, "loss": 0.0731, "num_input_tokens_seen": 6474096, "step": 7240 }, { "epoch": 1.9122343935594563, "grad_norm": 0.06936344504356384, "learning_rate": 4.6061780078552906e-05, "loss": 0.1283, "num_input_tokens_seen": 6478576, "step": 7245 }, { "epoch": 1.9135541771149531, "grad_norm": 0.09838573634624481, "learning_rate": 4.605648937358742e-05, "loss": 0.0607, "num_input_tokens_seen": 6482928, "step": 7250 }, { "epoch": 1.91487396067045, "grad_norm": 0.09534476697444916, "learning_rate": 4.605119542144665e-05, "loss": 0.0559, "num_input_tokens_seen": 6487632, "step": 7255 }, { "epoch": 1.916193744225947, "grad_norm": 0.1923447996377945, "learning_rate": 4.604589822294701e-05, "loss": 0.0984, "num_input_tokens_seen": 6491952, "step": 7260 }, { "epoch": 1.917513527781444, "grad_norm": 0.041288550943136215, "learning_rate": 4.604059777890537e-05, "loss": 0.0359, "num_input_tokens_seen": 6496496, "step": 7265 }, { "epoch": 1.918833311336941, "grad_norm": 0.2149384319782257, "learning_rate": 4.6035294090139145e-05, "loss": 0.0827, "num_input_tokens_seen": 6500752, "step": 7270 }, { "epoch": 1.9201530948924377, "grad_norm": 0.1374235600233078, "learning_rate": 4.6029987157466226e-05, "loss": 0.0779, "num_input_tokens_seen": 6505008, "step": 7275 }, { "epoch": 1.9214728784479345, "grad_norm": 0.3556566536426544, "learning_rate": 4.602467698170502e-05, "loss": 0.0798, "num_input_tokens_seen": 6509392, "step": 7280 }, { "epoch": 1.9227926620034315, "grad_norm": 0.28320884704589844, "learning_rate": 4.601936356367439e-05, "loss": 0.0822, "num_input_tokens_seen": 6513936, "step": 7285 }, { "epoch": 1.9241124455589285, "grad_norm": 0.2886987626552582, "learning_rate": 4.601404690419377e-05, "loss": 0.0642, "num_input_tokens_seen": 6518256, "step": 7290 }, { "epoch": 1.9254322291144252, "grad_norm": 0.20514874160289764, "learning_rate": 4.600872700408303e-05, "loss": 0.0533, "num_input_tokens_seen": 6522704, "step": 7295 }, { "epoch": 1.926752012669922, "grad_norm": 0.3322639763355255, "learning_rate": 4.600340386416258e-05, "loss": 0.0485, "num_input_tokens_seen": 6527408, "step": 7300 }, { "epoch": 1.928071796225419, "grad_norm": 0.16277071833610535, "learning_rate": 4.5998077485253296e-05, "loss": 0.0457, "num_input_tokens_seen": 6531856, "step": 7305 }, { "epoch": 1.929391579780916, "grad_norm": 0.21233443915843964, "learning_rate": 4.59927478681766e-05, "loss": 0.088, "num_input_tokens_seen": 6536368, "step": 7310 }, { "epoch": 1.9307113633364128, "grad_norm": 0.36338627338409424, "learning_rate": 4.5987415013754366e-05, "loss": 0.0663, "num_input_tokens_seen": 6540688, "step": 7315 }, { "epoch": 1.9320311468919096, "grad_norm": 0.2929827868938446, "learning_rate": 4.598207892280899e-05, "loss": 0.0894, "num_input_tokens_seen": 6545200, "step": 7320 }, { "epoch": 1.9333509304474066, "grad_norm": 0.33678942918777466, "learning_rate": 4.597673959616337e-05, "loss": 0.079, "num_input_tokens_seen": 6549808, "step": 7325 }, { "epoch": 1.9346707140029036, "grad_norm": 0.09622807800769806, "learning_rate": 4.597139703464089e-05, "loss": 0.0452, "num_input_tokens_seen": 6553968, "step": 7330 }, { "epoch": 1.9359904975584006, "grad_norm": 0.1935141384601593, "learning_rate": 4.596605123906545e-05, "loss": 0.0758, "num_input_tokens_seen": 6558544, "step": 7335 }, { "epoch": 1.9373102811138974, "grad_norm": 0.30104488134384155, "learning_rate": 4.596070221026143e-05, "loss": 0.0974, "num_input_tokens_seen": 6562896, "step": 7340 }, { "epoch": 1.9386300646693941, "grad_norm": 0.44369760155677795, "learning_rate": 4.595534994905372e-05, "loss": 0.0785, "num_input_tokens_seen": 6567184, "step": 7345 }, { "epoch": 1.9399498482248911, "grad_norm": 0.4172000586986542, "learning_rate": 4.594999445626771e-05, "loss": 0.1166, "num_input_tokens_seen": 6571632, "step": 7350 }, { "epoch": 1.9412696317803881, "grad_norm": 0.16598327457904816, "learning_rate": 4.5944635732729276e-05, "loss": 0.0676, "num_input_tokens_seen": 6576208, "step": 7355 }, { "epoch": 1.942589415335885, "grad_norm": 0.4370569586753845, "learning_rate": 4.5939273779264804e-05, "loss": 0.1078, "num_input_tokens_seen": 6580944, "step": 7360 }, { "epoch": 1.9439091988913817, "grad_norm": 0.29714450240135193, "learning_rate": 4.593390859670118e-05, "loss": 0.0656, "num_input_tokens_seen": 6585168, "step": 7365 }, { "epoch": 1.9452289824468787, "grad_norm": 0.12176475673913956, "learning_rate": 4.5928540185865776e-05, "loss": 0.0451, "num_input_tokens_seen": 6589840, "step": 7370 }, { "epoch": 1.9465487660023757, "grad_norm": 0.15068376064300537, "learning_rate": 4.592316854758648e-05, "loss": 0.0672, "num_input_tokens_seen": 6594128, "step": 7375 }, { "epoch": 1.9478685495578725, "grad_norm": 0.09998653829097748, "learning_rate": 4.5917793682691646e-05, "loss": 0.0649, "num_input_tokens_seen": 6598640, "step": 7380 }, { "epoch": 1.9491883331133693, "grad_norm": 0.39897748827934265, "learning_rate": 4.5912415592010164e-05, "loss": 0.0591, "num_input_tokens_seen": 6603120, "step": 7385 }, { "epoch": 1.9505081166688663, "grad_norm": 0.25645193457603455, "learning_rate": 4.5907034276371386e-05, "loss": 0.0662, "num_input_tokens_seen": 6607632, "step": 7390 }, { "epoch": 1.9518279002243633, "grad_norm": 0.28610509634017944, "learning_rate": 4.5901649736605196e-05, "loss": 0.1295, "num_input_tokens_seen": 6611952, "step": 7395 }, { "epoch": 1.9531476837798603, "grad_norm": 0.6255216598510742, "learning_rate": 4.589626197354195e-05, "loss": 0.1075, "num_input_tokens_seen": 6616176, "step": 7400 }, { "epoch": 1.9531476837798603, "eval_loss": 0.07558133453130722, "eval_runtime": 64.7185, "eval_samples_per_second": 104.066, "eval_steps_per_second": 26.02, "num_input_tokens_seen": 6616176, "step": 7400 }, { "epoch": 1.954467467335357, "grad_norm": 0.08739921450614929, "learning_rate": 4.5890870988012504e-05, "loss": 0.1119, "num_input_tokens_seen": 6620592, "step": 7405 }, { "epoch": 1.9557872508908538, "grad_norm": 0.10449250042438507, "learning_rate": 4.5885476780848226e-05, "loss": 0.053, "num_input_tokens_seen": 6625136, "step": 7410 }, { "epoch": 1.9571070344463508, "grad_norm": 0.11688117682933807, "learning_rate": 4.5880079352880964e-05, "loss": 0.0819, "num_input_tokens_seen": 6629840, "step": 7415 }, { "epoch": 1.9584268180018478, "grad_norm": 0.20546916127204895, "learning_rate": 4.5874678704943065e-05, "loss": 0.0947, "num_input_tokens_seen": 6634512, "step": 7420 }, { "epoch": 1.9597466015573446, "grad_norm": 0.20570893585681915, "learning_rate": 4.5869274837867394e-05, "loss": 0.0402, "num_input_tokens_seen": 6638928, "step": 7425 }, { "epoch": 1.9610663851128414, "grad_norm": 0.2569142282009125, "learning_rate": 4.5863867752487275e-05, "loss": 0.0935, "num_input_tokens_seen": 6643472, "step": 7430 }, { "epoch": 1.9623861686683384, "grad_norm": 0.13570162653923035, "learning_rate": 4.5858457449636554e-05, "loss": 0.0843, "num_input_tokens_seen": 6648112, "step": 7435 }, { "epoch": 1.9637059522238354, "grad_norm": 0.16203832626342773, "learning_rate": 4.5853043930149574e-05, "loss": 0.0756, "num_input_tokens_seen": 6652496, "step": 7440 }, { "epoch": 1.9650257357793321, "grad_norm": 0.05498882755637169, "learning_rate": 4.584762719486117e-05, "loss": 0.0717, "num_input_tokens_seen": 6656656, "step": 7445 }, { "epoch": 1.966345519334829, "grad_norm": 0.16412782669067383, "learning_rate": 4.584220724460665e-05, "loss": 0.0756, "num_input_tokens_seen": 6661200, "step": 7450 }, { "epoch": 1.967665302890326, "grad_norm": 0.20030273497104645, "learning_rate": 4.5836784080221865e-05, "loss": 0.0403, "num_input_tokens_seen": 6665680, "step": 7455 }, { "epoch": 1.968985086445823, "grad_norm": 0.3757839798927307, "learning_rate": 4.583135770254312e-05, "loss": 0.0666, "num_input_tokens_seen": 6670128, "step": 7460 }, { "epoch": 1.97030487000132, "grad_norm": 0.24799558520317078, "learning_rate": 4.5825928112407236e-05, "loss": 0.0803, "num_input_tokens_seen": 6674352, "step": 7465 }, { "epoch": 1.9716246535568167, "grad_norm": 0.045685745775699615, "learning_rate": 4.582049531065152e-05, "loss": 0.0442, "num_input_tokens_seen": 6679088, "step": 7470 }, { "epoch": 1.9729444371123135, "grad_norm": 0.46948033571243286, "learning_rate": 4.5815059298113783e-05, "loss": 0.0952, "num_input_tokens_seen": 6683568, "step": 7475 }, { "epoch": 1.9742642206678105, "grad_norm": 0.12226312607526779, "learning_rate": 4.580962007563232e-05, "loss": 0.0272, "num_input_tokens_seen": 6688016, "step": 7480 }, { "epoch": 1.9755840042233075, "grad_norm": 0.09945758432149887, "learning_rate": 4.5804177644045935e-05, "loss": 0.0886, "num_input_tokens_seen": 6692240, "step": 7485 }, { "epoch": 1.9769037877788043, "grad_norm": 0.15116862952709198, "learning_rate": 4.579873200419391e-05, "loss": 0.0572, "num_input_tokens_seen": 6696592, "step": 7490 }, { "epoch": 1.978223571334301, "grad_norm": 0.1248839721083641, "learning_rate": 4.5793283156916046e-05, "loss": 0.0586, "num_input_tokens_seen": 6700784, "step": 7495 }, { "epoch": 1.979543354889798, "grad_norm": 0.4491346776485443, "learning_rate": 4.578783110305261e-05, "loss": 0.0732, "num_input_tokens_seen": 6705200, "step": 7500 }, { "epoch": 1.980863138445295, "grad_norm": 0.350924015045166, "learning_rate": 4.578237584344438e-05, "loss": 0.0822, "num_input_tokens_seen": 6709648, "step": 7505 }, { "epoch": 1.9821829220007918, "grad_norm": 0.21357512474060059, "learning_rate": 4.577691737893263e-05, "loss": 0.0383, "num_input_tokens_seen": 6714288, "step": 7510 }, { "epoch": 1.9835027055562886, "grad_norm": 0.389934778213501, "learning_rate": 4.577145571035912e-05, "loss": 0.1077, "num_input_tokens_seen": 6718896, "step": 7515 }, { "epoch": 1.9848224891117856, "grad_norm": 0.10635025054216385, "learning_rate": 4.576599083856611e-05, "loss": 0.0315, "num_input_tokens_seen": 6723248, "step": 7520 }, { "epoch": 1.9861422726672826, "grad_norm": 0.15491513907909393, "learning_rate": 4.576052276439635e-05, "loss": 0.0515, "num_input_tokens_seen": 6727856, "step": 7525 }, { "epoch": 1.9874620562227796, "grad_norm": 0.23030245304107666, "learning_rate": 4.575505148869308e-05, "loss": 0.0436, "num_input_tokens_seen": 6732496, "step": 7530 }, { "epoch": 1.9887818397782764, "grad_norm": 0.1251135915517807, "learning_rate": 4.574957701230006e-05, "loss": 0.1285, "num_input_tokens_seen": 6736976, "step": 7535 }, { "epoch": 1.9901016233337732, "grad_norm": 0.39714178442955017, "learning_rate": 4.57440993360615e-05, "loss": 0.0906, "num_input_tokens_seen": 6741456, "step": 7540 }, { "epoch": 1.9914214068892702, "grad_norm": 0.2053503841161728, "learning_rate": 4.5738618460822134e-05, "loss": 0.075, "num_input_tokens_seen": 6745776, "step": 7545 }, { "epoch": 1.9927411904447672, "grad_norm": 0.3128725588321686, "learning_rate": 4.573313438742719e-05, "loss": 0.1104, "num_input_tokens_seen": 6750256, "step": 7550 }, { "epoch": 1.994060974000264, "grad_norm": 0.08910363167524338, "learning_rate": 4.5727647116722374e-05, "loss": 0.0468, "num_input_tokens_seen": 6754704, "step": 7555 }, { "epoch": 1.9953807575557607, "grad_norm": 0.209807351231575, "learning_rate": 4.5722156649553884e-05, "loss": 0.0529, "num_input_tokens_seen": 6758896, "step": 7560 }, { "epoch": 1.9967005411112577, "grad_norm": 0.1922292411327362, "learning_rate": 4.571666298676843e-05, "loss": 0.0586, "num_input_tokens_seen": 6763248, "step": 7565 }, { "epoch": 1.9980203246667547, "grad_norm": 0.11855418980121613, "learning_rate": 4.571116612921321e-05, "loss": 0.1161, "num_input_tokens_seen": 6767472, "step": 7570 }, { "epoch": 1.9993401082222515, "grad_norm": 0.36413517594337463, "learning_rate": 4.57056660777359e-05, "loss": 0.0625, "num_input_tokens_seen": 6771952, "step": 7575 }, { "epoch": 2.0005279134221987, "grad_norm": 0.25645843148231506, "learning_rate": 4.5700162833184666e-05, "loss": 0.0627, "num_input_tokens_seen": 6775856, "step": 7580 }, { "epoch": 2.0018476969776957, "grad_norm": 0.4639410376548767, "learning_rate": 4.5694656396408195e-05, "loss": 0.0952, "num_input_tokens_seen": 6780080, "step": 7585 }, { "epoch": 2.0031674805331927, "grad_norm": 0.3608251214027405, "learning_rate": 4.5689146768255646e-05, "loss": 0.0649, "num_input_tokens_seen": 6784528, "step": 7590 }, { "epoch": 2.0044872640886893, "grad_norm": 0.07005508244037628, "learning_rate": 4.568363394957667e-05, "loss": 0.0382, "num_input_tokens_seen": 6789168, "step": 7595 }, { "epoch": 2.0058070476441863, "grad_norm": 0.060909345746040344, "learning_rate": 4.567811794122141e-05, "loss": 0.0756, "num_input_tokens_seen": 6793648, "step": 7600 }, { "epoch": 2.0058070476441863, "eval_loss": 0.07510706037282944, "eval_runtime": 64.7268, "eval_samples_per_second": 104.053, "eval_steps_per_second": 26.017, "num_input_tokens_seen": 6793648, "step": 7600 }, { "epoch": 2.0071268311996833, "grad_norm": 0.21742381155490875, "learning_rate": 4.56725987440405e-05, "loss": 0.0703, "num_input_tokens_seen": 6798160, "step": 7605 }, { "epoch": 2.0084466147551803, "grad_norm": 0.2879336476325989, "learning_rate": 4.566707635888508e-05, "loss": 0.0583, "num_input_tokens_seen": 6803024, "step": 7610 }, { "epoch": 2.009766398310677, "grad_norm": 0.21169108152389526, "learning_rate": 4.566155078660677e-05, "loss": 0.0593, "num_input_tokens_seen": 6807376, "step": 7615 }, { "epoch": 2.011086181866174, "grad_norm": 0.12386344373226166, "learning_rate": 4.565602202805768e-05, "loss": 0.083, "num_input_tokens_seen": 6811920, "step": 7620 }, { "epoch": 2.012405965421671, "grad_norm": 0.2798001170158386, "learning_rate": 4.56504900840904e-05, "loss": 0.0443, "num_input_tokens_seen": 6816208, "step": 7625 }, { "epoch": 2.013725748977168, "grad_norm": 0.4686562716960907, "learning_rate": 4.564495495555805e-05, "loss": 0.0672, "num_input_tokens_seen": 6820528, "step": 7630 }, { "epoch": 2.015045532532665, "grad_norm": 0.3561771810054779, "learning_rate": 4.5639416643314204e-05, "loss": 0.117, "num_input_tokens_seen": 6825200, "step": 7635 }, { "epoch": 2.0163653160881614, "grad_norm": 0.33920371532440186, "learning_rate": 4.5633875148212946e-05, "loss": 0.1183, "num_input_tokens_seen": 6829712, "step": 7640 }, { "epoch": 2.0176850996436584, "grad_norm": 0.36639538407325745, "learning_rate": 4.562833047110883e-05, "loss": 0.0598, "num_input_tokens_seen": 6834224, "step": 7645 }, { "epoch": 2.0190048831991554, "grad_norm": 0.06148790940642357, "learning_rate": 4.5622782612856923e-05, "loss": 0.0191, "num_input_tokens_seen": 6839152, "step": 7650 }, { "epoch": 2.0203246667546524, "grad_norm": 0.08195067942142487, "learning_rate": 4.561723157431278e-05, "loss": 0.0521, "num_input_tokens_seen": 6843568, "step": 7655 }, { "epoch": 2.021644450310149, "grad_norm": 0.2522927522659302, "learning_rate": 4.5611677356332435e-05, "loss": 0.1466, "num_input_tokens_seen": 6848112, "step": 7660 }, { "epoch": 2.022964233865646, "grad_norm": 0.03633113205432892, "learning_rate": 4.560611995977242e-05, "loss": 0.0614, "num_input_tokens_seen": 6852592, "step": 7665 }, { "epoch": 2.024284017421143, "grad_norm": 0.25907647609710693, "learning_rate": 4.560055938548975e-05, "loss": 0.0591, "num_input_tokens_seen": 6856784, "step": 7670 }, { "epoch": 2.02560380097664, "grad_norm": 0.20321422815322876, "learning_rate": 4.5594995634341944e-05, "loss": 0.0811, "num_input_tokens_seen": 6861296, "step": 7675 }, { "epoch": 2.026923584532137, "grad_norm": 0.5927806496620178, "learning_rate": 4.5589428707187e-05, "loss": 0.1077, "num_input_tokens_seen": 6865840, "step": 7680 }, { "epoch": 2.0282433680876335, "grad_norm": 0.16285324096679688, "learning_rate": 4.55838586048834e-05, "loss": 0.0692, "num_input_tokens_seen": 6870160, "step": 7685 }, { "epoch": 2.0295631516431305, "grad_norm": 0.07795558124780655, "learning_rate": 4.557828532829013e-05, "loss": 0.0175, "num_input_tokens_seen": 6874992, "step": 7690 }, { "epoch": 2.0308829351986275, "grad_norm": 0.10244237631559372, "learning_rate": 4.557270887826667e-05, "loss": 0.0578, "num_input_tokens_seen": 6879312, "step": 7695 }, { "epoch": 2.0322027187541245, "grad_norm": 0.20857179164886475, "learning_rate": 4.556712925567296e-05, "loss": 0.0349, "num_input_tokens_seen": 6883984, "step": 7700 }, { "epoch": 2.033522502309621, "grad_norm": 0.4328763782978058, "learning_rate": 4.5561546461369454e-05, "loss": 0.0972, "num_input_tokens_seen": 6888272, "step": 7705 }, { "epoch": 2.034842285865118, "grad_norm": 0.34909024834632874, "learning_rate": 4.55559604962171e-05, "loss": 0.0553, "num_input_tokens_seen": 6892688, "step": 7710 }, { "epoch": 2.036162069420615, "grad_norm": 0.40270885825157166, "learning_rate": 4.55503713610773e-05, "loss": 0.0881, "num_input_tokens_seen": 6897360, "step": 7715 }, { "epoch": 2.037481852976112, "grad_norm": 0.26024842262268066, "learning_rate": 4.5544779056812e-05, "loss": 0.0647, "num_input_tokens_seen": 6901840, "step": 7720 }, { "epoch": 2.0388016365316086, "grad_norm": 0.1849614530801773, "learning_rate": 4.553918358428358e-05, "loss": 0.0687, "num_input_tokens_seen": 6906160, "step": 7725 }, { "epoch": 2.0401214200871056, "grad_norm": 0.2401839643716812, "learning_rate": 4.553358494435494e-05, "loss": 0.0617, "num_input_tokens_seen": 6910704, "step": 7730 }, { "epoch": 2.0414412036426026, "grad_norm": 0.2517256438732147, "learning_rate": 4.5527983137889464e-05, "loss": 0.0465, "num_input_tokens_seen": 6915280, "step": 7735 }, { "epoch": 2.0427609871980996, "grad_norm": 0.1625821739435196, "learning_rate": 4.5522378165751015e-05, "loss": 0.0666, "num_input_tokens_seen": 6919824, "step": 7740 }, { "epoch": 2.0440807707535966, "grad_norm": 0.10548185557126999, "learning_rate": 4.5516770028803954e-05, "loss": 0.0744, "num_input_tokens_seen": 6924432, "step": 7745 }, { "epoch": 2.045400554309093, "grad_norm": 0.15737095475196838, "learning_rate": 4.5511158727913116e-05, "loss": 0.0395, "num_input_tokens_seen": 6929232, "step": 7750 }, { "epoch": 2.04672033786459, "grad_norm": 0.33843058347702026, "learning_rate": 4.5505544263943856e-05, "loss": 0.0471, "num_input_tokens_seen": 6933744, "step": 7755 }, { "epoch": 2.048040121420087, "grad_norm": 0.2520791292190552, "learning_rate": 4.549992663776197e-05, "loss": 0.0864, "num_input_tokens_seen": 6938128, "step": 7760 }, { "epoch": 2.049359904975584, "grad_norm": 0.04124077409505844, "learning_rate": 4.5494305850233786e-05, "loss": 0.0701, "num_input_tokens_seen": 6942512, "step": 7765 }, { "epoch": 2.0506796885310807, "grad_norm": 0.05362673103809357, "learning_rate": 4.5488681902226094e-05, "loss": 0.0542, "num_input_tokens_seen": 6946832, "step": 7770 }, { "epoch": 2.0519994720865777, "grad_norm": 0.20405462384223938, "learning_rate": 4.5483054794606174e-05, "loss": 0.0798, "num_input_tokens_seen": 6951184, "step": 7775 }, { "epoch": 2.0533192556420747, "grad_norm": 0.09370773285627365, "learning_rate": 4.547742452824179e-05, "loss": 0.0684, "num_input_tokens_seen": 6955696, "step": 7780 }, { "epoch": 2.0546390391975717, "grad_norm": 0.3412418067455292, "learning_rate": 4.5471791104001215e-05, "loss": 0.0735, "num_input_tokens_seen": 6960016, "step": 7785 }, { "epoch": 2.0559588227530683, "grad_norm": 0.784838080406189, "learning_rate": 4.546615452275319e-05, "loss": 0.0948, "num_input_tokens_seen": 6964496, "step": 7790 }, { "epoch": 2.0572786063085653, "grad_norm": 0.16238315403461456, "learning_rate": 4.5460514785366944e-05, "loss": 0.1247, "num_input_tokens_seen": 6969168, "step": 7795 }, { "epoch": 2.0585983898640623, "grad_norm": 0.10413187742233276, "learning_rate": 4.545487189271219e-05, "loss": 0.0988, "num_input_tokens_seen": 6973744, "step": 7800 }, { "epoch": 2.0585983898640623, "eval_loss": 0.0754496306180954, "eval_runtime": 64.7725, "eval_samples_per_second": 103.979, "eval_steps_per_second": 25.999, "num_input_tokens_seen": 6973744, "step": 7800 }, { "epoch": 2.0599181734195593, "grad_norm": 0.3889513313770294, "learning_rate": 4.544922584565914e-05, "loss": 0.0413, "num_input_tokens_seen": 6977968, "step": 7805 }, { "epoch": 2.0612379569750563, "grad_norm": 0.25308260321617126, "learning_rate": 4.544357664507848e-05, "loss": 0.0476, "num_input_tokens_seen": 6982416, "step": 7810 }, { "epoch": 2.062557740530553, "grad_norm": 0.13888294994831085, "learning_rate": 4.54379242918414e-05, "loss": 0.0388, "num_input_tokens_seen": 6986512, "step": 7815 }, { "epoch": 2.06387752408605, "grad_norm": 0.489170640707016, "learning_rate": 4.543226878681955e-05, "loss": 0.0682, "num_input_tokens_seen": 6990576, "step": 7820 }, { "epoch": 2.065197307641547, "grad_norm": 0.16548322141170502, "learning_rate": 4.5426610130885087e-05, "loss": 0.0507, "num_input_tokens_seen": 6994640, "step": 7825 }, { "epoch": 2.066517091197044, "grad_norm": 0.08078751713037491, "learning_rate": 4.542094832491064e-05, "loss": 0.0254, "num_input_tokens_seen": 6998864, "step": 7830 }, { "epoch": 2.0678368747525404, "grad_norm": 0.253131628036499, "learning_rate": 4.541528336976934e-05, "loss": 0.0619, "num_input_tokens_seen": 7003248, "step": 7835 }, { "epoch": 2.0691566583080374, "grad_norm": 0.1995379477739334, "learning_rate": 4.540961526633479e-05, "loss": 0.0575, "num_input_tokens_seen": 7007664, "step": 7840 }, { "epoch": 2.0704764418635344, "grad_norm": 0.29386454820632935, "learning_rate": 4.540394401548108e-05, "loss": 0.0554, "num_input_tokens_seen": 7012144, "step": 7845 }, { "epoch": 2.0717962254190314, "grad_norm": 0.12137898057699203, "learning_rate": 4.539826961808279e-05, "loss": 0.0342, "num_input_tokens_seen": 7016816, "step": 7850 }, { "epoch": 2.073116008974528, "grad_norm": 0.506098747253418, "learning_rate": 4.5392592075014994e-05, "loss": 0.0963, "num_input_tokens_seen": 7021072, "step": 7855 }, { "epoch": 2.074435792530025, "grad_norm": 0.09510201215744019, "learning_rate": 4.538691138715322e-05, "loss": 0.0492, "num_input_tokens_seen": 7025488, "step": 7860 }, { "epoch": 2.075755576085522, "grad_norm": 0.07047756761312485, "learning_rate": 4.5381227555373516e-05, "loss": 0.0509, "num_input_tokens_seen": 7030224, "step": 7865 }, { "epoch": 2.077075359641019, "grad_norm": 0.04694641754031181, "learning_rate": 4.537554058055239e-05, "loss": 0.0683, "num_input_tokens_seen": 7034448, "step": 7870 }, { "epoch": 2.078395143196516, "grad_norm": 0.24805070459842682, "learning_rate": 4.5369850463566865e-05, "loss": 0.0872, "num_input_tokens_seen": 7039280, "step": 7875 }, { "epoch": 2.0797149267520125, "grad_norm": 0.05336683616042137, "learning_rate": 4.5364157205294404e-05, "loss": 0.0561, "num_input_tokens_seen": 7043760, "step": 7880 }, { "epoch": 2.0810347103075095, "grad_norm": 0.3307097554206848, "learning_rate": 4.5358460806612996e-05, "loss": 0.1189, "num_input_tokens_seen": 7047856, "step": 7885 }, { "epoch": 2.0823544938630065, "grad_norm": 0.15268053114414215, "learning_rate": 4.535276126840109e-05, "loss": 0.0714, "num_input_tokens_seen": 7052400, "step": 7890 }, { "epoch": 2.0836742774185035, "grad_norm": 0.09284672886133194, "learning_rate": 4.5347058591537626e-05, "loss": 0.0798, "num_input_tokens_seen": 7056976, "step": 7895 }, { "epoch": 2.084994060974, "grad_norm": 0.49223774671554565, "learning_rate": 4.534135277690203e-05, "loss": 0.1102, "num_input_tokens_seen": 7061712, "step": 7900 }, { "epoch": 2.086313844529497, "grad_norm": 0.31551679968833923, "learning_rate": 4.533564382537421e-05, "loss": 0.0335, "num_input_tokens_seen": 7066096, "step": 7905 }, { "epoch": 2.087633628084994, "grad_norm": 0.06261837482452393, "learning_rate": 4.532993173783456e-05, "loss": 0.0537, "num_input_tokens_seen": 7070544, "step": 7910 }, { "epoch": 2.088953411640491, "grad_norm": 0.23710490763187408, "learning_rate": 4.5324216515163954e-05, "loss": 0.0584, "num_input_tokens_seen": 7074736, "step": 7915 }, { "epoch": 2.0902731951959876, "grad_norm": 0.2738867402076721, "learning_rate": 4.531849815824375e-05, "loss": 0.101, "num_input_tokens_seen": 7079216, "step": 7920 }, { "epoch": 2.0915929787514846, "grad_norm": 0.08283576369285583, "learning_rate": 4.5312776667955795e-05, "loss": 0.06, "num_input_tokens_seen": 7083856, "step": 7925 }, { "epoch": 2.0929127623069816, "grad_norm": 0.09812027215957642, "learning_rate": 4.5307052045182405e-05, "loss": 0.0511, "num_input_tokens_seen": 7088272, "step": 7930 }, { "epoch": 2.0942325458624786, "grad_norm": 0.10546258091926575, "learning_rate": 4.53013242908064e-05, "loss": 0.0222, "num_input_tokens_seen": 7092688, "step": 7935 }, { "epoch": 2.0955523294179756, "grad_norm": 0.16834400594234467, "learning_rate": 4.529559340571107e-05, "loss": 0.0855, "num_input_tokens_seen": 7096720, "step": 7940 }, { "epoch": 2.096872112973472, "grad_norm": 0.28702494502067566, "learning_rate": 4.528985939078018e-05, "loss": 0.0524, "num_input_tokens_seen": 7101232, "step": 7945 }, { "epoch": 2.098191896528969, "grad_norm": 0.06454991549253464, "learning_rate": 4.5284122246898e-05, "loss": 0.0575, "num_input_tokens_seen": 7106032, "step": 7950 }, { "epoch": 2.099511680084466, "grad_norm": 0.5221126079559326, "learning_rate": 4.527838197494926e-05, "loss": 0.1118, "num_input_tokens_seen": 7110800, "step": 7955 }, { "epoch": 2.100831463639963, "grad_norm": 0.3111208379268646, "learning_rate": 4.527263857581918e-05, "loss": 0.0604, "num_input_tokens_seen": 7115408, "step": 7960 }, { "epoch": 2.1021512471954598, "grad_norm": 0.5192667245864868, "learning_rate": 4.526689205039347e-05, "loss": 0.0788, "num_input_tokens_seen": 7120112, "step": 7965 }, { "epoch": 2.1034710307509568, "grad_norm": 0.3179950416088104, "learning_rate": 4.5261142399558324e-05, "loss": 0.1121, "num_input_tokens_seen": 7124624, "step": 7970 }, { "epoch": 2.1047908143064538, "grad_norm": 0.3168044090270996, "learning_rate": 4.525538962420041e-05, "loss": 0.0748, "num_input_tokens_seen": 7128784, "step": 7975 }, { "epoch": 2.1061105978619508, "grad_norm": 0.1840326189994812, "learning_rate": 4.524963372520685e-05, "loss": 0.0639, "num_input_tokens_seen": 7133168, "step": 7980 }, { "epoch": 2.1074303814174478, "grad_norm": 0.2840273976325989, "learning_rate": 4.524387470346531e-05, "loss": 0.0511, "num_input_tokens_seen": 7137616, "step": 7985 }, { "epoch": 2.1087501649729443, "grad_norm": 0.2445802539587021, "learning_rate": 4.5238112559863885e-05, "loss": 0.064, "num_input_tokens_seen": 7142096, "step": 7990 }, { "epoch": 2.1100699485284413, "grad_norm": 0.40466511249542236, "learning_rate": 4.5232347295291175e-05, "loss": 0.0763, "num_input_tokens_seen": 7146256, "step": 7995 }, { "epoch": 2.1113897320839383, "grad_norm": 0.06845315545797348, "learning_rate": 4.522657891063626e-05, "loss": 0.0776, "num_input_tokens_seen": 7150896, "step": 8000 }, { "epoch": 2.1113897320839383, "eval_loss": 0.07489334791898727, "eval_runtime": 64.7648, "eval_samples_per_second": 103.992, "eval_steps_per_second": 26.002, "num_input_tokens_seen": 7150896, "step": 8000 }, { "epoch": 2.1127095156394353, "grad_norm": 0.4186022877693176, "learning_rate": 4.52208074067887e-05, "loss": 0.0733, "num_input_tokens_seen": 7155792, "step": 8005 }, { "epoch": 2.114029299194932, "grad_norm": 0.1848691701889038, "learning_rate": 4.5215032784638516e-05, "loss": 0.0818, "num_input_tokens_seen": 7160272, "step": 8010 }, { "epoch": 2.115349082750429, "grad_norm": 0.24331854283809662, "learning_rate": 4.5209255045076245e-05, "loss": 0.0795, "num_input_tokens_seen": 7164592, "step": 8015 }, { "epoch": 2.116668866305926, "grad_norm": 0.9791966080665588, "learning_rate": 4.5203474188992875e-05, "loss": 0.0817, "num_input_tokens_seen": 7168944, "step": 8020 }, { "epoch": 2.117988649861423, "grad_norm": 0.1498843878507614, "learning_rate": 4.51976902172799e-05, "loss": 0.0506, "num_input_tokens_seen": 7173200, "step": 8025 }, { "epoch": 2.1193084334169194, "grad_norm": 0.13253352046012878, "learning_rate": 4.519190313082927e-05, "loss": 0.0463, "num_input_tokens_seen": 7177648, "step": 8030 }, { "epoch": 2.1206282169724164, "grad_norm": 0.18419434130191803, "learning_rate": 4.518611293053343e-05, "loss": 0.0445, "num_input_tokens_seen": 7182256, "step": 8035 }, { "epoch": 2.1219480005279134, "grad_norm": 0.22218145430088043, "learning_rate": 4.51803196172853e-05, "loss": 0.04, "num_input_tokens_seen": 7186800, "step": 8040 }, { "epoch": 2.1232677840834104, "grad_norm": 0.2679457664489746, "learning_rate": 4.517452319197828e-05, "loss": 0.0959, "num_input_tokens_seen": 7191184, "step": 8045 }, { "epoch": 2.1245875676389074, "grad_norm": 0.30709266662597656, "learning_rate": 4.5168723655506265e-05, "loss": 0.0838, "num_input_tokens_seen": 7195664, "step": 8050 }, { "epoch": 2.125907351194404, "grad_norm": 0.27189576625823975, "learning_rate": 4.51629210087636e-05, "loss": 0.0328, "num_input_tokens_seen": 7200080, "step": 8055 }, { "epoch": 2.127227134749901, "grad_norm": 0.2236274629831314, "learning_rate": 4.515711525264513e-05, "loss": 0.0589, "num_input_tokens_seen": 7204528, "step": 8060 }, { "epoch": 2.128546918305398, "grad_norm": 0.047212280333042145, "learning_rate": 4.5151306388046175e-05, "loss": 0.0941, "num_input_tokens_seen": 7209136, "step": 8065 }, { "epoch": 2.129866701860895, "grad_norm": 0.23198921978473663, "learning_rate": 4.514549441586255e-05, "loss": 0.0821, "num_input_tokens_seen": 7213648, "step": 8070 }, { "epoch": 2.1311864854163916, "grad_norm": 0.2576185464859009, "learning_rate": 4.513967933699051e-05, "loss": 0.1052, "num_input_tokens_seen": 7218480, "step": 8075 }, { "epoch": 2.1325062689718886, "grad_norm": 0.18668513000011444, "learning_rate": 4.513386115232684e-05, "loss": 0.1173, "num_input_tokens_seen": 7222960, "step": 8080 }, { "epoch": 2.1338260525273856, "grad_norm": 0.3644115626811981, "learning_rate": 4.5128039862768745e-05, "loss": 0.0437, "num_input_tokens_seen": 7227536, "step": 8085 }, { "epoch": 2.1351458360828826, "grad_norm": 0.18272079527378082, "learning_rate": 4.512221546921397e-05, "loss": 0.1388, "num_input_tokens_seen": 7232048, "step": 8090 }, { "epoch": 2.136465619638379, "grad_norm": 0.10349217802286148, "learning_rate": 4.5116387972560694e-05, "loss": 0.0644, "num_input_tokens_seen": 7236432, "step": 8095 }, { "epoch": 2.137785403193876, "grad_norm": 0.2785578966140747, "learning_rate": 4.511055737370759e-05, "loss": 0.0611, "num_input_tokens_seen": 7240752, "step": 8100 }, { "epoch": 2.139105186749373, "grad_norm": 0.17527374625205994, "learning_rate": 4.510472367355383e-05, "loss": 0.0973, "num_input_tokens_seen": 7245552, "step": 8105 }, { "epoch": 2.14042497030487, "grad_norm": 0.22334282100200653, "learning_rate": 4.509888687299901e-05, "loss": 0.0692, "num_input_tokens_seen": 7250128, "step": 8110 }, { "epoch": 2.141744753860367, "grad_norm": 0.2183559089899063, "learning_rate": 4.5093046972943266e-05, "loss": 0.0522, "num_input_tokens_seen": 7254256, "step": 8115 }, { "epoch": 2.1430645374158637, "grad_norm": 0.44457024335861206, "learning_rate": 4.508720397428717e-05, "loss": 0.0653, "num_input_tokens_seen": 7258736, "step": 8120 }, { "epoch": 2.1443843209713607, "grad_norm": 0.07362023741006851, "learning_rate": 4.508135787793178e-05, "loss": 0.0714, "num_input_tokens_seen": 7263280, "step": 8125 }, { "epoch": 2.1457041045268577, "grad_norm": 0.21433889865875244, "learning_rate": 4.5075508684778664e-05, "loss": 0.0354, "num_input_tokens_seen": 7267760, "step": 8130 }, { "epoch": 2.1470238880823547, "grad_norm": 0.07051128894090652, "learning_rate": 4.506965639572982e-05, "loss": 0.0501, "num_input_tokens_seen": 7272688, "step": 8135 }, { "epoch": 2.1483436716378512, "grad_norm": 0.36079487204551697, "learning_rate": 4.506380101168774e-05, "loss": 0.038, "num_input_tokens_seen": 7277136, "step": 8140 }, { "epoch": 2.1496634551933482, "grad_norm": 0.1364288032054901, "learning_rate": 4.505794253355542e-05, "loss": 0.0604, "num_input_tokens_seen": 7282064, "step": 8145 }, { "epoch": 2.1509832387488452, "grad_norm": 0.4952329397201538, "learning_rate": 4.5052080962236286e-05, "loss": 0.107, "num_input_tokens_seen": 7286416, "step": 8150 }, { "epoch": 2.1523030223043422, "grad_norm": 0.20140352845191956, "learning_rate": 4.504621629863428e-05, "loss": 0.0853, "num_input_tokens_seen": 7290640, "step": 8155 }, { "epoch": 2.153622805859839, "grad_norm": 0.10488418489694595, "learning_rate": 4.504034854365381e-05, "loss": 0.0463, "num_input_tokens_seen": 7294800, "step": 8160 }, { "epoch": 2.154942589415336, "grad_norm": 0.37058648467063904, "learning_rate": 4.503447769819974e-05, "loss": 0.0485, "num_input_tokens_seen": 7299312, "step": 8165 }, { "epoch": 2.156262372970833, "grad_norm": 0.21084456145763397, "learning_rate": 4.502860376317745e-05, "loss": 0.0772, "num_input_tokens_seen": 7303408, "step": 8170 }, { "epoch": 2.15758215652633, "grad_norm": 0.09452219307422638, "learning_rate": 4.502272673949276e-05, "loss": 0.0984, "num_input_tokens_seen": 7307920, "step": 8175 }, { "epoch": 2.158901940081827, "grad_norm": 0.038731399923563004, "learning_rate": 4.501684662805199e-05, "loss": 0.0532, "num_input_tokens_seen": 7312176, "step": 8180 }, { "epoch": 2.1602217236373233, "grad_norm": 0.28049367666244507, "learning_rate": 4.5010963429761924e-05, "loss": 0.0827, "num_input_tokens_seen": 7316848, "step": 8185 }, { "epoch": 2.1615415071928203, "grad_norm": 0.26468542218208313, "learning_rate": 4.500507714552982e-05, "loss": 0.0759, "num_input_tokens_seen": 7321264, "step": 8190 }, { "epoch": 2.1628612907483173, "grad_norm": 0.0818038359284401, "learning_rate": 4.499918777626342e-05, "loss": 0.0435, "num_input_tokens_seen": 7325872, "step": 8195 }, { "epoch": 2.1641810743038143, "grad_norm": 0.7742751836776733, "learning_rate": 4.499329532287093e-05, "loss": 0.0671, "num_input_tokens_seen": 7330032, "step": 8200 }, { "epoch": 2.1641810743038143, "eval_loss": 0.0747479721903801, "eval_runtime": 64.7526, "eval_samples_per_second": 104.011, "eval_steps_per_second": 26.007, "num_input_tokens_seen": 7330032, "step": 8200 }, { "epoch": 2.165500857859311, "grad_norm": 0.15500406920909882, "learning_rate": 4.4987399786261064e-05, "loss": 0.0562, "num_input_tokens_seen": 7334320, "step": 8205 }, { "epoch": 2.166820641414808, "grad_norm": 0.06565006822347641, "learning_rate": 4.498150116734297e-05, "loss": 0.0381, "num_input_tokens_seen": 7338640, "step": 8210 }, { "epoch": 2.168140424970305, "grad_norm": 0.18759655952453613, "learning_rate": 4.4975599467026294e-05, "loss": 0.0672, "num_input_tokens_seen": 7343312, "step": 8215 }, { "epoch": 2.169460208525802, "grad_norm": 0.1542542725801468, "learning_rate": 4.496969468622114e-05, "loss": 0.0328, "num_input_tokens_seen": 7347824, "step": 8220 }, { "epoch": 2.1707799920812985, "grad_norm": 0.270298033952713, "learning_rate": 4.496378682583813e-05, "loss": 0.0333, "num_input_tokens_seen": 7352528, "step": 8225 }, { "epoch": 2.1720997756367955, "grad_norm": 0.03728686273097992, "learning_rate": 4.495787588678829e-05, "loss": 0.0548, "num_input_tokens_seen": 7357008, "step": 8230 }, { "epoch": 2.1734195591922925, "grad_norm": 0.18597032129764557, "learning_rate": 4.4951961869983196e-05, "loss": 0.0678, "num_input_tokens_seen": 7361552, "step": 8235 }, { "epoch": 2.1747393427477895, "grad_norm": 0.13053740561008453, "learning_rate": 4.494604477633485e-05, "loss": 0.0334, "num_input_tokens_seen": 7366128, "step": 8240 }, { "epoch": 2.1760591263032865, "grad_norm": 0.38580843806266785, "learning_rate": 4.4940124606755734e-05, "loss": 0.0451, "num_input_tokens_seen": 7370384, "step": 8245 }, { "epoch": 2.177378909858783, "grad_norm": 0.24125556647777557, "learning_rate": 4.493420136215882e-05, "loss": 0.0449, "num_input_tokens_seen": 7374800, "step": 8250 }, { "epoch": 2.17869869341428, "grad_norm": 0.06357249617576599, "learning_rate": 4.492827504345756e-05, "loss": 0.0327, "num_input_tokens_seen": 7379152, "step": 8255 }, { "epoch": 2.180018476969777, "grad_norm": 0.07770431041717529, "learning_rate": 4.492234565156584e-05, "loss": 0.0351, "num_input_tokens_seen": 7383568, "step": 8260 }, { "epoch": 2.181338260525274, "grad_norm": 0.15502195060253143, "learning_rate": 4.491641318739807e-05, "loss": 0.0995, "num_input_tokens_seen": 7387760, "step": 8265 }, { "epoch": 2.1826580440807706, "grad_norm": 0.2789313793182373, "learning_rate": 4.4910477651869096e-05, "loss": 0.0262, "num_input_tokens_seen": 7391888, "step": 8270 }, { "epoch": 2.1839778276362676, "grad_norm": 0.22910848259925842, "learning_rate": 4.4904539045894254e-05, "loss": 0.0624, "num_input_tokens_seen": 7396144, "step": 8275 }, { "epoch": 2.1852976111917646, "grad_norm": 0.3092767894268036, "learning_rate": 4.4898597370389364e-05, "loss": 0.0846, "num_input_tokens_seen": 7400560, "step": 8280 }, { "epoch": 2.1866173947472616, "grad_norm": 0.25779837369918823, "learning_rate": 4.489265262627069e-05, "loss": 0.0762, "num_input_tokens_seen": 7405296, "step": 8285 }, { "epoch": 2.187937178302758, "grad_norm": 0.05394850671291351, "learning_rate": 4.488670481445499e-05, "loss": 0.0448, "num_input_tokens_seen": 7409680, "step": 8290 }, { "epoch": 2.189256961858255, "grad_norm": 0.22546350955963135, "learning_rate": 4.488075393585951e-05, "loss": 0.0434, "num_input_tokens_seen": 7414000, "step": 8295 }, { "epoch": 2.190576745413752, "grad_norm": 0.2602360248565674, "learning_rate": 4.487479999140193e-05, "loss": 0.0724, "num_input_tokens_seen": 7418768, "step": 8300 }, { "epoch": 2.191896528969249, "grad_norm": 0.11775390803813934, "learning_rate": 4.4868842982000425e-05, "loss": 0.0582, "num_input_tokens_seen": 7423184, "step": 8305 }, { "epoch": 2.193216312524746, "grad_norm": 0.24081440269947052, "learning_rate": 4.486288290857365e-05, "loss": 0.0827, "num_input_tokens_seen": 7427696, "step": 8310 }, { "epoch": 2.1945360960802427, "grad_norm": 0.23181308805942535, "learning_rate": 4.4856919772040715e-05, "loss": 0.0767, "num_input_tokens_seen": 7432272, "step": 8315 }, { "epoch": 2.1958558796357397, "grad_norm": 0.13985151052474976, "learning_rate": 4.485095357332122e-05, "loss": 0.0355, "num_input_tokens_seen": 7437072, "step": 8320 }, { "epoch": 2.1971756631912367, "grad_norm": 0.04570583254098892, "learning_rate": 4.484498431333521e-05, "loss": 0.0902, "num_input_tokens_seen": 7441488, "step": 8325 }, { "epoch": 2.1984954467467337, "grad_norm": 0.1922800987958908, "learning_rate": 4.4839011993003245e-05, "loss": 0.0602, "num_input_tokens_seen": 7446192, "step": 8330 }, { "epoch": 2.1998152303022303, "grad_norm": 0.39609408378601074, "learning_rate": 4.4833036613246305e-05, "loss": 0.0791, "num_input_tokens_seen": 7450512, "step": 8335 }, { "epoch": 2.2011350138577273, "grad_norm": 0.1092604473233223, "learning_rate": 4.482705817498589e-05, "loss": 0.0331, "num_input_tokens_seen": 7455024, "step": 8340 }, { "epoch": 2.2024547974132243, "grad_norm": 0.052072856575250626, "learning_rate": 4.4821076679143934e-05, "loss": 0.026, "num_input_tokens_seen": 7459536, "step": 8345 }, { "epoch": 2.2037745809687213, "grad_norm": 0.19168980419635773, "learning_rate": 4.481509212664288e-05, "loss": 0.033, "num_input_tokens_seen": 7463920, "step": 8350 }, { "epoch": 2.205094364524218, "grad_norm": 0.1736547201871872, "learning_rate": 4.480910451840559e-05, "loss": 0.0431, "num_input_tokens_seen": 7468688, "step": 8355 }, { "epoch": 2.206414148079715, "grad_norm": 0.37127041816711426, "learning_rate": 4.480311385535546e-05, "loss": 0.0717, "num_input_tokens_seen": 7473456, "step": 8360 }, { "epoch": 2.207733931635212, "grad_norm": 0.1207113191485405, "learning_rate": 4.47971201384163e-05, "loss": 0.0646, "num_input_tokens_seen": 7478096, "step": 8365 }, { "epoch": 2.209053715190709, "grad_norm": 0.14862358570098877, "learning_rate": 4.4791123368512446e-05, "loss": 0.0736, "num_input_tokens_seen": 7482256, "step": 8370 }, { "epoch": 2.210373498746206, "grad_norm": 0.04348155856132507, "learning_rate": 4.478512354656864e-05, "loss": 0.1002, "num_input_tokens_seen": 7486608, "step": 8375 }, { "epoch": 2.2116932823017024, "grad_norm": 0.11444752663373947, "learning_rate": 4.477912067351016e-05, "loss": 0.0573, "num_input_tokens_seen": 7491184, "step": 8380 }, { "epoch": 2.2130130658571994, "grad_norm": 0.25760114192962646, "learning_rate": 4.477311475026271e-05, "loss": 0.0883, "num_input_tokens_seen": 7495728, "step": 8385 }, { "epoch": 2.2143328494126964, "grad_norm": 0.6964402198791504, "learning_rate": 4.476710577775248e-05, "loss": 0.0743, "num_input_tokens_seen": 7500336, "step": 8390 }, { "epoch": 2.2156526329681934, "grad_norm": 0.1507468968629837, "learning_rate": 4.476109375690612e-05, "loss": 0.0513, "num_input_tokens_seen": 7504592, "step": 8395 }, { "epoch": 2.21697241652369, "grad_norm": 0.3025268018245697, "learning_rate": 4.4755078688650784e-05, "loss": 0.1147, "num_input_tokens_seen": 7508816, "step": 8400 }, { "epoch": 2.21697241652369, "eval_loss": 0.07559145987033844, "eval_runtime": 64.7224, "eval_samples_per_second": 104.06, "eval_steps_per_second": 26.019, "num_input_tokens_seen": 7508816, "step": 8400 }, { "epoch": 2.218292200079187, "grad_norm": 0.4827629625797272, "learning_rate": 4.474906057391406e-05, "loss": 0.0624, "num_input_tokens_seen": 7513232, "step": 8405 }, { "epoch": 2.219611983634684, "grad_norm": 0.38947463035583496, "learning_rate": 4.4743039413624e-05, "loss": 0.0685, "num_input_tokens_seen": 7518000, "step": 8410 }, { "epoch": 2.220931767190181, "grad_norm": 0.2349194586277008, "learning_rate": 4.473701520870916e-05, "loss": 0.0301, "num_input_tokens_seen": 7522544, "step": 8415 }, { "epoch": 2.2222515507456775, "grad_norm": 0.3964787423610687, "learning_rate": 4.4730987960098544e-05, "loss": 0.0877, "num_input_tokens_seen": 7526960, "step": 8420 }, { "epoch": 2.2235713343011745, "grad_norm": 0.044484756886959076, "learning_rate": 4.4724957668721635e-05, "loss": 0.0563, "num_input_tokens_seen": 7531280, "step": 8425 }, { "epoch": 2.2248911178566715, "grad_norm": 0.11051148921251297, "learning_rate": 4.471892433550836e-05, "loss": 0.0285, "num_input_tokens_seen": 7535728, "step": 8430 }, { "epoch": 2.2262109014121685, "grad_norm": 0.12696214020252228, "learning_rate": 4.471288796138916e-05, "loss": 0.0225, "num_input_tokens_seen": 7540560, "step": 8435 }, { "epoch": 2.2275306849676655, "grad_norm": 0.3541504144668579, "learning_rate": 4.470684854729491e-05, "loss": 0.0739, "num_input_tokens_seen": 7544912, "step": 8440 }, { "epoch": 2.228850468523162, "grad_norm": 0.07317245006561279, "learning_rate": 4.4700806094156955e-05, "loss": 0.0274, "num_input_tokens_seen": 7549552, "step": 8445 }, { "epoch": 2.230170252078659, "grad_norm": 0.271212100982666, "learning_rate": 4.469476060290713e-05, "loss": 0.0882, "num_input_tokens_seen": 7553904, "step": 8450 }, { "epoch": 2.231490035634156, "grad_norm": 0.25180765986442566, "learning_rate": 4.468871207447772e-05, "loss": 0.1045, "num_input_tokens_seen": 7558640, "step": 8455 }, { "epoch": 2.232809819189653, "grad_norm": 0.22470711171627045, "learning_rate": 4.4682660509801486e-05, "loss": 0.0664, "num_input_tokens_seen": 7562704, "step": 8460 }, { "epoch": 2.2341296027451496, "grad_norm": 0.043757762759923935, "learning_rate": 4.467660590981165e-05, "loss": 0.0685, "num_input_tokens_seen": 7567216, "step": 8465 }, { "epoch": 2.2354493863006466, "grad_norm": 0.24988308548927307, "learning_rate": 4.467054827544191e-05, "loss": 0.0833, "num_input_tokens_seen": 7571760, "step": 8470 }, { "epoch": 2.2367691698561436, "grad_norm": 0.13088427484035492, "learning_rate": 4.4664487607626434e-05, "loss": 0.0999, "num_input_tokens_seen": 7576144, "step": 8475 }, { "epoch": 2.2380889534116406, "grad_norm": 0.11584965139627457, "learning_rate": 4.4658423907299845e-05, "loss": 0.0398, "num_input_tokens_seen": 7580208, "step": 8480 }, { "epoch": 2.239408736967137, "grad_norm": 0.2219480723142624, "learning_rate": 4.465235717539725e-05, "loss": 0.0694, "num_input_tokens_seen": 7584880, "step": 8485 }, { "epoch": 2.240728520522634, "grad_norm": 0.28186845779418945, "learning_rate": 4.464628741285421e-05, "loss": 0.0965, "num_input_tokens_seen": 7589168, "step": 8490 }, { "epoch": 2.242048304078131, "grad_norm": 0.25733256340026855, "learning_rate": 4.4640214620606754e-05, "loss": 0.0993, "num_input_tokens_seen": 7593456, "step": 8495 }, { "epoch": 2.243368087633628, "grad_norm": 0.3034327030181885, "learning_rate": 4.46341387995914e-05, "loss": 0.0627, "num_input_tokens_seen": 7597936, "step": 8500 }, { "epoch": 2.244687871189125, "grad_norm": 0.7294071316719055, "learning_rate": 4.4628059950745106e-05, "loss": 0.0374, "num_input_tokens_seen": 7602256, "step": 8505 }, { "epoch": 2.2460076547446217, "grad_norm": 0.13351532816886902, "learning_rate": 4.4621978075005297e-05, "loss": 0.0862, "num_input_tokens_seen": 7607152, "step": 8510 }, { "epoch": 2.2473274383001187, "grad_norm": 0.1253606230020523, "learning_rate": 4.461589317330989e-05, "loss": 0.0285, "num_input_tokens_seen": 7611632, "step": 8515 }, { "epoch": 2.2486472218556157, "grad_norm": 0.2611004412174225, "learning_rate": 4.460980524659724e-05, "loss": 0.0967, "num_input_tokens_seen": 7616208, "step": 8520 }, { "epoch": 2.2499670054111127, "grad_norm": 0.06876882165670395, "learning_rate": 4.46037142958062e-05, "loss": 0.0504, "num_input_tokens_seen": 7620720, "step": 8525 }, { "epoch": 2.2512867889666097, "grad_norm": 0.049842096865177155, "learning_rate": 4.4597620321876046e-05, "loss": 0.0463, "num_input_tokens_seen": 7624720, "step": 8530 }, { "epoch": 2.2526065725221063, "grad_norm": 0.13951657712459564, "learning_rate": 4.459152332574656e-05, "loss": 0.0495, "num_input_tokens_seen": 7629072, "step": 8535 }, { "epoch": 2.2539263560776033, "grad_norm": 0.1306658834218979, "learning_rate": 4.4585423308357985e-05, "loss": 0.0354, "num_input_tokens_seen": 7633584, "step": 8540 }, { "epoch": 2.2552461396331003, "grad_norm": 0.07734635472297668, "learning_rate": 4.457932027065102e-05, "loss": 0.1035, "num_input_tokens_seen": 7637904, "step": 8545 }, { "epoch": 2.256565923188597, "grad_norm": 0.18720126152038574, "learning_rate": 4.45732142135668e-05, "loss": 0.0447, "num_input_tokens_seen": 7642352, "step": 8550 }, { "epoch": 2.257885706744094, "grad_norm": 0.18094687163829803, "learning_rate": 4.4567105138046986e-05, "loss": 0.0484, "num_input_tokens_seen": 7646640, "step": 8555 }, { "epoch": 2.259205490299591, "grad_norm": 0.4723154902458191, "learning_rate": 4.456099304503365e-05, "loss": 0.048, "num_input_tokens_seen": 7651088, "step": 8560 }, { "epoch": 2.260525273855088, "grad_norm": 0.1067311093211174, "learning_rate": 4.455487793546939e-05, "loss": 0.0852, "num_input_tokens_seen": 7655216, "step": 8565 }, { "epoch": 2.261845057410585, "grad_norm": 0.2313871830701828, "learning_rate": 4.454875981029719e-05, "loss": 0.1012, "num_input_tokens_seen": 7659824, "step": 8570 }, { "epoch": 2.2631648409660814, "grad_norm": 0.05004528909921646, "learning_rate": 4.454263867046057e-05, "loss": 0.108, "num_input_tokens_seen": 7664176, "step": 8575 }, { "epoch": 2.2644846245215784, "grad_norm": 0.15142562985420227, "learning_rate": 4.4536514516903484e-05, "loss": 0.0642, "num_input_tokens_seen": 7668592, "step": 8580 }, { "epoch": 2.2658044080770754, "grad_norm": 0.1581783890724182, "learning_rate": 4.453038735057034e-05, "loss": 0.0762, "num_input_tokens_seen": 7672912, "step": 8585 }, { "epoch": 2.2671241916325724, "grad_norm": 0.03706010431051254, "learning_rate": 4.4524257172406034e-05, "loss": 0.0884, "num_input_tokens_seen": 7677456, "step": 8590 }, { "epoch": 2.2684439751880694, "grad_norm": 0.1639820784330368, "learning_rate": 4.451812398335592e-05, "loss": 0.1045, "num_input_tokens_seen": 7681744, "step": 8595 }, { "epoch": 2.269763758743566, "grad_norm": 0.08108969777822495, "learning_rate": 4.4511987784365805e-05, "loss": 0.0355, "num_input_tokens_seen": 7686352, "step": 8600 }, { "epoch": 2.269763758743566, "eval_loss": 0.07421866804361343, "eval_runtime": 64.7842, "eval_samples_per_second": 103.961, "eval_steps_per_second": 25.994, "num_input_tokens_seen": 7686352, "step": 8600 }, { "epoch": 2.271083542299063, "grad_norm": 0.1125345379114151, "learning_rate": 4.450584857638197e-05, "loss": 0.0514, "num_input_tokens_seen": 7690896, "step": 8605 }, { "epoch": 2.27240332585456, "grad_norm": 0.15789508819580078, "learning_rate": 4.449970636035116e-05, "loss": 0.0649, "num_input_tokens_seen": 7695536, "step": 8610 }, { "epoch": 2.2737231094100565, "grad_norm": 0.5257883667945862, "learning_rate": 4.4493561137220574e-05, "loss": 0.0815, "num_input_tokens_seen": 7700048, "step": 8615 }, { "epoch": 2.2750428929655535, "grad_norm": 0.24736924469470978, "learning_rate": 4.44874129079379e-05, "loss": 0.0223, "num_input_tokens_seen": 7704528, "step": 8620 }, { "epoch": 2.2763626765210505, "grad_norm": 0.3959580361843109, "learning_rate": 4.4481261673451255e-05, "loss": 0.0936, "num_input_tokens_seen": 7708848, "step": 8625 }, { "epoch": 2.2776824600765475, "grad_norm": 0.29168596863746643, "learning_rate": 4.4475107434709245e-05, "loss": 0.0726, "num_input_tokens_seen": 7713552, "step": 8630 }, { "epoch": 2.2790022436320445, "grad_norm": 0.15896978974342346, "learning_rate": 4.446895019266093e-05, "loss": 0.1009, "num_input_tokens_seen": 7717808, "step": 8635 }, { "epoch": 2.280322027187541, "grad_norm": 0.4824647009372711, "learning_rate": 4.446278994825583e-05, "loss": 0.1167, "num_input_tokens_seen": 7722128, "step": 8640 }, { "epoch": 2.281641810743038, "grad_norm": 0.3201243281364441, "learning_rate": 4.445662670244394e-05, "loss": 0.1022, "num_input_tokens_seen": 7726320, "step": 8645 }, { "epoch": 2.282961594298535, "grad_norm": 0.20121006667613983, "learning_rate": 4.44504604561757e-05, "loss": 0.0697, "num_input_tokens_seen": 7730704, "step": 8650 }, { "epoch": 2.284281377854032, "grad_norm": 0.15775150060653687, "learning_rate": 4.4444291210402035e-05, "loss": 0.1065, "num_input_tokens_seen": 7735024, "step": 8655 }, { "epoch": 2.285601161409529, "grad_norm": 0.12930023670196533, "learning_rate": 4.443811896607431e-05, "loss": 0.0714, "num_input_tokens_seen": 7739472, "step": 8660 }, { "epoch": 2.2869209449650256, "grad_norm": 0.24057447910308838, "learning_rate": 4.443194372414436e-05, "loss": 0.0546, "num_input_tokens_seen": 7743952, "step": 8665 }, { "epoch": 2.2882407285205226, "grad_norm": 0.5563846826553345, "learning_rate": 4.442576548556449e-05, "loss": 0.0535, "num_input_tokens_seen": 7748464, "step": 8670 }, { "epoch": 2.2895605120760196, "grad_norm": 0.24843387305736542, "learning_rate": 4.441958425128747e-05, "loss": 0.0524, "num_input_tokens_seen": 7753168, "step": 8675 }, { "epoch": 2.290880295631516, "grad_norm": 0.20776672661304474, "learning_rate": 4.4413400022266515e-05, "loss": 0.0581, "num_input_tokens_seen": 7757712, "step": 8680 }, { "epoch": 2.292200079187013, "grad_norm": 0.0841105505824089, "learning_rate": 4.4407212799455313e-05, "loss": 0.0905, "num_input_tokens_seen": 7761904, "step": 8685 }, { "epoch": 2.29351986274251, "grad_norm": 0.10613290965557098, "learning_rate": 4.4401022583808003e-05, "loss": 0.0792, "num_input_tokens_seen": 7766480, "step": 8690 }, { "epoch": 2.294839646298007, "grad_norm": 0.29400667548179626, "learning_rate": 4.439482937627921e-05, "loss": 0.0533, "num_input_tokens_seen": 7771120, "step": 8695 }, { "epoch": 2.296159429853504, "grad_norm": 0.1684807986021042, "learning_rate": 4.4388633177824004e-05, "loss": 0.0374, "num_input_tokens_seen": 7775600, "step": 8700 }, { "epoch": 2.2974792134090007, "grad_norm": 0.4519040882587433, "learning_rate": 4.4382433989397895e-05, "loss": 0.0594, "num_input_tokens_seen": 7780112, "step": 8705 }, { "epoch": 2.2987989969644977, "grad_norm": 0.032076336443424225, "learning_rate": 4.4376231811956895e-05, "loss": 0.0625, "num_input_tokens_seen": 7784400, "step": 8710 }, { "epoch": 2.3001187805199947, "grad_norm": 0.5452390909194946, "learning_rate": 4.437002664645745e-05, "loss": 0.0954, "num_input_tokens_seen": 7789200, "step": 8715 }, { "epoch": 2.3014385640754917, "grad_norm": 0.13484683632850647, "learning_rate": 4.436381849385649e-05, "loss": 0.0277, "num_input_tokens_seen": 7793744, "step": 8720 }, { "epoch": 2.3027583476309887, "grad_norm": 0.029698144644498825, "learning_rate": 4.435760735511136e-05, "loss": 0.0796, "num_input_tokens_seen": 7798416, "step": 8725 }, { "epoch": 2.3040781311864853, "grad_norm": 0.07295767217874527, "learning_rate": 4.435139323117992e-05, "loss": 0.075, "num_input_tokens_seen": 7802960, "step": 8730 }, { "epoch": 2.3053979147419823, "grad_norm": 0.1978251338005066, "learning_rate": 4.434517612302046e-05, "loss": 0.113, "num_input_tokens_seen": 7807472, "step": 8735 }, { "epoch": 2.3067176982974793, "grad_norm": 0.20524822175502777, "learning_rate": 4.433895603159174e-05, "loss": 0.07, "num_input_tokens_seen": 7811856, "step": 8740 }, { "epoch": 2.3080374818529763, "grad_norm": 0.24219156801700592, "learning_rate": 4.433273295785296e-05, "loss": 0.052, "num_input_tokens_seen": 7816240, "step": 8745 }, { "epoch": 2.309357265408473, "grad_norm": 0.49947288632392883, "learning_rate": 4.432650690276382e-05, "loss": 0.0649, "num_input_tokens_seen": 7820336, "step": 8750 }, { "epoch": 2.31067704896397, "grad_norm": 0.5283124446868896, "learning_rate": 4.4320277867284435e-05, "loss": 0.108, "num_input_tokens_seen": 7825040, "step": 8755 }, { "epoch": 2.311996832519467, "grad_norm": 0.3377077281475067, "learning_rate": 4.431404585237541e-05, "loss": 0.0955, "num_input_tokens_seen": 7829616, "step": 8760 }, { "epoch": 2.313316616074964, "grad_norm": 0.18836532533168793, "learning_rate": 4.43078108589978e-05, "loss": 0.0554, "num_input_tokens_seen": 7833968, "step": 8765 }, { "epoch": 2.3146363996304604, "grad_norm": 0.1997460573911667, "learning_rate": 4.4301572888113116e-05, "loss": 0.0683, "num_input_tokens_seen": 7838320, "step": 8770 }, { "epoch": 2.3159561831859574, "grad_norm": 0.3314087986946106, "learning_rate": 4.4295331940683337e-05, "loss": 0.0966, "num_input_tokens_seen": 7842352, "step": 8775 }, { "epoch": 2.3172759667414544, "grad_norm": 0.19467756152153015, "learning_rate": 4.428908801767089e-05, "loss": 0.0731, "num_input_tokens_seen": 7846480, "step": 8780 }, { "epoch": 2.3185957502969514, "grad_norm": 0.11510217934846878, "learning_rate": 4.428284112003868e-05, "loss": 0.0426, "num_input_tokens_seen": 7850992, "step": 8785 }, { "epoch": 2.3199155338524484, "grad_norm": 0.3122459948062897, "learning_rate": 4.4276591248750033e-05, "loss": 0.0385, "num_input_tokens_seen": 7855248, "step": 8790 }, { "epoch": 2.321235317407945, "grad_norm": 0.38800936937332153, "learning_rate": 4.4270338404768774e-05, "loss": 0.0761, "num_input_tokens_seen": 7859536, "step": 8795 }, { "epoch": 2.322555100963442, "grad_norm": 0.8272725939750671, "learning_rate": 4.426408258905917e-05, "loss": 0.0903, "num_input_tokens_seen": 7864080, "step": 8800 }, { "epoch": 2.322555100963442, "eval_loss": 0.07382544875144958, "eval_runtime": 64.7374, "eval_samples_per_second": 104.036, "eval_steps_per_second": 26.013, "num_input_tokens_seen": 7864080, "step": 8800 }, { "epoch": 2.323874884518939, "grad_norm": 0.22693680226802826, "learning_rate": 4.425782380258594e-05, "loss": 0.0564, "num_input_tokens_seen": 7868720, "step": 8805 }, { "epoch": 2.325194668074436, "grad_norm": 0.24192887544631958, "learning_rate": 4.425156204631427e-05, "loss": 0.0481, "num_input_tokens_seen": 7873200, "step": 8810 }, { "epoch": 2.3265144516299325, "grad_norm": 0.17911675572395325, "learning_rate": 4.424529732120981e-05, "loss": 0.0858, "num_input_tokens_seen": 7877840, "step": 8815 }, { "epoch": 2.3278342351854295, "grad_norm": 0.16295243799686432, "learning_rate": 4.423902962823864e-05, "loss": 0.0476, "num_input_tokens_seen": 7882416, "step": 8820 }, { "epoch": 2.3291540187409265, "grad_norm": 0.3257666230201721, "learning_rate": 4.423275896836733e-05, "loss": 0.1314, "num_input_tokens_seen": 7886864, "step": 8825 }, { "epoch": 2.3304738022964235, "grad_norm": 0.18956910073757172, "learning_rate": 4.42264853425629e-05, "loss": 0.1574, "num_input_tokens_seen": 7891024, "step": 8830 }, { "epoch": 2.33179358585192, "grad_norm": 0.2562328577041626, "learning_rate": 4.4220208751792816e-05, "loss": 0.1251, "num_input_tokens_seen": 7895280, "step": 8835 }, { "epoch": 2.333113369407417, "grad_norm": 0.23893992602825165, "learning_rate": 4.421392919702499e-05, "loss": 0.0977, "num_input_tokens_seen": 7900176, "step": 8840 }, { "epoch": 2.334433152962914, "grad_norm": 0.07824353128671646, "learning_rate": 4.4207646679227846e-05, "loss": 0.0346, "num_input_tokens_seen": 7904560, "step": 8845 }, { "epoch": 2.335752936518411, "grad_norm": 0.15438447892665863, "learning_rate": 4.42013611993702e-05, "loss": 0.0857, "num_input_tokens_seen": 7909072, "step": 8850 }, { "epoch": 2.337072720073908, "grad_norm": 0.16404542326927185, "learning_rate": 4.419507275842135e-05, "loss": 0.118, "num_input_tokens_seen": 7913680, "step": 8855 }, { "epoch": 2.3383925036294047, "grad_norm": 0.23074950277805328, "learning_rate": 4.418878135735106e-05, "loss": 0.0901, "num_input_tokens_seen": 7918032, "step": 8860 }, { "epoch": 2.3397122871849017, "grad_norm": 0.13195401430130005, "learning_rate": 4.418248699712955e-05, "loss": 0.0509, "num_input_tokens_seen": 7922704, "step": 8865 }, { "epoch": 2.3410320707403987, "grad_norm": 0.15457263588905334, "learning_rate": 4.417618967872748e-05, "loss": 0.0744, "num_input_tokens_seen": 7927184, "step": 8870 }, { "epoch": 2.3423518542958957, "grad_norm": 0.263943612575531, "learning_rate": 4.4169889403115985e-05, "loss": 0.0532, "num_input_tokens_seen": 7931152, "step": 8875 }, { "epoch": 2.343671637851392, "grad_norm": 0.062111757695674896, "learning_rate": 4.4163586171266627e-05, "loss": 0.0704, "num_input_tokens_seen": 7935536, "step": 8880 }, { "epoch": 2.344991421406889, "grad_norm": 0.31245023012161255, "learning_rate": 4.415727998415147e-05, "loss": 0.0524, "num_input_tokens_seen": 7939952, "step": 8885 }, { "epoch": 2.346311204962386, "grad_norm": 0.32612842321395874, "learning_rate": 4.4150970842742985e-05, "loss": 0.0447, "num_input_tokens_seen": 7944304, "step": 8890 }, { "epoch": 2.347630988517883, "grad_norm": 0.1825343668460846, "learning_rate": 4.4144658748014134e-05, "loss": 0.045, "num_input_tokens_seen": 7948912, "step": 8895 }, { "epoch": 2.3489507720733798, "grad_norm": 0.16124267876148224, "learning_rate": 4.413834370093831e-05, "loss": 0.0486, "num_input_tokens_seen": 7953232, "step": 8900 }, { "epoch": 2.3502705556288768, "grad_norm": 0.28025346994400024, "learning_rate": 4.413202570248939e-05, "loss": 0.056, "num_input_tokens_seen": 7957488, "step": 8905 }, { "epoch": 2.3515903391843738, "grad_norm": 0.23958459496498108, "learning_rate": 4.412570475364167e-05, "loss": 0.0988, "num_input_tokens_seen": 7962064, "step": 8910 }, { "epoch": 2.3529101227398708, "grad_norm": 0.3731449842453003, "learning_rate": 4.411938085536994e-05, "loss": 0.0892, "num_input_tokens_seen": 7966640, "step": 8915 }, { "epoch": 2.3542299062953678, "grad_norm": 0.08741065859794617, "learning_rate": 4.41130540086494e-05, "loss": 0.0558, "num_input_tokens_seen": 7971216, "step": 8920 }, { "epoch": 2.3555496898508643, "grad_norm": 0.17499606311321259, "learning_rate": 4.4106724214455754e-05, "loss": 0.0601, "num_input_tokens_seen": 7975952, "step": 8925 }, { "epoch": 2.3568694734063613, "grad_norm": 0.2939094305038452, "learning_rate": 4.4100391473765115e-05, "loss": 0.0529, "num_input_tokens_seen": 7980464, "step": 8930 }, { "epoch": 2.3581892569618583, "grad_norm": 0.2896695137023926, "learning_rate": 4.409405578755408e-05, "loss": 0.055, "num_input_tokens_seen": 7984912, "step": 8935 }, { "epoch": 2.3595090405173553, "grad_norm": 0.34834980964660645, "learning_rate": 4.4087717156799705e-05, "loss": 0.0281, "num_input_tokens_seen": 7989616, "step": 8940 }, { "epoch": 2.360828824072852, "grad_norm": 0.2766324579715729, "learning_rate": 4.408137558247946e-05, "loss": 0.0783, "num_input_tokens_seen": 7994032, "step": 8945 }, { "epoch": 2.362148607628349, "grad_norm": 0.13774330914020538, "learning_rate": 4.4075031065571306e-05, "loss": 0.0319, "num_input_tokens_seen": 7998672, "step": 8950 }, { "epoch": 2.363468391183846, "grad_norm": 0.1742262840270996, "learning_rate": 4.406868360705366e-05, "loss": 0.0843, "num_input_tokens_seen": 8003088, "step": 8955 }, { "epoch": 2.364788174739343, "grad_norm": 0.2502993643283844, "learning_rate": 4.406233320790536e-05, "loss": 0.0781, "num_input_tokens_seen": 8007440, "step": 8960 }, { "epoch": 2.3661079582948394, "grad_norm": 0.1978713572025299, "learning_rate": 4.4055979869105734e-05, "loss": 0.0676, "num_input_tokens_seen": 8011728, "step": 8965 }, { "epoch": 2.3674277418503364, "grad_norm": 0.13060395419597626, "learning_rate": 4.404962359163454e-05, "loss": 0.0962, "num_input_tokens_seen": 8016112, "step": 8970 }, { "epoch": 2.3687475254058334, "grad_norm": 0.19114427268505096, "learning_rate": 4.404326437647199e-05, "loss": 0.0879, "num_input_tokens_seen": 8020944, "step": 8975 }, { "epoch": 2.3700673089613304, "grad_norm": 0.06981606036424637, "learning_rate": 4.403690222459877e-05, "loss": 0.0301, "num_input_tokens_seen": 8025392, "step": 8980 }, { "epoch": 2.3713870925168274, "grad_norm": 0.20018890500068665, "learning_rate": 4.4030537136995984e-05, "loss": 0.0416, "num_input_tokens_seen": 8029744, "step": 8985 }, { "epoch": 2.372706876072324, "grad_norm": 0.05702696740627289, "learning_rate": 4.402416911464523e-05, "loss": 0.0426, "num_input_tokens_seen": 8033968, "step": 8990 }, { "epoch": 2.374026659627821, "grad_norm": 0.11251654475927353, "learning_rate": 4.4017798158528516e-05, "loss": 0.1073, "num_input_tokens_seen": 8038448, "step": 8995 }, { "epoch": 2.375346443183318, "grad_norm": 0.2353343665599823, "learning_rate": 4.401142426962834e-05, "loss": 0.0498, "num_input_tokens_seen": 8042928, "step": 9000 }, { "epoch": 2.375346443183318, "eval_loss": 0.07366812229156494, "eval_runtime": 64.7068, "eval_samples_per_second": 104.085, "eval_steps_per_second": 26.025, "num_input_tokens_seen": 8042928, "step": 9000 }, { "epoch": 2.376666226738815, "grad_norm": 0.458644300699234, "learning_rate": 4.400504744892763e-05, "loss": 0.079, "num_input_tokens_seen": 8047472, "step": 9005 }, { "epoch": 2.3779860102943116, "grad_norm": 0.26188164949417114, "learning_rate": 4.399866769740975e-05, "loss": 0.0751, "num_input_tokens_seen": 8051792, "step": 9010 }, { "epoch": 2.3793057938498086, "grad_norm": 0.17202936112880707, "learning_rate": 4.399228501605859e-05, "loss": 0.1041, "num_input_tokens_seen": 8056432, "step": 9015 }, { "epoch": 2.3806255774053056, "grad_norm": 0.41313886642456055, "learning_rate": 4.398589940585839e-05, "loss": 0.0584, "num_input_tokens_seen": 8060784, "step": 9020 }, { "epoch": 2.3819453609608026, "grad_norm": 0.20247070491313934, "learning_rate": 4.3979510867793917e-05, "loss": 0.0458, "num_input_tokens_seen": 8065136, "step": 9025 }, { "epoch": 2.383265144516299, "grad_norm": 0.3748520612716675, "learning_rate": 4.3973119402850346e-05, "loss": 0.0865, "num_input_tokens_seen": 8069200, "step": 9030 }, { "epoch": 2.384584928071796, "grad_norm": 0.16561299562454224, "learning_rate": 4.396672501201334e-05, "loss": 0.0487, "num_input_tokens_seen": 8073744, "step": 9035 }, { "epoch": 2.385904711627293, "grad_norm": 0.17081308364868164, "learning_rate": 4.396032769626899e-05, "loss": 0.0732, "num_input_tokens_seen": 8078128, "step": 9040 }, { "epoch": 2.38722449518279, "grad_norm": 0.07910822331905365, "learning_rate": 4.395392745660384e-05, "loss": 0.0258, "num_input_tokens_seen": 8082544, "step": 9045 }, { "epoch": 2.388544278738287, "grad_norm": 0.33812302350997925, "learning_rate": 4.394752429400488e-05, "loss": 0.0782, "num_input_tokens_seen": 8086960, "step": 9050 }, { "epoch": 2.3898640622937837, "grad_norm": 0.341682106256485, "learning_rate": 4.394111820945957e-05, "loss": 0.0674, "num_input_tokens_seen": 8091504, "step": 9055 }, { "epoch": 2.3911838458492807, "grad_norm": 0.2222287654876709, "learning_rate": 4.393470920395579e-05, "loss": 0.0549, "num_input_tokens_seen": 8095632, "step": 9060 }, { "epoch": 2.3925036294047777, "grad_norm": 0.2893441617488861, "learning_rate": 4.392829727848192e-05, "loss": 0.0662, "num_input_tokens_seen": 8100048, "step": 9065 }, { "epoch": 2.3938234129602747, "grad_norm": 0.24630434811115265, "learning_rate": 4.392188243402673e-05, "loss": 0.0406, "num_input_tokens_seen": 8104752, "step": 9070 }, { "epoch": 2.3951431965157712, "grad_norm": 0.30344417691230774, "learning_rate": 4.391546467157949e-05, "loss": 0.0473, "num_input_tokens_seen": 8109200, "step": 9075 }, { "epoch": 2.3964629800712682, "grad_norm": 0.42786845564842224, "learning_rate": 4.390904399212988e-05, "loss": 0.1073, "num_input_tokens_seen": 8113712, "step": 9080 }, { "epoch": 2.3977827636267652, "grad_norm": 0.49901145696640015, "learning_rate": 4.390262039666807e-05, "loss": 0.0673, "num_input_tokens_seen": 8118256, "step": 9085 }, { "epoch": 2.3991025471822622, "grad_norm": 0.32027795910835266, "learning_rate": 4.389619388618464e-05, "loss": 0.0858, "num_input_tokens_seen": 8122768, "step": 9090 }, { "epoch": 2.400422330737759, "grad_norm": 0.43628621101379395, "learning_rate": 4.3889764461670655e-05, "loss": 0.0836, "num_input_tokens_seen": 8127216, "step": 9095 }, { "epoch": 2.401742114293256, "grad_norm": 0.2572704553604126, "learning_rate": 4.38833321241176e-05, "loss": 0.0877, "num_input_tokens_seen": 8131792, "step": 9100 }, { "epoch": 2.403061897848753, "grad_norm": 0.10141805559396744, "learning_rate": 4.3876896874517434e-05, "loss": 0.0481, "num_input_tokens_seen": 8136464, "step": 9105 }, { "epoch": 2.40438168140425, "grad_norm": 0.18329687416553497, "learning_rate": 4.3870458713862554e-05, "loss": 0.0813, "num_input_tokens_seen": 8141424, "step": 9110 }, { "epoch": 2.405701464959747, "grad_norm": 0.06782520562410355, "learning_rate": 4.386401764314579e-05, "loss": 0.0576, "num_input_tokens_seen": 8145904, "step": 9115 }, { "epoch": 2.4070212485152434, "grad_norm": 0.23425708711147308, "learning_rate": 4.385757366336045e-05, "loss": 0.045, "num_input_tokens_seen": 8150480, "step": 9120 }, { "epoch": 2.4083410320707404, "grad_norm": 0.25675007700920105, "learning_rate": 4.385112677550027e-05, "loss": 0.0407, "num_input_tokens_seen": 8154448, "step": 9125 }, { "epoch": 2.4096608156262374, "grad_norm": 0.23761184513568878, "learning_rate": 4.384467698055945e-05, "loss": 0.0297, "num_input_tokens_seen": 8158832, "step": 9130 }, { "epoch": 2.4109805991817344, "grad_norm": 0.058695238083601, "learning_rate": 4.383822427953261e-05, "loss": 0.0437, "num_input_tokens_seen": 8163664, "step": 9135 }, { "epoch": 2.412300382737231, "grad_norm": 0.1232694461941719, "learning_rate": 4.3831768673414864e-05, "loss": 0.068, "num_input_tokens_seen": 8168432, "step": 9140 }, { "epoch": 2.413620166292728, "grad_norm": 0.159526988863945, "learning_rate": 4.382531016320173e-05, "loss": 0.0993, "num_input_tokens_seen": 8173264, "step": 9145 }, { "epoch": 2.414939949848225, "grad_norm": 0.1258520632982254, "learning_rate": 4.3818848749889184e-05, "loss": 0.0352, "num_input_tokens_seen": 8177808, "step": 9150 }, { "epoch": 2.416259733403722, "grad_norm": 0.1542961299419403, "learning_rate": 4.381238443447368e-05, "loss": 0.0345, "num_input_tokens_seen": 8182512, "step": 9155 }, { "epoch": 2.4175795169592185, "grad_norm": 0.40725672245025635, "learning_rate": 4.380591721795208e-05, "loss": 0.0777, "num_input_tokens_seen": 8187152, "step": 9160 }, { "epoch": 2.4188993005147155, "grad_norm": 0.08722355961799622, "learning_rate": 4.3799447101321723e-05, "loss": 0.0594, "num_input_tokens_seen": 8191920, "step": 9165 }, { "epoch": 2.4202190840702125, "grad_norm": 0.2834886312484741, "learning_rate": 4.379297408558036e-05, "loss": 0.0475, "num_input_tokens_seen": 8196592, "step": 9170 }, { "epoch": 2.4215388676257095, "grad_norm": 0.035566672682762146, "learning_rate": 4.378649817172624e-05, "loss": 0.0386, "num_input_tokens_seen": 8201008, "step": 9175 }, { "epoch": 2.4228586511812065, "grad_norm": 0.2594910264015198, "learning_rate": 4.378001936075801e-05, "loss": 0.0667, "num_input_tokens_seen": 8205808, "step": 9180 }, { "epoch": 2.424178434736703, "grad_norm": 0.4560912251472473, "learning_rate": 4.377353765367479e-05, "loss": 0.0715, "num_input_tokens_seen": 8210352, "step": 9185 }, { "epoch": 2.4254982182922, "grad_norm": 0.4761858284473419, "learning_rate": 4.376705305147614e-05, "loss": 0.0629, "num_input_tokens_seen": 8214672, "step": 9190 }, { "epoch": 2.426818001847697, "grad_norm": 0.09230410307645798, "learning_rate": 4.376056555516206e-05, "loss": 0.0455, "num_input_tokens_seen": 8219184, "step": 9195 }, { "epoch": 2.428137785403194, "grad_norm": 0.14981140196323395, "learning_rate": 4.375407516573302e-05, "loss": 0.0614, "num_input_tokens_seen": 8223824, "step": 9200 }, { "epoch": 2.428137785403194, "eval_loss": 0.07391242682933807, "eval_runtime": 64.7357, "eval_samples_per_second": 104.038, "eval_steps_per_second": 26.013, "num_input_tokens_seen": 8223824, "step": 9200 }, { "epoch": 2.4294575689586906, "grad_norm": 0.19377146661281586, "learning_rate": 4.3747581884189913e-05, "loss": 0.043, "num_input_tokens_seen": 8228272, "step": 9205 }, { "epoch": 2.4307773525141876, "grad_norm": 0.3537294864654541, "learning_rate": 4.374108571153408e-05, "loss": 0.0873, "num_input_tokens_seen": 8233040, "step": 9210 }, { "epoch": 2.4320971360696846, "grad_norm": 0.26429229974746704, "learning_rate": 4.3734586648767316e-05, "loss": 0.1141, "num_input_tokens_seen": 8237424, "step": 9215 }, { "epoch": 2.4334169196251816, "grad_norm": 0.03198317810893059, "learning_rate": 4.372808469689186e-05, "loss": 0.0492, "num_input_tokens_seen": 8241968, "step": 9220 }, { "epoch": 2.434736703180678, "grad_norm": 0.22830308973789215, "learning_rate": 4.372157985691039e-05, "loss": 0.0924, "num_input_tokens_seen": 8246288, "step": 9225 }, { "epoch": 2.436056486736175, "grad_norm": 0.36861100792884827, "learning_rate": 4.371507212982603e-05, "loss": 0.067, "num_input_tokens_seen": 8250832, "step": 9230 }, { "epoch": 2.437376270291672, "grad_norm": 0.17639894783496857, "learning_rate": 4.370856151664236e-05, "loss": 0.0763, "num_input_tokens_seen": 8255408, "step": 9235 }, { "epoch": 2.438696053847169, "grad_norm": 0.04036502167582512, "learning_rate": 4.3702048018363404e-05, "loss": 0.0378, "num_input_tokens_seen": 8259504, "step": 9240 }, { "epoch": 2.440015837402666, "grad_norm": 0.17586800456047058, "learning_rate": 4.369553163599362e-05, "loss": 0.0498, "num_input_tokens_seen": 8263984, "step": 9245 }, { "epoch": 2.4413356209581627, "grad_norm": 0.22516407072544098, "learning_rate": 4.3689012370537904e-05, "loss": 0.1002, "num_input_tokens_seen": 8268240, "step": 9250 }, { "epoch": 2.4426554045136597, "grad_norm": 0.15411430597305298, "learning_rate": 4.368249022300164e-05, "loss": 0.0607, "num_input_tokens_seen": 8272656, "step": 9255 }, { "epoch": 2.4439751880691567, "grad_norm": 0.07034123688936234, "learning_rate": 4.367596519439059e-05, "loss": 0.0347, "num_input_tokens_seen": 8276752, "step": 9260 }, { "epoch": 2.4452949716246537, "grad_norm": 0.27482086420059204, "learning_rate": 4.366943728571101e-05, "loss": 0.073, "num_input_tokens_seen": 8281104, "step": 9265 }, { "epoch": 2.4466147551801507, "grad_norm": 0.1325758844614029, "learning_rate": 4.366290649796959e-05, "loss": 0.0317, "num_input_tokens_seen": 8285808, "step": 9270 }, { "epoch": 2.4479345387356473, "grad_norm": 0.026735510677099228, "learning_rate": 4.3656372832173456e-05, "loss": 0.065, "num_input_tokens_seen": 8290256, "step": 9275 }, { "epoch": 2.4492543222911443, "grad_norm": 0.28094732761383057, "learning_rate": 4.364983628933017e-05, "loss": 0.0843, "num_input_tokens_seen": 8294832, "step": 9280 }, { "epoch": 2.4505741058466413, "grad_norm": 0.43103864789009094, "learning_rate": 4.364329687044777e-05, "loss": 0.0866, "num_input_tokens_seen": 8299088, "step": 9285 }, { "epoch": 2.451893889402138, "grad_norm": 0.10007283836603165, "learning_rate": 4.36367545765347e-05, "loss": 0.0644, "num_input_tokens_seen": 8303856, "step": 9290 }, { "epoch": 2.453213672957635, "grad_norm": 0.17797701060771942, "learning_rate": 4.363020940859988e-05, "loss": 0.0573, "num_input_tokens_seen": 8308336, "step": 9295 }, { "epoch": 2.454533456513132, "grad_norm": 0.2130151242017746, "learning_rate": 4.362366136765263e-05, "loss": 0.0442, "num_input_tokens_seen": 8312944, "step": 9300 }, { "epoch": 2.455853240068629, "grad_norm": 0.15052422881126404, "learning_rate": 4.361711045470278e-05, "loss": 0.0791, "num_input_tokens_seen": 8317424, "step": 9305 }, { "epoch": 2.457173023624126, "grad_norm": 0.1745966672897339, "learning_rate": 4.3610556670760524e-05, "loss": 0.1217, "num_input_tokens_seen": 8321840, "step": 9310 }, { "epoch": 2.4584928071796224, "grad_norm": 0.08775819838047028, "learning_rate": 4.360400001683657e-05, "loss": 0.1181, "num_input_tokens_seen": 8326480, "step": 9315 }, { "epoch": 2.4598125907351194, "grad_norm": 0.1912098079919815, "learning_rate": 4.3597440493942e-05, "loss": 0.0686, "num_input_tokens_seen": 8331024, "step": 9320 }, { "epoch": 2.4611323742906164, "grad_norm": 0.11903802305459976, "learning_rate": 4.3590878103088405e-05, "loss": 0.0584, "num_input_tokens_seen": 8335664, "step": 9325 }, { "epoch": 2.4624521578461134, "grad_norm": 0.09403789043426514, "learning_rate": 4.358431284528779e-05, "loss": 0.0572, "num_input_tokens_seen": 8340016, "step": 9330 }, { "epoch": 2.4637719414016104, "grad_norm": 0.0589064322412014, "learning_rate": 4.357774472155257e-05, "loss": 0.0876, "num_input_tokens_seen": 8344368, "step": 9335 }, { "epoch": 2.465091724957107, "grad_norm": 0.09838778525590897, "learning_rate": 4.3571173732895664e-05, "loss": 0.0298, "num_input_tokens_seen": 8348592, "step": 9340 }, { "epoch": 2.466411508512604, "grad_norm": 0.11654862016439438, "learning_rate": 4.356459988033039e-05, "loss": 0.0518, "num_input_tokens_seen": 8352880, "step": 9345 }, { "epoch": 2.467731292068101, "grad_norm": 0.2738012373447418, "learning_rate": 4.355802316487051e-05, "loss": 0.0729, "num_input_tokens_seen": 8357360, "step": 9350 }, { "epoch": 2.4690510756235975, "grad_norm": 0.6542018055915833, "learning_rate": 4.355144358753025e-05, "loss": 0.0703, "num_input_tokens_seen": 8361840, "step": 9355 }, { "epoch": 2.4703708591790945, "grad_norm": 0.05664927139878273, "learning_rate": 4.354486114932425e-05, "loss": 0.0481, "num_input_tokens_seen": 8366128, "step": 9360 }, { "epoch": 2.4716906427345915, "grad_norm": 0.028456710278987885, "learning_rate": 4.353827585126762e-05, "loss": 0.048, "num_input_tokens_seen": 8370864, "step": 9365 }, { "epoch": 2.4730104262900885, "grad_norm": 0.12671597301959991, "learning_rate": 4.353168769437588e-05, "loss": 0.0378, "num_input_tokens_seen": 8375440, "step": 9370 }, { "epoch": 2.4743302098455855, "grad_norm": 0.20276258885860443, "learning_rate": 4.3525096679665014e-05, "loss": 0.0683, "num_input_tokens_seen": 8379856, "step": 9375 }, { "epoch": 2.475649993401082, "grad_norm": 0.2866382896900177, "learning_rate": 4.351850280815144e-05, "loss": 0.054, "num_input_tokens_seen": 8384144, "step": 9380 }, { "epoch": 2.476969776956579, "grad_norm": 0.23208369314670563, "learning_rate": 4.3511906080852014e-05, "loss": 0.0728, "num_input_tokens_seen": 8388592, "step": 9385 }, { "epoch": 2.478289560512076, "grad_norm": 0.1869475096464157, "learning_rate": 4.350530649878404e-05, "loss": 0.0681, "num_input_tokens_seen": 8393136, "step": 9390 }, { "epoch": 2.479609344067573, "grad_norm": 0.24110490083694458, "learning_rate": 4.3498704062965246e-05, "loss": 0.0608, "num_input_tokens_seen": 8397648, "step": 9395 }, { "epoch": 2.48092912762307, "grad_norm": 0.027560560032725334, "learning_rate": 4.3492098774413815e-05, "loss": 0.0712, "num_input_tokens_seen": 8402448, "step": 9400 }, { "epoch": 2.48092912762307, "eval_loss": 0.07444201409816742, "eval_runtime": 64.7265, "eval_samples_per_second": 104.053, "eval_steps_per_second": 26.017, "num_input_tokens_seen": 8402448, "step": 9400 }, { "epoch": 2.4822489111785666, "grad_norm": 0.32648807764053345, "learning_rate": 4.3485490634148375e-05, "loss": 0.0951, "num_input_tokens_seen": 8406704, "step": 9405 }, { "epoch": 2.4835686947340636, "grad_norm": 0.3031865060329437, "learning_rate": 4.347887964318797e-05, "loss": 0.0696, "num_input_tokens_seen": 8411312, "step": 9410 }, { "epoch": 2.4848884782895606, "grad_norm": 0.16181257367134094, "learning_rate": 4.34722658025521e-05, "loss": 0.055, "num_input_tokens_seen": 8415472, "step": 9415 }, { "epoch": 2.486208261845057, "grad_norm": 0.24239273369312286, "learning_rate": 4.346564911326071e-05, "loss": 0.0513, "num_input_tokens_seen": 8419952, "step": 9420 }, { "epoch": 2.487528045400554, "grad_norm": 0.3218967616558075, "learning_rate": 4.345902957633418e-05, "loss": 0.1117, "num_input_tokens_seen": 8424688, "step": 9425 }, { "epoch": 2.488847828956051, "grad_norm": 0.21964791417121887, "learning_rate": 4.345240719279331e-05, "loss": 0.0942, "num_input_tokens_seen": 8429168, "step": 9430 }, { "epoch": 2.490167612511548, "grad_norm": 0.3412536680698395, "learning_rate": 4.3445781963659374e-05, "loss": 0.0482, "num_input_tokens_seen": 8434032, "step": 9435 }, { "epoch": 2.491487396067045, "grad_norm": 0.3344704806804657, "learning_rate": 4.3439153889954045e-05, "loss": 0.0828, "num_input_tokens_seen": 8438256, "step": 9440 }, { "epoch": 2.4928071796225417, "grad_norm": 0.09304055571556091, "learning_rate": 4.343252297269946e-05, "loss": 0.0672, "num_input_tokens_seen": 8443152, "step": 9445 }, { "epoch": 2.4941269631780387, "grad_norm": 0.14669226109981537, "learning_rate": 4.342588921291821e-05, "loss": 0.0399, "num_input_tokens_seen": 8447728, "step": 9450 }, { "epoch": 2.4954467467335357, "grad_norm": 0.12506449222564697, "learning_rate": 4.341925261163328e-05, "loss": 0.033, "num_input_tokens_seen": 8452208, "step": 9455 }, { "epoch": 2.4967665302890327, "grad_norm": 0.26568660140037537, "learning_rate": 4.341261316986813e-05, "loss": 0.0618, "num_input_tokens_seen": 8456912, "step": 9460 }, { "epoch": 2.4980863138445297, "grad_norm": 0.3317663073539734, "learning_rate": 4.340597088864664e-05, "loss": 0.0626, "num_input_tokens_seen": 8461136, "step": 9465 }, { "epoch": 2.4994060974000263, "grad_norm": 0.22316816449165344, "learning_rate": 4.339932576899313e-05, "loss": 0.053, "num_input_tokens_seen": 8465296, "step": 9470 }, { "epoch": 2.5007258809555233, "grad_norm": 0.3642165958881378, "learning_rate": 4.3392677811932375e-05, "loss": 0.0496, "num_input_tokens_seen": 8469584, "step": 9475 }, { "epoch": 2.5020456645110203, "grad_norm": 0.22838711738586426, "learning_rate": 4.338602701848956e-05, "loss": 0.0402, "num_input_tokens_seen": 8474352, "step": 9480 }, { "epoch": 2.503365448066517, "grad_norm": 0.12118610739707947, "learning_rate": 4.337937338969033e-05, "loss": 0.075, "num_input_tokens_seen": 8478768, "step": 9485 }, { "epoch": 2.504685231622014, "grad_norm": 0.535918116569519, "learning_rate": 4.337271692656075e-05, "loss": 0.1302, "num_input_tokens_seen": 8483120, "step": 9490 }, { "epoch": 2.506005015177511, "grad_norm": 0.05672985315322876, "learning_rate": 4.336605763012733e-05, "loss": 0.0632, "num_input_tokens_seen": 8487440, "step": 9495 }, { "epoch": 2.507324798733008, "grad_norm": 0.4549371302127838, "learning_rate": 4.3359395501417026e-05, "loss": 0.0958, "num_input_tokens_seen": 8491696, "step": 9500 }, { "epoch": 2.508644582288505, "grad_norm": 0.05192483216524124, "learning_rate": 4.335273054145722e-05, "loss": 0.0716, "num_input_tokens_seen": 8496176, "step": 9505 }, { "epoch": 2.5099643658440014, "grad_norm": 0.331556111574173, "learning_rate": 4.334606275127572e-05, "loss": 0.0415, "num_input_tokens_seen": 8501008, "step": 9510 }, { "epoch": 2.5112841493994984, "grad_norm": 0.18953146040439606, "learning_rate": 4.33393921319008e-05, "loss": 0.0461, "num_input_tokens_seen": 8505424, "step": 9515 }, { "epoch": 2.5126039329549954, "grad_norm": 0.13035956025123596, "learning_rate": 4.3332718684361146e-05, "loss": 0.0591, "num_input_tokens_seen": 8510064, "step": 9520 }, { "epoch": 2.5139237165104924, "grad_norm": 0.15347784757614136, "learning_rate": 4.332604240968588e-05, "loss": 0.0849, "num_input_tokens_seen": 8514384, "step": 9525 }, { "epoch": 2.5152435000659894, "grad_norm": 0.2641039788722992, "learning_rate": 4.331936330890459e-05, "loss": 0.0552, "num_input_tokens_seen": 8518864, "step": 9530 }, { "epoch": 2.516563283621486, "grad_norm": 0.25328975915908813, "learning_rate": 4.331268138304725e-05, "loss": 0.042, "num_input_tokens_seen": 8523088, "step": 9535 }, { "epoch": 2.517883067176983, "grad_norm": 0.24716410040855408, "learning_rate": 4.330599663314431e-05, "loss": 0.0485, "num_input_tokens_seen": 8527568, "step": 9540 }, { "epoch": 2.51920285073248, "grad_norm": 0.4119676351547241, "learning_rate": 4.329930906022665e-05, "loss": 0.0653, "num_input_tokens_seen": 8531728, "step": 9545 }, { "epoch": 2.5205226342879765, "grad_norm": 0.1101699247956276, "learning_rate": 4.3292618665325564e-05, "loss": 0.0725, "num_input_tokens_seen": 8536688, "step": 9550 }, { "epoch": 2.5218424178434735, "grad_norm": 0.08412696421146393, "learning_rate": 4.3285925449472796e-05, "loss": 0.0206, "num_input_tokens_seen": 8541296, "step": 9555 }, { "epoch": 2.5231622013989705, "grad_norm": 0.31384074687957764, "learning_rate": 4.327922941370054e-05, "loss": 0.0771, "num_input_tokens_seen": 8545776, "step": 9560 }, { "epoch": 2.5244819849544675, "grad_norm": 0.27171245217323303, "learning_rate": 4.3272530559041384e-05, "loss": 0.0534, "num_input_tokens_seen": 8550288, "step": 9565 }, { "epoch": 2.5258017685099645, "grad_norm": 0.1897088885307312, "learning_rate": 4.32658288865284e-05, "loss": 0.0429, "num_input_tokens_seen": 8554704, "step": 9570 }, { "epoch": 2.527121552065461, "grad_norm": 0.49644559621810913, "learning_rate": 4.325912439719505e-05, "loss": 0.1047, "num_input_tokens_seen": 8559120, "step": 9575 }, { "epoch": 2.528441335620958, "grad_norm": 0.19385695457458496, "learning_rate": 4.3252417092075266e-05, "loss": 0.0575, "num_input_tokens_seen": 8563632, "step": 9580 }, { "epoch": 2.529761119176455, "grad_norm": 0.4870379865169525, "learning_rate": 4.3245706972203385e-05, "loss": 0.0996, "num_input_tokens_seen": 8568304, "step": 9585 }, { "epoch": 2.531080902731952, "grad_norm": 0.5403352975845337, "learning_rate": 4.323899403861421e-05, "loss": 0.0771, "num_input_tokens_seen": 8572976, "step": 9590 }, { "epoch": 2.532400686287449, "grad_norm": 0.13458994030952454, "learning_rate": 4.3232278292342935e-05, "loss": 0.1178, "num_input_tokens_seen": 8577200, "step": 9595 }, { "epoch": 2.5337204698429456, "grad_norm": 0.40126243233680725, "learning_rate": 4.322555973442524e-05, "loss": 0.081, "num_input_tokens_seen": 8581936, "step": 9600 }, { "epoch": 2.5337204698429456, "eval_loss": 0.07336914539337158, "eval_runtime": 64.792, "eval_samples_per_second": 103.948, "eval_steps_per_second": 25.991, "num_input_tokens_seen": 8581936, "step": 9600 }, { "epoch": 2.5350402533984426, "grad_norm": 0.3336700201034546, "learning_rate": 4.3218838365897184e-05, "loss": 0.1019, "num_input_tokens_seen": 8586352, "step": 9605 }, { "epoch": 2.5363600369539396, "grad_norm": 0.23010782897472382, "learning_rate": 4.3212114187795306e-05, "loss": 0.0668, "num_input_tokens_seen": 8590640, "step": 9610 }, { "epoch": 2.537679820509436, "grad_norm": 0.18952596187591553, "learning_rate": 4.320538720115656e-05, "loss": 0.0636, "num_input_tokens_seen": 8595152, "step": 9615 }, { "epoch": 2.538999604064933, "grad_norm": 0.07515153288841248, "learning_rate": 4.319865740701831e-05, "loss": 0.1215, "num_input_tokens_seen": 8599344, "step": 9620 }, { "epoch": 2.54031938762043, "grad_norm": 0.11823656409978867, "learning_rate": 4.3191924806418396e-05, "loss": 0.0415, "num_input_tokens_seen": 8604112, "step": 9625 }, { "epoch": 2.541639171175927, "grad_norm": 0.08906187117099762, "learning_rate": 4.318518940039507e-05, "loss": 0.0818, "num_input_tokens_seen": 8608944, "step": 9630 }, { "epoch": 2.542958954731424, "grad_norm": 0.22747233510017395, "learning_rate": 4.3178451189987e-05, "loss": 0.0511, "num_input_tokens_seen": 8613552, "step": 9635 }, { "epoch": 2.5442787382869207, "grad_norm": 0.1265621781349182, "learning_rate": 4.3171710176233315e-05, "loss": 0.0562, "num_input_tokens_seen": 8618064, "step": 9640 }, { "epoch": 2.5455985218424177, "grad_norm": 0.3723251223564148, "learning_rate": 4.316496636017355e-05, "loss": 0.0757, "num_input_tokens_seen": 8622256, "step": 9645 }, { "epoch": 2.5469183053979148, "grad_norm": 0.11644499003887177, "learning_rate": 4.315821974284771e-05, "loss": 0.0515, "num_input_tokens_seen": 8626704, "step": 9650 }, { "epoch": 2.5482380889534118, "grad_norm": 0.4974287450313568, "learning_rate": 4.315147032529619e-05, "loss": 0.1267, "num_input_tokens_seen": 8631504, "step": 9655 }, { "epoch": 2.5495578725089088, "grad_norm": 0.37407687306404114, "learning_rate": 4.3144718108559845e-05, "loss": 0.0704, "num_input_tokens_seen": 8636016, "step": 9660 }, { "epoch": 2.5508776560644053, "grad_norm": 0.14029084146022797, "learning_rate": 4.3137963093679945e-05, "loss": 0.0557, "num_input_tokens_seen": 8640688, "step": 9665 }, { "epoch": 2.5521974396199023, "grad_norm": 0.3493426740169525, "learning_rate": 4.31312052816982e-05, "loss": 0.126, "num_input_tokens_seen": 8644976, "step": 9670 }, { "epoch": 2.5535172231753993, "grad_norm": 0.20684921741485596, "learning_rate": 4.312444467365675e-05, "loss": 0.1218, "num_input_tokens_seen": 8649232, "step": 9675 }, { "epoch": 2.554837006730896, "grad_norm": 0.5462194681167603, "learning_rate": 4.311768127059816e-05, "loss": 0.0774, "num_input_tokens_seen": 8653744, "step": 9680 }, { "epoch": 2.556156790286393, "grad_norm": 0.1233847364783287, "learning_rate": 4.3110915073565444e-05, "loss": 0.0567, "num_input_tokens_seen": 8658384, "step": 9685 }, { "epoch": 2.55747657384189, "grad_norm": 0.13533592224121094, "learning_rate": 4.310414608360203e-05, "loss": 0.0652, "num_input_tokens_seen": 8662704, "step": 9690 }, { "epoch": 2.558796357397387, "grad_norm": 0.2691023051738739, "learning_rate": 4.309737430175177e-05, "loss": 0.1036, "num_input_tokens_seen": 8667216, "step": 9695 }, { "epoch": 2.560116140952884, "grad_norm": 0.20291408896446228, "learning_rate": 4.309059972905897e-05, "loss": 0.0677, "num_input_tokens_seen": 8671760, "step": 9700 }, { "epoch": 2.5614359245083804, "grad_norm": 0.03975309431552887, "learning_rate": 4.308382236656836e-05, "loss": 0.0301, "num_input_tokens_seen": 8676304, "step": 9705 }, { "epoch": 2.5627557080638774, "grad_norm": 0.3087990880012512, "learning_rate": 4.307704221532507e-05, "loss": 0.0604, "num_input_tokens_seen": 8680880, "step": 9710 }, { "epoch": 2.5640754916193744, "grad_norm": 0.22949576377868652, "learning_rate": 4.307025927637471e-05, "loss": 0.074, "num_input_tokens_seen": 8685520, "step": 9715 }, { "epoch": 2.5653952751748714, "grad_norm": 0.27851688861846924, "learning_rate": 4.306347355076328e-05, "loss": 0.0763, "num_input_tokens_seen": 8690224, "step": 9720 }, { "epoch": 2.5667150587303684, "grad_norm": 0.13673822581768036, "learning_rate": 4.305668503953724e-05, "loss": 0.0412, "num_input_tokens_seen": 8694736, "step": 9725 }, { "epoch": 2.568034842285865, "grad_norm": 0.08895297348499298, "learning_rate": 4.3049893743743436e-05, "loss": 0.0639, "num_input_tokens_seen": 8699568, "step": 9730 }, { "epoch": 2.569354625841362, "grad_norm": 0.13384582102298737, "learning_rate": 4.304309966442919e-05, "loss": 0.058, "num_input_tokens_seen": 8703920, "step": 9735 }, { "epoch": 2.570674409396859, "grad_norm": 0.24064113199710846, "learning_rate": 4.303630280264224e-05, "loss": 0.0577, "num_input_tokens_seen": 8708400, "step": 9740 }, { "epoch": 2.5719941929523555, "grad_norm": 0.03973736986517906, "learning_rate": 4.302950315943074e-05, "loss": 0.0243, "num_input_tokens_seen": 8713136, "step": 9745 }, { "epoch": 2.573313976507853, "grad_norm": 0.1885768324136734, "learning_rate": 4.3022700735843275e-05, "loss": 0.0614, "num_input_tokens_seen": 8717392, "step": 9750 }, { "epoch": 2.5746337600633495, "grad_norm": 0.3195228576660156, "learning_rate": 4.301589553292887e-05, "loss": 0.1086, "num_input_tokens_seen": 8721936, "step": 9755 }, { "epoch": 2.5759535436188465, "grad_norm": 0.3497832417488098, "learning_rate": 4.300908755173697e-05, "loss": 0.0424, "num_input_tokens_seen": 8726704, "step": 9760 }, { "epoch": 2.5772733271743435, "grad_norm": 0.3529016375541687, "learning_rate": 4.300227679331745e-05, "loss": 0.047, "num_input_tokens_seen": 8731024, "step": 9765 }, { "epoch": 2.57859311072984, "grad_norm": 0.16406171023845673, "learning_rate": 4.299546325872063e-05, "loss": 0.0841, "num_input_tokens_seen": 8735504, "step": 9770 }, { "epoch": 2.579912894285337, "grad_norm": 0.14302510023117065, "learning_rate": 4.2988646948997225e-05, "loss": 0.0399, "num_input_tokens_seen": 8739760, "step": 9775 }, { "epoch": 2.581232677840834, "grad_norm": 0.198823481798172, "learning_rate": 4.29818278651984e-05, "loss": 0.0929, "num_input_tokens_seen": 8744208, "step": 9780 }, { "epoch": 2.582552461396331, "grad_norm": 0.32735323905944824, "learning_rate": 4.297500600837574e-05, "loss": 0.1111, "num_input_tokens_seen": 8748592, "step": 9785 }, { "epoch": 2.583872244951828, "grad_norm": 0.6087290644645691, "learning_rate": 4.2968181379581276e-05, "loss": 0.0585, "num_input_tokens_seen": 8753072, "step": 9790 }, { "epoch": 2.5851920285073247, "grad_norm": 0.18827207386493683, "learning_rate": 4.296135397986743e-05, "loss": 0.0906, "num_input_tokens_seen": 8757648, "step": 9795 }, { "epoch": 2.5865118120628217, "grad_norm": 0.24756808578968048, "learning_rate": 4.295452381028709e-05, "loss": 0.0724, "num_input_tokens_seen": 8762128, "step": 9800 }, { "epoch": 2.5865118120628217, "eval_loss": 0.07314550876617432, "eval_runtime": 64.7604, "eval_samples_per_second": 103.999, "eval_steps_per_second": 26.004, "num_input_tokens_seen": 8762128, "step": 9800 }, { "epoch": 2.5878315956183187, "grad_norm": 0.31770050525665283, "learning_rate": 4.294769087189354e-05, "loss": 0.0596, "num_input_tokens_seen": 8766416, "step": 9805 }, { "epoch": 2.5891513791738157, "grad_norm": 0.44346433877944946, "learning_rate": 4.294085516574052e-05, "loss": 0.1309, "num_input_tokens_seen": 8770928, "step": 9810 }, { "epoch": 2.5904711627293127, "grad_norm": 0.1290636658668518, "learning_rate": 4.2934016692882176e-05, "loss": 0.0487, "num_input_tokens_seen": 8775248, "step": 9815 }, { "epoch": 2.591790946284809, "grad_norm": 0.24320003390312195, "learning_rate": 4.292717545437308e-05, "loss": 0.0883, "num_input_tokens_seen": 8779728, "step": 9820 }, { "epoch": 2.593110729840306, "grad_norm": 0.4271619915962219, "learning_rate": 4.292033145126825e-05, "loss": 0.0989, "num_input_tokens_seen": 8784208, "step": 9825 }, { "epoch": 2.594430513395803, "grad_norm": 0.07138428837060928, "learning_rate": 4.29134846846231e-05, "loss": 0.0818, "num_input_tokens_seen": 8788688, "step": 9830 }, { "epoch": 2.5957502969512998, "grad_norm": 0.19680219888687134, "learning_rate": 4.29066351554935e-05, "loss": 0.0691, "num_input_tokens_seen": 8793008, "step": 9835 }, { "epoch": 2.5970700805067968, "grad_norm": 0.1529364138841629, "learning_rate": 4.289978286493574e-05, "loss": 0.0556, "num_input_tokens_seen": 8797424, "step": 9840 }, { "epoch": 2.5983898640622938, "grad_norm": 0.334871381521225, "learning_rate": 4.28929278140065e-05, "loss": 0.0688, "num_input_tokens_seen": 8802192, "step": 9845 }, { "epoch": 2.5997096476177908, "grad_norm": 0.3508867621421814, "learning_rate": 4.288607000376295e-05, "loss": 0.0822, "num_input_tokens_seen": 8806576, "step": 9850 }, { "epoch": 2.6010294311732878, "grad_norm": 0.38575175404548645, "learning_rate": 4.2879209435262624e-05, "loss": 0.1316, "num_input_tokens_seen": 8810896, "step": 9855 }, { "epoch": 2.6023492147287843, "grad_norm": 0.20000772178173065, "learning_rate": 4.287234610956353e-05, "loss": 0.0497, "num_input_tokens_seen": 8815152, "step": 9860 }, { "epoch": 2.6036689982842813, "grad_norm": 0.15364022552967072, "learning_rate": 4.2865480027724056e-05, "loss": 0.0301, "num_input_tokens_seen": 8819440, "step": 9865 }, { "epoch": 2.6049887818397783, "grad_norm": 0.20096606016159058, "learning_rate": 4.285861119080306e-05, "loss": 0.0373, "num_input_tokens_seen": 8824048, "step": 9870 }, { "epoch": 2.6063085653952753, "grad_norm": 0.09594684094190598, "learning_rate": 4.2851739599859784e-05, "loss": 0.0573, "num_input_tokens_seen": 8828688, "step": 9875 }, { "epoch": 2.6076283489507723, "grad_norm": 0.24546343088150024, "learning_rate": 4.2844865255953934e-05, "loss": 0.0587, "num_input_tokens_seen": 8833360, "step": 9880 }, { "epoch": 2.608948132506269, "grad_norm": 0.3628990054130554, "learning_rate": 4.2837988160145605e-05, "loss": 0.0817, "num_input_tokens_seen": 8837744, "step": 9885 }, { "epoch": 2.610267916061766, "grad_norm": 0.2777470350265503, "learning_rate": 4.2831108313495336e-05, "loss": 0.0787, "num_input_tokens_seen": 8842256, "step": 9890 }, { "epoch": 2.611587699617263, "grad_norm": 0.36594706773757935, "learning_rate": 4.282422571706408e-05, "loss": 0.0772, "num_input_tokens_seen": 8846672, "step": 9895 }, { "epoch": 2.6129074831727594, "grad_norm": 0.10054849088191986, "learning_rate": 4.281734037191323e-05, "loss": 0.082, "num_input_tokens_seen": 8851216, "step": 9900 }, { "epoch": 2.6142272667282564, "grad_norm": 0.3801535964012146, "learning_rate": 4.281045227910459e-05, "loss": 0.0376, "num_input_tokens_seen": 8855408, "step": 9905 }, { "epoch": 2.6155470502837534, "grad_norm": 0.08858103305101395, "learning_rate": 4.280356143970038e-05, "loss": 0.0325, "num_input_tokens_seen": 8859696, "step": 9910 }, { "epoch": 2.6168668338392505, "grad_norm": 0.188070610165596, "learning_rate": 4.279666785476327e-05, "loss": 0.084, "num_input_tokens_seen": 8864176, "step": 9915 }, { "epoch": 2.6181866173947475, "grad_norm": 0.27322542667388916, "learning_rate": 4.2789771525356325e-05, "loss": 0.0686, "num_input_tokens_seen": 8868560, "step": 9920 }, { "epoch": 2.619506400950244, "grad_norm": 0.20771068334579468, "learning_rate": 4.2782872452543056e-05, "loss": 0.0302, "num_input_tokens_seen": 8873008, "step": 9925 }, { "epoch": 2.620826184505741, "grad_norm": 0.40944206714630127, "learning_rate": 4.2775970637387376e-05, "loss": 0.1286, "num_input_tokens_seen": 8877488, "step": 9930 }, { "epoch": 2.622145968061238, "grad_norm": 0.14011326432228088, "learning_rate": 4.276906608095363e-05, "loss": 0.0618, "num_input_tokens_seen": 8881680, "step": 9935 }, { "epoch": 2.623465751616735, "grad_norm": 0.0863652303814888, "learning_rate": 4.276215878430661e-05, "loss": 0.1036, "num_input_tokens_seen": 8886000, "step": 9940 }, { "epoch": 2.624785535172232, "grad_norm": 0.07602347433567047, "learning_rate": 4.275524874851149e-05, "loss": 0.0431, "num_input_tokens_seen": 8890480, "step": 9945 }, { "epoch": 2.6261053187277286, "grad_norm": 0.3011094033718109, "learning_rate": 4.274833597463388e-05, "loss": 0.0496, "num_input_tokens_seen": 8895088, "step": 9950 }, { "epoch": 2.6274251022832256, "grad_norm": 0.2365679293870926, "learning_rate": 4.2741420463739824e-05, "loss": 0.0391, "num_input_tokens_seen": 8899600, "step": 9955 }, { "epoch": 2.6287448858387226, "grad_norm": 0.2474764585494995, "learning_rate": 4.273450221689578e-05, "loss": 0.0476, "num_input_tokens_seen": 8903920, "step": 9960 }, { "epoch": 2.630064669394219, "grad_norm": 0.3711318373680115, "learning_rate": 4.272758123516863e-05, "loss": 0.0843, "num_input_tokens_seen": 8908464, "step": 9965 }, { "epoch": 2.631384452949716, "grad_norm": 0.22012832760810852, "learning_rate": 4.272065751962567e-05, "loss": 0.0939, "num_input_tokens_seen": 8912752, "step": 9970 }, { "epoch": 2.632704236505213, "grad_norm": 0.1438518613576889, "learning_rate": 4.271373107133464e-05, "loss": 0.0468, "num_input_tokens_seen": 8917360, "step": 9975 }, { "epoch": 2.63402402006071, "grad_norm": 0.07493764162063599, "learning_rate": 4.270680189136366e-05, "loss": 0.0982, "num_input_tokens_seen": 8921808, "step": 9980 }, { "epoch": 2.635343803616207, "grad_norm": 0.297380656003952, "learning_rate": 4.269986998078132e-05, "loss": 0.0626, "num_input_tokens_seen": 8926096, "step": 9985 }, { "epoch": 2.6366635871717037, "grad_norm": 0.19269724190235138, "learning_rate": 4.2692935340656595e-05, "loss": 0.0716, "num_input_tokens_seen": 8930800, "step": 9990 }, { "epoch": 2.6379833707272007, "grad_norm": 0.19477353990077972, "learning_rate": 4.26859979720589e-05, "loss": 0.1146, "num_input_tokens_seen": 8935152, "step": 9995 }, { "epoch": 2.6393031542826977, "grad_norm": 0.36513808369636536, "learning_rate": 4.267905787605806e-05, "loss": 0.1073, "num_input_tokens_seen": 8939600, "step": 10000 }, { "epoch": 2.6393031542826977, "eval_loss": 0.0731375515460968, "eval_runtime": 64.7622, "eval_samples_per_second": 103.996, "eval_steps_per_second": 26.003, "num_input_tokens_seen": 8939600, "step": 10000 }, { "epoch": 2.6406229378381947, "grad_norm": 0.09440498799085617, "learning_rate": 4.267211505372433e-05, "loss": 0.0776, "num_input_tokens_seen": 8944016, "step": 10005 }, { "epoch": 2.6419427213936917, "grad_norm": 0.19922661781311035, "learning_rate": 4.266516950612837e-05, "loss": 0.0495, "num_input_tokens_seen": 8948080, "step": 10010 }, { "epoch": 2.6432625049491882, "grad_norm": 0.39145827293395996, "learning_rate": 4.265822123434128e-05, "loss": 0.1014, "num_input_tokens_seen": 8952400, "step": 10015 }, { "epoch": 2.6445822885046852, "grad_norm": 0.4069858491420746, "learning_rate": 4.265127023943457e-05, "loss": 0.1017, "num_input_tokens_seen": 8956912, "step": 10020 }, { "epoch": 2.6459020720601822, "grad_norm": 0.1430160403251648, "learning_rate": 4.2644316522480176e-05, "loss": 0.0376, "num_input_tokens_seen": 8961520, "step": 10025 }, { "epoch": 2.647221855615679, "grad_norm": 0.039314936846494675, "learning_rate": 4.263736008455044e-05, "loss": 0.037, "num_input_tokens_seen": 8965680, "step": 10030 }, { "epoch": 2.648541639171176, "grad_norm": 0.2309594452381134, "learning_rate": 4.2630400926718125e-05, "loss": 0.095, "num_input_tokens_seen": 8969776, "step": 10035 }, { "epoch": 2.649861422726673, "grad_norm": 0.47212281823158264, "learning_rate": 4.262343905005644e-05, "loss": 0.0684, "num_input_tokens_seen": 8974128, "step": 10040 }, { "epoch": 2.65118120628217, "grad_norm": 0.3808099031448364, "learning_rate": 4.261647445563897e-05, "loss": 0.0853, "num_input_tokens_seen": 8978320, "step": 10045 }, { "epoch": 2.652500989837667, "grad_norm": 0.33051803708076477, "learning_rate": 4.260950714453976e-05, "loss": 0.0384, "num_input_tokens_seen": 8982768, "step": 10050 }, { "epoch": 2.6538207733931634, "grad_norm": 0.16358062624931335, "learning_rate": 4.2602537117833266e-05, "loss": 0.0292, "num_input_tokens_seen": 8987536, "step": 10055 }, { "epoch": 2.6551405569486604, "grad_norm": 0.14569436013698578, "learning_rate": 4.259556437659433e-05, "loss": 0.0438, "num_input_tokens_seen": 8992144, "step": 10060 }, { "epoch": 2.6564603405041574, "grad_norm": 0.3683777451515198, "learning_rate": 4.258858892189825e-05, "loss": 0.1177, "num_input_tokens_seen": 8996752, "step": 10065 }, { "epoch": 2.6577801240596544, "grad_norm": 0.19314073026180267, "learning_rate": 4.2581610754820725e-05, "loss": 0.0433, "num_input_tokens_seen": 9001392, "step": 10070 }, { "epoch": 2.6590999076151514, "grad_norm": 0.2471987009048462, "learning_rate": 4.2574629876437876e-05, "loss": 0.066, "num_input_tokens_seen": 9005936, "step": 10075 }, { "epoch": 2.660419691170648, "grad_norm": 0.3301261067390442, "learning_rate": 4.256764628782625e-05, "loss": 0.0765, "num_input_tokens_seen": 9010640, "step": 10080 }, { "epoch": 2.661739474726145, "grad_norm": 0.18878726661205292, "learning_rate": 4.256065999006279e-05, "loss": 0.0296, "num_input_tokens_seen": 9015120, "step": 10085 }, { "epoch": 2.663059258281642, "grad_norm": 0.1538233757019043, "learning_rate": 4.2553670984224885e-05, "loss": 0.0453, "num_input_tokens_seen": 9019440, "step": 10090 }, { "epoch": 2.6643790418371385, "grad_norm": 0.13958032429218292, "learning_rate": 4.254667927139032e-05, "loss": 0.0499, "num_input_tokens_seen": 9023696, "step": 10095 }, { "epoch": 2.6656988253926355, "grad_norm": 0.10961255431175232, "learning_rate": 4.2539684852637295e-05, "loss": 0.027, "num_input_tokens_seen": 9027888, "step": 10100 }, { "epoch": 2.6670186089481325, "grad_norm": 0.10923238098621368, "learning_rate": 4.253268772904446e-05, "loss": 0.0326, "num_input_tokens_seen": 9032368, "step": 10105 }, { "epoch": 2.6683383925036295, "grad_norm": 0.34692806005477905, "learning_rate": 4.252568790169085e-05, "loss": 0.0935, "num_input_tokens_seen": 9036784, "step": 10110 }, { "epoch": 2.6696581760591265, "grad_norm": 0.13993752002716064, "learning_rate": 4.251868537165592e-05, "loss": 0.0485, "num_input_tokens_seen": 9041264, "step": 10115 }, { "epoch": 2.670977959614623, "grad_norm": 0.3688141703605652, "learning_rate": 4.251168014001955e-05, "loss": 0.0418, "num_input_tokens_seen": 9045872, "step": 10120 }, { "epoch": 2.67229774317012, "grad_norm": 0.09629227221012115, "learning_rate": 4.250467220786204e-05, "loss": 0.0648, "num_input_tokens_seen": 9050320, "step": 10125 }, { "epoch": 2.673617526725617, "grad_norm": 0.17815323173999786, "learning_rate": 4.249766157626409e-05, "loss": 0.0278, "num_input_tokens_seen": 9054960, "step": 10130 }, { "epoch": 2.674937310281114, "grad_norm": 0.1155419647693634, "learning_rate": 4.249064824630684e-05, "loss": 0.0499, "num_input_tokens_seen": 9059632, "step": 10135 }, { "epoch": 2.676257093836611, "grad_norm": 0.31764718890190125, "learning_rate": 4.248363221907183e-05, "loss": 0.0675, "num_input_tokens_seen": 9064368, "step": 10140 }, { "epoch": 2.6775768773921076, "grad_norm": 0.13647539913654327, "learning_rate": 4.2476613495641026e-05, "loss": 0.0554, "num_input_tokens_seen": 9068912, "step": 10145 }, { "epoch": 2.6788966609476046, "grad_norm": 0.37233930826187134, "learning_rate": 4.246959207709679e-05, "loss": 0.0485, "num_input_tokens_seen": 9073424, "step": 10150 }, { "epoch": 2.6802164445031016, "grad_norm": 0.056413568556308746, "learning_rate": 4.246256796452192e-05, "loss": 0.055, "num_input_tokens_seen": 9077680, "step": 10155 }, { "epoch": 2.681536228058598, "grad_norm": 0.2447153776884079, "learning_rate": 4.245554115899962e-05, "loss": 0.0899, "num_input_tokens_seen": 9081968, "step": 10160 }, { "epoch": 2.682856011614095, "grad_norm": 0.1946895271539688, "learning_rate": 4.2448511661613514e-05, "loss": 0.0297, "num_input_tokens_seen": 9086864, "step": 10165 }, { "epoch": 2.684175795169592, "grad_norm": 0.10948691517114639, "learning_rate": 4.2441479473447635e-05, "loss": 0.0602, "num_input_tokens_seen": 9091056, "step": 10170 }, { "epoch": 2.685495578725089, "grad_norm": 0.097696952521801, "learning_rate": 4.243444459558644e-05, "loss": 0.0592, "num_input_tokens_seen": 9095440, "step": 10175 }, { "epoch": 2.686815362280586, "grad_norm": 0.2656266987323761, "learning_rate": 4.24274070291148e-05, "loss": 0.063, "num_input_tokens_seen": 9099856, "step": 10180 }, { "epoch": 2.6881351458360827, "grad_norm": 0.0735374167561531, "learning_rate": 4.242036677511798e-05, "loss": 0.0613, "num_input_tokens_seen": 9103888, "step": 10185 }, { "epoch": 2.6894549293915797, "grad_norm": 0.10560992360115051, "learning_rate": 4.241332383468169e-05, "loss": 0.0527, "num_input_tokens_seen": 9108496, "step": 10190 }, { "epoch": 2.6907747129470767, "grad_norm": 0.04510555788874626, "learning_rate": 4.2406278208892034e-05, "loss": 0.0689, "num_input_tokens_seen": 9112752, "step": 10195 }, { "epoch": 2.6920944965025737, "grad_norm": 0.345093697309494, "learning_rate": 4.2399229898835536e-05, "loss": 0.0992, "num_input_tokens_seen": 9117424, "step": 10200 }, { "epoch": 2.6920944965025737, "eval_loss": 0.07295124977827072, "eval_runtime": 64.7565, "eval_samples_per_second": 104.005, "eval_steps_per_second": 26.005, "num_input_tokens_seen": 9117424, "step": 10200 }, { "epoch": 2.6934142800580707, "grad_norm": 0.23191717267036438, "learning_rate": 4.239217890559914e-05, "loss": 0.0545, "num_input_tokens_seen": 9121488, "step": 10205 }, { "epoch": 2.6947340636135673, "grad_norm": 0.09339532256126404, "learning_rate": 4.238512523027019e-05, "loss": 0.0642, "num_input_tokens_seen": 9126288, "step": 10210 }, { "epoch": 2.6960538471690643, "grad_norm": 0.029894517734646797, "learning_rate": 4.237806887393645e-05, "loss": 0.0512, "num_input_tokens_seen": 9130768, "step": 10215 }, { "epoch": 2.6973736307245613, "grad_norm": 0.043799567967653275, "learning_rate": 4.237100983768611e-05, "loss": 0.0515, "num_input_tokens_seen": 9135440, "step": 10220 }, { "epoch": 2.698693414280058, "grad_norm": 0.14154568314552307, "learning_rate": 4.2363948122607756e-05, "loss": 0.0451, "num_input_tokens_seen": 9139984, "step": 10225 }, { "epoch": 2.700013197835555, "grad_norm": 0.1434115171432495, "learning_rate": 4.235688372979039e-05, "loss": 0.0427, "num_input_tokens_seen": 9144688, "step": 10230 }, { "epoch": 2.701332981391052, "grad_norm": 0.022823425009846687, "learning_rate": 4.234981666032343e-05, "loss": 0.0784, "num_input_tokens_seen": 9149488, "step": 10235 }, { "epoch": 2.702652764946549, "grad_norm": 0.29769209027290344, "learning_rate": 4.2342746915296704e-05, "loss": 0.131, "num_input_tokens_seen": 9154288, "step": 10240 }, { "epoch": 2.703972548502046, "grad_norm": 0.5064764022827148, "learning_rate": 4.233567449580047e-05, "loss": 0.1072, "num_input_tokens_seen": 9158960, "step": 10245 }, { "epoch": 2.7052923320575424, "grad_norm": 0.0794043019413948, "learning_rate": 4.232859940292537e-05, "loss": 0.057, "num_input_tokens_seen": 9163536, "step": 10250 }, { "epoch": 2.7066121156130394, "grad_norm": 0.53033047914505, "learning_rate": 4.232152163776248e-05, "loss": 0.091, "num_input_tokens_seen": 9168144, "step": 10255 }, { "epoch": 2.7079318991685364, "grad_norm": 0.1327446848154068, "learning_rate": 4.231444120140328e-05, "loss": 0.0602, "num_input_tokens_seen": 9172464, "step": 10260 }, { "epoch": 2.7092516827240334, "grad_norm": 0.23512215912342072, "learning_rate": 4.230735809493967e-05, "loss": 0.0504, "num_input_tokens_seen": 9177168, "step": 10265 }, { "epoch": 2.7105714662795304, "grad_norm": 0.3205755352973938, "learning_rate": 4.2300272319463926e-05, "loss": 0.1353, "num_input_tokens_seen": 9181616, "step": 10270 }, { "epoch": 2.711891249835027, "grad_norm": 0.2373356968164444, "learning_rate": 4.2293183876068786e-05, "loss": 0.0574, "num_input_tokens_seen": 9186160, "step": 10275 }, { "epoch": 2.713211033390524, "grad_norm": 0.25926467776298523, "learning_rate": 4.228609276584737e-05, "loss": 0.061, "num_input_tokens_seen": 9190864, "step": 10280 }, { "epoch": 2.714530816946021, "grad_norm": 0.07786808907985687, "learning_rate": 4.227899898989323e-05, "loss": 0.0357, "num_input_tokens_seen": 9195600, "step": 10285 }, { "epoch": 2.7158506005015175, "grad_norm": 0.0353267639875412, "learning_rate": 4.2271902549300293e-05, "loss": 0.0652, "num_input_tokens_seen": 9199984, "step": 10290 }, { "epoch": 2.7171703840570145, "grad_norm": 0.16221003234386444, "learning_rate": 4.226480344516294e-05, "loss": 0.0645, "num_input_tokens_seen": 9204624, "step": 10295 }, { "epoch": 2.7184901676125115, "grad_norm": 0.07511204481124878, "learning_rate": 4.2257701678575925e-05, "loss": 0.0465, "num_input_tokens_seen": 9209072, "step": 10300 }, { "epoch": 2.7198099511680085, "grad_norm": 0.05817507579922676, "learning_rate": 4.225059725063444e-05, "loss": 0.068, "num_input_tokens_seen": 9213776, "step": 10305 }, { "epoch": 2.7211297347235055, "grad_norm": 0.44674259424209595, "learning_rate": 4.2243490162434074e-05, "loss": 0.1059, "num_input_tokens_seen": 9218352, "step": 10310 }, { "epoch": 2.722449518279002, "grad_norm": 0.34451398253440857, "learning_rate": 4.223638041507083e-05, "loss": 0.0761, "num_input_tokens_seen": 9222768, "step": 10315 }, { "epoch": 2.723769301834499, "grad_norm": 0.47843116521835327, "learning_rate": 4.2229268009641124e-05, "loss": 0.1112, "num_input_tokens_seen": 9227344, "step": 10320 }, { "epoch": 2.725089085389996, "grad_norm": 0.20910105109214783, "learning_rate": 4.222215294724177e-05, "loss": 0.0669, "num_input_tokens_seen": 9231440, "step": 10325 }, { "epoch": 2.726408868945493, "grad_norm": 0.09851108491420746, "learning_rate": 4.2215035228970005e-05, "loss": 0.0573, "num_input_tokens_seen": 9235632, "step": 10330 }, { "epoch": 2.72772865250099, "grad_norm": 0.21298708021640778, "learning_rate": 4.2207914855923464e-05, "loss": 0.0802, "num_input_tokens_seen": 9240016, "step": 10335 }, { "epoch": 2.7290484360564866, "grad_norm": 0.19436372816562653, "learning_rate": 4.220079182920021e-05, "loss": 0.0507, "num_input_tokens_seen": 9244560, "step": 10340 }, { "epoch": 2.7303682196119836, "grad_norm": 0.3786255121231079, "learning_rate": 4.2193666149898705e-05, "loss": 0.0646, "num_input_tokens_seen": 9248912, "step": 10345 }, { "epoch": 2.7316880031674806, "grad_norm": 0.13327337801456451, "learning_rate": 4.21865378191178e-05, "loss": 0.0524, "num_input_tokens_seen": 9253648, "step": 10350 }, { "epoch": 2.733007786722977, "grad_norm": 0.24992167949676514, "learning_rate": 4.217940683795678e-05, "loss": 0.0552, "num_input_tokens_seen": 9258416, "step": 10355 }, { "epoch": 2.734327570278474, "grad_norm": 0.11601991206407547, "learning_rate": 4.217227320751534e-05, "loss": 0.0575, "num_input_tokens_seen": 9262736, "step": 10360 }, { "epoch": 2.735647353833971, "grad_norm": 0.3245983123779297, "learning_rate": 4.216513692889358e-05, "loss": 0.0578, "num_input_tokens_seen": 9267760, "step": 10365 }, { "epoch": 2.736967137389468, "grad_norm": 0.5606169700622559, "learning_rate": 4.215799800319199e-05, "loss": 0.0971, "num_input_tokens_seen": 9272240, "step": 10370 }, { "epoch": 2.738286920944965, "grad_norm": 0.10975401103496552, "learning_rate": 4.2150856431511485e-05, "loss": 0.0754, "num_input_tokens_seen": 9276720, "step": 10375 }, { "epoch": 2.7396067045004617, "grad_norm": 0.21813541650772095, "learning_rate": 4.214371221495339e-05, "loss": 0.0322, "num_input_tokens_seen": 9281424, "step": 10380 }, { "epoch": 2.7409264880559587, "grad_norm": 0.06182369589805603, "learning_rate": 4.213656535461942e-05, "loss": 0.0482, "num_input_tokens_seen": 9285840, "step": 10385 }, { "epoch": 2.7422462716114557, "grad_norm": 0.2782314419746399, "learning_rate": 4.2129415851611734e-05, "loss": 0.0998, "num_input_tokens_seen": 9290160, "step": 10390 }, { "epoch": 2.7435660551669527, "grad_norm": 0.3484974503517151, "learning_rate": 4.2122263707032855e-05, "loss": 0.1228, "num_input_tokens_seen": 9294576, "step": 10395 }, { "epoch": 2.7448858387224497, "grad_norm": 0.1426055133342743, "learning_rate": 4.211510892198574e-05, "loss": 0.0661, "num_input_tokens_seen": 9299120, "step": 10400 }, { "epoch": 2.7448858387224497, "eval_loss": 0.07247276604175568, "eval_runtime": 64.8426, "eval_samples_per_second": 103.867, "eval_steps_per_second": 25.971, "num_input_tokens_seen": 9299120, "step": 10400 }, { "epoch": 2.7462056222779463, "grad_norm": 0.27401527762413025, "learning_rate": 4.210795149757375e-05, "loss": 0.0899, "num_input_tokens_seen": 9303952, "step": 10405 }, { "epoch": 2.7475254058334433, "grad_norm": 0.0694328099489212, "learning_rate": 4.210079143490065e-05, "loss": 0.0472, "num_input_tokens_seen": 9308368, "step": 10410 }, { "epoch": 2.7488451893889403, "grad_norm": 0.21915288269519806, "learning_rate": 4.2093628735070604e-05, "loss": 0.1334, "num_input_tokens_seen": 9312432, "step": 10415 }, { "epoch": 2.750164972944437, "grad_norm": 0.33886176347732544, "learning_rate": 4.208646339918819e-05, "loss": 0.0487, "num_input_tokens_seen": 9316784, "step": 10420 }, { "epoch": 2.751484756499934, "grad_norm": 0.28061583638191223, "learning_rate": 4.2079295428358414e-05, "loss": 0.077, "num_input_tokens_seen": 9321488, "step": 10425 }, { "epoch": 2.752804540055431, "grad_norm": 0.23319807648658752, "learning_rate": 4.207212482368664e-05, "loss": 0.0493, "num_input_tokens_seen": 9325936, "step": 10430 }, { "epoch": 2.754124323610928, "grad_norm": 0.18096014857292175, "learning_rate": 4.206495158627867e-05, "loss": 0.0646, "num_input_tokens_seen": 9330256, "step": 10435 }, { "epoch": 2.755444107166425, "grad_norm": 0.2003723680973053, "learning_rate": 4.205777571724073e-05, "loss": 0.0592, "num_input_tokens_seen": 9334832, "step": 10440 }, { "epoch": 2.7567638907219214, "grad_norm": 0.04375162720680237, "learning_rate": 4.20505972176794e-05, "loss": 0.0748, "num_input_tokens_seen": 9339504, "step": 10445 }, { "epoch": 2.7580836742774184, "grad_norm": 0.21019849181175232, "learning_rate": 4.204341608870171e-05, "loss": 0.0544, "num_input_tokens_seen": 9343920, "step": 10450 }, { "epoch": 2.7594034578329154, "grad_norm": 0.12451523542404175, "learning_rate": 4.203623233141508e-05, "loss": 0.0465, "num_input_tokens_seen": 9348528, "step": 10455 }, { "epoch": 2.7607232413884124, "grad_norm": 0.12882545590400696, "learning_rate": 4.2029045946927334e-05, "loss": 0.057, "num_input_tokens_seen": 9353456, "step": 10460 }, { "epoch": 2.7620430249439094, "grad_norm": 0.2153378427028656, "learning_rate": 4.20218569363467e-05, "loss": 0.0772, "num_input_tokens_seen": 9357936, "step": 10465 }, { "epoch": 2.763362808499406, "grad_norm": 0.21429449319839478, "learning_rate": 4.2014665300781834e-05, "loss": 0.0486, "num_input_tokens_seen": 9362416, "step": 10470 }, { "epoch": 2.764682592054903, "grad_norm": 0.15040379762649536, "learning_rate": 4.200747104134174e-05, "loss": 0.0762, "num_input_tokens_seen": 9366736, "step": 10475 }, { "epoch": 2.7660023756104, "grad_norm": 0.3279176354408264, "learning_rate": 4.200027415913588e-05, "loss": 0.0894, "num_input_tokens_seen": 9371024, "step": 10480 }, { "epoch": 2.7673221591658965, "grad_norm": 0.08205234259366989, "learning_rate": 4.1993074655274126e-05, "loss": 0.0246, "num_input_tokens_seen": 9375728, "step": 10485 }, { "epoch": 2.768641942721394, "grad_norm": 0.10314976423978806, "learning_rate": 4.198587253086669e-05, "loss": 0.0515, "num_input_tokens_seen": 9379920, "step": 10490 }, { "epoch": 2.7699617262768905, "grad_norm": 0.3758162558078766, "learning_rate": 4.197866778702426e-05, "loss": 0.1241, "num_input_tokens_seen": 9384560, "step": 10495 }, { "epoch": 2.7712815098323875, "grad_norm": 0.2838219106197357, "learning_rate": 4.197146042485789e-05, "loss": 0.0438, "num_input_tokens_seen": 9388816, "step": 10500 }, { "epoch": 2.7726012933878845, "grad_norm": 0.15853995084762573, "learning_rate": 4.1964250445479046e-05, "loss": 0.1027, "num_input_tokens_seen": 9393584, "step": 10505 }, { "epoch": 2.773921076943381, "grad_norm": 0.21835008263587952, "learning_rate": 4.19570378499996e-05, "loss": 0.0512, "num_input_tokens_seen": 9397968, "step": 10510 }, { "epoch": 2.775240860498878, "grad_norm": 0.14547650516033173, "learning_rate": 4.194982263953182e-05, "loss": 0.059, "num_input_tokens_seen": 9402768, "step": 10515 }, { "epoch": 2.776560644054375, "grad_norm": 0.2022438496351242, "learning_rate": 4.194260481518838e-05, "loss": 0.0792, "num_input_tokens_seen": 9407248, "step": 10520 }, { "epoch": 2.777880427609872, "grad_norm": 0.1448986828327179, "learning_rate": 4.1935384378082366e-05, "loss": 0.0976, "num_input_tokens_seen": 9411600, "step": 10525 }, { "epoch": 2.779200211165369, "grad_norm": 0.4270556569099426, "learning_rate": 4.1928161329327267e-05, "loss": 0.0912, "num_input_tokens_seen": 9415952, "step": 10530 }, { "epoch": 2.7805199947208656, "grad_norm": 0.4278147518634796, "learning_rate": 4.1920935670036945e-05, "loss": 0.0856, "num_input_tokens_seen": 9420848, "step": 10535 }, { "epoch": 2.7818397782763626, "grad_norm": 0.23939765989780426, "learning_rate": 4.1913707401325705e-05, "loss": 0.072, "num_input_tokens_seen": 9425072, "step": 10540 }, { "epoch": 2.7831595618318596, "grad_norm": 0.18642324209213257, "learning_rate": 4.1906476524308235e-05, "loss": 0.0732, "num_input_tokens_seen": 9429008, "step": 10545 }, { "epoch": 2.784479345387356, "grad_norm": 0.26902902126312256, "learning_rate": 4.189924304009962e-05, "loss": 0.0592, "num_input_tokens_seen": 9433456, "step": 10550 }, { "epoch": 2.7857991289428536, "grad_norm": 0.33873802423477173, "learning_rate": 4.189200694981537e-05, "loss": 0.0751, "num_input_tokens_seen": 9437680, "step": 10555 }, { "epoch": 2.78711891249835, "grad_norm": 0.13564440608024597, "learning_rate": 4.188476825457136e-05, "loss": 0.04, "num_input_tokens_seen": 9442448, "step": 10560 }, { "epoch": 2.788438696053847, "grad_norm": 0.11616267263889313, "learning_rate": 4.18775269554839e-05, "loss": 0.0516, "num_input_tokens_seen": 9446704, "step": 10565 }, { "epoch": 2.789758479609344, "grad_norm": 0.39329490065574646, "learning_rate": 4.187028305366969e-05, "loss": 0.1252, "num_input_tokens_seen": 9451312, "step": 10570 }, { "epoch": 2.7910782631648408, "grad_norm": 0.2904492914676666, "learning_rate": 4.1863036550245824e-05, "loss": 0.0703, "num_input_tokens_seen": 9456016, "step": 10575 }, { "epoch": 2.7923980467203378, "grad_norm": 0.26169899106025696, "learning_rate": 4.1855787446329806e-05, "loss": 0.0539, "num_input_tokens_seen": 9460368, "step": 10580 }, { "epoch": 2.7937178302758348, "grad_norm": 0.20216087996959686, "learning_rate": 4.184853574303955e-05, "loss": 0.0676, "num_input_tokens_seen": 9464816, "step": 10585 }, { "epoch": 2.7950376138313318, "grad_norm": 0.1291237324476242, "learning_rate": 4.184128144149334e-05, "loss": 0.0827, "num_input_tokens_seen": 9469200, "step": 10590 }, { "epoch": 2.7963573973868288, "grad_norm": 0.5450581908226013, "learning_rate": 4.1834024542809896e-05, "loss": 0.0704, "num_input_tokens_seen": 9473872, "step": 10595 }, { "epoch": 2.7976771809423253, "grad_norm": 0.15312978625297546, "learning_rate": 4.1826765048108315e-05, "loss": 0.0533, "num_input_tokens_seen": 9477968, "step": 10600 }, { "epoch": 2.7976771809423253, "eval_loss": 0.07258456945419312, "eval_runtime": 64.7758, "eval_samples_per_second": 103.974, "eval_steps_per_second": 25.997, "num_input_tokens_seen": 9477968, "step": 10600 }, { "epoch": 2.7989969644978223, "grad_norm": 0.23226916790008545, "learning_rate": 4.181950295850811e-05, "loss": 0.0511, "num_input_tokens_seen": 9482288, "step": 10605 }, { "epoch": 2.8003167480533193, "grad_norm": 0.2744671404361725, "learning_rate": 4.181223827512918e-05, "loss": 0.0809, "num_input_tokens_seen": 9486704, "step": 10610 }, { "epoch": 2.8016365316088163, "grad_norm": 0.11475303769111633, "learning_rate": 4.180497099909183e-05, "loss": 0.0279, "num_input_tokens_seen": 9491440, "step": 10615 }, { "epoch": 2.8029563151643133, "grad_norm": 0.4354880154132843, "learning_rate": 4.179770113151677e-05, "loss": 0.0873, "num_input_tokens_seen": 9496304, "step": 10620 }, { "epoch": 2.80427609871981, "grad_norm": 0.20914198458194733, "learning_rate": 4.179042867352511e-05, "loss": 0.0577, "num_input_tokens_seen": 9500528, "step": 10625 }, { "epoch": 2.805595882275307, "grad_norm": 0.22657379508018494, "learning_rate": 4.1783153626238334e-05, "loss": 0.0642, "num_input_tokens_seen": 9504944, "step": 10630 }, { "epoch": 2.806915665830804, "grad_norm": 0.24687549471855164, "learning_rate": 4.177587599077836e-05, "loss": 0.0455, "num_input_tokens_seen": 9509264, "step": 10635 }, { "epoch": 2.8082354493863004, "grad_norm": 0.3574235141277313, "learning_rate": 4.1768595768267494e-05, "loss": 0.0541, "num_input_tokens_seen": 9513744, "step": 10640 }, { "epoch": 2.8095552329417974, "grad_norm": 0.08993898332118988, "learning_rate": 4.176131295982843e-05, "loss": 0.0501, "num_input_tokens_seen": 9518352, "step": 10645 }, { "epoch": 2.8108750164972944, "grad_norm": 0.10981207340955734, "learning_rate": 4.1754027566584276e-05, "loss": 0.0555, "num_input_tokens_seen": 9522896, "step": 10650 }, { "epoch": 2.8121948000527914, "grad_norm": 0.40158671140670776, "learning_rate": 4.174673958965852e-05, "loss": 0.0779, "num_input_tokens_seen": 9527440, "step": 10655 }, { "epoch": 2.8135145836082884, "grad_norm": 0.1160324439406395, "learning_rate": 4.173944903017507e-05, "loss": 0.0202, "num_input_tokens_seen": 9531888, "step": 10660 }, { "epoch": 2.814834367163785, "grad_norm": 0.09010399878025055, "learning_rate": 4.173215588925822e-05, "loss": 0.071, "num_input_tokens_seen": 9536496, "step": 10665 }, { "epoch": 2.816154150719282, "grad_norm": 0.06260544061660767, "learning_rate": 4.172486016803266e-05, "loss": 0.0894, "num_input_tokens_seen": 9540880, "step": 10670 }, { "epoch": 2.817473934274779, "grad_norm": 0.17468859255313873, "learning_rate": 4.171756186762349e-05, "loss": 0.0524, "num_input_tokens_seen": 9545520, "step": 10675 }, { "epoch": 2.818793717830276, "grad_norm": 0.1733817756175995, "learning_rate": 4.171026098915619e-05, "loss": 0.0201, "num_input_tokens_seen": 9549872, "step": 10680 }, { "epoch": 2.820113501385773, "grad_norm": 0.10742655396461487, "learning_rate": 4.170295753375665e-05, "loss": 0.0813, "num_input_tokens_seen": 9554096, "step": 10685 }, { "epoch": 2.8214332849412695, "grad_norm": 0.14247892796993256, "learning_rate": 4.169565150255117e-05, "loss": 0.0613, "num_input_tokens_seen": 9558448, "step": 10690 }, { "epoch": 2.8227530684967665, "grad_norm": 0.04745020344853401, "learning_rate": 4.16883428966664e-05, "loss": 0.0588, "num_input_tokens_seen": 9563216, "step": 10695 }, { "epoch": 2.8240728520522635, "grad_norm": 0.26827290654182434, "learning_rate": 4.168103171722944e-05, "loss": 0.0912, "num_input_tokens_seen": 9567696, "step": 10700 }, { "epoch": 2.82539263560776, "grad_norm": 0.3079434037208557, "learning_rate": 4.167371796536777e-05, "loss": 0.0413, "num_input_tokens_seen": 9572016, "step": 10705 }, { "epoch": 2.826712419163257, "grad_norm": 0.40891391038894653, "learning_rate": 4.166640164220924e-05, "loss": 0.0727, "num_input_tokens_seen": 9576304, "step": 10710 }, { "epoch": 2.828032202718754, "grad_norm": 0.3849615752696991, "learning_rate": 4.1659082748882144e-05, "loss": 0.0932, "num_input_tokens_seen": 9581136, "step": 10715 }, { "epoch": 2.829351986274251, "grad_norm": 0.36571359634399414, "learning_rate": 4.1651761286515135e-05, "loss": 0.0721, "num_input_tokens_seen": 9585744, "step": 10720 }, { "epoch": 2.830671769829748, "grad_norm": 0.25078001618385315, "learning_rate": 4.164443725623728e-05, "loss": 0.0212, "num_input_tokens_seen": 9590064, "step": 10725 }, { "epoch": 2.8319915533852447, "grad_norm": 0.09294269233942032, "learning_rate": 4.163711065917802e-05, "loss": 0.0708, "num_input_tokens_seen": 9594640, "step": 10730 }, { "epoch": 2.8333113369407417, "grad_norm": 0.06363637000322342, "learning_rate": 4.1629781496467234e-05, "loss": 0.0342, "num_input_tokens_seen": 9599280, "step": 10735 }, { "epoch": 2.8346311204962387, "grad_norm": 0.08495685458183289, "learning_rate": 4.1622449769235164e-05, "loss": 0.052, "num_input_tokens_seen": 9603952, "step": 10740 }, { "epoch": 2.8359509040517357, "grad_norm": 0.09065207093954086, "learning_rate": 4.161511547861243e-05, "loss": 0.034, "num_input_tokens_seen": 9608752, "step": 10745 }, { "epoch": 2.8372706876072327, "grad_norm": 0.16980881989002228, "learning_rate": 4.1607778625730104e-05, "loss": 0.0597, "num_input_tokens_seen": 9613520, "step": 10750 }, { "epoch": 2.8385904711627292, "grad_norm": 0.28378695249557495, "learning_rate": 4.160043921171961e-05, "loss": 0.0681, "num_input_tokens_seen": 9617872, "step": 10755 }, { "epoch": 2.8399102547182262, "grad_norm": 0.45464298129081726, "learning_rate": 4.159309723771276e-05, "loss": 0.0868, "num_input_tokens_seen": 9622224, "step": 10760 }, { "epoch": 2.8412300382737232, "grad_norm": 0.34361666440963745, "learning_rate": 4.158575270484181e-05, "loss": 0.0405, "num_input_tokens_seen": 9626480, "step": 10765 }, { "epoch": 2.84254982182922, "grad_norm": 0.17745482921600342, "learning_rate": 4.157840561423936e-05, "loss": 0.0824, "num_input_tokens_seen": 9631312, "step": 10770 }, { "epoch": 2.843869605384717, "grad_norm": 0.2776324450969696, "learning_rate": 4.1571055967038416e-05, "loss": 0.0351, "num_input_tokens_seen": 9636016, "step": 10775 }, { "epoch": 2.845189388940214, "grad_norm": 0.19971998035907745, "learning_rate": 4.156370376437241e-05, "loss": 0.0433, "num_input_tokens_seen": 9640368, "step": 10780 }, { "epoch": 2.846509172495711, "grad_norm": 0.23342804610729218, "learning_rate": 4.155634900737513e-05, "loss": 0.0463, "num_input_tokens_seen": 9644912, "step": 10785 }, { "epoch": 2.847828956051208, "grad_norm": 0.08116573840379715, "learning_rate": 4.1548991697180764e-05, "loss": 0.0339, "num_input_tokens_seen": 9649264, "step": 10790 }, { "epoch": 2.8491487396067043, "grad_norm": 0.05584760755300522, "learning_rate": 4.1541631834923914e-05, "loss": 0.0423, "num_input_tokens_seen": 9653712, "step": 10795 }, { "epoch": 2.8504685231622013, "grad_norm": 0.18666653335094452, "learning_rate": 4.153426942173956e-05, "loss": 0.0602, "num_input_tokens_seen": 9658192, "step": 10800 }, { "epoch": 2.8504685231622013, "eval_loss": 0.07364362478256226, "eval_runtime": 64.767, "eval_samples_per_second": 103.988, "eval_steps_per_second": 26.001, "num_input_tokens_seen": 9658192, "step": 10800 }, { "epoch": 2.8517883067176983, "grad_norm": 0.28547418117523193, "learning_rate": 4.152690445876308e-05, "loss": 0.1068, "num_input_tokens_seen": 9662672, "step": 10805 }, { "epoch": 2.8531080902731953, "grad_norm": 0.45269665122032166, "learning_rate": 4.1519536947130245e-05, "loss": 0.0514, "num_input_tokens_seen": 9667312, "step": 10810 }, { "epoch": 2.8544278738286923, "grad_norm": 0.1670391857624054, "learning_rate": 4.151216688797722e-05, "loss": 0.0908, "num_input_tokens_seen": 9671920, "step": 10815 }, { "epoch": 2.855747657384189, "grad_norm": 0.14056283235549927, "learning_rate": 4.150479428244054e-05, "loss": 0.043, "num_input_tokens_seen": 9676368, "step": 10820 }, { "epoch": 2.857067440939686, "grad_norm": 0.5301304459571838, "learning_rate": 4.1497419131657176e-05, "loss": 0.0875, "num_input_tokens_seen": 9680944, "step": 10825 }, { "epoch": 2.858387224495183, "grad_norm": 0.606558084487915, "learning_rate": 4.149004143676447e-05, "loss": 0.0968, "num_input_tokens_seen": 9685296, "step": 10830 }, { "epoch": 2.8597070080506795, "grad_norm": 0.13543011248111725, "learning_rate": 4.148266119890015e-05, "loss": 0.0751, "num_input_tokens_seen": 9689904, "step": 10835 }, { "epoch": 2.8610267916061765, "grad_norm": 0.10696174949407578, "learning_rate": 4.1475278419202324e-05, "loss": 0.0693, "num_input_tokens_seen": 9694288, "step": 10840 }, { "epoch": 2.8623465751616735, "grad_norm": 0.3865920901298523, "learning_rate": 4.146789309880953e-05, "loss": 0.0831, "num_input_tokens_seen": 9699152, "step": 10845 }, { "epoch": 2.8636663587171705, "grad_norm": 0.21165058016777039, "learning_rate": 4.146050523886068e-05, "loss": 0.0335, "num_input_tokens_seen": 9703440, "step": 10850 }, { "epoch": 2.8649861422726675, "grad_norm": 0.4811539947986603, "learning_rate": 4.1453114840495055e-05, "loss": 0.0976, "num_input_tokens_seen": 9707984, "step": 10855 }, { "epoch": 2.866305925828164, "grad_norm": 0.08387497812509537, "learning_rate": 4.1445721904852364e-05, "loss": 0.0507, "num_input_tokens_seen": 9712496, "step": 10860 }, { "epoch": 2.867625709383661, "grad_norm": 0.22065885365009308, "learning_rate": 4.143832643307269e-05, "loss": 0.0785, "num_input_tokens_seen": 9716912, "step": 10865 }, { "epoch": 2.868945492939158, "grad_norm": 0.10948526114225388, "learning_rate": 4.1430928426296503e-05, "loss": 0.0156, "num_input_tokens_seen": 9721520, "step": 10870 }, { "epoch": 2.870265276494655, "grad_norm": 0.1257033497095108, "learning_rate": 4.142352788566466e-05, "loss": 0.1046, "num_input_tokens_seen": 9725904, "step": 10875 }, { "epoch": 2.871585060050152, "grad_norm": 0.4825247824192047, "learning_rate": 4.1416124812318424e-05, "loss": 0.1068, "num_input_tokens_seen": 9730480, "step": 10880 }, { "epoch": 2.8729048436056486, "grad_norm": 0.2748614549636841, "learning_rate": 4.1408719207399453e-05, "loss": 0.1398, "num_input_tokens_seen": 9735088, "step": 10885 }, { "epoch": 2.8742246271611456, "grad_norm": 0.155390664935112, "learning_rate": 4.140131107204978e-05, "loss": 0.0685, "num_input_tokens_seen": 9739536, "step": 10890 }, { "epoch": 2.8755444107166426, "grad_norm": 0.5951839685440063, "learning_rate": 4.139390040741182e-05, "loss": 0.0835, "num_input_tokens_seen": 9743792, "step": 10895 }, { "epoch": 2.876864194272139, "grad_norm": 0.06289207190275192, "learning_rate": 4.1386487214628396e-05, "loss": 0.0686, "num_input_tokens_seen": 9748304, "step": 10900 }, { "epoch": 2.878183977827636, "grad_norm": 0.17405596375465393, "learning_rate": 4.137907149484272e-05, "loss": 0.0458, "num_input_tokens_seen": 9752528, "step": 10905 }, { "epoch": 2.879503761383133, "grad_norm": 0.043563876301050186, "learning_rate": 4.137165324919839e-05, "loss": 0.0738, "num_input_tokens_seen": 9757232, "step": 10910 }, { "epoch": 2.88082354493863, "grad_norm": 0.4010505676269531, "learning_rate": 4.136423247883939e-05, "loss": 0.0694, "num_input_tokens_seen": 9761680, "step": 10915 }, { "epoch": 2.882143328494127, "grad_norm": 0.1429624706506729, "learning_rate": 4.135680918491009e-05, "loss": 0.0478, "num_input_tokens_seen": 9766064, "step": 10920 }, { "epoch": 2.8834631120496237, "grad_norm": 0.26946377754211426, "learning_rate": 4.1349383368555265e-05, "loss": 0.0356, "num_input_tokens_seen": 9770768, "step": 10925 }, { "epoch": 2.8847828956051207, "grad_norm": 0.1430450975894928, "learning_rate": 4.1341955030920065e-05, "loss": 0.0524, "num_input_tokens_seen": 9775216, "step": 10930 }, { "epoch": 2.8861026791606177, "grad_norm": 0.15326353907585144, "learning_rate": 4.1334524173150036e-05, "loss": 0.0308, "num_input_tokens_seen": 9779504, "step": 10935 }, { "epoch": 2.8874224627161147, "grad_norm": 0.2492009699344635, "learning_rate": 4.13270907963911e-05, "loss": 0.0637, "num_input_tokens_seen": 9783696, "step": 10940 }, { "epoch": 2.8887422462716117, "grad_norm": 0.42271658778190613, "learning_rate": 4.131965490178959e-05, "loss": 0.0702, "num_input_tokens_seen": 9788368, "step": 10945 }, { "epoch": 2.8900620298271082, "grad_norm": 0.08454377204179764, "learning_rate": 4.131221649049222e-05, "loss": 0.0352, "num_input_tokens_seen": 9792848, "step": 10950 }, { "epoch": 2.8913818133826052, "grad_norm": 0.36784616112709045, "learning_rate": 4.130477556364606e-05, "loss": 0.0398, "num_input_tokens_seen": 9797136, "step": 10955 }, { "epoch": 2.8927015969381022, "grad_norm": 0.19283534586429596, "learning_rate": 4.129733212239861e-05, "loss": 0.0913, "num_input_tokens_seen": 9801648, "step": 10960 }, { "epoch": 2.894021380493599, "grad_norm": 0.42087846994400024, "learning_rate": 4.128988616789774e-05, "loss": 0.087, "num_input_tokens_seen": 9806224, "step": 10965 }, { "epoch": 2.895341164049096, "grad_norm": 0.29365837574005127, "learning_rate": 4.1282437701291724e-05, "loss": 0.0849, "num_input_tokens_seen": 9810768, "step": 10970 }, { "epoch": 2.896660947604593, "grad_norm": 0.1813187450170517, "learning_rate": 4.1274986723729184e-05, "loss": 0.1038, "num_input_tokens_seen": 9815568, "step": 10975 }, { "epoch": 2.89798073116009, "grad_norm": 0.12784017622470856, "learning_rate": 4.126753323635917e-05, "loss": 0.1166, "num_input_tokens_seen": 9820112, "step": 10980 }, { "epoch": 2.899300514715587, "grad_norm": 0.4426461160182953, "learning_rate": 4.12600772403311e-05, "loss": 0.0919, "num_input_tokens_seen": 9824272, "step": 10985 }, { "epoch": 2.9006202982710834, "grad_norm": 0.03414088487625122, "learning_rate": 4.125261873679479e-05, "loss": 0.1085, "num_input_tokens_seen": 9828368, "step": 10990 }, { "epoch": 2.9019400818265804, "grad_norm": 0.36862286925315857, "learning_rate": 4.124515772690042e-05, "loss": 0.1163, "num_input_tokens_seen": 9833008, "step": 10995 }, { "epoch": 2.9032598653820774, "grad_norm": 0.31738120317459106, "learning_rate": 4.123769421179858e-05, "loss": 0.0609, "num_input_tokens_seen": 9837392, "step": 11000 }, { "epoch": 2.9032598653820774, "eval_loss": 0.0721987709403038, "eval_runtime": 64.7228, "eval_samples_per_second": 104.059, "eval_steps_per_second": 26.019, "num_input_tokens_seen": 9837392, "step": 11000 }, { "epoch": 2.9045796489375744, "grad_norm": 0.08383646607398987, "learning_rate": 4.1230228192640236e-05, "loss": 0.0458, "num_input_tokens_seen": 9841744, "step": 11005 }, { "epoch": 2.9058994324930714, "grad_norm": 0.1444535255432129, "learning_rate": 4.122275967057675e-05, "loss": 0.0292, "num_input_tokens_seen": 9846224, "step": 11010 }, { "epoch": 2.907219216048568, "grad_norm": 0.3993277847766876, "learning_rate": 4.1215288646759846e-05, "loss": 0.0906, "num_input_tokens_seen": 9850416, "step": 11015 }, { "epoch": 2.908538999604065, "grad_norm": 0.11769595742225647, "learning_rate": 4.120781512234166e-05, "loss": 0.0471, "num_input_tokens_seen": 9854800, "step": 11020 }, { "epoch": 2.909858783159562, "grad_norm": 0.2832094132900238, "learning_rate": 4.120033909847471e-05, "loss": 0.1087, "num_input_tokens_seen": 9859440, "step": 11025 }, { "epoch": 2.9111785667150585, "grad_norm": 0.34016942977905273, "learning_rate": 4.119286057631187e-05, "loss": 0.0594, "num_input_tokens_seen": 9863792, "step": 11030 }, { "epoch": 2.9124983502705555, "grad_norm": 0.09394952654838562, "learning_rate": 4.118537955700646e-05, "loss": 0.0913, "num_input_tokens_seen": 9868272, "step": 11035 }, { "epoch": 2.9138181338260525, "grad_norm": 0.12728965282440186, "learning_rate": 4.11778960417121e-05, "loss": 0.0513, "num_input_tokens_seen": 9872624, "step": 11040 }, { "epoch": 2.9151379173815495, "grad_norm": 0.3959643840789795, "learning_rate": 4.117041003158288e-05, "loss": 0.0883, "num_input_tokens_seen": 9877232, "step": 11045 }, { "epoch": 2.9164577009370465, "grad_norm": 0.19259177148342133, "learning_rate": 4.1162921527773215e-05, "loss": 0.0765, "num_input_tokens_seen": 9881616, "step": 11050 }, { "epoch": 2.917777484492543, "grad_norm": 0.11183683574199677, "learning_rate": 4.115543053143794e-05, "loss": 0.0646, "num_input_tokens_seen": 9886096, "step": 11055 }, { "epoch": 2.91909726804804, "grad_norm": 0.5395477414131165, "learning_rate": 4.114793704373226e-05, "loss": 0.1072, "num_input_tokens_seen": 9890512, "step": 11060 }, { "epoch": 2.920417051603537, "grad_norm": 0.320205420255661, "learning_rate": 4.114044106581175e-05, "loss": 0.0388, "num_input_tokens_seen": 9895088, "step": 11065 }, { "epoch": 2.921736835159034, "grad_norm": 0.10865378379821777, "learning_rate": 4.11329425988324e-05, "loss": 0.0845, "num_input_tokens_seen": 9899472, "step": 11070 }, { "epoch": 2.923056618714531, "grad_norm": 0.09640852361917496, "learning_rate": 4.112544164395056e-05, "loss": 0.03, "num_input_tokens_seen": 9904240, "step": 11075 }, { "epoch": 2.9243764022700276, "grad_norm": 0.49801191687583923, "learning_rate": 4.111793820232297e-05, "loss": 0.0555, "num_input_tokens_seen": 9908720, "step": 11080 }, { "epoch": 2.9256961858255246, "grad_norm": 0.2555314600467682, "learning_rate": 4.1110432275106767e-05, "loss": 0.0918, "num_input_tokens_seen": 9913168, "step": 11085 }, { "epoch": 2.9270159693810216, "grad_norm": 0.07466030865907669, "learning_rate": 4.110292386345944e-05, "loss": 0.0509, "num_input_tokens_seen": 9917712, "step": 11090 }, { "epoch": 2.928335752936518, "grad_norm": 0.4928416609764099, "learning_rate": 4.109541296853891e-05, "loss": 0.0898, "num_input_tokens_seen": 9921872, "step": 11095 }, { "epoch": 2.929655536492015, "grad_norm": 0.07516459375619888, "learning_rate": 4.108789959150341e-05, "loss": 0.0537, "num_input_tokens_seen": 9926224, "step": 11100 }, { "epoch": 2.930975320047512, "grad_norm": 0.04099766165018082, "learning_rate": 4.108038373351163e-05, "loss": 0.0509, "num_input_tokens_seen": 9930864, "step": 11105 }, { "epoch": 2.932295103603009, "grad_norm": 0.11317934095859528, "learning_rate": 4.10728653957226e-05, "loss": 0.0345, "num_input_tokens_seen": 9935536, "step": 11110 }, { "epoch": 2.933614887158506, "grad_norm": 0.1413199007511139, "learning_rate": 4.106534457929575e-05, "loss": 0.0369, "num_input_tokens_seen": 9939824, "step": 11115 }, { "epoch": 2.9349346707140027, "grad_norm": 0.4590227007865906, "learning_rate": 4.105782128539086e-05, "loss": 0.0402, "num_input_tokens_seen": 9943792, "step": 11120 }, { "epoch": 2.9362544542694997, "grad_norm": 0.433015912771225, "learning_rate": 4.1050295515168144e-05, "loss": 0.099, "num_input_tokens_seen": 9948240, "step": 11125 }, { "epoch": 2.9375742378249967, "grad_norm": 0.20044876635074615, "learning_rate": 4.1042767269788155e-05, "loss": 0.0439, "num_input_tokens_seen": 9952656, "step": 11130 }, { "epoch": 2.9388940213804937, "grad_norm": 0.13858526945114136, "learning_rate": 4.103523655041185e-05, "loss": 0.072, "num_input_tokens_seen": 9956880, "step": 11135 }, { "epoch": 2.9402138049359907, "grad_norm": 0.2685445547103882, "learning_rate": 4.102770335820055e-05, "loss": 0.0779, "num_input_tokens_seen": 9961296, "step": 11140 }, { "epoch": 2.9415335884914873, "grad_norm": 0.21399009227752686, "learning_rate": 4.1020167694315984e-05, "loss": 0.0782, "num_input_tokens_seen": 9965520, "step": 11145 }, { "epoch": 2.9428533720469843, "grad_norm": 0.24000246822834015, "learning_rate": 4.101262955992023e-05, "loss": 0.1536, "num_input_tokens_seen": 9970128, "step": 11150 }, { "epoch": 2.9441731556024813, "grad_norm": 0.26567238569259644, "learning_rate": 4.100508895617578e-05, "loss": 0.0705, "num_input_tokens_seen": 9974608, "step": 11155 }, { "epoch": 2.945492939157978, "grad_norm": 0.06241559982299805, "learning_rate": 4.099754588424547e-05, "loss": 0.0336, "num_input_tokens_seen": 9979056, "step": 11160 }, { "epoch": 2.946812722713475, "grad_norm": 0.03150666877627373, "learning_rate": 4.0990000345292546e-05, "loss": 0.0802, "num_input_tokens_seen": 9983568, "step": 11165 }, { "epoch": 2.948132506268972, "grad_norm": 0.2828163802623749, "learning_rate": 4.098245234048064e-05, "loss": 0.0454, "num_input_tokens_seen": 9987792, "step": 11170 }, { "epoch": 2.949452289824469, "grad_norm": 0.295186311006546, "learning_rate": 4.0974901870973726e-05, "loss": 0.0562, "num_input_tokens_seen": 9992272, "step": 11175 }, { "epoch": 2.950772073379966, "grad_norm": 0.17542459070682526, "learning_rate": 4.096734893793619e-05, "loss": 0.0588, "num_input_tokens_seen": 9996528, "step": 11180 }, { "epoch": 2.9520918569354624, "grad_norm": 0.3067108690738678, "learning_rate": 4.095979354253279e-05, "loss": 0.1364, "num_input_tokens_seen": 10001104, "step": 11185 }, { "epoch": 2.9534116404909594, "grad_norm": 0.08225727826356888, "learning_rate": 4.0952235685928656e-05, "loss": 0.0483, "num_input_tokens_seen": 10005520, "step": 11190 }, { "epoch": 2.9547314240464564, "grad_norm": 0.3279702961444855, "learning_rate": 4.094467536928932e-05, "loss": 0.0756, "num_input_tokens_seen": 10009936, "step": 11195 }, { "epoch": 2.9560512076019534, "grad_norm": 0.22426649928092957, "learning_rate": 4.093711259378067e-05, "loss": 0.0602, "num_input_tokens_seen": 10014320, "step": 11200 }, { "epoch": 2.9560512076019534, "eval_loss": 0.07229285687208176, "eval_runtime": 64.829, "eval_samples_per_second": 103.889, "eval_steps_per_second": 25.976, "num_input_tokens_seen": 10014320, "step": 11200 }, { "epoch": 2.9573709911574504, "grad_norm": 0.23876067996025085, "learning_rate": 4.092954736056897e-05, "loss": 0.0655, "num_input_tokens_seen": 10018640, "step": 11205 }, { "epoch": 2.958690774712947, "grad_norm": 0.04636706784367561, "learning_rate": 4.09219796708209e-05, "loss": 0.0398, "num_input_tokens_seen": 10023248, "step": 11210 }, { "epoch": 2.960010558268444, "grad_norm": 0.1479640007019043, "learning_rate": 4.0914409525703464e-05, "loss": 0.0785, "num_input_tokens_seen": 10027408, "step": 11215 }, { "epoch": 2.961330341823941, "grad_norm": 0.2845725119113922, "learning_rate": 4.090683692638408e-05, "loss": 0.0892, "num_input_tokens_seen": 10031632, "step": 11220 }, { "epoch": 2.9626501253794375, "grad_norm": 0.19546422362327576, "learning_rate": 4.089926187403056e-05, "loss": 0.0711, "num_input_tokens_seen": 10036464, "step": 11225 }, { "epoch": 2.9639699089349345, "grad_norm": 0.2750297486782074, "learning_rate": 4.0891684369811044e-05, "loss": 0.0512, "num_input_tokens_seen": 10041040, "step": 11230 }, { "epoch": 2.9652896924904315, "grad_norm": 0.1769384741783142, "learning_rate": 4.0884104414894107e-05, "loss": 0.0914, "num_input_tokens_seen": 10045392, "step": 11235 }, { "epoch": 2.9666094760459285, "grad_norm": 0.41263535618782043, "learning_rate": 4.087652201044864e-05, "loss": 0.1031, "num_input_tokens_seen": 10049744, "step": 11240 }, { "epoch": 2.9679292596014255, "grad_norm": 0.26773661375045776, "learning_rate": 4.086893715764397e-05, "loss": 0.0698, "num_input_tokens_seen": 10054000, "step": 11245 }, { "epoch": 2.969249043156922, "grad_norm": 0.3321188986301422, "learning_rate": 4.086134985764977e-05, "loss": 0.0639, "num_input_tokens_seen": 10058448, "step": 11250 }, { "epoch": 2.970568826712419, "grad_norm": 0.3590265214443207, "learning_rate": 4.0853760111636085e-05, "loss": 0.1117, "num_input_tokens_seen": 10062960, "step": 11255 }, { "epoch": 2.971888610267916, "grad_norm": 0.28318166732788086, "learning_rate": 4.084616792077337e-05, "loss": 0.0465, "num_input_tokens_seen": 10067376, "step": 11260 }, { "epoch": 2.973208393823413, "grad_norm": 0.13597536087036133, "learning_rate": 4.083857328623243e-05, "loss": 0.0365, "num_input_tokens_seen": 10071888, "step": 11265 }, { "epoch": 2.97452817737891, "grad_norm": 0.056540802121162415, "learning_rate": 4.083097620918444e-05, "loss": 0.0443, "num_input_tokens_seen": 10076368, "step": 11270 }, { "epoch": 2.9758479609344066, "grad_norm": 0.15361592173576355, "learning_rate": 4.082337669080097e-05, "loss": 0.0455, "num_input_tokens_seen": 10080656, "step": 11275 }, { "epoch": 2.9771677444899036, "grad_norm": 0.112300805747509, "learning_rate": 4.081577473225398e-05, "loss": 0.0324, "num_input_tokens_seen": 10085360, "step": 11280 }, { "epoch": 2.9784875280454006, "grad_norm": 0.443852037191391, "learning_rate": 4.080817033471577e-05, "loss": 0.0795, "num_input_tokens_seen": 10089424, "step": 11285 }, { "epoch": 2.979807311600897, "grad_norm": 0.035306625068187714, "learning_rate": 4.080056349935903e-05, "loss": 0.046, "num_input_tokens_seen": 10093808, "step": 11290 }, { "epoch": 2.9811270951563946, "grad_norm": 0.06907152384519577, "learning_rate": 4.079295422735684e-05, "loss": 0.0231, "num_input_tokens_seen": 10098320, "step": 11295 }, { "epoch": 2.982446878711891, "grad_norm": 0.027408909052610397, "learning_rate": 4.078534251988264e-05, "loss": 0.04, "num_input_tokens_seen": 10102672, "step": 11300 }, { "epoch": 2.983766662267388, "grad_norm": 0.15732643008232117, "learning_rate": 4.077772837811025e-05, "loss": 0.1244, "num_input_tokens_seen": 10107184, "step": 11305 }, { "epoch": 2.985086445822885, "grad_norm": 0.2766321301460266, "learning_rate": 4.0770111803213874e-05, "loss": 0.1018, "num_input_tokens_seen": 10111696, "step": 11310 }, { "epoch": 2.9864062293783817, "grad_norm": 0.05397824943065643, "learning_rate": 4.076249279636807e-05, "loss": 0.0931, "num_input_tokens_seen": 10116048, "step": 11315 }, { "epoch": 2.9877260129338787, "grad_norm": 0.1666417419910431, "learning_rate": 4.075487135874781e-05, "loss": 0.0945, "num_input_tokens_seen": 10120496, "step": 11320 }, { "epoch": 2.9890457964893757, "grad_norm": 0.15698044002056122, "learning_rate": 4.074724749152837e-05, "loss": 0.0355, "num_input_tokens_seen": 10124720, "step": 11325 }, { "epoch": 2.9903655800448727, "grad_norm": 0.2472420632839203, "learning_rate": 4.07396211958855e-05, "loss": 0.1004, "num_input_tokens_seen": 10128816, "step": 11330 }, { "epoch": 2.9916853636003697, "grad_norm": 0.38140153884887695, "learning_rate": 4.073199247299523e-05, "loss": 0.0495, "num_input_tokens_seen": 10133168, "step": 11335 }, { "epoch": 2.9930051471558663, "grad_norm": 0.27673792839050293, "learning_rate": 4.072436132403403e-05, "loss": 0.0679, "num_input_tokens_seen": 10137680, "step": 11340 }, { "epoch": 2.9943249307113633, "grad_norm": 0.22438573837280273, "learning_rate": 4.0716727750178704e-05, "loss": 0.0674, "num_input_tokens_seen": 10142480, "step": 11345 }, { "epoch": 2.9956447142668603, "grad_norm": 0.043681684881448746, "learning_rate": 4.0709091752606455e-05, "loss": 0.1434, "num_input_tokens_seen": 10147120, "step": 11350 }, { "epoch": 2.9969644978223573, "grad_norm": 0.37502187490463257, "learning_rate": 4.070145333249484e-05, "loss": 0.0914, "num_input_tokens_seen": 10151792, "step": 11355 }, { "epoch": 2.9982842813778543, "grad_norm": 0.1907288134098053, "learning_rate": 4.069381249102181e-05, "loss": 0.0263, "num_input_tokens_seen": 10156208, "step": 11360 }, { "epoch": 2.999604064933351, "grad_norm": 0.08892754465341568, "learning_rate": 4.0686169229365665e-05, "loss": 0.0428, "num_input_tokens_seen": 10160688, "step": 11365 }, { "epoch": 3.0007918701332983, "grad_norm": 0.2761468291282654, "learning_rate": 4.067852354870511e-05, "loss": 0.0395, "num_input_tokens_seen": 10164384, "step": 11370 }, { "epoch": 3.002111653688795, "grad_norm": 0.320089727640152, "learning_rate": 4.067087545021919e-05, "loss": 0.0616, "num_input_tokens_seen": 10169312, "step": 11375 }, { "epoch": 3.003431437244292, "grad_norm": 0.035226814448833466, "learning_rate": 4.066322493508734e-05, "loss": 0.0321, "num_input_tokens_seen": 10173664, "step": 11380 }, { "epoch": 3.004751220799789, "grad_norm": 0.22582793235778809, "learning_rate": 4.065557200448937e-05, "loss": 0.076, "num_input_tokens_seen": 10178432, "step": 11385 }, { "epoch": 3.006071004355286, "grad_norm": 0.31661298871040344, "learning_rate": 4.064791665960546e-05, "loss": 0.0813, "num_input_tokens_seen": 10182752, "step": 11390 }, { "epoch": 3.007390787910783, "grad_norm": 0.38629651069641113, "learning_rate": 4.064025890161615e-05, "loss": 0.0811, "num_input_tokens_seen": 10187488, "step": 11395 }, { "epoch": 3.0087105714662794, "grad_norm": 0.09677337855100632, "learning_rate": 4.0632598731702373e-05, "loss": 0.078, "num_input_tokens_seen": 10191904, "step": 11400 }, { "epoch": 3.0087105714662794, "eval_loss": 0.07147658616304398, "eval_runtime": 64.7462, "eval_samples_per_second": 104.022, "eval_steps_per_second": 26.009, "num_input_tokens_seen": 10191904, "step": 11400 }, { "epoch": 3.0100303550217764, "grad_norm": 0.11812586337327957, "learning_rate": 4.0624936151045426e-05, "loss": 0.0525, "num_input_tokens_seen": 10196096, "step": 11405 }, { "epoch": 3.0113501385772734, "grad_norm": 0.06322041898965836, "learning_rate": 4.061727116082696e-05, "loss": 0.0558, "num_input_tokens_seen": 10200384, "step": 11410 }, { "epoch": 3.0126699221327704, "grad_norm": 0.09015189111232758, "learning_rate": 4.060960376222903e-05, "loss": 0.1108, "num_input_tokens_seen": 10204736, "step": 11415 }, { "epoch": 3.013989705688267, "grad_norm": 0.05702092498540878, "learning_rate": 4.0601933956434034e-05, "loss": 0.0484, "num_input_tokens_seen": 10209120, "step": 11420 }, { "epoch": 3.015309489243764, "grad_norm": 0.20381246507167816, "learning_rate": 4.059426174462476e-05, "loss": 0.0663, "num_input_tokens_seen": 10213856, "step": 11425 }, { "epoch": 3.016629272799261, "grad_norm": 0.10250519961118698, "learning_rate": 4.058658712798435e-05, "loss": 0.0575, "num_input_tokens_seen": 10217984, "step": 11430 }, { "epoch": 3.017949056354758, "grad_norm": 0.2268906682729721, "learning_rate": 4.0578910107696336e-05, "loss": 0.0564, "num_input_tokens_seen": 10222592, "step": 11435 }, { "epoch": 3.0192688399102545, "grad_norm": 0.08170004934072495, "learning_rate": 4.05712306849446e-05, "loss": 0.0618, "num_input_tokens_seen": 10227072, "step": 11440 }, { "epoch": 3.0205886234657515, "grad_norm": 0.3212871253490448, "learning_rate": 4.0563548860913415e-05, "loss": 0.0375, "num_input_tokens_seen": 10231424, "step": 11445 }, { "epoch": 3.0219084070212485, "grad_norm": 0.47481027245521545, "learning_rate": 4.0555864636787414e-05, "loss": 0.078, "num_input_tokens_seen": 10235936, "step": 11450 }, { "epoch": 3.0232281905767455, "grad_norm": 0.5487993359565735, "learning_rate": 4.054817801375159e-05, "loss": 0.0906, "num_input_tokens_seen": 10240352, "step": 11455 }, { "epoch": 3.0245479741322425, "grad_norm": 0.028782833367586136, "learning_rate": 4.054048899299134e-05, "loss": 0.0235, "num_input_tokens_seen": 10244768, "step": 11460 }, { "epoch": 3.025867757687739, "grad_norm": 0.238408625125885, "learning_rate": 4.0532797575692385e-05, "loss": 0.0552, "num_input_tokens_seen": 10249280, "step": 11465 }, { "epoch": 3.027187541243236, "grad_norm": 0.08297261595726013, "learning_rate": 4.052510376304085e-05, "loss": 0.0441, "num_input_tokens_seen": 10253632, "step": 11470 }, { "epoch": 3.028507324798733, "grad_norm": 0.21018020808696747, "learning_rate": 4.051740755622321e-05, "loss": 0.0658, "num_input_tokens_seen": 10257856, "step": 11475 }, { "epoch": 3.02982710835423, "grad_norm": 0.11486421525478363, "learning_rate": 4.050970895642632e-05, "loss": 0.0597, "num_input_tokens_seen": 10262624, "step": 11480 }, { "epoch": 3.0311468919097266, "grad_norm": 0.0838986337184906, "learning_rate": 4.050200796483741e-05, "loss": 0.0478, "num_input_tokens_seen": 10267072, "step": 11485 }, { "epoch": 3.0324666754652236, "grad_norm": 0.14056506752967834, "learning_rate": 4.049430458264405e-05, "loss": 0.0713, "num_input_tokens_seen": 10271744, "step": 11490 }, { "epoch": 3.0337864590207206, "grad_norm": 0.04700998216867447, "learning_rate": 4.048659881103422e-05, "loss": 0.0331, "num_input_tokens_seen": 10275936, "step": 11495 }, { "epoch": 3.0351062425762176, "grad_norm": 0.2795446217060089, "learning_rate": 4.0478890651196235e-05, "loss": 0.067, "num_input_tokens_seen": 10280384, "step": 11500 }, { "epoch": 3.036426026131714, "grad_norm": 0.05636051297187805, "learning_rate": 4.047118010431879e-05, "loss": 0.0822, "num_input_tokens_seen": 10284736, "step": 11505 }, { "epoch": 3.037745809687211, "grad_norm": 0.2649874985218048, "learning_rate": 4.046346717159094e-05, "loss": 0.0828, "num_input_tokens_seen": 10289184, "step": 11510 }, { "epoch": 3.039065593242708, "grad_norm": 0.08017101883888245, "learning_rate": 4.045575185420214e-05, "loss": 0.0371, "num_input_tokens_seen": 10293728, "step": 11515 }, { "epoch": 3.040385376798205, "grad_norm": 0.197988823056221, "learning_rate": 4.0448034153342165e-05, "loss": 0.0355, "num_input_tokens_seen": 10298272, "step": 11520 }, { "epoch": 3.041705160353702, "grad_norm": 0.2966569662094116, "learning_rate": 4.0440314070201194e-05, "loss": 0.0786, "num_input_tokens_seen": 10302688, "step": 11525 }, { "epoch": 3.0430249439091988, "grad_norm": 0.1451355367898941, "learning_rate": 4.043259160596976e-05, "loss": 0.0593, "num_input_tokens_seen": 10307008, "step": 11530 }, { "epoch": 3.0443447274646958, "grad_norm": 0.43196558952331543, "learning_rate": 4.0424866761838767e-05, "loss": 0.0622, "num_input_tokens_seen": 10311712, "step": 11535 }, { "epoch": 3.0456645110201928, "grad_norm": 0.06707499176263809, "learning_rate": 4.041713953899948e-05, "loss": 0.0428, "num_input_tokens_seen": 10316288, "step": 11540 }, { "epoch": 3.0469842945756898, "grad_norm": 0.16918694972991943, "learning_rate": 4.0409409938643515e-05, "loss": 0.0507, "num_input_tokens_seen": 10320832, "step": 11545 }, { "epoch": 3.0483040781311863, "grad_norm": 0.4091033637523651, "learning_rate": 4.0401677961962904e-05, "loss": 0.1196, "num_input_tokens_seen": 10324896, "step": 11550 }, { "epoch": 3.0496238616866833, "grad_norm": 0.1273830533027649, "learning_rate": 4.039394361015001e-05, "loss": 0.0465, "num_input_tokens_seen": 10329440, "step": 11555 }, { "epoch": 3.0509436452421803, "grad_norm": 0.08937449753284454, "learning_rate": 4.038620688439755e-05, "loss": 0.0269, "num_input_tokens_seen": 10334272, "step": 11560 }, { "epoch": 3.0522634287976773, "grad_norm": 0.42565733194351196, "learning_rate": 4.037846778589862e-05, "loss": 0.1062, "num_input_tokens_seen": 10338912, "step": 11565 }, { "epoch": 3.053583212353174, "grad_norm": 0.18027301132678986, "learning_rate": 4.0370726315846715e-05, "loss": 0.0766, "num_input_tokens_seen": 10343296, "step": 11570 }, { "epoch": 3.054902995908671, "grad_norm": 0.140743687748909, "learning_rate": 4.036298247543565e-05, "loss": 0.0552, "num_input_tokens_seen": 10347840, "step": 11575 }, { "epoch": 3.056222779464168, "grad_norm": 0.26292720437049866, "learning_rate": 4.035523626585962e-05, "loss": 0.094, "num_input_tokens_seen": 10352448, "step": 11580 }, { "epoch": 3.057542563019665, "grad_norm": 0.11770360916852951, "learning_rate": 4.0347487688313194e-05, "loss": 0.0578, "num_input_tokens_seen": 10356992, "step": 11585 }, { "epoch": 3.058862346575162, "grad_norm": 0.21183274686336517, "learning_rate": 4.0339736743991296e-05, "loss": 0.0627, "num_input_tokens_seen": 10361152, "step": 11590 }, { "epoch": 3.0601821301306584, "grad_norm": 0.05844496563076973, "learning_rate": 4.0331983434089227e-05, "loss": 0.0477, "num_input_tokens_seen": 10365472, "step": 11595 }, { "epoch": 3.0615019136861554, "grad_norm": 0.09435346722602844, "learning_rate": 4.032422775980264e-05, "loss": 0.0541, "num_input_tokens_seen": 10369824, "step": 11600 }, { "epoch": 3.0615019136861554, "eval_loss": 0.0714099109172821, "eval_runtime": 64.7291, "eval_samples_per_second": 104.049, "eval_steps_per_second": 26.016, "num_input_tokens_seen": 10369824, "step": 11600 }, { "epoch": 3.0628216972416524, "grad_norm": 0.14890049397945404, "learning_rate": 4.031646972232754e-05, "loss": 0.1142, "num_input_tokens_seen": 10374400, "step": 11605 }, { "epoch": 3.0641414807971494, "grad_norm": 0.3251514732837677, "learning_rate": 4.0308709322860344e-05, "loss": 0.0992, "num_input_tokens_seen": 10378976, "step": 11610 }, { "epoch": 3.065461264352646, "grad_norm": 0.1879168599843979, "learning_rate": 4.0300946562597784e-05, "loss": 0.0482, "num_input_tokens_seen": 10383456, "step": 11615 }, { "epoch": 3.066781047908143, "grad_norm": 0.2131122648715973, "learning_rate": 4.029318144273698e-05, "loss": 0.0359, "num_input_tokens_seen": 10388000, "step": 11620 }, { "epoch": 3.06810083146364, "grad_norm": 0.25327858328819275, "learning_rate": 4.0285413964475415e-05, "loss": 0.0878, "num_input_tokens_seen": 10392576, "step": 11625 }, { "epoch": 3.069420615019137, "grad_norm": 0.15716034173965454, "learning_rate": 4.0277644129010927e-05, "loss": 0.1017, "num_input_tokens_seen": 10396992, "step": 11630 }, { "epoch": 3.0707403985746335, "grad_norm": 0.22831644117832184, "learning_rate": 4.0269871937541724e-05, "loss": 0.0685, "num_input_tokens_seen": 10401184, "step": 11635 }, { "epoch": 3.0720601821301305, "grad_norm": 0.30151504278182983, "learning_rate": 4.026209739126637e-05, "loss": 0.1281, "num_input_tokens_seen": 10405280, "step": 11640 }, { "epoch": 3.0733799656856275, "grad_norm": 0.35103875398635864, "learning_rate": 4.025432049138381e-05, "loss": 0.0602, "num_input_tokens_seen": 10410112, "step": 11645 }, { "epoch": 3.0746997492411245, "grad_norm": 0.17283488810062408, "learning_rate": 4.0246541239093325e-05, "loss": 0.058, "num_input_tokens_seen": 10414528, "step": 11650 }, { "epoch": 3.0760195327966215, "grad_norm": 0.14907215535640717, "learning_rate": 4.023875963559459e-05, "loss": 0.0319, "num_input_tokens_seen": 10418976, "step": 11655 }, { "epoch": 3.077339316352118, "grad_norm": 0.09874022752046585, "learning_rate": 4.023097568208761e-05, "loss": 0.0914, "num_input_tokens_seen": 10423648, "step": 11660 }, { "epoch": 3.078659099907615, "grad_norm": 0.10067681968212128, "learning_rate": 4.022318937977277e-05, "loss": 0.0404, "num_input_tokens_seen": 10427968, "step": 11665 }, { "epoch": 3.079978883463112, "grad_norm": 0.057094722986221313, "learning_rate": 4.021540072985084e-05, "loss": 0.1123, "num_input_tokens_seen": 10432384, "step": 11670 }, { "epoch": 3.081298667018609, "grad_norm": 0.24319683015346527, "learning_rate": 4.020760973352289e-05, "loss": 0.0448, "num_input_tokens_seen": 10436768, "step": 11675 }, { "epoch": 3.0826184505741057, "grad_norm": 0.10271193832159042, "learning_rate": 4.019981639199042e-05, "loss": 0.0746, "num_input_tokens_seen": 10441152, "step": 11680 }, { "epoch": 3.0839382341296027, "grad_norm": 0.11218240112066269, "learning_rate": 4.0192020706455245e-05, "loss": 0.0546, "num_input_tokens_seen": 10445408, "step": 11685 }, { "epoch": 3.0852580176850997, "grad_norm": 0.2968849837779999, "learning_rate": 4.018422267811956e-05, "loss": 0.0834, "num_input_tokens_seen": 10449920, "step": 11690 }, { "epoch": 3.0865778012405967, "grad_norm": 0.060434628278017044, "learning_rate": 4.017642230818592e-05, "loss": 0.0497, "num_input_tokens_seen": 10454560, "step": 11695 }, { "epoch": 3.087897584796093, "grad_norm": 0.38852328062057495, "learning_rate": 4.0168619597857246e-05, "loss": 0.0578, "num_input_tokens_seen": 10458752, "step": 11700 }, { "epoch": 3.08921736835159, "grad_norm": 0.10935933142900467, "learning_rate": 4.016081454833681e-05, "loss": 0.0257, "num_input_tokens_seen": 10463200, "step": 11705 }, { "epoch": 3.090537151907087, "grad_norm": 0.12661324441432953, "learning_rate": 4.0153007160828245e-05, "loss": 0.0558, "num_input_tokens_seen": 10467552, "step": 11710 }, { "epoch": 3.091856935462584, "grad_norm": 0.05486902967095375, "learning_rate": 4.0145197436535555e-05, "loss": 0.0846, "num_input_tokens_seen": 10471904, "step": 11715 }, { "epoch": 3.0931767190180812, "grad_norm": 0.6579387187957764, "learning_rate": 4.0137385376663095e-05, "loss": 0.0906, "num_input_tokens_seen": 10476672, "step": 11720 }, { "epoch": 3.094496502573578, "grad_norm": 0.13793013989925385, "learning_rate": 4.012957098241558e-05, "loss": 0.0513, "num_input_tokens_seen": 10481152, "step": 11725 }, { "epoch": 3.095816286129075, "grad_norm": 0.23525623977184296, "learning_rate": 4.0121754254998076e-05, "loss": 0.0809, "num_input_tokens_seen": 10485632, "step": 11730 }, { "epoch": 3.097136069684572, "grad_norm": 0.29484468698501587, "learning_rate": 4.011393519561606e-05, "loss": 0.0824, "num_input_tokens_seen": 10490208, "step": 11735 }, { "epoch": 3.098455853240069, "grad_norm": 0.08719450980424881, "learning_rate": 4.010611380547529e-05, "loss": 0.0309, "num_input_tokens_seen": 10494496, "step": 11740 }, { "epoch": 3.0997756367955653, "grad_norm": 0.4006711542606354, "learning_rate": 4.009829008578192e-05, "loss": 0.0744, "num_input_tokens_seen": 10499008, "step": 11745 }, { "epoch": 3.1010954203510623, "grad_norm": 0.06774111837148666, "learning_rate": 4.00904640377425e-05, "loss": 0.0437, "num_input_tokens_seen": 10503296, "step": 11750 }, { "epoch": 3.1024152039065593, "grad_norm": 0.4838588535785675, "learning_rate": 4.0082635662563886e-05, "loss": 0.0538, "num_input_tokens_seen": 10507616, "step": 11755 }, { "epoch": 3.1037349874620563, "grad_norm": 0.20226526260375977, "learning_rate": 4.007480496145331e-05, "loss": 0.0453, "num_input_tokens_seen": 10512032, "step": 11760 }, { "epoch": 3.105054771017553, "grad_norm": 0.09381644427776337, "learning_rate": 4.006697193561837e-05, "loss": 0.0651, "num_input_tokens_seen": 10516544, "step": 11765 }, { "epoch": 3.10637455457305, "grad_norm": 0.2197590470314026, "learning_rate": 4.005913658626701e-05, "loss": 0.0507, "num_input_tokens_seen": 10520928, "step": 11770 }, { "epoch": 3.107694338128547, "grad_norm": 0.15821683406829834, "learning_rate": 4.005129891460754e-05, "loss": 0.1076, "num_input_tokens_seen": 10525472, "step": 11775 }, { "epoch": 3.109014121684044, "grad_norm": 0.1322133094072342, "learning_rate": 4.004345892184864e-05, "loss": 0.0388, "num_input_tokens_seen": 10529920, "step": 11780 }, { "epoch": 3.110333905239541, "grad_norm": 0.22874826192855835, "learning_rate": 4.003561660919932e-05, "loss": 0.0745, "num_input_tokens_seen": 10534048, "step": 11785 }, { "epoch": 3.1116536887950375, "grad_norm": 0.07344454526901245, "learning_rate": 4.002777197786897e-05, "loss": 0.0423, "num_input_tokens_seen": 10538784, "step": 11790 }, { "epoch": 3.1129734723505345, "grad_norm": 0.2665601074695587, "learning_rate": 4.0019925029067326e-05, "loss": 0.0405, "num_input_tokens_seen": 10543104, "step": 11795 }, { "epoch": 3.1142932559060315, "grad_norm": 0.39506542682647705, "learning_rate": 4.0012075764004495e-05, "loss": 0.0574, "num_input_tokens_seen": 10547296, "step": 11800 }, { "epoch": 3.1142932559060315, "eval_loss": 0.07149194180965424, "eval_runtime": 64.7435, "eval_samples_per_second": 104.026, "eval_steps_per_second": 26.01, "num_input_tokens_seen": 10547296, "step": 11800 }, { "epoch": 3.1156130394615285, "grad_norm": 0.1312103420495987, "learning_rate": 4.000422418389094e-05, "loss": 0.1032, "num_input_tokens_seen": 10551968, "step": 11805 }, { "epoch": 3.116932823017025, "grad_norm": 0.1038178876042366, "learning_rate": 3.999637028993744e-05, "loss": 0.051, "num_input_tokens_seen": 10556512, "step": 11810 }, { "epoch": 3.118252606572522, "grad_norm": 0.12283166497945786, "learning_rate": 3.99885140833552e-05, "loss": 0.0735, "num_input_tokens_seen": 10560832, "step": 11815 }, { "epoch": 3.119572390128019, "grad_norm": 0.0407571904361248, "learning_rate": 3.998065556535572e-05, "loss": 0.0499, "num_input_tokens_seen": 10565440, "step": 11820 }, { "epoch": 3.120892173683516, "grad_norm": 0.40160784125328064, "learning_rate": 3.9972794737150895e-05, "loss": 0.0719, "num_input_tokens_seen": 10569824, "step": 11825 }, { "epoch": 3.122211957239013, "grad_norm": 0.2049623280763626, "learning_rate": 3.996493159995297e-05, "loss": 0.0578, "num_input_tokens_seen": 10573984, "step": 11830 }, { "epoch": 3.1235317407945096, "grad_norm": 0.20785745978355408, "learning_rate": 3.995706615497453e-05, "loss": 0.0581, "num_input_tokens_seen": 10578272, "step": 11835 }, { "epoch": 3.1248515243500066, "grad_norm": 0.3975113034248352, "learning_rate": 3.994919840342852e-05, "loss": 0.0634, "num_input_tokens_seen": 10582752, "step": 11840 }, { "epoch": 3.1261713079055036, "grad_norm": 0.7879478335380554, "learning_rate": 3.994132834652825e-05, "loss": 0.1033, "num_input_tokens_seen": 10586944, "step": 11845 }, { "epoch": 3.1274910914610006, "grad_norm": 0.20513638854026794, "learning_rate": 3.99334559854874e-05, "loss": 0.1151, "num_input_tokens_seen": 10591424, "step": 11850 }, { "epoch": 3.128810875016497, "grad_norm": 0.3745201826095581, "learning_rate": 3.9925581321519955e-05, "loss": 0.0983, "num_input_tokens_seen": 10596320, "step": 11855 }, { "epoch": 3.130130658571994, "grad_norm": 0.024210810661315918, "learning_rate": 3.991770435584031e-05, "loss": 0.0665, "num_input_tokens_seen": 10600896, "step": 11860 }, { "epoch": 3.131450442127491, "grad_norm": 0.3471834063529968, "learning_rate": 3.990982508966319e-05, "loss": 0.0641, "num_input_tokens_seen": 10605600, "step": 11865 }, { "epoch": 3.132770225682988, "grad_norm": 0.10622302442789078, "learning_rate": 3.990194352420367e-05, "loss": 0.0543, "num_input_tokens_seen": 10609952, "step": 11870 }, { "epoch": 3.1340900092384847, "grad_norm": 0.23643702268600464, "learning_rate": 3.9894059660677184e-05, "loss": 0.054, "num_input_tokens_seen": 10614784, "step": 11875 }, { "epoch": 3.1354097927939817, "grad_norm": 0.10862113535404205, "learning_rate": 3.9886173500299526e-05, "loss": 0.0881, "num_input_tokens_seen": 10619264, "step": 11880 }, { "epoch": 3.1367295763494787, "grad_norm": 0.10649561882019043, "learning_rate": 3.987828504428685e-05, "loss": 0.0607, "num_input_tokens_seen": 10623808, "step": 11885 }, { "epoch": 3.1380493599049757, "grad_norm": 0.4502353370189667, "learning_rate": 3.987039429385565e-05, "loss": 0.0798, "num_input_tokens_seen": 10628352, "step": 11890 }, { "epoch": 3.1393691434604727, "grad_norm": 0.24622097611427307, "learning_rate": 3.986250125022277e-05, "loss": 0.0619, "num_input_tokens_seen": 10632800, "step": 11895 }, { "epoch": 3.1406889270159692, "grad_norm": 0.13194748759269714, "learning_rate": 3.985460591460544e-05, "loss": 0.0613, "num_input_tokens_seen": 10637248, "step": 11900 }, { "epoch": 3.1420087105714662, "grad_norm": 0.24656416475772858, "learning_rate": 3.984670828822118e-05, "loss": 0.079, "num_input_tokens_seen": 10641632, "step": 11905 }, { "epoch": 3.1433284941269632, "grad_norm": 0.23687931895256042, "learning_rate": 3.983880837228794e-05, "loss": 0.0425, "num_input_tokens_seen": 10645664, "step": 11910 }, { "epoch": 3.1446482776824602, "grad_norm": 0.11159971356391907, "learning_rate": 3.983090616802396e-05, "loss": 0.0625, "num_input_tokens_seen": 10649792, "step": 11915 }, { "epoch": 3.145968061237957, "grad_norm": 0.02664664015173912, "learning_rate": 3.982300167664788e-05, "loss": 0.0492, "num_input_tokens_seen": 10654528, "step": 11920 }, { "epoch": 3.147287844793454, "grad_norm": 0.12186938524246216, "learning_rate": 3.981509489937868e-05, "loss": 0.0294, "num_input_tokens_seen": 10659168, "step": 11925 }, { "epoch": 3.148607628348951, "grad_norm": 0.10504790395498276, "learning_rate": 3.9807185837435643e-05, "loss": 0.03, "num_input_tokens_seen": 10663776, "step": 11930 }, { "epoch": 3.149927411904448, "grad_norm": 0.3318347632884979, "learning_rate": 3.9799274492038484e-05, "loss": 0.0572, "num_input_tokens_seen": 10668320, "step": 11935 }, { "epoch": 3.1512471954599444, "grad_norm": 0.2635408937931061, "learning_rate": 3.979136086440722e-05, "loss": 0.0674, "num_input_tokens_seen": 10672640, "step": 11940 }, { "epoch": 3.1525669790154414, "grad_norm": 0.2946043014526367, "learning_rate": 3.9783444955762226e-05, "loss": 0.0895, "num_input_tokens_seen": 10676704, "step": 11945 }, { "epoch": 3.1538867625709384, "grad_norm": 0.28410494327545166, "learning_rate": 3.977552676732424e-05, "loss": 0.0795, "num_input_tokens_seen": 10681056, "step": 11950 }, { "epoch": 3.1552065461264354, "grad_norm": 0.054054390639066696, "learning_rate": 3.976760630031435e-05, "loss": 0.0616, "num_input_tokens_seen": 10685472, "step": 11955 }, { "epoch": 3.1565263296819324, "grad_norm": 0.23285432159900665, "learning_rate": 3.975968355595398e-05, "loss": 0.0264, "num_input_tokens_seen": 10689856, "step": 11960 }, { "epoch": 3.157846113237429, "grad_norm": 0.169199138879776, "learning_rate": 3.9751758535464935e-05, "loss": 0.0704, "num_input_tokens_seen": 10694560, "step": 11965 }, { "epoch": 3.159165896792926, "grad_norm": 0.26040729880332947, "learning_rate": 3.9743831240069326e-05, "loss": 0.0616, "num_input_tokens_seen": 10699232, "step": 11970 }, { "epoch": 3.160485680348423, "grad_norm": 0.058169495314359665, "learning_rate": 3.9735901670989675e-05, "loss": 0.0251, "num_input_tokens_seen": 10703808, "step": 11975 }, { "epoch": 3.16180546390392, "grad_norm": 0.24905069172382355, "learning_rate": 3.97279698294488e-05, "loss": 0.091, "num_input_tokens_seen": 10708160, "step": 11980 }, { "epoch": 3.1631252474594165, "grad_norm": 0.06303642690181732, "learning_rate": 3.9720035716669876e-05, "loss": 0.0551, "num_input_tokens_seen": 10712736, "step": 11985 }, { "epoch": 3.1644450310149135, "grad_norm": 0.26368066668510437, "learning_rate": 3.9712099333876474e-05, "loss": 0.0594, "num_input_tokens_seen": 10717440, "step": 11990 }, { "epoch": 3.1657648145704105, "grad_norm": 0.40200507640838623, "learning_rate": 3.9704160682292475e-05, "loss": 0.0911, "num_input_tokens_seen": 10722016, "step": 11995 }, { "epoch": 3.1670845981259075, "grad_norm": 0.175954669713974, "learning_rate": 3.9696219763142106e-05, "loss": 0.0681, "num_input_tokens_seen": 10726592, "step": 12000 }, { "epoch": 3.1670845981259075, "eval_loss": 0.07133092731237411, "eval_runtime": 64.7285, "eval_samples_per_second": 104.05, "eval_steps_per_second": 26.016, "num_input_tokens_seen": 10726592, "step": 12000 }, { "epoch": 3.1684043816814045, "grad_norm": 0.06245870888233185, "learning_rate": 3.968827657764997e-05, "loss": 0.0329, "num_input_tokens_seen": 10731200, "step": 12005 }, { "epoch": 3.169724165236901, "grad_norm": 0.3634347915649414, "learning_rate": 3.9680331127041e-05, "loss": 0.0448, "num_input_tokens_seen": 10735552, "step": 12010 }, { "epoch": 3.171043948792398, "grad_norm": 0.2791411280632019, "learning_rate": 3.9672383412540495e-05, "loss": 0.1529, "num_input_tokens_seen": 10739872, "step": 12015 }, { "epoch": 3.172363732347895, "grad_norm": 0.5884649157524109, "learning_rate": 3.966443343537407e-05, "loss": 0.0787, "num_input_tokens_seen": 10744448, "step": 12020 }, { "epoch": 3.173683515903392, "grad_norm": 0.14738459885120392, "learning_rate": 3.965648119676772e-05, "loss": 0.0243, "num_input_tokens_seen": 10748800, "step": 12025 }, { "epoch": 3.1750032994588886, "grad_norm": 0.12158913910388947, "learning_rate": 3.96485266979478e-05, "loss": 0.094, "num_input_tokens_seen": 10753056, "step": 12030 }, { "epoch": 3.1763230830143856, "grad_norm": 0.36428841948509216, "learning_rate": 3.9640569940140974e-05, "loss": 0.0721, "num_input_tokens_seen": 10757728, "step": 12035 }, { "epoch": 3.1776428665698826, "grad_norm": 0.29071515798568726, "learning_rate": 3.963261092457428e-05, "loss": 0.0489, "num_input_tokens_seen": 10762400, "step": 12040 }, { "epoch": 3.1789626501253796, "grad_norm": 0.23073220252990723, "learning_rate": 3.962464965247509e-05, "loss": 0.0581, "num_input_tokens_seen": 10767072, "step": 12045 }, { "epoch": 3.180282433680876, "grad_norm": 0.0712454691529274, "learning_rate": 3.9616686125071135e-05, "loss": 0.0376, "num_input_tokens_seen": 10771424, "step": 12050 }, { "epoch": 3.181602217236373, "grad_norm": 0.11387949436903, "learning_rate": 3.9608720343590506e-05, "loss": 0.041, "num_input_tokens_seen": 10775840, "step": 12055 }, { "epoch": 3.18292200079187, "grad_norm": 0.2871323525905609, "learning_rate": 3.960075230926161e-05, "loss": 0.0797, "num_input_tokens_seen": 10780384, "step": 12060 }, { "epoch": 3.184241784347367, "grad_norm": 0.2290744036436081, "learning_rate": 3.959278202331322e-05, "loss": 0.1167, "num_input_tokens_seen": 10784800, "step": 12065 }, { "epoch": 3.185561567902864, "grad_norm": 0.1367940753698349, "learning_rate": 3.958480948697446e-05, "loss": 0.0721, "num_input_tokens_seen": 10789120, "step": 12070 }, { "epoch": 3.1868813514583607, "grad_norm": 0.37057942152023315, "learning_rate": 3.95768347014748e-05, "loss": 0.1003, "num_input_tokens_seen": 10793408, "step": 12075 }, { "epoch": 3.1882011350138577, "grad_norm": 0.42554354667663574, "learning_rate": 3.956885766804404e-05, "loss": 0.1077, "num_input_tokens_seen": 10797600, "step": 12080 }, { "epoch": 3.1895209185693547, "grad_norm": 0.19073957204818726, "learning_rate": 3.956087838791235e-05, "loss": 0.0477, "num_input_tokens_seen": 10802112, "step": 12085 }, { "epoch": 3.1908407021248517, "grad_norm": 0.3140142560005188, "learning_rate": 3.955289686231022e-05, "loss": 0.0589, "num_input_tokens_seen": 10806624, "step": 12090 }, { "epoch": 3.1921604856803483, "grad_norm": 0.4150667190551758, "learning_rate": 3.9544913092468504e-05, "loss": 0.1311, "num_input_tokens_seen": 10811232, "step": 12095 }, { "epoch": 3.1934802692358453, "grad_norm": 0.18257953226566315, "learning_rate": 3.9536927079618425e-05, "loss": 0.0448, "num_input_tokens_seen": 10815648, "step": 12100 }, { "epoch": 3.1948000527913423, "grad_norm": 0.2698443531990051, "learning_rate": 3.9528938824991494e-05, "loss": 0.0617, "num_input_tokens_seen": 10820000, "step": 12105 }, { "epoch": 3.1961198363468393, "grad_norm": 0.17314468324184418, "learning_rate": 3.952094832981962e-05, "loss": 0.0419, "num_input_tokens_seen": 10824416, "step": 12110 }, { "epoch": 3.197439619902336, "grad_norm": 0.4345317780971527, "learning_rate": 3.951295559533503e-05, "loss": 0.101, "num_input_tokens_seen": 10828992, "step": 12115 }, { "epoch": 3.198759403457833, "grad_norm": 0.27941009402275085, "learning_rate": 3.95049606227703e-05, "loss": 0.102, "num_input_tokens_seen": 10833856, "step": 12120 }, { "epoch": 3.20007918701333, "grad_norm": 0.11721979826688766, "learning_rate": 3.949696341335838e-05, "loss": 0.0307, "num_input_tokens_seen": 10838368, "step": 12125 }, { "epoch": 3.201398970568827, "grad_norm": 0.2687836289405823, "learning_rate": 3.9488963968332503e-05, "loss": 0.091, "num_input_tokens_seen": 10842880, "step": 12130 }, { "epoch": 3.202718754124324, "grad_norm": 0.0599936805665493, "learning_rate": 3.948096228892631e-05, "loss": 0.0904, "num_input_tokens_seen": 10847552, "step": 12135 }, { "epoch": 3.2040385376798204, "grad_norm": 0.3186212182044983, "learning_rate": 3.947295837637375e-05, "loss": 0.0881, "num_input_tokens_seen": 10851936, "step": 12140 }, { "epoch": 3.2053583212353174, "grad_norm": 0.25621819496154785, "learning_rate": 3.9464952231909135e-05, "loss": 0.0651, "num_input_tokens_seen": 10856640, "step": 12145 }, { "epoch": 3.2066781047908144, "grad_norm": 0.19948351383209229, "learning_rate": 3.945694385676711e-05, "loss": 0.0751, "num_input_tokens_seen": 10860960, "step": 12150 }, { "epoch": 3.2079978883463114, "grad_norm": 0.25004154443740845, "learning_rate": 3.944893325218265e-05, "loss": 0.0393, "num_input_tokens_seen": 10865504, "step": 12155 }, { "epoch": 3.209317671901808, "grad_norm": 0.3196832537651062, "learning_rate": 3.944092041939112e-05, "loss": 0.057, "num_input_tokens_seen": 10870016, "step": 12160 }, { "epoch": 3.210637455457305, "grad_norm": 0.22929449379444122, "learning_rate": 3.943290535962818e-05, "loss": 0.0415, "num_input_tokens_seen": 10874240, "step": 12165 }, { "epoch": 3.211957239012802, "grad_norm": 0.038272034376859665, "learning_rate": 3.942488807412985e-05, "loss": 0.0626, "num_input_tokens_seen": 10878752, "step": 12170 }, { "epoch": 3.213277022568299, "grad_norm": 0.2779572010040283, "learning_rate": 3.941686856413251e-05, "loss": 0.0547, "num_input_tokens_seen": 10883104, "step": 12175 }, { "epoch": 3.2145968061237955, "grad_norm": 0.41146060824394226, "learning_rate": 3.9408846830872874e-05, "loss": 0.045, "num_input_tokens_seen": 10887904, "step": 12180 }, { "epoch": 3.2159165896792925, "grad_norm": 0.11652016639709473, "learning_rate": 3.940082287558798e-05, "loss": 0.0487, "num_input_tokens_seen": 10892448, "step": 12185 }, { "epoch": 3.2172363732347895, "grad_norm": 0.09325562417507172, "learning_rate": 3.939279669951522e-05, "loss": 0.1289, "num_input_tokens_seen": 10896896, "step": 12190 }, { "epoch": 3.2185561567902865, "grad_norm": 0.052444543689489365, "learning_rate": 3.938476830389234e-05, "loss": 0.0798, "num_input_tokens_seen": 10901536, "step": 12195 }, { "epoch": 3.2198759403457835, "grad_norm": 0.08701413869857788, "learning_rate": 3.937673768995742e-05, "loss": 0.0118, "num_input_tokens_seen": 10905760, "step": 12200 }, { "epoch": 3.2198759403457835, "eval_loss": 0.07127954065799713, "eval_runtime": 64.8005, "eval_samples_per_second": 103.934, "eval_steps_per_second": 25.987, "num_input_tokens_seen": 10905760, "step": 12200 }, { "epoch": 3.22119572390128, "grad_norm": 0.24336403608322144, "learning_rate": 3.936870485894888e-05, "loss": 0.0732, "num_input_tokens_seen": 10909856, "step": 12205 }, { "epoch": 3.222515507456777, "grad_norm": 0.10953997820615768, "learning_rate": 3.9360669812105475e-05, "loss": 0.0522, "num_input_tokens_seen": 10914624, "step": 12210 }, { "epoch": 3.223835291012274, "grad_norm": 0.07805051654577255, "learning_rate": 3.9352632550666325e-05, "loss": 0.0264, "num_input_tokens_seen": 10919392, "step": 12215 }, { "epoch": 3.225155074567771, "grad_norm": 0.16056789457798004, "learning_rate": 3.9344593075870866e-05, "loss": 0.0373, "num_input_tokens_seen": 10923808, "step": 12220 }, { "epoch": 3.2264748581232676, "grad_norm": 0.3008278012275696, "learning_rate": 3.933655138895889e-05, "loss": 0.0592, "num_input_tokens_seen": 10928256, "step": 12225 }, { "epoch": 3.2277946416787646, "grad_norm": 0.12557865679264069, "learning_rate": 3.932850749117053e-05, "loss": 0.1159, "num_input_tokens_seen": 10932768, "step": 12230 }, { "epoch": 3.2291144252342616, "grad_norm": 0.1652558594942093, "learning_rate": 3.932046138374624e-05, "loss": 0.0401, "num_input_tokens_seen": 10937024, "step": 12235 }, { "epoch": 3.2304342087897586, "grad_norm": 0.13291266560554504, "learning_rate": 3.9312413067926854e-05, "loss": 0.0286, "num_input_tokens_seen": 10941632, "step": 12240 }, { "epoch": 3.231753992345255, "grad_norm": 0.39460572600364685, "learning_rate": 3.9304362544953506e-05, "loss": 0.0593, "num_input_tokens_seen": 10946016, "step": 12245 }, { "epoch": 3.233073775900752, "grad_norm": 0.10827259719371796, "learning_rate": 3.929630981606769e-05, "loss": 0.1123, "num_input_tokens_seen": 10950464, "step": 12250 }, { "epoch": 3.234393559456249, "grad_norm": 0.25769445300102234, "learning_rate": 3.928825488251124e-05, "loss": 0.0391, "num_input_tokens_seen": 10954912, "step": 12255 }, { "epoch": 3.235713343011746, "grad_norm": 0.35614487528800964, "learning_rate": 3.9280197745526344e-05, "loss": 0.0597, "num_input_tokens_seen": 10959392, "step": 12260 }, { "epoch": 3.237033126567243, "grad_norm": 0.28672635555267334, "learning_rate": 3.9272138406355495e-05, "loss": 0.066, "num_input_tokens_seen": 10963584, "step": 12265 }, { "epoch": 3.2383529101227397, "grad_norm": 0.16827726364135742, "learning_rate": 3.926407686624154e-05, "loss": 0.0429, "num_input_tokens_seen": 10968064, "step": 12270 }, { "epoch": 3.2396726936782367, "grad_norm": 0.1468430608510971, "learning_rate": 3.9256013126427684e-05, "loss": 0.0204, "num_input_tokens_seen": 10972800, "step": 12275 }, { "epoch": 3.2409924772337337, "grad_norm": 0.8434406518936157, "learning_rate": 3.9247947188157455e-05, "loss": 0.147, "num_input_tokens_seen": 10977696, "step": 12280 }, { "epoch": 3.2423122607892307, "grad_norm": 0.07937264442443848, "learning_rate": 3.9239879052674715e-05, "loss": 0.0643, "num_input_tokens_seen": 10982496, "step": 12285 }, { "epoch": 3.2436320443447273, "grad_norm": 0.6772339344024658, "learning_rate": 3.9231808721223673e-05, "loss": 0.0914, "num_input_tokens_seen": 10986944, "step": 12290 }, { "epoch": 3.2449518279002243, "grad_norm": 0.3079594075679779, "learning_rate": 3.9223736195048886e-05, "loss": 0.0261, "num_input_tokens_seen": 10991232, "step": 12295 }, { "epoch": 3.2462716114557213, "grad_norm": 0.12396866083145142, "learning_rate": 3.921566147539523e-05, "loss": 0.0622, "num_input_tokens_seen": 10995520, "step": 12300 }, { "epoch": 3.2475913950112183, "grad_norm": 0.5206104516983032, "learning_rate": 3.920758456350792e-05, "loss": 0.1253, "num_input_tokens_seen": 11000192, "step": 12305 }, { "epoch": 3.248911178566715, "grad_norm": 0.2579779624938965, "learning_rate": 3.919950546063253e-05, "loss": 0.1148, "num_input_tokens_seen": 11004480, "step": 12310 }, { "epoch": 3.250230962122212, "grad_norm": 0.20088797807693481, "learning_rate": 3.919142416801496e-05, "loss": 0.0872, "num_input_tokens_seen": 11009184, "step": 12315 }, { "epoch": 3.251550745677709, "grad_norm": 0.2147436887025833, "learning_rate": 3.918334068690144e-05, "loss": 0.0453, "num_input_tokens_seen": 11013664, "step": 12320 }, { "epoch": 3.252870529233206, "grad_norm": 0.07938262820243835, "learning_rate": 3.917525501853855e-05, "loss": 0.0673, "num_input_tokens_seen": 11018336, "step": 12325 }, { "epoch": 3.254190312788703, "grad_norm": 0.13699668645858765, "learning_rate": 3.916716716417319e-05, "loss": 0.0563, "num_input_tokens_seen": 11022848, "step": 12330 }, { "epoch": 3.2555100963441994, "grad_norm": 0.11194593459367752, "learning_rate": 3.915907712505263e-05, "loss": 0.0303, "num_input_tokens_seen": 11027648, "step": 12335 }, { "epoch": 3.2568298798996964, "grad_norm": 0.2792606055736542, "learning_rate": 3.915098490242444e-05, "loss": 0.0436, "num_input_tokens_seen": 11031968, "step": 12340 }, { "epoch": 3.2581496634551934, "grad_norm": 0.42174407839775085, "learning_rate": 3.914289049753654e-05, "loss": 0.0763, "num_input_tokens_seen": 11036448, "step": 12345 }, { "epoch": 3.2594694470106904, "grad_norm": 0.2800735831260681, "learning_rate": 3.913479391163719e-05, "loss": 0.081, "num_input_tokens_seen": 11040704, "step": 12350 }, { "epoch": 3.260789230566187, "grad_norm": 0.16416536271572113, "learning_rate": 3.9126695145975e-05, "loss": 0.1102, "num_input_tokens_seen": 11045120, "step": 12355 }, { "epoch": 3.262109014121684, "grad_norm": 0.18411599099636078, "learning_rate": 3.911859420179889e-05, "loss": 0.0659, "num_input_tokens_seen": 11049568, "step": 12360 }, { "epoch": 3.263428797677181, "grad_norm": 0.0886845737695694, "learning_rate": 3.911049108035813e-05, "loss": 0.0349, "num_input_tokens_seen": 11054176, "step": 12365 }, { "epoch": 3.264748581232678, "grad_norm": 0.07868367433547974, "learning_rate": 3.910238578290232e-05, "loss": 0.0385, "num_input_tokens_seen": 11058656, "step": 12370 }, { "epoch": 3.2660683647881745, "grad_norm": 0.3242507874965668, "learning_rate": 3.90942783106814e-05, "loss": 0.0476, "num_input_tokens_seen": 11063328, "step": 12375 }, { "epoch": 3.2673881483436715, "grad_norm": 0.19056358933448792, "learning_rate": 3.908616866494564e-05, "loss": 0.1009, "num_input_tokens_seen": 11067776, "step": 12380 }, { "epoch": 3.2687079318991685, "grad_norm": 0.06202366575598717, "learning_rate": 3.907805684694566e-05, "loss": 0.0445, "num_input_tokens_seen": 11072896, "step": 12385 }, { "epoch": 3.2700277154546655, "grad_norm": 0.2809184789657593, "learning_rate": 3.90699428579324e-05, "loss": 0.0483, "num_input_tokens_seen": 11077248, "step": 12390 }, { "epoch": 3.2713474990101625, "grad_norm": 0.27314749360084534, "learning_rate": 3.906182669915713e-05, "loss": 0.0326, "num_input_tokens_seen": 11081856, "step": 12395 }, { "epoch": 3.272667282565659, "grad_norm": 0.3536410927772522, "learning_rate": 3.9053708371871476e-05, "loss": 0.0654, "num_input_tokens_seen": 11086528, "step": 12400 }, { "epoch": 3.272667282565659, "eval_loss": 0.07118018716573715, "eval_runtime": 64.7659, "eval_samples_per_second": 103.99, "eval_steps_per_second": 26.001, "num_input_tokens_seen": 11086528, "step": 12400 }, { "epoch": 3.273987066121156, "grad_norm": 0.07660038769245148, "learning_rate": 3.904558787732738e-05, "loss": 0.0897, "num_input_tokens_seen": 11091360, "step": 12405 }, { "epoch": 3.275306849676653, "grad_norm": 0.3222607672214508, "learning_rate": 3.9037465216777135e-05, "loss": 0.0735, "num_input_tokens_seen": 11095488, "step": 12410 }, { "epoch": 3.27662663323215, "grad_norm": 0.537509024143219, "learning_rate": 3.902934039147334e-05, "loss": 0.1332, "num_input_tokens_seen": 11100000, "step": 12415 }, { "epoch": 3.2779464167876466, "grad_norm": 0.1443837285041809, "learning_rate": 3.902121340266894e-05, "loss": 0.0366, "num_input_tokens_seen": 11104256, "step": 12420 }, { "epoch": 3.2792662003431436, "grad_norm": 0.156450554728508, "learning_rate": 3.9013084251617246e-05, "loss": 0.041, "num_input_tokens_seen": 11108800, "step": 12425 }, { "epoch": 3.2805859838986406, "grad_norm": 0.03396226093173027, "learning_rate": 3.9004952939571865e-05, "loss": 0.0185, "num_input_tokens_seen": 11113120, "step": 12430 }, { "epoch": 3.2819057674541376, "grad_norm": 0.3223204016685486, "learning_rate": 3.899681946778673e-05, "loss": 0.0586, "num_input_tokens_seen": 11117664, "step": 12435 }, { "epoch": 3.283225551009634, "grad_norm": 0.08191344887018204, "learning_rate": 3.898868383751615e-05, "loss": 0.0721, "num_input_tokens_seen": 11122336, "step": 12440 }, { "epoch": 3.284545334565131, "grad_norm": 0.060574185103178024, "learning_rate": 3.8980546050014724e-05, "loss": 0.0466, "num_input_tokens_seen": 11126432, "step": 12445 }, { "epoch": 3.285865118120628, "grad_norm": 0.4308983087539673, "learning_rate": 3.897240610653741e-05, "loss": 0.1267, "num_input_tokens_seen": 11130784, "step": 12450 }, { "epoch": 3.287184901676125, "grad_norm": 0.027273230254650116, "learning_rate": 3.896426400833948e-05, "loss": 0.0734, "num_input_tokens_seen": 11135168, "step": 12455 }, { "epoch": 3.288504685231622, "grad_norm": 0.3020021915435791, "learning_rate": 3.895611975667656e-05, "loss": 0.0957, "num_input_tokens_seen": 11139648, "step": 12460 }, { "epoch": 3.2898244687871188, "grad_norm": 0.11137300729751587, "learning_rate": 3.8947973352804584e-05, "loss": 0.0467, "num_input_tokens_seen": 11144032, "step": 12465 }, { "epoch": 3.2911442523426158, "grad_norm": 0.1464376002550125, "learning_rate": 3.893982479797984e-05, "loss": 0.0614, "num_input_tokens_seen": 11148416, "step": 12470 }, { "epoch": 3.2924640358981128, "grad_norm": 0.0498352125287056, "learning_rate": 3.8931674093458926e-05, "loss": 0.034, "num_input_tokens_seen": 11152736, "step": 12475 }, { "epoch": 3.2937838194536098, "grad_norm": 0.4102264940738678, "learning_rate": 3.89235212404988e-05, "loss": 0.1037, "num_input_tokens_seen": 11157472, "step": 12480 }, { "epoch": 3.2951036030091063, "grad_norm": 0.3158841133117676, "learning_rate": 3.891536624035672e-05, "loss": 0.0467, "num_input_tokens_seen": 11161760, "step": 12485 }, { "epoch": 3.2964233865646033, "grad_norm": 0.2993851900100708, "learning_rate": 3.8907209094290295e-05, "loss": 0.0557, "num_input_tokens_seen": 11166176, "step": 12490 }, { "epoch": 3.2977431701201003, "grad_norm": 0.18618209660053253, "learning_rate": 3.8899049803557466e-05, "loss": 0.0971, "num_input_tokens_seen": 11170560, "step": 12495 }, { "epoch": 3.2990629536755973, "grad_norm": 0.30372899770736694, "learning_rate": 3.889088836941648e-05, "loss": 0.0429, "num_input_tokens_seen": 11175136, "step": 12500 }, { "epoch": 3.300382737231094, "grad_norm": 0.11637644469738007, "learning_rate": 3.8882724793125946e-05, "loss": 0.0707, "num_input_tokens_seen": 11179904, "step": 12505 }, { "epoch": 3.301702520786591, "grad_norm": 0.132761150598526, "learning_rate": 3.8874559075944794e-05, "loss": 0.0511, "num_input_tokens_seen": 11184032, "step": 12510 }, { "epoch": 3.303022304342088, "grad_norm": 0.08603093028068542, "learning_rate": 3.886639121913227e-05, "loss": 0.087, "num_input_tokens_seen": 11188416, "step": 12515 }, { "epoch": 3.304342087897585, "grad_norm": 0.44068774580955505, "learning_rate": 3.885822122394797e-05, "loss": 0.081, "num_input_tokens_seen": 11192832, "step": 12520 }, { "epoch": 3.305661871453082, "grad_norm": 0.22403687238693237, "learning_rate": 3.8850049091651794e-05, "loss": 0.075, "num_input_tokens_seen": 11197248, "step": 12525 }, { "epoch": 3.3069816550085784, "grad_norm": 0.10143960267305374, "learning_rate": 3.8841874823504e-05, "loss": 0.0731, "num_input_tokens_seen": 11201888, "step": 12530 }, { "epoch": 3.3083014385640754, "grad_norm": 0.18605142831802368, "learning_rate": 3.8833698420765157e-05, "loss": 0.0613, "num_input_tokens_seen": 11206400, "step": 12535 }, { "epoch": 3.3096212221195724, "grad_norm": 0.1609012931585312, "learning_rate": 3.882551988469618e-05, "loss": 0.0655, "num_input_tokens_seen": 11210976, "step": 12540 }, { "epoch": 3.3109410056750694, "grad_norm": 0.32810917496681213, "learning_rate": 3.881733921655829e-05, "loss": 0.0828, "num_input_tokens_seen": 11215776, "step": 12545 }, { "epoch": 3.312260789230566, "grad_norm": 0.1068665161728859, "learning_rate": 3.8809156417613054e-05, "loss": 0.0624, "num_input_tokens_seen": 11220128, "step": 12550 }, { "epoch": 3.313580572786063, "grad_norm": 0.11308085173368454, "learning_rate": 3.8800971489122364e-05, "loss": 0.0676, "num_input_tokens_seen": 11224480, "step": 12555 }, { "epoch": 3.31490035634156, "grad_norm": 0.28777286410331726, "learning_rate": 3.8792784432348434e-05, "loss": 0.0605, "num_input_tokens_seen": 11229088, "step": 12560 }, { "epoch": 3.316220139897057, "grad_norm": 0.14732003211975098, "learning_rate": 3.878459524855381e-05, "loss": 0.0702, "num_input_tokens_seen": 11233664, "step": 12565 }, { "epoch": 3.3175399234525536, "grad_norm": 0.10012771934270859, "learning_rate": 3.8776403939001384e-05, "loss": 0.0504, "num_input_tokens_seen": 11238176, "step": 12570 }, { "epoch": 3.3188597070080506, "grad_norm": 0.02408447675406933, "learning_rate": 3.876821050495433e-05, "loss": 0.0412, "num_input_tokens_seen": 11242560, "step": 12575 }, { "epoch": 3.3201794905635476, "grad_norm": 0.24276204407215118, "learning_rate": 3.87600149476762e-05, "loss": 0.039, "num_input_tokens_seen": 11247456, "step": 12580 }, { "epoch": 3.3214992741190446, "grad_norm": 0.23817916214466095, "learning_rate": 3.8751817268430843e-05, "loss": 0.0385, "num_input_tokens_seen": 11252224, "step": 12585 }, { "epoch": 3.3228190576745416, "grad_norm": 0.30738565325737, "learning_rate": 3.8743617468482464e-05, "loss": 0.0489, "num_input_tokens_seen": 11256608, "step": 12590 }, { "epoch": 3.324138841230038, "grad_norm": 0.29069724678993225, "learning_rate": 3.8735415549095535e-05, "loss": 0.0873, "num_input_tokens_seen": 11261376, "step": 12595 }, { "epoch": 3.325458624785535, "grad_norm": 0.312551885843277, "learning_rate": 3.8727211511534934e-05, "loss": 0.0981, "num_input_tokens_seen": 11266208, "step": 12600 }, { "epoch": 3.325458624785535, "eval_loss": 0.07108491659164429, "eval_runtime": 64.7375, "eval_samples_per_second": 104.036, "eval_steps_per_second": 26.013, "num_input_tokens_seen": 11266208, "step": 12600 }, { "epoch": 3.326778408341032, "grad_norm": 0.606665849685669, "learning_rate": 3.8719005357065804e-05, "loss": 0.0852, "num_input_tokens_seen": 11270784, "step": 12605 }, { "epoch": 3.328098191896529, "grad_norm": 0.3690475523471832, "learning_rate": 3.8710797086953645e-05, "loss": 0.0835, "num_input_tokens_seen": 11275200, "step": 12610 }, { "epoch": 3.329417975452026, "grad_norm": 0.08787667006254196, "learning_rate": 3.870258670246427e-05, "loss": 0.0288, "num_input_tokens_seen": 11279744, "step": 12615 }, { "epoch": 3.3307377590075227, "grad_norm": 0.17705772817134857, "learning_rate": 3.869437420486384e-05, "loss": 0.0787, "num_input_tokens_seen": 11284000, "step": 12620 }, { "epoch": 3.3320575425630197, "grad_norm": 0.35230889916419983, "learning_rate": 3.8686159595418805e-05, "loss": 0.076, "num_input_tokens_seen": 11288576, "step": 12625 }, { "epoch": 3.3333773261185167, "grad_norm": 0.46084660291671753, "learning_rate": 3.867794287539597e-05, "loss": 0.0832, "num_input_tokens_seen": 11292928, "step": 12630 }, { "epoch": 3.3346971096740132, "grad_norm": 0.42620107531547546, "learning_rate": 3.866972404606245e-05, "loss": 0.0796, "num_input_tokens_seen": 11297280, "step": 12635 }, { "epoch": 3.3360168932295102, "grad_norm": 0.059471841901540756, "learning_rate": 3.866150310868571e-05, "loss": 0.0551, "num_input_tokens_seen": 11302272, "step": 12640 }, { "epoch": 3.3373366767850072, "grad_norm": 0.2551204264163971, "learning_rate": 3.8653280064533506e-05, "loss": 0.0361, "num_input_tokens_seen": 11306496, "step": 12645 }, { "epoch": 3.3386564603405042, "grad_norm": 0.235077366232872, "learning_rate": 3.864505491487394e-05, "loss": 0.0373, "num_input_tokens_seen": 11311040, "step": 12650 }, { "epoch": 3.3399762438960012, "grad_norm": 0.23913665115833282, "learning_rate": 3.8636827660975414e-05, "loss": 0.0638, "num_input_tokens_seen": 11315616, "step": 12655 }, { "epoch": 3.341296027451498, "grad_norm": 0.24818812310695648, "learning_rate": 3.862859830410671e-05, "loss": 0.0816, "num_input_tokens_seen": 11319872, "step": 12660 }, { "epoch": 3.342615811006995, "grad_norm": 0.0719914510846138, "learning_rate": 3.862036684553688e-05, "loss": 0.0717, "num_input_tokens_seen": 11324160, "step": 12665 }, { "epoch": 3.343935594562492, "grad_norm": 0.28848952054977417, "learning_rate": 3.8612133286535314e-05, "loss": 0.0738, "num_input_tokens_seen": 11328352, "step": 12670 }, { "epoch": 3.345255378117989, "grad_norm": 0.15879443287849426, "learning_rate": 3.860389762837173e-05, "loss": 0.0807, "num_input_tokens_seen": 11332768, "step": 12675 }, { "epoch": 3.346575161673486, "grad_norm": 0.5777024626731873, "learning_rate": 3.859565987231618e-05, "loss": 0.1126, "num_input_tokens_seen": 11336896, "step": 12680 }, { "epoch": 3.3478949452289823, "grad_norm": 0.2672886550426483, "learning_rate": 3.858742001963902e-05, "loss": 0.1037, "num_input_tokens_seen": 11341376, "step": 12685 }, { "epoch": 3.3492147287844793, "grad_norm": 0.1624869853258133, "learning_rate": 3.857917807161094e-05, "loss": 0.0434, "num_input_tokens_seen": 11345888, "step": 12690 }, { "epoch": 3.3505345123399763, "grad_norm": 0.25792065262794495, "learning_rate": 3.857093402950296e-05, "loss": 0.0482, "num_input_tokens_seen": 11350592, "step": 12695 }, { "epoch": 3.351854295895473, "grad_norm": 0.08953763544559479, "learning_rate": 3.8562687894586414e-05, "loss": 0.0451, "num_input_tokens_seen": 11354816, "step": 12700 }, { "epoch": 3.35317407945097, "grad_norm": 0.21978148818016052, "learning_rate": 3.8554439668132946e-05, "loss": 0.0546, "num_input_tokens_seen": 11359552, "step": 12705 }, { "epoch": 3.354493863006467, "grad_norm": 0.19123108685016632, "learning_rate": 3.854618935141455e-05, "loss": 0.0434, "num_input_tokens_seen": 11364352, "step": 12710 }, { "epoch": 3.355813646561964, "grad_norm": 0.2923915386199951, "learning_rate": 3.8537936945703525e-05, "loss": 0.0422, "num_input_tokens_seen": 11368800, "step": 12715 }, { "epoch": 3.357133430117461, "grad_norm": 0.05438604950904846, "learning_rate": 3.852968245227249e-05, "loss": 0.028, "num_input_tokens_seen": 11373408, "step": 12720 }, { "epoch": 3.3584532136729575, "grad_norm": 0.4267398715019226, "learning_rate": 3.85214258723944e-05, "loss": 0.0824, "num_input_tokens_seen": 11378208, "step": 12725 }, { "epoch": 3.3597729972284545, "grad_norm": 0.06874749809503555, "learning_rate": 3.8513167207342524e-05, "loss": 0.0892, "num_input_tokens_seen": 11382688, "step": 12730 }, { "epoch": 3.3610927807839515, "grad_norm": 0.5170417428016663, "learning_rate": 3.850490645839044e-05, "loss": 0.0477, "num_input_tokens_seen": 11386944, "step": 12735 }, { "epoch": 3.3624125643394485, "grad_norm": 0.09361166507005692, "learning_rate": 3.849664362681207e-05, "loss": 0.0698, "num_input_tokens_seen": 11391424, "step": 12740 }, { "epoch": 3.3637323478949455, "grad_norm": 0.1086655855178833, "learning_rate": 3.848837871388165e-05, "loss": 0.0415, "num_input_tokens_seen": 11396032, "step": 12745 }, { "epoch": 3.365052131450442, "grad_norm": 0.030960671603679657, "learning_rate": 3.848011172087371e-05, "loss": 0.0348, "num_input_tokens_seen": 11400864, "step": 12750 }, { "epoch": 3.366371915005939, "grad_norm": 0.40509742498397827, "learning_rate": 3.847184264906315e-05, "loss": 0.078, "num_input_tokens_seen": 11405312, "step": 12755 }, { "epoch": 3.367691698561436, "grad_norm": 0.14501312375068665, "learning_rate": 3.846357149972516e-05, "loss": 0.1218, "num_input_tokens_seen": 11409824, "step": 12760 }, { "epoch": 3.3690114821169326, "grad_norm": 0.13309578597545624, "learning_rate": 3.8455298274135246e-05, "loss": 0.03, "num_input_tokens_seen": 11414496, "step": 12765 }, { "epoch": 3.3703312656724296, "grad_norm": 0.32120785117149353, "learning_rate": 3.8447022973569254e-05, "loss": 0.062, "num_input_tokens_seen": 11418784, "step": 12770 }, { "epoch": 3.3716510492279266, "grad_norm": 0.11422805488109589, "learning_rate": 3.843874559930332e-05, "loss": 0.0516, "num_input_tokens_seen": 11423520, "step": 12775 }, { "epoch": 3.3729708327834236, "grad_norm": 0.7025569081306458, "learning_rate": 3.843046615261394e-05, "loss": 0.1251, "num_input_tokens_seen": 11427552, "step": 12780 }, { "epoch": 3.3742906163389206, "grad_norm": 0.10289428383111954, "learning_rate": 3.842218463477791e-05, "loss": 0.0971, "num_input_tokens_seen": 11432000, "step": 12785 }, { "epoch": 3.375610399894417, "grad_norm": 0.22900713980197906, "learning_rate": 3.841390104707233e-05, "loss": 0.0313, "num_input_tokens_seen": 11436160, "step": 12790 }, { "epoch": 3.376930183449914, "grad_norm": 0.18482773005962372, "learning_rate": 3.8405615390774643e-05, "loss": 0.0418, "num_input_tokens_seen": 11440416, "step": 12795 }, { "epoch": 3.378249967005411, "grad_norm": 0.3416880965232849, "learning_rate": 3.839732766716259e-05, "loss": 0.043, "num_input_tokens_seen": 11445184, "step": 12800 }, { "epoch": 3.378249967005411, "eval_loss": 0.07112780958414078, "eval_runtime": 64.7412, "eval_samples_per_second": 104.03, "eval_steps_per_second": 26.011, "num_input_tokens_seen": 11445184, "step": 12800 }, { "epoch": 3.379569750560908, "grad_norm": 0.2003156542778015, "learning_rate": 3.838903787751425e-05, "loss": 0.0367, "num_input_tokens_seen": 11449728, "step": 12805 }, { "epoch": 3.380889534116405, "grad_norm": 0.07657336443662643, "learning_rate": 3.838074602310802e-05, "loss": 0.0923, "num_input_tokens_seen": 11454144, "step": 12810 }, { "epoch": 3.3822093176719017, "grad_norm": 0.3678944408893585, "learning_rate": 3.837245210522258e-05, "loss": 0.0734, "num_input_tokens_seen": 11458496, "step": 12815 }, { "epoch": 3.3835291012273987, "grad_norm": 0.05436988174915314, "learning_rate": 3.8364156125136996e-05, "loss": 0.0277, "num_input_tokens_seen": 11463200, "step": 12820 }, { "epoch": 3.3848488847828957, "grad_norm": 0.7360002398490906, "learning_rate": 3.835585808413059e-05, "loss": 0.1179, "num_input_tokens_seen": 11467968, "step": 12825 }, { "epoch": 3.3861686683383927, "grad_norm": 0.1318444311618805, "learning_rate": 3.8347557983483024e-05, "loss": 0.0898, "num_input_tokens_seen": 11472608, "step": 12830 }, { "epoch": 3.3874884518938893, "grad_norm": 0.17243808507919312, "learning_rate": 3.833925582447428e-05, "loss": 0.0616, "num_input_tokens_seen": 11477472, "step": 12835 }, { "epoch": 3.3888082354493863, "grad_norm": 0.18860001862049103, "learning_rate": 3.8330951608384656e-05, "loss": 0.0181, "num_input_tokens_seen": 11481952, "step": 12840 }, { "epoch": 3.3901280190048833, "grad_norm": 0.17510107159614563, "learning_rate": 3.832264533649477e-05, "loss": 0.0684, "num_input_tokens_seen": 11485920, "step": 12845 }, { "epoch": 3.3914478025603803, "grad_norm": 0.24049000442028046, "learning_rate": 3.8314337010085555e-05, "loss": 0.0721, "num_input_tokens_seen": 11490272, "step": 12850 }, { "epoch": 3.392767586115877, "grad_norm": 0.030918072909116745, "learning_rate": 3.830602663043824e-05, "loss": 0.0518, "num_input_tokens_seen": 11494592, "step": 12855 }, { "epoch": 3.394087369671374, "grad_norm": 0.33452993631362915, "learning_rate": 3.8297714198834414e-05, "loss": 0.0547, "num_input_tokens_seen": 11499136, "step": 12860 }, { "epoch": 3.395407153226871, "grad_norm": 0.25054389238357544, "learning_rate": 3.828939971655595e-05, "loss": 0.0484, "num_input_tokens_seen": 11503456, "step": 12865 }, { "epoch": 3.396726936782368, "grad_norm": 0.06659526377916336, "learning_rate": 3.828108318488505e-05, "loss": 0.0667, "num_input_tokens_seen": 11507712, "step": 12870 }, { "epoch": 3.398046720337865, "grad_norm": 0.28113067150115967, "learning_rate": 3.8272764605104216e-05, "loss": 0.0645, "num_input_tokens_seen": 11511936, "step": 12875 }, { "epoch": 3.3993665038933614, "grad_norm": 0.35489577054977417, "learning_rate": 3.826444397849628e-05, "loss": 0.0525, "num_input_tokens_seen": 11516576, "step": 12880 }, { "epoch": 3.4006862874488584, "grad_norm": 0.03772253170609474, "learning_rate": 3.825612130634439e-05, "loss": 0.0568, "num_input_tokens_seen": 11520928, "step": 12885 }, { "epoch": 3.4020060710043554, "grad_norm": 0.05010586604475975, "learning_rate": 3.824779658993202e-05, "loss": 0.046, "num_input_tokens_seen": 11525632, "step": 12890 }, { "epoch": 3.4033258545598524, "grad_norm": 0.3769335150718689, "learning_rate": 3.823946983054292e-05, "loss": 0.0563, "num_input_tokens_seen": 11530368, "step": 12895 }, { "epoch": 3.404645638115349, "grad_norm": 0.2078799158334732, "learning_rate": 3.82311410294612e-05, "loss": 0.0525, "num_input_tokens_seen": 11535072, "step": 12900 }, { "epoch": 3.405965421670846, "grad_norm": 0.2232269048690796, "learning_rate": 3.822281018797127e-05, "loss": 0.1072, "num_input_tokens_seen": 11539392, "step": 12905 }, { "epoch": 3.407285205226343, "grad_norm": 0.10016993433237076, "learning_rate": 3.821447730735783e-05, "loss": 0.07, "num_input_tokens_seen": 11543904, "step": 12910 }, { "epoch": 3.40860498878184, "grad_norm": 0.2646557688713074, "learning_rate": 3.820614238890592e-05, "loss": 0.0851, "num_input_tokens_seen": 11548416, "step": 12915 }, { "epoch": 3.4099247723373365, "grad_norm": 0.05379759520292282, "learning_rate": 3.819780543390091e-05, "loss": 0.0873, "num_input_tokens_seen": 11552832, "step": 12920 }, { "epoch": 3.4112445558928335, "grad_norm": 0.20758458971977234, "learning_rate": 3.818946644362844e-05, "loss": 0.0328, "num_input_tokens_seen": 11557696, "step": 12925 }, { "epoch": 3.4125643394483305, "grad_norm": 0.09507401287555695, "learning_rate": 3.81811254193745e-05, "loss": 0.0231, "num_input_tokens_seen": 11562176, "step": 12930 }, { "epoch": 3.4138841230038275, "grad_norm": 0.2030310183763504, "learning_rate": 3.8172782362425366e-05, "loss": 0.0591, "num_input_tokens_seen": 11566464, "step": 12935 }, { "epoch": 3.4152039065593245, "grad_norm": 0.09187600761651993, "learning_rate": 3.816443727406765e-05, "loss": 0.0407, "num_input_tokens_seen": 11570976, "step": 12940 }, { "epoch": 3.416523690114821, "grad_norm": 0.4140663146972656, "learning_rate": 3.815609015558829e-05, "loss": 0.0627, "num_input_tokens_seen": 11575552, "step": 12945 }, { "epoch": 3.417843473670318, "grad_norm": 0.17092835903167725, "learning_rate": 3.814774100827448e-05, "loss": 0.0524, "num_input_tokens_seen": 11579808, "step": 12950 }, { "epoch": 3.419163257225815, "grad_norm": 0.6382081508636475, "learning_rate": 3.813938983341379e-05, "loss": 0.0862, "num_input_tokens_seen": 11584224, "step": 12955 }, { "epoch": 3.420483040781312, "grad_norm": 0.05937276408076286, "learning_rate": 3.813103663229407e-05, "loss": 0.0996, "num_input_tokens_seen": 11588704, "step": 12960 }, { "epoch": 3.4218028243368086, "grad_norm": 0.239060640335083, "learning_rate": 3.812268140620349e-05, "loss": 0.0758, "num_input_tokens_seen": 11593216, "step": 12965 }, { "epoch": 3.4231226078923056, "grad_norm": 0.07066410034894943, "learning_rate": 3.811432415643051e-05, "loss": 0.0647, "num_input_tokens_seen": 11597568, "step": 12970 }, { "epoch": 3.4244423914478026, "grad_norm": 0.07205714285373688, "learning_rate": 3.8105964884263954e-05, "loss": 0.0487, "num_input_tokens_seen": 11602080, "step": 12975 }, { "epoch": 3.4257621750032996, "grad_norm": 0.41819435358047485, "learning_rate": 3.809760359099291e-05, "loss": 0.0969, "num_input_tokens_seen": 11606496, "step": 12980 }, { "epoch": 3.427081958558796, "grad_norm": 0.4587157368659973, "learning_rate": 3.8089240277906804e-05, "loss": 0.0832, "num_input_tokens_seen": 11610592, "step": 12985 }, { "epoch": 3.428401742114293, "grad_norm": 0.2852168679237366, "learning_rate": 3.808087494629535e-05, "loss": 0.0825, "num_input_tokens_seen": 11614976, "step": 12990 }, { "epoch": 3.42972152566979, "grad_norm": 0.4607377350330353, "learning_rate": 3.8072507597448595e-05, "loss": 0.0864, "num_input_tokens_seen": 11619488, "step": 12995 }, { "epoch": 3.431041309225287, "grad_norm": 0.160573810338974, "learning_rate": 3.806413823265689e-05, "loss": 0.0349, "num_input_tokens_seen": 11623936, "step": 13000 }, { "epoch": 3.431041309225287, "eval_loss": 0.07065212726593018, "eval_runtime": 64.8025, "eval_samples_per_second": 103.931, "eval_steps_per_second": 25.987, "num_input_tokens_seen": 11623936, "step": 13000 }, { "epoch": 3.432361092780784, "grad_norm": 0.12693430483341217, "learning_rate": 3.805576685321089e-05, "loss": 0.0574, "num_input_tokens_seen": 11628192, "step": 13005 }, { "epoch": 3.4336808763362807, "grad_norm": 0.15356528759002686, "learning_rate": 3.804739346040158e-05, "loss": 0.0394, "num_input_tokens_seen": 11632512, "step": 13010 }, { "epoch": 3.4350006598917777, "grad_norm": 0.2569732666015625, "learning_rate": 3.8039018055520234e-05, "loss": 0.0474, "num_input_tokens_seen": 11636832, "step": 13015 }, { "epoch": 3.4363204434472747, "grad_norm": 0.4036039710044861, "learning_rate": 3.803064063985844e-05, "loss": 0.068, "num_input_tokens_seen": 11641344, "step": 13020 }, { "epoch": 3.4376402270027717, "grad_norm": 0.05075610801577568, "learning_rate": 3.802226121470811e-05, "loss": 0.0435, "num_input_tokens_seen": 11645920, "step": 13025 }, { "epoch": 3.4389600105582683, "grad_norm": 0.39474213123321533, "learning_rate": 3.801387978136145e-05, "loss": 0.0943, "num_input_tokens_seen": 11650656, "step": 13030 }, { "epoch": 3.4402797941137653, "grad_norm": 0.23534747958183289, "learning_rate": 3.800549634111099e-05, "loss": 0.0992, "num_input_tokens_seen": 11655104, "step": 13035 }, { "epoch": 3.4415995776692623, "grad_norm": 0.13698719441890717, "learning_rate": 3.799711089524955e-05, "loss": 0.0567, "num_input_tokens_seen": 11659680, "step": 13040 }, { "epoch": 3.4429193612247593, "grad_norm": 0.44572871923446655, "learning_rate": 3.7988723445070285e-05, "loss": 0.0826, "num_input_tokens_seen": 11664256, "step": 13045 }, { "epoch": 3.444239144780256, "grad_norm": 0.17778794467449188, "learning_rate": 3.798033399186663e-05, "loss": 0.07, "num_input_tokens_seen": 11668544, "step": 13050 }, { "epoch": 3.445558928335753, "grad_norm": 0.3145613968372345, "learning_rate": 3.797194253693237e-05, "loss": 0.0691, "num_input_tokens_seen": 11672832, "step": 13055 }, { "epoch": 3.44687871189125, "grad_norm": 0.1257438212633133, "learning_rate": 3.796354908156153e-05, "loss": 0.0346, "num_input_tokens_seen": 11677344, "step": 13060 }, { "epoch": 3.448198495446747, "grad_norm": 0.07033568620681763, "learning_rate": 3.795515362704853e-05, "loss": 0.0637, "num_input_tokens_seen": 11681760, "step": 13065 }, { "epoch": 3.449518279002244, "grad_norm": 0.5977985858917236, "learning_rate": 3.794675617468803e-05, "loss": 0.0514, "num_input_tokens_seen": 11686048, "step": 13070 }, { "epoch": 3.4508380625577404, "grad_norm": 0.16040270030498505, "learning_rate": 3.793835672577503e-05, "loss": 0.0161, "num_input_tokens_seen": 11690496, "step": 13075 }, { "epoch": 3.4521578461132374, "grad_norm": 0.10661104321479797, "learning_rate": 3.7929955281604826e-05, "loss": 0.0358, "num_input_tokens_seen": 11694688, "step": 13080 }, { "epoch": 3.4534776296687344, "grad_norm": 0.1306701898574829, "learning_rate": 3.7921551843473036e-05, "loss": 0.043, "num_input_tokens_seen": 11699264, "step": 13085 }, { "epoch": 3.4547974132242314, "grad_norm": 0.08522362262010574, "learning_rate": 3.791314641267557e-05, "loss": 0.0559, "num_input_tokens_seen": 11703488, "step": 13090 }, { "epoch": 3.456117196779728, "grad_norm": 0.3868141770362854, "learning_rate": 3.790473899050864e-05, "loss": 0.0522, "num_input_tokens_seen": 11707648, "step": 13095 }, { "epoch": 3.457436980335225, "grad_norm": 0.03684745728969574, "learning_rate": 3.7896329578268794e-05, "loss": 0.0266, "num_input_tokens_seen": 11712192, "step": 13100 }, { "epoch": 3.458756763890722, "grad_norm": 0.34080183506011963, "learning_rate": 3.7887918177252855e-05, "loss": 0.0483, "num_input_tokens_seen": 11716576, "step": 13105 }, { "epoch": 3.460076547446219, "grad_norm": 0.2844870090484619, "learning_rate": 3.787950478875798e-05, "loss": 0.0572, "num_input_tokens_seen": 11721088, "step": 13110 }, { "epoch": 3.4613963310017155, "grad_norm": 0.10446255654096603, "learning_rate": 3.787108941408162e-05, "loss": 0.0486, "num_input_tokens_seen": 11725664, "step": 13115 }, { "epoch": 3.4627161145572125, "grad_norm": 0.15243922173976898, "learning_rate": 3.786267205452151e-05, "loss": 0.0241, "num_input_tokens_seen": 11729856, "step": 13120 }, { "epoch": 3.4640358981127095, "grad_norm": 0.18912728130817413, "learning_rate": 3.785425271137573e-05, "loss": 0.0754, "num_input_tokens_seen": 11734400, "step": 13125 }, { "epoch": 3.4653556816682065, "grad_norm": 0.1190740317106247, "learning_rate": 3.7845831385942655e-05, "loss": 0.0778, "num_input_tokens_seen": 11738944, "step": 13130 }, { "epoch": 3.4666754652237035, "grad_norm": 0.19124087691307068, "learning_rate": 3.7837408079520944e-05, "loss": 0.0331, "num_input_tokens_seen": 11743456, "step": 13135 }, { "epoch": 3.4679952487792, "grad_norm": 0.2830105125904083, "learning_rate": 3.782898279340957e-05, "loss": 0.0667, "num_input_tokens_seen": 11747744, "step": 13140 }, { "epoch": 3.469315032334697, "grad_norm": 0.2466481775045395, "learning_rate": 3.782055552890784e-05, "loss": 0.0619, "num_input_tokens_seen": 11752256, "step": 13145 }, { "epoch": 3.470634815890194, "grad_norm": 0.17591819167137146, "learning_rate": 3.781212628731534e-05, "loss": 0.0724, "num_input_tokens_seen": 11756832, "step": 13150 }, { "epoch": 3.471954599445691, "grad_norm": 0.08080969750881195, "learning_rate": 3.7803695069931946e-05, "loss": 0.0787, "num_input_tokens_seen": 11760960, "step": 13155 }, { "epoch": 3.4732743830011876, "grad_norm": 0.28955063223838806, "learning_rate": 3.779526187805789e-05, "loss": 0.0469, "num_input_tokens_seen": 11765408, "step": 13160 }, { "epoch": 3.4745941665566846, "grad_norm": 0.15552116930484772, "learning_rate": 3.778682671299364e-05, "loss": 0.0221, "num_input_tokens_seen": 11769600, "step": 13165 }, { "epoch": 3.4759139501121816, "grad_norm": 0.3238231837749481, "learning_rate": 3.777838957604003e-05, "loss": 0.0712, "num_input_tokens_seen": 11774176, "step": 13170 }, { "epoch": 3.4772337336676786, "grad_norm": 0.4140813648700714, "learning_rate": 3.776995046849816e-05, "loss": 0.0613, "num_input_tokens_seen": 11778496, "step": 13175 }, { "epoch": 3.478553517223175, "grad_norm": 0.5008772611618042, "learning_rate": 3.776150939166945e-05, "loss": 0.0807, "num_input_tokens_seen": 11783040, "step": 13180 }, { "epoch": 3.479873300778672, "grad_norm": 0.3332741856575012, "learning_rate": 3.775306634685562e-05, "loss": 0.0565, "num_input_tokens_seen": 11787520, "step": 13185 }, { "epoch": 3.481193084334169, "grad_norm": 0.24662064015865326, "learning_rate": 3.7744621335358696e-05, "loss": 0.0503, "num_input_tokens_seen": 11792288, "step": 13190 }, { "epoch": 3.482512867889666, "grad_norm": 0.13576284050941467, "learning_rate": 3.7736174358481e-05, "loss": 0.0478, "num_input_tokens_seen": 11796800, "step": 13195 }, { "epoch": 3.483832651445163, "grad_norm": 0.13013531267642975, "learning_rate": 3.7727725417525175e-05, "loss": 0.0351, "num_input_tokens_seen": 11801312, "step": 13200 }, { "epoch": 3.483832651445163, "eval_loss": 0.07120130211114883, "eval_runtime": 64.7201, "eval_samples_per_second": 104.063, "eval_steps_per_second": 26.02, "num_input_tokens_seen": 11801312, "step": 13200 }, { "epoch": 3.4851524350006597, "grad_norm": 0.2128872573375702, "learning_rate": 3.771927451379414e-05, "loss": 0.0712, "num_input_tokens_seen": 11805760, "step": 13205 }, { "epoch": 3.4864722185561567, "grad_norm": 0.3668399751186371, "learning_rate": 3.7710821648591135e-05, "loss": 0.1133, "num_input_tokens_seen": 11810464, "step": 13210 }, { "epoch": 3.4877920021116537, "grad_norm": 0.3235186040401459, "learning_rate": 3.7702366823219694e-05, "loss": 0.0885, "num_input_tokens_seen": 11814656, "step": 13215 }, { "epoch": 3.4891117856671507, "grad_norm": 0.24456529319286346, "learning_rate": 3.769391003898366e-05, "loss": 0.046, "num_input_tokens_seen": 11819424, "step": 13220 }, { "epoch": 3.4904315692226473, "grad_norm": 0.10153931379318237, "learning_rate": 3.768545129718718e-05, "loss": 0.0604, "num_input_tokens_seen": 11823680, "step": 13225 }, { "epoch": 3.4917513527781443, "grad_norm": 0.7076449394226074, "learning_rate": 3.7676990599134686e-05, "loss": 0.074, "num_input_tokens_seen": 11828096, "step": 13230 }, { "epoch": 3.4930711363336413, "grad_norm": 0.24989350140094757, "learning_rate": 3.766852794613095e-05, "loss": 0.0524, "num_input_tokens_seen": 11832576, "step": 13235 }, { "epoch": 3.4943909198891383, "grad_norm": 0.25572073459625244, "learning_rate": 3.766006333948099e-05, "loss": 0.0727, "num_input_tokens_seen": 11836896, "step": 13240 }, { "epoch": 3.495710703444635, "grad_norm": 0.125401109457016, "learning_rate": 3.765159678049017e-05, "loss": 0.0709, "num_input_tokens_seen": 11841280, "step": 13245 }, { "epoch": 3.497030487000132, "grad_norm": 0.18690870702266693, "learning_rate": 3.7643128270464134e-05, "loss": 0.032, "num_input_tokens_seen": 11845760, "step": 13250 }, { "epoch": 3.498350270555629, "grad_norm": 0.05348171666264534, "learning_rate": 3.763465781070884e-05, "loss": 0.0414, "num_input_tokens_seen": 11850240, "step": 13255 }, { "epoch": 3.499670054111126, "grad_norm": 0.1838684231042862, "learning_rate": 3.762618540253052e-05, "loss": 0.051, "num_input_tokens_seen": 11855104, "step": 13260 }, { "epoch": 3.500989837666623, "grad_norm": 0.486880898475647, "learning_rate": 3.761771104723576e-05, "loss": 0.067, "num_input_tokens_seen": 11859616, "step": 13265 }, { "epoch": 3.5023096212221194, "grad_norm": 0.33088698983192444, "learning_rate": 3.7609234746131386e-05, "loss": 0.0564, "num_input_tokens_seen": 11863872, "step": 13270 }, { "epoch": 3.5036294047776164, "grad_norm": 0.33012714982032776, "learning_rate": 3.7600756500524556e-05, "loss": 0.0412, "num_input_tokens_seen": 11867968, "step": 13275 }, { "epoch": 3.5049491883331134, "grad_norm": 0.10354222357273102, "learning_rate": 3.759227631172271e-05, "loss": 0.0232, "num_input_tokens_seen": 11872064, "step": 13280 }, { "epoch": 3.5062689718886104, "grad_norm": 0.08376798778772354, "learning_rate": 3.758379418103363e-05, "loss": 0.042, "num_input_tokens_seen": 11876544, "step": 13285 }, { "epoch": 3.5075887554441074, "grad_norm": 0.05344199761748314, "learning_rate": 3.757531010976534e-05, "loss": 0.0282, "num_input_tokens_seen": 11881088, "step": 13290 }, { "epoch": 3.508908538999604, "grad_norm": 0.0856684222817421, "learning_rate": 3.75668240992262e-05, "loss": 0.0487, "num_input_tokens_seen": 11885408, "step": 13295 }, { "epoch": 3.510228322555101, "grad_norm": 0.09118033200502396, "learning_rate": 3.7558336150724865e-05, "loss": 0.0443, "num_input_tokens_seen": 11889728, "step": 13300 }, { "epoch": 3.511548106110598, "grad_norm": 0.06639908999204636, "learning_rate": 3.754984626557028e-05, "loss": 0.044, "num_input_tokens_seen": 11894016, "step": 13305 }, { "epoch": 3.5128678896660945, "grad_norm": 0.06480544060468674, "learning_rate": 3.754135444507168e-05, "loss": 0.0859, "num_input_tokens_seen": 11898720, "step": 13310 }, { "epoch": 3.5141876732215915, "grad_norm": 0.03230544924736023, "learning_rate": 3.753286069053863e-05, "loss": 0.0597, "num_input_tokens_seen": 11903232, "step": 13315 }, { "epoch": 3.5155074567770885, "grad_norm": 0.05360060930252075, "learning_rate": 3.7524365003280945e-05, "loss": 0.0576, "num_input_tokens_seen": 11907904, "step": 13320 }, { "epoch": 3.5168272403325855, "grad_norm": 0.09171462804079056, "learning_rate": 3.75158673846088e-05, "loss": 0.0614, "num_input_tokens_seen": 11912224, "step": 13325 }, { "epoch": 3.5181470238880825, "grad_norm": 0.1222505271434784, "learning_rate": 3.750736783583262e-05, "loss": 0.0345, "num_input_tokens_seen": 11916608, "step": 13330 }, { "epoch": 3.519466807443579, "grad_norm": 0.5553958415985107, "learning_rate": 3.7498866358263144e-05, "loss": 0.155, "num_input_tokens_seen": 11921408, "step": 13335 }, { "epoch": 3.520786590999076, "grad_norm": 0.0767131894826889, "learning_rate": 3.74903629532114e-05, "loss": 0.0421, "num_input_tokens_seen": 11926016, "step": 13340 }, { "epoch": 3.522106374554573, "grad_norm": 0.19707342982292175, "learning_rate": 3.748185762198873e-05, "loss": 0.0904, "num_input_tokens_seen": 11930016, "step": 13345 }, { "epoch": 3.52342615811007, "grad_norm": 0.3796919882297516, "learning_rate": 3.747335036590676e-05, "loss": 0.1239, "num_input_tokens_seen": 11934400, "step": 13350 }, { "epoch": 3.524745941665567, "grad_norm": 0.13117749989032745, "learning_rate": 3.7464841186277405e-05, "loss": 0.0901, "num_input_tokens_seen": 11938816, "step": 13355 }, { "epoch": 3.5260657252210637, "grad_norm": 0.11444339156150818, "learning_rate": 3.7456330084412896e-05, "loss": 0.072, "num_input_tokens_seen": 11943552, "step": 13360 }, { "epoch": 3.5273855087765607, "grad_norm": 0.21108338236808777, "learning_rate": 3.744781706162576e-05, "loss": 0.0444, "num_input_tokens_seen": 11948032, "step": 13365 }, { "epoch": 3.5287052923320577, "grad_norm": 0.28620192408561707, "learning_rate": 3.743930211922879e-05, "loss": 0.047, "num_input_tokens_seen": 11952768, "step": 13370 }, { "epoch": 3.530025075887554, "grad_norm": 0.20223495364189148, "learning_rate": 3.743078525853513e-05, "loss": 0.079, "num_input_tokens_seen": 11956992, "step": 13375 }, { "epoch": 3.531344859443051, "grad_norm": 0.2729811668395996, "learning_rate": 3.7422266480858154e-05, "loss": 0.0419, "num_input_tokens_seen": 11961344, "step": 13380 }, { "epoch": 3.532664642998548, "grad_norm": 0.343795508146286, "learning_rate": 3.741374578751158e-05, "loss": 0.0901, "num_input_tokens_seen": 11965888, "step": 13385 }, { "epoch": 3.533984426554045, "grad_norm": 0.2854900360107422, "learning_rate": 3.740522317980941e-05, "loss": 0.0684, "num_input_tokens_seen": 11970368, "step": 13390 }, { "epoch": 3.535304210109542, "grad_norm": 0.09296902269124985, "learning_rate": 3.739669865906593e-05, "loss": 0.0717, "num_input_tokens_seen": 11974880, "step": 13395 }, { "epoch": 3.5366239936650388, "grad_norm": 0.2642110586166382, "learning_rate": 3.738817222659573e-05, "loss": 0.0612, "num_input_tokens_seen": 11979552, "step": 13400 }, { "epoch": 3.5366239936650388, "eval_loss": 0.0715508833527565, "eval_runtime": 64.7286, "eval_samples_per_second": 104.05, "eval_steps_per_second": 26.016, "num_input_tokens_seen": 11979552, "step": 13400 }, { "epoch": 3.5379437772205358, "grad_norm": 0.2166488617658615, "learning_rate": 3.73796438837137e-05, "loss": 0.0606, "num_input_tokens_seen": 11984384, "step": 13405 }, { "epoch": 3.5392635607760328, "grad_norm": 0.10222763568162918, "learning_rate": 3.7371113631735e-05, "loss": 0.0374, "num_input_tokens_seen": 11988864, "step": 13410 }, { "epoch": 3.5405833443315298, "grad_norm": 0.40142932534217834, "learning_rate": 3.736258147197512e-05, "loss": 0.057, "num_input_tokens_seen": 11993088, "step": 13415 }, { "epoch": 3.5419031278870268, "grad_norm": 0.1158529669046402, "learning_rate": 3.735404740574981e-05, "loss": 0.0621, "num_input_tokens_seen": 11997600, "step": 13420 }, { "epoch": 3.5432229114425233, "grad_norm": 0.19183537364006042, "learning_rate": 3.7345511434375145e-05, "loss": 0.127, "num_input_tokens_seen": 12002048, "step": 13425 }, { "epoch": 3.5445426949980203, "grad_norm": 0.06798561662435532, "learning_rate": 3.733697355916748e-05, "loss": 0.0276, "num_input_tokens_seen": 12006752, "step": 13430 }, { "epoch": 3.5458624785535173, "grad_norm": 0.2004760503768921, "learning_rate": 3.732843378144345e-05, "loss": 0.0462, "num_input_tokens_seen": 12011360, "step": 13435 }, { "epoch": 3.547182262109014, "grad_norm": 0.19981902837753296, "learning_rate": 3.7319892102519995e-05, "loss": 0.0332, "num_input_tokens_seen": 12015616, "step": 13440 }, { "epoch": 3.548502045664511, "grad_norm": 0.23517592251300812, "learning_rate": 3.731134852371436e-05, "loss": 0.0474, "num_input_tokens_seen": 12019904, "step": 13445 }, { "epoch": 3.549821829220008, "grad_norm": 0.12135408073663712, "learning_rate": 3.730280304634408e-05, "loss": 0.0749, "num_input_tokens_seen": 12024096, "step": 13450 }, { "epoch": 3.551141612775505, "grad_norm": 0.31607285141944885, "learning_rate": 3.729425567172696e-05, "loss": 0.0356, "num_input_tokens_seen": 12028608, "step": 13455 }, { "epoch": 3.552461396331002, "grad_norm": 0.13649743795394897, "learning_rate": 3.728570640118111e-05, "loss": 0.0357, "num_input_tokens_seen": 12032896, "step": 13460 }, { "epoch": 3.5537811798864984, "grad_norm": 0.2105242758989334, "learning_rate": 3.727715523602494e-05, "loss": 0.1471, "num_input_tokens_seen": 12037088, "step": 13465 }, { "epoch": 3.5551009634419954, "grad_norm": 0.030522378161549568, "learning_rate": 3.726860217757715e-05, "loss": 0.0672, "num_input_tokens_seen": 12041664, "step": 13470 }, { "epoch": 3.5564207469974924, "grad_norm": 0.2716744542121887, "learning_rate": 3.726004722715673e-05, "loss": 0.0347, "num_input_tokens_seen": 12046336, "step": 13475 }, { "epoch": 3.5577405305529894, "grad_norm": 0.08288174122571945, "learning_rate": 3.725149038608296e-05, "loss": 0.0262, "num_input_tokens_seen": 12050656, "step": 13480 }, { "epoch": 3.5590603141084864, "grad_norm": 0.3584168553352356, "learning_rate": 3.7242931655675404e-05, "loss": 0.1119, "num_input_tokens_seen": 12054944, "step": 13485 }, { "epoch": 3.560380097663983, "grad_norm": 0.4059719145298004, "learning_rate": 3.7234371037253937e-05, "loss": 0.0649, "num_input_tokens_seen": 12059392, "step": 13490 }, { "epoch": 3.56169988121948, "grad_norm": 0.2573276460170746, "learning_rate": 3.7225808532138705e-05, "loss": 0.0472, "num_input_tokens_seen": 12064096, "step": 13495 }, { "epoch": 3.563019664774977, "grad_norm": 0.16586817800998688, "learning_rate": 3.721724414165016e-05, "loss": 0.0562, "num_input_tokens_seen": 12068640, "step": 13500 }, { "epoch": 3.5643394483304736, "grad_norm": 0.31450891494750977, "learning_rate": 3.720867786710904e-05, "loss": 0.0723, "num_input_tokens_seen": 12073536, "step": 13505 }, { "epoch": 3.5656592318859706, "grad_norm": 0.11625517159700394, "learning_rate": 3.7200109709836366e-05, "loss": 0.0695, "num_input_tokens_seen": 12078048, "step": 13510 }, { "epoch": 3.5669790154414676, "grad_norm": 0.04604598879814148, "learning_rate": 3.7191539671153465e-05, "loss": 0.0671, "num_input_tokens_seen": 12082688, "step": 13515 }, { "epoch": 3.5682987989969646, "grad_norm": 0.21834248304367065, "learning_rate": 3.718296775238193e-05, "loss": 0.0324, "num_input_tokens_seen": 12086848, "step": 13520 }, { "epoch": 3.5696185825524616, "grad_norm": 0.30584001541137695, "learning_rate": 3.7174393954843675e-05, "loss": 0.0735, "num_input_tokens_seen": 12091552, "step": 13525 }, { "epoch": 3.570938366107958, "grad_norm": 0.281668484210968, "learning_rate": 3.716581827986087e-05, "loss": 0.0503, "num_input_tokens_seen": 12096064, "step": 13530 }, { "epoch": 3.572258149663455, "grad_norm": 0.09635057300329208, "learning_rate": 3.7157240728756004e-05, "loss": 0.0716, "num_input_tokens_seen": 12100544, "step": 13535 }, { "epoch": 3.573577933218952, "grad_norm": 0.4518418610095978, "learning_rate": 3.714866130285184e-05, "loss": 0.0808, "num_input_tokens_seen": 12105056, "step": 13540 }, { "epoch": 3.574897716774449, "grad_norm": 0.32803046703338623, "learning_rate": 3.714008000347143e-05, "loss": 0.0523, "num_input_tokens_seen": 12109696, "step": 13545 }, { "epoch": 3.576217500329946, "grad_norm": 0.027123291045427322, "learning_rate": 3.7131496831938126e-05, "loss": 0.081, "num_input_tokens_seen": 12114528, "step": 13550 }, { "epoch": 3.5775372838854427, "grad_norm": 0.3130221664905548, "learning_rate": 3.7122911789575565e-05, "loss": 0.0508, "num_input_tokens_seen": 12118976, "step": 13555 }, { "epoch": 3.5788570674409397, "grad_norm": 0.1424226611852646, "learning_rate": 3.711432487770765e-05, "loss": 0.0847, "num_input_tokens_seen": 12123616, "step": 13560 }, { "epoch": 3.5801768509964367, "grad_norm": 0.14411087334156036, "learning_rate": 3.710573609765861e-05, "loss": 0.0933, "num_input_tokens_seen": 12128160, "step": 13565 }, { "epoch": 3.5814966345519332, "grad_norm": 0.19305528700351715, "learning_rate": 3.709714545075292e-05, "loss": 0.0748, "num_input_tokens_seen": 12132352, "step": 13570 }, { "epoch": 3.5828164181074302, "grad_norm": 0.1429882049560547, "learning_rate": 3.708855293831538e-05, "loss": 0.0621, "num_input_tokens_seen": 12136832, "step": 13575 }, { "epoch": 3.5841362016629272, "grad_norm": 0.07496651262044907, "learning_rate": 3.707995856167107e-05, "loss": 0.0584, "num_input_tokens_seen": 12141152, "step": 13580 }, { "epoch": 3.5854559852184242, "grad_norm": 0.2872995436191559, "learning_rate": 3.707136232214534e-05, "loss": 0.0457, "num_input_tokens_seen": 12145408, "step": 13585 }, { "epoch": 3.5867757687739212, "grad_norm": 0.08361658453941345, "learning_rate": 3.7062764221063844e-05, "loss": 0.1092, "num_input_tokens_seen": 12149632, "step": 13590 }, { "epoch": 3.588095552329418, "grad_norm": 0.22108393907546997, "learning_rate": 3.705416425975252e-05, "loss": 0.0573, "num_input_tokens_seen": 12153856, "step": 13595 }, { "epoch": 3.589415335884915, "grad_norm": 0.08470819890499115, "learning_rate": 3.704556243953758e-05, "loss": 0.058, "num_input_tokens_seen": 12158496, "step": 13600 }, { "epoch": 3.589415335884915, "eval_loss": 0.07047203183174133, "eval_runtime": 64.7974, "eval_samples_per_second": 103.939, "eval_steps_per_second": 25.989, "num_input_tokens_seen": 12158496, "step": 13600 }, { "epoch": 3.590735119440412, "grad_norm": 0.059494469314813614, "learning_rate": 3.7036958761745535e-05, "loss": 0.1091, "num_input_tokens_seen": 12162976, "step": 13605 }, { "epoch": 3.592054902995909, "grad_norm": 0.18056075274944305, "learning_rate": 3.702835322770318e-05, "loss": 0.064, "num_input_tokens_seen": 12167424, "step": 13610 }, { "epoch": 3.593374686551406, "grad_norm": 0.13806694746017456, "learning_rate": 3.701974583873761e-05, "loss": 0.0672, "num_input_tokens_seen": 12172064, "step": 13615 }, { "epoch": 3.5946944701069024, "grad_norm": 0.2815007269382477, "learning_rate": 3.701113659617618e-05, "loss": 0.0704, "num_input_tokens_seen": 12176608, "step": 13620 }, { "epoch": 3.5960142536623994, "grad_norm": 0.08173112571239471, "learning_rate": 3.7002525501346535e-05, "loss": 0.059, "num_input_tokens_seen": 12180608, "step": 13625 }, { "epoch": 3.5973340372178964, "grad_norm": 0.27852824330329895, "learning_rate": 3.699391255557664e-05, "loss": 0.0915, "num_input_tokens_seen": 12185088, "step": 13630 }, { "epoch": 3.598653820773393, "grad_norm": 0.13107930123806, "learning_rate": 3.69852977601947e-05, "loss": 0.0611, "num_input_tokens_seen": 12189504, "step": 13635 }, { "epoch": 3.59997360432889, "grad_norm": 0.5061914920806885, "learning_rate": 3.697668111652922e-05, "loss": 0.0545, "num_input_tokens_seen": 12193760, "step": 13640 }, { "epoch": 3.601293387884387, "grad_norm": 0.1426459699869156, "learning_rate": 3.6968062625909005e-05, "loss": 0.0478, "num_input_tokens_seen": 12198208, "step": 13645 }, { "epoch": 3.602613171439884, "grad_norm": 0.13407565653324127, "learning_rate": 3.6959442289663135e-05, "loss": 0.0975, "num_input_tokens_seen": 12202624, "step": 13650 }, { "epoch": 3.603932954995381, "grad_norm": 0.11071648448705673, "learning_rate": 3.695082010912098e-05, "loss": 0.0665, "num_input_tokens_seen": 12207200, "step": 13655 }, { "epoch": 3.6052527385508775, "grad_norm": 0.3224470913410187, "learning_rate": 3.694219608561217e-05, "loss": 0.0867, "num_input_tokens_seen": 12211776, "step": 13660 }, { "epoch": 3.6065725221063745, "grad_norm": 0.21437466144561768, "learning_rate": 3.693357022046665e-05, "loss": 0.1175, "num_input_tokens_seen": 12216352, "step": 13665 }, { "epoch": 3.6078923056618715, "grad_norm": 0.1642499417066574, "learning_rate": 3.6924942515014644e-05, "loss": 0.0941, "num_input_tokens_seen": 12220832, "step": 13670 }, { "epoch": 3.6092120892173685, "grad_norm": 0.35209134221076965, "learning_rate": 3.691631297058664e-05, "loss": 0.0372, "num_input_tokens_seen": 12225344, "step": 13675 }, { "epoch": 3.6105318727728655, "grad_norm": 0.3754257261753082, "learning_rate": 3.6907681588513424e-05, "loss": 0.1059, "num_input_tokens_seen": 12229696, "step": 13680 }, { "epoch": 3.611851656328362, "grad_norm": 0.20746377110481262, "learning_rate": 3.689904837012606e-05, "loss": 0.0633, "num_input_tokens_seen": 12234176, "step": 13685 }, { "epoch": 3.613171439883859, "grad_norm": 0.0922841876745224, "learning_rate": 3.689041331675591e-05, "loss": 0.0395, "num_input_tokens_seen": 12238688, "step": 13690 }, { "epoch": 3.614491223439356, "grad_norm": 0.2862519919872284, "learning_rate": 3.688177642973461e-05, "loss": 0.0573, "num_input_tokens_seen": 12243200, "step": 13695 }, { "epoch": 3.6158110069948526, "grad_norm": 0.24763265252113342, "learning_rate": 3.687313771039406e-05, "loss": 0.0436, "num_input_tokens_seen": 12247712, "step": 13700 }, { "epoch": 3.6171307905503496, "grad_norm": 0.3014957904815674, "learning_rate": 3.686449716006647e-05, "loss": 0.1052, "num_input_tokens_seen": 12252256, "step": 13705 }, { "epoch": 3.6184505741058466, "grad_norm": 0.11168388277292252, "learning_rate": 3.685585478008432e-05, "loss": 0.0703, "num_input_tokens_seen": 12256960, "step": 13710 }, { "epoch": 3.6197703576613436, "grad_norm": 0.06092044711112976, "learning_rate": 3.6847210571780364e-05, "loss": 0.0299, "num_input_tokens_seen": 12261376, "step": 13715 }, { "epoch": 3.6210901412168406, "grad_norm": 0.32533618807792664, "learning_rate": 3.683856453648767e-05, "loss": 0.0888, "num_input_tokens_seen": 12265952, "step": 13720 }, { "epoch": 3.622409924772337, "grad_norm": 0.3705858588218689, "learning_rate": 3.682991667553954e-05, "loss": 0.0587, "num_input_tokens_seen": 12270528, "step": 13725 }, { "epoch": 3.623729708327834, "grad_norm": 0.23150433599948883, "learning_rate": 3.6821266990269606e-05, "loss": 0.0593, "num_input_tokens_seen": 12274912, "step": 13730 }, { "epoch": 3.625049491883331, "grad_norm": 0.17243900895118713, "learning_rate": 3.681261548201174e-05, "loss": 0.0473, "num_input_tokens_seen": 12279360, "step": 13735 }, { "epoch": 3.626369275438828, "grad_norm": 0.17127864062786102, "learning_rate": 3.6803962152100125e-05, "loss": 0.0756, "num_input_tokens_seen": 12283840, "step": 13740 }, { "epoch": 3.627689058994325, "grad_norm": 0.4241301715373993, "learning_rate": 3.67953070018692e-05, "loss": 0.0656, "num_input_tokens_seen": 12288352, "step": 13745 }, { "epoch": 3.6290088425498217, "grad_norm": 0.21278217434883118, "learning_rate": 3.678665003265371e-05, "loss": 0.0906, "num_input_tokens_seen": 12292576, "step": 13750 }, { "epoch": 3.6303286261053187, "grad_norm": 0.04317848011851311, "learning_rate": 3.677799124578867e-05, "loss": 0.0414, "num_input_tokens_seen": 12296896, "step": 13755 }, { "epoch": 3.6316484096608157, "grad_norm": 0.09849812835454941, "learning_rate": 3.676933064260937e-05, "loss": 0.0427, "num_input_tokens_seen": 12301184, "step": 13760 }, { "epoch": 3.6329681932163123, "grad_norm": 0.18287646770477295, "learning_rate": 3.6760668224451365e-05, "loss": 0.1015, "num_input_tokens_seen": 12305760, "step": 13765 }, { "epoch": 3.6342879767718093, "grad_norm": 0.09911791980266571, "learning_rate": 3.675200399265054e-05, "loss": 0.0362, "num_input_tokens_seen": 12310240, "step": 13770 }, { "epoch": 3.6356077603273063, "grad_norm": 0.2794325649738312, "learning_rate": 3.6743337948543014e-05, "loss": 0.1068, "num_input_tokens_seen": 12314624, "step": 13775 }, { "epoch": 3.6369275438828033, "grad_norm": 0.08199980854988098, "learning_rate": 3.6734670093465204e-05, "loss": 0.0741, "num_input_tokens_seen": 12319072, "step": 13780 }, { "epoch": 3.6382473274383003, "grad_norm": 0.22547121345996857, "learning_rate": 3.672600042875379e-05, "loss": 0.0651, "num_input_tokens_seen": 12323392, "step": 13785 }, { "epoch": 3.639567110993797, "grad_norm": 0.549271285533905, "learning_rate": 3.671732895574575e-05, "loss": 0.1036, "num_input_tokens_seen": 12327776, "step": 13790 }, { "epoch": 3.640886894549294, "grad_norm": 0.2668686807155609, "learning_rate": 3.670865567577834e-05, "loss": 0.0337, "num_input_tokens_seen": 12332512, "step": 13795 }, { "epoch": 3.642206678104791, "grad_norm": 0.27998656034469604, "learning_rate": 3.669998059018909e-05, "loss": 0.0441, "num_input_tokens_seen": 12336928, "step": 13800 }, { "epoch": 3.642206678104791, "eval_loss": 0.07020489126443863, "eval_runtime": 64.7322, "eval_samples_per_second": 104.044, "eval_steps_per_second": 26.015, "num_input_tokens_seen": 12336928, "step": 13800 }, { "epoch": 3.643526461660288, "grad_norm": 0.20211559534072876, "learning_rate": 3.6691303700315796e-05, "loss": 0.0923, "num_input_tokens_seen": 12341632, "step": 13805 }, { "epoch": 3.644846245215785, "grad_norm": 0.46745795011520386, "learning_rate": 3.668262500749655e-05, "loss": 0.0544, "num_input_tokens_seen": 12346272, "step": 13810 }, { "epoch": 3.6461660287712814, "grad_norm": 0.08376599103212357, "learning_rate": 3.667394451306971e-05, "loss": 0.0286, "num_input_tokens_seen": 12350816, "step": 13815 }, { "epoch": 3.6474858123267784, "grad_norm": 0.27907389402389526, "learning_rate": 3.666526221837393e-05, "loss": 0.115, "num_input_tokens_seen": 12355136, "step": 13820 }, { "epoch": 3.6488055958822754, "grad_norm": 0.24345633387565613, "learning_rate": 3.665657812474812e-05, "loss": 0.0496, "num_input_tokens_seen": 12360064, "step": 13825 }, { "epoch": 3.650125379437772, "grad_norm": 0.03641634061932564, "learning_rate": 3.664789223353147e-05, "loss": 0.0144, "num_input_tokens_seen": 12364704, "step": 13830 }, { "epoch": 3.6514451629932694, "grad_norm": 0.04545697197318077, "learning_rate": 3.663920454606347e-05, "loss": 0.0386, "num_input_tokens_seen": 12369248, "step": 13835 }, { "epoch": 3.652764946548766, "grad_norm": 0.09697481989860535, "learning_rate": 3.6630515063683856e-05, "loss": 0.0593, "num_input_tokens_seen": 12373632, "step": 13840 }, { "epoch": 3.654084730104263, "grad_norm": 0.07207610458135605, "learning_rate": 3.662182378773267e-05, "loss": 0.0402, "num_input_tokens_seen": 12377952, "step": 13845 }, { "epoch": 3.65540451365976, "grad_norm": 0.24276262521743774, "learning_rate": 3.66131307195502e-05, "loss": 0.0484, "num_input_tokens_seen": 12382688, "step": 13850 }, { "epoch": 3.6567242972152565, "grad_norm": 0.28794342279434204, "learning_rate": 3.6604435860477034e-05, "loss": 0.0315, "num_input_tokens_seen": 12386848, "step": 13855 }, { "epoch": 3.6580440807707535, "grad_norm": 0.07739073783159256, "learning_rate": 3.6595739211854025e-05, "loss": 0.0351, "num_input_tokens_seen": 12391264, "step": 13860 }, { "epoch": 3.6593638643262505, "grad_norm": 0.38529080152511597, "learning_rate": 3.658704077502231e-05, "loss": 0.0808, "num_input_tokens_seen": 12395616, "step": 13865 }, { "epoch": 3.6606836478817475, "grad_norm": 0.5888718366622925, "learning_rate": 3.65783405513233e-05, "loss": 0.1052, "num_input_tokens_seen": 12400224, "step": 13870 }, { "epoch": 3.6620034314372445, "grad_norm": 0.5400272011756897, "learning_rate": 3.656963854209867e-05, "loss": 0.0385, "num_input_tokens_seen": 12404704, "step": 13875 }, { "epoch": 3.663323214992741, "grad_norm": 0.10038716346025467, "learning_rate": 3.656093474869038e-05, "loss": 0.0368, "num_input_tokens_seen": 12409280, "step": 13880 }, { "epoch": 3.664642998548238, "grad_norm": 0.34208759665489197, "learning_rate": 3.655222917244068e-05, "loss": 0.0557, "num_input_tokens_seen": 12413664, "step": 13885 }, { "epoch": 3.665962782103735, "grad_norm": 0.3931296765804291, "learning_rate": 3.6543521814692054e-05, "loss": 0.1318, "num_input_tokens_seen": 12418336, "step": 13890 }, { "epoch": 3.667282565659232, "grad_norm": 0.3020230829715729, "learning_rate": 3.653481267678731e-05, "loss": 0.0843, "num_input_tokens_seen": 12422720, "step": 13895 }, { "epoch": 3.668602349214729, "grad_norm": 0.1967891901731491, "learning_rate": 3.652610176006949e-05, "loss": 0.0459, "num_input_tokens_seen": 12427104, "step": 13900 }, { "epoch": 3.6699221327702256, "grad_norm": 0.08594027906656265, "learning_rate": 3.6517389065881925e-05, "loss": 0.0345, "num_input_tokens_seen": 12431616, "step": 13905 }, { "epoch": 3.6712419163257226, "grad_norm": 0.2151394784450531, "learning_rate": 3.650867459556824e-05, "loss": 0.0371, "num_input_tokens_seen": 12436064, "step": 13910 }, { "epoch": 3.6725616998812196, "grad_norm": 0.05408387631177902, "learning_rate": 3.64999583504723e-05, "loss": 0.0194, "num_input_tokens_seen": 12440320, "step": 13915 }, { "epoch": 3.673881483436716, "grad_norm": 0.038490179926157, "learning_rate": 3.649124033193827e-05, "loss": 0.0301, "num_input_tokens_seen": 12444960, "step": 13920 }, { "epoch": 3.675201266992213, "grad_norm": 0.24906596541404724, "learning_rate": 3.648252054131057e-05, "loss": 0.065, "num_input_tokens_seen": 12449216, "step": 13925 }, { "epoch": 3.67652105054771, "grad_norm": 0.3120221197605133, "learning_rate": 3.647379897993391e-05, "loss": 0.0464, "num_input_tokens_seen": 12454176, "step": 13930 }, { "epoch": 3.677840834103207, "grad_norm": 0.08492430299520493, "learning_rate": 3.646507564915325e-05, "loss": 0.0423, "num_input_tokens_seen": 12458592, "step": 13935 }, { "epoch": 3.679160617658704, "grad_norm": 0.28929901123046875, "learning_rate": 3.645635055031385e-05, "loss": 0.0381, "num_input_tokens_seen": 12463296, "step": 13940 }, { "epoch": 3.6804804012142007, "grad_norm": 0.05457812175154686, "learning_rate": 3.6447623684761224e-05, "loss": 0.1091, "num_input_tokens_seen": 12467744, "step": 13945 }, { "epoch": 3.6818001847696977, "grad_norm": 0.09262394160032272, "learning_rate": 3.643889505384117e-05, "loss": 0.0505, "num_input_tokens_seen": 12471936, "step": 13950 }, { "epoch": 3.6831199683251947, "grad_norm": 0.31971409916877747, "learning_rate": 3.6430164658899744e-05, "loss": 0.0517, "num_input_tokens_seen": 12476064, "step": 13955 }, { "epoch": 3.6844397518806917, "grad_norm": 0.34457284212112427, "learning_rate": 3.642143250128329e-05, "loss": 0.1019, "num_input_tokens_seen": 12480512, "step": 13960 }, { "epoch": 3.6857595354361887, "grad_norm": 0.2229415327310562, "learning_rate": 3.641269858233841e-05, "loss": 0.047, "num_input_tokens_seen": 12485088, "step": 13965 }, { "epoch": 3.6870793189916853, "grad_norm": 0.22079087793827057, "learning_rate": 3.640396290341199e-05, "loss": 0.0823, "num_input_tokens_seen": 12489696, "step": 13970 }, { "epoch": 3.6883991025471823, "grad_norm": 0.2719988226890564, "learning_rate": 3.639522546585118e-05, "loss": 0.1033, "num_input_tokens_seen": 12494080, "step": 13975 }, { "epoch": 3.6897188861026793, "grad_norm": 0.24038533866405487, "learning_rate": 3.6386486271003404e-05, "loss": 0.0843, "num_input_tokens_seen": 12498688, "step": 13980 }, { "epoch": 3.691038669658176, "grad_norm": 0.19189786911010742, "learning_rate": 3.6377745320216346e-05, "loss": 0.1292, "num_input_tokens_seen": 12502976, "step": 13985 }, { "epoch": 3.692358453213673, "grad_norm": 0.29798004031181335, "learning_rate": 3.636900261483798e-05, "loss": 0.0718, "num_input_tokens_seen": 12507648, "step": 13990 }, { "epoch": 3.69367823676917, "grad_norm": 0.24173560738563538, "learning_rate": 3.636025815621654e-05, "loss": 0.0375, "num_input_tokens_seen": 12512352, "step": 13995 }, { "epoch": 3.694998020324667, "grad_norm": 0.23445792496204376, "learning_rate": 3.635151194570054e-05, "loss": 0.1011, "num_input_tokens_seen": 12516800, "step": 14000 }, { "epoch": 3.694998020324667, "eval_loss": 0.07012901455163956, "eval_runtime": 64.7621, "eval_samples_per_second": 103.996, "eval_steps_per_second": 26.003, "num_input_tokens_seen": 12516800, "step": 14000 }, { "epoch": 3.696317803880164, "grad_norm": 0.3708011507987976, "learning_rate": 3.634276398463873e-05, "loss": 0.075, "num_input_tokens_seen": 12521632, "step": 14005 }, { "epoch": 3.6976375874356604, "grad_norm": 0.06707891821861267, "learning_rate": 3.633401427438018e-05, "loss": 0.0582, "num_input_tokens_seen": 12525920, "step": 14010 }, { "epoch": 3.6989573709911574, "grad_norm": 0.5043233036994934, "learning_rate": 3.63252628162742e-05, "loss": 0.1065, "num_input_tokens_seen": 12530368, "step": 14015 }, { "epoch": 3.7002771545466544, "grad_norm": 0.15913519263267517, "learning_rate": 3.6316509611670364e-05, "loss": 0.0416, "num_input_tokens_seen": 12534848, "step": 14020 }, { "epoch": 3.7015969381021514, "grad_norm": 0.05630084499716759, "learning_rate": 3.630775466191854e-05, "loss": 0.0342, "num_input_tokens_seen": 12539296, "step": 14025 }, { "epoch": 3.7029167216576484, "grad_norm": 0.0715685710310936, "learning_rate": 3.629899796836884e-05, "loss": 0.0393, "num_input_tokens_seen": 12544000, "step": 14030 }, { "epoch": 3.704236505213145, "grad_norm": 0.10906235873699188, "learning_rate": 3.6290239532371666e-05, "loss": 0.0863, "num_input_tokens_seen": 12548608, "step": 14035 }, { "epoch": 3.705556288768642, "grad_norm": 0.2301749587059021, "learning_rate": 3.628147935527767e-05, "loss": 0.1001, "num_input_tokens_seen": 12553536, "step": 14040 }, { "epoch": 3.706876072324139, "grad_norm": 0.03506452217698097, "learning_rate": 3.627271743843779e-05, "loss": 0.0465, "num_input_tokens_seen": 12557920, "step": 14045 }, { "epoch": 3.7081958558796355, "grad_norm": 0.28077152371406555, "learning_rate": 3.626395378320321e-05, "loss": 0.0571, "num_input_tokens_seen": 12562560, "step": 14050 }, { "epoch": 3.7095156394351325, "grad_norm": 0.047922611236572266, "learning_rate": 3.625518839092541e-05, "loss": 0.0316, "num_input_tokens_seen": 12566816, "step": 14055 }, { "epoch": 3.7108354229906295, "grad_norm": 0.19901736080646515, "learning_rate": 3.624642126295612e-05, "loss": 0.0854, "num_input_tokens_seen": 12571456, "step": 14060 }, { "epoch": 3.7121552065461265, "grad_norm": 0.12431244552135468, "learning_rate": 3.6237652400647345e-05, "loss": 0.0737, "num_input_tokens_seen": 12575680, "step": 14065 }, { "epoch": 3.7134749901016235, "grad_norm": 0.05594848096370697, "learning_rate": 3.622888180535134e-05, "loss": 0.0282, "num_input_tokens_seen": 12580160, "step": 14070 }, { "epoch": 3.71479477365712, "grad_norm": 0.1305118203163147, "learning_rate": 3.6220109478420655e-05, "loss": 0.0272, "num_input_tokens_seen": 12584608, "step": 14075 }, { "epoch": 3.716114557212617, "grad_norm": 0.25873133540153503, "learning_rate": 3.6211335421208084e-05, "loss": 0.0373, "num_input_tokens_seen": 12589120, "step": 14080 }, { "epoch": 3.717434340768114, "grad_norm": 0.443569540977478, "learning_rate": 3.62025596350667e-05, "loss": 0.1054, "num_input_tokens_seen": 12593440, "step": 14085 }, { "epoch": 3.718754124323611, "grad_norm": 0.11574344336986542, "learning_rate": 3.619378212134984e-05, "loss": 0.05, "num_input_tokens_seen": 12597856, "step": 14090 }, { "epoch": 3.720073907879108, "grad_norm": 0.11435899883508682, "learning_rate": 3.618500288141111e-05, "loss": 0.0885, "num_input_tokens_seen": 12602400, "step": 14095 }, { "epoch": 3.7213936914346046, "grad_norm": 0.4830787777900696, "learning_rate": 3.617622191660438e-05, "loss": 0.1237, "num_input_tokens_seen": 12606912, "step": 14100 }, { "epoch": 3.7227134749901016, "grad_norm": 0.25740718841552734, "learning_rate": 3.616743922828377e-05, "loss": 0.0826, "num_input_tokens_seen": 12611072, "step": 14105 }, { "epoch": 3.7240332585455986, "grad_norm": 0.05857228860259056, "learning_rate": 3.615865481780371e-05, "loss": 0.0468, "num_input_tokens_seen": 12615488, "step": 14110 }, { "epoch": 3.725353042101095, "grad_norm": 0.1826494336128235, "learning_rate": 3.614986868651883e-05, "loss": 0.0291, "num_input_tokens_seen": 12619936, "step": 14115 }, { "epoch": 3.726672825656592, "grad_norm": 0.2047070562839508, "learning_rate": 3.614108083578409e-05, "loss": 0.0974, "num_input_tokens_seen": 12624512, "step": 14120 }, { "epoch": 3.727992609212089, "grad_norm": 0.2482721507549286, "learning_rate": 3.613229126695467e-05, "loss": 0.0378, "num_input_tokens_seen": 12628768, "step": 14125 }, { "epoch": 3.729312392767586, "grad_norm": 0.12914778292179108, "learning_rate": 3.612349998138605e-05, "loss": 0.045, "num_input_tokens_seen": 12633344, "step": 14130 }, { "epoch": 3.730632176323083, "grad_norm": 0.11181078106164932, "learning_rate": 3.6114706980433946e-05, "loss": 0.1314, "num_input_tokens_seen": 12638048, "step": 14135 }, { "epoch": 3.7319519598785797, "grad_norm": 0.2526971101760864, "learning_rate": 3.610591226545435e-05, "loss": 0.0546, "num_input_tokens_seen": 12642464, "step": 14140 }, { "epoch": 3.7332717434340768, "grad_norm": 0.06576159596443176, "learning_rate": 3.6097115837803505e-05, "loss": 0.044, "num_input_tokens_seen": 12646880, "step": 14145 }, { "epoch": 3.7345915269895738, "grad_norm": 0.3647424578666687, "learning_rate": 3.608831769883795e-05, "loss": 0.076, "num_input_tokens_seen": 12651072, "step": 14150 }, { "epoch": 3.7359113105450708, "grad_norm": 0.35942384600639343, "learning_rate": 3.607951784991446e-05, "loss": 0.0822, "num_input_tokens_seen": 12655776, "step": 14155 }, { "epoch": 3.7372310941005678, "grad_norm": 0.1862003356218338, "learning_rate": 3.6070716292390085e-05, "loss": 0.0487, "num_input_tokens_seen": 12660000, "step": 14160 }, { "epoch": 3.7385508776560643, "grad_norm": 0.5439689755439758, "learning_rate": 3.606191302762213e-05, "loss": 0.1412, "num_input_tokens_seen": 12664864, "step": 14165 }, { "epoch": 3.7398706612115613, "grad_norm": 0.2243514358997345, "learning_rate": 3.605310805696818e-05, "loss": 0.0679, "num_input_tokens_seen": 12669376, "step": 14170 }, { "epoch": 3.7411904447670583, "grad_norm": 0.24459627270698547, "learning_rate": 3.6044301381786067e-05, "loss": 0.1037, "num_input_tokens_seen": 12673600, "step": 14175 }, { "epoch": 3.742510228322555, "grad_norm": 0.2941947281360626, "learning_rate": 3.6035493003433883e-05, "loss": 0.0771, "num_input_tokens_seen": 12677760, "step": 14180 }, { "epoch": 3.743830011878052, "grad_norm": 0.250480055809021, "learning_rate": 3.6026682923269994e-05, "loss": 0.1254, "num_input_tokens_seen": 12682240, "step": 14185 }, { "epoch": 3.745149795433549, "grad_norm": 0.2735949158668518, "learning_rate": 3.6017871142653034e-05, "loss": 0.0543, "num_input_tokens_seen": 12686784, "step": 14190 }, { "epoch": 3.746469578989046, "grad_norm": 0.21321110427379608, "learning_rate": 3.600905766294189e-05, "loss": 0.0499, "num_input_tokens_seen": 12691360, "step": 14195 }, { "epoch": 3.747789362544543, "grad_norm": 0.04726054519414902, "learning_rate": 3.60002424854957e-05, "loss": 0.0558, "num_input_tokens_seen": 12695648, "step": 14200 }, { "epoch": 3.747789362544543, "eval_loss": 0.07010029256343842, "eval_runtime": 65.0734, "eval_samples_per_second": 103.499, "eval_steps_per_second": 25.878, "num_input_tokens_seen": 12695648, "step": 14200 }, { "epoch": 3.7491091461000394, "grad_norm": 0.19516563415527344, "learning_rate": 3.5991425611673876e-05, "loss": 0.0656, "num_input_tokens_seen": 12700288, "step": 14205 }, { "epoch": 3.7504289296555364, "grad_norm": 0.40891969203948975, "learning_rate": 3.5982607042836105e-05, "loss": 0.057, "num_input_tokens_seen": 12704544, "step": 14210 }, { "epoch": 3.7517487132110334, "grad_norm": 0.7194133400917053, "learning_rate": 3.597378678034231e-05, "loss": 0.1373, "num_input_tokens_seen": 12709184, "step": 14215 }, { "epoch": 3.7530684967665304, "grad_norm": 0.24140232801437378, "learning_rate": 3.596496482555269e-05, "loss": 0.0535, "num_input_tokens_seen": 12713920, "step": 14220 }, { "epoch": 3.7543882803220274, "grad_norm": 0.1821589171886444, "learning_rate": 3.595614117982769e-05, "loss": 0.0601, "num_input_tokens_seen": 12718336, "step": 14225 }, { "epoch": 3.755708063877524, "grad_norm": 0.09190386533737183, "learning_rate": 3.594731584452805e-05, "loss": 0.0758, "num_input_tokens_seen": 12722848, "step": 14230 }, { "epoch": 3.757027847433021, "grad_norm": 0.5025379061698914, "learning_rate": 3.593848882101472e-05, "loss": 0.0941, "num_input_tokens_seen": 12727200, "step": 14235 }, { "epoch": 3.758347630988518, "grad_norm": 0.1722545623779297, "learning_rate": 3.592966011064896e-05, "loss": 0.0301, "num_input_tokens_seen": 12731872, "step": 14240 }, { "epoch": 3.7596674145440145, "grad_norm": 0.2402086704969406, "learning_rate": 3.592082971479226e-05, "loss": 0.0596, "num_input_tokens_seen": 12736000, "step": 14245 }, { "epoch": 3.7609871980995115, "grad_norm": 0.17915412783622742, "learning_rate": 3.5911997634806385e-05, "loss": 0.0585, "num_input_tokens_seen": 12740512, "step": 14250 }, { "epoch": 3.7623069816550085, "grad_norm": 0.18790321052074432, "learning_rate": 3.5903163872053336e-05, "loss": 0.0923, "num_input_tokens_seen": 12745056, "step": 14255 }, { "epoch": 3.7636267652105055, "grad_norm": 0.06330185383558273, "learning_rate": 3.58943284278954e-05, "loss": 0.0484, "num_input_tokens_seen": 12749376, "step": 14260 }, { "epoch": 3.7649465487660025, "grad_norm": 0.072992704808712, "learning_rate": 3.588549130369512e-05, "loss": 0.0595, "num_input_tokens_seen": 12753888, "step": 14265 }, { "epoch": 3.766266332321499, "grad_norm": 0.10988523811101913, "learning_rate": 3.5876652500815274e-05, "loss": 0.0579, "num_input_tokens_seen": 12758432, "step": 14270 }, { "epoch": 3.767586115876996, "grad_norm": 0.20684972405433655, "learning_rate": 3.586781202061894e-05, "loss": 0.0329, "num_input_tokens_seen": 12763200, "step": 14275 }, { "epoch": 3.768905899432493, "grad_norm": 0.21513719856739044, "learning_rate": 3.585896986446942e-05, "loss": 0.0587, "num_input_tokens_seen": 12767680, "step": 14280 }, { "epoch": 3.77022568298799, "grad_norm": 0.15895283222198486, "learning_rate": 3.585012603373028e-05, "loss": 0.0443, "num_input_tokens_seen": 12771872, "step": 14285 }, { "epoch": 3.771545466543487, "grad_norm": 0.22698146104812622, "learning_rate": 3.584128052976535e-05, "loss": 0.0686, "num_input_tokens_seen": 12776224, "step": 14290 }, { "epoch": 3.7728652500989837, "grad_norm": 0.34538212418556213, "learning_rate": 3.5832433353938724e-05, "loss": 0.0744, "num_input_tokens_seen": 12780640, "step": 14295 }, { "epoch": 3.7741850336544807, "grad_norm": 0.11162048578262329, "learning_rate": 3.5823584507614746e-05, "loss": 0.0603, "num_input_tokens_seen": 12785344, "step": 14300 }, { "epoch": 3.7755048172099777, "grad_norm": 0.2139614224433899, "learning_rate": 3.581473399215802e-05, "loss": 0.0748, "num_input_tokens_seen": 12789600, "step": 14305 }, { "epoch": 3.776824600765474, "grad_norm": 0.08781816065311432, "learning_rate": 3.580588180893341e-05, "loss": 0.0374, "num_input_tokens_seen": 12794272, "step": 14310 }, { "epoch": 3.778144384320971, "grad_norm": 0.2113962024450302, "learning_rate": 3.579702795930602e-05, "loss": 0.0496, "num_input_tokens_seen": 12798720, "step": 14315 }, { "epoch": 3.779464167876468, "grad_norm": 0.2857036590576172, "learning_rate": 3.578817244464125e-05, "loss": 0.0755, "num_input_tokens_seen": 12803392, "step": 14320 }, { "epoch": 3.780783951431965, "grad_norm": 0.33682477474212646, "learning_rate": 3.577931526630471e-05, "loss": 0.0343, "num_input_tokens_seen": 12807968, "step": 14325 }, { "epoch": 3.782103734987462, "grad_norm": 0.07584571093320847, "learning_rate": 3.577045642566229e-05, "loss": 0.0263, "num_input_tokens_seen": 12812224, "step": 14330 }, { "epoch": 3.7834235185429588, "grad_norm": 0.0941348597407341, "learning_rate": 3.576159592408014e-05, "loss": 0.0816, "num_input_tokens_seen": 12816576, "step": 14335 }, { "epoch": 3.7847433020984558, "grad_norm": 0.07598311454057693, "learning_rate": 3.575273376292466e-05, "loss": 0.0457, "num_input_tokens_seen": 12820672, "step": 14340 }, { "epoch": 3.7860630856539528, "grad_norm": 0.5496885180473328, "learning_rate": 3.574386994356251e-05, "loss": 0.0839, "num_input_tokens_seen": 12824960, "step": 14345 }, { "epoch": 3.7873828692094498, "grad_norm": 0.42316362261772156, "learning_rate": 3.573500446736059e-05, "loss": 0.0627, "num_input_tokens_seen": 12829504, "step": 14350 }, { "epoch": 3.7887026527649468, "grad_norm": 0.3387746214866638, "learning_rate": 3.5726137335686094e-05, "loss": 0.0852, "num_input_tokens_seen": 12833888, "step": 14355 }, { "epoch": 3.7900224363204433, "grad_norm": 0.2717480957508087, "learning_rate": 3.571726854990642e-05, "loss": 0.0704, "num_input_tokens_seen": 12838624, "step": 14360 }, { "epoch": 3.7913422198759403, "grad_norm": 0.19734783470630646, "learning_rate": 3.570839811138925e-05, "loss": 0.0934, "num_input_tokens_seen": 12843008, "step": 14365 }, { "epoch": 3.7926620034314373, "grad_norm": 0.22105637192726135, "learning_rate": 3.569952602150252e-05, "loss": 0.0725, "num_input_tokens_seen": 12847264, "step": 14370 }, { "epoch": 3.793981786986934, "grad_norm": 0.13598424196243286, "learning_rate": 3.569065228161442e-05, "loss": 0.0974, "num_input_tokens_seen": 12852000, "step": 14375 }, { "epoch": 3.795301570542431, "grad_norm": 0.23259256780147552, "learning_rate": 3.5681776893093395e-05, "loss": 0.0491, "num_input_tokens_seen": 12856608, "step": 14380 }, { "epoch": 3.796621354097928, "grad_norm": 0.13966037333011627, "learning_rate": 3.5672899857308134e-05, "loss": 0.0603, "num_input_tokens_seen": 12861024, "step": 14385 }, { "epoch": 3.797941137653425, "grad_norm": 0.13671354949474335, "learning_rate": 3.566402117562759e-05, "loss": 0.0387, "num_input_tokens_seen": 12865600, "step": 14390 }, { "epoch": 3.799260921208922, "grad_norm": 0.1927642673254013, "learning_rate": 3.565514084942097e-05, "loss": 0.0592, "num_input_tokens_seen": 12870080, "step": 14395 }, { "epoch": 3.8005807047644184, "grad_norm": 0.3473242521286011, "learning_rate": 3.564625888005773e-05, "loss": 0.1227, "num_input_tokens_seen": 12874656, "step": 14400 }, { "epoch": 3.8005807047644184, "eval_loss": 0.07020100206136703, "eval_runtime": 64.7445, "eval_samples_per_second": 104.024, "eval_steps_per_second": 26.01, "num_input_tokens_seen": 12874656, "step": 14400 }, { "epoch": 3.8019004883199154, "grad_norm": 0.0446460098028183, "learning_rate": 3.563737526890759e-05, "loss": 0.0314, "num_input_tokens_seen": 12879392, "step": 14405 }, { "epoch": 3.8032202718754125, "grad_norm": 0.21624614298343658, "learning_rate": 3.562849001734049e-05, "loss": 0.0686, "num_input_tokens_seen": 12883744, "step": 14410 }, { "epoch": 3.8045400554309095, "grad_norm": 0.12420845031738281, "learning_rate": 3.561960312672667e-05, "loss": 0.073, "num_input_tokens_seen": 12887936, "step": 14415 }, { "epoch": 3.8058598389864065, "grad_norm": 0.223580002784729, "learning_rate": 3.5610714598436596e-05, "loss": 0.0431, "num_input_tokens_seen": 12892256, "step": 14420 }, { "epoch": 3.807179622541903, "grad_norm": 0.11566367745399475, "learning_rate": 3.5601824433840986e-05, "loss": 0.0336, "num_input_tokens_seen": 12897152, "step": 14425 }, { "epoch": 3.8084994060974, "grad_norm": 0.06418372690677643, "learning_rate": 3.559293263431082e-05, "loss": 0.0289, "num_input_tokens_seen": 12901504, "step": 14430 }, { "epoch": 3.809819189652897, "grad_norm": 0.04414607957005501, "learning_rate": 3.558403920121732e-05, "loss": 0.0638, "num_input_tokens_seen": 12905920, "step": 14435 }, { "epoch": 3.8111389732083936, "grad_norm": 0.12241875380277634, "learning_rate": 3.557514413593197e-05, "loss": 0.0489, "num_input_tokens_seen": 12910432, "step": 14440 }, { "epoch": 3.8124587567638906, "grad_norm": 0.06376060098409653, "learning_rate": 3.55662474398265e-05, "loss": 0.094, "num_input_tokens_seen": 12915040, "step": 14445 }, { "epoch": 3.8137785403193876, "grad_norm": 0.1842741072177887, "learning_rate": 3.555734911427288e-05, "loss": 0.0481, "num_input_tokens_seen": 12919328, "step": 14450 }, { "epoch": 3.8150983238748846, "grad_norm": 0.06489312648773193, "learning_rate": 3.5548449160643363e-05, "loss": 0.0994, "num_input_tokens_seen": 12923936, "step": 14455 }, { "epoch": 3.8164181074303816, "grad_norm": 0.1813816875219345, "learning_rate": 3.553954758031043e-05, "loss": 0.0556, "num_input_tokens_seen": 12928672, "step": 14460 }, { "epoch": 3.817737890985878, "grad_norm": 0.3094000220298767, "learning_rate": 3.5530644374646815e-05, "loss": 0.0531, "num_input_tokens_seen": 12932960, "step": 14465 }, { "epoch": 3.819057674541375, "grad_norm": 0.3110158145427704, "learning_rate": 3.552173954502549e-05, "loss": 0.0958, "num_input_tokens_seen": 12937568, "step": 14470 }, { "epoch": 3.820377458096872, "grad_norm": 0.08041540533304214, "learning_rate": 3.55128330928197e-05, "loss": 0.0202, "num_input_tokens_seen": 12941856, "step": 14475 }, { "epoch": 3.821697241652369, "grad_norm": 0.22771687805652618, "learning_rate": 3.550392501940294e-05, "loss": 0.0848, "num_input_tokens_seen": 12946560, "step": 14480 }, { "epoch": 3.823017025207866, "grad_norm": 0.18116185069084167, "learning_rate": 3.5495015326148945e-05, "loss": 0.0467, "num_input_tokens_seen": 12951072, "step": 14485 }, { "epoch": 3.8243368087633627, "grad_norm": 0.1734246015548706, "learning_rate": 3.548610401443169e-05, "loss": 0.0898, "num_input_tokens_seen": 12955744, "step": 14490 }, { "epoch": 3.8256565923188597, "grad_norm": 0.21937955915927887, "learning_rate": 3.547719108562543e-05, "loss": 0.0562, "num_input_tokens_seen": 12960448, "step": 14495 }, { "epoch": 3.8269763758743567, "grad_norm": 0.2469603419303894, "learning_rate": 3.546827654110464e-05, "loss": 0.0712, "num_input_tokens_seen": 12965184, "step": 14500 }, { "epoch": 3.8282961594298532, "grad_norm": 0.33697938919067383, "learning_rate": 3.545936038224405e-05, "loss": 0.1247, "num_input_tokens_seen": 12969568, "step": 14505 }, { "epoch": 3.8296159429853502, "grad_norm": 0.23930227756500244, "learning_rate": 3.545044261041864e-05, "loss": 0.0643, "num_input_tokens_seen": 12974112, "step": 14510 }, { "epoch": 3.8309357265408472, "grad_norm": 0.05051136389374733, "learning_rate": 3.5441523227003657e-05, "loss": 0.03, "num_input_tokens_seen": 12978432, "step": 14515 }, { "epoch": 3.8322555100963442, "grad_norm": 0.15185415744781494, "learning_rate": 3.543260223337459e-05, "loss": 0.055, "num_input_tokens_seen": 12982816, "step": 14520 }, { "epoch": 3.8335752936518412, "grad_norm": 0.0468449629843235, "learning_rate": 3.542367963090714e-05, "loss": 0.0627, "num_input_tokens_seen": 12987488, "step": 14525 }, { "epoch": 3.834895077207338, "grad_norm": 0.1370970457792282, "learning_rate": 3.5414755420977295e-05, "loss": 0.0447, "num_input_tokens_seen": 12992064, "step": 14530 }, { "epoch": 3.836214860762835, "grad_norm": 0.2558385729789734, "learning_rate": 3.54058296049613e-05, "loss": 0.0926, "num_input_tokens_seen": 12996128, "step": 14535 }, { "epoch": 3.837534644318332, "grad_norm": 0.1799997091293335, "learning_rate": 3.53969021842356e-05, "loss": 0.0341, "num_input_tokens_seen": 13000352, "step": 14540 }, { "epoch": 3.838854427873829, "grad_norm": 0.12324324250221252, "learning_rate": 3.5387973160176926e-05, "loss": 0.022, "num_input_tokens_seen": 13005056, "step": 14545 }, { "epoch": 3.840174211429326, "grad_norm": 0.28142648935317993, "learning_rate": 3.537904253416224e-05, "loss": 0.1321, "num_input_tokens_seen": 13009472, "step": 14550 }, { "epoch": 3.8414939949848224, "grad_norm": 0.31049129366874695, "learning_rate": 3.537011030756878e-05, "loss": 0.0656, "num_input_tokens_seen": 13013728, "step": 14555 }, { "epoch": 3.8428137785403194, "grad_norm": 0.04380623623728752, "learning_rate": 3.536117648177399e-05, "loss": 0.0518, "num_input_tokens_seen": 13018144, "step": 14560 }, { "epoch": 3.8441335620958164, "grad_norm": 0.3408360779285431, "learning_rate": 3.535224105815558e-05, "loss": 0.0733, "num_input_tokens_seen": 13022528, "step": 14565 }, { "epoch": 3.845453345651313, "grad_norm": 0.30420738458633423, "learning_rate": 3.5343304038091494e-05, "loss": 0.0785, "num_input_tokens_seen": 13026816, "step": 14570 }, { "epoch": 3.8467731292068104, "grad_norm": 0.0713987797498703, "learning_rate": 3.5334365422959955e-05, "loss": 0.0727, "num_input_tokens_seen": 13031264, "step": 14575 }, { "epoch": 3.848092912762307, "grad_norm": 0.232406347990036, "learning_rate": 3.5325425214139396e-05, "loss": 0.0634, "num_input_tokens_seen": 13035488, "step": 14580 }, { "epoch": 3.849412696317804, "grad_norm": 0.2916276156902313, "learning_rate": 3.531648341300851e-05, "loss": 0.079, "num_input_tokens_seen": 13040032, "step": 14585 }, { "epoch": 3.850732479873301, "grad_norm": 0.14552929997444153, "learning_rate": 3.530754002094623e-05, "loss": 0.0474, "num_input_tokens_seen": 13044448, "step": 14590 }, { "epoch": 3.8520522634287975, "grad_norm": 0.14198054373264313, "learning_rate": 3.529859503933175e-05, "loss": 0.074, "num_input_tokens_seen": 13048736, "step": 14595 }, { "epoch": 3.8533720469842945, "grad_norm": 0.3221355378627777, "learning_rate": 3.52896484695445e-05, "loss": 0.0549, "num_input_tokens_seen": 13053184, "step": 14600 }, { "epoch": 3.8533720469842945, "eval_loss": 0.06999914348125458, "eval_runtime": 64.7764, "eval_samples_per_second": 103.973, "eval_steps_per_second": 25.997, "num_input_tokens_seen": 13053184, "step": 14600 }, { "epoch": 3.8546918305397915, "grad_norm": 0.04182983189821243, "learning_rate": 3.528070031296414e-05, "loss": 0.0349, "num_input_tokens_seen": 13058240, "step": 14605 }, { "epoch": 3.8560116140952885, "grad_norm": 0.48399922251701355, "learning_rate": 3.5271750570970605e-05, "loss": 0.0868, "num_input_tokens_seen": 13063008, "step": 14610 }, { "epoch": 3.8573313976507855, "grad_norm": 0.24412967264652252, "learning_rate": 3.526279924494405e-05, "loss": 0.0478, "num_input_tokens_seen": 13067296, "step": 14615 }, { "epoch": 3.858651181206282, "grad_norm": 0.21276412904262543, "learning_rate": 3.5253846336264874e-05, "loss": 0.1046, "num_input_tokens_seen": 13071840, "step": 14620 }, { "epoch": 3.859970964761779, "grad_norm": 0.08351841568946838, "learning_rate": 3.5244891846313736e-05, "loss": 0.0711, "num_input_tokens_seen": 13076192, "step": 14625 }, { "epoch": 3.861290748317276, "grad_norm": 0.22587162256240845, "learning_rate": 3.5235935776471527e-05, "loss": 0.0951, "num_input_tokens_seen": 13080768, "step": 14630 }, { "epoch": 3.8626105318727726, "grad_norm": 0.10556302964687347, "learning_rate": 3.522697812811939e-05, "loss": 0.0694, "num_input_tokens_seen": 13085440, "step": 14635 }, { "epoch": 3.86393031542827, "grad_norm": 0.1070423275232315, "learning_rate": 3.521801890263871e-05, "loss": 0.0327, "num_input_tokens_seen": 13089952, "step": 14640 }, { "epoch": 3.8652500989837666, "grad_norm": 0.13928936421871185, "learning_rate": 3.5209058101411114e-05, "loss": 0.02, "num_input_tokens_seen": 13094336, "step": 14645 }, { "epoch": 3.8665698825392636, "grad_norm": 0.07713605463504791, "learning_rate": 3.520009572581845e-05, "loss": 0.0231, "num_input_tokens_seen": 13099136, "step": 14650 }, { "epoch": 3.8678896660947606, "grad_norm": 0.044993143528699875, "learning_rate": 3.519113177724285e-05, "loss": 0.036, "num_input_tokens_seen": 13103520, "step": 14655 }, { "epoch": 3.869209449650257, "grad_norm": 0.2854921519756317, "learning_rate": 3.5182166257066656e-05, "loss": 0.1164, "num_input_tokens_seen": 13107648, "step": 14660 }, { "epoch": 3.870529233205754, "grad_norm": 0.11618240177631378, "learning_rate": 3.517319916667247e-05, "loss": 0.0751, "num_input_tokens_seen": 13112256, "step": 14665 }, { "epoch": 3.871849016761251, "grad_norm": 0.07345984876155853, "learning_rate": 3.516423050744313e-05, "loss": 0.0355, "num_input_tokens_seen": 13117088, "step": 14670 }, { "epoch": 3.873168800316748, "grad_norm": 0.13963858783245087, "learning_rate": 3.5155260280761704e-05, "loss": 0.0322, "num_input_tokens_seen": 13121472, "step": 14675 }, { "epoch": 3.874488583872245, "grad_norm": 0.0901065468788147, "learning_rate": 3.514628848801154e-05, "loss": 0.0344, "num_input_tokens_seen": 13125856, "step": 14680 }, { "epoch": 3.8758083674277417, "grad_norm": 0.14650169014930725, "learning_rate": 3.5137315130576174e-05, "loss": 0.0785, "num_input_tokens_seen": 13130240, "step": 14685 }, { "epoch": 3.8771281509832387, "grad_norm": 0.4479154646396637, "learning_rate": 3.512834020983942e-05, "loss": 0.0671, "num_input_tokens_seen": 13134592, "step": 14690 }, { "epoch": 3.8784479345387357, "grad_norm": 0.2864721417427063, "learning_rate": 3.5119363727185334e-05, "loss": 0.0675, "num_input_tokens_seen": 13139040, "step": 14695 }, { "epoch": 3.8797677180942327, "grad_norm": 0.045504871755838394, "learning_rate": 3.511038568399819e-05, "loss": 0.0539, "num_input_tokens_seen": 13143360, "step": 14700 }, { "epoch": 3.8810875016497297, "grad_norm": 0.17049123346805573, "learning_rate": 3.510140608166251e-05, "loss": 0.0207, "num_input_tokens_seen": 13147552, "step": 14705 }, { "epoch": 3.8824072852052263, "grad_norm": 0.059922803193330765, "learning_rate": 3.509242492156308e-05, "loss": 0.0119, "num_input_tokens_seen": 13152096, "step": 14710 }, { "epoch": 3.8837270687607233, "grad_norm": 0.11769622564315796, "learning_rate": 3.5083442205084896e-05, "loss": 0.0838, "num_input_tokens_seen": 13156480, "step": 14715 }, { "epoch": 3.8850468523162203, "grad_norm": 0.23994484543800354, "learning_rate": 3.507445793361321e-05, "loss": 0.0915, "num_input_tokens_seen": 13160864, "step": 14720 }, { "epoch": 3.886366635871717, "grad_norm": 0.32888466119766235, "learning_rate": 3.5065472108533505e-05, "loss": 0.0803, "num_input_tokens_seen": 13165152, "step": 14725 }, { "epoch": 3.887686419427214, "grad_norm": 0.4879530370235443, "learning_rate": 3.5056484731231504e-05, "loss": 0.1403, "num_input_tokens_seen": 13169664, "step": 14730 }, { "epoch": 3.889006202982711, "grad_norm": 0.13911044597625732, "learning_rate": 3.504749580309319e-05, "loss": 0.0659, "num_input_tokens_seen": 13174144, "step": 14735 }, { "epoch": 3.890325986538208, "grad_norm": 0.26912108063697815, "learning_rate": 3.5038505325504753e-05, "loss": 0.0456, "num_input_tokens_seen": 13178720, "step": 14740 }, { "epoch": 3.891645770093705, "grad_norm": 0.05984117090702057, "learning_rate": 3.502951329985264e-05, "loss": 0.0696, "num_input_tokens_seen": 13183072, "step": 14745 }, { "epoch": 3.8929655536492014, "grad_norm": 0.39965179562568665, "learning_rate": 3.502051972752354e-05, "loss": 0.0582, "num_input_tokens_seen": 13187424, "step": 14750 }, { "epoch": 3.8942853372046984, "grad_norm": 0.03279373422265053, "learning_rate": 3.5011524609904374e-05, "loss": 0.0361, "num_input_tokens_seen": 13191936, "step": 14755 }, { "epoch": 3.8956051207601954, "grad_norm": 0.10416101664304733, "learning_rate": 3.50025279483823e-05, "loss": 0.0444, "num_input_tokens_seen": 13196160, "step": 14760 }, { "epoch": 3.8969249043156924, "grad_norm": 0.29175421595573425, "learning_rate": 3.499352974434472e-05, "loss": 0.0881, "num_input_tokens_seen": 13200608, "step": 14765 }, { "epoch": 3.8982446878711894, "grad_norm": 0.15377305448055267, "learning_rate": 3.498452999917926e-05, "loss": 0.1065, "num_input_tokens_seen": 13205248, "step": 14770 }, { "epoch": 3.899564471426686, "grad_norm": 0.793651819229126, "learning_rate": 3.4975528714273795e-05, "loss": 0.1181, "num_input_tokens_seen": 13209984, "step": 14775 }, { "epoch": 3.900884254982183, "grad_norm": 0.06444971263408661, "learning_rate": 3.4966525891016454e-05, "loss": 0.0661, "num_input_tokens_seen": 13214592, "step": 14780 }, { "epoch": 3.90220403853768, "grad_norm": 0.1338070034980774, "learning_rate": 3.495752153079557e-05, "loss": 0.0722, "num_input_tokens_seen": 13218912, "step": 14785 }, { "epoch": 3.9035238220931765, "grad_norm": 0.037054501473903656, "learning_rate": 3.494851563499974e-05, "loss": 0.0388, "num_input_tokens_seen": 13223616, "step": 14790 }, { "epoch": 3.9048436056486735, "grad_norm": 0.219995379447937, "learning_rate": 3.493950820501777e-05, "loss": 0.056, "num_input_tokens_seen": 13228064, "step": 14795 }, { "epoch": 3.9061633892041705, "grad_norm": 0.10364536195993423, "learning_rate": 3.493049924223872e-05, "loss": 0.035, "num_input_tokens_seen": 13232576, "step": 14800 }, { "epoch": 3.9061633892041705, "eval_loss": 0.07008786499500275, "eval_runtime": 64.8015, "eval_samples_per_second": 103.933, "eval_steps_per_second": 25.987, "num_input_tokens_seen": 13232576, "step": 14800 }, { "epoch": 3.9074831727596675, "grad_norm": 0.10320975631475449, "learning_rate": 3.49214887480519e-05, "loss": 0.0571, "num_input_tokens_seen": 13236992, "step": 14805 }, { "epoch": 3.9088029563151645, "grad_norm": 0.13766777515411377, "learning_rate": 3.4912476723846834e-05, "loss": 0.0586, "num_input_tokens_seen": 13241216, "step": 14810 }, { "epoch": 3.910122739870661, "grad_norm": 0.2176133692264557, "learning_rate": 3.490346317101328e-05, "loss": 0.0468, "num_input_tokens_seen": 13245280, "step": 14815 }, { "epoch": 3.911442523426158, "grad_norm": 0.14945746958255768, "learning_rate": 3.4894448090941266e-05, "loss": 0.0405, "num_input_tokens_seen": 13249632, "step": 14820 }, { "epoch": 3.912762306981655, "grad_norm": 0.11111060529947281, "learning_rate": 3.488543148502101e-05, "loss": 0.0498, "num_input_tokens_seen": 13253952, "step": 14825 }, { "epoch": 3.914082090537152, "grad_norm": 0.21643146872520447, "learning_rate": 3.487641335464299e-05, "loss": 0.0677, "num_input_tokens_seen": 13258464, "step": 14830 }, { "epoch": 3.915401874092649, "grad_norm": 0.2436448633670807, "learning_rate": 3.4867393701197914e-05, "loss": 0.0915, "num_input_tokens_seen": 13262880, "step": 14835 }, { "epoch": 3.9167216576481456, "grad_norm": 0.5105699300765991, "learning_rate": 3.485837252607673e-05, "loss": 0.0514, "num_input_tokens_seen": 13267520, "step": 14840 }, { "epoch": 3.9180414412036426, "grad_norm": 0.1857278198003769, "learning_rate": 3.4849349830670615e-05, "loss": 0.0863, "num_input_tokens_seen": 13272256, "step": 14845 }, { "epoch": 3.9193612247591396, "grad_norm": 0.022915441542863846, "learning_rate": 3.4840325616370976e-05, "loss": 0.0463, "num_input_tokens_seen": 13276320, "step": 14850 }, { "epoch": 3.920681008314636, "grad_norm": 0.032792024314403534, "learning_rate": 3.483129988456947e-05, "loss": 0.0412, "num_input_tokens_seen": 13280704, "step": 14855 }, { "epoch": 3.922000791870133, "grad_norm": 0.14063799381256104, "learning_rate": 3.482227263665797e-05, "loss": 0.0214, "num_input_tokens_seen": 13284992, "step": 14860 }, { "epoch": 3.92332057542563, "grad_norm": 0.12707214057445526, "learning_rate": 3.48132438740286e-05, "loss": 0.0515, "num_input_tokens_seen": 13289664, "step": 14865 }, { "epoch": 3.924640358981127, "grad_norm": 0.5708481073379517, "learning_rate": 3.48042135980737e-05, "loss": 0.0401, "num_input_tokens_seen": 13294304, "step": 14870 }, { "epoch": 3.925960142536624, "grad_norm": 0.05529450997710228, "learning_rate": 3.479518181018586e-05, "loss": 0.0388, "num_input_tokens_seen": 13298848, "step": 14875 }, { "epoch": 3.9272799260921207, "grad_norm": 0.266022264957428, "learning_rate": 3.4786148511757886e-05, "loss": 0.0572, "num_input_tokens_seen": 13303200, "step": 14880 }, { "epoch": 3.9285997096476177, "grad_norm": 0.19059279561042786, "learning_rate": 3.477711370418284e-05, "loss": 0.1014, "num_input_tokens_seen": 13307520, "step": 14885 }, { "epoch": 3.9299194932031147, "grad_norm": 0.17849835753440857, "learning_rate": 3.476807738885399e-05, "loss": 0.044, "num_input_tokens_seen": 13312096, "step": 14890 }, { "epoch": 3.9312392767586117, "grad_norm": 0.22950884699821472, "learning_rate": 3.475903956716485e-05, "loss": 0.0254, "num_input_tokens_seen": 13316608, "step": 14895 }, { "epoch": 3.9325590603141087, "grad_norm": 0.5360502004623413, "learning_rate": 3.475000024050917e-05, "loss": 0.0754, "num_input_tokens_seen": 13320800, "step": 14900 }, { "epoch": 3.9338788438696053, "grad_norm": 0.2870543897151947, "learning_rate": 3.4740959410280926e-05, "loss": 0.0599, "num_input_tokens_seen": 13325152, "step": 14905 }, { "epoch": 3.9351986274251023, "grad_norm": 0.171759694814682, "learning_rate": 3.4731917077874324e-05, "loss": 0.0485, "num_input_tokens_seen": 13329696, "step": 14910 }, { "epoch": 3.9365184109805993, "grad_norm": 0.3141838014125824, "learning_rate": 3.4722873244683816e-05, "loss": 0.0677, "num_input_tokens_seen": 13334336, "step": 14915 }, { "epoch": 3.937838194536096, "grad_norm": 0.22930733859539032, "learning_rate": 3.4713827912104065e-05, "loss": 0.0619, "num_input_tokens_seen": 13338560, "step": 14920 }, { "epoch": 3.939157978091593, "grad_norm": 0.030559547245502472, "learning_rate": 3.470478108152998e-05, "loss": 0.0358, "num_input_tokens_seen": 13343456, "step": 14925 }, { "epoch": 3.94047776164709, "grad_norm": 0.0794212594628334, "learning_rate": 3.4695732754356695e-05, "loss": 0.1, "num_input_tokens_seen": 13348064, "step": 14930 }, { "epoch": 3.941797545202587, "grad_norm": 0.24997788667678833, "learning_rate": 3.4686682931979576e-05, "loss": 0.0802, "num_input_tokens_seen": 13352352, "step": 14935 }, { "epoch": 3.943117328758084, "grad_norm": 0.04639951139688492, "learning_rate": 3.467763161579422e-05, "loss": 0.0941, "num_input_tokens_seen": 13356928, "step": 14940 }, { "epoch": 3.9444371123135804, "grad_norm": 0.113305002450943, "learning_rate": 3.466857880719645e-05, "loss": 0.0388, "num_input_tokens_seen": 13361184, "step": 14945 }, { "epoch": 3.9457568958690774, "grad_norm": 0.44650429487228394, "learning_rate": 3.465952450758233e-05, "loss": 0.0799, "num_input_tokens_seen": 13365696, "step": 14950 }, { "epoch": 3.9470766794245744, "grad_norm": 0.12692926824092865, "learning_rate": 3.4650468718348126e-05, "loss": 0.0297, "num_input_tokens_seen": 13370080, "step": 14955 }, { "epoch": 3.9483964629800714, "grad_norm": 0.15140819549560547, "learning_rate": 3.464141144089038e-05, "loss": 0.0907, "num_input_tokens_seen": 13374560, "step": 14960 }, { "epoch": 3.9497162465355684, "grad_norm": 0.03895285353064537, "learning_rate": 3.463235267660583e-05, "loss": 0.0405, "num_input_tokens_seen": 13378880, "step": 14965 }, { "epoch": 3.951036030091065, "grad_norm": 0.1654711216688156, "learning_rate": 3.462329242689145e-05, "loss": 0.0952, "num_input_tokens_seen": 13383232, "step": 14970 }, { "epoch": 3.952355813646562, "grad_norm": 0.19105668365955353, "learning_rate": 3.461423069314444e-05, "loss": 0.0726, "num_input_tokens_seen": 13387872, "step": 14975 }, { "epoch": 3.953675597202059, "grad_norm": 0.05818028748035431, "learning_rate": 3.460516747676224e-05, "loss": 0.0157, "num_input_tokens_seen": 13392320, "step": 14980 }, { "epoch": 3.9549953807575555, "grad_norm": 0.14191877841949463, "learning_rate": 3.459610277914251e-05, "loss": 0.0507, "num_input_tokens_seen": 13396448, "step": 14985 }, { "epoch": 3.9563151643130525, "grad_norm": 0.31594961881637573, "learning_rate": 3.458703660168314e-05, "loss": 0.0626, "num_input_tokens_seen": 13401056, "step": 14990 }, { "epoch": 3.9576349478685495, "grad_norm": 0.3360626995563507, "learning_rate": 3.457796894578224e-05, "loss": 0.1145, "num_input_tokens_seen": 13405632, "step": 14995 }, { "epoch": 3.9589547314240465, "grad_norm": 0.13125957548618317, "learning_rate": 3.456889981283817e-05, "loss": 0.038, "num_input_tokens_seen": 13410176, "step": 15000 }, { "epoch": 3.9589547314240465, "eval_loss": 0.0699625238776207, "eval_runtime": 64.7956, "eval_samples_per_second": 103.942, "eval_steps_per_second": 25.989, "num_input_tokens_seen": 13410176, "step": 15000 }, { "epoch": 3.9602745149795435, "grad_norm": 0.278329074382782, "learning_rate": 3.45598292042495e-05, "loss": 0.0367, "num_input_tokens_seen": 13414656, "step": 15005 }, { "epoch": 3.96159429853504, "grad_norm": 0.21147602796554565, "learning_rate": 3.4550757121415035e-05, "loss": 0.0537, "num_input_tokens_seen": 13418976, "step": 15010 }, { "epoch": 3.962914082090537, "grad_norm": 0.2862236797809601, "learning_rate": 3.454168356573378e-05, "loss": 0.0874, "num_input_tokens_seen": 13423136, "step": 15015 }, { "epoch": 3.964233865646034, "grad_norm": 0.11077149957418442, "learning_rate": 3.453260853860503e-05, "loss": 0.046, "num_input_tokens_seen": 13427552, "step": 15020 }, { "epoch": 3.965553649201531, "grad_norm": 0.04835366830229759, "learning_rate": 3.452353204142824e-05, "loss": 0.0609, "num_input_tokens_seen": 13432384, "step": 15025 }, { "epoch": 3.966873432757028, "grad_norm": 0.07445228844881058, "learning_rate": 3.4514454075603136e-05, "loss": 0.0309, "num_input_tokens_seen": 13436864, "step": 15030 }, { "epoch": 3.9681932163125246, "grad_norm": 0.22780664265155792, "learning_rate": 3.450537464252964e-05, "loss": 0.0621, "num_input_tokens_seen": 13441216, "step": 15035 }, { "epoch": 3.9695129998680216, "grad_norm": 0.2804659605026245, "learning_rate": 3.4496293743607925e-05, "loss": 0.0779, "num_input_tokens_seen": 13445792, "step": 15040 }, { "epoch": 3.9708327834235186, "grad_norm": 0.1564619541168213, "learning_rate": 3.448721138023838e-05, "loss": 0.0783, "num_input_tokens_seen": 13450016, "step": 15045 }, { "epoch": 3.972152566979015, "grad_norm": 0.44536006450653076, "learning_rate": 3.447812755382162e-05, "loss": 0.1, "num_input_tokens_seen": 13454176, "step": 15050 }, { "epoch": 3.973472350534512, "grad_norm": 0.3413216769695282, "learning_rate": 3.446904226575847e-05, "loss": 0.0712, "num_input_tokens_seen": 13458880, "step": 15055 }, { "epoch": 3.974792134090009, "grad_norm": 0.33574607968330383, "learning_rate": 3.445995551745002e-05, "loss": 0.0574, "num_input_tokens_seen": 13463232, "step": 15060 }, { "epoch": 3.976111917645506, "grad_norm": 0.07973798364400864, "learning_rate": 3.445086731029753e-05, "loss": 0.0583, "num_input_tokens_seen": 13467904, "step": 15065 }, { "epoch": 3.977431701201003, "grad_norm": 0.21384595334529877, "learning_rate": 3.444177764570255e-05, "loss": 0.0643, "num_input_tokens_seen": 13472512, "step": 15070 }, { "epoch": 3.9787514847564998, "grad_norm": 0.37786194682121277, "learning_rate": 3.44326865250668e-05, "loss": 0.056, "num_input_tokens_seen": 13476960, "step": 15075 }, { "epoch": 3.9800712683119968, "grad_norm": 0.3112943768501282, "learning_rate": 3.442359394979225e-05, "loss": 0.1247, "num_input_tokens_seen": 13481312, "step": 15080 }, { "epoch": 3.9813910518674938, "grad_norm": 0.1531369686126709, "learning_rate": 3.441449992128108e-05, "loss": 0.0426, "num_input_tokens_seen": 13485984, "step": 15085 }, { "epoch": 3.9827108354229908, "grad_norm": 0.12034045159816742, "learning_rate": 3.440540444093573e-05, "loss": 0.0727, "num_input_tokens_seen": 13490496, "step": 15090 }, { "epoch": 3.9840306189784878, "grad_norm": 0.2524610161781311, "learning_rate": 3.43963075101588e-05, "loss": 0.0758, "num_input_tokens_seen": 13494720, "step": 15095 }, { "epoch": 3.9853504025339843, "grad_norm": 0.2634822726249695, "learning_rate": 3.438720913035318e-05, "loss": 0.1152, "num_input_tokens_seen": 13499136, "step": 15100 }, { "epoch": 3.9866701860894813, "grad_norm": 0.12332644313573837, "learning_rate": 3.437810930292195e-05, "loss": 0.0803, "num_input_tokens_seen": 13503584, "step": 15105 }, { "epoch": 3.9879899696449783, "grad_norm": 0.06713055074214935, "learning_rate": 3.43690080292684e-05, "loss": 0.0683, "num_input_tokens_seen": 13508160, "step": 15110 }, { "epoch": 3.989309753200475, "grad_norm": 0.03950841724872589, "learning_rate": 3.435990531079608e-05, "loss": 0.0333, "num_input_tokens_seen": 13512960, "step": 15115 }, { "epoch": 3.990629536755972, "grad_norm": 0.21860715746879578, "learning_rate": 3.435080114890874e-05, "loss": 0.0396, "num_input_tokens_seen": 13516992, "step": 15120 }, { "epoch": 3.991949320311469, "grad_norm": 0.11835494637489319, "learning_rate": 3.434169554501035e-05, "loss": 0.0467, "num_input_tokens_seen": 13521472, "step": 15125 }, { "epoch": 3.993269103866966, "grad_norm": 0.11930891126394272, "learning_rate": 3.433258850050511e-05, "loss": 0.0633, "num_input_tokens_seen": 13525920, "step": 15130 }, { "epoch": 3.994588887422463, "grad_norm": 0.16619794070720673, "learning_rate": 3.4323480016797446e-05, "loss": 0.0943, "num_input_tokens_seen": 13530976, "step": 15135 }, { "epoch": 3.9959086709779594, "grad_norm": 0.0904647633433342, "learning_rate": 3.4314370095291995e-05, "loss": 0.067, "num_input_tokens_seen": 13535424, "step": 15140 }, { "epoch": 3.9972284545334564, "grad_norm": 0.35754939913749695, "learning_rate": 3.430525873739363e-05, "loss": 0.0778, "num_input_tokens_seen": 13539808, "step": 15145 }, { "epoch": 3.9985482380889534, "grad_norm": 0.1245490312576294, "learning_rate": 3.429614594450743e-05, "loss": 0.0545, "num_input_tokens_seen": 13544416, "step": 15150 }, { "epoch": 3.9998680216444504, "grad_norm": 0.2929128408432007, "learning_rate": 3.428703171803869e-05, "loss": 0.0529, "num_input_tokens_seen": 13548832, "step": 15155 }, { "epoch": 4.001055826844397, "grad_norm": 0.24679122865200043, "learning_rate": 3.4277916059392964e-05, "loss": 0.0389, "num_input_tokens_seen": 13552688, "step": 15160 }, { "epoch": 4.002375610399894, "grad_norm": 0.44888702034950256, "learning_rate": 3.426879896997598e-05, "loss": 0.0918, "num_input_tokens_seen": 13557168, "step": 15165 }, { "epoch": 4.003695393955391, "grad_norm": 0.18724696338176727, "learning_rate": 3.425968045119372e-05, "loss": 0.0863, "num_input_tokens_seen": 13561424, "step": 15170 }, { "epoch": 4.005015177510888, "grad_norm": 0.18873801827430725, "learning_rate": 3.425056050445237e-05, "loss": 0.0354, "num_input_tokens_seen": 13565968, "step": 15175 }, { "epoch": 4.006334961066385, "grad_norm": 0.10356069356203079, "learning_rate": 3.4241439131158336e-05, "loss": 0.0811, "num_input_tokens_seen": 13570352, "step": 15180 }, { "epoch": 4.007654744621882, "grad_norm": 0.2512379288673401, "learning_rate": 3.423231633271825e-05, "loss": 0.0808, "num_input_tokens_seen": 13574736, "step": 15185 }, { "epoch": 4.0089745281773785, "grad_norm": 0.14024873077869415, "learning_rate": 3.4223192110538985e-05, "loss": 0.035, "num_input_tokens_seen": 13579376, "step": 15190 }, { "epoch": 4.010294311732876, "grad_norm": 0.0481589175760746, "learning_rate": 3.4214066466027575e-05, "loss": 0.0666, "num_input_tokens_seen": 13583856, "step": 15195 }, { "epoch": 4.0116140952883725, "grad_norm": 0.26079243421554565, "learning_rate": 3.4204939400591325e-05, "loss": 0.0795, "num_input_tokens_seen": 13588176, "step": 15200 }, { "epoch": 4.0116140952883725, "eval_loss": 0.06972558051347733, "eval_runtime": 64.7979, "eval_samples_per_second": 103.939, "eval_steps_per_second": 25.989, "num_input_tokens_seen": 13588176, "step": 15200 }, { "epoch": 4.01293387884387, "grad_norm": 0.370058536529541, "learning_rate": 3.419581091563775e-05, "loss": 0.0485, "num_input_tokens_seen": 13592560, "step": 15205 }, { "epoch": 4.0142536623993665, "grad_norm": 0.11809702962636948, "learning_rate": 3.418668101257456e-05, "loss": 0.0639, "num_input_tokens_seen": 13596816, "step": 15210 }, { "epoch": 4.015573445954863, "grad_norm": 0.09288965910673141, "learning_rate": 3.417754969280971e-05, "loss": 0.0388, "num_input_tokens_seen": 13601424, "step": 15215 }, { "epoch": 4.0168932295103605, "grad_norm": 0.32196182012557983, "learning_rate": 3.416841695775137e-05, "loss": 0.042, "num_input_tokens_seen": 13606000, "step": 15220 }, { "epoch": 4.018213013065857, "grad_norm": 0.2525179386138916, "learning_rate": 3.415928280880792e-05, "loss": 0.0435, "num_input_tokens_seen": 13610640, "step": 15225 }, { "epoch": 4.019532796621354, "grad_norm": 0.18314425647258759, "learning_rate": 3.4150147247387965e-05, "loss": 0.0336, "num_input_tokens_seen": 13615440, "step": 15230 }, { "epoch": 4.020852580176851, "grad_norm": 0.48517295718193054, "learning_rate": 3.4141010274900306e-05, "loss": 0.0727, "num_input_tokens_seen": 13619920, "step": 15235 }, { "epoch": 4.022172363732348, "grad_norm": 0.444707989692688, "learning_rate": 3.413187189275399e-05, "loss": 0.0837, "num_input_tokens_seen": 13624400, "step": 15240 }, { "epoch": 4.023492147287845, "grad_norm": 0.3515114188194275, "learning_rate": 3.4122732102358265e-05, "loss": 0.0782, "num_input_tokens_seen": 13629072, "step": 15245 }, { "epoch": 4.024811930843342, "grad_norm": 0.39779072999954224, "learning_rate": 3.411359090512261e-05, "loss": 0.0559, "num_input_tokens_seen": 13633616, "step": 15250 }, { "epoch": 4.026131714398838, "grad_norm": 0.3236246407032013, "learning_rate": 3.410444830245672e-05, "loss": 0.0925, "num_input_tokens_seen": 13638224, "step": 15255 }, { "epoch": 4.027451497954336, "grad_norm": 0.020479105412960052, "learning_rate": 3.409530429577048e-05, "loss": 0.0609, "num_input_tokens_seen": 13642768, "step": 15260 }, { "epoch": 4.028771281509832, "grad_norm": 0.25240564346313477, "learning_rate": 3.408615888647402e-05, "loss": 0.0888, "num_input_tokens_seen": 13647056, "step": 15265 }, { "epoch": 4.03009106506533, "grad_norm": 0.0888763815164566, "learning_rate": 3.4077012075977675e-05, "loss": 0.0583, "num_input_tokens_seen": 13651408, "step": 15270 }, { "epoch": 4.031410848620826, "grad_norm": 0.19299018383026123, "learning_rate": 3.4067863865692e-05, "loss": 0.1245, "num_input_tokens_seen": 13655824, "step": 15275 }, { "epoch": 4.032730632176323, "grad_norm": 0.06421855837106705, "learning_rate": 3.4058714257027755e-05, "loss": 0.048, "num_input_tokens_seen": 13660304, "step": 15280 }, { "epoch": 4.03405041573182, "grad_norm": 0.115789033472538, "learning_rate": 3.404956325139594e-05, "loss": 0.0869, "num_input_tokens_seen": 13664624, "step": 15285 }, { "epoch": 4.035370199287317, "grad_norm": 0.3456934690475464, "learning_rate": 3.404041085020775e-05, "loss": 0.0832, "num_input_tokens_seen": 13669104, "step": 15290 }, { "epoch": 4.036689982842814, "grad_norm": 0.3957151472568512, "learning_rate": 3.403125705487459e-05, "loss": 0.0518, "num_input_tokens_seen": 13673520, "step": 15295 }, { "epoch": 4.038009766398311, "grad_norm": 0.1952756643295288, "learning_rate": 3.402210186680811e-05, "loss": 0.0846, "num_input_tokens_seen": 13677744, "step": 15300 }, { "epoch": 4.039329549953807, "grad_norm": 0.1507146805524826, "learning_rate": 3.4012945287420137e-05, "loss": 0.0471, "num_input_tokens_seen": 13681872, "step": 15305 }, { "epoch": 4.040649333509305, "grad_norm": 0.09474373608827591, "learning_rate": 3.400378731812274e-05, "loss": 0.0564, "num_input_tokens_seen": 13686384, "step": 15310 }, { "epoch": 4.041969117064801, "grad_norm": 0.18544119596481323, "learning_rate": 3.399462796032817e-05, "loss": 0.0883, "num_input_tokens_seen": 13690704, "step": 15315 }, { "epoch": 4.043288900620298, "grad_norm": 0.2785351276397705, "learning_rate": 3.3985467215448954e-05, "loss": 0.035, "num_input_tokens_seen": 13694992, "step": 15320 }, { "epoch": 4.044608684175795, "grad_norm": 0.4678272604942322, "learning_rate": 3.3976305084897776e-05, "loss": 0.123, "num_input_tokens_seen": 13699056, "step": 15325 }, { "epoch": 4.045928467731292, "grad_norm": 0.2782832384109497, "learning_rate": 3.3967141570087544e-05, "loss": 0.067, "num_input_tokens_seen": 13703824, "step": 15330 }, { "epoch": 4.047248251286789, "grad_norm": 0.27288955450057983, "learning_rate": 3.39579766724314e-05, "loss": 0.0751, "num_input_tokens_seen": 13708304, "step": 15335 }, { "epoch": 4.048568034842286, "grad_norm": 0.22154158353805542, "learning_rate": 3.3948810393342677e-05, "loss": 0.0451, "num_input_tokens_seen": 13712720, "step": 15340 }, { "epoch": 4.0498878183977824, "grad_norm": 0.16392801702022552, "learning_rate": 3.3939642734234936e-05, "loss": 0.0733, "num_input_tokens_seen": 13717040, "step": 15345 }, { "epoch": 4.05120760195328, "grad_norm": 0.2467198520898819, "learning_rate": 3.393047369652194e-05, "loss": 0.0634, "num_input_tokens_seen": 13721584, "step": 15350 }, { "epoch": 4.0525273855087764, "grad_norm": 0.07231663167476654, "learning_rate": 3.3921303281617664e-05, "loss": 0.0403, "num_input_tokens_seen": 13726128, "step": 15355 }, { "epoch": 4.053847169064274, "grad_norm": 0.07332615554332733, "learning_rate": 3.391213149093632e-05, "loss": 0.1195, "num_input_tokens_seen": 13730640, "step": 15360 }, { "epoch": 4.0551669526197704, "grad_norm": 0.16019225120544434, "learning_rate": 3.3902958325892303e-05, "loss": 0.0468, "num_input_tokens_seen": 13734896, "step": 15365 }, { "epoch": 4.056486736175267, "grad_norm": 0.2087184190750122, "learning_rate": 3.389378378790023e-05, "loss": 0.0674, "num_input_tokens_seen": 13739504, "step": 15370 }, { "epoch": 4.0578065197307644, "grad_norm": 0.35674214363098145, "learning_rate": 3.388460787837493e-05, "loss": 0.0774, "num_input_tokens_seen": 13744112, "step": 15375 }, { "epoch": 4.059126303286261, "grad_norm": 0.23047539591789246, "learning_rate": 3.387543059873145e-05, "loss": 0.0636, "num_input_tokens_seen": 13748528, "step": 15380 }, { "epoch": 4.060446086841758, "grad_norm": 0.17903640866279602, "learning_rate": 3.386625195038503e-05, "loss": 0.0548, "num_input_tokens_seen": 13752944, "step": 15385 }, { "epoch": 4.061765870397255, "grad_norm": 0.17652344703674316, "learning_rate": 3.3857071934751136e-05, "loss": 0.1132, "num_input_tokens_seen": 13757392, "step": 15390 }, { "epoch": 4.063085653952752, "grad_norm": 0.1955309361219406, "learning_rate": 3.384789055324544e-05, "loss": 0.0561, "num_input_tokens_seen": 13761904, "step": 15395 }, { "epoch": 4.064405437508249, "grad_norm": 0.13614407181739807, "learning_rate": 3.3838707807283843e-05, "loss": 0.0595, "num_input_tokens_seen": 13766160, "step": 15400 }, { "epoch": 4.064405437508249, "eval_loss": 0.06941487640142441, "eval_runtime": 64.7667, "eval_samples_per_second": 103.989, "eval_steps_per_second": 26.001, "num_input_tokens_seen": 13766160, "step": 15400 }, { "epoch": 4.065725221063746, "grad_norm": 0.052259884774684906, "learning_rate": 3.382952369828243e-05, "loss": 0.0809, "num_input_tokens_seen": 13770672, "step": 15405 }, { "epoch": 4.067045004619242, "grad_norm": 0.34939345717430115, "learning_rate": 3.38203382276575e-05, "loss": 0.063, "num_input_tokens_seen": 13775088, "step": 15410 }, { "epoch": 4.06836478817474, "grad_norm": 0.09233438968658447, "learning_rate": 3.381115139682557e-05, "loss": 0.0764, "num_input_tokens_seen": 13779376, "step": 15415 }, { "epoch": 4.069684571730236, "grad_norm": 0.046208497136831284, "learning_rate": 3.3801963207203366e-05, "loss": 0.0566, "num_input_tokens_seen": 13783984, "step": 15420 }, { "epoch": 4.071004355285734, "grad_norm": 0.23162056505680084, "learning_rate": 3.379277366020782e-05, "loss": 0.0943, "num_input_tokens_seen": 13788432, "step": 15425 }, { "epoch": 4.07232413884123, "grad_norm": 0.31733816862106323, "learning_rate": 3.3783582757256085e-05, "loss": 0.0732, "num_input_tokens_seen": 13793104, "step": 15430 }, { "epoch": 4.073643922396727, "grad_norm": 0.3538193702697754, "learning_rate": 3.3774390499765504e-05, "loss": 0.0888, "num_input_tokens_seen": 13797200, "step": 15435 }, { "epoch": 4.074963705952224, "grad_norm": 0.19041797518730164, "learning_rate": 3.376519688915364e-05, "loss": 0.0457, "num_input_tokens_seen": 13801680, "step": 15440 }, { "epoch": 4.076283489507721, "grad_norm": 0.23964625597000122, "learning_rate": 3.3756001926838273e-05, "loss": 0.0694, "num_input_tokens_seen": 13806544, "step": 15445 }, { "epoch": 4.077603273063217, "grad_norm": 0.19917507469654083, "learning_rate": 3.374680561423737e-05, "loss": 0.0615, "num_input_tokens_seen": 13810896, "step": 15450 }, { "epoch": 4.078923056618715, "grad_norm": 0.10634046047925949, "learning_rate": 3.373760795276912e-05, "loss": 0.0317, "num_input_tokens_seen": 13815536, "step": 15455 }, { "epoch": 4.080242840174211, "grad_norm": 0.1096583753824234, "learning_rate": 3.372840894385192e-05, "loss": 0.0434, "num_input_tokens_seen": 13820336, "step": 15460 }, { "epoch": 4.081562623729709, "grad_norm": 0.09835968166589737, "learning_rate": 3.3719208588904375e-05, "loss": 0.0685, "num_input_tokens_seen": 13825104, "step": 15465 }, { "epoch": 4.082882407285205, "grad_norm": 0.5322155952453613, "learning_rate": 3.371000688934529e-05, "loss": 0.0704, "num_input_tokens_seen": 13829200, "step": 15470 }, { "epoch": 4.084202190840702, "grad_norm": 0.2679766118526459, "learning_rate": 3.370080384659369e-05, "loss": 0.0582, "num_input_tokens_seen": 13833584, "step": 15475 }, { "epoch": 4.085521974396199, "grad_norm": 0.1547887772321701, "learning_rate": 3.36915994620688e-05, "loss": 0.0248, "num_input_tokens_seen": 13837936, "step": 15480 }, { "epoch": 4.086841757951696, "grad_norm": 0.088457390666008, "learning_rate": 3.3682393737190035e-05, "loss": 0.036, "num_input_tokens_seen": 13842320, "step": 15485 }, { "epoch": 4.088161541507193, "grad_norm": 0.21799533069133759, "learning_rate": 3.3673186673377054e-05, "loss": 0.0814, "num_input_tokens_seen": 13847120, "step": 15490 }, { "epoch": 4.08948132506269, "grad_norm": 0.18188966810703278, "learning_rate": 3.366397827204969e-05, "loss": 0.0318, "num_input_tokens_seen": 13851696, "step": 15495 }, { "epoch": 4.090801108618186, "grad_norm": 0.08275601267814636, "learning_rate": 3.3654768534628e-05, "loss": 0.0247, "num_input_tokens_seen": 13856240, "step": 15500 }, { "epoch": 4.092120892173684, "grad_norm": 0.27553048729896545, "learning_rate": 3.3645557462532245e-05, "loss": 0.0311, "num_input_tokens_seen": 13860880, "step": 15505 }, { "epoch": 4.09344067572918, "grad_norm": 0.31351494789123535, "learning_rate": 3.363634505718288e-05, "loss": 0.0955, "num_input_tokens_seen": 13865264, "step": 15510 }, { "epoch": 4.094760459284677, "grad_norm": 0.3622507154941559, "learning_rate": 3.362713132000057e-05, "loss": 0.0695, "num_input_tokens_seen": 13869552, "step": 15515 }, { "epoch": 4.096080242840174, "grad_norm": 0.2192094773054123, "learning_rate": 3.36179162524062e-05, "loss": 0.0993, "num_input_tokens_seen": 13873904, "step": 15520 }, { "epoch": 4.097400026395671, "grad_norm": 0.2093241959810257, "learning_rate": 3.3608699855820846e-05, "loss": 0.0723, "num_input_tokens_seen": 13878352, "step": 15525 }, { "epoch": 4.098719809951168, "grad_norm": 0.1291126012802124, "learning_rate": 3.359948213166578e-05, "loss": 0.0619, "num_input_tokens_seen": 13882608, "step": 15530 }, { "epoch": 4.100039593506665, "grad_norm": 0.38856059312820435, "learning_rate": 3.359026308136252e-05, "loss": 0.0546, "num_input_tokens_seen": 13887088, "step": 15535 }, { "epoch": 4.1013593770621615, "grad_norm": 0.20325766503810883, "learning_rate": 3.358104270633272e-05, "loss": 0.0437, "num_input_tokens_seen": 13891600, "step": 15540 }, { "epoch": 4.102679160617659, "grad_norm": 0.32808127999305725, "learning_rate": 3.357182100799831e-05, "loss": 0.0735, "num_input_tokens_seen": 13896176, "step": 15545 }, { "epoch": 4.1039989441731555, "grad_norm": 0.3997575044631958, "learning_rate": 3.3562597987781384e-05, "loss": 0.068, "num_input_tokens_seen": 13900848, "step": 15550 }, { "epoch": 4.105318727728653, "grad_norm": 0.38658228516578674, "learning_rate": 3.355337364710424e-05, "loss": 0.1324, "num_input_tokens_seen": 13905616, "step": 15555 }, { "epoch": 4.1066385112841495, "grad_norm": 0.1041800007224083, "learning_rate": 3.354414798738939e-05, "loss": 0.0373, "num_input_tokens_seen": 13909680, "step": 15560 }, { "epoch": 4.107958294839646, "grad_norm": 0.05422928184270859, "learning_rate": 3.353492101005955e-05, "loss": 0.0553, "num_input_tokens_seen": 13914352, "step": 15565 }, { "epoch": 4.1092780783951435, "grad_norm": 0.07094903290271759, "learning_rate": 3.352569271653763e-05, "loss": 0.0681, "num_input_tokens_seen": 13918896, "step": 15570 }, { "epoch": 4.11059786195064, "grad_norm": 0.44785088300704956, "learning_rate": 3.351646310824675e-05, "loss": 0.0764, "num_input_tokens_seen": 13923376, "step": 15575 }, { "epoch": 4.111917645506137, "grad_norm": 0.19541163742542267, "learning_rate": 3.350723218661023e-05, "loss": 0.0549, "num_input_tokens_seen": 13927696, "step": 15580 }, { "epoch": 4.113237429061634, "grad_norm": 0.1924787312746048, "learning_rate": 3.349799995305162e-05, "loss": 0.0588, "num_input_tokens_seen": 13932304, "step": 15585 }, { "epoch": 4.114557212617131, "grad_norm": 0.04075240343809128, "learning_rate": 3.348876640899461e-05, "loss": 0.0995, "num_input_tokens_seen": 13936944, "step": 15590 }, { "epoch": 4.115876996172628, "grad_norm": 0.08251579850912094, "learning_rate": 3.3479531555863144e-05, "loss": 0.0244, "num_input_tokens_seen": 13941424, "step": 15595 }, { "epoch": 4.117196779728125, "grad_norm": 0.12373322248458862, "learning_rate": 3.3470295395081344e-05, "loss": 0.0552, "num_input_tokens_seen": 13945776, "step": 15600 }, { "epoch": 4.117196779728125, "eval_loss": 0.06979098916053772, "eval_runtime": 64.7782, "eval_samples_per_second": 103.97, "eval_steps_per_second": 25.996, "num_input_tokens_seen": 13945776, "step": 15600 }, { "epoch": 4.118516563283621, "grad_norm": 0.17074279487133026, "learning_rate": 3.3461057928073556e-05, "loss": 0.0619, "num_input_tokens_seen": 13950128, "step": 15605 }, { "epoch": 4.119836346839119, "grad_norm": 0.3415358066558838, "learning_rate": 3.345181915626431e-05, "loss": 0.0325, "num_input_tokens_seen": 13954416, "step": 15610 }, { "epoch": 4.121156130394615, "grad_norm": 0.15275879204273224, "learning_rate": 3.344257908107834e-05, "loss": 0.0276, "num_input_tokens_seen": 13958544, "step": 15615 }, { "epoch": 4.122475913950113, "grad_norm": 0.23040013015270233, "learning_rate": 3.343333770394058e-05, "loss": 0.053, "num_input_tokens_seen": 13962928, "step": 15620 }, { "epoch": 4.123795697505609, "grad_norm": 0.04406331852078438, "learning_rate": 3.342409502627616e-05, "loss": 0.058, "num_input_tokens_seen": 13967152, "step": 15625 }, { "epoch": 4.125115481061106, "grad_norm": 0.3055570423603058, "learning_rate": 3.341485104951043e-05, "loss": 0.053, "num_input_tokens_seen": 13971408, "step": 15630 }, { "epoch": 4.126435264616603, "grad_norm": 0.4481905698776245, "learning_rate": 3.340560577506892e-05, "loss": 0.065, "num_input_tokens_seen": 13975856, "step": 15635 }, { "epoch": 4.1277550481721, "grad_norm": 0.27619317173957825, "learning_rate": 3.339635920437735e-05, "loss": 0.1077, "num_input_tokens_seen": 13980560, "step": 15640 }, { "epoch": 4.129074831727596, "grad_norm": 0.24358342587947845, "learning_rate": 3.338711133886169e-05, "loss": 0.0718, "num_input_tokens_seen": 13985040, "step": 15645 }, { "epoch": 4.130394615283094, "grad_norm": 0.14848750829696655, "learning_rate": 3.3377862179948064e-05, "loss": 0.0727, "num_input_tokens_seen": 13989712, "step": 15650 }, { "epoch": 4.13171439883859, "grad_norm": 0.12414304912090302, "learning_rate": 3.336861172906281e-05, "loss": 0.0398, "num_input_tokens_seen": 13994160, "step": 15655 }, { "epoch": 4.133034182394088, "grad_norm": 0.2855603098869324, "learning_rate": 3.335935998763245e-05, "loss": 0.0557, "num_input_tokens_seen": 13998736, "step": 15660 }, { "epoch": 4.134353965949584, "grad_norm": 0.14758923649787903, "learning_rate": 3.3350106957083744e-05, "loss": 0.0605, "num_input_tokens_seen": 14003312, "step": 15665 }, { "epoch": 4.135673749505081, "grad_norm": 0.32446083426475525, "learning_rate": 3.33408526388436e-05, "loss": 0.0389, "num_input_tokens_seen": 14007760, "step": 15670 }, { "epoch": 4.136993533060578, "grad_norm": 0.5859151482582092, "learning_rate": 3.3331597034339166e-05, "loss": 0.0695, "num_input_tokens_seen": 14012560, "step": 15675 }, { "epoch": 4.138313316616075, "grad_norm": 0.22588935494422913, "learning_rate": 3.3322340144997764e-05, "loss": 0.047, "num_input_tokens_seen": 14016816, "step": 15680 }, { "epoch": 4.139633100171572, "grad_norm": 0.274039626121521, "learning_rate": 3.331308197224693e-05, "loss": 0.0758, "num_input_tokens_seen": 14020880, "step": 15685 }, { "epoch": 4.140952883727069, "grad_norm": 0.05712407827377319, "learning_rate": 3.330382251751438e-05, "loss": 0.0877, "num_input_tokens_seen": 14025584, "step": 15690 }, { "epoch": 4.142272667282565, "grad_norm": 0.20152127742767334, "learning_rate": 3.3294561782228054e-05, "loss": 0.0744, "num_input_tokens_seen": 14029904, "step": 15695 }, { "epoch": 4.143592450838063, "grad_norm": 0.16317099332809448, "learning_rate": 3.328529976781607e-05, "loss": 0.0693, "num_input_tokens_seen": 14034544, "step": 15700 }, { "epoch": 4.144912234393559, "grad_norm": 0.30084744095802307, "learning_rate": 3.327603647570673e-05, "loss": 0.0584, "num_input_tokens_seen": 14039152, "step": 15705 }, { "epoch": 4.146232017949056, "grad_norm": 0.2822813093662262, "learning_rate": 3.326677190732857e-05, "loss": 0.0948, "num_input_tokens_seen": 14043504, "step": 15710 }, { "epoch": 4.147551801504553, "grad_norm": 0.058278538286685944, "learning_rate": 3.325750606411029e-05, "loss": 0.067, "num_input_tokens_seen": 14048016, "step": 15715 }, { "epoch": 4.14887158506005, "grad_norm": 0.19492752850055695, "learning_rate": 3.3248238947480804e-05, "loss": 0.0819, "num_input_tokens_seen": 14052400, "step": 15720 }, { "epoch": 4.150191368615547, "grad_norm": 0.14393779635429382, "learning_rate": 3.323897055886922e-05, "loss": 0.0306, "num_input_tokens_seen": 14057104, "step": 15725 }, { "epoch": 4.151511152171044, "grad_norm": 0.4962517023086548, "learning_rate": 3.322970089970484e-05, "loss": 0.0736, "num_input_tokens_seen": 14061648, "step": 15730 }, { "epoch": 4.1528309357265405, "grad_norm": 0.19396886229515076, "learning_rate": 3.3220429971417165e-05, "loss": 0.0323, "num_input_tokens_seen": 14066192, "step": 15735 }, { "epoch": 4.154150719282038, "grad_norm": 0.5927832722663879, "learning_rate": 3.321115777543588e-05, "loss": 0.0694, "num_input_tokens_seen": 14070704, "step": 15740 }, { "epoch": 4.1554705028375345, "grad_norm": 0.11209552735090256, "learning_rate": 3.320188431319088e-05, "loss": 0.0546, "num_input_tokens_seen": 14075152, "step": 15745 }, { "epoch": 4.156790286393032, "grad_norm": 0.24031676352024078, "learning_rate": 3.319260958611224e-05, "loss": 0.0731, "num_input_tokens_seen": 14079376, "step": 15750 }, { "epoch": 4.1581100699485285, "grad_norm": 0.36269837617874146, "learning_rate": 3.3183333595630256e-05, "loss": 0.0458, "num_input_tokens_seen": 14083728, "step": 15755 }, { "epoch": 4.159429853504025, "grad_norm": 0.23398347198963165, "learning_rate": 3.317405634317538e-05, "loss": 0.0995, "num_input_tokens_seen": 14088400, "step": 15760 }, { "epoch": 4.1607496370595225, "grad_norm": 0.17716103792190552, "learning_rate": 3.3164777830178315e-05, "loss": 0.0795, "num_input_tokens_seen": 14092688, "step": 15765 }, { "epoch": 4.162069420615019, "grad_norm": 0.10484951734542847, "learning_rate": 3.315549805806989e-05, "loss": 0.061, "num_input_tokens_seen": 14097296, "step": 15770 }, { "epoch": 4.163389204170516, "grad_norm": 0.10782454162836075, "learning_rate": 3.314621702828118e-05, "loss": 0.0617, "num_input_tokens_seen": 14101744, "step": 15775 }, { "epoch": 4.164708987726013, "grad_norm": 0.0995270311832428, "learning_rate": 3.313693474224342e-05, "loss": 0.078, "num_input_tokens_seen": 14105872, "step": 15780 }, { "epoch": 4.16602877128151, "grad_norm": 0.29905152320861816, "learning_rate": 3.312765120138809e-05, "loss": 0.0449, "num_input_tokens_seen": 14110000, "step": 15785 }, { "epoch": 4.167348554837007, "grad_norm": 0.3319566249847412, "learning_rate": 3.311836640714679e-05, "loss": 0.0768, "num_input_tokens_seen": 14114192, "step": 15790 }, { "epoch": 4.168668338392504, "grad_norm": 0.049213580787181854, "learning_rate": 3.310908036095137e-05, "loss": 0.052, "num_input_tokens_seen": 14118640, "step": 15795 }, { "epoch": 4.169988121948, "grad_norm": 0.2921159267425537, "learning_rate": 3.309979306423386e-05, "loss": 0.0544, "num_input_tokens_seen": 14123120, "step": 15800 }, { "epoch": 4.169988121948, "eval_loss": 0.06947094947099686, "eval_runtime": 64.7206, "eval_samples_per_second": 104.063, "eval_steps_per_second": 26.02, "num_input_tokens_seen": 14123120, "step": 15800 }, { "epoch": 4.171307905503498, "grad_norm": 0.5583330988883972, "learning_rate": 3.309050451842647e-05, "loss": 0.0824, "num_input_tokens_seen": 14127664, "step": 15805 }, { "epoch": 4.172627689058994, "grad_norm": 0.09117178618907928, "learning_rate": 3.3081214724961604e-05, "loss": 0.0225, "num_input_tokens_seen": 14132016, "step": 15810 }, { "epoch": 4.173947472614492, "grad_norm": 0.36706581711769104, "learning_rate": 3.307192368527188e-05, "loss": 0.0979, "num_input_tokens_seen": 14136336, "step": 15815 }, { "epoch": 4.175267256169988, "grad_norm": 0.369368314743042, "learning_rate": 3.306263140079008e-05, "loss": 0.0553, "num_input_tokens_seen": 14140784, "step": 15820 }, { "epoch": 4.176587039725485, "grad_norm": 0.22923415899276733, "learning_rate": 3.30533378729492e-05, "loss": 0.0554, "num_input_tokens_seen": 14145360, "step": 15825 }, { "epoch": 4.177906823280982, "grad_norm": 0.2262612134218216, "learning_rate": 3.304404310318242e-05, "loss": 0.0509, "num_input_tokens_seen": 14149904, "step": 15830 }, { "epoch": 4.179226606836479, "grad_norm": 0.07125329971313477, "learning_rate": 3.3034747092923105e-05, "loss": 0.0762, "num_input_tokens_seen": 14154320, "step": 15835 }, { "epoch": 4.180546390391975, "grad_norm": 0.2851259410381317, "learning_rate": 3.3025449843604806e-05, "loss": 0.0511, "num_input_tokens_seen": 14158832, "step": 15840 }, { "epoch": 4.181866173947473, "grad_norm": 0.16051997244358063, "learning_rate": 3.30161513566613e-05, "loss": 0.0404, "num_input_tokens_seen": 14163536, "step": 15845 }, { "epoch": 4.183185957502969, "grad_norm": 0.12674543261528015, "learning_rate": 3.3006851633526506e-05, "loss": 0.0574, "num_input_tokens_seen": 14167312, "step": 15850 }, { "epoch": 4.184505741058467, "grad_norm": 0.31514954566955566, "learning_rate": 3.2997550675634584e-05, "loss": 0.0569, "num_input_tokens_seen": 14171824, "step": 15855 }, { "epoch": 4.185825524613963, "grad_norm": 0.28195643424987793, "learning_rate": 3.2988248484419825e-05, "loss": 0.0298, "num_input_tokens_seen": 14176368, "step": 15860 }, { "epoch": 4.18714530816946, "grad_norm": 0.27311739325523376, "learning_rate": 3.2978945061316776e-05, "loss": 0.0409, "num_input_tokens_seen": 14180848, "step": 15865 }, { "epoch": 4.188465091724957, "grad_norm": 0.16340073943138123, "learning_rate": 3.296964040776013e-05, "loss": 0.0428, "num_input_tokens_seen": 14185360, "step": 15870 }, { "epoch": 4.189784875280454, "grad_norm": 0.24772635102272034, "learning_rate": 3.296033452518478e-05, "loss": 0.043, "num_input_tokens_seen": 14189808, "step": 15875 }, { "epoch": 4.191104658835951, "grad_norm": 0.16504834592342377, "learning_rate": 3.2951027415025806e-05, "loss": 0.0802, "num_input_tokens_seen": 14194128, "step": 15880 }, { "epoch": 4.192424442391448, "grad_norm": 0.2937238812446594, "learning_rate": 3.294171907871849e-05, "loss": 0.0474, "num_input_tokens_seen": 14198640, "step": 15885 }, { "epoch": 4.193744225946944, "grad_norm": 0.04853179305791855, "learning_rate": 3.293240951769828e-05, "loss": 0.0844, "num_input_tokens_seen": 14202736, "step": 15890 }, { "epoch": 4.195064009502442, "grad_norm": 0.325665682554245, "learning_rate": 3.2923098733400846e-05, "loss": 0.0815, "num_input_tokens_seen": 14207376, "step": 15895 }, { "epoch": 4.196383793057938, "grad_norm": 0.14807242155075073, "learning_rate": 3.291378672726202e-05, "loss": 0.0631, "num_input_tokens_seen": 14211856, "step": 15900 }, { "epoch": 4.197703576613435, "grad_norm": 0.13883376121520996, "learning_rate": 3.2904473500717824e-05, "loss": 0.0539, "num_input_tokens_seen": 14216368, "step": 15905 }, { "epoch": 4.199023360168932, "grad_norm": 0.08326022326946259, "learning_rate": 3.289515905520449e-05, "loss": 0.0411, "num_input_tokens_seen": 14220848, "step": 15910 }, { "epoch": 4.200343143724429, "grad_norm": 0.0963999554514885, "learning_rate": 3.288584339215841e-05, "loss": 0.0879, "num_input_tokens_seen": 14225456, "step": 15915 }, { "epoch": 4.201662927279926, "grad_norm": 0.3117450177669525, "learning_rate": 3.287652651301617e-05, "loss": 0.079, "num_input_tokens_seen": 14230384, "step": 15920 }, { "epoch": 4.202982710835423, "grad_norm": 0.059780437499284744, "learning_rate": 3.286720841921457e-05, "loss": 0.0544, "num_input_tokens_seen": 14234704, "step": 15925 }, { "epoch": 4.2043024943909195, "grad_norm": 0.08157684653997421, "learning_rate": 3.285788911219056e-05, "loss": 0.0623, "num_input_tokens_seen": 14238928, "step": 15930 }, { "epoch": 4.205622277946417, "grad_norm": 0.19843405485153198, "learning_rate": 3.284856859338131e-05, "loss": 0.0324, "num_input_tokens_seen": 14243216, "step": 15935 }, { "epoch": 4.2069420615019135, "grad_norm": 0.3667726516723633, "learning_rate": 3.283924686422414e-05, "loss": 0.1147, "num_input_tokens_seen": 14247536, "step": 15940 }, { "epoch": 4.208261845057411, "grad_norm": 0.5752120018005371, "learning_rate": 3.282992392615659e-05, "loss": 0.1574, "num_input_tokens_seen": 14252432, "step": 15945 }, { "epoch": 4.2095816286129075, "grad_norm": 0.15009431540966034, "learning_rate": 3.282059978061638e-05, "loss": 0.0689, "num_input_tokens_seen": 14257040, "step": 15950 }, { "epoch": 4.210901412168404, "grad_norm": 0.38392356038093567, "learning_rate": 3.28112744290414e-05, "loss": 0.0657, "num_input_tokens_seen": 14260816, "step": 15955 }, { "epoch": 4.2122211957239015, "grad_norm": 0.22872377932071686, "learning_rate": 3.280194787286974e-05, "loss": 0.0642, "num_input_tokens_seen": 14265392, "step": 15960 }, { "epoch": 4.213540979279398, "grad_norm": 0.10152777284383774, "learning_rate": 3.2792620113539674e-05, "loss": 0.0393, "num_input_tokens_seen": 14269872, "step": 15965 }, { "epoch": 4.2148607628348955, "grad_norm": 0.530021607875824, "learning_rate": 3.278329115248966e-05, "loss": 0.0677, "num_input_tokens_seen": 14274416, "step": 15970 }, { "epoch": 4.216180546390392, "grad_norm": 0.11004350334405899, "learning_rate": 3.277396099115834e-05, "loss": 0.0552, "num_input_tokens_seen": 14278768, "step": 15975 }, { "epoch": 4.217500329945889, "grad_norm": 0.09730465710163116, "learning_rate": 3.276462963098454e-05, "loss": 0.0829, "num_input_tokens_seen": 14283280, "step": 15980 }, { "epoch": 4.218820113501386, "grad_norm": 0.40982648730278015, "learning_rate": 3.275529707340728e-05, "loss": 0.0747, "num_input_tokens_seen": 14287728, "step": 15985 }, { "epoch": 4.220139897056883, "grad_norm": 0.20734404027462006, "learning_rate": 3.274596331986574e-05, "loss": 0.0401, "num_input_tokens_seen": 14292080, "step": 15990 }, { "epoch": 4.221459680612379, "grad_norm": 0.11304210871458054, "learning_rate": 3.273662837179932e-05, "loss": 0.0383, "num_input_tokens_seen": 14296272, "step": 15995 }, { "epoch": 4.222779464167877, "grad_norm": 0.14152880012989044, "learning_rate": 3.272729223064758e-05, "loss": 0.0521, "num_input_tokens_seen": 14300816, "step": 16000 }, { "epoch": 4.222779464167877, "eval_loss": 0.06940384209156036, "eval_runtime": 64.7909, "eval_samples_per_second": 103.95, "eval_steps_per_second": 25.991, "num_input_tokens_seen": 14300816, "step": 16000 }, { "epoch": 4.224099247723373, "grad_norm": 0.21146002411842346, "learning_rate": 3.2717954897850264e-05, "loss": 0.0211, "num_input_tokens_seen": 14305296, "step": 16005 }, { "epoch": 4.225419031278871, "grad_norm": 0.085642009973526, "learning_rate": 3.270861637484733e-05, "loss": 0.0278, "num_input_tokens_seen": 14309616, "step": 16010 }, { "epoch": 4.226738814834367, "grad_norm": 0.09676684439182281, "learning_rate": 3.2699276663078867e-05, "loss": 0.0342, "num_input_tokens_seen": 14314128, "step": 16015 }, { "epoch": 4.228058598389864, "grad_norm": 0.2536756694316864, "learning_rate": 3.268993576398519e-05, "loss": 0.052, "num_input_tokens_seen": 14318256, "step": 16020 }, { "epoch": 4.229378381945361, "grad_norm": 0.09631747752428055, "learning_rate": 3.268059367900678e-05, "loss": 0.0286, "num_input_tokens_seen": 14322480, "step": 16025 }, { "epoch": 4.230698165500858, "grad_norm": 0.14271752536296844, "learning_rate": 3.26712504095843e-05, "loss": 0.0571, "num_input_tokens_seen": 14327024, "step": 16030 }, { "epoch": 4.232017949056354, "grad_norm": 0.11234790831804276, "learning_rate": 3.2661905957158615e-05, "loss": 0.0391, "num_input_tokens_seen": 14331504, "step": 16035 }, { "epoch": 4.233337732611852, "grad_norm": 0.19353853166103363, "learning_rate": 3.2652560323170734e-05, "loss": 0.0411, "num_input_tokens_seen": 14336016, "step": 16040 }, { "epoch": 4.234657516167348, "grad_norm": 0.05595673993229866, "learning_rate": 3.264321350906189e-05, "loss": 0.0657, "num_input_tokens_seen": 14340752, "step": 16045 }, { "epoch": 4.235977299722846, "grad_norm": 0.15499264001846313, "learning_rate": 3.263386551627346e-05, "loss": 0.0496, "num_input_tokens_seen": 14345168, "step": 16050 }, { "epoch": 4.237297083278342, "grad_norm": 0.25461477041244507, "learning_rate": 3.2624516346247055e-05, "loss": 0.0798, "num_input_tokens_seen": 14349552, "step": 16055 }, { "epoch": 4.238616866833839, "grad_norm": 0.05314868688583374, "learning_rate": 3.2615166000424404e-05, "loss": 0.0461, "num_input_tokens_seen": 14354288, "step": 16060 }, { "epoch": 4.239936650389336, "grad_norm": 0.11047704517841339, "learning_rate": 3.260581448024745e-05, "loss": 0.0964, "num_input_tokens_seen": 14358800, "step": 16065 }, { "epoch": 4.241256433944833, "grad_norm": 0.36406680941581726, "learning_rate": 3.2596461787158335e-05, "loss": 0.0543, "num_input_tokens_seen": 14363248, "step": 16070 }, { "epoch": 4.24257621750033, "grad_norm": 0.14550025761127472, "learning_rate": 3.258710792259934e-05, "loss": 0.0509, "num_input_tokens_seen": 14367632, "step": 16075 }, { "epoch": 4.243896001055827, "grad_norm": 0.0645856112241745, "learning_rate": 3.257775288801296e-05, "loss": 0.0512, "num_input_tokens_seen": 14372272, "step": 16080 }, { "epoch": 4.245215784611323, "grad_norm": 0.17523810267448425, "learning_rate": 3.256839668484186e-05, "loss": 0.0624, "num_input_tokens_seen": 14376432, "step": 16085 }, { "epoch": 4.246535568166821, "grad_norm": 0.09288103878498077, "learning_rate": 3.255903931452888e-05, "loss": 0.0557, "num_input_tokens_seen": 14381008, "step": 16090 }, { "epoch": 4.247855351722317, "grad_norm": 0.16668589413166046, "learning_rate": 3.2549680778517045e-05, "loss": 0.1116, "num_input_tokens_seen": 14385488, "step": 16095 }, { "epoch": 4.249175135277815, "grad_norm": 0.08292868733406067, "learning_rate": 3.2540321078249556e-05, "loss": 0.0572, "num_input_tokens_seen": 14389680, "step": 16100 }, { "epoch": 4.250494918833311, "grad_norm": 0.3844326436519623, "learning_rate": 3.2530960215169795e-05, "loss": 0.0698, "num_input_tokens_seen": 14394160, "step": 16105 }, { "epoch": 4.251814702388808, "grad_norm": 0.135445699095726, "learning_rate": 3.2521598190721345e-05, "loss": 0.0453, "num_input_tokens_seen": 14398832, "step": 16110 }, { "epoch": 4.253134485944305, "grad_norm": 0.05857017636299133, "learning_rate": 3.251223500634792e-05, "loss": 0.0712, "num_input_tokens_seen": 14403056, "step": 16115 }, { "epoch": 4.254454269499802, "grad_norm": 0.16728360950946808, "learning_rate": 3.2502870663493445e-05, "loss": 0.0888, "num_input_tokens_seen": 14407248, "step": 16120 }, { "epoch": 4.2557740530552985, "grad_norm": 0.0856282189488411, "learning_rate": 3.249350516360203e-05, "loss": 0.0783, "num_input_tokens_seen": 14411408, "step": 16125 }, { "epoch": 4.257093836610796, "grad_norm": 0.024195149540901184, "learning_rate": 3.248413850811797e-05, "loss": 0.0485, "num_input_tokens_seen": 14415824, "step": 16130 }, { "epoch": 4.2584136201662925, "grad_norm": 0.23630072176456451, "learning_rate": 3.2474770698485677e-05, "loss": 0.0685, "num_input_tokens_seen": 14420368, "step": 16135 }, { "epoch": 4.25973340372179, "grad_norm": 0.22448642551898956, "learning_rate": 3.246540173614983e-05, "loss": 0.054, "num_input_tokens_seen": 14425104, "step": 16140 }, { "epoch": 4.2610531872772865, "grad_norm": 0.20074792206287384, "learning_rate": 3.2456031622555197e-05, "loss": 0.0538, "num_input_tokens_seen": 14429360, "step": 16145 }, { "epoch": 4.262372970832783, "grad_norm": 0.10946731269359589, "learning_rate": 3.2446660359146794e-05, "loss": 0.0749, "num_input_tokens_seen": 14433680, "step": 16150 }, { "epoch": 4.2636927543882805, "grad_norm": 0.35188889503479004, "learning_rate": 3.2437287947369786e-05, "loss": 0.0494, "num_input_tokens_seen": 14438352, "step": 16155 }, { "epoch": 4.265012537943777, "grad_norm": 0.46400460600852966, "learning_rate": 3.2427914388669525e-05, "loss": 0.0685, "num_input_tokens_seen": 14442800, "step": 16160 }, { "epoch": 4.266332321499274, "grad_norm": 0.21883760392665863, "learning_rate": 3.241853968449151e-05, "loss": 0.0323, "num_input_tokens_seen": 14447568, "step": 16165 }, { "epoch": 4.267652105054771, "grad_norm": 0.3653423488140106, "learning_rate": 3.240916383628144e-05, "loss": 0.0524, "num_input_tokens_seen": 14451920, "step": 16170 }, { "epoch": 4.268971888610268, "grad_norm": 0.48588311672210693, "learning_rate": 3.239978684548521e-05, "loss": 0.0763, "num_input_tokens_seen": 14456464, "step": 16175 }, { "epoch": 4.270291672165765, "grad_norm": 0.3170522451400757, "learning_rate": 3.239040871354885e-05, "loss": 0.0686, "num_input_tokens_seen": 14460944, "step": 16180 }, { "epoch": 4.271611455721262, "grad_norm": 0.330966979265213, "learning_rate": 3.2381029441918596e-05, "loss": 0.0434, "num_input_tokens_seen": 14465648, "step": 16185 }, { "epoch": 4.272931239276758, "grad_norm": 0.187547966837883, "learning_rate": 3.2371649032040845e-05, "loss": 0.0876, "num_input_tokens_seen": 14470128, "step": 16190 }, { "epoch": 4.274251022832256, "grad_norm": 0.3601433336734772, "learning_rate": 3.2362267485362174e-05, "loss": 0.114, "num_input_tokens_seen": 14474768, "step": 16195 }, { "epoch": 4.275570806387752, "grad_norm": 0.06938692927360535, "learning_rate": 3.235288480332934e-05, "loss": 0.0532, "num_input_tokens_seen": 14479248, "step": 16200 }, { "epoch": 4.275570806387752, "eval_loss": 0.06928060948848724, "eval_runtime": 64.7828, "eval_samples_per_second": 103.963, "eval_steps_per_second": 25.995, "num_input_tokens_seen": 14479248, "step": 16200 }, { "epoch": 4.27689058994325, "grad_norm": 0.08146052062511444, "learning_rate": 3.234350098738927e-05, "loss": 0.0474, "num_input_tokens_seen": 14483504, "step": 16205 }, { "epoch": 4.278210373498746, "grad_norm": 0.18147023022174835, "learning_rate": 3.233411603898906e-05, "loss": 0.0401, "num_input_tokens_seen": 14487888, "step": 16210 }, { "epoch": 4.279530157054243, "grad_norm": 0.2658587396144867, "learning_rate": 3.232472995957599e-05, "loss": 0.0935, "num_input_tokens_seen": 14492912, "step": 16215 }, { "epoch": 4.28084994060974, "grad_norm": 0.16622881591320038, "learning_rate": 3.231534275059751e-05, "loss": 0.0591, "num_input_tokens_seen": 14497520, "step": 16220 }, { "epoch": 4.282169724165237, "grad_norm": 0.05868997052311897, "learning_rate": 3.230595441350125e-05, "loss": 0.0959, "num_input_tokens_seen": 14502128, "step": 16225 }, { "epoch": 4.283489507720734, "grad_norm": 0.15182790160179138, "learning_rate": 3.2296564949735e-05, "loss": 0.0647, "num_input_tokens_seen": 14506640, "step": 16230 }, { "epoch": 4.284809291276231, "grad_norm": 0.17041447758674622, "learning_rate": 3.228717436074675e-05, "loss": 0.0438, "num_input_tokens_seen": 14511376, "step": 16235 }, { "epoch": 4.286129074831727, "grad_norm": 0.09544391930103302, "learning_rate": 3.227778264798463e-05, "loss": 0.0606, "num_input_tokens_seen": 14515696, "step": 16240 }, { "epoch": 4.287448858387225, "grad_norm": 0.08945830166339874, "learning_rate": 3.226838981289698e-05, "loss": 0.0687, "num_input_tokens_seen": 14520336, "step": 16245 }, { "epoch": 4.288768641942721, "grad_norm": 0.13536033034324646, "learning_rate": 3.225899585693227e-05, "loss": 0.0653, "num_input_tokens_seen": 14524592, "step": 16250 }, { "epoch": 4.290088425498218, "grad_norm": 0.03366810828447342, "learning_rate": 3.224960078153918e-05, "loss": 0.1031, "num_input_tokens_seen": 14529200, "step": 16255 }, { "epoch": 4.291408209053715, "grad_norm": 0.20745255053043365, "learning_rate": 3.224020458816655e-05, "loss": 0.031, "num_input_tokens_seen": 14534000, "step": 16260 }, { "epoch": 4.292727992609212, "grad_norm": 0.4370614290237427, "learning_rate": 3.223080727826337e-05, "loss": 0.0757, "num_input_tokens_seen": 14538480, "step": 16265 }, { "epoch": 4.294047776164709, "grad_norm": 0.5274820923805237, "learning_rate": 3.222140885327885e-05, "loss": 0.1127, "num_input_tokens_seen": 14543152, "step": 16270 }, { "epoch": 4.295367559720206, "grad_norm": 0.31907156109809875, "learning_rate": 3.221200931466234e-05, "loss": 0.0644, "num_input_tokens_seen": 14547696, "step": 16275 }, { "epoch": 4.2966873432757025, "grad_norm": 0.24983958899974823, "learning_rate": 3.220260866386336e-05, "loss": 0.0795, "num_input_tokens_seen": 14552176, "step": 16280 }, { "epoch": 4.2980071268312, "grad_norm": 0.2035602480173111, "learning_rate": 3.21932069023316e-05, "loss": 0.0489, "num_input_tokens_seen": 14556912, "step": 16285 }, { "epoch": 4.2993269103866965, "grad_norm": 0.33681946992874146, "learning_rate": 3.218380403151695e-05, "loss": 0.083, "num_input_tokens_seen": 14561168, "step": 16290 }, { "epoch": 4.300646693942193, "grad_norm": 0.1661912202835083, "learning_rate": 3.217440005286943e-05, "loss": 0.0814, "num_input_tokens_seen": 14565584, "step": 16295 }, { "epoch": 4.3019664774976905, "grad_norm": 0.19048462808132172, "learning_rate": 3.216499496783928e-05, "loss": 0.0456, "num_input_tokens_seen": 14570096, "step": 16300 }, { "epoch": 4.303286261053187, "grad_norm": 0.38355356454849243, "learning_rate": 3.2155588777876856e-05, "loss": 0.0581, "num_input_tokens_seen": 14574576, "step": 16305 }, { "epoch": 4.3046060446086845, "grad_norm": 0.26800528168678284, "learning_rate": 3.214618148443273e-05, "loss": 0.0714, "num_input_tokens_seen": 14579184, "step": 16310 }, { "epoch": 4.305925828164181, "grad_norm": 0.3803464472293854, "learning_rate": 3.2136773088957595e-05, "loss": 0.0862, "num_input_tokens_seen": 14583792, "step": 16315 }, { "epoch": 4.307245611719678, "grad_norm": 0.1807071566581726, "learning_rate": 3.2127363592902374e-05, "loss": 0.0632, "num_input_tokens_seen": 14588304, "step": 16320 }, { "epoch": 4.308565395275175, "grad_norm": 0.20470990240573883, "learning_rate": 3.211795299771812e-05, "loss": 0.105, "num_input_tokens_seen": 14592912, "step": 16325 }, { "epoch": 4.309885178830672, "grad_norm": 0.18536344170570374, "learning_rate": 3.210854130485605e-05, "loss": 0.0342, "num_input_tokens_seen": 14597424, "step": 16330 }, { "epoch": 4.311204962386169, "grad_norm": 0.12596353888511658, "learning_rate": 3.209912851576759e-05, "loss": 0.0811, "num_input_tokens_seen": 14601808, "step": 16335 }, { "epoch": 4.312524745941666, "grad_norm": 0.21420417726039886, "learning_rate": 3.208971463190431e-05, "loss": 0.0636, "num_input_tokens_seen": 14606448, "step": 16340 }, { "epoch": 4.313844529497162, "grad_norm": 0.15049614012241364, "learning_rate": 3.208029965471793e-05, "loss": 0.0608, "num_input_tokens_seen": 14611056, "step": 16345 }, { "epoch": 4.31516431305266, "grad_norm": 0.46172210574150085, "learning_rate": 3.2070883585660364e-05, "loss": 0.0498, "num_input_tokens_seen": 14615568, "step": 16350 }, { "epoch": 4.316484096608156, "grad_norm": 0.1275540590286255, "learning_rate": 3.20614664261837e-05, "loss": 0.0588, "num_input_tokens_seen": 14620688, "step": 16355 }, { "epoch": 4.317803880163654, "grad_norm": 0.25704100728034973, "learning_rate": 3.205204817774016e-05, "loss": 0.0776, "num_input_tokens_seen": 14625424, "step": 16360 }, { "epoch": 4.31912366371915, "grad_norm": 0.20196960866451263, "learning_rate": 3.204262884178218e-05, "loss": 0.0854, "num_input_tokens_seen": 14629840, "step": 16365 }, { "epoch": 4.320443447274647, "grad_norm": 0.374894380569458, "learning_rate": 3.2033208419762314e-05, "loss": 0.0577, "num_input_tokens_seen": 14634224, "step": 16370 }, { "epoch": 4.321763230830144, "grad_norm": 0.047620516270399094, "learning_rate": 3.2023786913133344e-05, "loss": 0.0516, "num_input_tokens_seen": 14638672, "step": 16375 }, { "epoch": 4.323083014385641, "grad_norm": 0.06040734425187111, "learning_rate": 3.201436432334816e-05, "loss": 0.035, "num_input_tokens_seen": 14642896, "step": 16380 }, { "epoch": 4.324402797941137, "grad_norm": 0.1091158390045166, "learning_rate": 3.2004940651859844e-05, "loss": 0.033, "num_input_tokens_seen": 14647120, "step": 16385 }, { "epoch": 4.325722581496635, "grad_norm": 0.14791551232337952, "learning_rate": 3.1995515900121655e-05, "loss": 0.0457, "num_input_tokens_seen": 14651824, "step": 16390 }, { "epoch": 4.327042365052131, "grad_norm": 0.06846854835748672, "learning_rate": 3.1986090069587e-05, "loss": 0.0451, "num_input_tokens_seen": 14656464, "step": 16395 }, { "epoch": 4.328362148607629, "grad_norm": 0.2865278720855713, "learning_rate": 3.1976663161709466e-05, "loss": 0.0322, "num_input_tokens_seen": 14660976, "step": 16400 }, { "epoch": 4.328362148607629, "eval_loss": 0.0692586898803711, "eval_runtime": 64.7386, "eval_samples_per_second": 104.034, "eval_steps_per_second": 26.012, "num_input_tokens_seen": 14660976, "step": 16400 }, { "epoch": 4.329681932163125, "grad_norm": 0.279731422662735, "learning_rate": 3.196723517794279e-05, "loss": 0.0623, "num_input_tokens_seen": 14665392, "step": 16405 }, { "epoch": 4.331001715718622, "grad_norm": 0.17093487083911896, "learning_rate": 3.19578061197409e-05, "loss": 0.0506, "num_input_tokens_seen": 14669712, "step": 16410 }, { "epoch": 4.332321499274119, "grad_norm": 0.26023542881011963, "learning_rate": 3.194837598855787e-05, "loss": 0.0301, "num_input_tokens_seen": 14673808, "step": 16415 }, { "epoch": 4.333641282829616, "grad_norm": 0.25157028436660767, "learning_rate": 3.193894478584794e-05, "loss": 0.0607, "num_input_tokens_seen": 14678256, "step": 16420 }, { "epoch": 4.334961066385113, "grad_norm": 0.31848573684692383, "learning_rate": 3.192951251306553e-05, "loss": 0.0431, "num_input_tokens_seen": 14682640, "step": 16425 }, { "epoch": 4.33628084994061, "grad_norm": 0.10602187365293503, "learning_rate": 3.192007917166521e-05, "loss": 0.0609, "num_input_tokens_seen": 14687376, "step": 16430 }, { "epoch": 4.337600633496106, "grad_norm": 0.19057434797286987, "learning_rate": 3.191064476310171e-05, "loss": 0.0337, "num_input_tokens_seen": 14692048, "step": 16435 }, { "epoch": 4.338920417051604, "grad_norm": 0.2442363053560257, "learning_rate": 3.1901209288829944e-05, "loss": 0.0477, "num_input_tokens_seen": 14696720, "step": 16440 }, { "epoch": 4.3402402006071, "grad_norm": 0.24493849277496338, "learning_rate": 3.1891772750304985e-05, "loss": 0.0645, "num_input_tokens_seen": 14701072, "step": 16445 }, { "epoch": 4.341559984162597, "grad_norm": 0.2435508817434311, "learning_rate": 3.188233514898206e-05, "loss": 0.0671, "num_input_tokens_seen": 14705744, "step": 16450 }, { "epoch": 4.342879767718094, "grad_norm": 0.287352055311203, "learning_rate": 3.187289648631657e-05, "loss": 0.1139, "num_input_tokens_seen": 14710352, "step": 16455 }, { "epoch": 4.344199551273591, "grad_norm": 0.5548743605613708, "learning_rate": 3.186345676376406e-05, "loss": 0.1067, "num_input_tokens_seen": 14714544, "step": 16460 }, { "epoch": 4.345519334829088, "grad_norm": 0.08118835091590881, "learning_rate": 3.1854015982780275e-05, "loss": 0.0291, "num_input_tokens_seen": 14718960, "step": 16465 }, { "epoch": 4.346839118384585, "grad_norm": 0.5737006664276123, "learning_rate": 3.1844574144821084e-05, "loss": 0.0853, "num_input_tokens_seen": 14723504, "step": 16470 }, { "epoch": 4.3481589019400815, "grad_norm": 0.3589685559272766, "learning_rate": 3.1835131251342554e-05, "loss": 0.0691, "num_input_tokens_seen": 14727696, "step": 16475 }, { "epoch": 4.349478685495579, "grad_norm": 0.3090875446796417, "learning_rate": 3.182568730380089e-05, "loss": 0.0507, "num_input_tokens_seen": 14731792, "step": 16480 }, { "epoch": 4.3507984690510755, "grad_norm": 0.6478426456451416, "learning_rate": 3.181624230365245e-05, "loss": 0.0856, "num_input_tokens_seen": 14736496, "step": 16485 }, { "epoch": 4.352118252606573, "grad_norm": 0.3426189422607422, "learning_rate": 3.180679625235381e-05, "loss": 0.0338, "num_input_tokens_seen": 14740784, "step": 16490 }, { "epoch": 4.3534380361620695, "grad_norm": 0.5377569198608398, "learning_rate": 3.1797349151361646e-05, "loss": 0.1396, "num_input_tokens_seen": 14745200, "step": 16495 }, { "epoch": 4.354757819717566, "grad_norm": 0.272971510887146, "learning_rate": 3.178790100213281e-05, "loss": 0.0408, "num_input_tokens_seen": 14749744, "step": 16500 }, { "epoch": 4.3560776032730635, "grad_norm": 0.2793237864971161, "learning_rate": 3.1778451806124346e-05, "loss": 0.0705, "num_input_tokens_seen": 14754096, "step": 16505 }, { "epoch": 4.35739738682856, "grad_norm": 0.35539141297340393, "learning_rate": 3.176900156479342e-05, "loss": 0.0715, "num_input_tokens_seen": 14758640, "step": 16510 }, { "epoch": 4.3587171703840575, "grad_norm": 0.19726309180259705, "learning_rate": 3.17595502795974e-05, "loss": 0.0523, "num_input_tokens_seen": 14762928, "step": 16515 }, { "epoch": 4.360036953939554, "grad_norm": 0.1303650289773941, "learning_rate": 3.175009795199377e-05, "loss": 0.0495, "num_input_tokens_seen": 14767440, "step": 16520 }, { "epoch": 4.361356737495051, "grad_norm": 0.21357670426368713, "learning_rate": 3.1740644583440224e-05, "loss": 0.0501, "num_input_tokens_seen": 14772144, "step": 16525 }, { "epoch": 4.362676521050548, "grad_norm": 0.4475732147693634, "learning_rate": 3.173119017539457e-05, "loss": 0.1034, "num_input_tokens_seen": 14776624, "step": 16530 }, { "epoch": 4.363996304606045, "grad_norm": 0.20076723396778107, "learning_rate": 3.172173472931479e-05, "loss": 0.0553, "num_input_tokens_seen": 14781232, "step": 16535 }, { "epoch": 4.365316088161541, "grad_norm": 0.11919048428535461, "learning_rate": 3.1712278246659055e-05, "loss": 0.0635, "num_input_tokens_seen": 14785648, "step": 16540 }, { "epoch": 4.366635871717039, "grad_norm": 0.18762211501598358, "learning_rate": 3.170282072888566e-05, "loss": 0.0569, "num_input_tokens_seen": 14790256, "step": 16545 }, { "epoch": 4.367955655272535, "grad_norm": 0.06658461689949036, "learning_rate": 3.169336217745307e-05, "loss": 0.0327, "num_input_tokens_seen": 14794736, "step": 16550 }, { "epoch": 4.369275438828033, "grad_norm": 0.1254887729883194, "learning_rate": 3.1683902593819924e-05, "loss": 0.0243, "num_input_tokens_seen": 14799408, "step": 16555 }, { "epoch": 4.370595222383529, "grad_norm": 0.2671094536781311, "learning_rate": 3.1674441979445e-05, "loss": 0.11, "num_input_tokens_seen": 14803760, "step": 16560 }, { "epoch": 4.371915005939026, "grad_norm": 0.3242781162261963, "learning_rate": 3.166498033578725e-05, "loss": 0.0746, "num_input_tokens_seen": 14808112, "step": 16565 }, { "epoch": 4.373234789494523, "grad_norm": 0.2077052742242813, "learning_rate": 3.165551766430578e-05, "loss": 0.0764, "num_input_tokens_seen": 14812528, "step": 16570 }, { "epoch": 4.37455457305002, "grad_norm": 0.03699875250458717, "learning_rate": 3.164605396645984e-05, "loss": 0.0631, "num_input_tokens_seen": 14816816, "step": 16575 }, { "epoch": 4.375874356605516, "grad_norm": 0.10970988869667053, "learning_rate": 3.163658924370886e-05, "loss": 0.1444, "num_input_tokens_seen": 14821424, "step": 16580 }, { "epoch": 4.377194140161014, "grad_norm": 0.19884027540683746, "learning_rate": 3.1627123497512415e-05, "loss": 0.0583, "num_input_tokens_seen": 14826096, "step": 16585 }, { "epoch": 4.37851392371651, "grad_norm": 0.26470932364463806, "learning_rate": 3.1617656729330245e-05, "loss": 0.0692, "num_input_tokens_seen": 14830352, "step": 16590 }, { "epoch": 4.379833707272008, "grad_norm": 0.2684254050254822, "learning_rate": 3.1608188940622255e-05, "loss": 0.0456, "num_input_tokens_seen": 14834736, "step": 16595 }, { "epoch": 4.381153490827504, "grad_norm": 0.30212345719337463, "learning_rate": 3.159872013284847e-05, "loss": 0.0679, "num_input_tokens_seen": 14839056, "step": 16600 }, { "epoch": 4.381153490827504, "eval_loss": 0.06891518831253052, "eval_runtime": 64.8211, "eval_samples_per_second": 103.901, "eval_steps_per_second": 25.979, "num_input_tokens_seen": 14839056, "step": 16600 }, { "epoch": 4.382473274383001, "grad_norm": 0.4185463786125183, "learning_rate": 3.1589250307469134e-05, "loss": 0.0723, "num_input_tokens_seen": 14843440, "step": 16605 }, { "epoch": 4.383793057938498, "grad_norm": 0.1523248702287674, "learning_rate": 3.1579779465944586e-05, "loss": 0.0571, "num_input_tokens_seen": 14847664, "step": 16610 }, { "epoch": 4.385112841493995, "grad_norm": 0.08444304019212723, "learning_rate": 3.1570307609735363e-05, "loss": 0.0576, "num_input_tokens_seen": 14852080, "step": 16615 }, { "epoch": 4.386432625049492, "grad_norm": 0.32412415742874146, "learning_rate": 3.156083474030213e-05, "loss": 0.0809, "num_input_tokens_seen": 14856336, "step": 16620 }, { "epoch": 4.387752408604989, "grad_norm": 0.39373818039894104, "learning_rate": 3.155136085910573e-05, "loss": 0.0625, "num_input_tokens_seen": 14861040, "step": 16625 }, { "epoch": 4.389072192160485, "grad_norm": 0.10800865292549133, "learning_rate": 3.154188596760717e-05, "loss": 0.0375, "num_input_tokens_seen": 14865680, "step": 16630 }, { "epoch": 4.390391975715983, "grad_norm": 0.04714430496096611, "learning_rate": 3.153241006726757e-05, "loss": 0.0481, "num_input_tokens_seen": 14870256, "step": 16635 }, { "epoch": 4.391711759271479, "grad_norm": 0.16838154196739197, "learning_rate": 3.152293315954825e-05, "loss": 0.0419, "num_input_tokens_seen": 14874544, "step": 16640 }, { "epoch": 4.393031542826977, "grad_norm": 0.16970589756965637, "learning_rate": 3.1513455245910666e-05, "loss": 0.0609, "num_input_tokens_seen": 14879088, "step": 16645 }, { "epoch": 4.394351326382473, "grad_norm": 0.06012123450636864, "learning_rate": 3.150397632781643e-05, "loss": 0.0577, "num_input_tokens_seen": 14883696, "step": 16650 }, { "epoch": 4.39567110993797, "grad_norm": 0.43462684750556946, "learning_rate": 3.149449640672731e-05, "loss": 0.0805, "num_input_tokens_seen": 14887952, "step": 16655 }, { "epoch": 4.396990893493467, "grad_norm": 0.14801432192325592, "learning_rate": 3.148501548410523e-05, "loss": 0.0649, "num_input_tokens_seen": 14892304, "step": 16660 }, { "epoch": 4.398310677048964, "grad_norm": 0.20283685624599457, "learning_rate": 3.1475533561412256e-05, "loss": 0.0712, "num_input_tokens_seen": 14896880, "step": 16665 }, { "epoch": 4.3996304606044605, "grad_norm": 0.3539446294307709, "learning_rate": 3.146605064011065e-05, "loss": 0.0695, "num_input_tokens_seen": 14901488, "step": 16670 }, { "epoch": 4.400950244159958, "grad_norm": 0.4137662947177887, "learning_rate": 3.145656672166277e-05, "loss": 0.1694, "num_input_tokens_seen": 14905872, "step": 16675 }, { "epoch": 4.4022700277154545, "grad_norm": 0.13412520289421082, "learning_rate": 3.144708180753116e-05, "loss": 0.0461, "num_input_tokens_seen": 14909872, "step": 16680 }, { "epoch": 4.403589811270952, "grad_norm": 0.07847852259874344, "learning_rate": 3.143759589917851e-05, "loss": 0.0696, "num_input_tokens_seen": 14914096, "step": 16685 }, { "epoch": 4.4049095948264485, "grad_norm": 0.06109490245580673, "learning_rate": 3.142810899806768e-05, "loss": 0.0339, "num_input_tokens_seen": 14918320, "step": 16690 }, { "epoch": 4.406229378381945, "grad_norm": 0.25303465127944946, "learning_rate": 3.141862110566166e-05, "loss": 0.0686, "num_input_tokens_seen": 14922608, "step": 16695 }, { "epoch": 4.4075491619374425, "grad_norm": 0.18817973136901855, "learning_rate": 3.1409132223423606e-05, "loss": 0.0417, "num_input_tokens_seen": 14927248, "step": 16700 }, { "epoch": 4.408868945492939, "grad_norm": 0.3521624505519867, "learning_rate": 3.139964235281682e-05, "loss": 0.0491, "num_input_tokens_seen": 14931664, "step": 16705 }, { "epoch": 4.410188729048436, "grad_norm": 0.20132042467594147, "learning_rate": 3.139015149530476e-05, "loss": 0.0627, "num_input_tokens_seen": 14935792, "step": 16710 }, { "epoch": 4.411508512603933, "grad_norm": 0.1608770489692688, "learning_rate": 3.1380659652351034e-05, "loss": 0.0551, "num_input_tokens_seen": 14940208, "step": 16715 }, { "epoch": 4.41282829615943, "grad_norm": 0.11961697787046432, "learning_rate": 3.137116682541941e-05, "loss": 0.0451, "num_input_tokens_seen": 14944752, "step": 16720 }, { "epoch": 4.414148079714927, "grad_norm": 0.38208794593811035, "learning_rate": 3.136167301597379e-05, "loss": 0.1234, "num_input_tokens_seen": 14949136, "step": 16725 }, { "epoch": 4.415467863270424, "grad_norm": 0.2332635074853897, "learning_rate": 3.1352178225478254e-05, "loss": 0.0622, "num_input_tokens_seen": 14953488, "step": 16730 }, { "epoch": 4.41678764682592, "grad_norm": 0.11598386615514755, "learning_rate": 3.1342682455396996e-05, "loss": 0.0749, "num_input_tokens_seen": 14958096, "step": 16735 }, { "epoch": 4.418107430381418, "grad_norm": 0.10811329632997513, "learning_rate": 3.133318570719441e-05, "loss": 0.0429, "num_input_tokens_seen": 14962640, "step": 16740 }, { "epoch": 4.419427213936914, "grad_norm": 0.12695413827896118, "learning_rate": 3.132368798233499e-05, "loss": 0.0458, "num_input_tokens_seen": 14967088, "step": 16745 }, { "epoch": 4.420746997492412, "grad_norm": 0.3933844566345215, "learning_rate": 3.131418928228342e-05, "loss": 0.0357, "num_input_tokens_seen": 14971696, "step": 16750 }, { "epoch": 4.422066781047908, "grad_norm": 0.22276704013347626, "learning_rate": 3.1304689608504514e-05, "loss": 0.0601, "num_input_tokens_seen": 14976272, "step": 16755 }, { "epoch": 4.423386564603405, "grad_norm": 0.2870182991027832, "learning_rate": 3.129518896246324e-05, "loss": 0.0646, "num_input_tokens_seen": 14980752, "step": 16760 }, { "epoch": 4.424706348158902, "grad_norm": 0.10488896816968918, "learning_rate": 3.128568734562472e-05, "loss": 0.0386, "num_input_tokens_seen": 14985488, "step": 16765 }, { "epoch": 4.426026131714399, "grad_norm": 0.07172037661075592, "learning_rate": 3.127618475945421e-05, "loss": 0.0397, "num_input_tokens_seen": 14989616, "step": 16770 }, { "epoch": 4.427345915269896, "grad_norm": 0.24923862516880035, "learning_rate": 3.126668120541715e-05, "loss": 0.0605, "num_input_tokens_seen": 14993936, "step": 16775 }, { "epoch": 4.428665698825393, "grad_norm": 0.0857858955860138, "learning_rate": 3.1257176684979096e-05, "loss": 0.053, "num_input_tokens_seen": 14998768, "step": 16780 }, { "epoch": 4.429985482380889, "grad_norm": 0.05052291229367256, "learning_rate": 3.124767119960576e-05, "loss": 0.0367, "num_input_tokens_seen": 15003184, "step": 16785 }, { "epoch": 4.431305265936387, "grad_norm": 0.2406575083732605, "learning_rate": 3.123816475076301e-05, "loss": 0.0504, "num_input_tokens_seen": 15007376, "step": 16790 }, { "epoch": 4.432625049491883, "grad_norm": 0.42316174507141113, "learning_rate": 3.122865733991687e-05, "loss": 0.0659, "num_input_tokens_seen": 15011536, "step": 16795 }, { "epoch": 4.43394483304738, "grad_norm": 0.357875257730484, "learning_rate": 3.1219148968533486e-05, "loss": 0.0803, "num_input_tokens_seen": 15016048, "step": 16800 }, { "epoch": 4.43394483304738, "eval_loss": 0.06896334141492844, "eval_runtime": 64.7731, "eval_samples_per_second": 103.978, "eval_steps_per_second": 25.998, "num_input_tokens_seen": 15016048, "step": 16800 }, { "epoch": 4.435264616602877, "grad_norm": 0.1395978480577469, "learning_rate": 3.120963963807918e-05, "loss": 0.0958, "num_input_tokens_seen": 15020720, "step": 16805 }, { "epoch": 4.436584400158374, "grad_norm": 0.15530188381671906, "learning_rate": 3.12001293500204e-05, "loss": 0.0302, "num_input_tokens_seen": 15025168, "step": 16810 }, { "epoch": 4.437904183713871, "grad_norm": 0.10831574350595474, "learning_rate": 3.1190618105823765e-05, "loss": 0.0433, "num_input_tokens_seen": 15029360, "step": 16815 }, { "epoch": 4.439223967269368, "grad_norm": 0.2043435126543045, "learning_rate": 3.118110590695603e-05, "loss": 0.0656, "num_input_tokens_seen": 15034064, "step": 16820 }, { "epoch": 4.440543750824864, "grad_norm": 0.12220444530248642, "learning_rate": 3.117159275488407e-05, "loss": 0.0485, "num_input_tokens_seen": 15038672, "step": 16825 }, { "epoch": 4.441863534380362, "grad_norm": 0.15089628100395203, "learning_rate": 3.1162078651074956e-05, "loss": 0.0246, "num_input_tokens_seen": 15043408, "step": 16830 }, { "epoch": 4.443183317935858, "grad_norm": 0.04405693709850311, "learning_rate": 3.1152563596995885e-05, "loss": 0.0389, "num_input_tokens_seen": 15048112, "step": 16835 }, { "epoch": 4.444503101491355, "grad_norm": 0.13576270639896393, "learning_rate": 3.1143047594114186e-05, "loss": 0.0532, "num_input_tokens_seen": 15052912, "step": 16840 }, { "epoch": 4.445822885046852, "grad_norm": 0.08532340824604034, "learning_rate": 3.113353064389734e-05, "loss": 0.0545, "num_input_tokens_seen": 15057744, "step": 16845 }, { "epoch": 4.447142668602349, "grad_norm": 0.1438058614730835, "learning_rate": 3.1124012747812993e-05, "loss": 0.0524, "num_input_tokens_seen": 15062128, "step": 16850 }, { "epoch": 4.448462452157846, "grad_norm": 0.13937461376190186, "learning_rate": 3.1114493907328936e-05, "loss": 0.0778, "num_input_tokens_seen": 15066640, "step": 16855 }, { "epoch": 4.449782235713343, "grad_norm": 0.1294962465763092, "learning_rate": 3.110497412391306e-05, "loss": 0.1046, "num_input_tokens_seen": 15071216, "step": 16860 }, { "epoch": 4.4511020192688395, "grad_norm": 0.11827854067087173, "learning_rate": 3.1095453399033466e-05, "loss": 0.0491, "num_input_tokens_seen": 15075984, "step": 16865 }, { "epoch": 4.452421802824337, "grad_norm": 0.17024582624435425, "learning_rate": 3.108593173415835e-05, "loss": 0.0672, "num_input_tokens_seen": 15080592, "step": 16870 }, { "epoch": 4.4537415863798335, "grad_norm": 0.4058993458747864, "learning_rate": 3.107640913075609e-05, "loss": 0.0513, "num_input_tokens_seen": 15085136, "step": 16875 }, { "epoch": 4.455061369935331, "grad_norm": 0.11020553112030029, "learning_rate": 3.106688559029517e-05, "loss": 0.0417, "num_input_tokens_seen": 15089776, "step": 16880 }, { "epoch": 4.4563811534908275, "grad_norm": 0.4051533341407776, "learning_rate": 3.105736111424425e-05, "loss": 0.0594, "num_input_tokens_seen": 15094288, "step": 16885 }, { "epoch": 4.457700937046324, "grad_norm": 0.22695191204547882, "learning_rate": 3.1047835704072136e-05, "loss": 0.0794, "num_input_tokens_seen": 15098512, "step": 16890 }, { "epoch": 4.4590207206018215, "grad_norm": 0.27577629685401917, "learning_rate": 3.103830936124775e-05, "loss": 0.0371, "num_input_tokens_seen": 15102896, "step": 16895 }, { "epoch": 4.460340504157318, "grad_norm": 0.22035843133926392, "learning_rate": 3.102878208724018e-05, "loss": 0.0612, "num_input_tokens_seen": 15107568, "step": 16900 }, { "epoch": 4.4616602877128155, "grad_norm": 0.279906690120697, "learning_rate": 3.101925388351865e-05, "loss": 0.053, "num_input_tokens_seen": 15111920, "step": 16905 }, { "epoch": 4.462980071268312, "grad_norm": 0.40887123346328735, "learning_rate": 3.1009724751552515e-05, "loss": 0.0419, "num_input_tokens_seen": 15116528, "step": 16910 }, { "epoch": 4.464299854823809, "grad_norm": 0.19499571621418, "learning_rate": 3.100019469281131e-05, "loss": 0.0229, "num_input_tokens_seen": 15120944, "step": 16915 }, { "epoch": 4.465619638379306, "grad_norm": 0.16979607939720154, "learning_rate": 3.0990663708764685e-05, "loss": 0.0598, "num_input_tokens_seen": 15125328, "step": 16920 }, { "epoch": 4.466939421934803, "grad_norm": 0.14564411342144012, "learning_rate": 3.098113180088243e-05, "loss": 0.0433, "num_input_tokens_seen": 15129584, "step": 16925 }, { "epoch": 4.468259205490299, "grad_norm": 0.2107173055410385, "learning_rate": 3.097159897063448e-05, "loss": 0.0781, "num_input_tokens_seen": 15133808, "step": 16930 }, { "epoch": 4.469578989045797, "grad_norm": 0.22117169201374054, "learning_rate": 3.096206521949094e-05, "loss": 0.0659, "num_input_tokens_seen": 15138672, "step": 16935 }, { "epoch": 4.470898772601293, "grad_norm": 0.1824360489845276, "learning_rate": 3.0952530548922006e-05, "loss": 0.0594, "num_input_tokens_seen": 15143472, "step": 16940 }, { "epoch": 4.472218556156791, "grad_norm": 0.20703795552253723, "learning_rate": 3.0942994960398064e-05, "loss": 0.0425, "num_input_tokens_seen": 15147984, "step": 16945 }, { "epoch": 4.473538339712287, "grad_norm": 0.5699824690818787, "learning_rate": 3.093345845538961e-05, "loss": 0.0957, "num_input_tokens_seen": 15152336, "step": 16950 }, { "epoch": 4.474858123267784, "grad_norm": 0.1892164796590805, "learning_rate": 3.09239210353673e-05, "loss": 0.071, "num_input_tokens_seen": 15156720, "step": 16955 }, { "epoch": 4.476177906823281, "grad_norm": 0.12792544066905975, "learning_rate": 3.0914382701801926e-05, "loss": 0.0692, "num_input_tokens_seen": 15161104, "step": 16960 }, { "epoch": 4.477497690378778, "grad_norm": 0.06650001555681229, "learning_rate": 3.090484345616441e-05, "loss": 0.0328, "num_input_tokens_seen": 15165744, "step": 16965 }, { "epoch": 4.478817473934274, "grad_norm": 0.6658869981765747, "learning_rate": 3.0895303299925825e-05, "loss": 0.0881, "num_input_tokens_seen": 15170384, "step": 16970 }, { "epoch": 4.480137257489772, "grad_norm": 0.24367541074752808, "learning_rate": 3.0885762234557393e-05, "loss": 0.0598, "num_input_tokens_seen": 15174544, "step": 16975 }, { "epoch": 4.481457041045268, "grad_norm": 0.2661319077014923, "learning_rate": 3.087622026153045e-05, "loss": 0.1055, "num_input_tokens_seen": 15178736, "step": 16980 }, { "epoch": 4.482776824600766, "grad_norm": 0.12424825876951218, "learning_rate": 3.086667738231651e-05, "loss": 0.0714, "num_input_tokens_seen": 15183024, "step": 16985 }, { "epoch": 4.484096608156262, "grad_norm": 0.14623700082302094, "learning_rate": 3.085713359838718e-05, "loss": 0.1048, "num_input_tokens_seen": 15187664, "step": 16990 }, { "epoch": 4.485416391711759, "grad_norm": 0.11514277756214142, "learning_rate": 3.084758891121425e-05, "loss": 0.0272, "num_input_tokens_seen": 15192048, "step": 16995 }, { "epoch": 4.486736175267256, "grad_norm": 0.032668016850948334, "learning_rate": 3.083804332226963e-05, "loss": 0.0332, "num_input_tokens_seen": 15196432, "step": 17000 }, { "epoch": 4.486736175267256, "eval_loss": 0.06966648995876312, "eval_runtime": 64.7628, "eval_samples_per_second": 103.995, "eval_steps_per_second": 26.003, "num_input_tokens_seen": 15196432, "step": 17000 }, { "epoch": 4.488055958822753, "grad_norm": 0.2274118810892105, "learning_rate": 3.082849683302536e-05, "loss": 0.0579, "num_input_tokens_seen": 15200880, "step": 17005 }, { "epoch": 4.48937574237825, "grad_norm": 0.03433378413319588, "learning_rate": 3.081894944495363e-05, "loss": 0.0952, "num_input_tokens_seen": 15205296, "step": 17010 }, { "epoch": 4.490695525933747, "grad_norm": 0.08687478303909302, "learning_rate": 3.080940115952677e-05, "loss": 0.1081, "num_input_tokens_seen": 15210032, "step": 17015 }, { "epoch": 4.492015309489243, "grad_norm": 0.14901818335056305, "learning_rate": 3.0799851978217245e-05, "loss": 0.0805, "num_input_tokens_seen": 15214384, "step": 17020 }, { "epoch": 4.493335093044741, "grad_norm": 0.25301143527030945, "learning_rate": 3.0790301902497666e-05, "loss": 0.0519, "num_input_tokens_seen": 15218992, "step": 17025 }, { "epoch": 4.494654876600237, "grad_norm": 0.09659424424171448, "learning_rate": 3.078075093384076e-05, "loss": 0.1262, "num_input_tokens_seen": 15223408, "step": 17030 }, { "epoch": 4.495974660155735, "grad_norm": 0.24253039062023163, "learning_rate": 3.077119907371942e-05, "loss": 0.0959, "num_input_tokens_seen": 15227664, "step": 17035 }, { "epoch": 4.497294443711231, "grad_norm": 0.10424157232046127, "learning_rate": 3.076164632360666e-05, "loss": 0.0446, "num_input_tokens_seen": 15232368, "step": 17040 }, { "epoch": 4.498614227266728, "grad_norm": 0.1829790621995926, "learning_rate": 3.075209268497563e-05, "loss": 0.0526, "num_input_tokens_seen": 15236496, "step": 17045 }, { "epoch": 4.499934010822225, "grad_norm": 0.13616782426834106, "learning_rate": 3.074253815929961e-05, "loss": 0.0312, "num_input_tokens_seen": 15240976, "step": 17050 }, { "epoch": 4.501253794377722, "grad_norm": 0.05685742199420929, "learning_rate": 3.0732982748052054e-05, "loss": 0.0372, "num_input_tokens_seen": 15245456, "step": 17055 }, { "epoch": 4.502573577933219, "grad_norm": 0.12395831942558289, "learning_rate": 3.072342645270651e-05, "loss": 0.0825, "num_input_tokens_seen": 15249872, "step": 17060 }, { "epoch": 4.503893361488716, "grad_norm": 0.2523971199989319, "learning_rate": 3.071386927473668e-05, "loss": 0.0635, "num_input_tokens_seen": 15254480, "step": 17065 }, { "epoch": 4.5052131450442126, "grad_norm": 0.12605805695056915, "learning_rate": 3.0704311215616404e-05, "loss": 0.028, "num_input_tokens_seen": 15258704, "step": 17070 }, { "epoch": 4.50653292859971, "grad_norm": 0.21279558539390564, "learning_rate": 3.0694752276819656e-05, "loss": 0.0457, "num_input_tokens_seen": 15263120, "step": 17075 }, { "epoch": 4.5078527121552066, "grad_norm": 0.09739082306623459, "learning_rate": 3.068519245982054e-05, "loss": 0.0437, "num_input_tokens_seen": 15267664, "step": 17080 }, { "epoch": 4.509172495710703, "grad_norm": 0.21431399881839752, "learning_rate": 3.0675631766093304e-05, "loss": 0.0471, "num_input_tokens_seen": 15272144, "step": 17085 }, { "epoch": 4.5104922792662006, "grad_norm": 0.06275000423192978, "learning_rate": 3.066607019711232e-05, "loss": 0.026, "num_input_tokens_seen": 15276816, "step": 17090 }, { "epoch": 4.511812062821697, "grad_norm": 0.2777114510536194, "learning_rate": 3.065650775435211e-05, "loss": 0.0401, "num_input_tokens_seen": 15281424, "step": 17095 }, { "epoch": 4.513131846377194, "grad_norm": 0.06710651516914368, "learning_rate": 3.0646944439287326e-05, "loss": 0.0463, "num_input_tokens_seen": 15286064, "step": 17100 }, { "epoch": 4.514451629932691, "grad_norm": 0.23893192410469055, "learning_rate": 3.0637380253392736e-05, "loss": 0.056, "num_input_tokens_seen": 15290736, "step": 17105 }, { "epoch": 4.515771413488188, "grad_norm": 0.1470186412334442, "learning_rate": 3.062781519814327e-05, "loss": 0.0893, "num_input_tokens_seen": 15294960, "step": 17110 }, { "epoch": 4.517091197043685, "grad_norm": 0.1950264424085617, "learning_rate": 3.0618249275013985e-05, "loss": 0.0896, "num_input_tokens_seen": 15299088, "step": 17115 }, { "epoch": 4.518410980599182, "grad_norm": 0.3128533363342285, "learning_rate": 3.060868248548005e-05, "loss": 0.0691, "num_input_tokens_seen": 15303760, "step": 17120 }, { "epoch": 4.519730764154678, "grad_norm": 0.20763787627220154, "learning_rate": 3.0599114831016796e-05, "loss": 0.0469, "num_input_tokens_seen": 15308208, "step": 17125 }, { "epoch": 4.521050547710176, "grad_norm": 0.10270621627569199, "learning_rate": 3.0589546313099666e-05, "loss": 0.0719, "num_input_tokens_seen": 15312496, "step": 17130 }, { "epoch": 4.522370331265672, "grad_norm": 0.10230451822280884, "learning_rate": 3.0579976933204255e-05, "loss": 0.095, "num_input_tokens_seen": 15317040, "step": 17135 }, { "epoch": 4.52369011482117, "grad_norm": 0.06859953701496124, "learning_rate": 3.0570406692806284e-05, "loss": 0.0375, "num_input_tokens_seen": 15321424, "step": 17140 }, { "epoch": 4.525009898376666, "grad_norm": 0.5696061849594116, "learning_rate": 3.05608355933816e-05, "loss": 0.0929, "num_input_tokens_seen": 15325808, "step": 17145 }, { "epoch": 4.526329681932163, "grad_norm": 0.07461816072463989, "learning_rate": 3.055126363640618e-05, "loss": 0.0575, "num_input_tokens_seen": 15329936, "step": 17150 }, { "epoch": 4.52764946548766, "grad_norm": 0.23385633528232574, "learning_rate": 3.0541690823356146e-05, "loss": 0.0574, "num_input_tokens_seen": 15334576, "step": 17155 }, { "epoch": 4.528969249043157, "grad_norm": 0.24719227850437164, "learning_rate": 3.053211715570775e-05, "loss": 0.0461, "num_input_tokens_seen": 15339024, "step": 17160 }, { "epoch": 4.530289032598654, "grad_norm": 0.15709394216537476, "learning_rate": 3.052254263493736e-05, "loss": 0.051, "num_input_tokens_seen": 15343280, "step": 17165 }, { "epoch": 4.531608816154151, "grad_norm": 0.26371175050735474, "learning_rate": 3.0512967262521498e-05, "loss": 0.0924, "num_input_tokens_seen": 15347792, "step": 17170 }, { "epoch": 4.532928599709647, "grad_norm": 0.08827108144760132, "learning_rate": 3.0503391039936803e-05, "loss": 0.0644, "num_input_tokens_seen": 15352176, "step": 17175 }, { "epoch": 4.534248383265145, "grad_norm": 0.14745686948299408, "learning_rate": 3.0493813968660056e-05, "loss": 0.0433, "num_input_tokens_seen": 15356528, "step": 17180 }, { "epoch": 4.535568166820641, "grad_norm": 0.13035918772220612, "learning_rate": 3.0484236050168153e-05, "loss": 0.0203, "num_input_tokens_seen": 15360880, "step": 17185 }, { "epoch": 4.536887950376139, "grad_norm": 0.23638279736042023, "learning_rate": 3.0474657285938123e-05, "loss": 0.0971, "num_input_tokens_seen": 15365104, "step": 17190 }, { "epoch": 4.538207733931635, "grad_norm": 0.24518848955631256, "learning_rate": 3.046507767744715e-05, "loss": 0.0427, "num_input_tokens_seen": 15369680, "step": 17195 }, { "epoch": 4.539527517487132, "grad_norm": 0.129789799451828, "learning_rate": 3.045549722617252e-05, "loss": 0.0381, "num_input_tokens_seen": 15374128, "step": 17200 }, { "epoch": 4.539527517487132, "eval_loss": 0.06896604597568512, "eval_runtime": 64.7282, "eval_samples_per_second": 104.05, "eval_steps_per_second": 26.016, "num_input_tokens_seen": 15374128, "step": 17200 }, { "epoch": 4.540847301042629, "grad_norm": 0.21089038252830505, "learning_rate": 3.0445915933591658e-05, "loss": 0.0896, "num_input_tokens_seen": 15378640, "step": 17205 }, { "epoch": 4.542167084598126, "grad_norm": 0.2550269663333893, "learning_rate": 3.0436333801182114e-05, "loss": 0.0232, "num_input_tokens_seen": 15383504, "step": 17210 }, { "epoch": 4.5434868681536225, "grad_norm": 0.27197185158729553, "learning_rate": 3.0426750830421596e-05, "loss": 0.1017, "num_input_tokens_seen": 15388272, "step": 17215 }, { "epoch": 4.54480665170912, "grad_norm": 0.23533427715301514, "learning_rate": 3.0417167022787897e-05, "loss": 0.059, "num_input_tokens_seen": 15392816, "step": 17220 }, { "epoch": 4.5461264352646165, "grad_norm": 0.15790709853172302, "learning_rate": 3.0407582379758966e-05, "loss": 0.0453, "num_input_tokens_seen": 15397296, "step": 17225 }, { "epoch": 4.547446218820113, "grad_norm": 0.25732430815696716, "learning_rate": 3.039799690281287e-05, "loss": 0.0694, "num_input_tokens_seen": 15402032, "step": 17230 }, { "epoch": 4.5487660023756105, "grad_norm": 0.6158699989318848, "learning_rate": 3.0388410593427823e-05, "loss": 0.08, "num_input_tokens_seen": 15406480, "step": 17235 }, { "epoch": 4.550085785931107, "grad_norm": 0.07004453986883163, "learning_rate": 3.0378823453082146e-05, "loss": 0.0155, "num_input_tokens_seen": 15410768, "step": 17240 }, { "epoch": 4.5514055694866045, "grad_norm": 0.4746168255805969, "learning_rate": 3.03692354832543e-05, "loss": 0.0572, "num_input_tokens_seen": 15415248, "step": 17245 }, { "epoch": 4.552725353042101, "grad_norm": 0.23729097843170166, "learning_rate": 3.0359646685422865e-05, "loss": 0.0728, "num_input_tokens_seen": 15419984, "step": 17250 }, { "epoch": 4.554045136597598, "grad_norm": 0.02623366378247738, "learning_rate": 3.035005706106656e-05, "loss": 0.0395, "num_input_tokens_seen": 15424208, "step": 17255 }, { "epoch": 4.555364920153095, "grad_norm": 0.25380733609199524, "learning_rate": 3.034046661166422e-05, "loss": 0.0562, "num_input_tokens_seen": 15428368, "step": 17260 }, { "epoch": 4.556684703708592, "grad_norm": 0.16102853417396545, "learning_rate": 3.033087533869482e-05, "loss": 0.0286, "num_input_tokens_seen": 15433040, "step": 17265 }, { "epoch": 4.558004487264089, "grad_norm": 0.1977294385433197, "learning_rate": 3.0321283243637444e-05, "loss": 0.039, "num_input_tokens_seen": 15437648, "step": 17270 }, { "epoch": 4.559324270819586, "grad_norm": 0.3743496537208557, "learning_rate": 3.0311690327971326e-05, "loss": 0.1182, "num_input_tokens_seen": 15442448, "step": 17275 }, { "epoch": 4.560644054375082, "grad_norm": 0.28324317932128906, "learning_rate": 3.030209659317581e-05, "loss": 0.0678, "num_input_tokens_seen": 15447024, "step": 17280 }, { "epoch": 4.56196383793058, "grad_norm": 0.08828887343406677, "learning_rate": 3.0292502040730362e-05, "loss": 0.0295, "num_input_tokens_seen": 15451472, "step": 17285 }, { "epoch": 4.563283621486076, "grad_norm": 0.3007761240005493, "learning_rate": 3.0282906672114597e-05, "loss": 0.1016, "num_input_tokens_seen": 15455824, "step": 17290 }, { "epoch": 4.564603405041574, "grad_norm": 0.2322336882352829, "learning_rate": 3.027331048880823e-05, "loss": 0.0881, "num_input_tokens_seen": 15460336, "step": 17295 }, { "epoch": 4.56592318859707, "grad_norm": 0.11608325690031052, "learning_rate": 3.0263713492291123e-05, "loss": 0.0297, "num_input_tokens_seen": 15464784, "step": 17300 }, { "epoch": 4.567242972152567, "grad_norm": 0.315477579832077, "learning_rate": 3.0254115684043242e-05, "loss": 0.0716, "num_input_tokens_seen": 15469200, "step": 17305 }, { "epoch": 4.568562755708064, "grad_norm": 0.4073043465614319, "learning_rate": 3.024451706554469e-05, "loss": 0.1258, "num_input_tokens_seen": 15473552, "step": 17310 }, { "epoch": 4.569882539263561, "grad_norm": 0.032944973558187485, "learning_rate": 3.0234917638275705e-05, "loss": 0.0714, "num_input_tokens_seen": 15478256, "step": 17315 }, { "epoch": 4.571202322819058, "grad_norm": 0.1115405410528183, "learning_rate": 3.0225317403716635e-05, "loss": 0.052, "num_input_tokens_seen": 15482448, "step": 17320 }, { "epoch": 4.572522106374555, "grad_norm": 0.2591233551502228, "learning_rate": 3.0215716363347956e-05, "loss": 0.0707, "num_input_tokens_seen": 15486896, "step": 17325 }, { "epoch": 4.573841889930051, "grad_norm": 0.15288959443569183, "learning_rate": 3.0206114518650275e-05, "loss": 0.0961, "num_input_tokens_seen": 15491472, "step": 17330 }, { "epoch": 4.575161673485549, "grad_norm": 0.18196724355220795, "learning_rate": 3.0196511871104304e-05, "loss": 0.0347, "num_input_tokens_seen": 15495696, "step": 17335 }, { "epoch": 4.576481457041045, "grad_norm": 0.3367064893245697, "learning_rate": 3.01869084221909e-05, "loss": 0.063, "num_input_tokens_seen": 15499920, "step": 17340 }, { "epoch": 4.577801240596542, "grad_norm": 0.2511852979660034, "learning_rate": 3.0177304173391037e-05, "loss": 0.0449, "num_input_tokens_seen": 15504272, "step": 17345 }, { "epoch": 4.579121024152039, "grad_norm": 0.374184787273407, "learning_rate": 3.01676991261858e-05, "loss": 0.0704, "num_input_tokens_seen": 15508624, "step": 17350 }, { "epoch": 4.580440807707536, "grad_norm": 0.20775195956230164, "learning_rate": 3.015809328205642e-05, "loss": 0.054, "num_input_tokens_seen": 15513040, "step": 17355 }, { "epoch": 4.581760591263032, "grad_norm": 0.23273865878582, "learning_rate": 3.0148486642484248e-05, "loss": 0.0575, "num_input_tokens_seen": 15517360, "step": 17360 }, { "epoch": 4.58308037481853, "grad_norm": 0.09257908910512924, "learning_rate": 3.0138879208950722e-05, "loss": 0.0176, "num_input_tokens_seen": 15522128, "step": 17365 }, { "epoch": 4.584400158374026, "grad_norm": 0.11899735033512115, "learning_rate": 3.012927098293744e-05, "loss": 0.0383, "num_input_tokens_seen": 15526608, "step": 17370 }, { "epoch": 4.585719941929524, "grad_norm": 0.23780351877212524, "learning_rate": 3.0119661965926123e-05, "loss": 0.0625, "num_input_tokens_seen": 15530928, "step": 17375 }, { "epoch": 4.58703972548502, "grad_norm": 0.156488835811615, "learning_rate": 3.0110052159398587e-05, "loss": 0.1041, "num_input_tokens_seen": 15535792, "step": 17380 }, { "epoch": 4.588359509040517, "grad_norm": 0.04205513000488281, "learning_rate": 3.0100441564836802e-05, "loss": 0.0774, "num_input_tokens_seen": 15540080, "step": 17385 }, { "epoch": 4.589679292596014, "grad_norm": 0.3483344614505768, "learning_rate": 3.0090830183722817e-05, "loss": 0.0981, "num_input_tokens_seen": 15544528, "step": 17390 }, { "epoch": 4.590999076151511, "grad_norm": 0.30300524830818176, "learning_rate": 3.0081218017538852e-05, "loss": 0.0938, "num_input_tokens_seen": 15549328, "step": 17395 }, { "epoch": 4.592318859707008, "grad_norm": 0.22110016644001007, "learning_rate": 3.0071605067767212e-05, "loss": 0.0947, "num_input_tokens_seen": 15553776, "step": 17400 }, { "epoch": 4.592318859707008, "eval_loss": 0.06884676963090897, "eval_runtime": 64.8367, "eval_samples_per_second": 103.876, "eval_steps_per_second": 25.973, "num_input_tokens_seen": 15553776, "step": 17400 }, { "epoch": 4.593638643262505, "grad_norm": 0.1262214630842209, "learning_rate": 3.006199133589034e-05, "loss": 0.0242, "num_input_tokens_seen": 15558384, "step": 17405 }, { "epoch": 4.5949584268180015, "grad_norm": 0.1509024202823639, "learning_rate": 3.005237682339079e-05, "loss": 0.0251, "num_input_tokens_seen": 15562736, "step": 17410 }, { "epoch": 4.596278210373499, "grad_norm": 0.33995139598846436, "learning_rate": 3.0042761531751228e-05, "loss": 0.059, "num_input_tokens_seen": 15567312, "step": 17415 }, { "epoch": 4.5975979939289955, "grad_norm": 0.40181964635849, "learning_rate": 3.0033145462454482e-05, "loss": 0.1055, "num_input_tokens_seen": 15571632, "step": 17420 }, { "epoch": 4.598917777484493, "grad_norm": 0.2430763840675354, "learning_rate": 3.002352861698345e-05, "loss": 0.0645, "num_input_tokens_seen": 15576048, "step": 17425 }, { "epoch": 4.6002375610399895, "grad_norm": 0.2577477693557739, "learning_rate": 3.0013910996821178e-05, "loss": 0.0555, "num_input_tokens_seen": 15581104, "step": 17430 }, { "epoch": 4.601557344595486, "grad_norm": 0.0804043859243393, "learning_rate": 3.0004292603450817e-05, "loss": 0.0373, "num_input_tokens_seen": 15585456, "step": 17435 }, { "epoch": 4.6028771281509835, "grad_norm": 0.26149576902389526, "learning_rate": 2.9994673438355653e-05, "loss": 0.0514, "num_input_tokens_seen": 15589776, "step": 17440 }, { "epoch": 4.60419691170648, "grad_norm": 0.33292320370674133, "learning_rate": 2.9985053503019078e-05, "loss": 0.0578, "num_input_tokens_seen": 15594320, "step": 17445 }, { "epoch": 4.6055166952619775, "grad_norm": 0.06740719825029373, "learning_rate": 2.99754327989246e-05, "loss": 0.0495, "num_input_tokens_seen": 15598576, "step": 17450 }, { "epoch": 4.606836478817474, "grad_norm": 0.3843245804309845, "learning_rate": 2.9965811327555864e-05, "loss": 0.0742, "num_input_tokens_seen": 15602928, "step": 17455 }, { "epoch": 4.608156262372971, "grad_norm": 0.42473652958869934, "learning_rate": 2.995618909039662e-05, "loss": 0.0838, "num_input_tokens_seen": 15607504, "step": 17460 }, { "epoch": 4.609476045928468, "grad_norm": 0.23682084679603577, "learning_rate": 2.9946566088930727e-05, "loss": 0.0493, "num_input_tokens_seen": 15612080, "step": 17465 }, { "epoch": 4.610795829483965, "grad_norm": 0.062377359718084335, "learning_rate": 2.9936942324642192e-05, "loss": 0.0186, "num_input_tokens_seen": 15616496, "step": 17470 }, { "epoch": 4.612115613039461, "grad_norm": 0.2805708348751068, "learning_rate": 2.9927317799015097e-05, "loss": 0.0475, "num_input_tokens_seen": 15620848, "step": 17475 }, { "epoch": 4.613435396594959, "grad_norm": 0.3321366608142853, "learning_rate": 2.9917692513533685e-05, "loss": 0.0987, "num_input_tokens_seen": 15625168, "step": 17480 }, { "epoch": 4.614755180150455, "grad_norm": 0.1830660104751587, "learning_rate": 2.990806646968229e-05, "loss": 0.0628, "num_input_tokens_seen": 15629680, "step": 17485 }, { "epoch": 4.616074963705953, "grad_norm": 0.4089912176132202, "learning_rate": 2.989843966894536e-05, "loss": 0.0657, "num_input_tokens_seen": 15634224, "step": 17490 }, { "epoch": 4.617394747261449, "grad_norm": 0.23219004273414612, "learning_rate": 2.9888812112807472e-05, "loss": 0.0908, "num_input_tokens_seen": 15638608, "step": 17495 }, { "epoch": 4.618714530816946, "grad_norm": 0.17908340692520142, "learning_rate": 2.987918380275333e-05, "loss": 0.0307, "num_input_tokens_seen": 15643280, "step": 17500 }, { "epoch": 4.620034314372443, "grad_norm": 0.13531553745269775, "learning_rate": 2.9869554740267724e-05, "loss": 0.0588, "num_input_tokens_seen": 15647536, "step": 17505 }, { "epoch": 4.62135409792794, "grad_norm": 0.08422496914863586, "learning_rate": 2.9859924926835585e-05, "loss": 0.0418, "num_input_tokens_seen": 15652080, "step": 17510 }, { "epoch": 4.622673881483436, "grad_norm": 0.08091544359922409, "learning_rate": 2.9850294363941944e-05, "loss": 0.0301, "num_input_tokens_seen": 15656848, "step": 17515 }, { "epoch": 4.623993665038934, "grad_norm": 0.10226783901453018, "learning_rate": 2.9840663053071967e-05, "loss": 0.0709, "num_input_tokens_seen": 15661584, "step": 17520 }, { "epoch": 4.62531344859443, "grad_norm": 0.24376888573169708, "learning_rate": 2.983103099571091e-05, "loss": 0.0922, "num_input_tokens_seen": 15665872, "step": 17525 }, { "epoch": 4.626633232149928, "grad_norm": 0.1510409116744995, "learning_rate": 2.9821398193344164e-05, "loss": 0.0729, "num_input_tokens_seen": 15670544, "step": 17530 }, { "epoch": 4.627953015705424, "grad_norm": 0.10583268851041794, "learning_rate": 2.9811764647457226e-05, "loss": 0.0402, "num_input_tokens_seen": 15675088, "step": 17535 }, { "epoch": 4.629272799260921, "grad_norm": 0.1281079202890396, "learning_rate": 2.9802130359535714e-05, "loss": 0.0328, "num_input_tokens_seen": 15679504, "step": 17540 }, { "epoch": 4.630592582816418, "grad_norm": 0.22123976051807404, "learning_rate": 2.979249533106535e-05, "loss": 0.0716, "num_input_tokens_seen": 15684112, "step": 17545 }, { "epoch": 4.631912366371915, "grad_norm": 0.19087405502796173, "learning_rate": 2.9782859563531986e-05, "loss": 0.0177, "num_input_tokens_seen": 15688816, "step": 17550 }, { "epoch": 4.633232149927412, "grad_norm": 0.09876422584056854, "learning_rate": 2.977322305842156e-05, "loss": 0.1122, "num_input_tokens_seen": 15693264, "step": 17555 }, { "epoch": 4.634551933482909, "grad_norm": 0.10813527554273605, "learning_rate": 2.9763585817220162e-05, "loss": 0.0452, "num_input_tokens_seen": 15697392, "step": 17560 }, { "epoch": 4.635871717038405, "grad_norm": 0.15994390845298767, "learning_rate": 2.975394784141397e-05, "loss": 0.0489, "num_input_tokens_seen": 15701840, "step": 17565 }, { "epoch": 4.637191500593903, "grad_norm": 0.048398297280073166, "learning_rate": 2.974430913248928e-05, "loss": 0.0207, "num_input_tokens_seen": 15706384, "step": 17570 }, { "epoch": 4.638511284149399, "grad_norm": 0.3100918233394623, "learning_rate": 2.9734669691932497e-05, "loss": 0.0286, "num_input_tokens_seen": 15710864, "step": 17575 }, { "epoch": 4.639831067704897, "grad_norm": 0.4119435250759125, "learning_rate": 2.9725029521230147e-05, "loss": 0.0691, "num_input_tokens_seen": 15715472, "step": 17580 }, { "epoch": 4.641150851260393, "grad_norm": 0.590843141078949, "learning_rate": 2.9715388621868873e-05, "loss": 0.0861, "num_input_tokens_seen": 15720016, "step": 17585 }, { "epoch": 4.64247063481589, "grad_norm": 0.34467923641204834, "learning_rate": 2.970574699533541e-05, "loss": 0.0686, "num_input_tokens_seen": 15724560, "step": 17590 }, { "epoch": 4.643790418371387, "grad_norm": 0.12566322088241577, "learning_rate": 2.969610464311662e-05, "loss": 0.0577, "num_input_tokens_seen": 15729008, "step": 17595 }, { "epoch": 4.645110201926884, "grad_norm": 0.23434963822364807, "learning_rate": 2.9686461566699487e-05, "loss": 0.0702, "num_input_tokens_seen": 15733520, "step": 17600 }, { "epoch": 4.645110201926884, "eval_loss": 0.06890065968036652, "eval_runtime": 64.7883, "eval_samples_per_second": 103.954, "eval_steps_per_second": 25.992, "num_input_tokens_seen": 15733520, "step": 17600 }, { "epoch": 4.6464299854823805, "grad_norm": 0.47985199093818665, "learning_rate": 2.9676817767571086e-05, "loss": 0.1122, "num_input_tokens_seen": 15737808, "step": 17605 }, { "epoch": 4.647749769037878, "grad_norm": 0.3714390695095062, "learning_rate": 2.966717324721861e-05, "loss": 0.08, "num_input_tokens_seen": 15742320, "step": 17610 }, { "epoch": 4.6490695525933745, "grad_norm": 0.05435813218355179, "learning_rate": 2.9657528007129366e-05, "loss": 0.0558, "num_input_tokens_seen": 15746928, "step": 17615 }, { "epoch": 4.650389336148872, "grad_norm": 0.29634612798690796, "learning_rate": 2.9647882048790777e-05, "loss": 0.0515, "num_input_tokens_seen": 15751792, "step": 17620 }, { "epoch": 4.6517091197043685, "grad_norm": 0.3147405982017517, "learning_rate": 2.963823537369037e-05, "loss": 0.0716, "num_input_tokens_seen": 15756240, "step": 17625 }, { "epoch": 4.653028903259865, "grad_norm": 0.11478442698717117, "learning_rate": 2.9628587983315775e-05, "loss": 0.1111, "num_input_tokens_seen": 15760656, "step": 17630 }, { "epoch": 4.6543486868153625, "grad_norm": 0.13129062950611115, "learning_rate": 2.9618939879154746e-05, "loss": 0.0189, "num_input_tokens_seen": 15765040, "step": 17635 }, { "epoch": 4.655668470370859, "grad_norm": 0.3957226276397705, "learning_rate": 2.9609291062695143e-05, "loss": 0.0454, "num_input_tokens_seen": 15769264, "step": 17640 }, { "epoch": 4.656988253926356, "grad_norm": 0.13761353492736816, "learning_rate": 2.9599641535424938e-05, "loss": 0.0447, "num_input_tokens_seen": 15773520, "step": 17645 }, { "epoch": 4.658308037481853, "grad_norm": 0.05479968339204788, "learning_rate": 2.9589991298832202e-05, "loss": 0.0452, "num_input_tokens_seen": 15777872, "step": 17650 }, { "epoch": 4.65962782103735, "grad_norm": 0.2791734039783478, "learning_rate": 2.958034035440513e-05, "loss": 0.0521, "num_input_tokens_seen": 15782608, "step": 17655 }, { "epoch": 4.660947604592847, "grad_norm": 0.08973439782857895, "learning_rate": 2.957068870363201e-05, "loss": 0.0711, "num_input_tokens_seen": 15786896, "step": 17660 }, { "epoch": 4.662267388148344, "grad_norm": 0.05793585255742073, "learning_rate": 2.956103634800126e-05, "loss": 0.0641, "num_input_tokens_seen": 15791184, "step": 17665 }, { "epoch": 4.66358717170384, "grad_norm": 0.4634318947792053, "learning_rate": 2.9551383289001384e-05, "loss": 0.1204, "num_input_tokens_seen": 15795440, "step": 17670 }, { "epoch": 4.664906955259338, "grad_norm": 0.2288123518228531, "learning_rate": 2.9541729528121005e-05, "loss": 0.0712, "num_input_tokens_seen": 15799824, "step": 17675 }, { "epoch": 4.666226738814834, "grad_norm": 0.17236408591270447, "learning_rate": 2.9532075066848856e-05, "loss": 0.0575, "num_input_tokens_seen": 15804240, "step": 17680 }, { "epoch": 4.667546522370332, "grad_norm": 0.31086984276771545, "learning_rate": 2.9522419906673786e-05, "loss": 0.1438, "num_input_tokens_seen": 15808720, "step": 17685 }, { "epoch": 4.668866305925828, "grad_norm": 0.08635164797306061, "learning_rate": 2.951276404908474e-05, "loss": 0.0267, "num_input_tokens_seen": 15813200, "step": 17690 }, { "epoch": 4.670186089481325, "grad_norm": 0.10308081656694412, "learning_rate": 2.9503107495570752e-05, "loss": 0.0914, "num_input_tokens_seen": 15817648, "step": 17695 }, { "epoch": 4.671505873036822, "grad_norm": 0.06466356664896011, "learning_rate": 2.9493450247621003e-05, "loss": 0.0718, "num_input_tokens_seen": 15822032, "step": 17700 }, { "epoch": 4.672825656592319, "grad_norm": 0.2598475515842438, "learning_rate": 2.948379230672476e-05, "loss": 0.062, "num_input_tokens_seen": 15826544, "step": 17705 }, { "epoch": 4.674145440147816, "grad_norm": 0.07291156053543091, "learning_rate": 2.9474133674371396e-05, "loss": 0.0864, "num_input_tokens_seen": 15831280, "step": 17710 }, { "epoch": 4.675465223703313, "grad_norm": 0.27887532114982605, "learning_rate": 2.9464474352050387e-05, "loss": 0.0767, "num_input_tokens_seen": 15835824, "step": 17715 }, { "epoch": 4.676785007258809, "grad_norm": 0.0346548855304718, "learning_rate": 2.9454814341251336e-05, "loss": 0.0403, "num_input_tokens_seen": 15840496, "step": 17720 }, { "epoch": 4.678104790814307, "grad_norm": 0.42805132269859314, "learning_rate": 2.9445153643463942e-05, "loss": 0.0543, "num_input_tokens_seen": 15845104, "step": 17725 }, { "epoch": 4.679424574369803, "grad_norm": 0.1374506801366806, "learning_rate": 2.943549226017798e-05, "loss": 0.078, "num_input_tokens_seen": 15849456, "step": 17730 }, { "epoch": 4.680744357925301, "grad_norm": 0.027222875505685806, "learning_rate": 2.942583019288337e-05, "loss": 0.0517, "num_input_tokens_seen": 15853744, "step": 17735 }, { "epoch": 4.682064141480797, "grad_norm": 0.19105306267738342, "learning_rate": 2.9416167443070132e-05, "loss": 0.0389, "num_input_tokens_seen": 15858128, "step": 17740 }, { "epoch": 4.683383925036294, "grad_norm": 0.0392376072704792, "learning_rate": 2.9406504012228375e-05, "loss": 0.0299, "num_input_tokens_seen": 15862608, "step": 17745 }, { "epoch": 4.684703708591791, "grad_norm": 0.40456998348236084, "learning_rate": 2.939683990184832e-05, "loss": 0.0402, "num_input_tokens_seen": 15866928, "step": 17750 }, { "epoch": 4.686023492147288, "grad_norm": 0.1575268805027008, "learning_rate": 2.93871751134203e-05, "loss": 0.0629, "num_input_tokens_seen": 15871184, "step": 17755 }, { "epoch": 4.687343275702784, "grad_norm": 0.14288605749607086, "learning_rate": 2.9377509648434752e-05, "loss": 0.038, "num_input_tokens_seen": 15875408, "step": 17760 }, { "epoch": 4.688663059258282, "grad_norm": 0.08281012624502182, "learning_rate": 2.9367843508382203e-05, "loss": 0.067, "num_input_tokens_seen": 15879728, "step": 17765 }, { "epoch": 4.689982842813778, "grad_norm": 0.024431154131889343, "learning_rate": 2.9358176694753293e-05, "loss": 0.0554, "num_input_tokens_seen": 15884528, "step": 17770 }, { "epoch": 4.691302626369275, "grad_norm": 0.23028463125228882, "learning_rate": 2.9348509209038766e-05, "loss": 0.0606, "num_input_tokens_seen": 15888944, "step": 17775 }, { "epoch": 4.692622409924772, "grad_norm": 0.08968193829059601, "learning_rate": 2.933884105272947e-05, "loss": 0.0133, "num_input_tokens_seen": 15893840, "step": 17780 }, { "epoch": 4.693942193480269, "grad_norm": 0.4847165644168854, "learning_rate": 2.9329172227316366e-05, "loss": 0.0556, "num_input_tokens_seen": 15898224, "step": 17785 }, { "epoch": 4.695261977035766, "grad_norm": 0.3025650680065155, "learning_rate": 2.93195027342905e-05, "loss": 0.0733, "num_input_tokens_seen": 15902832, "step": 17790 }, { "epoch": 4.696581760591263, "grad_norm": 0.22033993899822235, "learning_rate": 2.9309832575143024e-05, "loss": 0.0943, "num_input_tokens_seen": 15907280, "step": 17795 }, { "epoch": 4.6979015441467595, "grad_norm": 0.28822341561317444, "learning_rate": 2.930016175136521e-05, "loss": 0.0904, "num_input_tokens_seen": 15911728, "step": 17800 }, { "epoch": 4.6979015441467595, "eval_loss": 0.06856284290552139, "eval_runtime": 64.781, "eval_samples_per_second": 103.966, "eval_steps_per_second": 25.995, "num_input_tokens_seen": 15911728, "step": 17800 }, { "epoch": 4.699221327702257, "grad_norm": 0.30911681056022644, "learning_rate": 2.9290490264448412e-05, "loss": 0.1562, "num_input_tokens_seen": 15916208, "step": 17805 }, { "epoch": 4.7005411112577535, "grad_norm": 0.1361420899629593, "learning_rate": 2.9280818115884094e-05, "loss": 0.0576, "num_input_tokens_seen": 15920496, "step": 17810 }, { "epoch": 4.701860894813251, "grad_norm": 0.04205937311053276, "learning_rate": 2.9271145307163828e-05, "loss": 0.1267, "num_input_tokens_seen": 15924880, "step": 17815 }, { "epoch": 4.7031806783687475, "grad_norm": 0.0512765608727932, "learning_rate": 2.9261471839779287e-05, "loss": 0.0633, "num_input_tokens_seen": 15929744, "step": 17820 }, { "epoch": 4.704500461924244, "grad_norm": 0.18293075263500214, "learning_rate": 2.925179771522223e-05, "loss": 0.0366, "num_input_tokens_seen": 15934352, "step": 17825 }, { "epoch": 4.7058202454797415, "grad_norm": 0.23262397944927216, "learning_rate": 2.9242122934984535e-05, "loss": 0.0336, "num_input_tokens_seen": 15938640, "step": 17830 }, { "epoch": 4.707140029035238, "grad_norm": 0.2571221590042114, "learning_rate": 2.9232447500558176e-05, "loss": 0.0734, "num_input_tokens_seen": 15943152, "step": 17835 }, { "epoch": 4.7084598125907355, "grad_norm": 0.05716380104422569, "learning_rate": 2.9222771413435225e-05, "loss": 0.0194, "num_input_tokens_seen": 15947408, "step": 17840 }, { "epoch": 4.709779596146232, "grad_norm": 0.09259232133626938, "learning_rate": 2.9213094675107848e-05, "loss": 0.0652, "num_input_tokens_seen": 15951664, "step": 17845 }, { "epoch": 4.711099379701729, "grad_norm": 0.03979388624429703, "learning_rate": 2.9203417287068335e-05, "loss": 0.014, "num_input_tokens_seen": 15955824, "step": 17850 }, { "epoch": 4.712419163257226, "grad_norm": 0.061700619757175446, "learning_rate": 2.9193739250809042e-05, "loss": 0.064, "num_input_tokens_seen": 15959856, "step": 17855 }, { "epoch": 4.713738946812723, "grad_norm": 0.16454225778579712, "learning_rate": 2.9184060567822463e-05, "loss": 0.0422, "num_input_tokens_seen": 15964560, "step": 17860 }, { "epoch": 4.71505873036822, "grad_norm": 0.25724831223487854, "learning_rate": 2.9174381239601166e-05, "loss": 0.0404, "num_input_tokens_seen": 15969040, "step": 17865 }, { "epoch": 4.716378513923717, "grad_norm": 0.24976512789726257, "learning_rate": 2.916470126763783e-05, "loss": 0.0716, "num_input_tokens_seen": 15973328, "step": 17870 }, { "epoch": 4.717698297479213, "grad_norm": 0.26603972911834717, "learning_rate": 2.9155020653425203e-05, "loss": 0.045, "num_input_tokens_seen": 15978000, "step": 17875 }, { "epoch": 4.719018081034711, "grad_norm": 0.23804716765880585, "learning_rate": 2.9145339398456184e-05, "loss": 0.0567, "num_input_tokens_seen": 15982320, "step": 17880 }, { "epoch": 4.720337864590207, "grad_norm": 0.08322809636592865, "learning_rate": 2.913565750422374e-05, "loss": 0.0363, "num_input_tokens_seen": 15987216, "step": 17885 }, { "epoch": 4.721657648145704, "grad_norm": 0.21547013521194458, "learning_rate": 2.9125974972220938e-05, "loss": 0.0594, "num_input_tokens_seen": 15991600, "step": 17890 }, { "epoch": 4.722977431701201, "grad_norm": 0.2432098239660263, "learning_rate": 2.9116291803940932e-05, "loss": 0.0559, "num_input_tokens_seen": 15995984, "step": 17895 }, { "epoch": 4.724297215256698, "grad_norm": 0.03919065743684769, "learning_rate": 2.910660800087701e-05, "loss": 0.0234, "num_input_tokens_seen": 16000688, "step": 17900 }, { "epoch": 4.725616998812194, "grad_norm": 0.11776836961507797, "learning_rate": 2.909692356452254e-05, "loss": 0.0568, "num_input_tokens_seen": 16005136, "step": 17905 }, { "epoch": 4.726936782367692, "grad_norm": 0.18365944921970367, "learning_rate": 2.9087238496370962e-05, "loss": 0.0985, "num_input_tokens_seen": 16009392, "step": 17910 }, { "epoch": 4.728256565923188, "grad_norm": 0.5774285197257996, "learning_rate": 2.907755279791583e-05, "loss": 0.0994, "num_input_tokens_seen": 16013648, "step": 17915 }, { "epoch": 4.729576349478686, "grad_norm": 0.09283986687660217, "learning_rate": 2.906786647065083e-05, "loss": 0.053, "num_input_tokens_seen": 16018192, "step": 17920 }, { "epoch": 4.730896133034182, "grad_norm": 0.01985105499625206, "learning_rate": 2.9058179516069695e-05, "loss": 0.0188, "num_input_tokens_seen": 16023088, "step": 17925 }, { "epoch": 4.732215916589679, "grad_norm": 0.6372429728507996, "learning_rate": 2.9048491935666282e-05, "loss": 0.0861, "num_input_tokens_seen": 16027536, "step": 17930 }, { "epoch": 4.733535700145176, "grad_norm": 0.26704344153404236, "learning_rate": 2.9038803730934534e-05, "loss": 0.0622, "num_input_tokens_seen": 16032208, "step": 17935 }, { "epoch": 4.734855483700673, "grad_norm": 0.20905055105686188, "learning_rate": 2.9029114903368503e-05, "loss": 0.0531, "num_input_tokens_seen": 16036976, "step": 17940 }, { "epoch": 4.73617526725617, "grad_norm": 0.27750009298324585, "learning_rate": 2.9019425454462318e-05, "loss": 0.0598, "num_input_tokens_seen": 16041392, "step": 17945 }, { "epoch": 4.737495050811667, "grad_norm": 0.18839259445667267, "learning_rate": 2.9009735385710212e-05, "loss": 0.0615, "num_input_tokens_seen": 16046032, "step": 17950 }, { "epoch": 4.738814834367163, "grad_norm": 0.26710817217826843, "learning_rate": 2.900004469860652e-05, "loss": 0.0612, "num_input_tokens_seen": 16050672, "step": 17955 }, { "epoch": 4.740134617922661, "grad_norm": 0.20179186761379242, "learning_rate": 2.8990353394645668e-05, "loss": 0.0833, "num_input_tokens_seen": 16055216, "step": 17960 }, { "epoch": 4.741454401478157, "grad_norm": 0.2130323350429535, "learning_rate": 2.8980661475322186e-05, "loss": 0.0424, "num_input_tokens_seen": 16059664, "step": 17965 }, { "epoch": 4.742774185033655, "grad_norm": 0.15826711058616638, "learning_rate": 2.897096894213067e-05, "loss": 0.0644, "num_input_tokens_seen": 16064464, "step": 17970 }, { "epoch": 4.7440939685891514, "grad_norm": 0.34615734219551086, "learning_rate": 2.8961275796565845e-05, "loss": 0.0418, "num_input_tokens_seen": 16069072, "step": 17975 }, { "epoch": 4.745413752144648, "grad_norm": 0.03634057939052582, "learning_rate": 2.8951582040122517e-05, "loss": 0.033, "num_input_tokens_seen": 16073744, "step": 17980 }, { "epoch": 4.7467335357001454, "grad_norm": 0.2439921349287033, "learning_rate": 2.894188767429557e-05, "loss": 0.0966, "num_input_tokens_seen": 16078384, "step": 17985 }, { "epoch": 4.748053319255642, "grad_norm": 0.030153362080454826, "learning_rate": 2.8932192700580014e-05, "loss": 0.059, "num_input_tokens_seen": 16082832, "step": 17990 }, { "epoch": 4.7493731028111394, "grad_norm": 0.1239658072590828, "learning_rate": 2.8922497120470916e-05, "loss": 0.0287, "num_input_tokens_seen": 16087280, "step": 17995 }, { "epoch": 4.750692886366636, "grad_norm": 0.13337016105651855, "learning_rate": 2.891280093546348e-05, "loss": 0.0531, "num_input_tokens_seen": 16091728, "step": 18000 }, { "epoch": 4.750692886366636, "eval_loss": 0.06902699917554855, "eval_runtime": 64.7828, "eval_samples_per_second": 103.963, "eval_steps_per_second": 25.995, "num_input_tokens_seen": 16091728, "step": 18000 }, { "epoch": 4.752012669922133, "grad_norm": 0.08740201592445374, "learning_rate": 2.890310414705297e-05, "loss": 0.083, "num_input_tokens_seen": 16096112, "step": 18005 }, { "epoch": 4.75333245347763, "grad_norm": 0.10230229794979095, "learning_rate": 2.8893406756734742e-05, "loss": 0.0601, "num_input_tokens_seen": 16100272, "step": 18010 }, { "epoch": 4.754652237033127, "grad_norm": 0.23941786587238312, "learning_rate": 2.888370876600427e-05, "loss": 0.0389, "num_input_tokens_seen": 16104592, "step": 18015 }, { "epoch": 4.755972020588623, "grad_norm": 0.09854938089847565, "learning_rate": 2.8874010176357104e-05, "loss": 0.0967, "num_input_tokens_seen": 16108688, "step": 18020 }, { "epoch": 4.757291804144121, "grad_norm": 0.12685872614383698, "learning_rate": 2.886431098928888e-05, "loss": 0.0681, "num_input_tokens_seen": 16113584, "step": 18025 }, { "epoch": 4.758611587699617, "grad_norm": 0.16588932275772095, "learning_rate": 2.885461120629534e-05, "loss": 0.0876, "num_input_tokens_seen": 16118000, "step": 18030 }, { "epoch": 4.759931371255114, "grad_norm": 0.13890865445137024, "learning_rate": 2.8844910828872317e-05, "loss": 0.0256, "num_input_tokens_seen": 16122608, "step": 18035 }, { "epoch": 4.761251154810611, "grad_norm": 0.042833294719457626, "learning_rate": 2.8835209858515715e-05, "loss": 0.1106, "num_input_tokens_seen": 16127088, "step": 18040 }, { "epoch": 4.762570938366108, "grad_norm": 0.04848520830273628, "learning_rate": 2.8825508296721566e-05, "loss": 0.063, "num_input_tokens_seen": 16131440, "step": 18045 }, { "epoch": 4.763890721921605, "grad_norm": 0.07415361702442169, "learning_rate": 2.881580614498596e-05, "loss": 0.0518, "num_input_tokens_seen": 16135824, "step": 18050 }, { "epoch": 4.765210505477102, "grad_norm": 0.3715346157550812, "learning_rate": 2.8806103404805103e-05, "loss": 0.0832, "num_input_tokens_seen": 16140240, "step": 18055 }, { "epoch": 4.766530289032598, "grad_norm": 0.15694566071033478, "learning_rate": 2.8796400077675257e-05, "loss": 0.0537, "num_input_tokens_seen": 16144336, "step": 18060 }, { "epoch": 4.767850072588096, "grad_norm": 0.1628815233707428, "learning_rate": 2.8786696165092812e-05, "loss": 0.0623, "num_input_tokens_seen": 16149072, "step": 18065 }, { "epoch": 4.769169856143592, "grad_norm": 0.46819257736206055, "learning_rate": 2.8776991668554236e-05, "loss": 0.0831, "num_input_tokens_seen": 16153616, "step": 18070 }, { "epoch": 4.77048963969909, "grad_norm": 0.2627697288990021, "learning_rate": 2.876728658955608e-05, "loss": 0.0892, "num_input_tokens_seen": 16157744, "step": 18075 }, { "epoch": 4.771809423254586, "grad_norm": 0.25993549823760986, "learning_rate": 2.8757580929594986e-05, "loss": 0.0409, "num_input_tokens_seen": 16162320, "step": 18080 }, { "epoch": 4.773129206810083, "grad_norm": 0.1118636205792427, "learning_rate": 2.87478746901677e-05, "loss": 0.0548, "num_input_tokens_seen": 16166480, "step": 18085 }, { "epoch": 4.77444899036558, "grad_norm": 0.16295285522937775, "learning_rate": 2.873816787277103e-05, "loss": 0.0829, "num_input_tokens_seen": 16170544, "step": 18090 }, { "epoch": 4.775768773921077, "grad_norm": 0.3596700131893158, "learning_rate": 2.8728460478901903e-05, "loss": 0.0572, "num_input_tokens_seen": 16174672, "step": 18095 }, { "epoch": 4.777088557476574, "grad_norm": 0.32202497124671936, "learning_rate": 2.8718752510057307e-05, "loss": 0.0949, "num_input_tokens_seen": 16179216, "step": 18100 }, { "epoch": 4.778408341032071, "grad_norm": 0.1442602425813675, "learning_rate": 2.870904396773435e-05, "loss": 0.0256, "num_input_tokens_seen": 16183632, "step": 18105 }, { "epoch": 4.779728124587567, "grad_norm": 0.2890265882015228, "learning_rate": 2.86993348534302e-05, "loss": 0.0851, "num_input_tokens_seen": 16188304, "step": 18110 }, { "epoch": 4.781047908143065, "grad_norm": 0.034286852926015854, "learning_rate": 2.868962516864212e-05, "loss": 0.0135, "num_input_tokens_seen": 16192848, "step": 18115 }, { "epoch": 4.782367691698561, "grad_norm": 0.21903228759765625, "learning_rate": 2.8679914914867477e-05, "loss": 0.0349, "num_input_tokens_seen": 16197264, "step": 18120 }, { "epoch": 4.783687475254059, "grad_norm": 0.1597234606742859, "learning_rate": 2.8670204093603713e-05, "loss": 0.0591, "num_input_tokens_seen": 16201968, "step": 18125 }, { "epoch": 4.785007258809555, "grad_norm": 0.44640466570854187, "learning_rate": 2.8660492706348357e-05, "loss": 0.0416, "num_input_tokens_seen": 16206320, "step": 18130 }, { "epoch": 4.786327042365052, "grad_norm": 0.5237805843353271, "learning_rate": 2.8650780754599022e-05, "loss": 0.0775, "num_input_tokens_seen": 16210736, "step": 18135 }, { "epoch": 4.787646825920549, "grad_norm": 0.28884586691856384, "learning_rate": 2.8641068239853407e-05, "loss": 0.0551, "num_input_tokens_seen": 16215280, "step": 18140 }, { "epoch": 4.788966609476046, "grad_norm": 0.2676484286785126, "learning_rate": 2.863135516360932e-05, "loss": 0.063, "num_input_tokens_seen": 16219792, "step": 18145 }, { "epoch": 4.7902863930315425, "grad_norm": 0.21679319441318512, "learning_rate": 2.8621641527364633e-05, "loss": 0.1207, "num_input_tokens_seen": 16224304, "step": 18150 }, { "epoch": 4.79160617658704, "grad_norm": 0.08065123111009598, "learning_rate": 2.8611927332617313e-05, "loss": 0.0855, "num_input_tokens_seen": 16228720, "step": 18155 }, { "epoch": 4.7929259601425365, "grad_norm": 0.03950365632772446, "learning_rate": 2.8602212580865405e-05, "loss": 0.0677, "num_input_tokens_seen": 16233200, "step": 18160 }, { "epoch": 4.794245743698033, "grad_norm": 0.10011640191078186, "learning_rate": 2.859249727360705e-05, "loss": 0.0282, "num_input_tokens_seen": 16237648, "step": 18165 }, { "epoch": 4.7955655272535305, "grad_norm": 0.37731754779815674, "learning_rate": 2.8582781412340465e-05, "loss": 0.0525, "num_input_tokens_seen": 16242064, "step": 18170 }, { "epoch": 4.796885310809027, "grad_norm": 0.28352510929107666, "learning_rate": 2.857306499856397e-05, "loss": 0.0543, "num_input_tokens_seen": 16246256, "step": 18175 }, { "epoch": 4.7982050943645245, "grad_norm": 0.2848483622074127, "learning_rate": 2.856334803377594e-05, "loss": 0.0528, "num_input_tokens_seen": 16250544, "step": 18180 }, { "epoch": 4.799524877920021, "grad_norm": 0.2371665984392166, "learning_rate": 2.8553630519474867e-05, "loss": 0.0555, "num_input_tokens_seen": 16254864, "step": 18185 }, { "epoch": 4.800844661475518, "grad_norm": 0.06103622540831566, "learning_rate": 2.8543912457159317e-05, "loss": 0.0628, "num_input_tokens_seen": 16259376, "step": 18190 }, { "epoch": 4.802164445031015, "grad_norm": 0.3026491701602936, "learning_rate": 2.853419384832792e-05, "loss": 0.0362, "num_input_tokens_seen": 16263792, "step": 18195 }, { "epoch": 4.803484228586512, "grad_norm": 0.7308284044265747, "learning_rate": 2.8524474694479423e-05, "loss": 0.0737, "num_input_tokens_seen": 16268208, "step": 18200 }, { "epoch": 4.803484228586512, "eval_loss": 0.06847847253084183, "eval_runtime": 64.7227, "eval_samples_per_second": 104.059, "eval_steps_per_second": 26.019, "num_input_tokens_seen": 16268208, "step": 18200 }, { "epoch": 4.804804012142009, "grad_norm": 0.23003126680850983, "learning_rate": 2.851475499711264e-05, "loss": 0.0401, "num_input_tokens_seen": 16272624, "step": 18205 }, { "epoch": 4.806123795697506, "grad_norm": 0.19622860848903656, "learning_rate": 2.8505034757726468e-05, "loss": 0.0325, "num_input_tokens_seen": 16277104, "step": 18210 }, { "epoch": 4.807443579253002, "grad_norm": 0.15494847297668457, "learning_rate": 2.8495313977819886e-05, "loss": 0.0253, "num_input_tokens_seen": 16281424, "step": 18215 }, { "epoch": 4.8087633628085, "grad_norm": 0.29320693016052246, "learning_rate": 2.8485592658891956e-05, "loss": 0.0747, "num_input_tokens_seen": 16285968, "step": 18220 }, { "epoch": 4.810083146363996, "grad_norm": 0.2857033312320709, "learning_rate": 2.8475870802441844e-05, "loss": 0.0961, "num_input_tokens_seen": 16290512, "step": 18225 }, { "epoch": 4.811402929919494, "grad_norm": 0.2897636890411377, "learning_rate": 2.8466148409968774e-05, "loss": 0.0373, "num_input_tokens_seen": 16294928, "step": 18230 }, { "epoch": 4.81272271347499, "grad_norm": 0.27795130014419556, "learning_rate": 2.8456425482972067e-05, "loss": 0.0575, "num_input_tokens_seen": 16299088, "step": 18235 }, { "epoch": 4.814042497030487, "grad_norm": 0.44196945428848267, "learning_rate": 2.84467020229511e-05, "loss": 0.073, "num_input_tokens_seen": 16303920, "step": 18240 }, { "epoch": 4.815362280585984, "grad_norm": 0.14868836104869843, "learning_rate": 2.8436978031405375e-05, "loss": 0.0412, "num_input_tokens_seen": 16308432, "step": 18245 }, { "epoch": 4.816682064141481, "grad_norm": 0.2528722584247589, "learning_rate": 2.842725350983445e-05, "loss": 0.0445, "num_input_tokens_seen": 16313168, "step": 18250 }, { "epoch": 4.818001847696978, "grad_norm": 0.2338588982820511, "learning_rate": 2.8417528459737957e-05, "loss": 0.065, "num_input_tokens_seen": 16317552, "step": 18255 }, { "epoch": 4.819321631252475, "grad_norm": 0.37647566199302673, "learning_rate": 2.8407802882615624e-05, "loss": 0.073, "num_input_tokens_seen": 16321904, "step": 18260 }, { "epoch": 4.820641414807971, "grad_norm": 0.20688609778881073, "learning_rate": 2.8398076779967277e-05, "loss": 0.0763, "num_input_tokens_seen": 16326320, "step": 18265 }, { "epoch": 4.821961198363469, "grad_norm": 0.19370673596858978, "learning_rate": 2.8388350153292774e-05, "loss": 0.0802, "num_input_tokens_seen": 16331024, "step": 18270 }, { "epoch": 4.823280981918965, "grad_norm": 0.08151532709598541, "learning_rate": 2.8378623004092103e-05, "loss": 0.083, "num_input_tokens_seen": 16335280, "step": 18275 }, { "epoch": 4.824600765474462, "grad_norm": 0.15876011550426483, "learning_rate": 2.8368895333865302e-05, "loss": 0.0271, "num_input_tokens_seen": 16339472, "step": 18280 }, { "epoch": 4.825920549029959, "grad_norm": 0.4037726819515228, "learning_rate": 2.835916714411251e-05, "loss": 0.0416, "num_input_tokens_seen": 16344176, "step": 18285 }, { "epoch": 4.827240332585456, "grad_norm": 0.1780112236738205, "learning_rate": 2.8349438436333926e-05, "loss": 0.0247, "num_input_tokens_seen": 16348656, "step": 18290 }, { "epoch": 4.828560116140953, "grad_norm": 0.5652247071266174, "learning_rate": 2.833970921202984e-05, "loss": 0.064, "num_input_tokens_seen": 16352912, "step": 18295 }, { "epoch": 4.82987989969645, "grad_norm": 0.19574569165706635, "learning_rate": 2.8329979472700628e-05, "loss": 0.0402, "num_input_tokens_seen": 16357136, "step": 18300 }, { "epoch": 4.831199683251946, "grad_norm": 0.2805270850658417, "learning_rate": 2.832024921984674e-05, "loss": 0.0532, "num_input_tokens_seen": 16361648, "step": 18305 }, { "epoch": 4.832519466807444, "grad_norm": 0.20967717468738556, "learning_rate": 2.8310518454968693e-05, "loss": 0.0314, "num_input_tokens_seen": 16366352, "step": 18310 }, { "epoch": 4.83383925036294, "grad_norm": 0.13764281570911407, "learning_rate": 2.8300787179567095e-05, "loss": 0.0504, "num_input_tokens_seen": 16370640, "step": 18315 }, { "epoch": 4.835159033918437, "grad_norm": 0.4281664490699768, "learning_rate": 2.8291055395142636e-05, "loss": 0.0833, "num_input_tokens_seen": 16375280, "step": 18320 }, { "epoch": 4.836478817473934, "grad_norm": 0.30913692712783813, "learning_rate": 2.8281323103196073e-05, "loss": 0.0829, "num_input_tokens_seen": 16380048, "step": 18325 }, { "epoch": 4.837798601029431, "grad_norm": 0.43322595953941345, "learning_rate": 2.8271590305228256e-05, "loss": 0.0646, "num_input_tokens_seen": 16384560, "step": 18330 }, { "epoch": 4.839118384584928, "grad_norm": 0.4322415888309479, "learning_rate": 2.82618570027401e-05, "loss": 0.0849, "num_input_tokens_seen": 16388816, "step": 18335 }, { "epoch": 4.840438168140425, "grad_norm": 0.20907701551914215, "learning_rate": 2.8252123197232604e-05, "loss": 0.0249, "num_input_tokens_seen": 16393424, "step": 18340 }, { "epoch": 4.8417579516959215, "grad_norm": 0.40784138441085815, "learning_rate": 2.8242388890206843e-05, "loss": 0.1337, "num_input_tokens_seen": 16397872, "step": 18345 }, { "epoch": 4.843077735251419, "grad_norm": 0.3420793116092682, "learning_rate": 2.8232654083163967e-05, "loss": 0.0943, "num_input_tokens_seen": 16402224, "step": 18350 }, { "epoch": 4.8443975188069155, "grad_norm": 0.5291674137115479, "learning_rate": 2.822291877760521e-05, "loss": 0.1612, "num_input_tokens_seen": 16406672, "step": 18355 }, { "epoch": 4.845717302362413, "grad_norm": 0.2153484970331192, "learning_rate": 2.8213182975031864e-05, "loss": 0.051, "num_input_tokens_seen": 16411024, "step": 18360 }, { "epoch": 4.8470370859179095, "grad_norm": 0.19715672731399536, "learning_rate": 2.8203446676945337e-05, "loss": 0.0772, "num_input_tokens_seen": 16415600, "step": 18365 }, { "epoch": 4.848356869473406, "grad_norm": 0.5209136605262756, "learning_rate": 2.8193709884847075e-05, "loss": 0.0885, "num_input_tokens_seen": 16420048, "step": 18370 }, { "epoch": 4.8496766530289035, "grad_norm": 0.23625631630420685, "learning_rate": 2.8183972600238605e-05, "loss": 0.0862, "num_input_tokens_seen": 16424656, "step": 18375 }, { "epoch": 4.8509964365844, "grad_norm": 0.34111475944519043, "learning_rate": 2.817423482462156e-05, "loss": 0.0276, "num_input_tokens_seen": 16428880, "step": 18380 }, { "epoch": 4.8523162201398975, "grad_norm": 0.16406036913394928, "learning_rate": 2.8164496559497605e-05, "loss": 0.0745, "num_input_tokens_seen": 16433360, "step": 18385 }, { "epoch": 4.853636003695394, "grad_norm": 0.11744712293148041, "learning_rate": 2.815475780636852e-05, "loss": 0.0666, "num_input_tokens_seen": 16437680, "step": 18390 }, { "epoch": 4.854955787250891, "grad_norm": 0.3759215176105499, "learning_rate": 2.814501856673613e-05, "loss": 0.0762, "num_input_tokens_seen": 16442224, "step": 18395 }, { "epoch": 4.856275570806388, "grad_norm": 0.22512462735176086, "learning_rate": 2.8135278842102353e-05, "loss": 0.12, "num_input_tokens_seen": 16446704, "step": 18400 }, { "epoch": 4.856275570806388, "eval_loss": 0.06849317997694016, "eval_runtime": 64.7706, "eval_samples_per_second": 103.982, "eval_steps_per_second": 25.999, "num_input_tokens_seen": 16446704, "step": 18400 }, { "epoch": 4.857595354361885, "grad_norm": 0.0899285227060318, "learning_rate": 2.8125538633969183e-05, "loss": 0.0455, "num_input_tokens_seen": 16451120, "step": 18405 }, { "epoch": 4.858915137917381, "grad_norm": 0.30360764265060425, "learning_rate": 2.8115797943838677e-05, "loss": 0.065, "num_input_tokens_seen": 16455792, "step": 18410 }, { "epoch": 4.860234921472879, "grad_norm": 0.3270409107208252, "learning_rate": 2.810605677321298e-05, "loss": 0.0574, "num_input_tokens_seen": 16460080, "step": 18415 }, { "epoch": 4.861554705028375, "grad_norm": 0.11539432406425476, "learning_rate": 2.809631512359428e-05, "loss": 0.0261, "num_input_tokens_seen": 16464432, "step": 18420 }, { "epoch": 4.862874488583873, "grad_norm": 0.1628473997116089, "learning_rate": 2.8086572996484884e-05, "loss": 0.0436, "num_input_tokens_seen": 16468848, "step": 18425 }, { "epoch": 4.864194272139369, "grad_norm": 0.579860270023346, "learning_rate": 2.8076830393387143e-05, "loss": 0.0712, "num_input_tokens_seen": 16473232, "step": 18430 }, { "epoch": 4.865514055694866, "grad_norm": 0.2550118565559387, "learning_rate": 2.8067087315803497e-05, "loss": 0.0609, "num_input_tokens_seen": 16477840, "step": 18435 }, { "epoch": 4.866833839250363, "grad_norm": 0.11569822579622269, "learning_rate": 2.8057343765236433e-05, "loss": 0.0751, "num_input_tokens_seen": 16482256, "step": 18440 }, { "epoch": 4.86815362280586, "grad_norm": 0.11786375939846039, "learning_rate": 2.804759974318854e-05, "loss": 0.0637, "num_input_tokens_seen": 16486704, "step": 18445 }, { "epoch": 4.869473406361356, "grad_norm": 0.2913278341293335, "learning_rate": 2.8037855251162482e-05, "loss": 0.0878, "num_input_tokens_seen": 16491184, "step": 18450 }, { "epoch": 4.870793189916854, "grad_norm": 0.16652972996234894, "learning_rate": 2.802811029066096e-05, "loss": 0.0404, "num_input_tokens_seen": 16495888, "step": 18455 }, { "epoch": 4.87211297347235, "grad_norm": 0.10203798115253448, "learning_rate": 2.8018364863186764e-05, "loss": 0.0666, "num_input_tokens_seen": 16500368, "step": 18460 }, { "epoch": 4.873432757027848, "grad_norm": 0.1026599109172821, "learning_rate": 2.800861897024279e-05, "loss": 0.0333, "num_input_tokens_seen": 16504912, "step": 18465 }, { "epoch": 4.874752540583344, "grad_norm": 0.01670807972550392, "learning_rate": 2.799887261333196e-05, "loss": 0.0441, "num_input_tokens_seen": 16509584, "step": 18470 }, { "epoch": 4.876072324138841, "grad_norm": 0.15002210438251495, "learning_rate": 2.798912579395728e-05, "loss": 0.0512, "num_input_tokens_seen": 16514064, "step": 18475 }, { "epoch": 4.877392107694338, "grad_norm": 0.14212659001350403, "learning_rate": 2.797937851362185e-05, "loss": 0.0504, "num_input_tokens_seen": 16518512, "step": 18480 }, { "epoch": 4.878711891249835, "grad_norm": 0.15426969528198242, "learning_rate": 2.7969630773828802e-05, "loss": 0.0351, "num_input_tokens_seen": 16523056, "step": 18485 }, { "epoch": 4.880031674805332, "grad_norm": 0.18571124970912933, "learning_rate": 2.7959882576081382e-05, "loss": 0.0564, "num_input_tokens_seen": 16527632, "step": 18490 }, { "epoch": 4.881351458360829, "grad_norm": 0.553921103477478, "learning_rate": 2.795013392188286e-05, "loss": 0.104, "num_input_tokens_seen": 16531920, "step": 18495 }, { "epoch": 4.882671241916325, "grad_norm": 0.28923413157463074, "learning_rate": 2.7940384812736614e-05, "loss": 0.0508, "num_input_tokens_seen": 16536656, "step": 18500 }, { "epoch": 4.883991025471823, "grad_norm": 0.11596778780221939, "learning_rate": 2.7930635250146087e-05, "loss": 0.0354, "num_input_tokens_seen": 16540944, "step": 18505 }, { "epoch": 4.885310809027319, "grad_norm": 0.0761212557554245, "learning_rate": 2.792088523561477e-05, "loss": 0.0362, "num_input_tokens_seen": 16545616, "step": 18510 }, { "epoch": 4.886630592582817, "grad_norm": 0.09057457745075226, "learning_rate": 2.7911134770646246e-05, "loss": 0.0131, "num_input_tokens_seen": 16550160, "step": 18515 }, { "epoch": 4.887950376138313, "grad_norm": 0.15105414390563965, "learning_rate": 2.7901383856744157e-05, "loss": 0.0445, "num_input_tokens_seen": 16554768, "step": 18520 }, { "epoch": 4.88927015969381, "grad_norm": 0.2015790492296219, "learning_rate": 2.7891632495412217e-05, "loss": 0.0551, "num_input_tokens_seen": 16559248, "step": 18525 }, { "epoch": 4.890589943249307, "grad_norm": 0.10020840167999268, "learning_rate": 2.7881880688154205e-05, "loss": 0.0685, "num_input_tokens_seen": 16563568, "step": 18530 }, { "epoch": 4.891909726804804, "grad_norm": 0.017234444618225098, "learning_rate": 2.7872128436473977e-05, "loss": 0.0314, "num_input_tokens_seen": 16568016, "step": 18535 }, { "epoch": 4.893229510360301, "grad_norm": 0.3275602459907532, "learning_rate": 2.7862375741875448e-05, "loss": 0.0514, "num_input_tokens_seen": 16572688, "step": 18540 }, { "epoch": 4.894549293915798, "grad_norm": 0.217233806848526, "learning_rate": 2.785262260586261e-05, "loss": 0.0605, "num_input_tokens_seen": 16577296, "step": 18545 }, { "epoch": 4.8958690774712945, "grad_norm": 0.24206528067588806, "learning_rate": 2.7842869029939517e-05, "loss": 0.0997, "num_input_tokens_seen": 16581872, "step": 18550 }, { "epoch": 4.897188861026792, "grad_norm": 0.16623778641223907, "learning_rate": 2.7833115015610296e-05, "loss": 0.09, "num_input_tokens_seen": 16586544, "step": 18555 }, { "epoch": 4.8985086445822885, "grad_norm": 0.410235196352005, "learning_rate": 2.7823360564379136e-05, "loss": 0.1413, "num_input_tokens_seen": 16590960, "step": 18560 }, { "epoch": 4.899828428137785, "grad_norm": 0.1687140166759491, "learning_rate": 2.7813605677750297e-05, "loss": 0.0493, "num_input_tokens_seen": 16595440, "step": 18565 }, { "epoch": 4.9011482116932825, "grad_norm": 0.1012502908706665, "learning_rate": 2.7803850357228102e-05, "loss": 0.0333, "num_input_tokens_seen": 16600336, "step": 18570 }, { "epoch": 4.902467995248779, "grad_norm": 0.16951428353786469, "learning_rate": 2.779409460431695e-05, "loss": 0.0439, "num_input_tokens_seen": 16604752, "step": 18575 }, { "epoch": 4.903787778804276, "grad_norm": 0.2252022624015808, "learning_rate": 2.778433842052129e-05, "loss": 0.0373, "num_input_tokens_seen": 16609360, "step": 18580 }, { "epoch": 4.905107562359773, "grad_norm": 0.15820682048797607, "learning_rate": 2.7774581807345664e-05, "loss": 0.0718, "num_input_tokens_seen": 16614000, "step": 18585 }, { "epoch": 4.90642734591527, "grad_norm": 0.050562042742967606, "learning_rate": 2.776482476629465e-05, "loss": 0.0241, "num_input_tokens_seen": 16618032, "step": 18590 }, { "epoch": 4.907747129470767, "grad_norm": 0.4059712886810303, "learning_rate": 2.7755067298872924e-05, "loss": 0.0782, "num_input_tokens_seen": 16622576, "step": 18595 }, { "epoch": 4.909066913026264, "grad_norm": 0.14000160992145538, "learning_rate": 2.774530940658518e-05, "loss": 0.0508, "num_input_tokens_seen": 16627152, "step": 18600 }, { "epoch": 4.909066913026264, "eval_loss": 0.06863165646791458, "eval_runtime": 64.7735, "eval_samples_per_second": 103.978, "eval_steps_per_second": 25.998, "num_input_tokens_seen": 16627152, "step": 18600 }, { "epoch": 4.91038669658176, "grad_norm": 0.5590384602546692, "learning_rate": 2.7735551090936236e-05, "loss": 0.0474, "num_input_tokens_seen": 16631568, "step": 18605 }, { "epoch": 4.911706480137258, "grad_norm": 0.15937602519989014, "learning_rate": 2.7725792353430934e-05, "loss": 0.0239, "num_input_tokens_seen": 16635888, "step": 18610 }, { "epoch": 4.913026263692754, "grad_norm": 0.20967191457748413, "learning_rate": 2.77160331955742e-05, "loss": 0.0564, "num_input_tokens_seen": 16640560, "step": 18615 }, { "epoch": 4.914346047248252, "grad_norm": 0.32316821813583374, "learning_rate": 2.7706273618871008e-05, "loss": 0.0844, "num_input_tokens_seen": 16644848, "step": 18620 }, { "epoch": 4.915665830803748, "grad_norm": 0.11100181192159653, "learning_rate": 2.769651362482642e-05, "loss": 0.0527, "num_input_tokens_seen": 16649424, "step": 18625 }, { "epoch": 4.916985614359245, "grad_norm": 0.04020622745156288, "learning_rate": 2.768675321494555e-05, "loss": 0.0608, "num_input_tokens_seen": 16653808, "step": 18630 }, { "epoch": 4.918305397914742, "grad_norm": 0.2699630558490753, "learning_rate": 2.7676992390733565e-05, "loss": 0.0613, "num_input_tokens_seen": 16658480, "step": 18635 }, { "epoch": 4.919625181470239, "grad_norm": 0.23626990616321564, "learning_rate": 2.766723115369571e-05, "loss": 0.0641, "num_input_tokens_seen": 16662928, "step": 18640 }, { "epoch": 4.920944965025736, "grad_norm": 0.07744468748569489, "learning_rate": 2.765746950533729e-05, "loss": 0.0671, "num_input_tokens_seen": 16667440, "step": 18645 }, { "epoch": 4.922264748581233, "grad_norm": 0.15091878175735474, "learning_rate": 2.7647707447163684e-05, "loss": 0.0947, "num_input_tokens_seen": 16672048, "step": 18650 }, { "epoch": 4.923584532136729, "grad_norm": 0.04735223576426506, "learning_rate": 2.7637944980680315e-05, "loss": 0.048, "num_input_tokens_seen": 16676336, "step": 18655 }, { "epoch": 4.924904315692227, "grad_norm": 0.18337567150592804, "learning_rate": 2.762818210739268e-05, "loss": 0.059, "num_input_tokens_seen": 16680816, "step": 18660 }, { "epoch": 4.926224099247723, "grad_norm": 0.134157195687294, "learning_rate": 2.7618418828806332e-05, "loss": 0.0534, "num_input_tokens_seen": 16685200, "step": 18665 }, { "epoch": 4.927543882803221, "grad_norm": 0.20517902076244354, "learning_rate": 2.76086551464269e-05, "loss": 0.0211, "num_input_tokens_seen": 16689360, "step": 18670 }, { "epoch": 4.928863666358717, "grad_norm": 0.13440768420696259, "learning_rate": 2.759889106176006e-05, "loss": 0.0477, "num_input_tokens_seen": 16693776, "step": 18675 }, { "epoch": 4.930183449914214, "grad_norm": 0.037327684462070465, "learning_rate": 2.758912657631156e-05, "loss": 0.0416, "num_input_tokens_seen": 16698256, "step": 18680 }, { "epoch": 4.931503233469711, "grad_norm": 0.05003780126571655, "learning_rate": 2.7579361691587198e-05, "loss": 0.0353, "num_input_tokens_seen": 16702736, "step": 18685 }, { "epoch": 4.932823017025208, "grad_norm": 0.18410705029964447, "learning_rate": 2.756959640909285e-05, "loss": 0.0512, "num_input_tokens_seen": 16707312, "step": 18690 }, { "epoch": 4.934142800580704, "grad_norm": 0.09078793227672577, "learning_rate": 2.7559830730334452e-05, "loss": 0.0951, "num_input_tokens_seen": 16711664, "step": 18695 }, { "epoch": 4.935462584136202, "grad_norm": 0.24783697724342346, "learning_rate": 2.7550064656817988e-05, "loss": 0.0743, "num_input_tokens_seen": 16715952, "step": 18700 }, { "epoch": 4.936782367691698, "grad_norm": 0.17165496945381165, "learning_rate": 2.7540298190049503e-05, "loss": 0.0587, "num_input_tokens_seen": 16720528, "step": 18705 }, { "epoch": 4.938102151247195, "grad_norm": 0.36842474341392517, "learning_rate": 2.7530531331535107e-05, "loss": 0.1076, "num_input_tokens_seen": 16724944, "step": 18710 }, { "epoch": 4.939421934802692, "grad_norm": 0.1413727104663849, "learning_rate": 2.752076408278099e-05, "loss": 0.0504, "num_input_tokens_seen": 16729648, "step": 18715 }, { "epoch": 4.940741718358189, "grad_norm": 0.046087827533483505, "learning_rate": 2.751099644529337e-05, "loss": 0.0485, "num_input_tokens_seen": 16734288, "step": 18720 }, { "epoch": 4.942061501913686, "grad_norm": 0.06575822830200195, "learning_rate": 2.7501228420578533e-05, "loss": 0.0257, "num_input_tokens_seen": 16738768, "step": 18725 }, { "epoch": 4.943381285469183, "grad_norm": 0.2496006339788437, "learning_rate": 2.7491460010142857e-05, "loss": 0.0804, "num_input_tokens_seen": 16743632, "step": 18730 }, { "epoch": 4.9447010690246795, "grad_norm": 0.3014904260635376, "learning_rate": 2.7481691215492727e-05, "loss": 0.0453, "num_input_tokens_seen": 16747728, "step": 18735 }, { "epoch": 4.946020852580177, "grad_norm": 0.29435354471206665, "learning_rate": 2.747192203813463e-05, "loss": 0.1188, "num_input_tokens_seen": 16752368, "step": 18740 }, { "epoch": 4.9473406361356735, "grad_norm": 0.2004237174987793, "learning_rate": 2.7462152479575087e-05, "loss": 0.081, "num_input_tokens_seen": 16757104, "step": 18745 }, { "epoch": 4.948660419691171, "grad_norm": 0.41868647933006287, "learning_rate": 2.7452382541320697e-05, "loss": 0.0548, "num_input_tokens_seen": 16761520, "step": 18750 }, { "epoch": 4.9499802032466675, "grad_norm": 0.0709867998957634, "learning_rate": 2.7442612224878096e-05, "loss": 0.0748, "num_input_tokens_seen": 16766128, "step": 18755 }, { "epoch": 4.951299986802164, "grad_norm": 0.04019893333315849, "learning_rate": 2.7432841531753994e-05, "loss": 0.0663, "num_input_tokens_seen": 16770512, "step": 18760 }, { "epoch": 4.9526197703576615, "grad_norm": 0.2900136411190033, "learning_rate": 2.7423070463455147e-05, "loss": 0.0633, "num_input_tokens_seen": 16774960, "step": 18765 }, { "epoch": 4.953939553913158, "grad_norm": 0.07563133537769318, "learning_rate": 2.7413299021488397e-05, "loss": 0.0413, "num_input_tokens_seen": 16779440, "step": 18770 }, { "epoch": 4.9552593374686555, "grad_norm": 0.04926672577857971, "learning_rate": 2.7403527207360615e-05, "loss": 0.0712, "num_input_tokens_seen": 16783920, "step": 18775 }, { "epoch": 4.956579121024152, "grad_norm": 0.11770081520080566, "learning_rate": 2.7393755022578722e-05, "loss": 0.0384, "num_input_tokens_seen": 16788496, "step": 18780 }, { "epoch": 4.957898904579649, "grad_norm": 0.24784019589424133, "learning_rate": 2.7383982468649714e-05, "loss": 0.1081, "num_input_tokens_seen": 16792880, "step": 18785 }, { "epoch": 4.959218688135146, "grad_norm": 0.10999373346567154, "learning_rate": 2.7374209547080665e-05, "loss": 0.0218, "num_input_tokens_seen": 16797168, "step": 18790 }, { "epoch": 4.960538471690643, "grad_norm": 0.21823538839817047, "learning_rate": 2.7364436259378663e-05, "loss": 0.0644, "num_input_tokens_seen": 16801680, "step": 18795 }, { "epoch": 4.96185825524614, "grad_norm": 0.6799261569976807, "learning_rate": 2.735466260705088e-05, "loss": 0.0784, "num_input_tokens_seen": 16806032, "step": 18800 }, { "epoch": 4.96185825524614, "eval_loss": 0.06862161308526993, "eval_runtime": 64.7769, "eval_samples_per_second": 103.972, "eval_steps_per_second": 25.997, "num_input_tokens_seen": 16806032, "step": 18800 }, { "epoch": 4.963178038801637, "grad_norm": 0.09917201846837997, "learning_rate": 2.7344888591604524e-05, "loss": 0.0666, "num_input_tokens_seen": 16810384, "step": 18805 }, { "epoch": 4.964497822357133, "grad_norm": 0.1404743492603302, "learning_rate": 2.7335114214546893e-05, "loss": 0.0916, "num_input_tokens_seen": 16815152, "step": 18810 }, { "epoch": 4.965817605912631, "grad_norm": 0.198434516787529, "learning_rate": 2.7325339477385293e-05, "loss": 0.0334, "num_input_tokens_seen": 16819888, "step": 18815 }, { "epoch": 4.967137389468127, "grad_norm": 0.10376154631376266, "learning_rate": 2.7315564381627128e-05, "loss": 0.0443, "num_input_tokens_seen": 16824432, "step": 18820 }, { "epoch": 4.968457173023624, "grad_norm": 0.060383908450603485, "learning_rate": 2.7305788928779835e-05, "loss": 0.0539, "num_input_tokens_seen": 16829008, "step": 18825 }, { "epoch": 4.969776956579121, "grad_norm": 0.13487784564495087, "learning_rate": 2.729601312035091e-05, "loss": 0.0802, "num_input_tokens_seen": 16833584, "step": 18830 }, { "epoch": 4.971096740134618, "grad_norm": 0.20110732316970825, "learning_rate": 2.7286236957847915e-05, "loss": 0.0394, "num_input_tokens_seen": 16838032, "step": 18835 }, { "epoch": 4.972416523690114, "grad_norm": 0.1296185702085495, "learning_rate": 2.7276460442778446e-05, "loss": 0.0428, "num_input_tokens_seen": 16842448, "step": 18840 }, { "epoch": 4.973736307245612, "grad_norm": 0.27330026030540466, "learning_rate": 2.726668357665017e-05, "loss": 0.0604, "num_input_tokens_seen": 16846832, "step": 18845 }, { "epoch": 4.975056090801108, "grad_norm": 0.07113484293222427, "learning_rate": 2.7256906360970808e-05, "loss": 0.0438, "num_input_tokens_seen": 16851472, "step": 18850 }, { "epoch": 4.976375874356606, "grad_norm": 0.4439070224761963, "learning_rate": 2.7247128797248117e-05, "loss": 0.0735, "num_input_tokens_seen": 16855920, "step": 18855 }, { "epoch": 4.977695657912102, "grad_norm": 0.07135985791683197, "learning_rate": 2.7237350886989925e-05, "loss": 0.0643, "num_input_tokens_seen": 16860176, "step": 18860 }, { "epoch": 4.979015441467599, "grad_norm": 0.17006325721740723, "learning_rate": 2.7227572631704107e-05, "loss": 0.0604, "num_input_tokens_seen": 16864560, "step": 18865 }, { "epoch": 4.980335225023096, "grad_norm": 0.22113637626171112, "learning_rate": 2.7217794032898596e-05, "loss": 0.0828, "num_input_tokens_seen": 16869008, "step": 18870 }, { "epoch": 4.981655008578593, "grad_norm": 0.27087342739105225, "learning_rate": 2.7208015092081384e-05, "loss": 0.1003, "num_input_tokens_seen": 16873232, "step": 18875 }, { "epoch": 4.98297479213409, "grad_norm": 0.1871531456708908, "learning_rate": 2.719823581076049e-05, "loss": 0.052, "num_input_tokens_seen": 16877680, "step": 18880 }, { "epoch": 4.984294575689587, "grad_norm": 0.5451454520225525, "learning_rate": 2.718845619044401e-05, "loss": 0.0527, "num_input_tokens_seen": 16882256, "step": 18885 }, { "epoch": 4.9856143592450834, "grad_norm": 0.35073158144950867, "learning_rate": 2.7178676232640088e-05, "loss": 0.122, "num_input_tokens_seen": 16886512, "step": 18890 }, { "epoch": 4.986934142800581, "grad_norm": 0.23698747158050537, "learning_rate": 2.716889593885691e-05, "loss": 0.0427, "num_input_tokens_seen": 16890928, "step": 18895 }, { "epoch": 4.9882539263560775, "grad_norm": 0.3007442355155945, "learning_rate": 2.7159115310602716e-05, "loss": 0.1175, "num_input_tokens_seen": 16895248, "step": 18900 }, { "epoch": 4.989573709911575, "grad_norm": 0.059046510607004166, "learning_rate": 2.7149334349385814e-05, "loss": 0.0389, "num_input_tokens_seen": 16899952, "step": 18905 }, { "epoch": 4.9908934934670715, "grad_norm": 0.3999674618244171, "learning_rate": 2.713955305671454e-05, "loss": 0.042, "num_input_tokens_seen": 16904592, "step": 18910 }, { "epoch": 4.992213277022568, "grad_norm": 0.42112573981285095, "learning_rate": 2.71297714340973e-05, "loss": 0.0805, "num_input_tokens_seen": 16909040, "step": 18915 }, { "epoch": 4.9935330605780655, "grad_norm": 0.2466246485710144, "learning_rate": 2.7119989483042545e-05, "loss": 0.0485, "num_input_tokens_seen": 16913808, "step": 18920 }, { "epoch": 4.994852844133562, "grad_norm": 0.0232480950653553, "learning_rate": 2.7110207205058768e-05, "loss": 0.0394, "num_input_tokens_seen": 16918640, "step": 18925 }, { "epoch": 4.9961726276890595, "grad_norm": 0.1594805270433426, "learning_rate": 2.7100424601654517e-05, "loss": 0.0916, "num_input_tokens_seen": 16922992, "step": 18930 }, { "epoch": 4.997492411244556, "grad_norm": 0.2975085973739624, "learning_rate": 2.7090641674338403e-05, "loss": 0.0729, "num_input_tokens_seen": 16927152, "step": 18935 }, { "epoch": 4.998812194800053, "grad_norm": 0.19951516389846802, "learning_rate": 2.7080858424619072e-05, "loss": 0.0616, "num_input_tokens_seen": 16931664, "step": 18940 }, { "epoch": 5.0, "grad_norm": 0.22755782306194305, "learning_rate": 2.707107485400521e-05, "loss": 0.0264, "num_input_tokens_seen": 16935568, "step": 18945 }, { "epoch": 5.001319783555497, "grad_norm": 0.48928946256637573, "learning_rate": 2.7061290964005586e-05, "loss": 0.0935, "num_input_tokens_seen": 16940176, "step": 18950 }, { "epoch": 5.002639567110994, "grad_norm": 0.15697601437568665, "learning_rate": 2.7051506756129e-05, "loss": 0.064, "num_input_tokens_seen": 16944720, "step": 18955 }, { "epoch": 5.003959350666491, "grad_norm": 0.3455692529678345, "learning_rate": 2.704172223188428e-05, "loss": 0.0818, "num_input_tokens_seen": 16949200, "step": 18960 }, { "epoch": 5.005279134221988, "grad_norm": 0.22429300844669342, "learning_rate": 2.7031937392780334e-05, "loss": 0.04, "num_input_tokens_seen": 16953968, "step": 18965 }, { "epoch": 5.006598917777485, "grad_norm": 0.27705511450767517, "learning_rate": 2.702215224032611e-05, "loss": 0.0712, "num_input_tokens_seen": 16958448, "step": 18970 }, { "epoch": 5.007918701332981, "grad_norm": 0.21976374089717865, "learning_rate": 2.70123667760306e-05, "loss": 0.0537, "num_input_tokens_seen": 16963024, "step": 18975 }, { "epoch": 5.009238484888479, "grad_norm": 0.1555311381816864, "learning_rate": 2.7002581001402845e-05, "loss": 0.0695, "num_input_tokens_seen": 16967376, "step": 18980 }, { "epoch": 5.010558268443975, "grad_norm": 0.06559067964553833, "learning_rate": 2.6992794917951923e-05, "loss": 0.0981, "num_input_tokens_seen": 16971952, "step": 18985 }, { "epoch": 5.011878051999472, "grad_norm": 0.09878981113433838, "learning_rate": 2.6983008527187e-05, "loss": 0.0416, "num_input_tokens_seen": 16976720, "step": 18990 }, { "epoch": 5.013197835554969, "grad_norm": 0.027351541444659233, "learning_rate": 2.697322183061723e-05, "loss": 0.0197, "num_input_tokens_seen": 16981328, "step": 18995 }, { "epoch": 5.014517619110466, "grad_norm": 0.28916311264038086, "learning_rate": 2.696343482975186e-05, "loss": 0.057, "num_input_tokens_seen": 16986160, "step": 19000 }, { "epoch": 5.014517619110466, "eval_loss": 0.06854243576526642, "eval_runtime": 64.7706, "eval_samples_per_second": 103.982, "eval_steps_per_second": 25.999, "num_input_tokens_seen": 16986160, "step": 19000 }, { "epoch": 5.015837402665963, "grad_norm": 0.07987211644649506, "learning_rate": 2.695364752610016e-05, "loss": 0.0274, "num_input_tokens_seen": 16990352, "step": 19005 }, { "epoch": 5.01715718622146, "grad_norm": 0.3318217992782593, "learning_rate": 2.6943859921171467e-05, "loss": 0.124, "num_input_tokens_seen": 16994960, "step": 19010 }, { "epoch": 5.018476969776956, "grad_norm": 0.11486142873764038, "learning_rate": 2.6934072016475143e-05, "loss": 0.0531, "num_input_tokens_seen": 16999376, "step": 19015 }, { "epoch": 5.019796753332454, "grad_norm": 0.4915933907032013, "learning_rate": 2.6924283813520606e-05, "loss": 0.1208, "num_input_tokens_seen": 17004048, "step": 19020 }, { "epoch": 5.02111653688795, "grad_norm": 0.17441192269325256, "learning_rate": 2.691449531381733e-05, "loss": 0.0371, "num_input_tokens_seen": 17008464, "step": 19025 }, { "epoch": 5.022436320443448, "grad_norm": 0.16601507365703583, "learning_rate": 2.6904706518874816e-05, "loss": 0.0186, "num_input_tokens_seen": 17013072, "step": 19030 }, { "epoch": 5.023756103998944, "grad_norm": 0.26146870851516724, "learning_rate": 2.6894917430202615e-05, "loss": 0.039, "num_input_tokens_seen": 17017424, "step": 19035 }, { "epoch": 5.025075887554441, "grad_norm": 0.2478986233472824, "learning_rate": 2.6885128049310343e-05, "loss": 0.0473, "num_input_tokens_seen": 17021744, "step": 19040 }, { "epoch": 5.026395671109938, "grad_norm": 0.19030870497226715, "learning_rate": 2.687533837770762e-05, "loss": 0.072, "num_input_tokens_seen": 17026128, "step": 19045 }, { "epoch": 5.027715454665435, "grad_norm": 0.22293902933597565, "learning_rate": 2.6865548416904162e-05, "loss": 0.0674, "num_input_tokens_seen": 17030576, "step": 19050 }, { "epoch": 5.029035238220931, "grad_norm": 0.17587222158908844, "learning_rate": 2.68557581684097e-05, "loss": 0.0403, "num_input_tokens_seen": 17035120, "step": 19055 }, { "epoch": 5.030355021776429, "grad_norm": 0.05642015114426613, "learning_rate": 2.6845967633733998e-05, "loss": 0.0895, "num_input_tokens_seen": 17039568, "step": 19060 }, { "epoch": 5.031674805331925, "grad_norm": 0.43870484828948975, "learning_rate": 2.683617681438689e-05, "loss": 0.1014, "num_input_tokens_seen": 17043984, "step": 19065 }, { "epoch": 5.032994588887423, "grad_norm": 0.11381293088197708, "learning_rate": 2.682638571187825e-05, "loss": 0.0725, "num_input_tokens_seen": 17048432, "step": 19070 }, { "epoch": 5.034314372442919, "grad_norm": 0.4731496572494507, "learning_rate": 2.6816594327717976e-05, "loss": 0.0612, "num_input_tokens_seen": 17053104, "step": 19075 }, { "epoch": 5.035634155998416, "grad_norm": 0.34311068058013916, "learning_rate": 2.680680266341603e-05, "loss": 0.0414, "num_input_tokens_seen": 17057424, "step": 19080 }, { "epoch": 5.036953939553913, "grad_norm": 0.20074953138828278, "learning_rate": 2.67970107204824e-05, "loss": 0.0635, "num_input_tokens_seen": 17061776, "step": 19085 }, { "epoch": 5.03827372310941, "grad_norm": 0.232033371925354, "learning_rate": 2.6787218500427142e-05, "loss": 0.0737, "num_input_tokens_seen": 17066640, "step": 19090 }, { "epoch": 5.039593506664907, "grad_norm": 0.16428549587726593, "learning_rate": 2.6777426004760332e-05, "loss": 0.0472, "num_input_tokens_seen": 17071280, "step": 19095 }, { "epoch": 5.040913290220404, "grad_norm": 0.1406024694442749, "learning_rate": 2.6767633234992094e-05, "loss": 0.0525, "num_input_tokens_seen": 17075696, "step": 19100 }, { "epoch": 5.0422330737759005, "grad_norm": 0.34932538866996765, "learning_rate": 2.6757840192632598e-05, "loss": 0.0315, "num_input_tokens_seen": 17080048, "step": 19105 }, { "epoch": 5.043552857331398, "grad_norm": 0.1890619695186615, "learning_rate": 2.6748046879192052e-05, "loss": 0.0212, "num_input_tokens_seen": 17084432, "step": 19110 }, { "epoch": 5.0448726408868945, "grad_norm": 0.14648717641830444, "learning_rate": 2.673825329618071e-05, "loss": 0.0644, "num_input_tokens_seen": 17088464, "step": 19115 }, { "epoch": 5.046192424442391, "grad_norm": 0.29285961389541626, "learning_rate": 2.6728459445108866e-05, "loss": 0.0632, "num_input_tokens_seen": 17093168, "step": 19120 }, { "epoch": 5.0475122079978885, "grad_norm": 0.11911913752555847, "learning_rate": 2.6718665327486854e-05, "loss": 0.088, "num_input_tokens_seen": 17097712, "step": 19125 }, { "epoch": 5.048831991553385, "grad_norm": 0.1464732140302658, "learning_rate": 2.6708870944825048e-05, "loss": 0.0812, "num_input_tokens_seen": 17102096, "step": 19130 }, { "epoch": 5.0501517751088825, "grad_norm": 0.3286367356777191, "learning_rate": 2.6699076298633874e-05, "loss": 0.1149, "num_input_tokens_seen": 17106992, "step": 19135 }, { "epoch": 5.051471558664379, "grad_norm": 0.08369673788547516, "learning_rate": 2.6689281390423788e-05, "loss": 0.0391, "num_input_tokens_seen": 17111344, "step": 19140 }, { "epoch": 5.052791342219876, "grad_norm": 0.13633674383163452, "learning_rate": 2.667948622170527e-05, "loss": 0.0493, "num_input_tokens_seen": 17115504, "step": 19145 }, { "epoch": 5.054111125775373, "grad_norm": 0.18322749435901642, "learning_rate": 2.6669690793988873e-05, "loss": 0.0361, "num_input_tokens_seen": 17120368, "step": 19150 }, { "epoch": 5.05543090933087, "grad_norm": 0.20799368619918823, "learning_rate": 2.665989510878518e-05, "loss": 0.0323, "num_input_tokens_seen": 17124720, "step": 19155 }, { "epoch": 5.056750692886367, "grad_norm": 0.3070172369480133, "learning_rate": 2.6650099167604793e-05, "loss": 0.0564, "num_input_tokens_seen": 17129296, "step": 19160 }, { "epoch": 5.058070476441864, "grad_norm": 0.32486045360565186, "learning_rate": 2.6640302971958376e-05, "loss": 0.0677, "num_input_tokens_seen": 17133936, "step": 19165 }, { "epoch": 5.05939025999736, "grad_norm": 0.48955875635147095, "learning_rate": 2.6630506523356635e-05, "loss": 0.0587, "num_input_tokens_seen": 17138352, "step": 19170 }, { "epoch": 5.060710043552858, "grad_norm": 0.3325125575065613, "learning_rate": 2.6620709823310297e-05, "loss": 0.1193, "num_input_tokens_seen": 17142832, "step": 19175 }, { "epoch": 5.062029827108354, "grad_norm": 0.03684444725513458, "learning_rate": 2.661091287333014e-05, "loss": 0.0289, "num_input_tokens_seen": 17147440, "step": 19180 }, { "epoch": 5.063349610663851, "grad_norm": 0.09186553955078125, "learning_rate": 2.660111567492696e-05, "loss": 0.0361, "num_input_tokens_seen": 17151984, "step": 19185 }, { "epoch": 5.064669394219348, "grad_norm": 0.2506871521472931, "learning_rate": 2.6591318229611635e-05, "loss": 0.0543, "num_input_tokens_seen": 17156240, "step": 19190 }, { "epoch": 5.065989177774845, "grad_norm": 0.3182862401008606, "learning_rate": 2.6581520538895037e-05, "loss": 0.084, "num_input_tokens_seen": 17160432, "step": 19195 }, { "epoch": 5.067308961330342, "grad_norm": 0.25975215435028076, "learning_rate": 2.6571722604288102e-05, "loss": 0.0367, "num_input_tokens_seen": 17164848, "step": 19200 }, { "epoch": 5.067308961330342, "eval_loss": 0.06870897114276886, "eval_runtime": 64.7963, "eval_samples_per_second": 103.941, "eval_steps_per_second": 25.989, "num_input_tokens_seen": 17164848, "step": 19200 }, { "epoch": 5.068628744885839, "grad_norm": 0.10210270434617996, "learning_rate": 2.656192442730179e-05, "loss": 0.0763, "num_input_tokens_seen": 17169168, "step": 19205 }, { "epoch": 5.069948528441335, "grad_norm": 0.033850397914648056, "learning_rate": 2.6552126009447098e-05, "loss": 0.0389, "num_input_tokens_seen": 17173712, "step": 19210 }, { "epoch": 5.071268311996833, "grad_norm": 0.44324642419815063, "learning_rate": 2.654232735223507e-05, "loss": 0.0564, "num_input_tokens_seen": 17178192, "step": 19215 }, { "epoch": 5.072588095552329, "grad_norm": 0.05681411549448967, "learning_rate": 2.6532528457176787e-05, "loss": 0.0482, "num_input_tokens_seen": 17182896, "step": 19220 }, { "epoch": 5.073907879107827, "grad_norm": 0.26370126008987427, "learning_rate": 2.6522729325783348e-05, "loss": 0.0704, "num_input_tokens_seen": 17187280, "step": 19225 }, { "epoch": 5.075227662663323, "grad_norm": 0.2961690127849579, "learning_rate": 2.6512929959565914e-05, "loss": 0.0424, "num_input_tokens_seen": 17191888, "step": 19230 }, { "epoch": 5.07654744621882, "grad_norm": 0.20757268369197845, "learning_rate": 2.6503130360035673e-05, "loss": 0.0317, "num_input_tokens_seen": 17196368, "step": 19235 }, { "epoch": 5.077867229774317, "grad_norm": 0.07291275262832642, "learning_rate": 2.6493330528703835e-05, "loss": 0.0902, "num_input_tokens_seen": 17200656, "step": 19240 }, { "epoch": 5.079187013329814, "grad_norm": 0.0547778382897377, "learning_rate": 2.648353046708167e-05, "loss": 0.0669, "num_input_tokens_seen": 17204976, "step": 19245 }, { "epoch": 5.08050679688531, "grad_norm": 0.21507373452186584, "learning_rate": 2.647373017668046e-05, "loss": 0.0321, "num_input_tokens_seen": 17209136, "step": 19250 }, { "epoch": 5.081826580440808, "grad_norm": 0.3954898715019226, "learning_rate": 2.6463929659011537e-05, "loss": 0.0759, "num_input_tokens_seen": 17213552, "step": 19255 }, { "epoch": 5.083146363996304, "grad_norm": 0.22904185950756073, "learning_rate": 2.6454128915586262e-05, "loss": 0.0771, "num_input_tokens_seen": 17217872, "step": 19260 }, { "epoch": 5.084466147551802, "grad_norm": 0.20396392047405243, "learning_rate": 2.6444327947916036e-05, "loss": 0.0723, "num_input_tokens_seen": 17222096, "step": 19265 }, { "epoch": 5.085785931107298, "grad_norm": 0.14217182993888855, "learning_rate": 2.6434526757512292e-05, "loss": 0.0409, "num_input_tokens_seen": 17226608, "step": 19270 }, { "epoch": 5.087105714662795, "grad_norm": 0.07806000113487244, "learning_rate": 2.6424725345886486e-05, "loss": 0.0451, "num_input_tokens_seen": 17231440, "step": 19275 }, { "epoch": 5.088425498218292, "grad_norm": 0.12203270196914673, "learning_rate": 2.641492371455014e-05, "loss": 0.0384, "num_input_tokens_seen": 17235760, "step": 19280 }, { "epoch": 5.089745281773789, "grad_norm": 0.03367949649691582, "learning_rate": 2.640512186501477e-05, "loss": 0.0348, "num_input_tokens_seen": 17240240, "step": 19285 }, { "epoch": 5.091065065329286, "grad_norm": 0.16777314245700836, "learning_rate": 2.639531979879195e-05, "loss": 0.0532, "num_input_tokens_seen": 17245104, "step": 19290 }, { "epoch": 5.092384848884783, "grad_norm": 0.25858274102211, "learning_rate": 2.638551751739328e-05, "loss": 0.0357, "num_input_tokens_seen": 17249680, "step": 19295 }, { "epoch": 5.0937046324402795, "grad_norm": 0.14053095877170563, "learning_rate": 2.6375715022330404e-05, "loss": 0.0235, "num_input_tokens_seen": 17254160, "step": 19300 }, { "epoch": 5.095024415995777, "grad_norm": 0.19657224416732788, "learning_rate": 2.6365912315114976e-05, "loss": 0.0721, "num_input_tokens_seen": 17258832, "step": 19305 }, { "epoch": 5.0963441995512735, "grad_norm": 0.14245790243148804, "learning_rate": 2.6356109397258704e-05, "loss": 0.0288, "num_input_tokens_seen": 17263120, "step": 19310 }, { "epoch": 5.09766398310677, "grad_norm": 0.3223910331726074, "learning_rate": 2.6346306270273325e-05, "loss": 0.0471, "num_input_tokens_seen": 17267472, "step": 19315 }, { "epoch": 5.0989837666622675, "grad_norm": 0.4378238618373871, "learning_rate": 2.6336502935670608e-05, "loss": 0.0541, "num_input_tokens_seen": 17271632, "step": 19320 }, { "epoch": 5.100303550217764, "grad_norm": 0.3300918936729431, "learning_rate": 2.6326699394962333e-05, "loss": 0.0933, "num_input_tokens_seen": 17275760, "step": 19325 }, { "epoch": 5.1016233337732615, "grad_norm": 0.3612113893032074, "learning_rate": 2.6316895649660334e-05, "loss": 0.1097, "num_input_tokens_seen": 17280368, "step": 19330 }, { "epoch": 5.102943117328758, "grad_norm": 0.360798180103302, "learning_rate": 2.6307091701276486e-05, "loss": 0.0715, "num_input_tokens_seen": 17284688, "step": 19335 }, { "epoch": 5.104262900884255, "grad_norm": 0.09335128217935562, "learning_rate": 2.629728755132267e-05, "loss": 0.0221, "num_input_tokens_seen": 17289168, "step": 19340 }, { "epoch": 5.105582684439752, "grad_norm": 0.08992238342761993, "learning_rate": 2.628748320131081e-05, "loss": 0.0692, "num_input_tokens_seen": 17293680, "step": 19345 }, { "epoch": 5.106902467995249, "grad_norm": 0.21821841597557068, "learning_rate": 2.6277678652752856e-05, "loss": 0.0641, "num_input_tokens_seen": 17298096, "step": 19350 }, { "epoch": 5.108222251550746, "grad_norm": 0.3959363102912903, "learning_rate": 2.6267873907160807e-05, "loss": 0.0718, "num_input_tokens_seen": 17302672, "step": 19355 }, { "epoch": 5.109542035106243, "grad_norm": 0.2679448425769806, "learning_rate": 2.6258068966046668e-05, "loss": 0.0623, "num_input_tokens_seen": 17307568, "step": 19360 }, { "epoch": 5.110861818661739, "grad_norm": 0.08793846517801285, "learning_rate": 2.6248263830922475e-05, "loss": 0.0411, "num_input_tokens_seen": 17312176, "step": 19365 }, { "epoch": 5.112181602217237, "grad_norm": 0.10815593600273132, "learning_rate": 2.6238458503300318e-05, "loss": 0.0866, "num_input_tokens_seen": 17316688, "step": 19370 }, { "epoch": 5.113501385772733, "grad_norm": 0.08007234334945679, "learning_rate": 2.6228652984692292e-05, "loss": 0.0232, "num_input_tokens_seen": 17320880, "step": 19375 }, { "epoch": 5.114821169328231, "grad_norm": 0.34647461771965027, "learning_rate": 2.621884727661054e-05, "loss": 0.1262, "num_input_tokens_seen": 17325104, "step": 19380 }, { "epoch": 5.116140952883727, "grad_norm": 0.23384162783622742, "learning_rate": 2.6209041380567222e-05, "loss": 0.0529, "num_input_tokens_seen": 17329360, "step": 19385 }, { "epoch": 5.117460736439224, "grad_norm": 0.2360575944185257, "learning_rate": 2.6199235298074527e-05, "loss": 0.0798, "num_input_tokens_seen": 17333776, "step": 19390 }, { "epoch": 5.118780519994721, "grad_norm": 0.09644157439470291, "learning_rate": 2.618942903064468e-05, "loss": 0.0519, "num_input_tokens_seen": 17338544, "step": 19395 }, { "epoch": 5.120100303550218, "grad_norm": 0.1589479297399521, "learning_rate": 2.6179622579789932e-05, "loss": 0.0418, "num_input_tokens_seen": 17342800, "step": 19400 }, { "epoch": 5.120100303550218, "eval_loss": 0.06855374574661255, "eval_runtime": 64.7426, "eval_samples_per_second": 104.027, "eval_steps_per_second": 26.011, "num_input_tokens_seen": 17342800, "step": 19400 }, { "epoch": 5.121420087105714, "grad_norm": 0.31678342819213867, "learning_rate": 2.6169815947022553e-05, "loss": 0.043, "num_input_tokens_seen": 17347152, "step": 19405 }, { "epoch": 5.122739870661212, "grad_norm": 0.14495675265789032, "learning_rate": 2.6160009133854853e-05, "loss": 0.0402, "num_input_tokens_seen": 17351984, "step": 19410 }, { "epoch": 5.124059654216708, "grad_norm": 0.24549011886119843, "learning_rate": 2.6150202141799168e-05, "loss": 0.0839, "num_input_tokens_seen": 17356432, "step": 19415 }, { "epoch": 5.125379437772206, "grad_norm": 0.1788676381111145, "learning_rate": 2.614039497236786e-05, "loss": 0.1072, "num_input_tokens_seen": 17361104, "step": 19420 }, { "epoch": 5.126699221327702, "grad_norm": 0.2293304204940796, "learning_rate": 2.6130587627073315e-05, "loss": 0.0394, "num_input_tokens_seen": 17365488, "step": 19425 }, { "epoch": 5.128019004883199, "grad_norm": 0.10142004489898682, "learning_rate": 2.6120780107427956e-05, "loss": 0.0286, "num_input_tokens_seen": 17369808, "step": 19430 }, { "epoch": 5.129338788438696, "grad_norm": 0.41322341561317444, "learning_rate": 2.6110972414944214e-05, "loss": 0.038, "num_input_tokens_seen": 17374256, "step": 19435 }, { "epoch": 5.130658571994193, "grad_norm": 0.026502422988414764, "learning_rate": 2.6101164551134565e-05, "loss": 0.0491, "num_input_tokens_seen": 17378800, "step": 19440 }, { "epoch": 5.131978355549689, "grad_norm": 0.26221224665641785, "learning_rate": 2.6091356517511505e-05, "loss": 0.0498, "num_input_tokens_seen": 17383248, "step": 19445 }, { "epoch": 5.133298139105187, "grad_norm": 0.23653709888458252, "learning_rate": 2.608154831558755e-05, "loss": 0.0823, "num_input_tokens_seen": 17387536, "step": 19450 }, { "epoch": 5.134617922660683, "grad_norm": 0.10292716324329376, "learning_rate": 2.607173994687526e-05, "loss": 0.0547, "num_input_tokens_seen": 17392208, "step": 19455 }, { "epoch": 5.135937706216181, "grad_norm": 0.22351998090744019, "learning_rate": 2.6061931412887196e-05, "loss": 0.0505, "num_input_tokens_seen": 17396816, "step": 19460 }, { "epoch": 5.137257489771677, "grad_norm": 0.2204088568687439, "learning_rate": 2.6052122715135973e-05, "loss": 0.0618, "num_input_tokens_seen": 17401232, "step": 19465 }, { "epoch": 5.138577273327174, "grad_norm": 0.2688142657279968, "learning_rate": 2.60423138551342e-05, "loss": 0.0581, "num_input_tokens_seen": 17405648, "step": 19470 }, { "epoch": 5.139897056882671, "grad_norm": 0.08223710209131241, "learning_rate": 2.6032504834394527e-05, "loss": 0.0563, "num_input_tokens_seen": 17410064, "step": 19475 }, { "epoch": 5.141216840438168, "grad_norm": 0.31379571557044983, "learning_rate": 2.602269565442964e-05, "loss": 0.0722, "num_input_tokens_seen": 17414448, "step": 19480 }, { "epoch": 5.142536623993665, "grad_norm": 0.06925041973590851, "learning_rate": 2.6012886316752227e-05, "loss": 0.0605, "num_input_tokens_seen": 17418864, "step": 19485 }, { "epoch": 5.143856407549162, "grad_norm": 0.4157066345214844, "learning_rate": 2.6003076822875018e-05, "loss": 0.0623, "num_input_tokens_seen": 17422960, "step": 19490 }, { "epoch": 5.1451761911046585, "grad_norm": 0.08373609185218811, "learning_rate": 2.5993267174310755e-05, "loss": 0.0334, "num_input_tokens_seen": 17427568, "step": 19495 }, { "epoch": 5.146495974660156, "grad_norm": 0.3729817271232605, "learning_rate": 2.5983457372572218e-05, "loss": 0.0709, "num_input_tokens_seen": 17432400, "step": 19500 }, { "epoch": 5.1478157582156525, "grad_norm": 0.10966972261667252, "learning_rate": 2.597364741917219e-05, "loss": 0.0356, "num_input_tokens_seen": 17436848, "step": 19505 }, { "epoch": 5.14913554177115, "grad_norm": 0.44847801327705383, "learning_rate": 2.5963837315623492e-05, "loss": 0.1241, "num_input_tokens_seen": 17441168, "step": 19510 }, { "epoch": 5.1504553253266465, "grad_norm": 0.186454176902771, "learning_rate": 2.595402706343897e-05, "loss": 0.0319, "num_input_tokens_seen": 17445392, "step": 19515 }, { "epoch": 5.151775108882143, "grad_norm": 0.20327343046665192, "learning_rate": 2.594421666413148e-05, "loss": 0.0699, "num_input_tokens_seen": 17449968, "step": 19520 }, { "epoch": 5.1530948924376405, "grad_norm": 0.1192198246717453, "learning_rate": 2.5934406119213928e-05, "loss": 0.0345, "num_input_tokens_seen": 17454192, "step": 19525 }, { "epoch": 5.154414675993137, "grad_norm": 0.13682861626148224, "learning_rate": 2.5924595430199193e-05, "loss": 0.0447, "num_input_tokens_seen": 17458416, "step": 19530 }, { "epoch": 5.155734459548634, "grad_norm": 0.1316082924604416, "learning_rate": 2.5914784598600238e-05, "loss": 0.0308, "num_input_tokens_seen": 17462672, "step": 19535 }, { "epoch": 5.157054243104131, "grad_norm": 0.2377040833234787, "learning_rate": 2.5904973625930002e-05, "loss": 0.0288, "num_input_tokens_seen": 17467376, "step": 19540 }, { "epoch": 5.158374026659628, "grad_norm": 0.14712467789649963, "learning_rate": 2.5895162513701456e-05, "loss": 0.04, "num_input_tokens_seen": 17471792, "step": 19545 }, { "epoch": 5.159693810215125, "grad_norm": 0.25829941034317017, "learning_rate": 2.5885351263427593e-05, "loss": 0.0818, "num_input_tokens_seen": 17476304, "step": 19550 }, { "epoch": 5.161013593770622, "grad_norm": 0.1244855746626854, "learning_rate": 2.5875539876621448e-05, "loss": 0.039, "num_input_tokens_seen": 17480592, "step": 19555 }, { "epoch": 5.162333377326118, "grad_norm": 0.2959396541118622, "learning_rate": 2.586572835479605e-05, "loss": 0.068, "num_input_tokens_seen": 17485168, "step": 19560 }, { "epoch": 5.163653160881616, "grad_norm": 0.2747514247894287, "learning_rate": 2.585591669946446e-05, "loss": 0.0894, "num_input_tokens_seen": 17489456, "step": 19565 }, { "epoch": 5.164972944437112, "grad_norm": 0.4393116235733032, "learning_rate": 2.5846104912139756e-05, "loss": 0.0995, "num_input_tokens_seen": 17493584, "step": 19570 }, { "epoch": 5.16629272799261, "grad_norm": 0.3752378821372986, "learning_rate": 2.583629299433505e-05, "loss": 0.0802, "num_input_tokens_seen": 17497968, "step": 19575 }, { "epoch": 5.167612511548106, "grad_norm": 0.11787020415067673, "learning_rate": 2.582648094756345e-05, "loss": 0.0787, "num_input_tokens_seen": 17502224, "step": 19580 }, { "epoch": 5.168932295103603, "grad_norm": 0.16222721338272095, "learning_rate": 2.5816668773338098e-05, "loss": 0.1079, "num_input_tokens_seen": 17506768, "step": 19585 }, { "epoch": 5.1702520786591, "grad_norm": 0.06564243882894516, "learning_rate": 2.580685647317216e-05, "loss": 0.0929, "num_input_tokens_seen": 17511056, "step": 19590 }, { "epoch": 5.171571862214597, "grad_norm": 0.594566822052002, "learning_rate": 2.5797044048578818e-05, "loss": 0.0751, "num_input_tokens_seen": 17515568, "step": 19595 }, { "epoch": 5.172891645770093, "grad_norm": 0.18579146265983582, "learning_rate": 2.5787231501071262e-05, "loss": 0.0873, "num_input_tokens_seen": 17520144, "step": 19600 }, { "epoch": 5.172891645770093, "eval_loss": 0.06836747378110886, "eval_runtime": 64.7298, "eval_samples_per_second": 104.048, "eval_steps_per_second": 26.016, "num_input_tokens_seen": 17520144, "step": 19600 }, { "epoch": 5.174211429325591, "grad_norm": 0.2781524658203125, "learning_rate": 2.577741883216272e-05, "loss": 0.0398, "num_input_tokens_seen": 17524592, "step": 19605 }, { "epoch": 5.175531212881087, "grad_norm": 0.08001847565174103, "learning_rate": 2.576760604336642e-05, "loss": 0.0291, "num_input_tokens_seen": 17529072, "step": 19610 }, { "epoch": 5.176850996436585, "grad_norm": 0.21498465538024902, "learning_rate": 2.575779313619563e-05, "loss": 0.0459, "num_input_tokens_seen": 17533776, "step": 19615 }, { "epoch": 5.178170779992081, "grad_norm": 0.10978720337152481, "learning_rate": 2.5747980112163605e-05, "loss": 0.0343, "num_input_tokens_seen": 17537872, "step": 19620 }, { "epoch": 5.179490563547578, "grad_norm": 0.18835125863552094, "learning_rate": 2.5738166972783656e-05, "loss": 0.0695, "num_input_tokens_seen": 17542640, "step": 19625 }, { "epoch": 5.180810347103075, "grad_norm": 0.12763911485671997, "learning_rate": 2.5728353719569075e-05, "loss": 0.0842, "num_input_tokens_seen": 17547088, "step": 19630 }, { "epoch": 5.182130130658572, "grad_norm": 0.15005506575107574, "learning_rate": 2.57185403540332e-05, "loss": 0.051, "num_input_tokens_seen": 17551600, "step": 19635 }, { "epoch": 5.183449914214069, "grad_norm": 0.18843179941177368, "learning_rate": 2.5708726877689375e-05, "loss": 0.0368, "num_input_tokens_seen": 17555888, "step": 19640 }, { "epoch": 5.184769697769566, "grad_norm": 0.027912145480513573, "learning_rate": 2.5698913292050964e-05, "loss": 0.0608, "num_input_tokens_seen": 17560464, "step": 19645 }, { "epoch": 5.186089481325062, "grad_norm": 0.334033340215683, "learning_rate": 2.568909959863133e-05, "loss": 0.1307, "num_input_tokens_seen": 17564784, "step": 19650 }, { "epoch": 5.18740926488056, "grad_norm": 0.1925666481256485, "learning_rate": 2.5679285798943887e-05, "loss": 0.0998, "num_input_tokens_seen": 17569296, "step": 19655 }, { "epoch": 5.188729048436056, "grad_norm": 0.30758169293403625, "learning_rate": 2.5669471894502035e-05, "loss": 0.098, "num_input_tokens_seen": 17573552, "step": 19660 }, { "epoch": 5.190048831991553, "grad_norm": 0.2587870657444, "learning_rate": 2.56596578868192e-05, "loss": 0.033, "num_input_tokens_seen": 17578448, "step": 19665 }, { "epoch": 5.19136861554705, "grad_norm": 0.049315791577100754, "learning_rate": 2.564984377740883e-05, "loss": 0.0622, "num_input_tokens_seen": 17583024, "step": 19670 }, { "epoch": 5.192688399102547, "grad_norm": 0.6486089825630188, "learning_rate": 2.564002956778438e-05, "loss": 0.0902, "num_input_tokens_seen": 17587376, "step": 19675 }, { "epoch": 5.194008182658044, "grad_norm": 0.3426949679851532, "learning_rate": 2.563021525945934e-05, "loss": 0.0532, "num_input_tokens_seen": 17591888, "step": 19680 }, { "epoch": 5.195327966213541, "grad_norm": 0.21619224548339844, "learning_rate": 2.562040085394718e-05, "loss": 0.1097, "num_input_tokens_seen": 17596272, "step": 19685 }, { "epoch": 5.1966477497690375, "grad_norm": 0.04153769835829735, "learning_rate": 2.56105863527614e-05, "loss": 0.0792, "num_input_tokens_seen": 17600720, "step": 19690 }, { "epoch": 5.197967533324535, "grad_norm": 0.2781229615211487, "learning_rate": 2.5600771757415548e-05, "loss": 0.0426, "num_input_tokens_seen": 17605008, "step": 19695 }, { "epoch": 5.1992873168800315, "grad_norm": 0.30697721242904663, "learning_rate": 2.5590957069423134e-05, "loss": 0.0378, "num_input_tokens_seen": 17609392, "step": 19700 }, { "epoch": 5.200607100435529, "grad_norm": 0.34575575590133667, "learning_rate": 2.5581142290297716e-05, "loss": 0.0641, "num_input_tokens_seen": 17613904, "step": 19705 }, { "epoch": 5.2019268839910255, "grad_norm": 0.4239283800125122, "learning_rate": 2.557132742155285e-05, "loss": 0.0874, "num_input_tokens_seen": 17618160, "step": 19710 }, { "epoch": 5.203246667546522, "grad_norm": 0.3491196036338806, "learning_rate": 2.556151246470212e-05, "loss": 0.0436, "num_input_tokens_seen": 17622576, "step": 19715 }, { "epoch": 5.2045664511020195, "grad_norm": 0.052440203726291656, "learning_rate": 2.5551697421259114e-05, "loss": 0.0645, "num_input_tokens_seen": 17627248, "step": 19720 }, { "epoch": 5.205886234657516, "grad_norm": 0.24077579379081726, "learning_rate": 2.554188229273743e-05, "loss": 0.0521, "num_input_tokens_seen": 17631664, "step": 19725 }, { "epoch": 5.207206018213013, "grad_norm": 0.0738476887345314, "learning_rate": 2.5532067080650678e-05, "loss": 0.0397, "num_input_tokens_seen": 17635920, "step": 19730 }, { "epoch": 5.20852580176851, "grad_norm": 0.10256332159042358, "learning_rate": 2.55222517865125e-05, "loss": 0.047, "num_input_tokens_seen": 17640272, "step": 19735 }, { "epoch": 5.209845585324007, "grad_norm": 0.09147744625806808, "learning_rate": 2.5512436411836538e-05, "loss": 0.0652, "num_input_tokens_seen": 17644784, "step": 19740 }, { "epoch": 5.211165368879504, "grad_norm": 0.11400903016328812, "learning_rate": 2.5502620958136443e-05, "loss": 0.0187, "num_input_tokens_seen": 17649456, "step": 19745 }, { "epoch": 5.212485152435001, "grad_norm": 0.41656580567359924, "learning_rate": 2.5492805426925874e-05, "loss": 0.0529, "num_input_tokens_seen": 17653520, "step": 19750 }, { "epoch": 5.213804935990497, "grad_norm": 0.24157501757144928, "learning_rate": 2.5482989819718523e-05, "loss": 0.0929, "num_input_tokens_seen": 17657712, "step": 19755 }, { "epoch": 5.215124719545995, "grad_norm": 0.0967484563589096, "learning_rate": 2.5473174138028065e-05, "loss": 0.0464, "num_input_tokens_seen": 17662224, "step": 19760 }, { "epoch": 5.216444503101491, "grad_norm": 0.261910617351532, "learning_rate": 2.5463358383368212e-05, "loss": 0.0715, "num_input_tokens_seen": 17666896, "step": 19765 }, { "epoch": 5.217764286656989, "grad_norm": 0.10458493232727051, "learning_rate": 2.545354255725267e-05, "loss": 0.0567, "num_input_tokens_seen": 17671536, "step": 19770 }, { "epoch": 5.219084070212485, "grad_norm": 0.11341068893671036, "learning_rate": 2.5443726661195165e-05, "loss": 0.0544, "num_input_tokens_seen": 17675888, "step": 19775 }, { "epoch": 5.220403853767982, "grad_norm": 0.26071885228157043, "learning_rate": 2.543391069670944e-05, "loss": 0.0734, "num_input_tokens_seen": 17680848, "step": 19780 }, { "epoch": 5.221723637323479, "grad_norm": 0.17147307097911835, "learning_rate": 2.5424094665309228e-05, "loss": 0.0478, "num_input_tokens_seen": 17685200, "step": 19785 }, { "epoch": 5.223043420878976, "grad_norm": 0.22167818248271942, "learning_rate": 2.5414278568508292e-05, "loss": 0.0753, "num_input_tokens_seen": 17689488, "step": 19790 }, { "epoch": 5.224363204434472, "grad_norm": 0.278536856174469, "learning_rate": 2.540446240782039e-05, "loss": 0.0554, "num_input_tokens_seen": 17693744, "step": 19795 }, { "epoch": 5.22568298798997, "grad_norm": 0.21481439471244812, "learning_rate": 2.5394646184759307e-05, "loss": 0.0997, "num_input_tokens_seen": 17697936, "step": 19800 }, { "epoch": 5.22568298798997, "eval_loss": 0.06815442442893982, "eval_runtime": 64.7625, "eval_samples_per_second": 103.995, "eval_steps_per_second": 26.003, "num_input_tokens_seen": 17697936, "step": 19800 }, { "epoch": 5.227002771545466, "grad_norm": 0.04391780495643616, "learning_rate": 2.538482990083882e-05, "loss": 0.0361, "num_input_tokens_seen": 17702416, "step": 19805 }, { "epoch": 5.228322555100964, "grad_norm": 0.5930991172790527, "learning_rate": 2.5375013557572725e-05, "loss": 0.0504, "num_input_tokens_seen": 17707216, "step": 19810 }, { "epoch": 5.22964233865646, "grad_norm": 0.10810256004333496, "learning_rate": 2.536519715647483e-05, "loss": 0.0554, "num_input_tokens_seen": 17712048, "step": 19815 }, { "epoch": 5.230962122211957, "grad_norm": 0.14258959889411926, "learning_rate": 2.535538069905894e-05, "loss": 0.1121, "num_input_tokens_seen": 17716528, "step": 19820 }, { "epoch": 5.232281905767454, "grad_norm": 0.0782979428768158, "learning_rate": 2.534556418683888e-05, "loss": 0.057, "num_input_tokens_seen": 17720784, "step": 19825 }, { "epoch": 5.233601689322951, "grad_norm": 0.19077028334140778, "learning_rate": 2.5335747621328486e-05, "loss": 0.0215, "num_input_tokens_seen": 17725072, "step": 19830 }, { "epoch": 5.234921472878448, "grad_norm": 0.06264678388834, "learning_rate": 2.5325931004041586e-05, "loss": 0.0619, "num_input_tokens_seen": 17729552, "step": 19835 }, { "epoch": 5.236241256433945, "grad_norm": 0.1429455280303955, "learning_rate": 2.5316114336492032e-05, "loss": 0.0936, "num_input_tokens_seen": 17734000, "step": 19840 }, { "epoch": 5.2375610399894414, "grad_norm": 0.11153917759656906, "learning_rate": 2.530629762019367e-05, "loss": 0.0537, "num_input_tokens_seen": 17738576, "step": 19845 }, { "epoch": 5.238880823544939, "grad_norm": 0.0435015968978405, "learning_rate": 2.5296480856660364e-05, "loss": 0.0477, "num_input_tokens_seen": 17742992, "step": 19850 }, { "epoch": 5.2402006071004354, "grad_norm": 0.19637437164783478, "learning_rate": 2.528666404740599e-05, "loss": 0.0988, "num_input_tokens_seen": 17747440, "step": 19855 }, { "epoch": 5.241520390655932, "grad_norm": 0.2633301615715027, "learning_rate": 2.527684719394442e-05, "loss": 0.0375, "num_input_tokens_seen": 17751984, "step": 19860 }, { "epoch": 5.2428401742114294, "grad_norm": 0.21777556836605072, "learning_rate": 2.526703029778953e-05, "loss": 0.0687, "num_input_tokens_seen": 17756368, "step": 19865 }, { "epoch": 5.244159957766926, "grad_norm": 0.16902273893356323, "learning_rate": 2.5257213360455208e-05, "loss": 0.0654, "num_input_tokens_seen": 17761008, "step": 19870 }, { "epoch": 5.2454797413224235, "grad_norm": 0.09370040148496628, "learning_rate": 2.5247396383455353e-05, "loss": 0.0344, "num_input_tokens_seen": 17765360, "step": 19875 }, { "epoch": 5.24679952487792, "grad_norm": 0.2592080235481262, "learning_rate": 2.523757936830387e-05, "loss": 0.0478, "num_input_tokens_seen": 17769616, "step": 19880 }, { "epoch": 5.248119308433417, "grad_norm": 0.060568470507860184, "learning_rate": 2.5227762316514662e-05, "loss": 0.0214, "num_input_tokens_seen": 17774096, "step": 19885 }, { "epoch": 5.249439091988914, "grad_norm": 0.20296640694141388, "learning_rate": 2.5217945229601648e-05, "loss": 0.0294, "num_input_tokens_seen": 17778512, "step": 19890 }, { "epoch": 5.250758875544411, "grad_norm": 0.057543933391571045, "learning_rate": 2.5208128109078738e-05, "loss": 0.0146, "num_input_tokens_seen": 17782832, "step": 19895 }, { "epoch": 5.252078659099908, "grad_norm": 0.2966860234737396, "learning_rate": 2.5198310956459853e-05, "loss": 0.0608, "num_input_tokens_seen": 17787504, "step": 19900 }, { "epoch": 5.253398442655405, "grad_norm": 0.10221467912197113, "learning_rate": 2.518849377325893e-05, "loss": 0.0883, "num_input_tokens_seen": 17791920, "step": 19905 }, { "epoch": 5.254718226210901, "grad_norm": 0.15956884622573853, "learning_rate": 2.51786765609899e-05, "loss": 0.0781, "num_input_tokens_seen": 17796688, "step": 19910 }, { "epoch": 5.256038009766399, "grad_norm": 0.044606950134038925, "learning_rate": 2.5168859321166694e-05, "loss": 0.0849, "num_input_tokens_seen": 17801072, "step": 19915 }, { "epoch": 5.257357793321895, "grad_norm": 0.21111707389354706, "learning_rate": 2.515904205530326e-05, "loss": 0.0746, "num_input_tokens_seen": 17805200, "step": 19920 }, { "epoch": 5.258677576877393, "grad_norm": 0.1135886162519455, "learning_rate": 2.514922476491355e-05, "loss": 0.0471, "num_input_tokens_seen": 17809552, "step": 19925 }, { "epoch": 5.259997360432889, "grad_norm": 0.2286888062953949, "learning_rate": 2.51394074515115e-05, "loss": 0.0531, "num_input_tokens_seen": 17814064, "step": 19930 }, { "epoch": 5.261317143988386, "grad_norm": 0.41446393728256226, "learning_rate": 2.5129590116611067e-05, "loss": 0.0922, "num_input_tokens_seen": 17818256, "step": 19935 }, { "epoch": 5.262636927543883, "grad_norm": 0.27736788988113403, "learning_rate": 2.5119772761726212e-05, "loss": 0.0462, "num_input_tokens_seen": 17822768, "step": 19940 }, { "epoch": 5.26395671109938, "grad_norm": 0.03485806658864021, "learning_rate": 2.5109955388370893e-05, "loss": 0.0367, "num_input_tokens_seen": 17827312, "step": 19945 }, { "epoch": 5.265276494654876, "grad_norm": 0.20494556427001953, "learning_rate": 2.510013799805907e-05, "loss": 0.06, "num_input_tokens_seen": 17831760, "step": 19950 }, { "epoch": 5.266596278210374, "grad_norm": 0.27125608921051025, "learning_rate": 2.5090320592304706e-05, "loss": 0.0524, "num_input_tokens_seen": 17836336, "step": 19955 }, { "epoch": 5.26791606176587, "grad_norm": 0.18720531463623047, "learning_rate": 2.5080503172621777e-05, "loss": 0.0193, "num_input_tokens_seen": 17840720, "step": 19960 }, { "epoch": 5.269235845321368, "grad_norm": 0.11960024386644363, "learning_rate": 2.5070685740524246e-05, "loss": 0.1166, "num_input_tokens_seen": 17845008, "step": 19965 }, { "epoch": 5.270555628876864, "grad_norm": 0.07140699774026871, "learning_rate": 2.5060868297526084e-05, "loss": 0.0706, "num_input_tokens_seen": 17849584, "step": 19970 }, { "epoch": 5.271875412432361, "grad_norm": 0.5706459283828735, "learning_rate": 2.5051050845141267e-05, "loss": 0.0943, "num_input_tokens_seen": 17853776, "step": 19975 }, { "epoch": 5.273195195987858, "grad_norm": 0.22265766561031342, "learning_rate": 2.5041233384883765e-05, "loss": 0.0381, "num_input_tokens_seen": 17858512, "step": 19980 }, { "epoch": 5.274514979543355, "grad_norm": 0.1970408856868744, "learning_rate": 2.5031415918267564e-05, "loss": 0.0529, "num_input_tokens_seen": 17863056, "step": 19985 }, { "epoch": 5.275834763098851, "grad_norm": 0.21469996869564056, "learning_rate": 2.5021598446806626e-05, "loss": 0.0309, "num_input_tokens_seen": 17867440, "step": 19990 }, { "epoch": 5.277154546654349, "grad_norm": 0.1320730745792389, "learning_rate": 2.5011780972014937e-05, "loss": 0.0675, "num_input_tokens_seen": 17871920, "step": 19995 }, { "epoch": 5.278474330209845, "grad_norm": 0.28926241397857666, "learning_rate": 2.5001963495406478e-05, "loss": 0.0607, "num_input_tokens_seen": 17876496, "step": 20000 }, { "epoch": 5.278474330209845, "eval_loss": 0.06812906265258789, "eval_runtime": 64.813, "eval_samples_per_second": 103.914, "eval_steps_per_second": 25.982, "num_input_tokens_seen": 17876496, "step": 20000 }, { "epoch": 5.279794113765343, "grad_norm": 0.12972353398799896, "learning_rate": 2.499214601849522e-05, "loss": 0.0655, "num_input_tokens_seen": 17881040, "step": 20005 }, { "epoch": 5.281113897320839, "grad_norm": 0.12967248260974884, "learning_rate": 2.4982328542795148e-05, "loss": 0.0442, "num_input_tokens_seen": 17885680, "step": 20010 }, { "epoch": 5.282433680876336, "grad_norm": 0.2909599840641022, "learning_rate": 2.497251106982024e-05, "loss": 0.0664, "num_input_tokens_seen": 17890416, "step": 20015 }, { "epoch": 5.283753464431833, "grad_norm": 0.06966737657785416, "learning_rate": 2.4962693601084458e-05, "loss": 0.0534, "num_input_tokens_seen": 17894928, "step": 20020 }, { "epoch": 5.28507324798733, "grad_norm": 0.15251995623111725, "learning_rate": 2.4952876138101794e-05, "loss": 0.0153, "num_input_tokens_seen": 17899632, "step": 20025 }, { "epoch": 5.286393031542827, "grad_norm": 0.49357345700263977, "learning_rate": 2.4943058682386233e-05, "loss": 0.0975, "num_input_tokens_seen": 17904048, "step": 20030 }, { "epoch": 5.287712815098324, "grad_norm": 0.2558630108833313, "learning_rate": 2.493324123545173e-05, "loss": 0.0773, "num_input_tokens_seen": 17908464, "step": 20035 }, { "epoch": 5.2890325986538205, "grad_norm": 0.38097602128982544, "learning_rate": 2.4923423798812272e-05, "loss": 0.0794, "num_input_tokens_seen": 17912944, "step": 20040 }, { "epoch": 5.290352382209318, "grad_norm": 0.28355035185813904, "learning_rate": 2.4913606373981825e-05, "loss": 0.0746, "num_input_tokens_seen": 17917360, "step": 20045 }, { "epoch": 5.2916721657648145, "grad_norm": 0.08607621490955353, "learning_rate": 2.4903788962474357e-05, "loss": 0.0388, "num_input_tokens_seen": 17921840, "step": 20050 }, { "epoch": 5.292991949320312, "grad_norm": 0.0862642303109169, "learning_rate": 2.489397156580385e-05, "loss": 0.0495, "num_input_tokens_seen": 17926192, "step": 20055 }, { "epoch": 5.2943117328758085, "grad_norm": 0.09222869575023651, "learning_rate": 2.4884154185484246e-05, "loss": 0.062, "num_input_tokens_seen": 17931024, "step": 20060 }, { "epoch": 5.295631516431305, "grad_norm": 0.08120808750391006, "learning_rate": 2.4874336823029526e-05, "loss": 0.0804, "num_input_tokens_seen": 17935344, "step": 20065 }, { "epoch": 5.2969512999868025, "grad_norm": 0.1858755201101303, "learning_rate": 2.4864519479953656e-05, "loss": 0.0607, "num_input_tokens_seen": 17939728, "step": 20070 }, { "epoch": 5.298271083542299, "grad_norm": 0.06332714110612869, "learning_rate": 2.485470215777058e-05, "loss": 0.0554, "num_input_tokens_seen": 17944368, "step": 20075 }, { "epoch": 5.299590867097796, "grad_norm": 0.17160849273204803, "learning_rate": 2.4844884857994258e-05, "loss": 0.0404, "num_input_tokens_seen": 17948336, "step": 20080 }, { "epoch": 5.300910650653293, "grad_norm": 0.20850728452205658, "learning_rate": 2.4835067582138638e-05, "loss": 0.0727, "num_input_tokens_seen": 17952496, "step": 20085 }, { "epoch": 5.30223043420879, "grad_norm": 0.10861066728830338, "learning_rate": 2.4825250331717666e-05, "loss": 0.0534, "num_input_tokens_seen": 17957168, "step": 20090 }, { "epoch": 5.303550217764287, "grad_norm": 0.07189831882715225, "learning_rate": 2.4815433108245298e-05, "loss": 0.0805, "num_input_tokens_seen": 17962064, "step": 20095 }, { "epoch": 5.304870001319784, "grad_norm": 0.21253249049186707, "learning_rate": 2.4805615913235456e-05, "loss": 0.053, "num_input_tokens_seen": 17966480, "step": 20100 }, { "epoch": 5.30618978487528, "grad_norm": 0.049017295241355896, "learning_rate": 2.479579874820208e-05, "loss": 0.0372, "num_input_tokens_seen": 17970960, "step": 20105 }, { "epoch": 5.307509568430778, "grad_norm": 0.04952039197087288, "learning_rate": 2.4785981614659115e-05, "loss": 0.0194, "num_input_tokens_seen": 17975440, "step": 20110 }, { "epoch": 5.308829351986274, "grad_norm": 0.23236462473869324, "learning_rate": 2.477616451412047e-05, "loss": 0.0528, "num_input_tokens_seen": 17979824, "step": 20115 }, { "epoch": 5.310149135541771, "grad_norm": 0.2543785274028778, "learning_rate": 2.476634744810007e-05, "loss": 0.1068, "num_input_tokens_seen": 17984336, "step": 20120 }, { "epoch": 5.311468919097268, "grad_norm": 0.03549548238515854, "learning_rate": 2.475653041811183e-05, "loss": 0.0357, "num_input_tokens_seen": 17988816, "step": 20125 }, { "epoch": 5.312788702652765, "grad_norm": 0.07516444474458694, "learning_rate": 2.4746713425669652e-05, "loss": 0.0157, "num_input_tokens_seen": 17993232, "step": 20130 }, { "epoch": 5.314108486208262, "grad_norm": 0.3256787359714508, "learning_rate": 2.4736896472287458e-05, "loss": 0.0545, "num_input_tokens_seen": 17997584, "step": 20135 }, { "epoch": 5.315428269763759, "grad_norm": 0.3003339469432831, "learning_rate": 2.4727079559479124e-05, "loss": 0.063, "num_input_tokens_seen": 18001936, "step": 20140 }, { "epoch": 5.316748053319255, "grad_norm": 0.2503361105918884, "learning_rate": 2.4717262688758557e-05, "loss": 0.1288, "num_input_tokens_seen": 18006128, "step": 20145 }, { "epoch": 5.318067836874753, "grad_norm": 0.49126237630844116, "learning_rate": 2.4707445861639637e-05, "loss": 0.0738, "num_input_tokens_seen": 18010320, "step": 20150 }, { "epoch": 5.319387620430249, "grad_norm": 0.04810823127627373, "learning_rate": 2.4697629079636244e-05, "loss": 0.043, "num_input_tokens_seen": 18014704, "step": 20155 }, { "epoch": 5.320707403985747, "grad_norm": 0.06196076050400734, "learning_rate": 2.4687812344262244e-05, "loss": 0.0336, "num_input_tokens_seen": 18019152, "step": 20160 }, { "epoch": 5.322027187541243, "grad_norm": 0.26994767785072327, "learning_rate": 2.46779956570315e-05, "loss": 0.0924, "num_input_tokens_seen": 18023600, "step": 20165 }, { "epoch": 5.32334697109674, "grad_norm": 0.170233353972435, "learning_rate": 2.466817901945787e-05, "loss": 0.041, "num_input_tokens_seen": 18028016, "step": 20170 }, { "epoch": 5.324666754652237, "grad_norm": 0.03940499201416969, "learning_rate": 2.4658362433055217e-05, "loss": 0.074, "num_input_tokens_seen": 18032592, "step": 20175 }, { "epoch": 5.325986538207734, "grad_norm": 0.28275907039642334, "learning_rate": 2.4648545899337356e-05, "loss": 0.0558, "num_input_tokens_seen": 18036912, "step": 20180 }, { "epoch": 5.327306321763231, "grad_norm": 0.059048958122730255, "learning_rate": 2.4638729419818143e-05, "loss": 0.0653, "num_input_tokens_seen": 18041328, "step": 20185 }, { "epoch": 5.328626105318728, "grad_norm": 0.3982534110546112, "learning_rate": 2.46289129960114e-05, "loss": 0.0843, "num_input_tokens_seen": 18046032, "step": 20190 }, { "epoch": 5.329945888874224, "grad_norm": 0.24517352879047394, "learning_rate": 2.4619096629430924e-05, "loss": 0.0375, "num_input_tokens_seen": 18050256, "step": 20195 }, { "epoch": 5.331265672429722, "grad_norm": 0.2974519431591034, "learning_rate": 2.4609280321590543e-05, "loss": 0.0185, "num_input_tokens_seen": 18054800, "step": 20200 }, { "epoch": 5.331265672429722, "eval_loss": 0.06868622452020645, "eval_runtime": 64.778, "eval_samples_per_second": 103.971, "eval_steps_per_second": 25.996, "num_input_tokens_seen": 18054800, "step": 20200 }, { "epoch": 5.332585455985218, "grad_norm": 0.30404600501060486, "learning_rate": 2.4599464074004037e-05, "loss": 0.0715, "num_input_tokens_seen": 18059056, "step": 20205 }, { "epoch": 5.333905239540715, "grad_norm": 0.39369818568229675, "learning_rate": 2.4589647888185204e-05, "loss": 0.1144, "num_input_tokens_seen": 18063504, "step": 20210 }, { "epoch": 5.335225023096212, "grad_norm": 0.2401280701160431, "learning_rate": 2.4579831765647836e-05, "loss": 0.0615, "num_input_tokens_seen": 18067856, "step": 20215 }, { "epoch": 5.336544806651709, "grad_norm": 0.15014079213142395, "learning_rate": 2.4570015707905676e-05, "loss": 0.1261, "num_input_tokens_seen": 18072144, "step": 20220 }, { "epoch": 5.337864590207206, "grad_norm": 0.30884531140327454, "learning_rate": 2.4560199716472508e-05, "loss": 0.0483, "num_input_tokens_seen": 18076688, "step": 20225 }, { "epoch": 5.339184373762703, "grad_norm": 0.046297959983348846, "learning_rate": 2.455038379286207e-05, "loss": 0.0602, "num_input_tokens_seen": 18081104, "step": 20230 }, { "epoch": 5.3405041573181995, "grad_norm": 0.2118537724018097, "learning_rate": 2.4540567938588095e-05, "loss": 0.0737, "num_input_tokens_seen": 18085680, "step": 20235 }, { "epoch": 5.341823940873697, "grad_norm": 0.08932926505804062, "learning_rate": 2.4530752155164328e-05, "loss": 0.0876, "num_input_tokens_seen": 18089808, "step": 20240 }, { "epoch": 5.3431437244291935, "grad_norm": 0.24854230880737305, "learning_rate": 2.4520936444104463e-05, "loss": 0.09, "num_input_tokens_seen": 18094224, "step": 20245 }, { "epoch": 5.34446350798469, "grad_norm": 0.23663578927516937, "learning_rate": 2.4511120806922218e-05, "loss": 0.0388, "num_input_tokens_seen": 18098640, "step": 20250 }, { "epoch": 5.3457832915401875, "grad_norm": 0.28054967522621155, "learning_rate": 2.45013052451313e-05, "loss": 0.0991, "num_input_tokens_seen": 18103088, "step": 20255 }, { "epoch": 5.347103075095684, "grad_norm": 0.14086765050888062, "learning_rate": 2.4491489760245376e-05, "loss": 0.0706, "num_input_tokens_seen": 18107536, "step": 20260 }, { "epoch": 5.3484228586511815, "grad_norm": 0.3821715712547302, "learning_rate": 2.4481674353778115e-05, "loss": 0.0473, "num_input_tokens_seen": 18111824, "step": 20265 }, { "epoch": 5.349742642206678, "grad_norm": 0.3472025394439697, "learning_rate": 2.447185902724319e-05, "loss": 0.0294, "num_input_tokens_seen": 18116048, "step": 20270 }, { "epoch": 5.351062425762175, "grad_norm": 0.4772235155105591, "learning_rate": 2.4462043782154233e-05, "loss": 0.0801, "num_input_tokens_seen": 18120368, "step": 20275 }, { "epoch": 5.352382209317672, "grad_norm": 0.2287965565919876, "learning_rate": 2.4452228620024895e-05, "loss": 0.1044, "num_input_tokens_seen": 18124880, "step": 20280 }, { "epoch": 5.353701992873169, "grad_norm": 0.052422914654016495, "learning_rate": 2.4442413542368776e-05, "loss": 0.0989, "num_input_tokens_seen": 18129456, "step": 20285 }, { "epoch": 5.355021776428666, "grad_norm": 0.34070533514022827, "learning_rate": 2.4432598550699502e-05, "loss": 0.0809, "num_input_tokens_seen": 18133616, "step": 20290 }, { "epoch": 5.356341559984163, "grad_norm": 0.3570539355278015, "learning_rate": 2.4422783646530663e-05, "loss": 0.0974, "num_input_tokens_seen": 18138032, "step": 20295 }, { "epoch": 5.357661343539659, "grad_norm": 0.20943960547447205, "learning_rate": 2.441296883137584e-05, "loss": 0.049, "num_input_tokens_seen": 18142480, "step": 20300 }, { "epoch": 5.358981127095157, "grad_norm": 0.1433006376028061, "learning_rate": 2.4403154106748592e-05, "loss": 0.0306, "num_input_tokens_seen": 18147184, "step": 20305 }, { "epoch": 5.360300910650653, "grad_norm": 0.20047394931316376, "learning_rate": 2.4393339474162494e-05, "loss": 0.0681, "num_input_tokens_seen": 18151600, "step": 20310 }, { "epoch": 5.361620694206151, "grad_norm": 0.154203400015831, "learning_rate": 2.4383524935131062e-05, "loss": 0.0405, "num_input_tokens_seen": 18155792, "step": 20315 }, { "epoch": 5.362940477761647, "grad_norm": 0.1663346141576767, "learning_rate": 2.437371049116784e-05, "loss": 0.0482, "num_input_tokens_seen": 18159952, "step": 20320 }, { "epoch": 5.364260261317144, "grad_norm": 0.24978007376194, "learning_rate": 2.436389614378632e-05, "loss": 0.0443, "num_input_tokens_seen": 18164272, "step": 20325 }, { "epoch": 5.365580044872641, "grad_norm": 0.23309314250946045, "learning_rate": 2.435408189450002e-05, "loss": 0.0654, "num_input_tokens_seen": 18168976, "step": 20330 }, { "epoch": 5.366899828428138, "grad_norm": 0.24592380225658417, "learning_rate": 2.4344267744822406e-05, "loss": 0.0318, "num_input_tokens_seen": 18173456, "step": 20335 }, { "epoch": 5.368219611983634, "grad_norm": 0.26433998346328735, "learning_rate": 2.4334453696266944e-05, "loss": 0.0782, "num_input_tokens_seen": 18177936, "step": 20340 }, { "epoch": 5.369539395539132, "grad_norm": 0.2110581248998642, "learning_rate": 2.432463975034708e-05, "loss": 0.0645, "num_input_tokens_seen": 18182320, "step": 20345 }, { "epoch": 5.370859179094628, "grad_norm": 0.08799154311418533, "learning_rate": 2.4314825908576265e-05, "loss": 0.0794, "num_input_tokens_seen": 18186768, "step": 20350 }, { "epoch": 5.372178962650126, "grad_norm": 0.4860915243625641, "learning_rate": 2.4305012172467897e-05, "loss": 0.067, "num_input_tokens_seen": 18191504, "step": 20355 }, { "epoch": 5.373498746205622, "grad_norm": 0.04163866490125656, "learning_rate": 2.4295198543535393e-05, "loss": 0.0435, "num_input_tokens_seen": 18195824, "step": 20360 }, { "epoch": 5.374818529761119, "grad_norm": 0.05976337939500809, "learning_rate": 2.4285385023292124e-05, "loss": 0.0492, "num_input_tokens_seen": 18200176, "step": 20365 }, { "epoch": 5.376138313316616, "grad_norm": 0.14654403924942017, "learning_rate": 2.427557161325147e-05, "loss": 0.0184, "num_input_tokens_seen": 18204624, "step": 20370 }, { "epoch": 5.377458096872113, "grad_norm": 0.21097059547901154, "learning_rate": 2.4265758314926778e-05, "loss": 0.0444, "num_input_tokens_seen": 18209136, "step": 20375 }, { "epoch": 5.378777880427609, "grad_norm": 0.2892642617225647, "learning_rate": 2.4255945129831373e-05, "loss": 0.0647, "num_input_tokens_seen": 18213712, "step": 20380 }, { "epoch": 5.380097663983107, "grad_norm": 0.3796842396259308, "learning_rate": 2.4246132059478578e-05, "loss": 0.0848, "num_input_tokens_seen": 18218352, "step": 20385 }, { "epoch": 5.381417447538603, "grad_norm": 0.15289348363876343, "learning_rate": 2.4236319105381706e-05, "loss": 0.0332, "num_input_tokens_seen": 18223088, "step": 20390 }, { "epoch": 5.382737231094101, "grad_norm": 0.5039880871772766, "learning_rate": 2.422650626905401e-05, "loss": 0.1049, "num_input_tokens_seen": 18227152, "step": 20395 }, { "epoch": 5.384057014649597, "grad_norm": 0.13780230283737183, "learning_rate": 2.4216693552008785e-05, "loss": 0.064, "num_input_tokens_seen": 18232176, "step": 20400 }, { "epoch": 5.384057014649597, "eval_loss": 0.06840289384126663, "eval_runtime": 64.7619, "eval_samples_per_second": 103.996, "eval_steps_per_second": 26.003, "num_input_tokens_seen": 18232176, "step": 20400 }, { "epoch": 5.385376798205094, "grad_norm": 0.13231101632118225, "learning_rate": 2.4206880955759247e-05, "loss": 0.0654, "num_input_tokens_seen": 18236432, "step": 20405 }, { "epoch": 5.386696581760591, "grad_norm": 0.1054055392742157, "learning_rate": 2.419706848181863e-05, "loss": 0.0507, "num_input_tokens_seen": 18241136, "step": 20410 }, { "epoch": 5.388016365316088, "grad_norm": 0.2758007049560547, "learning_rate": 2.4187256131700153e-05, "loss": 0.0492, "num_input_tokens_seen": 18245744, "step": 20415 }, { "epoch": 5.389336148871585, "grad_norm": 0.08499789983034134, "learning_rate": 2.4177443906916985e-05, "loss": 0.0577, "num_input_tokens_seen": 18250160, "step": 20420 }, { "epoch": 5.390655932427082, "grad_norm": 0.14825163781642914, "learning_rate": 2.4167631808982303e-05, "loss": 0.1089, "num_input_tokens_seen": 18254416, "step": 20425 }, { "epoch": 5.3919757159825785, "grad_norm": 0.19929571449756622, "learning_rate": 2.4157819839409264e-05, "loss": 0.0332, "num_input_tokens_seen": 18258928, "step": 20430 }, { "epoch": 5.393295499538076, "grad_norm": 0.10604074597358704, "learning_rate": 2.414800799971098e-05, "loss": 0.0685, "num_input_tokens_seen": 18263344, "step": 20435 }, { "epoch": 5.3946152830935725, "grad_norm": 0.17447803914546967, "learning_rate": 2.4138196291400582e-05, "loss": 0.0942, "num_input_tokens_seen": 18267696, "step": 20440 }, { "epoch": 5.39593506664907, "grad_norm": 0.19025644659996033, "learning_rate": 2.412838471599114e-05, "loss": 0.0527, "num_input_tokens_seen": 18272240, "step": 20445 }, { "epoch": 5.3972548502045665, "grad_norm": 0.7038541436195374, "learning_rate": 2.411857327499572e-05, "loss": 0.0853, "num_input_tokens_seen": 18276880, "step": 20450 }, { "epoch": 5.398574633760063, "grad_norm": 0.2678239345550537, "learning_rate": 2.410876196992739e-05, "loss": 0.1093, "num_input_tokens_seen": 18281456, "step": 20455 }, { "epoch": 5.3998944173155605, "grad_norm": 0.10129847377538681, "learning_rate": 2.4098950802299156e-05, "loss": 0.0386, "num_input_tokens_seen": 18285776, "step": 20460 }, { "epoch": 5.401214200871057, "grad_norm": 0.47548508644104004, "learning_rate": 2.4089139773624027e-05, "loss": 0.028, "num_input_tokens_seen": 18290320, "step": 20465 }, { "epoch": 5.402533984426554, "grad_norm": 0.10117387026548386, "learning_rate": 2.4079328885415007e-05, "loss": 0.0785, "num_input_tokens_seen": 18294608, "step": 20470 }, { "epoch": 5.403853767982051, "grad_norm": 0.05529160052537918, "learning_rate": 2.4069518139185036e-05, "loss": 0.054, "num_input_tokens_seen": 18298928, "step": 20475 }, { "epoch": 5.405173551537548, "grad_norm": 0.02918175421655178, "learning_rate": 2.405970753644706e-05, "loss": 0.099, "num_input_tokens_seen": 18303600, "step": 20480 }, { "epoch": 5.406493335093045, "grad_norm": 0.34676364064216614, "learning_rate": 2.4049897078714e-05, "loss": 0.0736, "num_input_tokens_seen": 18308304, "step": 20485 }, { "epoch": 5.407813118648542, "grad_norm": 0.492438405752182, "learning_rate": 2.404008676749874e-05, "loss": 0.1382, "num_input_tokens_seen": 18312912, "step": 20490 }, { "epoch": 5.409132902204038, "grad_norm": 0.36976322531700134, "learning_rate": 2.403027660431418e-05, "loss": 0.0838, "num_input_tokens_seen": 18317296, "step": 20495 }, { "epoch": 5.410452685759536, "grad_norm": 0.11185924708843231, "learning_rate": 2.402046659067314e-05, "loss": 0.0256, "num_input_tokens_seen": 18321968, "step": 20500 }, { "epoch": 5.411772469315032, "grad_norm": 0.03092212975025177, "learning_rate": 2.401065672808847e-05, "loss": 0.0428, "num_input_tokens_seen": 18326448, "step": 20505 }, { "epoch": 5.41309225287053, "grad_norm": 0.04172932729125023, "learning_rate": 2.400084701807296e-05, "loss": 0.0259, "num_input_tokens_seen": 18331216, "step": 20510 }, { "epoch": 5.414412036426026, "grad_norm": 0.13894903659820557, "learning_rate": 2.39910374621394e-05, "loss": 0.1127, "num_input_tokens_seen": 18336016, "step": 20515 }, { "epoch": 5.415731819981523, "grad_norm": 0.35350263118743896, "learning_rate": 2.3981228061800544e-05, "loss": 0.0385, "num_input_tokens_seen": 18340368, "step": 20520 }, { "epoch": 5.41705160353702, "grad_norm": 0.0676608756184578, "learning_rate": 2.3971418818569115e-05, "loss": 0.0505, "num_input_tokens_seen": 18344784, "step": 20525 }, { "epoch": 5.418371387092517, "grad_norm": 0.40052247047424316, "learning_rate": 2.3961609733957832e-05, "loss": 0.0931, "num_input_tokens_seen": 18349136, "step": 20530 }, { "epoch": 5.419691170648013, "grad_norm": 0.44407305121421814, "learning_rate": 2.395180080947939e-05, "loss": 0.0811, "num_input_tokens_seen": 18353872, "step": 20535 }, { "epoch": 5.421010954203511, "grad_norm": 0.12540219724178314, "learning_rate": 2.394199204664642e-05, "loss": 0.0832, "num_input_tokens_seen": 18358256, "step": 20540 }, { "epoch": 5.422330737759007, "grad_norm": 0.0647035464644432, "learning_rate": 2.3932183446971583e-05, "loss": 0.0573, "num_input_tokens_seen": 18362512, "step": 20545 }, { "epoch": 5.423650521314505, "grad_norm": 0.34237101674079895, "learning_rate": 2.3922375011967473e-05, "loss": 0.0565, "num_input_tokens_seen": 18367024, "step": 20550 }, { "epoch": 5.424970304870001, "grad_norm": 0.057282138615846634, "learning_rate": 2.3912566743146676e-05, "loss": 0.0534, "num_input_tokens_seen": 18371568, "step": 20555 }, { "epoch": 5.426290088425498, "grad_norm": 0.15275301039218903, "learning_rate": 2.390275864202176e-05, "loss": 0.0339, "num_input_tokens_seen": 18375952, "step": 20560 }, { "epoch": 5.427609871980995, "grad_norm": 0.15629975497722626, "learning_rate": 2.3892950710105243e-05, "loss": 0.0523, "num_input_tokens_seen": 18380528, "step": 20565 }, { "epoch": 5.428929655536492, "grad_norm": 0.5885166525840759, "learning_rate": 2.3883142948909635e-05, "loss": 0.0691, "num_input_tokens_seen": 18384880, "step": 20570 }, { "epoch": 5.430249439091989, "grad_norm": 0.34113261103630066, "learning_rate": 2.3873335359947433e-05, "loss": 0.0484, "num_input_tokens_seen": 18389328, "step": 20575 }, { "epoch": 5.431569222647486, "grad_norm": 0.40408459305763245, "learning_rate": 2.3863527944731066e-05, "loss": 0.076, "num_input_tokens_seen": 18393840, "step": 20580 }, { "epoch": 5.432889006202982, "grad_norm": 0.09386181086301804, "learning_rate": 2.385372070477298e-05, "loss": 0.0347, "num_input_tokens_seen": 18398256, "step": 20585 }, { "epoch": 5.43420878975848, "grad_norm": 0.7162038683891296, "learning_rate": 2.384391364158556e-05, "loss": 0.1166, "num_input_tokens_seen": 18402608, "step": 20590 }, { "epoch": 5.435528573313976, "grad_norm": 0.10104630887508392, "learning_rate": 2.3834106756681185e-05, "loss": 0.0504, "num_input_tokens_seen": 18407120, "step": 20595 }, { "epoch": 5.436848356869474, "grad_norm": 0.09121514856815338, "learning_rate": 2.3824300051572206e-05, "loss": 0.0329, "num_input_tokens_seen": 18411760, "step": 20600 }, { "epoch": 5.436848356869474, "eval_loss": 0.06782509386539459, "eval_runtime": 64.773, "eval_samples_per_second": 103.979, "eval_steps_per_second": 25.998, "num_input_tokens_seen": 18411760, "step": 20600 }, { "epoch": 5.43816814042497, "grad_norm": 0.25113779306411743, "learning_rate": 2.3814493527770923e-05, "loss": 0.1132, "num_input_tokens_seen": 18416464, "step": 20605 }, { "epoch": 5.439487923980467, "grad_norm": 0.0962737500667572, "learning_rate": 2.3804687186789637e-05, "loss": 0.089, "num_input_tokens_seen": 18421136, "step": 20610 }, { "epoch": 5.440807707535964, "grad_norm": 0.22451110184192657, "learning_rate": 2.379488103014062e-05, "loss": 0.0997, "num_input_tokens_seen": 18425712, "step": 20615 }, { "epoch": 5.442127491091461, "grad_norm": 0.26961997151374817, "learning_rate": 2.3785075059336086e-05, "loss": 0.037, "num_input_tokens_seen": 18430480, "step": 20620 }, { "epoch": 5.4434472746469575, "grad_norm": 0.15414807200431824, "learning_rate": 2.3775269275888248e-05, "loss": 0.0317, "num_input_tokens_seen": 18435056, "step": 20625 }, { "epoch": 5.444767058202455, "grad_norm": 0.3021535873413086, "learning_rate": 2.3765463681309274e-05, "loss": 0.0922, "num_input_tokens_seen": 18439440, "step": 20630 }, { "epoch": 5.4460868417579515, "grad_norm": 0.27356576919555664, "learning_rate": 2.3755658277111313e-05, "loss": 0.0509, "num_input_tokens_seen": 18444144, "step": 20635 }, { "epoch": 5.447406625313449, "grad_norm": 0.29202792048454285, "learning_rate": 2.374585306480649e-05, "loss": 0.1203, "num_input_tokens_seen": 18448688, "step": 20640 }, { "epoch": 5.4487264088689455, "grad_norm": 0.12554433941841125, "learning_rate": 2.3736048045906877e-05, "loss": 0.0198, "num_input_tokens_seen": 18453264, "step": 20645 }, { "epoch": 5.450046192424442, "grad_norm": 0.28185394406318665, "learning_rate": 2.372624322192454e-05, "loss": 0.0407, "num_input_tokens_seen": 18457584, "step": 20650 }, { "epoch": 5.4513659759799395, "grad_norm": 0.18556854128837585, "learning_rate": 2.3716438594371516e-05, "loss": 0.0718, "num_input_tokens_seen": 18462032, "step": 20655 }, { "epoch": 5.452685759535436, "grad_norm": 0.37454450130462646, "learning_rate": 2.3706634164759784e-05, "loss": 0.0922, "num_input_tokens_seen": 18466576, "step": 20660 }, { "epoch": 5.454005543090933, "grad_norm": 0.1118970513343811, "learning_rate": 2.3696829934601323e-05, "loss": 0.037, "num_input_tokens_seen": 18470928, "step": 20665 }, { "epoch": 5.45532532664643, "grad_norm": 0.18228600919246674, "learning_rate": 2.3687025905408053e-05, "loss": 0.0732, "num_input_tokens_seen": 18475056, "step": 20670 }, { "epoch": 5.456645110201927, "grad_norm": 0.4120979607105255, "learning_rate": 2.3677222078691886e-05, "loss": 0.0816, "num_input_tokens_seen": 18479664, "step": 20675 }, { "epoch": 5.457964893757424, "grad_norm": 0.2520212233066559, "learning_rate": 2.366741845596471e-05, "loss": 0.029, "num_input_tokens_seen": 18484016, "step": 20680 }, { "epoch": 5.459284677312921, "grad_norm": 0.15789498388767242, "learning_rate": 2.3657615038738343e-05, "loss": 0.0688, "num_input_tokens_seen": 18488496, "step": 20685 }, { "epoch": 5.460604460868417, "grad_norm": 0.2324552834033966, "learning_rate": 2.3647811828524614e-05, "loss": 0.0519, "num_input_tokens_seen": 18492880, "step": 20690 }, { "epoch": 5.461924244423915, "grad_norm": 0.08744845539331436, "learning_rate": 2.363800882683529e-05, "loss": 0.0854, "num_input_tokens_seen": 18497360, "step": 20695 }, { "epoch": 5.463244027979411, "grad_norm": 0.2076427936553955, "learning_rate": 2.3628206035182125e-05, "loss": 0.0727, "num_input_tokens_seen": 18501904, "step": 20700 }, { "epoch": 5.464563811534909, "grad_norm": 0.33741968870162964, "learning_rate": 2.361840345507683e-05, "loss": 0.0635, "num_input_tokens_seen": 18506320, "step": 20705 }, { "epoch": 5.465883595090405, "grad_norm": 0.1196742057800293, "learning_rate": 2.3608601088031073e-05, "loss": 0.0363, "num_input_tokens_seen": 18510640, "step": 20710 }, { "epoch": 5.467203378645902, "grad_norm": 0.21732641756534576, "learning_rate": 2.3598798935556516e-05, "loss": 0.0483, "num_input_tokens_seen": 18515280, "step": 20715 }, { "epoch": 5.468523162201399, "grad_norm": 0.20192818343639374, "learning_rate": 2.3588996999164784e-05, "loss": 0.0455, "num_input_tokens_seen": 18519728, "step": 20720 }, { "epoch": 5.469842945756896, "grad_norm": 0.226049542427063, "learning_rate": 2.3579195280367434e-05, "loss": 0.066, "num_input_tokens_seen": 18523984, "step": 20725 }, { "epoch": 5.471162729312393, "grad_norm": 0.19992336630821228, "learning_rate": 2.356939378067603e-05, "loss": 0.0545, "num_input_tokens_seen": 18528560, "step": 20730 }, { "epoch": 5.47248251286789, "grad_norm": 0.38813725113868713, "learning_rate": 2.3559592501602092e-05, "loss": 0.077, "num_input_tokens_seen": 18533040, "step": 20735 }, { "epoch": 5.473802296423386, "grad_norm": 0.23195841908454895, "learning_rate": 2.3549791444657076e-05, "loss": 0.0454, "num_input_tokens_seen": 18537744, "step": 20740 }, { "epoch": 5.475122079978884, "grad_norm": 0.11163752526044846, "learning_rate": 2.353999061135246e-05, "loss": 0.0273, "num_input_tokens_seen": 18542000, "step": 20745 }, { "epoch": 5.47644186353438, "grad_norm": 0.2662763297557831, "learning_rate": 2.3530190003199626e-05, "loss": 0.0376, "num_input_tokens_seen": 18546576, "step": 20750 }, { "epoch": 5.477761647089877, "grad_norm": 0.06427902728319168, "learning_rate": 2.3520389621709965e-05, "loss": 0.0301, "num_input_tokens_seen": 18551152, "step": 20755 }, { "epoch": 5.479081430645374, "grad_norm": 0.17020420730113983, "learning_rate": 2.351058946839483e-05, "loss": 0.0755, "num_input_tokens_seen": 18555632, "step": 20760 }, { "epoch": 5.480401214200871, "grad_norm": 0.17938990890979767, "learning_rate": 2.350078954476551e-05, "loss": 0.0362, "num_input_tokens_seen": 18559920, "step": 20765 }, { "epoch": 5.481720997756368, "grad_norm": 0.0953431949019432, "learning_rate": 2.3490989852333272e-05, "loss": 0.0301, "num_input_tokens_seen": 18564080, "step": 20770 }, { "epoch": 5.483040781311865, "grad_norm": 0.18800514936447144, "learning_rate": 2.3481190392609377e-05, "loss": 0.0687, "num_input_tokens_seen": 18568272, "step": 20775 }, { "epoch": 5.4843605648673615, "grad_norm": 0.35476887226104736, "learning_rate": 2.3471391167105e-05, "loss": 0.074, "num_input_tokens_seen": 18572720, "step": 20780 }, { "epoch": 5.485680348422859, "grad_norm": 0.056558284908533096, "learning_rate": 2.3461592177331325e-05, "loss": 0.082, "num_input_tokens_seen": 18577008, "step": 20785 }, { "epoch": 5.4870001319783555, "grad_norm": 0.0970536395907402, "learning_rate": 2.345179342479946e-05, "loss": 0.062, "num_input_tokens_seen": 18581424, "step": 20790 }, { "epoch": 5.488319915533852, "grad_norm": 0.25388211011886597, "learning_rate": 2.3441994911020503e-05, "loss": 0.0281, "num_input_tokens_seen": 18586192, "step": 20795 }, { "epoch": 5.4896396990893495, "grad_norm": 0.03132476657629013, "learning_rate": 2.3432196637505522e-05, "loss": 0.0557, "num_input_tokens_seen": 18590672, "step": 20800 }, { "epoch": 5.4896396990893495, "eval_loss": 0.06822258979082108, "eval_runtime": 64.7514, "eval_samples_per_second": 104.013, "eval_steps_per_second": 26.007, "num_input_tokens_seen": 18590672, "step": 20800 }, { "epoch": 5.490959482644846, "grad_norm": 0.051991067826747894, "learning_rate": 2.3422398605765515e-05, "loss": 0.0234, "num_input_tokens_seen": 18595216, "step": 20805 }, { "epoch": 5.4922792662003435, "grad_norm": 0.12102445214986801, "learning_rate": 2.3412600817311462e-05, "loss": 0.1101, "num_input_tokens_seen": 18599728, "step": 20810 }, { "epoch": 5.49359904975584, "grad_norm": 0.1357477307319641, "learning_rate": 2.3402803273654326e-05, "loss": 0.0766, "num_input_tokens_seen": 18604304, "step": 20815 }, { "epoch": 5.494918833311337, "grad_norm": 0.2408204823732376, "learning_rate": 2.3393005976304983e-05, "loss": 0.0824, "num_input_tokens_seen": 18608944, "step": 20820 }, { "epoch": 5.496238616866834, "grad_norm": 0.249470517039299, "learning_rate": 2.338320892677432e-05, "loss": 0.0872, "num_input_tokens_seen": 18613552, "step": 20825 }, { "epoch": 5.497558400422331, "grad_norm": 0.06542292982339859, "learning_rate": 2.3373412126573155e-05, "loss": 0.0393, "num_input_tokens_seen": 18618160, "step": 20830 }, { "epoch": 5.498878183977828, "grad_norm": 0.44577333331108093, "learning_rate": 2.3363615577212285e-05, "loss": 0.0493, "num_input_tokens_seen": 18622320, "step": 20835 }, { "epoch": 5.500197967533325, "grad_norm": 0.46828821301460266, "learning_rate": 2.3353819280202455e-05, "loss": 0.0936, "num_input_tokens_seen": 18626896, "step": 20840 }, { "epoch": 5.501517751088821, "grad_norm": 0.32352742552757263, "learning_rate": 2.334402323705438e-05, "loss": 0.0974, "num_input_tokens_seen": 18631088, "step": 20845 }, { "epoch": 5.502837534644319, "grad_norm": 0.21690423786640167, "learning_rate": 2.3334227449278725e-05, "loss": 0.0342, "num_input_tokens_seen": 18635504, "step": 20850 }, { "epoch": 5.504157318199815, "grad_norm": 0.05946047604084015, "learning_rate": 2.3324431918386143e-05, "loss": 0.0254, "num_input_tokens_seen": 18639984, "step": 20855 }, { "epoch": 5.505477101755313, "grad_norm": 0.2893528342247009, "learning_rate": 2.3314636645887207e-05, "loss": 0.076, "num_input_tokens_seen": 18644656, "step": 20860 }, { "epoch": 5.506796885310809, "grad_norm": 0.08694219589233398, "learning_rate": 2.3304841633292487e-05, "loss": 0.1027, "num_input_tokens_seen": 18648944, "step": 20865 }, { "epoch": 5.508116668866306, "grad_norm": 0.42229074239730835, "learning_rate": 2.329504688211248e-05, "loss": 0.1129, "num_input_tokens_seen": 18653168, "step": 20870 }, { "epoch": 5.509436452421803, "grad_norm": 0.19916792213916779, "learning_rate": 2.3285252393857677e-05, "loss": 0.0995, "num_input_tokens_seen": 18657392, "step": 20875 }, { "epoch": 5.5107562359773, "grad_norm": 0.09439694136381149, "learning_rate": 2.327545817003851e-05, "loss": 0.0579, "num_input_tokens_seen": 18661840, "step": 20880 }, { "epoch": 5.512076019532796, "grad_norm": 0.10494999587535858, "learning_rate": 2.326566421216535e-05, "loss": 0.0458, "num_input_tokens_seen": 18666320, "step": 20885 }, { "epoch": 5.513395803088294, "grad_norm": 0.03207461163401604, "learning_rate": 2.3255870521748565e-05, "loss": 0.0404, "num_input_tokens_seen": 18670800, "step": 20890 }, { "epoch": 5.51471558664379, "grad_norm": 0.1524760127067566, "learning_rate": 2.3246077100298474e-05, "loss": 0.0903, "num_input_tokens_seen": 18675472, "step": 20895 }, { "epoch": 5.516035370199288, "grad_norm": 0.23691290616989136, "learning_rate": 2.3236283949325328e-05, "loss": 0.0688, "num_input_tokens_seen": 18679856, "step": 20900 }, { "epoch": 5.517355153754784, "grad_norm": 0.2641846239566803, "learning_rate": 2.3226491070339368e-05, "loss": 0.037, "num_input_tokens_seen": 18684304, "step": 20905 }, { "epoch": 5.518674937310281, "grad_norm": 0.09911259263753891, "learning_rate": 2.3216698464850762e-05, "loss": 0.0505, "num_input_tokens_seen": 18688656, "step": 20910 }, { "epoch": 5.519994720865778, "grad_norm": 0.18849235773086548, "learning_rate": 2.320690613436967e-05, "loss": 0.0366, "num_input_tokens_seen": 18693008, "step": 20915 }, { "epoch": 5.521314504421275, "grad_norm": 0.0780048817396164, "learning_rate": 2.3197114080406192e-05, "loss": 0.022, "num_input_tokens_seen": 18697584, "step": 20920 }, { "epoch": 5.522634287976771, "grad_norm": 0.45181116461753845, "learning_rate": 2.3187322304470365e-05, "loss": 0.0994, "num_input_tokens_seen": 18702160, "step": 20925 }, { "epoch": 5.523954071532269, "grad_norm": 0.11812064796686172, "learning_rate": 2.3177530808072222e-05, "loss": 0.034, "num_input_tokens_seen": 18706224, "step": 20930 }, { "epoch": 5.525273855087765, "grad_norm": 0.33413559198379517, "learning_rate": 2.316773959272174e-05, "loss": 0.11, "num_input_tokens_seen": 18710800, "step": 20935 }, { "epoch": 5.526593638643263, "grad_norm": 0.06064317375421524, "learning_rate": 2.3157948659928823e-05, "loss": 0.0407, "num_input_tokens_seen": 18715376, "step": 20940 }, { "epoch": 5.527913422198759, "grad_norm": 0.11536934971809387, "learning_rate": 2.3148158011203388e-05, "loss": 0.054, "num_input_tokens_seen": 18719792, "step": 20945 }, { "epoch": 5.529233205754256, "grad_norm": 0.12524516880512238, "learning_rate": 2.3138367648055253e-05, "loss": 0.0332, "num_input_tokens_seen": 18724336, "step": 20950 }, { "epoch": 5.530552989309753, "grad_norm": 0.20421750843524933, "learning_rate": 2.312857757199422e-05, "loss": 0.0567, "num_input_tokens_seen": 18729072, "step": 20955 }, { "epoch": 5.53187277286525, "grad_norm": 0.35203099250793457, "learning_rate": 2.3118787784530048e-05, "loss": 0.051, "num_input_tokens_seen": 18733936, "step": 20960 }, { "epoch": 5.533192556420747, "grad_norm": 0.04522039741277695, "learning_rate": 2.310899828717243e-05, "loss": 0.0444, "num_input_tokens_seen": 18738416, "step": 20965 }, { "epoch": 5.534512339976244, "grad_norm": 0.3366948962211609, "learning_rate": 2.309920908143104e-05, "loss": 0.0344, "num_input_tokens_seen": 18742864, "step": 20970 }, { "epoch": 5.5358321235317405, "grad_norm": 0.019371913745999336, "learning_rate": 2.308942016881551e-05, "loss": 0.0408, "num_input_tokens_seen": 18747312, "step": 20975 }, { "epoch": 5.537151907087238, "grad_norm": 0.17780911922454834, "learning_rate": 2.307963155083539e-05, "loss": 0.0656, "num_input_tokens_seen": 18751824, "step": 20980 }, { "epoch": 5.5384716906427345, "grad_norm": 0.05368199571967125, "learning_rate": 2.306984322900022e-05, "loss": 0.0405, "num_input_tokens_seen": 18756208, "step": 20985 }, { "epoch": 5.539791474198232, "grad_norm": 0.03777335211634636, "learning_rate": 2.3060055204819482e-05, "loss": 0.051, "num_input_tokens_seen": 18760720, "step": 20990 }, { "epoch": 5.5411112577537285, "grad_norm": 0.10467682033777237, "learning_rate": 2.3050267479802604e-05, "loss": 0.0353, "num_input_tokens_seen": 18765360, "step": 20995 }, { "epoch": 5.542431041309225, "grad_norm": 0.19103550910949707, "learning_rate": 2.304048005545899e-05, "loss": 0.0487, "num_input_tokens_seen": 18770000, "step": 21000 }, { "epoch": 5.542431041309225, "eval_loss": 0.06886310875415802, "eval_runtime": 64.8064, "eval_samples_per_second": 103.925, "eval_steps_per_second": 25.985, "num_input_tokens_seen": 18770000, "step": 21000 }, { "epoch": 5.5437508248647225, "grad_norm": 0.045127011835575104, "learning_rate": 2.3030692933297972e-05, "loss": 0.0659, "num_input_tokens_seen": 18774704, "step": 21005 }, { "epoch": 5.545070608420219, "grad_norm": 0.04276011884212494, "learning_rate": 2.3020906114828843e-05, "loss": 0.0501, "num_input_tokens_seen": 18779184, "step": 21010 }, { "epoch": 5.546390391975716, "grad_norm": 0.17357252538204193, "learning_rate": 2.301111960156088e-05, "loss": 0.0482, "num_input_tokens_seen": 18783536, "step": 21015 }, { "epoch": 5.547710175531213, "grad_norm": 0.11120226979255676, "learning_rate": 2.300133339500326e-05, "loss": 0.0641, "num_input_tokens_seen": 18788048, "step": 21020 }, { "epoch": 5.54902995908671, "grad_norm": 0.1699865758419037, "learning_rate": 2.2991547496665148e-05, "loss": 0.062, "num_input_tokens_seen": 18792624, "step": 21025 }, { "epoch": 5.550349742642207, "grad_norm": 0.2163824588060379, "learning_rate": 2.298176190805565e-05, "loss": 0.1077, "num_input_tokens_seen": 18797264, "step": 21030 }, { "epoch": 5.551669526197704, "grad_norm": 0.4397863447666168, "learning_rate": 2.2971976630683826e-05, "loss": 0.073, "num_input_tokens_seen": 18801584, "step": 21035 }, { "epoch": 5.5529893097532, "grad_norm": 0.37010493874549866, "learning_rate": 2.29621916660587e-05, "loss": 0.0805, "num_input_tokens_seen": 18806192, "step": 21040 }, { "epoch": 5.554309093308698, "grad_norm": 0.15090349316596985, "learning_rate": 2.295240701568922e-05, "loss": 0.0475, "num_input_tokens_seen": 18810576, "step": 21045 }, { "epoch": 5.555628876864194, "grad_norm": 0.44576624035835266, "learning_rate": 2.2942622681084312e-05, "loss": 0.0656, "num_input_tokens_seen": 18814896, "step": 21050 }, { "epoch": 5.556948660419691, "grad_norm": 0.3156450390815735, "learning_rate": 2.293283866375284e-05, "loss": 0.1195, "num_input_tokens_seen": 18819152, "step": 21055 }, { "epoch": 5.558268443975188, "grad_norm": 0.13547496497631073, "learning_rate": 2.2923054965203627e-05, "loss": 0.0593, "num_input_tokens_seen": 18823472, "step": 21060 }, { "epoch": 5.559588227530685, "grad_norm": 0.2864745855331421, "learning_rate": 2.2913271586945443e-05, "loss": 0.0539, "num_input_tokens_seen": 18828112, "step": 21065 }, { "epoch": 5.560908011086182, "grad_norm": 0.08022721111774445, "learning_rate": 2.290348853048699e-05, "loss": 0.0449, "num_input_tokens_seen": 18832592, "step": 21070 }, { "epoch": 5.562227794641679, "grad_norm": 0.1406233161687851, "learning_rate": 2.2893705797336956e-05, "loss": 0.0345, "num_input_tokens_seen": 18837328, "step": 21075 }, { "epoch": 5.563547578197175, "grad_norm": 0.29012757539749146, "learning_rate": 2.288392338900397e-05, "loss": 0.0925, "num_input_tokens_seen": 18841936, "step": 21080 }, { "epoch": 5.564867361752673, "grad_norm": 0.15457555651664734, "learning_rate": 2.2874141306996576e-05, "loss": 0.0633, "num_input_tokens_seen": 18846096, "step": 21085 }, { "epoch": 5.566187145308169, "grad_norm": 0.09114647656679153, "learning_rate": 2.2864359552823312e-05, "loss": 0.091, "num_input_tokens_seen": 18850352, "step": 21090 }, { "epoch": 5.567506928863667, "grad_norm": 0.28845515847206116, "learning_rate": 2.2854578127992648e-05, "loss": 0.0493, "num_input_tokens_seen": 18854544, "step": 21095 }, { "epoch": 5.568826712419163, "grad_norm": 0.014079189859330654, "learning_rate": 2.2844797034012988e-05, "loss": 0.0539, "num_input_tokens_seen": 18858928, "step": 21100 }, { "epoch": 5.57014649597466, "grad_norm": 0.191579207777977, "learning_rate": 2.2835016272392722e-05, "loss": 0.0258, "num_input_tokens_seen": 18863440, "step": 21105 }, { "epoch": 5.571466279530157, "grad_norm": 0.24062155187129974, "learning_rate": 2.2825235844640142e-05, "loss": 0.0363, "num_input_tokens_seen": 18867728, "step": 21110 }, { "epoch": 5.572786063085654, "grad_norm": 0.0639524832367897, "learning_rate": 2.2815455752263522e-05, "loss": 0.0421, "num_input_tokens_seen": 18872400, "step": 21115 }, { "epoch": 5.574105846641151, "grad_norm": 0.10769195854663849, "learning_rate": 2.2805675996771092e-05, "loss": 0.0715, "num_input_tokens_seen": 18876784, "step": 21120 }, { "epoch": 5.575425630196648, "grad_norm": 0.1816895604133606, "learning_rate": 2.2795896579670987e-05, "loss": 0.0763, "num_input_tokens_seen": 18881008, "step": 21125 }, { "epoch": 5.576745413752144, "grad_norm": 0.2661650776863098, "learning_rate": 2.2786117502471337e-05, "loss": 0.0971, "num_input_tokens_seen": 18885808, "step": 21130 }, { "epoch": 5.578065197307642, "grad_norm": 0.4620969593524933, "learning_rate": 2.2776338766680185e-05, "loss": 0.0681, "num_input_tokens_seen": 18890192, "step": 21135 }, { "epoch": 5.579384980863138, "grad_norm": 0.17995209991931915, "learning_rate": 2.2766560373805533e-05, "loss": 0.0422, "num_input_tokens_seen": 18894512, "step": 21140 }, { "epoch": 5.580704764418636, "grad_norm": 0.1734919399023056, "learning_rate": 2.2756782325355353e-05, "loss": 0.0675, "num_input_tokens_seen": 18898768, "step": 21145 }, { "epoch": 5.582024547974132, "grad_norm": 0.11131265014410019, "learning_rate": 2.2747004622837514e-05, "loss": 0.1139, "num_input_tokens_seen": 18903344, "step": 21150 }, { "epoch": 5.583344331529629, "grad_norm": 0.44984909892082214, "learning_rate": 2.2737227267759878e-05, "loss": 0.0832, "num_input_tokens_seen": 18907920, "step": 21155 }, { "epoch": 5.584664115085126, "grad_norm": 0.04744124785065651, "learning_rate": 2.272745026163024e-05, "loss": 0.034, "num_input_tokens_seen": 18912432, "step": 21160 }, { "epoch": 5.585983898640623, "grad_norm": 0.24574008584022522, "learning_rate": 2.271767360595633e-05, "loss": 0.0469, "num_input_tokens_seen": 18916688, "step": 21165 }, { "epoch": 5.5873036821961195, "grad_norm": 0.15691885352134705, "learning_rate": 2.270789730224583e-05, "loss": 0.0314, "num_input_tokens_seen": 18921072, "step": 21170 }, { "epoch": 5.588623465751617, "grad_norm": 0.043993137776851654, "learning_rate": 2.2698121352006367e-05, "loss": 0.0611, "num_input_tokens_seen": 18925360, "step": 21175 }, { "epoch": 5.5899432493071135, "grad_norm": 0.0546710267663002, "learning_rate": 2.2688345756745517e-05, "loss": 0.023, "num_input_tokens_seen": 18929840, "step": 21180 }, { "epoch": 5.59126303286261, "grad_norm": 0.1551498919725418, "learning_rate": 2.267857051797081e-05, "loss": 0.0873, "num_input_tokens_seen": 18934128, "step": 21185 }, { "epoch": 5.5925828164181075, "grad_norm": 0.26744258403778076, "learning_rate": 2.2668795637189695e-05, "loss": 0.0481, "num_input_tokens_seen": 18938544, "step": 21190 }, { "epoch": 5.593902599973604, "grad_norm": 0.2521877586841583, "learning_rate": 2.2659021115909586e-05, "loss": 0.051, "num_input_tokens_seen": 18943344, "step": 21195 }, { "epoch": 5.5952223835291015, "grad_norm": 0.34809041023254395, "learning_rate": 2.2649246955637847e-05, "loss": 0.0486, "num_input_tokens_seen": 18947664, "step": 21200 }, { "epoch": 5.5952223835291015, "eval_loss": 0.06824265420436859, "eval_runtime": 64.7426, "eval_samples_per_second": 104.027, "eval_steps_per_second": 26.011, "num_input_tokens_seen": 18947664, "step": 21200 }, { "epoch": 5.596542167084598, "grad_norm": 0.08779877424240112, "learning_rate": 2.2639473157881766e-05, "loss": 0.0382, "num_input_tokens_seen": 18952176, "step": 21205 }, { "epoch": 5.597861950640095, "grad_norm": 0.28717777132987976, "learning_rate": 2.2629699724148594e-05, "loss": 0.0718, "num_input_tokens_seen": 18956720, "step": 21210 }, { "epoch": 5.599181734195592, "grad_norm": 0.15190094709396362, "learning_rate": 2.26199266559455e-05, "loss": 0.0443, "num_input_tokens_seen": 18961200, "step": 21215 }, { "epoch": 5.600501517751089, "grad_norm": 0.13326530158519745, "learning_rate": 2.2610153954779625e-05, "loss": 0.0512, "num_input_tokens_seen": 18965616, "step": 21220 }, { "epoch": 5.601821301306586, "grad_norm": 0.0784735456109047, "learning_rate": 2.2600381622158056e-05, "loss": 0.029, "num_input_tokens_seen": 18970448, "step": 21225 }, { "epoch": 5.603141084862083, "grad_norm": 0.35813286900520325, "learning_rate": 2.2590609659587783e-05, "loss": 0.0767, "num_input_tokens_seen": 18975024, "step": 21230 }, { "epoch": 5.604460868417579, "grad_norm": 0.2475057691335678, "learning_rate": 2.2580838068575787e-05, "loss": 0.0705, "num_input_tokens_seen": 18979504, "step": 21235 }, { "epoch": 5.605780651973077, "grad_norm": 0.06599078327417374, "learning_rate": 2.257106685062896e-05, "loss": 0.079, "num_input_tokens_seen": 18983856, "step": 21240 }, { "epoch": 5.607100435528573, "grad_norm": 0.24672354757785797, "learning_rate": 2.256129600725415e-05, "loss": 0.0551, "num_input_tokens_seen": 18988400, "step": 21245 }, { "epoch": 5.608420219084071, "grad_norm": 0.20182178914546967, "learning_rate": 2.2551525539958145e-05, "loss": 0.044, "num_input_tokens_seen": 18993136, "step": 21250 }, { "epoch": 5.609740002639567, "grad_norm": 0.4448821246623993, "learning_rate": 2.2541755450247663e-05, "loss": 0.0398, "num_input_tokens_seen": 18997552, "step": 21255 }, { "epoch": 5.611059786195064, "grad_norm": 0.22393456101417542, "learning_rate": 2.2531985739629382e-05, "loss": 0.0502, "num_input_tokens_seen": 19001936, "step": 21260 }, { "epoch": 5.612379569750561, "grad_norm": 0.2718956470489502, "learning_rate": 2.2522216409609924e-05, "loss": 0.0635, "num_input_tokens_seen": 19006320, "step": 21265 }, { "epoch": 5.613699353306058, "grad_norm": 0.21783769130706787, "learning_rate": 2.2512447461695826e-05, "loss": 0.0858, "num_input_tokens_seen": 19010544, "step": 21270 }, { "epoch": 5.615019136861555, "grad_norm": 0.23383860290050507, "learning_rate": 2.2502678897393593e-05, "loss": 0.0846, "num_input_tokens_seen": 19014896, "step": 21275 }, { "epoch": 5.616338920417052, "grad_norm": 0.06760753691196442, "learning_rate": 2.2492910718209665e-05, "loss": 0.0209, "num_input_tokens_seen": 19019632, "step": 21280 }, { "epoch": 5.617658703972548, "grad_norm": 0.17811067402362823, "learning_rate": 2.2483142925650398e-05, "loss": 0.0294, "num_input_tokens_seen": 19024240, "step": 21285 }, { "epoch": 5.618978487528046, "grad_norm": 0.11221028119325638, "learning_rate": 2.247337552122213e-05, "loss": 0.0502, "num_input_tokens_seen": 19029008, "step": 21290 }, { "epoch": 5.620298271083542, "grad_norm": 0.4110196530818939, "learning_rate": 2.24636085064311e-05, "loss": 0.0514, "num_input_tokens_seen": 19033584, "step": 21295 }, { "epoch": 5.621618054639039, "grad_norm": 0.18568487465381622, "learning_rate": 2.245384188278351e-05, "loss": 0.0348, "num_input_tokens_seen": 19038224, "step": 21300 }, { "epoch": 5.622937838194536, "grad_norm": 0.20742888748645782, "learning_rate": 2.2444075651785513e-05, "loss": 0.0206, "num_input_tokens_seen": 19042384, "step": 21305 }, { "epoch": 5.624257621750033, "grad_norm": 0.6016423106193542, "learning_rate": 2.243430981494316e-05, "loss": 0.0833, "num_input_tokens_seen": 19046896, "step": 21310 }, { "epoch": 5.625577405305529, "grad_norm": 0.6026350259780884, "learning_rate": 2.2424544373762475e-05, "loss": 0.0968, "num_input_tokens_seen": 19051440, "step": 21315 }, { "epoch": 5.626897188861027, "grad_norm": 0.04013843461871147, "learning_rate": 2.2414779329749418e-05, "loss": 0.0558, "num_input_tokens_seen": 19055760, "step": 21320 }, { "epoch": 5.628216972416523, "grad_norm": 0.3939080238342285, "learning_rate": 2.2405014684409873e-05, "loss": 0.0469, "num_input_tokens_seen": 19060240, "step": 21325 }, { "epoch": 5.629536755972021, "grad_norm": 0.18597635626792908, "learning_rate": 2.239525043924968e-05, "loss": 0.0328, "num_input_tokens_seen": 19064336, "step": 21330 }, { "epoch": 5.630856539527517, "grad_norm": 0.30208033323287964, "learning_rate": 2.2385486595774592e-05, "loss": 0.0291, "num_input_tokens_seen": 19068720, "step": 21335 }, { "epoch": 5.632176323083014, "grad_norm": 0.09490006417036057, "learning_rate": 2.237572315549033e-05, "loss": 0.064, "num_input_tokens_seen": 19073360, "step": 21340 }, { "epoch": 5.633496106638511, "grad_norm": 0.5111339688301086, "learning_rate": 2.2365960119902545e-05, "loss": 0.0734, "num_input_tokens_seen": 19077776, "step": 21345 }, { "epoch": 5.634815890194008, "grad_norm": 0.17606085538864136, "learning_rate": 2.2356197490516806e-05, "loss": 0.0335, "num_input_tokens_seen": 19082064, "step": 21350 }, { "epoch": 5.636135673749505, "grad_norm": 0.24307148158550262, "learning_rate": 2.234643526883863e-05, "loss": 0.0701, "num_input_tokens_seen": 19086576, "step": 21355 }, { "epoch": 5.637455457305002, "grad_norm": 0.4026353657245636, "learning_rate": 2.2336673456373497e-05, "loss": 0.0806, "num_input_tokens_seen": 19091056, "step": 21360 }, { "epoch": 5.6387752408604985, "grad_norm": 0.2667510509490967, "learning_rate": 2.2326912054626772e-05, "loss": 0.0408, "num_input_tokens_seen": 19095568, "step": 21365 }, { "epoch": 5.640095024415996, "grad_norm": 0.045207541435956955, "learning_rate": 2.2317151065103813e-05, "loss": 0.0313, "num_input_tokens_seen": 19100304, "step": 21370 }, { "epoch": 5.6414148079714925, "grad_norm": 0.2859058082103729, "learning_rate": 2.2307390489309865e-05, "loss": 0.0481, "num_input_tokens_seen": 19105072, "step": 21375 }, { "epoch": 5.64273459152699, "grad_norm": 0.08890268206596375, "learning_rate": 2.2297630328750146e-05, "loss": 0.0449, "num_input_tokens_seen": 19109488, "step": 21380 }, { "epoch": 5.6440543750824865, "grad_norm": 0.08340633660554886, "learning_rate": 2.228787058492979e-05, "loss": 0.0359, "num_input_tokens_seen": 19114128, "step": 21385 }, { "epoch": 5.645374158637983, "grad_norm": 0.20061054825782776, "learning_rate": 2.2278111259353875e-05, "loss": 0.0894, "num_input_tokens_seen": 19118704, "step": 21390 }, { "epoch": 5.6466939421934805, "grad_norm": 0.047674551606178284, "learning_rate": 2.2268352353527395e-05, "loss": 0.0308, "num_input_tokens_seen": 19123088, "step": 21395 }, { "epoch": 5.648013725748977, "grad_norm": 0.13338921964168549, "learning_rate": 2.225859386895533e-05, "loss": 0.0357, "num_input_tokens_seen": 19127344, "step": 21400 }, { "epoch": 5.648013725748977, "eval_loss": 0.06848660111427307, "eval_runtime": 64.7802, "eval_samples_per_second": 103.967, "eval_steps_per_second": 25.996, "num_input_tokens_seen": 19127344, "step": 21400 }, { "epoch": 5.6493335093044745, "grad_norm": 0.4650416672229767, "learning_rate": 2.2248835807142525e-05, "loss": 0.0905, "num_input_tokens_seen": 19131856, "step": 21405 }, { "epoch": 5.650653292859971, "grad_norm": 0.06632301211357117, "learning_rate": 2.2239078169593826e-05, "loss": 0.053, "num_input_tokens_seen": 19136400, "step": 21410 }, { "epoch": 5.651973076415468, "grad_norm": 0.15004661679267883, "learning_rate": 2.222932095781396e-05, "loss": 0.0573, "num_input_tokens_seen": 19140944, "step": 21415 }, { "epoch": 5.653292859970965, "grad_norm": 0.1269366443157196, "learning_rate": 2.221956417330762e-05, "loss": 0.048, "num_input_tokens_seen": 19145520, "step": 21420 }, { "epoch": 5.654612643526462, "grad_norm": 0.11003465950489044, "learning_rate": 2.2209807817579438e-05, "loss": 0.0622, "num_input_tokens_seen": 19149680, "step": 21425 }, { "epoch": 5.655932427081958, "grad_norm": 0.5999953150749207, "learning_rate": 2.220005189213394e-05, "loss": 0.0569, "num_input_tokens_seen": 19154032, "step": 21430 }, { "epoch": 5.657252210637456, "grad_norm": 0.07152295857667923, "learning_rate": 2.2190296398475624e-05, "loss": 0.0615, "num_input_tokens_seen": 19158704, "step": 21435 }, { "epoch": 5.658571994192952, "grad_norm": 0.2252178192138672, "learning_rate": 2.2180541338108926e-05, "loss": 0.0578, "num_input_tokens_seen": 19163280, "step": 21440 }, { "epoch": 5.659891777748449, "grad_norm": 0.3121447265148163, "learning_rate": 2.2170786712538176e-05, "loss": 0.1096, "num_input_tokens_seen": 19167632, "step": 21445 }, { "epoch": 5.661211561303946, "grad_norm": 0.5204066634178162, "learning_rate": 2.216103252326768e-05, "loss": 0.0657, "num_input_tokens_seen": 19171952, "step": 21450 }, { "epoch": 5.662531344859443, "grad_norm": 0.19713424146175385, "learning_rate": 2.2151278771801635e-05, "loss": 0.0753, "num_input_tokens_seen": 19176336, "step": 21455 }, { "epoch": 5.66385112841494, "grad_norm": 0.17127105593681335, "learning_rate": 2.21415254596442e-05, "loss": 0.0445, "num_input_tokens_seen": 19180880, "step": 21460 }, { "epoch": 5.665170911970437, "grad_norm": 0.23283417522907257, "learning_rate": 2.213177258829947e-05, "loss": 0.0712, "num_input_tokens_seen": 19185584, "step": 21465 }, { "epoch": 5.666490695525933, "grad_norm": 0.21299847960472107, "learning_rate": 2.2122020159271445e-05, "loss": 0.1122, "num_input_tokens_seen": 19189936, "step": 21470 }, { "epoch": 5.667810479081431, "grad_norm": 0.21029940247535706, "learning_rate": 2.2112268174064075e-05, "loss": 0.0624, "num_input_tokens_seen": 19194448, "step": 21475 }, { "epoch": 5.669130262636927, "grad_norm": 0.14166143536567688, "learning_rate": 2.2102516634181253e-05, "loss": 0.085, "num_input_tokens_seen": 19198864, "step": 21480 }, { "epoch": 5.670450046192425, "grad_norm": 0.17571035027503967, "learning_rate": 2.209276554112677e-05, "loss": 0.0316, "num_input_tokens_seen": 19203280, "step": 21485 }, { "epoch": 5.671769829747921, "grad_norm": 0.13505153357982635, "learning_rate": 2.2083014896404384e-05, "loss": 0.0259, "num_input_tokens_seen": 19208176, "step": 21490 }, { "epoch": 5.673089613303418, "grad_norm": 0.13763467967510223, "learning_rate": 2.207326470151775e-05, "loss": 0.0425, "num_input_tokens_seen": 19213040, "step": 21495 }, { "epoch": 5.674409396858915, "grad_norm": 0.24686755239963531, "learning_rate": 2.2063514957970477e-05, "loss": 0.057, "num_input_tokens_seen": 19217744, "step": 21500 }, { "epoch": 5.675729180414412, "grad_norm": 0.04710644856095314, "learning_rate": 2.205376566726611e-05, "loss": 0.0221, "num_input_tokens_seen": 19222192, "step": 21505 }, { "epoch": 5.677048963969909, "grad_norm": 0.16848784685134888, "learning_rate": 2.204401683090809e-05, "loss": 0.0319, "num_input_tokens_seen": 19227152, "step": 21510 }, { "epoch": 5.678368747525406, "grad_norm": 0.22841808199882507, "learning_rate": 2.203426845039982e-05, "loss": 0.1192, "num_input_tokens_seen": 19231472, "step": 21515 }, { "epoch": 5.679688531080902, "grad_norm": 0.23605351150035858, "learning_rate": 2.202452052724464e-05, "loss": 0.0491, "num_input_tokens_seen": 19235888, "step": 21520 }, { "epoch": 5.6810083146364, "grad_norm": 0.3673918545246124, "learning_rate": 2.2014773062945777e-05, "loss": 0.0635, "num_input_tokens_seen": 19240464, "step": 21525 }, { "epoch": 5.682328098191896, "grad_norm": 0.3230617344379425, "learning_rate": 2.2005026059006427e-05, "loss": 0.0739, "num_input_tokens_seen": 19244880, "step": 21530 }, { "epoch": 5.683647881747394, "grad_norm": 0.13052794337272644, "learning_rate": 2.1995279516929695e-05, "loss": 0.0399, "num_input_tokens_seen": 19249008, "step": 21535 }, { "epoch": 5.68496766530289, "grad_norm": 0.7355924248695374, "learning_rate": 2.1985533438218613e-05, "loss": 0.1195, "num_input_tokens_seen": 19253328, "step": 21540 }, { "epoch": 5.686287448858387, "grad_norm": 0.29957613348960876, "learning_rate": 2.197578782437617e-05, "loss": 0.0458, "num_input_tokens_seen": 19258064, "step": 21545 }, { "epoch": 5.687607232413884, "grad_norm": 0.18086326122283936, "learning_rate": 2.196604267690524e-05, "loss": 0.0658, "num_input_tokens_seen": 19262320, "step": 21550 }, { "epoch": 5.688927015969381, "grad_norm": 0.16441041231155396, "learning_rate": 2.195629799730865e-05, "loss": 0.0637, "num_input_tokens_seen": 19266704, "step": 21555 }, { "epoch": 5.6902467995248776, "grad_norm": 0.4081951975822449, "learning_rate": 2.1946553787089173e-05, "loss": 0.0459, "num_input_tokens_seen": 19271504, "step": 21560 }, { "epoch": 5.691566583080375, "grad_norm": 0.05400290712714195, "learning_rate": 2.193681004774947e-05, "loss": 0.0369, "num_input_tokens_seen": 19275696, "step": 21565 }, { "epoch": 5.6928863666358716, "grad_norm": 0.16085407137870789, "learning_rate": 2.1927066780792154e-05, "loss": 0.1119, "num_input_tokens_seen": 19280272, "step": 21570 }, { "epoch": 5.694206150191369, "grad_norm": 0.1212693378329277, "learning_rate": 2.191732398771975e-05, "loss": 0.066, "num_input_tokens_seen": 19284720, "step": 21575 }, { "epoch": 5.6955259337468656, "grad_norm": 0.1213379055261612, "learning_rate": 2.1907581670034725e-05, "loss": 0.0549, "num_input_tokens_seen": 19289040, "step": 21580 }, { "epoch": 5.696845717302362, "grad_norm": 0.040082138031721115, "learning_rate": 2.189783982923948e-05, "loss": 0.0869, "num_input_tokens_seen": 19293552, "step": 21585 }, { "epoch": 5.6981655008578596, "grad_norm": 0.0658331885933876, "learning_rate": 2.1888098466836303e-05, "loss": 0.0766, "num_input_tokens_seen": 19297808, "step": 21590 }, { "epoch": 5.699485284413356, "grad_norm": 0.18205708265304565, "learning_rate": 2.1878357584327457e-05, "loss": 0.0241, "num_input_tokens_seen": 19302352, "step": 21595 }, { "epoch": 5.700805067968853, "grad_norm": 0.1503007858991623, "learning_rate": 2.1868617183215103e-05, "loss": 0.03, "num_input_tokens_seen": 19306864, "step": 21600 }, { "epoch": 5.700805067968853, "eval_loss": 0.06780818849802017, "eval_runtime": 64.7836, "eval_samples_per_second": 103.962, "eval_steps_per_second": 25.994, "num_input_tokens_seen": 19306864, "step": 21600 }, { "epoch": 5.70212485152435, "grad_norm": 0.09879620373249054, "learning_rate": 2.1858877265001327e-05, "loss": 0.0672, "num_input_tokens_seen": 19310992, "step": 21605 }, { "epoch": 5.703444635079847, "grad_norm": 0.4708520472049713, "learning_rate": 2.184913783118816e-05, "loss": 0.0613, "num_input_tokens_seen": 19315632, "step": 21610 }, { "epoch": 5.704764418635344, "grad_norm": 0.22414879500865936, "learning_rate": 2.1839398883277522e-05, "loss": 0.0538, "num_input_tokens_seen": 19320080, "step": 21615 }, { "epoch": 5.706084202190841, "grad_norm": 0.27883031964302063, "learning_rate": 2.182966042277129e-05, "loss": 0.0765, "num_input_tokens_seen": 19324176, "step": 21620 }, { "epoch": 5.707403985746337, "grad_norm": 0.19263526797294617, "learning_rate": 2.181992245117128e-05, "loss": 0.0786, "num_input_tokens_seen": 19328784, "step": 21625 }, { "epoch": 5.708723769301835, "grad_norm": 0.1222437396645546, "learning_rate": 2.181018496997918e-05, "loss": 0.0755, "num_input_tokens_seen": 19332912, "step": 21630 }, { "epoch": 5.710043552857331, "grad_norm": 0.20359845459461212, "learning_rate": 2.1800447980696648e-05, "loss": 0.0294, "num_input_tokens_seen": 19337360, "step": 21635 }, { "epoch": 5.711363336412829, "grad_norm": 0.4364054501056671, "learning_rate": 2.1790711484825248e-05, "loss": 0.0622, "num_input_tokens_seen": 19341648, "step": 21640 }, { "epoch": 5.712683119968325, "grad_norm": 0.24508921802043915, "learning_rate": 2.178097548386646e-05, "loss": 0.0903, "num_input_tokens_seen": 19346448, "step": 21645 }, { "epoch": 5.714002903523822, "grad_norm": 0.0940992683172226, "learning_rate": 2.1771239979321712e-05, "loss": 0.0432, "num_input_tokens_seen": 19350800, "step": 21650 }, { "epoch": 5.715322687079319, "grad_norm": 0.09137574583292007, "learning_rate": 2.1761504972692327e-05, "loss": 0.0344, "num_input_tokens_seen": 19355152, "step": 21655 }, { "epoch": 5.716642470634816, "grad_norm": 0.06133875995874405, "learning_rate": 2.1751770465479572e-05, "loss": 0.0255, "num_input_tokens_seen": 19359856, "step": 21660 }, { "epoch": 5.717962254190313, "grad_norm": 0.2081190049648285, "learning_rate": 2.174203645918464e-05, "loss": 0.1054, "num_input_tokens_seen": 19364144, "step": 21665 }, { "epoch": 5.71928203774581, "grad_norm": 0.3296848237514496, "learning_rate": 2.1732302955308624e-05, "loss": 0.0737, "num_input_tokens_seen": 19368656, "step": 21670 }, { "epoch": 5.720601821301306, "grad_norm": 0.14269819855690002, "learning_rate": 2.172256995535255e-05, "loss": 0.0599, "num_input_tokens_seen": 19373264, "step": 21675 }, { "epoch": 5.721921604856804, "grad_norm": 0.16286475956439972, "learning_rate": 2.171283746081739e-05, "loss": 0.0563, "num_input_tokens_seen": 19377680, "step": 21680 }, { "epoch": 5.7232413884123, "grad_norm": 0.05276135355234146, "learning_rate": 2.1703105473203988e-05, "loss": 0.0414, "num_input_tokens_seen": 19382064, "step": 21685 }, { "epoch": 5.724561171967797, "grad_norm": 0.24705389142036438, "learning_rate": 2.1693373994013168e-05, "loss": 0.069, "num_input_tokens_seen": 19386384, "step": 21690 }, { "epoch": 5.725880955523294, "grad_norm": 0.10040836781263351, "learning_rate": 2.168364302474562e-05, "loss": 0.0897, "num_input_tokens_seen": 19391088, "step": 21695 }, { "epoch": 5.727200739078791, "grad_norm": 0.1523878574371338, "learning_rate": 2.167391256690199e-05, "loss": 0.0631, "num_input_tokens_seen": 19395504, "step": 21700 }, { "epoch": 5.728520522634288, "grad_norm": 0.15728230774402618, "learning_rate": 2.1664182621982855e-05, "loss": 0.0539, "num_input_tokens_seen": 19400048, "step": 21705 }, { "epoch": 5.729840306189785, "grad_norm": 0.1888779103755951, "learning_rate": 2.1654453191488673e-05, "loss": 0.0893, "num_input_tokens_seen": 19404624, "step": 21710 }, { "epoch": 5.7311600897452815, "grad_norm": 0.0433732308447361, "learning_rate": 2.1644724276919846e-05, "loss": 0.0272, "num_input_tokens_seen": 19409008, "step": 21715 }, { "epoch": 5.732479873300779, "grad_norm": 0.1991540491580963, "learning_rate": 2.1634995879776715e-05, "loss": 0.0281, "num_input_tokens_seen": 19413872, "step": 21720 }, { "epoch": 5.7337996568562755, "grad_norm": 0.3610057830810547, "learning_rate": 2.162526800155949e-05, "loss": 0.1048, "num_input_tokens_seen": 19417904, "step": 21725 }, { "epoch": 5.735119440411772, "grad_norm": 0.13592877984046936, "learning_rate": 2.1615540643768363e-05, "loss": 0.0272, "num_input_tokens_seen": 19422576, "step": 21730 }, { "epoch": 5.7364392239672695, "grad_norm": 0.19473311305046082, "learning_rate": 2.160581380790339e-05, "loss": 0.0316, "num_input_tokens_seen": 19426864, "step": 21735 }, { "epoch": 5.737759007522766, "grad_norm": 0.6999038457870483, "learning_rate": 2.1596087495464586e-05, "loss": 0.0846, "num_input_tokens_seen": 19431408, "step": 21740 }, { "epoch": 5.7390787910782635, "grad_norm": 0.06212056055665016, "learning_rate": 2.1586361707951866e-05, "loss": 0.0444, "num_input_tokens_seen": 19435760, "step": 21745 }, { "epoch": 5.74039857463376, "grad_norm": 0.21014398336410522, "learning_rate": 2.157663644686507e-05, "loss": 0.0333, "num_input_tokens_seen": 19440336, "step": 21750 }, { "epoch": 5.741718358189257, "grad_norm": 0.285422682762146, "learning_rate": 2.156691171370396e-05, "loss": 0.0616, "num_input_tokens_seen": 19444624, "step": 21755 }, { "epoch": 5.743038141744754, "grad_norm": 0.14195486903190613, "learning_rate": 2.1557187509968195e-05, "loss": 0.0729, "num_input_tokens_seen": 19449168, "step": 21760 }, { "epoch": 5.744357925300251, "grad_norm": 0.19457361102104187, "learning_rate": 2.1547463837157382e-05, "loss": 0.028, "num_input_tokens_seen": 19453744, "step": 21765 }, { "epoch": 5.745677708855748, "grad_norm": 0.27239757776260376, "learning_rate": 2.1537740696771045e-05, "loss": 0.1174, "num_input_tokens_seen": 19458192, "step": 21770 }, { "epoch": 5.746997492411245, "grad_norm": 0.053028497844934464, "learning_rate": 2.1528018090308587e-05, "loss": 0.0325, "num_input_tokens_seen": 19462416, "step": 21775 }, { "epoch": 5.748317275966741, "grad_norm": 0.11894752085208893, "learning_rate": 2.151829601926938e-05, "loss": 0.0298, "num_input_tokens_seen": 19467024, "step": 21780 }, { "epoch": 5.749637059522239, "grad_norm": 0.2708461582660675, "learning_rate": 2.1508574485152684e-05, "loss": 0.0916, "num_input_tokens_seen": 19471856, "step": 21785 }, { "epoch": 5.750956843077735, "grad_norm": 0.40038877725601196, "learning_rate": 2.1498853489457667e-05, "loss": 0.052, "num_input_tokens_seen": 19476336, "step": 21790 }, { "epoch": 5.752276626633233, "grad_norm": 0.239413782954216, "learning_rate": 2.1489133033683455e-05, "loss": 0.0871, "num_input_tokens_seen": 19480816, "step": 21795 }, { "epoch": 5.753596410188729, "grad_norm": 0.13613425195217133, "learning_rate": 2.1479413119329038e-05, "loss": 0.0772, "num_input_tokens_seen": 19485200, "step": 21800 }, { "epoch": 5.753596410188729, "eval_loss": 0.06795632094144821, "eval_runtime": 64.8305, "eval_samples_per_second": 103.886, "eval_steps_per_second": 25.975, "num_input_tokens_seen": 19485200, "step": 21800 }, { "epoch": 5.754916193744226, "grad_norm": 0.09529406577348709, "learning_rate": 2.1469693747893355e-05, "loss": 0.0212, "num_input_tokens_seen": 19489616, "step": 21805 }, { "epoch": 5.756235977299723, "grad_norm": 0.048222169280052185, "learning_rate": 2.1459974920875274e-05, "loss": 0.0278, "num_input_tokens_seen": 19494192, "step": 21810 }, { "epoch": 5.75755576085522, "grad_norm": 0.10930148512125015, "learning_rate": 2.145025663977354e-05, "loss": 0.1443, "num_input_tokens_seen": 19498640, "step": 21815 }, { "epoch": 5.758875544410717, "grad_norm": 0.22699697315692902, "learning_rate": 2.1440538906086844e-05, "loss": 0.0676, "num_input_tokens_seen": 19503152, "step": 21820 }, { "epoch": 5.760195327966214, "grad_norm": 0.06325110048055649, "learning_rate": 2.1430821721313782e-05, "loss": 0.0385, "num_input_tokens_seen": 19507664, "step": 21825 }, { "epoch": 5.76151511152171, "grad_norm": 0.09122063219547272, "learning_rate": 2.142110508695286e-05, "loss": 0.0335, "num_input_tokens_seen": 19511888, "step": 21830 }, { "epoch": 5.762834895077208, "grad_norm": 0.25243082642555237, "learning_rate": 2.1411389004502515e-05, "loss": 0.0682, "num_input_tokens_seen": 19516400, "step": 21835 }, { "epoch": 5.764154678632704, "grad_norm": 0.1664630025625229, "learning_rate": 2.140167347546107e-05, "loss": 0.0772, "num_input_tokens_seen": 19521008, "step": 21840 }, { "epoch": 5.765474462188201, "grad_norm": 0.08837154507637024, "learning_rate": 2.1391958501326793e-05, "loss": 0.0373, "num_input_tokens_seen": 19525488, "step": 21845 }, { "epoch": 5.766794245743698, "grad_norm": 0.3557136654853821, "learning_rate": 2.1382244083597873e-05, "loss": 0.1501, "num_input_tokens_seen": 19529744, "step": 21850 }, { "epoch": 5.768114029299195, "grad_norm": 0.06833229959011078, "learning_rate": 2.137253022377237e-05, "loss": 0.0637, "num_input_tokens_seen": 19533840, "step": 21855 }, { "epoch": 5.769433812854691, "grad_norm": 0.1684255301952362, "learning_rate": 2.136281692334829e-05, "loss": 0.0645, "num_input_tokens_seen": 19538192, "step": 21860 }, { "epoch": 5.770753596410189, "grad_norm": 0.2881527245044708, "learning_rate": 2.135310418382356e-05, "loss": 0.0488, "num_input_tokens_seen": 19543024, "step": 21865 }, { "epoch": 5.772073379965685, "grad_norm": 0.029740381985902786, "learning_rate": 2.134339200669598e-05, "loss": 0.031, "num_input_tokens_seen": 19547504, "step": 21870 }, { "epoch": 5.773393163521183, "grad_norm": 0.26382920145988464, "learning_rate": 2.133368039346331e-05, "loss": 0.051, "num_input_tokens_seen": 19552080, "step": 21875 }, { "epoch": 5.774712947076679, "grad_norm": 0.07745521515607834, "learning_rate": 2.1323969345623195e-05, "loss": 0.0842, "num_input_tokens_seen": 19556496, "step": 21880 }, { "epoch": 5.776032730632176, "grad_norm": 0.28076961636543274, "learning_rate": 2.1314258864673207e-05, "loss": 0.0884, "num_input_tokens_seen": 19560848, "step": 21885 }, { "epoch": 5.777352514187673, "grad_norm": 0.07795155793428421, "learning_rate": 2.130454895211082e-05, "loss": 0.0403, "num_input_tokens_seen": 19565296, "step": 21890 }, { "epoch": 5.77867229774317, "grad_norm": 0.12019369751214981, "learning_rate": 2.129483960943342e-05, "loss": 0.0262, "num_input_tokens_seen": 19569712, "step": 21895 }, { "epoch": 5.779992081298667, "grad_norm": 0.43047478795051575, "learning_rate": 2.128513083813831e-05, "loss": 0.1247, "num_input_tokens_seen": 19574000, "step": 21900 }, { "epoch": 5.781311864854164, "grad_norm": 0.1759934425354004, "learning_rate": 2.1275422639722724e-05, "loss": 0.058, "num_input_tokens_seen": 19578384, "step": 21905 }, { "epoch": 5.7826316484096605, "grad_norm": 0.3217279613018036, "learning_rate": 2.126571501568376e-05, "loss": 0.0739, "num_input_tokens_seen": 19583280, "step": 21910 }, { "epoch": 5.783951431965158, "grad_norm": 0.27746906876564026, "learning_rate": 2.1256007967518478e-05, "loss": 0.0569, "num_input_tokens_seen": 19587632, "step": 21915 }, { "epoch": 5.7852712155206545, "grad_norm": 0.1901746541261673, "learning_rate": 2.124630149672381e-05, "loss": 0.1087, "num_input_tokens_seen": 19592368, "step": 21920 }, { "epoch": 5.786590999076152, "grad_norm": 0.3257935345172882, "learning_rate": 2.1236595604796624e-05, "loss": 0.0786, "num_input_tokens_seen": 19596848, "step": 21925 }, { "epoch": 5.7879107826316485, "grad_norm": 0.27395591139793396, "learning_rate": 2.1226890293233693e-05, "loss": 0.0847, "num_input_tokens_seen": 19601296, "step": 21930 }, { "epoch": 5.789230566187145, "grad_norm": 0.17663167417049408, "learning_rate": 2.1217185563531694e-05, "loss": 0.079, "num_input_tokens_seen": 19605872, "step": 21935 }, { "epoch": 5.7905503497426425, "grad_norm": 0.2962487041950226, "learning_rate": 2.120748141718721e-05, "loss": 0.0436, "num_input_tokens_seen": 19610352, "step": 21940 }, { "epoch": 5.791870133298139, "grad_norm": 0.508523166179657, "learning_rate": 2.1197777855696765e-05, "loss": 0.0513, "num_input_tokens_seen": 19614800, "step": 21945 }, { "epoch": 5.7931899168536365, "grad_norm": 0.038787566125392914, "learning_rate": 2.1188074880556746e-05, "loss": 0.0761, "num_input_tokens_seen": 19619216, "step": 21950 }, { "epoch": 5.794509700409133, "grad_norm": 0.022323528304696083, "learning_rate": 2.1178372493263495e-05, "loss": 0.0291, "num_input_tokens_seen": 19623792, "step": 21955 }, { "epoch": 5.79582948396463, "grad_norm": 0.31188657879829407, "learning_rate": 2.116867069531322e-05, "loss": 0.0767, "num_input_tokens_seen": 19628400, "step": 21960 }, { "epoch": 5.797149267520127, "grad_norm": 0.05209731310606003, "learning_rate": 2.1158969488202073e-05, "loss": 0.0469, "num_input_tokens_seen": 19632784, "step": 21965 }, { "epoch": 5.798469051075624, "grad_norm": 0.09773904085159302, "learning_rate": 2.114926887342611e-05, "loss": 0.051, "num_input_tokens_seen": 19637488, "step": 21970 }, { "epoch": 5.79978883463112, "grad_norm": 0.1102888211607933, "learning_rate": 2.113956885248127e-05, "loss": 0.0929, "num_input_tokens_seen": 19641680, "step": 21975 }, { "epoch": 5.801108618186618, "grad_norm": 0.15374262630939484, "learning_rate": 2.112986942686342e-05, "loss": 0.0217, "num_input_tokens_seen": 19646160, "step": 21980 }, { "epoch": 5.802428401742114, "grad_norm": 0.024703998118638992, "learning_rate": 2.112017059806835e-05, "loss": 0.0417, "num_input_tokens_seen": 19650480, "step": 21985 }, { "epoch": 5.803748185297611, "grad_norm": 0.5820759534835815, "learning_rate": 2.1110472367591724e-05, "loss": 0.0939, "num_input_tokens_seen": 19654992, "step": 21990 }, { "epoch": 5.805067968853108, "grad_norm": 0.15050703287124634, "learning_rate": 2.1100774736929145e-05, "loss": 0.0427, "num_input_tokens_seen": 19659472, "step": 21995 }, { "epoch": 5.806387752408605, "grad_norm": 0.34700727462768555, "learning_rate": 2.10910777075761e-05, "loss": 0.061, "num_input_tokens_seen": 19664112, "step": 22000 }, { "epoch": 5.806387752408605, "eval_loss": 0.06791354715824127, "eval_runtime": 64.7415, "eval_samples_per_second": 104.029, "eval_steps_per_second": 26.011, "num_input_tokens_seen": 19664112, "step": 22000 }, { "epoch": 5.807707535964102, "grad_norm": 0.29714256525039673, "learning_rate": 2.108138128102799e-05, "loss": 0.0748, "num_input_tokens_seen": 19668784, "step": 22005 }, { "epoch": 5.809027319519599, "grad_norm": 0.2102889120578766, "learning_rate": 2.107168545878014e-05, "loss": 0.0395, "num_input_tokens_seen": 19673296, "step": 22010 }, { "epoch": 5.810347103075095, "grad_norm": 0.48718398809432983, "learning_rate": 2.106199024232775e-05, "loss": 0.0901, "num_input_tokens_seen": 19677680, "step": 22015 }, { "epoch": 5.811666886630593, "grad_norm": 0.07027366012334824, "learning_rate": 2.105229563316595e-05, "loss": 0.0246, "num_input_tokens_seen": 19682256, "step": 22020 }, { "epoch": 5.812986670186089, "grad_norm": 0.2676008343696594, "learning_rate": 2.1042601632789784e-05, "loss": 0.039, "num_input_tokens_seen": 19686640, "step": 22025 }, { "epoch": 5.814306453741587, "grad_norm": 0.05093734711408615, "learning_rate": 2.103290824269417e-05, "loss": 0.0323, "num_input_tokens_seen": 19691088, "step": 22030 }, { "epoch": 5.815626237297083, "grad_norm": 0.3033090829849243, "learning_rate": 2.1023215464373965e-05, "loss": 0.0602, "num_input_tokens_seen": 19695760, "step": 22035 }, { "epoch": 5.81694602085258, "grad_norm": 0.18333011865615845, "learning_rate": 2.1013523299323908e-05, "loss": 0.0901, "num_input_tokens_seen": 19700144, "step": 22040 }, { "epoch": 5.818265804408077, "grad_norm": 0.18397235870361328, "learning_rate": 2.1003831749038654e-05, "loss": 0.0288, "num_input_tokens_seen": 19704560, "step": 22045 }, { "epoch": 5.819585587963574, "grad_norm": 0.163334459066391, "learning_rate": 2.099414081501277e-05, "loss": 0.0751, "num_input_tokens_seen": 19709040, "step": 22050 }, { "epoch": 5.820905371519071, "grad_norm": 0.5005264282226562, "learning_rate": 2.09844504987407e-05, "loss": 0.0498, "num_input_tokens_seen": 19713296, "step": 22055 }, { "epoch": 5.822225155074568, "grad_norm": 0.3711463212966919, "learning_rate": 2.097476080171683e-05, "loss": 0.0499, "num_input_tokens_seen": 19717904, "step": 22060 }, { "epoch": 5.823544938630064, "grad_norm": 0.09006980061531067, "learning_rate": 2.0965071725435436e-05, "loss": 0.0729, "num_input_tokens_seen": 19722224, "step": 22065 }, { "epoch": 5.824864722185562, "grad_norm": 0.20882688462734222, "learning_rate": 2.0955383271390684e-05, "loss": 0.0839, "num_input_tokens_seen": 19726544, "step": 22070 }, { "epoch": 5.826184505741058, "grad_norm": 0.27874624729156494, "learning_rate": 2.094569544107666e-05, "loss": 0.0519, "num_input_tokens_seen": 19731056, "step": 22075 }, { "epoch": 5.827504289296556, "grad_norm": 0.11233468353748322, "learning_rate": 2.093600823598735e-05, "loss": 0.055, "num_input_tokens_seen": 19735440, "step": 22080 }, { "epoch": 5.828824072852052, "grad_norm": 0.05246885120868683, "learning_rate": 2.092632165761663e-05, "loss": 0.1071, "num_input_tokens_seen": 19739632, "step": 22085 }, { "epoch": 5.830143856407549, "grad_norm": 0.27770912647247314, "learning_rate": 2.091663570745832e-05, "loss": 0.0541, "num_input_tokens_seen": 19744304, "step": 22090 }, { "epoch": 5.831463639963046, "grad_norm": 0.152957946062088, "learning_rate": 2.0906950387006086e-05, "loss": 0.04, "num_input_tokens_seen": 19748656, "step": 22095 }, { "epoch": 5.832783423518543, "grad_norm": 0.6110996603965759, "learning_rate": 2.0897265697753543e-05, "loss": 0.0908, "num_input_tokens_seen": 19753168, "step": 22100 }, { "epoch": 5.8341032070740395, "grad_norm": 0.08941664546728134, "learning_rate": 2.088758164119419e-05, "loss": 0.038, "num_input_tokens_seen": 19757552, "step": 22105 }, { "epoch": 5.835422990629537, "grad_norm": 0.12796351313591003, "learning_rate": 2.0877898218821428e-05, "loss": 0.043, "num_input_tokens_seen": 19762064, "step": 22110 }, { "epoch": 5.8367427741850335, "grad_norm": 0.059940509498119354, "learning_rate": 2.0868215432128565e-05, "loss": 0.035, "num_input_tokens_seen": 19766320, "step": 22115 }, { "epoch": 5.83806255774053, "grad_norm": 0.15658944845199585, "learning_rate": 2.0858533282608796e-05, "loss": 0.0405, "num_input_tokens_seen": 19770544, "step": 22120 }, { "epoch": 5.8393823412960275, "grad_norm": 0.08887448161840439, "learning_rate": 2.084885177175524e-05, "loss": 0.0639, "num_input_tokens_seen": 19775408, "step": 22125 }, { "epoch": 5.840702124851524, "grad_norm": 0.19172848761081696, "learning_rate": 2.0839170901060917e-05, "loss": 0.0559, "num_input_tokens_seen": 19779760, "step": 22130 }, { "epoch": 5.8420219084070215, "grad_norm": 0.22904475033283234, "learning_rate": 2.082949067201872e-05, "loss": 0.1024, "num_input_tokens_seen": 19784464, "step": 22135 }, { "epoch": 5.843341691962518, "grad_norm": 0.17465868592262268, "learning_rate": 2.0819811086121475e-05, "loss": 0.03, "num_input_tokens_seen": 19789072, "step": 22140 }, { "epoch": 5.844661475518015, "grad_norm": 0.31478258967399597, "learning_rate": 2.08101321448619e-05, "loss": 0.067, "num_input_tokens_seen": 19793456, "step": 22145 }, { "epoch": 5.845981259073512, "grad_norm": 0.07886745035648346, "learning_rate": 2.080045384973259e-05, "loss": 0.0544, "num_input_tokens_seen": 19798032, "step": 22150 }, { "epoch": 5.847301042629009, "grad_norm": 0.10830595344305038, "learning_rate": 2.0790776202226082e-05, "loss": 0.0391, "num_input_tokens_seen": 19802640, "step": 22155 }, { "epoch": 5.848620826184506, "grad_norm": 0.47462987899780273, "learning_rate": 2.078109920383477e-05, "loss": 0.054, "num_input_tokens_seen": 19807152, "step": 22160 }, { "epoch": 5.849940609740003, "grad_norm": 0.2559426724910736, "learning_rate": 2.0771422856050978e-05, "loss": 0.0727, "num_input_tokens_seen": 19811472, "step": 22165 }, { "epoch": 5.851260393295499, "grad_norm": 0.06618694961071014, "learning_rate": 2.076174716036693e-05, "loss": 0.0622, "num_input_tokens_seen": 19816048, "step": 22170 }, { "epoch": 5.852580176850997, "grad_norm": 0.28502368927001953, "learning_rate": 2.075207211827472e-05, "loss": 0.087, "num_input_tokens_seen": 19820432, "step": 22175 }, { "epoch": 5.853899960406493, "grad_norm": 0.1422804445028305, "learning_rate": 2.074239773126638e-05, "loss": 0.0268, "num_input_tokens_seen": 19825232, "step": 22180 }, { "epoch": 5.855219743961991, "grad_norm": 0.1599172204732895, "learning_rate": 2.073272400083382e-05, "loss": 0.0517, "num_input_tokens_seen": 19829488, "step": 22185 }, { "epoch": 5.856539527517487, "grad_norm": 0.17204247415065765, "learning_rate": 2.072305092846883e-05, "loss": 0.0724, "num_input_tokens_seen": 19834224, "step": 22190 }, { "epoch": 5.857859311072984, "grad_norm": 0.3195152282714844, "learning_rate": 2.0713378515663152e-05, "loss": 0.0743, "num_input_tokens_seen": 19838896, "step": 22195 }, { "epoch": 5.859179094628481, "grad_norm": 0.3729403614997864, "learning_rate": 2.070370676390836e-05, "loss": 0.0942, "num_input_tokens_seen": 19843216, "step": 22200 }, { "epoch": 5.859179094628481, "eval_loss": 0.06794528663158417, "eval_runtime": 64.7917, "eval_samples_per_second": 103.948, "eval_steps_per_second": 25.991, "num_input_tokens_seen": 19843216, "step": 22200 }, { "epoch": 5.860498878183978, "grad_norm": 0.21294929087162018, "learning_rate": 2.0694035674695974e-05, "loss": 0.0506, "num_input_tokens_seen": 19847632, "step": 22205 }, { "epoch": 5.861818661739475, "grad_norm": 0.20426017045974731, "learning_rate": 2.0684365249517416e-05, "loss": 0.0359, "num_input_tokens_seen": 19852048, "step": 22210 }, { "epoch": 5.863138445294972, "grad_norm": 0.22793236374855042, "learning_rate": 2.067469548986396e-05, "loss": 0.0479, "num_input_tokens_seen": 19856400, "step": 22215 }, { "epoch": 5.864458228850468, "grad_norm": 0.11497310549020767, "learning_rate": 2.066502639722681e-05, "loss": 0.0331, "num_input_tokens_seen": 19860656, "step": 22220 }, { "epoch": 5.865778012405966, "grad_norm": 0.20773261785507202, "learning_rate": 2.065535797309708e-05, "loss": 0.0473, "num_input_tokens_seen": 19865200, "step": 22225 }, { "epoch": 5.867097795961462, "grad_norm": 0.09034489840269089, "learning_rate": 2.0645690218965736e-05, "loss": 0.071, "num_input_tokens_seen": 19869456, "step": 22230 }, { "epoch": 5.868417579516959, "grad_norm": 0.0734708234667778, "learning_rate": 2.063602313632369e-05, "loss": 0.0204, "num_input_tokens_seen": 19873872, "step": 22235 }, { "epoch": 5.869737363072456, "grad_norm": 0.0815160870552063, "learning_rate": 2.0626356726661704e-05, "loss": 0.0289, "num_input_tokens_seen": 19878320, "step": 22240 }, { "epoch": 5.871057146627953, "grad_norm": 0.04125366359949112, "learning_rate": 2.0616690991470477e-05, "loss": 0.1512, "num_input_tokens_seen": 19883056, "step": 22245 }, { "epoch": 5.872376930183449, "grad_norm": 0.15904560685157776, "learning_rate": 2.0607025932240595e-05, "loss": 0.0314, "num_input_tokens_seen": 19887536, "step": 22250 }, { "epoch": 5.873696713738947, "grad_norm": 0.0795137956738472, "learning_rate": 2.059736155046251e-05, "loss": 0.0436, "num_input_tokens_seen": 19892112, "step": 22255 }, { "epoch": 5.875016497294443, "grad_norm": 0.06374108046293259, "learning_rate": 2.0587697847626603e-05, "loss": 0.0305, "num_input_tokens_seen": 19896336, "step": 22260 }, { "epoch": 5.876336280849941, "grad_norm": 0.2981642782688141, "learning_rate": 2.057803482522314e-05, "loss": 0.0845, "num_input_tokens_seen": 19900592, "step": 22265 }, { "epoch": 5.877656064405437, "grad_norm": 0.206697478890419, "learning_rate": 2.056837248474227e-05, "loss": 0.059, "num_input_tokens_seen": 19905456, "step": 22270 }, { "epoch": 5.878975847960934, "grad_norm": 0.40407702326774597, "learning_rate": 2.0558710827674064e-05, "loss": 0.0736, "num_input_tokens_seen": 19909968, "step": 22275 }, { "epoch": 5.880295631516431, "grad_norm": 0.643403172492981, "learning_rate": 2.054904985550845e-05, "loss": 0.1173, "num_input_tokens_seen": 19914544, "step": 22280 }, { "epoch": 5.881615415071928, "grad_norm": 0.04485026001930237, "learning_rate": 2.0539389569735287e-05, "loss": 0.0482, "num_input_tokens_seen": 19919088, "step": 22285 }, { "epoch": 5.882935198627425, "grad_norm": 0.37334638833999634, "learning_rate": 2.052972997184431e-05, "loss": 0.0825, "num_input_tokens_seen": 19923280, "step": 22290 }, { "epoch": 5.884254982182922, "grad_norm": 0.3444516658782959, "learning_rate": 2.0520071063325146e-05, "loss": 0.0841, "num_input_tokens_seen": 19928080, "step": 22295 }, { "epoch": 5.8855747657384185, "grad_norm": 0.12093238532543182, "learning_rate": 2.051041284566732e-05, "loss": 0.0728, "num_input_tokens_seen": 19932592, "step": 22300 }, { "epoch": 5.886894549293916, "grad_norm": 0.11455899477005005, "learning_rate": 2.050075532036026e-05, "loss": 0.0732, "num_input_tokens_seen": 19936944, "step": 22305 }, { "epoch": 5.8882143328494125, "grad_norm": 0.36267268657684326, "learning_rate": 2.0491098488893264e-05, "loss": 0.0699, "num_input_tokens_seen": 19941616, "step": 22310 }, { "epoch": 5.88953411640491, "grad_norm": 0.17083221673965454, "learning_rate": 2.0481442352755546e-05, "loss": 0.0902, "num_input_tokens_seen": 19946160, "step": 22315 }, { "epoch": 5.8908538999604065, "grad_norm": 0.13374413549900055, "learning_rate": 2.0471786913436198e-05, "loss": 0.0391, "num_input_tokens_seen": 19951024, "step": 22320 }, { "epoch": 5.892173683515903, "grad_norm": 0.637289822101593, "learning_rate": 2.0462132172424218e-05, "loss": 0.1144, "num_input_tokens_seen": 19955632, "step": 22325 }, { "epoch": 5.8934934670714005, "grad_norm": 0.19507372379302979, "learning_rate": 2.0452478131208484e-05, "loss": 0.0686, "num_input_tokens_seen": 19960048, "step": 22330 }, { "epoch": 5.894813250626897, "grad_norm": 0.2867993712425232, "learning_rate": 2.0442824791277765e-05, "loss": 0.0589, "num_input_tokens_seen": 19964496, "step": 22335 }, { "epoch": 5.8961330341823945, "grad_norm": 0.18662673234939575, "learning_rate": 2.0433172154120727e-05, "loss": 0.0954, "num_input_tokens_seen": 19969008, "step": 22340 }, { "epoch": 5.897452817737891, "grad_norm": 0.22194775938987732, "learning_rate": 2.0423520221225947e-05, "loss": 0.0613, "num_input_tokens_seen": 19973616, "step": 22345 }, { "epoch": 5.898772601293388, "grad_norm": 0.42935243248939514, "learning_rate": 2.0413868994081848e-05, "loss": 0.1126, "num_input_tokens_seen": 19977872, "step": 22350 }, { "epoch": 5.900092384848885, "grad_norm": 0.04585396498441696, "learning_rate": 2.0404218474176795e-05, "loss": 0.0892, "num_input_tokens_seen": 19982704, "step": 22355 }, { "epoch": 5.901412168404382, "grad_norm": 0.07347700744867325, "learning_rate": 2.0394568662999002e-05, "loss": 0.0404, "num_input_tokens_seen": 19987088, "step": 22360 }, { "epoch": 5.902731951959878, "grad_norm": 0.10140080749988556, "learning_rate": 2.0384919562036593e-05, "loss": 0.0423, "num_input_tokens_seen": 19991664, "step": 22365 }, { "epoch": 5.904051735515376, "grad_norm": 0.2450418621301651, "learning_rate": 2.0375271172777593e-05, "loss": 0.0803, "num_input_tokens_seen": 19995888, "step": 22370 }, { "epoch": 5.905371519070872, "grad_norm": 0.08728967607021332, "learning_rate": 2.0365623496709885e-05, "loss": 0.0227, "num_input_tokens_seen": 20000240, "step": 22375 }, { "epoch": 5.90669130262637, "grad_norm": 0.07881025224924088, "learning_rate": 2.0355976535321283e-05, "loss": 0.0476, "num_input_tokens_seen": 20004688, "step": 22380 }, { "epoch": 5.908011086181866, "grad_norm": 0.3610896170139313, "learning_rate": 2.034633029009945e-05, "loss": 0.0737, "num_input_tokens_seen": 20009264, "step": 22385 }, { "epoch": 5.909330869737363, "grad_norm": 0.26583799719810486, "learning_rate": 2.0336684762531972e-05, "loss": 0.0979, "num_input_tokens_seen": 20013712, "step": 22390 }, { "epoch": 5.91065065329286, "grad_norm": 0.1178184300661087, "learning_rate": 2.032703995410631e-05, "loss": 0.0693, "num_input_tokens_seen": 20018288, "step": 22395 }, { "epoch": 5.911970436848357, "grad_norm": 0.19758890569210052, "learning_rate": 2.031739586630981e-05, "loss": 0.031, "num_input_tokens_seen": 20022672, "step": 22400 }, { "epoch": 5.911970436848357, "eval_loss": 0.06776358187198639, "eval_runtime": 64.7323, "eval_samples_per_second": 104.044, "eval_steps_per_second": 26.015, "num_input_tokens_seen": 20022672, "step": 22400 }, { "epoch": 5.913290220403853, "grad_norm": 0.34361952543258667, "learning_rate": 2.0307752500629707e-05, "loss": 0.0628, "num_input_tokens_seen": 20027120, "step": 22405 }, { "epoch": 5.914610003959351, "grad_norm": 0.2447052150964737, "learning_rate": 2.0298109858553144e-05, "loss": 0.0901, "num_input_tokens_seen": 20031504, "step": 22410 }, { "epoch": 5.915929787514847, "grad_norm": 0.17563025653362274, "learning_rate": 2.028846794156712e-05, "loss": 0.0348, "num_input_tokens_seen": 20035952, "step": 22415 }, { "epoch": 5.917249571070345, "grad_norm": 0.06616902351379395, "learning_rate": 2.027882675115856e-05, "loss": 0.1116, "num_input_tokens_seen": 20040464, "step": 22420 }, { "epoch": 5.918569354625841, "grad_norm": 0.12638072669506073, "learning_rate": 2.026918628881423e-05, "loss": 0.0387, "num_input_tokens_seen": 20045136, "step": 22425 }, { "epoch": 5.919889138181338, "grad_norm": 0.11560352146625519, "learning_rate": 2.0259546556020833e-05, "loss": 0.0455, "num_input_tokens_seen": 20049680, "step": 22430 }, { "epoch": 5.921208921736835, "grad_norm": 0.11841684579849243, "learning_rate": 2.024990755426493e-05, "loss": 0.0943, "num_input_tokens_seen": 20053840, "step": 22435 }, { "epoch": 5.922528705292332, "grad_norm": 0.39667266607284546, "learning_rate": 2.0240269285032975e-05, "loss": 0.067, "num_input_tokens_seen": 20058160, "step": 22440 }, { "epoch": 5.923848488847829, "grad_norm": 0.06110318377614021, "learning_rate": 2.0230631749811306e-05, "loss": 0.0384, "num_input_tokens_seen": 20062640, "step": 22445 }, { "epoch": 5.925168272403326, "grad_norm": 0.07051922380924225, "learning_rate": 2.0220994950086162e-05, "loss": 0.0578, "num_input_tokens_seen": 20066800, "step": 22450 }, { "epoch": 5.926488055958822, "grad_norm": 0.07160522043704987, "learning_rate": 2.021135888734365e-05, "loss": 0.0433, "num_input_tokens_seen": 20071152, "step": 22455 }, { "epoch": 5.92780783951432, "grad_norm": 0.19556304812431335, "learning_rate": 2.0201723563069783e-05, "loss": 0.0696, "num_input_tokens_seen": 20075376, "step": 22460 }, { "epoch": 5.929127623069816, "grad_norm": 0.09849676489830017, "learning_rate": 2.0192088978750433e-05, "loss": 0.0291, "num_input_tokens_seen": 20080048, "step": 22465 }, { "epoch": 5.930447406625314, "grad_norm": 0.352900892496109, "learning_rate": 2.0182455135871385e-05, "loss": 0.0893, "num_input_tokens_seen": 20084400, "step": 22470 }, { "epoch": 5.9317671901808104, "grad_norm": 0.07026854157447815, "learning_rate": 2.0172822035918305e-05, "loss": 0.0524, "num_input_tokens_seen": 20089264, "step": 22475 }, { "epoch": 5.933086973736307, "grad_norm": 0.4231656491756439, "learning_rate": 2.016318968037671e-05, "loss": 0.0689, "num_input_tokens_seen": 20093808, "step": 22480 }, { "epoch": 5.9344067572918044, "grad_norm": 0.07129927724599838, "learning_rate": 2.015355807073206e-05, "loss": 0.0459, "num_input_tokens_seen": 20098544, "step": 22485 }, { "epoch": 5.935726540847301, "grad_norm": 0.07132765650749207, "learning_rate": 2.0143927208469664e-05, "loss": 0.0801, "num_input_tokens_seen": 20102992, "step": 22490 }, { "epoch": 5.937046324402798, "grad_norm": 0.225108340382576, "learning_rate": 2.0134297095074708e-05, "loss": 0.0351, "num_input_tokens_seen": 20107696, "step": 22495 }, { "epoch": 5.938366107958295, "grad_norm": 0.12584401667118073, "learning_rate": 2.0124667732032297e-05, "loss": 0.0588, "num_input_tokens_seen": 20112336, "step": 22500 }, { "epoch": 5.939685891513792, "grad_norm": 0.2663365602493286, "learning_rate": 2.011503912082738e-05, "loss": 0.1231, "num_input_tokens_seen": 20116368, "step": 22505 }, { "epoch": 5.941005675069289, "grad_norm": 0.18252389132976532, "learning_rate": 2.0105411262944823e-05, "loss": 0.0514, "num_input_tokens_seen": 20120976, "step": 22510 }, { "epoch": 5.942325458624786, "grad_norm": 0.24875110387802124, "learning_rate": 2.0095784159869366e-05, "loss": 0.0851, "num_input_tokens_seen": 20125392, "step": 22515 }, { "epoch": 5.943645242180282, "grad_norm": 0.07982087880373001, "learning_rate": 2.0086157813085608e-05, "loss": 0.0601, "num_input_tokens_seen": 20129776, "step": 22520 }, { "epoch": 5.94496502573578, "grad_norm": 0.17628490924835205, "learning_rate": 2.0076532224078068e-05, "loss": 0.0512, "num_input_tokens_seen": 20134320, "step": 22525 }, { "epoch": 5.946284809291276, "grad_norm": 0.17296676337718964, "learning_rate": 2.0066907394331142e-05, "loss": 0.0761, "num_input_tokens_seen": 20138992, "step": 22530 }, { "epoch": 5.947604592846773, "grad_norm": 0.3404502272605896, "learning_rate": 2.0057283325329077e-05, "loss": 0.0623, "num_input_tokens_seen": 20143504, "step": 22535 }, { "epoch": 5.94892437640227, "grad_norm": 0.07860489934682846, "learning_rate": 2.0047660018556047e-05, "loss": 0.0791, "num_input_tokens_seen": 20147472, "step": 22540 }, { "epoch": 5.950244159957767, "grad_norm": 0.13407465815544128, "learning_rate": 2.0038037475496075e-05, "loss": 0.0486, "num_input_tokens_seen": 20151824, "step": 22545 }, { "epoch": 5.951563943513264, "grad_norm": 0.2956903576850891, "learning_rate": 2.0028415697633073e-05, "loss": 0.0651, "num_input_tokens_seen": 20156464, "step": 22550 }, { "epoch": 5.952883727068761, "grad_norm": 0.07711460441350937, "learning_rate": 2.0018794686450858e-05, "loss": 0.0438, "num_input_tokens_seen": 20161072, "step": 22555 }, { "epoch": 5.954203510624257, "grad_norm": 0.04942665994167328, "learning_rate": 2.0009174443433088e-05, "loss": 0.0362, "num_input_tokens_seen": 20165808, "step": 22560 }, { "epoch": 5.955523294179755, "grad_norm": 0.20319174230098724, "learning_rate": 1.999955497006334e-05, "loss": 0.0435, "num_input_tokens_seen": 20170160, "step": 22565 }, { "epoch": 5.956843077735251, "grad_norm": 0.17572377622127533, "learning_rate": 1.9989936267825067e-05, "loss": 0.0229, "num_input_tokens_seen": 20174768, "step": 22570 }, { "epoch": 5.958162861290749, "grad_norm": 0.027589555829763412, "learning_rate": 1.9980318338201572e-05, "loss": 0.0807, "num_input_tokens_seen": 20179344, "step": 22575 }, { "epoch": 5.959482644846245, "grad_norm": 0.1353382170200348, "learning_rate": 1.997070118267607e-05, "loss": 0.0327, "num_input_tokens_seen": 20183952, "step": 22580 }, { "epoch": 5.960802428401742, "grad_norm": 0.27334097027778625, "learning_rate": 1.9961084802731654e-05, "loss": 0.0676, "num_input_tokens_seen": 20188336, "step": 22585 }, { "epoch": 5.962122211957239, "grad_norm": 0.11162400990724564, "learning_rate": 1.9951469199851273e-05, "loss": 0.0322, "num_input_tokens_seen": 20192688, "step": 22590 }, { "epoch": 5.963441995512736, "grad_norm": 0.08960824459791183, "learning_rate": 1.99418543755178e-05, "loss": 0.02, "num_input_tokens_seen": 20196848, "step": 22595 }, { "epoch": 5.964761779068233, "grad_norm": 0.2577708959579468, "learning_rate": 1.9932240331213936e-05, "loss": 0.0648, "num_input_tokens_seen": 20201808, "step": 22600 }, { "epoch": 5.964761779068233, "eval_loss": 0.06784471869468689, "eval_runtime": 64.8003, "eval_samples_per_second": 103.935, "eval_steps_per_second": 25.988, "num_input_tokens_seen": 20201808, "step": 22600 }, { "epoch": 5.96608156262373, "grad_norm": 0.11876357346773148, "learning_rate": 1.9922627068422297e-05, "loss": 0.0545, "num_input_tokens_seen": 20206064, "step": 22605 }, { "epoch": 5.967401346179226, "grad_norm": 0.09185434877872467, "learning_rate": 1.991301458862538e-05, "loss": 0.0882, "num_input_tokens_seen": 20210480, "step": 22610 }, { "epoch": 5.968721129734724, "grad_norm": 0.21985825896263123, "learning_rate": 1.9903402893305536e-05, "loss": 0.0806, "num_input_tokens_seen": 20215024, "step": 22615 }, { "epoch": 5.97004091329022, "grad_norm": 0.15698637068271637, "learning_rate": 1.9893791983945016e-05, "loss": 0.0478, "num_input_tokens_seen": 20219312, "step": 22620 }, { "epoch": 5.971360696845718, "grad_norm": 0.14252899587154388, "learning_rate": 1.988418186202594e-05, "loss": 0.0242, "num_input_tokens_seen": 20223920, "step": 22625 }, { "epoch": 5.972680480401214, "grad_norm": 0.17126336693763733, "learning_rate": 1.98745725290303e-05, "loss": 0.058, "num_input_tokens_seen": 20228144, "step": 22630 }, { "epoch": 5.974000263956711, "grad_norm": 0.1142592802643776, "learning_rate": 1.986496398644e-05, "loss": 0.0362, "num_input_tokens_seen": 20232752, "step": 22635 }, { "epoch": 5.975320047512208, "grad_norm": 0.07449564337730408, "learning_rate": 1.9855356235736777e-05, "loss": 0.038, "num_input_tokens_seen": 20237648, "step": 22640 }, { "epoch": 5.976639831067705, "grad_norm": 0.596913754940033, "learning_rate": 1.9845749278402277e-05, "loss": 0.0427, "num_input_tokens_seen": 20242064, "step": 22645 }, { "epoch": 5.9779596146232015, "grad_norm": 0.22089427709579468, "learning_rate": 1.9836143115918006e-05, "loss": 0.0739, "num_input_tokens_seen": 20246608, "step": 22650 }, { "epoch": 5.979279398178699, "grad_norm": 0.06661208719015121, "learning_rate": 1.9826537749765367e-05, "loss": 0.0609, "num_input_tokens_seen": 20251120, "step": 22655 }, { "epoch": 5.9805991817341955, "grad_norm": 0.1210612803697586, "learning_rate": 1.9816933181425625e-05, "loss": 0.0279, "num_input_tokens_seen": 20255440, "step": 22660 }, { "epoch": 5.981918965289692, "grad_norm": 0.23772338032722473, "learning_rate": 1.9807329412379903e-05, "loss": 0.0486, "num_input_tokens_seen": 20259760, "step": 22665 }, { "epoch": 5.9832387488451895, "grad_norm": 0.05137418210506439, "learning_rate": 1.9797726444109247e-05, "loss": 0.0659, "num_input_tokens_seen": 20264272, "step": 22670 }, { "epoch": 5.984558532400686, "grad_norm": 0.4696442782878876, "learning_rate": 1.9788124278094557e-05, "loss": 0.0922, "num_input_tokens_seen": 20268592, "step": 22675 }, { "epoch": 5.9858783159561835, "grad_norm": 0.03975774347782135, "learning_rate": 1.9778522915816594e-05, "loss": 0.044, "num_input_tokens_seen": 20272944, "step": 22680 }, { "epoch": 5.98719809951168, "grad_norm": 0.2779412269592285, "learning_rate": 1.9768922358756014e-05, "loss": 0.0425, "num_input_tokens_seen": 20277584, "step": 22685 }, { "epoch": 5.988517883067177, "grad_norm": 0.10319958627223969, "learning_rate": 1.9759322608393353e-05, "loss": 0.0913, "num_input_tokens_seen": 20282256, "step": 22690 }, { "epoch": 5.989837666622674, "grad_norm": 0.08342566341161728, "learning_rate": 1.9749723666208992e-05, "loss": 0.096, "num_input_tokens_seen": 20286416, "step": 22695 }, { "epoch": 5.991157450178171, "grad_norm": 0.22150297462940216, "learning_rate": 1.9740125533683235e-05, "loss": 0.0861, "num_input_tokens_seen": 20290864, "step": 22700 }, { "epoch": 5.992477233733668, "grad_norm": 0.2464013397693634, "learning_rate": 1.9730528212296208e-05, "loss": 0.0231, "num_input_tokens_seen": 20295440, "step": 22705 }, { "epoch": 5.993797017289165, "grad_norm": 0.024801168590784073, "learning_rate": 1.9720931703527945e-05, "loss": 0.0192, "num_input_tokens_seen": 20299984, "step": 22710 }, { "epoch": 5.995116800844661, "grad_norm": 0.10164182633161545, "learning_rate": 1.9711336008858373e-05, "loss": 0.0544, "num_input_tokens_seen": 20304560, "step": 22715 }, { "epoch": 5.996436584400159, "grad_norm": 0.2264719307422638, "learning_rate": 1.9701741129767233e-05, "loss": 0.062, "num_input_tokens_seen": 20309200, "step": 22720 }, { "epoch": 5.997756367955655, "grad_norm": 0.047787144780159, "learning_rate": 1.9692147067734202e-05, "loss": 0.0684, "num_input_tokens_seen": 20313744, "step": 22725 }, { "epoch": 5.999076151511153, "grad_norm": 0.24971498548984528, "learning_rate": 1.96825538242388e-05, "loss": 0.0419, "num_input_tokens_seen": 20318000, "step": 22730 }, { "epoch": 6.000263956711099, "grad_norm": 0.1991625726222992, "learning_rate": 1.967296140076041e-05, "loss": 0.0843, "num_input_tokens_seen": 20321824, "step": 22735 }, { "epoch": 6.001583740266597, "grad_norm": 0.4505075216293335, "learning_rate": 1.966336979877833e-05, "loss": 0.0678, "num_input_tokens_seen": 20326432, "step": 22740 }, { "epoch": 6.002903523822093, "grad_norm": 0.05449704825878143, "learning_rate": 1.9653779019771678e-05, "loss": 0.0397, "num_input_tokens_seen": 20331008, "step": 22745 }, { "epoch": 6.00422330737759, "grad_norm": 0.18573202192783356, "learning_rate": 1.9644189065219488e-05, "loss": 0.0431, "num_input_tokens_seen": 20335520, "step": 22750 }, { "epoch": 6.005543090933087, "grad_norm": 0.2909197211265564, "learning_rate": 1.9634599936600655e-05, "loss": 0.0547, "num_input_tokens_seen": 20340064, "step": 22755 }, { "epoch": 6.006862874488584, "grad_norm": 0.3009883165359497, "learning_rate": 1.9625011635393935e-05, "loss": 0.0771, "num_input_tokens_seen": 20344736, "step": 22760 }, { "epoch": 6.008182658044081, "grad_norm": 0.22672471404075623, "learning_rate": 1.9615424163077963e-05, "loss": 0.0432, "num_input_tokens_seen": 20349536, "step": 22765 }, { "epoch": 6.009502441599578, "grad_norm": 0.03663812577724457, "learning_rate": 1.9605837521131263e-05, "loss": 0.0466, "num_input_tokens_seen": 20354048, "step": 22770 }, { "epoch": 6.010822225155074, "grad_norm": 0.37975892424583435, "learning_rate": 1.9596251711032192e-05, "loss": 0.0538, "num_input_tokens_seen": 20358592, "step": 22775 }, { "epoch": 6.012142008710572, "grad_norm": 0.37234705686569214, "learning_rate": 1.958666673425903e-05, "loss": 0.0822, "num_input_tokens_seen": 20362752, "step": 22780 }, { "epoch": 6.013461792266068, "grad_norm": 0.0928589329123497, "learning_rate": 1.957708259228987e-05, "loss": 0.0291, "num_input_tokens_seen": 20367232, "step": 22785 }, { "epoch": 6.014781575821566, "grad_norm": 0.3026881814002991, "learning_rate": 1.956749928660273e-05, "loss": 0.0849, "num_input_tokens_seen": 20371712, "step": 22790 }, { "epoch": 6.016101359377062, "grad_norm": 0.34005528688430786, "learning_rate": 1.955791681867547e-05, "loss": 0.0679, "num_input_tokens_seen": 20376096, "step": 22795 }, { "epoch": 6.017421142932559, "grad_norm": 0.47880837321281433, "learning_rate": 1.9548335189985824e-05, "loss": 0.0647, "num_input_tokens_seen": 20380512, "step": 22800 }, { "epoch": 6.017421142932559, "eval_loss": 0.06773233413696289, "eval_runtime": 64.7924, "eval_samples_per_second": 103.947, "eval_steps_per_second": 25.991, "num_input_tokens_seen": 20380512, "step": 22800 }, { "epoch": 6.018740926488056, "grad_norm": 0.2146192044019699, "learning_rate": 1.9538754402011396e-05, "loss": 0.0558, "num_input_tokens_seen": 20384864, "step": 22805 }, { "epoch": 6.020060710043553, "grad_norm": 0.26803654432296753, "learning_rate": 1.952917445622968e-05, "loss": 0.1158, "num_input_tokens_seen": 20389280, "step": 22810 }, { "epoch": 6.021380493599049, "grad_norm": 0.08758390694856644, "learning_rate": 1.9519595354118005e-05, "loss": 0.0199, "num_input_tokens_seen": 20393632, "step": 22815 }, { "epoch": 6.022700277154547, "grad_norm": 0.07868046313524246, "learning_rate": 1.951001709715361e-05, "loss": 0.044, "num_input_tokens_seen": 20398016, "step": 22820 }, { "epoch": 6.024020060710043, "grad_norm": 0.04365616664290428, "learning_rate": 1.9500439686813556e-05, "loss": 0.0461, "num_input_tokens_seen": 20402432, "step": 22825 }, { "epoch": 6.025339844265541, "grad_norm": 0.09426572918891907, "learning_rate": 1.949086312457482e-05, "loss": 0.0232, "num_input_tokens_seen": 20406656, "step": 22830 }, { "epoch": 6.026659627821037, "grad_norm": 0.034761928021907806, "learning_rate": 1.9481287411914223e-05, "loss": 0.0396, "num_input_tokens_seen": 20411136, "step": 22835 }, { "epoch": 6.027979411376534, "grad_norm": 0.15311692655086517, "learning_rate": 1.9471712550308457e-05, "loss": 0.0767, "num_input_tokens_seen": 20415584, "step": 22840 }, { "epoch": 6.029299194932031, "grad_norm": 0.4278508126735687, "learning_rate": 1.946213854123409e-05, "loss": 0.0714, "num_input_tokens_seen": 20420096, "step": 22845 }, { "epoch": 6.030618978487528, "grad_norm": 0.06508508324623108, "learning_rate": 1.9452565386167554e-05, "loss": 0.0538, "num_input_tokens_seen": 20424640, "step": 22850 }, { "epoch": 6.031938762043025, "grad_norm": 0.31825318932533264, "learning_rate": 1.9442993086585142e-05, "loss": 0.0592, "num_input_tokens_seen": 20429312, "step": 22855 }, { "epoch": 6.033258545598522, "grad_norm": 0.028642315417528152, "learning_rate": 1.9433421643963043e-05, "loss": 0.0687, "num_input_tokens_seen": 20433792, "step": 22860 }, { "epoch": 6.0345783291540185, "grad_norm": 0.25238311290740967, "learning_rate": 1.942385105977727e-05, "loss": 0.075, "num_input_tokens_seen": 20438720, "step": 22865 }, { "epoch": 6.035898112709516, "grad_norm": 0.15502043068408966, "learning_rate": 1.9414281335503743e-05, "loss": 0.0343, "num_input_tokens_seen": 20443232, "step": 22870 }, { "epoch": 6.0372178962650125, "grad_norm": 0.4300287663936615, "learning_rate": 1.9404712472618232e-05, "loss": 0.0732, "num_input_tokens_seen": 20447680, "step": 22875 }, { "epoch": 6.038537679820509, "grad_norm": 0.24386338889598846, "learning_rate": 1.939514447259636e-05, "loss": 0.0643, "num_input_tokens_seen": 20452160, "step": 22880 }, { "epoch": 6.0398574633760065, "grad_norm": 0.16614852845668793, "learning_rate": 1.938557733691365e-05, "loss": 0.0438, "num_input_tokens_seen": 20456544, "step": 22885 }, { "epoch": 6.041177246931503, "grad_norm": 0.08854377269744873, "learning_rate": 1.9376011067045476e-05, "loss": 0.0363, "num_input_tokens_seen": 20461248, "step": 22890 }, { "epoch": 6.0424970304870005, "grad_norm": 0.11368018388748169, "learning_rate": 1.9366445664467065e-05, "loss": 0.0451, "num_input_tokens_seen": 20465632, "step": 22895 }, { "epoch": 6.043816814042497, "grad_norm": 0.05930309742689133, "learning_rate": 1.9356881130653533e-05, "loss": 0.0614, "num_input_tokens_seen": 20470304, "step": 22900 }, { "epoch": 6.045136597597994, "grad_norm": 0.340002179145813, "learning_rate": 1.9347317467079846e-05, "loss": 0.0355, "num_input_tokens_seen": 20474784, "step": 22905 }, { "epoch": 6.046456381153491, "grad_norm": 0.11450517177581787, "learning_rate": 1.9337754675220836e-05, "loss": 0.0624, "num_input_tokens_seen": 20479296, "step": 22910 }, { "epoch": 6.047776164708988, "grad_norm": 0.11731050908565521, "learning_rate": 1.9328192756551218e-05, "loss": 0.0461, "num_input_tokens_seen": 20483584, "step": 22915 }, { "epoch": 6.049095948264485, "grad_norm": 0.33171775937080383, "learning_rate": 1.931863171254555e-05, "loss": 0.106, "num_input_tokens_seen": 20488320, "step": 22920 }, { "epoch": 6.050415731819982, "grad_norm": 0.2961711287498474, "learning_rate": 1.930907154467826e-05, "loss": 0.0737, "num_input_tokens_seen": 20492736, "step": 22925 }, { "epoch": 6.051735515375478, "grad_norm": 0.2513962984085083, "learning_rate": 1.9299512254423673e-05, "loss": 0.0529, "num_input_tokens_seen": 20497216, "step": 22930 }, { "epoch": 6.053055298930976, "grad_norm": 0.03956704959273338, "learning_rate": 1.9289953843255914e-05, "loss": 0.0236, "num_input_tokens_seen": 20501696, "step": 22935 }, { "epoch": 6.054375082486472, "grad_norm": 0.2110985964536667, "learning_rate": 1.9280396312649048e-05, "loss": 0.0449, "num_input_tokens_seen": 20506368, "step": 22940 }, { "epoch": 6.055694866041969, "grad_norm": 0.12699942290782928, "learning_rate": 1.9270839664076936e-05, "loss": 0.0345, "num_input_tokens_seen": 20510688, "step": 22945 }, { "epoch": 6.057014649597466, "grad_norm": 0.08285914361476898, "learning_rate": 1.9261283899013345e-05, "loss": 0.0432, "num_input_tokens_seen": 20515424, "step": 22950 }, { "epoch": 6.058334433152963, "grad_norm": 0.10564824938774109, "learning_rate": 1.92517290189319e-05, "loss": 0.0468, "num_input_tokens_seen": 20520000, "step": 22955 }, { "epoch": 6.05965421670846, "grad_norm": 0.11849465221166611, "learning_rate": 1.924217502530607e-05, "loss": 0.0378, "num_input_tokens_seen": 20524480, "step": 22960 }, { "epoch": 6.060974000263957, "grad_norm": 0.21841368079185486, "learning_rate": 1.9232621919609207e-05, "loss": 0.061, "num_input_tokens_seen": 20529120, "step": 22965 }, { "epoch": 6.062293783819453, "grad_norm": 0.039363421499729156, "learning_rate": 1.9223069703314534e-05, "loss": 0.0854, "num_input_tokens_seen": 20533824, "step": 22970 }, { "epoch": 6.063613567374951, "grad_norm": 0.17661263048648834, "learning_rate": 1.92135183778951e-05, "loss": 0.0256, "num_input_tokens_seen": 20538240, "step": 22975 }, { "epoch": 6.064933350930447, "grad_norm": 0.13207556307315826, "learning_rate": 1.9203967944823857e-05, "loss": 0.0279, "num_input_tokens_seen": 20542624, "step": 22980 }, { "epoch": 6.066253134485945, "grad_norm": 0.33705660700798035, "learning_rate": 1.9194418405573588e-05, "loss": 0.0559, "num_input_tokens_seen": 20547040, "step": 22985 }, { "epoch": 6.067572918041441, "grad_norm": 0.12375576794147491, "learning_rate": 1.9184869761616954e-05, "loss": 0.0613, "num_input_tokens_seen": 20551616, "step": 22990 }, { "epoch": 6.068892701596938, "grad_norm": 0.08161190897226334, "learning_rate": 1.9175322014426495e-05, "loss": 0.0623, "num_input_tokens_seen": 20555712, "step": 22995 }, { "epoch": 6.070212485152435, "grad_norm": 0.5069274306297302, "learning_rate": 1.9165775165474565e-05, "loss": 0.0986, "num_input_tokens_seen": 20560608, "step": 23000 }, { "epoch": 6.070212485152435, "eval_loss": 0.06800545006990433, "eval_runtime": 64.7414, "eval_samples_per_second": 104.029, "eval_steps_per_second": 26.011, "num_input_tokens_seen": 20560608, "step": 23000 }, { "epoch": 6.071532268707932, "grad_norm": 0.1637074500322342, "learning_rate": 1.9156229216233434e-05, "loss": 0.0629, "num_input_tokens_seen": 20565088, "step": 23005 }, { "epoch": 6.072852052263428, "grad_norm": 0.354707807302475, "learning_rate": 1.9146684168175184e-05, "loss": 0.0749, "num_input_tokens_seen": 20569888, "step": 23010 }, { "epoch": 6.074171835818926, "grad_norm": 0.1474297046661377, "learning_rate": 1.9137140022771796e-05, "loss": 0.0748, "num_input_tokens_seen": 20574080, "step": 23015 }, { "epoch": 6.075491619374422, "grad_norm": 0.2152809053659439, "learning_rate": 1.9127596781495103e-05, "loss": 0.0804, "num_input_tokens_seen": 20578528, "step": 23020 }, { "epoch": 6.07681140292992, "grad_norm": 0.04411572590470314, "learning_rate": 1.9118054445816767e-05, "loss": 0.0547, "num_input_tokens_seen": 20583104, "step": 23025 }, { "epoch": 6.078131186485416, "grad_norm": 0.22132724523544312, "learning_rate": 1.9108513017208356e-05, "loss": 0.0564, "num_input_tokens_seen": 20587744, "step": 23030 }, { "epoch": 6.079450970040913, "grad_norm": 0.11532240360975266, "learning_rate": 1.9098972497141287e-05, "loss": 0.0384, "num_input_tokens_seen": 20591872, "step": 23035 }, { "epoch": 6.08077075359641, "grad_norm": 0.04132236912846565, "learning_rate": 1.9089432887086806e-05, "loss": 0.0493, "num_input_tokens_seen": 20596032, "step": 23040 }, { "epoch": 6.082090537151907, "grad_norm": 0.20819427073001862, "learning_rate": 1.9079894188516056e-05, "loss": 0.0429, "num_input_tokens_seen": 20600448, "step": 23045 }, { "epoch": 6.083410320707404, "grad_norm": 0.18943600356578827, "learning_rate": 1.907035640290002e-05, "loss": 0.1047, "num_input_tokens_seen": 20604832, "step": 23050 }, { "epoch": 6.084730104262901, "grad_norm": 0.055999405682086945, "learning_rate": 1.9060819531709534e-05, "loss": 0.0365, "num_input_tokens_seen": 20609152, "step": 23055 }, { "epoch": 6.0860498878183975, "grad_norm": 0.13046874105930328, "learning_rate": 1.9051283576415325e-05, "loss": 0.0532, "num_input_tokens_seen": 20613696, "step": 23060 }, { "epoch": 6.087369671373895, "grad_norm": 0.22738541662693024, "learning_rate": 1.904174853848793e-05, "loss": 0.077, "num_input_tokens_seen": 20618496, "step": 23065 }, { "epoch": 6.0886894549293915, "grad_norm": 0.14440608024597168, "learning_rate": 1.903221441939779e-05, "loss": 0.0343, "num_input_tokens_seen": 20622976, "step": 23070 }, { "epoch": 6.090009238484888, "grad_norm": 0.39488643407821655, "learning_rate": 1.9022681220615194e-05, "loss": 0.0372, "num_input_tokens_seen": 20627648, "step": 23075 }, { "epoch": 6.0913290220403855, "grad_norm": 0.23189353942871094, "learning_rate": 1.9013148943610255e-05, "loss": 0.1224, "num_input_tokens_seen": 20632128, "step": 23080 }, { "epoch": 6.092648805595882, "grad_norm": 0.24145640432834625, "learning_rate": 1.9003617589852998e-05, "loss": 0.0754, "num_input_tokens_seen": 20636672, "step": 23085 }, { "epoch": 6.0939685891513795, "grad_norm": 0.1598062366247177, "learning_rate": 1.899408716081326e-05, "loss": 0.0356, "num_input_tokens_seen": 20640992, "step": 23090 }, { "epoch": 6.095288372706876, "grad_norm": 0.1900138109922409, "learning_rate": 1.898455765796075e-05, "loss": 0.0242, "num_input_tokens_seen": 20645440, "step": 23095 }, { "epoch": 6.096608156262373, "grad_norm": 0.0436834841966629, "learning_rate": 1.8975029082765053e-05, "loss": 0.0406, "num_input_tokens_seen": 20650048, "step": 23100 }, { "epoch": 6.09792793981787, "grad_norm": 0.15502937138080597, "learning_rate": 1.8965501436695577e-05, "loss": 0.0593, "num_input_tokens_seen": 20654688, "step": 23105 }, { "epoch": 6.099247723373367, "grad_norm": 0.14915795624256134, "learning_rate": 1.895597472122161e-05, "loss": 0.0658, "num_input_tokens_seen": 20659488, "step": 23110 }, { "epoch": 6.100567506928864, "grad_norm": 0.27350491285324097, "learning_rate": 1.894644893781231e-05, "loss": 0.0444, "num_input_tokens_seen": 20663968, "step": 23115 }, { "epoch": 6.101887290484361, "grad_norm": 0.08406408131122589, "learning_rate": 1.893692408793665e-05, "loss": 0.0936, "num_input_tokens_seen": 20668576, "step": 23120 }, { "epoch": 6.103207074039857, "grad_norm": 0.24237757921218872, "learning_rate": 1.8927400173063493e-05, "loss": 0.0403, "num_input_tokens_seen": 20673248, "step": 23125 }, { "epoch": 6.104526857595355, "grad_norm": 0.07170090824365616, "learning_rate": 1.891787719466154e-05, "loss": 0.0446, "num_input_tokens_seen": 20677728, "step": 23130 }, { "epoch": 6.105846641150851, "grad_norm": 0.14027707278728485, "learning_rate": 1.8908355154199346e-05, "loss": 0.031, "num_input_tokens_seen": 20682112, "step": 23135 }, { "epoch": 6.107166424706348, "grad_norm": 0.06461618095636368, "learning_rate": 1.8898834053145357e-05, "loss": 0.0523, "num_input_tokens_seen": 20686304, "step": 23140 }, { "epoch": 6.108486208261845, "grad_norm": 0.14769235253334045, "learning_rate": 1.8889313892967813e-05, "loss": 0.0781, "num_input_tokens_seen": 20690752, "step": 23145 }, { "epoch": 6.109805991817342, "grad_norm": 0.2689723074436188, "learning_rate": 1.8879794675134863e-05, "loss": 0.0481, "num_input_tokens_seen": 20695392, "step": 23150 }, { "epoch": 6.111125775372839, "grad_norm": 0.3610045909881592, "learning_rate": 1.8870276401114494e-05, "loss": 0.0756, "num_input_tokens_seen": 20699712, "step": 23155 }, { "epoch": 6.112445558928336, "grad_norm": 0.3233611583709717, "learning_rate": 1.886075907237453e-05, "loss": 0.0502, "num_input_tokens_seen": 20704160, "step": 23160 }, { "epoch": 6.113765342483832, "grad_norm": 0.15273818373680115, "learning_rate": 1.8851242690382672e-05, "loss": 0.0411, "num_input_tokens_seen": 20708672, "step": 23165 }, { "epoch": 6.11508512603933, "grad_norm": 0.25217145681381226, "learning_rate": 1.884172725660645e-05, "loss": 0.0396, "num_input_tokens_seen": 20713056, "step": 23170 }, { "epoch": 6.116404909594826, "grad_norm": 0.175252765417099, "learning_rate": 1.8832212772513277e-05, "loss": 0.0966, "num_input_tokens_seen": 20717344, "step": 23175 }, { "epoch": 6.117724693150324, "grad_norm": 0.05805163457989693, "learning_rate": 1.8822699239570414e-05, "loss": 0.0528, "num_input_tokens_seen": 20721472, "step": 23180 }, { "epoch": 6.11904447670582, "grad_norm": 0.06072873994708061, "learning_rate": 1.8813186659244943e-05, "loss": 0.0655, "num_input_tokens_seen": 20725792, "step": 23185 }, { "epoch": 6.120364260261317, "grad_norm": 0.16884228587150574, "learning_rate": 1.880367503300385e-05, "loss": 0.0598, "num_input_tokens_seen": 20730144, "step": 23190 }, { "epoch": 6.121684043816814, "grad_norm": 0.47738370299339294, "learning_rate": 1.8794164362313927e-05, "loss": 0.075, "num_input_tokens_seen": 20734656, "step": 23195 }, { "epoch": 6.123003827372311, "grad_norm": 0.3757287859916687, "learning_rate": 1.878465464864185e-05, "loss": 0.1126, "num_input_tokens_seen": 20739200, "step": 23200 }, { "epoch": 6.123003827372311, "eval_loss": 0.06785519421100616, "eval_runtime": 64.7514, "eval_samples_per_second": 104.013, "eval_steps_per_second": 26.007, "num_input_tokens_seen": 20739200, "step": 23200 }, { "epoch": 6.124323610927807, "grad_norm": 0.47358763217926025, "learning_rate": 1.877514589345414e-05, "loss": 0.111, "num_input_tokens_seen": 20743648, "step": 23205 }, { "epoch": 6.125643394483305, "grad_norm": 0.16667725145816803, "learning_rate": 1.876563809821715e-05, "loss": 0.0482, "num_input_tokens_seen": 20748000, "step": 23210 }, { "epoch": 6.126963178038801, "grad_norm": 0.5252594351768494, "learning_rate": 1.8756131264397106e-05, "loss": 0.0642, "num_input_tokens_seen": 20752576, "step": 23215 }, { "epoch": 6.128282961594299, "grad_norm": 0.08749470859766006, "learning_rate": 1.87466253934601e-05, "loss": 0.0565, "num_input_tokens_seen": 20756832, "step": 23220 }, { "epoch": 6.129602745149795, "grad_norm": 0.20065639913082123, "learning_rate": 1.8737120486872033e-05, "loss": 0.0178, "num_input_tokens_seen": 20761472, "step": 23225 }, { "epoch": 6.130922528705292, "grad_norm": 0.10488150268793106, "learning_rate": 1.8727616546098696e-05, "loss": 0.0592, "num_input_tokens_seen": 20765984, "step": 23230 }, { "epoch": 6.132242312260789, "grad_norm": 0.4782564043998718, "learning_rate": 1.8718113572605716e-05, "loss": 0.0973, "num_input_tokens_seen": 20770496, "step": 23235 }, { "epoch": 6.133562095816286, "grad_norm": 0.0979752317070961, "learning_rate": 1.8708611567858554e-05, "loss": 0.0651, "num_input_tokens_seen": 20775168, "step": 23240 }, { "epoch": 6.134881879371783, "grad_norm": 0.07275248318910599, "learning_rate": 1.8699110533322565e-05, "loss": 0.049, "num_input_tokens_seen": 20779840, "step": 23245 }, { "epoch": 6.13620166292728, "grad_norm": 0.13235941529273987, "learning_rate": 1.8689610470462897e-05, "loss": 0.0411, "num_input_tokens_seen": 20784192, "step": 23250 }, { "epoch": 6.1375214464827765, "grad_norm": 0.31818288564682007, "learning_rate": 1.8680111380744604e-05, "loss": 0.0618, "num_input_tokens_seen": 20788736, "step": 23255 }, { "epoch": 6.138841230038274, "grad_norm": 0.294825941324234, "learning_rate": 1.8670613265632564e-05, "loss": 0.0436, "num_input_tokens_seen": 20793248, "step": 23260 }, { "epoch": 6.1401610135937705, "grad_norm": 0.0656796470284462, "learning_rate": 1.866111612659149e-05, "loss": 0.043, "num_input_tokens_seen": 20797728, "step": 23265 }, { "epoch": 6.141480797149267, "grad_norm": 0.06125950813293457, "learning_rate": 1.8651619965085967e-05, "loss": 0.1042, "num_input_tokens_seen": 20802304, "step": 23270 }, { "epoch": 6.1428005807047645, "grad_norm": 0.18593533337116241, "learning_rate": 1.8642124782580433e-05, "loss": 0.0499, "num_input_tokens_seen": 20806432, "step": 23275 }, { "epoch": 6.144120364260261, "grad_norm": 0.05149536952376366, "learning_rate": 1.8632630580539144e-05, "loss": 0.0514, "num_input_tokens_seen": 20810976, "step": 23280 }, { "epoch": 6.1454401478157585, "grad_norm": 0.6380911469459534, "learning_rate": 1.862313736042625e-05, "loss": 0.0676, "num_input_tokens_seen": 20815584, "step": 23285 }, { "epoch": 6.146759931371255, "grad_norm": 0.24208343029022217, "learning_rate": 1.8613645123705703e-05, "loss": 0.0357, "num_input_tokens_seen": 20819904, "step": 23290 }, { "epoch": 6.148079714926752, "grad_norm": 0.2303633689880371, "learning_rate": 1.8604153871841328e-05, "loss": 0.0524, "num_input_tokens_seen": 20824608, "step": 23295 }, { "epoch": 6.149399498482249, "grad_norm": 0.34356626868247986, "learning_rate": 1.859466360629682e-05, "loss": 0.0346, "num_input_tokens_seen": 20828896, "step": 23300 }, { "epoch": 6.150719282037746, "grad_norm": 0.21986468136310577, "learning_rate": 1.8585174328535666e-05, "loss": 0.0633, "num_input_tokens_seen": 20833024, "step": 23305 }, { "epoch": 6.152039065593243, "grad_norm": 0.16313406825065613, "learning_rate": 1.857568604002124e-05, "loss": 0.0385, "num_input_tokens_seen": 20837632, "step": 23310 }, { "epoch": 6.15335884914874, "grad_norm": 0.2151777744293213, "learning_rate": 1.8566198742216774e-05, "loss": 0.1034, "num_input_tokens_seen": 20841728, "step": 23315 }, { "epoch": 6.154678632704236, "grad_norm": 0.03583676740527153, "learning_rate": 1.85567124365853e-05, "loss": 0.0202, "num_input_tokens_seen": 20846400, "step": 23320 }, { "epoch": 6.155998416259734, "grad_norm": 0.17931176722049713, "learning_rate": 1.854722712458975e-05, "loss": 0.0806, "num_input_tokens_seen": 20850912, "step": 23325 }, { "epoch": 6.15731819981523, "grad_norm": 0.04011833667755127, "learning_rate": 1.853774280769286e-05, "loss": 0.0592, "num_input_tokens_seen": 20855488, "step": 23330 }, { "epoch": 6.158637983370728, "grad_norm": 0.3229314088821411, "learning_rate": 1.852825948735724e-05, "loss": 0.0486, "num_input_tokens_seen": 20860288, "step": 23335 }, { "epoch": 6.159957766926224, "grad_norm": 0.17152704298496246, "learning_rate": 1.851877716504534e-05, "loss": 0.0417, "num_input_tokens_seen": 20864416, "step": 23340 }, { "epoch": 6.161277550481721, "grad_norm": 0.21125110983848572, "learning_rate": 1.8509295842219448e-05, "loss": 0.078, "num_input_tokens_seen": 20868864, "step": 23345 }, { "epoch": 6.162597334037218, "grad_norm": 0.5406584143638611, "learning_rate": 1.8499815520341697e-05, "loss": 0.0916, "num_input_tokens_seen": 20873472, "step": 23350 }, { "epoch": 6.163917117592715, "grad_norm": 0.14122548699378967, "learning_rate": 1.8490336200874094e-05, "loss": 0.0353, "num_input_tokens_seen": 20877568, "step": 23355 }, { "epoch": 6.165236901148211, "grad_norm": 0.032147087156772614, "learning_rate": 1.848085788527844e-05, "loss": 0.0379, "num_input_tokens_seen": 20882336, "step": 23360 }, { "epoch": 6.166556684703709, "grad_norm": 0.2314123958349228, "learning_rate": 1.847138057501644e-05, "loss": 0.0431, "num_input_tokens_seen": 20886912, "step": 23365 }, { "epoch": 6.167876468259205, "grad_norm": 0.3218337893486023, "learning_rate": 1.8461904271549582e-05, "loss": 0.0731, "num_input_tokens_seen": 20891200, "step": 23370 }, { "epoch": 6.169196251814703, "grad_norm": 0.2519700825214386, "learning_rate": 1.845242897633926e-05, "loss": 0.0471, "num_input_tokens_seen": 20895840, "step": 23375 }, { "epoch": 6.170516035370199, "grad_norm": 0.10085026919841766, "learning_rate": 1.844295469084667e-05, "loss": 0.0335, "num_input_tokens_seen": 20900384, "step": 23380 }, { "epoch": 6.171835818925696, "grad_norm": 0.08631719648838043, "learning_rate": 1.843348141653286e-05, "loss": 0.043, "num_input_tokens_seen": 20904640, "step": 23385 }, { "epoch": 6.173155602481193, "grad_norm": 0.32672932744026184, "learning_rate": 1.842400915485874e-05, "loss": 0.0335, "num_input_tokens_seen": 20909120, "step": 23390 }, { "epoch": 6.17447538603669, "grad_norm": 0.41695156693458557, "learning_rate": 1.8414537907285053e-05, "loss": 0.0757, "num_input_tokens_seen": 20913376, "step": 23395 }, { "epoch": 6.175795169592186, "grad_norm": 0.0174611397087574, "learning_rate": 1.840506767527237e-05, "loss": 0.0812, "num_input_tokens_seen": 20917728, "step": 23400 }, { "epoch": 6.175795169592186, "eval_loss": 0.06792458891868591, "eval_runtime": 64.7372, "eval_samples_per_second": 104.036, "eval_steps_per_second": 26.013, "num_input_tokens_seen": 20917728, "step": 23400 }, { "epoch": 6.177114953147684, "grad_norm": 0.09049680083990097, "learning_rate": 1.8395598460281137e-05, "loss": 0.0318, "num_input_tokens_seen": 20922176, "step": 23405 }, { "epoch": 6.17843473670318, "grad_norm": 0.06453737616539001, "learning_rate": 1.838613026377161e-05, "loss": 0.0446, "num_input_tokens_seen": 20926496, "step": 23410 }, { "epoch": 6.179754520258678, "grad_norm": 0.3907078206539154, "learning_rate": 1.8376663087203917e-05, "loss": 0.0688, "num_input_tokens_seen": 20931296, "step": 23415 }, { "epoch": 6.181074303814174, "grad_norm": 0.3931455612182617, "learning_rate": 1.8367196932038014e-05, "loss": 0.0582, "num_input_tokens_seen": 20935424, "step": 23420 }, { "epoch": 6.182394087369671, "grad_norm": 0.09347866475582123, "learning_rate": 1.8357731799733686e-05, "loss": 0.056, "num_input_tokens_seen": 20939872, "step": 23425 }, { "epoch": 6.183713870925168, "grad_norm": 0.14979027211666107, "learning_rate": 1.8348267691750586e-05, "loss": 0.0983, "num_input_tokens_seen": 20944576, "step": 23430 }, { "epoch": 6.185033654480665, "grad_norm": 0.09732377529144287, "learning_rate": 1.833880460954821e-05, "loss": 0.023, "num_input_tokens_seen": 20949472, "step": 23435 }, { "epoch": 6.1863534380361624, "grad_norm": 0.10456091910600662, "learning_rate": 1.8329342554585866e-05, "loss": 0.0527, "num_input_tokens_seen": 20953856, "step": 23440 }, { "epoch": 6.187673221591659, "grad_norm": 0.5026780366897583, "learning_rate": 1.8319881528322735e-05, "loss": 0.1131, "num_input_tokens_seen": 20958496, "step": 23445 }, { "epoch": 6.188993005147156, "grad_norm": 0.2576005756855011, "learning_rate": 1.8310421532217815e-05, "loss": 0.0651, "num_input_tokens_seen": 20962784, "step": 23450 }, { "epoch": 6.190312788702653, "grad_norm": 0.21271757781505585, "learning_rate": 1.8300962567729958e-05, "loss": 0.0309, "num_input_tokens_seen": 20967360, "step": 23455 }, { "epoch": 6.19163257225815, "grad_norm": 0.13919296860694885, "learning_rate": 1.8291504636317866e-05, "loss": 0.022, "num_input_tokens_seen": 20971936, "step": 23460 }, { "epoch": 6.192952355813647, "grad_norm": 0.21724295616149902, "learning_rate": 1.8282047739440055e-05, "loss": 0.0594, "num_input_tokens_seen": 20976576, "step": 23465 }, { "epoch": 6.194272139369144, "grad_norm": 0.22113631665706635, "learning_rate": 1.8272591878554903e-05, "loss": 0.0535, "num_input_tokens_seen": 20980960, "step": 23470 }, { "epoch": 6.19559192292464, "grad_norm": 0.2735648453235626, "learning_rate": 1.8263137055120638e-05, "loss": 0.1389, "num_input_tokens_seen": 20985472, "step": 23475 }, { "epoch": 6.196911706480138, "grad_norm": 0.0804530531167984, "learning_rate": 1.8253683270595295e-05, "loss": 0.0479, "num_input_tokens_seen": 20989760, "step": 23480 }, { "epoch": 6.198231490035634, "grad_norm": 0.03825506567955017, "learning_rate": 1.824423052643677e-05, "loss": 0.0588, "num_input_tokens_seen": 20994112, "step": 23485 }, { "epoch": 6.199551273591131, "grad_norm": 0.18740999698638916, "learning_rate": 1.82347788241028e-05, "loss": 0.039, "num_input_tokens_seen": 20998720, "step": 23490 }, { "epoch": 6.200871057146628, "grad_norm": 0.16862717270851135, "learning_rate": 1.8225328165050942e-05, "loss": 0.0772, "num_input_tokens_seen": 21002976, "step": 23495 }, { "epoch": 6.202190840702125, "grad_norm": 0.04621472954750061, "learning_rate": 1.821587855073863e-05, "loss": 0.0419, "num_input_tokens_seen": 21007424, "step": 23500 }, { "epoch": 6.203510624257622, "grad_norm": 0.06968823820352554, "learning_rate": 1.8206429982623086e-05, "loss": 0.061, "num_input_tokens_seen": 21011936, "step": 23505 }, { "epoch": 6.204830407813119, "grad_norm": 0.05870481953024864, "learning_rate": 1.8196982462161416e-05, "loss": 0.0712, "num_input_tokens_seen": 21016352, "step": 23510 }, { "epoch": 6.206150191368615, "grad_norm": 0.283359169960022, "learning_rate": 1.818753599081055e-05, "loss": 0.0572, "num_input_tokens_seen": 21021120, "step": 23515 }, { "epoch": 6.207469974924113, "grad_norm": 0.42665475606918335, "learning_rate": 1.817809057002724e-05, "loss": 0.0801, "num_input_tokens_seen": 21025280, "step": 23520 }, { "epoch": 6.208789758479609, "grad_norm": 0.17830058932304382, "learning_rate": 1.8168646201268096e-05, "loss": 0.017, "num_input_tokens_seen": 21029600, "step": 23525 }, { "epoch": 6.210109542035106, "grad_norm": 0.19644585251808167, "learning_rate": 1.8159202885989557e-05, "loss": 0.0623, "num_input_tokens_seen": 21034112, "step": 23530 }, { "epoch": 6.211429325590603, "grad_norm": 0.3622327148914337, "learning_rate": 1.814976062564789e-05, "loss": 0.0885, "num_input_tokens_seen": 21038368, "step": 23535 }, { "epoch": 6.2127491091461, "grad_norm": 0.030862197279930115, "learning_rate": 1.8140319421699234e-05, "loss": 0.0406, "num_input_tokens_seen": 21043232, "step": 23540 }, { "epoch": 6.214068892701597, "grad_norm": 0.28243887424468994, "learning_rate": 1.8130879275599515e-05, "loss": 0.0454, "num_input_tokens_seen": 21047680, "step": 23545 }, { "epoch": 6.215388676257094, "grad_norm": 0.11172232031822205, "learning_rate": 1.8121440188804544e-05, "loss": 0.0839, "num_input_tokens_seen": 21052128, "step": 23550 }, { "epoch": 6.21670845981259, "grad_norm": 0.0934629812836647, "learning_rate": 1.811200216276993e-05, "loss": 0.069, "num_input_tokens_seen": 21056832, "step": 23555 }, { "epoch": 6.218028243368088, "grad_norm": 0.39137664437294006, "learning_rate": 1.810256519895115e-05, "loss": 0.107, "num_input_tokens_seen": 21061024, "step": 23560 }, { "epoch": 6.219348026923584, "grad_norm": 0.14804111421108246, "learning_rate": 1.8093129298803494e-05, "loss": 0.0541, "num_input_tokens_seen": 21065568, "step": 23565 }, { "epoch": 6.220667810479082, "grad_norm": 0.272752970457077, "learning_rate": 1.808369446378209e-05, "loss": 0.0559, "num_input_tokens_seen": 21069952, "step": 23570 }, { "epoch": 6.221987594034578, "grad_norm": 0.09275731444358826, "learning_rate": 1.8074260695341914e-05, "loss": 0.053, "num_input_tokens_seen": 21074496, "step": 23575 }, { "epoch": 6.223307377590075, "grad_norm": 0.1410493552684784, "learning_rate": 1.8064827994937782e-05, "loss": 0.0836, "num_input_tokens_seen": 21078784, "step": 23580 }, { "epoch": 6.224627161145572, "grad_norm": 0.2554750144481659, "learning_rate": 1.8055396364024317e-05, "loss": 0.0939, "num_input_tokens_seen": 21083328, "step": 23585 }, { "epoch": 6.225946944701069, "grad_norm": 0.05474372208118439, "learning_rate": 1.804596580405601e-05, "loss": 0.0569, "num_input_tokens_seen": 21087936, "step": 23590 }, { "epoch": 6.227266728256566, "grad_norm": 0.24490174651145935, "learning_rate": 1.8036536316487174e-05, "loss": 0.0386, "num_input_tokens_seen": 21092576, "step": 23595 }, { "epoch": 6.228586511812063, "grad_norm": 0.21805471181869507, "learning_rate": 1.802710790277193e-05, "loss": 0.0304, "num_input_tokens_seen": 21097088, "step": 23600 }, { "epoch": 6.228586511812063, "eval_loss": 0.06765655428171158, "eval_runtime": 64.8117, "eval_samples_per_second": 103.916, "eval_steps_per_second": 25.983, "num_input_tokens_seen": 21097088, "step": 23600 }, { "epoch": 6.2299062953675595, "grad_norm": 0.10086227208375931, "learning_rate": 1.801768056436429e-05, "loss": 0.052, "num_input_tokens_seen": 21101568, "step": 23605 }, { "epoch": 6.231226078923057, "grad_norm": 0.3419232964515686, "learning_rate": 1.8008254302718035e-05, "loss": 0.0535, "num_input_tokens_seen": 21106272, "step": 23610 }, { "epoch": 6.2325458624785535, "grad_norm": 0.40463292598724365, "learning_rate": 1.7998829119286837e-05, "loss": 0.0813, "num_input_tokens_seen": 21110688, "step": 23615 }, { "epoch": 6.23386564603405, "grad_norm": 0.2521134912967682, "learning_rate": 1.798940501552418e-05, "loss": 0.052, "num_input_tokens_seen": 21115520, "step": 23620 }, { "epoch": 6.2351854295895475, "grad_norm": 0.08768836408853531, "learning_rate": 1.797998199288336e-05, "loss": 0.0573, "num_input_tokens_seen": 21119904, "step": 23625 }, { "epoch": 6.236505213145044, "grad_norm": 0.51609206199646, "learning_rate": 1.7970560052817543e-05, "loss": 0.0996, "num_input_tokens_seen": 21124224, "step": 23630 }, { "epoch": 6.2378249967005415, "grad_norm": 0.35435980558395386, "learning_rate": 1.7961139196779702e-05, "loss": 0.0666, "num_input_tokens_seen": 21128800, "step": 23635 }, { "epoch": 6.239144780256038, "grad_norm": 0.2641701400279999, "learning_rate": 1.7951719426222647e-05, "loss": 0.0805, "num_input_tokens_seen": 21133088, "step": 23640 }, { "epoch": 6.240464563811535, "grad_norm": 0.040627576410770416, "learning_rate": 1.794230074259904e-05, "loss": 0.0464, "num_input_tokens_seen": 21137696, "step": 23645 }, { "epoch": 6.241784347367032, "grad_norm": 0.06558166444301605, "learning_rate": 1.7932883147361336e-05, "loss": 0.1099, "num_input_tokens_seen": 21142080, "step": 23650 }, { "epoch": 6.243104130922529, "grad_norm": 0.234835684299469, "learning_rate": 1.7923466641961865e-05, "loss": 0.0893, "num_input_tokens_seen": 21146688, "step": 23655 }, { "epoch": 6.244423914478026, "grad_norm": 0.3145919740200043, "learning_rate": 1.791405122785278e-05, "loss": 0.1082, "num_input_tokens_seen": 21151424, "step": 23660 }, { "epoch": 6.245743698033523, "grad_norm": 0.06777172535657883, "learning_rate": 1.7904636906486037e-05, "loss": 0.0494, "num_input_tokens_seen": 21155808, "step": 23665 }, { "epoch": 6.247063481589019, "grad_norm": 0.2834625840187073, "learning_rate": 1.7895223679313448e-05, "loss": 0.0575, "num_input_tokens_seen": 21160160, "step": 23670 }, { "epoch": 6.248383265144517, "grad_norm": 0.3051503300666809, "learning_rate": 1.7885811547786653e-05, "loss": 0.0579, "num_input_tokens_seen": 21164256, "step": 23675 }, { "epoch": 6.249703048700013, "grad_norm": 0.43011030554771423, "learning_rate": 1.7876400513357115e-05, "loss": 0.0974, "num_input_tokens_seen": 21168448, "step": 23680 }, { "epoch": 6.25102283225551, "grad_norm": 0.12648195028305054, "learning_rate": 1.7866990577476146e-05, "loss": 0.0732, "num_input_tokens_seen": 21172704, "step": 23685 }, { "epoch": 6.252342615811007, "grad_norm": 0.1834063082933426, "learning_rate": 1.7857581741594863e-05, "loss": 0.0609, "num_input_tokens_seen": 21177344, "step": 23690 }, { "epoch": 6.253662399366504, "grad_norm": 0.4432027339935303, "learning_rate": 1.7848174007164237e-05, "loss": 0.0649, "num_input_tokens_seen": 21181568, "step": 23695 }, { "epoch": 6.254982182922001, "grad_norm": 0.3047706186771393, "learning_rate": 1.7838767375635052e-05, "loss": 0.0423, "num_input_tokens_seen": 21185920, "step": 23700 }, { "epoch": 6.256301966477498, "grad_norm": 0.032131265848875046, "learning_rate": 1.782936184845793e-05, "loss": 0.0459, "num_input_tokens_seen": 21190432, "step": 23705 }, { "epoch": 6.257621750032994, "grad_norm": 0.33897316455841064, "learning_rate": 1.7819957427083334e-05, "loss": 0.081, "num_input_tokens_seen": 21194976, "step": 23710 }, { "epoch": 6.258941533588492, "grad_norm": 0.2434198260307312, "learning_rate": 1.7810554112961516e-05, "loss": 0.1128, "num_input_tokens_seen": 21199392, "step": 23715 }, { "epoch": 6.260261317143988, "grad_norm": 0.26061710715293884, "learning_rate": 1.7801151907542607e-05, "loss": 0.0572, "num_input_tokens_seen": 21203680, "step": 23720 }, { "epoch": 6.261581100699486, "grad_norm": 0.278072327375412, "learning_rate": 1.7791750812276547e-05, "loss": 0.0636, "num_input_tokens_seen": 21207744, "step": 23725 }, { "epoch": 6.262900884254982, "grad_norm": 0.3654395043849945, "learning_rate": 1.778235082861309e-05, "loss": 0.0793, "num_input_tokens_seen": 21211968, "step": 23730 }, { "epoch": 6.264220667810479, "grad_norm": 0.14277417957782745, "learning_rate": 1.777295195800184e-05, "loss": 0.0857, "num_input_tokens_seen": 21216320, "step": 23735 }, { "epoch": 6.265540451365976, "grad_norm": 0.31520983576774597, "learning_rate": 1.7763554201892215e-05, "loss": 0.0583, "num_input_tokens_seen": 21221024, "step": 23740 }, { "epoch": 6.266860234921473, "grad_norm": 0.047309454530477524, "learning_rate": 1.7754157561733476e-05, "loss": 0.0567, "num_input_tokens_seen": 21225568, "step": 23745 }, { "epoch": 6.268180018476969, "grad_norm": 0.3717719316482544, "learning_rate": 1.7744762038974702e-05, "loss": 0.0835, "num_input_tokens_seen": 21230016, "step": 23750 }, { "epoch": 6.269499802032467, "grad_norm": 0.20820075273513794, "learning_rate": 1.7735367635064788e-05, "loss": 0.0642, "num_input_tokens_seen": 21234400, "step": 23755 }, { "epoch": 6.270819585587963, "grad_norm": 0.16844941675662994, "learning_rate": 1.7725974351452474e-05, "loss": 0.0339, "num_input_tokens_seen": 21239136, "step": 23760 }, { "epoch": 6.272139369143461, "grad_norm": 0.12091757357120514, "learning_rate": 1.771658218958634e-05, "loss": 0.0685, "num_input_tokens_seen": 21243904, "step": 23765 }, { "epoch": 6.273459152698957, "grad_norm": 0.1907871514558792, "learning_rate": 1.770719115091475e-05, "loss": 0.1082, "num_input_tokens_seen": 21248576, "step": 23770 }, { "epoch": 6.274778936254454, "grad_norm": 0.21142634749412537, "learning_rate": 1.7697801236885935e-05, "loss": 0.0806, "num_input_tokens_seen": 21253120, "step": 23775 }, { "epoch": 6.276098719809951, "grad_norm": 0.468583881855011, "learning_rate": 1.7688412448947944e-05, "loss": 0.0729, "num_input_tokens_seen": 21257568, "step": 23780 }, { "epoch": 6.277418503365448, "grad_norm": 0.20885169506072998, "learning_rate": 1.767902478854862e-05, "loss": 0.063, "num_input_tokens_seen": 21261984, "step": 23785 }, { "epoch": 6.278738286920945, "grad_norm": 0.16629639267921448, "learning_rate": 1.766963825713569e-05, "loss": 0.0672, "num_input_tokens_seen": 21266336, "step": 23790 }, { "epoch": 6.280058070476442, "grad_norm": 0.2886351943016052, "learning_rate": 1.766025285615665e-05, "loss": 0.0889, "num_input_tokens_seen": 21270816, "step": 23795 }, { "epoch": 6.2813778540319385, "grad_norm": 0.39841577410697937, "learning_rate": 1.7650868587058854e-05, "loss": 0.0544, "num_input_tokens_seen": 21275360, "step": 23800 }, { "epoch": 6.2813778540319385, "eval_loss": 0.06739436835050583, "eval_runtime": 64.7756, "eval_samples_per_second": 103.974, "eval_steps_per_second": 25.997, "num_input_tokens_seen": 21275360, "step": 23800 }, { "epoch": 6.282697637587436, "grad_norm": 0.04047234356403351, "learning_rate": 1.7641485451289484e-05, "loss": 0.0902, "num_input_tokens_seen": 21279808, "step": 23805 }, { "epoch": 6.2840174211429325, "grad_norm": 0.04487486183643341, "learning_rate": 1.7632103450295534e-05, "loss": 0.0362, "num_input_tokens_seen": 21284224, "step": 23810 }, { "epoch": 6.285337204698429, "grad_norm": 0.2171248495578766, "learning_rate": 1.762272258552381e-05, "loss": 0.0521, "num_input_tokens_seen": 21288768, "step": 23815 }, { "epoch": 6.2866569882539265, "grad_norm": 0.034815799444913864, "learning_rate": 1.7613342858420988e-05, "loss": 0.0825, "num_input_tokens_seen": 21293536, "step": 23820 }, { "epoch": 6.287976771809423, "grad_norm": 0.2077503651380539, "learning_rate": 1.760396427043351e-05, "loss": 0.0491, "num_input_tokens_seen": 21298016, "step": 23825 }, { "epoch": 6.2892965553649205, "grad_norm": 0.06525364518165588, "learning_rate": 1.7594586823007696e-05, "loss": 0.0261, "num_input_tokens_seen": 21302528, "step": 23830 }, { "epoch": 6.290616338920417, "grad_norm": 0.295021116733551, "learning_rate": 1.7585210517589646e-05, "loss": 0.0431, "num_input_tokens_seen": 21307168, "step": 23835 }, { "epoch": 6.291936122475914, "grad_norm": 0.302420973777771, "learning_rate": 1.7575835355625314e-05, "loss": 0.0384, "num_input_tokens_seen": 21311712, "step": 23840 }, { "epoch": 6.293255906031411, "grad_norm": 0.1583237200975418, "learning_rate": 1.756646133856048e-05, "loss": 0.0713, "num_input_tokens_seen": 21316160, "step": 23845 }, { "epoch": 6.294575689586908, "grad_norm": 0.07820507138967514, "learning_rate": 1.7557088467840714e-05, "loss": 0.0409, "num_input_tokens_seen": 21320512, "step": 23850 }, { "epoch": 6.295895473142405, "grad_norm": 0.05903852358460426, "learning_rate": 1.7547716744911438e-05, "loss": 0.0772, "num_input_tokens_seen": 21325056, "step": 23855 }, { "epoch": 6.297215256697902, "grad_norm": 0.163959339261055, "learning_rate": 1.7538346171217902e-05, "loss": 0.0296, "num_input_tokens_seen": 21329824, "step": 23860 }, { "epoch": 6.298535040253398, "grad_norm": 0.2888582944869995, "learning_rate": 1.7528976748205146e-05, "loss": 0.0864, "num_input_tokens_seen": 21334272, "step": 23865 }, { "epoch": 6.299854823808896, "grad_norm": 0.13921359181404114, "learning_rate": 1.751960847731807e-05, "loss": 0.0613, "num_input_tokens_seen": 21338720, "step": 23870 }, { "epoch": 6.301174607364392, "grad_norm": 0.41786932945251465, "learning_rate": 1.7510241360001362e-05, "loss": 0.0621, "num_input_tokens_seen": 21343232, "step": 23875 }, { "epoch": 6.302494390919889, "grad_norm": 0.04720594361424446, "learning_rate": 1.7500875397699562e-05, "loss": 0.0881, "num_input_tokens_seen": 21347744, "step": 23880 }, { "epoch": 6.303814174475386, "grad_norm": 0.12537865340709686, "learning_rate": 1.7491510591857015e-05, "loss": 0.0317, "num_input_tokens_seen": 21352352, "step": 23885 }, { "epoch": 6.305133958030883, "grad_norm": 0.21012453734874725, "learning_rate": 1.7482146943917896e-05, "loss": 0.0384, "num_input_tokens_seen": 21357024, "step": 23890 }, { "epoch": 6.30645374158638, "grad_norm": 0.3014776408672333, "learning_rate": 1.7472784455326185e-05, "loss": 0.095, "num_input_tokens_seen": 21361376, "step": 23895 }, { "epoch": 6.307773525141877, "grad_norm": 0.20395205914974213, "learning_rate": 1.746342312752572e-05, "loss": 0.0328, "num_input_tokens_seen": 21365696, "step": 23900 }, { "epoch": 6.309093308697373, "grad_norm": 0.05851678177714348, "learning_rate": 1.74540629619601e-05, "loss": 0.0201, "num_input_tokens_seen": 21370304, "step": 23905 }, { "epoch": 6.310413092252871, "grad_norm": 0.510770857334137, "learning_rate": 1.7444703960072815e-05, "loss": 0.0648, "num_input_tokens_seen": 21374400, "step": 23910 }, { "epoch": 6.311732875808367, "grad_norm": 0.30311185121536255, "learning_rate": 1.7435346123307118e-05, "loss": 0.0567, "num_input_tokens_seen": 21378816, "step": 23915 }, { "epoch": 6.313052659363865, "grad_norm": 0.08239750564098358, "learning_rate": 1.742598945310611e-05, "loss": 0.0311, "num_input_tokens_seen": 21383200, "step": 23920 }, { "epoch": 6.314372442919361, "grad_norm": 0.3715181350708008, "learning_rate": 1.741663395091272e-05, "loss": 0.0561, "num_input_tokens_seen": 21387584, "step": 23925 }, { "epoch": 6.315692226474858, "grad_norm": 0.447177529335022, "learning_rate": 1.7407279618169657e-05, "loss": 0.1002, "num_input_tokens_seen": 21392384, "step": 23930 }, { "epoch": 6.317012010030355, "grad_norm": 0.11280747503042221, "learning_rate": 1.73979264563195e-05, "loss": 0.0292, "num_input_tokens_seen": 21396704, "step": 23935 }, { "epoch": 6.318331793585852, "grad_norm": 0.3707388937473297, "learning_rate": 1.7388574466804625e-05, "loss": 0.0844, "num_input_tokens_seen": 21401184, "step": 23940 }, { "epoch": 6.319651577141348, "grad_norm": 0.12804940342903137, "learning_rate": 1.7379223651067207e-05, "loss": 0.0577, "num_input_tokens_seen": 21405888, "step": 23945 }, { "epoch": 6.320971360696846, "grad_norm": 0.18792009353637695, "learning_rate": 1.736987401054928e-05, "loss": 0.0911, "num_input_tokens_seen": 21410112, "step": 23950 }, { "epoch": 6.322291144252342, "grad_norm": 0.22138424217700958, "learning_rate": 1.736052554669266e-05, "loss": 0.0769, "num_input_tokens_seen": 21414240, "step": 23955 }, { "epoch": 6.32361092780784, "grad_norm": 0.1386704444885254, "learning_rate": 1.7351178260939007e-05, "loss": 0.0356, "num_input_tokens_seen": 21418912, "step": 23960 }, { "epoch": 6.324930711363336, "grad_norm": 0.2715069651603699, "learning_rate": 1.7341832154729794e-05, "loss": 0.0385, "num_input_tokens_seen": 21423136, "step": 23965 }, { "epoch": 6.326250494918833, "grad_norm": 0.1740335375070572, "learning_rate": 1.7332487229506286e-05, "loss": 0.0239, "num_input_tokens_seen": 21427520, "step": 23970 }, { "epoch": 6.32757027847433, "grad_norm": 0.18758241832256317, "learning_rate": 1.732314348670961e-05, "loss": 0.0646, "num_input_tokens_seen": 21432352, "step": 23975 }, { "epoch": 6.328890062029827, "grad_norm": 0.3596736788749695, "learning_rate": 1.7313800927780686e-05, "loss": 0.1002, "num_input_tokens_seen": 21436640, "step": 23980 }, { "epoch": 6.330209845585324, "grad_norm": 0.3624059557914734, "learning_rate": 1.7304459554160245e-05, "loss": 0.052, "num_input_tokens_seen": 21441056, "step": 23985 }, { "epoch": 6.331529629140821, "grad_norm": 0.20522797107696533, "learning_rate": 1.7295119367288853e-05, "loss": 0.0599, "num_input_tokens_seen": 21445440, "step": 23990 }, { "epoch": 6.3328494126963175, "grad_norm": 0.16960488259792328, "learning_rate": 1.728578036860688e-05, "loss": 0.063, "num_input_tokens_seen": 21449728, "step": 23995 }, { "epoch": 6.334169196251815, "grad_norm": 0.3596837818622589, "learning_rate": 1.7276442559554513e-05, "loss": 0.0673, "num_input_tokens_seen": 21454048, "step": 24000 }, { "epoch": 6.334169196251815, "eval_loss": 0.06759290397167206, "eval_runtime": 64.7681, "eval_samples_per_second": 103.986, "eval_steps_per_second": 26.0, "num_input_tokens_seen": 21454048, "step": 24000 }, { "epoch": 6.3354889798073115, "grad_norm": 0.11799906194210052, "learning_rate": 1.726710594157177e-05, "loss": 0.0574, "num_input_tokens_seen": 21458976, "step": 24005 }, { "epoch": 6.336808763362809, "grad_norm": 0.1542358547449112, "learning_rate": 1.725777051609846e-05, "loss": 0.0427, "num_input_tokens_seen": 21463488, "step": 24010 }, { "epoch": 6.3381285469183055, "grad_norm": 0.13928136229515076, "learning_rate": 1.7248436284574228e-05, "loss": 0.0196, "num_input_tokens_seen": 21468064, "step": 24015 }, { "epoch": 6.339448330473802, "grad_norm": 0.17160825431346893, "learning_rate": 1.723910324843855e-05, "loss": 0.0635, "num_input_tokens_seen": 21472512, "step": 24020 }, { "epoch": 6.3407681140292995, "grad_norm": 0.10874205082654953, "learning_rate": 1.722977140913067e-05, "loss": 0.0483, "num_input_tokens_seen": 21476832, "step": 24025 }, { "epoch": 6.342087897584796, "grad_norm": 0.22802071273326874, "learning_rate": 1.7220440768089688e-05, "loss": 0.0277, "num_input_tokens_seen": 21481696, "step": 24030 }, { "epoch": 6.343407681140293, "grad_norm": 0.09431540220975876, "learning_rate": 1.7211111326754505e-05, "loss": 0.0456, "num_input_tokens_seen": 21486048, "step": 24035 }, { "epoch": 6.34472746469579, "grad_norm": 0.1326177418231964, "learning_rate": 1.720178308656383e-05, "loss": 0.1008, "num_input_tokens_seen": 21490656, "step": 24040 }, { "epoch": 6.346047248251287, "grad_norm": 0.0991852805018425, "learning_rate": 1.719245604895621e-05, "loss": 0.0278, "num_input_tokens_seen": 21495168, "step": 24045 }, { "epoch": 6.347367031806784, "grad_norm": 0.5619545578956604, "learning_rate": 1.7183130215369972e-05, "loss": 0.072, "num_input_tokens_seen": 21499584, "step": 24050 }, { "epoch": 6.348686815362281, "grad_norm": 0.26978424191474915, "learning_rate": 1.7173805587243292e-05, "loss": 0.0701, "num_input_tokens_seen": 21503808, "step": 24055 }, { "epoch": 6.350006598917777, "grad_norm": 0.13514144718647003, "learning_rate": 1.7164482166014147e-05, "loss": 0.0595, "num_input_tokens_seen": 21508320, "step": 24060 }, { "epoch": 6.351326382473275, "grad_norm": 0.3354870080947876, "learning_rate": 1.7155159953120313e-05, "loss": 0.1056, "num_input_tokens_seen": 21512672, "step": 24065 }, { "epoch": 6.352646166028771, "grad_norm": 0.3133455216884613, "learning_rate": 1.714583894999941e-05, "loss": 0.0477, "num_input_tokens_seen": 21517152, "step": 24070 }, { "epoch": 6.353965949584268, "grad_norm": 0.06239788606762886, "learning_rate": 1.7136519158088826e-05, "loss": 0.0586, "num_input_tokens_seen": 21521216, "step": 24075 }, { "epoch": 6.355285733139765, "grad_norm": 0.47262364625930786, "learning_rate": 1.712720057882581e-05, "loss": 0.0589, "num_input_tokens_seen": 21525632, "step": 24080 }, { "epoch": 6.356605516695262, "grad_norm": 0.3394815921783447, "learning_rate": 1.7117883213647413e-05, "loss": 0.0973, "num_input_tokens_seen": 21530400, "step": 24085 }, { "epoch": 6.357925300250759, "grad_norm": 0.22428442537784576, "learning_rate": 1.710856706399046e-05, "loss": 0.0893, "num_input_tokens_seen": 21534944, "step": 24090 }, { "epoch": 6.359245083806256, "grad_norm": 0.27164584398269653, "learning_rate": 1.7099252131291648e-05, "loss": 0.0978, "num_input_tokens_seen": 21539520, "step": 24095 }, { "epoch": 6.360564867361752, "grad_norm": 0.14305263757705688, "learning_rate": 1.708993841698744e-05, "loss": 0.0257, "num_input_tokens_seen": 21543936, "step": 24100 }, { "epoch": 6.36188465091725, "grad_norm": 0.0484158955514431, "learning_rate": 1.7080625922514132e-05, "loss": 0.012, "num_input_tokens_seen": 21548448, "step": 24105 }, { "epoch": 6.363204434472746, "grad_norm": 0.30051058530807495, "learning_rate": 1.7071314649307836e-05, "loss": 0.053, "num_input_tokens_seen": 21552576, "step": 24110 }, { "epoch": 6.364524218028244, "grad_norm": 0.1470867246389389, "learning_rate": 1.7062004598804448e-05, "loss": 0.0559, "num_input_tokens_seen": 21556896, "step": 24115 }, { "epoch": 6.36584400158374, "grad_norm": 0.22805409133434296, "learning_rate": 1.7052695772439702e-05, "loss": 0.0799, "num_input_tokens_seen": 21561504, "step": 24120 }, { "epoch": 6.367163785139237, "grad_norm": 0.2706526815891266, "learning_rate": 1.7043388171649154e-05, "loss": 0.0727, "num_input_tokens_seen": 21565952, "step": 24125 }, { "epoch": 6.368483568694734, "grad_norm": 0.11722206324338913, "learning_rate": 1.7034081797868127e-05, "loss": 0.0461, "num_input_tokens_seen": 21570496, "step": 24130 }, { "epoch": 6.369803352250231, "grad_norm": 0.07856651395559311, "learning_rate": 1.70247766525318e-05, "loss": 0.0855, "num_input_tokens_seen": 21575232, "step": 24135 }, { "epoch": 6.371123135805728, "grad_norm": 0.2467634230852127, "learning_rate": 1.701547273707514e-05, "loss": 0.1884, "num_input_tokens_seen": 21579456, "step": 24140 }, { "epoch": 6.372442919361225, "grad_norm": 0.08213524520397186, "learning_rate": 1.7006170052932916e-05, "loss": 0.0227, "num_input_tokens_seen": 21583840, "step": 24145 }, { "epoch": 6.373762702916721, "grad_norm": 0.1368740350008011, "learning_rate": 1.6996868601539735e-05, "loss": 0.081, "num_input_tokens_seen": 21588064, "step": 24150 }, { "epoch": 6.375082486472219, "grad_norm": 0.24248552322387695, "learning_rate": 1.6987568384329977e-05, "loss": 0.0526, "num_input_tokens_seen": 21592224, "step": 24155 }, { "epoch": 6.376402270027715, "grad_norm": 0.41842830181121826, "learning_rate": 1.6978269402737866e-05, "loss": 0.1242, "num_input_tokens_seen": 21596608, "step": 24160 }, { "epoch": 6.377722053583212, "grad_norm": 0.3097465932369232, "learning_rate": 1.696897165819743e-05, "loss": 0.0461, "num_input_tokens_seen": 21600832, "step": 24165 }, { "epoch": 6.379041837138709, "grad_norm": 0.11658819019794464, "learning_rate": 1.6959675152142487e-05, "loss": 0.0635, "num_input_tokens_seen": 21605312, "step": 24170 }, { "epoch": 6.380361620694206, "grad_norm": 0.3191256523132324, "learning_rate": 1.6950379886006667e-05, "loss": 0.1081, "num_input_tokens_seen": 21609472, "step": 24175 }, { "epoch": 6.381681404249703, "grad_norm": 0.21096785366535187, "learning_rate": 1.6941085861223438e-05, "loss": 0.0839, "num_input_tokens_seen": 21613888, "step": 24180 }, { "epoch": 6.3830011878052, "grad_norm": 0.19102483987808228, "learning_rate": 1.6931793079226034e-05, "loss": 0.0533, "num_input_tokens_seen": 21618496, "step": 24185 }, { "epoch": 6.3843209713606965, "grad_norm": 0.06051099672913551, "learning_rate": 1.692250154144754e-05, "loss": 0.0267, "num_input_tokens_seen": 21622656, "step": 24190 }, { "epoch": 6.385640754916194, "grad_norm": 0.0768212378025055, "learning_rate": 1.6913211249320807e-05, "loss": 0.0372, "num_input_tokens_seen": 21626720, "step": 24195 }, { "epoch": 6.3869605384716905, "grad_norm": 0.30191799998283386, "learning_rate": 1.6903922204278522e-05, "loss": 0.0503, "num_input_tokens_seen": 21631232, "step": 24200 }, { "epoch": 6.3869605384716905, "eval_loss": 0.0673343613743782, "eval_runtime": 64.7615, "eval_samples_per_second": 103.997, "eval_steps_per_second": 26.003, "num_input_tokens_seen": 21631232, "step": 24200 }, { "epoch": 6.388280322027187, "grad_norm": 0.1655900776386261, "learning_rate": 1.6894634407753186e-05, "loss": 0.0386, "num_input_tokens_seen": 21636000, "step": 24205 }, { "epoch": 6.3896001055826845, "grad_norm": 0.7960726022720337, "learning_rate": 1.6885347861177077e-05, "loss": 0.1011, "num_input_tokens_seen": 21640352, "step": 24210 }, { "epoch": 6.390919889138181, "grad_norm": 0.06886322051286697, "learning_rate": 1.6876062565982298e-05, "loss": 0.0727, "num_input_tokens_seen": 21645120, "step": 24215 }, { "epoch": 6.3922396726936785, "grad_norm": 0.17605438828468323, "learning_rate": 1.6866778523600774e-05, "loss": 0.04, "num_input_tokens_seen": 21649312, "step": 24220 }, { "epoch": 6.393559456249175, "grad_norm": 0.45297589898109436, "learning_rate": 1.6857495735464195e-05, "loss": 0.085, "num_input_tokens_seen": 21653664, "step": 24225 }, { "epoch": 6.394879239804672, "grad_norm": 0.27499887347221375, "learning_rate": 1.6848214203004115e-05, "loss": 0.1189, "num_input_tokens_seen": 21658208, "step": 24230 }, { "epoch": 6.396199023360169, "grad_norm": 0.30921897292137146, "learning_rate": 1.6838933927651835e-05, "loss": 0.0444, "num_input_tokens_seen": 21662624, "step": 24235 }, { "epoch": 6.397518806915666, "grad_norm": 0.5695245862007141, "learning_rate": 1.6829654910838506e-05, "loss": 0.1108, "num_input_tokens_seen": 21667072, "step": 24240 }, { "epoch": 6.398838590471163, "grad_norm": 0.6028729677200317, "learning_rate": 1.6820377153995065e-05, "loss": 0.1081, "num_input_tokens_seen": 21671776, "step": 24245 }, { "epoch": 6.40015837402666, "grad_norm": 0.18353049457073212, "learning_rate": 1.681110065855226e-05, "loss": 0.085, "num_input_tokens_seen": 21676352, "step": 24250 }, { "epoch": 6.401478157582156, "grad_norm": 0.1644231528043747, "learning_rate": 1.6801825425940642e-05, "loss": 0.027, "num_input_tokens_seen": 21680640, "step": 24255 }, { "epoch": 6.402797941137654, "grad_norm": 0.11812327057123184, "learning_rate": 1.679255145759056e-05, "loss": 0.0834, "num_input_tokens_seen": 21685056, "step": 24260 }, { "epoch": 6.40411772469315, "grad_norm": 0.13153013586997986, "learning_rate": 1.6783278754932187e-05, "loss": 0.0586, "num_input_tokens_seen": 21689568, "step": 24265 }, { "epoch": 6.405437508248648, "grad_norm": 0.061284635215997696, "learning_rate": 1.6774007319395496e-05, "loss": 0.0306, "num_input_tokens_seen": 21693728, "step": 24270 }, { "epoch": 6.406757291804144, "grad_norm": 0.14705680310726166, "learning_rate": 1.6764737152410243e-05, "loss": 0.0304, "num_input_tokens_seen": 21697792, "step": 24275 }, { "epoch": 6.408077075359641, "grad_norm": 0.06740880757570267, "learning_rate": 1.6755468255406016e-05, "loss": 0.0555, "num_input_tokens_seen": 21702240, "step": 24280 }, { "epoch": 6.409396858915138, "grad_norm": 0.09752621501684189, "learning_rate": 1.674620062981219e-05, "loss": 0.07, "num_input_tokens_seen": 21706400, "step": 24285 }, { "epoch": 6.410716642470635, "grad_norm": 0.07922883331775665, "learning_rate": 1.6736934277057947e-05, "loss": 0.0426, "num_input_tokens_seen": 21710624, "step": 24290 }, { "epoch": 6.412036426026131, "grad_norm": 0.25735238194465637, "learning_rate": 1.6727669198572286e-05, "loss": 0.0784, "num_input_tokens_seen": 21714688, "step": 24295 }, { "epoch": 6.413356209581629, "grad_norm": 0.31563568115234375, "learning_rate": 1.6718405395783984e-05, "loss": 0.0601, "num_input_tokens_seen": 21719136, "step": 24300 }, { "epoch": 6.414675993137125, "grad_norm": 0.22938284277915955, "learning_rate": 1.6709142870121643e-05, "loss": 0.1273, "num_input_tokens_seen": 21723552, "step": 24305 }, { "epoch": 6.415995776692623, "grad_norm": 0.1776706576347351, "learning_rate": 1.669988162301367e-05, "loss": 0.0351, "num_input_tokens_seen": 21728512, "step": 24310 }, { "epoch": 6.417315560248119, "grad_norm": 0.17078739404678345, "learning_rate": 1.6690621655888243e-05, "loss": 0.0242, "num_input_tokens_seen": 21733024, "step": 24315 }, { "epoch": 6.418635343803616, "grad_norm": 0.4248080253601074, "learning_rate": 1.6681362970173386e-05, "loss": 0.0626, "num_input_tokens_seen": 21737504, "step": 24320 }, { "epoch": 6.419955127359113, "grad_norm": 0.22397294640541077, "learning_rate": 1.6672105567296904e-05, "loss": 0.0592, "num_input_tokens_seen": 21741920, "step": 24325 }, { "epoch": 6.42127491091461, "grad_norm": 0.13692547380924225, "learning_rate": 1.666284944868639e-05, "loss": 0.0603, "num_input_tokens_seen": 21746272, "step": 24330 }, { "epoch": 6.4225946944701064, "grad_norm": 0.16578029096126556, "learning_rate": 1.665359461576927e-05, "loss": 0.1302, "num_input_tokens_seen": 21750816, "step": 24335 }, { "epoch": 6.423914478025604, "grad_norm": 0.3957580029964447, "learning_rate": 1.6644341069972736e-05, "loss": 0.1244, "num_input_tokens_seen": 21755168, "step": 24340 }, { "epoch": 6.4252342615811004, "grad_norm": 0.12295303493738174, "learning_rate": 1.6635088812723813e-05, "loss": 0.0192, "num_input_tokens_seen": 21759968, "step": 24345 }, { "epoch": 6.426554045136598, "grad_norm": 0.17227862775325775, "learning_rate": 1.6625837845449328e-05, "loss": 0.0586, "num_input_tokens_seen": 21764608, "step": 24350 }, { "epoch": 6.4278738286920944, "grad_norm": 0.16177645325660706, "learning_rate": 1.6616588169575874e-05, "loss": 0.0321, "num_input_tokens_seen": 21769344, "step": 24355 }, { "epoch": 6.429193612247591, "grad_norm": 0.3222702443599701, "learning_rate": 1.6607339786529878e-05, "loss": 0.0663, "num_input_tokens_seen": 21773760, "step": 24360 }, { "epoch": 6.4305133958030885, "grad_norm": 0.151154026389122, "learning_rate": 1.659809269773756e-05, "loss": 0.0223, "num_input_tokens_seen": 21778272, "step": 24365 }, { "epoch": 6.431833179358585, "grad_norm": 0.1167394146323204, "learning_rate": 1.658884690462493e-05, "loss": 0.0372, "num_input_tokens_seen": 21782752, "step": 24370 }, { "epoch": 6.4331529629140825, "grad_norm": 0.1073404848575592, "learning_rate": 1.6579602408617813e-05, "loss": 0.0619, "num_input_tokens_seen": 21787136, "step": 24375 }, { "epoch": 6.434472746469579, "grad_norm": 0.37267884612083435, "learning_rate": 1.657035921114181e-05, "loss": 0.1361, "num_input_tokens_seen": 21791712, "step": 24380 }, { "epoch": 6.435792530025076, "grad_norm": 0.196873277425766, "learning_rate": 1.656111731362236e-05, "loss": 0.0574, "num_input_tokens_seen": 21796128, "step": 24385 }, { "epoch": 6.437112313580573, "grad_norm": 0.11980720609426498, "learning_rate": 1.6551876717484666e-05, "loss": 0.111, "num_input_tokens_seen": 21800640, "step": 24390 }, { "epoch": 6.43843209713607, "grad_norm": 0.2921040952205658, "learning_rate": 1.6542637424153752e-05, "loss": 0.0529, "num_input_tokens_seen": 21804960, "step": 24395 }, { "epoch": 6.439751880691567, "grad_norm": 0.24739579856395721, "learning_rate": 1.6533399435054418e-05, "loss": 0.0772, "num_input_tokens_seen": 21809632, "step": 24400 }, { "epoch": 6.439751880691567, "eval_loss": 0.06740210950374603, "eval_runtime": 64.7862, "eval_samples_per_second": 103.957, "eval_steps_per_second": 25.993, "num_input_tokens_seen": 21809632, "step": 24400 }, { "epoch": 6.441071664247064, "grad_norm": 0.3582490384578705, "learning_rate": 1.6524162751611304e-05, "loss": 0.0821, "num_input_tokens_seen": 21813632, "step": 24405 }, { "epoch": 6.44239144780256, "grad_norm": 0.2821758985519409, "learning_rate": 1.6514927375248796e-05, "loss": 0.0378, "num_input_tokens_seen": 21817920, "step": 24410 }, { "epoch": 6.443711231358058, "grad_norm": 0.3133186399936676, "learning_rate": 1.6505693307391127e-05, "loss": 0.0598, "num_input_tokens_seen": 21822048, "step": 24415 }, { "epoch": 6.445031014913554, "grad_norm": 0.0899176225066185, "learning_rate": 1.6496460549462288e-05, "loss": 0.0659, "num_input_tokens_seen": 21826656, "step": 24420 }, { "epoch": 6.446350798469051, "grad_norm": 0.09695081412792206, "learning_rate": 1.6487229102886097e-05, "loss": 0.0502, "num_input_tokens_seen": 21831104, "step": 24425 }, { "epoch": 6.447670582024548, "grad_norm": 0.09089024364948273, "learning_rate": 1.6477998969086155e-05, "loss": 0.0384, "num_input_tokens_seen": 21835264, "step": 24430 }, { "epoch": 6.448990365580045, "grad_norm": 0.10365107655525208, "learning_rate": 1.646877014948587e-05, "loss": 0.0394, "num_input_tokens_seen": 21839744, "step": 24435 }, { "epoch": 6.450310149135542, "grad_norm": 0.0416833758354187, "learning_rate": 1.6459542645508433e-05, "loss": 0.0833, "num_input_tokens_seen": 21844512, "step": 24440 }, { "epoch": 6.451629932691039, "grad_norm": 0.0596088208258152, "learning_rate": 1.6450316458576852e-05, "loss": 0.0256, "num_input_tokens_seen": 21849152, "step": 24445 }, { "epoch": 6.452949716246535, "grad_norm": 0.07115906476974487, "learning_rate": 1.6441091590113912e-05, "loss": 0.0368, "num_input_tokens_seen": 21853696, "step": 24450 }, { "epoch": 6.454269499802033, "grad_norm": 0.4276336133480072, "learning_rate": 1.6431868041542213e-05, "loss": 0.099, "num_input_tokens_seen": 21857952, "step": 24455 }, { "epoch": 6.455589283357529, "grad_norm": 0.10453805327415466, "learning_rate": 1.6422645814284123e-05, "loss": 0.0155, "num_input_tokens_seen": 21862528, "step": 24460 }, { "epoch": 6.456909066913026, "grad_norm": 0.2848013639450073, "learning_rate": 1.6413424909761846e-05, "loss": 0.087, "num_input_tokens_seen": 21866912, "step": 24465 }, { "epoch": 6.458228850468523, "grad_norm": 0.04527978226542473, "learning_rate": 1.640420532939736e-05, "loss": 0.0214, "num_input_tokens_seen": 21871296, "step": 24470 }, { "epoch": 6.45954863402402, "grad_norm": 0.13420382142066956, "learning_rate": 1.639498707461242e-05, "loss": 0.0625, "num_input_tokens_seen": 21875488, "step": 24475 }, { "epoch": 6.460868417579517, "grad_norm": 0.10216771811246872, "learning_rate": 1.6385770146828614e-05, "loss": 0.0748, "num_input_tokens_seen": 21879712, "step": 24480 }, { "epoch": 6.462188201135014, "grad_norm": 0.08184622973203659, "learning_rate": 1.637655454746731e-05, "loss": 0.0379, "num_input_tokens_seen": 21884384, "step": 24485 }, { "epoch": 6.46350798469051, "grad_norm": 0.02707076072692871, "learning_rate": 1.6367340277949658e-05, "loss": 0.0315, "num_input_tokens_seen": 21888800, "step": 24490 }, { "epoch": 6.464827768246008, "grad_norm": 0.034223467111587524, "learning_rate": 1.635812733969663e-05, "loss": 0.0205, "num_input_tokens_seen": 21893344, "step": 24495 }, { "epoch": 6.466147551801504, "grad_norm": 0.025290852412581444, "learning_rate": 1.634891573412896e-05, "loss": 0.0289, "num_input_tokens_seen": 21897664, "step": 24500 }, { "epoch": 6.467467335357002, "grad_norm": 0.1661483496427536, "learning_rate": 1.6339705462667196e-05, "loss": 0.0264, "num_input_tokens_seen": 21902048, "step": 24505 }, { "epoch": 6.468787118912498, "grad_norm": 0.2683849036693573, "learning_rate": 1.633049652673169e-05, "loss": 0.043, "num_input_tokens_seen": 21906752, "step": 24510 }, { "epoch": 6.470106902467995, "grad_norm": 0.07957678288221359, "learning_rate": 1.632128892774256e-05, "loss": 0.035, "num_input_tokens_seen": 21911360, "step": 24515 }, { "epoch": 6.471426686023492, "grad_norm": 0.28816160559654236, "learning_rate": 1.6312082667119737e-05, "loss": 0.0765, "num_input_tokens_seen": 21915744, "step": 24520 }, { "epoch": 6.472746469578989, "grad_norm": 0.36751312017440796, "learning_rate": 1.630287774628296e-05, "loss": 0.0835, "num_input_tokens_seen": 21920480, "step": 24525 }, { "epoch": 6.474066253134486, "grad_norm": 0.04958411678671837, "learning_rate": 1.6293674166651718e-05, "loss": 0.0111, "num_input_tokens_seen": 21924960, "step": 24530 }, { "epoch": 6.475386036689983, "grad_norm": 0.3844093978404999, "learning_rate": 1.6284471929645338e-05, "loss": 0.1046, "num_input_tokens_seen": 21929472, "step": 24535 }, { "epoch": 6.4767058202454795, "grad_norm": 0.24479833245277405, "learning_rate": 1.627527103668291e-05, "loss": 0.0882, "num_input_tokens_seen": 21934208, "step": 24540 }, { "epoch": 6.478025603800977, "grad_norm": 0.19606706500053406, "learning_rate": 1.6266071489183327e-05, "loss": 0.0262, "num_input_tokens_seen": 21938880, "step": 24545 }, { "epoch": 6.4793453873564735, "grad_norm": 0.31466829776763916, "learning_rate": 1.6256873288565283e-05, "loss": 0.0497, "num_input_tokens_seen": 21942912, "step": 24550 }, { "epoch": 6.48066517091197, "grad_norm": 0.23563647270202637, "learning_rate": 1.6247676436247245e-05, "loss": 0.089, "num_input_tokens_seen": 21947520, "step": 24555 }, { "epoch": 6.4819849544674675, "grad_norm": 0.2779657244682312, "learning_rate": 1.6238480933647486e-05, "loss": 0.0743, "num_input_tokens_seen": 21952192, "step": 24560 }, { "epoch": 6.483304738022964, "grad_norm": 0.04489078000187874, "learning_rate": 1.6229286782184083e-05, "loss": 0.1057, "num_input_tokens_seen": 21956640, "step": 24565 }, { "epoch": 6.4846245215784615, "grad_norm": 0.03793451935052872, "learning_rate": 1.622009398327487e-05, "loss": 0.0614, "num_input_tokens_seen": 21961184, "step": 24570 }, { "epoch": 6.485944305133958, "grad_norm": 0.21921348571777344, "learning_rate": 1.6210902538337502e-05, "loss": 0.0682, "num_input_tokens_seen": 21965568, "step": 24575 }, { "epoch": 6.487264088689455, "grad_norm": 0.14004310965538025, "learning_rate": 1.6201712448789413e-05, "loss": 0.1301, "num_input_tokens_seen": 21970432, "step": 24580 }, { "epoch": 6.488583872244952, "grad_norm": 0.08286130428314209, "learning_rate": 1.6192523716047827e-05, "loss": 0.064, "num_input_tokens_seen": 21974784, "step": 24585 }, { "epoch": 6.489903655800449, "grad_norm": 0.19946348667144775, "learning_rate": 1.6183336341529776e-05, "loss": 0.0169, "num_input_tokens_seen": 21979456, "step": 24590 }, { "epoch": 6.491223439355946, "grad_norm": 0.041830938309431076, "learning_rate": 1.6174150326652047e-05, "loss": 0.0508, "num_input_tokens_seen": 21983872, "step": 24595 }, { "epoch": 6.492543222911443, "grad_norm": 0.06078316271305084, "learning_rate": 1.6164965672831256e-05, "loss": 0.0463, "num_input_tokens_seen": 21988192, "step": 24600 }, { "epoch": 6.492543222911443, "eval_loss": 0.06740046292543411, "eval_runtime": 64.7464, "eval_samples_per_second": 104.021, "eval_steps_per_second": 26.009, "num_input_tokens_seen": 21988192, "step": 24600 }, { "epoch": 6.493863006466939, "grad_norm": 0.09459691494703293, "learning_rate": 1.6155782381483784e-05, "loss": 0.0326, "num_input_tokens_seen": 21992928, "step": 24605 }, { "epoch": 6.495182790022437, "grad_norm": 0.373667448759079, "learning_rate": 1.6146600454025813e-05, "loss": 0.0736, "num_input_tokens_seen": 21997376, "step": 24610 }, { "epoch": 6.496502573577933, "grad_norm": 0.15895818173885345, "learning_rate": 1.6137419891873317e-05, "loss": 0.0347, "num_input_tokens_seen": 22001952, "step": 24615 }, { "epoch": 6.49782235713343, "grad_norm": 0.26278477907180786, "learning_rate": 1.6128240696442038e-05, "loss": 0.0915, "num_input_tokens_seen": 22006432, "step": 24620 }, { "epoch": 6.499142140688927, "grad_norm": 0.11717765033245087, "learning_rate": 1.611906286914753e-05, "loss": 0.0892, "num_input_tokens_seen": 22010784, "step": 24625 }, { "epoch": 6.500461924244424, "grad_norm": 0.1504027098417282, "learning_rate": 1.6109886411405144e-05, "loss": 0.04, "num_input_tokens_seen": 22015072, "step": 24630 }, { "epoch": 6.501781707799921, "grad_norm": 0.4211582541465759, "learning_rate": 1.6100711324629985e-05, "loss": 0.0782, "num_input_tokens_seen": 22019360, "step": 24635 }, { "epoch": 6.503101491355418, "grad_norm": 0.21142257750034332, "learning_rate": 1.609153761023698e-05, "loss": 0.0295, "num_input_tokens_seen": 22023648, "step": 24640 }, { "epoch": 6.504421274910914, "grad_norm": 0.11218905448913574, "learning_rate": 1.608236526964083e-05, "loss": 0.0825, "num_input_tokens_seen": 22028576, "step": 24645 }, { "epoch": 6.505741058466412, "grad_norm": 0.3195367753505707, "learning_rate": 1.607319430425601e-05, "loss": 0.1029, "num_input_tokens_seen": 22033216, "step": 24650 }, { "epoch": 6.507060842021908, "grad_norm": 0.12473928183317184, "learning_rate": 1.606402471549682e-05, "loss": 0.0749, "num_input_tokens_seen": 22038176, "step": 24655 }, { "epoch": 6.508380625577406, "grad_norm": 0.2630380392074585, "learning_rate": 1.6054856504777312e-05, "loss": 0.0894, "num_input_tokens_seen": 22042944, "step": 24660 }, { "epoch": 6.509700409132902, "grad_norm": 0.3162872791290283, "learning_rate": 1.6045689673511334e-05, "loss": 0.0609, "num_input_tokens_seen": 22047360, "step": 24665 }, { "epoch": 6.511020192688399, "grad_norm": 0.0589139498770237, "learning_rate": 1.6036524223112548e-05, "loss": 0.0617, "num_input_tokens_seen": 22051680, "step": 24670 }, { "epoch": 6.512339976243896, "grad_norm": 0.1132378876209259, "learning_rate": 1.602736015499436e-05, "loss": 0.0448, "num_input_tokens_seen": 22056064, "step": 24675 }, { "epoch": 6.513659759799393, "grad_norm": 0.41450002789497375, "learning_rate": 1.601819747057e-05, "loss": 0.0719, "num_input_tokens_seen": 22060512, "step": 24680 }, { "epoch": 6.51497954335489, "grad_norm": 0.3552088439464569, "learning_rate": 1.6009036171252465e-05, "loss": 0.0545, "num_input_tokens_seen": 22065088, "step": 24685 }, { "epoch": 6.516299326910387, "grad_norm": 0.1403990238904953, "learning_rate": 1.599987625845453e-05, "loss": 0.0666, "num_input_tokens_seen": 22069696, "step": 24690 }, { "epoch": 6.517619110465883, "grad_norm": 0.20467954874038696, "learning_rate": 1.599071773358879e-05, "loss": 0.0559, "num_input_tokens_seen": 22074272, "step": 24695 }, { "epoch": 6.518938894021381, "grad_norm": 0.38528159260749817, "learning_rate": 1.598156059806758e-05, "loss": 0.0912, "num_input_tokens_seen": 22078880, "step": 24700 }, { "epoch": 6.520258677576877, "grad_norm": 0.0723615288734436, "learning_rate": 1.5972404853303062e-05, "loss": 0.0384, "num_input_tokens_seen": 22083328, "step": 24705 }, { "epoch": 6.521578461132374, "grad_norm": 0.1609569489955902, "learning_rate": 1.5963250500707172e-05, "loss": 0.0359, "num_input_tokens_seen": 22087776, "step": 24710 }, { "epoch": 6.522898244687871, "grad_norm": 0.0764029324054718, "learning_rate": 1.5954097541691612e-05, "loss": 0.0354, "num_input_tokens_seen": 22092192, "step": 24715 }, { "epoch": 6.524218028243368, "grad_norm": 0.4174157977104187, "learning_rate": 1.5944945977667884e-05, "loss": 0.0543, "num_input_tokens_seen": 22096768, "step": 24720 }, { "epoch": 6.5255378117988645, "grad_norm": 0.1853865534067154, "learning_rate": 1.593579581004729e-05, "loss": 0.0549, "num_input_tokens_seen": 22101376, "step": 24725 }, { "epoch": 6.526857595354362, "grad_norm": 0.38852012157440186, "learning_rate": 1.592664704024088e-05, "loss": 0.076, "num_input_tokens_seen": 22105696, "step": 24730 }, { "epoch": 6.5281773789098585, "grad_norm": 0.24551379680633545, "learning_rate": 1.591749966965953e-05, "loss": 0.0397, "num_input_tokens_seen": 22109888, "step": 24735 }, { "epoch": 6.529497162465356, "grad_norm": 0.12772448360919952, "learning_rate": 1.5908353699713856e-05, "loss": 0.0631, "num_input_tokens_seen": 22114240, "step": 24740 }, { "epoch": 6.5308169460208525, "grad_norm": 0.2763034999370575, "learning_rate": 1.5899209131814298e-05, "loss": 0.0884, "num_input_tokens_seen": 22118464, "step": 24745 }, { "epoch": 6.532136729576349, "grad_norm": 0.26864516735076904, "learning_rate": 1.5890065967371067e-05, "loss": 0.0704, "num_input_tokens_seen": 22123008, "step": 24750 }, { "epoch": 6.5334565131318465, "grad_norm": 0.45077452063560486, "learning_rate": 1.5880924207794144e-05, "loss": 0.0652, "num_input_tokens_seen": 22127616, "step": 24755 }, { "epoch": 6.534776296687343, "grad_norm": 0.06777957826852798, "learning_rate": 1.5871783854493298e-05, "loss": 0.0397, "num_input_tokens_seen": 22132512, "step": 24760 }, { "epoch": 6.5360960802428405, "grad_norm": 0.05977372080087662, "learning_rate": 1.5862644908878106e-05, "loss": 0.0222, "num_input_tokens_seen": 22137120, "step": 24765 }, { "epoch": 6.537415863798337, "grad_norm": 0.1738738864660263, "learning_rate": 1.5853507372357885e-05, "loss": 0.0284, "num_input_tokens_seen": 22141600, "step": 24770 }, { "epoch": 6.538735647353834, "grad_norm": 0.07651665806770325, "learning_rate": 1.5844371246341776e-05, "loss": 0.0569, "num_input_tokens_seen": 22146240, "step": 24775 }, { "epoch": 6.540055430909331, "grad_norm": 0.05778427794575691, "learning_rate": 1.5835236532238674e-05, "loss": 0.0359, "num_input_tokens_seen": 22150528, "step": 24780 }, { "epoch": 6.541375214464828, "grad_norm": 0.2825198769569397, "learning_rate": 1.582610323145727e-05, "loss": 0.062, "num_input_tokens_seen": 22155168, "step": 24785 }, { "epoch": 6.542694998020325, "grad_norm": 0.3366435766220093, "learning_rate": 1.5816971345406035e-05, "loss": 0.0728, "num_input_tokens_seen": 22159840, "step": 24790 }, { "epoch": 6.544014781575822, "grad_norm": 0.21497344970703125, "learning_rate": 1.5807840875493225e-05, "loss": 0.071, "num_input_tokens_seen": 22164608, "step": 24795 }, { "epoch": 6.545334565131318, "grad_norm": 0.23767434060573578, "learning_rate": 1.5798711823126854e-05, "loss": 0.0817, "num_input_tokens_seen": 22168864, "step": 24800 }, { "epoch": 6.545334565131318, "eval_loss": 0.06724140048027039, "eval_runtime": 64.737, "eval_samples_per_second": 104.036, "eval_steps_per_second": 26.013, "num_input_tokens_seen": 22168864, "step": 24800 }, { "epoch": 6.546654348686816, "grad_norm": 0.3482906222343445, "learning_rate": 1.578958418971477e-05, "loss": 0.0916, "num_input_tokens_seen": 22173344, "step": 24805 }, { "epoch": 6.547974132242312, "grad_norm": 0.1272256225347519, "learning_rate": 1.578045797666453e-05, "loss": 0.0485, "num_input_tokens_seen": 22177856, "step": 24810 }, { "epoch": 6.54929391579781, "grad_norm": 0.16916589438915253, "learning_rate": 1.5771333185383548e-05, "loss": 0.0389, "num_input_tokens_seen": 22182304, "step": 24815 }, { "epoch": 6.550613699353306, "grad_norm": 0.25160858035087585, "learning_rate": 1.576220981727895e-05, "loss": 0.1231, "num_input_tokens_seen": 22186528, "step": 24820 }, { "epoch": 6.551933482908803, "grad_norm": 0.20918305218219757, "learning_rate": 1.575308787375769e-05, "loss": 0.0427, "num_input_tokens_seen": 22191040, "step": 24825 }, { "epoch": 6.5532532664643, "grad_norm": 0.3416500985622406, "learning_rate": 1.5743967356226492e-05, "loss": 0.113, "num_input_tokens_seen": 22195360, "step": 24830 }, { "epoch": 6.554573050019797, "grad_norm": 0.2278144508600235, "learning_rate": 1.5734848266091835e-05, "loss": 0.0342, "num_input_tokens_seen": 22199904, "step": 24835 }, { "epoch": 6.555892833575293, "grad_norm": 0.15138262510299683, "learning_rate": 1.572573060476001e-05, "loss": 0.0617, "num_input_tokens_seen": 22204608, "step": 24840 }, { "epoch": 6.557212617130791, "grad_norm": 0.24951870739459991, "learning_rate": 1.5716614373637085e-05, "loss": 0.0623, "num_input_tokens_seen": 22208736, "step": 24845 }, { "epoch": 6.558532400686287, "grad_norm": 0.08371890336275101, "learning_rate": 1.570749957412887e-05, "loss": 0.074, "num_input_tokens_seen": 22213152, "step": 24850 }, { "epoch": 6.559852184241785, "grad_norm": 0.16481614112854004, "learning_rate": 1.5698386207641013e-05, "loss": 0.0322, "num_input_tokens_seen": 22217664, "step": 24855 }, { "epoch": 6.561171967797281, "grad_norm": 0.19218680262565613, "learning_rate": 1.5689274275578884e-05, "loss": 0.0352, "num_input_tokens_seen": 22222016, "step": 24860 }, { "epoch": 6.562491751352778, "grad_norm": 0.2483195662498474, "learning_rate": 1.5680163779347667e-05, "loss": 0.1442, "num_input_tokens_seen": 22226400, "step": 24865 }, { "epoch": 6.563811534908275, "grad_norm": 0.13551373779773712, "learning_rate": 1.5671054720352327e-05, "loss": 0.0374, "num_input_tokens_seen": 22230912, "step": 24870 }, { "epoch": 6.565131318463772, "grad_norm": 0.3919602334499359, "learning_rate": 1.566194709999757e-05, "loss": 0.1095, "num_input_tokens_seen": 22235264, "step": 24875 }, { "epoch": 6.566451102019268, "grad_norm": 0.10560186952352524, "learning_rate": 1.5652840919687933e-05, "loss": 0.0475, "num_input_tokens_seen": 22239840, "step": 24880 }, { "epoch": 6.567770885574766, "grad_norm": 0.20058444142341614, "learning_rate": 1.5643736180827676e-05, "loss": 0.04, "num_input_tokens_seen": 22244288, "step": 24885 }, { "epoch": 6.569090669130262, "grad_norm": 0.5639130473136902, "learning_rate": 1.5634632884820878e-05, "loss": 0.0747, "num_input_tokens_seen": 22248896, "step": 24890 }, { "epoch": 6.57041045268576, "grad_norm": 0.20073877274990082, "learning_rate": 1.5625531033071395e-05, "loss": 0.0184, "num_input_tokens_seen": 22253408, "step": 24895 }, { "epoch": 6.571730236241256, "grad_norm": 0.03171442076563835, "learning_rate": 1.5616430626982828e-05, "loss": 0.0337, "num_input_tokens_seen": 22257792, "step": 24900 }, { "epoch": 6.573050019796753, "grad_norm": 0.5195639729499817, "learning_rate": 1.5607331667958575e-05, "loss": 0.0518, "num_input_tokens_seen": 22262080, "step": 24905 }, { "epoch": 6.57436980335225, "grad_norm": 0.07130123674869537, "learning_rate": 1.5598234157401824e-05, "loss": 0.0524, "num_input_tokens_seen": 22266656, "step": 24910 }, { "epoch": 6.575689586907747, "grad_norm": 0.245599627494812, "learning_rate": 1.5589138096715503e-05, "loss": 0.0362, "num_input_tokens_seen": 22271296, "step": 24915 }, { "epoch": 6.577009370463244, "grad_norm": 0.07163049280643463, "learning_rate": 1.5580043487302365e-05, "loss": 0.0329, "num_input_tokens_seen": 22275744, "step": 24920 }, { "epoch": 6.578329154018741, "grad_norm": 0.3264152407646179, "learning_rate": 1.5570950330564888e-05, "loss": 0.0346, "num_input_tokens_seen": 22280320, "step": 24925 }, { "epoch": 6.5796489375742375, "grad_norm": 0.09860634803771973, "learning_rate": 1.5561858627905367e-05, "loss": 0.0824, "num_input_tokens_seen": 22284512, "step": 24930 }, { "epoch": 6.580968721129735, "grad_norm": 0.16872276365756989, "learning_rate": 1.5552768380725857e-05, "loss": 0.0504, "num_input_tokens_seen": 22289376, "step": 24935 }, { "epoch": 6.5822885046852315, "grad_norm": 0.36777904629707336, "learning_rate": 1.5543679590428183e-05, "loss": 0.0433, "num_input_tokens_seen": 22293696, "step": 24940 }, { "epoch": 6.583608288240729, "grad_norm": 0.24464012682437897, "learning_rate": 1.5534592258413943e-05, "loss": 0.0602, "num_input_tokens_seen": 22298144, "step": 24945 }, { "epoch": 6.5849280717962255, "grad_norm": 0.3762947916984558, "learning_rate": 1.5525506386084538e-05, "loss": 0.0958, "num_input_tokens_seen": 22302784, "step": 24950 }, { "epoch": 6.586247855351722, "grad_norm": 0.15514510869979858, "learning_rate": 1.55164219748411e-05, "loss": 0.0988, "num_input_tokens_seen": 22307232, "step": 24955 }, { "epoch": 6.5875676389072195, "grad_norm": 0.24981392920017242, "learning_rate": 1.550733902608459e-05, "loss": 0.0416, "num_input_tokens_seen": 22311712, "step": 24960 }, { "epoch": 6.588887422462716, "grad_norm": 0.13734064996242523, "learning_rate": 1.549825754121568e-05, "loss": 0.0609, "num_input_tokens_seen": 22316160, "step": 24965 }, { "epoch": 6.590207206018213, "grad_norm": 0.028818149119615555, "learning_rate": 1.5489177521634864e-05, "loss": 0.1445, "num_input_tokens_seen": 22320704, "step": 24970 }, { "epoch": 6.59152698957371, "grad_norm": 0.20127154886722565, "learning_rate": 1.5480098968742402e-05, "loss": 0.0605, "num_input_tokens_seen": 22325024, "step": 24975 }, { "epoch": 6.592846773129207, "grad_norm": 0.23138581216335297, "learning_rate": 1.5471021883938304e-05, "loss": 0.0331, "num_input_tokens_seen": 22329568, "step": 24980 }, { "epoch": 6.594166556684704, "grad_norm": 0.3950396776199341, "learning_rate": 1.546194626862238e-05, "loss": 0.0502, "num_input_tokens_seen": 22334016, "step": 24985 }, { "epoch": 6.595486340240201, "grad_norm": 0.3257615268230438, "learning_rate": 1.5452872124194216e-05, "loss": 0.0926, "num_input_tokens_seen": 22338336, "step": 24990 }, { "epoch": 6.596806123795697, "grad_norm": 0.2152331918478012, "learning_rate": 1.5443799452053136e-05, "loss": 0.0437, "num_input_tokens_seen": 22342688, "step": 24995 }, { "epoch": 6.598125907351195, "grad_norm": 0.10007510334253311, "learning_rate": 1.543472825359828e-05, "loss": 0.09, "num_input_tokens_seen": 22347392, "step": 25000 }, { "epoch": 6.598125907351195, "eval_loss": 0.06756851077079773, "eval_runtime": 64.7562, "eval_samples_per_second": 104.005, "eval_steps_per_second": 26.005, "num_input_tokens_seen": 22347392, "step": 25000 }, { "epoch": 6.599445690906691, "grad_norm": 0.3414227068424225, "learning_rate": 1.5425658530228522e-05, "loss": 0.0419, "num_input_tokens_seen": 22351712, "step": 25005 }, { "epoch": 6.600765474462188, "grad_norm": 0.024705443531274796, "learning_rate": 1.5416590283342546e-05, "loss": 0.0137, "num_input_tokens_seen": 22356192, "step": 25010 }, { "epoch": 6.602085258017685, "grad_norm": 0.3722876310348511, "learning_rate": 1.5407523514338783e-05, "loss": 0.1056, "num_input_tokens_seen": 22360960, "step": 25015 }, { "epoch": 6.603405041573182, "grad_norm": 0.3392707109451294, "learning_rate": 1.539845822461543e-05, "loss": 0.0719, "num_input_tokens_seen": 22365888, "step": 25020 }, { "epoch": 6.604724825128679, "grad_norm": 0.022637538611888885, "learning_rate": 1.538939441557048e-05, "loss": 0.0275, "num_input_tokens_seen": 22370176, "step": 25025 }, { "epoch": 6.606044608684176, "grad_norm": 0.15895098447799683, "learning_rate": 1.5380332088601696e-05, "loss": 0.0203, "num_input_tokens_seen": 22374368, "step": 25030 }, { "epoch": 6.607364392239672, "grad_norm": 0.1940978318452835, "learning_rate": 1.537127124510658e-05, "loss": 0.0442, "num_input_tokens_seen": 22379008, "step": 25035 }, { "epoch": 6.60868417579517, "grad_norm": 0.10255209356546402, "learning_rate": 1.5362211886482457e-05, "loss": 0.0844, "num_input_tokens_seen": 22383296, "step": 25040 }, { "epoch": 6.610003959350666, "grad_norm": 0.9258224368095398, "learning_rate": 1.5353154014126363e-05, "loss": 0.0284, "num_input_tokens_seen": 22387776, "step": 25045 }, { "epoch": 6.611323742906164, "grad_norm": 0.39313894510269165, "learning_rate": 1.534409762943515e-05, "loss": 0.0512, "num_input_tokens_seen": 22392480, "step": 25050 }, { "epoch": 6.61264352646166, "grad_norm": 0.2638201415538788, "learning_rate": 1.5335042733805438e-05, "loss": 0.0696, "num_input_tokens_seen": 22396896, "step": 25055 }, { "epoch": 6.613963310017157, "grad_norm": 0.2537665069103241, "learning_rate": 1.532598932863358e-05, "loss": 0.0419, "num_input_tokens_seen": 22401312, "step": 25060 }, { "epoch": 6.615283093572654, "grad_norm": 0.37750327587127686, "learning_rate": 1.531693741531574e-05, "loss": 0.0676, "num_input_tokens_seen": 22405888, "step": 25065 }, { "epoch": 6.616602877128151, "grad_norm": 0.3170725405216217, "learning_rate": 1.5307886995247844e-05, "loss": 0.087, "num_input_tokens_seen": 22410560, "step": 25070 }, { "epoch": 6.617922660683648, "grad_norm": 0.07566087692975998, "learning_rate": 1.529883806982557e-05, "loss": 0.0308, "num_input_tokens_seen": 22415072, "step": 25075 }, { "epoch": 6.619242444239145, "grad_norm": 0.040788568556308746, "learning_rate": 1.5289790640444376e-05, "loss": 0.045, "num_input_tokens_seen": 22419264, "step": 25080 }, { "epoch": 6.620562227794641, "grad_norm": 0.08925555646419525, "learning_rate": 1.5280744708499494e-05, "loss": 0.0408, "num_input_tokens_seen": 22423424, "step": 25085 }, { "epoch": 6.621882011350139, "grad_norm": 0.0710161030292511, "learning_rate": 1.527170027538591e-05, "loss": 0.0227, "num_input_tokens_seen": 22427680, "step": 25090 }, { "epoch": 6.623201794905635, "grad_norm": 0.14255192875862122, "learning_rate": 1.5262657342498407e-05, "loss": 0.0356, "num_input_tokens_seen": 22432096, "step": 25095 }, { "epoch": 6.624521578461132, "grad_norm": 0.3095634877681732, "learning_rate": 1.52536159112315e-05, "loss": 0.0565, "num_input_tokens_seen": 22436608, "step": 25100 }, { "epoch": 6.625841362016629, "grad_norm": 0.08310741186141968, "learning_rate": 1.5244575982979497e-05, "loss": 0.0394, "num_input_tokens_seen": 22440992, "step": 25105 }, { "epoch": 6.627161145572126, "grad_norm": 0.1763300746679306, "learning_rate": 1.5235537559136487e-05, "loss": 0.0715, "num_input_tokens_seen": 22445120, "step": 25110 }, { "epoch": 6.628480929127623, "grad_norm": 0.31509190797805786, "learning_rate": 1.5226500641096286e-05, "loss": 0.1056, "num_input_tokens_seen": 22449696, "step": 25115 }, { "epoch": 6.62980071268312, "grad_norm": 0.35046854615211487, "learning_rate": 1.5217465230252509e-05, "loss": 0.0551, "num_input_tokens_seen": 22454080, "step": 25120 }, { "epoch": 6.6311204962386165, "grad_norm": 0.28767433762550354, "learning_rate": 1.5208431327998523e-05, "loss": 0.031, "num_input_tokens_seen": 22458528, "step": 25125 }, { "epoch": 6.632440279794114, "grad_norm": 0.3922910988330841, "learning_rate": 1.5199398935727477e-05, "loss": 0.0745, "num_input_tokens_seen": 22462720, "step": 25130 }, { "epoch": 6.6337600633496105, "grad_norm": 0.19528095424175262, "learning_rate": 1.5190368054832282e-05, "loss": 0.045, "num_input_tokens_seen": 22467264, "step": 25135 }, { "epoch": 6.635079846905107, "grad_norm": 0.36180612444877625, "learning_rate": 1.5181338686705601e-05, "loss": 0.0811, "num_input_tokens_seen": 22471648, "step": 25140 }, { "epoch": 6.6363996304606045, "grad_norm": 0.4259493350982666, "learning_rate": 1.5172310832739889e-05, "loss": 0.064, "num_input_tokens_seen": 22476320, "step": 25145 }, { "epoch": 6.637719414016101, "grad_norm": 0.15420609712600708, "learning_rate": 1.5163284494327346e-05, "loss": 0.0689, "num_input_tokens_seen": 22480672, "step": 25150 }, { "epoch": 6.6390391975715985, "grad_norm": 0.06095863878726959, "learning_rate": 1.5154259672859952e-05, "loss": 0.0821, "num_input_tokens_seen": 22485088, "step": 25155 }, { "epoch": 6.640358981127095, "grad_norm": 0.09854001551866531, "learning_rate": 1.5145236369729452e-05, "loss": 0.06, "num_input_tokens_seen": 22490112, "step": 25160 }, { "epoch": 6.641678764682592, "grad_norm": 0.23597709834575653, "learning_rate": 1.5136214586327335e-05, "loss": 0.0814, "num_input_tokens_seen": 22494208, "step": 25165 }, { "epoch": 6.642998548238089, "grad_norm": 0.17460519075393677, "learning_rate": 1.5127194324044885e-05, "loss": 0.0318, "num_input_tokens_seen": 22498624, "step": 25170 }, { "epoch": 6.644318331793586, "grad_norm": 0.21749743819236755, "learning_rate": 1.5118175584273148e-05, "loss": 0.0832, "num_input_tokens_seen": 22503360, "step": 25175 }, { "epoch": 6.645638115349083, "grad_norm": 0.3704953193664551, "learning_rate": 1.5109158368402909e-05, "loss": 0.1004, "num_input_tokens_seen": 22507840, "step": 25180 }, { "epoch": 6.64695789890458, "grad_norm": 0.0787973701953888, "learning_rate": 1.5100142677824753e-05, "loss": 0.0235, "num_input_tokens_seen": 22512512, "step": 25185 }, { "epoch": 6.648277682460076, "grad_norm": 0.21899017691612244, "learning_rate": 1.509112851392901e-05, "loss": 0.0755, "num_input_tokens_seen": 22517248, "step": 25190 }, { "epoch": 6.649597466015574, "grad_norm": 0.08616723865270615, "learning_rate": 1.5082115878105763e-05, "loss": 0.0311, "num_input_tokens_seen": 22521760, "step": 25195 }, { "epoch": 6.65091724957107, "grad_norm": 0.23156920075416565, "learning_rate": 1.5073104771744892e-05, "loss": 0.0447, "num_input_tokens_seen": 22526048, "step": 25200 }, { "epoch": 6.65091724957107, "eval_loss": 0.0673813447356224, "eval_runtime": 64.7733, "eval_samples_per_second": 103.978, "eval_steps_per_second": 25.998, "num_input_tokens_seen": 22526048, "step": 25200 }, { "epoch": 6.652237033126568, "grad_norm": 0.21575644612312317, "learning_rate": 1.5064095196236006e-05, "loss": 0.0685, "num_input_tokens_seen": 22530560, "step": 25205 }, { "epoch": 6.653556816682064, "grad_norm": 0.2730697989463806, "learning_rate": 1.50550871529685e-05, "loss": 0.0556, "num_input_tokens_seen": 22535296, "step": 25210 }, { "epoch": 6.654876600237561, "grad_norm": 0.23770639300346375, "learning_rate": 1.5046080643331546e-05, "loss": 0.0655, "num_input_tokens_seen": 22539392, "step": 25215 }, { "epoch": 6.656196383793058, "grad_norm": 0.14808988571166992, "learning_rate": 1.5037075668714028e-05, "loss": 0.0447, "num_input_tokens_seen": 22543872, "step": 25220 }, { "epoch": 6.657516167348555, "grad_norm": 0.09054088592529297, "learning_rate": 1.5028072230504656e-05, "loss": 0.0925, "num_input_tokens_seen": 22548512, "step": 25225 }, { "epoch": 6.658835950904052, "grad_norm": 0.023256955668330193, "learning_rate": 1.5019070330091861e-05, "loss": 0.0401, "num_input_tokens_seen": 22552864, "step": 25230 }, { "epoch": 6.660155734459549, "grad_norm": 0.15788036584854126, "learning_rate": 1.5010069968863843e-05, "loss": 0.0501, "num_input_tokens_seen": 22557120, "step": 25235 }, { "epoch": 6.661475518015045, "grad_norm": 0.3684157133102417, "learning_rate": 1.5001071148208584e-05, "loss": 0.0568, "num_input_tokens_seen": 22561696, "step": 25240 }, { "epoch": 6.662795301570543, "grad_norm": 0.041230540722608566, "learning_rate": 1.49920738695138e-05, "loss": 0.0558, "num_input_tokens_seen": 22566240, "step": 25245 }, { "epoch": 6.664115085126039, "grad_norm": 0.1836252361536026, "learning_rate": 1.4983078134166995e-05, "loss": 0.0588, "num_input_tokens_seen": 22570720, "step": 25250 }, { "epoch": 6.665434868681536, "grad_norm": 0.17589899897575378, "learning_rate": 1.4974083943555428e-05, "loss": 0.0175, "num_input_tokens_seen": 22575136, "step": 25255 }, { "epoch": 6.666754652237033, "grad_norm": 0.16102515161037445, "learning_rate": 1.496509129906611e-05, "loss": 0.1505, "num_input_tokens_seen": 22579424, "step": 25260 }, { "epoch": 6.66807443579253, "grad_norm": 0.5027807354927063, "learning_rate": 1.4956100202085809e-05, "loss": 0.0881, "num_input_tokens_seen": 22584352, "step": 25265 }, { "epoch": 6.6693942193480265, "grad_norm": 0.25196483731269836, "learning_rate": 1.4947110654001093e-05, "loss": 0.0651, "num_input_tokens_seen": 22588896, "step": 25270 }, { "epoch": 6.670714002903524, "grad_norm": 0.11714130640029907, "learning_rate": 1.4938122656198234e-05, "loss": 0.038, "num_input_tokens_seen": 22593280, "step": 25275 }, { "epoch": 6.6720337864590205, "grad_norm": 0.25152313709259033, "learning_rate": 1.4929136210063316e-05, "loss": 0.0527, "num_input_tokens_seen": 22597536, "step": 25280 }, { "epoch": 6.673353570014518, "grad_norm": 0.13583233952522278, "learning_rate": 1.4920151316982146e-05, "loss": 0.0476, "num_input_tokens_seen": 22602048, "step": 25285 }, { "epoch": 6.6746733535700145, "grad_norm": 0.1612614542245865, "learning_rate": 1.4911167978340312e-05, "loss": 0.0246, "num_input_tokens_seen": 22606528, "step": 25290 }, { "epoch": 6.675993137125511, "grad_norm": 0.306769996881485, "learning_rate": 1.4902186195523166e-05, "loss": 0.042, "num_input_tokens_seen": 22610816, "step": 25295 }, { "epoch": 6.6773129206810085, "grad_norm": 0.3543424606323242, "learning_rate": 1.4893205969915805e-05, "loss": 0.0489, "num_input_tokens_seen": 22615296, "step": 25300 }, { "epoch": 6.678632704236505, "grad_norm": 0.41265955567359924, "learning_rate": 1.4884227302903086e-05, "loss": 0.0593, "num_input_tokens_seen": 22619712, "step": 25305 }, { "epoch": 6.6799524877920025, "grad_norm": 0.12850342690944672, "learning_rate": 1.4875250195869653e-05, "loss": 0.0549, "num_input_tokens_seen": 22624352, "step": 25310 }, { "epoch": 6.681272271347499, "grad_norm": 0.15705634653568268, "learning_rate": 1.4866274650199862e-05, "loss": 0.0446, "num_input_tokens_seen": 22628608, "step": 25315 }, { "epoch": 6.682592054902996, "grad_norm": 0.13536995649337769, "learning_rate": 1.485730066727788e-05, "loss": 0.0803, "num_input_tokens_seen": 22633088, "step": 25320 }, { "epoch": 6.683911838458493, "grad_norm": 0.27390095591545105, "learning_rate": 1.4848328248487586e-05, "loss": 0.0974, "num_input_tokens_seen": 22637440, "step": 25325 }, { "epoch": 6.68523162201399, "grad_norm": 0.2808273434638977, "learning_rate": 1.4839357395212656e-05, "loss": 0.0559, "num_input_tokens_seen": 22641984, "step": 25330 }, { "epoch": 6.686551405569487, "grad_norm": 0.15451383590698242, "learning_rate": 1.4830388108836502e-05, "loss": 0.0766, "num_input_tokens_seen": 22646464, "step": 25335 }, { "epoch": 6.687871189124984, "grad_norm": 0.0476226881146431, "learning_rate": 1.4821420390742299e-05, "loss": 0.0473, "num_input_tokens_seen": 22651424, "step": 25340 }, { "epoch": 6.68919097268048, "grad_norm": 0.06523685157299042, "learning_rate": 1.4812454242312979e-05, "loss": 0.0864, "num_input_tokens_seen": 22655968, "step": 25345 }, { "epoch": 6.690510756235978, "grad_norm": 0.09467204660177231, "learning_rate": 1.4803489664931253e-05, "loss": 0.0715, "num_input_tokens_seen": 22660384, "step": 25350 }, { "epoch": 6.691830539791474, "grad_norm": 0.16191187500953674, "learning_rate": 1.4794526659979544e-05, "loss": 0.0685, "num_input_tokens_seen": 22664896, "step": 25355 }, { "epoch": 6.693150323346972, "grad_norm": 0.2829194962978363, "learning_rate": 1.4785565228840086e-05, "loss": 0.0247, "num_input_tokens_seen": 22669312, "step": 25360 }, { "epoch": 6.694470106902468, "grad_norm": 0.15123167634010315, "learning_rate": 1.4776605372894819e-05, "loss": 0.0417, "num_input_tokens_seen": 22673760, "step": 25365 }, { "epoch": 6.695789890457965, "grad_norm": 0.10001382231712341, "learning_rate": 1.4767647093525488e-05, "loss": 0.0634, "num_input_tokens_seen": 22678464, "step": 25370 }, { "epoch": 6.697109674013462, "grad_norm": 0.4783877432346344, "learning_rate": 1.4758690392113566e-05, "loss": 0.0878, "num_input_tokens_seen": 22682976, "step": 25375 }, { "epoch": 6.698429457568959, "grad_norm": 0.20336632430553436, "learning_rate": 1.4749735270040276e-05, "loss": 0.0335, "num_input_tokens_seen": 22687616, "step": 25380 }, { "epoch": 6.699749241124455, "grad_norm": 0.1614319086074829, "learning_rate": 1.4740781728686623e-05, "loss": 0.1117, "num_input_tokens_seen": 22692128, "step": 25385 }, { "epoch": 6.701069024679953, "grad_norm": 0.0773206576704979, "learning_rate": 1.4731829769433358e-05, "loss": 0.0275, "num_input_tokens_seen": 22696416, "step": 25390 }, { "epoch": 6.702388808235449, "grad_norm": 0.08341587334871292, "learning_rate": 1.4722879393660976e-05, "loss": 0.0266, "num_input_tokens_seen": 22700512, "step": 25395 }, { "epoch": 6.703708591790946, "grad_norm": 0.2792700529098511, "learning_rate": 1.4713930602749748e-05, "loss": 0.1004, "num_input_tokens_seen": 22704800, "step": 25400 }, { "epoch": 6.703708591790946, "eval_loss": 0.06725440174341202, "eval_runtime": 64.8237, "eval_samples_per_second": 103.897, "eval_steps_per_second": 25.978, "num_input_tokens_seen": 22704800, "step": 25400 }, { "epoch": 6.705028375346443, "grad_norm": 0.17292864620685577, "learning_rate": 1.470498339807968e-05, "loss": 0.0727, "num_input_tokens_seen": 22709152, "step": 25405 }, { "epoch": 6.70634815890194, "grad_norm": 0.21330343186855316, "learning_rate": 1.4696037781030542e-05, "loss": 0.0954, "num_input_tokens_seen": 22713408, "step": 25410 }, { "epoch": 6.707667942457437, "grad_norm": 0.23509669303894043, "learning_rate": 1.4687093752981876e-05, "loss": 0.0626, "num_input_tokens_seen": 22717696, "step": 25415 }, { "epoch": 6.708987726012934, "grad_norm": 0.18300104141235352, "learning_rate": 1.4678151315312943e-05, "loss": 0.0365, "num_input_tokens_seen": 22722560, "step": 25420 }, { "epoch": 6.71030750956843, "grad_norm": 0.295907199382782, "learning_rate": 1.4669210469402789e-05, "loss": 0.037, "num_input_tokens_seen": 22727104, "step": 25425 }, { "epoch": 6.711627293123928, "grad_norm": 0.2507394254207611, "learning_rate": 1.4660271216630218e-05, "loss": 0.0271, "num_input_tokens_seen": 22731712, "step": 25430 }, { "epoch": 6.712947076679424, "grad_norm": 0.40745922923088074, "learning_rate": 1.4651333558373748e-05, "loss": 0.0891, "num_input_tokens_seen": 22736224, "step": 25435 }, { "epoch": 6.714266860234922, "grad_norm": 0.03623587638139725, "learning_rate": 1.4642397496011707e-05, "loss": 0.0552, "num_input_tokens_seen": 22740736, "step": 25440 }, { "epoch": 6.715586643790418, "grad_norm": 0.08631671965122223, "learning_rate": 1.4633463030922129e-05, "loss": 0.0401, "num_input_tokens_seen": 22744992, "step": 25445 }, { "epoch": 6.716906427345915, "grad_norm": 0.2634826898574829, "learning_rate": 1.462453016448282e-05, "loss": 0.0413, "num_input_tokens_seen": 22749184, "step": 25450 }, { "epoch": 6.718226210901412, "grad_norm": 0.19099149107933044, "learning_rate": 1.4615598898071354e-05, "loss": 0.0921, "num_input_tokens_seen": 22753472, "step": 25455 }, { "epoch": 6.719545994456909, "grad_norm": 0.33130350708961487, "learning_rate": 1.4606669233065026e-05, "loss": 0.0277, "num_input_tokens_seen": 22757984, "step": 25460 }, { "epoch": 6.720865778012406, "grad_norm": 0.25199878215789795, "learning_rate": 1.4597741170840914e-05, "loss": 0.0505, "num_input_tokens_seen": 22762240, "step": 25465 }, { "epoch": 6.722185561567903, "grad_norm": 0.1882053017616272, "learning_rate": 1.4588814712775853e-05, "loss": 0.0575, "num_input_tokens_seen": 22767008, "step": 25470 }, { "epoch": 6.7235053451233995, "grad_norm": 0.18165087699890137, "learning_rate": 1.4579889860246382e-05, "loss": 0.0844, "num_input_tokens_seen": 22771648, "step": 25475 }, { "epoch": 6.724825128678897, "grad_norm": 0.15447381138801575, "learning_rate": 1.457096661462885e-05, "loss": 0.0391, "num_input_tokens_seen": 22776000, "step": 25480 }, { "epoch": 6.7261449122343935, "grad_norm": 0.2860298156738281, "learning_rate": 1.4562044977299322e-05, "loss": 0.1682, "num_input_tokens_seen": 22780416, "step": 25485 }, { "epoch": 6.727464695789891, "grad_norm": 0.0845583975315094, "learning_rate": 1.4553124949633623e-05, "loss": 0.1032, "num_input_tokens_seen": 22785088, "step": 25490 }, { "epoch": 6.7287844793453875, "grad_norm": 0.3871162235736847, "learning_rate": 1.4544206533007354e-05, "loss": 0.1165, "num_input_tokens_seen": 22789760, "step": 25495 }, { "epoch": 6.730104262900884, "grad_norm": 0.0702676922082901, "learning_rate": 1.4535289728795821e-05, "loss": 0.0351, "num_input_tokens_seen": 22794464, "step": 25500 }, { "epoch": 6.7314240464563815, "grad_norm": 0.07201121747493744, "learning_rate": 1.4526374538374132e-05, "loss": 0.0796, "num_input_tokens_seen": 22799072, "step": 25505 }, { "epoch": 6.732743830011878, "grad_norm": 0.14322853088378906, "learning_rate": 1.4517460963117097e-05, "loss": 0.0392, "num_input_tokens_seen": 22803424, "step": 25510 }, { "epoch": 6.734063613567375, "grad_norm": 0.11913283169269562, "learning_rate": 1.4508549004399314e-05, "loss": 0.0412, "num_input_tokens_seen": 22808064, "step": 25515 }, { "epoch": 6.735383397122872, "grad_norm": 0.2387559562921524, "learning_rate": 1.449963866359513e-05, "loss": 0.0354, "num_input_tokens_seen": 22812608, "step": 25520 }, { "epoch": 6.736703180678369, "grad_norm": 0.05257706344127655, "learning_rate": 1.4490729942078607e-05, "loss": 0.0273, "num_input_tokens_seen": 22816960, "step": 25525 }, { "epoch": 6.738022964233865, "grad_norm": 0.03567688539624214, "learning_rate": 1.4481822841223608e-05, "loss": 0.0138, "num_input_tokens_seen": 22821120, "step": 25530 }, { "epoch": 6.739342747789363, "grad_norm": 0.1068384200334549, "learning_rate": 1.4472917362403704e-05, "loss": 0.0371, "num_input_tokens_seen": 22825216, "step": 25535 }, { "epoch": 6.740662531344859, "grad_norm": 0.04199467599391937, "learning_rate": 1.4464013506992224e-05, "loss": 0.0332, "num_input_tokens_seen": 22829952, "step": 25540 }, { "epoch": 6.741982314900357, "grad_norm": 0.21400941908359528, "learning_rate": 1.4455111276362277e-05, "loss": 0.1039, "num_input_tokens_seen": 22834368, "step": 25545 }, { "epoch": 6.743302098455853, "grad_norm": 0.4346313774585724, "learning_rate": 1.4446210671886676e-05, "loss": 0.0494, "num_input_tokens_seen": 22838784, "step": 25550 }, { "epoch": 6.74462188201135, "grad_norm": 0.11817231774330139, "learning_rate": 1.4437311694938015e-05, "loss": 0.0229, "num_input_tokens_seen": 22843232, "step": 25555 }, { "epoch": 6.745941665566847, "grad_norm": 0.060409996658563614, "learning_rate": 1.442841434688864e-05, "loss": 0.0628, "num_input_tokens_seen": 22847808, "step": 25560 }, { "epoch": 6.747261449122344, "grad_norm": 0.33465641736984253, "learning_rate": 1.4419518629110615e-05, "loss": 0.1271, "num_input_tokens_seen": 22852224, "step": 25565 }, { "epoch": 6.748581232677841, "grad_norm": 0.18665072321891785, "learning_rate": 1.4410624542975778e-05, "loss": 0.0256, "num_input_tokens_seen": 22856736, "step": 25570 }, { "epoch": 6.749901016233338, "grad_norm": 0.11938192695379257, "learning_rate": 1.4401732089855724e-05, "loss": 0.0338, "num_input_tokens_seen": 22861088, "step": 25575 }, { "epoch": 6.751220799788834, "grad_norm": 0.24802088737487793, "learning_rate": 1.4392841271121754e-05, "loss": 0.0664, "num_input_tokens_seen": 22865440, "step": 25580 }, { "epoch": 6.752540583344332, "grad_norm": 0.09298299252986908, "learning_rate": 1.438395208814497e-05, "loss": 0.0373, "num_input_tokens_seen": 22869728, "step": 25585 }, { "epoch": 6.753860366899828, "grad_norm": 0.08434057980775833, "learning_rate": 1.4375064542296174e-05, "loss": 0.1036, "num_input_tokens_seen": 22874400, "step": 25590 }, { "epoch": 6.755180150455326, "grad_norm": 0.43707507848739624, "learning_rate": 1.4366178634945946e-05, "loss": 0.0596, "num_input_tokens_seen": 22878784, "step": 25595 }, { "epoch": 6.756499934010822, "grad_norm": 0.4586452543735504, "learning_rate": 1.4357294367464616e-05, "loss": 0.0837, "num_input_tokens_seen": 22883200, "step": 25600 }, { "epoch": 6.756499934010822, "eval_loss": 0.06741667538881302, "eval_runtime": 65.0615, "eval_samples_per_second": 103.517, "eval_steps_per_second": 25.883, "num_input_tokens_seen": 22883200, "step": 25600 }, { "epoch": 6.757819717566319, "grad_norm": 0.18190863728523254, "learning_rate": 1.434841174122224e-05, "loss": 0.031, "num_input_tokens_seen": 22887648, "step": 25605 }, { "epoch": 6.759139501121816, "grad_norm": 0.05596841871738434, "learning_rate": 1.4339530757588615e-05, "loss": 0.0221, "num_input_tokens_seen": 22891968, "step": 25610 }, { "epoch": 6.760459284677313, "grad_norm": 0.07477214932441711, "learning_rate": 1.433065141793333e-05, "loss": 0.0494, "num_input_tokens_seen": 22896512, "step": 25615 }, { "epoch": 6.76177906823281, "grad_norm": 0.08816975355148315, "learning_rate": 1.4321773723625665e-05, "loss": 0.0728, "num_input_tokens_seen": 22900960, "step": 25620 }, { "epoch": 6.763098851788307, "grad_norm": 0.2804015874862671, "learning_rate": 1.4312897676034693e-05, "loss": 0.058, "num_input_tokens_seen": 22905600, "step": 25625 }, { "epoch": 6.764418635343803, "grad_norm": 0.10673972964286804, "learning_rate": 1.4304023276529188e-05, "loss": 0.0219, "num_input_tokens_seen": 22909728, "step": 25630 }, { "epoch": 6.765738418899301, "grad_norm": 0.1238364577293396, "learning_rate": 1.4295150526477712e-05, "loss": 0.0528, "num_input_tokens_seen": 22914336, "step": 25635 }, { "epoch": 6.767058202454797, "grad_norm": 0.39629462361335754, "learning_rate": 1.4286279427248562e-05, "loss": 0.0808, "num_input_tokens_seen": 22919104, "step": 25640 }, { "epoch": 6.768377986010294, "grad_norm": 0.3609510660171509, "learning_rate": 1.4277409980209747e-05, "loss": 0.0557, "num_input_tokens_seen": 22923680, "step": 25645 }, { "epoch": 6.769697769565791, "grad_norm": 0.429675430059433, "learning_rate": 1.4268542186729061e-05, "loss": 0.0777, "num_input_tokens_seen": 22928320, "step": 25650 }, { "epoch": 6.771017553121288, "grad_norm": 0.23814165592193604, "learning_rate": 1.4259676048174043e-05, "loss": 0.0296, "num_input_tokens_seen": 22932800, "step": 25655 }, { "epoch": 6.772337336676785, "grad_norm": 0.034748248755931854, "learning_rate": 1.4250811565911937e-05, "loss": 0.0864, "num_input_tokens_seen": 22937152, "step": 25660 }, { "epoch": 6.773657120232282, "grad_norm": 0.5204176902770996, "learning_rate": 1.4241948741309782e-05, "loss": 0.0682, "num_input_tokens_seen": 22941728, "step": 25665 }, { "epoch": 6.7749769037877785, "grad_norm": 0.1144670844078064, "learning_rate": 1.4233087575734317e-05, "loss": 0.0311, "num_input_tokens_seen": 22946112, "step": 25670 }, { "epoch": 6.776296687343276, "grad_norm": 0.06653739511966705, "learning_rate": 1.422422807055206e-05, "loss": 0.0438, "num_input_tokens_seen": 22950784, "step": 25675 }, { "epoch": 6.7776164708987725, "grad_norm": 0.16192932426929474, "learning_rate": 1.4215370227129243e-05, "loss": 0.0547, "num_input_tokens_seen": 22954880, "step": 25680 }, { "epoch": 6.778936254454269, "grad_norm": 0.06337298452854156, "learning_rate": 1.4206514046831876e-05, "loss": 0.0495, "num_input_tokens_seen": 22959648, "step": 25685 }, { "epoch": 6.7802560380097665, "grad_norm": 0.1323336809873581, "learning_rate": 1.419765953102567e-05, "loss": 0.0689, "num_input_tokens_seen": 22964192, "step": 25690 }, { "epoch": 6.781575821565263, "grad_norm": 0.039876069873571396, "learning_rate": 1.4188806681076125e-05, "loss": 0.0273, "num_input_tokens_seen": 22968608, "step": 25695 }, { "epoch": 6.7828956051207605, "grad_norm": 0.17432242631912231, "learning_rate": 1.4179955498348443e-05, "loss": 0.0612, "num_input_tokens_seen": 22973024, "step": 25700 }, { "epoch": 6.784215388676257, "grad_norm": 0.15356789529323578, "learning_rate": 1.4171105984207605e-05, "loss": 0.0791, "num_input_tokens_seen": 22977504, "step": 25705 }, { "epoch": 6.785535172231754, "grad_norm": 0.43902450799942017, "learning_rate": 1.4162258140018304e-05, "loss": 0.0558, "num_input_tokens_seen": 22981888, "step": 25710 }, { "epoch": 6.786854955787251, "grad_norm": 0.08482851833105087, "learning_rate": 1.4153411967144986e-05, "loss": 0.0897, "num_input_tokens_seen": 22986624, "step": 25715 }, { "epoch": 6.788174739342748, "grad_norm": 0.35725027322769165, "learning_rate": 1.4144567466951864e-05, "loss": 0.057, "num_input_tokens_seen": 22991008, "step": 25720 }, { "epoch": 6.789494522898245, "grad_norm": 0.25625497102737427, "learning_rate": 1.4135724640802844e-05, "loss": 0.1019, "num_input_tokens_seen": 22995680, "step": 25725 }, { "epoch": 6.790814306453742, "grad_norm": 0.22969114780426025, "learning_rate": 1.4126883490061615e-05, "loss": 0.0516, "num_input_tokens_seen": 23000128, "step": 25730 }, { "epoch": 6.792134090009238, "grad_norm": 0.24468931555747986, "learning_rate": 1.4118044016091603e-05, "loss": 0.0535, "num_input_tokens_seen": 23004832, "step": 25735 }, { "epoch": 6.793453873564736, "grad_norm": 0.09296294301748276, "learning_rate": 1.410920622025594e-05, "loss": 0.0455, "num_input_tokens_seen": 23009184, "step": 25740 }, { "epoch": 6.794773657120232, "grad_norm": 0.1221107766032219, "learning_rate": 1.4100370103917554e-05, "loss": 0.0196, "num_input_tokens_seen": 23013920, "step": 25745 }, { "epoch": 6.79609344067573, "grad_norm": 0.05854547768831253, "learning_rate": 1.409153566843907e-05, "loss": 0.1117, "num_input_tokens_seen": 23018400, "step": 25750 }, { "epoch": 6.797413224231226, "grad_norm": 0.052798379212617874, "learning_rate": 1.408270291518286e-05, "loss": 0.0168, "num_input_tokens_seen": 23022912, "step": 25755 }, { "epoch": 6.798733007786723, "grad_norm": 0.12160706520080566, "learning_rate": 1.407387184551107e-05, "loss": 0.0814, "num_input_tokens_seen": 23027296, "step": 25760 }, { "epoch": 6.80005279134222, "grad_norm": 0.5155794620513916, "learning_rate": 1.4065042460785532e-05, "loss": 0.0808, "num_input_tokens_seen": 23031392, "step": 25765 }, { "epoch": 6.801372574897717, "grad_norm": 0.11116791516542435, "learning_rate": 1.405621476236787e-05, "loss": 0.0427, "num_input_tokens_seen": 23035584, "step": 25770 }, { "epoch": 6.802692358453213, "grad_norm": 0.2727549076080322, "learning_rate": 1.4047388751619423e-05, "loss": 0.0506, "num_input_tokens_seen": 23040288, "step": 25775 }, { "epoch": 6.804012142008711, "grad_norm": 0.18266388773918152, "learning_rate": 1.4038564429901264e-05, "loss": 0.0335, "num_input_tokens_seen": 23044800, "step": 25780 }, { "epoch": 6.805331925564207, "grad_norm": 0.0636642724275589, "learning_rate": 1.4029741798574227e-05, "loss": 0.0189, "num_input_tokens_seen": 23049184, "step": 25785 }, { "epoch": 6.806651709119705, "grad_norm": 0.1771543323993683, "learning_rate": 1.402092085899886e-05, "loss": 0.0471, "num_input_tokens_seen": 23053952, "step": 25790 }, { "epoch": 6.807971492675201, "grad_norm": 0.12185461074113846, "learning_rate": 1.4012101612535464e-05, "loss": 0.0866, "num_input_tokens_seen": 23058720, "step": 25795 }, { "epoch": 6.809291276230698, "grad_norm": 0.20256946980953217, "learning_rate": 1.4003284060544092e-05, "loss": 0.0482, "num_input_tokens_seen": 23063104, "step": 25800 }, { "epoch": 6.809291276230698, "eval_loss": 0.06739328056573868, "eval_runtime": 64.7738, "eval_samples_per_second": 103.977, "eval_steps_per_second": 25.998, "num_input_tokens_seen": 23063104, "step": 25800 }, { "epoch": 6.810611059786195, "grad_norm": 0.4812100827693939, "learning_rate": 1.3994468204384504e-05, "loss": 0.0887, "num_input_tokens_seen": 23067360, "step": 25805 }, { "epoch": 6.811930843341692, "grad_norm": 0.23098504543304443, "learning_rate": 1.398565404541622e-05, "loss": 0.0704, "num_input_tokens_seen": 23071520, "step": 25810 }, { "epoch": 6.813250626897188, "grad_norm": 0.39465436339378357, "learning_rate": 1.3976841584998513e-05, "loss": 0.074, "num_input_tokens_seen": 23076384, "step": 25815 }, { "epoch": 6.814570410452686, "grad_norm": 0.23772825300693512, "learning_rate": 1.3968030824490352e-05, "loss": 0.0705, "num_input_tokens_seen": 23080896, "step": 25820 }, { "epoch": 6.815890194008182, "grad_norm": 0.22949117422103882, "learning_rate": 1.3959221765250469e-05, "loss": 0.0496, "num_input_tokens_seen": 23085216, "step": 25825 }, { "epoch": 6.81720997756368, "grad_norm": 0.3033384084701538, "learning_rate": 1.3950414408637343e-05, "loss": 0.0634, "num_input_tokens_seen": 23089472, "step": 25830 }, { "epoch": 6.818529761119176, "grad_norm": 0.4588758051395416, "learning_rate": 1.3941608756009166e-05, "loss": 0.0624, "num_input_tokens_seen": 23093824, "step": 25835 }, { "epoch": 6.819849544674673, "grad_norm": 0.18282847106456757, "learning_rate": 1.3932804808723898e-05, "loss": 0.0225, "num_input_tokens_seen": 23098304, "step": 25840 }, { "epoch": 6.82116932823017, "grad_norm": 0.05114181712269783, "learning_rate": 1.3924002568139194e-05, "loss": 0.0769, "num_input_tokens_seen": 23102752, "step": 25845 }, { "epoch": 6.822489111785667, "grad_norm": 0.2718784511089325, "learning_rate": 1.3915202035612485e-05, "loss": 0.0323, "num_input_tokens_seen": 23107360, "step": 25850 }, { "epoch": 6.823808895341164, "grad_norm": 0.09425494074821472, "learning_rate": 1.3906403212500935e-05, "loss": 0.0421, "num_input_tokens_seen": 23111968, "step": 25855 }, { "epoch": 6.825128678896661, "grad_norm": 0.18809205293655396, "learning_rate": 1.3897606100161409e-05, "loss": 0.0424, "num_input_tokens_seen": 23116416, "step": 25860 }, { "epoch": 6.8264484624521575, "grad_norm": 0.08508098870515823, "learning_rate": 1.388881069995055e-05, "loss": 0.0574, "num_input_tokens_seen": 23121184, "step": 25865 }, { "epoch": 6.827768246007655, "grad_norm": 0.3545161783695221, "learning_rate": 1.3880017013224708e-05, "loss": 0.1181, "num_input_tokens_seen": 23125536, "step": 25870 }, { "epoch": 6.8290880295631515, "grad_norm": 0.3881031572818756, "learning_rate": 1.3871225041339984e-05, "loss": 0.1011, "num_input_tokens_seen": 23129824, "step": 25875 }, { "epoch": 6.830407813118649, "grad_norm": 0.09362603724002838, "learning_rate": 1.386243478565222e-05, "loss": 0.0491, "num_input_tokens_seen": 23134496, "step": 25880 }, { "epoch": 6.8317275966741455, "grad_norm": 0.1557009518146515, "learning_rate": 1.3853646247516966e-05, "loss": 0.0402, "num_input_tokens_seen": 23138848, "step": 25885 }, { "epoch": 6.833047380229642, "grad_norm": 0.2811470031738281, "learning_rate": 1.3844859428289545e-05, "loss": 0.0365, "num_input_tokens_seen": 23143488, "step": 25890 }, { "epoch": 6.8343671637851395, "grad_norm": 0.15471263229846954, "learning_rate": 1.3836074329324984e-05, "loss": 0.0275, "num_input_tokens_seen": 23147712, "step": 25895 }, { "epoch": 6.835686947340636, "grad_norm": 0.3714939057826996, "learning_rate": 1.3827290951978044e-05, "loss": 0.0427, "num_input_tokens_seen": 23152192, "step": 25900 }, { "epoch": 6.8370067308961335, "grad_norm": 0.19825437664985657, "learning_rate": 1.381850929760326e-05, "loss": 0.0449, "num_input_tokens_seen": 23156576, "step": 25905 }, { "epoch": 6.83832651445163, "grad_norm": 0.12204756587743759, "learning_rate": 1.3809729367554842e-05, "loss": 0.0371, "num_input_tokens_seen": 23161184, "step": 25910 }, { "epoch": 6.839646298007127, "grad_norm": 0.06243292614817619, "learning_rate": 1.3800951163186784e-05, "loss": 0.0158, "num_input_tokens_seen": 23165632, "step": 25915 }, { "epoch": 6.840966081562624, "grad_norm": 0.16030694544315338, "learning_rate": 1.3792174685852801e-05, "loss": 0.1162, "num_input_tokens_seen": 23170240, "step": 25920 }, { "epoch": 6.842285865118121, "grad_norm": 0.06883686035871506, "learning_rate": 1.378339993690632e-05, "loss": 0.0678, "num_input_tokens_seen": 23174880, "step": 25925 }, { "epoch": 6.843605648673617, "grad_norm": 0.19646264612674713, "learning_rate": 1.3774626917700523e-05, "loss": 0.0542, "num_input_tokens_seen": 23179488, "step": 25930 }, { "epoch": 6.844925432229115, "grad_norm": 0.34963467717170715, "learning_rate": 1.3765855629588334e-05, "loss": 0.0593, "num_input_tokens_seen": 23183744, "step": 25935 }, { "epoch": 6.846245215784611, "grad_norm": 0.7438236474990845, "learning_rate": 1.3757086073922374e-05, "loss": 0.1082, "num_input_tokens_seen": 23188416, "step": 25940 }, { "epoch": 6.847564999340108, "grad_norm": 0.2681655287742615, "learning_rate": 1.3748318252055038e-05, "loss": 0.0999, "num_input_tokens_seen": 23192992, "step": 25945 }, { "epoch": 6.848884782895605, "grad_norm": 0.44821324944496155, "learning_rate": 1.3739552165338416e-05, "loss": 0.071, "num_input_tokens_seen": 23197600, "step": 25950 }, { "epoch": 6.850204566451102, "grad_norm": 0.13945230841636658, "learning_rate": 1.3730787815124354e-05, "loss": 0.0465, "num_input_tokens_seen": 23202144, "step": 25955 }, { "epoch": 6.851524350006599, "grad_norm": 0.1082872673869133, "learning_rate": 1.3722025202764443e-05, "loss": 0.0956, "num_input_tokens_seen": 23206624, "step": 25960 }, { "epoch": 6.852844133562096, "grad_norm": 0.1621987372636795, "learning_rate": 1.371326432960997e-05, "loss": 0.0342, "num_input_tokens_seen": 23211264, "step": 25965 }, { "epoch": 6.854163917117592, "grad_norm": 0.2991095781326294, "learning_rate": 1.3704505197011969e-05, "loss": 0.0364, "num_input_tokens_seen": 23215648, "step": 25970 }, { "epoch": 6.85548370067309, "grad_norm": 0.3187922239303589, "learning_rate": 1.3695747806321224e-05, "loss": 0.0538, "num_input_tokens_seen": 23220032, "step": 25975 }, { "epoch": 6.856803484228586, "grad_norm": 0.3495848476886749, "learning_rate": 1.3686992158888212e-05, "loss": 0.0715, "num_input_tokens_seen": 23224448, "step": 25980 }, { "epoch": 6.858123267784084, "grad_norm": 0.20014162361621857, "learning_rate": 1.367823825606319e-05, "loss": 0.0622, "num_input_tokens_seen": 23228864, "step": 25985 }, { "epoch": 6.85944305133958, "grad_norm": 0.10503510385751724, "learning_rate": 1.36694860991961e-05, "loss": 0.1124, "num_input_tokens_seen": 23233312, "step": 25990 }, { "epoch": 6.860762834895077, "grad_norm": 0.3018341064453125, "learning_rate": 1.3660735689636636e-05, "loss": 0.0642, "num_input_tokens_seen": 23237792, "step": 25995 }, { "epoch": 6.862082618450574, "grad_norm": 0.36216849088668823, "learning_rate": 1.365198702873424e-05, "loss": 0.0759, "num_input_tokens_seen": 23242080, "step": 26000 }, { "epoch": 6.862082618450574, "eval_loss": 0.06706790626049042, "eval_runtime": 64.7579, "eval_samples_per_second": 104.003, "eval_steps_per_second": 26.005, "num_input_tokens_seen": 23242080, "step": 26000 }, { "epoch": 6.863402402006071, "grad_norm": 0.18120232224464417, "learning_rate": 1.364324011783804e-05, "loss": 0.0346, "num_input_tokens_seen": 23246496, "step": 26005 }, { "epoch": 6.864722185561568, "grad_norm": 0.09264125674962997, "learning_rate": 1.3634494958296934e-05, "loss": 0.048, "num_input_tokens_seen": 23250944, "step": 26010 }, { "epoch": 6.866041969117065, "grad_norm": 0.04794415459036827, "learning_rate": 1.3625751551459542e-05, "loss": 0.0393, "num_input_tokens_seen": 23254912, "step": 26015 }, { "epoch": 6.867361752672561, "grad_norm": 0.057788681238889694, "learning_rate": 1.3617009898674188e-05, "loss": 0.0153, "num_input_tokens_seen": 23259552, "step": 26020 }, { "epoch": 6.868681536228059, "grad_norm": 0.39926236867904663, "learning_rate": 1.3608270001288967e-05, "loss": 0.0815, "num_input_tokens_seen": 23264416, "step": 26025 }, { "epoch": 6.870001319783555, "grad_norm": 0.06625698506832123, "learning_rate": 1.359953186065166e-05, "loss": 0.0626, "num_input_tokens_seen": 23269056, "step": 26030 }, { "epoch": 6.871321103339053, "grad_norm": 0.18107272684574127, "learning_rate": 1.3590795478109814e-05, "loss": 0.0469, "num_input_tokens_seen": 23273408, "step": 26035 }, { "epoch": 6.872640886894549, "grad_norm": 0.22623808681964874, "learning_rate": 1.3582060855010675e-05, "loss": 0.0633, "num_input_tokens_seen": 23278080, "step": 26040 }, { "epoch": 6.873960670450046, "grad_norm": 0.49687936902046204, "learning_rate": 1.3573327992701245e-05, "loss": 0.0774, "num_input_tokens_seen": 23282784, "step": 26045 }, { "epoch": 6.875280454005543, "grad_norm": 0.21760812401771545, "learning_rate": 1.356459689252823e-05, "loss": 0.071, "num_input_tokens_seen": 23287392, "step": 26050 }, { "epoch": 6.87660023756104, "grad_norm": 0.08660692721605301, "learning_rate": 1.3555867555838087e-05, "loss": 0.0135, "num_input_tokens_seen": 23291872, "step": 26055 }, { "epoch": 6.8779200211165366, "grad_norm": 0.1054881289601326, "learning_rate": 1.3547139983976975e-05, "loss": 0.0481, "num_input_tokens_seen": 23296128, "step": 26060 }, { "epoch": 6.879239804672034, "grad_norm": 0.3173291087150574, "learning_rate": 1.3538414178290815e-05, "loss": 0.0329, "num_input_tokens_seen": 23300352, "step": 26065 }, { "epoch": 6.8805595882275306, "grad_norm": 0.20991815626621246, "learning_rate": 1.3529690140125209e-05, "loss": 0.0322, "num_input_tokens_seen": 23304864, "step": 26070 }, { "epoch": 6.881879371783027, "grad_norm": 0.2373851090669632, "learning_rate": 1.352096787082553e-05, "loss": 0.0758, "num_input_tokens_seen": 23309280, "step": 26075 }, { "epoch": 6.8831991553385246, "grad_norm": 0.11811939626932144, "learning_rate": 1.3512247371736871e-05, "loss": 0.035, "num_input_tokens_seen": 23313664, "step": 26080 }, { "epoch": 6.884518938894021, "grad_norm": 0.22317059338092804, "learning_rate": 1.3503528644204022e-05, "loss": 0.0816, "num_input_tokens_seen": 23318688, "step": 26085 }, { "epoch": 6.885838722449519, "grad_norm": 0.17305819690227509, "learning_rate": 1.349481168957153e-05, "loss": 0.0268, "num_input_tokens_seen": 23323008, "step": 26090 }, { "epoch": 6.887158506005015, "grad_norm": 0.21702373027801514, "learning_rate": 1.3486096509183665e-05, "loss": 0.0476, "num_input_tokens_seen": 23327360, "step": 26095 }, { "epoch": 6.888478289560512, "grad_norm": 0.24821020662784576, "learning_rate": 1.3477383104384406e-05, "loss": 0.0543, "num_input_tokens_seen": 23331968, "step": 26100 }, { "epoch": 6.889798073116009, "grad_norm": 0.5861372351646423, "learning_rate": 1.3468671476517481e-05, "loss": 0.0629, "num_input_tokens_seen": 23336640, "step": 26105 }, { "epoch": 6.891117856671506, "grad_norm": 0.28352785110473633, "learning_rate": 1.3459961626926326e-05, "loss": 0.0278, "num_input_tokens_seen": 23340992, "step": 26110 }, { "epoch": 6.892437640227003, "grad_norm": 0.03513330593705177, "learning_rate": 1.3451253556954101e-05, "loss": 0.034, "num_input_tokens_seen": 23345632, "step": 26115 }, { "epoch": 6.8937574237825, "grad_norm": 0.35132622718811035, "learning_rate": 1.3442547267943717e-05, "loss": 0.0504, "num_input_tokens_seen": 23350112, "step": 26120 }, { "epoch": 6.895077207337996, "grad_norm": 0.3103977143764496, "learning_rate": 1.3433842761237774e-05, "loss": 0.0445, "num_input_tokens_seen": 23354528, "step": 26125 }, { "epoch": 6.896396990893494, "grad_norm": 0.4659053385257721, "learning_rate": 1.3425140038178639e-05, "loss": 0.0976, "num_input_tokens_seen": 23358816, "step": 26130 }, { "epoch": 6.89771677444899, "grad_norm": 0.24028538167476654, "learning_rate": 1.3416439100108358e-05, "loss": 0.0427, "num_input_tokens_seen": 23363392, "step": 26135 }, { "epoch": 6.899036558004488, "grad_norm": 0.10888324677944183, "learning_rate": 1.3407739948368734e-05, "loss": 0.0597, "num_input_tokens_seen": 23367712, "step": 26140 }, { "epoch": 6.900356341559984, "grad_norm": 0.08844508975744247, "learning_rate": 1.3399042584301298e-05, "loss": 0.0478, "num_input_tokens_seen": 23372384, "step": 26145 }, { "epoch": 6.901676125115481, "grad_norm": 0.2642793655395508, "learning_rate": 1.3390347009247272e-05, "loss": 0.0543, "num_input_tokens_seen": 23376544, "step": 26150 }, { "epoch": 6.902995908670978, "grad_norm": 0.0783705785870552, "learning_rate": 1.3381653224547635e-05, "loss": 0.0741, "num_input_tokens_seen": 23380928, "step": 26155 }, { "epoch": 6.904315692226475, "grad_norm": 0.3414996862411499, "learning_rate": 1.3372961231543086e-05, "loss": 0.0691, "num_input_tokens_seen": 23385568, "step": 26160 }, { "epoch": 6.905635475781972, "grad_norm": 0.20869261026382446, "learning_rate": 1.3364271031574016e-05, "loss": 0.0785, "num_input_tokens_seen": 23390176, "step": 26165 }, { "epoch": 6.906955259337469, "grad_norm": 0.37239232659339905, "learning_rate": 1.335558262598059e-05, "loss": 0.0375, "num_input_tokens_seen": 23394656, "step": 26170 }, { "epoch": 6.908275042892965, "grad_norm": 0.051340557634830475, "learning_rate": 1.3346896016102645e-05, "loss": 0.0835, "num_input_tokens_seen": 23399296, "step": 26175 }, { "epoch": 6.909594826448463, "grad_norm": 0.33996284008026123, "learning_rate": 1.3338211203279788e-05, "loss": 0.0502, "num_input_tokens_seen": 23403776, "step": 26180 }, { "epoch": 6.910914610003959, "grad_norm": 0.050017885863780975, "learning_rate": 1.3329528188851303e-05, "loss": 0.0569, "num_input_tokens_seen": 23408192, "step": 26185 }, { "epoch": 6.912234393559456, "grad_norm": 0.2997513711452484, "learning_rate": 1.3320846974156242e-05, "loss": 0.0473, "num_input_tokens_seen": 23412416, "step": 26190 }, { "epoch": 6.913554177114953, "grad_norm": 0.2910624146461487, "learning_rate": 1.3312167560533337e-05, "loss": 0.1263, "num_input_tokens_seen": 23416800, "step": 26195 }, { "epoch": 6.91487396067045, "grad_norm": 0.02987534925341606, "learning_rate": 1.3303489949321082e-05, "loss": 0.0229, "num_input_tokens_seen": 23421312, "step": 26200 }, { "epoch": 6.91487396067045, "eval_loss": 0.06737001985311508, "eval_runtime": 64.7831, "eval_samples_per_second": 103.962, "eval_steps_per_second": 25.994, "num_input_tokens_seen": 23421312, "step": 26200 }, { "epoch": 6.9161937442259465, "grad_norm": 0.10874485969543457, "learning_rate": 1.3294814141857653e-05, "loss": 0.034, "num_input_tokens_seen": 23425888, "step": 26205 }, { "epoch": 6.917513527781444, "grad_norm": 0.02958831377327442, "learning_rate": 1.3286140139480992e-05, "loss": 0.0718, "num_input_tokens_seen": 23430432, "step": 26210 }, { "epoch": 6.9188333113369405, "grad_norm": 0.2245890349149704, "learning_rate": 1.3277467943528719e-05, "loss": 0.0641, "num_input_tokens_seen": 23435264, "step": 26215 }, { "epoch": 6.920153094892438, "grad_norm": 0.03356247395277023, "learning_rate": 1.3268797555338203e-05, "loss": 0.083, "num_input_tokens_seen": 23439808, "step": 26220 }, { "epoch": 6.9214728784479345, "grad_norm": 0.4324404299259186, "learning_rate": 1.3260128976246533e-05, "loss": 0.0527, "num_input_tokens_seen": 23444128, "step": 26225 }, { "epoch": 6.922792662003431, "grad_norm": 0.25428101420402527, "learning_rate": 1.32514622075905e-05, "loss": 0.0668, "num_input_tokens_seen": 23448544, "step": 26230 }, { "epoch": 6.9241124455589285, "grad_norm": 0.19580595195293427, "learning_rate": 1.3242797250706638e-05, "loss": 0.0457, "num_input_tokens_seen": 23453216, "step": 26235 }, { "epoch": 6.925432229114425, "grad_norm": 0.45693421363830566, "learning_rate": 1.3234134106931195e-05, "loss": 0.0888, "num_input_tokens_seen": 23457856, "step": 26240 }, { "epoch": 6.9267520126699225, "grad_norm": 0.17358365654945374, "learning_rate": 1.322547277760013e-05, "loss": 0.0417, "num_input_tokens_seen": 23462400, "step": 26245 }, { "epoch": 6.928071796225419, "grad_norm": 0.1895885169506073, "learning_rate": 1.3216813264049132e-05, "loss": 0.0184, "num_input_tokens_seen": 23466848, "step": 26250 }, { "epoch": 6.929391579780916, "grad_norm": 0.062415365129709244, "learning_rate": 1.32081555676136e-05, "loss": 0.0439, "num_input_tokens_seen": 23471200, "step": 26255 }, { "epoch": 6.930711363336413, "grad_norm": 0.12897033989429474, "learning_rate": 1.3199499689628674e-05, "loss": 0.0369, "num_input_tokens_seen": 23475488, "step": 26260 }, { "epoch": 6.93203114689191, "grad_norm": 0.38613736629486084, "learning_rate": 1.3190845631429192e-05, "loss": 0.0587, "num_input_tokens_seen": 23479872, "step": 26265 }, { "epoch": 6.933350930447407, "grad_norm": 0.14430344104766846, "learning_rate": 1.3182193394349704e-05, "loss": 0.0704, "num_input_tokens_seen": 23484512, "step": 26270 }, { "epoch": 6.934670714002904, "grad_norm": 0.285412073135376, "learning_rate": 1.3173542979724507e-05, "loss": 0.0751, "num_input_tokens_seen": 23488992, "step": 26275 }, { "epoch": 6.9359904975584, "grad_norm": 0.08865345269441605, "learning_rate": 1.3164894388887617e-05, "loss": 0.0248, "num_input_tokens_seen": 23493216, "step": 26280 }, { "epoch": 6.937310281113898, "grad_norm": 0.37336641550064087, "learning_rate": 1.3156247623172727e-05, "loss": 0.0952, "num_input_tokens_seen": 23497536, "step": 26285 }, { "epoch": 6.938630064669394, "grad_norm": 0.17545442283153534, "learning_rate": 1.3147602683913302e-05, "loss": 0.0303, "num_input_tokens_seen": 23501888, "step": 26290 }, { "epoch": 6.939949848224892, "grad_norm": 0.06161971762776375, "learning_rate": 1.3138959572442481e-05, "loss": 0.0259, "num_input_tokens_seen": 23506560, "step": 26295 }, { "epoch": 6.941269631780388, "grad_norm": 0.2512925863265991, "learning_rate": 1.3130318290093146e-05, "loss": 0.0403, "num_input_tokens_seen": 23511104, "step": 26300 }, { "epoch": 6.942589415335885, "grad_norm": 0.38723674416542053, "learning_rate": 1.3121678838197909e-05, "loss": 0.0401, "num_input_tokens_seen": 23515392, "step": 26305 }, { "epoch": 6.943909198891382, "grad_norm": 0.03177139163017273, "learning_rate": 1.3113041218089056e-05, "loss": 0.1097, "num_input_tokens_seen": 23519680, "step": 26310 }, { "epoch": 6.945228982446879, "grad_norm": 0.11637531965970993, "learning_rate": 1.3104405431098626e-05, "loss": 0.0674, "num_input_tokens_seen": 23524192, "step": 26315 }, { "epoch": 6.946548766002375, "grad_norm": 0.34624460339546204, "learning_rate": 1.3095771478558377e-05, "loss": 0.0349, "num_input_tokens_seen": 23528576, "step": 26320 }, { "epoch": 6.947868549557873, "grad_norm": 0.3507158160209656, "learning_rate": 1.3087139361799766e-05, "loss": 0.0425, "num_input_tokens_seen": 23533376, "step": 26325 }, { "epoch": 6.949188333113369, "grad_norm": 0.2701948881149292, "learning_rate": 1.3078509082153964e-05, "loss": 0.0395, "num_input_tokens_seen": 23538048, "step": 26330 }, { "epoch": 6.950508116668866, "grad_norm": 0.11996275931596756, "learning_rate": 1.3069880640951885e-05, "loss": 0.0299, "num_input_tokens_seen": 23542336, "step": 26335 }, { "epoch": 6.951827900224363, "grad_norm": 0.2698823809623718, "learning_rate": 1.3061254039524123e-05, "loss": 0.037, "num_input_tokens_seen": 23546624, "step": 26340 }, { "epoch": 6.95314768377986, "grad_norm": 0.2736412286758423, "learning_rate": 1.3052629279201028e-05, "loss": 0.0554, "num_input_tokens_seen": 23551104, "step": 26345 }, { "epoch": 6.954467467335357, "grad_norm": 0.10672712326049805, "learning_rate": 1.3044006361312633e-05, "loss": 0.0763, "num_input_tokens_seen": 23555104, "step": 26350 }, { "epoch": 6.955787250890854, "grad_norm": 0.22339585423469543, "learning_rate": 1.30353852871887e-05, "loss": 0.0722, "num_input_tokens_seen": 23559616, "step": 26355 }, { "epoch": 6.95710703444635, "grad_norm": 0.4189163148403168, "learning_rate": 1.302676605815873e-05, "loss": 0.0339, "num_input_tokens_seen": 23564064, "step": 26360 }, { "epoch": 6.958426818001848, "grad_norm": 0.2866857945919037, "learning_rate": 1.3018148675551884e-05, "loss": 0.0778, "num_input_tokens_seen": 23568320, "step": 26365 }, { "epoch": 6.959746601557344, "grad_norm": 0.24438057839870453, "learning_rate": 1.3009533140697094e-05, "loss": 0.1338, "num_input_tokens_seen": 23572640, "step": 26370 }, { "epoch": 6.961066385112842, "grad_norm": 0.5954139828681946, "learning_rate": 1.3000919454922966e-05, "loss": 0.0846, "num_input_tokens_seen": 23576992, "step": 26375 }, { "epoch": 6.962386168668338, "grad_norm": 0.21665580570697784, "learning_rate": 1.299230761955785e-05, "loss": 0.0604, "num_input_tokens_seen": 23581312, "step": 26380 }, { "epoch": 6.963705952223835, "grad_norm": 0.02980898879468441, "learning_rate": 1.2983697635929807e-05, "loss": 0.0488, "num_input_tokens_seen": 23586240, "step": 26385 }, { "epoch": 6.965025735779332, "grad_norm": 0.10365483164787292, "learning_rate": 1.2975089505366584e-05, "loss": 0.0295, "num_input_tokens_seen": 23590560, "step": 26390 }, { "epoch": 6.966345519334829, "grad_norm": 0.05207838490605354, "learning_rate": 1.2966483229195683e-05, "loss": 0.0127, "num_input_tokens_seen": 23594624, "step": 26395 }, { "epoch": 6.967665302890326, "grad_norm": 0.21801352500915527, "learning_rate": 1.2957878808744283e-05, "loss": 0.0648, "num_input_tokens_seen": 23599008, "step": 26400 }, { "epoch": 6.967665302890326, "eval_loss": 0.06753312051296234, "eval_runtime": 64.7619, "eval_samples_per_second": 103.996, "eval_steps_per_second": 26.003, "num_input_tokens_seen": 23599008, "step": 26400 }, { "epoch": 6.968985086445823, "grad_norm": 0.22268319129943848, "learning_rate": 1.294927624533931e-05, "loss": 0.0582, "num_input_tokens_seen": 23603296, "step": 26405 }, { "epoch": 6.9703048700013195, "grad_norm": 0.1794978231191635, "learning_rate": 1.2940675540307378e-05, "loss": 0.0366, "num_input_tokens_seen": 23607584, "step": 26410 }, { "epoch": 6.971624653556817, "grad_norm": 0.14915111660957336, "learning_rate": 1.2932076694974814e-05, "loss": 0.0237, "num_input_tokens_seen": 23611840, "step": 26415 }, { "epoch": 6.9729444371123135, "grad_norm": 0.05516654998064041, "learning_rate": 1.2923479710667682e-05, "loss": 0.1428, "num_input_tokens_seen": 23616448, "step": 26420 }, { "epoch": 6.974264220667811, "grad_norm": 0.23909355700016022, "learning_rate": 1.2914884588711751e-05, "loss": 0.036, "num_input_tokens_seen": 23620928, "step": 26425 }, { "epoch": 6.9755840042233075, "grad_norm": 0.06487134099006653, "learning_rate": 1.2906291330432475e-05, "loss": 0.0409, "num_input_tokens_seen": 23625504, "step": 26430 }, { "epoch": 6.976903787778804, "grad_norm": 0.01848084293305874, "learning_rate": 1.2897699937155055e-05, "loss": 0.081, "num_input_tokens_seen": 23630176, "step": 26435 }, { "epoch": 6.9782235713343015, "grad_norm": 0.2839110493659973, "learning_rate": 1.2889110410204403e-05, "loss": 0.1151, "num_input_tokens_seen": 23634528, "step": 26440 }, { "epoch": 6.979543354889798, "grad_norm": 0.10108304768800735, "learning_rate": 1.2880522750905111e-05, "loss": 0.0656, "num_input_tokens_seen": 23638816, "step": 26445 }, { "epoch": 6.980863138445295, "grad_norm": 0.2128608524799347, "learning_rate": 1.2871936960581523e-05, "loss": 0.0797, "num_input_tokens_seen": 23643360, "step": 26450 }, { "epoch": 6.982182922000792, "grad_norm": 0.09511413425207138, "learning_rate": 1.2863353040557658e-05, "loss": 0.0432, "num_input_tokens_seen": 23647872, "step": 26455 }, { "epoch": 6.983502705556289, "grad_norm": 0.35618311166763306, "learning_rate": 1.2854770992157273e-05, "loss": 0.084, "num_input_tokens_seen": 23652352, "step": 26460 }, { "epoch": 6.984822489111786, "grad_norm": 0.288748174905777, "learning_rate": 1.2846190816703835e-05, "loss": 0.0667, "num_input_tokens_seen": 23656864, "step": 26465 }, { "epoch": 6.986142272667283, "grad_norm": 0.3650299906730652, "learning_rate": 1.2837612515520498e-05, "loss": 0.105, "num_input_tokens_seen": 23661344, "step": 26470 }, { "epoch": 6.987462056222779, "grad_norm": 0.19632835686206818, "learning_rate": 1.2829036089930163e-05, "loss": 0.0326, "num_input_tokens_seen": 23665952, "step": 26475 }, { "epoch": 6.988781839778277, "grad_norm": 0.0808877944946289, "learning_rate": 1.2820461541255412e-05, "loss": 0.0342, "num_input_tokens_seen": 23670752, "step": 26480 }, { "epoch": 6.990101623333773, "grad_norm": 0.19864478707313538, "learning_rate": 1.2811888870818543e-05, "loss": 0.0382, "num_input_tokens_seen": 23675584, "step": 26485 }, { "epoch": 6.99142140688927, "grad_norm": 0.1210024431347847, "learning_rate": 1.2803318079941581e-05, "loss": 0.0506, "num_input_tokens_seen": 23679840, "step": 26490 }, { "epoch": 6.992741190444767, "grad_norm": 0.0403180867433548, "learning_rate": 1.2794749169946235e-05, "loss": 0.0432, "num_input_tokens_seen": 23684192, "step": 26495 }, { "epoch": 6.994060974000264, "grad_norm": 0.3332643508911133, "learning_rate": 1.2786182142153952e-05, "loss": 0.1321, "num_input_tokens_seen": 23688512, "step": 26500 }, { "epoch": 6.995380757555761, "grad_norm": 0.46026432514190674, "learning_rate": 1.2777616997885878e-05, "loss": 0.057, "num_input_tokens_seen": 23693152, "step": 26505 }, { "epoch": 6.996700541111258, "grad_norm": 0.3616117537021637, "learning_rate": 1.2769053738462847e-05, "loss": 0.0525, "num_input_tokens_seen": 23697600, "step": 26510 }, { "epoch": 6.998020324666754, "grad_norm": 0.15544871985912323, "learning_rate": 1.2760492365205434e-05, "loss": 0.0402, "num_input_tokens_seen": 23702208, "step": 26515 }, { "epoch": 6.999340108222252, "grad_norm": 0.13193447887897491, "learning_rate": 1.2751932879433919e-05, "loss": 0.0384, "num_input_tokens_seen": 23706720, "step": 26520 }, { "epoch": 7.000527913422199, "grad_norm": 0.09995996206998825, "learning_rate": 1.2743375282468267e-05, "loss": 0.0951, "num_input_tokens_seen": 23710928, "step": 26525 }, { "epoch": 7.001847696977696, "grad_norm": 0.3785555362701416, "learning_rate": 1.2734819575628182e-05, "loss": 0.0359, "num_input_tokens_seen": 23715696, "step": 26530 }, { "epoch": 7.003167480533192, "grad_norm": 0.07407134026288986, "learning_rate": 1.2726265760233039e-05, "loss": 0.0513, "num_input_tokens_seen": 23720176, "step": 26535 }, { "epoch": 7.00448726408869, "grad_norm": 0.040014345198869705, "learning_rate": 1.271771383760197e-05, "loss": 0.0443, "num_input_tokens_seen": 23724464, "step": 26540 }, { "epoch": 7.005807047644186, "grad_norm": 0.21336564421653748, "learning_rate": 1.2709163809053764e-05, "loss": 0.0212, "num_input_tokens_seen": 23728976, "step": 26545 }, { "epoch": 7.007126831199683, "grad_norm": 0.3592245578765869, "learning_rate": 1.2700615675906963e-05, "loss": 0.0476, "num_input_tokens_seen": 23733072, "step": 26550 }, { "epoch": 7.00844661475518, "grad_norm": 0.07405228912830353, "learning_rate": 1.269206943947978e-05, "loss": 0.0434, "num_input_tokens_seen": 23737616, "step": 26555 }, { "epoch": 7.009766398310677, "grad_norm": 0.1678626984357834, "learning_rate": 1.2683525101090177e-05, "loss": 0.038, "num_input_tokens_seen": 23742064, "step": 26560 }, { "epoch": 7.011086181866174, "grad_norm": 0.2535615861415863, "learning_rate": 1.2674982662055765e-05, "loss": 0.0813, "num_input_tokens_seen": 23746224, "step": 26565 }, { "epoch": 7.012405965421671, "grad_norm": 0.17748519778251648, "learning_rate": 1.2666442123693922e-05, "loss": 0.0424, "num_input_tokens_seen": 23750736, "step": 26570 }, { "epoch": 7.013725748977167, "grad_norm": 0.22192297875881195, "learning_rate": 1.265790348732169e-05, "loss": 0.1172, "num_input_tokens_seen": 23754864, "step": 26575 }, { "epoch": 7.015045532532665, "grad_norm": 0.599966824054718, "learning_rate": 1.264936675425584e-05, "loss": 0.1043, "num_input_tokens_seen": 23759504, "step": 26580 }, { "epoch": 7.016365316088161, "grad_norm": 0.2450350522994995, "learning_rate": 1.2640831925812852e-05, "loss": 0.029, "num_input_tokens_seen": 23764176, "step": 26585 }, { "epoch": 7.017685099643659, "grad_norm": 0.14555564522743225, "learning_rate": 1.263229900330889e-05, "loss": 0.0372, "num_input_tokens_seen": 23768784, "step": 26590 }, { "epoch": 7.019004883199155, "grad_norm": 0.1665077954530716, "learning_rate": 1.2623767988059843e-05, "loss": 0.0356, "num_input_tokens_seen": 23773008, "step": 26595 }, { "epoch": 7.020324666754652, "grad_norm": 0.3017309010028839, "learning_rate": 1.2615238881381309e-05, "loss": 0.0823, "num_input_tokens_seen": 23777520, "step": 26600 }, { "epoch": 7.020324666754652, "eval_loss": 0.06739455461502075, "eval_runtime": 64.778, "eval_samples_per_second": 103.97, "eval_steps_per_second": 25.996, "num_input_tokens_seen": 23777520, "step": 26600 }, { "epoch": 7.021644450310149, "grad_norm": 0.20437005162239075, "learning_rate": 1.2606711684588568e-05, "loss": 0.0437, "num_input_tokens_seen": 23782160, "step": 26605 }, { "epoch": 7.022964233865646, "grad_norm": 0.18186326324939728, "learning_rate": 1.2598186398996636e-05, "loss": 0.0204, "num_input_tokens_seen": 23786416, "step": 26610 }, { "epoch": 7.0242840174211425, "grad_norm": 0.20386849343776703, "learning_rate": 1.2589663025920207e-05, "loss": 0.0673, "num_input_tokens_seen": 23791056, "step": 26615 }, { "epoch": 7.02560380097664, "grad_norm": 0.24426443874835968, "learning_rate": 1.2581141566673705e-05, "loss": 0.0767, "num_input_tokens_seen": 23795344, "step": 26620 }, { "epoch": 7.0269235845321365, "grad_norm": 0.30892109870910645, "learning_rate": 1.257262202257124e-05, "loss": 0.0613, "num_input_tokens_seen": 23799792, "step": 26625 }, { "epoch": 7.028243368087634, "grad_norm": 0.07429211586713791, "learning_rate": 1.2564104394926618e-05, "loss": 0.0286, "num_input_tokens_seen": 23804336, "step": 26630 }, { "epoch": 7.0295631516431305, "grad_norm": 0.28406187891960144, "learning_rate": 1.2555588685053383e-05, "loss": 0.107, "num_input_tokens_seen": 23808880, "step": 26635 }, { "epoch": 7.030882935198627, "grad_norm": 0.3646852970123291, "learning_rate": 1.2547074894264762e-05, "loss": 0.0889, "num_input_tokens_seen": 23813456, "step": 26640 }, { "epoch": 7.0322027187541245, "grad_norm": 0.2474670708179474, "learning_rate": 1.2538563023873679e-05, "loss": 0.0501, "num_input_tokens_seen": 23817712, "step": 26645 }, { "epoch": 7.033522502309621, "grad_norm": 0.19502291083335876, "learning_rate": 1.2530053075192789e-05, "loss": 0.0481, "num_input_tokens_seen": 23822160, "step": 26650 }, { "epoch": 7.0348422858651185, "grad_norm": 0.34203797578811646, "learning_rate": 1.252154504953441e-05, "loss": 0.0964, "num_input_tokens_seen": 23826480, "step": 26655 }, { "epoch": 7.036162069420615, "grad_norm": 0.1713544875383377, "learning_rate": 1.25130389482106e-05, "loss": 0.0356, "num_input_tokens_seen": 23830960, "step": 26660 }, { "epoch": 7.037481852976112, "grad_norm": 0.11495032161474228, "learning_rate": 1.2504534772533116e-05, "loss": 0.0405, "num_input_tokens_seen": 23835408, "step": 26665 }, { "epoch": 7.038801636531609, "grad_norm": 0.04386752471327782, "learning_rate": 1.2496032523813387e-05, "loss": 0.0255, "num_input_tokens_seen": 23839856, "step": 26670 }, { "epoch": 7.040121420087106, "grad_norm": 0.16792790591716766, "learning_rate": 1.2487532203362576e-05, "loss": 0.1079, "num_input_tokens_seen": 23844528, "step": 26675 }, { "epoch": 7.041441203642602, "grad_norm": 0.06166511029005051, "learning_rate": 1.247903381249155e-05, "loss": 0.033, "num_input_tokens_seen": 23849264, "step": 26680 }, { "epoch": 7.0427609871981, "grad_norm": 0.23144420981407166, "learning_rate": 1.2470537352510853e-05, "loss": 0.1167, "num_input_tokens_seen": 23853712, "step": 26685 }, { "epoch": 7.044080770753596, "grad_norm": 0.24693745374679565, "learning_rate": 1.2462042824730758e-05, "loss": 0.0815, "num_input_tokens_seen": 23857968, "step": 26690 }, { "epoch": 7.045400554309094, "grad_norm": 0.36573684215545654, "learning_rate": 1.245355023046122e-05, "loss": 0.0735, "num_input_tokens_seen": 23862352, "step": 26695 }, { "epoch": 7.04672033786459, "grad_norm": 0.057069532573223114, "learning_rate": 1.2445059571011896e-05, "loss": 0.051, "num_input_tokens_seen": 23866640, "step": 26700 }, { "epoch": 7.048040121420087, "grad_norm": 0.15891991555690765, "learning_rate": 1.2436570847692173e-05, "loss": 0.1009, "num_input_tokens_seen": 23870896, "step": 26705 }, { "epoch": 7.049359904975584, "grad_norm": 0.065348781645298, "learning_rate": 1.2428084061811096e-05, "loss": 0.026, "num_input_tokens_seen": 23875152, "step": 26710 }, { "epoch": 7.050679688531081, "grad_norm": 0.1477564126253128, "learning_rate": 1.2419599214677447e-05, "loss": 0.0394, "num_input_tokens_seen": 23879504, "step": 26715 }, { "epoch": 7.051999472086578, "grad_norm": 0.37308457493782043, "learning_rate": 1.2411116307599702e-05, "loss": 0.0823, "num_input_tokens_seen": 23883920, "step": 26720 }, { "epoch": 7.053319255642075, "grad_norm": 0.3137248158454895, "learning_rate": 1.2402635341886016e-05, "loss": 0.0603, "num_input_tokens_seen": 23888400, "step": 26725 }, { "epoch": 7.054639039197571, "grad_norm": 0.2803465723991394, "learning_rate": 1.2394156318844278e-05, "loss": 0.0513, "num_input_tokens_seen": 23892944, "step": 26730 }, { "epoch": 7.055958822753069, "grad_norm": 0.3562689423561096, "learning_rate": 1.2385679239782039e-05, "loss": 0.073, "num_input_tokens_seen": 23897552, "step": 26735 }, { "epoch": 7.057278606308565, "grad_norm": 0.21104729175567627, "learning_rate": 1.2377204106006585e-05, "loss": 0.0272, "num_input_tokens_seen": 23901872, "step": 26740 }, { "epoch": 7.058598389864063, "grad_norm": 0.2214275747537613, "learning_rate": 1.2368730918824891e-05, "loss": 0.0922, "num_input_tokens_seen": 23906192, "step": 26745 }, { "epoch": 7.059918173419559, "grad_norm": 0.407850056886673, "learning_rate": 1.236025967954362e-05, "loss": 0.0524, "num_input_tokens_seen": 23910192, "step": 26750 }, { "epoch": 7.061237956975056, "grad_norm": 0.14235419034957886, "learning_rate": 1.2351790389469153e-05, "loss": 0.0477, "num_input_tokens_seen": 23914768, "step": 26755 }, { "epoch": 7.062557740530553, "grad_norm": 0.10189440101385117, "learning_rate": 1.234332304990755e-05, "loss": 0.0406, "num_input_tokens_seen": 23919152, "step": 26760 }, { "epoch": 7.06387752408605, "grad_norm": 0.04264217987656593, "learning_rate": 1.2334857662164593e-05, "loss": 0.0448, "num_input_tokens_seen": 23923728, "step": 26765 }, { "epoch": 7.065197307641546, "grad_norm": 0.21801042556762695, "learning_rate": 1.2326394227545743e-05, "loss": 0.0998, "num_input_tokens_seen": 23928272, "step": 26770 }, { "epoch": 7.066517091197044, "grad_norm": 0.2024906724691391, "learning_rate": 1.2317932747356162e-05, "loss": 0.0424, "num_input_tokens_seen": 23932400, "step": 26775 }, { "epoch": 7.06783687475254, "grad_norm": 0.2778661549091339, "learning_rate": 1.2309473222900726e-05, "loss": 0.054, "num_input_tokens_seen": 23936656, "step": 26780 }, { "epoch": 7.069156658308038, "grad_norm": 0.3980865478515625, "learning_rate": 1.2301015655484006e-05, "loss": 0.0854, "num_input_tokens_seen": 23941040, "step": 26785 }, { "epoch": 7.070476441863534, "grad_norm": 0.23265591263771057, "learning_rate": 1.2292560046410245e-05, "loss": 0.0875, "num_input_tokens_seen": 23945328, "step": 26790 }, { "epoch": 7.071796225419031, "grad_norm": 0.18941642343997955, "learning_rate": 1.228410639698343e-05, "loss": 0.0349, "num_input_tokens_seen": 23949904, "step": 26795 }, { "epoch": 7.073116008974528, "grad_norm": 0.47927021980285645, "learning_rate": 1.2275654708507195e-05, "loss": 0.0555, "num_input_tokens_seen": 23954320, "step": 26800 }, { "epoch": 7.073116008974528, "eval_loss": 0.06721608340740204, "eval_runtime": 64.779, "eval_samples_per_second": 103.969, "eval_steps_per_second": 25.996, "num_input_tokens_seen": 23954320, "step": 26800 }, { "epoch": 7.074435792530025, "grad_norm": 0.030394135043025017, "learning_rate": 1.2267204982284908e-05, "loss": 0.0396, "num_input_tokens_seen": 23958704, "step": 26805 }, { "epoch": 7.0757555760855215, "grad_norm": 0.05200476571917534, "learning_rate": 1.2258757219619635e-05, "loss": 0.0389, "num_input_tokens_seen": 23963376, "step": 26810 }, { "epoch": 7.077075359641019, "grad_norm": 0.17522476613521576, "learning_rate": 1.2250311421814104e-05, "loss": 0.0386, "num_input_tokens_seen": 23968080, "step": 26815 }, { "epoch": 7.0783951431965155, "grad_norm": 0.030085133388638496, "learning_rate": 1.2241867590170772e-05, "loss": 0.1179, "num_input_tokens_seen": 23972592, "step": 26820 }, { "epoch": 7.079714926752013, "grad_norm": 0.1663040816783905, "learning_rate": 1.2233425725991799e-05, "loss": 0.0941, "num_input_tokens_seen": 23977232, "step": 26825 }, { "epoch": 7.0810347103075095, "grad_norm": 0.11070884764194489, "learning_rate": 1.2224985830579003e-05, "loss": 0.039, "num_input_tokens_seen": 23981648, "step": 26830 }, { "epoch": 7.082354493863006, "grad_norm": 0.1555173695087433, "learning_rate": 1.2216547905233944e-05, "loss": 0.0583, "num_input_tokens_seen": 23986096, "step": 26835 }, { "epoch": 7.0836742774185035, "grad_norm": 0.17865745723247528, "learning_rate": 1.2208111951257842e-05, "loss": 0.0722, "num_input_tokens_seen": 23990800, "step": 26840 }, { "epoch": 7.084994060974, "grad_norm": 0.20165173709392548, "learning_rate": 1.2199677969951622e-05, "loss": 0.0638, "num_input_tokens_seen": 23995312, "step": 26845 }, { "epoch": 7.0863138445294975, "grad_norm": 0.08363146334886551, "learning_rate": 1.2191245962615927e-05, "loss": 0.0463, "num_input_tokens_seen": 23999440, "step": 26850 }, { "epoch": 7.087633628084994, "grad_norm": 0.03990233317017555, "learning_rate": 1.218281593055106e-05, "loss": 0.026, "num_input_tokens_seen": 24004144, "step": 26855 }, { "epoch": 7.088953411640491, "grad_norm": 0.2977573871612549, "learning_rate": 1.217438787505705e-05, "loss": 0.0874, "num_input_tokens_seen": 24008592, "step": 26860 }, { "epoch": 7.090273195195988, "grad_norm": 0.1399790197610855, "learning_rate": 1.2165961797433615e-05, "loss": 0.101, "num_input_tokens_seen": 24013136, "step": 26865 }, { "epoch": 7.091592978751485, "grad_norm": 0.14798660576343536, "learning_rate": 1.215753769898014e-05, "loss": 0.0791, "num_input_tokens_seen": 24017680, "step": 26870 }, { "epoch": 7.092912762306982, "grad_norm": 0.1649623066186905, "learning_rate": 1.2149115580995755e-05, "loss": 0.0299, "num_input_tokens_seen": 24022384, "step": 26875 }, { "epoch": 7.094232545862479, "grad_norm": 0.23406080901622772, "learning_rate": 1.2140695444779227e-05, "loss": 0.0376, "num_input_tokens_seen": 24026672, "step": 26880 }, { "epoch": 7.095552329417975, "grad_norm": 0.10084214806556702, "learning_rate": 1.2132277291629066e-05, "loss": 0.0652, "num_input_tokens_seen": 24031184, "step": 26885 }, { "epoch": 7.096872112973473, "grad_norm": 0.03310735151171684, "learning_rate": 1.2123861122843458e-05, "loss": 0.0623, "num_input_tokens_seen": 24035728, "step": 26890 }, { "epoch": 7.098191896528969, "grad_norm": 0.06398425251245499, "learning_rate": 1.2115446939720271e-05, "loss": 0.0437, "num_input_tokens_seen": 24040144, "step": 26895 }, { "epoch": 7.099511680084466, "grad_norm": 0.35846656560897827, "learning_rate": 1.210703474355708e-05, "loss": 0.0579, "num_input_tokens_seen": 24044464, "step": 26900 }, { "epoch": 7.100831463639963, "grad_norm": 0.10406621545553207, "learning_rate": 1.2098624535651164e-05, "loss": 0.0508, "num_input_tokens_seen": 24048976, "step": 26905 }, { "epoch": 7.10215124719546, "grad_norm": 0.3346163332462311, "learning_rate": 1.2090216317299477e-05, "loss": 0.0447, "num_input_tokens_seen": 24053232, "step": 26910 }, { "epoch": 7.103471030750957, "grad_norm": 0.07963395863771439, "learning_rate": 1.2081810089798668e-05, "loss": 0.0275, "num_input_tokens_seen": 24057680, "step": 26915 }, { "epoch": 7.104790814306454, "grad_norm": 0.06073209270834923, "learning_rate": 1.2073405854445072e-05, "loss": 0.036, "num_input_tokens_seen": 24062384, "step": 26920 }, { "epoch": 7.10611059786195, "grad_norm": 0.3106163442134857, "learning_rate": 1.206500361253474e-05, "loss": 0.0938, "num_input_tokens_seen": 24066736, "step": 26925 }, { "epoch": 7.107430381417448, "grad_norm": 0.1260586529970169, "learning_rate": 1.2056603365363409e-05, "loss": 0.0406, "num_input_tokens_seen": 24071152, "step": 26930 }, { "epoch": 7.108750164972944, "grad_norm": 0.21250569820404053, "learning_rate": 1.2048205114226487e-05, "loss": 0.0587, "num_input_tokens_seen": 24075472, "step": 26935 }, { "epoch": 7.110069948528442, "grad_norm": 0.09910744428634644, "learning_rate": 1.2039808860419102e-05, "loss": 0.0544, "num_input_tokens_seen": 24079728, "step": 26940 }, { "epoch": 7.111389732083938, "grad_norm": 0.2636645436286926, "learning_rate": 1.2031414605236066e-05, "loss": 0.0539, "num_input_tokens_seen": 24084240, "step": 26945 }, { "epoch": 7.112709515639435, "grad_norm": 0.08178254961967468, "learning_rate": 1.2023022349971862e-05, "loss": 0.0503, "num_input_tokens_seen": 24089136, "step": 26950 }, { "epoch": 7.114029299194932, "grad_norm": 0.1292632818222046, "learning_rate": 1.20146320959207e-05, "loss": 0.0622, "num_input_tokens_seen": 24093840, "step": 26955 }, { "epoch": 7.115349082750429, "grad_norm": 0.2928435504436493, "learning_rate": 1.2006243844376445e-05, "loss": 0.0723, "num_input_tokens_seen": 24098864, "step": 26960 }, { "epoch": 7.116668866305925, "grad_norm": 0.041891586035490036, "learning_rate": 1.1997857596632678e-05, "loss": 0.0279, "num_input_tokens_seen": 24103216, "step": 26965 }, { "epoch": 7.117988649861423, "grad_norm": 0.13848808407783508, "learning_rate": 1.1989473353982672e-05, "loss": 0.0515, "num_input_tokens_seen": 24107728, "step": 26970 }, { "epoch": 7.119308433416919, "grad_norm": 0.323123961687088, "learning_rate": 1.198109111771937e-05, "loss": 0.153, "num_input_tokens_seen": 24112176, "step": 26975 }, { "epoch": 7.120628216972417, "grad_norm": 0.09815607964992523, "learning_rate": 1.197271088913543e-05, "loss": 0.081, "num_input_tokens_seen": 24116848, "step": 26980 }, { "epoch": 7.121948000527913, "grad_norm": 0.38988539576530457, "learning_rate": 1.1964332669523182e-05, "loss": 0.0784, "num_input_tokens_seen": 24121360, "step": 26985 }, { "epoch": 7.12326778408341, "grad_norm": 0.24771583080291748, "learning_rate": 1.1955956460174645e-05, "loss": 0.0696, "num_input_tokens_seen": 24125872, "step": 26990 }, { "epoch": 7.124587567638907, "grad_norm": 0.25931790471076965, "learning_rate": 1.1947582262381552e-05, "loss": 0.0527, "num_input_tokens_seen": 24130288, "step": 26995 }, { "epoch": 7.125907351194404, "grad_norm": 0.24531014263629913, "learning_rate": 1.1939210077435293e-05, "loss": 0.0412, "num_input_tokens_seen": 24134608, "step": 27000 }, { "epoch": 7.125907351194404, "eval_loss": 0.06724131852388382, "eval_runtime": 64.7599, "eval_samples_per_second": 104.0, "eval_steps_per_second": 26.004, "num_input_tokens_seen": 24134608, "step": 27000 }, { "epoch": 7.127227134749901, "grad_norm": 0.31773126125335693, "learning_rate": 1.193083990662697e-05, "loss": 0.0591, "num_input_tokens_seen": 24139184, "step": 27005 }, { "epoch": 7.128546918305398, "grad_norm": 0.38315433263778687, "learning_rate": 1.192247175124738e-05, "loss": 0.0843, "num_input_tokens_seen": 24143440, "step": 27010 }, { "epoch": 7.1298667018608946, "grad_norm": 0.1428283452987671, "learning_rate": 1.191410561258698e-05, "loss": 0.05, "num_input_tokens_seen": 24147760, "step": 27015 }, { "epoch": 7.131186485416392, "grad_norm": 0.22901950776576996, "learning_rate": 1.1905741491935944e-05, "loss": 0.0346, "num_input_tokens_seen": 24152112, "step": 27020 }, { "epoch": 7.1325062689718886, "grad_norm": 0.10218995064496994, "learning_rate": 1.1897379390584129e-05, "loss": 0.0521, "num_input_tokens_seen": 24156176, "step": 27025 }, { "epoch": 7.133826052527385, "grad_norm": 0.19707873463630676, "learning_rate": 1.1889019309821062e-05, "loss": 0.0528, "num_input_tokens_seen": 24160720, "step": 27030 }, { "epoch": 7.1351458360828826, "grad_norm": 0.06293908506631851, "learning_rate": 1.188066125093599e-05, "loss": 0.0406, "num_input_tokens_seen": 24165104, "step": 27035 }, { "epoch": 7.136465619638379, "grad_norm": 0.2122800648212433, "learning_rate": 1.1872305215217811e-05, "loss": 0.0489, "num_input_tokens_seen": 24169616, "step": 27040 }, { "epoch": 7.1377854031938766, "grad_norm": 0.0775170847773552, "learning_rate": 1.186395120395514e-05, "loss": 0.033, "num_input_tokens_seen": 24174096, "step": 27045 }, { "epoch": 7.139105186749373, "grad_norm": 0.29120853543281555, "learning_rate": 1.1855599218436283e-05, "loss": 0.0479, "num_input_tokens_seen": 24178544, "step": 27050 }, { "epoch": 7.14042497030487, "grad_norm": 0.04172292724251747, "learning_rate": 1.1847249259949209e-05, "loss": 0.0494, "num_input_tokens_seen": 24183344, "step": 27055 }, { "epoch": 7.141744753860367, "grad_norm": 0.224239319562912, "learning_rate": 1.1838901329781574e-05, "loss": 0.0519, "num_input_tokens_seen": 24187728, "step": 27060 }, { "epoch": 7.143064537415864, "grad_norm": 0.19917020201683044, "learning_rate": 1.1830555429220758e-05, "loss": 0.0657, "num_input_tokens_seen": 24192368, "step": 27065 }, { "epoch": 7.144384320971361, "grad_norm": 0.2386878877878189, "learning_rate": 1.1822211559553784e-05, "loss": 0.0478, "num_input_tokens_seen": 24196688, "step": 27070 }, { "epoch": 7.145704104526858, "grad_norm": 0.06777238100767136, "learning_rate": 1.18138697220674e-05, "loss": 0.0239, "num_input_tokens_seen": 24201360, "step": 27075 }, { "epoch": 7.147023888082354, "grad_norm": 0.1428188532590866, "learning_rate": 1.1805529918048e-05, "loss": 0.0309, "num_input_tokens_seen": 24205840, "step": 27080 }, { "epoch": 7.148343671637852, "grad_norm": 0.0450635589659214, "learning_rate": 1.1797192148781702e-05, "loss": 0.0647, "num_input_tokens_seen": 24210000, "step": 27085 }, { "epoch": 7.149663455193348, "grad_norm": 0.5597084760665894, "learning_rate": 1.1788856415554297e-05, "loss": 0.0969, "num_input_tokens_seen": 24214544, "step": 27090 }, { "epoch": 7.150983238748845, "grad_norm": 0.31675687432289124, "learning_rate": 1.1780522719651249e-05, "loss": 0.1274, "num_input_tokens_seen": 24218736, "step": 27095 }, { "epoch": 7.152303022304342, "grad_norm": 0.26600244641304016, "learning_rate": 1.1772191062357721e-05, "loss": 0.0801, "num_input_tokens_seen": 24223376, "step": 27100 }, { "epoch": 7.153622805859839, "grad_norm": 0.2309029996395111, "learning_rate": 1.1763861444958573e-05, "loss": 0.0671, "num_input_tokens_seen": 24227984, "step": 27105 }, { "epoch": 7.154942589415336, "grad_norm": 0.46515825390815735, "learning_rate": 1.1755533868738317e-05, "loss": 0.045, "num_input_tokens_seen": 24232368, "step": 27110 }, { "epoch": 7.156262372970833, "grad_norm": 0.04308098554611206, "learning_rate": 1.1747208334981185e-05, "loss": 0.0224, "num_input_tokens_seen": 24236944, "step": 27115 }, { "epoch": 7.157582156526329, "grad_norm": 0.3021142780780792, "learning_rate": 1.1738884844971067e-05, "loss": 0.0454, "num_input_tokens_seen": 24241488, "step": 27120 }, { "epoch": 7.158901940081827, "grad_norm": 0.23209217190742493, "learning_rate": 1.1730563399991563e-05, "loss": 0.0569, "num_input_tokens_seen": 24245904, "step": 27125 }, { "epoch": 7.160221723637323, "grad_norm": 0.10624974220991135, "learning_rate": 1.1722244001325938e-05, "loss": 0.0308, "num_input_tokens_seen": 24250288, "step": 27130 }, { "epoch": 7.161541507192821, "grad_norm": 0.04418506845831871, "learning_rate": 1.1713926650257137e-05, "loss": 0.012, "num_input_tokens_seen": 24255056, "step": 27135 }, { "epoch": 7.162861290748317, "grad_norm": 0.28927895426750183, "learning_rate": 1.170561134806781e-05, "loss": 0.0472, "num_input_tokens_seen": 24259184, "step": 27140 }, { "epoch": 7.164181074303814, "grad_norm": 0.327822744846344, "learning_rate": 1.1697298096040287e-05, "loss": 0.0835, "num_input_tokens_seen": 24263664, "step": 27145 }, { "epoch": 7.165500857859311, "grad_norm": 0.1626204401254654, "learning_rate": 1.1688986895456567e-05, "loss": 0.0239, "num_input_tokens_seen": 24267664, "step": 27150 }, { "epoch": 7.166820641414808, "grad_norm": 0.1624266803264618, "learning_rate": 1.1680677747598349e-05, "loss": 0.0356, "num_input_tokens_seen": 24271920, "step": 27155 }, { "epoch": 7.1681404249703045, "grad_norm": 0.3008864223957062, "learning_rate": 1.1672370653746995e-05, "loss": 0.0882, "num_input_tokens_seen": 24276368, "step": 27160 }, { "epoch": 7.169460208525802, "grad_norm": 0.3909413814544678, "learning_rate": 1.166406561518357e-05, "loss": 0.0764, "num_input_tokens_seen": 24280880, "step": 27165 }, { "epoch": 7.1707799920812985, "grad_norm": 0.23474417626857758, "learning_rate": 1.1655762633188826e-05, "loss": 0.0732, "num_input_tokens_seen": 24285488, "step": 27170 }, { "epoch": 7.172099775636796, "grad_norm": 0.2852865755558014, "learning_rate": 1.1647461709043172e-05, "loss": 0.085, "num_input_tokens_seen": 24290128, "step": 27175 }, { "epoch": 7.1734195591922925, "grad_norm": 0.07236635684967041, "learning_rate": 1.1639162844026722e-05, "loss": 0.0372, "num_input_tokens_seen": 24294480, "step": 27180 }, { "epoch": 7.174739342747789, "grad_norm": 0.42925286293029785, "learning_rate": 1.163086603941927e-05, "loss": 0.1157, "num_input_tokens_seen": 24299088, "step": 27185 }, { "epoch": 7.1760591263032865, "grad_norm": 0.14288103580474854, "learning_rate": 1.1622571296500273e-05, "loss": 0.0659, "num_input_tokens_seen": 24303120, "step": 27190 }, { "epoch": 7.177378909858783, "grad_norm": 0.2055729180574417, "learning_rate": 1.1614278616548904e-05, "loss": 0.0639, "num_input_tokens_seen": 24307760, "step": 27195 }, { "epoch": 7.1786986934142805, "grad_norm": 0.17746588587760925, "learning_rate": 1.1605988000843986e-05, "loss": 0.0743, "num_input_tokens_seen": 24312464, "step": 27200 }, { "epoch": 7.1786986934142805, "eval_loss": 0.0671549066901207, "eval_runtime": 64.7709, "eval_samples_per_second": 103.982, "eval_steps_per_second": 25.999, "num_input_tokens_seen": 24312464, "step": 27200 }, { "epoch": 7.180018476969777, "grad_norm": 0.1453004628419876, "learning_rate": 1.1597699450664028e-05, "loss": 0.0671, "num_input_tokens_seen": 24316976, "step": 27205 }, { "epoch": 7.181338260525274, "grad_norm": 0.0941695049405098, "learning_rate": 1.1589412967287252e-05, "loss": 0.078, "num_input_tokens_seen": 24321104, "step": 27210 }, { "epoch": 7.182658044080771, "grad_norm": 0.2976839542388916, "learning_rate": 1.1581128551991514e-05, "loss": 0.0748, "num_input_tokens_seen": 24325840, "step": 27215 }, { "epoch": 7.183977827636268, "grad_norm": 0.044943131506443024, "learning_rate": 1.1572846206054383e-05, "loss": 0.0499, "num_input_tokens_seen": 24330512, "step": 27220 }, { "epoch": 7.185297611191764, "grad_norm": 0.0658232718706131, "learning_rate": 1.1564565930753113e-05, "loss": 0.0296, "num_input_tokens_seen": 24335088, "step": 27225 }, { "epoch": 7.186617394747262, "grad_norm": 0.06307104974985123, "learning_rate": 1.1556287727364606e-05, "loss": 0.0559, "num_input_tokens_seen": 24339728, "step": 27230 }, { "epoch": 7.187937178302758, "grad_norm": 0.1420002430677414, "learning_rate": 1.1548011597165489e-05, "loss": 0.049, "num_input_tokens_seen": 24344368, "step": 27235 }, { "epoch": 7.189256961858256, "grad_norm": 0.18477608263492584, "learning_rate": 1.1539737541432019e-05, "loss": 0.0929, "num_input_tokens_seen": 24348720, "step": 27240 }, { "epoch": 7.190576745413752, "grad_norm": 0.11105629801750183, "learning_rate": 1.1531465561440174e-05, "loss": 0.0749, "num_input_tokens_seen": 24353040, "step": 27245 }, { "epoch": 7.191896528969249, "grad_norm": 0.20076912641525269, "learning_rate": 1.1523195658465605e-05, "loss": 0.0746, "num_input_tokens_seen": 24357680, "step": 27250 }, { "epoch": 7.193216312524746, "grad_norm": 0.16600032150745392, "learning_rate": 1.1514927833783618e-05, "loss": 0.0656, "num_input_tokens_seen": 24362256, "step": 27255 }, { "epoch": 7.194536096080243, "grad_norm": 0.3126922845840454, "learning_rate": 1.150666208866922e-05, "loss": 0.0288, "num_input_tokens_seen": 24366992, "step": 27260 }, { "epoch": 7.19585587963574, "grad_norm": 0.13824398815631866, "learning_rate": 1.1498398424397106e-05, "loss": 0.0749, "num_input_tokens_seen": 24371280, "step": 27265 }, { "epoch": 7.197175663191237, "grad_norm": 0.09463849663734436, "learning_rate": 1.1490136842241628e-05, "loss": 0.0561, "num_input_tokens_seen": 24375504, "step": 27270 }, { "epoch": 7.198495446746733, "grad_norm": 0.411231130361557, "learning_rate": 1.1481877343476813e-05, "loss": 0.1084, "num_input_tokens_seen": 24379824, "step": 27275 }, { "epoch": 7.199815230302231, "grad_norm": 0.23730726540088654, "learning_rate": 1.14736199293764e-05, "loss": 0.0697, "num_input_tokens_seen": 24384336, "step": 27280 }, { "epoch": 7.201135013857727, "grad_norm": 0.1440921127796173, "learning_rate": 1.1465364601213771e-05, "loss": 0.0892, "num_input_tokens_seen": 24388880, "step": 27285 }, { "epoch": 7.202454797413224, "grad_norm": 0.17842113971710205, "learning_rate": 1.1457111360262012e-05, "loss": 0.0812, "num_input_tokens_seen": 24393072, "step": 27290 }, { "epoch": 7.203774580968721, "grad_norm": 0.15534725785255432, "learning_rate": 1.1448860207793869e-05, "loss": 0.0607, "num_input_tokens_seen": 24397360, "step": 27295 }, { "epoch": 7.205094364524218, "grad_norm": 0.2299571931362152, "learning_rate": 1.144061114508177e-05, "loss": 0.0672, "num_input_tokens_seen": 24402160, "step": 27300 }, { "epoch": 7.206414148079715, "grad_norm": 0.0235295407474041, "learning_rate": 1.1432364173397842e-05, "loss": 0.0398, "num_input_tokens_seen": 24406576, "step": 27305 }, { "epoch": 7.207733931635212, "grad_norm": 0.07702226936817169, "learning_rate": 1.1424119294013852e-05, "loss": 0.0479, "num_input_tokens_seen": 24411504, "step": 27310 }, { "epoch": 7.209053715190708, "grad_norm": 0.11302943527698517, "learning_rate": 1.1415876508201279e-05, "loss": 0.047, "num_input_tokens_seen": 24416048, "step": 27315 }, { "epoch": 7.210373498746206, "grad_norm": 0.0733208954334259, "learning_rate": 1.140763581723125e-05, "loss": 0.0718, "num_input_tokens_seen": 24420112, "step": 27320 }, { "epoch": 7.211693282301702, "grad_norm": 0.062075842171907425, "learning_rate": 1.1399397222374588e-05, "loss": 0.0977, "num_input_tokens_seen": 24424432, "step": 27325 }, { "epoch": 7.2130130658572, "grad_norm": 0.07009834051132202, "learning_rate": 1.1391160724901804e-05, "loss": 0.0673, "num_input_tokens_seen": 24428784, "step": 27330 }, { "epoch": 7.214332849412696, "grad_norm": 0.08232706040143967, "learning_rate": 1.138292632608304e-05, "loss": 0.0756, "num_input_tokens_seen": 24433136, "step": 27335 }, { "epoch": 7.215652632968193, "grad_norm": 0.39854931831359863, "learning_rate": 1.1374694027188174e-05, "loss": 0.0813, "num_input_tokens_seen": 24437456, "step": 27340 }, { "epoch": 7.21697241652369, "grad_norm": 0.18461249768733978, "learning_rate": 1.1366463829486711e-05, "loss": 0.0643, "num_input_tokens_seen": 24441936, "step": 27345 }, { "epoch": 7.218292200079187, "grad_norm": 0.14270691573619843, "learning_rate": 1.1358235734247849e-05, "loss": 0.0528, "num_input_tokens_seen": 24446640, "step": 27350 }, { "epoch": 7.2196119836346835, "grad_norm": 0.3645656704902649, "learning_rate": 1.1350009742740478e-05, "loss": 0.0782, "num_input_tokens_seen": 24450992, "step": 27355 }, { "epoch": 7.220931767190181, "grad_norm": 0.255948007106781, "learning_rate": 1.134178585623313e-05, "loss": 0.0622, "num_input_tokens_seen": 24455376, "step": 27360 }, { "epoch": 7.2222515507456775, "grad_norm": 0.16897998750209808, "learning_rate": 1.1333564075994047e-05, "loss": 0.0684, "num_input_tokens_seen": 24459952, "step": 27365 }, { "epoch": 7.223571334301175, "grad_norm": 0.18294745683670044, "learning_rate": 1.1325344403291133e-05, "loss": 0.0412, "num_input_tokens_seen": 24464336, "step": 27370 }, { "epoch": 7.2248911178566715, "grad_norm": 0.403415709733963, "learning_rate": 1.1317126839391951e-05, "loss": 0.0385, "num_input_tokens_seen": 24468944, "step": 27375 }, { "epoch": 7.226210901412168, "grad_norm": 0.473206102848053, "learning_rate": 1.1308911385563766e-05, "loss": 0.0655, "num_input_tokens_seen": 24473744, "step": 27380 }, { "epoch": 7.2275306849676655, "grad_norm": 0.048293884843587875, "learning_rate": 1.1300698043073494e-05, "loss": 0.0524, "num_input_tokens_seen": 24478256, "step": 27385 }, { "epoch": 7.228850468523162, "grad_norm": 0.04211829975247383, "learning_rate": 1.1292486813187736e-05, "loss": 0.0315, "num_input_tokens_seen": 24482832, "step": 27390 }, { "epoch": 7.2301702520786595, "grad_norm": 0.2053355872631073, "learning_rate": 1.1284277697172782e-05, "loss": 0.0576, "num_input_tokens_seen": 24487248, "step": 27395 }, { "epoch": 7.231490035634156, "grad_norm": 0.262908399105072, "learning_rate": 1.127607069629456e-05, "loss": 0.0658, "num_input_tokens_seen": 24491696, "step": 27400 }, { "epoch": 7.231490035634156, "eval_loss": 0.06733673065900803, "eval_runtime": 64.7926, "eval_samples_per_second": 103.947, "eval_steps_per_second": 25.991, "num_input_tokens_seen": 24491696, "step": 27400 }, { "epoch": 7.232809819189653, "grad_norm": 0.37509220838546753, "learning_rate": 1.1267865811818701e-05, "loss": 0.0912, "num_input_tokens_seen": 24495856, "step": 27405 }, { "epoch": 7.23412960274515, "grad_norm": 0.385707288980484, "learning_rate": 1.1259663045010513e-05, "loss": 0.0826, "num_input_tokens_seen": 24500496, "step": 27410 }, { "epoch": 7.235449386300647, "grad_norm": 0.045231495052576065, "learning_rate": 1.1251462397134957e-05, "loss": 0.0196, "num_input_tokens_seen": 24504816, "step": 27415 }, { "epoch": 7.236769169856144, "grad_norm": 0.06550495326519012, "learning_rate": 1.1243263869456664e-05, "loss": 0.0472, "num_input_tokens_seen": 24509424, "step": 27420 }, { "epoch": 7.238088953411641, "grad_norm": 0.07850643992424011, "learning_rate": 1.1235067463239967e-05, "loss": 0.0795, "num_input_tokens_seen": 24513712, "step": 27425 }, { "epoch": 7.239408736967137, "grad_norm": 0.338356614112854, "learning_rate": 1.122687317974884e-05, "loss": 0.081, "num_input_tokens_seen": 24517936, "step": 27430 }, { "epoch": 7.240728520522635, "grad_norm": 0.5090805292129517, "learning_rate": 1.1218681020246963e-05, "loss": 0.0909, "num_input_tokens_seen": 24522448, "step": 27435 }, { "epoch": 7.242048304078131, "grad_norm": 0.10686513036489487, "learning_rate": 1.1210490985997652e-05, "loss": 0.0977, "num_input_tokens_seen": 24526736, "step": 27440 }, { "epoch": 7.243368087633628, "grad_norm": 0.038170576095581055, "learning_rate": 1.1202303078263917e-05, "loss": 0.0476, "num_input_tokens_seen": 24531088, "step": 27445 }, { "epoch": 7.244687871189125, "grad_norm": 0.04953524470329285, "learning_rate": 1.1194117298308451e-05, "loss": 0.0481, "num_input_tokens_seen": 24535760, "step": 27450 }, { "epoch": 7.246007654744622, "grad_norm": 0.3726954758167267, "learning_rate": 1.1185933647393585e-05, "loss": 0.0682, "num_input_tokens_seen": 24540048, "step": 27455 }, { "epoch": 7.247327438300119, "grad_norm": 0.31466010212898254, "learning_rate": 1.1177752126781354e-05, "loss": 0.1195, "num_input_tokens_seen": 24544464, "step": 27460 }, { "epoch": 7.248647221855616, "grad_norm": 0.11652236431837082, "learning_rate": 1.1169572737733441e-05, "loss": 0.0716, "num_input_tokens_seen": 24548720, "step": 27465 }, { "epoch": 7.249967005411112, "grad_norm": 0.038556769490242004, "learning_rate": 1.1161395481511216e-05, "loss": 0.1219, "num_input_tokens_seen": 24553136, "step": 27470 }, { "epoch": 7.25128678896661, "grad_norm": 0.3688308000564575, "learning_rate": 1.1153220359375722e-05, "loss": 0.0876, "num_input_tokens_seen": 24557904, "step": 27475 }, { "epoch": 7.252606572522106, "grad_norm": 0.1164795458316803, "learning_rate": 1.114504737258765e-05, "loss": 0.0478, "num_input_tokens_seen": 24562448, "step": 27480 }, { "epoch": 7.253926356077603, "grad_norm": 0.4749835133552551, "learning_rate": 1.1136876522407393e-05, "loss": 0.0852, "num_input_tokens_seen": 24566928, "step": 27485 }, { "epoch": 7.2552461396331, "grad_norm": 0.18950961530208588, "learning_rate": 1.1128707810094985e-05, "loss": 0.0556, "num_input_tokens_seen": 24571216, "step": 27490 }, { "epoch": 7.256565923188597, "grad_norm": 0.29345831274986267, "learning_rate": 1.1120541236910157e-05, "loss": 0.0521, "num_input_tokens_seen": 24575728, "step": 27495 }, { "epoch": 7.257885706744094, "grad_norm": 0.2807425856590271, "learning_rate": 1.111237680411229e-05, "loss": 0.0409, "num_input_tokens_seen": 24580240, "step": 27500 }, { "epoch": 7.259205490299591, "grad_norm": 0.558468222618103, "learning_rate": 1.1104214512960433e-05, "loss": 0.1002, "num_input_tokens_seen": 24584656, "step": 27505 }, { "epoch": 7.260525273855087, "grad_norm": 0.07164488732814789, "learning_rate": 1.1096054364713327e-05, "loss": 0.0649, "num_input_tokens_seen": 24589520, "step": 27510 }, { "epoch": 7.261845057410585, "grad_norm": 0.4163208603858948, "learning_rate": 1.1087896360629371e-05, "loss": 0.0591, "num_input_tokens_seen": 24593936, "step": 27515 }, { "epoch": 7.263164840966081, "grad_norm": 0.06254494935274124, "learning_rate": 1.107974050196662e-05, "loss": 0.0327, "num_input_tokens_seen": 24598416, "step": 27520 }, { "epoch": 7.264484624521579, "grad_norm": 0.173482283949852, "learning_rate": 1.1071586789982816e-05, "loss": 0.0893, "num_input_tokens_seen": 24602896, "step": 27525 }, { "epoch": 7.265804408077075, "grad_norm": 0.3245493471622467, "learning_rate": 1.1063435225935373e-05, "loss": 0.0387, "num_input_tokens_seen": 24607504, "step": 27530 }, { "epoch": 7.267124191632572, "grad_norm": 0.3367374837398529, "learning_rate": 1.1055285811081348e-05, "loss": 0.0658, "num_input_tokens_seen": 24611856, "step": 27535 }, { "epoch": 7.268443975188069, "grad_norm": 0.11486507207155228, "learning_rate": 1.1047138546677499e-05, "loss": 0.0599, "num_input_tokens_seen": 24616816, "step": 27540 }, { "epoch": 7.269763758743566, "grad_norm": 0.29035088419914246, "learning_rate": 1.1038993433980219e-05, "loss": 0.0711, "num_input_tokens_seen": 24621456, "step": 27545 }, { "epoch": 7.271083542299063, "grad_norm": 0.08633492887020111, "learning_rate": 1.1030850474245597e-05, "loss": 0.0608, "num_input_tokens_seen": 24626096, "step": 27550 }, { "epoch": 7.27240332585456, "grad_norm": 0.3409574627876282, "learning_rate": 1.102270966872939e-05, "loss": 0.0778, "num_input_tokens_seen": 24630384, "step": 27555 }, { "epoch": 7.2737231094100565, "grad_norm": 0.2998826801776886, "learning_rate": 1.1014571018687e-05, "loss": 0.0279, "num_input_tokens_seen": 24634608, "step": 27560 }, { "epoch": 7.275042892965554, "grad_norm": 0.06862879544496536, "learning_rate": 1.1006434525373502e-05, "loss": 0.0424, "num_input_tokens_seen": 24639024, "step": 27565 }, { "epoch": 7.2763626765210505, "grad_norm": 0.11353106796741486, "learning_rate": 1.0998300190043664e-05, "loss": 0.111, "num_input_tokens_seen": 24643632, "step": 27570 }, { "epoch": 7.277682460076547, "grad_norm": 0.3312112092971802, "learning_rate": 1.0990168013951882e-05, "loss": 0.075, "num_input_tokens_seen": 24648080, "step": 27575 }, { "epoch": 7.2790022436320445, "grad_norm": 0.5687589645385742, "learning_rate": 1.0982037998352263e-05, "loss": 0.0983, "num_input_tokens_seen": 24652624, "step": 27580 }, { "epoch": 7.280322027187541, "grad_norm": 0.34436213970184326, "learning_rate": 1.0973910144498534e-05, "loss": 0.0486, "num_input_tokens_seen": 24657072, "step": 27585 }, { "epoch": 7.2816418107430385, "grad_norm": 0.1591799408197403, "learning_rate": 1.0965784453644123e-05, "loss": 0.0427, "num_input_tokens_seen": 24661424, "step": 27590 }, { "epoch": 7.282961594298535, "grad_norm": 0.20682141184806824, "learning_rate": 1.0957660927042127e-05, "loss": 0.0455, "num_input_tokens_seen": 24665808, "step": 27595 }, { "epoch": 7.284281377854032, "grad_norm": 0.1414584368467331, "learning_rate": 1.094953956594527e-05, "loss": 0.0596, "num_input_tokens_seen": 24670160, "step": 27600 }, { "epoch": 7.284281377854032, "eval_loss": 0.0670321062207222, "eval_runtime": 64.7425, "eval_samples_per_second": 104.027, "eval_steps_per_second": 26.011, "num_input_tokens_seen": 24670160, "step": 27600 }, { "epoch": 7.285601161409529, "grad_norm": 0.10209903120994568, "learning_rate": 1.0941420371605981e-05, "loss": 0.0805, "num_input_tokens_seen": 24674672, "step": 27605 }, { "epoch": 7.286920944965026, "grad_norm": 0.2846114933490753, "learning_rate": 1.0933303345276354e-05, "loss": 0.063, "num_input_tokens_seen": 24678992, "step": 27610 }, { "epoch": 7.288240728520522, "grad_norm": 0.04901299625635147, "learning_rate": 1.0925188488208112e-05, "loss": 0.0372, "num_input_tokens_seen": 24683472, "step": 27615 }, { "epoch": 7.28956051207602, "grad_norm": 0.3092661499977112, "learning_rate": 1.0917075801652694e-05, "loss": 0.0798, "num_input_tokens_seen": 24687536, "step": 27620 }, { "epoch": 7.290880295631516, "grad_norm": 0.09395746141672134, "learning_rate": 1.0908965286861151e-05, "loss": 0.0446, "num_input_tokens_seen": 24691696, "step": 27625 }, { "epoch": 7.292200079187014, "grad_norm": 0.47397541999816895, "learning_rate": 1.090085694508425e-05, "loss": 0.1218, "num_input_tokens_seen": 24696080, "step": 27630 }, { "epoch": 7.29351986274251, "grad_norm": 0.4713501036167145, "learning_rate": 1.089275077757238e-05, "loss": 0.1008, "num_input_tokens_seen": 24700560, "step": 27635 }, { "epoch": 7.294839646298007, "grad_norm": 0.06048964336514473, "learning_rate": 1.0884646785575633e-05, "loss": 0.032, "num_input_tokens_seen": 24705104, "step": 27640 }, { "epoch": 7.296159429853504, "grad_norm": 0.04711132124066353, "learning_rate": 1.0876544970343728e-05, "loss": 0.0354, "num_input_tokens_seen": 24709584, "step": 27645 }, { "epoch": 7.297479213409001, "grad_norm": 0.2942809462547302, "learning_rate": 1.0868445333126082e-05, "loss": 0.0563, "num_input_tokens_seen": 24714064, "step": 27650 }, { "epoch": 7.298798996964498, "grad_norm": 0.053972892463207245, "learning_rate": 1.0860347875171745e-05, "loss": 0.0921, "num_input_tokens_seen": 24718672, "step": 27655 }, { "epoch": 7.300118780519995, "grad_norm": 0.08592488616704941, "learning_rate": 1.0852252597729465e-05, "loss": 0.0618, "num_input_tokens_seen": 24722800, "step": 27660 }, { "epoch": 7.301438564075491, "grad_norm": 0.22755536437034607, "learning_rate": 1.0844159502047615e-05, "loss": 0.0324, "num_input_tokens_seen": 24727120, "step": 27665 }, { "epoch": 7.302758347630989, "grad_norm": 0.3051374852657318, "learning_rate": 1.0836068589374265e-05, "loss": 0.0556, "num_input_tokens_seen": 24731696, "step": 27670 }, { "epoch": 7.304078131186485, "grad_norm": 0.6084445714950562, "learning_rate": 1.0827979860957144e-05, "loss": 0.0622, "num_input_tokens_seen": 24736048, "step": 27675 }, { "epoch": 7.305397914741983, "grad_norm": 0.08674095571041107, "learning_rate": 1.0819893318043615e-05, "loss": 0.0638, "num_input_tokens_seen": 24740784, "step": 27680 }, { "epoch": 7.306717698297479, "grad_norm": 0.2730005085468292, "learning_rate": 1.0811808961880734e-05, "loss": 0.0846, "num_input_tokens_seen": 24745520, "step": 27685 }, { "epoch": 7.308037481852976, "grad_norm": 0.14386364817619324, "learning_rate": 1.080372679371522e-05, "loss": 0.0744, "num_input_tokens_seen": 24750352, "step": 27690 }, { "epoch": 7.309357265408473, "grad_norm": 0.1752229630947113, "learning_rate": 1.0795646814793428e-05, "loss": 0.0422, "num_input_tokens_seen": 24754800, "step": 27695 }, { "epoch": 7.31067704896397, "grad_norm": 0.40007075667381287, "learning_rate": 1.078756902636141e-05, "loss": 0.0643, "num_input_tokens_seen": 24759216, "step": 27700 }, { "epoch": 7.311996832519466, "grad_norm": 0.24604114890098572, "learning_rate": 1.077949342966485e-05, "loss": 0.0502, "num_input_tokens_seen": 24763504, "step": 27705 }, { "epoch": 7.313316616074964, "grad_norm": 0.1704041212797165, "learning_rate": 1.0771420025949103e-05, "loss": 0.0238, "num_input_tokens_seen": 24768112, "step": 27710 }, { "epoch": 7.31463639963046, "grad_norm": 0.15443898737430573, "learning_rate": 1.0763348816459204e-05, "loss": 0.0599, "num_input_tokens_seen": 24772464, "step": 27715 }, { "epoch": 7.315956183185958, "grad_norm": 0.1411323994398117, "learning_rate": 1.0755279802439816e-05, "loss": 0.0449, "num_input_tokens_seen": 24776816, "step": 27720 }, { "epoch": 7.317275966741454, "grad_norm": 0.1422702968120575, "learning_rate": 1.0747212985135293e-05, "loss": 0.0405, "num_input_tokens_seen": 24781328, "step": 27725 }, { "epoch": 7.318595750296951, "grad_norm": 0.27496621012687683, "learning_rate": 1.073914836578965e-05, "loss": 0.0538, "num_input_tokens_seen": 24785744, "step": 27730 }, { "epoch": 7.319915533852448, "grad_norm": 0.2235908955335617, "learning_rate": 1.0731085945646529e-05, "loss": 0.0394, "num_input_tokens_seen": 24790288, "step": 27735 }, { "epoch": 7.321235317407945, "grad_norm": 0.17384664714336395, "learning_rate": 1.0723025725949285e-05, "loss": 0.0407, "num_input_tokens_seen": 24794768, "step": 27740 }, { "epoch": 7.3225551009634415, "grad_norm": 0.09952058643102646, "learning_rate": 1.0714967707940875e-05, "loss": 0.069, "num_input_tokens_seen": 24799504, "step": 27745 }, { "epoch": 7.323874884518939, "grad_norm": 0.08403459191322327, "learning_rate": 1.0706911892863963e-05, "loss": 0.0567, "num_input_tokens_seen": 24804272, "step": 27750 }, { "epoch": 7.3251946680744355, "grad_norm": 0.09434563666582108, "learning_rate": 1.0698858281960866e-05, "loss": 0.0455, "num_input_tokens_seen": 24808720, "step": 27755 }, { "epoch": 7.326514451629933, "grad_norm": 0.372171550989151, "learning_rate": 1.069080687647353e-05, "loss": 0.0654, "num_input_tokens_seen": 24813040, "step": 27760 }, { "epoch": 7.3278342351854295, "grad_norm": 0.09060271084308624, "learning_rate": 1.0682757677643596e-05, "loss": 0.0537, "num_input_tokens_seen": 24817680, "step": 27765 }, { "epoch": 7.329154018740926, "grad_norm": 0.16516268253326416, "learning_rate": 1.0674710686712359e-05, "loss": 0.0485, "num_input_tokens_seen": 24822224, "step": 27770 }, { "epoch": 7.3304738022964235, "grad_norm": 0.12392836809158325, "learning_rate": 1.0666665904920756e-05, "loss": 0.0401, "num_input_tokens_seen": 24826704, "step": 27775 }, { "epoch": 7.33179358585192, "grad_norm": 0.02740810438990593, "learning_rate": 1.0658623333509385e-05, "loss": 0.0433, "num_input_tokens_seen": 24831024, "step": 27780 }, { "epoch": 7.3331133694074175, "grad_norm": 0.13148590922355652, "learning_rate": 1.0650582973718532e-05, "loss": 0.0834, "num_input_tokens_seen": 24835536, "step": 27785 }, { "epoch": 7.334433152962914, "grad_norm": 0.21084579825401306, "learning_rate": 1.0642544826788098e-05, "loss": 0.0523, "num_input_tokens_seen": 24839760, "step": 27790 }, { "epoch": 7.335752936518411, "grad_norm": 0.13928169012069702, "learning_rate": 1.063450889395769e-05, "loss": 0.0773, "num_input_tokens_seen": 24844368, "step": 27795 }, { "epoch": 7.337072720073908, "grad_norm": 0.21707572042942047, "learning_rate": 1.062647517646653e-05, "loss": 0.0333, "num_input_tokens_seen": 24848976, "step": 27800 }, { "epoch": 7.337072720073908, "eval_loss": 0.06726846098899841, "eval_runtime": 64.7517, "eval_samples_per_second": 104.013, "eval_steps_per_second": 26.007, "num_input_tokens_seen": 24848976, "step": 27800 }, { "epoch": 7.338392503629405, "grad_norm": 0.055873699486255646, "learning_rate": 1.0618443675553527e-05, "loss": 0.0551, "num_input_tokens_seen": 24853168, "step": 27805 }, { "epoch": 7.339712287184902, "grad_norm": 0.06328243762254715, "learning_rate": 1.0610414392457247e-05, "loss": 0.0693, "num_input_tokens_seen": 24857456, "step": 27810 }, { "epoch": 7.341032070740399, "grad_norm": 0.24468226730823517, "learning_rate": 1.0602387328415888e-05, "loss": 0.0701, "num_input_tokens_seen": 24861712, "step": 27815 }, { "epoch": 7.342351854295895, "grad_norm": 0.7732455134391785, "learning_rate": 1.0594362484667347e-05, "loss": 0.1038, "num_input_tokens_seen": 24866320, "step": 27820 }, { "epoch": 7.343671637851393, "grad_norm": 0.27959728240966797, "learning_rate": 1.0586339862449132e-05, "loss": 0.0467, "num_input_tokens_seen": 24870864, "step": 27825 }, { "epoch": 7.344991421406889, "grad_norm": 0.2536408603191376, "learning_rate": 1.0578319462998445e-05, "loss": 0.1302, "num_input_tokens_seen": 24875312, "step": 27830 }, { "epoch": 7.346311204962386, "grad_norm": 0.38055500388145447, "learning_rate": 1.057030128755214e-05, "loss": 0.0959, "num_input_tokens_seen": 24879312, "step": 27835 }, { "epoch": 7.347630988517883, "grad_norm": 0.07191559672355652, "learning_rate": 1.0562285337346703e-05, "loss": 0.041, "num_input_tokens_seen": 24884048, "step": 27840 }, { "epoch": 7.34895077207338, "grad_norm": 0.15985798835754395, "learning_rate": 1.0554271613618308e-05, "loss": 0.0649, "num_input_tokens_seen": 24888720, "step": 27845 }, { "epoch": 7.350270555628877, "grad_norm": 0.05079872906208038, "learning_rate": 1.054626011760276e-05, "loss": 0.0409, "num_input_tokens_seen": 24893104, "step": 27850 }, { "epoch": 7.351590339184374, "grad_norm": 0.16903164982795715, "learning_rate": 1.0538250850535549e-05, "loss": 0.0742, "num_input_tokens_seen": 24897552, "step": 27855 }, { "epoch": 7.35291012273987, "grad_norm": 0.267366886138916, "learning_rate": 1.0530243813651794e-05, "loss": 0.053, "num_input_tokens_seen": 24901776, "step": 27860 }, { "epoch": 7.354229906295368, "grad_norm": 0.0450739823281765, "learning_rate": 1.0522239008186271e-05, "loss": 0.0628, "num_input_tokens_seen": 24906320, "step": 27865 }, { "epoch": 7.355549689850864, "grad_norm": 0.09771493822336197, "learning_rate": 1.0514236435373434e-05, "loss": 0.05, "num_input_tokens_seen": 24910736, "step": 27870 }, { "epoch": 7.356869473406362, "grad_norm": 0.04377949610352516, "learning_rate": 1.0506236096447386e-05, "loss": 0.0671, "num_input_tokens_seen": 24915344, "step": 27875 }, { "epoch": 7.358189256961858, "grad_norm": 0.4896205961704254, "learning_rate": 1.049823799264186e-05, "loss": 0.0504, "num_input_tokens_seen": 24919920, "step": 27880 }, { "epoch": 7.359509040517355, "grad_norm": 0.26843783259391785, "learning_rate": 1.049024212519028e-05, "loss": 0.0308, "num_input_tokens_seen": 24924432, "step": 27885 }, { "epoch": 7.360828824072852, "grad_norm": 0.10107819736003876, "learning_rate": 1.0482248495325713e-05, "loss": 0.0392, "num_input_tokens_seen": 24928880, "step": 27890 }, { "epoch": 7.362148607628349, "grad_norm": 0.047026123851537704, "learning_rate": 1.047425710428086e-05, "loss": 0.0177, "num_input_tokens_seen": 24933616, "step": 27895 }, { "epoch": 7.363468391183845, "grad_norm": 0.5455624461174011, "learning_rate": 1.0466267953288114e-05, "loss": 0.0882, "num_input_tokens_seen": 24938000, "step": 27900 }, { "epoch": 7.364788174739343, "grad_norm": 0.1768079400062561, "learning_rate": 1.0458281043579482e-05, "loss": 0.0451, "num_input_tokens_seen": 24942352, "step": 27905 }, { "epoch": 7.366107958294839, "grad_norm": 0.2302229404449463, "learning_rate": 1.0450296376386657e-05, "loss": 0.0694, "num_input_tokens_seen": 24947216, "step": 27910 }, { "epoch": 7.367427741850337, "grad_norm": 0.09673712402582169, "learning_rate": 1.044231395294098e-05, "loss": 0.0437, "num_input_tokens_seen": 24951888, "step": 27915 }, { "epoch": 7.368747525405833, "grad_norm": 0.021349217742681503, "learning_rate": 1.0434333774473435e-05, "loss": 0.0212, "num_input_tokens_seen": 24956464, "step": 27920 }, { "epoch": 7.37006730896133, "grad_norm": 0.06207912415266037, "learning_rate": 1.0426355842214657e-05, "loss": 0.0378, "num_input_tokens_seen": 24960752, "step": 27925 }, { "epoch": 7.371387092516827, "grad_norm": 0.36458393931388855, "learning_rate": 1.0418380157394963e-05, "loss": 0.0953, "num_input_tokens_seen": 24965072, "step": 27930 }, { "epoch": 7.372706876072324, "grad_norm": 0.143574520945549, "learning_rate": 1.0410406721244281e-05, "loss": 0.0414, "num_input_tokens_seen": 24969520, "step": 27935 }, { "epoch": 7.3740266596278214, "grad_norm": 0.20523373782634735, "learning_rate": 1.0402435534992238e-05, "loss": 0.0593, "num_input_tokens_seen": 24973744, "step": 27940 }, { "epoch": 7.375346443183318, "grad_norm": 0.25944244861602783, "learning_rate": 1.0394466599868071e-05, "loss": 0.0272, "num_input_tokens_seen": 24978256, "step": 27945 }, { "epoch": 7.376666226738815, "grad_norm": 0.048216287046670914, "learning_rate": 1.0386499917100697e-05, "loss": 0.0582, "num_input_tokens_seen": 24983024, "step": 27950 }, { "epoch": 7.377986010294312, "grad_norm": 0.2627435326576233, "learning_rate": 1.0378535487918692e-05, "loss": 0.0494, "num_input_tokens_seen": 24987440, "step": 27955 }, { "epoch": 7.379305793849809, "grad_norm": 0.2561013996601105, "learning_rate": 1.037057331355025e-05, "loss": 0.0852, "num_input_tokens_seen": 24991920, "step": 27960 }, { "epoch": 7.380625577405305, "grad_norm": 0.41921135783195496, "learning_rate": 1.0362613395223247e-05, "loss": 0.0446, "num_input_tokens_seen": 24996688, "step": 27965 }, { "epoch": 7.381945360960803, "grad_norm": 0.16882388293743134, "learning_rate": 1.0354655734165212e-05, "loss": 0.0313, "num_input_tokens_seen": 25000976, "step": 27970 }, { "epoch": 7.383265144516299, "grad_norm": 0.32885047793388367, "learning_rate": 1.03467003316033e-05, "loss": 0.0776, "num_input_tokens_seen": 25005264, "step": 27975 }, { "epoch": 7.384584928071797, "grad_norm": 0.2750985324382782, "learning_rate": 1.033874718876435e-05, "loss": 0.0614, "num_input_tokens_seen": 25009776, "step": 27980 }, { "epoch": 7.385904711627293, "grad_norm": 0.4717799127101898, "learning_rate": 1.0330796306874818e-05, "loss": 0.0823, "num_input_tokens_seen": 25014288, "step": 27985 }, { "epoch": 7.38722449518279, "grad_norm": 0.2063411921262741, "learning_rate": 1.032284768716085e-05, "loss": 0.0188, "num_input_tokens_seen": 25018736, "step": 27990 }, { "epoch": 7.388544278738287, "grad_norm": 0.22211600840091705, "learning_rate": 1.0314901330848206e-05, "loss": 0.0612, "num_input_tokens_seen": 25023280, "step": 27995 }, { "epoch": 7.389864062293784, "grad_norm": 0.0464959554374218, "learning_rate": 1.030695723916233e-05, "loss": 0.0169, "num_input_tokens_seen": 25027536, "step": 28000 }, { "epoch": 7.389864062293784, "eval_loss": 0.06733068078756332, "eval_runtime": 64.819, "eval_samples_per_second": 103.905, "eval_steps_per_second": 25.98, "num_input_tokens_seen": 25027536, "step": 28000 }, { "epoch": 7.391183845849281, "grad_norm": 0.37597766518592834, "learning_rate": 1.0299015413328289e-05, "loss": 0.0564, "num_input_tokens_seen": 25031856, "step": 28005 }, { "epoch": 7.392503629404778, "grad_norm": 0.25106000900268555, "learning_rate": 1.0291075854570809e-05, "loss": 0.0536, "num_input_tokens_seen": 25036464, "step": 28010 }, { "epoch": 7.393823412960274, "grad_norm": 0.289782851934433, "learning_rate": 1.0283138564114275e-05, "loss": 0.0787, "num_input_tokens_seen": 25040752, "step": 28015 }, { "epoch": 7.395143196515772, "grad_norm": 0.07802927494049072, "learning_rate": 1.027520354318273e-05, "loss": 0.0833, "num_input_tokens_seen": 25045392, "step": 28020 }, { "epoch": 7.396462980071268, "grad_norm": 0.24733249843120575, "learning_rate": 1.0267270792999828e-05, "loss": 0.0378, "num_input_tokens_seen": 25049744, "step": 28025 }, { "epoch": 7.397782763626765, "grad_norm": 0.6040419936180115, "learning_rate": 1.0259340314788919e-05, "loss": 0.0825, "num_input_tokens_seen": 25053968, "step": 28030 }, { "epoch": 7.399102547182262, "grad_norm": 0.1716860681772232, "learning_rate": 1.0251412109772979e-05, "loss": 0.0423, "num_input_tokens_seen": 25058384, "step": 28035 }, { "epoch": 7.400422330737759, "grad_norm": 0.2641092538833618, "learning_rate": 1.0243486179174627e-05, "loss": 0.0444, "num_input_tokens_seen": 25063152, "step": 28040 }, { "epoch": 7.401742114293256, "grad_norm": 0.07201898843050003, "learning_rate": 1.0235562524216158e-05, "loss": 0.0348, "num_input_tokens_seen": 25067632, "step": 28045 }, { "epoch": 7.403061897848753, "grad_norm": 0.293312668800354, "learning_rate": 1.022764114611948e-05, "loss": 0.0412, "num_input_tokens_seen": 25071824, "step": 28050 }, { "epoch": 7.404381681404249, "grad_norm": 0.3897400200366974, "learning_rate": 1.0219722046106178e-05, "loss": 0.0753, "num_input_tokens_seen": 25076336, "step": 28055 }, { "epoch": 7.405701464959747, "grad_norm": 0.16458198428153992, "learning_rate": 1.0211805225397486e-05, "loss": 0.0515, "num_input_tokens_seen": 25080816, "step": 28060 }, { "epoch": 7.407021248515243, "grad_norm": 0.7404221892356873, "learning_rate": 1.020389068521426e-05, "loss": 0.0993, "num_input_tokens_seen": 25085200, "step": 28065 }, { "epoch": 7.408341032070741, "grad_norm": 0.21583284437656403, "learning_rate": 1.0195978426777039e-05, "loss": 0.0269, "num_input_tokens_seen": 25089232, "step": 28070 }, { "epoch": 7.409660815626237, "grad_norm": 0.2122388333082199, "learning_rate": 1.0188068451305982e-05, "loss": 0.0725, "num_input_tokens_seen": 25093552, "step": 28075 }, { "epoch": 7.410980599181734, "grad_norm": 0.07161913067102432, "learning_rate": 1.0180160760020902e-05, "loss": 0.0529, "num_input_tokens_seen": 25097968, "step": 28080 }, { "epoch": 7.412300382737231, "grad_norm": 0.3532697558403015, "learning_rate": 1.0172255354141278e-05, "loss": 0.0821, "num_input_tokens_seen": 25101808, "step": 28085 }, { "epoch": 7.413620166292728, "grad_norm": 0.17360340058803558, "learning_rate": 1.0164352234886205e-05, "loss": 0.0713, "num_input_tokens_seen": 25106320, "step": 28090 }, { "epoch": 7.414939949848225, "grad_norm": 0.13384853303432465, "learning_rate": 1.0156451403474454e-05, "loss": 0.0576, "num_input_tokens_seen": 25110800, "step": 28095 }, { "epoch": 7.416259733403722, "grad_norm": 0.36028075218200684, "learning_rate": 1.0148552861124443e-05, "loss": 0.0592, "num_input_tokens_seen": 25115376, "step": 28100 }, { "epoch": 7.4175795169592185, "grad_norm": 0.24620217084884644, "learning_rate": 1.0140656609054205e-05, "loss": 0.0506, "num_input_tokens_seen": 25119952, "step": 28105 }, { "epoch": 7.418899300514716, "grad_norm": 0.34116989374160767, "learning_rate": 1.0132762648481455e-05, "loss": 0.0643, "num_input_tokens_seen": 25124496, "step": 28110 }, { "epoch": 7.4202190840702125, "grad_norm": 0.05062805116176605, "learning_rate": 1.0124870980623543e-05, "loss": 0.1242, "num_input_tokens_seen": 25128816, "step": 28115 }, { "epoch": 7.421538867625709, "grad_norm": 0.05739118531346321, "learning_rate": 1.0116981606697453e-05, "loss": 0.0311, "num_input_tokens_seen": 25133328, "step": 28120 }, { "epoch": 7.4228586511812065, "grad_norm": 0.9214297533035278, "learning_rate": 1.0109094527919838e-05, "loss": 0.0675, "num_input_tokens_seen": 25137680, "step": 28125 }, { "epoch": 7.424178434736703, "grad_norm": 0.2304917573928833, "learning_rate": 1.010120974550697e-05, "loss": 0.0293, "num_input_tokens_seen": 25141872, "step": 28130 }, { "epoch": 7.4254982182922005, "grad_norm": 0.1777554601430893, "learning_rate": 1.0093327260674795e-05, "loss": 0.0195, "num_input_tokens_seen": 25146192, "step": 28135 }, { "epoch": 7.426818001847697, "grad_norm": 0.28297916054725647, "learning_rate": 1.0085447074638878e-05, "loss": 0.0886, "num_input_tokens_seen": 25150992, "step": 28140 }, { "epoch": 7.428137785403194, "grad_norm": 0.2068546712398529, "learning_rate": 1.0077569188614461e-05, "loss": 0.0139, "num_input_tokens_seen": 25155696, "step": 28145 }, { "epoch": 7.429457568958691, "grad_norm": 0.13197863101959229, "learning_rate": 1.0069693603816393e-05, "loss": 0.0708, "num_input_tokens_seen": 25160432, "step": 28150 }, { "epoch": 7.430777352514188, "grad_norm": 0.5150306224822998, "learning_rate": 1.0061820321459204e-05, "loss": 0.0648, "num_input_tokens_seen": 25165040, "step": 28155 }, { "epoch": 7.432097136069684, "grad_norm": 0.15374921262264252, "learning_rate": 1.0053949342757038e-05, "loss": 0.0648, "num_input_tokens_seen": 25169584, "step": 28160 }, { "epoch": 7.433416919625182, "grad_norm": 0.15092085301876068, "learning_rate": 1.0046080668923717e-05, "loss": 0.0471, "num_input_tokens_seen": 25174064, "step": 28165 }, { "epoch": 7.434736703180678, "grad_norm": 0.05484118312597275, "learning_rate": 1.003821430117267e-05, "loss": 0.0925, "num_input_tokens_seen": 25178672, "step": 28170 }, { "epoch": 7.436056486736176, "grad_norm": 0.4082276225090027, "learning_rate": 1.0030350240716999e-05, "loss": 0.0579, "num_input_tokens_seen": 25182992, "step": 28175 }, { "epoch": 7.437376270291672, "grad_norm": 0.2505885660648346, "learning_rate": 1.0022488488769449e-05, "loss": 0.1014, "num_input_tokens_seen": 25187664, "step": 28180 }, { "epoch": 7.438696053847169, "grad_norm": 0.23509156703948975, "learning_rate": 1.0014629046542387e-05, "loss": 0.0689, "num_input_tokens_seen": 25192304, "step": 28185 }, { "epoch": 7.440015837402666, "grad_norm": 0.04883621633052826, "learning_rate": 1.0006771915247842e-05, "loss": 0.1295, "num_input_tokens_seen": 25196880, "step": 28190 }, { "epoch": 7.441335620958163, "grad_norm": 0.19336022436618805, "learning_rate": 9.998917096097495e-06, "loss": 0.0524, "num_input_tokens_seen": 25201296, "step": 28195 }, { "epoch": 7.44265540451366, "grad_norm": 0.23239991068840027, "learning_rate": 9.991064590302638e-06, "loss": 0.0395, "num_input_tokens_seen": 25205648, "step": 28200 }, { "epoch": 7.44265540451366, "eval_loss": 0.06724493950605392, "eval_runtime": 64.7838, "eval_samples_per_second": 103.961, "eval_steps_per_second": 25.994, "num_input_tokens_seen": 25205648, "step": 28200 }, { "epoch": 7.443975188069157, "grad_norm": 0.1710798293352127, "learning_rate": 9.983214399074241e-06, "loss": 0.0303, "num_input_tokens_seen": 25209968, "step": 28205 }, { "epoch": 7.445294971624653, "grad_norm": 0.4014093279838562, "learning_rate": 9.975366523622893e-06, "loss": 0.1064, "num_input_tokens_seen": 25214992, "step": 28210 }, { "epoch": 7.446614755180151, "grad_norm": 0.2548074722290039, "learning_rate": 9.967520965158841e-06, "loss": 0.0506, "num_input_tokens_seen": 25219312, "step": 28215 }, { "epoch": 7.447934538735647, "grad_norm": 0.17595715820789337, "learning_rate": 9.95967772489197e-06, "loss": 0.0481, "num_input_tokens_seen": 25223888, "step": 28220 }, { "epoch": 7.449254322291145, "grad_norm": 0.19337214529514313, "learning_rate": 9.951836804031794e-06, "loss": 0.0455, "num_input_tokens_seen": 25228208, "step": 28225 }, { "epoch": 7.450574105846641, "grad_norm": 0.20132915675640106, "learning_rate": 9.943998203787489e-06, "loss": 0.1062, "num_input_tokens_seen": 25232912, "step": 28230 }, { "epoch": 7.451893889402138, "grad_norm": 0.19663147628307343, "learning_rate": 9.936161925367874e-06, "loss": 0.0517, "num_input_tokens_seen": 25237488, "step": 28235 }, { "epoch": 7.453213672957635, "grad_norm": 0.05027906969189644, "learning_rate": 9.928327969981386e-06, "loss": 0.05, "num_input_tokens_seen": 25241840, "step": 28240 }, { "epoch": 7.454533456513132, "grad_norm": 0.036288391798734665, "learning_rate": 9.920496338836135e-06, "loss": 0.0708, "num_input_tokens_seen": 25246384, "step": 28245 }, { "epoch": 7.455853240068628, "grad_norm": 0.29450228810310364, "learning_rate": 9.912667033139844e-06, "loss": 0.0427, "num_input_tokens_seen": 25250928, "step": 28250 }, { "epoch": 7.457173023624126, "grad_norm": 0.2381192296743393, "learning_rate": 9.904840054099893e-06, "loss": 0.0492, "num_input_tokens_seen": 25255248, "step": 28255 }, { "epoch": 7.458492807179622, "grad_norm": 0.19136130809783936, "learning_rate": 9.897015402923312e-06, "loss": 0.0557, "num_input_tokens_seen": 25259280, "step": 28260 }, { "epoch": 7.45981259073512, "grad_norm": 0.2605243921279907, "learning_rate": 9.889193080816744e-06, "loss": 0.065, "num_input_tokens_seen": 25263728, "step": 28265 }, { "epoch": 7.461132374290616, "grad_norm": 0.4398597776889801, "learning_rate": 9.881373088986498e-06, "loss": 0.0522, "num_input_tokens_seen": 25268208, "step": 28270 }, { "epoch": 7.462452157846113, "grad_norm": 0.0907437726855278, "learning_rate": 9.873555428638523e-06, "loss": 0.0545, "num_input_tokens_seen": 25272816, "step": 28275 }, { "epoch": 7.46377194140161, "grad_norm": 0.1638597697019577, "learning_rate": 9.865740100978383e-06, "loss": 0.1038, "num_input_tokens_seen": 25277392, "step": 28280 }, { "epoch": 7.465091724957107, "grad_norm": 0.3589237928390503, "learning_rate": 9.857927107211315e-06, "loss": 0.0287, "num_input_tokens_seen": 25281648, "step": 28285 }, { "epoch": 7.4664115085126035, "grad_norm": 0.43596720695495605, "learning_rate": 9.850116448542177e-06, "loss": 0.0853, "num_input_tokens_seen": 25286064, "step": 28290 }, { "epoch": 7.467731292068101, "grad_norm": 0.18783985078334808, "learning_rate": 9.842308126175457e-06, "loss": 0.0349, "num_input_tokens_seen": 25290384, "step": 28295 }, { "epoch": 7.4690510756235975, "grad_norm": 0.18272094428539276, "learning_rate": 9.834502141315315e-06, "loss": 0.0515, "num_input_tokens_seen": 25294736, "step": 28300 }, { "epoch": 7.470370859179095, "grad_norm": 0.06580014526844025, "learning_rate": 9.82669849516552e-06, "loss": 0.0217, "num_input_tokens_seen": 25299280, "step": 28305 }, { "epoch": 7.4716906427345915, "grad_norm": 0.07384705543518066, "learning_rate": 9.818897188929493e-06, "loss": 0.0576, "num_input_tokens_seen": 25303728, "step": 28310 }, { "epoch": 7.473010426290088, "grad_norm": 0.2425065040588379, "learning_rate": 9.811098223810309e-06, "loss": 0.0571, "num_input_tokens_seen": 25308112, "step": 28315 }, { "epoch": 7.4743302098455855, "grad_norm": 0.14675264060497284, "learning_rate": 9.803301601010641e-06, "loss": 0.0592, "num_input_tokens_seen": 25312656, "step": 28320 }, { "epoch": 7.475649993401082, "grad_norm": 0.20976220071315765, "learning_rate": 9.795507321732853e-06, "loss": 0.0526, "num_input_tokens_seen": 25317008, "step": 28325 }, { "epoch": 7.4769697769565795, "grad_norm": 0.2456665337085724, "learning_rate": 9.787715387178898e-06, "loss": 0.0381, "num_input_tokens_seen": 25321520, "step": 28330 }, { "epoch": 7.478289560512076, "grad_norm": 0.15069472789764404, "learning_rate": 9.779925798550399e-06, "loss": 0.0789, "num_input_tokens_seen": 25326032, "step": 28335 }, { "epoch": 7.479609344067573, "grad_norm": 0.26236090064048767, "learning_rate": 9.772138557048619e-06, "loss": 0.075, "num_input_tokens_seen": 25330320, "step": 28340 }, { "epoch": 7.48092912762307, "grad_norm": 0.13382378220558167, "learning_rate": 9.764353663874426e-06, "loss": 0.0592, "num_input_tokens_seen": 25334704, "step": 28345 }, { "epoch": 7.482248911178567, "grad_norm": 0.12811772525310516, "learning_rate": 9.756571120228375e-06, "loss": 0.034, "num_input_tokens_seen": 25339088, "step": 28350 }, { "epoch": 7.483568694734064, "grad_norm": 0.1899196207523346, "learning_rate": 9.748790927310605e-06, "loss": 0.0544, "num_input_tokens_seen": 25343728, "step": 28355 }, { "epoch": 7.484888478289561, "grad_norm": 0.7166891694068909, "learning_rate": 9.741013086320946e-06, "loss": 0.0926, "num_input_tokens_seen": 25348304, "step": 28360 }, { "epoch": 7.486208261845057, "grad_norm": 0.05646941438317299, "learning_rate": 9.733237598458821e-06, "loss": 0.0129, "num_input_tokens_seen": 25352720, "step": 28365 }, { "epoch": 7.487528045400555, "grad_norm": 0.07555802166461945, "learning_rate": 9.725464464923308e-06, "loss": 0.026, "num_input_tokens_seen": 25357232, "step": 28370 }, { "epoch": 7.488847828956051, "grad_norm": 0.14126692712306976, "learning_rate": 9.717693686913123e-06, "loss": 0.0162, "num_input_tokens_seen": 25361872, "step": 28375 }, { "epoch": 7.490167612511548, "grad_norm": 0.269870787858963, "learning_rate": 9.709925265626632e-06, "loss": 0.0754, "num_input_tokens_seen": 25366256, "step": 28380 }, { "epoch": 7.491487396067045, "grad_norm": 0.05480840057134628, "learning_rate": 9.702159202261801e-06, "loss": 0.0924, "num_input_tokens_seen": 25371056, "step": 28385 }, { "epoch": 7.492807179622542, "grad_norm": 0.27994129061698914, "learning_rate": 9.694395498016268e-06, "loss": 0.0468, "num_input_tokens_seen": 25375696, "step": 28390 }, { "epoch": 7.494126963178039, "grad_norm": 0.10177845507860184, "learning_rate": 9.686634154087298e-06, "loss": 0.0582, "num_input_tokens_seen": 25379888, "step": 28395 }, { "epoch": 7.495446746733536, "grad_norm": 0.44680944085121155, "learning_rate": 9.678875171671776e-06, "loss": 0.0467, "num_input_tokens_seen": 25384496, "step": 28400 }, { "epoch": 7.495446746733536, "eval_loss": 0.06719040870666504, "eval_runtime": 64.7755, "eval_samples_per_second": 103.975, "eval_steps_per_second": 25.997, "num_input_tokens_seen": 25384496, "step": 28400 }, { "epoch": 7.496766530289032, "grad_norm": 0.18420284986495972, "learning_rate": 9.671118551966246e-06, "loss": 0.0279, "num_input_tokens_seen": 25389200, "step": 28405 }, { "epoch": 7.49808631384453, "grad_norm": 0.09603328257799149, "learning_rate": 9.66336429616686e-06, "loss": 0.0736, "num_input_tokens_seen": 25393744, "step": 28410 }, { "epoch": 7.499406097400026, "grad_norm": 0.07899913191795349, "learning_rate": 9.655612405469436e-06, "loss": 0.031, "num_input_tokens_seen": 25398192, "step": 28415 }, { "epoch": 7.500725880955523, "grad_norm": 0.12213881313800812, "learning_rate": 9.647862881069413e-06, "loss": 0.1169, "num_input_tokens_seen": 25402960, "step": 28420 }, { "epoch": 7.50204566451102, "grad_norm": 0.3203723430633545, "learning_rate": 9.640115724161855e-06, "loss": 0.0516, "num_input_tokens_seen": 25407696, "step": 28425 }, { "epoch": 7.503365448066517, "grad_norm": 0.30350419878959656, "learning_rate": 9.632370935941483e-06, "loss": 0.1009, "num_input_tokens_seen": 25412272, "step": 28430 }, { "epoch": 7.504685231622014, "grad_norm": 0.20776693522930145, "learning_rate": 9.624628517602634e-06, "loss": 0.0332, "num_input_tokens_seen": 25416624, "step": 28435 }, { "epoch": 7.506005015177511, "grad_norm": 0.24014948308467865, "learning_rate": 9.61688847033928e-06, "loss": 0.0268, "num_input_tokens_seen": 25421200, "step": 28440 }, { "epoch": 7.507324798733007, "grad_norm": 0.07349469512701035, "learning_rate": 9.609150795345051e-06, "loss": 0.0585, "num_input_tokens_seen": 25425904, "step": 28445 }, { "epoch": 7.508644582288505, "grad_norm": 0.05070003122091293, "learning_rate": 9.601415493813171e-06, "loss": 0.0319, "num_input_tokens_seen": 25430512, "step": 28450 }, { "epoch": 7.509964365844001, "grad_norm": 0.06833989918231964, "learning_rate": 9.593682566936533e-06, "loss": 0.0255, "num_input_tokens_seen": 25435216, "step": 28455 }, { "epoch": 7.511284149399499, "grad_norm": 0.05357332527637482, "learning_rate": 9.58595201590766e-06, "loss": 0.0502, "num_input_tokens_seen": 25439856, "step": 28460 }, { "epoch": 7.512603932954995, "grad_norm": 0.12030693143606186, "learning_rate": 9.578223841918681e-06, "loss": 0.0202, "num_input_tokens_seen": 25444496, "step": 28465 }, { "epoch": 7.513923716510492, "grad_norm": 0.23244842886924744, "learning_rate": 9.570498046161389e-06, "loss": 0.0548, "num_input_tokens_seen": 25448976, "step": 28470 }, { "epoch": 7.515243500065989, "grad_norm": 0.11157065629959106, "learning_rate": 9.562774629827206e-06, "loss": 0.0439, "num_input_tokens_seen": 25453328, "step": 28475 }, { "epoch": 7.516563283621486, "grad_norm": 0.1607305109500885, "learning_rate": 9.555053594107163e-06, "loss": 0.0295, "num_input_tokens_seen": 25457712, "step": 28480 }, { "epoch": 7.517883067176983, "grad_norm": 0.08802018314599991, "learning_rate": 9.547334940191957e-06, "loss": 0.0411, "num_input_tokens_seen": 25462512, "step": 28485 }, { "epoch": 7.51920285073248, "grad_norm": 0.3966522514820099, "learning_rate": 9.539618669271886e-06, "loss": 0.1033, "num_input_tokens_seen": 25466864, "step": 28490 }, { "epoch": 7.5205226342879765, "grad_norm": 0.15384790301322937, "learning_rate": 9.531904782536904e-06, "loss": 0.0305, "num_input_tokens_seen": 25471248, "step": 28495 }, { "epoch": 7.521842417843474, "grad_norm": 0.056325383484363556, "learning_rate": 9.524193281176597e-06, "loss": 0.0997, "num_input_tokens_seen": 25475696, "step": 28500 }, { "epoch": 7.5231622013989705, "grad_norm": 0.18907099962234497, "learning_rate": 9.516484166380165e-06, "loss": 0.0446, "num_input_tokens_seen": 25480112, "step": 28505 }, { "epoch": 7.524481984954468, "grad_norm": 0.08811620622873306, "learning_rate": 9.508777439336447e-06, "loss": 0.043, "num_input_tokens_seen": 25484368, "step": 28510 }, { "epoch": 7.5258017685099645, "grad_norm": 0.0339387021958828, "learning_rate": 9.50107310123393e-06, "loss": 0.0315, "num_input_tokens_seen": 25488624, "step": 28515 }, { "epoch": 7.527121552065461, "grad_norm": 0.09460266679525375, "learning_rate": 9.493371153260702e-06, "loss": 0.0585, "num_input_tokens_seen": 25492912, "step": 28520 }, { "epoch": 7.5284413356209585, "grad_norm": 0.20555604994297028, "learning_rate": 9.485671596604523e-06, "loss": 0.0395, "num_input_tokens_seen": 25497424, "step": 28525 }, { "epoch": 7.529761119176455, "grad_norm": 0.07615350931882858, "learning_rate": 9.477974432452738e-06, "loss": 0.052, "num_input_tokens_seen": 25502064, "step": 28530 }, { "epoch": 7.531080902731952, "grad_norm": 0.08699978142976761, "learning_rate": 9.470279661992356e-06, "loss": 0.0224, "num_input_tokens_seen": 25506288, "step": 28535 }, { "epoch": 7.532400686287449, "grad_norm": 0.14378374814987183, "learning_rate": 9.462587286410021e-06, "loss": 0.0754, "num_input_tokens_seen": 25510960, "step": 28540 }, { "epoch": 7.533720469842946, "grad_norm": 0.20876257121562958, "learning_rate": 9.454897306891972e-06, "loss": 0.0441, "num_input_tokens_seen": 25515408, "step": 28545 }, { "epoch": 7.535040253398442, "grad_norm": 0.4181246757507324, "learning_rate": 9.44720972462411e-06, "loss": 0.0853, "num_input_tokens_seen": 25519696, "step": 28550 }, { "epoch": 7.53636003695394, "grad_norm": 0.05389338359236717, "learning_rate": 9.439524540791964e-06, "loss": 0.0575, "num_input_tokens_seen": 25524144, "step": 28555 }, { "epoch": 7.537679820509436, "grad_norm": 0.5026947259902954, "learning_rate": 9.431841756580673e-06, "loss": 0.0588, "num_input_tokens_seen": 25528560, "step": 28560 }, { "epoch": 7.538999604064934, "grad_norm": 0.34509214758872986, "learning_rate": 9.42416137317503e-06, "loss": 0.091, "num_input_tokens_seen": 25532912, "step": 28565 }, { "epoch": 7.54031938762043, "grad_norm": 0.30521687865257263, "learning_rate": 9.416483391759437e-06, "loss": 0.0618, "num_input_tokens_seen": 25537296, "step": 28570 }, { "epoch": 7.541639171175927, "grad_norm": 0.16312257945537567, "learning_rate": 9.408807813517945e-06, "loss": 0.0295, "num_input_tokens_seen": 25541424, "step": 28575 }, { "epoch": 7.542958954731424, "grad_norm": 0.2657415568828583, "learning_rate": 9.401134639634221e-06, "loss": 0.0453, "num_input_tokens_seen": 25545936, "step": 28580 }, { "epoch": 7.544278738286921, "grad_norm": 0.1762724369764328, "learning_rate": 9.393463871291555e-06, "loss": 0.0468, "num_input_tokens_seen": 25550448, "step": 28585 }, { "epoch": 7.545598521842418, "grad_norm": 0.05827685818076134, "learning_rate": 9.385795509672881e-06, "loss": 0.0714, "num_input_tokens_seen": 25554640, "step": 28590 }, { "epoch": 7.546918305397915, "grad_norm": 0.12334321439266205, "learning_rate": 9.378129555960771e-06, "loss": 0.0349, "num_input_tokens_seen": 25559248, "step": 28595 }, { "epoch": 7.548238088953411, "grad_norm": 0.06427537649869919, "learning_rate": 9.370466011337392e-06, "loss": 0.0817, "num_input_tokens_seen": 25563856, "step": 28600 }, { "epoch": 7.548238088953411, "eval_loss": 0.06737852096557617, "eval_runtime": 64.7733, "eval_samples_per_second": 103.978, "eval_steps_per_second": 25.998, "num_input_tokens_seen": 25563856, "step": 28600 }, { "epoch": 7.549557872508909, "grad_norm": 0.12501057982444763, "learning_rate": 9.362804876984573e-06, "loss": 0.044, "num_input_tokens_seen": 25568464, "step": 28605 }, { "epoch": 7.550877656064405, "grad_norm": 0.5499928593635559, "learning_rate": 9.355146154083747e-06, "loss": 0.0488, "num_input_tokens_seen": 25572784, "step": 28610 }, { "epoch": 7.552197439619903, "grad_norm": 0.22544871270656586, "learning_rate": 9.347489843815987e-06, "loss": 0.0492, "num_input_tokens_seen": 25577168, "step": 28615 }, { "epoch": 7.553517223175399, "grad_norm": 0.2568769156932831, "learning_rate": 9.339835947362002e-06, "loss": 0.0713, "num_input_tokens_seen": 25581680, "step": 28620 }, { "epoch": 7.554837006730896, "grad_norm": 0.28132879734039307, "learning_rate": 9.332184465902105e-06, "loss": 0.0458, "num_input_tokens_seen": 25586384, "step": 28625 }, { "epoch": 7.556156790286393, "grad_norm": 0.11143406480550766, "learning_rate": 9.324535400616266e-06, "loss": 0.0603, "num_input_tokens_seen": 25590928, "step": 28630 }, { "epoch": 7.55747657384189, "grad_norm": 0.3229554295539856, "learning_rate": 9.31688875268405e-06, "loss": 0.0357, "num_input_tokens_seen": 25595600, "step": 28635 }, { "epoch": 7.558796357397387, "grad_norm": 0.14877256751060486, "learning_rate": 9.309244523284674e-06, "loss": 0.0649, "num_input_tokens_seen": 25599920, "step": 28640 }, { "epoch": 7.560116140952884, "grad_norm": 0.1517946869134903, "learning_rate": 9.301602713596982e-06, "loss": 0.0166, "num_input_tokens_seen": 25604304, "step": 28645 }, { "epoch": 7.56143592450838, "grad_norm": 0.07871007174253464, "learning_rate": 9.293963324799432e-06, "loss": 0.0343, "num_input_tokens_seen": 25608912, "step": 28650 }, { "epoch": 7.562755708063878, "grad_norm": 0.11559867858886719, "learning_rate": 9.286326358070104e-06, "loss": 0.034, "num_input_tokens_seen": 25613360, "step": 28655 }, { "epoch": 7.564075491619374, "grad_norm": 0.262851357460022, "learning_rate": 9.278691814586729e-06, "loss": 0.0743, "num_input_tokens_seen": 25617552, "step": 28660 }, { "epoch": 7.565395275174871, "grad_norm": 0.21714268624782562, "learning_rate": 9.271059695526635e-06, "loss": 0.0658, "num_input_tokens_seen": 25621840, "step": 28665 }, { "epoch": 7.566715058730368, "grad_norm": 0.24854934215545654, "learning_rate": 9.263430002066805e-06, "loss": 0.1493, "num_input_tokens_seen": 25626288, "step": 28670 }, { "epoch": 7.568034842285865, "grad_norm": 0.30293935537338257, "learning_rate": 9.25580273538382e-06, "loss": 0.0663, "num_input_tokens_seen": 25630960, "step": 28675 }, { "epoch": 7.5693546258413615, "grad_norm": 0.37781822681427, "learning_rate": 9.248177896653907e-06, "loss": 0.0591, "num_input_tokens_seen": 25635632, "step": 28680 }, { "epoch": 7.570674409396859, "grad_norm": 0.3661993741989136, "learning_rate": 9.240555487052918e-06, "loss": 0.0207, "num_input_tokens_seen": 25640336, "step": 28685 }, { "epoch": 7.5719941929523555, "grad_norm": 0.2354733943939209, "learning_rate": 9.232935507756313e-06, "loss": 0.0524, "num_input_tokens_seen": 25644912, "step": 28690 }, { "epoch": 7.573313976507853, "grad_norm": 0.2871819734573364, "learning_rate": 9.225317959939193e-06, "loss": 0.0704, "num_input_tokens_seen": 25649296, "step": 28695 }, { "epoch": 7.5746337600633495, "grad_norm": 0.0757763460278511, "learning_rate": 9.217702844776287e-06, "loss": 0.066, "num_input_tokens_seen": 25653552, "step": 28700 }, { "epoch": 7.575953543618846, "grad_norm": 0.16725897789001465, "learning_rate": 9.210090163441929e-06, "loss": 0.0361, "num_input_tokens_seen": 25657808, "step": 28705 }, { "epoch": 7.5772733271743435, "grad_norm": 0.08254332840442657, "learning_rate": 9.202479917110105e-06, "loss": 0.0787, "num_input_tokens_seen": 25662128, "step": 28710 }, { "epoch": 7.57859311072984, "grad_norm": 0.06442095339298248, "learning_rate": 9.194872106954392e-06, "loss": 0.0762, "num_input_tokens_seen": 25666960, "step": 28715 }, { "epoch": 7.5799128942853375, "grad_norm": 0.5059494376182556, "learning_rate": 9.187266734148029e-06, "loss": 0.1057, "num_input_tokens_seen": 25671632, "step": 28720 }, { "epoch": 7.581232677840834, "grad_norm": 0.32733625173568726, "learning_rate": 9.179663799863849e-06, "loss": 0.0536, "num_input_tokens_seen": 25676016, "step": 28725 }, { "epoch": 7.582552461396331, "grad_norm": 0.33237534761428833, "learning_rate": 9.172063305274317e-06, "loss": 0.0835, "num_input_tokens_seen": 25680752, "step": 28730 }, { "epoch": 7.583872244951828, "grad_norm": 0.5302887558937073, "learning_rate": 9.164465251551527e-06, "loss": 0.183, "num_input_tokens_seen": 25685232, "step": 28735 }, { "epoch": 7.585192028507325, "grad_norm": 0.07520095258951187, "learning_rate": 9.156869639867205e-06, "loss": 0.0432, "num_input_tokens_seen": 25689552, "step": 28740 }, { "epoch": 7.586511812062822, "grad_norm": 0.029999621212482452, "learning_rate": 9.149276471392677e-06, "loss": 0.0412, "num_input_tokens_seen": 25694224, "step": 28745 }, { "epoch": 7.587831595618319, "grad_norm": 0.2688811123371124, "learning_rate": 9.141685747298914e-06, "loss": 0.0778, "num_input_tokens_seen": 25699056, "step": 28750 }, { "epoch": 7.589151379173815, "grad_norm": 0.20439714193344116, "learning_rate": 9.13409746875649e-06, "loss": 0.0251, "num_input_tokens_seen": 25703312, "step": 28755 }, { "epoch": 7.590471162729313, "grad_norm": 0.045238979160785675, "learning_rate": 9.12651163693562e-06, "loss": 0.0348, "num_input_tokens_seen": 25707824, "step": 28760 }, { "epoch": 7.591790946284809, "grad_norm": 0.5429162383079529, "learning_rate": 9.11892825300614e-06, "loss": 0.0979, "num_input_tokens_seen": 25712528, "step": 28765 }, { "epoch": 7.593110729840307, "grad_norm": 0.21179956197738647, "learning_rate": 9.111347318137491e-06, "loss": 0.0966, "num_input_tokens_seen": 25717328, "step": 28770 }, { "epoch": 7.594430513395803, "grad_norm": 0.03599433973431587, "learning_rate": 9.103768833498755e-06, "loss": 0.0695, "num_input_tokens_seen": 25721424, "step": 28775 }, { "epoch": 7.5957502969513, "grad_norm": 0.0834731012582779, "learning_rate": 9.096192800258639e-06, "loss": 0.0437, "num_input_tokens_seen": 25725808, "step": 28780 }, { "epoch": 7.597070080506797, "grad_norm": 0.1773025542497635, "learning_rate": 9.088619219585443e-06, "loss": 0.0428, "num_input_tokens_seen": 25730192, "step": 28785 }, { "epoch": 7.598389864062294, "grad_norm": 0.3067278265953064, "learning_rate": 9.081048092647127e-06, "loss": 0.0459, "num_input_tokens_seen": 25734512, "step": 28790 }, { "epoch": 7.59970964761779, "grad_norm": 0.1530575156211853, "learning_rate": 9.073479420611245e-06, "loss": 0.0679, "num_input_tokens_seen": 25738864, "step": 28795 }, { "epoch": 7.601029431173288, "grad_norm": 0.20797239243984222, "learning_rate": 9.065913204644974e-06, "loss": 0.0346, "num_input_tokens_seen": 25743536, "step": 28800 }, { "epoch": 7.601029431173288, "eval_loss": 0.06708266586065292, "eval_runtime": 64.8109, "eval_samples_per_second": 103.918, "eval_steps_per_second": 25.983, "num_input_tokens_seen": 25743536, "step": 28800 }, { "epoch": 7.602349214728784, "grad_norm": 0.20771774649620056, "learning_rate": 9.058349445915135e-06, "loss": 0.0457, "num_input_tokens_seen": 25747984, "step": 28805 }, { "epoch": 7.603668998284281, "grad_norm": 0.14603017270565033, "learning_rate": 9.050788145588138e-06, "loss": 0.0607, "num_input_tokens_seen": 25752400, "step": 28810 }, { "epoch": 7.604988781839778, "grad_norm": 0.18702679872512817, "learning_rate": 9.043229304830039e-06, "loss": 0.039, "num_input_tokens_seen": 25756976, "step": 28815 }, { "epoch": 7.606308565395275, "grad_norm": 0.46001186966896057, "learning_rate": 9.035672924806515e-06, "loss": 0.1038, "num_input_tokens_seen": 25761040, "step": 28820 }, { "epoch": 7.607628348950772, "grad_norm": 0.24940703809261322, "learning_rate": 9.028119006682839e-06, "loss": 0.0602, "num_input_tokens_seen": 25765712, "step": 28825 }, { "epoch": 7.608948132506269, "grad_norm": 0.33831915259361267, "learning_rate": 9.020567551623935e-06, "loss": 0.0244, "num_input_tokens_seen": 25770352, "step": 28830 }, { "epoch": 7.6102679160617654, "grad_norm": 0.14558133482933044, "learning_rate": 9.013018560794318e-06, "loss": 0.0321, "num_input_tokens_seen": 25774864, "step": 28835 }, { "epoch": 7.611587699617263, "grad_norm": 0.0739254280924797, "learning_rate": 9.005472035358139e-06, "loss": 0.0175, "num_input_tokens_seen": 25779408, "step": 28840 }, { "epoch": 7.6129074831727594, "grad_norm": 0.4416583180427551, "learning_rate": 8.997927976479185e-06, "loss": 0.0717, "num_input_tokens_seen": 25783632, "step": 28845 }, { "epoch": 7.614227266728257, "grad_norm": 0.3231644034385681, "learning_rate": 8.99038638532082e-06, "loss": 0.0565, "num_input_tokens_seen": 25788368, "step": 28850 }, { "epoch": 7.6155470502837534, "grad_norm": 0.053276024758815765, "learning_rate": 8.982847263046065e-06, "loss": 0.0483, "num_input_tokens_seen": 25793104, "step": 28855 }, { "epoch": 7.61686683383925, "grad_norm": 0.23859666287899017, "learning_rate": 8.975310610817555e-06, "loss": 0.0449, "num_input_tokens_seen": 25797296, "step": 28860 }, { "epoch": 7.6181866173947475, "grad_norm": 0.07422548532485962, "learning_rate": 8.967776429797528e-06, "loss": 0.0756, "num_input_tokens_seen": 25801488, "step": 28865 }, { "epoch": 7.619506400950244, "grad_norm": 0.06326866149902344, "learning_rate": 8.960244721147842e-06, "loss": 0.0353, "num_input_tokens_seen": 25805776, "step": 28870 }, { "epoch": 7.6208261845057415, "grad_norm": 0.1297316998243332, "learning_rate": 8.952715486029995e-06, "loss": 0.0215, "num_input_tokens_seen": 25810000, "step": 28875 }, { "epoch": 7.622145968061238, "grad_norm": 0.03616291657090187, "learning_rate": 8.945188725605075e-06, "loss": 0.0784, "num_input_tokens_seen": 25814512, "step": 28880 }, { "epoch": 7.623465751616735, "grad_norm": 0.10423058271408081, "learning_rate": 8.937664441033817e-06, "loss": 0.0249, "num_input_tokens_seen": 25818640, "step": 28885 }, { "epoch": 7.624785535172232, "grad_norm": 0.024196263402700424, "learning_rate": 8.930142633476549e-06, "loss": 0.0465, "num_input_tokens_seen": 25822736, "step": 28890 }, { "epoch": 7.626105318727729, "grad_norm": 0.26560425758361816, "learning_rate": 8.92262330409323e-06, "loss": 0.0853, "num_input_tokens_seen": 25827024, "step": 28895 }, { "epoch": 7.627425102283226, "grad_norm": 0.14553707838058472, "learning_rate": 8.915106454043448e-06, "loss": 0.0271, "num_input_tokens_seen": 25831536, "step": 28900 }, { "epoch": 7.628744885838723, "grad_norm": 0.25742220878601074, "learning_rate": 8.90759208448638e-06, "loss": 0.0823, "num_input_tokens_seen": 25836144, "step": 28905 }, { "epoch": 7.630064669394219, "grad_norm": 0.23736485838890076, "learning_rate": 8.900080196580848e-06, "loss": 0.0552, "num_input_tokens_seen": 25840496, "step": 28910 }, { "epoch": 7.631384452949717, "grad_norm": 0.16533923149108887, "learning_rate": 8.892570791485267e-06, "loss": 0.0572, "num_input_tokens_seen": 25845072, "step": 28915 }, { "epoch": 7.632704236505213, "grad_norm": 0.18870149552822113, "learning_rate": 8.885063870357688e-06, "loss": 0.0344, "num_input_tokens_seen": 25849392, "step": 28920 }, { "epoch": 7.63402402006071, "grad_norm": 0.2535285949707031, "learning_rate": 8.87755943435578e-06, "loss": 0.045, "num_input_tokens_seen": 25854064, "step": 28925 }, { "epoch": 7.635343803616207, "grad_norm": 0.05172240361571312, "learning_rate": 8.87005748463681e-06, "loss": 0.0509, "num_input_tokens_seen": 25858640, "step": 28930 }, { "epoch": 7.636663587171704, "grad_norm": 0.1622723489999771, "learning_rate": 8.862558022357681e-06, "loss": 0.0698, "num_input_tokens_seen": 25862800, "step": 28935 }, { "epoch": 7.637983370727201, "grad_norm": 0.06722614169120789, "learning_rate": 8.855061048674903e-06, "loss": 0.024, "num_input_tokens_seen": 25866992, "step": 28940 }, { "epoch": 7.639303154282698, "grad_norm": 0.5979558229446411, "learning_rate": 8.847566564744595e-06, "loss": 0.0502, "num_input_tokens_seen": 25871600, "step": 28945 }, { "epoch": 7.640622937838194, "grad_norm": 0.10905921459197998, "learning_rate": 8.840074571722512e-06, "loss": 0.1028, "num_input_tokens_seen": 25876240, "step": 28950 }, { "epoch": 7.641942721393692, "grad_norm": 0.3263806700706482, "learning_rate": 8.832585070764002e-06, "loss": 0.058, "num_input_tokens_seen": 25880912, "step": 28955 }, { "epoch": 7.643262504949188, "grad_norm": 0.4214063584804535, "learning_rate": 8.825098063024045e-06, "loss": 0.0343, "num_input_tokens_seen": 25885328, "step": 28960 }, { "epoch": 7.644582288504685, "grad_norm": 0.1956067681312561, "learning_rate": 8.817613549657244e-06, "loss": 0.0391, "num_input_tokens_seen": 25889744, "step": 28965 }, { "epoch": 7.645902072060182, "grad_norm": 0.19761741161346436, "learning_rate": 8.810131531817783e-06, "loss": 0.0548, "num_input_tokens_seen": 25894352, "step": 28970 }, { "epoch": 7.647221855615679, "grad_norm": 0.48980140686035156, "learning_rate": 8.802652010659496e-06, "loss": 0.0541, "num_input_tokens_seen": 25898480, "step": 28975 }, { "epoch": 7.648541639171176, "grad_norm": 0.3829546272754669, "learning_rate": 8.795174987335827e-06, "loss": 0.1288, "num_input_tokens_seen": 25902864, "step": 28980 }, { "epoch": 7.649861422726673, "grad_norm": 0.23275887966156006, "learning_rate": 8.787700462999807e-06, "loss": 0.063, "num_input_tokens_seen": 25907792, "step": 28985 }, { "epoch": 7.651181206282169, "grad_norm": 0.25544312596321106, "learning_rate": 8.780228438804122e-06, "loss": 0.0397, "num_input_tokens_seen": 25912432, "step": 28990 }, { "epoch": 7.652500989837667, "grad_norm": 0.2271474301815033, "learning_rate": 8.772758915901032e-06, "loss": 0.0665, "num_input_tokens_seen": 25917136, "step": 28995 }, { "epoch": 7.653820773393163, "grad_norm": 0.22674846649169922, "learning_rate": 8.765291895442443e-06, "loss": 0.0321, "num_input_tokens_seen": 25921616, "step": 29000 }, { "epoch": 7.653820773393163, "eval_loss": 0.06706395745277405, "eval_runtime": 64.8256, "eval_samples_per_second": 103.894, "eval_steps_per_second": 25.977, "num_input_tokens_seen": 25921616, "step": 29000 }, { "epoch": 7.655140556948661, "grad_norm": 0.15786947309970856, "learning_rate": 8.75782737857987e-06, "loss": 0.0828, "num_input_tokens_seen": 25926160, "step": 29005 }, { "epoch": 7.656460340504157, "grad_norm": 0.32487648725509644, "learning_rate": 8.750365366464425e-06, "loss": 0.1307, "num_input_tokens_seen": 25930736, "step": 29010 }, { "epoch": 7.657780124059654, "grad_norm": 0.12603232264518738, "learning_rate": 8.742905860246838e-06, "loss": 0.0688, "num_input_tokens_seen": 25935184, "step": 29015 }, { "epoch": 7.659099907615151, "grad_norm": 0.3401598036289215, "learning_rate": 8.735448861077478e-06, "loss": 0.1034, "num_input_tokens_seen": 25939408, "step": 29020 }, { "epoch": 7.660419691170648, "grad_norm": 0.03564178943634033, "learning_rate": 8.727994370106288e-06, "loss": 0.0389, "num_input_tokens_seen": 25943760, "step": 29025 }, { "epoch": 7.661739474726145, "grad_norm": 0.12660466134548187, "learning_rate": 8.720542388482861e-06, "loss": 0.0298, "num_input_tokens_seen": 25948304, "step": 29030 }, { "epoch": 7.663059258281642, "grad_norm": 0.12257567793130875, "learning_rate": 8.71309291735637e-06, "loss": 0.0539, "num_input_tokens_seen": 25952624, "step": 29035 }, { "epoch": 7.6643790418371385, "grad_norm": 0.09583305567502975, "learning_rate": 8.705645957875621e-06, "loss": 0.0418, "num_input_tokens_seen": 25957072, "step": 29040 }, { "epoch": 7.665698825392636, "grad_norm": 0.0794573649764061, "learning_rate": 8.698201511189048e-06, "loss": 0.0414, "num_input_tokens_seen": 25961712, "step": 29045 }, { "epoch": 7.6670186089481325, "grad_norm": 0.045155834406614304, "learning_rate": 8.690759578444649e-06, "loss": 0.0336, "num_input_tokens_seen": 25966288, "step": 29050 }, { "epoch": 7.668338392503629, "grad_norm": 0.207078754901886, "learning_rate": 8.68332016079008e-06, "loss": 0.044, "num_input_tokens_seen": 25970704, "step": 29055 }, { "epoch": 7.6696581760591265, "grad_norm": 0.2654801309108734, "learning_rate": 8.6758832593726e-06, "loss": 0.0818, "num_input_tokens_seen": 25975440, "step": 29060 }, { "epoch": 7.670977959614623, "grad_norm": 0.06456955522298813, "learning_rate": 8.668448875339053e-06, "loss": 0.0588, "num_input_tokens_seen": 25980016, "step": 29065 }, { "epoch": 7.6722977431701205, "grad_norm": 0.08655328303575516, "learning_rate": 8.661017009835933e-06, "loss": 0.0741, "num_input_tokens_seen": 25984624, "step": 29070 }, { "epoch": 7.673617526725617, "grad_norm": 0.16649572551250458, "learning_rate": 8.653587664009311e-06, "loss": 0.0315, "num_input_tokens_seen": 25989168, "step": 29075 }, { "epoch": 7.674937310281114, "grad_norm": 0.16485533118247986, "learning_rate": 8.646160839004902e-06, "loss": 0.0487, "num_input_tokens_seen": 25993552, "step": 29080 }, { "epoch": 7.676257093836611, "grad_norm": 0.4182368218898773, "learning_rate": 8.638736535967998e-06, "loss": 0.0618, "num_input_tokens_seen": 25998160, "step": 29085 }, { "epoch": 7.677576877392108, "grad_norm": 0.4864540696144104, "learning_rate": 8.631314756043535e-06, "loss": 0.1188, "num_input_tokens_seen": 26003056, "step": 29090 }, { "epoch": 7.678896660947604, "grad_norm": 0.3631393313407898, "learning_rate": 8.62389550037603e-06, "loss": 0.0752, "num_input_tokens_seen": 26007568, "step": 29095 }, { "epoch": 7.680216444503102, "grad_norm": 0.06107712537050247, "learning_rate": 8.616478770109646e-06, "loss": 0.0465, "num_input_tokens_seen": 26011984, "step": 29100 }, { "epoch": 7.681536228058598, "grad_norm": 0.20940859615802765, "learning_rate": 8.609064566388111e-06, "loss": 0.0452, "num_input_tokens_seen": 26016368, "step": 29105 }, { "epoch": 7.682856011614096, "grad_norm": 0.26821354031562805, "learning_rate": 8.601652890354815e-06, "loss": 0.0652, "num_input_tokens_seen": 26020848, "step": 29110 }, { "epoch": 7.684175795169592, "grad_norm": 0.17707842588424683, "learning_rate": 8.594243743152705e-06, "loss": 0.0542, "num_input_tokens_seen": 26025680, "step": 29115 }, { "epoch": 7.685495578725089, "grad_norm": 0.21193699538707733, "learning_rate": 8.58683712592438e-06, "loss": 0.0383, "num_input_tokens_seen": 26030416, "step": 29120 }, { "epoch": 7.686815362280586, "grad_norm": 0.45034798979759216, "learning_rate": 8.579433039812037e-06, "loss": 0.1112, "num_input_tokens_seen": 26034864, "step": 29125 }, { "epoch": 7.688135145836083, "grad_norm": 0.2201678305864334, "learning_rate": 8.572031485957466e-06, "loss": 0.0381, "num_input_tokens_seen": 26039408, "step": 29130 }, { "epoch": 7.68945492939158, "grad_norm": 0.16068312525749207, "learning_rate": 8.564632465502084e-06, "loss": 0.0636, "num_input_tokens_seen": 26044112, "step": 29135 }, { "epoch": 7.690774712947077, "grad_norm": 0.07296279072761536, "learning_rate": 8.557235979586928e-06, "loss": 0.0382, "num_input_tokens_seen": 26048848, "step": 29140 }, { "epoch": 7.692094496502573, "grad_norm": 0.5406801104545593, "learning_rate": 8.549842029352606e-06, "loss": 0.1161, "num_input_tokens_seen": 26053328, "step": 29145 }, { "epoch": 7.693414280058071, "grad_norm": 0.04569797217845917, "learning_rate": 8.542450615939376e-06, "loss": 0.0634, "num_input_tokens_seen": 26057488, "step": 29150 }, { "epoch": 7.694734063613567, "grad_norm": 0.08818359673023224, "learning_rate": 8.535061740487082e-06, "loss": 0.0728, "num_input_tokens_seen": 26061968, "step": 29155 }, { "epoch": 7.696053847169065, "grad_norm": 0.2057567685842514, "learning_rate": 8.527675404135168e-06, "loss": 0.0492, "num_input_tokens_seen": 26066288, "step": 29160 }, { "epoch": 7.697373630724561, "grad_norm": 0.47638198733329773, "learning_rate": 8.520291608022724e-06, "loss": 0.0322, "num_input_tokens_seen": 26070832, "step": 29165 }, { "epoch": 7.698693414280058, "grad_norm": 0.2535324692726135, "learning_rate": 8.512910353288398e-06, "loss": 0.0435, "num_input_tokens_seen": 26075440, "step": 29170 }, { "epoch": 7.700013197835555, "grad_norm": 0.36763906478881836, "learning_rate": 8.505531641070486e-06, "loss": 0.0809, "num_input_tokens_seen": 26079984, "step": 29175 }, { "epoch": 7.701332981391052, "grad_norm": 0.12091916799545288, "learning_rate": 8.498155472506885e-06, "loss": 0.0215, "num_input_tokens_seen": 26084592, "step": 29180 }, { "epoch": 7.702652764946548, "grad_norm": 0.3094196021556854, "learning_rate": 8.49078184873508e-06, "loss": 0.1353, "num_input_tokens_seen": 26089168, "step": 29185 }, { "epoch": 7.703972548502046, "grad_norm": 0.3009713888168335, "learning_rate": 8.483410770892188e-06, "loss": 0.0358, "num_input_tokens_seen": 26093808, "step": 29190 }, { "epoch": 7.705292332057542, "grad_norm": 0.28877606987953186, "learning_rate": 8.476042240114909e-06, "loss": 0.0667, "num_input_tokens_seen": 26098544, "step": 29195 }, { "epoch": 7.70661211561304, "grad_norm": 0.42427170276641846, "learning_rate": 8.468676257539568e-06, "loss": 0.091, "num_input_tokens_seen": 26103376, "step": 29200 }, { "epoch": 7.70661211561304, "eval_loss": 0.06723228842020035, "eval_runtime": 64.8353, "eval_samples_per_second": 103.879, "eval_steps_per_second": 25.974, "num_input_tokens_seen": 26103376, "step": 29200 }, { "epoch": 7.707931899168536, "grad_norm": 0.15870288014411926, "learning_rate": 8.4613128243021e-06, "loss": 0.0553, "num_input_tokens_seen": 26107696, "step": 29205 }, { "epoch": 7.709251682724033, "grad_norm": 0.10355361551046371, "learning_rate": 8.453951941538028e-06, "loss": 0.0596, "num_input_tokens_seen": 26112240, "step": 29210 }, { "epoch": 7.71057146627953, "grad_norm": 0.036306701600551605, "learning_rate": 8.446593610382495e-06, "loss": 0.084, "num_input_tokens_seen": 26116752, "step": 29215 }, { "epoch": 7.711891249835027, "grad_norm": 0.4973589777946472, "learning_rate": 8.439237831970259e-06, "loss": 0.0624, "num_input_tokens_seen": 26121072, "step": 29220 }, { "epoch": 7.7132110333905235, "grad_norm": 0.40243661403656006, "learning_rate": 8.431884607435667e-06, "loss": 0.0664, "num_input_tokens_seen": 26125840, "step": 29225 }, { "epoch": 7.714530816946021, "grad_norm": 0.14700132608413696, "learning_rate": 8.424533937912665e-06, "loss": 0.0527, "num_input_tokens_seen": 26130192, "step": 29230 }, { "epoch": 7.7158506005015175, "grad_norm": 0.195792555809021, "learning_rate": 8.41718582453484e-06, "loss": 0.0539, "num_input_tokens_seen": 26134416, "step": 29235 }, { "epoch": 7.717170384057015, "grad_norm": 0.17996478080749512, "learning_rate": 8.409840268435346e-06, "loss": 0.0468, "num_input_tokens_seen": 26138928, "step": 29240 }, { "epoch": 7.7184901676125115, "grad_norm": 0.21712270379066467, "learning_rate": 8.402497270746976e-06, "loss": 0.0442, "num_input_tokens_seen": 26143632, "step": 29245 }, { "epoch": 7.719809951168008, "grad_norm": 0.1893381029367447, "learning_rate": 8.395156832602095e-06, "loss": 0.0629, "num_input_tokens_seen": 26148336, "step": 29250 }, { "epoch": 7.7211297347235055, "grad_norm": 0.048457589000463486, "learning_rate": 8.387818955132707e-06, "loss": 0.0517, "num_input_tokens_seen": 26153104, "step": 29255 }, { "epoch": 7.722449518279002, "grad_norm": 0.058434098958969116, "learning_rate": 8.38048363947039e-06, "loss": 0.1124, "num_input_tokens_seen": 26157712, "step": 29260 }, { "epoch": 7.7237693018344995, "grad_norm": 0.09891604632139206, "learning_rate": 8.373150886746351e-06, "loss": 0.0805, "num_input_tokens_seen": 26162384, "step": 29265 }, { "epoch": 7.725089085389996, "grad_norm": 0.30577352643013, "learning_rate": 8.365820698091397e-06, "loss": 0.0487, "num_input_tokens_seen": 26166960, "step": 29270 }, { "epoch": 7.726408868945493, "grad_norm": 0.03424977511167526, "learning_rate": 8.358493074635922e-06, "loss": 0.0665, "num_input_tokens_seen": 26171600, "step": 29275 }, { "epoch": 7.72772865250099, "grad_norm": 0.04744609072804451, "learning_rate": 8.351168017509948e-06, "loss": 0.068, "num_input_tokens_seen": 26175888, "step": 29280 }, { "epoch": 7.729048436056487, "grad_norm": 0.32994920015335083, "learning_rate": 8.343845527843094e-06, "loss": 0.0524, "num_input_tokens_seen": 26180432, "step": 29285 }, { "epoch": 7.730368219611984, "grad_norm": 0.2943839430809021, "learning_rate": 8.336525606764566e-06, "loss": 0.065, "num_input_tokens_seen": 26184880, "step": 29290 }, { "epoch": 7.731688003167481, "grad_norm": 0.29377907514572144, "learning_rate": 8.329208255403204e-06, "loss": 0.0589, "num_input_tokens_seen": 26189392, "step": 29295 }, { "epoch": 7.733007786722977, "grad_norm": 0.015558062121272087, "learning_rate": 8.321893474887426e-06, "loss": 0.0326, "num_input_tokens_seen": 26193904, "step": 29300 }, { "epoch": 7.734327570278475, "grad_norm": 0.18897658586502075, "learning_rate": 8.31458126634526e-06, "loss": 0.0692, "num_input_tokens_seen": 26198576, "step": 29305 }, { "epoch": 7.735647353833971, "grad_norm": 0.14393861591815948, "learning_rate": 8.30727163090435e-06, "loss": 0.0711, "num_input_tokens_seen": 26203152, "step": 29310 }, { "epoch": 7.736967137389469, "grad_norm": 0.39964500069618225, "learning_rate": 8.29996456969192e-06, "loss": 0.0453, "num_input_tokens_seen": 26207664, "step": 29315 }, { "epoch": 7.738286920944965, "grad_norm": 0.15900760889053345, "learning_rate": 8.292660083834818e-06, "loss": 0.0333, "num_input_tokens_seen": 26211888, "step": 29320 }, { "epoch": 7.739606704500462, "grad_norm": 0.06413280963897705, "learning_rate": 8.2853581744595e-06, "loss": 0.063, "num_input_tokens_seen": 26216592, "step": 29325 }, { "epoch": 7.740926488055959, "grad_norm": 0.12114482372999191, "learning_rate": 8.278058842691991e-06, "loss": 0.056, "num_input_tokens_seen": 26220976, "step": 29330 }, { "epoch": 7.742246271611456, "grad_norm": 0.09418950974941254, "learning_rate": 8.27076208965796e-06, "loss": 0.0407, "num_input_tokens_seen": 26225488, "step": 29335 }, { "epoch": 7.743566055166952, "grad_norm": 0.25682172179222107, "learning_rate": 8.263467916482637e-06, "loss": 0.0629, "num_input_tokens_seen": 26229904, "step": 29340 }, { "epoch": 7.74488583872245, "grad_norm": 0.2081858366727829, "learning_rate": 8.256176324290885e-06, "loss": 0.0875, "num_input_tokens_seen": 26234416, "step": 29345 }, { "epoch": 7.746205622277946, "grad_norm": 0.03535403311252594, "learning_rate": 8.248887314207168e-06, "loss": 0.0654, "num_input_tokens_seen": 26238992, "step": 29350 }, { "epoch": 7.747525405833443, "grad_norm": 0.09094540774822235, "learning_rate": 8.24160088735553e-06, "loss": 0.0248, "num_input_tokens_seen": 26243344, "step": 29355 }, { "epoch": 7.74884518938894, "grad_norm": 0.07507632672786713, "learning_rate": 8.234317044859629e-06, "loss": 0.0734, "num_input_tokens_seen": 26247888, "step": 29360 }, { "epoch": 7.750164972944437, "grad_norm": 0.08941983431577682, "learning_rate": 8.227035787842744e-06, "loss": 0.0419, "num_input_tokens_seen": 26252176, "step": 29365 }, { "epoch": 7.751484756499934, "grad_norm": 0.09300301223993301, "learning_rate": 8.219757117427721e-06, "loss": 0.0334, "num_input_tokens_seen": 26256848, "step": 29370 }, { "epoch": 7.752804540055431, "grad_norm": 0.30176451802253723, "learning_rate": 8.212481034737014e-06, "loss": 0.0725, "num_input_tokens_seen": 26261456, "step": 29375 }, { "epoch": 7.754124323610927, "grad_norm": 0.2823893129825592, "learning_rate": 8.205207540892707e-06, "loss": 0.0464, "num_input_tokens_seen": 26265840, "step": 29380 }, { "epoch": 7.755444107166425, "grad_norm": 0.16713373363018036, "learning_rate": 8.197936637016442e-06, "loss": 0.0487, "num_input_tokens_seen": 26270256, "step": 29385 }, { "epoch": 7.756763890721921, "grad_norm": 0.21369557082653046, "learning_rate": 8.190668324229508e-06, "loss": 0.0413, "num_input_tokens_seen": 26274672, "step": 29390 }, { "epoch": 7.758083674277419, "grad_norm": 0.12407999485731125, "learning_rate": 8.183402603652749e-06, "loss": 0.0726, "num_input_tokens_seen": 26279088, "step": 29395 }, { "epoch": 7.759403457832915, "grad_norm": 0.06376142054796219, "learning_rate": 8.176139476406635e-06, "loss": 0.0146, "num_input_tokens_seen": 26283664, "step": 29400 }, { "epoch": 7.759403457832915, "eval_loss": 0.0671762153506279, "eval_runtime": 64.7781, "eval_samples_per_second": 103.97, "eval_steps_per_second": 25.996, "num_input_tokens_seen": 26283664, "step": 29400 }, { "epoch": 7.760723241388412, "grad_norm": 0.18328458070755005, "learning_rate": 8.16887894361125e-06, "loss": 0.0426, "num_input_tokens_seen": 26288464, "step": 29405 }, { "epoch": 7.762043024943909, "grad_norm": 0.07232607901096344, "learning_rate": 8.161621006386233e-06, "loss": 0.1119, "num_input_tokens_seen": 26292912, "step": 29410 }, { "epoch": 7.763362808499406, "grad_norm": 0.40565112233161926, "learning_rate": 8.154365665850869e-06, "loss": 0.0982, "num_input_tokens_seen": 26297200, "step": 29415 }, { "epoch": 7.764682592054903, "grad_norm": 0.014998036436736584, "learning_rate": 8.147112923124005e-06, "loss": 0.0935, "num_input_tokens_seen": 26301648, "step": 29420 }, { "epoch": 7.7660023756104, "grad_norm": 0.08331792056560516, "learning_rate": 8.13986277932412e-06, "loss": 0.0325, "num_input_tokens_seen": 26306192, "step": 29425 }, { "epoch": 7.7673221591658965, "grad_norm": 0.33068928122520447, "learning_rate": 8.132615235569277e-06, "loss": 0.0836, "num_input_tokens_seen": 26310608, "step": 29430 }, { "epoch": 7.768641942721394, "grad_norm": 0.3390275239944458, "learning_rate": 8.125370292977124e-06, "loss": 0.0638, "num_input_tokens_seen": 26315248, "step": 29435 }, { "epoch": 7.7699617262768905, "grad_norm": 0.05445443466305733, "learning_rate": 8.118127952664944e-06, "loss": 0.0289, "num_input_tokens_seen": 26319888, "step": 29440 }, { "epoch": 7.771281509832388, "grad_norm": 0.11204639077186584, "learning_rate": 8.110888215749574e-06, "loss": 0.0326, "num_input_tokens_seen": 26324336, "step": 29445 }, { "epoch": 7.7726012933878845, "grad_norm": 0.036024268716573715, "learning_rate": 8.10365108334749e-06, "loss": 0.0692, "num_input_tokens_seen": 26328688, "step": 29450 }, { "epoch": 7.773921076943381, "grad_norm": 0.13047192990779877, "learning_rate": 8.096416556574743e-06, "loss": 0.0234, "num_input_tokens_seen": 26333104, "step": 29455 }, { "epoch": 7.7752408604988785, "grad_norm": 0.316723495721817, "learning_rate": 8.08918463654698e-06, "loss": 0.0621, "num_input_tokens_seen": 26337776, "step": 29460 }, { "epoch": 7.776560644054375, "grad_norm": 0.2604009211063385, "learning_rate": 8.081955324379458e-06, "loss": 0.0768, "num_input_tokens_seen": 26342224, "step": 29465 }, { "epoch": 7.777880427609872, "grad_norm": 0.28379932045936584, "learning_rate": 8.074728621187039e-06, "loss": 0.0914, "num_input_tokens_seen": 26346832, "step": 29470 }, { "epoch": 7.779200211165369, "grad_norm": 0.0667523443698883, "learning_rate": 8.067504528084158e-06, "loss": 0.071, "num_input_tokens_seen": 26351312, "step": 29475 }, { "epoch": 7.780519994720866, "grad_norm": 0.3414291441440582, "learning_rate": 8.060283046184861e-06, "loss": 0.108, "num_input_tokens_seen": 26355920, "step": 29480 }, { "epoch": 7.781839778276362, "grad_norm": 0.2726857364177704, "learning_rate": 8.053064176602806e-06, "loss": 0.1423, "num_input_tokens_seen": 26360400, "step": 29485 }, { "epoch": 7.78315956183186, "grad_norm": 0.05482906848192215, "learning_rate": 8.045847920451216e-06, "loss": 0.0547, "num_input_tokens_seen": 26364752, "step": 29490 }, { "epoch": 7.784479345387356, "grad_norm": 0.2447459101676941, "learning_rate": 8.038634278842944e-06, "loss": 0.0461, "num_input_tokens_seen": 26369552, "step": 29495 }, { "epoch": 7.785799128942854, "grad_norm": 0.14525727927684784, "learning_rate": 8.031423252890408e-06, "loss": 0.0371, "num_input_tokens_seen": 26374064, "step": 29500 }, { "epoch": 7.78711891249835, "grad_norm": 0.31675413250923157, "learning_rate": 8.024214843705646e-06, "loss": 0.079, "num_input_tokens_seen": 26378288, "step": 29505 }, { "epoch": 7.788438696053847, "grad_norm": 0.23156945407390594, "learning_rate": 8.017009052400295e-06, "loss": 0.0448, "num_input_tokens_seen": 26382704, "step": 29510 }, { "epoch": 7.789758479609344, "grad_norm": 0.5436953902244568, "learning_rate": 8.00980588008557e-06, "loss": 0.0912, "num_input_tokens_seen": 26387248, "step": 29515 }, { "epoch": 7.791078263164841, "grad_norm": 0.13787122070789337, "learning_rate": 8.002605327872282e-06, "loss": 0.0483, "num_input_tokens_seen": 26391696, "step": 29520 }, { "epoch": 7.792398046720338, "grad_norm": 0.22874706983566284, "learning_rate": 7.995407396870862e-06, "loss": 0.0486, "num_input_tokens_seen": 26396176, "step": 29525 }, { "epoch": 7.793717830275835, "grad_norm": 0.11142507940530777, "learning_rate": 7.988212088191307e-06, "loss": 0.0301, "num_input_tokens_seen": 26400624, "step": 29530 }, { "epoch": 7.795037613831331, "grad_norm": 0.026693236082792282, "learning_rate": 7.98101940294324e-06, "loss": 0.0279, "num_input_tokens_seen": 26405392, "step": 29535 }, { "epoch": 7.796357397386829, "grad_norm": 0.10701094567775726, "learning_rate": 7.973829342235847e-06, "loss": 0.1023, "num_input_tokens_seen": 26410256, "step": 29540 }, { "epoch": 7.797677180942325, "grad_norm": 0.21908967196941376, "learning_rate": 7.966641907177936e-06, "loss": 0.0809, "num_input_tokens_seen": 26414832, "step": 29545 }, { "epoch": 7.798996964497823, "grad_norm": 0.038357775658369064, "learning_rate": 7.959457098877901e-06, "loss": 0.0201, "num_input_tokens_seen": 26419504, "step": 29550 }, { "epoch": 7.800316748053319, "grad_norm": 0.08825361728668213, "learning_rate": 7.952274918443719e-06, "loss": 0.063, "num_input_tokens_seen": 26424144, "step": 29555 }, { "epoch": 7.801636531608816, "grad_norm": 0.5710088014602661, "learning_rate": 7.945095366982983e-06, "loss": 0.1055, "num_input_tokens_seen": 26428496, "step": 29560 }, { "epoch": 7.802956315164313, "grad_norm": 0.4130508303642273, "learning_rate": 7.937918445602871e-06, "loss": 0.0877, "num_input_tokens_seen": 26433168, "step": 29565 }, { "epoch": 7.80427609871981, "grad_norm": 0.06366489082574844, "learning_rate": 7.930744155410145e-06, "loss": 0.0725, "num_input_tokens_seen": 26437360, "step": 29570 }, { "epoch": 7.805595882275307, "grad_norm": 0.11841519176959991, "learning_rate": 7.923572497511181e-06, "loss": 0.0857, "num_input_tokens_seen": 26441552, "step": 29575 }, { "epoch": 7.806915665830804, "grad_norm": 0.12524597346782684, "learning_rate": 7.916403473011927e-06, "loss": 0.0431, "num_input_tokens_seen": 26445712, "step": 29580 }, { "epoch": 7.8082354493863, "grad_norm": 0.388108491897583, "learning_rate": 7.909237083017953e-06, "loss": 0.0525, "num_input_tokens_seen": 26449968, "step": 29585 }, { "epoch": 7.809555232941798, "grad_norm": 0.21542654931545258, "learning_rate": 7.902073328634389e-06, "loss": 0.047, "num_input_tokens_seen": 26454640, "step": 29590 }, { "epoch": 7.810875016497294, "grad_norm": 0.05684393271803856, "learning_rate": 7.894912210965987e-06, "loss": 0.0244, "num_input_tokens_seen": 26459280, "step": 29595 }, { "epoch": 7.812194800052791, "grad_norm": 0.2622746229171753, "learning_rate": 7.887753731117075e-06, "loss": 0.0524, "num_input_tokens_seen": 26463440, "step": 29600 }, { "epoch": 7.812194800052791, "eval_loss": 0.06682220846414566, "eval_runtime": 64.7479, "eval_samples_per_second": 104.019, "eval_steps_per_second": 26.009, "num_input_tokens_seen": 26463440, "step": 29600 }, { "epoch": 7.813514583608288, "grad_norm": 0.44331231713294983, "learning_rate": 7.880597890191587e-06, "loss": 0.1185, "num_input_tokens_seen": 26467888, "step": 29605 }, { "epoch": 7.814834367163785, "grad_norm": 0.17849665880203247, "learning_rate": 7.873444689293036e-06, "loss": 0.0516, "num_input_tokens_seen": 26472144, "step": 29610 }, { "epoch": 7.8161541507192815, "grad_norm": 0.10475289076566696, "learning_rate": 7.866294129524548e-06, "loss": 0.0524, "num_input_tokens_seen": 26476592, "step": 29615 }, { "epoch": 7.817473934274779, "grad_norm": 0.44389402866363525, "learning_rate": 7.859146211988811e-06, "loss": 0.0679, "num_input_tokens_seen": 26481040, "step": 29620 }, { "epoch": 7.8187937178302755, "grad_norm": 0.21584081649780273, "learning_rate": 7.852000937788134e-06, "loss": 0.0543, "num_input_tokens_seen": 26485488, "step": 29625 }, { "epoch": 7.820113501385773, "grad_norm": 0.07895476371049881, "learning_rate": 7.844858308024416e-06, "loss": 0.0636, "num_input_tokens_seen": 26490032, "step": 29630 }, { "epoch": 7.8214332849412695, "grad_norm": 0.07522094994783401, "learning_rate": 7.837718323799122e-06, "loss": 0.0523, "num_input_tokens_seen": 26494672, "step": 29635 }, { "epoch": 7.822753068496766, "grad_norm": 0.12387192249298096, "learning_rate": 7.83058098621334e-06, "loss": 0.0413, "num_input_tokens_seen": 26498832, "step": 29640 }, { "epoch": 7.8240728520522635, "grad_norm": 0.22803911566734314, "learning_rate": 7.823446296367739e-06, "loss": 0.0385, "num_input_tokens_seen": 26503632, "step": 29645 }, { "epoch": 7.82539263560776, "grad_norm": 0.18696343898773193, "learning_rate": 7.81631425536257e-06, "loss": 0.0545, "num_input_tokens_seen": 26507792, "step": 29650 }, { "epoch": 7.8267124191632576, "grad_norm": 0.15291476249694824, "learning_rate": 7.809184864297689e-06, "loss": 0.0585, "num_input_tokens_seen": 26512304, "step": 29655 }, { "epoch": 7.828032202718754, "grad_norm": 0.032771509140729904, "learning_rate": 7.802058124272532e-06, "loss": 0.053, "num_input_tokens_seen": 26516816, "step": 29660 }, { "epoch": 7.829351986274251, "grad_norm": 0.24588407576084137, "learning_rate": 7.79493403638614e-06, "loss": 0.0417, "num_input_tokens_seen": 26521328, "step": 29665 }, { "epoch": 7.830671769829748, "grad_norm": 0.14163744449615479, "learning_rate": 7.787812601737132e-06, "loss": 0.0293, "num_input_tokens_seen": 26525776, "step": 29670 }, { "epoch": 7.831991553385245, "grad_norm": 0.09877733141183853, "learning_rate": 7.780693821423715e-06, "loss": 0.0447, "num_input_tokens_seen": 26530224, "step": 29675 }, { "epoch": 7.833311336940742, "grad_norm": 0.1099616140127182, "learning_rate": 7.773577696543705e-06, "loss": 0.0685, "num_input_tokens_seen": 26534768, "step": 29680 }, { "epoch": 7.834631120496239, "grad_norm": 0.16719500720500946, "learning_rate": 7.7664642281945e-06, "loss": 0.0274, "num_input_tokens_seen": 26539024, "step": 29685 }, { "epoch": 7.835950904051735, "grad_norm": 0.12275432795286179, "learning_rate": 7.759353417473072e-06, "loss": 0.0327, "num_input_tokens_seen": 26543696, "step": 29690 }, { "epoch": 7.837270687607233, "grad_norm": 0.29782289266586304, "learning_rate": 7.752245265476016e-06, "loss": 0.087, "num_input_tokens_seen": 26548048, "step": 29695 }, { "epoch": 7.838590471162729, "grad_norm": 0.15246348083019257, "learning_rate": 7.745139773299481e-06, "loss": 0.0337, "num_input_tokens_seen": 26552432, "step": 29700 }, { "epoch": 7.839910254718227, "grad_norm": 0.12156028300523758, "learning_rate": 7.738036942039232e-06, "loss": 0.0293, "num_input_tokens_seen": 26556752, "step": 29705 }, { "epoch": 7.841230038273723, "grad_norm": 0.7386717200279236, "learning_rate": 7.73093677279062e-06, "loss": 0.0523, "num_input_tokens_seen": 26561296, "step": 29710 }, { "epoch": 7.84254982182922, "grad_norm": 0.4415915310382843, "learning_rate": 7.72383926664857e-06, "loss": 0.0847, "num_input_tokens_seen": 26565840, "step": 29715 }, { "epoch": 7.843869605384717, "grad_norm": 0.18421350419521332, "learning_rate": 7.716744424707606e-06, "loss": 0.0517, "num_input_tokens_seen": 26570800, "step": 29720 }, { "epoch": 7.845189388940214, "grad_norm": 0.2864592969417572, "learning_rate": 7.709652248061858e-06, "loss": 0.0367, "num_input_tokens_seen": 26575408, "step": 29725 }, { "epoch": 7.84650917249571, "grad_norm": 0.04761968180537224, "learning_rate": 7.702562737805017e-06, "loss": 0.0217, "num_input_tokens_seen": 26579568, "step": 29730 }, { "epoch": 7.847828956051208, "grad_norm": 0.4514889717102051, "learning_rate": 7.695475895030365e-06, "loss": 0.1086, "num_input_tokens_seen": 26584112, "step": 29735 }, { "epoch": 7.849148739606704, "grad_norm": 0.26566755771636963, "learning_rate": 7.6883917208308e-06, "loss": 0.0692, "num_input_tokens_seen": 26588656, "step": 29740 }, { "epoch": 7.850468523162202, "grad_norm": 0.13077063858509064, "learning_rate": 7.681310216298778e-06, "loss": 0.0471, "num_input_tokens_seen": 26593072, "step": 29745 }, { "epoch": 7.851788306717698, "grad_norm": 0.15217702090740204, "learning_rate": 7.674231382526367e-06, "loss": 0.0594, "num_input_tokens_seen": 26597520, "step": 29750 }, { "epoch": 7.853108090273195, "grad_norm": 0.09122762829065323, "learning_rate": 7.667155220605198e-06, "loss": 0.0393, "num_input_tokens_seen": 26601808, "step": 29755 }, { "epoch": 7.854427873828692, "grad_norm": 0.16524642705917358, "learning_rate": 7.660081731626515e-06, "loss": 0.0278, "num_input_tokens_seen": 26606224, "step": 29760 }, { "epoch": 7.855747657384189, "grad_norm": 0.11907808482646942, "learning_rate": 7.653010916681141e-06, "loss": 0.0299, "num_input_tokens_seen": 26610576, "step": 29765 }, { "epoch": 7.8570674409396855, "grad_norm": 0.07378603518009186, "learning_rate": 7.645942776859472e-06, "loss": 0.0253, "num_input_tokens_seen": 26615120, "step": 29770 }, { "epoch": 7.858387224495183, "grad_norm": 0.29111722111701965, "learning_rate": 7.63887731325152e-06, "loss": 0.0726, "num_input_tokens_seen": 26619856, "step": 29775 }, { "epoch": 7.8597070080506795, "grad_norm": 0.4768751263618469, "learning_rate": 7.63181452694685e-06, "loss": 0.0763, "num_input_tokens_seen": 26624240, "step": 29780 }, { "epoch": 7.861026791606177, "grad_norm": 0.08021488040685654, "learning_rate": 7.624754419034644e-06, "loss": 0.0442, "num_input_tokens_seen": 26628880, "step": 29785 }, { "epoch": 7.8623465751616735, "grad_norm": 0.09418930858373642, "learning_rate": 7.6176969906036645e-06, "loss": 0.0529, "num_input_tokens_seen": 26633328, "step": 29790 }, { "epoch": 7.86366635871717, "grad_norm": 0.18523737788200378, "learning_rate": 7.610642242742242e-06, "loss": 0.03, "num_input_tokens_seen": 26638000, "step": 29795 }, { "epoch": 7.8649861422726675, "grad_norm": 0.09514988213777542, "learning_rate": 7.603590176538322e-06, "loss": 0.0428, "num_input_tokens_seen": 26642352, "step": 29800 }, { "epoch": 7.8649861422726675, "eval_loss": 0.06709372997283936, "eval_runtime": 64.8067, "eval_samples_per_second": 103.925, "eval_steps_per_second": 25.985, "num_input_tokens_seen": 26642352, "step": 29800 }, { "epoch": 7.866305925828164, "grad_norm": 0.7175394892692566, "learning_rate": 7.596540793079404e-06, "loss": 0.0701, "num_input_tokens_seen": 26646672, "step": 29805 }, { "epoch": 7.8676257093836615, "grad_norm": 0.10682795941829681, "learning_rate": 7.5894940934526125e-06, "loss": 0.0872, "num_input_tokens_seen": 26650992, "step": 29810 }, { "epoch": 7.868945492939158, "grad_norm": 0.24400553107261658, "learning_rate": 7.582450078744621e-06, "loss": 0.066, "num_input_tokens_seen": 26655280, "step": 29815 }, { "epoch": 7.870265276494655, "grad_norm": 0.15113161504268646, "learning_rate": 7.575408750041707e-06, "loss": 0.0313, "num_input_tokens_seen": 26659888, "step": 29820 }, { "epoch": 7.871585060050152, "grad_norm": 0.22402040660381317, "learning_rate": 7.568370108429732e-06, "loss": 0.0673, "num_input_tokens_seen": 26664592, "step": 29825 }, { "epoch": 7.872904843605649, "grad_norm": 0.24285975098609924, "learning_rate": 7.561334154994154e-06, "loss": 0.0553, "num_input_tokens_seen": 26669296, "step": 29830 }, { "epoch": 7.874224627161146, "grad_norm": 0.26852935552597046, "learning_rate": 7.55430089081999e-06, "loss": 0.0695, "num_input_tokens_seen": 26673584, "step": 29835 }, { "epoch": 7.875544410716643, "grad_norm": 0.18975195288658142, "learning_rate": 7.547270316991864e-06, "loss": 0.0691, "num_input_tokens_seen": 26677904, "step": 29840 }, { "epoch": 7.876864194272139, "grad_norm": 0.22726944088935852, "learning_rate": 7.5402424345939884e-06, "loss": 0.0543, "num_input_tokens_seen": 26682384, "step": 29845 }, { "epoch": 7.878183977827637, "grad_norm": 0.47393304109573364, "learning_rate": 7.533217244710133e-06, "loss": 0.0865, "num_input_tokens_seen": 26686672, "step": 29850 }, { "epoch": 7.879503761383133, "grad_norm": 0.11657862365245819, "learning_rate": 7.52619474842369e-06, "loss": 0.0682, "num_input_tokens_seen": 26690992, "step": 29855 }, { "epoch": 7.88082354493863, "grad_norm": 0.14319394528865814, "learning_rate": 7.519174946817597e-06, "loss": 0.068, "num_input_tokens_seen": 26695216, "step": 29860 }, { "epoch": 7.882143328494127, "grad_norm": 0.20124183595180511, "learning_rate": 7.512157840974407e-06, "loss": 0.0348, "num_input_tokens_seen": 26699664, "step": 29865 }, { "epoch": 7.883463112049624, "grad_norm": 0.2509338855743408, "learning_rate": 7.5051434319762496e-06, "loss": 0.0684, "num_input_tokens_seen": 26703920, "step": 29870 }, { "epoch": 7.884782895605121, "grad_norm": 0.10358088463544846, "learning_rate": 7.498131720904822e-06, "loss": 0.072, "num_input_tokens_seen": 26708464, "step": 29875 }, { "epoch": 7.886102679160618, "grad_norm": 0.4617219567298889, "learning_rate": 7.491122708841433e-06, "loss": 0.0712, "num_input_tokens_seen": 26712816, "step": 29880 }, { "epoch": 7.887422462716114, "grad_norm": 0.13039080798625946, "learning_rate": 7.4841163968669524e-06, "loss": 0.0339, "num_input_tokens_seen": 26717552, "step": 29885 }, { "epoch": 7.888742246271612, "grad_norm": 0.3037091791629791, "learning_rate": 7.4771127860618355e-06, "loss": 0.0554, "num_input_tokens_seen": 26722224, "step": 29890 }, { "epoch": 7.890062029827108, "grad_norm": 0.14825887978076935, "learning_rate": 7.470111877506139e-06, "loss": 0.0419, "num_input_tokens_seen": 26726992, "step": 29895 }, { "epoch": 7.891381813382605, "grad_norm": 0.04369297996163368, "learning_rate": 7.463113672279479e-06, "loss": 0.0543, "num_input_tokens_seen": 26731728, "step": 29900 }, { "epoch": 7.892701596938102, "grad_norm": 0.08232252299785614, "learning_rate": 7.456118171461071e-06, "loss": 0.0988, "num_input_tokens_seen": 26736304, "step": 29905 }, { "epoch": 7.894021380493599, "grad_norm": 0.276728093624115, "learning_rate": 7.449125376129721e-06, "loss": 0.0684, "num_input_tokens_seen": 26740720, "step": 29910 }, { "epoch": 7.895341164049096, "grad_norm": 0.3015488386154175, "learning_rate": 7.442135287363788e-06, "loss": 0.0565, "num_input_tokens_seen": 26745008, "step": 29915 }, { "epoch": 7.896660947604593, "grad_norm": 0.15158341825008392, "learning_rate": 7.435147906241247e-06, "loss": 0.0425, "num_input_tokens_seen": 26749296, "step": 29920 }, { "epoch": 7.897980731160089, "grad_norm": 0.3544863164424896, "learning_rate": 7.428163233839624e-06, "loss": 0.1373, "num_input_tokens_seen": 26754064, "step": 29925 }, { "epoch": 7.899300514715587, "grad_norm": 0.25933584570884705, "learning_rate": 7.4211812712360525e-06, "loss": 0.0239, "num_input_tokens_seen": 26758608, "step": 29930 }, { "epoch": 7.900620298271083, "grad_norm": 0.411872535943985, "learning_rate": 7.4142020195072464e-06, "loss": 0.043, "num_input_tokens_seen": 26763120, "step": 29935 }, { "epoch": 7.901940081826581, "grad_norm": 0.10601312667131424, "learning_rate": 7.407225479729479e-06, "loss": 0.0519, "num_input_tokens_seen": 26767888, "step": 29940 }, { "epoch": 7.903259865382077, "grad_norm": 0.19563254714012146, "learning_rate": 7.400251652978632e-06, "loss": 0.0444, "num_input_tokens_seen": 26772560, "step": 29945 }, { "epoch": 7.904579648937574, "grad_norm": 0.10898806899785995, "learning_rate": 7.393280540330147e-06, "loss": 0.0418, "num_input_tokens_seen": 26777424, "step": 29950 }, { "epoch": 7.905899432493071, "grad_norm": 0.056096311658620834, "learning_rate": 7.386312142859069e-06, "loss": 0.0206, "num_input_tokens_seen": 26781616, "step": 29955 }, { "epoch": 7.907219216048568, "grad_norm": 0.10543117672204971, "learning_rate": 7.379346461640008e-06, "loss": 0.09, "num_input_tokens_seen": 26786320, "step": 29960 }, { "epoch": 7.908538999604065, "grad_norm": 0.22770382463932037, "learning_rate": 7.372383497747149e-06, "loss": 0.0291, "num_input_tokens_seen": 26790960, "step": 29965 }, { "epoch": 7.909858783159562, "grad_norm": 0.11658193171024323, "learning_rate": 7.3654232522542775e-06, "loss": 0.0374, "num_input_tokens_seen": 26795248, "step": 29970 }, { "epoch": 7.9111785667150585, "grad_norm": 0.08234360814094543, "learning_rate": 7.358465726234756e-06, "loss": 0.059, "num_input_tokens_seen": 26799792, "step": 29975 }, { "epoch": 7.912498350270556, "grad_norm": 0.24109932780265808, "learning_rate": 7.351510920761512e-06, "loss": 0.0573, "num_input_tokens_seen": 26804208, "step": 29980 }, { "epoch": 7.9138181338260525, "grad_norm": 0.046731509268283844, "learning_rate": 7.344558836907067e-06, "loss": 0.0363, "num_input_tokens_seen": 26808752, "step": 29985 }, { "epoch": 7.915137917381549, "grad_norm": 0.06505048274993896, "learning_rate": 7.3376094757435285e-06, "loss": 0.0447, "num_input_tokens_seen": 26813264, "step": 29990 }, { "epoch": 7.9164577009370465, "grad_norm": 0.22991526126861572, "learning_rate": 7.330662838342561e-06, "loss": 0.0332, "num_input_tokens_seen": 26817712, "step": 29995 }, { "epoch": 7.917777484492543, "grad_norm": 0.06408590078353882, "learning_rate": 7.323718925775438e-06, "loss": 0.0454, "num_input_tokens_seen": 26822096, "step": 30000 }, { "epoch": 7.917777484492543, "eval_loss": 0.06719440966844559, "eval_runtime": 64.8014, "eval_samples_per_second": 103.933, "eval_steps_per_second": 25.987, "num_input_tokens_seen": 26822096, "step": 30000 }, { "epoch": 7.9190972680480405, "grad_norm": 0.10281378030776978, "learning_rate": 7.316777739112985e-06, "loss": 0.033, "num_input_tokens_seen": 26826480, "step": 30005 }, { "epoch": 7.920417051603537, "grad_norm": 0.5092872381210327, "learning_rate": 7.309839279425626e-06, "loss": 0.0853, "num_input_tokens_seen": 26831024, "step": 30010 }, { "epoch": 7.921736835159034, "grad_norm": 0.22064289450645447, "learning_rate": 7.302903547783366e-06, "loss": 0.0331, "num_input_tokens_seen": 26835312, "step": 30015 }, { "epoch": 7.923056618714531, "grad_norm": 0.3362635374069214, "learning_rate": 7.2959705452557644e-06, "loss": 0.0807, "num_input_tokens_seen": 26839632, "step": 30020 }, { "epoch": 7.924376402270028, "grad_norm": 0.12530982494354248, "learning_rate": 7.289040272911996e-06, "loss": 0.0324, "num_input_tokens_seen": 26844240, "step": 30025 }, { "epoch": 7.925696185825524, "grad_norm": 0.04797416180372238, "learning_rate": 7.282112731820789e-06, "loss": 0.099, "num_input_tokens_seen": 26848656, "step": 30030 }, { "epoch": 7.927015969381022, "grad_norm": 0.0678882822394371, "learning_rate": 7.275187923050447e-06, "loss": 0.0185, "num_input_tokens_seen": 26853168, "step": 30035 }, { "epoch": 7.928335752936518, "grad_norm": 0.029461923986673355, "learning_rate": 7.268265847668879e-06, "loss": 0.041, "num_input_tokens_seen": 26857808, "step": 30040 }, { "epoch": 7.929655536492016, "grad_norm": 0.03337513282895088, "learning_rate": 7.261346506743538e-06, "loss": 0.0455, "num_input_tokens_seen": 26861968, "step": 30045 }, { "epoch": 7.930975320047512, "grad_norm": 0.027279408648610115, "learning_rate": 7.254429901341486e-06, "loss": 0.0507, "num_input_tokens_seen": 26866128, "step": 30050 }, { "epoch": 7.932295103603009, "grad_norm": 0.21016177535057068, "learning_rate": 7.247516032529356e-06, "loss": 0.0854, "num_input_tokens_seen": 26870736, "step": 30055 }, { "epoch": 7.933614887158506, "grad_norm": 0.031663864850997925, "learning_rate": 7.240604901373338e-06, "loss": 0.0456, "num_input_tokens_seen": 26875408, "step": 30060 }, { "epoch": 7.934934670714003, "grad_norm": 0.040149152278900146, "learning_rate": 7.233696508939223e-06, "loss": 0.0195, "num_input_tokens_seen": 26879664, "step": 30065 }, { "epoch": 7.9362544542695, "grad_norm": 0.5209730267524719, "learning_rate": 7.226790856292376e-06, "loss": 0.1905, "num_input_tokens_seen": 26883728, "step": 30070 }, { "epoch": 7.937574237824997, "grad_norm": 0.028535690158605576, "learning_rate": 7.219887944497727e-06, "loss": 0.0278, "num_input_tokens_seen": 26888208, "step": 30075 }, { "epoch": 7.938894021380493, "grad_norm": 0.09450594335794449, "learning_rate": 7.2129877746198e-06, "loss": 0.0653, "num_input_tokens_seen": 26892560, "step": 30080 }, { "epoch": 7.940213804935991, "grad_norm": 0.03408260643482208, "learning_rate": 7.20609034772268e-06, "loss": 0.0684, "num_input_tokens_seen": 26897040, "step": 30085 }, { "epoch": 7.941533588491487, "grad_norm": 0.15078240633010864, "learning_rate": 7.19919566487004e-06, "loss": 0.0274, "num_input_tokens_seen": 26901712, "step": 30090 }, { "epoch": 7.942853372046985, "grad_norm": 1.5123096704483032, "learning_rate": 7.192303727125132e-06, "loss": 0.0534, "num_input_tokens_seen": 26906192, "step": 30095 }, { "epoch": 7.944173155602481, "grad_norm": 0.020771503448486328, "learning_rate": 7.185414535550777e-06, "loss": 0.0279, "num_input_tokens_seen": 26910832, "step": 30100 }, { "epoch": 7.945492939157978, "grad_norm": 0.37324821949005127, "learning_rate": 7.178528091209363e-06, "loss": 0.065, "num_input_tokens_seen": 26915184, "step": 30105 }, { "epoch": 7.946812722713475, "grad_norm": 0.08117761462926865, "learning_rate": 7.171644395162888e-06, "loss": 0.0628, "num_input_tokens_seen": 26919792, "step": 30110 }, { "epoch": 7.948132506268972, "grad_norm": 0.12705957889556885, "learning_rate": 7.164763448472881e-06, "loss": 0.0534, "num_input_tokens_seen": 26924304, "step": 30115 }, { "epoch": 7.949452289824469, "grad_norm": 0.12614706158638, "learning_rate": 7.157885252200491e-06, "loss": 0.0412, "num_input_tokens_seen": 26928784, "step": 30120 }, { "epoch": 7.950772073379966, "grad_norm": 0.06763449311256409, "learning_rate": 7.151009807406403e-06, "loss": 0.039, "num_input_tokens_seen": 26933328, "step": 30125 }, { "epoch": 7.952091856935462, "grad_norm": 0.20126107335090637, "learning_rate": 7.144137115150909e-06, "loss": 0.1024, "num_input_tokens_seen": 26938224, "step": 30130 }, { "epoch": 7.95341164049096, "grad_norm": 0.08997827023267746, "learning_rate": 7.1372671764938725e-06, "loss": 0.0517, "num_input_tokens_seen": 26942416, "step": 30135 }, { "epoch": 7.954731424046456, "grad_norm": 0.28838831186294556, "learning_rate": 7.130399992494705e-06, "loss": 0.0407, "num_input_tokens_seen": 26946960, "step": 30140 }, { "epoch": 7.956051207601953, "grad_norm": 0.07410097122192383, "learning_rate": 7.123535564212419e-06, "loss": 0.0408, "num_input_tokens_seen": 26951536, "step": 30145 }, { "epoch": 7.95737099115745, "grad_norm": 0.2847036123275757, "learning_rate": 7.116673892705611e-06, "loss": 0.0407, "num_input_tokens_seen": 26955888, "step": 30150 }, { "epoch": 7.958690774712947, "grad_norm": 0.08863674104213715, "learning_rate": 7.109814979032415e-06, "loss": 0.0389, "num_input_tokens_seen": 26960240, "step": 30155 }, { "epoch": 7.9600105582684435, "grad_norm": 0.2323164939880371, "learning_rate": 7.102958824250577e-06, "loss": 0.0421, "num_input_tokens_seen": 26964784, "step": 30160 }, { "epoch": 7.961330341823941, "grad_norm": 0.1313123106956482, "learning_rate": 7.096105429417393e-06, "loss": 0.0352, "num_input_tokens_seen": 26969072, "step": 30165 }, { "epoch": 7.9626501253794375, "grad_norm": 0.2643413841724396, "learning_rate": 7.0892547955897506e-06, "loss": 0.055, "num_input_tokens_seen": 26973488, "step": 30170 }, { "epoch": 7.963969908934935, "grad_norm": 0.14999638497829437, "learning_rate": 7.0824069238241e-06, "loss": 0.0498, "num_input_tokens_seen": 26977968, "step": 30175 }, { "epoch": 7.9652896924904315, "grad_norm": 0.04462284967303276, "learning_rate": 7.075561815176462e-06, "loss": 0.0628, "num_input_tokens_seen": 26982544, "step": 30180 }, { "epoch": 7.966609476045928, "grad_norm": 0.09972035884857178, "learning_rate": 7.068719470702445e-06, "loss": 0.0467, "num_input_tokens_seen": 26986864, "step": 30185 }, { "epoch": 7.9679292596014255, "grad_norm": 0.31688374280929565, "learning_rate": 7.061879891457229e-06, "loss": 0.0909, "num_input_tokens_seen": 26991504, "step": 30190 }, { "epoch": 7.969249043156922, "grad_norm": 0.28414109349250793, "learning_rate": 7.0550430784955515e-06, "loss": 0.0697, "num_input_tokens_seen": 26996112, "step": 30195 }, { "epoch": 7.9705688267124195, "grad_norm": 0.19878104329109192, "learning_rate": 7.048209032871752e-06, "loss": 0.0417, "num_input_tokens_seen": 27000688, "step": 30200 }, { "epoch": 7.9705688267124195, "eval_loss": 0.06734690070152283, "eval_runtime": 64.7791, "eval_samples_per_second": 103.969, "eval_steps_per_second": 25.996, "num_input_tokens_seen": 27000688, "step": 30200 }, { "epoch": 7.971888610267916, "grad_norm": 0.16194979846477509, "learning_rate": 7.0413777556397055e-06, "loss": 0.0555, "num_input_tokens_seen": 27005488, "step": 30205 }, { "epoch": 7.973208393823413, "grad_norm": 0.12212682515382767, "learning_rate": 7.0345492478528925e-06, "loss": 0.0691, "num_input_tokens_seen": 27009872, "step": 30210 }, { "epoch": 7.97452817737891, "grad_norm": 0.33504918217658997, "learning_rate": 7.02772351056436e-06, "loss": 0.1048, "num_input_tokens_seen": 27014480, "step": 30215 }, { "epoch": 7.975847960934407, "grad_norm": 0.021263884380459785, "learning_rate": 7.020900544826709e-06, "loss": 0.0653, "num_input_tokens_seen": 27018640, "step": 30220 }, { "epoch": 7.977167744489904, "grad_norm": 0.09829231351613998, "learning_rate": 7.014080351692134e-06, "loss": 0.088, "num_input_tokens_seen": 27023088, "step": 30225 }, { "epoch": 7.978487528045401, "grad_norm": 0.23319900035858154, "learning_rate": 7.0072629322124024e-06, "loss": 0.1154, "num_input_tokens_seen": 27027504, "step": 30230 }, { "epoch": 7.979807311600897, "grad_norm": 0.6480019092559814, "learning_rate": 7.000448287438827e-06, "loss": 0.1265, "num_input_tokens_seen": 27031664, "step": 30235 }, { "epoch": 7.981127095156395, "grad_norm": 0.3714023530483246, "learning_rate": 6.993636418422331e-06, "loss": 0.0668, "num_input_tokens_seen": 27035792, "step": 30240 }, { "epoch": 7.982446878711891, "grad_norm": 0.2799889147281647, "learning_rate": 6.986827326213383e-06, "loss": 0.0461, "num_input_tokens_seen": 27040368, "step": 30245 }, { "epoch": 7.983766662267389, "grad_norm": 0.34205272793769836, "learning_rate": 6.9800210118620205e-06, "loss": 0.1031, "num_input_tokens_seen": 27044720, "step": 30250 }, { "epoch": 7.985086445822885, "grad_norm": 0.06961945444345474, "learning_rate": 6.973217476417876e-06, "loss": 0.0517, "num_input_tokens_seen": 27049232, "step": 30255 }, { "epoch": 7.986406229378382, "grad_norm": 0.18115903437137604, "learning_rate": 6.96641672093013e-06, "loss": 0.0541, "num_input_tokens_seen": 27053872, "step": 30260 }, { "epoch": 7.987726012933879, "grad_norm": 0.18713590502738953, "learning_rate": 6.95961874644755e-06, "loss": 0.062, "num_input_tokens_seen": 27058576, "step": 30265 }, { "epoch": 7.989045796489376, "grad_norm": 0.14399154484272003, "learning_rate": 6.952823554018476e-06, "loss": 0.0824, "num_input_tokens_seen": 27062928, "step": 30270 }, { "epoch": 7.990365580044872, "grad_norm": 0.05791733041405678, "learning_rate": 6.946031144690798e-06, "loss": 0.0167, "num_input_tokens_seen": 27067376, "step": 30275 }, { "epoch": 7.99168536360037, "grad_norm": 0.27518972754478455, "learning_rate": 6.939241519512005e-06, "loss": 0.0465, "num_input_tokens_seen": 27071696, "step": 30280 }, { "epoch": 7.993005147155866, "grad_norm": 0.06753062456846237, "learning_rate": 6.932454679529129e-06, "loss": 0.028, "num_input_tokens_seen": 27076080, "step": 30285 }, { "epoch": 7.994324930711363, "grad_norm": 0.1605243682861328, "learning_rate": 6.925670625788791e-06, "loss": 0.0382, "num_input_tokens_seen": 27080624, "step": 30290 }, { "epoch": 7.99564471426686, "grad_norm": 0.06993427872657776, "learning_rate": 6.918889359337186e-06, "loss": 0.0926, "num_input_tokens_seen": 27085104, "step": 30295 }, { "epoch": 7.996964497822357, "grad_norm": 0.11666660010814667, "learning_rate": 6.912110881220058e-06, "loss": 0.0557, "num_input_tokens_seen": 27089456, "step": 30300 }, { "epoch": 7.998284281377854, "grad_norm": 0.25693169236183167, "learning_rate": 6.905335192482735e-06, "loss": 0.0432, "num_input_tokens_seen": 27094224, "step": 30305 }, { "epoch": 7.999604064933351, "grad_norm": 0.46004870533943176, "learning_rate": 6.8985622941701275e-06, "loss": 0.1014, "num_input_tokens_seen": 27098448, "step": 30310 }, { "epoch": 8.000791870133298, "grad_norm": 0.6272907257080078, "learning_rate": 6.89179218732669e-06, "loss": 0.1432, "num_input_tokens_seen": 27102112, "step": 30315 }, { "epoch": 8.002111653688795, "grad_norm": 0.051905639469623566, "learning_rate": 6.8850248729964595e-06, "loss": 0.014, "num_input_tokens_seen": 27106624, "step": 30320 }, { "epoch": 8.003431437244291, "grad_norm": 0.4335690438747406, "learning_rate": 6.8782603522230314e-06, "loss": 0.0454, "num_input_tokens_seen": 27111264, "step": 30325 }, { "epoch": 8.004751220799788, "grad_norm": 0.15811732411384583, "learning_rate": 6.871498626049591e-06, "loss": 0.0638, "num_input_tokens_seen": 27115872, "step": 30330 }, { "epoch": 8.006071004355286, "grad_norm": 0.21418800950050354, "learning_rate": 6.8647396955188875e-06, "loss": 0.093, "num_input_tokens_seen": 27120096, "step": 30335 }, { "epoch": 8.007390787910783, "grad_norm": 0.0878409668803215, "learning_rate": 6.857983561673218e-06, "loss": 0.0507, "num_input_tokens_seen": 27124736, "step": 30340 }, { "epoch": 8.00871057146628, "grad_norm": 0.1614827662706375, "learning_rate": 6.851230225554467e-06, "loss": 0.0545, "num_input_tokens_seen": 27129024, "step": 30345 }, { "epoch": 8.010030355021776, "grad_norm": 0.4831705689430237, "learning_rate": 6.8444796882040946e-06, "loss": 0.0712, "num_input_tokens_seen": 27133344, "step": 30350 }, { "epoch": 8.011350138577273, "grad_norm": 0.2862592041492462, "learning_rate": 6.837731950663106e-06, "loss": 0.0678, "num_input_tokens_seen": 27137824, "step": 30355 }, { "epoch": 8.01266992213277, "grad_norm": 0.18406331539154053, "learning_rate": 6.830987013972098e-06, "loss": 0.0216, "num_input_tokens_seen": 27142624, "step": 30360 }, { "epoch": 8.013989705688267, "grad_norm": 0.07311908155679703, "learning_rate": 6.82424487917121e-06, "loss": 0.0344, "num_input_tokens_seen": 27146848, "step": 30365 }, { "epoch": 8.015309489243764, "grad_norm": 0.1424454152584076, "learning_rate": 6.8175055473001735e-06, "loss": 0.0353, "num_input_tokens_seen": 27151200, "step": 30370 }, { "epoch": 8.01662927279926, "grad_norm": 0.34248077869415283, "learning_rate": 6.8107690193982855e-06, "loss": 0.0618, "num_input_tokens_seen": 27155712, "step": 30375 }, { "epoch": 8.017949056354757, "grad_norm": 0.26333096623420715, "learning_rate": 6.804035296504385e-06, "loss": 0.1031, "num_input_tokens_seen": 27160512, "step": 30380 }, { "epoch": 8.019268839910255, "grad_norm": 0.14900581538677216, "learning_rate": 6.797304379656916e-06, "loss": 0.0403, "num_input_tokens_seen": 27164672, "step": 30385 }, { "epoch": 8.020588623465752, "grad_norm": 0.0962800458073616, "learning_rate": 6.790576269893861e-06, "loss": 0.1023, "num_input_tokens_seen": 27169152, "step": 30390 }, { "epoch": 8.021908407021249, "grad_norm": 0.07552297413349152, "learning_rate": 6.783850968252772e-06, "loss": 0.1338, "num_input_tokens_seen": 27173696, "step": 30395 }, { "epoch": 8.023228190576745, "grad_norm": 0.48807820677757263, "learning_rate": 6.777128475770789e-06, "loss": 0.0414, "num_input_tokens_seen": 27178304, "step": 30400 }, { "epoch": 8.023228190576745, "eval_loss": 0.06699827313423157, "eval_runtime": 64.7936, "eval_samples_per_second": 103.945, "eval_steps_per_second": 25.99, "num_input_tokens_seen": 27178304, "step": 30400 }, { "epoch": 8.024547974132242, "grad_norm": 0.2678922712802887, "learning_rate": 6.77040879348459e-06, "loss": 0.084, "num_input_tokens_seen": 27182560, "step": 30405 }, { "epoch": 8.02586775768774, "grad_norm": 0.12824031710624695, "learning_rate": 6.763691922430443e-06, "loss": 0.0878, "num_input_tokens_seen": 27187008, "step": 30410 }, { "epoch": 8.027187541243237, "grad_norm": 0.16855241358280182, "learning_rate": 6.756977863644178e-06, "loss": 0.0682, "num_input_tokens_seen": 27191808, "step": 30415 }, { "epoch": 8.028507324798733, "grad_norm": 0.21065367758274078, "learning_rate": 6.7502666181611804e-06, "loss": 0.1042, "num_input_tokens_seen": 27196256, "step": 30420 }, { "epoch": 8.02982710835423, "grad_norm": 0.19942709803581238, "learning_rate": 6.743558187016405e-06, "loss": 0.0439, "num_input_tokens_seen": 27200736, "step": 30425 }, { "epoch": 8.031146891909726, "grad_norm": 0.2652882933616638, "learning_rate": 6.7368525712443925e-06, "loss": 0.0616, "num_input_tokens_seen": 27204928, "step": 30430 }, { "epoch": 8.032466675465225, "grad_norm": 0.16896645724773407, "learning_rate": 6.7301497718792155e-06, "loss": 0.0407, "num_input_tokens_seen": 27209472, "step": 30435 }, { "epoch": 8.033786459020721, "grad_norm": 0.03256889432668686, "learning_rate": 6.723449789954544e-06, "loss": 0.0326, "num_input_tokens_seen": 27213952, "step": 30440 }, { "epoch": 8.035106242576218, "grad_norm": 0.11549127101898193, "learning_rate": 6.716752626503586e-06, "loss": 0.0672, "num_input_tokens_seen": 27218528, "step": 30445 }, { "epoch": 8.036426026131714, "grad_norm": 0.04922103136777878, "learning_rate": 6.710058282559131e-06, "loss": 0.0428, "num_input_tokens_seen": 27223072, "step": 30450 }, { "epoch": 8.03774580968721, "grad_norm": 0.052635084837675095, "learning_rate": 6.703366759153545e-06, "loss": 0.0545, "num_input_tokens_seen": 27227520, "step": 30455 }, { "epoch": 8.039065593242707, "grad_norm": 0.24790987372398376, "learning_rate": 6.6966780573187335e-06, "loss": 0.0327, "num_input_tokens_seen": 27231712, "step": 30460 }, { "epoch": 8.040385376798206, "grad_norm": 0.10166924446821213, "learning_rate": 6.689992178086174e-06, "loss": 0.0208, "num_input_tokens_seen": 27236224, "step": 30465 }, { "epoch": 8.041705160353702, "grad_norm": 0.35070276260375977, "learning_rate": 6.683309122486925e-06, "loss": 0.06, "num_input_tokens_seen": 27240640, "step": 30470 }, { "epoch": 8.043024943909199, "grad_norm": 0.38466334342956543, "learning_rate": 6.676628891551584e-06, "loss": 0.0814, "num_input_tokens_seen": 27244800, "step": 30475 }, { "epoch": 8.044344727464695, "grad_norm": 0.384154349565506, "learning_rate": 6.6699514863103385e-06, "loss": 0.1041, "num_input_tokens_seen": 27249344, "step": 30480 }, { "epoch": 8.045664511020192, "grad_norm": 0.15531635284423828, "learning_rate": 6.663276907792921e-06, "loss": 0.0454, "num_input_tokens_seen": 27253920, "step": 30485 }, { "epoch": 8.04698429457569, "grad_norm": 0.2224864959716797, "learning_rate": 6.656605157028634e-06, "loss": 0.0272, "num_input_tokens_seen": 27258368, "step": 30490 }, { "epoch": 8.048304078131187, "grad_norm": 0.19829757511615753, "learning_rate": 6.649936235046358e-06, "loss": 0.0604, "num_input_tokens_seen": 27262656, "step": 30495 }, { "epoch": 8.049623861686683, "grad_norm": 0.21809379756450653, "learning_rate": 6.643270142874508e-06, "loss": 0.0455, "num_input_tokens_seen": 27267040, "step": 30500 }, { "epoch": 8.05094364524218, "grad_norm": 0.1967085599899292, "learning_rate": 6.636606881541094e-06, "loss": 0.055, "num_input_tokens_seen": 27271520, "step": 30505 }, { "epoch": 8.052263428797676, "grad_norm": 0.2552953064441681, "learning_rate": 6.629946452073662e-06, "loss": 0.0781, "num_input_tokens_seen": 27276384, "step": 30510 }, { "epoch": 8.053583212353175, "grad_norm": 0.3122285008430481, "learning_rate": 6.6232888554993375e-06, "loss": 0.0699, "num_input_tokens_seen": 27280992, "step": 30515 }, { "epoch": 8.054902995908671, "grad_norm": 0.29305362701416016, "learning_rate": 6.616634092844817e-06, "loss": 0.0948, "num_input_tokens_seen": 27285568, "step": 30520 }, { "epoch": 8.056222779464168, "grad_norm": 0.32683810591697693, "learning_rate": 6.609982165136331e-06, "loss": 0.0489, "num_input_tokens_seen": 27290272, "step": 30525 }, { "epoch": 8.057542563019664, "grad_norm": 0.16808360815048218, "learning_rate": 6.603333073399706e-06, "loss": 0.0822, "num_input_tokens_seen": 27294432, "step": 30530 }, { "epoch": 8.058862346575161, "grad_norm": 0.2741433084011078, "learning_rate": 6.596686818660308e-06, "loss": 0.0691, "num_input_tokens_seen": 27298944, "step": 30535 }, { "epoch": 8.06018213013066, "grad_norm": 0.10603894293308258, "learning_rate": 6.590043401943066e-06, "loss": 0.0666, "num_input_tokens_seen": 27303104, "step": 30540 }, { "epoch": 8.061501913686156, "grad_norm": 0.4956870377063751, "learning_rate": 6.583402824272494e-06, "loss": 0.0773, "num_input_tokens_seen": 27307392, "step": 30545 }, { "epoch": 8.062821697241652, "grad_norm": 0.12128914147615433, "learning_rate": 6.576765086672634e-06, "loss": 0.0255, "num_input_tokens_seen": 27312352, "step": 30550 }, { "epoch": 8.064141480797149, "grad_norm": 0.09428085386753082, "learning_rate": 6.57013019016712e-06, "loss": 0.0335, "num_input_tokens_seen": 27316928, "step": 30555 }, { "epoch": 8.065461264352646, "grad_norm": 0.26983922719955444, "learning_rate": 6.563498135779142e-06, "loss": 0.0429, "num_input_tokens_seen": 27321568, "step": 30560 }, { "epoch": 8.066781047908144, "grad_norm": 0.014601712115108967, "learning_rate": 6.556868924531431e-06, "loss": 0.0547, "num_input_tokens_seen": 27326112, "step": 30565 }, { "epoch": 8.06810083146364, "grad_norm": 0.049448683857917786, "learning_rate": 6.550242557446304e-06, "loss": 0.0394, "num_input_tokens_seen": 27330592, "step": 30570 }, { "epoch": 8.069420615019137, "grad_norm": 0.06863787025213242, "learning_rate": 6.543619035545634e-06, "loss": 0.0195, "num_input_tokens_seen": 27335136, "step": 30575 }, { "epoch": 8.070740398574634, "grad_norm": 0.04687635600566864, "learning_rate": 6.53699835985084e-06, "loss": 0.0201, "num_input_tokens_seen": 27339456, "step": 30580 }, { "epoch": 8.07206018213013, "grad_norm": 0.1384400576353073, "learning_rate": 6.530380531382927e-06, "loss": 0.039, "num_input_tokens_seen": 27343808, "step": 30585 }, { "epoch": 8.073379965685628, "grad_norm": 0.23200586438179016, "learning_rate": 6.523765551162433e-06, "loss": 0.0717, "num_input_tokens_seen": 27348320, "step": 30590 }, { "epoch": 8.074699749241125, "grad_norm": 0.1067097932100296, "learning_rate": 6.517153420209476e-06, "loss": 0.0332, "num_input_tokens_seen": 27352800, "step": 30595 }, { "epoch": 8.076019532796622, "grad_norm": 0.08200924098491669, "learning_rate": 6.510544139543739e-06, "loss": 0.0434, "num_input_tokens_seen": 27356864, "step": 30600 }, { "epoch": 8.076019532796622, "eval_loss": 0.06709827482700348, "eval_runtime": 64.8336, "eval_samples_per_second": 103.881, "eval_steps_per_second": 25.974, "num_input_tokens_seen": 27356864, "step": 30600 }, { "epoch": 8.077339316352118, "grad_norm": 0.2393959015607834, "learning_rate": 6.503937710184452e-06, "loss": 0.1075, "num_input_tokens_seen": 27361376, "step": 30605 }, { "epoch": 8.078659099907615, "grad_norm": 0.11751607805490494, "learning_rate": 6.4973341331503954e-06, "loss": 0.045, "num_input_tokens_seen": 27366112, "step": 30610 }, { "epoch": 8.079978883463111, "grad_norm": 0.1562637835741043, "learning_rate": 6.490733409459942e-06, "loss": 0.0556, "num_input_tokens_seen": 27370752, "step": 30615 }, { "epoch": 8.08129866701861, "grad_norm": 0.4470108151435852, "learning_rate": 6.484135540130995e-06, "loss": 0.0358, "num_input_tokens_seen": 27375232, "step": 30620 }, { "epoch": 8.082618450574106, "grad_norm": 0.04428371787071228, "learning_rate": 6.4775405261810364e-06, "loss": 0.061, "num_input_tokens_seen": 27379712, "step": 30625 }, { "epoch": 8.083938234129603, "grad_norm": 0.29352280497550964, "learning_rate": 6.470948368627092e-06, "loss": 0.1114, "num_input_tokens_seen": 27384000, "step": 30630 }, { "epoch": 8.0852580176851, "grad_norm": 0.36975812911987305, "learning_rate": 6.464359068485756e-06, "loss": 0.0439, "num_input_tokens_seen": 27388576, "step": 30635 }, { "epoch": 8.086577801240596, "grad_norm": 0.24279573559761047, "learning_rate": 6.457772626773195e-06, "loss": 0.064, "num_input_tokens_seen": 27393216, "step": 30640 }, { "epoch": 8.087897584796094, "grad_norm": 0.21298812329769135, "learning_rate": 6.451189044505104e-06, "loss": 0.038, "num_input_tokens_seen": 27397664, "step": 30645 }, { "epoch": 8.08921736835159, "grad_norm": 0.3142053186893463, "learning_rate": 6.44460832269676e-06, "loss": 0.1105, "num_input_tokens_seen": 27402272, "step": 30650 }, { "epoch": 8.090537151907087, "grad_norm": 0.041229307651519775, "learning_rate": 6.438030462363001e-06, "loss": 0.0737, "num_input_tokens_seen": 27406816, "step": 30655 }, { "epoch": 8.091856935462584, "grad_norm": 0.11774046719074249, "learning_rate": 6.431455464518205e-06, "loss": 0.0323, "num_input_tokens_seen": 27411552, "step": 30660 }, { "epoch": 8.09317671901808, "grad_norm": 0.2325984090566635, "learning_rate": 6.424883330176326e-06, "loss": 0.0435, "num_input_tokens_seen": 27416128, "step": 30665 }, { "epoch": 8.094496502573579, "grad_norm": 0.3053373396396637, "learning_rate": 6.418314060350864e-06, "loss": 0.0971, "num_input_tokens_seen": 27420736, "step": 30670 }, { "epoch": 8.095816286129075, "grad_norm": 0.1313740313053131, "learning_rate": 6.4117476560548895e-06, "loss": 0.0867, "num_input_tokens_seen": 27425376, "step": 30675 }, { "epoch": 8.097136069684572, "grad_norm": 0.05126567557454109, "learning_rate": 6.405184118301016e-06, "loss": 0.0377, "num_input_tokens_seen": 27429600, "step": 30680 }, { "epoch": 8.098455853240068, "grad_norm": 0.09584731608629227, "learning_rate": 6.398623448101434e-06, "loss": 0.0257, "num_input_tokens_seen": 27433792, "step": 30685 }, { "epoch": 8.099775636795565, "grad_norm": 0.07174982130527496, "learning_rate": 6.392065646467871e-06, "loss": 0.0257, "num_input_tokens_seen": 27438432, "step": 30690 }, { "epoch": 8.101095420351063, "grad_norm": 0.16257187724113464, "learning_rate": 6.385510714411632e-06, "loss": 0.0555, "num_input_tokens_seen": 27443136, "step": 30695 }, { "epoch": 8.10241520390656, "grad_norm": 0.038926783949136734, "learning_rate": 6.378958652943559e-06, "loss": 0.0236, "num_input_tokens_seen": 27447712, "step": 30700 }, { "epoch": 8.103734987462056, "grad_norm": 0.2738514840602875, "learning_rate": 6.3724094630740776e-06, "loss": 0.0624, "num_input_tokens_seen": 27452096, "step": 30705 }, { "epoch": 8.105054771017553, "grad_norm": 0.211082324385643, "learning_rate": 6.365863145813136e-06, "loss": 0.0358, "num_input_tokens_seen": 27456672, "step": 30710 }, { "epoch": 8.10637455457305, "grad_norm": 0.10475766658782959, "learning_rate": 6.359319702170269e-06, "loss": 0.0161, "num_input_tokens_seen": 27460832, "step": 30715 }, { "epoch": 8.107694338128548, "grad_norm": 0.1974291354417801, "learning_rate": 6.352779133154566e-06, "loss": 0.0275, "num_input_tokens_seen": 27465088, "step": 30720 }, { "epoch": 8.109014121684044, "grad_norm": 0.0966809093952179, "learning_rate": 6.346241439774648e-06, "loss": 0.0805, "num_input_tokens_seen": 27469632, "step": 30725 }, { "epoch": 8.110333905239541, "grad_norm": 0.04558365046977997, "learning_rate": 6.339706623038716e-06, "loss": 0.0447, "num_input_tokens_seen": 27474080, "step": 30730 }, { "epoch": 8.111653688795037, "grad_norm": 0.09342856705188751, "learning_rate": 6.333174683954532e-06, "loss": 0.0395, "num_input_tokens_seen": 27478496, "step": 30735 }, { "epoch": 8.112973472350534, "grad_norm": 0.08678475767374039, "learning_rate": 6.326645623529387e-06, "loss": 0.0615, "num_input_tokens_seen": 27482752, "step": 30740 }, { "epoch": 8.11429325590603, "grad_norm": 0.33435139060020447, "learning_rate": 6.320119442770156e-06, "loss": 0.0706, "num_input_tokens_seen": 27487296, "step": 30745 }, { "epoch": 8.115613039461529, "grad_norm": 0.34346190094947815, "learning_rate": 6.313596142683254e-06, "loss": 0.1007, "num_input_tokens_seen": 27492288, "step": 30750 }, { "epoch": 8.116932823017025, "grad_norm": 0.2475024163722992, "learning_rate": 6.307075724274647e-06, "loss": 0.0317, "num_input_tokens_seen": 27496576, "step": 30755 }, { "epoch": 8.118252606572522, "grad_norm": 0.18166270852088928, "learning_rate": 6.300558188549882e-06, "loss": 0.0718, "num_input_tokens_seen": 27500960, "step": 30760 }, { "epoch": 8.119572390128019, "grad_norm": 0.0950164943933487, "learning_rate": 6.29404353651403e-06, "loss": 0.0962, "num_input_tokens_seen": 27505440, "step": 30765 }, { "epoch": 8.120892173683515, "grad_norm": 0.19131559133529663, "learning_rate": 6.287531769171737e-06, "loss": 0.0557, "num_input_tokens_seen": 27509920, "step": 30770 }, { "epoch": 8.122211957239013, "grad_norm": 0.5464094877243042, "learning_rate": 6.2810228875272045e-06, "loss": 0.0966, "num_input_tokens_seen": 27514464, "step": 30775 }, { "epoch": 8.12353174079451, "grad_norm": 0.5447847843170166, "learning_rate": 6.274516892584179e-06, "loss": 0.1148, "num_input_tokens_seen": 27519040, "step": 30780 }, { "epoch": 8.124851524350007, "grad_norm": 0.2564816474914551, "learning_rate": 6.268013785345969e-06, "loss": 0.0743, "num_input_tokens_seen": 27523776, "step": 30785 }, { "epoch": 8.126171307905503, "grad_norm": 0.3408893644809723, "learning_rate": 6.26151356681543e-06, "loss": 0.0328, "num_input_tokens_seen": 27527936, "step": 30790 }, { "epoch": 8.127491091461, "grad_norm": 0.2511783838272095, "learning_rate": 6.255016237994981e-06, "loss": 0.0882, "num_input_tokens_seen": 27532128, "step": 30795 }, { "epoch": 8.128810875016498, "grad_norm": 0.26270827651023865, "learning_rate": 6.248521799886603e-06, "loss": 0.0436, "num_input_tokens_seen": 27536640, "step": 30800 }, { "epoch": 8.128810875016498, "eval_loss": 0.06699128448963165, "eval_runtime": 64.7815, "eval_samples_per_second": 103.965, "eval_steps_per_second": 25.995, "num_input_tokens_seen": 27536640, "step": 30800 }, { "epoch": 8.130130658571995, "grad_norm": 0.25550466775894165, "learning_rate": 6.242030253491798e-06, "loss": 0.0685, "num_input_tokens_seen": 27540896, "step": 30805 }, { "epoch": 8.131450442127491, "grad_norm": 0.2523503005504608, "learning_rate": 6.235541599811656e-06, "loss": 0.0806, "num_input_tokens_seen": 27545152, "step": 30810 }, { "epoch": 8.132770225682988, "grad_norm": 0.08061458170413971, "learning_rate": 6.229055839846814e-06, "loss": 0.0364, "num_input_tokens_seen": 27549984, "step": 30815 }, { "epoch": 8.134090009238484, "grad_norm": 0.07186921685934067, "learning_rate": 6.222572974597455e-06, "loss": 0.0175, "num_input_tokens_seen": 27554368, "step": 30820 }, { "epoch": 8.135409792793983, "grad_norm": 0.3117089867591858, "learning_rate": 6.216093005063306e-06, "loss": 0.0568, "num_input_tokens_seen": 27558720, "step": 30825 }, { "epoch": 8.13672957634948, "grad_norm": 0.36000946164131165, "learning_rate": 6.209615932243678e-06, "loss": 0.0854, "num_input_tokens_seen": 27562976, "step": 30830 }, { "epoch": 8.138049359904976, "grad_norm": 0.0566524975001812, "learning_rate": 6.203141757137399e-06, "loss": 0.0561, "num_input_tokens_seen": 27567520, "step": 30835 }, { "epoch": 8.139369143460472, "grad_norm": 0.13415862619876862, "learning_rate": 6.196670480742886e-06, "loss": 0.0432, "num_input_tokens_seen": 27571872, "step": 30840 }, { "epoch": 8.140688927015969, "grad_norm": 0.12202560901641846, "learning_rate": 6.190202104058074e-06, "loss": 0.0429, "num_input_tokens_seen": 27576384, "step": 30845 }, { "epoch": 8.142008710571467, "grad_norm": 0.03247178718447685, "learning_rate": 6.183736628080475e-06, "loss": 0.0431, "num_input_tokens_seen": 27580800, "step": 30850 }, { "epoch": 8.143328494126964, "grad_norm": 0.36564645171165466, "learning_rate": 6.177274053807155e-06, "loss": 0.0771, "num_input_tokens_seen": 27585568, "step": 30855 }, { "epoch": 8.14464827768246, "grad_norm": 0.06069609895348549, "learning_rate": 6.170814382234713e-06, "loss": 0.0806, "num_input_tokens_seen": 27590080, "step": 30860 }, { "epoch": 8.145968061237957, "grad_norm": 0.03083733841776848, "learning_rate": 6.16435761435932e-06, "loss": 0.0215, "num_input_tokens_seen": 27594464, "step": 30865 }, { "epoch": 8.147287844793453, "grad_norm": 0.22972054779529572, "learning_rate": 6.157903751176681e-06, "loss": 0.0818, "num_input_tokens_seen": 27598816, "step": 30870 }, { "epoch": 8.14860762834895, "grad_norm": 0.28183144330978394, "learning_rate": 6.151452793682066e-06, "loss": 0.0367, "num_input_tokens_seen": 27603200, "step": 30875 }, { "epoch": 8.149927411904448, "grad_norm": 0.1983894556760788, "learning_rate": 6.145004742870305e-06, "loss": 0.034, "num_input_tokens_seen": 27607488, "step": 30880 }, { "epoch": 8.151247195459945, "grad_norm": 0.16630153357982635, "learning_rate": 6.138559599735752e-06, "loss": 0.0595, "num_input_tokens_seen": 27611968, "step": 30885 }, { "epoch": 8.152566979015441, "grad_norm": 0.15515772998332977, "learning_rate": 6.132117365272344e-06, "loss": 0.0859, "num_input_tokens_seen": 27616608, "step": 30890 }, { "epoch": 8.153886762570938, "grad_norm": 0.72199547290802, "learning_rate": 6.125678040473545e-06, "loss": 0.0723, "num_input_tokens_seen": 27620960, "step": 30895 }, { "epoch": 8.155206546126434, "grad_norm": 0.3908475637435913, "learning_rate": 6.1192416263323755e-06, "loss": 0.1086, "num_input_tokens_seen": 27625216, "step": 30900 }, { "epoch": 8.156526329681933, "grad_norm": 0.3268996775150299, "learning_rate": 6.112808123841424e-06, "loss": 0.0424, "num_input_tokens_seen": 27629696, "step": 30905 }, { "epoch": 8.15784611323743, "grad_norm": 0.25765612721443176, "learning_rate": 6.106377533992805e-06, "loss": 0.0705, "num_input_tokens_seen": 27634144, "step": 30910 }, { "epoch": 8.159165896792926, "grad_norm": 0.12074697017669678, "learning_rate": 6.099949857778204e-06, "loss": 0.0138, "num_input_tokens_seen": 27638656, "step": 30915 }, { "epoch": 8.160485680348422, "grad_norm": 0.2204914540052414, "learning_rate": 6.093525096188852e-06, "loss": 0.0687, "num_input_tokens_seen": 27642880, "step": 30920 }, { "epoch": 8.161805463903919, "grad_norm": 0.1614728420972824, "learning_rate": 6.087103250215518e-06, "loss": 0.036, "num_input_tokens_seen": 27647168, "step": 30925 }, { "epoch": 8.163125247459417, "grad_norm": 0.2823449373245239, "learning_rate": 6.080684320848537e-06, "loss": 0.0682, "num_input_tokens_seen": 27651680, "step": 30930 }, { "epoch": 8.164445031014914, "grad_norm": 0.3879050314426422, "learning_rate": 6.074268309077794e-06, "loss": 0.0497, "num_input_tokens_seen": 27656288, "step": 30935 }, { "epoch": 8.16576481457041, "grad_norm": 0.32613587379455566, "learning_rate": 6.067855215892709e-06, "loss": 0.0651, "num_input_tokens_seen": 27660640, "step": 30940 }, { "epoch": 8.167084598125907, "grad_norm": 0.06942100077867508, "learning_rate": 6.061445042282271e-06, "loss": 0.0473, "num_input_tokens_seen": 27665024, "step": 30945 }, { "epoch": 8.168404381681404, "grad_norm": 0.05501936748623848, "learning_rate": 6.055037789234999e-06, "loss": 0.0574, "num_input_tokens_seen": 27669504, "step": 30950 }, { "epoch": 8.169724165236902, "grad_norm": 0.22663424909114838, "learning_rate": 6.048633457738975e-06, "loss": 0.054, "num_input_tokens_seen": 27674176, "step": 30955 }, { "epoch": 8.171043948792398, "grad_norm": 0.13617445528507233, "learning_rate": 6.042232048781837e-06, "loss": 0.0641, "num_input_tokens_seen": 27678624, "step": 30960 }, { "epoch": 8.172363732347895, "grad_norm": 0.34475985169410706, "learning_rate": 6.035833563350757e-06, "loss": 0.0509, "num_input_tokens_seen": 27683168, "step": 30965 }, { "epoch": 8.173683515903392, "grad_norm": 0.029613815248012543, "learning_rate": 6.0294380024324525e-06, "loss": 0.0323, "num_input_tokens_seen": 27687392, "step": 30970 }, { "epoch": 8.175003299458888, "grad_norm": 0.2132827192544937, "learning_rate": 6.023045367013213e-06, "loss": 0.09, "num_input_tokens_seen": 27691840, "step": 30975 }, { "epoch": 8.176323083014386, "grad_norm": 0.5717481374740601, "learning_rate": 6.016655658078851e-06, "loss": 0.0719, "num_input_tokens_seen": 27696096, "step": 30980 }, { "epoch": 8.177642866569883, "grad_norm": 0.04379801079630852, "learning_rate": 6.010268876614753e-06, "loss": 0.0742, "num_input_tokens_seen": 27700608, "step": 30985 }, { "epoch": 8.17896265012538, "grad_norm": 0.45642977952957153, "learning_rate": 6.0038850236058266e-06, "loss": 0.0314, "num_input_tokens_seen": 27705344, "step": 30990 }, { "epoch": 8.180282433680876, "grad_norm": 0.02169051207602024, "learning_rate": 5.997504100036549e-06, "loss": 0.0434, "num_input_tokens_seen": 27709920, "step": 30995 }, { "epoch": 8.181602217236373, "grad_norm": 0.40946316719055176, "learning_rate": 5.991126106890949e-06, "loss": 0.0449, "num_input_tokens_seen": 27714496, "step": 31000 }, { "epoch": 8.181602217236373, "eval_loss": 0.06689025461673737, "eval_runtime": 64.8118, "eval_samples_per_second": 103.916, "eval_steps_per_second": 25.983, "num_input_tokens_seen": 27714496, "step": 31000 }, { "epoch": 8.18292200079187, "grad_norm": 0.2756527066230774, "learning_rate": 5.984751045152576e-06, "loss": 0.0865, "num_input_tokens_seen": 27718976, "step": 31005 }, { "epoch": 8.184241784347368, "grad_norm": 0.11364801973104477, "learning_rate": 5.978378915804553e-06, "loss": 0.0464, "num_input_tokens_seen": 27723488, "step": 31010 }, { "epoch": 8.185561567902864, "grad_norm": 0.07122594118118286, "learning_rate": 5.972009719829547e-06, "loss": 0.0309, "num_input_tokens_seen": 27728064, "step": 31015 }, { "epoch": 8.18688135145836, "grad_norm": 0.28265488147735596, "learning_rate": 5.965643458209755e-06, "loss": 0.08, "num_input_tokens_seen": 27732800, "step": 31020 }, { "epoch": 8.188201135013857, "grad_norm": 0.044520679861307144, "learning_rate": 5.95928013192695e-06, "loss": 0.0808, "num_input_tokens_seen": 27737536, "step": 31025 }, { "epoch": 8.189520918569354, "grad_norm": 0.21377992630004883, "learning_rate": 5.952919741962423e-06, "loss": 0.0371, "num_input_tokens_seen": 27741824, "step": 31030 }, { "epoch": 8.190840702124852, "grad_norm": 0.06417445093393326, "learning_rate": 5.946562289297042e-06, "loss": 0.0422, "num_input_tokens_seen": 27746592, "step": 31035 }, { "epoch": 8.192160485680349, "grad_norm": 0.09747159481048584, "learning_rate": 5.9402077749111855e-06, "loss": 0.0418, "num_input_tokens_seen": 27751104, "step": 31040 }, { "epoch": 8.193480269235845, "grad_norm": 0.19890965521335602, "learning_rate": 5.933856199784821e-06, "loss": 0.0549, "num_input_tokens_seen": 27755520, "step": 31045 }, { "epoch": 8.194800052791342, "grad_norm": 0.629007875919342, "learning_rate": 5.927507564897419e-06, "loss": 0.1318, "num_input_tokens_seen": 27760096, "step": 31050 }, { "epoch": 8.196119836346838, "grad_norm": 0.22235006093978882, "learning_rate": 5.9211618712280395e-06, "loss": 0.0688, "num_input_tokens_seen": 27764640, "step": 31055 }, { "epoch": 8.197439619902337, "grad_norm": 0.055925942957401276, "learning_rate": 5.914819119755255e-06, "loss": 0.0288, "num_input_tokens_seen": 27769024, "step": 31060 }, { "epoch": 8.198759403457833, "grad_norm": 0.3730144500732422, "learning_rate": 5.908479311457205e-06, "loss": 0.0961, "num_input_tokens_seen": 27773504, "step": 31065 }, { "epoch": 8.20007918701333, "grad_norm": 0.16459566354751587, "learning_rate": 5.902142447311559e-06, "loss": 0.0776, "num_input_tokens_seen": 27777792, "step": 31070 }, { "epoch": 8.201398970568826, "grad_norm": 0.4478456974029541, "learning_rate": 5.895808528295546e-06, "loss": 0.0845, "num_input_tokens_seen": 27782336, "step": 31075 }, { "epoch": 8.202718754124323, "grad_norm": 0.16906821727752686, "learning_rate": 5.889477555385941e-06, "loss": 0.0613, "num_input_tokens_seen": 27786752, "step": 31080 }, { "epoch": 8.204038537679821, "grad_norm": 0.3061339557170868, "learning_rate": 5.883149529559051e-06, "loss": 0.018, "num_input_tokens_seen": 27791008, "step": 31085 }, { "epoch": 8.205358321235318, "grad_norm": 0.09723767638206482, "learning_rate": 5.876824451790738e-06, "loss": 0.0394, "num_input_tokens_seen": 27795776, "step": 31090 }, { "epoch": 8.206678104790814, "grad_norm": 0.19318608939647675, "learning_rate": 5.87050232305642e-06, "loss": 0.0533, "num_input_tokens_seen": 27800032, "step": 31095 }, { "epoch": 8.207997888346311, "grad_norm": 0.4211634397506714, "learning_rate": 5.864183144331034e-06, "loss": 0.0673, "num_input_tokens_seen": 27804320, "step": 31100 }, { "epoch": 8.209317671901808, "grad_norm": 0.23600414395332336, "learning_rate": 5.857866916589089e-06, "loss": 0.0484, "num_input_tokens_seen": 27808544, "step": 31105 }, { "epoch": 8.210637455457306, "grad_norm": 0.03233930468559265, "learning_rate": 5.8515536408046216e-06, "loss": 0.0587, "num_input_tokens_seen": 27813280, "step": 31110 }, { "epoch": 8.211957239012802, "grad_norm": 0.11944656819105148, "learning_rate": 5.845243317951208e-06, "loss": 0.0519, "num_input_tokens_seen": 27817696, "step": 31115 }, { "epoch": 8.213277022568299, "grad_norm": 0.342307984828949, "learning_rate": 5.838935949001997e-06, "loss": 0.0736, "num_input_tokens_seen": 27822400, "step": 31120 }, { "epoch": 8.214596806123796, "grad_norm": 0.3261626064777374, "learning_rate": 5.8326315349296476e-06, "loss": 0.0691, "num_input_tokens_seen": 27826944, "step": 31125 }, { "epoch": 8.215916589679292, "grad_norm": 0.4118320345878601, "learning_rate": 5.826330076706396e-06, "loss": 0.0683, "num_input_tokens_seen": 27831456, "step": 31130 }, { "epoch": 8.21723637323479, "grad_norm": 0.048163801431655884, "learning_rate": 5.820031575303988e-06, "loss": 0.019, "num_input_tokens_seen": 27835936, "step": 31135 }, { "epoch": 8.218556156790287, "grad_norm": 0.10341208428144455, "learning_rate": 5.813736031693745e-06, "loss": 0.0721, "num_input_tokens_seen": 27840320, "step": 31140 }, { "epoch": 8.219875940345784, "grad_norm": 0.08211777359247208, "learning_rate": 5.807443446846522e-06, "loss": 0.0244, "num_input_tokens_seen": 27844736, "step": 31145 }, { "epoch": 8.22119572390128, "grad_norm": 0.07241077721118927, "learning_rate": 5.801153821732699e-06, "loss": 0.0159, "num_input_tokens_seen": 27848960, "step": 31150 }, { "epoch": 8.222515507456777, "grad_norm": 0.30976414680480957, "learning_rate": 5.794867157322229e-06, "loss": 0.0588, "num_input_tokens_seen": 27853312, "step": 31155 }, { "epoch": 8.223835291012273, "grad_norm": 0.2636660039424896, "learning_rate": 5.788583454584593e-06, "loss": 0.0854, "num_input_tokens_seen": 27857920, "step": 31160 }, { "epoch": 8.225155074567772, "grad_norm": 0.13529348373413086, "learning_rate": 5.7823027144888075e-06, "loss": 0.042, "num_input_tokens_seen": 27862272, "step": 31165 }, { "epoch": 8.226474858123268, "grad_norm": 0.11944790929555893, "learning_rate": 5.776024938003455e-06, "loss": 0.0379, "num_input_tokens_seen": 27866752, "step": 31170 }, { "epoch": 8.227794641678765, "grad_norm": 0.1988510936498642, "learning_rate": 5.7697501260966345e-06, "loss": 0.0388, "num_input_tokens_seen": 27871552, "step": 31175 }, { "epoch": 8.229114425234261, "grad_norm": 0.23770549893379211, "learning_rate": 5.7634782797360145e-06, "loss": 0.0381, "num_input_tokens_seen": 27876256, "step": 31180 }, { "epoch": 8.230434208789758, "grad_norm": 0.15230992436408997, "learning_rate": 5.757209399888777e-06, "loss": 0.0194, "num_input_tokens_seen": 27880832, "step": 31185 }, { "epoch": 8.231753992345256, "grad_norm": 0.3588865399360657, "learning_rate": 5.750943487521679e-06, "loss": 0.0591, "num_input_tokens_seen": 27885120, "step": 31190 }, { "epoch": 8.233073775900753, "grad_norm": 0.2936408221721649, "learning_rate": 5.744680543600986e-06, "loss": 0.0492, "num_input_tokens_seen": 27889248, "step": 31195 }, { "epoch": 8.23439355945625, "grad_norm": 0.17947396636009216, "learning_rate": 5.738420569092537e-06, "loss": 0.0607, "num_input_tokens_seen": 27893536, "step": 31200 }, { "epoch": 8.23439355945625, "eval_loss": 0.06724120676517487, "eval_runtime": 64.772, "eval_samples_per_second": 103.98, "eval_steps_per_second": 25.999, "num_input_tokens_seen": 27893536, "step": 31200 }, { "epoch": 8.235713343011746, "grad_norm": 0.14596471190452576, "learning_rate": 5.732163564961684e-06, "loss": 0.056, "num_input_tokens_seen": 27897824, "step": 31205 }, { "epoch": 8.237033126567242, "grad_norm": 0.07952891290187836, "learning_rate": 5.725909532173354e-06, "loss": 0.0471, "num_input_tokens_seen": 27902144, "step": 31210 }, { "epoch": 8.23835291012274, "grad_norm": 0.11572608351707458, "learning_rate": 5.719658471691977e-06, "loss": 0.0672, "num_input_tokens_seen": 27906400, "step": 31215 }, { "epoch": 8.239672693678237, "grad_norm": 0.05074088275432587, "learning_rate": 5.71341038448156e-06, "loss": 0.0254, "num_input_tokens_seen": 27910752, "step": 31220 }, { "epoch": 8.240992477233734, "grad_norm": 0.47607237100601196, "learning_rate": 5.707165271505635e-06, "loss": 0.0989, "num_input_tokens_seen": 27914816, "step": 31225 }, { "epoch": 8.24231226078923, "grad_norm": 0.33451172709465027, "learning_rate": 5.700923133727271e-06, "loss": 0.0586, "num_input_tokens_seen": 27918976, "step": 31230 }, { "epoch": 8.243632044344727, "grad_norm": 0.0521954782307148, "learning_rate": 5.694683972109083e-06, "loss": 0.0394, "num_input_tokens_seen": 27923584, "step": 31235 }, { "epoch": 8.244951827900225, "grad_norm": 0.2928738594055176, "learning_rate": 5.688447787613241e-06, "loss": 0.0533, "num_input_tokens_seen": 27928192, "step": 31240 }, { "epoch": 8.246271611455722, "grad_norm": 0.0311958696693182, "learning_rate": 5.6822145812014285e-06, "loss": 0.0363, "num_input_tokens_seen": 27932576, "step": 31245 }, { "epoch": 8.247591395011218, "grad_norm": 0.19424369931221008, "learning_rate": 5.675984353834896e-06, "loss": 0.0564, "num_input_tokens_seen": 27936960, "step": 31250 }, { "epoch": 8.248911178566715, "grad_norm": 0.26832765340805054, "learning_rate": 5.66975710647441e-06, "loss": 0.0392, "num_input_tokens_seen": 27941632, "step": 31255 }, { "epoch": 8.250230962122211, "grad_norm": 0.2575373649597168, "learning_rate": 5.663532840080304e-06, "loss": 0.0912, "num_input_tokens_seen": 27946144, "step": 31260 }, { "epoch": 8.251550745677708, "grad_norm": 0.3163471221923828, "learning_rate": 5.6573115556124325e-06, "loss": 0.0708, "num_input_tokens_seen": 27950880, "step": 31265 }, { "epoch": 8.252870529233206, "grad_norm": 0.12572205066680908, "learning_rate": 5.651093254030185e-06, "loss": 0.0738, "num_input_tokens_seen": 27955136, "step": 31270 }, { "epoch": 8.254190312788703, "grad_norm": 0.06410905718803406, "learning_rate": 5.644877936292514e-06, "loss": 0.0506, "num_input_tokens_seen": 27959712, "step": 31275 }, { "epoch": 8.2555100963442, "grad_norm": 0.08642936497926712, "learning_rate": 5.638665603357901e-06, "loss": 0.0346, "num_input_tokens_seen": 27964256, "step": 31280 }, { "epoch": 8.256829879899696, "grad_norm": 0.1099819540977478, "learning_rate": 5.632456256184357e-06, "loss": 0.0262, "num_input_tokens_seen": 27968800, "step": 31285 }, { "epoch": 8.258149663455193, "grad_norm": 0.2367771416902542, "learning_rate": 5.626249895729452e-06, "loss": 0.0863, "num_input_tokens_seen": 27973312, "step": 31290 }, { "epoch": 8.25946944701069, "grad_norm": 0.18472091853618622, "learning_rate": 5.620046522950273e-06, "loss": 0.0871, "num_input_tokens_seen": 27977888, "step": 31295 }, { "epoch": 8.260789230566187, "grad_norm": 0.40643802285194397, "learning_rate": 5.613846138803464e-06, "loss": 0.0736, "num_input_tokens_seen": 27982304, "step": 31300 }, { "epoch": 8.262109014121684, "grad_norm": 0.1791028082370758, "learning_rate": 5.607648744245206e-06, "loss": 0.051, "num_input_tokens_seen": 27986656, "step": 31305 }, { "epoch": 8.26342879767718, "grad_norm": 0.17201103270053864, "learning_rate": 5.601454340231207e-06, "loss": 0.0653, "num_input_tokens_seen": 27991072, "step": 31310 }, { "epoch": 8.264748581232677, "grad_norm": 0.18725524842739105, "learning_rate": 5.595262927716724e-06, "loss": 0.0433, "num_input_tokens_seen": 27995680, "step": 31315 }, { "epoch": 8.266068364788175, "grad_norm": 0.15287287533283234, "learning_rate": 5.589074507656561e-06, "loss": 0.0735, "num_input_tokens_seen": 28000000, "step": 31320 }, { "epoch": 8.267388148343672, "grad_norm": 0.04334359988570213, "learning_rate": 5.582889081005044e-06, "loss": 0.0606, "num_input_tokens_seen": 28004416, "step": 31325 }, { "epoch": 8.268707931899169, "grad_norm": 0.37684768438339233, "learning_rate": 5.5767066487160316e-06, "loss": 0.0654, "num_input_tokens_seen": 28008864, "step": 31330 }, { "epoch": 8.270027715454665, "grad_norm": 0.648932158946991, "learning_rate": 5.570527211742949e-06, "loss": 0.0916, "num_input_tokens_seen": 28013440, "step": 31335 }, { "epoch": 8.271347499010162, "grad_norm": 0.09987188875675201, "learning_rate": 5.564350771038731e-06, "loss": 0.0686, "num_input_tokens_seen": 28017792, "step": 31340 }, { "epoch": 8.27266728256566, "grad_norm": 0.2771715819835663, "learning_rate": 5.558177327555875e-06, "loss": 0.0973, "num_input_tokens_seen": 28022144, "step": 31345 }, { "epoch": 8.273987066121157, "grad_norm": 0.12438545376062393, "learning_rate": 5.552006882246388e-06, "loss": 0.0881, "num_input_tokens_seen": 28026848, "step": 31350 }, { "epoch": 8.275306849676653, "grad_norm": 0.4493320882320404, "learning_rate": 5.545839436061839e-06, "loss": 0.0386, "num_input_tokens_seen": 28031360, "step": 31355 }, { "epoch": 8.27662663323215, "grad_norm": 0.05131245404481888, "learning_rate": 5.539674989953331e-06, "loss": 0.0572, "num_input_tokens_seen": 28036160, "step": 31360 }, { "epoch": 8.277946416787646, "grad_norm": 0.15000054240226746, "learning_rate": 5.533513544871488e-06, "loss": 0.0484, "num_input_tokens_seen": 28040448, "step": 31365 }, { "epoch": 8.279266200343145, "grad_norm": 0.3825835883617401, "learning_rate": 5.527355101766493e-06, "loss": 0.0774, "num_input_tokens_seen": 28044768, "step": 31370 }, { "epoch": 8.280585983898641, "grad_norm": 0.0758058950304985, "learning_rate": 5.521199661588044e-06, "loss": 0.0335, "num_input_tokens_seen": 28049376, "step": 31375 }, { "epoch": 8.281905767454138, "grad_norm": 0.4857470989227295, "learning_rate": 5.5150472252853944e-06, "loss": 0.0931, "num_input_tokens_seen": 28053664, "step": 31380 }, { "epoch": 8.283225551009634, "grad_norm": 0.2806839346885681, "learning_rate": 5.50889779380733e-06, "loss": 0.0701, "num_input_tokens_seen": 28058112, "step": 31385 }, { "epoch": 8.28454533456513, "grad_norm": 0.09459013491868973, "learning_rate": 5.5027513681021605e-06, "loss": 0.0499, "num_input_tokens_seen": 28062688, "step": 31390 }, { "epoch": 8.285865118120629, "grad_norm": 0.11410725861787796, "learning_rate": 5.4966079491177545e-06, "loss": 0.0678, "num_input_tokens_seen": 28067392, "step": 31395 }, { "epoch": 8.287184901676126, "grad_norm": 0.08025260269641876, "learning_rate": 5.490467537801491e-06, "loss": 0.0881, "num_input_tokens_seen": 28071808, "step": 31400 }, { "epoch": 8.287184901676126, "eval_loss": 0.06709538400173187, "eval_runtime": 64.7836, "eval_samples_per_second": 103.961, "eval_steps_per_second": 25.994, "num_input_tokens_seen": 28071808, "step": 31400 }, { "epoch": 8.288504685231622, "grad_norm": 0.2685471475124359, "learning_rate": 5.484330135100313e-06, "loss": 0.0382, "num_input_tokens_seen": 28076480, "step": 31405 }, { "epoch": 8.289824468787119, "grad_norm": 0.10487838089466095, "learning_rate": 5.4781957419606785e-06, "loss": 0.0415, "num_input_tokens_seen": 28081440, "step": 31410 }, { "epoch": 8.291144252342615, "grad_norm": 0.16225402057170868, "learning_rate": 5.472064359328577e-06, "loss": 0.0244, "num_input_tokens_seen": 28086016, "step": 31415 }, { "epoch": 8.292464035898112, "grad_norm": 0.2321801632642746, "learning_rate": 5.4659359881495565e-06, "loss": 0.0301, "num_input_tokens_seen": 28090304, "step": 31420 }, { "epoch": 8.29378381945361, "grad_norm": 0.05357467755675316, "learning_rate": 5.4598106293686916e-06, "loss": 0.0832, "num_input_tokens_seen": 28094784, "step": 31425 }, { "epoch": 8.295103603009107, "grad_norm": 0.306360125541687, "learning_rate": 5.45368828393058e-06, "loss": 0.0881, "num_input_tokens_seen": 28099424, "step": 31430 }, { "epoch": 8.296423386564603, "grad_norm": 0.2016575187444687, "learning_rate": 5.44756895277937e-06, "loss": 0.0626, "num_input_tokens_seen": 28103840, "step": 31435 }, { "epoch": 8.2977431701201, "grad_norm": 0.3366232216358185, "learning_rate": 5.441452636858746e-06, "loss": 0.078, "num_input_tokens_seen": 28108064, "step": 31440 }, { "epoch": 8.299062953675596, "grad_norm": 0.22584505379199982, "learning_rate": 5.435339337111905e-06, "loss": 0.0622, "num_input_tokens_seen": 28112768, "step": 31445 }, { "epoch": 8.300382737231095, "grad_norm": 0.3065774440765381, "learning_rate": 5.42922905448161e-06, "loss": 0.0598, "num_input_tokens_seen": 28117312, "step": 31450 }, { "epoch": 8.301702520786591, "grad_norm": 0.1937895119190216, "learning_rate": 5.423121789910129e-06, "loss": 0.0615, "num_input_tokens_seen": 28121600, "step": 31455 }, { "epoch": 8.303022304342088, "grad_norm": 0.1442263424396515, "learning_rate": 5.417017544339287e-06, "loss": 0.0336, "num_input_tokens_seen": 28126048, "step": 31460 }, { "epoch": 8.304342087897584, "grad_norm": 0.2206220179796219, "learning_rate": 5.410916318710443e-06, "loss": 0.0989, "num_input_tokens_seen": 28130656, "step": 31465 }, { "epoch": 8.305661871453081, "grad_norm": 0.22425362467765808, "learning_rate": 5.404818113964466e-06, "loss": 0.0596, "num_input_tokens_seen": 28135040, "step": 31470 }, { "epoch": 8.30698165500858, "grad_norm": 0.10591068863868713, "learning_rate": 5.398722931041792e-06, "loss": 0.0545, "num_input_tokens_seen": 28139296, "step": 31475 }, { "epoch": 8.308301438564076, "grad_norm": 0.19140157103538513, "learning_rate": 5.392630770882367e-06, "loss": 0.0508, "num_input_tokens_seen": 28143840, "step": 31480 }, { "epoch": 8.309621222119572, "grad_norm": 0.07222902774810791, "learning_rate": 5.3865416344256705e-06, "loss": 0.0274, "num_input_tokens_seen": 28148256, "step": 31485 }, { "epoch": 8.310941005675069, "grad_norm": 0.1052062138915062, "learning_rate": 5.380455522610742e-06, "loss": 0.0305, "num_input_tokens_seen": 28152672, "step": 31490 }, { "epoch": 8.312260789230566, "grad_norm": 0.1660299003124237, "learning_rate": 5.374372436376116e-06, "loss": 0.037, "num_input_tokens_seen": 28157088, "step": 31495 }, { "epoch": 8.313580572786064, "grad_norm": 0.3312130570411682, "learning_rate": 5.368292376659895e-06, "loss": 0.0784, "num_input_tokens_seen": 28161312, "step": 31500 }, { "epoch": 8.31490035634156, "grad_norm": 0.08026500791311264, "learning_rate": 5.362215344399701e-06, "loss": 0.0681, "num_input_tokens_seen": 28165536, "step": 31505 }, { "epoch": 8.316220139897057, "grad_norm": 0.4170088768005371, "learning_rate": 5.356141340532678e-06, "loss": 0.0763, "num_input_tokens_seen": 28170016, "step": 31510 }, { "epoch": 8.317539923452554, "grad_norm": 0.3503752648830414, "learning_rate": 5.350070365995522e-06, "loss": 0.0723, "num_input_tokens_seen": 28174304, "step": 31515 }, { "epoch": 8.31885970700805, "grad_norm": 0.5154256820678711, "learning_rate": 5.344002421724459e-06, "loss": 0.0453, "num_input_tokens_seen": 28178816, "step": 31520 }, { "epoch": 8.320179490563547, "grad_norm": 0.16560965776443481, "learning_rate": 5.337937508655228e-06, "loss": 0.0434, "num_input_tokens_seen": 28183456, "step": 31525 }, { "epoch": 8.321499274119045, "grad_norm": 0.41666868329048157, "learning_rate": 5.331875627723126e-06, "loss": 0.046, "num_input_tokens_seen": 28187840, "step": 31530 }, { "epoch": 8.322819057674542, "grad_norm": 0.1576455682516098, "learning_rate": 5.325816779862963e-06, "loss": 0.0472, "num_input_tokens_seen": 28192288, "step": 31535 }, { "epoch": 8.324138841230038, "grad_norm": 0.42261579632759094, "learning_rate": 5.319760966009102e-06, "loss": 0.0389, "num_input_tokens_seen": 28196864, "step": 31540 }, { "epoch": 8.325458624785535, "grad_norm": 0.21517950296401978, "learning_rate": 5.3137081870954096e-06, "loss": 0.0485, "num_input_tokens_seen": 28201568, "step": 31545 }, { "epoch": 8.326778408341031, "grad_norm": 0.25522544980049133, "learning_rate": 5.307658444055313e-06, "loss": 0.0765, "num_input_tokens_seen": 28205664, "step": 31550 }, { "epoch": 8.32809819189653, "grad_norm": 0.08197813481092453, "learning_rate": 5.301611737821749e-06, "loss": 0.0816, "num_input_tokens_seen": 28210464, "step": 31555 }, { "epoch": 8.329417975452026, "grad_norm": 0.15731588006019592, "learning_rate": 5.295568069327206e-06, "loss": 0.0304, "num_input_tokens_seen": 28214816, "step": 31560 }, { "epoch": 8.330737759007523, "grad_norm": 0.22606533765792847, "learning_rate": 5.289527439503683e-06, "loss": 0.0881, "num_input_tokens_seen": 28219296, "step": 31565 }, { "epoch": 8.33205754256302, "grad_norm": 0.2702862620353699, "learning_rate": 5.28348984928273e-06, "loss": 0.0695, "num_input_tokens_seen": 28223584, "step": 31570 }, { "epoch": 8.333377326118516, "grad_norm": 0.35594359040260315, "learning_rate": 5.27745529959541e-06, "loss": 0.0986, "num_input_tokens_seen": 28227584, "step": 31575 }, { "epoch": 8.334697109674014, "grad_norm": 0.057019129395484924, "learning_rate": 5.271423791372335e-06, "loss": 0.1018, "num_input_tokens_seen": 28232096, "step": 31580 }, { "epoch": 8.33601689322951, "grad_norm": 0.08245081454515457, "learning_rate": 5.26539532554364e-06, "loss": 0.0852, "num_input_tokens_seen": 28236640, "step": 31585 }, { "epoch": 8.337336676785007, "grad_norm": 0.3890661299228668, "learning_rate": 5.25936990303898e-06, "loss": 0.0431, "num_input_tokens_seen": 28241312, "step": 31590 }, { "epoch": 8.338656460340504, "grad_norm": 0.052959244698286057, "learning_rate": 5.253347524787555e-06, "loss": 0.0695, "num_input_tokens_seen": 28245536, "step": 31595 }, { "epoch": 8.339976243896, "grad_norm": 0.4196988046169281, "learning_rate": 5.2473281917181035e-06, "loss": 0.0963, "num_input_tokens_seen": 28250112, "step": 31600 }, { "epoch": 8.339976243896, "eval_loss": 0.06694076955318451, "eval_runtime": 64.7424, "eval_samples_per_second": 104.028, "eval_steps_per_second": 26.011, "num_input_tokens_seen": 28250112, "step": 31600 }, { "epoch": 8.341296027451499, "grad_norm": 0.22888000309467316, "learning_rate": 5.241311904758864e-06, "loss": 0.07, "num_input_tokens_seen": 28254368, "step": 31605 }, { "epoch": 8.342615811006995, "grad_norm": 0.0943085104227066, "learning_rate": 5.23529866483764e-06, "loss": 0.0412, "num_input_tokens_seen": 28258816, "step": 31610 }, { "epoch": 8.343935594562492, "grad_norm": 0.16361837089061737, "learning_rate": 5.229288472881732e-06, "loss": 0.0701, "num_input_tokens_seen": 28263392, "step": 31615 }, { "epoch": 8.345255378117988, "grad_norm": 0.2628397047519684, "learning_rate": 5.2232813298180025e-06, "loss": 0.1107, "num_input_tokens_seen": 28267968, "step": 31620 }, { "epoch": 8.346575161673485, "grad_norm": 0.13943825662136078, "learning_rate": 5.217277236572824e-06, "loss": 0.0731, "num_input_tokens_seen": 28272384, "step": 31625 }, { "epoch": 8.347894945228983, "grad_norm": 0.3473707437515259, "learning_rate": 5.211276194072093e-06, "loss": 0.098, "num_input_tokens_seen": 28276928, "step": 31630 }, { "epoch": 8.34921472878448, "grad_norm": 0.38137203454971313, "learning_rate": 5.205278203241254e-06, "loss": 0.0729, "num_input_tokens_seen": 28281280, "step": 31635 }, { "epoch": 8.350534512339976, "grad_norm": 0.33100423216819763, "learning_rate": 5.199283265005278e-06, "loss": 0.0445, "num_input_tokens_seen": 28285504, "step": 31640 }, { "epoch": 8.351854295895473, "grad_norm": 0.17923983931541443, "learning_rate": 5.193291380288648e-06, "loss": 0.0333, "num_input_tokens_seen": 28289824, "step": 31645 }, { "epoch": 8.35317407945097, "grad_norm": 0.3521307110786438, "learning_rate": 5.1873025500153995e-06, "loss": 0.0535, "num_input_tokens_seen": 28294528, "step": 31650 }, { "epoch": 8.354493863006468, "grad_norm": 0.2762030363082886, "learning_rate": 5.181316775109071e-06, "loss": 0.0708, "num_input_tokens_seen": 28299168, "step": 31655 }, { "epoch": 8.355813646561964, "grad_norm": 0.039546411484479904, "learning_rate": 5.1753340564927564e-06, "loss": 0.0341, "num_input_tokens_seen": 28303488, "step": 31660 }, { "epoch": 8.357133430117461, "grad_norm": 0.08518904447555542, "learning_rate": 5.169354395089068e-06, "loss": 0.0256, "num_input_tokens_seen": 28307968, "step": 31665 }, { "epoch": 8.358453213672957, "grad_norm": 0.056911397725343704, "learning_rate": 5.1633777918201346e-06, "loss": 0.0377, "num_input_tokens_seen": 28312160, "step": 31670 }, { "epoch": 8.359772997228454, "grad_norm": 0.11146537214517593, "learning_rate": 5.157404247607625e-06, "loss": 0.0656, "num_input_tokens_seen": 28316640, "step": 31675 }, { "epoch": 8.36109278078395, "grad_norm": 0.2967804968357086, "learning_rate": 5.1514337633727454e-06, "loss": 0.0601, "num_input_tokens_seen": 28321056, "step": 31680 }, { "epoch": 8.362412564339449, "grad_norm": 0.4036308825016022, "learning_rate": 5.145466340036206e-06, "loss": 0.1365, "num_input_tokens_seen": 28325408, "step": 31685 }, { "epoch": 8.363732347894945, "grad_norm": 0.12064474076032639, "learning_rate": 5.139501978518274e-06, "loss": 0.0488, "num_input_tokens_seen": 28329728, "step": 31690 }, { "epoch": 8.365052131450442, "grad_norm": 0.07563777267932892, "learning_rate": 5.133540679738716e-06, "loss": 0.0595, "num_input_tokens_seen": 28334208, "step": 31695 }, { "epoch": 8.366371915005939, "grad_norm": 0.10374762117862701, "learning_rate": 5.127582444616838e-06, "loss": 0.028, "num_input_tokens_seen": 28338976, "step": 31700 }, { "epoch": 8.367691698561435, "grad_norm": 0.06006516143679619, "learning_rate": 5.121627274071486e-06, "loss": 0.0583, "num_input_tokens_seen": 28343520, "step": 31705 }, { "epoch": 8.369011482116933, "grad_norm": 0.2423616498708725, "learning_rate": 5.115675169021009e-06, "loss": 0.0782, "num_input_tokens_seen": 28347936, "step": 31710 }, { "epoch": 8.37033126567243, "grad_norm": 0.12826000154018402, "learning_rate": 5.1097261303832994e-06, "loss": 0.0454, "num_input_tokens_seen": 28352288, "step": 31715 }, { "epoch": 8.371651049227927, "grad_norm": 0.12992027401924133, "learning_rate": 5.103780159075788e-06, "loss": 0.0848, "num_input_tokens_seen": 28356992, "step": 31720 }, { "epoch": 8.372970832783423, "grad_norm": 0.2809655964374542, "learning_rate": 5.0978372560154e-06, "loss": 0.0514, "num_input_tokens_seen": 28361568, "step": 31725 }, { "epoch": 8.37429061633892, "grad_norm": 0.16390928626060486, "learning_rate": 5.091897422118619e-06, "loss": 0.0482, "num_input_tokens_seen": 28366272, "step": 31730 }, { "epoch": 8.375610399894418, "grad_norm": 0.060936011373996735, "learning_rate": 5.0859606583014305e-06, "loss": 0.0202, "num_input_tokens_seen": 28370688, "step": 31735 }, { "epoch": 8.376930183449915, "grad_norm": 0.12615734338760376, "learning_rate": 5.080026965479365e-06, "loss": 0.0375, "num_input_tokens_seen": 28375488, "step": 31740 }, { "epoch": 8.378249967005411, "grad_norm": 0.12842825055122375, "learning_rate": 5.074096344567475e-06, "loss": 0.0362, "num_input_tokens_seen": 28379904, "step": 31745 }, { "epoch": 8.379569750560908, "grad_norm": 0.6115501523017883, "learning_rate": 5.0681687964803294e-06, "loss": 0.0822, "num_input_tokens_seen": 28384224, "step": 31750 }, { "epoch": 8.380889534116404, "grad_norm": 0.1151709258556366, "learning_rate": 5.06224432213204e-06, "loss": 0.027, "num_input_tokens_seen": 28388736, "step": 31755 }, { "epoch": 8.382209317671903, "grad_norm": 0.18120890855789185, "learning_rate": 5.056322922436224e-06, "loss": 0.0552, "num_input_tokens_seen": 28393120, "step": 31760 }, { "epoch": 8.3835291012274, "grad_norm": 0.2290099859237671, "learning_rate": 5.0504045983060465e-06, "loss": 0.0838, "num_input_tokens_seen": 28397504, "step": 31765 }, { "epoch": 8.384848884782896, "grad_norm": 0.06172773241996765, "learning_rate": 5.044489350654183e-06, "loss": 0.0272, "num_input_tokens_seen": 28401984, "step": 31770 }, { "epoch": 8.386168668338392, "grad_norm": 0.5357003211975098, "learning_rate": 5.038577180392831e-06, "loss": 0.1154, "num_input_tokens_seen": 28406784, "step": 31775 }, { "epoch": 8.387488451893889, "grad_norm": 0.17018455266952515, "learning_rate": 5.032668088433729e-06, "loss": 0.0372, "num_input_tokens_seen": 28411168, "step": 31780 }, { "epoch": 8.388808235449387, "grad_norm": 0.09959359467029572, "learning_rate": 5.02676207568814e-06, "loss": 0.0205, "num_input_tokens_seen": 28415616, "step": 31785 }, { "epoch": 8.390128019004884, "grad_norm": 0.09072726219892502, "learning_rate": 5.02085914306683e-06, "loss": 0.0403, "num_input_tokens_seen": 28419936, "step": 31790 }, { "epoch": 8.39144780256038, "grad_norm": 0.14661361277103424, "learning_rate": 5.014959291480123e-06, "loss": 0.067, "num_input_tokens_seen": 28424416, "step": 31795 }, { "epoch": 8.392767586115877, "grad_norm": 0.17932093143463135, "learning_rate": 5.009062521837835e-06, "loss": 0.0301, "num_input_tokens_seen": 28428832, "step": 31800 }, { "epoch": 8.392767586115877, "eval_loss": 0.06704459339380264, "eval_runtime": 64.7313, "eval_samples_per_second": 104.046, "eval_steps_per_second": 26.015, "num_input_tokens_seen": 28428832, "step": 31800 }, { "epoch": 8.394087369671373, "grad_norm": 0.18261459469795227, "learning_rate": 5.003168835049324e-06, "loss": 0.0737, "num_input_tokens_seen": 28433344, "step": 31805 }, { "epoch": 8.39540715322687, "grad_norm": 0.07657528668642044, "learning_rate": 4.997278232023483e-06, "loss": 0.0469, "num_input_tokens_seen": 28437888, "step": 31810 }, { "epoch": 8.396726936782368, "grad_norm": 0.3366856575012207, "learning_rate": 4.9913907136687036e-06, "loss": 0.0579, "num_input_tokens_seen": 28442336, "step": 31815 }, { "epoch": 8.398046720337865, "grad_norm": 0.05351114645600319, "learning_rate": 4.985506280892918e-06, "loss": 0.0499, "num_input_tokens_seen": 28447168, "step": 31820 }, { "epoch": 8.399366503893361, "grad_norm": 0.19690437614917755, "learning_rate": 4.979624934603589e-06, "loss": 0.0394, "num_input_tokens_seen": 28451712, "step": 31825 }, { "epoch": 8.400686287448858, "grad_norm": 0.2705552577972412, "learning_rate": 4.97374667570768e-06, "loss": 0.037, "num_input_tokens_seen": 28456064, "step": 31830 }, { "epoch": 8.402006071004354, "grad_norm": 0.12079799175262451, "learning_rate": 4.967871505111704e-06, "loss": 0.0399, "num_input_tokens_seen": 28460672, "step": 31835 }, { "epoch": 8.403325854559853, "grad_norm": 0.2921290695667267, "learning_rate": 4.961999423721686e-06, "loss": 0.0587, "num_input_tokens_seen": 28465024, "step": 31840 }, { "epoch": 8.40464563811535, "grad_norm": 0.06648524850606918, "learning_rate": 4.956130432443159e-06, "loss": 0.0496, "num_input_tokens_seen": 28469472, "step": 31845 }, { "epoch": 8.405965421670846, "grad_norm": 0.19861769676208496, "learning_rate": 4.950264532181215e-06, "loss": 0.0566, "num_input_tokens_seen": 28473760, "step": 31850 }, { "epoch": 8.407285205226342, "grad_norm": 0.33737319707870483, "learning_rate": 4.944401723840433e-06, "loss": 0.0685, "num_input_tokens_seen": 28478592, "step": 31855 }, { "epoch": 8.408604988781839, "grad_norm": 0.2275693416595459, "learning_rate": 4.938542008324942e-06, "loss": 0.0418, "num_input_tokens_seen": 28482688, "step": 31860 }, { "epoch": 8.409924772337337, "grad_norm": 0.055784787982702255, "learning_rate": 4.9326853865383855e-06, "loss": 0.0976, "num_input_tokens_seen": 28487136, "step": 31865 }, { "epoch": 8.411244555892834, "grad_norm": 0.3049996495246887, "learning_rate": 4.926831859383918e-06, "loss": 0.0372, "num_input_tokens_seen": 28491680, "step": 31870 }, { "epoch": 8.41256433944833, "grad_norm": 0.07680006325244904, "learning_rate": 4.92098142776424e-06, "loss": 0.0623, "num_input_tokens_seen": 28496320, "step": 31875 }, { "epoch": 8.413884123003827, "grad_norm": 0.37665557861328125, "learning_rate": 4.91513409258155e-06, "loss": 0.0488, "num_input_tokens_seen": 28500928, "step": 31880 }, { "epoch": 8.415203906559324, "grad_norm": 0.3817359507083893, "learning_rate": 4.909289854737581e-06, "loss": 0.0512, "num_input_tokens_seen": 28505152, "step": 31885 }, { "epoch": 8.416523690114822, "grad_norm": 0.3784540295600891, "learning_rate": 4.903448715133602e-06, "loss": 0.042, "num_input_tokens_seen": 28509632, "step": 31890 }, { "epoch": 8.417843473670318, "grad_norm": 0.08142757415771484, "learning_rate": 4.897610674670372e-06, "loss": 0.0429, "num_input_tokens_seen": 28514336, "step": 31895 }, { "epoch": 8.419163257225815, "grad_norm": 0.06685979664325714, "learning_rate": 4.8917757342482e-06, "loss": 0.0236, "num_input_tokens_seen": 28518880, "step": 31900 }, { "epoch": 8.420483040781312, "grad_norm": 0.29032984375953674, "learning_rate": 4.885943894766909e-06, "loss": 0.0605, "num_input_tokens_seen": 28523232, "step": 31905 }, { "epoch": 8.421802824336808, "grad_norm": 0.377621591091156, "learning_rate": 4.880115157125842e-06, "loss": 0.0486, "num_input_tokens_seen": 28527776, "step": 31910 }, { "epoch": 8.423122607892306, "grad_norm": 0.20871679484844208, "learning_rate": 4.874289522223857e-06, "loss": 0.0925, "num_input_tokens_seen": 28532384, "step": 31915 }, { "epoch": 8.424442391447803, "grad_norm": 0.05319749936461449, "learning_rate": 4.868466990959339e-06, "loss": 0.0592, "num_input_tokens_seen": 28536992, "step": 31920 }, { "epoch": 8.4257621750033, "grad_norm": 0.1837724894285202, "learning_rate": 4.8626475642301964e-06, "loss": 0.0606, "num_input_tokens_seen": 28541600, "step": 31925 }, { "epoch": 8.427081958558796, "grad_norm": 0.3020378053188324, "learning_rate": 4.856831242933871e-06, "loss": 0.0336, "num_input_tokens_seen": 28545696, "step": 31930 }, { "epoch": 8.428401742114293, "grad_norm": 0.20873194932937622, "learning_rate": 4.851018027967294e-06, "loss": 0.0799, "num_input_tokens_seen": 28550048, "step": 31935 }, { "epoch": 8.429721525669791, "grad_norm": 0.03155635669827461, "learning_rate": 4.845207920226946e-06, "loss": 0.1012, "num_input_tokens_seen": 28554560, "step": 31940 }, { "epoch": 8.431041309225288, "grad_norm": 0.3218725025653839, "learning_rate": 4.839400920608825e-06, "loss": 0.0996, "num_input_tokens_seen": 28559008, "step": 31945 }, { "epoch": 8.432361092780784, "grad_norm": 0.28791195154190063, "learning_rate": 4.83359703000843e-06, "loss": 0.0647, "num_input_tokens_seen": 28563392, "step": 31950 }, { "epoch": 8.43368087633628, "grad_norm": 0.26693063974380493, "learning_rate": 4.827796249320804e-06, "loss": 0.0689, "num_input_tokens_seen": 28567808, "step": 31955 }, { "epoch": 8.435000659891777, "grad_norm": 0.07012023776769638, "learning_rate": 4.82199857944049e-06, "loss": 0.0545, "num_input_tokens_seen": 28572000, "step": 31960 }, { "epoch": 8.436320443447274, "grad_norm": 0.1225777342915535, "learning_rate": 4.8162040212615695e-06, "loss": 0.0638, "num_input_tokens_seen": 28576448, "step": 31965 }, { "epoch": 8.437640227002772, "grad_norm": 0.3847530484199524, "learning_rate": 4.810412575677639e-06, "loss": 0.066, "num_input_tokens_seen": 28581120, "step": 31970 }, { "epoch": 8.438960010558269, "grad_norm": 0.17505361139774323, "learning_rate": 4.804624243581801e-06, "loss": 0.0857, "num_input_tokens_seen": 28585184, "step": 31975 }, { "epoch": 8.440279794113765, "grad_norm": 0.22402746975421906, "learning_rate": 4.798839025866703e-06, "loss": 0.0603, "num_input_tokens_seen": 28589600, "step": 31980 }, { "epoch": 8.441599577669262, "grad_norm": 0.2544792592525482, "learning_rate": 4.793056923424491e-06, "loss": 0.0905, "num_input_tokens_seen": 28594240, "step": 31985 }, { "epoch": 8.442919361224758, "grad_norm": 0.2715524137020111, "learning_rate": 4.78727793714683e-06, "loss": 0.1232, "num_input_tokens_seen": 28598528, "step": 31990 }, { "epoch": 8.444239144780257, "grad_norm": 0.398752361536026, "learning_rate": 4.7815020679249285e-06, "loss": 0.0967, "num_input_tokens_seen": 28602912, "step": 31995 }, { "epoch": 8.445558928335753, "grad_norm": 0.22669674456119537, "learning_rate": 4.775729316649483e-06, "loss": 0.0454, "num_input_tokens_seen": 28607296, "step": 32000 }, { "epoch": 8.445558928335753, "eval_loss": 0.06695601344108582, "eval_runtime": 64.7583, "eval_samples_per_second": 104.002, "eval_steps_per_second": 26.004, "num_input_tokens_seen": 28607296, "step": 32000 }, { "epoch": 8.44687871189125, "grad_norm": 0.19208596646785736, "learning_rate": 4.769959684210728e-06, "loss": 0.039, "num_input_tokens_seen": 28611648, "step": 32005 }, { "epoch": 8.448198495446746, "grad_norm": 0.39272093772888184, "learning_rate": 4.764193171498426e-06, "loss": 0.0582, "num_input_tokens_seen": 28616160, "step": 32010 }, { "epoch": 8.449518279002243, "grad_norm": 0.1660929173231125, "learning_rate": 4.75842977940183e-06, "loss": 0.0306, "num_input_tokens_seen": 28620352, "step": 32015 }, { "epoch": 8.450838062557741, "grad_norm": 0.15429633855819702, "learning_rate": 4.752669508809729e-06, "loss": 0.0638, "num_input_tokens_seen": 28625024, "step": 32020 }, { "epoch": 8.452157846113238, "grad_norm": 0.11691737174987793, "learning_rate": 4.746912360610445e-06, "loss": 0.059, "num_input_tokens_seen": 28629472, "step": 32025 }, { "epoch": 8.453477629668734, "grad_norm": 0.3263303339481354, "learning_rate": 4.741158335691781e-06, "loss": 0.0466, "num_input_tokens_seen": 28633728, "step": 32030 }, { "epoch": 8.454797413224231, "grad_norm": 0.17225512862205505, "learning_rate": 4.7354074349410994e-06, "loss": 0.096, "num_input_tokens_seen": 28638208, "step": 32035 }, { "epoch": 8.456117196779728, "grad_norm": 0.06650277227163315, "learning_rate": 4.729659659245245e-06, "loss": 0.0525, "num_input_tokens_seen": 28642848, "step": 32040 }, { "epoch": 8.457436980335226, "grad_norm": 0.25759896636009216, "learning_rate": 4.723915009490601e-06, "loss": 0.0431, "num_input_tokens_seen": 28647104, "step": 32045 }, { "epoch": 8.458756763890722, "grad_norm": 0.24824389815330505, "learning_rate": 4.718173486563077e-06, "loss": 0.0282, "num_input_tokens_seen": 28651488, "step": 32050 }, { "epoch": 8.460076547446219, "grad_norm": 0.08176346868276596, "learning_rate": 4.71243509134808e-06, "loss": 0.075, "num_input_tokens_seen": 28655872, "step": 32055 }, { "epoch": 8.461396331001716, "grad_norm": 0.2675381302833557, "learning_rate": 4.706699824730532e-06, "loss": 0.0472, "num_input_tokens_seen": 28660448, "step": 32060 }, { "epoch": 8.462716114557212, "grad_norm": 0.16624954342842102, "learning_rate": 4.700967687594901e-06, "loss": 0.0685, "num_input_tokens_seen": 28664896, "step": 32065 }, { "epoch": 8.464035898112709, "grad_norm": 0.3976919651031494, "learning_rate": 4.69523868082514e-06, "loss": 0.1168, "num_input_tokens_seen": 28669504, "step": 32070 }, { "epoch": 8.465355681668207, "grad_norm": 0.2948966920375824, "learning_rate": 4.689512805304747e-06, "loss": 0.0699, "num_input_tokens_seen": 28674016, "step": 32075 }, { "epoch": 8.466675465223704, "grad_norm": 0.16364853084087372, "learning_rate": 4.683790061916707e-06, "loss": 0.0632, "num_input_tokens_seen": 28678208, "step": 32080 }, { "epoch": 8.4679952487792, "grad_norm": 0.2183244228363037, "learning_rate": 4.678070451543551e-06, "loss": 0.0397, "num_input_tokens_seen": 28682752, "step": 32085 }, { "epoch": 8.469315032334697, "grad_norm": 0.28774094581604004, "learning_rate": 4.6723539750673204e-06, "loss": 0.1189, "num_input_tokens_seen": 28687200, "step": 32090 }, { "epoch": 8.470634815890193, "grad_norm": 0.08767541497945786, "learning_rate": 4.666640633369551e-06, "loss": 0.0374, "num_input_tokens_seen": 28691712, "step": 32095 }, { "epoch": 8.471954599445692, "grad_norm": 0.06807275116443634, "learning_rate": 4.660930427331323e-06, "loss": 0.0341, "num_input_tokens_seen": 28696224, "step": 32100 }, { "epoch": 8.473274383001188, "grad_norm": 0.3960869312286377, "learning_rate": 4.6552233578332244e-06, "loss": 0.069, "num_input_tokens_seen": 28700832, "step": 32105 }, { "epoch": 8.474594166556685, "grad_norm": 0.3786757290363312, "learning_rate": 4.649519425755347e-06, "loss": 0.075, "num_input_tokens_seen": 28705280, "step": 32110 }, { "epoch": 8.475913950112181, "grad_norm": 0.2847622334957123, "learning_rate": 4.64381863197732e-06, "loss": 0.0499, "num_input_tokens_seen": 28710048, "step": 32115 }, { "epoch": 8.477233733667678, "grad_norm": 0.2357923984527588, "learning_rate": 4.638120977378269e-06, "loss": 0.0758, "num_input_tokens_seen": 28714752, "step": 32120 }, { "epoch": 8.478553517223176, "grad_norm": 0.39049309492111206, "learning_rate": 4.632426462836848e-06, "loss": 0.1325, "num_input_tokens_seen": 28719296, "step": 32125 }, { "epoch": 8.479873300778673, "grad_norm": 0.24942511320114136, "learning_rate": 4.626735089231224e-06, "loss": 0.0555, "num_input_tokens_seen": 28723872, "step": 32130 }, { "epoch": 8.48119308433417, "grad_norm": 0.11647042632102966, "learning_rate": 4.621046857439068e-06, "loss": 0.0535, "num_input_tokens_seen": 28728672, "step": 32135 }, { "epoch": 8.482512867889666, "grad_norm": 0.05813154578208923, "learning_rate": 4.615361768337587e-06, "loss": 0.0607, "num_input_tokens_seen": 28733248, "step": 32140 }, { "epoch": 8.483832651445162, "grad_norm": 0.2970067262649536, "learning_rate": 4.6096798228034946e-06, "loss": 0.082, "num_input_tokens_seen": 28737504, "step": 32145 }, { "epoch": 8.48515243500066, "grad_norm": 0.04213309288024902, "learning_rate": 4.604001021713008e-06, "loss": 0.077, "num_input_tokens_seen": 28741952, "step": 32150 }, { "epoch": 8.486472218556157, "grad_norm": 0.4650309979915619, "learning_rate": 4.598325365941883e-06, "loss": 0.0628, "num_input_tokens_seen": 28746656, "step": 32155 }, { "epoch": 8.487792002111654, "grad_norm": 0.2871243953704834, "learning_rate": 4.5926528563653645e-06, "loss": 0.1086, "num_input_tokens_seen": 28750976, "step": 32160 }, { "epoch": 8.48911178566715, "grad_norm": 0.2560248374938965, "learning_rate": 4.5869834938582295e-06, "loss": 0.0926, "num_input_tokens_seen": 28755520, "step": 32165 }, { "epoch": 8.490431569222647, "grad_norm": 0.05088259279727936, "learning_rate": 4.581317279294772e-06, "loss": 0.0398, "num_input_tokens_seen": 28759936, "step": 32170 }, { "epoch": 8.491751352778145, "grad_norm": 0.0860687792301178, "learning_rate": 4.57565421354878e-06, "loss": 0.0399, "num_input_tokens_seen": 28764480, "step": 32175 }, { "epoch": 8.493071136333642, "grad_norm": 0.13699094951152802, "learning_rate": 4.569994297493579e-06, "loss": 0.0499, "num_input_tokens_seen": 28769088, "step": 32180 }, { "epoch": 8.494390919889138, "grad_norm": 0.1934330016374588, "learning_rate": 4.564337532002002e-06, "loss": 0.0447, "num_input_tokens_seen": 28773696, "step": 32185 }, { "epoch": 8.495710703444635, "grad_norm": 0.11536159366369247, "learning_rate": 4.55868391794638e-06, "loss": 0.0873, "num_input_tokens_seen": 28778272, "step": 32190 }, { "epoch": 8.497030487000131, "grad_norm": 0.2240523248910904, "learning_rate": 4.553033456198588e-06, "loss": 0.0216, "num_input_tokens_seen": 28782496, "step": 32195 }, { "epoch": 8.49835027055563, "grad_norm": 0.19347232580184937, "learning_rate": 4.54738614762999e-06, "loss": 0.0416, "num_input_tokens_seen": 28787040, "step": 32200 }, { "epoch": 8.49835027055563, "eval_loss": 0.06675978004932404, "eval_runtime": 64.7846, "eval_samples_per_second": 103.96, "eval_steps_per_second": 25.994, "num_input_tokens_seen": 28787040, "step": 32200 }, { "epoch": 8.499670054111126, "grad_norm": 0.0561528354883194, "learning_rate": 4.541741993111465e-06, "loss": 0.03, "num_input_tokens_seen": 28791424, "step": 32205 }, { "epoch": 8.500989837666623, "grad_norm": 0.11692935228347778, "learning_rate": 4.536100993513423e-06, "loss": 0.0189, "num_input_tokens_seen": 28795808, "step": 32210 }, { "epoch": 8.50230962122212, "grad_norm": 0.03888910263776779, "learning_rate": 4.530463149705768e-06, "loss": 0.0288, "num_input_tokens_seen": 28800288, "step": 32215 }, { "epoch": 8.503629404777616, "grad_norm": 0.2690803110599518, "learning_rate": 4.524828462557934e-06, "loss": 0.0787, "num_input_tokens_seen": 28804576, "step": 32220 }, { "epoch": 8.504949188333114, "grad_norm": 0.04509957507252693, "learning_rate": 4.5191969329388625e-06, "loss": 0.0657, "num_input_tokens_seen": 28809312, "step": 32225 }, { "epoch": 8.50626897188861, "grad_norm": 0.2366829365491867, "learning_rate": 4.5135685617169965e-06, "loss": 0.0544, "num_input_tokens_seen": 28813504, "step": 32230 }, { "epoch": 8.507588755444107, "grad_norm": 0.2822888195514679, "learning_rate": 4.507943349760313e-06, "loss": 0.0594, "num_input_tokens_seen": 28817920, "step": 32235 }, { "epoch": 8.508908538999604, "grad_norm": 0.2680598497390747, "learning_rate": 4.502321297936277e-06, "loss": 0.0304, "num_input_tokens_seen": 28822240, "step": 32240 }, { "epoch": 8.5102283225551, "grad_norm": 0.19376246631145477, "learning_rate": 4.496702407111888e-06, "loss": 0.0915, "num_input_tokens_seen": 28826848, "step": 32245 }, { "epoch": 8.511548106110597, "grad_norm": 0.08419783413410187, "learning_rate": 4.491086678153653e-06, "loss": 0.0405, "num_input_tokens_seen": 28831808, "step": 32250 }, { "epoch": 8.512867889666095, "grad_norm": 0.16952945291996002, "learning_rate": 4.485474111927579e-06, "loss": 0.0542, "num_input_tokens_seen": 28836224, "step": 32255 }, { "epoch": 8.514187673221592, "grad_norm": 0.5525137782096863, "learning_rate": 4.479864709299197e-06, "loss": 0.0905, "num_input_tokens_seen": 28840832, "step": 32260 }, { "epoch": 8.515507456777089, "grad_norm": 0.13719230890274048, "learning_rate": 4.474258471133555e-06, "loss": 0.0303, "num_input_tokens_seen": 28845504, "step": 32265 }, { "epoch": 8.516827240332585, "grad_norm": 0.04424746707081795, "learning_rate": 4.4686553982952014e-06, "loss": 0.0218, "num_input_tokens_seen": 28849920, "step": 32270 }, { "epoch": 8.518147023888082, "grad_norm": 0.06465354561805725, "learning_rate": 4.463055491648191e-06, "loss": 0.0309, "num_input_tokens_seen": 28854272, "step": 32275 }, { "epoch": 8.51946680744358, "grad_norm": 0.4480089843273163, "learning_rate": 4.457458752056112e-06, "loss": 0.1041, "num_input_tokens_seen": 28858784, "step": 32280 }, { "epoch": 8.520786590999077, "grad_norm": 0.05316169559955597, "learning_rate": 4.451865180382042e-06, "loss": 0.0508, "num_input_tokens_seen": 28863520, "step": 32285 }, { "epoch": 8.522106374554573, "grad_norm": 0.07662070542573929, "learning_rate": 4.4462747774885936e-06, "loss": 0.0408, "num_input_tokens_seen": 28867840, "step": 32290 }, { "epoch": 8.52342615811007, "grad_norm": 0.42241764068603516, "learning_rate": 4.440687544237859e-06, "loss": 0.0768, "num_input_tokens_seen": 28872192, "step": 32295 }, { "epoch": 8.524745941665566, "grad_norm": 0.2723045349121094, "learning_rate": 4.435103481491471e-06, "loss": 0.0362, "num_input_tokens_seen": 28876640, "step": 32300 }, { "epoch": 8.526065725221065, "grad_norm": 0.04508015513420105, "learning_rate": 4.429522590110569e-06, "loss": 0.0231, "num_input_tokens_seen": 28881632, "step": 32305 }, { "epoch": 8.527385508776561, "grad_norm": 0.12172966450452805, "learning_rate": 4.423944870955779e-06, "loss": 0.0481, "num_input_tokens_seen": 28885792, "step": 32310 }, { "epoch": 8.528705292332058, "grad_norm": 0.10600674152374268, "learning_rate": 4.418370324887272e-06, "loss": 0.0596, "num_input_tokens_seen": 28890464, "step": 32315 }, { "epoch": 8.530025075887554, "grad_norm": 0.06982612609863281, "learning_rate": 4.412798952764699e-06, "loss": 0.037, "num_input_tokens_seen": 28894816, "step": 32320 }, { "epoch": 8.53134485944305, "grad_norm": 0.4049108624458313, "learning_rate": 4.407230755447245e-06, "loss": 0.0594, "num_input_tokens_seen": 28899200, "step": 32325 }, { "epoch": 8.532664642998547, "grad_norm": 0.29336977005004883, "learning_rate": 4.401665733793598e-06, "loss": 0.0729, "num_input_tokens_seen": 28903392, "step": 32330 }, { "epoch": 8.533984426554046, "grad_norm": 0.23175837099552155, "learning_rate": 4.3961038886619425e-06, "loss": 0.0586, "num_input_tokens_seen": 28907872, "step": 32335 }, { "epoch": 8.535304210109542, "grad_norm": 0.15440090000629425, "learning_rate": 4.39054522091e-06, "loss": 0.0789, "num_input_tokens_seen": 28912480, "step": 32340 }, { "epoch": 8.536623993665039, "grad_norm": 0.24717773497104645, "learning_rate": 4.384989731394979e-06, "loss": 0.1021, "num_input_tokens_seen": 28916992, "step": 32345 }, { "epoch": 8.537943777220535, "grad_norm": 0.19682781398296356, "learning_rate": 4.379437420973598e-06, "loss": 0.0339, "num_input_tokens_seen": 28921376, "step": 32350 }, { "epoch": 8.539263560776032, "grad_norm": 0.07588683813810349, "learning_rate": 4.373888290502107e-06, "loss": 0.0332, "num_input_tokens_seen": 28926080, "step": 32355 }, { "epoch": 8.54058334433153, "grad_norm": 0.5153923034667969, "learning_rate": 4.36834234083624e-06, "loss": 0.1028, "num_input_tokens_seen": 28930624, "step": 32360 }, { "epoch": 8.541903127887027, "grad_norm": 0.2605419158935547, "learning_rate": 4.362799572831258e-06, "loss": 0.0487, "num_input_tokens_seen": 28935040, "step": 32365 }, { "epoch": 8.543222911442523, "grad_norm": 0.056142859160900116, "learning_rate": 4.35725998734193e-06, "loss": 0.0844, "num_input_tokens_seen": 28939616, "step": 32370 }, { "epoch": 8.54454269499802, "grad_norm": 0.1018192395567894, "learning_rate": 4.3517235852225195e-06, "loss": 0.0972, "num_input_tokens_seen": 28943968, "step": 32375 }, { "epoch": 8.545862478553516, "grad_norm": 0.17404715716838837, "learning_rate": 4.346190367326822e-06, "loss": 0.0716, "num_input_tokens_seen": 28948576, "step": 32380 }, { "epoch": 8.547182262109015, "grad_norm": 0.262067049741745, "learning_rate": 4.340660334508115e-06, "loss": 0.0353, "num_input_tokens_seen": 28952992, "step": 32385 }, { "epoch": 8.548502045664511, "grad_norm": 0.22257110476493835, "learning_rate": 4.335133487619206e-06, "loss": 0.1131, "num_input_tokens_seen": 28957344, "step": 32390 }, { "epoch": 8.549821829220008, "grad_norm": 0.11056710034608841, "learning_rate": 4.329609827512409e-06, "loss": 0.052, "num_input_tokens_seen": 28962080, "step": 32395 }, { "epoch": 8.551141612775504, "grad_norm": 0.09083539992570877, "learning_rate": 4.324089355039531e-06, "loss": 0.0382, "num_input_tokens_seen": 28966560, "step": 32400 }, { "epoch": 8.551141612775504, "eval_loss": 0.06670115888118744, "eval_runtime": 64.7671, "eval_samples_per_second": 103.988, "eval_steps_per_second": 26.001, "num_input_tokens_seen": 28966560, "step": 32400 }, { "epoch": 8.552461396331001, "grad_norm": 0.20540232956409454, "learning_rate": 4.3185720710519075e-06, "loss": 0.042, "num_input_tokens_seen": 28970944, "step": 32405 }, { "epoch": 8.5537811798865, "grad_norm": 0.14135994017124176, "learning_rate": 4.3130579764003724e-06, "loss": 0.0324, "num_input_tokens_seen": 28975488, "step": 32410 }, { "epoch": 8.555100963441996, "grad_norm": 0.14959384500980377, "learning_rate": 4.307547071935267e-06, "loss": 0.1375, "num_input_tokens_seen": 28980000, "step": 32415 }, { "epoch": 8.556420746997492, "grad_norm": 0.22653919458389282, "learning_rate": 4.302039358506435e-06, "loss": 0.0585, "num_input_tokens_seen": 28984224, "step": 32420 }, { "epoch": 8.557740530552989, "grad_norm": 0.10717340558767319, "learning_rate": 4.296534836963245e-06, "loss": 0.0707, "num_input_tokens_seen": 28988544, "step": 32425 }, { "epoch": 8.559060314108486, "grad_norm": 0.1399032026529312, "learning_rate": 4.291033508154555e-06, "loss": 0.0327, "num_input_tokens_seen": 28992928, "step": 32430 }, { "epoch": 8.560380097663984, "grad_norm": 0.22474636137485504, "learning_rate": 4.285535372928748e-06, "loss": 0.0903, "num_input_tokens_seen": 28997408, "step": 32435 }, { "epoch": 8.56169988121948, "grad_norm": 0.522908091545105, "learning_rate": 4.280040432133695e-06, "loss": 0.0907, "num_input_tokens_seen": 29002112, "step": 32440 }, { "epoch": 8.563019664774977, "grad_norm": 0.13207074999809265, "learning_rate": 4.274548686616789e-06, "loss": 0.0789, "num_input_tokens_seen": 29006368, "step": 32445 }, { "epoch": 8.564339448330474, "grad_norm": 0.2553294003009796, "learning_rate": 4.2690601372249364e-06, "loss": 0.0981, "num_input_tokens_seen": 29010784, "step": 32450 }, { "epoch": 8.56565923188597, "grad_norm": 0.35335758328437805, "learning_rate": 4.263574784804525e-06, "loss": 0.0477, "num_input_tokens_seen": 29014848, "step": 32455 }, { "epoch": 8.566979015441468, "grad_norm": 0.026406941935420036, "learning_rate": 4.258092630201479e-06, "loss": 0.0189, "num_input_tokens_seen": 29019392, "step": 32460 }, { "epoch": 8.568298798996965, "grad_norm": 0.06469951570034027, "learning_rate": 4.252613674261202e-06, "loss": 0.0283, "num_input_tokens_seen": 29023616, "step": 32465 }, { "epoch": 8.569618582552462, "grad_norm": 0.19222770631313324, "learning_rate": 4.2471379178286224e-06, "loss": 0.0507, "num_input_tokens_seen": 29028128, "step": 32470 }, { "epoch": 8.570938366107958, "grad_norm": 0.4107562303543091, "learning_rate": 4.241665361748181e-06, "loss": 0.0503, "num_input_tokens_seen": 29032800, "step": 32475 }, { "epoch": 8.572258149663455, "grad_norm": 0.11473802477121353, "learning_rate": 4.2361960068637994e-06, "loss": 0.0319, "num_input_tokens_seen": 29037152, "step": 32480 }, { "epoch": 8.573577933218953, "grad_norm": 0.254913866519928, "learning_rate": 4.230729854018933e-06, "loss": 0.0509, "num_input_tokens_seen": 29041568, "step": 32485 }, { "epoch": 8.57489771677445, "grad_norm": 0.1560724526643753, "learning_rate": 4.225266904056521e-06, "loss": 0.0645, "num_input_tokens_seen": 29045824, "step": 32490 }, { "epoch": 8.576217500329946, "grad_norm": 0.18119493126869202, "learning_rate": 4.21980715781903e-06, "loss": 0.0498, "num_input_tokens_seen": 29050368, "step": 32495 }, { "epoch": 8.577537283885443, "grad_norm": 0.350727915763855, "learning_rate": 4.214350616148416e-06, "loss": 0.047, "num_input_tokens_seen": 29055008, "step": 32500 }, { "epoch": 8.57885706744094, "grad_norm": 0.11176837980747223, "learning_rate": 4.20889727988614e-06, "loss": 0.1036, "num_input_tokens_seen": 29059776, "step": 32505 }, { "epoch": 8.580176850996436, "grad_norm": 0.15640810132026672, "learning_rate": 4.20344714987318e-06, "loss": 0.0342, "num_input_tokens_seen": 29064480, "step": 32510 }, { "epoch": 8.581496634551934, "grad_norm": 0.1420358568429947, "learning_rate": 4.198000226950022e-06, "loss": 0.0794, "num_input_tokens_seen": 29068864, "step": 32515 }, { "epoch": 8.58281641810743, "grad_norm": 0.5217701196670532, "learning_rate": 4.192556511956635e-06, "loss": 0.1003, "num_input_tokens_seen": 29073344, "step": 32520 }, { "epoch": 8.584136201662927, "grad_norm": 0.31086355447769165, "learning_rate": 4.18711600573252e-06, "loss": 0.0765, "num_input_tokens_seen": 29077920, "step": 32525 }, { "epoch": 8.585455985218424, "grad_norm": 0.23521286249160767, "learning_rate": 4.181678709116671e-06, "loss": 0.0699, "num_input_tokens_seen": 29082304, "step": 32530 }, { "epoch": 8.58677576877392, "grad_norm": 0.012565506622195244, "learning_rate": 4.1762446229475785e-06, "loss": 0.0463, "num_input_tokens_seen": 29087008, "step": 32535 }, { "epoch": 8.588095552329419, "grad_norm": 0.38530752062797546, "learning_rate": 4.17081374806326e-06, "loss": 0.0535, "num_input_tokens_seen": 29091584, "step": 32540 }, { "epoch": 8.589415335884915, "grad_norm": 0.3136151134967804, "learning_rate": 4.165386085301212e-06, "loss": 0.0753, "num_input_tokens_seen": 29096128, "step": 32545 }, { "epoch": 8.590735119440412, "grad_norm": 0.16378270089626312, "learning_rate": 4.1599616354984525e-06, "loss": 0.0734, "num_input_tokens_seen": 29100448, "step": 32550 }, { "epoch": 8.592054902995908, "grad_norm": 0.09566398710012436, "learning_rate": 4.154540399491508e-06, "loss": 0.086, "num_input_tokens_seen": 29104800, "step": 32555 }, { "epoch": 8.593374686551405, "grad_norm": 0.32277849316596985, "learning_rate": 4.149122378116394e-06, "loss": 0.0318, "num_input_tokens_seen": 29109088, "step": 32560 }, { "epoch": 8.594694470106903, "grad_norm": 0.1684873402118683, "learning_rate": 4.14370757220863e-06, "loss": 0.0431, "num_input_tokens_seen": 29113696, "step": 32565 }, { "epoch": 8.5960142536624, "grad_norm": 0.2711392343044281, "learning_rate": 4.138295982603263e-06, "loss": 0.0478, "num_input_tokens_seen": 29118272, "step": 32570 }, { "epoch": 8.597334037217896, "grad_norm": 0.051284972578287125, "learning_rate": 4.132887610134814e-06, "loss": 0.0337, "num_input_tokens_seen": 29122976, "step": 32575 }, { "epoch": 8.598653820773393, "grad_norm": 0.06220335513353348, "learning_rate": 4.127482455637335e-06, "loss": 0.0336, "num_input_tokens_seen": 29127104, "step": 32580 }, { "epoch": 8.59997360432889, "grad_norm": 0.13155677914619446, "learning_rate": 4.1220805199443545e-06, "loss": 0.0238, "num_input_tokens_seen": 29131424, "step": 32585 }, { "epoch": 8.601293387884386, "grad_norm": 0.1772831380367279, "learning_rate": 4.116681803888925e-06, "loss": 0.0982, "num_input_tokens_seen": 29136064, "step": 32590 }, { "epoch": 8.602613171439884, "grad_norm": 0.03344983607530594, "learning_rate": 4.111286308303605e-06, "loss": 0.0304, "num_input_tokens_seen": 29140224, "step": 32595 }, { "epoch": 8.603932954995381, "grad_norm": 0.24147240817546844, "learning_rate": 4.105894034020433e-06, "loss": 0.0619, "num_input_tokens_seen": 29144544, "step": 32600 }, { "epoch": 8.603932954995381, "eval_loss": 0.06694675236940384, "eval_runtime": 64.7223, "eval_samples_per_second": 104.06, "eval_steps_per_second": 26.019, "num_input_tokens_seen": 29144544, "step": 32600 }, { "epoch": 8.605252738550877, "grad_norm": 0.3302669823169708, "learning_rate": 4.100504981870975e-06, "loss": 0.0307, "num_input_tokens_seen": 29149056, "step": 32605 }, { "epoch": 8.606572522106374, "grad_norm": 0.31716346740722656, "learning_rate": 4.0951191526862915e-06, "loss": 0.066, "num_input_tokens_seen": 29153504, "step": 32610 }, { "epoch": 8.60789230566187, "grad_norm": 0.18275701999664307, "learning_rate": 4.089736547296938e-06, "loss": 0.026, "num_input_tokens_seen": 29157728, "step": 32615 }, { "epoch": 8.609212089217369, "grad_norm": 0.0660560131072998, "learning_rate": 4.08435716653299e-06, "loss": 0.0543, "num_input_tokens_seen": 29161984, "step": 32620 }, { "epoch": 8.610531872772865, "grad_norm": 0.2171325385570526, "learning_rate": 4.0789810112240005e-06, "loss": 0.0534, "num_input_tokens_seen": 29166144, "step": 32625 }, { "epoch": 8.611851656328362, "grad_norm": 0.20636442303657532, "learning_rate": 4.073608082199057e-06, "loss": 0.0516, "num_input_tokens_seen": 29170656, "step": 32630 }, { "epoch": 8.613171439883859, "grad_norm": 0.30510881543159485, "learning_rate": 4.068238380286718e-06, "loss": 0.0713, "num_input_tokens_seen": 29175424, "step": 32635 }, { "epoch": 8.614491223439355, "grad_norm": 0.17648358643054962, "learning_rate": 4.062871906315072e-06, "loss": 0.0574, "num_input_tokens_seen": 29179840, "step": 32640 }, { "epoch": 8.615811006994853, "grad_norm": 0.2471001297235489, "learning_rate": 4.057508661111686e-06, "loss": 0.0786, "num_input_tokens_seen": 29184320, "step": 32645 }, { "epoch": 8.61713079055035, "grad_norm": 0.07106463611125946, "learning_rate": 4.052148645503648e-06, "loss": 0.0405, "num_input_tokens_seen": 29188704, "step": 32650 }, { "epoch": 8.618450574105847, "grad_norm": 0.06340433657169342, "learning_rate": 4.046791860317531e-06, "loss": 0.0282, "num_input_tokens_seen": 29193472, "step": 32655 }, { "epoch": 8.619770357661343, "grad_norm": 0.19910138845443726, "learning_rate": 4.041438306379431e-06, "loss": 0.0515, "num_input_tokens_seen": 29197760, "step": 32660 }, { "epoch": 8.62109014121684, "grad_norm": 0.10229451954364777, "learning_rate": 4.036087984514916e-06, "loss": 0.0339, "num_input_tokens_seen": 29202272, "step": 32665 }, { "epoch": 8.622409924772338, "grad_norm": 0.07130244374275208, "learning_rate": 4.030740895549084e-06, "loss": 0.0917, "num_input_tokens_seen": 29206496, "step": 32670 }, { "epoch": 8.623729708327835, "grad_norm": 0.3523802161216736, "learning_rate": 4.025397040306531e-06, "loss": 0.0774, "num_input_tokens_seen": 29211200, "step": 32675 }, { "epoch": 8.625049491883331, "grad_norm": 0.1125662624835968, "learning_rate": 4.0200564196113285e-06, "loss": 0.1065, "num_input_tokens_seen": 29215584, "step": 32680 }, { "epoch": 8.626369275438828, "grad_norm": 0.14605417847633362, "learning_rate": 4.014719034287079e-06, "loss": 0.0402, "num_input_tokens_seen": 29219808, "step": 32685 }, { "epoch": 8.627689058994324, "grad_norm": 0.3915920555591583, "learning_rate": 4.0093848851568775e-06, "loss": 0.1012, "num_input_tokens_seen": 29224224, "step": 32690 }, { "epoch": 8.629008842549823, "grad_norm": 0.17859698832035065, "learning_rate": 4.004053973043304e-06, "loss": 0.1052, "num_input_tokens_seen": 29228576, "step": 32695 }, { "epoch": 8.63032862610532, "grad_norm": 0.19770224392414093, "learning_rate": 3.998726298768465e-06, "loss": 0.0946, "num_input_tokens_seen": 29233216, "step": 32700 }, { "epoch": 8.631648409660816, "grad_norm": 0.16487587988376617, "learning_rate": 3.99340186315395e-06, "loss": 0.095, "num_input_tokens_seen": 29237760, "step": 32705 }, { "epoch": 8.632968193216312, "grad_norm": 0.3935829997062683, "learning_rate": 3.988080667020849e-06, "loss": 0.0706, "num_input_tokens_seen": 29242144, "step": 32710 }, { "epoch": 8.634287976771809, "grad_norm": 0.3497520089149475, "learning_rate": 3.982762711189766e-06, "loss": 0.0677, "num_input_tokens_seen": 29246944, "step": 32715 }, { "epoch": 8.635607760327307, "grad_norm": 0.3890731930732727, "learning_rate": 3.977447996480785e-06, "loss": 0.067, "num_input_tokens_seen": 29251200, "step": 32720 }, { "epoch": 8.636927543882804, "grad_norm": 0.034144505858421326, "learning_rate": 3.97213652371351e-06, "loss": 0.0623, "num_input_tokens_seen": 29256000, "step": 32725 }, { "epoch": 8.6382473274383, "grad_norm": 0.17078763246536255, "learning_rate": 3.966828293707042e-06, "loss": 0.0347, "num_input_tokens_seen": 29260032, "step": 32730 }, { "epoch": 8.639567110993797, "grad_norm": 0.3603339195251465, "learning_rate": 3.961523307279963e-06, "loss": 0.1263, "num_input_tokens_seen": 29264480, "step": 32735 }, { "epoch": 8.640886894549293, "grad_norm": 0.034257672727108, "learning_rate": 3.956221565250382e-06, "loss": 0.0732, "num_input_tokens_seen": 29268864, "step": 32740 }, { "epoch": 8.642206678104792, "grad_norm": 0.09919828176498413, "learning_rate": 3.950923068435883e-06, "loss": 0.0503, "num_input_tokens_seen": 29273216, "step": 32745 }, { "epoch": 8.643526461660288, "grad_norm": 0.3247840106487274, "learning_rate": 3.945627817653566e-06, "loss": 0.0707, "num_input_tokens_seen": 29277760, "step": 32750 }, { "epoch": 8.644846245215785, "grad_norm": 0.46490582823753357, "learning_rate": 3.9403358137200335e-06, "loss": 0.058, "num_input_tokens_seen": 29281984, "step": 32755 }, { "epoch": 8.646166028771281, "grad_norm": 0.14247538149356842, "learning_rate": 3.9350470574513605e-06, "loss": 0.086, "num_input_tokens_seen": 29286496, "step": 32760 }, { "epoch": 8.647485812326778, "grad_norm": 0.34124991297721863, "learning_rate": 3.9297615496631525e-06, "loss": 0.0583, "num_input_tokens_seen": 29290944, "step": 32765 }, { "epoch": 8.648805595882274, "grad_norm": 0.09631524235010147, "learning_rate": 3.924479291170505e-06, "loss": 0.052, "num_input_tokens_seen": 29295328, "step": 32770 }, { "epoch": 8.650125379437773, "grad_norm": 0.15627343952655792, "learning_rate": 3.919200282788002e-06, "loss": 0.0516, "num_input_tokens_seen": 29299840, "step": 32775 }, { "epoch": 8.65144516299327, "grad_norm": 0.23588554561138153, "learning_rate": 3.913924525329726e-06, "loss": 0.0467, "num_input_tokens_seen": 29304896, "step": 32780 }, { "epoch": 8.652764946548766, "grad_norm": 0.25933998823165894, "learning_rate": 3.908652019609279e-06, "loss": 0.1102, "num_input_tokens_seen": 29309728, "step": 32785 }, { "epoch": 8.654084730104262, "grad_norm": 0.05708885192871094, "learning_rate": 3.9033827664397364e-06, "loss": 0.0688, "num_input_tokens_seen": 29314112, "step": 32790 }, { "epoch": 8.655404513659759, "grad_norm": 0.21943096816539764, "learning_rate": 3.898116766633694e-06, "loss": 0.052, "num_input_tokens_seen": 29318752, "step": 32795 }, { "epoch": 8.656724297215257, "grad_norm": 0.5523438453674316, "learning_rate": 3.8928540210032225e-06, "loss": 0.1309, "num_input_tokens_seen": 29323040, "step": 32800 }, { "epoch": 8.656724297215257, "eval_loss": 0.06685389578342438, "eval_runtime": 64.7446, "eval_samples_per_second": 104.024, "eval_steps_per_second": 26.01, "num_input_tokens_seen": 29323040, "step": 32800 }, { "epoch": 8.658044080770754, "grad_norm": 0.27125513553619385, "learning_rate": 3.887594530359909e-06, "loss": 0.0487, "num_input_tokens_seen": 29327680, "step": 32805 }, { "epoch": 8.65936386432625, "grad_norm": 0.13196101784706116, "learning_rate": 3.88233829551484e-06, "loss": 0.0372, "num_input_tokens_seen": 29332192, "step": 32810 }, { "epoch": 8.660683647881747, "grad_norm": 0.052428774535655975, "learning_rate": 3.877085317278581e-06, "loss": 0.0174, "num_input_tokens_seen": 29336608, "step": 32815 }, { "epoch": 8.662003431437244, "grad_norm": 0.16101226210594177, "learning_rate": 3.87183559646122e-06, "loss": 0.082, "num_input_tokens_seen": 29341248, "step": 32820 }, { "epoch": 8.663323214992742, "grad_norm": 0.19853097200393677, "learning_rate": 3.866589133872317e-06, "loss": 0.0594, "num_input_tokens_seen": 29345632, "step": 32825 }, { "epoch": 8.664642998548238, "grad_norm": 0.27099448442459106, "learning_rate": 3.861345930320948e-06, "loss": 0.0854, "num_input_tokens_seen": 29350304, "step": 32830 }, { "epoch": 8.665962782103735, "grad_norm": 0.23485668003559113, "learning_rate": 3.856105986615688e-06, "loss": 0.0644, "num_input_tokens_seen": 29354976, "step": 32835 }, { "epoch": 8.667282565659232, "grad_norm": 0.02862987481057644, "learning_rate": 3.850869303564589e-06, "loss": 0.0183, "num_input_tokens_seen": 29359456, "step": 32840 }, { "epoch": 8.668602349214728, "grad_norm": 0.058950502425432205, "learning_rate": 3.845635881975226e-06, "loss": 0.0467, "num_input_tokens_seen": 29363840, "step": 32845 }, { "epoch": 8.669922132770226, "grad_norm": 0.1413072943687439, "learning_rate": 3.840405722654647e-06, "loss": 0.0311, "num_input_tokens_seen": 29368256, "step": 32850 }, { "epoch": 8.671241916325723, "grad_norm": 0.042121268808841705, "learning_rate": 3.835178826409419e-06, "loss": 0.0672, "num_input_tokens_seen": 29372512, "step": 32855 }, { "epoch": 8.67256169988122, "grad_norm": 0.15638349950313568, "learning_rate": 3.8299551940455895e-06, "loss": 0.0486, "num_input_tokens_seen": 29377088, "step": 32860 }, { "epoch": 8.673881483436716, "grad_norm": 0.35553884506225586, "learning_rate": 3.824734826368703e-06, "loss": 0.0501, "num_input_tokens_seen": 29381536, "step": 32865 }, { "epoch": 8.675201266992213, "grad_norm": 0.5077926516532898, "learning_rate": 3.819517724183813e-06, "loss": 0.0772, "num_input_tokens_seen": 29385888, "step": 32870 }, { "epoch": 8.67652105054771, "grad_norm": 0.09993810951709747, "learning_rate": 3.8143038882954648e-06, "loss": 0.0515, "num_input_tokens_seen": 29390400, "step": 32875 }, { "epoch": 8.677840834103208, "grad_norm": 0.23732170462608337, "learning_rate": 3.8090933195076867e-06, "loss": 0.0806, "num_input_tokens_seen": 29395136, "step": 32880 }, { "epoch": 8.679160617658704, "grad_norm": 0.24257096648216248, "learning_rate": 3.8038860186240198e-06, "loss": 0.0435, "num_input_tokens_seen": 29399648, "step": 32885 }, { "epoch": 8.6804804012142, "grad_norm": 0.17076610028743744, "learning_rate": 3.7986819864475026e-06, "loss": 0.0477, "num_input_tokens_seen": 29404064, "step": 32890 }, { "epoch": 8.681800184769697, "grad_norm": 0.3292805552482605, "learning_rate": 3.793481223780651e-06, "loss": 0.0524, "num_input_tokens_seen": 29408416, "step": 32895 }, { "epoch": 8.683119968325194, "grad_norm": 0.28284069895744324, "learning_rate": 3.788283731425496e-06, "loss": 0.0927, "num_input_tokens_seen": 29412768, "step": 32900 }, { "epoch": 8.684439751880692, "grad_norm": 0.1924552172422409, "learning_rate": 3.7830895101835488e-06, "loss": 0.0582, "num_input_tokens_seen": 29417376, "step": 32905 }, { "epoch": 8.685759535436189, "grad_norm": 0.15752220153808594, "learning_rate": 3.7778985608558274e-06, "loss": 0.1213, "num_input_tokens_seen": 29421952, "step": 32910 }, { "epoch": 8.687079318991685, "grad_norm": 0.4603945314884186, "learning_rate": 3.7727108842428443e-06, "loss": 0.0703, "num_input_tokens_seen": 29426208, "step": 32915 }, { "epoch": 8.688399102547182, "grad_norm": 0.2732532024383545, "learning_rate": 3.7675264811446065e-06, "loss": 0.0487, "num_input_tokens_seen": 29430752, "step": 32920 }, { "epoch": 8.689718886102678, "grad_norm": 0.27809938788414, "learning_rate": 3.7623453523605994e-06, "loss": 0.0839, "num_input_tokens_seen": 29435168, "step": 32925 }, { "epoch": 8.691038669658177, "grad_norm": 0.17244605720043182, "learning_rate": 3.757167498689834e-06, "loss": 0.0524, "num_input_tokens_seen": 29439776, "step": 32930 }, { "epoch": 8.692358453213673, "grad_norm": 0.2921839654445648, "learning_rate": 3.7519929209307914e-06, "loss": 0.0457, "num_input_tokens_seen": 29444384, "step": 32935 }, { "epoch": 8.69367823676917, "grad_norm": 0.2505919933319092, "learning_rate": 3.746821619881463e-06, "loss": 0.0341, "num_input_tokens_seen": 29449024, "step": 32940 }, { "epoch": 8.694998020324666, "grad_norm": 0.3557567000389099, "learning_rate": 3.74165359633932e-06, "loss": 0.0422, "num_input_tokens_seen": 29453728, "step": 32945 }, { "epoch": 8.696317803880163, "grad_norm": 0.16278469562530518, "learning_rate": 3.736488851101341e-06, "loss": 0.0814, "num_input_tokens_seen": 29458208, "step": 32950 }, { "epoch": 8.697637587435661, "grad_norm": 0.07389740645885468, "learning_rate": 3.7313273849640035e-06, "loss": 0.046, "num_input_tokens_seen": 29462208, "step": 32955 }, { "epoch": 8.698957370991158, "grad_norm": 0.15894150733947754, "learning_rate": 3.7261691987232533e-06, "loss": 0.0419, "num_input_tokens_seen": 29466752, "step": 32960 }, { "epoch": 8.700277154546654, "grad_norm": 0.23662126064300537, "learning_rate": 3.7210142931745575e-06, "loss": 0.0491, "num_input_tokens_seen": 29471360, "step": 32965 }, { "epoch": 8.701596938102151, "grad_norm": 0.030453141778707504, "learning_rate": 3.7158626691128712e-06, "loss": 0.0784, "num_input_tokens_seen": 29475584, "step": 32970 }, { "epoch": 8.702916721657648, "grad_norm": 0.024369509890675545, "learning_rate": 3.710714327332629e-06, "loss": 0.028, "num_input_tokens_seen": 29479936, "step": 32975 }, { "epoch": 8.704236505213146, "grad_norm": 0.21548347175121307, "learning_rate": 3.7055692686277815e-06, "loss": 0.0989, "num_input_tokens_seen": 29484512, "step": 32980 }, { "epoch": 8.705556288768642, "grad_norm": 0.3104400038719177, "learning_rate": 3.70042749379175e-06, "loss": 0.0625, "num_input_tokens_seen": 29489056, "step": 32985 }, { "epoch": 8.706876072324139, "grad_norm": 0.042929720133543015, "learning_rate": 3.6952890036174693e-06, "loss": 0.0556, "num_input_tokens_seen": 29493472, "step": 32990 }, { "epoch": 8.708195855879636, "grad_norm": 0.08773769438266754, "learning_rate": 3.690153798897353e-06, "loss": 0.1146, "num_input_tokens_seen": 29497792, "step": 32995 }, { "epoch": 8.709515639435132, "grad_norm": 0.2040826380252838, "learning_rate": 3.6850218804233225e-06, "loss": 0.0829, "num_input_tokens_seen": 29502176, "step": 33000 }, { "epoch": 8.709515639435132, "eval_loss": 0.06704409420490265, "eval_runtime": 64.7625, "eval_samples_per_second": 103.995, "eval_steps_per_second": 26.003, "num_input_tokens_seen": 29502176, "step": 33000 }, { "epoch": 8.71083542299063, "grad_norm": 0.2500505745410919, "learning_rate": 3.679893248986779e-06, "loss": 0.0497, "num_input_tokens_seen": 29506560, "step": 33005 }, { "epoch": 8.712155206546127, "grad_norm": 0.28358033299446106, "learning_rate": 3.6747679053786147e-06, "loss": 0.0626, "num_input_tokens_seen": 29511008, "step": 33010 }, { "epoch": 8.713474990101624, "grad_norm": 0.0624716617166996, "learning_rate": 3.669645850389228e-06, "loss": 0.0108, "num_input_tokens_seen": 29515456, "step": 33015 }, { "epoch": 8.71479477365712, "grad_norm": 0.20820358395576477, "learning_rate": 3.664527084808514e-06, "loss": 0.0793, "num_input_tokens_seen": 29520416, "step": 33020 }, { "epoch": 8.716114557212617, "grad_norm": 0.3063121736049652, "learning_rate": 3.6594116094258337e-06, "loss": 0.1164, "num_input_tokens_seen": 29525024, "step": 33025 }, { "epoch": 8.717434340768115, "grad_norm": 0.11276159435510635, "learning_rate": 3.6542994250300665e-06, "loss": 0.0824, "num_input_tokens_seen": 29529600, "step": 33030 }, { "epoch": 8.718754124323612, "grad_norm": 0.4005969762802124, "learning_rate": 3.6491905324095825e-06, "loss": 0.0337, "num_input_tokens_seen": 29534304, "step": 33035 }, { "epoch": 8.720073907879108, "grad_norm": 0.5101680755615234, "learning_rate": 3.644084932352221e-06, "loss": 0.0623, "num_input_tokens_seen": 29538848, "step": 33040 }, { "epoch": 8.721393691434605, "grad_norm": 0.49138447642326355, "learning_rate": 3.6389826256453457e-06, "loss": 0.097, "num_input_tokens_seen": 29543296, "step": 33045 }, { "epoch": 8.722713474990101, "grad_norm": 0.1468307077884674, "learning_rate": 3.633883613075781e-06, "loss": 0.032, "num_input_tokens_seen": 29547744, "step": 33050 }, { "epoch": 8.724033258545598, "grad_norm": 0.17765173316001892, "learning_rate": 3.6287878954298693e-06, "loss": 0.035, "num_input_tokens_seen": 29552384, "step": 33055 }, { "epoch": 8.725353042101096, "grad_norm": 0.11306607723236084, "learning_rate": 3.6236954734934354e-06, "loss": 0.1001, "num_input_tokens_seen": 29557248, "step": 33060 }, { "epoch": 8.726672825656593, "grad_norm": 0.21788683533668518, "learning_rate": 3.618606348051784e-06, "loss": 0.0359, "num_input_tokens_seen": 29561472, "step": 33065 }, { "epoch": 8.72799260921209, "grad_norm": 0.038152098655700684, "learning_rate": 3.6135205198897376e-06, "loss": 0.0187, "num_input_tokens_seen": 29565632, "step": 33070 }, { "epoch": 8.729312392767586, "grad_norm": 0.14211511611938477, "learning_rate": 3.6084379897915854e-06, "loss": 0.0467, "num_input_tokens_seen": 29569984, "step": 33075 }, { "epoch": 8.730632176323082, "grad_norm": 0.2154967039823532, "learning_rate": 3.6033587585411115e-06, "loss": 0.0874, "num_input_tokens_seen": 29574592, "step": 33080 }, { "epoch": 8.73195195987858, "grad_norm": 0.08838847279548645, "learning_rate": 3.5982828269216117e-06, "loss": 0.0272, "num_input_tokens_seen": 29578944, "step": 33085 }, { "epoch": 8.733271743434077, "grad_norm": 0.19743730127811432, "learning_rate": 3.593210195715843e-06, "loss": 0.0542, "num_input_tokens_seen": 29583744, "step": 33090 }, { "epoch": 8.734591526989574, "grad_norm": 0.5086268186569214, "learning_rate": 3.5881408657060773e-06, "loss": 0.0941, "num_input_tokens_seen": 29587968, "step": 33095 }, { "epoch": 8.73591131054507, "grad_norm": 0.048844076693058014, "learning_rate": 3.583074837674075e-06, "loss": 0.0334, "num_input_tokens_seen": 29592736, "step": 33100 }, { "epoch": 8.737231094100567, "grad_norm": 0.43333449959754944, "learning_rate": 3.578012112401069e-06, "loss": 0.087, "num_input_tokens_seen": 29597024, "step": 33105 }, { "epoch": 8.738550877656065, "grad_norm": 0.19507119059562683, "learning_rate": 3.5729526906677996e-06, "loss": 0.0663, "num_input_tokens_seen": 29601600, "step": 33110 }, { "epoch": 8.739870661211562, "grad_norm": 0.4819417893886566, "learning_rate": 3.5678965732545007e-06, "loss": 0.0544, "num_input_tokens_seen": 29606496, "step": 33115 }, { "epoch": 8.741190444767058, "grad_norm": 0.15235202014446259, "learning_rate": 3.562843760940876e-06, "loss": 0.0741, "num_input_tokens_seen": 29611040, "step": 33120 }, { "epoch": 8.742510228322555, "grad_norm": 0.26340052485466003, "learning_rate": 3.5577942545061473e-06, "loss": 0.047, "num_input_tokens_seen": 29615616, "step": 33125 }, { "epoch": 8.743830011878051, "grad_norm": 0.14818225800991058, "learning_rate": 3.5527480547289967e-06, "loss": 0.1005, "num_input_tokens_seen": 29620128, "step": 33130 }, { "epoch": 8.745149795433548, "grad_norm": 0.47261500358581543, "learning_rate": 3.547705162387624e-06, "loss": 0.0642, "num_input_tokens_seen": 29624608, "step": 33135 }, { "epoch": 8.746469578989046, "grad_norm": 0.11401692777872086, "learning_rate": 3.542665578259699e-06, "loss": 0.0455, "num_input_tokens_seen": 29629376, "step": 33140 }, { "epoch": 8.747789362544543, "grad_norm": 0.11041387170553207, "learning_rate": 3.5376293031223945e-06, "loss": 0.0412, "num_input_tokens_seen": 29633984, "step": 33145 }, { "epoch": 8.74910914610004, "grad_norm": 0.08082600682973862, "learning_rate": 3.5325963377523614e-06, "loss": 0.05, "num_input_tokens_seen": 29638592, "step": 33150 }, { "epoch": 8.750428929655536, "grad_norm": 0.07581301778554916, "learning_rate": 3.5275666829257536e-06, "loss": 0.0359, "num_input_tokens_seen": 29643232, "step": 33155 }, { "epoch": 8.751748713211033, "grad_norm": 0.103957399725914, "learning_rate": 3.5225403394181955e-06, "loss": 0.0357, "num_input_tokens_seen": 29647808, "step": 33160 }, { "epoch": 8.75306849676653, "grad_norm": 0.2724948227405548, "learning_rate": 3.517517308004828e-06, "loss": 0.0541, "num_input_tokens_seen": 29652192, "step": 33165 }, { "epoch": 8.754388280322027, "grad_norm": 0.1904197484254837, "learning_rate": 3.512497589460251e-06, "loss": 0.0794, "num_input_tokens_seen": 29656384, "step": 33170 }, { "epoch": 8.755708063877524, "grad_norm": 0.42544230818748474, "learning_rate": 3.5074811845585727e-06, "loss": 0.0814, "num_input_tokens_seen": 29660768, "step": 33175 }, { "epoch": 8.75702784743302, "grad_norm": 0.38817551732063293, "learning_rate": 3.5024680940733937e-06, "loss": 0.0683, "num_input_tokens_seen": 29664960, "step": 33180 }, { "epoch": 8.758347630988517, "grad_norm": 0.1555711179971695, "learning_rate": 3.4974583187777852e-06, "loss": 0.0518, "num_input_tokens_seen": 29669600, "step": 33185 }, { "epoch": 8.759667414544015, "grad_norm": 0.05048147961497307, "learning_rate": 3.4924518594443204e-06, "loss": 0.1023, "num_input_tokens_seen": 29674048, "step": 33190 }, { "epoch": 8.760987198099512, "grad_norm": 0.2714904248714447, "learning_rate": 3.4874487168450682e-06, "loss": 0.0746, "num_input_tokens_seen": 29678240, "step": 33195 }, { "epoch": 8.762306981655009, "grad_norm": 0.0266673993319273, "learning_rate": 3.482448891751558e-06, "loss": 0.0213, "num_input_tokens_seen": 29682560, "step": 33200 }, { "epoch": 8.762306981655009, "eval_loss": 0.06676588952541351, "eval_runtime": 64.819, "eval_samples_per_second": 103.905, "eval_steps_per_second": 25.98, "num_input_tokens_seen": 29682560, "step": 33200 }, { "epoch": 8.763626765210505, "grad_norm": 0.28558051586151123, "learning_rate": 3.477452384934843e-06, "loss": 0.0659, "num_input_tokens_seen": 29686880, "step": 33205 }, { "epoch": 8.764946548766002, "grad_norm": 0.2627415657043457, "learning_rate": 3.472459197165434e-06, "loss": 0.0602, "num_input_tokens_seen": 29691296, "step": 33210 }, { "epoch": 8.7662663323215, "grad_norm": 0.42825135588645935, "learning_rate": 3.4674693292133518e-06, "loss": 0.0771, "num_input_tokens_seen": 29695616, "step": 33215 }, { "epoch": 8.767586115876997, "grad_norm": 0.056689005345106125, "learning_rate": 3.4624827818480977e-06, "loss": 0.0451, "num_input_tokens_seen": 29700064, "step": 33220 }, { "epoch": 8.768905899432493, "grad_norm": 0.11603164672851562, "learning_rate": 3.4574995558386474e-06, "loss": 0.0467, "num_input_tokens_seen": 29704928, "step": 33225 }, { "epoch": 8.77022568298799, "grad_norm": 0.18743230402469635, "learning_rate": 3.452519651953487e-06, "loss": 0.0307, "num_input_tokens_seen": 29709280, "step": 33230 }, { "epoch": 8.771545466543486, "grad_norm": 0.21699371933937073, "learning_rate": 3.447543070960585e-06, "loss": 0.0649, "num_input_tokens_seen": 29713760, "step": 33235 }, { "epoch": 8.772865250098985, "grad_norm": 0.23706123232841492, "learning_rate": 3.4425698136273778e-06, "loss": 0.0986, "num_input_tokens_seen": 29718176, "step": 33240 }, { "epoch": 8.774185033654481, "grad_norm": 0.08144243061542511, "learning_rate": 3.437599880720821e-06, "loss": 0.0241, "num_input_tokens_seen": 29722720, "step": 33245 }, { "epoch": 8.775504817209978, "grad_norm": 0.05081043764948845, "learning_rate": 3.4326332730073267e-06, "loss": 0.0474, "num_input_tokens_seen": 29727360, "step": 33250 }, { "epoch": 8.776824600765474, "grad_norm": 0.08954569697380066, "learning_rate": 3.427669991252813e-06, "loss": 0.097, "num_input_tokens_seen": 29731744, "step": 33255 }, { "epoch": 8.77814438432097, "grad_norm": 0.04469580948352814, "learning_rate": 3.42271003622269e-06, "loss": 0.0417, "num_input_tokens_seen": 29736160, "step": 33260 }, { "epoch": 8.779464167876469, "grad_norm": 0.519282341003418, "learning_rate": 3.4177534086818286e-06, "loss": 0.0715, "num_input_tokens_seen": 29740512, "step": 33265 }, { "epoch": 8.780783951431966, "grad_norm": 0.2236897051334381, "learning_rate": 3.412800109394612e-06, "loss": 0.0262, "num_input_tokens_seen": 29744928, "step": 33270 }, { "epoch": 8.782103734987462, "grad_norm": 0.31672611832618713, "learning_rate": 3.4078501391249044e-06, "loss": 0.1019, "num_input_tokens_seen": 29749312, "step": 33275 }, { "epoch": 8.783423518542959, "grad_norm": 0.25734931230545044, "learning_rate": 3.4029034986360453e-06, "loss": 0.0892, "num_input_tokens_seen": 29753728, "step": 33280 }, { "epoch": 8.784743302098455, "grad_norm": 0.17762035131454468, "learning_rate": 3.397960188690877e-06, "loss": 0.08, "num_input_tokens_seen": 29758400, "step": 33285 }, { "epoch": 8.786063085653954, "grad_norm": 0.29525870084762573, "learning_rate": 3.393020210051717e-06, "loss": 0.0628, "num_input_tokens_seen": 29762848, "step": 33290 }, { "epoch": 8.78738286920945, "grad_norm": 0.34065327048301697, "learning_rate": 3.3880835634803655e-06, "loss": 0.0674, "num_input_tokens_seen": 29767200, "step": 33295 }, { "epoch": 8.788702652764947, "grad_norm": 0.05425555631518364, "learning_rate": 3.383150249738126e-06, "loss": 0.0356, "num_input_tokens_seen": 29771424, "step": 33300 }, { "epoch": 8.790022436320443, "grad_norm": 0.2203356772661209, "learning_rate": 3.3782202695857663e-06, "loss": 0.0385, "num_input_tokens_seen": 29775584, "step": 33305 }, { "epoch": 8.79134221987594, "grad_norm": 0.455761194229126, "learning_rate": 3.373293623783558e-06, "loss": 0.0725, "num_input_tokens_seen": 29780096, "step": 33310 }, { "epoch": 8.792662003431436, "grad_norm": 0.061299826949834824, "learning_rate": 3.368370313091257e-06, "loss": 0.023, "num_input_tokens_seen": 29784480, "step": 33315 }, { "epoch": 8.793981786986935, "grad_norm": 0.0508180633187294, "learning_rate": 3.363450338268087e-06, "loss": 0.0168, "num_input_tokens_seen": 29788928, "step": 33320 }, { "epoch": 8.795301570542431, "grad_norm": 0.0725937932729721, "learning_rate": 3.358533700072783e-06, "loss": 0.0492, "num_input_tokens_seen": 29793408, "step": 33325 }, { "epoch": 8.796621354097928, "grad_norm": 0.18679532408714294, "learning_rate": 3.3536203992635377e-06, "loss": 0.0697, "num_input_tokens_seen": 29797920, "step": 33330 }, { "epoch": 8.797941137653424, "grad_norm": 0.2164284884929657, "learning_rate": 3.348710436598057e-06, "loss": 0.0549, "num_input_tokens_seen": 29802144, "step": 33335 }, { "epoch": 8.799260921208921, "grad_norm": 0.1605917066335678, "learning_rate": 3.3438038128335155e-06, "loss": 0.0296, "num_input_tokens_seen": 29806496, "step": 33340 }, { "epoch": 8.80058070476442, "grad_norm": 0.14203490316867828, "learning_rate": 3.338900528726571e-06, "loss": 0.0581, "num_input_tokens_seen": 29811328, "step": 33345 }, { "epoch": 8.801900488319916, "grad_norm": 0.12017769366502762, "learning_rate": 3.3340005850333812e-06, "loss": 0.0701, "num_input_tokens_seen": 29815840, "step": 33350 }, { "epoch": 8.803220271875412, "grad_norm": 0.24080802500247955, "learning_rate": 3.329103982509568e-06, "loss": 0.0656, "num_input_tokens_seen": 29820416, "step": 33355 }, { "epoch": 8.804540055430909, "grad_norm": 0.209104523062706, "learning_rate": 3.324210721910259e-06, "loss": 0.0318, "num_input_tokens_seen": 29824736, "step": 33360 }, { "epoch": 8.805859838986406, "grad_norm": 0.2165743112564087, "learning_rate": 3.319320803990053e-06, "loss": 0.0301, "num_input_tokens_seen": 29829216, "step": 33365 }, { "epoch": 8.807179622541904, "grad_norm": 0.13622023165225983, "learning_rate": 3.3144342295030274e-06, "loss": 0.0499, "num_input_tokens_seen": 29833824, "step": 33370 }, { "epoch": 8.8084994060974, "grad_norm": 0.16241422295570374, "learning_rate": 3.309550999202765e-06, "loss": 0.1075, "num_input_tokens_seen": 29838016, "step": 33375 }, { "epoch": 8.809819189652897, "grad_norm": 0.4399169981479645, "learning_rate": 3.3046711138423197e-06, "loss": 0.0499, "num_input_tokens_seen": 29842400, "step": 33380 }, { "epoch": 8.811138973208394, "grad_norm": 0.408474326133728, "learning_rate": 3.2997945741742255e-06, "loss": 0.0399, "num_input_tokens_seen": 29847136, "step": 33385 }, { "epoch": 8.81245875676389, "grad_norm": 0.34309136867523193, "learning_rate": 3.2949213809505082e-06, "loss": 0.0719, "num_input_tokens_seen": 29851776, "step": 33390 }, { "epoch": 8.813778540319387, "grad_norm": 0.18612755835056305, "learning_rate": 3.2900515349226834e-06, "loss": 0.0319, "num_input_tokens_seen": 29856320, "step": 33395 }, { "epoch": 8.815098323874885, "grad_norm": 0.05157690867781639, "learning_rate": 3.285185036841731e-06, "loss": 0.0122, "num_input_tokens_seen": 29860768, "step": 33400 }, { "epoch": 8.815098323874885, "eval_loss": 0.06660357117652893, "eval_runtime": 64.7488, "eval_samples_per_second": 104.017, "eval_steps_per_second": 26.008, "num_input_tokens_seen": 29860768, "step": 33400 }, { "epoch": 8.816418107430382, "grad_norm": 0.5171669721603394, "learning_rate": 3.2803218874581377e-06, "loss": 0.0538, "num_input_tokens_seen": 29865056, "step": 33405 }, { "epoch": 8.817737890985878, "grad_norm": 0.05674148350954056, "learning_rate": 3.2754620875218494e-06, "loss": 0.0328, "num_input_tokens_seen": 29869344, "step": 33410 }, { "epoch": 8.819057674541375, "grad_norm": 0.03309844434261322, "learning_rate": 3.2706056377823146e-06, "loss": 0.0272, "num_input_tokens_seen": 29873824, "step": 33415 }, { "epoch": 8.820377458096871, "grad_norm": 0.618338406085968, "learning_rate": 3.2657525389884647e-06, "loss": 0.089, "num_input_tokens_seen": 29878496, "step": 33420 }, { "epoch": 8.82169724165237, "grad_norm": 0.1796351820230484, "learning_rate": 3.260902791888698e-06, "loss": 0.0346, "num_input_tokens_seen": 29882752, "step": 33425 }, { "epoch": 8.823017025207866, "grad_norm": 0.4590378701686859, "learning_rate": 3.2560563972309166e-06, "loss": 0.0577, "num_input_tokens_seen": 29887072, "step": 33430 }, { "epoch": 8.824336808763363, "grad_norm": 0.2503621578216553, "learning_rate": 3.251213355762489e-06, "loss": 0.1501, "num_input_tokens_seen": 29891680, "step": 33435 }, { "epoch": 8.82565659231886, "grad_norm": 0.24146400392055511, "learning_rate": 3.2463736682302707e-06, "loss": 0.0496, "num_input_tokens_seen": 29896320, "step": 33440 }, { "epoch": 8.826976375874356, "grad_norm": 0.18247416615486145, "learning_rate": 3.2415373353806124e-06, "loss": 0.0846, "num_input_tokens_seen": 29901152, "step": 33445 }, { "epoch": 8.828296159429854, "grad_norm": 0.1882723718881607, "learning_rate": 3.236704357959322e-06, "loss": 0.0269, "num_input_tokens_seen": 29905888, "step": 33450 }, { "epoch": 8.82961594298535, "grad_norm": 0.1875351071357727, "learning_rate": 3.2318747367117154e-06, "loss": 0.0498, "num_input_tokens_seen": 29910496, "step": 33455 }, { "epoch": 8.830935726540847, "grad_norm": 0.15903311967849731, "learning_rate": 3.227048472382585e-06, "loss": 0.0433, "num_input_tokens_seen": 29914816, "step": 33460 }, { "epoch": 8.832255510096344, "grad_norm": 0.41671276092529297, "learning_rate": 3.2222255657161915e-06, "loss": 0.09, "num_input_tokens_seen": 29919200, "step": 33465 }, { "epoch": 8.83357529365184, "grad_norm": 0.23736391961574554, "learning_rate": 3.2174060174562924e-06, "loss": 0.0314, "num_input_tokens_seen": 29923424, "step": 33470 }, { "epoch": 8.834895077207339, "grad_norm": 0.31572583317756653, "learning_rate": 3.2125898283461298e-06, "loss": 0.0588, "num_input_tokens_seen": 29927648, "step": 33475 }, { "epoch": 8.836214860762835, "grad_norm": 0.0662103220820427, "learning_rate": 3.207776999128406e-06, "loss": 0.0577, "num_input_tokens_seen": 29931712, "step": 33480 }, { "epoch": 8.837534644318332, "grad_norm": 0.12377462536096573, "learning_rate": 3.202967530545331e-06, "loss": 0.0469, "num_input_tokens_seen": 29935744, "step": 33485 }, { "epoch": 8.838854427873828, "grad_norm": 0.3124590218067169, "learning_rate": 3.1981614233385778e-06, "loss": 0.0434, "num_input_tokens_seen": 29939840, "step": 33490 }, { "epoch": 8.840174211429325, "grad_norm": 0.33301085233688354, "learning_rate": 3.1933586782493115e-06, "loss": 0.1164, "num_input_tokens_seen": 29944320, "step": 33495 }, { "epoch": 8.841493994984823, "grad_norm": 0.10111606866121292, "learning_rate": 3.188559296018184e-06, "loss": 0.0673, "num_input_tokens_seen": 29949184, "step": 33500 }, { "epoch": 8.84281377854032, "grad_norm": 0.1912315934896469, "learning_rate": 3.1837632773853098e-06, "loss": 0.0554, "num_input_tokens_seen": 29953504, "step": 33505 }, { "epoch": 8.844133562095816, "grad_norm": 0.1832091361284256, "learning_rate": 3.178970623090294e-06, "loss": 0.0216, "num_input_tokens_seen": 29958016, "step": 33510 }, { "epoch": 8.845453345651313, "grad_norm": 0.04191828519105911, "learning_rate": 3.174181333872234e-06, "loss": 0.0644, "num_input_tokens_seen": 29962432, "step": 33515 }, { "epoch": 8.84677312920681, "grad_norm": 0.10754439234733582, "learning_rate": 3.169395410469686e-06, "loss": 0.0599, "num_input_tokens_seen": 29966784, "step": 33520 }, { "epoch": 8.848092912762308, "grad_norm": 0.057430148124694824, "learning_rate": 3.164612853620713e-06, "loss": 0.0309, "num_input_tokens_seen": 29971744, "step": 33525 }, { "epoch": 8.849412696317804, "grad_norm": 0.2920302748680115, "learning_rate": 3.1598336640628333e-06, "loss": 0.0307, "num_input_tokens_seen": 29976224, "step": 33530 }, { "epoch": 8.850732479873301, "grad_norm": 0.15682421624660492, "learning_rate": 3.155057842533063e-06, "loss": 0.047, "num_input_tokens_seen": 29980384, "step": 33535 }, { "epoch": 8.852052263428797, "grad_norm": 0.08103614300489426, "learning_rate": 3.1502853897678984e-06, "loss": 0.0395, "num_input_tokens_seen": 29984768, "step": 33540 }, { "epoch": 8.853372046984294, "grad_norm": 0.20594191551208496, "learning_rate": 3.1455163065033017e-06, "loss": 0.0397, "num_input_tokens_seen": 29989472, "step": 33545 }, { "epoch": 8.854691830539792, "grad_norm": 0.050733353942632675, "learning_rate": 3.140750593474734e-06, "loss": 0.0696, "num_input_tokens_seen": 29993920, "step": 33550 }, { "epoch": 8.856011614095289, "grad_norm": 0.18081626296043396, "learning_rate": 3.1359882514171294e-06, "loss": 0.0779, "num_input_tokens_seen": 29998048, "step": 33555 }, { "epoch": 8.857331397650785, "grad_norm": 0.029113732278347015, "learning_rate": 3.1312292810648903e-06, "loss": 0.0327, "num_input_tokens_seen": 30002496, "step": 33560 }, { "epoch": 8.858651181206282, "grad_norm": 0.10736019164323807, "learning_rate": 3.1264736831519204e-06, "loss": 0.0279, "num_input_tokens_seen": 30006816, "step": 33565 }, { "epoch": 8.859970964761779, "grad_norm": 0.03093256987631321, "learning_rate": 3.1217214584115863e-06, "loss": 0.0413, "num_input_tokens_seen": 30011552, "step": 33570 }, { "epoch": 8.861290748317275, "grad_norm": 0.17004123330116272, "learning_rate": 3.116972607576746e-06, "loss": 0.0732, "num_input_tokens_seen": 30016224, "step": 33575 }, { "epoch": 8.862610531872773, "grad_norm": 0.3231600821018219, "learning_rate": 3.1122271313797303e-06, "loss": 0.0746, "num_input_tokens_seen": 30020544, "step": 33580 }, { "epoch": 8.86393031542827, "grad_norm": 0.1653924435377121, "learning_rate": 3.107485030552343e-06, "loss": 0.0851, "num_input_tokens_seen": 30025120, "step": 33585 }, { "epoch": 8.865250098983767, "grad_norm": 0.1316758692264557, "learning_rate": 3.1027463058258848e-06, "loss": 0.0542, "num_input_tokens_seen": 30029696, "step": 33590 }, { "epoch": 8.866569882539263, "grad_norm": 0.04609846696257591, "learning_rate": 3.0980109579311273e-06, "loss": 0.0324, "num_input_tokens_seen": 30034080, "step": 33595 }, { "epoch": 8.86788966609476, "grad_norm": 0.1789936125278473, "learning_rate": 3.093278987598314e-06, "loss": 0.0547, "num_input_tokens_seen": 30038624, "step": 33600 }, { "epoch": 8.86788966609476, "eval_loss": 0.06689382344484329, "eval_runtime": 64.7544, "eval_samples_per_second": 104.008, "eval_steps_per_second": 26.006, "num_input_tokens_seen": 30038624, "step": 33600 }, { "epoch": 8.869209449650258, "grad_norm": 0.2320082187652588, "learning_rate": 3.0885503955571826e-06, "loss": 0.0414, "num_input_tokens_seen": 30042944, "step": 33605 }, { "epoch": 8.870529233205755, "grad_norm": 0.35803115367889404, "learning_rate": 3.0838251825369313e-06, "loss": 0.0669, "num_input_tokens_seen": 30047328, "step": 33610 }, { "epoch": 8.871849016761251, "grad_norm": 0.19429190456867218, "learning_rate": 3.0791033492662517e-06, "loss": 0.0526, "num_input_tokens_seen": 30051680, "step": 33615 }, { "epoch": 8.873168800316748, "grad_norm": 0.24337060749530792, "learning_rate": 3.0743848964733203e-06, "loss": 0.0324, "num_input_tokens_seen": 30056064, "step": 33620 }, { "epoch": 8.874488583872244, "grad_norm": 0.2758674919605255, "learning_rate": 3.0696698248857625e-06, "loss": 0.044, "num_input_tokens_seen": 30060448, "step": 33625 }, { "epoch": 8.875808367427743, "grad_norm": 0.28566503524780273, "learning_rate": 3.0649581352307192e-06, "loss": 0.0767, "num_input_tokens_seen": 30064928, "step": 33630 }, { "epoch": 8.87712815098324, "grad_norm": 0.11672279238700867, "learning_rate": 3.060249828234776e-06, "loss": 0.0532, "num_input_tokens_seen": 30069024, "step": 33635 }, { "epoch": 8.878447934538736, "grad_norm": 0.19393907487392426, "learning_rate": 3.055544904624025e-06, "loss": 0.0339, "num_input_tokens_seen": 30073504, "step": 33640 }, { "epoch": 8.879767718094232, "grad_norm": 0.04693886265158653, "learning_rate": 3.050843365124026e-06, "loss": 0.0386, "num_input_tokens_seen": 30077664, "step": 33645 }, { "epoch": 8.881087501649729, "grad_norm": 0.14179402589797974, "learning_rate": 3.0461452104598083e-06, "loss": 0.0416, "num_input_tokens_seen": 30081888, "step": 33650 }, { "epoch": 8.882407285205227, "grad_norm": 0.32260459661483765, "learning_rate": 3.0414504413558836e-06, "loss": 0.0563, "num_input_tokens_seen": 30086144, "step": 33655 }, { "epoch": 8.883727068760724, "grad_norm": 0.2694810628890991, "learning_rate": 3.0367590585362564e-06, "loss": 0.0428, "num_input_tokens_seen": 30090496, "step": 33660 }, { "epoch": 8.88504685231622, "grad_norm": 0.2997215688228607, "learning_rate": 3.0320710627243813e-06, "loss": 0.0787, "num_input_tokens_seen": 30094976, "step": 33665 }, { "epoch": 8.886366635871717, "grad_norm": 0.16100257635116577, "learning_rate": 3.027386454643222e-06, "loss": 0.0614, "num_input_tokens_seen": 30099712, "step": 33670 }, { "epoch": 8.887686419427213, "grad_norm": 0.3621823787689209, "learning_rate": 3.0227052350151914e-06, "loss": 0.0534, "num_input_tokens_seen": 30104128, "step": 33675 }, { "epoch": 8.88900620298271, "grad_norm": 0.14674434065818787, "learning_rate": 3.0180274045621957e-06, "loss": 0.1146, "num_input_tokens_seen": 30108384, "step": 33680 }, { "epoch": 8.890325986538208, "grad_norm": 0.2379944622516632, "learning_rate": 3.013352964005625e-06, "loss": 0.0676, "num_input_tokens_seen": 30112640, "step": 33685 }, { "epoch": 8.891645770093705, "grad_norm": 0.12532909214496613, "learning_rate": 3.0086819140663218e-06, "loss": 0.0348, "num_input_tokens_seen": 30117376, "step": 33690 }, { "epoch": 8.892965553649201, "grad_norm": 0.05897604301571846, "learning_rate": 3.0040142554646265e-06, "loss": 0.0324, "num_input_tokens_seen": 30121696, "step": 33695 }, { "epoch": 8.894285337204698, "grad_norm": 0.08731305599212646, "learning_rate": 2.999349988920361e-06, "loss": 0.0693, "num_input_tokens_seen": 30125920, "step": 33700 }, { "epoch": 8.895605120760194, "grad_norm": 0.07779120653867722, "learning_rate": 2.994689115152796e-06, "loss": 0.071, "num_input_tokens_seen": 30130112, "step": 33705 }, { "epoch": 8.896924904315693, "grad_norm": 0.20681162178516388, "learning_rate": 2.9900316348807105e-06, "loss": 0.0491, "num_input_tokens_seen": 30134432, "step": 33710 }, { "epoch": 8.89824468787119, "grad_norm": 0.22345609962940216, "learning_rate": 2.985377548822338e-06, "loss": 0.0264, "num_input_tokens_seen": 30138976, "step": 33715 }, { "epoch": 8.899564471426686, "grad_norm": 0.16095314919948578, "learning_rate": 2.980726857695404e-06, "loss": 0.0403, "num_input_tokens_seen": 30143552, "step": 33720 }, { "epoch": 8.900884254982182, "grad_norm": 0.07145502418279648, "learning_rate": 2.9760795622171017e-06, "loss": 0.0493, "num_input_tokens_seen": 30148192, "step": 33725 }, { "epoch": 8.902204038537679, "grad_norm": 0.05089440196752548, "learning_rate": 2.971435663104094e-06, "loss": 0.0635, "num_input_tokens_seen": 30152960, "step": 33730 }, { "epoch": 8.903523822093177, "grad_norm": 0.11006798595190048, "learning_rate": 2.9667951610725385e-06, "loss": 0.0839, "num_input_tokens_seen": 30157568, "step": 33735 }, { "epoch": 8.904843605648674, "grad_norm": 0.39502623677253723, "learning_rate": 2.9621580568380575e-06, "loss": 0.0841, "num_input_tokens_seen": 30162144, "step": 33740 }, { "epoch": 8.90616338920417, "grad_norm": 0.40727654099464417, "learning_rate": 2.9575243511157453e-06, "loss": 0.0803, "num_input_tokens_seen": 30166720, "step": 33745 }, { "epoch": 8.907483172759667, "grad_norm": 0.3528825044631958, "learning_rate": 2.952894044620186e-06, "loss": 0.0457, "num_input_tokens_seen": 30171296, "step": 33750 }, { "epoch": 8.908802956315164, "grad_norm": 0.26231250166893005, "learning_rate": 2.948267138065419e-06, "loss": 0.0688, "num_input_tokens_seen": 30175968, "step": 33755 }, { "epoch": 8.910122739870662, "grad_norm": 0.4106236398220062, "learning_rate": 2.943643632164983e-06, "loss": 0.0422, "num_input_tokens_seen": 30180448, "step": 33760 }, { "epoch": 8.911442523426159, "grad_norm": 0.43454307317733765, "learning_rate": 2.939023527631879e-06, "loss": 0.0582, "num_input_tokens_seen": 30184832, "step": 33765 }, { "epoch": 8.912762306981655, "grad_norm": 0.20565663278102875, "learning_rate": 2.934406825178576e-06, "loss": 0.0396, "num_input_tokens_seen": 30189440, "step": 33770 }, { "epoch": 8.914082090537152, "grad_norm": 0.3171520233154297, "learning_rate": 2.9297935255170357e-06, "loss": 0.0644, "num_input_tokens_seen": 30193728, "step": 33775 }, { "epoch": 8.915401874092648, "grad_norm": 0.3234643340110779, "learning_rate": 2.925183629358691e-06, "loss": 0.0836, "num_input_tokens_seen": 30198368, "step": 33780 }, { "epoch": 8.916721657648147, "grad_norm": 0.4608983099460602, "learning_rate": 2.9205771374144346e-06, "loss": 0.1413, "num_input_tokens_seen": 30202944, "step": 33785 }, { "epoch": 8.918041441203643, "grad_norm": 0.05254804715514183, "learning_rate": 2.915974050394657e-06, "loss": 0.0167, "num_input_tokens_seen": 30207680, "step": 33790 }, { "epoch": 8.91936122475914, "grad_norm": 0.05397457256913185, "learning_rate": 2.9113743690092067e-06, "loss": 0.0533, "num_input_tokens_seen": 30211968, "step": 33795 }, { "epoch": 8.920681008314636, "grad_norm": 0.04846327379345894, "learning_rate": 2.906778093967402e-06, "loss": 0.0384, "num_input_tokens_seen": 30216384, "step": 33800 }, { "epoch": 8.920681008314636, "eval_loss": 0.06673185527324677, "eval_runtime": 64.7602, "eval_samples_per_second": 103.999, "eval_steps_per_second": 26.004, "num_input_tokens_seen": 30216384, "step": 33800 }, { "epoch": 8.922000791870133, "grad_norm": 0.03824539855122566, "learning_rate": 2.9021852259780656e-06, "loss": 0.0225, "num_input_tokens_seen": 30220800, "step": 33805 }, { "epoch": 8.923320575425631, "grad_norm": 0.13985612988471985, "learning_rate": 2.8975957657494583e-06, "loss": 0.0376, "num_input_tokens_seen": 30225472, "step": 33810 }, { "epoch": 8.924640358981128, "grad_norm": 0.22610527276992798, "learning_rate": 2.8930097139893417e-06, "loss": 0.0274, "num_input_tokens_seen": 30229984, "step": 33815 }, { "epoch": 8.925960142536624, "grad_norm": 0.12828852236270905, "learning_rate": 2.888427071404945e-06, "loss": 0.0425, "num_input_tokens_seen": 30234528, "step": 33820 }, { "epoch": 8.92727992609212, "grad_norm": 0.17715567350387573, "learning_rate": 2.8838478387029606e-06, "loss": 0.0709, "num_input_tokens_seen": 30238976, "step": 33825 }, { "epoch": 8.928599709647617, "grad_norm": 0.24659322202205658, "learning_rate": 2.8792720165895737e-06, "loss": 0.0339, "num_input_tokens_seen": 30243520, "step": 33830 }, { "epoch": 8.929919493203116, "grad_norm": 0.21981750428676605, "learning_rate": 2.874699605770423e-06, "loss": 0.064, "num_input_tokens_seen": 30248128, "step": 33835 }, { "epoch": 8.931239276758612, "grad_norm": 0.26891186833381653, "learning_rate": 2.8701306069506383e-06, "loss": 0.0541, "num_input_tokens_seen": 30252672, "step": 33840 }, { "epoch": 8.932559060314109, "grad_norm": 0.21992447972297668, "learning_rate": 2.8655650208348178e-06, "loss": 0.0446, "num_input_tokens_seen": 30257216, "step": 33845 }, { "epoch": 8.933878843869605, "grad_norm": 0.10089381784200668, "learning_rate": 2.8610028481270257e-06, "loss": 0.0392, "num_input_tokens_seen": 30261600, "step": 33850 }, { "epoch": 8.935198627425102, "grad_norm": 0.10193891078233719, "learning_rate": 2.856444089530813e-06, "loss": 0.0431, "num_input_tokens_seen": 30266112, "step": 33855 }, { "epoch": 8.936518410980598, "grad_norm": 0.22850273549556732, "learning_rate": 2.8518887457491955e-06, "loss": 0.068, "num_input_tokens_seen": 30270688, "step": 33860 }, { "epoch": 8.937838194536097, "grad_norm": 0.4412859380245209, "learning_rate": 2.8473368174846666e-06, "loss": 0.0604, "num_input_tokens_seen": 30275360, "step": 33865 }, { "epoch": 8.939157978091593, "grad_norm": 0.19657257199287415, "learning_rate": 2.842788305439184e-06, "loss": 0.0578, "num_input_tokens_seen": 30279968, "step": 33870 }, { "epoch": 8.94047776164709, "grad_norm": 0.23637180030345917, "learning_rate": 2.8382432103141925e-06, "loss": 0.0669, "num_input_tokens_seen": 30284512, "step": 33875 }, { "epoch": 8.941797545202586, "grad_norm": 0.09823627024888992, "learning_rate": 2.833701532810598e-06, "loss": 0.1378, "num_input_tokens_seen": 30288992, "step": 33880 }, { "epoch": 8.943117328758083, "grad_norm": 0.05842132866382599, "learning_rate": 2.8291632736287877e-06, "loss": 0.0516, "num_input_tokens_seen": 30293024, "step": 33885 }, { "epoch": 8.944437112313581, "grad_norm": 0.28283172845840454, "learning_rate": 2.824628433468615e-06, "loss": 0.08, "num_input_tokens_seen": 30297504, "step": 33890 }, { "epoch": 8.945756895869078, "grad_norm": 0.23988932371139526, "learning_rate": 2.8200970130294073e-06, "loss": 0.0469, "num_input_tokens_seen": 30301696, "step": 33895 }, { "epoch": 8.947076679424574, "grad_norm": 0.2807973027229309, "learning_rate": 2.8155690130099775e-06, "loss": 0.029, "num_input_tokens_seen": 30306176, "step": 33900 }, { "epoch": 8.948396462980071, "grad_norm": 0.1333424299955368, "learning_rate": 2.8110444341085895e-06, "loss": 0.0477, "num_input_tokens_seen": 30310752, "step": 33905 }, { "epoch": 8.949716246535568, "grad_norm": 0.044917792081832886, "learning_rate": 2.806523277022996e-06, "loss": 0.0716, "num_input_tokens_seen": 30315392, "step": 33910 }, { "epoch": 8.951036030091066, "grad_norm": 0.0968228131532669, "learning_rate": 2.802005542450409e-06, "loss": 0.0233, "num_input_tokens_seen": 30319680, "step": 33915 }, { "epoch": 8.952355813646562, "grad_norm": 0.09316549450159073, "learning_rate": 2.797491231087526e-06, "loss": 0.0302, "num_input_tokens_seen": 30324448, "step": 33920 }, { "epoch": 8.953675597202059, "grad_norm": 0.5087403655052185, "learning_rate": 2.7929803436305137e-06, "loss": 0.0482, "num_input_tokens_seen": 30328672, "step": 33925 }, { "epoch": 8.954995380757556, "grad_norm": 0.22161389887332916, "learning_rate": 2.788472880774998e-06, "loss": 0.0807, "num_input_tokens_seen": 30333056, "step": 33930 }, { "epoch": 8.956315164313052, "grad_norm": 0.1899174004793167, "learning_rate": 2.7839688432160977e-06, "loss": 0.0741, "num_input_tokens_seen": 30337632, "step": 33935 }, { "epoch": 8.957634947868549, "grad_norm": 0.31969016790390015, "learning_rate": 2.779468231648383e-06, "loss": 0.0551, "num_input_tokens_seen": 30342176, "step": 33940 }, { "epoch": 8.958954731424047, "grad_norm": 0.22740787267684937, "learning_rate": 2.774971046765906e-06, "loss": 0.0544, "num_input_tokens_seen": 30346464, "step": 33945 }, { "epoch": 8.960274514979544, "grad_norm": 0.2248426377773285, "learning_rate": 2.770477289262194e-06, "loss": 0.0731, "num_input_tokens_seen": 30351168, "step": 33950 }, { "epoch": 8.96159429853504, "grad_norm": 0.0767684280872345, "learning_rate": 2.765986959830233e-06, "loss": 0.0369, "num_input_tokens_seen": 30355648, "step": 33955 }, { "epoch": 8.962914082090537, "grad_norm": 0.08161349594593048, "learning_rate": 2.761500059162492e-06, "loss": 0.0844, "num_input_tokens_seen": 30360512, "step": 33960 }, { "epoch": 8.964233865646033, "grad_norm": 0.11311199516057968, "learning_rate": 2.757016587950914e-06, "loss": 0.0217, "num_input_tokens_seen": 30365024, "step": 33965 }, { "epoch": 8.965553649201532, "grad_norm": 0.06902402639389038, "learning_rate": 2.752536546886897e-06, "loss": 0.0747, "num_input_tokens_seen": 30369536, "step": 33970 }, { "epoch": 8.966873432757028, "grad_norm": 0.0739562138915062, "learning_rate": 2.7480599366613234e-06, "loss": 0.048, "num_input_tokens_seen": 30374048, "step": 33975 }, { "epoch": 8.968193216312525, "grad_norm": 0.06103876233100891, "learning_rate": 2.7435867579645473e-06, "loss": 0.0412, "num_input_tokens_seen": 30378496, "step": 33980 }, { "epoch": 8.969512999868021, "grad_norm": 0.17259518802165985, "learning_rate": 2.739117011486378e-06, "loss": 0.0707, "num_input_tokens_seen": 30382976, "step": 33985 }, { "epoch": 8.970832783423518, "grad_norm": 0.023606494069099426, "learning_rate": 2.7346506979161216e-06, "loss": 0.0382, "num_input_tokens_seen": 30387424, "step": 33990 }, { "epoch": 8.972152566979016, "grad_norm": 0.08904796093702316, "learning_rate": 2.7301878179425227e-06, "loss": 0.0537, "num_input_tokens_seen": 30391904, "step": 33995 }, { "epoch": 8.973472350534513, "grad_norm": 0.1865626722574234, "learning_rate": 2.7257283722538244e-06, "loss": 0.0605, "num_input_tokens_seen": 30395872, "step": 34000 }, { "epoch": 8.973472350534513, "eval_loss": 0.06677081435918808, "eval_runtime": 64.7564, "eval_samples_per_second": 104.005, "eval_steps_per_second": 26.005, "num_input_tokens_seen": 30395872, "step": 34000 }, { "epoch": 8.97479213409001, "grad_norm": 0.02121318131685257, "learning_rate": 2.7212723615377326e-06, "loss": 0.0457, "num_input_tokens_seen": 30400480, "step": 34005 }, { "epoch": 8.976111917645506, "grad_norm": 0.2629278004169464, "learning_rate": 2.7168197864814145e-06, "loss": 0.095, "num_input_tokens_seen": 30405088, "step": 34010 }, { "epoch": 8.977431701201002, "grad_norm": 0.27620548009872437, "learning_rate": 2.712370647771509e-06, "loss": 0.0809, "num_input_tokens_seen": 30409600, "step": 34015 }, { "epoch": 8.9787514847565, "grad_norm": 0.4787909686565399, "learning_rate": 2.707924946094137e-06, "loss": 0.1249, "num_input_tokens_seen": 30413728, "step": 34020 }, { "epoch": 8.980071268311997, "grad_norm": 0.3036344051361084, "learning_rate": 2.7034826821348723e-06, "loss": 0.0345, "num_input_tokens_seen": 30417952, "step": 34025 }, { "epoch": 8.981391051867494, "grad_norm": 0.03586472198367119, "learning_rate": 2.6990438565787786e-06, "loss": 0.0712, "num_input_tokens_seen": 30422016, "step": 34030 }, { "epoch": 8.98271083542299, "grad_norm": 0.10260725021362305, "learning_rate": 2.6946084701103714e-06, "loss": 0.0407, "num_input_tokens_seen": 30426336, "step": 34035 }, { "epoch": 8.984030618978487, "grad_norm": 0.1300048679113388, "learning_rate": 2.6901765234136428e-06, "loss": 0.0262, "num_input_tokens_seen": 30430592, "step": 34040 }, { "epoch": 8.985350402533985, "grad_norm": 0.04834938049316406, "learning_rate": 2.685748017172063e-06, "loss": 0.0162, "num_input_tokens_seen": 30435232, "step": 34045 }, { "epoch": 8.986670186089482, "grad_norm": 0.22620095312595367, "learning_rate": 2.681322952068549e-06, "loss": 0.0661, "num_input_tokens_seen": 30439840, "step": 34050 }, { "epoch": 8.987989969644978, "grad_norm": 0.2332710325717926, "learning_rate": 2.6769013287855137e-06, "loss": 0.0471, "num_input_tokens_seen": 30443840, "step": 34055 }, { "epoch": 8.989309753200475, "grad_norm": 0.0665879175066948, "learning_rate": 2.6724831480048286e-06, "loss": 0.0481, "num_input_tokens_seen": 30448128, "step": 34060 }, { "epoch": 8.990629536755971, "grad_norm": 0.07129260897636414, "learning_rate": 2.66806841040782e-06, "loss": 0.0586, "num_input_tokens_seen": 30452544, "step": 34065 }, { "epoch": 8.99194932031147, "grad_norm": 0.019878221675753593, "learning_rate": 2.6636571166753083e-06, "loss": 0.0299, "num_input_tokens_seen": 30457088, "step": 34070 }, { "epoch": 8.993269103866966, "grad_norm": 0.03723651170730591, "learning_rate": 2.6592492674875598e-06, "loss": 0.035, "num_input_tokens_seen": 30461472, "step": 34075 }, { "epoch": 8.994588887422463, "grad_norm": 0.026776984333992004, "learning_rate": 2.6548448635243305e-06, "loss": 0.0263, "num_input_tokens_seen": 30466080, "step": 34080 }, { "epoch": 8.99590867097796, "grad_norm": 0.34618017077445984, "learning_rate": 2.650443905464828e-06, "loss": 0.059, "num_input_tokens_seen": 30470624, "step": 34085 }, { "epoch": 8.997228454533456, "grad_norm": 0.15927749872207642, "learning_rate": 2.646046393987739e-06, "loss": 0.0805, "num_input_tokens_seen": 30475488, "step": 34090 }, { "epoch": 8.998548238088954, "grad_norm": 0.1500425934791565, "learning_rate": 2.64165232977121e-06, "loss": 0.1179, "num_input_tokens_seen": 30480064, "step": 34095 }, { "epoch": 8.99986802164445, "grad_norm": 0.2765823304653168, "learning_rate": 2.6372617134928695e-06, "loss": 0.0756, "num_input_tokens_seen": 30484480, "step": 34100 }, { "epoch": 9.001055826844398, "grad_norm": 0.14811013638973236, "learning_rate": 2.6328745458297943e-06, "loss": 0.0748, "num_input_tokens_seen": 30488672, "step": 34105 }, { "epoch": 9.002375610399895, "grad_norm": 0.3726392388343811, "learning_rate": 2.6284908274585546e-06, "loss": 0.0803, "num_input_tokens_seen": 30492960, "step": 34110 }, { "epoch": 9.003695393955391, "grad_norm": 0.24268601834774017, "learning_rate": 2.6241105590551595e-06, "loss": 0.091, "num_input_tokens_seen": 30497120, "step": 34115 }, { "epoch": 9.005015177510888, "grad_norm": 0.16123487055301666, "learning_rate": 2.6197337412951105e-06, "loss": 0.0302, "num_input_tokens_seen": 30501664, "step": 34120 }, { "epoch": 9.006334961066385, "grad_norm": 0.24793678522109985, "learning_rate": 2.6153603748533705e-06, "loss": 0.0551, "num_input_tokens_seen": 30506080, "step": 34125 }, { "epoch": 9.007654744621883, "grad_norm": 0.11255044490098953, "learning_rate": 2.6109904604043585e-06, "loss": 0.0429, "num_input_tokens_seen": 30510912, "step": 34130 }, { "epoch": 9.00897452817738, "grad_norm": 0.20901846885681152, "learning_rate": 2.6066239986219765e-06, "loss": 0.0383, "num_input_tokens_seen": 30515616, "step": 34135 }, { "epoch": 9.010294311732876, "grad_norm": 0.37452232837677, "learning_rate": 2.602260990179592e-06, "loss": 0.0772, "num_input_tokens_seen": 30519744, "step": 34140 }, { "epoch": 9.011614095288373, "grad_norm": 0.7276430130004883, "learning_rate": 2.5979014357500248e-06, "loss": 0.0209, "num_input_tokens_seen": 30524160, "step": 34145 }, { "epoch": 9.012933878843869, "grad_norm": 0.11489791423082352, "learning_rate": 2.5935453360055844e-06, "loss": 0.0511, "num_input_tokens_seen": 30528768, "step": 34150 }, { "epoch": 9.014253662399366, "grad_norm": 0.3230082392692566, "learning_rate": 2.5891926916180283e-06, "loss": 0.0453, "num_input_tokens_seen": 30533408, "step": 34155 }, { "epoch": 9.015573445954864, "grad_norm": 0.04839889332652092, "learning_rate": 2.5848435032585883e-06, "loss": 0.0713, "num_input_tokens_seen": 30538336, "step": 34160 }, { "epoch": 9.01689322951036, "grad_norm": 0.16654208302497864, "learning_rate": 2.58049777159797e-06, "loss": 0.0597, "num_input_tokens_seen": 30542816, "step": 34165 }, { "epoch": 9.018213013065857, "grad_norm": 0.06050413101911545, "learning_rate": 2.576155497306332e-06, "loss": 0.0854, "num_input_tokens_seen": 30547392, "step": 34170 }, { "epoch": 9.019532796621354, "grad_norm": 0.071187324821949, "learning_rate": 2.57181668105331e-06, "loss": 0.0589, "num_input_tokens_seen": 30551712, "step": 34175 }, { "epoch": 9.02085258017685, "grad_norm": 0.16261057555675507, "learning_rate": 2.567481323508014e-06, "loss": 0.0663, "num_input_tokens_seen": 30556224, "step": 34180 }, { "epoch": 9.022172363732349, "grad_norm": 0.14340335130691528, "learning_rate": 2.5631494253389954e-06, "loss": 0.0505, "num_input_tokens_seen": 30560608, "step": 34185 }, { "epoch": 9.023492147287845, "grad_norm": 0.2412203997373581, "learning_rate": 2.5588209872142997e-06, "loss": 0.0391, "num_input_tokens_seen": 30565120, "step": 34190 }, { "epoch": 9.024811930843342, "grad_norm": 0.05711883306503296, "learning_rate": 2.5544960098014186e-06, "loss": 0.0646, "num_input_tokens_seen": 30569344, "step": 34195 }, { "epoch": 9.026131714398838, "grad_norm": 0.06810269504785538, "learning_rate": 2.550174493767318e-06, "loss": 0.029, "num_input_tokens_seen": 30573920, "step": 34200 }, { "epoch": 9.026131714398838, "eval_loss": 0.06678295135498047, "eval_runtime": 64.7567, "eval_samples_per_second": 104.005, "eval_steps_per_second": 26.005, "num_input_tokens_seen": 30573920, "step": 34200 }, { "epoch": 9.027451497954335, "grad_norm": 0.3600969612598419, "learning_rate": 2.545856439778438e-06, "loss": 0.0885, "num_input_tokens_seen": 30578464, "step": 34205 }, { "epoch": 9.028771281509833, "grad_norm": 0.38368111848831177, "learning_rate": 2.541541848500667e-06, "loss": 0.0628, "num_input_tokens_seen": 30583072, "step": 34210 }, { "epoch": 9.03009106506533, "grad_norm": 0.21490925550460815, "learning_rate": 2.5372307205993733e-06, "loss": 0.0293, "num_input_tokens_seen": 30587520, "step": 34215 }, { "epoch": 9.031410848620826, "grad_norm": 0.063166543841362, "learning_rate": 2.5329230567393917e-06, "loss": 0.0309, "num_input_tokens_seen": 30592192, "step": 34220 }, { "epoch": 9.032730632176323, "grad_norm": 0.16628064215183258, "learning_rate": 2.5286188575850164e-06, "loss": 0.0644, "num_input_tokens_seen": 30596736, "step": 34225 }, { "epoch": 9.03405041573182, "grad_norm": 0.06292189657688141, "learning_rate": 2.5243181237999984e-06, "loss": 0.0576, "num_input_tokens_seen": 30601312, "step": 34230 }, { "epoch": 9.035370199287318, "grad_norm": 0.054911937564611435, "learning_rate": 2.520020856047578e-06, "loss": 0.0715, "num_input_tokens_seen": 30605728, "step": 34235 }, { "epoch": 9.036689982842814, "grad_norm": 0.25926274061203003, "learning_rate": 2.515727054990438e-06, "loss": 0.0753, "num_input_tokens_seen": 30610208, "step": 34240 }, { "epoch": 9.03800976639831, "grad_norm": 0.21004317700862885, "learning_rate": 2.511436721290747e-06, "loss": 0.0262, "num_input_tokens_seen": 30614560, "step": 34245 }, { "epoch": 9.039329549953807, "grad_norm": 0.31882160902023315, "learning_rate": 2.5071498556101164e-06, "loss": 0.0776, "num_input_tokens_seen": 30618944, "step": 34250 }, { "epoch": 9.040649333509304, "grad_norm": 0.16440081596374512, "learning_rate": 2.5028664586096485e-06, "loss": 0.0728, "num_input_tokens_seen": 30623456, "step": 34255 }, { "epoch": 9.041969117064802, "grad_norm": 0.06106419861316681, "learning_rate": 2.498586530949881e-06, "loss": 0.0366, "num_input_tokens_seen": 30627648, "step": 34260 }, { "epoch": 9.043288900620299, "grad_norm": 0.1860082894563675, "learning_rate": 2.4943100732908427e-06, "loss": 0.0584, "num_input_tokens_seen": 30632096, "step": 34265 }, { "epoch": 9.044608684175795, "grad_norm": 0.19812174141407013, "learning_rate": 2.4900370862920188e-06, "loss": 0.0562, "num_input_tokens_seen": 30636224, "step": 34270 }, { "epoch": 9.045928467731292, "grad_norm": 0.02721893973648548, "learning_rate": 2.4857675706123518e-06, "loss": 0.0297, "num_input_tokens_seen": 30640896, "step": 34275 }, { "epoch": 9.047248251286788, "grad_norm": 0.18956708908081055, "learning_rate": 2.4815015269102543e-06, "loss": 0.0431, "num_input_tokens_seen": 30645216, "step": 34280 }, { "epoch": 9.048568034842285, "grad_norm": 0.05388544872403145, "learning_rate": 2.477238955843611e-06, "loss": 0.066, "num_input_tokens_seen": 30649376, "step": 34285 }, { "epoch": 9.049887818397783, "grad_norm": 0.2598036825656891, "learning_rate": 2.4729798580697573e-06, "loss": 0.05, "num_input_tokens_seen": 30653888, "step": 34290 }, { "epoch": 9.05120760195328, "grad_norm": 0.19086545705795288, "learning_rate": 2.4687242342455034e-06, "loss": 0.0329, "num_input_tokens_seen": 30658592, "step": 34295 }, { "epoch": 9.052527385508776, "grad_norm": 0.12978072464466095, "learning_rate": 2.4644720850271196e-06, "loss": 0.0717, "num_input_tokens_seen": 30663008, "step": 34300 }, { "epoch": 9.053847169064273, "grad_norm": 0.3398091793060303, "learning_rate": 2.4602234110703364e-06, "loss": 0.0458, "num_input_tokens_seen": 30667456, "step": 34305 }, { "epoch": 9.05516695261977, "grad_norm": 0.09530085325241089, "learning_rate": 2.4559782130303576e-06, "loss": 0.0495, "num_input_tokens_seen": 30671808, "step": 34310 }, { "epoch": 9.056486736175268, "grad_norm": 0.08488544821739197, "learning_rate": 2.451736491561843e-06, "loss": 0.0185, "num_input_tokens_seen": 30676544, "step": 34315 }, { "epoch": 9.057806519730764, "grad_norm": 0.4272072911262512, "learning_rate": 2.4474982473189163e-06, "loss": 0.1205, "num_input_tokens_seen": 30681280, "step": 34320 }, { "epoch": 9.059126303286261, "grad_norm": 0.11024122685194016, "learning_rate": 2.4432634809551796e-06, "loss": 0.0546, "num_input_tokens_seen": 30685984, "step": 34325 }, { "epoch": 9.060446086841758, "grad_norm": 0.37305113673210144, "learning_rate": 2.439032193123675e-06, "loss": 0.0769, "num_input_tokens_seen": 30690304, "step": 34330 }, { "epoch": 9.061765870397254, "grad_norm": 0.3001219630241394, "learning_rate": 2.4348043844769297e-06, "loss": 0.0472, "num_input_tokens_seen": 30694752, "step": 34335 }, { "epoch": 9.063085653952752, "grad_norm": 0.2659684121608734, "learning_rate": 2.4305800556669146e-06, "loss": 0.0742, "num_input_tokens_seen": 30699200, "step": 34340 }, { "epoch": 9.064405437508249, "grad_norm": 0.18000783026218414, "learning_rate": 2.426359207345083e-06, "loss": 0.0558, "num_input_tokens_seen": 30703904, "step": 34345 }, { "epoch": 9.065725221063746, "grad_norm": 0.27730628848075867, "learning_rate": 2.4221418401623396e-06, "loss": 0.0556, "num_input_tokens_seen": 30708128, "step": 34350 }, { "epoch": 9.067045004619242, "grad_norm": 0.09412769228219986, "learning_rate": 2.4179279547690557e-06, "loss": 0.0716, "num_input_tokens_seen": 30712480, "step": 34355 }, { "epoch": 9.068364788174739, "grad_norm": 0.08771993219852448, "learning_rate": 2.413717551815062e-06, "loss": 0.0807, "num_input_tokens_seen": 30716928, "step": 34360 }, { "epoch": 9.069684571730237, "grad_norm": 0.16841833293437958, "learning_rate": 2.409510631949666e-06, "loss": 0.0283, "num_input_tokens_seen": 30721920, "step": 34365 }, { "epoch": 9.071004355285734, "grad_norm": 0.1209721565246582, "learning_rate": 2.405307195821618e-06, "loss": 0.0847, "num_input_tokens_seen": 30726496, "step": 34370 }, { "epoch": 9.07232413884123, "grad_norm": 0.07847556471824646, "learning_rate": 2.4011072440791372e-06, "loss": 0.0775, "num_input_tokens_seen": 30731072, "step": 34375 }, { "epoch": 9.073643922396727, "grad_norm": 0.22748929262161255, "learning_rate": 2.3969107773699233e-06, "loss": 0.1007, "num_input_tokens_seen": 30735488, "step": 34380 }, { "epoch": 9.074963705952223, "grad_norm": 0.2644062042236328, "learning_rate": 2.3927177963411096e-06, "loss": 0.0482, "num_input_tokens_seen": 30740256, "step": 34385 }, { "epoch": 9.076283489507722, "grad_norm": 0.34842315316200256, "learning_rate": 2.3885283016393144e-06, "loss": 0.1058, "num_input_tokens_seen": 30744704, "step": 34390 }, { "epoch": 9.077603273063218, "grad_norm": 0.2846302390098572, "learning_rate": 2.3843422939106076e-06, "loss": 0.0838, "num_input_tokens_seen": 30749120, "step": 34395 }, { "epoch": 9.078923056618715, "grad_norm": 0.0994100570678711, "learning_rate": 2.380159773800525e-06, "loss": 0.0564, "num_input_tokens_seen": 30753536, "step": 34400 }, { "epoch": 9.078923056618715, "eval_loss": 0.0668676346540451, "eval_runtime": 64.733, "eval_samples_per_second": 104.043, "eval_steps_per_second": 26.015, "num_input_tokens_seen": 30753536, "step": 34400 }, { "epoch": 9.080242840174211, "grad_norm": 0.2420606166124344, "learning_rate": 2.3759807419540675e-06, "loss": 0.0872, "num_input_tokens_seen": 30758016, "step": 34405 }, { "epoch": 9.081562623729708, "grad_norm": 0.045871444046497345, "learning_rate": 2.3718051990156835e-06, "loss": 0.0547, "num_input_tokens_seen": 30762432, "step": 34410 }, { "epoch": 9.082882407285204, "grad_norm": 0.3601616621017456, "learning_rate": 2.367633145629311e-06, "loss": 0.0954, "num_input_tokens_seen": 30766944, "step": 34415 }, { "epoch": 9.084202190840703, "grad_norm": 0.1162792444229126, "learning_rate": 2.363464582438316e-06, "loss": 0.0408, "num_input_tokens_seen": 30771392, "step": 34420 }, { "epoch": 9.0855219743962, "grad_norm": 0.2849809229373932, "learning_rate": 2.3592995100855526e-06, "loss": 0.0962, "num_input_tokens_seen": 30775744, "step": 34425 }, { "epoch": 9.086841757951696, "grad_norm": 0.021222257986664772, "learning_rate": 2.3551379292133273e-06, "loss": 0.089, "num_input_tokens_seen": 30780224, "step": 34430 }, { "epoch": 9.088161541507192, "grad_norm": 0.19661718606948853, "learning_rate": 2.3509798404634047e-06, "loss": 0.0506, "num_input_tokens_seen": 30785152, "step": 34435 }, { "epoch": 9.089481325062689, "grad_norm": 0.08007462322711945, "learning_rate": 2.346825244477019e-06, "loss": 0.0318, "num_input_tokens_seen": 30789760, "step": 34440 }, { "epoch": 9.090801108618187, "grad_norm": 0.12793056666851044, "learning_rate": 2.3426741418948545e-06, "loss": 0.0327, "num_input_tokens_seen": 30794336, "step": 34445 }, { "epoch": 9.092120892173684, "grad_norm": 0.2705821394920349, "learning_rate": 2.3385265333570715e-06, "loss": 0.0845, "num_input_tokens_seen": 30798944, "step": 34450 }, { "epoch": 9.09344067572918, "grad_norm": 0.240452378988266, "learning_rate": 2.334382419503278e-06, "loss": 0.0339, "num_input_tokens_seen": 30803200, "step": 34455 }, { "epoch": 9.094760459284677, "grad_norm": 0.2875104546546936, "learning_rate": 2.3302418009725465e-06, "loss": 0.0802, "num_input_tokens_seen": 30807776, "step": 34460 }, { "epoch": 9.096080242840173, "grad_norm": 0.2622779607772827, "learning_rate": 2.326104678403415e-06, "loss": 0.0503, "num_input_tokens_seen": 30812064, "step": 34465 }, { "epoch": 9.097400026395672, "grad_norm": 0.37460052967071533, "learning_rate": 2.321971052433883e-06, "loss": 0.0486, "num_input_tokens_seen": 30816384, "step": 34470 }, { "epoch": 9.098719809951168, "grad_norm": 0.4294571578502655, "learning_rate": 2.3178409237014004e-06, "loss": 0.0912, "num_input_tokens_seen": 30820800, "step": 34475 }, { "epoch": 9.100039593506665, "grad_norm": 0.09951585531234741, "learning_rate": 2.313714292842889e-06, "loss": 0.0259, "num_input_tokens_seen": 30825312, "step": 34480 }, { "epoch": 9.101359377062161, "grad_norm": 0.051830872893333435, "learning_rate": 2.309591160494734e-06, "loss": 0.0729, "num_input_tokens_seen": 30829536, "step": 34485 }, { "epoch": 9.102679160617658, "grad_norm": 0.21221770346164703, "learning_rate": 2.305471527292763e-06, "loss": 0.0387, "num_input_tokens_seen": 30833856, "step": 34490 }, { "epoch": 9.103998944173156, "grad_norm": 0.07946129888296127, "learning_rate": 2.3013553938722817e-06, "loss": 0.0703, "num_input_tokens_seen": 30838560, "step": 34495 }, { "epoch": 9.105318727728653, "grad_norm": 0.19412478804588318, "learning_rate": 2.297242760868043e-06, "loss": 0.0911, "num_input_tokens_seen": 30842848, "step": 34500 }, { "epoch": 9.10663851128415, "grad_norm": 0.2251584231853485, "learning_rate": 2.2931336289142735e-06, "loss": 0.0324, "num_input_tokens_seen": 30847328, "step": 34505 }, { "epoch": 9.107958294839646, "grad_norm": 0.0960526242852211, "learning_rate": 2.289027998644655e-06, "loss": 0.0555, "num_input_tokens_seen": 30852000, "step": 34510 }, { "epoch": 9.109278078395143, "grad_norm": 0.08653322607278824, "learning_rate": 2.2849258706923228e-06, "loss": 0.06, "num_input_tokens_seen": 30856512, "step": 34515 }, { "epoch": 9.110597861950641, "grad_norm": 0.04940854012966156, "learning_rate": 2.2808272456898705e-06, "loss": 0.0515, "num_input_tokens_seen": 30860896, "step": 34520 }, { "epoch": 9.111917645506137, "grad_norm": 0.2512252926826477, "learning_rate": 2.2767321242693707e-06, "loss": 0.0736, "num_input_tokens_seen": 30865504, "step": 34525 }, { "epoch": 9.113237429061634, "grad_norm": 0.26358452439308167, "learning_rate": 2.272640507062329e-06, "loss": 0.0381, "num_input_tokens_seen": 30870080, "step": 34530 }, { "epoch": 9.11455721261713, "grad_norm": 0.10302529484033585, "learning_rate": 2.2685523946997382e-06, "loss": 0.0461, "num_input_tokens_seen": 30874048, "step": 34535 }, { "epoch": 9.115876996172627, "grad_norm": 0.024606136605143547, "learning_rate": 2.2644677878120245e-06, "loss": 0.0802, "num_input_tokens_seen": 30878624, "step": 34540 }, { "epoch": 9.117196779728125, "grad_norm": 0.11763212829828262, "learning_rate": 2.2603866870290897e-06, "loss": 0.0399, "num_input_tokens_seen": 30883264, "step": 34545 }, { "epoch": 9.118516563283622, "grad_norm": 0.06879474967718124, "learning_rate": 2.256309092980294e-06, "loss": 0.0326, "num_input_tokens_seen": 30887712, "step": 34550 }, { "epoch": 9.119836346839119, "grad_norm": 0.26831358671188354, "learning_rate": 2.252235006294448e-06, "loss": 0.0735, "num_input_tokens_seen": 30891872, "step": 34555 }, { "epoch": 9.121156130394615, "grad_norm": 0.21659240126609802, "learning_rate": 2.2481644275998333e-06, "loss": 0.0431, "num_input_tokens_seen": 30896096, "step": 34560 }, { "epoch": 9.122475913950112, "grad_norm": 0.6196189522743225, "learning_rate": 2.2440973575241832e-06, "loss": 0.0711, "num_input_tokens_seen": 30900480, "step": 34565 }, { "epoch": 9.123795697505608, "grad_norm": 0.05162690952420235, "learning_rate": 2.240033796694685e-06, "loss": 0.0261, "num_input_tokens_seen": 30905248, "step": 34570 }, { "epoch": 9.125115481061107, "grad_norm": 0.18388321995735168, "learning_rate": 2.235973745737999e-06, "loss": 0.0894, "num_input_tokens_seen": 30909824, "step": 34575 }, { "epoch": 9.126435264616603, "grad_norm": 0.17426635324954987, "learning_rate": 2.2319172052802263e-06, "loss": 0.0544, "num_input_tokens_seen": 30914240, "step": 34580 }, { "epoch": 9.1277550481721, "grad_norm": 0.08125588297843933, "learning_rate": 2.2278641759469477e-06, "loss": 0.0804, "num_input_tokens_seen": 30918624, "step": 34585 }, { "epoch": 9.129074831727596, "grad_norm": 0.2528926134109497, "learning_rate": 2.2238146583631825e-06, "loss": 0.0466, "num_input_tokens_seen": 30922816, "step": 34590 }, { "epoch": 9.130394615283093, "grad_norm": 0.16479706764221191, "learning_rate": 2.2197686531534256e-06, "loss": 0.0484, "num_input_tokens_seen": 30927296, "step": 34595 }, { "epoch": 9.131714398838591, "grad_norm": 0.5828876495361328, "learning_rate": 2.2157261609416087e-06, "loss": 0.1248, "num_input_tokens_seen": 30931776, "step": 34600 }, { "epoch": 9.131714398838591, "eval_loss": 0.0668899342417717, "eval_runtime": 64.7287, "eval_samples_per_second": 104.05, "eval_steps_per_second": 26.016, "num_input_tokens_seen": 30931776, "step": 34600 }, { "epoch": 9.133034182394088, "grad_norm": 0.3236760199069977, "learning_rate": 2.211687182351149e-06, "loss": 0.0595, "num_input_tokens_seen": 30936160, "step": 34605 }, { "epoch": 9.134353965949584, "grad_norm": 0.027450822293758392, "learning_rate": 2.2076517180048993e-06, "loss": 0.0496, "num_input_tokens_seen": 30940416, "step": 34610 }, { "epoch": 9.13567374950508, "grad_norm": 0.22930891811847687, "learning_rate": 2.2036197685251834e-06, "loss": 0.0593, "num_input_tokens_seen": 30944448, "step": 34615 }, { "epoch": 9.136993533060577, "grad_norm": 0.22151413559913635, "learning_rate": 2.199591334533771e-06, "loss": 0.064, "num_input_tokens_seen": 30948672, "step": 34620 }, { "epoch": 9.138313316616076, "grad_norm": 0.06415620446205139, "learning_rate": 2.1955664166519036e-06, "loss": 0.0455, "num_input_tokens_seen": 30953248, "step": 34625 }, { "epoch": 9.139633100171572, "grad_norm": 0.11579170823097229, "learning_rate": 2.1915450155002793e-06, "loss": 0.0541, "num_input_tokens_seen": 30957760, "step": 34630 }, { "epoch": 9.140952883727069, "grad_norm": 0.09477139264345169, "learning_rate": 2.187527131699038e-06, "loss": 0.0963, "num_input_tokens_seen": 30962432, "step": 34635 }, { "epoch": 9.142272667282565, "grad_norm": 0.09049534797668457, "learning_rate": 2.18351276586779e-06, "loss": 0.0566, "num_input_tokens_seen": 30966624, "step": 34640 }, { "epoch": 9.143592450838062, "grad_norm": 0.08964726328849792, "learning_rate": 2.1795019186256092e-06, "loss": 0.0379, "num_input_tokens_seen": 30970816, "step": 34645 }, { "epoch": 9.14491223439356, "grad_norm": 0.048677053302526474, "learning_rate": 2.1754945905910094e-06, "loss": 0.088, "num_input_tokens_seen": 30975200, "step": 34650 }, { "epoch": 9.146232017949057, "grad_norm": 0.09771867841482162, "learning_rate": 2.171490782381977e-06, "loss": 0.0601, "num_input_tokens_seen": 30979680, "step": 34655 }, { "epoch": 9.147551801504553, "grad_norm": 0.05459333583712578, "learning_rate": 2.1674904946159425e-06, "loss": 0.0318, "num_input_tokens_seen": 30984160, "step": 34660 }, { "epoch": 9.14887158506005, "grad_norm": 0.17346391081809998, "learning_rate": 2.16349372790981e-06, "loss": 0.0767, "num_input_tokens_seen": 30988768, "step": 34665 }, { "epoch": 9.150191368615546, "grad_norm": 0.23871064186096191, "learning_rate": 2.159500482879928e-06, "loss": 0.0828, "num_input_tokens_seen": 30993536, "step": 34670 }, { "epoch": 9.151511152171043, "grad_norm": 0.07678995281457901, "learning_rate": 2.155510760142096e-06, "loss": 0.0707, "num_input_tokens_seen": 30998080, "step": 34675 }, { "epoch": 9.152830935726541, "grad_norm": 0.2861528694629669, "learning_rate": 2.151524560311588e-06, "loss": 0.041, "num_input_tokens_seen": 31002816, "step": 34680 }, { "epoch": 9.154150719282038, "grad_norm": 0.15417854487895966, "learning_rate": 2.147541884003129e-06, "loss": 0.0551, "num_input_tokens_seen": 31007232, "step": 34685 }, { "epoch": 9.155470502837534, "grad_norm": 0.155519500374794, "learning_rate": 2.1435627318308895e-06, "loss": 0.0512, "num_input_tokens_seen": 31011904, "step": 34690 }, { "epoch": 9.156790286393031, "grad_norm": 0.3594532907009125, "learning_rate": 2.139587104408511e-06, "loss": 0.0419, "num_input_tokens_seen": 31016640, "step": 34695 }, { "epoch": 9.158110069948528, "grad_norm": 0.20393440127372742, "learning_rate": 2.1356150023490783e-06, "loss": 0.0544, "num_input_tokens_seen": 31021440, "step": 34700 }, { "epoch": 9.159429853504026, "grad_norm": 0.09567544609308243, "learning_rate": 2.1316464262651464e-06, "loss": 0.0576, "num_input_tokens_seen": 31026016, "step": 34705 }, { "epoch": 9.160749637059522, "grad_norm": 0.16381627321243286, "learning_rate": 2.1276813767687224e-06, "loss": 0.0876, "num_input_tokens_seen": 31030688, "step": 34710 }, { "epoch": 9.162069420615019, "grad_norm": 0.19003786146640778, "learning_rate": 2.123719854471254e-06, "loss": 0.0295, "num_input_tokens_seen": 31035200, "step": 34715 }, { "epoch": 9.163389204170516, "grad_norm": 0.33006224036216736, "learning_rate": 2.119761859983668e-06, "loss": 0.0813, "num_input_tokens_seen": 31039616, "step": 34720 }, { "epoch": 9.164708987726012, "grad_norm": 0.10440487414598465, "learning_rate": 2.1158073939163386e-06, "loss": 0.0316, "num_input_tokens_seen": 31044000, "step": 34725 }, { "epoch": 9.16602877128151, "grad_norm": 0.19474081695079803, "learning_rate": 2.111856456879088e-06, "loss": 0.0558, "num_input_tokens_seen": 31048480, "step": 34730 }, { "epoch": 9.167348554837007, "grad_norm": 0.023104684427380562, "learning_rate": 2.1079090494811993e-06, "loss": 0.0653, "num_input_tokens_seen": 31052928, "step": 34735 }, { "epoch": 9.168668338392504, "grad_norm": 0.027050459757447243, "learning_rate": 2.103965172331418e-06, "loss": 0.0954, "num_input_tokens_seen": 31057408, "step": 34740 }, { "epoch": 9.169988121948, "grad_norm": 0.2050352841615677, "learning_rate": 2.100024826037933e-06, "loss": 0.0452, "num_input_tokens_seen": 31061952, "step": 34745 }, { "epoch": 9.171307905503497, "grad_norm": 0.09616685658693314, "learning_rate": 2.0960880112084027e-06, "loss": 0.0757, "num_input_tokens_seen": 31066496, "step": 34750 }, { "epoch": 9.172627689058995, "grad_norm": 0.30344218015670776, "learning_rate": 2.092154728449927e-06, "loss": 0.0536, "num_input_tokens_seen": 31071104, "step": 34755 }, { "epoch": 9.173947472614492, "grad_norm": 0.0866456851363182, "learning_rate": 2.0882249783690687e-06, "loss": 0.0205, "num_input_tokens_seen": 31075424, "step": 34760 }, { "epoch": 9.175267256169988, "grad_norm": 0.19021546840667725, "learning_rate": 2.084298761571851e-06, "loss": 0.0932, "num_input_tokens_seen": 31079584, "step": 34765 }, { "epoch": 9.176587039725485, "grad_norm": 0.1955973207950592, "learning_rate": 2.080376078663737e-06, "loss": 0.0566, "num_input_tokens_seen": 31084256, "step": 34770 }, { "epoch": 9.177906823280981, "grad_norm": 0.11902330070734024, "learning_rate": 2.0764569302496593e-06, "loss": 0.1028, "num_input_tokens_seen": 31088512, "step": 34775 }, { "epoch": 9.17922660683648, "grad_norm": 0.1522032767534256, "learning_rate": 2.0725413169339957e-06, "loss": 0.0561, "num_input_tokens_seen": 31092960, "step": 34780 }, { "epoch": 9.180546390391976, "grad_norm": 0.22749671339988708, "learning_rate": 2.068629239320588e-06, "loss": 0.065, "num_input_tokens_seen": 31097120, "step": 34785 }, { "epoch": 9.181866173947473, "grad_norm": 0.11153033375740051, "learning_rate": 2.064720698012726e-06, "loss": 0.0585, "num_input_tokens_seen": 31101632, "step": 34790 }, { "epoch": 9.18318595750297, "grad_norm": 0.15709128975868225, "learning_rate": 2.0608156936131522e-06, "loss": 0.0393, "num_input_tokens_seen": 31106240, "step": 34795 }, { "epoch": 9.184505741058466, "grad_norm": 0.2918480634689331, "learning_rate": 2.056914226724074e-06, "loss": 0.0469, "num_input_tokens_seen": 31110592, "step": 34800 }, { "epoch": 9.184505741058466, "eval_loss": 0.06680532544851303, "eval_runtime": 64.7679, "eval_samples_per_second": 103.987, "eval_steps_per_second": 26.001, "num_input_tokens_seen": 31110592, "step": 34800 }, { "epoch": 9.185825524613964, "grad_norm": 0.4884013533592224, "learning_rate": 2.0530162979471385e-06, "loss": 0.0736, "num_input_tokens_seen": 31115008, "step": 34805 }, { "epoch": 9.18714530816946, "grad_norm": 0.04927368834614754, "learning_rate": 2.0491219078834667e-06, "loss": 0.1058, "num_input_tokens_seen": 31119264, "step": 34810 }, { "epoch": 9.188465091724957, "grad_norm": 0.17808640003204346, "learning_rate": 2.045231057133612e-06, "loss": 0.031, "num_input_tokens_seen": 31123872, "step": 34815 }, { "epoch": 9.189784875280454, "grad_norm": 0.22612524032592773, "learning_rate": 2.0413437462975944e-06, "loss": 0.0524, "num_input_tokens_seen": 31128512, "step": 34820 }, { "epoch": 9.19110465883595, "grad_norm": 0.23320885002613068, "learning_rate": 2.0374599759748843e-06, "loss": 0.0906, "num_input_tokens_seen": 31133152, "step": 34825 }, { "epoch": 9.192424442391447, "grad_norm": 0.5458745956420898, "learning_rate": 2.033579746764419e-06, "loss": 0.046, "num_input_tokens_seen": 31137568, "step": 34830 }, { "epoch": 9.193744225946945, "grad_norm": 0.24943146109580994, "learning_rate": 2.029703059264565e-06, "loss": 0.0368, "num_input_tokens_seen": 31142112, "step": 34835 }, { "epoch": 9.195064009502442, "grad_norm": 0.2626420855522156, "learning_rate": 2.02582991407316e-06, "loss": 0.0624, "num_input_tokens_seen": 31146400, "step": 34840 }, { "epoch": 9.196383793057938, "grad_norm": 0.1136518269777298, "learning_rate": 2.0219603117874992e-06, "loss": 0.0567, "num_input_tokens_seen": 31150816, "step": 34845 }, { "epoch": 9.197703576613435, "grad_norm": 0.08701056241989136, "learning_rate": 2.0180942530043156e-06, "loss": 0.0286, "num_input_tokens_seen": 31155360, "step": 34850 }, { "epoch": 9.199023360168932, "grad_norm": 0.208920419216156, "learning_rate": 2.0142317383198107e-06, "loss": 0.0741, "num_input_tokens_seen": 31159968, "step": 34855 }, { "epoch": 9.20034314372443, "grad_norm": 0.32918015122413635, "learning_rate": 2.0103727683296243e-06, "loss": 0.1289, "num_input_tokens_seen": 31164256, "step": 34860 }, { "epoch": 9.201662927279926, "grad_norm": 0.09645234793424606, "learning_rate": 2.0065173436288636e-06, "loss": 0.0305, "num_input_tokens_seen": 31168704, "step": 34865 }, { "epoch": 9.202982710835423, "grad_norm": 0.06839535385370255, "learning_rate": 2.002665464812087e-06, "loss": 0.0244, "num_input_tokens_seen": 31173344, "step": 34870 }, { "epoch": 9.20430249439092, "grad_norm": 0.13438846170902252, "learning_rate": 1.998817132473291e-06, "loss": 0.0555, "num_input_tokens_seen": 31177408, "step": 34875 }, { "epoch": 9.205622277946416, "grad_norm": 0.11136575043201447, "learning_rate": 1.9949723472059507e-06, "loss": 0.0247, "num_input_tokens_seen": 31181888, "step": 34880 }, { "epoch": 9.206942061501914, "grad_norm": 0.19053339958190918, "learning_rate": 1.9911311096029726e-06, "loss": 0.0327, "num_input_tokens_seen": 31186560, "step": 34885 }, { "epoch": 9.208261845057411, "grad_norm": 0.36149442195892334, "learning_rate": 1.9872934202567224e-06, "loss": 0.0257, "num_input_tokens_seen": 31190816, "step": 34890 }, { "epoch": 9.209581628612908, "grad_norm": 0.24522201716899872, "learning_rate": 1.9834592797590257e-06, "loss": 0.0622, "num_input_tokens_seen": 31195040, "step": 34895 }, { "epoch": 9.210901412168404, "grad_norm": 0.11872867494821548, "learning_rate": 1.979628688701149e-06, "loss": 0.0285, "num_input_tokens_seen": 31199328, "step": 34900 }, { "epoch": 9.2122211957239, "grad_norm": 0.07938193529844284, "learning_rate": 1.9758016476738193e-06, "loss": 0.0569, "num_input_tokens_seen": 31204160, "step": 34905 }, { "epoch": 9.213540979279399, "grad_norm": 0.05569290369749069, "learning_rate": 1.971978157267221e-06, "loss": 0.0553, "num_input_tokens_seen": 31208416, "step": 34910 }, { "epoch": 9.214860762834896, "grad_norm": 0.25793763995170593, "learning_rate": 1.968158218070973e-06, "loss": 0.0869, "num_input_tokens_seen": 31212928, "step": 34915 }, { "epoch": 9.216180546390392, "grad_norm": 0.10364700108766556, "learning_rate": 1.9643418306741682e-06, "loss": 0.0424, "num_input_tokens_seen": 31217568, "step": 34920 }, { "epoch": 9.217500329945889, "grad_norm": 0.06759428977966309, "learning_rate": 1.9605289956653337e-06, "loss": 0.0613, "num_input_tokens_seen": 31221920, "step": 34925 }, { "epoch": 9.218820113501385, "grad_norm": 0.06491217017173767, "learning_rate": 1.9567197136324626e-06, "loss": 0.0174, "num_input_tokens_seen": 31226272, "step": 34930 }, { "epoch": 9.220139897056884, "grad_norm": 0.027667446061968803, "learning_rate": 1.9529139851629935e-06, "loss": 0.0696, "num_input_tokens_seen": 31230752, "step": 34935 }, { "epoch": 9.22145968061238, "grad_norm": 0.5168484449386597, "learning_rate": 1.949111810843812e-06, "loss": 0.0821, "num_input_tokens_seen": 31235040, "step": 34940 }, { "epoch": 9.222779464167877, "grad_norm": 0.19354355335235596, "learning_rate": 1.9453131912612694e-06, "loss": 0.1061, "num_input_tokens_seen": 31239488, "step": 34945 }, { "epoch": 9.224099247723373, "grad_norm": 0.14623889327049255, "learning_rate": 1.941518127001149e-06, "loss": 0.0806, "num_input_tokens_seen": 31243904, "step": 34950 }, { "epoch": 9.22541903127887, "grad_norm": 0.23593467473983765, "learning_rate": 1.9377266186487107e-06, "loss": 0.0476, "num_input_tokens_seen": 31248352, "step": 34955 }, { "epoch": 9.226738814834366, "grad_norm": 0.15503300726413727, "learning_rate": 1.9339386667886483e-06, "loss": 0.0247, "num_input_tokens_seen": 31252928, "step": 34960 }, { "epoch": 9.228058598389865, "grad_norm": 0.2180701494216919, "learning_rate": 1.9301542720051024e-06, "loss": 0.0405, "num_input_tokens_seen": 31257376, "step": 34965 }, { "epoch": 9.229378381945361, "grad_norm": 0.1182580441236496, "learning_rate": 1.926373434881684e-06, "loss": 0.0982, "num_input_tokens_seen": 31261984, "step": 34970 }, { "epoch": 9.230698165500858, "grad_norm": 0.2371140569448471, "learning_rate": 1.9225961560014468e-06, "loss": 0.026, "num_input_tokens_seen": 31266400, "step": 34975 }, { "epoch": 9.232017949056354, "grad_norm": 0.08317003399133682, "learning_rate": 1.918822435946885e-06, "loss": 0.0313, "num_input_tokens_seen": 31270880, "step": 34980 }, { "epoch": 9.23333773261185, "grad_norm": 0.08405255526304245, "learning_rate": 1.915052275299961e-06, "loss": 0.0966, "num_input_tokens_seen": 31275392, "step": 34985 }, { "epoch": 9.23465751616735, "grad_norm": 0.04278353601694107, "learning_rate": 1.9112856746420854e-06, "loss": 0.0719, "num_input_tokens_seen": 31279488, "step": 34990 }, { "epoch": 9.235977299722846, "grad_norm": 0.14153872430324554, "learning_rate": 1.907522634554104e-06, "loss": 0.0381, "num_input_tokens_seen": 31283584, "step": 34995 }, { "epoch": 9.237297083278342, "grad_norm": 0.5269731283187866, "learning_rate": 1.9037631556163337e-06, "loss": 0.1252, "num_input_tokens_seen": 31288160, "step": 35000 }, { "epoch": 9.237297083278342, "eval_loss": 0.06683162599802017, "eval_runtime": 64.7802, "eval_samples_per_second": 103.967, "eval_steps_per_second": 25.996, "num_input_tokens_seen": 31288160, "step": 35000 }, { "epoch": 9.238616866833839, "grad_norm": 0.2745518982410431, "learning_rate": 1.9000072384085272e-06, "loss": 0.071, "num_input_tokens_seen": 31292672, "step": 35005 }, { "epoch": 9.239936650389335, "grad_norm": 0.16345472633838654, "learning_rate": 1.8962548835098987e-06, "loss": 0.0736, "num_input_tokens_seen": 31296992, "step": 35010 }, { "epoch": 9.241256433944834, "grad_norm": 0.19953536987304688, "learning_rate": 1.8925060914991077e-06, "loss": 0.0796, "num_input_tokens_seen": 31301472, "step": 35015 }, { "epoch": 9.24257621750033, "grad_norm": 0.3499477505683899, "learning_rate": 1.888760862954264e-06, "loss": 0.0761, "num_input_tokens_seen": 31306240, "step": 35020 }, { "epoch": 9.243896001055827, "grad_norm": 0.1604841947555542, "learning_rate": 1.8850191984529309e-06, "loss": 0.0579, "num_input_tokens_seen": 31310336, "step": 35025 }, { "epoch": 9.245215784611323, "grad_norm": 0.2717927098274231, "learning_rate": 1.8812810985721186e-06, "loss": 0.0336, "num_input_tokens_seen": 31314752, "step": 35030 }, { "epoch": 9.24653556816682, "grad_norm": 0.20730164647102356, "learning_rate": 1.8775465638882856e-06, "loss": 0.0707, "num_input_tokens_seen": 31319520, "step": 35035 }, { "epoch": 9.247855351722318, "grad_norm": 0.04383067414164543, "learning_rate": 1.8738155949773517e-06, "loss": 0.0927, "num_input_tokens_seen": 31323968, "step": 35040 }, { "epoch": 9.249175135277815, "grad_norm": 0.21296025812625885, "learning_rate": 1.8700881924146707e-06, "loss": 0.0663, "num_input_tokens_seen": 31328768, "step": 35045 }, { "epoch": 9.250494918833311, "grad_norm": 0.07832084596157074, "learning_rate": 1.8663643567750577e-06, "loss": 0.0651, "num_input_tokens_seen": 31332992, "step": 35050 }, { "epoch": 9.251814702388808, "grad_norm": 0.3525379002094269, "learning_rate": 1.8626440886327813e-06, "loss": 0.089, "num_input_tokens_seen": 31337696, "step": 35055 }, { "epoch": 9.253134485944305, "grad_norm": 0.11978326737880707, "learning_rate": 1.8589273885615432e-06, "loss": 0.0517, "num_input_tokens_seen": 31342304, "step": 35060 }, { "epoch": 9.254454269499803, "grad_norm": 0.6307327151298523, "learning_rate": 1.8552142571345133e-06, "loss": 0.1236, "num_input_tokens_seen": 31346752, "step": 35065 }, { "epoch": 9.2557740530553, "grad_norm": 0.26586130261421204, "learning_rate": 1.8515046949243025e-06, "loss": 0.0599, "num_input_tokens_seen": 31351328, "step": 35070 }, { "epoch": 9.257093836610796, "grad_norm": 0.19320237636566162, "learning_rate": 1.8477987025029674e-06, "loss": 0.0419, "num_input_tokens_seen": 31355776, "step": 35075 }, { "epoch": 9.258413620166293, "grad_norm": 0.1549493819475174, "learning_rate": 1.8440962804420232e-06, "loss": 0.097, "num_input_tokens_seen": 31360320, "step": 35080 }, { "epoch": 9.259733403721789, "grad_norm": 0.1618567258119583, "learning_rate": 1.8403974293124265e-06, "loss": 0.0763, "num_input_tokens_seen": 31364768, "step": 35085 }, { "epoch": 9.261053187277287, "grad_norm": 0.1198483258485794, "learning_rate": 1.8367021496845854e-06, "loss": 0.0538, "num_input_tokens_seen": 31368928, "step": 35090 }, { "epoch": 9.262372970832784, "grad_norm": 0.12135984748601913, "learning_rate": 1.8330104421283662e-06, "loss": 0.0811, "num_input_tokens_seen": 31373440, "step": 35095 }, { "epoch": 9.26369275438828, "grad_norm": 0.30855053663253784, "learning_rate": 1.8293223072130717e-06, "loss": 0.029, "num_input_tokens_seen": 31377728, "step": 35100 }, { "epoch": 9.265012537943777, "grad_norm": 0.19386233389377594, "learning_rate": 1.8256377455074525e-06, "loss": 0.0748, "num_input_tokens_seen": 31381888, "step": 35105 }, { "epoch": 9.266332321499274, "grad_norm": 0.32154449820518494, "learning_rate": 1.8219567575797263e-06, "loss": 0.0617, "num_input_tokens_seen": 31386208, "step": 35110 }, { "epoch": 9.26765210505477, "grad_norm": 0.27865394949913025, "learning_rate": 1.8182793439975365e-06, "loss": 0.0502, "num_input_tokens_seen": 31390464, "step": 35115 }, { "epoch": 9.268971888610269, "grad_norm": 0.189579576253891, "learning_rate": 1.8146055053279958e-06, "loss": 0.0775, "num_input_tokens_seen": 31394848, "step": 35120 }, { "epoch": 9.270291672165765, "grad_norm": 0.10871469974517822, "learning_rate": 1.8109352421376486e-06, "loss": 0.028, "num_input_tokens_seen": 31398944, "step": 35125 }, { "epoch": 9.271611455721262, "grad_norm": 0.3476543426513672, "learning_rate": 1.8072685549924972e-06, "loss": 0.0775, "num_input_tokens_seen": 31403264, "step": 35130 }, { "epoch": 9.272931239276758, "grad_norm": 0.21213936805725098, "learning_rate": 1.8036054444579982e-06, "loss": 0.0631, "num_input_tokens_seen": 31407552, "step": 35135 }, { "epoch": 9.274251022832255, "grad_norm": 0.47305503487586975, "learning_rate": 1.7999459110990407e-06, "loss": 0.0842, "num_input_tokens_seen": 31412384, "step": 35140 }, { "epoch": 9.275570806387753, "grad_norm": 0.06577346473932266, "learning_rate": 1.7962899554799712e-06, "loss": 0.0436, "num_input_tokens_seen": 31416640, "step": 35145 }, { "epoch": 9.27689058994325, "grad_norm": 0.22955182194709778, "learning_rate": 1.7926375781645937e-06, "loss": 0.0703, "num_input_tokens_seen": 31421408, "step": 35150 }, { "epoch": 9.278210373498746, "grad_norm": 0.036228280514478683, "learning_rate": 1.7889887797161359e-06, "loss": 0.0347, "num_input_tokens_seen": 31425952, "step": 35155 }, { "epoch": 9.279530157054243, "grad_norm": 0.1186600998044014, "learning_rate": 1.7853435606973028e-06, "loss": 0.0463, "num_input_tokens_seen": 31430368, "step": 35160 }, { "epoch": 9.28084994060974, "grad_norm": 0.054083045572042465, "learning_rate": 1.781701921670223e-06, "loss": 0.047, "num_input_tokens_seen": 31434848, "step": 35165 }, { "epoch": 9.282169724165238, "grad_norm": 0.09619026631116867, "learning_rate": 1.7780638631964886e-06, "loss": 0.0429, "num_input_tokens_seen": 31438880, "step": 35170 }, { "epoch": 9.283489507720734, "grad_norm": 0.32648539543151855, "learning_rate": 1.7744293858371314e-06, "loss": 0.0578, "num_input_tokens_seen": 31443200, "step": 35175 }, { "epoch": 9.28480929127623, "grad_norm": 0.11920803040266037, "learning_rate": 1.770798490152631e-06, "loss": 0.0496, "num_input_tokens_seen": 31447776, "step": 35180 }, { "epoch": 9.286129074831727, "grad_norm": 0.23472970724105835, "learning_rate": 1.767171176702917e-06, "loss": 0.0519, "num_input_tokens_seen": 31452000, "step": 35185 }, { "epoch": 9.287448858387224, "grad_norm": 0.1074891984462738, "learning_rate": 1.7635474460473755e-06, "loss": 0.0241, "num_input_tokens_seen": 31456544, "step": 35190 }, { "epoch": 9.288768641942722, "grad_norm": 0.130157932639122, "learning_rate": 1.7599272987448206e-06, "loss": 0.044, "num_input_tokens_seen": 31461088, "step": 35195 }, { "epoch": 9.290088425498219, "grad_norm": 0.14141687750816345, "learning_rate": 1.7563107353535362e-06, "loss": 0.0335, "num_input_tokens_seen": 31465760, "step": 35200 }, { "epoch": 9.290088425498219, "eval_loss": 0.066804438829422, "eval_runtime": 64.7256, "eval_samples_per_second": 104.055, "eval_steps_per_second": 26.018, "num_input_tokens_seen": 31465760, "step": 35200 }, { "epoch": 9.291408209053715, "grad_norm": 0.3006802797317505, "learning_rate": 1.7526977564312263e-06, "loss": 0.1077, "num_input_tokens_seen": 31470112, "step": 35205 }, { "epoch": 9.292727992609212, "grad_norm": 0.3751697540283203, "learning_rate": 1.7490883625350701e-06, "loss": 0.0585, "num_input_tokens_seen": 31474592, "step": 35210 }, { "epoch": 9.294047776164708, "grad_norm": 0.052346792072057724, "learning_rate": 1.7454825542216807e-06, "loss": 0.0653, "num_input_tokens_seen": 31478912, "step": 35215 }, { "epoch": 9.295367559720205, "grad_norm": 0.3811892867088318, "learning_rate": 1.7418803320471105e-06, "loss": 0.0471, "num_input_tokens_seen": 31483232, "step": 35220 }, { "epoch": 9.296687343275703, "grad_norm": 0.3555549085140228, "learning_rate": 1.7382816965668737e-06, "loss": 0.0602, "num_input_tokens_seen": 31488160, "step": 35225 }, { "epoch": 9.2980071268312, "grad_norm": 0.13075213134288788, "learning_rate": 1.7346866483359285e-06, "loss": 0.0828, "num_input_tokens_seen": 31492448, "step": 35230 }, { "epoch": 9.299326910386696, "grad_norm": 0.08017893880605698, "learning_rate": 1.7310951879086657e-06, "loss": 0.0503, "num_input_tokens_seen": 31497056, "step": 35235 }, { "epoch": 9.300646693942193, "grad_norm": 0.35733428597450256, "learning_rate": 1.7275073158389471e-06, "loss": 0.058, "num_input_tokens_seen": 31501408, "step": 35240 }, { "epoch": 9.30196647749769, "grad_norm": 0.04155261069536209, "learning_rate": 1.723923032680061e-06, "loss": 0.0369, "num_input_tokens_seen": 31505920, "step": 35245 }, { "epoch": 9.303286261053188, "grad_norm": 0.20637236535549164, "learning_rate": 1.7203423389847428e-06, "loss": 0.0494, "num_input_tokens_seen": 31510496, "step": 35250 }, { "epoch": 9.304606044608684, "grad_norm": 0.45904818177223206, "learning_rate": 1.7167652353051928e-06, "loss": 0.0716, "num_input_tokens_seen": 31515040, "step": 35255 }, { "epoch": 9.305925828164181, "grad_norm": 0.40912407636642456, "learning_rate": 1.7131917221930333e-06, "loss": 0.058, "num_input_tokens_seen": 31519424, "step": 35260 }, { "epoch": 9.307245611719678, "grad_norm": 0.535129964351654, "learning_rate": 1.7096218001993513e-06, "loss": 0.082, "num_input_tokens_seen": 31523808, "step": 35265 }, { "epoch": 9.308565395275174, "grad_norm": 0.39120668172836304, "learning_rate": 1.706055469874676e-06, "loss": 0.0864, "num_input_tokens_seen": 31528288, "step": 35270 }, { "epoch": 9.309885178830672, "grad_norm": 0.20317073166370392, "learning_rate": 1.702492731768976e-06, "loss": 0.0587, "num_input_tokens_seen": 31532768, "step": 35275 }, { "epoch": 9.311204962386169, "grad_norm": 0.4517119228839874, "learning_rate": 1.6989335864316724e-06, "loss": 0.0817, "num_input_tokens_seen": 31537120, "step": 35280 }, { "epoch": 9.312524745941666, "grad_norm": 0.42405879497528076, "learning_rate": 1.6953780344116265e-06, "loss": 0.0734, "num_input_tokens_seen": 31541888, "step": 35285 }, { "epoch": 9.313844529497162, "grad_norm": 0.3776664435863495, "learning_rate": 1.6918260762571497e-06, "loss": 0.0968, "num_input_tokens_seen": 31546624, "step": 35290 }, { "epoch": 9.315164313052659, "grad_norm": 0.27456656098365784, "learning_rate": 1.6882777125160093e-06, "loss": 0.0439, "num_input_tokens_seen": 31550688, "step": 35295 }, { "epoch": 9.316484096608157, "grad_norm": 0.08221334964036942, "learning_rate": 1.6847329437353899e-06, "loss": 0.0437, "num_input_tokens_seen": 31554848, "step": 35300 }, { "epoch": 9.317803880163654, "grad_norm": 0.13294507563114166, "learning_rate": 1.6811917704619511e-06, "loss": 0.0264, "num_input_tokens_seen": 31559488, "step": 35305 }, { "epoch": 9.31912366371915, "grad_norm": 0.5683545470237732, "learning_rate": 1.67765419324179e-06, "loss": 0.0929, "num_input_tokens_seen": 31564000, "step": 35310 }, { "epoch": 9.320443447274647, "grad_norm": 0.19954471290111542, "learning_rate": 1.6741202126204364e-06, "loss": 0.0251, "num_input_tokens_seen": 31568544, "step": 35315 }, { "epoch": 9.321763230830143, "grad_norm": 0.04624074697494507, "learning_rate": 1.6705898291428767e-06, "loss": 0.017, "num_input_tokens_seen": 31572992, "step": 35320 }, { "epoch": 9.323083014385642, "grad_norm": 0.10103760659694672, "learning_rate": 1.6670630433535395e-06, "loss": 0.0432, "num_input_tokens_seen": 31577248, "step": 35325 }, { "epoch": 9.324402797941138, "grad_norm": 0.2893691062927246, "learning_rate": 1.6635398557962979e-06, "loss": 0.1044, "num_input_tokens_seen": 31581664, "step": 35330 }, { "epoch": 9.325722581496635, "grad_norm": 0.1032041609287262, "learning_rate": 1.660020267014481e-06, "loss": 0.0688, "num_input_tokens_seen": 31586048, "step": 35335 }, { "epoch": 9.327042365052131, "grad_norm": 0.31559255719184875, "learning_rate": 1.6565042775508438e-06, "loss": 0.0336, "num_input_tokens_seen": 31590336, "step": 35340 }, { "epoch": 9.328362148607628, "grad_norm": 0.40643203258514404, "learning_rate": 1.6529918879475997e-06, "loss": 0.0914, "num_input_tokens_seen": 31594688, "step": 35345 }, { "epoch": 9.329681932163126, "grad_norm": 0.2483128160238266, "learning_rate": 1.6494830987464043e-06, "loss": 0.0382, "num_input_tokens_seen": 31599104, "step": 35350 }, { "epoch": 9.331001715718623, "grad_norm": 0.4615936279296875, "learning_rate": 1.6459779104883555e-06, "loss": 0.0796, "num_input_tokens_seen": 31603360, "step": 35355 }, { "epoch": 9.33232149927412, "grad_norm": 0.12768855690956116, "learning_rate": 1.6424763237140013e-06, "loss": 0.0264, "num_input_tokens_seen": 31607648, "step": 35360 }, { "epoch": 9.333641282829616, "grad_norm": 0.10510478168725967, "learning_rate": 1.6389783389633207e-06, "loss": 0.0681, "num_input_tokens_seen": 31611904, "step": 35365 }, { "epoch": 9.334961066385112, "grad_norm": 0.04251127690076828, "learning_rate": 1.6354839567757546e-06, "loss": 0.0202, "num_input_tokens_seen": 31616544, "step": 35370 }, { "epoch": 9.336280849940609, "grad_norm": 0.08850254118442535, "learning_rate": 1.6319931776901831e-06, "loss": 0.0648, "num_input_tokens_seen": 31621216, "step": 35375 }, { "epoch": 9.337600633496107, "grad_norm": 0.16239258646965027, "learning_rate": 1.6285060022449229e-06, "loss": 0.0557, "num_input_tokens_seen": 31625728, "step": 35380 }, { "epoch": 9.338920417051604, "grad_norm": 0.04191487282514572, "learning_rate": 1.6250224309777434e-06, "loss": 0.032, "num_input_tokens_seen": 31630304, "step": 35385 }, { "epoch": 9.3402402006071, "grad_norm": 0.40563708543777466, "learning_rate": 1.6215424644258515e-06, "loss": 0.0429, "num_input_tokens_seen": 31634688, "step": 35390 }, { "epoch": 9.341559984162597, "grad_norm": 0.28846850991249084, "learning_rate": 1.6180661031259036e-06, "loss": 0.0687, "num_input_tokens_seen": 31638816, "step": 35395 }, { "epoch": 9.342879767718093, "grad_norm": 0.19480672478675842, "learning_rate": 1.614593347613999e-06, "loss": 0.0174, "num_input_tokens_seen": 31643168, "step": 35400 }, { "epoch": 9.342879767718093, "eval_loss": 0.06670345366001129, "eval_runtime": 64.7249, "eval_samples_per_second": 104.056, "eval_steps_per_second": 26.018, "num_input_tokens_seen": 31643168, "step": 35400 }, { "epoch": 9.344199551273592, "grad_norm": 0.08062545955181122, "learning_rate": 1.6111241984256758e-06, "loss": 0.0333, "num_input_tokens_seen": 31647680, "step": 35405 }, { "epoch": 9.345519334829088, "grad_norm": 0.15169300138950348, "learning_rate": 1.6076586560959257e-06, "loss": 0.0848, "num_input_tokens_seen": 31652032, "step": 35410 }, { "epoch": 9.346839118384585, "grad_norm": 0.11037168651819229, "learning_rate": 1.604196721159182e-06, "loss": 0.0422, "num_input_tokens_seen": 31656640, "step": 35415 }, { "epoch": 9.348158901940081, "grad_norm": 0.438476026058197, "learning_rate": 1.6007383941493092e-06, "loss": 0.0759, "num_input_tokens_seen": 31661088, "step": 35420 }, { "epoch": 9.349478685495578, "grad_norm": 0.18475303053855896, "learning_rate": 1.5972836755996285e-06, "loss": 0.0365, "num_input_tokens_seen": 31665536, "step": 35425 }, { "epoch": 9.350798469051076, "grad_norm": 0.21896690130233765, "learning_rate": 1.5938325660429076e-06, "loss": 0.067, "num_input_tokens_seen": 31670016, "step": 35430 }, { "epoch": 9.352118252606573, "grad_norm": 0.2817040979862213, "learning_rate": 1.5903850660113378e-06, "loss": 0.0467, "num_input_tokens_seen": 31674432, "step": 35435 }, { "epoch": 9.35343803616207, "grad_norm": 0.1353224515914917, "learning_rate": 1.5869411760365826e-06, "loss": 0.0566, "num_input_tokens_seen": 31678944, "step": 35440 }, { "epoch": 9.354757819717566, "grad_norm": 0.3669642210006714, "learning_rate": 1.58350089664972e-06, "loss": 0.0716, "num_input_tokens_seen": 31683424, "step": 35445 }, { "epoch": 9.356077603273063, "grad_norm": 0.23217715322971344, "learning_rate": 1.5800642283812865e-06, "loss": 0.073, "num_input_tokens_seen": 31687552, "step": 35450 }, { "epoch": 9.357397386828561, "grad_norm": 0.12688122689723969, "learning_rate": 1.5766311717612698e-06, "loss": 0.0452, "num_input_tokens_seen": 31691712, "step": 35455 }, { "epoch": 9.358717170384057, "grad_norm": 0.1995408833026886, "learning_rate": 1.5732017273190818e-06, "loss": 0.0276, "num_input_tokens_seen": 31696032, "step": 35460 }, { "epoch": 9.360036953939554, "grad_norm": 0.27183258533477783, "learning_rate": 1.5697758955835806e-06, "loss": 0.0524, "num_input_tokens_seen": 31700736, "step": 35465 }, { "epoch": 9.36135673749505, "grad_norm": 0.202050119638443, "learning_rate": 1.566353677083085e-06, "loss": 0.0288, "num_input_tokens_seen": 31705536, "step": 35470 }, { "epoch": 9.362676521050547, "grad_norm": 0.5705959796905518, "learning_rate": 1.562935072345334e-06, "loss": 0.1186, "num_input_tokens_seen": 31710016, "step": 35475 }, { "epoch": 9.363996304606044, "grad_norm": 0.40491166710853577, "learning_rate": 1.5595200818975281e-06, "loss": 0.0879, "num_input_tokens_seen": 31714592, "step": 35480 }, { "epoch": 9.365316088161542, "grad_norm": 0.1162884309887886, "learning_rate": 1.5561087062662905e-06, "loss": 0.0647, "num_input_tokens_seen": 31719040, "step": 35485 }, { "epoch": 9.366635871717039, "grad_norm": 0.04190465062856674, "learning_rate": 1.5527009459777087e-06, "loss": 0.0537, "num_input_tokens_seen": 31723616, "step": 35490 }, { "epoch": 9.367955655272535, "grad_norm": 0.43932339549064636, "learning_rate": 1.5492968015572984e-06, "loss": 0.1108, "num_input_tokens_seen": 31727840, "step": 35495 }, { "epoch": 9.369275438828032, "grad_norm": 0.23918014764785767, "learning_rate": 1.5458962735300203e-06, "loss": 0.0906, "num_input_tokens_seen": 31732096, "step": 35500 }, { "epoch": 9.370595222383528, "grad_norm": 0.3836345374584198, "learning_rate": 1.54249936242028e-06, "loss": 0.0358, "num_input_tokens_seen": 31736512, "step": 35505 }, { "epoch": 9.371915005939027, "grad_norm": 0.16582264006137848, "learning_rate": 1.5391060687519222e-06, "loss": 0.0753, "num_input_tokens_seen": 31741152, "step": 35510 }, { "epoch": 9.373234789494523, "grad_norm": 0.21266137063503265, "learning_rate": 1.5357163930482367e-06, "loss": 0.0501, "num_input_tokens_seen": 31745600, "step": 35515 }, { "epoch": 9.37455457305002, "grad_norm": 0.02760969288647175, "learning_rate": 1.532330335831955e-06, "loss": 0.0315, "num_input_tokens_seen": 31750016, "step": 35520 }, { "epoch": 9.375874356605516, "grad_norm": 0.6111297607421875, "learning_rate": 1.5289478976252491e-06, "loss": 0.0773, "num_input_tokens_seen": 31754976, "step": 35525 }, { "epoch": 9.377194140161013, "grad_norm": 0.10423564165830612, "learning_rate": 1.5255690789497345e-06, "loss": 0.0312, "num_input_tokens_seen": 31759232, "step": 35530 }, { "epoch": 9.378513923716511, "grad_norm": 0.10101795196533203, "learning_rate": 1.5221938803264641e-06, "loss": 0.0454, "num_input_tokens_seen": 31763904, "step": 35535 }, { "epoch": 9.379833707272008, "grad_norm": 0.0902305319905281, "learning_rate": 1.518822302275938e-06, "loss": 0.0205, "num_input_tokens_seen": 31768320, "step": 35540 }, { "epoch": 9.381153490827504, "grad_norm": 0.47667238116264343, "learning_rate": 1.5154543453180958e-06, "loss": 0.0628, "num_input_tokens_seen": 31773056, "step": 35545 }, { "epoch": 9.382473274383, "grad_norm": 0.13630710542201996, "learning_rate": 1.5120900099723167e-06, "loss": 0.0187, "num_input_tokens_seen": 31777600, "step": 35550 }, { "epoch": 9.383793057938497, "grad_norm": 0.3094789385795593, "learning_rate": 1.5087292967574273e-06, "loss": 0.0429, "num_input_tokens_seen": 31782144, "step": 35555 }, { "epoch": 9.385112841493996, "grad_norm": 0.03494316712021828, "learning_rate": 1.5053722061916908e-06, "loss": 0.0375, "num_input_tokens_seen": 31786912, "step": 35560 }, { "epoch": 9.386432625049492, "grad_norm": 0.05522536113858223, "learning_rate": 1.5020187387928124e-06, "loss": 0.019, "num_input_tokens_seen": 31791296, "step": 35565 }, { "epoch": 9.387752408604989, "grad_norm": 0.07607729732990265, "learning_rate": 1.4986688950779343e-06, "loss": 0.0392, "num_input_tokens_seen": 31795808, "step": 35570 }, { "epoch": 9.389072192160485, "grad_norm": 0.09176502376794815, "learning_rate": 1.495322675563654e-06, "loss": 0.0596, "num_input_tokens_seen": 31800288, "step": 35575 }, { "epoch": 9.390391975715982, "grad_norm": 0.22456508874893188, "learning_rate": 1.4919800807659922e-06, "loss": 0.0412, "num_input_tokens_seen": 31804544, "step": 35580 }, { "epoch": 9.39171175927148, "grad_norm": 0.20803193747997284, "learning_rate": 1.4886411112004255e-06, "loss": 0.0258, "num_input_tokens_seen": 31808864, "step": 35585 }, { "epoch": 9.393031542826977, "grad_norm": 0.0972791388630867, "learning_rate": 1.4853057673818588e-06, "loss": 0.0172, "num_input_tokens_seen": 31813408, "step": 35590 }, { "epoch": 9.394351326382473, "grad_norm": 0.34281662106513977, "learning_rate": 1.481974049824647e-06, "loss": 0.0867, "num_input_tokens_seen": 31817632, "step": 35595 }, { "epoch": 9.39567110993797, "grad_norm": 0.3948444426059723, "learning_rate": 1.4786459590425849e-06, "loss": 0.0852, "num_input_tokens_seen": 31821856, "step": 35600 }, { "epoch": 9.39567110993797, "eval_loss": 0.0668850988149643, "eval_runtime": 64.717, "eval_samples_per_second": 104.068, "eval_steps_per_second": 26.021, "num_input_tokens_seen": 31821856, "step": 35600 }, { "epoch": 9.396990893493467, "grad_norm": 0.18731388449668884, "learning_rate": 1.4753214955489036e-06, "loss": 0.0533, "num_input_tokens_seen": 31826400, "step": 35605 }, { "epoch": 9.398310677048965, "grad_norm": 0.1431075781583786, "learning_rate": 1.4720006598562737e-06, "loss": 0.0953, "num_input_tokens_seen": 31830592, "step": 35610 }, { "epoch": 9.399630460604461, "grad_norm": 0.0659983903169632, "learning_rate": 1.4686834524768185e-06, "loss": 0.021, "num_input_tokens_seen": 31834880, "step": 35615 }, { "epoch": 9.400950244159958, "grad_norm": 0.3530988097190857, "learning_rate": 1.4653698739220844e-06, "loss": 0.0296, "num_input_tokens_seen": 31839296, "step": 35620 }, { "epoch": 9.402270027715455, "grad_norm": 0.21844987571239471, "learning_rate": 1.4620599247030715e-06, "loss": 0.0614, "num_input_tokens_seen": 31843840, "step": 35625 }, { "epoch": 9.403589811270951, "grad_norm": 0.2784656882286072, "learning_rate": 1.4587536053302125e-06, "loss": 0.0358, "num_input_tokens_seen": 31848128, "step": 35630 }, { "epoch": 9.404909594826448, "grad_norm": 0.5678418874740601, "learning_rate": 1.4554509163133862e-06, "loss": 0.0842, "num_input_tokens_seen": 31852544, "step": 35635 }, { "epoch": 9.406229378381946, "grad_norm": 0.25789692997932434, "learning_rate": 1.4521518581619098e-06, "loss": 0.0603, "num_input_tokens_seen": 31856768, "step": 35640 }, { "epoch": 9.407549161937443, "grad_norm": 0.07239355146884918, "learning_rate": 1.4488564313845348e-06, "loss": 0.0204, "num_input_tokens_seen": 31861376, "step": 35645 }, { "epoch": 9.408868945492939, "grad_norm": 0.25704944133758545, "learning_rate": 1.4455646364894603e-06, "loss": 0.0676, "num_input_tokens_seen": 31865728, "step": 35650 }, { "epoch": 9.410188729048436, "grad_norm": 0.14750495553016663, "learning_rate": 1.4422764739843247e-06, "loss": 0.1043, "num_input_tokens_seen": 31870048, "step": 35655 }, { "epoch": 9.411508512603932, "grad_norm": 0.3360860347747803, "learning_rate": 1.4389919443762e-06, "loss": 0.1965, "num_input_tokens_seen": 31874432, "step": 35660 }, { "epoch": 9.41282829615943, "grad_norm": 0.04006439819931984, "learning_rate": 1.4357110481716063e-06, "loss": 0.0496, "num_input_tokens_seen": 31878752, "step": 35665 }, { "epoch": 9.414148079714927, "grad_norm": 0.394654244184494, "learning_rate": 1.4324337858764941e-06, "loss": 0.0691, "num_input_tokens_seen": 31883328, "step": 35670 }, { "epoch": 9.415467863270424, "grad_norm": 0.16913703083992004, "learning_rate": 1.4291601579962622e-06, "loss": 0.0949, "num_input_tokens_seen": 31887776, "step": 35675 }, { "epoch": 9.41678764682592, "grad_norm": 0.322200745344162, "learning_rate": 1.42589016503574e-06, "loss": 0.0432, "num_input_tokens_seen": 31891904, "step": 35680 }, { "epoch": 9.418107430381417, "grad_norm": 0.17461909353733063, "learning_rate": 1.4226238074992099e-06, "loss": 0.0622, "num_input_tokens_seen": 31896128, "step": 35685 }, { "epoch": 9.419427213936915, "grad_norm": 0.16690261662006378, "learning_rate": 1.4193610858903778e-06, "loss": 0.061, "num_input_tokens_seen": 31900672, "step": 35690 }, { "epoch": 9.420746997492412, "grad_norm": 0.11034580320119858, "learning_rate": 1.416102000712402e-06, "loss": 0.0597, "num_input_tokens_seen": 31905184, "step": 35695 }, { "epoch": 9.422066781047908, "grad_norm": 0.21503998339176178, "learning_rate": 1.4128465524678668e-06, "loss": 0.0322, "num_input_tokens_seen": 31909504, "step": 35700 }, { "epoch": 9.423386564603405, "grad_norm": 0.1960384100675583, "learning_rate": 1.4095947416588124e-06, "loss": 0.078, "num_input_tokens_seen": 31914112, "step": 35705 }, { "epoch": 9.424706348158901, "grad_norm": 0.13333863019943237, "learning_rate": 1.4063465687866983e-06, "loss": 0.1114, "num_input_tokens_seen": 31918560, "step": 35710 }, { "epoch": 9.4260261317144, "grad_norm": 0.5157410502433777, "learning_rate": 1.4031020343524438e-06, "loss": 0.05, "num_input_tokens_seen": 31923520, "step": 35715 }, { "epoch": 9.427345915269896, "grad_norm": 0.2986643314361572, "learning_rate": 1.3998611388563926e-06, "loss": 0.0849, "num_input_tokens_seen": 31927840, "step": 35720 }, { "epoch": 9.428665698825393, "grad_norm": 0.11249393224716187, "learning_rate": 1.3966238827983314e-06, "loss": 0.0687, "num_input_tokens_seen": 31932416, "step": 35725 }, { "epoch": 9.42998548238089, "grad_norm": 0.2712875306606293, "learning_rate": 1.393390266677483e-06, "loss": 0.0749, "num_input_tokens_seen": 31936704, "step": 35730 }, { "epoch": 9.431305265936386, "grad_norm": 0.055935170501470566, "learning_rate": 1.3901602909925204e-06, "loss": 0.0861, "num_input_tokens_seen": 31941120, "step": 35735 }, { "epoch": 9.432625049491884, "grad_norm": 0.14575591683387756, "learning_rate": 1.3869339562415373e-06, "loss": 0.0886, "num_input_tokens_seen": 31945760, "step": 35740 }, { "epoch": 9.43394483304738, "grad_norm": 0.046151384711265564, "learning_rate": 1.38371126292208e-06, "loss": 0.0401, "num_input_tokens_seen": 31950336, "step": 35745 }, { "epoch": 9.435264616602877, "grad_norm": 0.20803791284561157, "learning_rate": 1.3804922115311286e-06, "loss": 0.031, "num_input_tokens_seen": 31954560, "step": 35750 }, { "epoch": 9.436584400158374, "grad_norm": 0.14611060917377472, "learning_rate": 1.3772768025650945e-06, "loss": 0.0424, "num_input_tokens_seen": 31958944, "step": 35755 }, { "epoch": 9.43790418371387, "grad_norm": 0.3255484104156494, "learning_rate": 1.3740650365198448e-06, "loss": 0.0568, "num_input_tokens_seen": 31963456, "step": 35760 }, { "epoch": 9.439223967269367, "grad_norm": 0.03696254640817642, "learning_rate": 1.3708569138906612e-06, "loss": 0.0741, "num_input_tokens_seen": 31967776, "step": 35765 }, { "epoch": 9.440543750824865, "grad_norm": 0.09446676075458527, "learning_rate": 1.367652435172287e-06, "loss": 0.0453, "num_input_tokens_seen": 31972256, "step": 35770 }, { "epoch": 9.441863534380362, "grad_norm": 0.3016657829284668, "learning_rate": 1.364451600858893e-06, "loss": 0.0484, "num_input_tokens_seen": 31976544, "step": 35775 }, { "epoch": 9.443183317935858, "grad_norm": 0.3186556100845337, "learning_rate": 1.3612544114440823e-06, "loss": 0.053, "num_input_tokens_seen": 31981408, "step": 35780 }, { "epoch": 9.444503101491355, "grad_norm": 0.27121850848197937, "learning_rate": 1.3580608674209072e-06, "loss": 0.0511, "num_input_tokens_seen": 31985600, "step": 35785 }, { "epoch": 9.445822885046852, "grad_norm": 0.05777192860841751, "learning_rate": 1.3548709692818434e-06, "loss": 0.0486, "num_input_tokens_seen": 31990016, "step": 35790 }, { "epoch": 9.44714266860235, "grad_norm": 0.17754770815372467, "learning_rate": 1.3516847175188223e-06, "loss": 0.0303, "num_input_tokens_seen": 31994240, "step": 35795 }, { "epoch": 9.448462452157846, "grad_norm": 0.1456814855337143, "learning_rate": 1.348502112623204e-06, "loss": 0.0511, "num_input_tokens_seen": 31998368, "step": 35800 }, { "epoch": 9.448462452157846, "eval_loss": 0.06698223948478699, "eval_runtime": 64.7626, "eval_samples_per_second": 103.995, "eval_steps_per_second": 26.003, "num_input_tokens_seen": 31998368, "step": 35800 }, { "epoch": 9.449782235713343, "grad_norm": 0.3033802807331085, "learning_rate": 1.3453231550857787e-06, "loss": 0.0647, "num_input_tokens_seen": 32002880, "step": 35805 }, { "epoch": 9.45110201926884, "grad_norm": 0.11059413850307465, "learning_rate": 1.3421478453967878e-06, "loss": 0.0327, "num_input_tokens_seen": 32007552, "step": 35810 }, { "epoch": 9.452421802824336, "grad_norm": 0.12480597198009491, "learning_rate": 1.3389761840459065e-06, "loss": 0.0317, "num_input_tokens_seen": 32012000, "step": 35815 }, { "epoch": 9.453741586379834, "grad_norm": 0.33015137910842896, "learning_rate": 1.3358081715222376e-06, "loss": 0.0873, "num_input_tokens_seen": 32016544, "step": 35820 }, { "epoch": 9.455061369935331, "grad_norm": 0.34156304597854614, "learning_rate": 1.3326438083143295e-06, "loss": 0.057, "num_input_tokens_seen": 32021376, "step": 35825 }, { "epoch": 9.456381153490828, "grad_norm": 0.16523858904838562, "learning_rate": 1.3294830949101723e-06, "loss": 0.0347, "num_input_tokens_seen": 32025824, "step": 35830 }, { "epoch": 9.457700937046324, "grad_norm": 0.13245216012001038, "learning_rate": 1.3263260317971815e-06, "loss": 0.0603, "num_input_tokens_seen": 32030368, "step": 35835 }, { "epoch": 9.45902072060182, "grad_norm": 0.09716157615184784, "learning_rate": 1.3231726194622208e-06, "loss": 0.0405, "num_input_tokens_seen": 32034848, "step": 35840 }, { "epoch": 9.460340504157319, "grad_norm": 0.2539297342300415, "learning_rate": 1.3200228583915814e-06, "loss": 0.0466, "num_input_tokens_seen": 32039296, "step": 35845 }, { "epoch": 9.461660287712816, "grad_norm": 0.07760278135538101, "learning_rate": 1.3168767490709971e-06, "loss": 0.0486, "num_input_tokens_seen": 32043648, "step": 35850 }, { "epoch": 9.462980071268312, "grad_norm": 0.21851150691509247, "learning_rate": 1.3137342919856437e-06, "loss": 0.0218, "num_input_tokens_seen": 32048352, "step": 35855 }, { "epoch": 9.464299854823809, "grad_norm": 0.2527139484882355, "learning_rate": 1.310595487620117e-06, "loss": 0.0531, "num_input_tokens_seen": 32052736, "step": 35860 }, { "epoch": 9.465619638379305, "grad_norm": 0.3916867673397064, "learning_rate": 1.3074603364584715e-06, "loss": 0.07, "num_input_tokens_seen": 32056896, "step": 35865 }, { "epoch": 9.466939421934804, "grad_norm": 0.2598065137863159, "learning_rate": 1.3043288389841758e-06, "loss": 0.0478, "num_input_tokens_seen": 32061440, "step": 35870 }, { "epoch": 9.4682592054903, "grad_norm": 0.1841055303812027, "learning_rate": 1.3012009956801546e-06, "loss": 0.0388, "num_input_tokens_seen": 32065536, "step": 35875 }, { "epoch": 9.469578989045797, "grad_norm": 0.22664779424667358, "learning_rate": 1.2980768070287586e-06, "loss": 0.065, "num_input_tokens_seen": 32070112, "step": 35880 }, { "epoch": 9.470898772601293, "grad_norm": 0.16580571234226227, "learning_rate": 1.2949562735117716e-06, "loss": 0.07, "num_input_tokens_seen": 32075040, "step": 35885 }, { "epoch": 9.47221855615679, "grad_norm": 0.060632552951574326, "learning_rate": 1.291839395610428e-06, "loss": 0.0479, "num_input_tokens_seen": 32079520, "step": 35890 }, { "epoch": 9.473538339712288, "grad_norm": 0.06873712688684464, "learning_rate": 1.2887261738053852e-06, "loss": 0.0416, "num_input_tokens_seen": 32083808, "step": 35895 }, { "epoch": 9.474858123267785, "grad_norm": 0.43802589178085327, "learning_rate": 1.2856166085767396e-06, "loss": 0.07, "num_input_tokens_seen": 32088000, "step": 35900 }, { "epoch": 9.476177906823281, "grad_norm": 0.12196921557188034, "learning_rate": 1.2825107004040272e-06, "loss": 0.027, "num_input_tokens_seen": 32092224, "step": 35905 }, { "epoch": 9.477497690378778, "grad_norm": 0.1386118084192276, "learning_rate": 1.2794084497662146e-06, "loss": 0.0565, "num_input_tokens_seen": 32096928, "step": 35910 }, { "epoch": 9.478817473934274, "grad_norm": 0.48510780930519104, "learning_rate": 1.276309857141711e-06, "loss": 0.0555, "num_input_tokens_seen": 32101440, "step": 35915 }, { "epoch": 9.48013725748977, "grad_norm": 0.060436997562646866, "learning_rate": 1.273214923008359e-06, "loss": 0.0756, "num_input_tokens_seen": 32105760, "step": 35920 }, { "epoch": 9.48145704104527, "grad_norm": 0.6558877229690552, "learning_rate": 1.2701236478434352e-06, "loss": 0.0598, "num_input_tokens_seen": 32110048, "step": 35925 }, { "epoch": 9.482776824600766, "grad_norm": 0.039301253855228424, "learning_rate": 1.2670360321236502e-06, "loss": 0.048, "num_input_tokens_seen": 32114368, "step": 35930 }, { "epoch": 9.484096608156262, "grad_norm": 0.22750680148601532, "learning_rate": 1.2639520763251617e-06, "loss": 0.0913, "num_input_tokens_seen": 32118912, "step": 35935 }, { "epoch": 9.485416391711759, "grad_norm": 0.08800280094146729, "learning_rate": 1.2608717809235448e-06, "loss": 0.0607, "num_input_tokens_seen": 32123296, "step": 35940 }, { "epoch": 9.486736175267255, "grad_norm": 0.31629496812820435, "learning_rate": 1.2577951463938282e-06, "loss": 0.0744, "num_input_tokens_seen": 32127584, "step": 35945 }, { "epoch": 9.488055958822754, "grad_norm": 0.03660682961344719, "learning_rate": 1.2547221732104569e-06, "loss": 0.0404, "num_input_tokens_seen": 32132000, "step": 35950 }, { "epoch": 9.48937574237825, "grad_norm": 0.11558277904987335, "learning_rate": 1.25165286184733e-06, "loss": 0.0521, "num_input_tokens_seen": 32136672, "step": 35955 }, { "epoch": 9.490695525933747, "grad_norm": 0.2368285059928894, "learning_rate": 1.248587212777777e-06, "loss": 0.067, "num_input_tokens_seen": 32141024, "step": 35960 }, { "epoch": 9.492015309489243, "grad_norm": 0.0584837831556797, "learning_rate": 1.2455252264745532e-06, "loss": 0.0237, "num_input_tokens_seen": 32145568, "step": 35965 }, { "epoch": 9.49333509304474, "grad_norm": 0.2167876958847046, "learning_rate": 1.2424669034098528e-06, "loss": 0.0577, "num_input_tokens_seen": 32150176, "step": 35970 }, { "epoch": 9.494654876600238, "grad_norm": 0.2450532168149948, "learning_rate": 1.2394122440553185e-06, "loss": 0.041, "num_input_tokens_seen": 32154752, "step": 35975 }, { "epoch": 9.495974660155735, "grad_norm": 0.47548699378967285, "learning_rate": 1.2363612488820037e-06, "loss": 0.0955, "num_input_tokens_seen": 32159456, "step": 35980 }, { "epoch": 9.497294443711231, "grad_norm": 0.21347351372241974, "learning_rate": 1.2333139183604208e-06, "loss": 0.085, "num_input_tokens_seen": 32164224, "step": 35985 }, { "epoch": 9.498614227266728, "grad_norm": 0.14385399222373962, "learning_rate": 1.2302702529604998e-06, "loss": 0.0311, "num_input_tokens_seen": 32168864, "step": 35990 }, { "epoch": 9.499934010822225, "grad_norm": 0.12284520268440247, "learning_rate": 1.227230253151615e-06, "loss": 0.0398, "num_input_tokens_seen": 32173504, "step": 35995 }, { "epoch": 9.501253794377723, "grad_norm": 0.08230874687433243, "learning_rate": 1.2241939194025748e-06, "loss": 0.0541, "num_input_tokens_seen": 32178176, "step": 36000 }, { "epoch": 9.501253794377723, "eval_loss": 0.06689131259918213, "eval_runtime": 64.7572, "eval_samples_per_second": 104.004, "eval_steps_per_second": 26.005, "num_input_tokens_seen": 32178176, "step": 36000 }, { "epoch": 9.50257357793322, "grad_norm": 0.05430793762207031, "learning_rate": 1.2211612521816156e-06, "loss": 0.0998, "num_input_tokens_seen": 32182592, "step": 36005 }, { "epoch": 9.503893361488716, "grad_norm": 0.06356596946716309, "learning_rate": 1.2181322519564137e-06, "loss": 0.0333, "num_input_tokens_seen": 32187360, "step": 36010 }, { "epoch": 9.505213145044213, "grad_norm": 0.16719351708889008, "learning_rate": 1.2151069191940839e-06, "loss": 0.0973, "num_input_tokens_seen": 32191808, "step": 36015 }, { "epoch": 9.506532928599709, "grad_norm": 0.21490101516246796, "learning_rate": 1.2120852543611644e-06, "loss": 0.0855, "num_input_tokens_seen": 32196352, "step": 36020 }, { "epoch": 9.507852712155206, "grad_norm": 0.19186356663703918, "learning_rate": 1.2090672579236379e-06, "loss": 0.0457, "num_input_tokens_seen": 32200864, "step": 36025 }, { "epoch": 9.509172495710704, "grad_norm": 0.3806540369987488, "learning_rate": 1.2060529303469126e-06, "loss": 0.0498, "num_input_tokens_seen": 32205472, "step": 36030 }, { "epoch": 9.5104922792662, "grad_norm": 0.09993086755275726, "learning_rate": 1.2030422720958445e-06, "loss": 0.0284, "num_input_tokens_seen": 32209888, "step": 36035 }, { "epoch": 9.511812062821697, "grad_norm": 0.18758989870548248, "learning_rate": 1.200035283634704e-06, "loss": 0.0331, "num_input_tokens_seen": 32214336, "step": 36040 }, { "epoch": 9.513131846377194, "grad_norm": 0.032845307141542435, "learning_rate": 1.1970319654272144e-06, "loss": 0.0421, "num_input_tokens_seen": 32218816, "step": 36045 }, { "epoch": 9.51445162993269, "grad_norm": 0.1916522979736328, "learning_rate": 1.1940323179365192e-06, "loss": 0.0951, "num_input_tokens_seen": 32223200, "step": 36050 }, { "epoch": 9.515771413488189, "grad_norm": 0.026084665209054947, "learning_rate": 1.1910363416252095e-06, "loss": 0.016, "num_input_tokens_seen": 32227488, "step": 36055 }, { "epoch": 9.517091197043685, "grad_norm": 0.22550491988658905, "learning_rate": 1.1880440369552964e-06, "loss": 0.0449, "num_input_tokens_seen": 32231872, "step": 36060 }, { "epoch": 9.518410980599182, "grad_norm": 0.2937047779560089, "learning_rate": 1.1850554043882328e-06, "loss": 0.0637, "num_input_tokens_seen": 32236160, "step": 36065 }, { "epoch": 9.519730764154678, "grad_norm": 0.23751212656497955, "learning_rate": 1.1820704443849028e-06, "loss": 0.0426, "num_input_tokens_seen": 32240704, "step": 36070 }, { "epoch": 9.521050547710175, "grad_norm": 0.08320045471191406, "learning_rate": 1.1790891574056219e-06, "loss": 0.0364, "num_input_tokens_seen": 32245088, "step": 36075 }, { "epoch": 9.522370331265673, "grad_norm": 0.33854562044143677, "learning_rate": 1.1761115439101523e-06, "loss": 0.091, "num_input_tokens_seen": 32249632, "step": 36080 }, { "epoch": 9.52369011482117, "grad_norm": 0.1391405314207077, "learning_rate": 1.1731376043576659e-06, "loss": 0.0339, "num_input_tokens_seen": 32254176, "step": 36085 }, { "epoch": 9.525009898376666, "grad_norm": 0.5929086804389954, "learning_rate": 1.1701673392067875e-06, "loss": 0.1383, "num_input_tokens_seen": 32258528, "step": 36090 }, { "epoch": 9.526329681932163, "grad_norm": 0.21276335418224335, "learning_rate": 1.1672007489155757e-06, "loss": 0.0709, "num_input_tokens_seen": 32262688, "step": 36095 }, { "epoch": 9.52764946548766, "grad_norm": 0.08716940879821777, "learning_rate": 1.164237833941506e-06, "loss": 0.0516, "num_input_tokens_seen": 32267520, "step": 36100 }, { "epoch": 9.528969249043158, "grad_norm": 0.05854455381631851, "learning_rate": 1.1612785947415022e-06, "loss": 0.0611, "num_input_tokens_seen": 32271808, "step": 36105 }, { "epoch": 9.530289032598654, "grad_norm": 0.461655855178833, "learning_rate": 1.1583230317719185e-06, "loss": 0.0717, "num_input_tokens_seen": 32276288, "step": 36110 }, { "epoch": 9.53160881615415, "grad_norm": 0.15409792959690094, "learning_rate": 1.1553711454885318e-06, "loss": 0.0509, "num_input_tokens_seen": 32280512, "step": 36115 }, { "epoch": 9.532928599709647, "grad_norm": 0.15361681580543518, "learning_rate": 1.152422936346567e-06, "loss": 0.0405, "num_input_tokens_seen": 32284832, "step": 36120 }, { "epoch": 9.534248383265144, "grad_norm": 0.1338394731283188, "learning_rate": 1.1494784048006718e-06, "loss": 0.0568, "num_input_tokens_seen": 32289440, "step": 36125 }, { "epoch": 9.535568166820642, "grad_norm": 0.1780836284160614, "learning_rate": 1.1465375513049326e-06, "loss": 0.0495, "num_input_tokens_seen": 32294112, "step": 36130 }, { "epoch": 9.536887950376139, "grad_norm": 0.3479895293712616, "learning_rate": 1.1436003763128616e-06, "loss": 0.0618, "num_input_tokens_seen": 32298528, "step": 36135 }, { "epoch": 9.538207733931635, "grad_norm": 0.24477459490299225, "learning_rate": 1.1406668802774106e-06, "loss": 0.0537, "num_input_tokens_seen": 32303072, "step": 36140 }, { "epoch": 9.539527517487132, "grad_norm": 0.19615603983402252, "learning_rate": 1.137737063650965e-06, "loss": 0.0965, "num_input_tokens_seen": 32307712, "step": 36145 }, { "epoch": 9.540847301042628, "grad_norm": 0.22162428498268127, "learning_rate": 1.1348109268853323e-06, "loss": 0.0616, "num_input_tokens_seen": 32312256, "step": 36150 }, { "epoch": 9.542167084598127, "grad_norm": 0.15427495539188385, "learning_rate": 1.1318884704317634e-06, "loss": 0.0343, "num_input_tokens_seen": 32316672, "step": 36155 }, { "epoch": 9.543486868153623, "grad_norm": 0.27769675850868225, "learning_rate": 1.1289696947409417e-06, "loss": 0.0506, "num_input_tokens_seen": 32321024, "step": 36160 }, { "epoch": 9.54480665170912, "grad_norm": 0.11984102427959442, "learning_rate": 1.126054600262974e-06, "loss": 0.0781, "num_input_tokens_seen": 32325728, "step": 36165 }, { "epoch": 9.546126435264616, "grad_norm": 0.25897517800331116, "learning_rate": 1.1231431874474064e-06, "loss": 0.0445, "num_input_tokens_seen": 32330112, "step": 36170 }, { "epoch": 9.547446218820113, "grad_norm": 0.12067486345767975, "learning_rate": 1.12023545674321e-06, "loss": 0.0695, "num_input_tokens_seen": 32334496, "step": 36175 }, { "epoch": 9.54876600237561, "grad_norm": 0.38571301102638245, "learning_rate": 1.117331408598804e-06, "loss": 0.1041, "num_input_tokens_seen": 32338848, "step": 36180 }, { "epoch": 9.550085785931108, "grad_norm": 0.23605024814605713, "learning_rate": 1.1144310434620191e-06, "loss": 0.0316, "num_input_tokens_seen": 32343296, "step": 36185 }, { "epoch": 9.551405569486604, "grad_norm": 0.20349544286727905, "learning_rate": 1.1115343617801365e-06, "loss": 0.0992, "num_input_tokens_seen": 32347904, "step": 36190 }, { "epoch": 9.552725353042101, "grad_norm": 0.4745585322380066, "learning_rate": 1.1086413639998515e-06, "loss": 0.0865, "num_input_tokens_seen": 32352320, "step": 36195 }, { "epoch": 9.554045136597598, "grad_norm": 0.1492098867893219, "learning_rate": 1.1057520505673103e-06, "loss": 0.0533, "num_input_tokens_seen": 32356768, "step": 36200 }, { "epoch": 9.554045136597598, "eval_loss": 0.06703390926122665, "eval_runtime": 64.7595, "eval_samples_per_second": 104.0, "eval_steps_per_second": 26.004, "num_input_tokens_seen": 32356768, "step": 36200 }, { "epoch": 9.555364920153094, "grad_norm": 0.1634102314710617, "learning_rate": 1.1028664219280727e-06, "loss": 0.046, "num_input_tokens_seen": 32361344, "step": 36205 }, { "epoch": 9.556684703708592, "grad_norm": 0.1981695592403412, "learning_rate": 1.0999844785271468e-06, "loss": 0.0382, "num_input_tokens_seen": 32366048, "step": 36210 }, { "epoch": 9.558004487264089, "grad_norm": 0.05717066675424576, "learning_rate": 1.097106220808955e-06, "loss": 0.0367, "num_input_tokens_seen": 32370304, "step": 36215 }, { "epoch": 9.559324270819586, "grad_norm": 0.200364887714386, "learning_rate": 1.0942316492173698e-06, "loss": 0.0428, "num_input_tokens_seen": 32374720, "step": 36220 }, { "epoch": 9.560644054375082, "grad_norm": 0.11499189585447311, "learning_rate": 1.0913607641956841e-06, "loss": 0.0232, "num_input_tokens_seen": 32378944, "step": 36225 }, { "epoch": 9.561963837930579, "grad_norm": 0.08159168809652328, "learning_rate": 1.0884935661866213e-06, "loss": 0.0511, "num_input_tokens_seen": 32383392, "step": 36230 }, { "epoch": 9.563283621486077, "grad_norm": 0.1983921229839325, "learning_rate": 1.0856300556323418e-06, "loss": 0.0822, "num_input_tokens_seen": 32387840, "step": 36235 }, { "epoch": 9.564603405041574, "grad_norm": 0.0377018116414547, "learning_rate": 1.0827702329744365e-06, "loss": 0.0575, "num_input_tokens_seen": 32392224, "step": 36240 }, { "epoch": 9.56592318859707, "grad_norm": 0.08685025572776794, "learning_rate": 1.0799140986539197e-06, "loss": 0.0431, "num_input_tokens_seen": 32396800, "step": 36245 }, { "epoch": 9.567242972152567, "grad_norm": 0.20333470404148102, "learning_rate": 1.0770616531112526e-06, "loss": 0.0577, "num_input_tokens_seen": 32401216, "step": 36250 }, { "epoch": 9.568562755708063, "grad_norm": 0.44525718688964844, "learning_rate": 1.0742128967863085e-06, "loss": 0.0638, "num_input_tokens_seen": 32405888, "step": 36255 }, { "epoch": 9.569882539263562, "grad_norm": 0.4472443163394928, "learning_rate": 1.071367830118411e-06, "loss": 0.1023, "num_input_tokens_seen": 32410816, "step": 36260 }, { "epoch": 9.571202322819058, "grad_norm": 0.2957662045955658, "learning_rate": 1.068526453546298e-06, "loss": 0.0906, "num_input_tokens_seen": 32415328, "step": 36265 }, { "epoch": 9.572522106374555, "grad_norm": 0.13187162578105927, "learning_rate": 1.0656887675081467e-06, "loss": 0.0482, "num_input_tokens_seen": 32419616, "step": 36270 }, { "epoch": 9.573841889930051, "grad_norm": 0.3729113042354584, "learning_rate": 1.0628547724415628e-06, "loss": 0.0511, "num_input_tokens_seen": 32424160, "step": 36275 }, { "epoch": 9.575161673485548, "grad_norm": 0.30321475863456726, "learning_rate": 1.0600244687835881e-06, "loss": 0.0874, "num_input_tokens_seen": 32428608, "step": 36280 }, { "epoch": 9.576481457041044, "grad_norm": 0.23957787454128265, "learning_rate": 1.0571978569706876e-06, "loss": 0.0547, "num_input_tokens_seen": 32433536, "step": 36285 }, { "epoch": 9.577801240596543, "grad_norm": 0.346936970949173, "learning_rate": 1.0543749374387652e-06, "loss": 0.0447, "num_input_tokens_seen": 32438144, "step": 36290 }, { "epoch": 9.57912102415204, "grad_norm": 0.041708122938871384, "learning_rate": 1.051555710623142e-06, "loss": 0.0743, "num_input_tokens_seen": 32442720, "step": 36295 }, { "epoch": 9.580440807707536, "grad_norm": 0.25884315371513367, "learning_rate": 1.0487401769585847e-06, "loss": 0.0285, "num_input_tokens_seen": 32447072, "step": 36300 }, { "epoch": 9.581760591263032, "grad_norm": 0.21954306960105896, "learning_rate": 1.0459283368792845e-06, "loss": 0.1061, "num_input_tokens_seen": 32451680, "step": 36305 }, { "epoch": 9.583080374818529, "grad_norm": 0.35791873931884766, "learning_rate": 1.043120190818858e-06, "loss": 0.0734, "num_input_tokens_seen": 32456256, "step": 36310 }, { "epoch": 9.584400158374027, "grad_norm": 0.6011964678764343, "learning_rate": 1.0403157392103596e-06, "loss": 0.0976, "num_input_tokens_seen": 32461120, "step": 36315 }, { "epoch": 9.585719941929524, "grad_norm": 0.3216552138328552, "learning_rate": 1.0375149824862735e-06, "loss": 0.0704, "num_input_tokens_seen": 32465728, "step": 36320 }, { "epoch": 9.58703972548502, "grad_norm": 0.10040421783924103, "learning_rate": 1.034717921078507e-06, "loss": 0.1019, "num_input_tokens_seen": 32470048, "step": 36325 }, { "epoch": 9.588359509040517, "grad_norm": 0.34894701838493347, "learning_rate": 1.0319245554184009e-06, "loss": 0.0328, "num_input_tokens_seen": 32474656, "step": 36330 }, { "epoch": 9.589679292596013, "grad_norm": 0.0894196480512619, "learning_rate": 1.0291348859367361e-06, "loss": 0.1103, "num_input_tokens_seen": 32478976, "step": 36335 }, { "epoch": 9.590999076151512, "grad_norm": 0.42518916726112366, "learning_rate": 1.0263489130637016e-06, "loss": 0.0977, "num_input_tokens_seen": 32483680, "step": 36340 }, { "epoch": 9.592318859707008, "grad_norm": 0.05858302861452103, "learning_rate": 1.0235666372289427e-06, "loss": 0.086, "num_input_tokens_seen": 32488128, "step": 36345 }, { "epoch": 9.593638643262505, "grad_norm": 0.14783096313476562, "learning_rate": 1.0207880588615076e-06, "loss": 0.046, "num_input_tokens_seen": 32492704, "step": 36350 }, { "epoch": 9.594958426818001, "grad_norm": 0.2992841303348541, "learning_rate": 1.0180131783898984e-06, "loss": 0.0853, "num_input_tokens_seen": 32497152, "step": 36355 }, { "epoch": 9.596278210373498, "grad_norm": 0.33625030517578125, "learning_rate": 1.0152419962420362e-06, "loss": 0.0695, "num_input_tokens_seen": 32501536, "step": 36360 }, { "epoch": 9.597597993928996, "grad_norm": 0.10060246288776398, "learning_rate": 1.0124745128452685e-06, "loss": 0.0573, "num_input_tokens_seen": 32505856, "step": 36365 }, { "epoch": 9.598917777484493, "grad_norm": 0.1423308402299881, "learning_rate": 1.0097107286263758e-06, "loss": 0.0287, "num_input_tokens_seen": 32510592, "step": 36370 }, { "epoch": 9.60023756103999, "grad_norm": 0.08338498324155807, "learning_rate": 1.00695064401157e-06, "loss": 0.053, "num_input_tokens_seen": 32515008, "step": 36375 }, { "epoch": 9.601557344595486, "grad_norm": 0.24650920927524567, "learning_rate": 1.0041942594264886e-06, "loss": 0.0654, "num_input_tokens_seen": 32519680, "step": 36380 }, { "epoch": 9.602877128150983, "grad_norm": 0.291219025850296, "learning_rate": 1.001441575296208e-06, "loss": 0.0513, "num_input_tokens_seen": 32524064, "step": 36385 }, { "epoch": 9.604196911706481, "grad_norm": 0.09708461165428162, "learning_rate": 9.986925920452139e-07, "loss": 0.0209, "num_input_tokens_seen": 32528864, "step": 36390 }, { "epoch": 9.605516695261977, "grad_norm": 0.40332335233688354, "learning_rate": 9.959473100974475e-07, "loss": 0.1139, "num_input_tokens_seen": 32533440, "step": 36395 }, { "epoch": 9.606836478817474, "grad_norm": 0.43742066621780396, "learning_rate": 9.932057298762564e-07, "loss": 0.1163, "num_input_tokens_seen": 32537792, "step": 36400 }, { "epoch": 9.606836478817474, "eval_loss": 0.06680143624544144, "eval_runtime": 64.7362, "eval_samples_per_second": 104.038, "eval_steps_per_second": 26.013, "num_input_tokens_seen": 32537792, "step": 36400 }, { "epoch": 9.60815626237297, "grad_norm": 0.46489986777305603, "learning_rate": 9.90467851804433e-07, "loss": 0.1172, "num_input_tokens_seen": 32542528, "step": 36405 }, { "epoch": 9.609476045928467, "grad_norm": 0.29599010944366455, "learning_rate": 9.877336763041895e-07, "loss": 0.0525, "num_input_tokens_seen": 32546784, "step": 36410 }, { "epoch": 9.610795829483965, "grad_norm": 0.05403849855065346, "learning_rate": 9.850032037971662e-07, "loss": 0.0482, "num_input_tokens_seen": 32551040, "step": 36415 }, { "epoch": 9.612115613039462, "grad_norm": 0.15851284563541412, "learning_rate": 9.822764347044406e-07, "loss": 0.0247, "num_input_tokens_seen": 32555456, "step": 36420 }, { "epoch": 9.613435396594959, "grad_norm": 0.08824492990970612, "learning_rate": 9.795533694465175e-07, "loss": 0.043, "num_input_tokens_seen": 32559840, "step": 36425 }, { "epoch": 9.614755180150455, "grad_norm": 0.24742934107780457, "learning_rate": 9.768340084433197e-07, "loss": 0.0498, "num_input_tokens_seen": 32564064, "step": 36430 }, { "epoch": 9.616074963705952, "grad_norm": 0.2141820639371872, "learning_rate": 9.741183521142143e-07, "loss": 0.1092, "num_input_tokens_seen": 32568736, "step": 36435 }, { "epoch": 9.61739474726145, "grad_norm": 0.19324392080307007, "learning_rate": 9.714064008779889e-07, "loss": 0.0383, "num_input_tokens_seen": 32572864, "step": 36440 }, { "epoch": 9.618714530816947, "grad_norm": 0.5264033675193787, "learning_rate": 9.686981551528584e-07, "loss": 0.0754, "num_input_tokens_seen": 32577216, "step": 36445 }, { "epoch": 9.620034314372443, "grad_norm": 0.09812907874584198, "learning_rate": 9.65993615356467e-07, "loss": 0.0214, "num_input_tokens_seen": 32581504, "step": 36450 }, { "epoch": 9.62135409792794, "grad_norm": 0.11764270812273026, "learning_rate": 9.632927819058917e-07, "loss": 0.0573, "num_input_tokens_seen": 32586144, "step": 36455 }, { "epoch": 9.622673881483436, "grad_norm": 0.13817362487316132, "learning_rate": 9.605956552176305e-07, "loss": 0.042, "num_input_tokens_seen": 32590688, "step": 36460 }, { "epoch": 9.623993665038933, "grad_norm": 0.5800774693489075, "learning_rate": 9.579022357076223e-07, "loss": 0.0741, "num_input_tokens_seen": 32595296, "step": 36465 }, { "epoch": 9.625313448594431, "grad_norm": 0.058949973434209824, "learning_rate": 9.552125237912158e-07, "loss": 0.0316, "num_input_tokens_seen": 32599648, "step": 36470 }, { "epoch": 9.626633232149928, "grad_norm": 0.1207602247595787, "learning_rate": 9.525265198832096e-07, "loss": 0.0809, "num_input_tokens_seen": 32604000, "step": 36475 }, { "epoch": 9.627953015705424, "grad_norm": 0.7709706425666809, "learning_rate": 9.498442243978112e-07, "loss": 0.0914, "num_input_tokens_seen": 32608416, "step": 36480 }, { "epoch": 9.62927279926092, "grad_norm": 0.13213668763637543, "learning_rate": 9.471656377486649e-07, "loss": 0.0349, "num_input_tokens_seen": 32612832, "step": 36485 }, { "epoch": 9.630592582816417, "grad_norm": 0.08719389885663986, "learning_rate": 9.444907603488456e-07, "loss": 0.0639, "num_input_tokens_seen": 32617408, "step": 36490 }, { "epoch": 9.631912366371916, "grad_norm": 0.08487800508737564, "learning_rate": 9.418195926108514e-07, "loss": 0.0546, "num_input_tokens_seen": 32621856, "step": 36495 }, { "epoch": 9.633232149927412, "grad_norm": 0.2714018225669861, "learning_rate": 9.391521349466053e-07, "loss": 0.0473, "num_input_tokens_seen": 32626176, "step": 36500 }, { "epoch": 9.634551933482909, "grad_norm": 0.30420243740081787, "learning_rate": 9.364883877674758e-07, "loss": 0.0773, "num_input_tokens_seen": 32630688, "step": 36505 }, { "epoch": 9.635871717038405, "grad_norm": 0.6089054942131042, "learning_rate": 9.33828351484231e-07, "loss": 0.0818, "num_input_tokens_seen": 32634976, "step": 36510 }, { "epoch": 9.637191500593902, "grad_norm": 0.05980704724788666, "learning_rate": 9.311720265070906e-07, "loss": 0.0575, "num_input_tokens_seen": 32639584, "step": 36515 }, { "epoch": 9.6385112841494, "grad_norm": 0.11449901759624481, "learning_rate": 9.285194132456931e-07, "loss": 0.0412, "num_input_tokens_seen": 32644128, "step": 36520 }, { "epoch": 9.639831067704897, "grad_norm": 0.36789384484291077, "learning_rate": 9.258705121091032e-07, "loss": 0.0732, "num_input_tokens_seen": 32648480, "step": 36525 }, { "epoch": 9.641150851260393, "grad_norm": 0.26520252227783203, "learning_rate": 9.232253235058136e-07, "loss": 0.0421, "num_input_tokens_seen": 32653216, "step": 36530 }, { "epoch": 9.64247063481589, "grad_norm": 0.04465131461620331, "learning_rate": 9.205838478437478e-07, "loss": 0.0296, "num_input_tokens_seen": 32657920, "step": 36535 }, { "epoch": 9.643790418371387, "grad_norm": 0.058188553899526596, "learning_rate": 9.179460855302524e-07, "loss": 0.0551, "num_input_tokens_seen": 32662432, "step": 36540 }, { "epoch": 9.645110201926883, "grad_norm": 0.12666666507720947, "learning_rate": 9.153120369721046e-07, "loss": 0.0831, "num_input_tokens_seen": 32666944, "step": 36545 }, { "epoch": 9.646429985482381, "grad_norm": 0.03768955171108246, "learning_rate": 9.126817025755103e-07, "loss": 0.0564, "num_input_tokens_seen": 32671680, "step": 36550 }, { "epoch": 9.647749769037878, "grad_norm": 0.5245255827903748, "learning_rate": 9.100550827460947e-07, "loss": 0.0873, "num_input_tokens_seen": 32676128, "step": 36555 }, { "epoch": 9.649069552593375, "grad_norm": 0.37384510040283203, "learning_rate": 9.0743217788892e-07, "loss": 0.1058, "num_input_tokens_seen": 32680544, "step": 36560 }, { "epoch": 9.650389336148871, "grad_norm": 0.1574496477842331, "learning_rate": 9.048129884084683e-07, "loss": 0.074, "num_input_tokens_seen": 32684800, "step": 36565 }, { "epoch": 9.651709119704368, "grad_norm": 0.024178965017199516, "learning_rate": 9.021975147086553e-07, "loss": 0.0335, "num_input_tokens_seen": 32689152, "step": 36570 }, { "epoch": 9.653028903259866, "grad_norm": 0.17039723694324493, "learning_rate": 8.995857571928141e-07, "loss": 0.0637, "num_input_tokens_seen": 32693440, "step": 36575 }, { "epoch": 9.654348686815363, "grad_norm": 0.23729397356510162, "learning_rate": 8.969777162637139e-07, "loss": 0.0434, "num_input_tokens_seen": 32698048, "step": 36580 }, { "epoch": 9.655668470370859, "grad_norm": 0.21668079495429993, "learning_rate": 8.943733923235525e-07, "loss": 0.0783, "num_input_tokens_seen": 32702304, "step": 36585 }, { "epoch": 9.656988253926356, "grad_norm": 0.14011089503765106, "learning_rate": 8.917727857739394e-07, "loss": 0.0562, "num_input_tokens_seen": 32706368, "step": 36590 }, { "epoch": 9.658308037481852, "grad_norm": 0.19592523574829102, "learning_rate": 8.891758970159258e-07, "loss": 0.0522, "num_input_tokens_seen": 32710528, "step": 36595 }, { "epoch": 9.65962782103735, "grad_norm": 0.15823659300804138, "learning_rate": 8.86582726449986e-07, "loss": 0.0261, "num_input_tokens_seen": 32714880, "step": 36600 }, { "epoch": 9.65962782103735, "eval_loss": 0.06666872650384903, "eval_runtime": 64.7204, "eval_samples_per_second": 104.063, "eval_steps_per_second": 26.02, "num_input_tokens_seen": 32714880, "step": 36600 }, { "epoch": 9.660947604592847, "grad_norm": 0.05643278360366821, "learning_rate": 8.839932744760165e-07, "loss": 0.0479, "num_input_tokens_seen": 32719584, "step": 36605 }, { "epoch": 9.662267388148344, "grad_norm": 0.14952188730239868, "learning_rate": 8.814075414933482e-07, "loss": 0.0433, "num_input_tokens_seen": 32723776, "step": 36610 }, { "epoch": 9.66358717170384, "grad_norm": 1.0100500583648682, "learning_rate": 8.788255279007257e-07, "loss": 0.1414, "num_input_tokens_seen": 32728544, "step": 36615 }, { "epoch": 9.664906955259337, "grad_norm": 0.26396429538726807, "learning_rate": 8.762472340963362e-07, "loss": 0.0365, "num_input_tokens_seen": 32733056, "step": 36620 }, { "epoch": 9.666226738814835, "grad_norm": 0.1437600702047348, "learning_rate": 8.736726604777811e-07, "loss": 0.0321, "num_input_tokens_seen": 32737408, "step": 36625 }, { "epoch": 9.667546522370332, "grad_norm": 0.2120397537946701, "learning_rate": 8.711018074420901e-07, "loss": 0.0727, "num_input_tokens_seen": 32741888, "step": 36630 }, { "epoch": 9.668866305925828, "grad_norm": 0.24308793246746063, "learning_rate": 8.685346753857209e-07, "loss": 0.0552, "num_input_tokens_seen": 32746432, "step": 36635 }, { "epoch": 9.670186089481325, "grad_norm": 0.37679094076156616, "learning_rate": 8.659712647045654e-07, "loss": 0.0588, "num_input_tokens_seen": 32750880, "step": 36640 }, { "epoch": 9.671505873036821, "grad_norm": 0.32582390308380127, "learning_rate": 8.634115757939209e-07, "loss": 0.0536, "num_input_tokens_seen": 32755040, "step": 36645 }, { "epoch": 9.67282565659232, "grad_norm": 0.045773304998874664, "learning_rate": 8.608556090485387e-07, "loss": 0.0664, "num_input_tokens_seen": 32759264, "step": 36650 }, { "epoch": 9.674145440147816, "grad_norm": 0.24038860201835632, "learning_rate": 8.583033648625671e-07, "loss": 0.0875, "num_input_tokens_seen": 32763840, "step": 36655 }, { "epoch": 9.675465223703313, "grad_norm": 0.10279779136180878, "learning_rate": 8.557548436295998e-07, "loss": 0.0255, "num_input_tokens_seen": 32768192, "step": 36660 }, { "epoch": 9.67678500725881, "grad_norm": 0.02744360640645027, "learning_rate": 8.532100457426556e-07, "loss": 0.0589, "num_input_tokens_seen": 32772384, "step": 36665 }, { "epoch": 9.678104790814306, "grad_norm": 0.18834277987480164, "learning_rate": 8.506689715941679e-07, "loss": 0.0456, "num_input_tokens_seen": 32776832, "step": 36670 }, { "epoch": 9.679424574369804, "grad_norm": 0.09027508646249771, "learning_rate": 8.481316215760011e-07, "loss": 0.016, "num_input_tokens_seen": 32781120, "step": 36675 }, { "epoch": 9.6807443579253, "grad_norm": 0.4615442752838135, "learning_rate": 8.455979960794558e-07, "loss": 0.0676, "num_input_tokens_seen": 32785760, "step": 36680 }, { "epoch": 9.682064141480797, "grad_norm": 0.41708117723464966, "learning_rate": 8.430680954952364e-07, "loss": 0.0846, "num_input_tokens_seen": 32790624, "step": 36685 }, { "epoch": 9.683383925036294, "grad_norm": 0.0963064432144165, "learning_rate": 8.405419202134974e-07, "loss": 0.0589, "num_input_tokens_seen": 32795296, "step": 36690 }, { "epoch": 9.68470370859179, "grad_norm": 0.031728584319353104, "learning_rate": 8.380194706237993e-07, "loss": 0.0786, "num_input_tokens_seen": 32800096, "step": 36695 }, { "epoch": 9.686023492147289, "grad_norm": 0.1846175640821457, "learning_rate": 8.355007471151366e-07, "loss": 0.0335, "num_input_tokens_seen": 32804736, "step": 36700 }, { "epoch": 9.687343275702785, "grad_norm": 0.1352980136871338, "learning_rate": 8.329857500759292e-07, "loss": 0.0557, "num_input_tokens_seen": 32809152, "step": 36705 }, { "epoch": 9.688663059258282, "grad_norm": 0.2586682438850403, "learning_rate": 8.304744798940194e-07, "loss": 0.0825, "num_input_tokens_seen": 32813152, "step": 36710 }, { "epoch": 9.689982842813778, "grad_norm": 0.3628913462162018, "learning_rate": 8.279669369566756e-07, "loss": 0.0418, "num_input_tokens_seen": 32817536, "step": 36715 }, { "epoch": 9.691302626369275, "grad_norm": 0.027791297063231468, "learning_rate": 8.254631216505993e-07, "loss": 0.058, "num_input_tokens_seen": 32822080, "step": 36720 }, { "epoch": 9.692622409924772, "grad_norm": 0.3089745044708252, "learning_rate": 8.229630343619038e-07, "loss": 0.0797, "num_input_tokens_seen": 32826496, "step": 36725 }, { "epoch": 9.69394219348027, "grad_norm": 0.11913476139307022, "learning_rate": 8.204666754761392e-07, "loss": 0.0654, "num_input_tokens_seen": 32831072, "step": 36730 }, { "epoch": 9.695261977035766, "grad_norm": 0.08050766587257385, "learning_rate": 8.179740453782669e-07, "loss": 0.0341, "num_input_tokens_seen": 32835424, "step": 36735 }, { "epoch": 9.696581760591263, "grad_norm": 0.06045312434434891, "learning_rate": 8.154851444526907e-07, "loss": 0.0319, "num_input_tokens_seen": 32839648, "step": 36740 }, { "epoch": 9.69790154414676, "grad_norm": 0.32110732793807983, "learning_rate": 8.129999730832283e-07, "loss": 0.0537, "num_input_tokens_seen": 32844544, "step": 36745 }, { "epoch": 9.699221327702256, "grad_norm": 0.23077958822250366, "learning_rate": 8.105185316531178e-07, "loss": 0.0399, "num_input_tokens_seen": 32849152, "step": 36750 }, { "epoch": 9.700541111257754, "grad_norm": 0.16372065246105194, "learning_rate": 8.08040820545039e-07, "loss": 0.046, "num_input_tokens_seen": 32853632, "step": 36755 }, { "epoch": 9.701860894813251, "grad_norm": 0.22356246411800385, "learning_rate": 8.055668401410782e-07, "loss": 0.0751, "num_input_tokens_seen": 32857952, "step": 36760 }, { "epoch": 9.703180678368748, "grad_norm": 0.4643203616142273, "learning_rate": 8.030965908227578e-07, "loss": 0.0483, "num_input_tokens_seen": 32862528, "step": 36765 }, { "epoch": 9.704500461924244, "grad_norm": 0.15463142096996307, "learning_rate": 8.006300729710203e-07, "loss": 0.034, "num_input_tokens_seen": 32866784, "step": 36770 }, { "epoch": 9.70582024547974, "grad_norm": 0.1039866954088211, "learning_rate": 7.981672869662337e-07, "loss": 0.0279, "num_input_tokens_seen": 32871072, "step": 36775 }, { "epoch": 9.707140029035239, "grad_norm": 0.41785889863967896, "learning_rate": 7.957082331881888e-07, "loss": 0.1147, "num_input_tokens_seen": 32875680, "step": 36780 }, { "epoch": 9.708459812590736, "grad_norm": 0.33697253465652466, "learning_rate": 7.932529120161069e-07, "loss": 0.0674, "num_input_tokens_seen": 32880032, "step": 36785 }, { "epoch": 9.709779596146232, "grad_norm": 0.34266966581344604, "learning_rate": 7.908013238286243e-07, "loss": 0.062, "num_input_tokens_seen": 32884608, "step": 36790 }, { "epoch": 9.711099379701729, "grad_norm": 0.10386625677347183, "learning_rate": 7.883534690038136e-07, "loss": 0.0427, "num_input_tokens_seen": 32889152, "step": 36795 }, { "epoch": 9.712419163257225, "grad_norm": 0.23419667780399323, "learning_rate": 7.859093479191559e-07, "loss": 0.0462, "num_input_tokens_seen": 32893312, "step": 36800 }, { "epoch": 9.712419163257225, "eval_loss": 0.06686647236347198, "eval_runtime": 64.7556, "eval_samples_per_second": 104.006, "eval_steps_per_second": 26.005, "num_input_tokens_seen": 32893312, "step": 36800 }, { "epoch": 9.713738946812724, "grad_norm": 0.1796492040157318, "learning_rate": 7.834689609515722e-07, "loss": 0.0275, "num_input_tokens_seen": 32898016, "step": 36805 }, { "epoch": 9.71505873036822, "grad_norm": 0.25671592354774475, "learning_rate": 7.810323084774002e-07, "loss": 0.0467, "num_input_tokens_seen": 32902432, "step": 36810 }, { "epoch": 9.716378513923717, "grad_norm": 0.3190504014492035, "learning_rate": 7.785993908723976e-07, "loss": 0.0625, "num_input_tokens_seen": 32906816, "step": 36815 }, { "epoch": 9.717698297479213, "grad_norm": 0.1363082379102707, "learning_rate": 7.761702085117534e-07, "loss": 0.0466, "num_input_tokens_seen": 32911520, "step": 36820 }, { "epoch": 9.71901808103471, "grad_norm": 0.08445966988801956, "learning_rate": 7.737447617700844e-07, "loss": 0.0401, "num_input_tokens_seen": 32915968, "step": 36825 }, { "epoch": 9.720337864590206, "grad_norm": 0.049805037677288055, "learning_rate": 7.713230510214136e-07, "loss": 0.0825, "num_input_tokens_seen": 32920640, "step": 36830 }, { "epoch": 9.721657648145705, "grad_norm": 0.1078580841422081, "learning_rate": 7.689050766392092e-07, "loss": 0.0553, "num_input_tokens_seen": 32925088, "step": 36835 }, { "epoch": 9.722977431701201, "grad_norm": 0.01730385422706604, "learning_rate": 7.664908389963477e-07, "loss": 0.0209, "num_input_tokens_seen": 32929376, "step": 36840 }, { "epoch": 9.724297215256698, "grad_norm": 0.19974097609519958, "learning_rate": 7.64080338465134e-07, "loss": 0.0972, "num_input_tokens_seen": 32933952, "step": 36845 }, { "epoch": 9.725616998812194, "grad_norm": 0.22803504765033722, "learning_rate": 7.616735754173043e-07, "loss": 0.0371, "num_input_tokens_seen": 32938304, "step": 36850 }, { "epoch": 9.72693678236769, "grad_norm": 0.24974143505096436, "learning_rate": 7.592705502240005e-07, "loss": 0.075, "num_input_tokens_seen": 32942720, "step": 36855 }, { "epoch": 9.72825656592319, "grad_norm": 0.08736464381217957, "learning_rate": 7.568712632558095e-07, "loss": 0.042, "num_input_tokens_seen": 32947296, "step": 36860 }, { "epoch": 9.729576349478686, "grad_norm": 0.1285615861415863, "learning_rate": 7.544757148827297e-07, "loss": 0.0465, "num_input_tokens_seen": 32951648, "step": 36865 }, { "epoch": 9.730896133034182, "grad_norm": 0.35199403762817383, "learning_rate": 7.520839054741797e-07, "loss": 0.1016, "num_input_tokens_seen": 32955872, "step": 36870 }, { "epoch": 9.732215916589679, "grad_norm": 0.34824544191360474, "learning_rate": 7.496958353990113e-07, "loss": 0.0519, "num_input_tokens_seen": 32960224, "step": 36875 }, { "epoch": 9.733535700145175, "grad_norm": 0.6092756390571594, "learning_rate": 7.473115050254941e-07, "loss": 0.0563, "num_input_tokens_seen": 32964832, "step": 36880 }, { "epoch": 9.734855483700674, "grad_norm": 0.13450933992862701, "learning_rate": 7.449309147213173e-07, "loss": 0.03, "num_input_tokens_seen": 32969344, "step": 36885 }, { "epoch": 9.73617526725617, "grad_norm": 0.14798036217689514, "learning_rate": 7.425540648536067e-07, "loss": 0.0451, "num_input_tokens_seen": 32973984, "step": 36890 }, { "epoch": 9.737495050811667, "grad_norm": 0.23962444067001343, "learning_rate": 7.40180955788894e-07, "loss": 0.0529, "num_input_tokens_seen": 32978272, "step": 36895 }, { "epoch": 9.738814834367163, "grad_norm": 0.24147900938987732, "learning_rate": 7.378115878931474e-07, "loss": 0.0714, "num_input_tokens_seen": 32982976, "step": 36900 }, { "epoch": 9.74013461792266, "grad_norm": 0.5479511618614197, "learning_rate": 7.354459615317527e-07, "loss": 0.0873, "num_input_tokens_seen": 32987520, "step": 36905 }, { "epoch": 9.741454401478158, "grad_norm": 0.06193297728896141, "learning_rate": 7.33084077069518e-07, "loss": 0.0636, "num_input_tokens_seen": 32991840, "step": 36910 }, { "epoch": 9.742774185033655, "grad_norm": 0.07315385341644287, "learning_rate": 7.307259348706768e-07, "loss": 0.0553, "num_input_tokens_seen": 32996192, "step": 36915 }, { "epoch": 9.744093968589151, "grad_norm": 0.4768737554550171, "learning_rate": 7.283715352988801e-07, "loss": 0.0252, "num_input_tokens_seen": 33000576, "step": 36920 }, { "epoch": 9.745413752144648, "grad_norm": 0.36762920022010803, "learning_rate": 7.260208787172068e-07, "loss": 0.1161, "num_input_tokens_seen": 33005056, "step": 36925 }, { "epoch": 9.746733535700145, "grad_norm": 0.4105690121650696, "learning_rate": 7.23673965488167e-07, "loss": 0.0841, "num_input_tokens_seen": 33009408, "step": 36930 }, { "epoch": 9.748053319255643, "grad_norm": 0.1448710858821869, "learning_rate": 7.213307959736709e-07, "loss": 0.0401, "num_input_tokens_seen": 33013952, "step": 36935 }, { "epoch": 9.74937310281114, "grad_norm": 0.07691355794668198, "learning_rate": 7.189913705350715e-07, "loss": 0.0322, "num_input_tokens_seen": 33018272, "step": 36940 }, { "epoch": 9.750692886366636, "grad_norm": 0.1460392326116562, "learning_rate": 7.166556895331411e-07, "loss": 0.0356, "num_input_tokens_seen": 33022528, "step": 36945 }, { "epoch": 9.752012669922133, "grad_norm": 0.05647468939423561, "learning_rate": 7.143237533280639e-07, "loss": 0.0344, "num_input_tokens_seen": 33027520, "step": 36950 }, { "epoch": 9.75333245347763, "grad_norm": 0.22515201568603516, "learning_rate": 7.119955622794578e-07, "loss": 0.0754, "num_input_tokens_seen": 33032032, "step": 36955 }, { "epoch": 9.754652237033127, "grad_norm": 0.5260065197944641, "learning_rate": 7.096711167463577e-07, "loss": 0.0703, "num_input_tokens_seen": 33036128, "step": 36960 }, { "epoch": 9.755972020588624, "grad_norm": 0.2403654307126999, "learning_rate": 7.073504170872213e-07, "loss": 0.0985, "num_input_tokens_seen": 33040448, "step": 36965 }, { "epoch": 9.75729180414412, "grad_norm": 0.18120531737804413, "learning_rate": 7.05033463659932e-07, "loss": 0.0462, "num_input_tokens_seen": 33044960, "step": 36970 }, { "epoch": 9.758611587699617, "grad_norm": 0.24936379492282867, "learning_rate": 7.027202568217928e-07, "loss": 0.0378, "num_input_tokens_seen": 33049312, "step": 36975 }, { "epoch": 9.759931371255114, "grad_norm": 0.08598634600639343, "learning_rate": 7.004107969295293e-07, "loss": 0.02, "num_input_tokens_seen": 33053632, "step": 36980 }, { "epoch": 9.76125115481061, "grad_norm": 0.13366326689720154, "learning_rate": 6.9810508433929e-07, "loss": 0.0389, "num_input_tokens_seen": 33057856, "step": 36985 }, { "epoch": 9.762570938366109, "grad_norm": 0.04107854142785072, "learning_rate": 6.958031194066406e-07, "loss": 0.0464, "num_input_tokens_seen": 33062592, "step": 36990 }, { "epoch": 9.763890721921605, "grad_norm": 0.11019769310951233, "learning_rate": 6.935049024865776e-07, "loss": 0.0237, "num_input_tokens_seen": 33067104, "step": 36995 }, { "epoch": 9.765210505477102, "grad_norm": 0.07969275861978531, "learning_rate": 6.912104339335118e-07, "loss": 0.0218, "num_input_tokens_seen": 33071840, "step": 37000 }, { "epoch": 9.765210505477102, "eval_loss": 0.06681240350008011, "eval_runtime": 64.7068, "eval_samples_per_second": 104.085, "eval_steps_per_second": 26.025, "num_input_tokens_seen": 33071840, "step": 37000 }, { "epoch": 9.766530289032598, "grad_norm": 0.15063218772411346, "learning_rate": 6.889197141012799e-07, "loss": 0.0416, "num_input_tokens_seen": 33076352, "step": 37005 }, { "epoch": 9.767850072588095, "grad_norm": 0.14502611756324768, "learning_rate": 6.866327433431435e-07, "loss": 0.074, "num_input_tokens_seen": 33080992, "step": 37010 }, { "epoch": 9.769169856143593, "grad_norm": 0.03428981080651283, "learning_rate": 6.843495220117735e-07, "loss": 0.1014, "num_input_tokens_seen": 33085216, "step": 37015 }, { "epoch": 9.77048963969909, "grad_norm": 0.05172092467546463, "learning_rate": 6.820700504592798e-07, "loss": 0.0452, "num_input_tokens_seen": 33089440, "step": 37020 }, { "epoch": 9.771809423254586, "grad_norm": 0.2527451515197754, "learning_rate": 6.797943290371839e-07, "loss": 0.0225, "num_input_tokens_seen": 33093824, "step": 37025 }, { "epoch": 9.773129206810083, "grad_norm": 0.16107957065105438, "learning_rate": 6.775223580964274e-07, "loss": 0.0357, "num_input_tokens_seen": 33098272, "step": 37030 }, { "epoch": 9.77444899036558, "grad_norm": 0.2943838834762573, "learning_rate": 6.7525413798738e-07, "loss": 0.118, "num_input_tokens_seen": 33102784, "step": 37035 }, { "epoch": 9.775768773921078, "grad_norm": 0.0790591612458229, "learning_rate": 6.729896690598259e-07, "loss": 0.0633, "num_input_tokens_seen": 33106976, "step": 37040 }, { "epoch": 9.777088557476574, "grad_norm": 0.20587407052516937, "learning_rate": 6.707289516629772e-07, "loss": 0.0348, "num_input_tokens_seen": 33111264, "step": 37045 }, { "epoch": 9.77840834103207, "grad_norm": 0.16879156231880188, "learning_rate": 6.684719861454692e-07, "loss": 0.0531, "num_input_tokens_seen": 33115904, "step": 37050 }, { "epoch": 9.779728124587567, "grad_norm": 0.04438408091664314, "learning_rate": 6.662187728553481e-07, "loss": 0.0592, "num_input_tokens_seen": 33120576, "step": 37055 }, { "epoch": 9.781047908143064, "grad_norm": 0.12252353131771088, "learning_rate": 6.639693121400892e-07, "loss": 0.0638, "num_input_tokens_seen": 33124992, "step": 37060 }, { "epoch": 9.782367691698562, "grad_norm": 0.3266213536262512, "learning_rate": 6.617236043465868e-07, "loss": 0.0559, "num_input_tokens_seen": 33129408, "step": 37065 }, { "epoch": 9.783687475254059, "grad_norm": 0.1434156447649002, "learning_rate": 6.594816498211587e-07, "loss": 0.0385, "num_input_tokens_seen": 33134016, "step": 37070 }, { "epoch": 9.785007258809555, "grad_norm": 0.27777019143104553, "learning_rate": 6.572434489095447e-07, "loss": 0.0677, "num_input_tokens_seen": 33138720, "step": 37075 }, { "epoch": 9.786327042365052, "grad_norm": 0.34602275490760803, "learning_rate": 6.550090019568994e-07, "loss": 0.0871, "num_input_tokens_seen": 33143072, "step": 37080 }, { "epoch": 9.787646825920548, "grad_norm": 0.24594658613204956, "learning_rate": 6.527783093078027e-07, "loss": 0.0518, "num_input_tokens_seen": 33147616, "step": 37085 }, { "epoch": 9.788966609476045, "grad_norm": 0.07750890403985977, "learning_rate": 6.5055137130626e-07, "loss": 0.0744, "num_input_tokens_seen": 33152256, "step": 37090 }, { "epoch": 9.790286393031543, "grad_norm": 0.09458242356777191, "learning_rate": 6.483281882956854e-07, "loss": 0.0657, "num_input_tokens_seen": 33156640, "step": 37095 }, { "epoch": 9.79160617658704, "grad_norm": 0.12512488663196564, "learning_rate": 6.461087606189298e-07, "loss": 0.0524, "num_input_tokens_seen": 33161376, "step": 37100 }, { "epoch": 9.792925960142536, "grad_norm": 0.10112882405519485, "learning_rate": 6.438930886182554e-07, "loss": 0.0389, "num_input_tokens_seen": 33165568, "step": 37105 }, { "epoch": 9.794245743698033, "grad_norm": 0.05718434974551201, "learning_rate": 6.416811726353417e-07, "loss": 0.0601, "num_input_tokens_seen": 33169760, "step": 37110 }, { "epoch": 9.79556552725353, "grad_norm": 0.2776090204715729, "learning_rate": 6.394730130112991e-07, "loss": 0.0375, "num_input_tokens_seen": 33174464, "step": 37115 }, { "epoch": 9.796885310809028, "grad_norm": 0.1050463393330574, "learning_rate": 6.372686100866471e-07, "loss": 0.0588, "num_input_tokens_seen": 33179008, "step": 37120 }, { "epoch": 9.798205094364524, "grad_norm": 0.025791127234697342, "learning_rate": 6.350679642013413e-07, "loss": 0.1004, "num_input_tokens_seen": 33183360, "step": 37125 }, { "epoch": 9.799524877920021, "grad_norm": 0.42565640807151794, "learning_rate": 6.328710756947437e-07, "loss": 0.035, "num_input_tokens_seen": 33187648, "step": 37130 }, { "epoch": 9.800844661475518, "grad_norm": 0.3045409023761749, "learning_rate": 6.306779449056416e-07, "loss": 0.0555, "num_input_tokens_seen": 33192352, "step": 37135 }, { "epoch": 9.802164445031014, "grad_norm": 0.2856617271900177, "learning_rate": 6.284885721722422e-07, "loss": 0.0589, "num_input_tokens_seen": 33196864, "step": 37140 }, { "epoch": 9.803484228586512, "grad_norm": 0.0676911398768425, "learning_rate": 6.26302957832181e-07, "loss": 0.0575, "num_input_tokens_seen": 33201568, "step": 37145 }, { "epoch": 9.804804012142009, "grad_norm": 0.045551009476184845, "learning_rate": 6.241211022224997e-07, "loss": 0.0453, "num_input_tokens_seen": 33206112, "step": 37150 }, { "epoch": 9.806123795697506, "grad_norm": 0.0717528834939003, "learning_rate": 6.219430056796732e-07, "loss": 0.036, "num_input_tokens_seen": 33210496, "step": 37155 }, { "epoch": 9.807443579253002, "grad_norm": 0.173980712890625, "learning_rate": 6.19768668539586e-07, "loss": 0.0346, "num_input_tokens_seen": 33215264, "step": 37160 }, { "epoch": 9.808763362808499, "grad_norm": 0.09196390956640244, "learning_rate": 6.175980911375528e-07, "loss": 0.0225, "num_input_tokens_seen": 33219680, "step": 37165 }, { "epoch": 9.810083146363997, "grad_norm": 0.08689911663532257, "learning_rate": 6.154312738083034e-07, "loss": 0.0395, "num_input_tokens_seen": 33224352, "step": 37170 }, { "epoch": 9.811402929919494, "grad_norm": 0.09039895981550217, "learning_rate": 6.132682168859843e-07, "loss": 0.0324, "num_input_tokens_seen": 33228736, "step": 37175 }, { "epoch": 9.81272271347499, "grad_norm": 0.10126292705535889, "learning_rate": 6.111089207041704e-07, "loss": 0.0776, "num_input_tokens_seen": 33233600, "step": 37180 }, { "epoch": 9.814042497030487, "grad_norm": 0.10259318351745605, "learning_rate": 6.089533855958507e-07, "loss": 0.0372, "num_input_tokens_seen": 33238048, "step": 37185 }, { "epoch": 9.815362280585983, "grad_norm": 0.229819655418396, "learning_rate": 6.068016118934372e-07, "loss": 0.0296, "num_input_tokens_seen": 33242400, "step": 37190 }, { "epoch": 9.816682064141482, "grad_norm": 0.09456592053174973, "learning_rate": 6.04653599928759e-07, "loss": 0.0216, "num_input_tokens_seen": 33246848, "step": 37195 }, { "epoch": 9.818001847696978, "grad_norm": 0.2497648000717163, "learning_rate": 6.025093500330675e-07, "loss": 0.0752, "num_input_tokens_seen": 33251936, "step": 37200 }, { "epoch": 9.818001847696978, "eval_loss": 0.06687842309474945, "eval_runtime": 64.7714, "eval_samples_per_second": 103.981, "eval_steps_per_second": 25.999, "num_input_tokens_seen": 33251936, "step": 37200 }, { "epoch": 9.819321631252475, "grad_norm": 0.14527736604213715, "learning_rate": 6.003688625370291e-07, "loss": 0.0424, "num_input_tokens_seen": 33256288, "step": 37205 }, { "epoch": 9.820641414807971, "grad_norm": 0.02420186623930931, "learning_rate": 5.982321377707406e-07, "loss": 0.0373, "num_input_tokens_seen": 33260928, "step": 37210 }, { "epoch": 9.821961198363468, "grad_norm": 0.11463820934295654, "learning_rate": 5.96099176063708e-07, "loss": 0.105, "num_input_tokens_seen": 33265344, "step": 37215 }, { "epoch": 9.823280981918966, "grad_norm": 0.3090822398662567, "learning_rate": 5.93969977744857e-07, "loss": 0.075, "num_input_tokens_seen": 33269728, "step": 37220 }, { "epoch": 9.824600765474463, "grad_norm": 0.32196441292762756, "learning_rate": 5.918445431425445e-07, "loss": 0.0914, "num_input_tokens_seen": 33274176, "step": 37225 }, { "epoch": 9.82592054902996, "grad_norm": 0.05316760018467903, "learning_rate": 5.897228725845333e-07, "loss": 0.04, "num_input_tokens_seen": 33278592, "step": 37230 }, { "epoch": 9.827240332585456, "grad_norm": 0.18035797774791718, "learning_rate": 5.876049663980171e-07, "loss": 0.0393, "num_input_tokens_seen": 33282912, "step": 37235 }, { "epoch": 9.828560116140952, "grad_norm": 0.5084364414215088, "learning_rate": 5.854908249095959e-07, "loss": 0.0717, "num_input_tokens_seen": 33287360, "step": 37240 }, { "epoch": 9.82987989969645, "grad_norm": 0.07268796116113663, "learning_rate": 5.833804484453031e-07, "loss": 0.0609, "num_input_tokens_seen": 33291904, "step": 37245 }, { "epoch": 9.831199683251947, "grad_norm": 0.24577440321445465, "learning_rate": 5.81273837330587e-07, "loss": 0.0623, "num_input_tokens_seen": 33296256, "step": 37250 }, { "epoch": 9.832519466807444, "grad_norm": 0.08234730362892151, "learning_rate": 5.791709918903071e-07, "loss": 0.0855, "num_input_tokens_seen": 33300736, "step": 37255 }, { "epoch": 9.83383925036294, "grad_norm": 0.168877974152565, "learning_rate": 5.770719124487483e-07, "loss": 0.0362, "num_input_tokens_seen": 33305248, "step": 37260 }, { "epoch": 9.835159033918437, "grad_norm": 0.4873526096343994, "learning_rate": 5.749765993296241e-07, "loss": 0.0736, "num_input_tokens_seen": 33309760, "step": 37265 }, { "epoch": 9.836478817473933, "grad_norm": 0.21717356145381927, "learning_rate": 5.728850528560509e-07, "loss": 0.0194, "num_input_tokens_seen": 33314048, "step": 37270 }, { "epoch": 9.837798601029432, "grad_norm": 0.22532346844673157, "learning_rate": 5.707972733505707e-07, "loss": 0.1178, "num_input_tokens_seen": 33318624, "step": 37275 }, { "epoch": 9.839118384584928, "grad_norm": 0.20967541635036469, "learning_rate": 5.687132611351509e-07, "loss": 0.0622, "num_input_tokens_seen": 33322912, "step": 37280 }, { "epoch": 9.840438168140425, "grad_norm": 0.08733822405338287, "learning_rate": 5.666330165311651e-07, "loss": 0.0306, "num_input_tokens_seen": 33327648, "step": 37285 }, { "epoch": 9.841757951695921, "grad_norm": 0.27452874183654785, "learning_rate": 5.645565398594204e-07, "loss": 0.0643, "num_input_tokens_seen": 33331872, "step": 37290 }, { "epoch": 9.843077735251418, "grad_norm": 0.22627541422843933, "learning_rate": 5.624838314401304e-07, "loss": 0.048, "num_input_tokens_seen": 33336352, "step": 37295 }, { "epoch": 9.844397518806916, "grad_norm": 0.03566306084394455, "learning_rate": 5.604148915929336e-07, "loss": 0.1266, "num_input_tokens_seen": 33340576, "step": 37300 }, { "epoch": 9.845717302362413, "grad_norm": 0.08761882781982422, "learning_rate": 5.583497206368887e-07, "loss": 0.0732, "num_input_tokens_seen": 33345088, "step": 37305 }, { "epoch": 9.84703708591791, "grad_norm": 0.48690420389175415, "learning_rate": 5.562883188904688e-07, "loss": 0.1253, "num_input_tokens_seen": 33349472, "step": 37310 }, { "epoch": 9.848356869473406, "grad_norm": 0.11547759175300598, "learning_rate": 5.542306866715724e-07, "loss": 0.0714, "num_input_tokens_seen": 33354048, "step": 37315 }, { "epoch": 9.849676653028903, "grad_norm": 0.06528135389089584, "learning_rate": 5.52176824297504e-07, "loss": 0.0574, "num_input_tokens_seen": 33358848, "step": 37320 }, { "epoch": 9.850996436584401, "grad_norm": 0.18517811596393585, "learning_rate": 5.501267320850018e-07, "loss": 0.0499, "num_input_tokens_seen": 33363296, "step": 37325 }, { "epoch": 9.852316220139897, "grad_norm": 0.48187902569770813, "learning_rate": 5.480804103502157e-07, "loss": 0.0585, "num_input_tokens_seen": 33367648, "step": 37330 }, { "epoch": 9.853636003695394, "grad_norm": 0.1787024289369583, "learning_rate": 5.460378594087101e-07, "loss": 0.0695, "num_input_tokens_seen": 33372224, "step": 37335 }, { "epoch": 9.85495578725089, "grad_norm": 0.23202332854270935, "learning_rate": 5.439990795754773e-07, "loss": 0.0426, "num_input_tokens_seen": 33376416, "step": 37340 }, { "epoch": 9.856275570806387, "grad_norm": 0.25234633684158325, "learning_rate": 5.419640711649188e-07, "loss": 0.0827, "num_input_tokens_seen": 33380800, "step": 37345 }, { "epoch": 9.857595354361884, "grad_norm": 0.2695445120334625, "learning_rate": 5.399328344908583e-07, "loss": 0.0567, "num_input_tokens_seen": 33385472, "step": 37350 }, { "epoch": 9.858915137917382, "grad_norm": 0.2858934700489044, "learning_rate": 5.379053698665399e-07, "loss": 0.0308, "num_input_tokens_seen": 33389856, "step": 37355 }, { "epoch": 9.860234921472879, "grad_norm": 0.33976802229881287, "learning_rate": 5.358816776046216e-07, "loss": 0.0952, "num_input_tokens_seen": 33394528, "step": 37360 }, { "epoch": 9.861554705028375, "grad_norm": 0.03156353160738945, "learning_rate": 5.338617580171817e-07, "loss": 0.0422, "num_input_tokens_seen": 33398816, "step": 37365 }, { "epoch": 9.862874488583872, "grad_norm": 0.19284354150295258, "learning_rate": 5.318456114157239e-07, "loss": 0.0682, "num_input_tokens_seen": 33403488, "step": 37370 }, { "epoch": 9.864194272139368, "grad_norm": 0.373263418674469, "learning_rate": 5.298332381111576e-07, "loss": 0.041, "num_input_tokens_seen": 33408000, "step": 37375 }, { "epoch": 9.865514055694867, "grad_norm": 0.168899267911911, "learning_rate": 5.27824638413818e-07, "loss": 0.0353, "num_input_tokens_seen": 33412864, "step": 37380 }, { "epoch": 9.866833839250363, "grad_norm": 0.11835721135139465, "learning_rate": 5.258198126334546e-07, "loss": 0.055, "num_input_tokens_seen": 33417120, "step": 37385 }, { "epoch": 9.86815362280586, "grad_norm": 0.3403611481189728, "learning_rate": 5.238187610792367e-07, "loss": 0.0791, "num_input_tokens_seen": 33421760, "step": 37390 }, { "epoch": 9.869473406361356, "grad_norm": 0.04644545167684555, "learning_rate": 5.218214840597563e-07, "loss": 0.0263, "num_input_tokens_seen": 33426272, "step": 37395 }, { "epoch": 9.870793189916853, "grad_norm": 0.10087738931179047, "learning_rate": 5.198279818830115e-07, "loss": 0.019, "num_input_tokens_seen": 33431008, "step": 37400 }, { "epoch": 9.870793189916853, "eval_loss": 0.06666062027215958, "eval_runtime": 64.7631, "eval_samples_per_second": 103.994, "eval_steps_per_second": 26.002, "num_input_tokens_seen": 33431008, "step": 37400 }, { "epoch": 9.872112973472351, "grad_norm": 0.2766406834125519, "learning_rate": 5.178382548564287e-07, "loss": 0.0729, "num_input_tokens_seen": 33435520, "step": 37405 }, { "epoch": 9.873432757027848, "grad_norm": 0.4182945191860199, "learning_rate": 5.15852303286854e-07, "loss": 0.1098, "num_input_tokens_seen": 33439808, "step": 37410 }, { "epoch": 9.874752540583344, "grad_norm": 0.10201513022184372, "learning_rate": 5.138701274805396e-07, "loss": 0.0644, "num_input_tokens_seen": 33444128, "step": 37415 }, { "epoch": 9.87607232413884, "grad_norm": 0.2746305465698242, "learning_rate": 5.118917277431606e-07, "loss": 0.0358, "num_input_tokens_seen": 33448800, "step": 37420 }, { "epoch": 9.877392107694337, "grad_norm": 0.5421135425567627, "learning_rate": 5.099171043798145e-07, "loss": 0.0769, "num_input_tokens_seen": 33452960, "step": 37425 }, { "epoch": 9.878711891249836, "grad_norm": 0.17085377871990204, "learning_rate": 5.079462576950133e-07, "loss": 0.0386, "num_input_tokens_seen": 33457728, "step": 37430 }, { "epoch": 9.880031674805332, "grad_norm": 0.19559040665626526, "learning_rate": 5.059791879926862e-07, "loss": 0.0328, "num_input_tokens_seen": 33461984, "step": 37435 }, { "epoch": 9.881351458360829, "grad_norm": 0.07823667675256729, "learning_rate": 5.040158955761793e-07, "loss": 0.0812, "num_input_tokens_seen": 33466272, "step": 37440 }, { "epoch": 9.882671241916325, "grad_norm": 0.15807582437992096, "learning_rate": 5.020563807482559e-07, "loss": 0.0417, "num_input_tokens_seen": 33470912, "step": 37445 }, { "epoch": 9.883991025471822, "grad_norm": 0.1242532879114151, "learning_rate": 5.001006438110995e-07, "loss": 0.0218, "num_input_tokens_seen": 33475072, "step": 37450 }, { "epoch": 9.88531080902732, "grad_norm": 0.06011911854147911, "learning_rate": 4.981486850663075e-07, "loss": 0.0574, "num_input_tokens_seen": 33480064, "step": 37455 }, { "epoch": 9.886630592582817, "grad_norm": 0.21927376091480255, "learning_rate": 4.962005048149005e-07, "loss": 0.0295, "num_input_tokens_seen": 33484608, "step": 37460 }, { "epoch": 9.887950376138313, "grad_norm": 0.08316655457019806, "learning_rate": 4.942561033573073e-07, "loss": 0.0764, "num_input_tokens_seen": 33489408, "step": 37465 }, { "epoch": 9.88927015969381, "grad_norm": 0.21824263036251068, "learning_rate": 4.923154809933827e-07, "loss": 0.0311, "num_input_tokens_seen": 33494272, "step": 37470 }, { "epoch": 9.890589943249307, "grad_norm": 0.13372664153575897, "learning_rate": 4.903786380223957e-07, "loss": 0.0595, "num_input_tokens_seen": 33498720, "step": 37475 }, { "epoch": 9.891909726804805, "grad_norm": 0.12491300702095032, "learning_rate": 4.884455747430266e-07, "loss": 0.0409, "num_input_tokens_seen": 33503552, "step": 37480 }, { "epoch": 9.893229510360301, "grad_norm": 0.03278215602040291, "learning_rate": 4.865162914533816e-07, "loss": 0.0318, "num_input_tokens_seen": 33508032, "step": 37485 }, { "epoch": 9.894549293915798, "grad_norm": 0.1143200695514679, "learning_rate": 4.845907884509809e-07, "loss": 0.0284, "num_input_tokens_seen": 33512224, "step": 37490 }, { "epoch": 9.895869077471295, "grad_norm": 0.048936448991298676, "learning_rate": 4.82669066032762e-07, "loss": 0.0637, "num_input_tokens_seen": 33516896, "step": 37495 }, { "epoch": 9.897188861026791, "grad_norm": 0.17584718763828278, "learning_rate": 4.807511244950768e-07, "loss": 0.0489, "num_input_tokens_seen": 33521088, "step": 37500 }, { "epoch": 9.89850864458229, "grad_norm": 0.04286818578839302, "learning_rate": 4.788369641336943e-07, "loss": 0.0499, "num_input_tokens_seen": 33525664, "step": 37505 }, { "epoch": 9.899828428137786, "grad_norm": 0.17666275799274445, "learning_rate": 4.769265852438032e-07, "loss": 0.0461, "num_input_tokens_seen": 33530112, "step": 37510 }, { "epoch": 9.901148211693283, "grad_norm": 0.22586558759212494, "learning_rate": 4.750199881200124e-07, "loss": 0.0759, "num_input_tokens_seen": 33534496, "step": 37515 }, { "epoch": 9.902467995248779, "grad_norm": 0.340863436460495, "learning_rate": 4.7311717305633664e-07, "loss": 0.0652, "num_input_tokens_seen": 33539072, "step": 37520 }, { "epoch": 9.903787778804276, "grad_norm": 0.14083009958267212, "learning_rate": 4.7121814034621623e-07, "loss": 0.0815, "num_input_tokens_seen": 33543616, "step": 37525 }, { "epoch": 9.905107562359772, "grad_norm": 0.2481112778186798, "learning_rate": 4.693228902825114e-07, "loss": 0.0774, "num_input_tokens_seen": 33547936, "step": 37530 }, { "epoch": 9.90642734591527, "grad_norm": 0.22487303614616394, "learning_rate": 4.6743142315748277e-07, "loss": 0.0308, "num_input_tokens_seen": 33552224, "step": 37535 }, { "epoch": 9.907747129470767, "grad_norm": 0.05817550793290138, "learning_rate": 4.655437392628276e-07, "loss": 0.0632, "num_input_tokens_seen": 33556704, "step": 37540 }, { "epoch": 9.909066913026264, "grad_norm": 0.12761804461479187, "learning_rate": 4.636598388896463e-07, "loss": 0.0501, "num_input_tokens_seen": 33561632, "step": 37545 }, { "epoch": 9.91038669658176, "grad_norm": 0.09301698952913284, "learning_rate": 4.6177972232845925e-07, "loss": 0.0534, "num_input_tokens_seen": 33566208, "step": 37550 }, { "epoch": 9.911706480137257, "grad_norm": 0.28809693455696106, "learning_rate": 4.5990338986920953e-07, "loss": 0.0973, "num_input_tokens_seen": 33570880, "step": 37555 }, { "epoch": 9.913026263692755, "grad_norm": 0.2248685657978058, "learning_rate": 4.5803084180124633e-07, "loss": 0.0605, "num_input_tokens_seen": 33575936, "step": 37560 }, { "epoch": 9.914346047248252, "grad_norm": 0.15298062562942505, "learning_rate": 4.561620784133386e-07, "loss": 0.0503, "num_input_tokens_seen": 33580064, "step": 37565 }, { "epoch": 9.915665830803748, "grad_norm": 0.4139944016933441, "learning_rate": 4.5429709999367796e-07, "loss": 0.0711, "num_input_tokens_seen": 33584480, "step": 37570 }, { "epoch": 9.916985614359245, "grad_norm": 0.13759741187095642, "learning_rate": 4.5243590682986223e-07, "loss": 0.0558, "num_input_tokens_seen": 33588640, "step": 37575 }, { "epoch": 9.918305397914741, "grad_norm": 0.09520643204450607, "learning_rate": 4.5057849920891735e-07, "loss": 0.0644, "num_input_tokens_seen": 33593344, "step": 37580 }, { "epoch": 9.91962518147024, "grad_norm": 0.14969949424266815, "learning_rate": 4.487248774172698e-07, "loss": 0.0456, "num_input_tokens_seen": 33597600, "step": 37585 }, { "epoch": 9.920944965025736, "grad_norm": 0.2798534631729126, "learning_rate": 4.4687504174077965e-07, "loss": 0.0523, "num_input_tokens_seen": 33601920, "step": 37590 }, { "epoch": 9.922264748581233, "grad_norm": 0.12155798822641373, "learning_rate": 4.450289924647133e-07, "loss": 0.0552, "num_input_tokens_seen": 33606368, "step": 37595 }, { "epoch": 9.92358453213673, "grad_norm": 0.33616411685943604, "learning_rate": 4.431867298737513e-07, "loss": 0.0513, "num_input_tokens_seen": 33610816, "step": 37600 }, { "epoch": 9.92358453213673, "eval_loss": 0.06668636202812195, "eval_runtime": 64.7816, "eval_samples_per_second": 103.965, "eval_steps_per_second": 25.995, "num_input_tokens_seen": 33610816, "step": 37600 }, { "epoch": 9.924904315692226, "grad_norm": 0.17602252960205078, "learning_rate": 4.41348254251997e-07, "loss": 0.0626, "num_input_tokens_seen": 33615104, "step": 37605 }, { "epoch": 9.926224099247724, "grad_norm": 0.03280019015073776, "learning_rate": 4.395135658829652e-07, "loss": 0.0649, "num_input_tokens_seen": 33619520, "step": 37610 }, { "epoch": 9.92754388280322, "grad_norm": 0.306650847196579, "learning_rate": 4.376826650495852e-07, "loss": 0.0497, "num_input_tokens_seen": 33624224, "step": 37615 }, { "epoch": 9.928863666358717, "grad_norm": 0.18307961523532867, "learning_rate": 4.358555520342117e-07, "loss": 0.0397, "num_input_tokens_seen": 33628512, "step": 37620 }, { "epoch": 9.930183449914214, "grad_norm": 0.026564642786979675, "learning_rate": 4.3403222711860257e-07, "loss": 0.0429, "num_input_tokens_seen": 33632896, "step": 37625 }, { "epoch": 9.93150323346971, "grad_norm": 0.159596249461174, "learning_rate": 4.3221269058394133e-07, "loss": 0.094, "num_input_tokens_seen": 33637312, "step": 37630 }, { "epoch": 9.932823017025207, "grad_norm": 0.17609955370426178, "learning_rate": 4.303969427108173e-07, "loss": 0.013, "num_input_tokens_seen": 33641824, "step": 37635 }, { "epoch": 9.934142800580705, "grad_norm": 0.3578086495399475, "learning_rate": 4.2858498377924825e-07, "loss": 0.1042, "num_input_tokens_seen": 33646528, "step": 37640 }, { "epoch": 9.935462584136202, "grad_norm": 0.2368783801794052, "learning_rate": 4.267768140686579e-07, "loss": 0.0533, "num_input_tokens_seen": 33650880, "step": 37645 }, { "epoch": 9.936782367691698, "grad_norm": 0.3872343897819519, "learning_rate": 4.2497243385788975e-07, "loss": 0.1125, "num_input_tokens_seen": 33655360, "step": 37650 }, { "epoch": 9.938102151247195, "grad_norm": 0.2644798159599304, "learning_rate": 4.231718434251991e-07, "loss": 0.0535, "num_input_tokens_seen": 33659840, "step": 37655 }, { "epoch": 9.939421934802692, "grad_norm": 0.07587499916553497, "learning_rate": 4.213750430482666e-07, "loss": 0.0241, "num_input_tokens_seen": 33664352, "step": 37660 }, { "epoch": 9.94074171835819, "grad_norm": 0.053557366132736206, "learning_rate": 4.1958203300417054e-07, "loss": 0.051, "num_input_tokens_seen": 33668864, "step": 37665 }, { "epoch": 9.942061501913686, "grad_norm": 0.14698892831802368, "learning_rate": 4.177928135694259e-07, "loss": 0.0404, "num_input_tokens_seen": 33673312, "step": 37670 }, { "epoch": 9.943381285469183, "grad_norm": 0.05708412453532219, "learning_rate": 4.1600738501994807e-07, "loss": 0.0533, "num_input_tokens_seen": 33677728, "step": 37675 }, { "epoch": 9.94470106902468, "grad_norm": 0.36210569739341736, "learning_rate": 4.1422574763107237e-07, "loss": 0.0999, "num_input_tokens_seen": 33682528, "step": 37680 }, { "epoch": 9.946020852580176, "grad_norm": 0.14235402643680573, "learning_rate": 4.124479016775512e-07, "loss": 0.0557, "num_input_tokens_seen": 33687296, "step": 37685 }, { "epoch": 9.947340636135674, "grad_norm": 0.19021719694137573, "learning_rate": 4.106738474335514e-07, "loss": 0.0711, "num_input_tokens_seen": 33691584, "step": 37690 }, { "epoch": 9.948660419691171, "grad_norm": 0.15559323132038116, "learning_rate": 4.089035851726486e-07, "loss": 0.0451, "num_input_tokens_seen": 33695808, "step": 37695 }, { "epoch": 9.949980203246668, "grad_norm": 0.11610660701990128, "learning_rate": 4.0713711516784937e-07, "loss": 0.0555, "num_input_tokens_seen": 33700352, "step": 37700 }, { "epoch": 9.951299986802164, "grad_norm": 0.19101186096668243, "learning_rate": 4.05374437691558e-07, "loss": 0.0493, "num_input_tokens_seen": 33704704, "step": 37705 }, { "epoch": 9.95261977035766, "grad_norm": 0.10566519945859909, "learning_rate": 4.036155530156044e-07, "loss": 0.0219, "num_input_tokens_seen": 33709376, "step": 37710 }, { "epoch": 9.953939553913159, "grad_norm": 0.029912447556853294, "learning_rate": 4.018604614112298e-07, "loss": 0.1215, "num_input_tokens_seen": 33713888, "step": 37715 }, { "epoch": 9.955259337468656, "grad_norm": 0.24787446856498718, "learning_rate": 4.0010916314908996e-07, "loss": 0.0858, "num_input_tokens_seen": 33718304, "step": 37720 }, { "epoch": 9.956579121024152, "grad_norm": 0.14572301506996155, "learning_rate": 3.983616584992578e-07, "loss": 0.0427, "num_input_tokens_seen": 33722912, "step": 37725 }, { "epoch": 9.957898904579649, "grad_norm": 0.20772379636764526, "learning_rate": 3.9661794773122595e-07, "loss": 0.051, "num_input_tokens_seen": 33727520, "step": 37730 }, { "epoch": 9.959218688135145, "grad_norm": 0.3043055534362793, "learning_rate": 3.9487803111388777e-07, "loss": 0.0748, "num_input_tokens_seen": 33731904, "step": 37735 }, { "epoch": 9.960538471690644, "grad_norm": 0.1559484601020813, "learning_rate": 3.9314190891556747e-07, "loss": 0.035, "num_input_tokens_seen": 33736928, "step": 37740 }, { "epoch": 9.96185825524614, "grad_norm": 0.06001834571361542, "learning_rate": 3.914095814039925e-07, "loss": 0.0274, "num_input_tokens_seen": 33741408, "step": 37745 }, { "epoch": 9.963178038801637, "grad_norm": 0.34496524930000305, "learning_rate": 3.896810488463104e-07, "loss": 0.0329, "num_input_tokens_seen": 33746144, "step": 37750 }, { "epoch": 9.964497822357133, "grad_norm": 0.05874822288751602, "learning_rate": 3.8795631150908565e-07, "loss": 0.0372, "num_input_tokens_seen": 33750688, "step": 37755 }, { "epoch": 9.96581760591263, "grad_norm": 0.34706056118011475, "learning_rate": 3.862353696582888e-07, "loss": 0.0399, "num_input_tokens_seen": 33754592, "step": 37760 }, { "epoch": 9.967137389468128, "grad_norm": 0.2014765590429306, "learning_rate": 3.8451822355931313e-07, "loss": 0.1041, "num_input_tokens_seen": 33759104, "step": 37765 }, { "epoch": 9.968457173023625, "grad_norm": 0.4670551121234894, "learning_rate": 3.82804873476969e-07, "loss": 0.0533, "num_input_tokens_seen": 33763680, "step": 37770 }, { "epoch": 9.969776956579121, "grad_norm": 0.11517295241355896, "learning_rate": 3.810953196754702e-07, "loss": 0.0653, "num_input_tokens_seen": 33768352, "step": 37775 }, { "epoch": 9.971096740134618, "grad_norm": 0.07209106534719467, "learning_rate": 3.793895624184529e-07, "loss": 0.0386, "num_input_tokens_seen": 33772576, "step": 37780 }, { "epoch": 9.972416523690114, "grad_norm": 0.11723534017801285, "learning_rate": 3.776876019689679e-07, "loss": 0.1142, "num_input_tokens_seen": 33777184, "step": 37785 }, { "epoch": 9.97373630724561, "grad_norm": 0.06859634071588516, "learning_rate": 3.7598943858947743e-07, "loss": 0.0363, "num_input_tokens_seen": 33781696, "step": 37790 }, { "epoch": 9.97505609080111, "grad_norm": 0.20572331547737122, "learning_rate": 3.742950725418637e-07, "loss": 0.0534, "num_input_tokens_seen": 33786176, "step": 37795 }, { "epoch": 9.976375874356606, "grad_norm": 0.5612384080886841, "learning_rate": 3.726045040874093e-07, "loss": 0.0667, "num_input_tokens_seen": 33790528, "step": 37800 }, { "epoch": 9.976375874356606, "eval_loss": 0.06681814044713974, "eval_runtime": 64.7803, "eval_samples_per_second": 103.967, "eval_steps_per_second": 25.996, "num_input_tokens_seen": 33790528, "step": 37800 }, { "epoch": 9.977695657912102, "grad_norm": 0.33468860387802124, "learning_rate": 3.709177334868308e-07, "loss": 0.0872, "num_input_tokens_seen": 33794752, "step": 37805 }, { "epoch": 9.979015441467599, "grad_norm": 0.22791466116905212, "learning_rate": 3.692347610002478e-07, "loss": 0.0705, "num_input_tokens_seen": 33799264, "step": 37810 }, { "epoch": 9.980335225023095, "grad_norm": 0.0702073723077774, "learning_rate": 3.675555868871916e-07, "loss": 0.03, "num_input_tokens_seen": 33803776, "step": 37815 }, { "epoch": 9.981655008578594, "grad_norm": 0.0674089640378952, "learning_rate": 3.658802114066162e-07, "loss": 0.039, "num_input_tokens_seen": 33808288, "step": 37820 }, { "epoch": 9.98297479213409, "grad_norm": 0.2751138210296631, "learning_rate": 3.6420863481688437e-07, "loss": 0.0402, "num_input_tokens_seen": 33812832, "step": 37825 }, { "epoch": 9.984294575689587, "grad_norm": 0.11854992806911469, "learning_rate": 3.625408573757705e-07, "loss": 0.061, "num_input_tokens_seen": 33817120, "step": 37830 }, { "epoch": 9.985614359245083, "grad_norm": 0.07495156675577164, "learning_rate": 3.608768793404743e-07, "loss": 0.0474, "num_input_tokens_seen": 33821504, "step": 37835 }, { "epoch": 9.98693414280058, "grad_norm": 0.2974012494087219, "learning_rate": 3.592167009675934e-07, "loss": 0.0942, "num_input_tokens_seen": 33825664, "step": 37840 }, { "epoch": 9.988253926356078, "grad_norm": 0.3808560073375702, "learning_rate": 3.575603225131563e-07, "loss": 0.0662, "num_input_tokens_seen": 33830048, "step": 37845 }, { "epoch": 9.989573709911575, "grad_norm": 0.1966271549463272, "learning_rate": 3.55907744232592e-07, "loss": 0.0554, "num_input_tokens_seen": 33834624, "step": 37850 }, { "epoch": 9.990893493467071, "grad_norm": 0.09760642051696777, "learning_rate": 3.5425896638075217e-07, "loss": 0.0304, "num_input_tokens_seen": 33838816, "step": 37855 }, { "epoch": 9.992213277022568, "grad_norm": 0.15175718069076538, "learning_rate": 3.5261398921189736e-07, "loss": 0.0711, "num_input_tokens_seen": 33843328, "step": 37860 }, { "epoch": 9.993533060578065, "grad_norm": 0.3343449831008911, "learning_rate": 3.509728129797024e-07, "loss": 0.0624, "num_input_tokens_seen": 33847680, "step": 37865 }, { "epoch": 9.994852844133563, "grad_norm": 0.0773860514163971, "learning_rate": 3.4933543793725656e-07, "loss": 0.0792, "num_input_tokens_seen": 33851968, "step": 37870 }, { "epoch": 9.99617262768906, "grad_norm": 0.2404259592294693, "learning_rate": 3.4770186433707163e-07, "loss": 0.0504, "num_input_tokens_seen": 33856576, "step": 37875 }, { "epoch": 9.997492411244556, "grad_norm": 0.194298654794693, "learning_rate": 3.4607209243105453e-07, "loss": 0.0362, "num_input_tokens_seen": 33861120, "step": 37880 }, { "epoch": 9.998812194800053, "grad_norm": 0.10631931573152542, "learning_rate": 3.444461224705431e-07, "loss": 0.0388, "num_input_tokens_seen": 33865824, "step": 37885 }, { "epoch": 10.0, "grad_norm": 0.12102352827787399, "learning_rate": 3.4282395470628116e-07, "loss": 0.0448, "num_input_tokens_seen": 33869824, "step": 37890 }, { "epoch": 10.001319783555497, "grad_norm": 0.3837428390979767, "learning_rate": 3.4120558938842417e-07, "loss": 0.0978, "num_input_tokens_seen": 33873856, "step": 37895 }, { "epoch": 10.002639567110993, "grad_norm": 0.12204327434301376, "learning_rate": 3.395910267665503e-07, "loss": 0.044, "num_input_tokens_seen": 33878528, "step": 37900 }, { "epoch": 10.003959350666491, "grad_norm": 0.19019757211208344, "learning_rate": 3.3798026708964094e-07, "loss": 0.0866, "num_input_tokens_seen": 33882944, "step": 37905 }, { "epoch": 10.005279134221988, "grad_norm": 0.15790686011314392, "learning_rate": 3.3637331060609456e-07, "loss": 0.0405, "num_input_tokens_seen": 33887360, "step": 37910 }, { "epoch": 10.006598917777485, "grad_norm": 0.03515373170375824, "learning_rate": 3.3477015756372966e-07, "loss": 0.0534, "num_input_tokens_seen": 33891840, "step": 37915 }, { "epoch": 10.007918701332981, "grad_norm": 0.08442790806293488, "learning_rate": 3.3317080820976785e-07, "loss": 0.0231, "num_input_tokens_seen": 33896384, "step": 37920 }, { "epoch": 10.009238484888478, "grad_norm": 0.1246090680360794, "learning_rate": 3.315752627908508e-07, "loss": 0.0657, "num_input_tokens_seen": 33900992, "step": 37925 }, { "epoch": 10.010558268443976, "grad_norm": 0.13627082109451294, "learning_rate": 3.299835215530317e-07, "loss": 0.0228, "num_input_tokens_seen": 33905152, "step": 37930 }, { "epoch": 10.011878051999473, "grad_norm": 0.4496801197528839, "learning_rate": 3.2839558474177245e-07, "loss": 0.0903, "num_input_tokens_seen": 33909600, "step": 37935 }, { "epoch": 10.01319783555497, "grad_norm": 0.08077823370695114, "learning_rate": 3.2681145260196056e-07, "loss": 0.0537, "num_input_tokens_seen": 33913792, "step": 37940 }, { "epoch": 10.014517619110466, "grad_norm": 0.3165336847305298, "learning_rate": 3.252311253778839e-07, "loss": 0.0262, "num_input_tokens_seen": 33918336, "step": 37945 }, { "epoch": 10.015837402665962, "grad_norm": 0.28487715125083923, "learning_rate": 3.2365460331325034e-07, "loss": 0.0646, "num_input_tokens_seen": 33922528, "step": 37950 }, { "epoch": 10.01715718622146, "grad_norm": 0.2868373990058899, "learning_rate": 3.2208188665117934e-07, "loss": 0.0934, "num_input_tokens_seen": 33927168, "step": 37955 }, { "epoch": 10.018476969776957, "grad_norm": 0.21680396795272827, "learning_rate": 3.205129756342018e-07, "loss": 0.0518, "num_input_tokens_seen": 33931744, "step": 37960 }, { "epoch": 10.019796753332454, "grad_norm": 0.11099917441606522, "learning_rate": 3.189478705042659e-07, "loss": 0.0818, "num_input_tokens_seen": 33936256, "step": 37965 }, { "epoch": 10.02111653688795, "grad_norm": 0.222688689827919, "learning_rate": 3.173865715027341e-07, "loss": 0.0909, "num_input_tokens_seen": 33940576, "step": 37970 }, { "epoch": 10.022436320443447, "grad_norm": 0.053455181419849396, "learning_rate": 3.158290788703694e-07, "loss": 0.0476, "num_input_tokens_seen": 33944896, "step": 37975 }, { "epoch": 10.023756103998943, "grad_norm": 0.0686720609664917, "learning_rate": 3.1427539284736297e-07, "loss": 0.042, "num_input_tokens_seen": 33949472, "step": 37980 }, { "epoch": 10.025075887554442, "grad_norm": 0.376705139875412, "learning_rate": 3.127255136733093e-07, "loss": 0.0444, "num_input_tokens_seen": 33954080, "step": 37985 }, { "epoch": 10.026395671109938, "grad_norm": 0.05477975681424141, "learning_rate": 3.1117944158722544e-07, "loss": 0.1289, "num_input_tokens_seen": 33958336, "step": 37990 }, { "epoch": 10.027715454665435, "grad_norm": 0.052923668175935745, "learning_rate": 3.0963717682752635e-07, "loss": 0.0183, "num_input_tokens_seen": 33962976, "step": 37995 }, { "epoch": 10.029035238220931, "grad_norm": 0.26281264424324036, "learning_rate": 3.080987196320578e-07, "loss": 0.0793, "num_input_tokens_seen": 33967520, "step": 38000 }, { "epoch": 10.029035238220931, "eval_loss": 0.06681524962186813, "eval_runtime": 64.731, "eval_samples_per_second": 104.046, "eval_steps_per_second": 26.015, "num_input_tokens_seen": 33967520, "step": 38000 }, { "epoch": 10.030355021776428, "grad_norm": 0.08204266428947449, "learning_rate": 3.065640702380607e-07, "loss": 0.089, "num_input_tokens_seen": 33971872, "step": 38005 }, { "epoch": 10.031674805331926, "grad_norm": 0.30541324615478516, "learning_rate": 3.050332288822011e-07, "loss": 0.0634, "num_input_tokens_seen": 33976320, "step": 38010 }, { "epoch": 10.032994588887423, "grad_norm": 0.36135199666023254, "learning_rate": 3.035061958005542e-07, "loss": 0.0542, "num_input_tokens_seen": 33980672, "step": 38015 }, { "epoch": 10.03431437244292, "grad_norm": 0.22293494641780853, "learning_rate": 3.019829712286093e-07, "loss": 0.1233, "num_input_tokens_seen": 33985152, "step": 38020 }, { "epoch": 10.035634155998416, "grad_norm": 0.11124400049448013, "learning_rate": 3.004635554012647e-07, "loss": 0.0371, "num_input_tokens_seen": 33989536, "step": 38025 }, { "epoch": 10.036953939553912, "grad_norm": 0.11182738095521927, "learning_rate": 2.9894794855283017e-07, "loss": 0.0667, "num_input_tokens_seen": 33993856, "step": 38030 }, { "epoch": 10.03827372310941, "grad_norm": 0.02259814366698265, "learning_rate": 2.9743615091703816e-07, "loss": 0.0361, "num_input_tokens_seen": 33998336, "step": 38035 }, { "epoch": 10.039593506664907, "grad_norm": 0.05230380967259407, "learning_rate": 2.959281627270216e-07, "loss": 0.0983, "num_input_tokens_seen": 34003136, "step": 38040 }, { "epoch": 10.040913290220404, "grad_norm": 0.14785458147525787, "learning_rate": 2.944239842153362e-07, "loss": 0.0775, "num_input_tokens_seen": 34007616, "step": 38045 }, { "epoch": 10.0422330737759, "grad_norm": 0.06608415395021439, "learning_rate": 2.929236156139381e-07, "loss": 0.0348, "num_input_tokens_seen": 34012320, "step": 38050 }, { "epoch": 10.043552857331397, "grad_norm": 0.25637203454971313, "learning_rate": 2.9142705715420883e-07, "loss": 0.0391, "num_input_tokens_seen": 34016736, "step": 38055 }, { "epoch": 10.044872640886895, "grad_norm": 0.12935493886470795, "learning_rate": 2.8993430906693595e-07, "loss": 0.0995, "num_input_tokens_seen": 34021312, "step": 38060 }, { "epoch": 10.046192424442392, "grad_norm": 0.3180655837059021, "learning_rate": 2.88445371582316e-07, "loss": 0.0555, "num_input_tokens_seen": 34025792, "step": 38065 }, { "epoch": 10.047512207997888, "grad_norm": 0.17793750762939453, "learning_rate": 2.8696024492996796e-07, "loss": 0.0722, "num_input_tokens_seen": 34029920, "step": 38070 }, { "epoch": 10.048831991553385, "grad_norm": 0.14906224608421326, "learning_rate": 2.854789293389115e-07, "loss": 0.0334, "num_input_tokens_seen": 34034368, "step": 38075 }, { "epoch": 10.050151775108882, "grad_norm": 0.2949332594871521, "learning_rate": 2.8400142503758606e-07, "loss": 0.0561, "num_input_tokens_seen": 34038784, "step": 38080 }, { "epoch": 10.05147155866438, "grad_norm": 0.5170860886573792, "learning_rate": 2.8252773225384276e-07, "loss": 0.0939, "num_input_tokens_seen": 34043040, "step": 38085 }, { "epoch": 10.052791342219876, "grad_norm": 0.20082075893878937, "learning_rate": 2.8105785121494143e-07, "loss": 0.0579, "num_input_tokens_seen": 34047840, "step": 38090 }, { "epoch": 10.054111125775373, "grad_norm": 0.3481748104095459, "learning_rate": 2.795917821475563e-07, "loss": 0.0858, "num_input_tokens_seen": 34052480, "step": 38095 }, { "epoch": 10.05543090933087, "grad_norm": 0.2764076292514801, "learning_rate": 2.78129525277776e-07, "loss": 0.1093, "num_input_tokens_seen": 34056800, "step": 38100 }, { "epoch": 10.056750692886366, "grad_norm": 0.23656082153320312, "learning_rate": 2.766710808310952e-07, "loss": 0.0506, "num_input_tokens_seen": 34060896, "step": 38105 }, { "epoch": 10.058070476441863, "grad_norm": 0.28911080956459045, "learning_rate": 2.7521644903242827e-07, "loss": 0.0785, "num_input_tokens_seen": 34065184, "step": 38110 }, { "epoch": 10.059390259997361, "grad_norm": 0.03436349332332611, "learning_rate": 2.7376563010609593e-07, "loss": 0.1204, "num_input_tokens_seen": 34069536, "step": 38115 }, { "epoch": 10.060710043552858, "grad_norm": 0.2678040564060211, "learning_rate": 2.72318624275833e-07, "loss": 0.0605, "num_input_tokens_seen": 34074016, "step": 38120 }, { "epoch": 10.062029827108354, "grad_norm": 0.08466766774654388, "learning_rate": 2.7087543176478324e-07, "loss": 0.0767, "num_input_tokens_seen": 34078560, "step": 38125 }, { "epoch": 10.06334961066385, "grad_norm": 0.1350928246974945, "learning_rate": 2.694360527955103e-07, "loss": 0.0401, "num_input_tokens_seen": 34083168, "step": 38130 }, { "epoch": 10.064669394219347, "grad_norm": 0.22209849953651428, "learning_rate": 2.680004875899811e-07, "loss": 0.057, "num_input_tokens_seen": 34087296, "step": 38135 }, { "epoch": 10.065989177774846, "grad_norm": 0.3528316020965576, "learning_rate": 2.665687363695768e-07, "loss": 0.06, "num_input_tokens_seen": 34092032, "step": 38140 }, { "epoch": 10.067308961330342, "grad_norm": 0.23281915485858917, "learning_rate": 2.6514079935509584e-07, "loss": 0.0737, "num_input_tokens_seen": 34096608, "step": 38145 }, { "epoch": 10.068628744885839, "grad_norm": 0.5810974836349487, "learning_rate": 2.6371667676673983e-07, "loss": 0.0994, "num_input_tokens_seen": 34101440, "step": 38150 }, { "epoch": 10.069948528441335, "grad_norm": 0.02259673923254013, "learning_rate": 2.6229636882412755e-07, "loss": 0.0402, "num_input_tokens_seen": 34105792, "step": 38155 }, { "epoch": 10.071268311996832, "grad_norm": 0.24426713585853577, "learning_rate": 2.6087987574628935e-07, "loss": 0.0732, "num_input_tokens_seen": 34110112, "step": 38160 }, { "epoch": 10.07258809555233, "grad_norm": 0.14291857182979584, "learning_rate": 2.5946719775166437e-07, "loss": 0.0718, "num_input_tokens_seen": 34114560, "step": 38165 }, { "epoch": 10.073907879107827, "grad_norm": 0.31020388007164, "learning_rate": 2.5805833505810616e-07, "loss": 0.0842, "num_input_tokens_seen": 34119072, "step": 38170 }, { "epoch": 10.075227662663323, "grad_norm": 0.1958678811788559, "learning_rate": 2.566532878828798e-07, "loss": 0.0877, "num_input_tokens_seen": 34123552, "step": 38175 }, { "epoch": 10.07654744621882, "grad_norm": 0.19787365198135376, "learning_rate": 2.552520564426619e-07, "loss": 0.0434, "num_input_tokens_seen": 34127840, "step": 38180 }, { "epoch": 10.077867229774316, "grad_norm": 0.09934801608324051, "learning_rate": 2.5385464095353803e-07, "loss": 0.0294, "num_input_tokens_seen": 34132128, "step": 38185 }, { "epoch": 10.079187013329815, "grad_norm": 0.06508375704288483, "learning_rate": 2.5246104163100804e-07, "loss": 0.0673, "num_input_tokens_seen": 34136576, "step": 38190 }, { "epoch": 10.080506796885311, "grad_norm": 0.16275034844875336, "learning_rate": 2.510712586899833e-07, "loss": 0.0479, "num_input_tokens_seen": 34140960, "step": 38195 }, { "epoch": 10.081826580440808, "grad_norm": 0.15083876252174377, "learning_rate": 2.4968529234478124e-07, "loss": 0.0775, "num_input_tokens_seen": 34145280, "step": 38200 }, { "epoch": 10.081826580440808, "eval_loss": 0.06662769615650177, "eval_runtime": 64.7578, "eval_samples_per_second": 104.003, "eval_steps_per_second": 26.005, "num_input_tokens_seen": 34145280, "step": 38200 }, { "epoch": 10.083146363996304, "grad_norm": 0.35546180605888367, "learning_rate": 2.483031428091448e-07, "loss": 0.0511, "num_input_tokens_seen": 34149632, "step": 38205 }, { "epoch": 10.084466147551801, "grad_norm": 0.2029927521944046, "learning_rate": 2.469248102962091e-07, "loss": 0.0465, "num_input_tokens_seen": 34154528, "step": 38210 }, { "epoch": 10.0857859311073, "grad_norm": 0.16105994582176208, "learning_rate": 2.4555029501853455e-07, "loss": 0.0907, "num_input_tokens_seen": 34158752, "step": 38215 }, { "epoch": 10.087105714662796, "grad_norm": 0.04617764428257942, "learning_rate": 2.441795971880906e-07, "loss": 0.0571, "num_input_tokens_seen": 34163584, "step": 38220 }, { "epoch": 10.088425498218292, "grad_norm": 0.17850059270858765, "learning_rate": 2.4281271701625255e-07, "loss": 0.0541, "num_input_tokens_seen": 34168000, "step": 38225 }, { "epoch": 10.089745281773789, "grad_norm": 0.37063658237457275, "learning_rate": 2.4144965471381007e-07, "loss": 0.0735, "num_input_tokens_seen": 34172512, "step": 38230 }, { "epoch": 10.091065065329285, "grad_norm": 0.477384090423584, "learning_rate": 2.400904104909674e-07, "loss": 0.0717, "num_input_tokens_seen": 34177280, "step": 38235 }, { "epoch": 10.092384848884782, "grad_norm": 0.09601424634456635, "learning_rate": 2.3873498455733725e-07, "loss": 0.0263, "num_input_tokens_seen": 34181888, "step": 38240 }, { "epoch": 10.09370463244028, "grad_norm": 0.09946175664663315, "learning_rate": 2.3738337712194137e-07, "loss": 0.0233, "num_input_tokens_seen": 34186144, "step": 38245 }, { "epoch": 10.095024415995777, "grad_norm": 0.18595488369464874, "learning_rate": 2.3603558839321305e-07, "loss": 0.046, "num_input_tokens_seen": 34190688, "step": 38250 }, { "epoch": 10.096344199551273, "grad_norm": 0.0821559727191925, "learning_rate": 2.3469161857900267e-07, "loss": 0.0178, "num_input_tokens_seen": 34195200, "step": 38255 }, { "epoch": 10.09766398310677, "grad_norm": 0.1861674189567566, "learning_rate": 2.3335146788656393e-07, "loss": 0.0685, "num_input_tokens_seen": 34199840, "step": 38260 }, { "epoch": 10.098983766662267, "grad_norm": 0.0501311793923378, "learning_rate": 2.3201513652256757e-07, "loss": 0.0383, "num_input_tokens_seen": 34204064, "step": 38265 }, { "epoch": 10.100303550217765, "grad_norm": 0.12444113194942474, "learning_rate": 2.3068262469308766e-07, "loss": 0.0694, "num_input_tokens_seen": 34208256, "step": 38270 }, { "epoch": 10.101623333773261, "grad_norm": 0.09051330387592316, "learning_rate": 2.2935393260362093e-07, "loss": 0.0419, "num_input_tokens_seen": 34212448, "step": 38275 }, { "epoch": 10.102943117328758, "grad_norm": 0.17154602706432343, "learning_rate": 2.2802906045906458e-07, "loss": 0.0908, "num_input_tokens_seen": 34216800, "step": 38280 }, { "epoch": 10.104262900884255, "grad_norm": 0.03760075941681862, "learning_rate": 2.2670800846373018e-07, "loss": 0.0664, "num_input_tokens_seen": 34221216, "step": 38285 }, { "epoch": 10.105582684439751, "grad_norm": 0.23714080452919006, "learning_rate": 2.2539077682134367e-07, "loss": 0.0582, "num_input_tokens_seen": 34225888, "step": 38290 }, { "epoch": 10.10690246799525, "grad_norm": 0.4629157483577728, "learning_rate": 2.2407736573503423e-07, "loss": 0.0697, "num_input_tokens_seen": 34230496, "step": 38295 }, { "epoch": 10.108222251550746, "grad_norm": 0.048339519649744034, "learning_rate": 2.2276777540735093e-07, "loss": 0.0518, "num_input_tokens_seen": 34235008, "step": 38300 }, { "epoch": 10.109542035106243, "grad_norm": 0.08449076116085052, "learning_rate": 2.2146200604024613e-07, "loss": 0.0222, "num_input_tokens_seen": 34239680, "step": 38305 }, { "epoch": 10.11086181866174, "grad_norm": 0.3369671404361725, "learning_rate": 2.2016005783508375e-07, "loss": 0.1095, "num_input_tokens_seen": 34244096, "step": 38310 }, { "epoch": 10.112181602217236, "grad_norm": 0.03911479189991951, "learning_rate": 2.1886193099264763e-07, "loss": 0.0919, "num_input_tokens_seen": 34248384, "step": 38315 }, { "epoch": 10.113501385772734, "grad_norm": 0.239566832780838, "learning_rate": 2.175676257131165e-07, "loss": 0.0785, "num_input_tokens_seen": 34252512, "step": 38320 }, { "epoch": 10.11482116932823, "grad_norm": 0.10681136697530746, "learning_rate": 2.162771421960974e-07, "loss": 0.1113, "num_input_tokens_seen": 34257088, "step": 38325 }, { "epoch": 10.116140952883727, "grad_norm": 0.5920694470405579, "learning_rate": 2.1499048064059224e-07, "loss": 0.0837, "num_input_tokens_seen": 34261248, "step": 38330 }, { "epoch": 10.117460736439224, "grad_norm": 0.37838441133499146, "learning_rate": 2.1370764124502285e-07, "loss": 0.0439, "num_input_tokens_seen": 34265568, "step": 38335 }, { "epoch": 10.11878051999472, "grad_norm": 0.11948937922716141, "learning_rate": 2.1242862420721988e-07, "loss": 0.0379, "num_input_tokens_seen": 34269920, "step": 38340 }, { "epoch": 10.120100303550219, "grad_norm": 0.218428835272789, "learning_rate": 2.1115342972442276e-07, "loss": 0.0619, "num_input_tokens_seen": 34274432, "step": 38345 }, { "epoch": 10.121420087105715, "grad_norm": 0.17791852355003357, "learning_rate": 2.0988205799328252e-07, "loss": 0.0434, "num_input_tokens_seen": 34278688, "step": 38350 }, { "epoch": 10.122739870661212, "grad_norm": 0.16509293019771576, "learning_rate": 2.0861450920986182e-07, "loss": 0.0375, "num_input_tokens_seen": 34283200, "step": 38355 }, { "epoch": 10.124059654216708, "grad_norm": 0.1823548674583435, "learning_rate": 2.07350783569632e-07, "loss": 0.1131, "num_input_tokens_seen": 34288160, "step": 38360 }, { "epoch": 10.125379437772205, "grad_norm": 0.07168129831552505, "learning_rate": 2.060908812674761e-07, "loss": 0.0366, "num_input_tokens_seen": 34292640, "step": 38365 }, { "epoch": 10.126699221327701, "grad_norm": 0.26080265641212463, "learning_rate": 2.0483480249768317e-07, "loss": 0.0714, "num_input_tokens_seen": 34297088, "step": 38370 }, { "epoch": 10.1280190048832, "grad_norm": 0.4193274676799774, "learning_rate": 2.035825474539621e-07, "loss": 0.0781, "num_input_tokens_seen": 34301600, "step": 38375 }, { "epoch": 10.129338788438696, "grad_norm": 0.27161845564842224, "learning_rate": 2.0233411632942235e-07, "loss": 0.0241, "num_input_tokens_seen": 34306240, "step": 38380 }, { "epoch": 10.130658571994193, "grad_norm": 0.1561221033334732, "learning_rate": 2.0108950931658764e-07, "loss": 0.0325, "num_input_tokens_seen": 34310880, "step": 38385 }, { "epoch": 10.13197835554969, "grad_norm": 0.2053440660238266, "learning_rate": 1.998487266073934e-07, "loss": 0.0852, "num_input_tokens_seen": 34315392, "step": 38390 }, { "epoch": 10.133298139105186, "grad_norm": 0.17807281017303467, "learning_rate": 1.986117683931865e-07, "loss": 0.0448, "num_input_tokens_seen": 34319968, "step": 38395 }, { "epoch": 10.134617922660684, "grad_norm": 0.2973754405975342, "learning_rate": 1.9737863486471442e-07, "loss": 0.0978, "num_input_tokens_seen": 34324448, "step": 38400 }, { "epoch": 10.134617922660684, "eval_loss": 0.06676899641752243, "eval_runtime": 64.7742, "eval_samples_per_second": 103.977, "eval_steps_per_second": 25.998, "num_input_tokens_seen": 34324448, "step": 38400 }, { "epoch": 10.13593770621618, "grad_norm": 0.17091618478298187, "learning_rate": 1.9614932621215e-07, "loss": 0.0356, "num_input_tokens_seen": 34329152, "step": 38405 }, { "epoch": 10.137257489771677, "grad_norm": 0.16999801993370056, "learning_rate": 1.9492384262506102e-07, "loss": 0.0732, "num_input_tokens_seen": 34333408, "step": 38410 }, { "epoch": 10.138577273327174, "grad_norm": 0.1775590479373932, "learning_rate": 1.9370218429243524e-07, "loss": 0.0502, "num_input_tokens_seen": 34337760, "step": 38415 }, { "epoch": 10.13989705688267, "grad_norm": 0.4389783442020416, "learning_rate": 1.9248435140267197e-07, "loss": 0.0614, "num_input_tokens_seen": 34342272, "step": 38420 }, { "epoch": 10.141216840438169, "grad_norm": 0.14308783411979675, "learning_rate": 1.9127034414356814e-07, "loss": 0.043, "num_input_tokens_seen": 34346464, "step": 38425 }, { "epoch": 10.142536623993665, "grad_norm": 0.3823065161705017, "learning_rate": 1.9006016270234627e-07, "loss": 0.0734, "num_input_tokens_seen": 34350880, "step": 38430 }, { "epoch": 10.143856407549162, "grad_norm": 0.18779703974723816, "learning_rate": 1.888538072656293e-07, "loss": 0.0375, "num_input_tokens_seen": 34355456, "step": 38435 }, { "epoch": 10.145176191104659, "grad_norm": 0.10997649282217026, "learning_rate": 1.8765127801944893e-07, "loss": 0.0297, "num_input_tokens_seen": 34359968, "step": 38440 }, { "epoch": 10.146495974660155, "grad_norm": 0.07946807891130447, "learning_rate": 1.8645257514925406e-07, "loss": 0.1066, "num_input_tokens_seen": 34364352, "step": 38445 }, { "epoch": 10.147815758215653, "grad_norm": 0.2856985628604889, "learning_rate": 1.8525769883989685e-07, "loss": 0.0673, "num_input_tokens_seen": 34368608, "step": 38450 }, { "epoch": 10.14913554177115, "grad_norm": 0.3269865810871124, "learning_rate": 1.8406664927564654e-07, "loss": 0.0866, "num_input_tokens_seen": 34373504, "step": 38455 }, { "epoch": 10.150455325326647, "grad_norm": 0.24784094095230103, "learning_rate": 1.8287942664017566e-07, "loss": 0.0598, "num_input_tokens_seen": 34378080, "step": 38460 }, { "epoch": 10.151775108882143, "grad_norm": 0.13810604810714722, "learning_rate": 1.8169603111656552e-07, "loss": 0.0553, "num_input_tokens_seen": 34382784, "step": 38465 }, { "epoch": 10.15309489243764, "grad_norm": 0.2855387032032013, "learning_rate": 1.805164628873146e-07, "loss": 0.059, "num_input_tokens_seen": 34387424, "step": 38470 }, { "epoch": 10.154414675993138, "grad_norm": 0.26491403579711914, "learning_rate": 1.793407221343274e-07, "loss": 0.0338, "num_input_tokens_seen": 34391840, "step": 38475 }, { "epoch": 10.155734459548635, "grad_norm": 0.4147125482559204, "learning_rate": 1.781688090389172e-07, "loss": 0.036, "num_input_tokens_seen": 34396320, "step": 38480 }, { "epoch": 10.157054243104131, "grad_norm": 0.5289273262023926, "learning_rate": 1.770007237818061e-07, "loss": 0.1146, "num_input_tokens_seen": 34400640, "step": 38485 }, { "epoch": 10.158374026659628, "grad_norm": 0.3145584166049957, "learning_rate": 1.7583646654313059e-07, "loss": 0.0827, "num_input_tokens_seen": 34405280, "step": 38490 }, { "epoch": 10.159693810215124, "grad_norm": 0.111534982919693, "learning_rate": 1.7467603750242757e-07, "loss": 0.0547, "num_input_tokens_seen": 34409696, "step": 38495 }, { "epoch": 10.16101359377062, "grad_norm": 0.3558240532875061, "learning_rate": 1.7351943683865944e-07, "loss": 0.0423, "num_input_tokens_seen": 34414496, "step": 38500 }, { "epoch": 10.162333377326119, "grad_norm": 0.45740658044815063, "learning_rate": 1.723666647301808e-07, "loss": 0.0615, "num_input_tokens_seen": 34419008, "step": 38505 }, { "epoch": 10.163653160881616, "grad_norm": 0.0971350222826004, "learning_rate": 1.712177213547661e-07, "loss": 0.0435, "num_input_tokens_seen": 34423264, "step": 38510 }, { "epoch": 10.164972944437112, "grad_norm": 0.23266413807868958, "learning_rate": 1.7007260688959581e-07, "loss": 0.054, "num_input_tokens_seen": 34427872, "step": 38515 }, { "epoch": 10.166292727992609, "grad_norm": 0.12821246683597565, "learning_rate": 1.68931321511262e-07, "loss": 0.0337, "num_input_tokens_seen": 34432416, "step": 38520 }, { "epoch": 10.167612511548105, "grad_norm": 0.058961283415555954, "learning_rate": 1.6779386539576835e-07, "loss": 0.0582, "num_input_tokens_seen": 34436736, "step": 38525 }, { "epoch": 10.168932295103604, "grad_norm": 0.09676984697580338, "learning_rate": 1.666602387185162e-07, "loss": 0.043, "num_input_tokens_seen": 34441152, "step": 38530 }, { "epoch": 10.1702520786591, "grad_norm": 0.24321171641349792, "learning_rate": 1.655304416543352e-07, "loss": 0.0809, "num_input_tokens_seen": 34445472, "step": 38535 }, { "epoch": 10.171571862214597, "grad_norm": 0.3261115550994873, "learning_rate": 1.6440447437744698e-07, "loss": 0.0604, "num_input_tokens_seen": 34450016, "step": 38540 }, { "epoch": 10.172891645770093, "grad_norm": 0.3651960790157318, "learning_rate": 1.6328233706149332e-07, "loss": 0.0634, "num_input_tokens_seen": 34454208, "step": 38545 }, { "epoch": 10.17421142932559, "grad_norm": 0.48833152651786804, "learning_rate": 1.6216402987951906e-07, "loss": 0.0677, "num_input_tokens_seen": 34458784, "step": 38550 }, { "epoch": 10.175531212881088, "grad_norm": 0.2033245712518692, "learning_rate": 1.6104955300398627e-07, "loss": 0.0611, "num_input_tokens_seen": 34463360, "step": 38555 }, { "epoch": 10.176850996436585, "grad_norm": 0.22157514095306396, "learning_rate": 1.5993890660675748e-07, "loss": 0.0627, "num_input_tokens_seen": 34467872, "step": 38560 }, { "epoch": 10.178170779992081, "grad_norm": 0.4708595275878906, "learning_rate": 1.5883209085910678e-07, "loss": 0.0811, "num_input_tokens_seen": 34472672, "step": 38565 }, { "epoch": 10.179490563547578, "grad_norm": 0.20167233049869537, "learning_rate": 1.5772910593172264e-07, "loss": 0.0262, "num_input_tokens_seen": 34476896, "step": 38570 }, { "epoch": 10.180810347103074, "grad_norm": 0.10514634102582932, "learning_rate": 1.5662995199469954e-07, "loss": 0.0439, "num_input_tokens_seen": 34481472, "step": 38575 }, { "epoch": 10.182130130658573, "grad_norm": 0.3564639091491699, "learning_rate": 1.5553462921753802e-07, "loss": 0.0708, "num_input_tokens_seen": 34485920, "step": 38580 }, { "epoch": 10.18344991421407, "grad_norm": 0.1412670612335205, "learning_rate": 1.544431377691502e-07, "loss": 0.0848, "num_input_tokens_seen": 34490144, "step": 38585 }, { "epoch": 10.184769697769566, "grad_norm": 0.1433810144662857, "learning_rate": 1.5335547781785975e-07, "loss": 0.0309, "num_input_tokens_seen": 34494752, "step": 38590 }, { "epoch": 10.186089481325062, "grad_norm": 0.0487697534263134, "learning_rate": 1.5227164953139917e-07, "loss": 0.0231, "num_input_tokens_seen": 34498976, "step": 38595 }, { "epoch": 10.187409264880559, "grad_norm": 0.05024844408035278, "learning_rate": 1.511916530769042e-07, "loss": 0.0125, "num_input_tokens_seen": 34503808, "step": 38600 }, { "epoch": 10.187409264880559, "eval_loss": 0.06674957275390625, "eval_runtime": 64.7698, "eval_samples_per_second": 103.984, "eval_steps_per_second": 26.0, "num_input_tokens_seen": 34503808, "step": 38600 }, { "epoch": 10.188729048436057, "grad_norm": 0.5583072304725647, "learning_rate": 1.5011548862092773e-07, "loss": 0.0985, "num_input_tokens_seen": 34508224, "step": 38605 }, { "epoch": 10.190048831991554, "grad_norm": 0.17431506514549255, "learning_rate": 1.490431563294231e-07, "loss": 0.0278, "num_input_tokens_seen": 34512736, "step": 38610 }, { "epoch": 10.19136861554705, "grad_norm": 0.04053434729576111, "learning_rate": 1.4797465636776365e-07, "loss": 0.0419, "num_input_tokens_seen": 34517152, "step": 38615 }, { "epoch": 10.192688399102547, "grad_norm": 0.34291166067123413, "learning_rate": 1.4690998890072027e-07, "loss": 0.0899, "num_input_tokens_seen": 34521632, "step": 38620 }, { "epoch": 10.194008182658044, "grad_norm": 0.0928865298628807, "learning_rate": 1.4584915409248112e-07, "loss": 0.0161, "num_input_tokens_seen": 34525984, "step": 38625 }, { "epoch": 10.19532796621354, "grad_norm": 0.06816121935844421, "learning_rate": 1.4479215210663754e-07, "loss": 0.023, "num_input_tokens_seen": 34530624, "step": 38630 }, { "epoch": 10.196647749769038, "grad_norm": 0.5474709272384644, "learning_rate": 1.4373898310619528e-07, "loss": 0.0737, "num_input_tokens_seen": 34535232, "step": 38635 }, { "epoch": 10.197967533324535, "grad_norm": 0.2091810554265976, "learning_rate": 1.4268964725356604e-07, "loss": 0.1289, "num_input_tokens_seen": 34539712, "step": 38640 }, { "epoch": 10.199287316880032, "grad_norm": 0.024008432403206825, "learning_rate": 1.4164414471056764e-07, "loss": 0.0339, "num_input_tokens_seen": 34544384, "step": 38645 }, { "epoch": 10.200607100435528, "grad_norm": 0.18757879734039307, "learning_rate": 1.4060247563843497e-07, "loss": 0.0562, "num_input_tokens_seen": 34548928, "step": 38650 }, { "epoch": 10.201926883991025, "grad_norm": 0.2560552954673767, "learning_rate": 1.3956464019780068e-07, "loss": 0.0755, "num_input_tokens_seen": 34553344, "step": 38655 }, { "epoch": 10.203246667546523, "grad_norm": 0.21417447924613953, "learning_rate": 1.385306385487145e-07, "loss": 0.0756, "num_input_tokens_seen": 34557728, "step": 38660 }, { "epoch": 10.20456645110202, "grad_norm": 0.1422772854566574, "learning_rate": 1.3750047085063222e-07, "loss": 0.0658, "num_input_tokens_seen": 34562208, "step": 38665 }, { "epoch": 10.205886234657516, "grad_norm": 0.15437009930610657, "learning_rate": 1.3647413726242119e-07, "loss": 0.0434, "num_input_tokens_seen": 34566592, "step": 38670 }, { "epoch": 10.207206018213013, "grad_norm": 0.03055115044116974, "learning_rate": 1.3545163794235205e-07, "loss": 0.0489, "num_input_tokens_seen": 34571552, "step": 38675 }, { "epoch": 10.20852580176851, "grad_norm": 0.1291097104549408, "learning_rate": 1.3443297304810698e-07, "loss": 0.0555, "num_input_tokens_seen": 34575840, "step": 38680 }, { "epoch": 10.209845585324008, "grad_norm": 0.2938077747821808, "learning_rate": 1.3341814273677977e-07, "loss": 0.0557, "num_input_tokens_seen": 34580512, "step": 38685 }, { "epoch": 10.211165368879504, "grad_norm": 0.542168378829956, "learning_rate": 1.324071471648647e-07, "loss": 0.1073, "num_input_tokens_seen": 34585024, "step": 38690 }, { "epoch": 10.212485152435, "grad_norm": 0.561034619808197, "learning_rate": 1.3139998648827312e-07, "loss": 0.0827, "num_input_tokens_seen": 34589664, "step": 38695 }, { "epoch": 10.213804935990497, "grad_norm": 0.12570880353450775, "learning_rate": 1.3039666086232526e-07, "loss": 0.0225, "num_input_tokens_seen": 34594144, "step": 38700 }, { "epoch": 10.215124719545994, "grad_norm": 0.15776684880256653, "learning_rate": 1.2939717044174183e-07, "loss": 0.0319, "num_input_tokens_seen": 34598592, "step": 38705 }, { "epoch": 10.216444503101492, "grad_norm": 0.04353850707411766, "learning_rate": 1.284015153806578e-07, "loss": 0.0489, "num_input_tokens_seen": 34602752, "step": 38710 }, { "epoch": 10.217764286656989, "grad_norm": 0.5946187376976013, "learning_rate": 1.274096958326171e-07, "loss": 0.0749, "num_input_tokens_seen": 34607584, "step": 38715 }, { "epoch": 10.219084070212485, "grad_norm": 0.04369615763425827, "learning_rate": 1.2642171195056952e-07, "loss": 0.0368, "num_input_tokens_seen": 34612416, "step": 38720 }, { "epoch": 10.220403853767982, "grad_norm": 0.21111132204532623, "learning_rate": 1.2543756388687377e-07, "loss": 0.0795, "num_input_tokens_seen": 34616672, "step": 38725 }, { "epoch": 10.221723637323478, "grad_norm": 0.08242810517549515, "learning_rate": 1.2445725179330014e-07, "loss": 0.0335, "num_input_tokens_seen": 34620736, "step": 38730 }, { "epoch": 10.223043420878977, "grad_norm": 0.10774124413728714, "learning_rate": 1.2348077582102212e-07, "loss": 0.0522, "num_input_tokens_seen": 34625120, "step": 38735 }, { "epoch": 10.224363204434473, "grad_norm": 0.2842857241630554, "learning_rate": 1.2250813612062762e-07, "loss": 0.0874, "num_input_tokens_seen": 34629824, "step": 38740 }, { "epoch": 10.22568298798997, "grad_norm": 0.14024296402931213, "learning_rate": 1.215393328421105e-07, "loss": 0.051, "num_input_tokens_seen": 34634400, "step": 38745 }, { "epoch": 10.227002771545466, "grad_norm": 0.09930836409330368, "learning_rate": 1.2057436613486796e-07, "loss": 0.031, "num_input_tokens_seen": 34638656, "step": 38750 }, { "epoch": 10.228322555100963, "grad_norm": 0.4378281235694885, "learning_rate": 1.1961323614771424e-07, "loss": 0.0953, "num_input_tokens_seen": 34643328, "step": 38755 }, { "epoch": 10.229642338656461, "grad_norm": 0.16187579929828644, "learning_rate": 1.1865594302886418e-07, "loss": 0.0417, "num_input_tokens_seen": 34648000, "step": 38760 }, { "epoch": 10.230962122211958, "grad_norm": 0.3063742518424988, "learning_rate": 1.1770248692594687e-07, "loss": 0.0704, "num_input_tokens_seen": 34652224, "step": 38765 }, { "epoch": 10.232281905767454, "grad_norm": 0.3048715591430664, "learning_rate": 1.167528679859975e-07, "loss": 0.0729, "num_input_tokens_seen": 34656736, "step": 38770 }, { "epoch": 10.233601689322951, "grad_norm": 0.38002267479896545, "learning_rate": 1.1580708635545446e-07, "loss": 0.0546, "num_input_tokens_seen": 34661216, "step": 38775 }, { "epoch": 10.234921472878447, "grad_norm": 0.06072096899151802, "learning_rate": 1.1486514218017885e-07, "loss": 0.025, "num_input_tokens_seen": 34665888, "step": 38780 }, { "epoch": 10.236241256433944, "grad_norm": 0.42840471863746643, "learning_rate": 1.1392703560542117e-07, "loss": 0.0362, "num_input_tokens_seen": 34670400, "step": 38785 }, { "epoch": 10.237561039989442, "grad_norm": 0.06651969254016876, "learning_rate": 1.129927667758518e-07, "loss": 0.0381, "num_input_tokens_seen": 34674784, "step": 38790 }, { "epoch": 10.238880823544939, "grad_norm": 0.082082100212574, "learning_rate": 1.1206233583554992e-07, "loss": 0.0551, "num_input_tokens_seen": 34679328, "step": 38795 }, { "epoch": 10.240200607100435, "grad_norm": 0.03843453899025917, "learning_rate": 1.1113574292799523e-07, "loss": 0.0467, "num_input_tokens_seen": 34683552, "step": 38800 }, { "epoch": 10.240200607100435, "eval_loss": 0.06684853881597519, "eval_runtime": 64.7989, "eval_samples_per_second": 103.937, "eval_steps_per_second": 25.988, "num_input_tokens_seen": 34683552, "step": 38800 }, { "epoch": 10.241520390655932, "grad_norm": 0.10521683096885681, "learning_rate": 1.1021298819608449e-07, "loss": 0.0527, "num_input_tokens_seen": 34687840, "step": 38805 }, { "epoch": 10.242840174211429, "grad_norm": 0.12652160227298737, "learning_rate": 1.0929407178211226e-07, "loss": 0.077, "num_input_tokens_seen": 34692352, "step": 38810 }, { "epoch": 10.244159957766927, "grad_norm": 0.02760947123169899, "learning_rate": 1.0837899382779293e-07, "loss": 0.021, "num_input_tokens_seen": 34697152, "step": 38815 }, { "epoch": 10.245479741322423, "grad_norm": 0.7245965600013733, "learning_rate": 1.0746775447423862e-07, "loss": 0.0916, "num_input_tokens_seen": 34701472, "step": 38820 }, { "epoch": 10.24679952487792, "grad_norm": 0.08462116867303848, "learning_rate": 1.0656035386197583e-07, "loss": 0.0212, "num_input_tokens_seen": 34705888, "step": 38825 }, { "epoch": 10.248119308433417, "grad_norm": 0.12168659269809723, "learning_rate": 1.0565679213093982e-07, "loss": 0.0444, "num_input_tokens_seen": 34710208, "step": 38830 }, { "epoch": 10.249439091988913, "grad_norm": 0.05977329611778259, "learning_rate": 1.0475706942046638e-07, "loss": 0.0383, "num_input_tokens_seen": 34714336, "step": 38835 }, { "epoch": 10.250758875544411, "grad_norm": 0.40691497921943665, "learning_rate": 1.0386118586930282e-07, "loss": 0.0631, "num_input_tokens_seen": 34718816, "step": 38840 }, { "epoch": 10.252078659099908, "grad_norm": 0.4241376519203186, "learning_rate": 1.0296914161561367e-07, "loss": 0.0838, "num_input_tokens_seen": 34723360, "step": 38845 }, { "epoch": 10.253398442655405, "grad_norm": 0.20379526913166046, "learning_rate": 1.0208093679695552e-07, "loss": 0.0441, "num_input_tokens_seen": 34727616, "step": 38850 }, { "epoch": 10.254718226210901, "grad_norm": 0.06051342934370041, "learning_rate": 1.0119657155030493e-07, "loss": 0.0408, "num_input_tokens_seen": 34732032, "step": 38855 }, { "epoch": 10.256038009766398, "grad_norm": 0.13274140655994415, "learning_rate": 1.003160460120417e-07, "loss": 0.0287, "num_input_tokens_seen": 34736512, "step": 38860 }, { "epoch": 10.257357793321896, "grad_norm": 0.13771241903305054, "learning_rate": 9.943936031795165e-08, "loss": 0.0457, "num_input_tokens_seen": 34740960, "step": 38865 }, { "epoch": 10.258677576877393, "grad_norm": 0.3219413757324219, "learning_rate": 9.856651460323219e-08, "loss": 0.0745, "num_input_tokens_seen": 34745376, "step": 38870 }, { "epoch": 10.25999736043289, "grad_norm": 0.02981734462082386, "learning_rate": 9.769750900248953e-08, "loss": 0.0394, "num_input_tokens_seen": 34749600, "step": 38875 }, { "epoch": 10.261317143988386, "grad_norm": 0.18825046718120575, "learning_rate": 9.683234364973038e-08, "loss": 0.0265, "num_input_tokens_seen": 34754176, "step": 38880 }, { "epoch": 10.262636927543882, "grad_norm": 0.18589605391025543, "learning_rate": 9.597101867837854e-08, "loss": 0.0306, "num_input_tokens_seen": 34758528, "step": 38885 }, { "epoch": 10.263956711099379, "grad_norm": 0.48391279578208923, "learning_rate": 9.511353422125835e-08, "loss": 0.1284, "num_input_tokens_seen": 34762944, "step": 38890 }, { "epoch": 10.265276494654877, "grad_norm": 0.1390886753797531, "learning_rate": 9.42598904106029e-08, "loss": 0.0599, "num_input_tokens_seen": 34767232, "step": 38895 }, { "epoch": 10.266596278210374, "grad_norm": 0.282479852437973, "learning_rate": 9.341008737806245e-08, "loss": 0.0337, "num_input_tokens_seen": 34771584, "step": 38900 }, { "epoch": 10.26791606176587, "grad_norm": 0.07049477845430374, "learning_rate": 9.256412525467661e-08, "loss": 0.0635, "num_input_tokens_seen": 34776000, "step": 38905 }, { "epoch": 10.269235845321367, "grad_norm": 0.32042673230171204, "learning_rate": 9.172200417091326e-08, "loss": 0.0411, "num_input_tokens_seen": 34780384, "step": 38910 }, { "epoch": 10.270555628876863, "grad_norm": 0.051038529723882675, "learning_rate": 9.088372425663239e-08, "loss": 0.0569, "num_input_tokens_seen": 34784896, "step": 38915 }, { "epoch": 10.271875412432362, "grad_norm": 0.1753125786781311, "learning_rate": 9.004928564110837e-08, "loss": 0.0439, "num_input_tokens_seen": 34789696, "step": 38920 }, { "epoch": 10.273195195987858, "grad_norm": 0.5092338919639587, "learning_rate": 8.92186884530244e-08, "loss": 0.0841, "num_input_tokens_seen": 34794112, "step": 38925 }, { "epoch": 10.274514979543355, "grad_norm": 0.022948477417230606, "learning_rate": 8.83919328204641e-08, "loss": 0.0281, "num_input_tokens_seen": 34798432, "step": 38930 }, { "epoch": 10.275834763098851, "grad_norm": 0.22265799343585968, "learning_rate": 8.756901887093105e-08, "loss": 0.0816, "num_input_tokens_seen": 34802656, "step": 38935 }, { "epoch": 10.277154546654348, "grad_norm": 0.2962473928928375, "learning_rate": 8.674994673132098e-08, "loss": 0.0667, "num_input_tokens_seen": 34807264, "step": 38940 }, { "epoch": 10.278474330209846, "grad_norm": 0.09947988390922546, "learning_rate": 8.593471652794949e-08, "loss": 0.0867, "num_input_tokens_seen": 34811872, "step": 38945 }, { "epoch": 10.279794113765343, "grad_norm": 0.49483397603034973, "learning_rate": 8.512332838653548e-08, "loss": 0.0949, "num_input_tokens_seen": 34816608, "step": 38950 }, { "epoch": 10.28111389732084, "grad_norm": 0.19445523619651794, "learning_rate": 8.431578243220106e-08, "loss": 0.0491, "num_input_tokens_seen": 34821024, "step": 38955 }, { "epoch": 10.282433680876336, "grad_norm": 0.12636226415634155, "learning_rate": 8.351207878948552e-08, "loss": 0.0232, "num_input_tokens_seen": 34825280, "step": 38960 }, { "epoch": 10.283753464431832, "grad_norm": 0.3059556484222412, "learning_rate": 8.271221758232583e-08, "loss": 0.0819, "num_input_tokens_seen": 34830048, "step": 38965 }, { "epoch": 10.28507324798733, "grad_norm": 0.05188632383942604, "learning_rate": 8.191619893407332e-08, "loss": 0.0406, "num_input_tokens_seen": 34834464, "step": 38970 }, { "epoch": 10.286393031542827, "grad_norm": 0.032690707594156265, "learning_rate": 8.112402296748534e-08, "loss": 0.0224, "num_input_tokens_seen": 34839136, "step": 38975 }, { "epoch": 10.287712815098324, "grad_norm": 0.25292280316352844, "learning_rate": 8.033568980471973e-08, "loss": 0.1075, "num_input_tokens_seen": 34843104, "step": 38980 }, { "epoch": 10.28903259865382, "grad_norm": 0.3211628496646881, "learning_rate": 7.955119956735146e-08, "loss": 0.0731, "num_input_tokens_seen": 34847296, "step": 38985 }, { "epoch": 10.290352382209317, "grad_norm": 0.08910756558179855, "learning_rate": 7.877055237636155e-08, "loss": 0.0591, "num_input_tokens_seen": 34851744, "step": 38990 }, { "epoch": 10.291672165764815, "grad_norm": 0.07351075857877731, "learning_rate": 7.79937483521287e-08, "loss": 0.0296, "num_input_tokens_seen": 34856416, "step": 38995 }, { "epoch": 10.292991949320312, "grad_norm": 0.24101482331752777, "learning_rate": 7.722078761444873e-08, "loss": 0.0458, "num_input_tokens_seen": 34860896, "step": 39000 }, { "epoch": 10.292991949320312, "eval_loss": 0.0668007880449295, "eval_runtime": 64.7625, "eval_samples_per_second": 103.995, "eval_steps_per_second": 26.003, "num_input_tokens_seen": 34860896, "step": 39000 }, { "epoch": 10.294311732875808, "grad_norm": 0.2994737923145294, "learning_rate": 7.645167028252631e-08, "loss": 0.1298, "num_input_tokens_seen": 34865312, "step": 39005 }, { "epoch": 10.295631516431305, "grad_norm": 0.1882815957069397, "learning_rate": 7.568639647496379e-08, "loss": 0.0703, "num_input_tokens_seen": 34869856, "step": 39010 }, { "epoch": 10.296951299986802, "grad_norm": 0.07314976304769516, "learning_rate": 7.492496630977508e-08, "loss": 0.0919, "num_input_tokens_seen": 34874592, "step": 39015 }, { "epoch": 10.2982710835423, "grad_norm": 0.2699569761753082, "learning_rate": 7.416737990438571e-08, "loss": 0.0446, "num_input_tokens_seen": 34879072, "step": 39020 }, { "epoch": 10.299590867097796, "grad_norm": 0.12427210062742233, "learning_rate": 7.341363737562445e-08, "loss": 0.0769, "num_input_tokens_seen": 34883552, "step": 39025 }, { "epoch": 10.300910650653293, "grad_norm": 0.08230319619178772, "learning_rate": 7.266373883972887e-08, "loss": 0.121, "num_input_tokens_seen": 34888032, "step": 39030 }, { "epoch": 10.30223043420879, "grad_norm": 0.40962591767311096, "learning_rate": 7.191768441233981e-08, "loss": 0.0952, "num_input_tokens_seen": 34892576, "step": 39035 }, { "epoch": 10.303550217764286, "grad_norm": 0.1081714779138565, "learning_rate": 7.11754742085069e-08, "loss": 0.076, "num_input_tokens_seen": 34897056, "step": 39040 }, { "epoch": 10.304870001319783, "grad_norm": 0.14868226647377014, "learning_rate": 7.043710834269413e-08, "loss": 0.0335, "num_input_tokens_seen": 34901568, "step": 39045 }, { "epoch": 10.306189784875281, "grad_norm": 0.34856677055358887, "learning_rate": 6.970258692876319e-08, "loss": 0.0931, "num_input_tokens_seen": 34906016, "step": 39050 }, { "epoch": 10.307509568430778, "grad_norm": 0.1221441924571991, "learning_rate": 6.897191007998738e-08, "loss": 0.0569, "num_input_tokens_seen": 34910688, "step": 39055 }, { "epoch": 10.308829351986274, "grad_norm": 0.20173712074756622, "learning_rate": 6.824507790904599e-08, "loss": 0.0799, "num_input_tokens_seen": 34915424, "step": 39060 }, { "epoch": 10.31014913554177, "grad_norm": 0.22462844848632812, "learning_rate": 6.752209052802439e-08, "loss": 0.0891, "num_input_tokens_seen": 34919488, "step": 39065 }, { "epoch": 10.311468919097267, "grad_norm": 0.09147199243307114, "learning_rate": 6.680294804841946e-08, "loss": 0.0543, "num_input_tokens_seen": 34924288, "step": 39070 }, { "epoch": 10.312788702652766, "grad_norm": 0.06847822666168213, "learning_rate": 6.608765058112865e-08, "loss": 0.0138, "num_input_tokens_seen": 34928928, "step": 39075 }, { "epoch": 10.314108486208262, "grad_norm": 0.31936952471733093, "learning_rate": 6.537619823646368e-08, "loss": 0.0521, "num_input_tokens_seen": 34933216, "step": 39080 }, { "epoch": 10.315428269763759, "grad_norm": 0.06189334765076637, "learning_rate": 6.466859112413404e-08, "loss": 0.1031, "num_input_tokens_seen": 34937440, "step": 39085 }, { "epoch": 10.316748053319255, "grad_norm": 0.06286962330341339, "learning_rate": 6.39648293532663e-08, "loss": 0.0366, "num_input_tokens_seen": 34942208, "step": 39090 }, { "epoch": 10.318067836874752, "grad_norm": 0.13972745835781097, "learning_rate": 6.32649130323848e-08, "loss": 0.0666, "num_input_tokens_seen": 34946592, "step": 39095 }, { "epoch": 10.31938762043025, "grad_norm": 0.22997868061065674, "learning_rate": 6.256884226943094e-08, "loss": 0.0647, "num_input_tokens_seen": 34950912, "step": 39100 }, { "epoch": 10.320707403985747, "grad_norm": 0.13109838962554932, "learning_rate": 6.187661717174386e-08, "loss": 0.0696, "num_input_tokens_seen": 34955296, "step": 39105 }, { "epoch": 10.322027187541243, "grad_norm": 0.19327491521835327, "learning_rate": 6.118823784607708e-08, "loss": 0.0688, "num_input_tokens_seen": 34959712, "step": 39110 }, { "epoch": 10.32334697109674, "grad_norm": 0.140794038772583, "learning_rate": 6.050370439858178e-08, "loss": 0.047, "num_input_tokens_seen": 34964000, "step": 39115 }, { "epoch": 10.324666754652236, "grad_norm": 0.32161054015159607, "learning_rate": 5.98230169348235e-08, "loss": 0.0584, "num_input_tokens_seen": 34968352, "step": 39120 }, { "epoch": 10.325986538207735, "grad_norm": 0.14388413727283478, "learning_rate": 5.914617555977664e-08, "loss": 0.0315, "num_input_tokens_seen": 34972640, "step": 39125 }, { "epoch": 10.327306321763231, "grad_norm": 0.026323627680540085, "learning_rate": 5.8473180377816017e-08, "loss": 0.0579, "num_input_tokens_seen": 34977248, "step": 39130 }, { "epoch": 10.328626105318728, "grad_norm": 0.12550987303256989, "learning_rate": 5.780403149272251e-08, "loss": 0.0765, "num_input_tokens_seen": 34981760, "step": 39135 }, { "epoch": 10.329945888874224, "grad_norm": 0.20249582827091217, "learning_rate": 5.7138729007694126e-08, "loss": 0.0367, "num_input_tokens_seen": 34986048, "step": 39140 }, { "epoch": 10.331265672429721, "grad_norm": 0.411763995885849, "learning_rate": 5.64772730253238e-08, "loss": 0.0677, "num_input_tokens_seen": 34990304, "step": 39145 }, { "epoch": 10.33258545598522, "grad_norm": 0.2721242606639862, "learning_rate": 5.5819663647618814e-08, "loss": 0.047, "num_input_tokens_seen": 34994624, "step": 39150 }, { "epoch": 10.333905239540716, "grad_norm": 0.1430835723876953, "learning_rate": 5.5165900975989723e-08, "loss": 0.0421, "num_input_tokens_seen": 34999008, "step": 39155 }, { "epoch": 10.335225023096212, "grad_norm": 0.3239065408706665, "learning_rate": 5.451598511125311e-08, "loss": 0.0896, "num_input_tokens_seen": 35003552, "step": 39160 }, { "epoch": 10.336544806651709, "grad_norm": 0.42054077982902527, "learning_rate": 5.3869916153637124e-08, "loss": 0.0981, "num_input_tokens_seen": 35008064, "step": 39165 }, { "epoch": 10.337864590207205, "grad_norm": 0.43364962935447693, "learning_rate": 5.322769420277318e-08, "loss": 0.1049, "num_input_tokens_seen": 35012672, "step": 39170 }, { "epoch": 10.339184373762702, "grad_norm": 0.3862692713737488, "learning_rate": 5.258931935769873e-08, "loss": 0.1081, "num_input_tokens_seen": 35017088, "step": 39175 }, { "epoch": 10.3405041573182, "grad_norm": 0.42801719903945923, "learning_rate": 5.19547917168628e-08, "loss": 0.092, "num_input_tokens_seen": 35021568, "step": 39180 }, { "epoch": 10.341823940873697, "grad_norm": 0.05797917768359184, "learning_rate": 5.13241113781121e-08, "loss": 0.0503, "num_input_tokens_seen": 35025920, "step": 39185 }, { "epoch": 10.343143724429193, "grad_norm": 0.058744050562381744, "learning_rate": 5.0697278438707755e-08, "loss": 0.0513, "num_input_tokens_seen": 35030240, "step": 39190 }, { "epoch": 10.34446350798469, "grad_norm": 0.037497036159038544, "learning_rate": 5.0074292995316854e-08, "loss": 0.0316, "num_input_tokens_seen": 35034848, "step": 39195 }, { "epoch": 10.345783291540187, "grad_norm": 0.25086164474487305, "learning_rate": 4.945515514400978e-08, "loss": 0.0469, "num_input_tokens_seen": 35039424, "step": 39200 }, { "epoch": 10.345783291540187, "eval_loss": 0.06669439375400543, "eval_runtime": 64.7685, "eval_samples_per_second": 103.986, "eval_steps_per_second": 26.0, "num_input_tokens_seen": 35039424, "step": 39200 }, { "epoch": 10.347103075095685, "grad_norm": 0.356332391500473, "learning_rate": 4.883986498026571e-08, "loss": 0.1522, "num_input_tokens_seen": 35043936, "step": 39205 }, { "epoch": 10.348422858651181, "grad_norm": 0.6044077277183533, "learning_rate": 4.822842259896987e-08, "loss": 0.0977, "num_input_tokens_seen": 35048096, "step": 39210 }, { "epoch": 10.349742642206678, "grad_norm": 0.06820203363895416, "learning_rate": 4.762082809441626e-08, "loss": 0.0104, "num_input_tokens_seen": 35052416, "step": 39215 }, { "epoch": 10.351062425762175, "grad_norm": 0.42420727014541626, "learning_rate": 4.7017081560302156e-08, "loss": 0.0581, "num_input_tokens_seen": 35057056, "step": 39220 }, { "epoch": 10.352382209317671, "grad_norm": 0.19865663349628448, "learning_rate": 4.6417183089730866e-08, "loss": 0.0763, "num_input_tokens_seen": 35061312, "step": 39225 }, { "epoch": 10.35370199287317, "grad_norm": 0.06255370378494263, "learning_rate": 4.5821132775217265e-08, "loss": 0.0201, "num_input_tokens_seen": 35065888, "step": 39230 }, { "epoch": 10.355021776428666, "grad_norm": 0.10420253127813339, "learning_rate": 4.5228930708679504e-08, "loss": 0.0518, "num_input_tokens_seen": 35070624, "step": 39235 }, { "epoch": 10.356341559984163, "grad_norm": 0.18582554161548615, "learning_rate": 4.464057698144175e-08, "loss": 0.044, "num_input_tokens_seen": 35074784, "step": 39240 }, { "epoch": 10.35766134353966, "grad_norm": 0.3424522578716278, "learning_rate": 4.4056071684236974e-08, "loss": 0.0561, "num_input_tokens_seen": 35079296, "step": 39245 }, { "epoch": 10.358981127095156, "grad_norm": 0.2922587990760803, "learning_rate": 4.347541490719864e-08, "loss": 0.0807, "num_input_tokens_seen": 35084032, "step": 39250 }, { "epoch": 10.360300910650654, "grad_norm": 0.03479823097586632, "learning_rate": 4.2898606739877336e-08, "loss": 0.0426, "num_input_tokens_seen": 35088128, "step": 39255 }, { "epoch": 10.36162069420615, "grad_norm": 0.2777692377567291, "learning_rate": 4.232564727122135e-08, "loss": 0.0544, "num_input_tokens_seen": 35092928, "step": 39260 }, { "epoch": 10.362940477761647, "grad_norm": 0.35067906975746155, "learning_rate": 4.1756536589585004e-08, "loss": 0.063, "num_input_tokens_seen": 35097632, "step": 39265 }, { "epoch": 10.364260261317144, "grad_norm": 0.09923063963651657, "learning_rate": 4.119127478273976e-08, "loss": 0.0276, "num_input_tokens_seen": 35102080, "step": 39270 }, { "epoch": 10.36558004487264, "grad_norm": 0.031164998188614845, "learning_rate": 4.062986193784923e-08, "loss": 0.0227, "num_input_tokens_seen": 35106784, "step": 39275 }, { "epoch": 10.366899828428139, "grad_norm": 0.09088560938835144, "learning_rate": 4.007229814149416e-08, "loss": 0.0377, "num_input_tokens_seen": 35110976, "step": 39280 }, { "epoch": 10.368219611983635, "grad_norm": 0.5445138812065125, "learning_rate": 3.951858347965576e-08, "loss": 0.0727, "num_input_tokens_seen": 35115456, "step": 39285 }, { "epoch": 10.369539395539132, "grad_norm": 0.21864992380142212, "learning_rate": 3.896871803772684e-08, "loss": 0.066, "num_input_tokens_seen": 35119520, "step": 39290 }, { "epoch": 10.370859179094628, "grad_norm": 0.06065914407372475, "learning_rate": 3.842270190050068e-08, "loss": 0.0841, "num_input_tokens_seen": 35124224, "step": 39295 }, { "epoch": 10.372178962650125, "grad_norm": 0.01906857080757618, "learning_rate": 3.7880535152179376e-08, "loss": 0.0901, "num_input_tokens_seen": 35128224, "step": 39300 }, { "epoch": 10.373498746205623, "grad_norm": 0.3996250331401825, "learning_rate": 3.734221787637382e-08, "loss": 0.0873, "num_input_tokens_seen": 35132832, "step": 39305 }, { "epoch": 10.37481852976112, "grad_norm": 0.11153167486190796, "learning_rate": 3.680775015609817e-08, "loss": 0.0928, "num_input_tokens_seen": 35137120, "step": 39310 }, { "epoch": 10.376138313316616, "grad_norm": 0.2349633276462555, "learning_rate": 3.627713207377537e-08, "loss": 0.0469, "num_input_tokens_seen": 35141600, "step": 39315 }, { "epoch": 10.377458096872113, "grad_norm": 0.08981253206729889, "learning_rate": 3.575036371123164e-08, "loss": 0.0244, "num_input_tokens_seen": 35146272, "step": 39320 }, { "epoch": 10.37877788042761, "grad_norm": 0.1727363020181656, "learning_rate": 3.5227445149704776e-08, "loss": 0.0836, "num_input_tokens_seen": 35150912, "step": 39325 }, { "epoch": 10.380097663983106, "grad_norm": 0.25248101353645325, "learning_rate": 3.470837646983027e-08, "loss": 0.0477, "num_input_tokens_seen": 35155360, "step": 39330 }, { "epoch": 10.381417447538604, "grad_norm": 0.13595643639564514, "learning_rate": 3.419315775165799e-08, "loss": 0.0436, "num_input_tokens_seen": 35160032, "step": 39335 }, { "epoch": 10.3827372310941, "grad_norm": 0.24405348300933838, "learning_rate": 3.368178907464103e-08, "loss": 0.0472, "num_input_tokens_seen": 35164544, "step": 39340 }, { "epoch": 10.384057014649597, "grad_norm": 0.12438478320837021, "learning_rate": 3.317427051763855e-08, "loss": 0.0488, "num_input_tokens_seen": 35168832, "step": 39345 }, { "epoch": 10.385376798205094, "grad_norm": 0.3066204786300659, "learning_rate": 3.267060215891571e-08, "loss": 0.0826, "num_input_tokens_seen": 35173440, "step": 39350 }, { "epoch": 10.38669658176059, "grad_norm": 0.27836695313453674, "learning_rate": 3.217078407614649e-08, "loss": 0.1218, "num_input_tokens_seen": 35178080, "step": 39355 }, { "epoch": 10.388016365316089, "grad_norm": 0.21126212179660797, "learning_rate": 3.1674816346405345e-08, "loss": 0.0501, "num_input_tokens_seen": 35182496, "step": 39360 }, { "epoch": 10.389336148871585, "grad_norm": 0.2908795177936554, "learning_rate": 3.11826990461811e-08, "loss": 0.0346, "num_input_tokens_seen": 35186720, "step": 39365 }, { "epoch": 10.390655932427082, "grad_norm": 0.08373452723026276, "learning_rate": 3.069443225136304e-08, "loss": 0.0728, "num_input_tokens_seen": 35190976, "step": 39370 }, { "epoch": 10.391975715982579, "grad_norm": 0.20170536637306213, "learning_rate": 3.021001603724372e-08, "loss": 0.0595, "num_input_tokens_seen": 35195360, "step": 39375 }, { "epoch": 10.393295499538075, "grad_norm": 0.40421366691589355, "learning_rate": 2.9729450478532818e-08, "loss": 0.0944, "num_input_tokens_seen": 35199744, "step": 39380 }, { "epoch": 10.394615283093573, "grad_norm": 0.17769184708595276, "learning_rate": 2.9252735649337726e-08, "loss": 0.0431, "num_input_tokens_seen": 35204544, "step": 39385 }, { "epoch": 10.39593506664907, "grad_norm": 0.1792316734790802, "learning_rate": 2.8779871623171863e-08, "loss": 0.0263, "num_input_tokens_seen": 35209152, "step": 39390 }, { "epoch": 10.397254850204567, "grad_norm": 0.6637542843818665, "learning_rate": 2.8310858472957448e-08, "loss": 0.1141, "num_input_tokens_seen": 35213440, "step": 39395 }, { "epoch": 10.398574633760063, "grad_norm": 0.21643762290477753, "learning_rate": 2.784569627101996e-08, "loss": 0.0359, "num_input_tokens_seen": 35217792, "step": 39400 }, { "epoch": 10.398574633760063, "eval_loss": 0.06686842441558838, "eval_runtime": 64.7817, "eval_samples_per_second": 103.964, "eval_steps_per_second": 25.995, "num_input_tokens_seen": 35217792, "step": 39400 }, { "epoch": 10.39989441731556, "grad_norm": 0.060691509395837784, "learning_rate": 2.738438508909924e-08, "loss": 0.0526, "num_input_tokens_seen": 35222272, "step": 39405 }, { "epoch": 10.401214200871058, "grad_norm": 0.08169547468423843, "learning_rate": 2.692692499833005e-08, "loss": 0.03, "num_input_tokens_seen": 35226752, "step": 39410 }, { "epoch": 10.402533984426555, "grad_norm": 0.31232625246047974, "learning_rate": 2.647331606926151e-08, "loss": 0.0442, "num_input_tokens_seen": 35231168, "step": 39415 }, { "epoch": 10.403853767982051, "grad_norm": 0.017157819122076035, "learning_rate": 2.6023558371843225e-08, "loss": 0.0394, "num_input_tokens_seen": 35235712, "step": 39420 }, { "epoch": 10.405173551537548, "grad_norm": 0.06694570183753967, "learning_rate": 2.557765197543638e-08, "loss": 0.0437, "num_input_tokens_seen": 35240416, "step": 39425 }, { "epoch": 10.406493335093044, "grad_norm": 0.43080952763557434, "learning_rate": 2.513559694880263e-08, "loss": 0.0674, "num_input_tokens_seen": 35245184, "step": 39430 }, { "epoch": 10.40781311864854, "grad_norm": 0.09999874979257584, "learning_rate": 2.469739336011523e-08, "loss": 0.0813, "num_input_tokens_seen": 35249280, "step": 39435 }, { "epoch": 10.409132902204039, "grad_norm": 0.02377139776945114, "learning_rate": 2.4263041276947894e-08, "loss": 0.02, "num_input_tokens_seen": 35253728, "step": 39440 }, { "epoch": 10.410452685759536, "grad_norm": 0.4324994385242462, "learning_rate": 2.3832540766283164e-08, "loss": 0.0716, "num_input_tokens_seen": 35258368, "step": 39445 }, { "epoch": 10.411772469315032, "grad_norm": 0.17599305510520935, "learning_rate": 2.3405891894512366e-08, "loss": 0.0479, "num_input_tokens_seen": 35262560, "step": 39450 }, { "epoch": 10.413092252870529, "grad_norm": 0.5903554558753967, "learning_rate": 2.29830947274301e-08, "loss": 0.0473, "num_input_tokens_seen": 35266912, "step": 39455 }, { "epoch": 10.414412036426025, "grad_norm": 0.23585903644561768, "learning_rate": 2.2564149330231432e-08, "loss": 0.0427, "num_input_tokens_seen": 35271424, "step": 39460 }, { "epoch": 10.415731819981524, "grad_norm": 0.03456173837184906, "learning_rate": 2.2149055767528572e-08, "loss": 0.0544, "num_input_tokens_seen": 35275808, "step": 39465 }, { "epoch": 10.41705160353702, "grad_norm": 0.057969797402620316, "learning_rate": 2.1737814103334197e-08, "loss": 0.055, "num_input_tokens_seen": 35280448, "step": 39470 }, { "epoch": 10.418371387092517, "grad_norm": 0.17995183169841766, "learning_rate": 2.1330424401064253e-08, "loss": 0.04, "num_input_tokens_seen": 35284800, "step": 39475 }, { "epoch": 10.419691170648013, "grad_norm": 0.10369167476892471, "learning_rate": 2.092688672354348e-08, "loss": 0.0624, "num_input_tokens_seen": 35289248, "step": 39480 }, { "epoch": 10.42101095420351, "grad_norm": 0.14830124378204346, "learning_rate": 2.0527201133005435e-08, "loss": 0.0387, "num_input_tokens_seen": 35293760, "step": 39485 }, { "epoch": 10.422330737759008, "grad_norm": 0.021682003512978554, "learning_rate": 2.0131367691084148e-08, "loss": 0.0574, "num_input_tokens_seen": 35298176, "step": 39490 }, { "epoch": 10.423650521314505, "grad_norm": 0.19973953068256378, "learning_rate": 1.9739386458819675e-08, "loss": 0.0898, "num_input_tokens_seen": 35302656, "step": 39495 }, { "epoch": 10.424970304870001, "grad_norm": 0.19754983484745026, "learning_rate": 1.9351257496666442e-08, "loss": 0.1166, "num_input_tokens_seen": 35307168, "step": 39500 }, { "epoch": 10.426290088425498, "grad_norm": 0.5130003690719604, "learning_rate": 1.896698086447657e-08, "loss": 0.1357, "num_input_tokens_seen": 35311360, "step": 39505 }, { "epoch": 10.427609871980994, "grad_norm": 0.14373548328876495, "learning_rate": 1.8586556621505436e-08, "loss": 0.0347, "num_input_tokens_seen": 35316000, "step": 39510 }, { "epoch": 10.428929655536493, "grad_norm": 0.08314194530248642, "learning_rate": 1.820998482642833e-08, "loss": 0.0791, "num_input_tokens_seen": 35320096, "step": 39515 }, { "epoch": 10.43024943909199, "grad_norm": 0.0866573378443718, "learning_rate": 1.7837265537309912e-08, "loss": 0.0739, "num_input_tokens_seen": 35324416, "step": 39520 }, { "epoch": 10.431569222647486, "grad_norm": 0.554697573184967, "learning_rate": 1.7468398811629206e-08, "loss": 0.0651, "num_input_tokens_seen": 35329216, "step": 39525 }, { "epoch": 10.432889006202982, "grad_norm": 0.13776478171348572, "learning_rate": 1.710338470627404e-08, "loss": 0.0362, "num_input_tokens_seen": 35333856, "step": 39530 }, { "epoch": 10.434208789758479, "grad_norm": 0.09810629487037659, "learning_rate": 1.6742223277529945e-08, "loss": 0.0826, "num_input_tokens_seen": 35338272, "step": 39535 }, { "epoch": 10.435528573313977, "grad_norm": 0.053270068019628525, "learning_rate": 1.6384914581094036e-08, "loss": 0.0225, "num_input_tokens_seen": 35342848, "step": 39540 }, { "epoch": 10.436848356869474, "grad_norm": 0.5734820365905762, "learning_rate": 1.6031458672069455e-08, "loss": 0.0738, "num_input_tokens_seen": 35347392, "step": 39545 }, { "epoch": 10.43816814042497, "grad_norm": 0.04700048640370369, "learning_rate": 1.5681855604962602e-08, "loss": 0.0815, "num_input_tokens_seen": 35351744, "step": 39550 }, { "epoch": 10.439487923980467, "grad_norm": 0.192489892244339, "learning_rate": 1.5336105433683135e-08, "loss": 0.0196, "num_input_tokens_seen": 35355968, "step": 39555 }, { "epoch": 10.440807707535964, "grad_norm": 0.1723727285861969, "learning_rate": 1.499420821155506e-08, "loss": 0.0553, "num_input_tokens_seen": 35360448, "step": 39560 }, { "epoch": 10.442127491091462, "grad_norm": 0.0815168246626854, "learning_rate": 1.4656163991302874e-08, "loss": 0.0162, "num_input_tokens_seen": 35364640, "step": 39565 }, { "epoch": 10.443447274646958, "grad_norm": 0.17705528438091278, "learning_rate": 1.4321972825051544e-08, "loss": 0.0775, "num_input_tokens_seen": 35368864, "step": 39570 }, { "epoch": 10.444767058202455, "grad_norm": 0.19994628429412842, "learning_rate": 1.3991634764345951e-08, "loss": 0.0445, "num_input_tokens_seen": 35373408, "step": 39575 }, { "epoch": 10.446086841757952, "grad_norm": 0.09063997864723206, "learning_rate": 1.3665149860120352e-08, "loss": 0.0294, "num_input_tokens_seen": 35377952, "step": 39580 }, { "epoch": 10.447406625313448, "grad_norm": 0.11073332279920578, "learning_rate": 1.3342518162728912e-08, "loss": 0.0561, "num_input_tokens_seen": 35382560, "step": 39585 }, { "epoch": 10.448726408868945, "grad_norm": 0.057739559561014175, "learning_rate": 1.30237397219235e-08, "loss": 0.1048, "num_input_tokens_seen": 35387168, "step": 39590 }, { "epoch": 10.450046192424443, "grad_norm": 0.2522336542606354, "learning_rate": 1.2708814586862016e-08, "loss": 0.0405, "num_input_tokens_seen": 35391744, "step": 39595 }, { "epoch": 10.45136597597994, "grad_norm": 0.12819427251815796, "learning_rate": 1.2397742806111168e-08, "loss": 0.0202, "num_input_tokens_seen": 35396000, "step": 39600 }, { "epoch": 10.45136597597994, "eval_loss": 0.06686899065971375, "eval_runtime": 64.7708, "eval_samples_per_second": 103.982, "eval_steps_per_second": 25.999, "num_input_tokens_seen": 35396000, "step": 39600 }, { "epoch": 10.452685759535436, "grad_norm": 0.15743660926818848, "learning_rate": 1.209052442764369e-08, "loss": 0.0504, "num_input_tokens_seen": 35400480, "step": 39605 }, { "epoch": 10.454005543090933, "grad_norm": 0.18735115230083466, "learning_rate": 1.17871594988328e-08, "loss": 0.0294, "num_input_tokens_seen": 35404672, "step": 39610 }, { "epoch": 10.45532532664643, "grad_norm": 0.029971200972795486, "learning_rate": 1.1487648066466072e-08, "loss": 0.0442, "num_input_tokens_seen": 35409216, "step": 39615 }, { "epoch": 10.456645110201928, "grad_norm": 0.23768869042396545, "learning_rate": 1.1191990176728784e-08, "loss": 0.0648, "num_input_tokens_seen": 35414112, "step": 39620 }, { "epoch": 10.457964893757424, "grad_norm": 0.31516918540000916, "learning_rate": 1.0900185875215018e-08, "loss": 0.0471, "num_input_tokens_seen": 35418656, "step": 39625 }, { "epoch": 10.45928467731292, "grad_norm": 0.2805478870868683, "learning_rate": 1.0612235206924891e-08, "loss": 0.0766, "num_input_tokens_seen": 35423360, "step": 39630 }, { "epoch": 10.460604460868417, "grad_norm": 0.12993866205215454, "learning_rate": 1.0328138216264549e-08, "loss": 0.083, "num_input_tokens_seen": 35428032, "step": 39635 }, { "epoch": 10.461924244423914, "grad_norm": 0.18811824917793274, "learning_rate": 1.004789494704339e-08, "loss": 0.0578, "num_input_tokens_seen": 35432448, "step": 39640 }, { "epoch": 10.463244027979412, "grad_norm": 0.05997803062200546, "learning_rate": 9.771505442482397e-09, "loss": 0.0379, "num_input_tokens_seen": 35436736, "step": 39645 }, { "epoch": 10.464563811534909, "grad_norm": 0.1549091637134552, "learning_rate": 9.498969745200259e-09, "loss": 0.0481, "num_input_tokens_seen": 35441248, "step": 39650 }, { "epoch": 10.465883595090405, "grad_norm": 0.11711782962083817, "learning_rate": 9.230287897230017e-09, "loss": 0.0695, "num_input_tokens_seen": 35445888, "step": 39655 }, { "epoch": 10.467203378645902, "grad_norm": 0.14941294491291046, "learning_rate": 8.965459940002419e-09, "loss": 0.0865, "num_input_tokens_seen": 35450336, "step": 39660 }, { "epoch": 10.468523162201398, "grad_norm": 0.12813811004161835, "learning_rate": 8.704485914357019e-09, "loss": 0.0786, "num_input_tokens_seen": 35454592, "step": 39665 }, { "epoch": 10.469842945756897, "grad_norm": 0.19397613406181335, "learning_rate": 8.447365860539402e-09, "loss": 0.0603, "num_input_tokens_seen": 35459200, "step": 39670 }, { "epoch": 10.471162729312393, "grad_norm": 0.24467119574546814, "learning_rate": 8.194099818201184e-09, "loss": 0.0614, "num_input_tokens_seen": 35463776, "step": 39675 }, { "epoch": 10.47248251286789, "grad_norm": 0.21178002655506134, "learning_rate": 7.944687826400011e-09, "loss": 0.0624, "num_input_tokens_seen": 35468224, "step": 39680 }, { "epoch": 10.473802296423386, "grad_norm": 0.19043120741844177, "learning_rate": 7.699129923599557e-09, "loss": 0.0345, "num_input_tokens_seen": 35472832, "step": 39685 }, { "epoch": 10.475122079978883, "grad_norm": 0.08776219189167023, "learning_rate": 7.457426147663982e-09, "loss": 0.0324, "num_input_tokens_seen": 35477312, "step": 39690 }, { "epoch": 10.47644186353438, "grad_norm": 0.09339522570371628, "learning_rate": 7.219576535871797e-09, "loss": 0.0454, "num_input_tokens_seen": 35481696, "step": 39695 }, { "epoch": 10.477761647089878, "grad_norm": 0.09532131254673004, "learning_rate": 6.985581124896445e-09, "loss": 0.038, "num_input_tokens_seen": 35486112, "step": 39700 }, { "epoch": 10.479081430645374, "grad_norm": 0.11481470614671707, "learning_rate": 6.755439950828501e-09, "loss": 0.0566, "num_input_tokens_seen": 35490368, "step": 39705 }, { "epoch": 10.480401214200871, "grad_norm": 0.16774976253509521, "learning_rate": 6.5291530491562444e-09, "loss": 0.0506, "num_input_tokens_seen": 35494656, "step": 39710 }, { "epoch": 10.481720997756367, "grad_norm": 0.14974111318588257, "learning_rate": 6.3067204547739845e-09, "loss": 0.056, "num_input_tokens_seen": 35499360, "step": 39715 }, { "epoch": 10.483040781311864, "grad_norm": 0.4242684543132782, "learning_rate": 6.088142201987612e-09, "loss": 0.0302, "num_input_tokens_seen": 35503520, "step": 39720 }, { "epoch": 10.484360564867362, "grad_norm": 0.2316826730966568, "learning_rate": 5.873418324503499e-09, "loss": 0.0638, "num_input_tokens_seen": 35508032, "step": 39725 }, { "epoch": 10.485680348422859, "grad_norm": 0.3321036994457245, "learning_rate": 5.6625488554340465e-09, "loss": 0.0383, "num_input_tokens_seen": 35512608, "step": 39730 }, { "epoch": 10.487000131978355, "grad_norm": 0.23089765012264252, "learning_rate": 5.455533827297688e-09, "loss": 0.0468, "num_input_tokens_seen": 35517312, "step": 39735 }, { "epoch": 10.488319915533852, "grad_norm": 0.07289786636829376, "learning_rate": 5.252373272018885e-09, "loss": 0.0503, "num_input_tokens_seen": 35522112, "step": 39740 }, { "epoch": 10.489639699089349, "grad_norm": 0.17986682057380676, "learning_rate": 5.053067220925356e-09, "loss": 0.0714, "num_input_tokens_seen": 35526560, "step": 39745 }, { "epoch": 10.490959482644847, "grad_norm": 0.4868500232696533, "learning_rate": 4.857615704759177e-09, "loss": 0.0916, "num_input_tokens_seen": 35530752, "step": 39750 }, { "epoch": 10.492279266200343, "grad_norm": 0.464214563369751, "learning_rate": 4.666018753654577e-09, "loss": 0.1001, "num_input_tokens_seen": 35535168, "step": 39755 }, { "epoch": 10.49359904975584, "grad_norm": 0.20315559208393097, "learning_rate": 4.478276397162917e-09, "loss": 0.07, "num_input_tokens_seen": 35539712, "step": 39760 }, { "epoch": 10.494918833311337, "grad_norm": 0.15449482202529907, "learning_rate": 4.294388664233262e-09, "loss": 0.0869, "num_input_tokens_seen": 35544096, "step": 39765 }, { "epoch": 10.496238616866833, "grad_norm": 0.31987449526786804, "learning_rate": 4.114355583223484e-09, "loss": 0.0401, "num_input_tokens_seen": 35548672, "step": 39770 }, { "epoch": 10.497558400422331, "grad_norm": 0.064491406083107, "learning_rate": 3.9381771818974845e-09, "loss": 0.0222, "num_input_tokens_seen": 35553216, "step": 39775 }, { "epoch": 10.498878183977828, "grad_norm": 0.3632766604423523, "learning_rate": 3.765853487427973e-09, "loss": 0.0982, "num_input_tokens_seen": 35557696, "step": 39780 }, { "epoch": 10.500197967533325, "grad_norm": 0.23432210087776184, "learning_rate": 3.5973845263825857e-09, "loss": 0.047, "num_input_tokens_seen": 35562208, "step": 39785 }, { "epoch": 10.501517751088821, "grad_norm": 0.10265903174877167, "learning_rate": 3.4327703247488684e-09, "loss": 0.0635, "num_input_tokens_seen": 35566720, "step": 39790 }, { "epoch": 10.502837534644318, "grad_norm": 0.08026956766843796, "learning_rate": 3.2720109079037443e-09, "loss": 0.0679, "num_input_tokens_seen": 35570912, "step": 39795 }, { "epoch": 10.504157318199816, "grad_norm": 0.24375790357589722, "learning_rate": 3.1151063006468193e-09, "loss": 0.0659, "num_input_tokens_seen": 35575872, "step": 39800 }, { "epoch": 10.504157318199816, "eval_loss": 0.06686899065971375, "eval_runtime": 64.7668, "eval_samples_per_second": 103.988, "eval_steps_per_second": 26.001, "num_input_tokens_seen": 35575872, "step": 39800 }, { "epoch": 10.505477101755313, "grad_norm": 0.24508266150951385, "learning_rate": 2.962056527169854e-09, "loss": 0.0278, "num_input_tokens_seen": 35580256, "step": 39805 }, { "epoch": 10.50679688531081, "grad_norm": 0.12731514871120453, "learning_rate": 2.8128616110761898e-09, "loss": 0.0373, "num_input_tokens_seen": 35584512, "step": 39810 }, { "epoch": 10.508116668866306, "grad_norm": 0.4986494481563568, "learning_rate": 2.6675215753724223e-09, "loss": 0.1155, "num_input_tokens_seen": 35589088, "step": 39815 }, { "epoch": 10.509436452421802, "grad_norm": 0.09775055199861526, "learning_rate": 2.5260364424739557e-09, "loss": 0.0526, "num_input_tokens_seen": 35593472, "step": 39820 }, { "epoch": 10.5107562359773, "grad_norm": 0.397988885641098, "learning_rate": 2.3884062341994475e-09, "loss": 0.0641, "num_input_tokens_seen": 35598144, "step": 39825 }, { "epoch": 10.512076019532797, "grad_norm": 0.39882686734199524, "learning_rate": 2.25463097177081e-09, "loss": 0.0782, "num_input_tokens_seen": 35602784, "step": 39830 }, { "epoch": 10.513395803088294, "grad_norm": 0.07363387942314148, "learning_rate": 2.1247106758215397e-09, "loss": 0.0439, "num_input_tokens_seen": 35607296, "step": 39835 }, { "epoch": 10.51471558664379, "grad_norm": 0.07311610132455826, "learning_rate": 1.998645366382834e-09, "loss": 0.0286, "num_input_tokens_seen": 35611520, "step": 39840 }, { "epoch": 10.516035370199287, "grad_norm": 0.09407701343297958, "learning_rate": 1.876435062897475e-09, "loss": 0.0276, "num_input_tokens_seen": 35616000, "step": 39845 }, { "epoch": 10.517355153754785, "grad_norm": 0.04539132118225098, "learning_rate": 1.758079784211497e-09, "loss": 0.0653, "num_input_tokens_seen": 35620320, "step": 39850 }, { "epoch": 10.518674937310282, "grad_norm": 0.6947534680366516, "learning_rate": 1.6435795485797434e-09, "loss": 0.0629, "num_input_tokens_seen": 35624736, "step": 39855 }, { "epoch": 10.519994720865778, "grad_norm": 0.15846362709999084, "learning_rate": 1.5329343736547596e-09, "loss": 0.0618, "num_input_tokens_seen": 35629088, "step": 39860 }, { "epoch": 10.521314504421275, "grad_norm": 0.2503301501274109, "learning_rate": 1.4261442765006739e-09, "loss": 0.0372, "num_input_tokens_seen": 35633472, "step": 39865 }, { "epoch": 10.522634287976771, "grad_norm": 0.18638908863067627, "learning_rate": 1.3232092735876445e-09, "loss": 0.0949, "num_input_tokens_seen": 35638048, "step": 39870 }, { "epoch": 10.523954071532268, "grad_norm": 0.35070744156837463, "learning_rate": 1.2241293807918607e-09, "loss": 0.0731, "num_input_tokens_seen": 35642592, "step": 39875 }, { "epoch": 10.525273855087766, "grad_norm": 0.08850501477718353, "learning_rate": 1.128904613387216e-09, "loss": 0.022, "num_input_tokens_seen": 35647104, "step": 39880 }, { "epoch": 10.526593638643263, "grad_norm": 0.2380620390176773, "learning_rate": 1.0375349860591853e-09, "loss": 0.085, "num_input_tokens_seen": 35651392, "step": 39885 }, { "epoch": 10.52791342219876, "grad_norm": 0.26449471712112427, "learning_rate": 9.5002051290205e-10, "loss": 0.0659, "num_input_tokens_seen": 35656000, "step": 39890 }, { "epoch": 10.529233205754256, "grad_norm": 0.19770123064517975, "learning_rate": 8.663612074077954e-10, "loss": 0.0394, "num_input_tokens_seen": 35660480, "step": 39895 }, { "epoch": 10.530552989309752, "grad_norm": 0.04451938718557358, "learning_rate": 7.865570824799884e-10, "loss": 0.0685, "num_input_tokens_seen": 35664704, "step": 39900 }, { "epoch": 10.53187277286525, "grad_norm": 0.30129677057266235, "learning_rate": 7.106081504254514e-10, "loss": 0.091, "num_input_tokens_seen": 35669056, "step": 39905 }, { "epoch": 10.533192556420747, "grad_norm": 0.08751536905765533, "learning_rate": 6.385144229570372e-10, "loss": 0.0417, "num_input_tokens_seen": 35673632, "step": 39910 }, { "epoch": 10.534512339976244, "grad_norm": 0.286141961812973, "learning_rate": 5.70275911190854e-10, "loss": 0.0407, "num_input_tokens_seen": 35678112, "step": 39915 }, { "epoch": 10.53583212353174, "grad_norm": 0.2553674876689911, "learning_rate": 5.058926256490403e-10, "loss": 0.0595, "num_input_tokens_seen": 35682592, "step": 39920 }, { "epoch": 10.537151907087237, "grad_norm": 0.08961944282054901, "learning_rate": 4.4536457626254134e-10, "loss": 0.0481, "num_input_tokens_seen": 35687328, "step": 39925 }, { "epoch": 10.538471690642735, "grad_norm": 0.048031508922576904, "learning_rate": 3.88691772365557e-10, "loss": 0.0262, "num_input_tokens_seen": 35691904, "step": 39930 }, { "epoch": 10.539791474198232, "grad_norm": 0.10910078883171082, "learning_rate": 3.358742226955425e-10, "loss": 0.0495, "num_input_tokens_seen": 35696288, "step": 39935 }, { "epoch": 10.541111257753728, "grad_norm": 0.17059040069580078, "learning_rate": 2.8691193539875925e-10, "loss": 0.1079, "num_input_tokens_seen": 35700640, "step": 39940 }, { "epoch": 10.542431041309225, "grad_norm": 0.08324994891881943, "learning_rate": 2.418049180274995e-10, "loss": 0.0444, "num_input_tokens_seen": 35704960, "step": 39945 }, { "epoch": 10.543750824864722, "grad_norm": 0.48782598972320557, "learning_rate": 2.005531775373104e-10, "loss": 0.0892, "num_input_tokens_seen": 35709760, "step": 39950 }, { "epoch": 10.545070608420218, "grad_norm": 0.32518601417541504, "learning_rate": 1.6315672028699435e-10, "loss": 0.0411, "num_input_tokens_seen": 35714048, "step": 39955 }, { "epoch": 10.546390391975716, "grad_norm": 0.06300792098045349, "learning_rate": 1.2961555204693555e-10, "loss": 0.0377, "num_input_tokens_seen": 35719104, "step": 39960 }, { "epoch": 10.547710175531213, "grad_norm": 0.2515268325805664, "learning_rate": 9.992967798799768e-11, "loss": 0.0774, "num_input_tokens_seen": 35723744, "step": 39965 }, { "epoch": 10.54902995908671, "grad_norm": 0.0424208790063858, "learning_rate": 7.409910268707521e-11, "loss": 0.0456, "num_input_tokens_seen": 35728448, "step": 39970 }, { "epoch": 10.550349742642206, "grad_norm": 0.06364504247903824, "learning_rate": 5.212383012986877e-11, "loss": 0.0483, "num_input_tokens_seen": 35732736, "step": 39975 }, { "epoch": 10.551669526197703, "grad_norm": 0.07357128709554672, "learning_rate": 3.400386370533415e-11, "loss": 0.0287, "num_input_tokens_seen": 35737184, "step": 39980 }, { "epoch": 10.552989309753201, "grad_norm": 0.11814670264720917, "learning_rate": 1.9739206205682258e-11, "loss": 0.016, "num_input_tokens_seen": 35741472, "step": 39985 }, { "epoch": 10.554309093308698, "grad_norm": 0.23330411314964294, "learning_rate": 9.329859829154685e-12, "loss": 0.0831, "num_input_tokens_seen": 35746048, "step": 39990 }, { "epoch": 10.555628876864194, "grad_norm": 0.11445839703083038, "learning_rate": 2.7758261855748148e-12, "loss": 0.0448, "num_input_tokens_seen": 35750592, "step": 39995 }, { "epoch": 10.55694866041969, "grad_norm": 0.04129907861351967, "learning_rate": 7.710628524559838e-14, "loss": 0.0661, "num_input_tokens_seen": 35754976, "step": 40000 }, { "epoch": 10.55694866041969, "eval_loss": 0.06686899065971375, "eval_runtime": 64.7753, "eval_samples_per_second": 103.975, "eval_steps_per_second": 25.998, "num_input_tokens_seen": 35754976, "step": 40000 }, { "epoch": 10.55694866041969, "num_input_tokens_seen": 35754976, "step": 40000, "total_flos": 1.6100725874609357e+18, "train_loss": 0.06669788992283866, "train_runtime": 35858.3327, "train_samples_per_second": 17.848, "train_steps_per_second": 1.116 } ], "logging_steps": 5, "max_steps": 40000, "num_input_tokens_seen": 35754976, "num_train_epochs": 11, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.6100725874609357e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }