{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 6252, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0023999520009599807, "grad_norm": 0.49803411960601807, "learning_rate": 4.999994949996767e-05, "loss": 0.9716, "num_input_tokens_seen": 54328, "step": 5, "train_runtime": 8.3772, "train_tokens_per_second": 6485.207 }, { "epoch": 0.004799904001919961, "grad_norm": 0.4587724804878235, "learning_rate": 4.9999744343936e-05, "loss": 0.9705, "num_input_tokens_seen": 108376, "step": 10, "train_runtime": 16.0234, "train_tokens_per_second": 6763.598 }, { "epoch": 0.007199856002879942, "grad_norm": 0.3823186159133911, "learning_rate": 4.999938137694701e-05, "loss": 0.8622, "num_input_tokens_seen": 163816, "step": 15, "train_runtime": 24.1319, "train_tokens_per_second": 6788.347 }, { "epoch": 0.009599808003839923, "grad_norm": 0.3449329733848572, "learning_rate": 4.999886060129194e-05, "loss": 0.8309, "num_input_tokens_seen": 225216, "step": 20, "train_runtime": 33.0832, "train_tokens_per_second": 6807.559 }, { "epoch": 0.011999760004799903, "grad_norm": 0.4700082242488861, "learning_rate": 4.999818202025819e-05, "loss": 0.8118, "num_input_tokens_seen": 279480, "step": 25, "train_runtime": 40.5384, "train_tokens_per_second": 6894.209 }, { "epoch": 0.014399712005759884, "grad_norm": 0.3817142844200134, "learning_rate": 4.999734563812929e-05, "loss": 0.7763, "num_input_tokens_seen": 337816, "step": 30, "train_runtime": 48.8916, "train_tokens_per_second": 6909.488 }, { "epoch": 0.016799664006719867, "grad_norm": 0.33569249510765076, "learning_rate": 4.9996351460184923e-05, "loss": 0.7919, "num_input_tokens_seen": 394952, "step": 35, "train_runtime": 57.353, "train_tokens_per_second": 6886.34 }, { "epoch": 0.019199616007679846, "grad_norm": 0.29491692781448364, "learning_rate": 4.9995199492700826e-05, "loss": 0.7095, "num_input_tokens_seen": 454608, "step": 40, "train_runtime": 65.8361, "train_tokens_per_second": 6905.143 }, { "epoch": 0.021599568008639828, "grad_norm": 0.3096805810928345, "learning_rate": 4.9993889742948806e-05, "loss": 0.7347, "num_input_tokens_seen": 510256, "step": 45, "train_runtime": 74.0444, "train_tokens_per_second": 6891.219 }, { "epoch": 0.023999520009599807, "grad_norm": 0.3358306884765625, "learning_rate": 4.9992422219196656e-05, "loss": 0.7461, "num_input_tokens_seen": 568112, "step": 50, "train_runtime": 82.4236, "train_tokens_per_second": 6892.587 }, { "epoch": 0.02639947201055979, "grad_norm": 0.3951747417449951, "learning_rate": 4.9990796930708125e-05, "loss": 0.7168, "num_input_tokens_seen": 625896, "step": 55, "train_runtime": 90.7851, "train_tokens_per_second": 6894.26 }, { "epoch": 0.02879942401151977, "grad_norm": 0.3642365038394928, "learning_rate": 4.9989013887742856e-05, "loss": 0.7117, "num_input_tokens_seen": 677856, "step": 60, "train_runtime": 98.2631, "train_tokens_per_second": 6898.377 }, { "epoch": 0.03119937601247975, "grad_norm": 0.3379388153553009, "learning_rate": 4.998707310155631e-05, "loss": 0.6441, "num_input_tokens_seen": 740792, "step": 65, "train_runtime": 107.5229, "train_tokens_per_second": 6889.624 }, { "epoch": 0.03359932801343973, "grad_norm": 0.46827253699302673, "learning_rate": 4.99849745843997e-05, "loss": 0.617, "num_input_tokens_seen": 795784, "step": 70, "train_runtime": 115.3119, "train_tokens_per_second": 6901.144 }, { "epoch": 0.03599928001439971, "grad_norm": 0.46408799290657043, "learning_rate": 4.998271834951993e-05, "loss": 0.685, "num_input_tokens_seen": 852016, "step": 75, "train_runtime": 123.2024, "train_tokens_per_second": 6915.578 }, { "epoch": 0.03839923201535969, "grad_norm": 0.4789453148841858, "learning_rate": 4.998030441115949e-05, "loss": 0.6505, "num_input_tokens_seen": 909224, "step": 80, "train_runtime": 131.6277, "train_tokens_per_second": 6907.542 }, { "epoch": 0.04079918401631968, "grad_norm": 0.40359923243522644, "learning_rate": 4.9977732784556355e-05, "loss": 0.6212, "num_input_tokens_seen": 959568, "step": 85, "train_runtime": 139.0702, "train_tokens_per_second": 6899.88 }, { "epoch": 0.043199136017279656, "grad_norm": 0.452709436416626, "learning_rate": 4.997500348594394e-05, "loss": 0.6978, "num_input_tokens_seen": 1010696, "step": 90, "train_runtime": 146.8828, "train_tokens_per_second": 6880.967 }, { "epoch": 0.045599088018239635, "grad_norm": 0.4287179112434387, "learning_rate": 4.997211653255096e-05, "loss": 0.6212, "num_input_tokens_seen": 1067912, "step": 95, "train_runtime": 155.302, "train_tokens_per_second": 6876.357 }, { "epoch": 0.047999040019199614, "grad_norm": 0.5242288112640381, "learning_rate": 4.996907194260129e-05, "loss": 0.6182, "num_input_tokens_seen": 1127264, "step": 100, "train_runtime": 164.3956, "train_tokens_per_second": 6857.02 }, { "epoch": 0.0503989920201596, "grad_norm": 0.31285974383354187, "learning_rate": 4.996586973531394e-05, "loss": 0.6254, "num_input_tokens_seen": 1183208, "step": 105, "train_runtime": 172.5905, "train_tokens_per_second": 6855.582 }, { "epoch": 0.05279894402111958, "grad_norm": 0.30165454745292664, "learning_rate": 4.9962509930902836e-05, "loss": 0.5758, "num_input_tokens_seen": 1243128, "step": 110, "train_runtime": 181.2314, "train_tokens_per_second": 6859.34 }, { "epoch": 0.05519889602207956, "grad_norm": 0.3959163725376129, "learning_rate": 4.9958992550576754e-05, "loss": 0.6427, "num_input_tokens_seen": 1294648, "step": 115, "train_runtime": 188.848, "train_tokens_per_second": 6855.503 }, { "epoch": 0.05759884802303954, "grad_norm": 0.44546279311180115, "learning_rate": 4.9955317616539174e-05, "loss": 0.6416, "num_input_tokens_seen": 1349136, "step": 120, "train_runtime": 196.8055, "train_tokens_per_second": 6855.173 }, { "epoch": 0.05999880002399952, "grad_norm": 0.5479788184165955, "learning_rate": 4.9951485151988126e-05, "loss": 0.6039, "num_input_tokens_seen": 1403304, "step": 125, "train_runtime": 204.9341, "train_tokens_per_second": 6847.585 }, { "epoch": 0.0623987520249595, "grad_norm": 0.46208852529525757, "learning_rate": 4.994749518111604e-05, "loss": 0.6365, "num_input_tokens_seen": 1460712, "step": 130, "train_runtime": 212.8501, "train_tokens_per_second": 6862.633 }, { "epoch": 0.06479870402591949, "grad_norm": 0.5154985189437866, "learning_rate": 4.9943347729109646e-05, "loss": 0.5757, "num_input_tokens_seen": 1516920, "step": 135, "train_runtime": 221.1296, "train_tokens_per_second": 6859.868 }, { "epoch": 0.06719865602687947, "grad_norm": 0.4509885311126709, "learning_rate": 4.993904282214972e-05, "loss": 0.6484, "num_input_tokens_seen": 1569296, "step": 140, "train_runtime": 228.9245, "train_tokens_per_second": 6855.081 }, { "epoch": 0.06959860802783945, "grad_norm": 0.47324448823928833, "learning_rate": 4.993458048741102e-05, "loss": 0.5967, "num_input_tokens_seen": 1627720, "step": 145, "train_runtime": 237.6306, "train_tokens_per_second": 6849.792 }, { "epoch": 0.07199856002879942, "grad_norm": 0.4491414427757263, "learning_rate": 4.992996075306203e-05, "loss": 0.6705, "num_input_tokens_seen": 1680600, "step": 150, "train_runtime": 245.5875, "train_tokens_per_second": 6843.181 }, { "epoch": 0.0743985120297594, "grad_norm": 0.5371958613395691, "learning_rate": 4.992518364826484e-05, "loss": 0.5925, "num_input_tokens_seen": 1732368, "step": 155, "train_runtime": 253.2225, "train_tokens_per_second": 6841.288 }, { "epoch": 0.07679846403071938, "grad_norm": 0.44730937480926514, "learning_rate": 4.9920249203174945e-05, "loss": 0.5695, "num_input_tokens_seen": 1794680, "step": 160, "train_runtime": 262.1391, "train_tokens_per_second": 6846.289 }, { "epoch": 0.07919841603167936, "grad_norm": 0.4398422837257385, "learning_rate": 4.9915157448941044e-05, "loss": 0.5549, "num_input_tokens_seen": 1854040, "step": 165, "train_runtime": 270.3386, "train_tokens_per_second": 6858.214 }, { "epoch": 0.08159836803263935, "grad_norm": 0.5156921148300171, "learning_rate": 4.9909908417704835e-05, "loss": 0.5701, "num_input_tokens_seen": 1908808, "step": 170, "train_runtime": 278.3256, "train_tokens_per_second": 6858.183 }, { "epoch": 0.08399832003359933, "grad_norm": 0.40140026807785034, "learning_rate": 4.990450214260086e-05, "loss": 0.5478, "num_input_tokens_seen": 1966184, "step": 175, "train_runtime": 286.7588, "train_tokens_per_second": 6856.577 }, { "epoch": 0.08639827203455931, "grad_norm": 0.5238102674484253, "learning_rate": 4.9898938657756234e-05, "loss": 0.5816, "num_input_tokens_seen": 2023280, "step": 180, "train_runtime": 295.3163, "train_tokens_per_second": 6851.231 }, { "epoch": 0.08879822403551929, "grad_norm": 0.4058316648006439, "learning_rate": 4.989321799829048e-05, "loss": 0.6243, "num_input_tokens_seen": 2079160, "step": 185, "train_runtime": 303.4477, "train_tokens_per_second": 6851.791 }, { "epoch": 0.09119817603647927, "grad_norm": 0.48315656185150146, "learning_rate": 4.988734020031527e-05, "loss": 0.5903, "num_input_tokens_seen": 2129480, "step": 190, "train_runtime": 310.6729, "train_tokens_per_second": 6854.413 }, { "epoch": 0.09359812803743925, "grad_norm": 0.49458763003349304, "learning_rate": 4.9881305300934225e-05, "loss": 0.5232, "num_input_tokens_seen": 2189160, "step": 195, "train_runtime": 319.1985, "train_tokens_per_second": 6858.302 }, { "epoch": 0.09599808003839923, "grad_norm": 0.3490532338619232, "learning_rate": 4.987511333824266e-05, "loss": 0.5846, "num_input_tokens_seen": 2247704, "step": 200, "train_runtime": 327.3152, "train_tokens_per_second": 6867.093 }, { "epoch": 0.0983980320393592, "grad_norm": 0.41308099031448364, "learning_rate": 4.986876435132736e-05, "loss": 0.589, "num_input_tokens_seen": 2307040, "step": 205, "train_runtime": 335.8335, "train_tokens_per_second": 6869.595 }, { "epoch": 0.1007979840403192, "grad_norm": 0.4715804159641266, "learning_rate": 4.9862258380266325e-05, "loss": 0.5737, "num_input_tokens_seen": 2357488, "step": 210, "train_runtime": 343.3468, "train_tokens_per_second": 6866.201 }, { "epoch": 0.10319793604127918, "grad_norm": 0.35753390192985535, "learning_rate": 4.985559546612851e-05, "loss": 0.6138, "num_input_tokens_seen": 2414064, "step": 215, "train_runtime": 351.1882, "train_tokens_per_second": 6873.99 }, { "epoch": 0.10559788804223916, "grad_norm": 0.44587111473083496, "learning_rate": 4.984877565097359e-05, "loss": 0.5923, "num_input_tokens_seen": 2472744, "step": 220, "train_runtime": 359.483, "train_tokens_per_second": 6878.611 }, { "epoch": 0.10799784004319914, "grad_norm": 0.47545069456100464, "learning_rate": 4.984179897785166e-05, "loss": 0.55, "num_input_tokens_seen": 2529024, "step": 225, "train_runtime": 367.4645, "train_tokens_per_second": 6882.364 }, { "epoch": 0.11039779204415912, "grad_norm": 0.5392165184020996, "learning_rate": 4.983466549080299e-05, "loss": 0.618, "num_input_tokens_seen": 2584864, "step": 230, "train_runtime": 375.6998, "train_tokens_per_second": 6880.132 }, { "epoch": 0.1127977440451191, "grad_norm": 0.6994487047195435, "learning_rate": 4.9827375234857735e-05, "loss": 0.6055, "num_input_tokens_seen": 2638696, "step": 235, "train_runtime": 383.3187, "train_tokens_per_second": 6883.818 }, { "epoch": 0.11519769604607907, "grad_norm": 0.5480724573135376, "learning_rate": 4.981992825603566e-05, "loss": 0.5962, "num_input_tokens_seen": 2699160, "step": 240, "train_runtime": 391.6683, "train_tokens_per_second": 6891.444 }, { "epoch": 0.11759764804703907, "grad_norm": 0.49630582332611084, "learning_rate": 4.981232460134584e-05, "loss": 0.5556, "num_input_tokens_seen": 2756440, "step": 245, "train_runtime": 400.0318, "train_tokens_per_second": 6890.551 }, { "epoch": 0.11999760004799905, "grad_norm": 0.48846226930618286, "learning_rate": 4.980456431878636e-05, "loss": 0.6064, "num_input_tokens_seen": 2811584, "step": 250, "train_runtime": 408.3053, "train_tokens_per_second": 6885.985 }, { "epoch": 0.12239755204895902, "grad_norm": 0.7514108419418335, "learning_rate": 4.9796647457344034e-05, "loss": 0.622, "num_input_tokens_seen": 2864600, "step": 255, "train_runtime": 416.1151, "train_tokens_per_second": 6884.152 }, { "epoch": 0.124797504049919, "grad_norm": 0.45766520500183105, "learning_rate": 4.9788574066994074e-05, "loss": 0.5792, "num_input_tokens_seen": 2920384, "step": 260, "train_runtime": 424.193, "train_tokens_per_second": 6884.564 }, { "epoch": 0.12719745605087898, "grad_norm": 0.40784621238708496, "learning_rate": 4.978034419869977e-05, "loss": 0.5464, "num_input_tokens_seen": 2976400, "step": 265, "train_runtime": 432.5949, "train_tokens_per_second": 6880.34 }, { "epoch": 0.12959740805183897, "grad_norm": 0.5691152811050415, "learning_rate": 4.977195790441219e-05, "loss": 0.5769, "num_input_tokens_seen": 3031640, "step": 270, "train_runtime": 440.75, "train_tokens_per_second": 6878.366 }, { "epoch": 0.13199736005279894, "grad_norm": 0.509024977684021, "learning_rate": 4.976341523706986e-05, "loss": 0.5853, "num_input_tokens_seen": 3088304, "step": 275, "train_runtime": 448.5804, "train_tokens_per_second": 6884.616 }, { "epoch": 0.13439731205375893, "grad_norm": 0.5476660132408142, "learning_rate": 4.975471625059837e-05, "loss": 0.5715, "num_input_tokens_seen": 3146984, "step": 280, "train_runtime": 456.8318, "train_tokens_per_second": 6888.716 }, { "epoch": 0.1367972640547189, "grad_norm": 0.41494348645210266, "learning_rate": 4.9745860999910093e-05, "loss": 0.5492, "num_input_tokens_seen": 3206416, "step": 285, "train_runtime": 465.4275, "train_tokens_per_second": 6889.185 }, { "epoch": 0.1391972160556789, "grad_norm": 0.4294047951698303, "learning_rate": 4.973684954090384e-05, "loss": 0.6008, "num_input_tokens_seen": 3263920, "step": 290, "train_runtime": 473.6647, "train_tokens_per_second": 6890.782 }, { "epoch": 0.14159716805663886, "grad_norm": 0.673201858997345, "learning_rate": 4.972768193046446e-05, "loss": 0.5588, "num_input_tokens_seen": 3318032, "step": 295, "train_runtime": 481.6549, "train_tokens_per_second": 6888.816 }, { "epoch": 0.14399712005759885, "grad_norm": 0.6196733117103577, "learning_rate": 4.971835822646254e-05, "loss": 0.5692, "num_input_tokens_seen": 3373136, "step": 300, "train_runtime": 489.8925, "train_tokens_per_second": 6885.461 }, { "epoch": 0.14639707205855884, "grad_norm": 0.5182610154151917, "learning_rate": 4.9708878487753976e-05, "loss": 0.5801, "num_input_tokens_seen": 3428032, "step": 305, "train_runtime": 497.9042, "train_tokens_per_second": 6884.923 }, { "epoch": 0.1487970240595188, "grad_norm": 0.5710193514823914, "learning_rate": 4.969924277417963e-05, "loss": 0.5601, "num_input_tokens_seen": 3482432, "step": 310, "train_runtime": 505.9162, "train_tokens_per_second": 6883.416 }, { "epoch": 0.1511969760604788, "grad_norm": 0.5431010127067566, "learning_rate": 4.968945114656499e-05, "loss": 0.6167, "num_input_tokens_seen": 3540200, "step": 315, "train_runtime": 513.822, "train_tokens_per_second": 6889.935 }, { "epoch": 0.15359692806143876, "grad_norm": 0.5962916016578674, "learning_rate": 4.967950366671973e-05, "loss": 0.5528, "num_input_tokens_seen": 3590376, "step": 320, "train_runtime": 521.033, "train_tokens_per_second": 6890.88 }, { "epoch": 0.15599688006239876, "grad_norm": 0.43872061371803284, "learning_rate": 4.966940039743734e-05, "loss": 0.582, "num_input_tokens_seen": 3650392, "step": 325, "train_runtime": 529.3248, "train_tokens_per_second": 6896.317 }, { "epoch": 0.15839683206335872, "grad_norm": 0.6549321413040161, "learning_rate": 4.965914140249475e-05, "loss": 0.6262, "num_input_tokens_seen": 3700960, "step": 330, "train_runtime": 537.0024, "train_tokens_per_second": 6891.887 }, { "epoch": 0.16079678406431872, "grad_norm": 0.49688732624053955, "learning_rate": 4.9648726746651875e-05, "loss": 0.555, "num_input_tokens_seen": 3757192, "step": 335, "train_runtime": 544.9852, "train_tokens_per_second": 6894.117 }, { "epoch": 0.1631967360652787, "grad_norm": 0.600683331489563, "learning_rate": 4.9638156495651265e-05, "loss": 0.5747, "num_input_tokens_seen": 3812168, "step": 340, "train_runtime": 552.5883, "train_tokens_per_second": 6898.749 }, { "epoch": 0.16559668806623867, "grad_norm": 0.506166398525238, "learning_rate": 4.9627430716217674e-05, "loss": 0.562, "num_input_tokens_seen": 3873432, "step": 345, "train_runtime": 561.3753, "train_tokens_per_second": 6899.898 }, { "epoch": 0.16799664006719867, "grad_norm": 0.5933504104614258, "learning_rate": 4.96165494760576e-05, "loss": 0.5751, "num_input_tokens_seen": 3928216, "step": 350, "train_runtime": 569.6058, "train_tokens_per_second": 6896.377 }, { "epoch": 0.17039659206815863, "grad_norm": 0.7012840509414673, "learning_rate": 4.96055128438589e-05, "loss": 0.5283, "num_input_tokens_seen": 3985672, "step": 355, "train_runtime": 578.024, "train_tokens_per_second": 6895.339 }, { "epoch": 0.17279654406911862, "grad_norm": 0.5886171460151672, "learning_rate": 4.959432088929036e-05, "loss": 0.5688, "num_input_tokens_seen": 4042336, "step": 360, "train_runtime": 586.1104, "train_tokens_per_second": 6896.885 }, { "epoch": 0.1751964960700786, "grad_norm": 0.6454927325248718, "learning_rate": 4.958297368300122e-05, "loss": 0.5236, "num_input_tokens_seen": 4097248, "step": 365, "train_runtime": 594.2204, "train_tokens_per_second": 6895.165 }, { "epoch": 0.17759644807103858, "grad_norm": 0.48636892437934875, "learning_rate": 4.957147129662074e-05, "loss": 0.5569, "num_input_tokens_seen": 4152816, "step": 370, "train_runtime": 602.2577, "train_tokens_per_second": 6895.413 }, { "epoch": 0.17999640007199855, "grad_norm": 0.5636932253837585, "learning_rate": 4.9559813802757785e-05, "loss": 0.5558, "num_input_tokens_seen": 4210824, "step": 375, "train_runtime": 610.6583, "train_tokens_per_second": 6895.549 }, { "epoch": 0.18239635207295854, "grad_norm": 0.4750101864337921, "learning_rate": 4.954800127500031e-05, "loss": 0.5055, "num_input_tokens_seen": 4263672, "step": 380, "train_runtime": 618.8445, "train_tokens_per_second": 6889.73 }, { "epoch": 0.18479630407391853, "grad_norm": 0.6123194694519043, "learning_rate": 4.953603378791493e-05, "loss": 0.5524, "num_input_tokens_seen": 4319024, "step": 385, "train_runtime": 626.7904, "train_tokens_per_second": 6890.699 }, { "epoch": 0.1871962560748785, "grad_norm": 0.49063947796821594, "learning_rate": 4.952391141704644e-05, "loss": 0.5653, "num_input_tokens_seen": 4377064, "step": 390, "train_runtime": 634.9033, "train_tokens_per_second": 6894.064 }, { "epoch": 0.1895962080758385, "grad_norm": 0.5559214949607849, "learning_rate": 4.951163423891735e-05, "loss": 0.6034, "num_input_tokens_seen": 4434984, "step": 395, "train_runtime": 643.2672, "train_tokens_per_second": 6894.466 }, { "epoch": 0.19199616007679846, "grad_norm": 0.3978354334831238, "learning_rate": 4.949920233102736e-05, "loss": 0.5667, "num_input_tokens_seen": 4492368, "step": 400, "train_runtime": 651.0435, "train_tokens_per_second": 6900.258 }, { "epoch": 0.19439611207775845, "grad_norm": 0.5354523658752441, "learning_rate": 4.948661577185295e-05, "loss": 0.5445, "num_input_tokens_seen": 4549008, "step": 405, "train_runtime": 659.5277, "train_tokens_per_second": 6897.372 }, { "epoch": 0.1967960640787184, "grad_norm": 0.4975457191467285, "learning_rate": 4.947387464084679e-05, "loss": 0.5462, "num_input_tokens_seen": 4609072, "step": 410, "train_runtime": 668.316, "train_tokens_per_second": 6896.546 }, { "epoch": 0.1991960160796784, "grad_norm": 0.5424690246582031, "learning_rate": 4.9460979018437314e-05, "loss": 0.5855, "num_input_tokens_seen": 4662560, "step": 415, "train_runtime": 676.3317, "train_tokens_per_second": 6893.895 }, { "epoch": 0.2015959680806384, "grad_norm": 0.5656135678291321, "learning_rate": 4.944792898602818e-05, "loss": 0.5909, "num_input_tokens_seen": 4719248, "step": 420, "train_runtime": 684.592, "train_tokens_per_second": 6893.519 }, { "epoch": 0.20399592008159836, "grad_norm": 0.4792700409889221, "learning_rate": 4.943472462599775e-05, "loss": 0.5211, "num_input_tokens_seen": 4774096, "step": 425, "train_runtime": 692.7133, "train_tokens_per_second": 6891.879 }, { "epoch": 0.20639587208255836, "grad_norm": 0.5212066173553467, "learning_rate": 4.942136602169858e-05, "loss": 0.5245, "num_input_tokens_seen": 4832616, "step": 430, "train_runtime": 700.7277, "train_tokens_per_second": 6896.567 }, { "epoch": 0.20879582408351832, "grad_norm": 0.5669515132904053, "learning_rate": 4.94078532574569e-05, "loss": 0.5304, "num_input_tokens_seen": 4887208, "step": 435, "train_runtime": 708.8314, "train_tokens_per_second": 6894.74 }, { "epoch": 0.21119577608447831, "grad_norm": 0.6369892358779907, "learning_rate": 4.939418641857209e-05, "loss": 0.5879, "num_input_tokens_seen": 4942504, "step": 440, "train_runtime": 716.88, "train_tokens_per_second": 6894.464 }, { "epoch": 0.21359572808543828, "grad_norm": 0.5132316946983337, "learning_rate": 4.938036559131608e-05, "loss": 0.5854, "num_input_tokens_seen": 4997880, "step": 445, "train_runtime": 724.9035, "train_tokens_per_second": 6894.546 }, { "epoch": 0.21599568008639827, "grad_norm": 0.5846990942955017, "learning_rate": 4.9366390862932896e-05, "loss": 0.5545, "num_input_tokens_seen": 5060096, "step": 450, "train_runtime": 733.2702, "train_tokens_per_second": 6900.725 }, { "epoch": 0.21839563208735827, "grad_norm": 0.5361617803573608, "learning_rate": 4.9352262321638056e-05, "loss": 0.528, "num_input_tokens_seen": 5120168, "step": 455, "train_runtime": 741.6463, "train_tokens_per_second": 6903.787 }, { "epoch": 0.22079558408831823, "grad_norm": 0.6068050265312195, "learning_rate": 4.9337980056618006e-05, "loss": 0.5462, "num_input_tokens_seen": 5175776, "step": 460, "train_runtime": 750.017, "train_tokens_per_second": 6900.878 }, { "epoch": 0.22319553608927822, "grad_norm": 0.6304349899291992, "learning_rate": 4.932354415802959e-05, "loss": 0.5399, "num_input_tokens_seen": 5232032, "step": 465, "train_runtime": 758.2013, "train_tokens_per_second": 6900.584 }, { "epoch": 0.2255954880902382, "grad_norm": 0.5615517497062683, "learning_rate": 4.9308954716999464e-05, "loss": 0.5224, "num_input_tokens_seen": 5292688, "step": 470, "train_runtime": 766.6597, "train_tokens_per_second": 6903.569 }, { "epoch": 0.22799544009119818, "grad_norm": 0.7061598896980286, "learning_rate": 4.92942118256235e-05, "loss": 0.5335, "num_input_tokens_seen": 5353096, "step": 475, "train_runtime": 775.3136, "train_tokens_per_second": 6904.427 }, { "epoch": 0.23039539209215815, "grad_norm": 0.6964676976203918, "learning_rate": 4.9279315576966265e-05, "loss": 0.4755, "num_input_tokens_seen": 5412360, "step": 480, "train_runtime": 784.1147, "train_tokens_per_second": 6902.511 }, { "epoch": 0.23279534409311814, "grad_norm": 0.6583765745162964, "learning_rate": 4.926426606506036e-05, "loss": 0.5725, "num_input_tokens_seen": 5466664, "step": 485, "train_runtime": 792.1938, "train_tokens_per_second": 6900.665 }, { "epoch": 0.23519529609407813, "grad_norm": 0.6751510500907898, "learning_rate": 4.924906338490586e-05, "loss": 0.5181, "num_input_tokens_seen": 5526480, "step": 490, "train_runtime": 800.9788, "train_tokens_per_second": 6899.658 }, { "epoch": 0.2375952480950381, "grad_norm": 0.5503116250038147, "learning_rate": 4.9233707632469746e-05, "loss": 0.5586, "num_input_tokens_seen": 5579704, "step": 495, "train_runtime": 808.8081, "train_tokens_per_second": 6898.674 }, { "epoch": 0.2399952000959981, "grad_norm": 0.5688736438751221, "learning_rate": 4.921819890468523e-05, "loss": 0.5465, "num_input_tokens_seen": 5633168, "step": 500, "train_runtime": 816.7042, "train_tokens_per_second": 6897.44 }, { "epoch": 0.24239515209695806, "grad_norm": 0.48173242807388306, "learning_rate": 4.9202537299451215e-05, "loss": 0.488, "num_input_tokens_seen": 5692232, "step": 505, "train_runtime": 825.1134, "train_tokens_per_second": 6898.727 }, { "epoch": 0.24479510409791805, "grad_norm": 0.5660738945007324, "learning_rate": 4.9186722915631626e-05, "loss": 0.5354, "num_input_tokens_seen": 5751464, "step": 510, "train_runtime": 833.8275, "train_tokens_per_second": 6897.666 }, { "epoch": 0.247195056098878, "grad_norm": 0.5903744697570801, "learning_rate": 4.9170755853054806e-05, "loss": 0.6093, "num_input_tokens_seen": 5811696, "step": 515, "train_runtime": 842.0821, "train_tokens_per_second": 6901.579 }, { "epoch": 0.249595008099838, "grad_norm": 0.6396485567092896, "learning_rate": 4.915463621251287e-05, "loss": 0.5436, "num_input_tokens_seen": 5867200, "step": 520, "train_runtime": 850.626, "train_tokens_per_second": 6897.509 }, { "epoch": 0.25199496010079797, "grad_norm": 0.5617818236351013, "learning_rate": 4.913836409576112e-05, "loss": 0.5537, "num_input_tokens_seen": 5924320, "step": 525, "train_runtime": 858.7807, "train_tokens_per_second": 6898.525 }, { "epoch": 0.25439491210175796, "grad_norm": 0.6151410937309265, "learning_rate": 4.912193960551732e-05, "loss": 0.5392, "num_input_tokens_seen": 5979680, "step": 530, "train_runtime": 866.7277, "train_tokens_per_second": 6899.145 }, { "epoch": 0.25679486410271796, "grad_norm": 0.6780862808227539, "learning_rate": 4.9105362845461114e-05, "loss": 0.5776, "num_input_tokens_seen": 6037568, "step": 535, "train_runtime": 874.7881, "train_tokens_per_second": 6901.749 }, { "epoch": 0.25919481610367795, "grad_norm": 0.6386091113090515, "learning_rate": 4.9088633920233345e-05, "loss": 0.5463, "num_input_tokens_seen": 6092712, "step": 540, "train_runtime": 883.1921, "train_tokens_per_second": 6898.513 }, { "epoch": 0.2615947681046379, "grad_norm": 0.49828580021858215, "learning_rate": 4.907175293543541e-05, "loss": 0.6055, "num_input_tokens_seen": 6147664, "step": 545, "train_runtime": 891.085, "train_tokens_per_second": 6899.077 }, { "epoch": 0.2639947201055979, "grad_norm": 0.5254030227661133, "learning_rate": 4.905471999762857e-05, "loss": 0.6124, "num_input_tokens_seen": 6199352, "step": 550, "train_runtime": 898.7767, "train_tokens_per_second": 6897.544 }, { "epoch": 0.2663946721065579, "grad_norm": 0.519650936126709, "learning_rate": 4.9037535214333287e-05, "loss": 0.5247, "num_input_tokens_seen": 6255144, "step": 555, "train_runtime": 906.8762, "train_tokens_per_second": 6897.462 }, { "epoch": 0.26879462410751787, "grad_norm": 0.568850040435791, "learning_rate": 4.9020198694028565e-05, "loss": 0.5647, "num_input_tokens_seen": 6306704, "step": 560, "train_runtime": 914.4502, "train_tokens_per_second": 6896.717 }, { "epoch": 0.2711945761084778, "grad_norm": 0.47335347533226013, "learning_rate": 4.900271054615123e-05, "loss": 0.4978, "num_input_tokens_seen": 6366360, "step": 565, "train_runtime": 923.5165, "train_tokens_per_second": 6893.607 }, { "epoch": 0.2735945281094378, "grad_norm": 0.6809021830558777, "learning_rate": 4.898507088109527e-05, "loss": 0.545, "num_input_tokens_seen": 6421288, "step": 570, "train_runtime": 931.4592, "train_tokens_per_second": 6893.794 }, { "epoch": 0.2759944801103978, "grad_norm": 0.41399407386779785, "learning_rate": 4.8967279810211114e-05, "loss": 0.5454, "num_input_tokens_seen": 6479424, "step": 575, "train_runtime": 939.8206, "train_tokens_per_second": 6894.32 }, { "epoch": 0.2783944321113578, "grad_norm": 0.6248930096626282, "learning_rate": 4.894933744580496e-05, "loss": 0.5506, "num_input_tokens_seen": 6534464, "step": 580, "train_runtime": 947.6162, "train_tokens_per_second": 6895.686 }, { "epoch": 0.2807943841123178, "grad_norm": 0.5835601687431335, "learning_rate": 4.893124390113802e-05, "loss": 0.5536, "num_input_tokens_seen": 6587088, "step": 585, "train_runtime": 955.2033, "train_tokens_per_second": 6896.006 }, { "epoch": 0.2831943361132777, "grad_norm": 0.6930661797523499, "learning_rate": 4.8912999290425854e-05, "loss": 0.5646, "num_input_tokens_seen": 6641552, "step": 590, "train_runtime": 963.1053, "train_tokens_per_second": 6895.977 }, { "epoch": 0.2855942881142377, "grad_norm": 0.6734236478805542, "learning_rate": 4.889460372883762e-05, "loss": 0.5492, "num_input_tokens_seen": 6695296, "step": 595, "train_runtime": 971.3483, "train_tokens_per_second": 6892.786 }, { "epoch": 0.2879942401151977, "grad_norm": 0.5208594799041748, "learning_rate": 4.887605733249535e-05, "loss": 0.5629, "num_input_tokens_seen": 6753000, "step": 600, "train_runtime": 979.3691, "train_tokens_per_second": 6895.255 }, { "epoch": 0.2903941921161577, "grad_norm": 0.5543494820594788, "learning_rate": 4.885736021847322e-05, "loss": 0.5165, "num_input_tokens_seen": 6808816, "step": 605, "train_runtime": 987.597, "train_tokens_per_second": 6894.326 }, { "epoch": 0.2927941441171177, "grad_norm": 0.4651249051094055, "learning_rate": 4.883851250479682e-05, "loss": 0.5292, "num_input_tokens_seen": 6866616, "step": 610, "train_runtime": 995.7307, "train_tokens_per_second": 6896.057 }, { "epoch": 0.2951940961180776, "grad_norm": 0.6964675188064575, "learning_rate": 4.881951431044241e-05, "loss": 0.5368, "num_input_tokens_seen": 6926136, "step": 615, "train_runtime": 1004.3343, "train_tokens_per_second": 6896.246 }, { "epoch": 0.2975940481190376, "grad_norm": 0.5867466330528259, "learning_rate": 4.8800365755336114e-05, "loss": 0.5104, "num_input_tokens_seen": 6982680, "step": 620, "train_runtime": 1012.6109, "train_tokens_per_second": 6895.719 }, { "epoch": 0.2999940001199976, "grad_norm": 0.7193952202796936, "learning_rate": 4.8781066960353264e-05, "loss": 0.5729, "num_input_tokens_seen": 7035152, "step": 625, "train_runtime": 1020.3607, "train_tokens_per_second": 6894.77 }, { "epoch": 0.3023939521209576, "grad_norm": 0.6436483860015869, "learning_rate": 4.876161804731756e-05, "loss": 0.5777, "num_input_tokens_seen": 7085976, "step": 630, "train_runtime": 1028.043, "train_tokens_per_second": 6892.684 }, { "epoch": 0.30479390412191754, "grad_norm": 0.8330582976341248, "learning_rate": 4.87420191390003e-05, "loss": 0.5729, "num_input_tokens_seen": 7139560, "step": 635, "train_runtime": 1035.6955, "train_tokens_per_second": 6893.493 }, { "epoch": 0.30719385612287753, "grad_norm": 0.5311642289161682, "learning_rate": 4.872227035911967e-05, "loss": 0.5212, "num_input_tokens_seen": 7194016, "step": 640, "train_runtime": 1043.4188, "train_tokens_per_second": 6894.658 }, { "epoch": 0.3095938081238375, "grad_norm": 0.5079819560050964, "learning_rate": 4.87023718323399e-05, "loss": 0.5227, "num_input_tokens_seen": 7249352, "step": 645, "train_runtime": 1051.5718, "train_tokens_per_second": 6893.825 }, { "epoch": 0.3119937601247975, "grad_norm": 0.5671476721763611, "learning_rate": 4.868232368427048e-05, "loss": 0.5057, "num_input_tokens_seen": 7312768, "step": 650, "train_runtime": 1060.8218, "train_tokens_per_second": 6893.494 }, { "epoch": 0.3143937121257575, "grad_norm": 0.5429338216781616, "learning_rate": 4.8662126041465414e-05, "loss": 0.522, "num_input_tokens_seen": 7371440, "step": 655, "train_runtime": 1068.911, "train_tokens_per_second": 6896.215 }, { "epoch": 0.31679366412671744, "grad_norm": 0.5430482625961304, "learning_rate": 4.864177903142237e-05, "loss": 0.5712, "num_input_tokens_seen": 7428856, "step": 660, "train_runtime": 1077.1698, "train_tokens_per_second": 6896.643 }, { "epoch": 0.31919361612767744, "grad_norm": 0.5577422380447388, "learning_rate": 4.862128278258191e-05, "loss": 0.5763, "num_input_tokens_seen": 7482928, "step": 665, "train_runtime": 1085.0793, "train_tokens_per_second": 6896.204 }, { "epoch": 0.32159356812863743, "grad_norm": 0.8080245852470398, "learning_rate": 4.8600637424326676e-05, "loss": 0.5921, "num_input_tokens_seen": 7537000, "step": 670, "train_runtime": 1092.9929, "train_tokens_per_second": 6895.744 }, { "epoch": 0.3239935201295974, "grad_norm": 0.5444366931915283, "learning_rate": 4.8579843086980536e-05, "loss": 0.5164, "num_input_tokens_seen": 7600512, "step": 675, "train_runtime": 1102.2798, "train_tokens_per_second": 6895.266 }, { "epoch": 0.3263934721305574, "grad_norm": 0.7307661771774292, "learning_rate": 4.855889990180781e-05, "loss": 0.4963, "num_input_tokens_seen": 7655032, "step": 680, "train_runtime": 1110.5484, "train_tokens_per_second": 6893.02 }, { "epoch": 0.32879342413151735, "grad_norm": 0.5061231851577759, "learning_rate": 4.853780800101241e-05, "loss": 0.5042, "num_input_tokens_seen": 7709432, "step": 685, "train_runtime": 1118.5898, "train_tokens_per_second": 6892.099 }, { "epoch": 0.33119337613247735, "grad_norm": 0.5457553863525391, "learning_rate": 4.851656751773702e-05, "loss": 0.5505, "num_input_tokens_seen": 7768248, "step": 690, "train_runtime": 1127.0452, "train_tokens_per_second": 6892.579 }, { "epoch": 0.33359332813343734, "grad_norm": 0.581109881401062, "learning_rate": 4.849517858606225e-05, "loss": 0.5219, "num_input_tokens_seen": 7821976, "step": 695, "train_runtime": 1135.0848, "train_tokens_per_second": 6891.094 }, { "epoch": 0.33599328013439733, "grad_norm": 0.6451846361160278, "learning_rate": 4.84736413410058e-05, "loss": 0.541, "num_input_tokens_seen": 7875264, "step": 700, "train_runtime": 1143.0269, "train_tokens_per_second": 6889.833 }, { "epoch": 0.33839323213535727, "grad_norm": 0.48146116733551025, "learning_rate": 4.8451955918521586e-05, "loss": 0.5666, "num_input_tokens_seen": 7929656, "step": 705, "train_runtime": 1150.8799, "train_tokens_per_second": 6890.081 }, { "epoch": 0.34079318413631726, "grad_norm": 0.5709965825080872, "learning_rate": 4.84301224554989e-05, "loss": 0.5295, "num_input_tokens_seen": 7980872, "step": 710, "train_runtime": 1158.4978, "train_tokens_per_second": 6888.983 }, { "epoch": 0.34319313613727725, "grad_norm": 0.6052954196929932, "learning_rate": 4.840814108976154e-05, "loss": 0.5509, "num_input_tokens_seen": 8037376, "step": 715, "train_runtime": 1166.5335, "train_tokens_per_second": 6889.966 }, { "epoch": 0.34559308813823725, "grad_norm": 0.5755806565284729, "learning_rate": 4.838601196006694e-05, "loss": 0.54, "num_input_tokens_seen": 8094024, "step": 720, "train_runtime": 1175.0556, "train_tokens_per_second": 6888.205 }, { "epoch": 0.34799304013919724, "grad_norm": 0.5676959753036499, "learning_rate": 4.8363735206105276e-05, "loss": 0.5663, "num_input_tokens_seen": 8152456, "step": 725, "train_runtime": 1183.2718, "train_tokens_per_second": 6889.758 }, { "epoch": 0.3503929921401572, "grad_norm": 0.7371501922607422, "learning_rate": 4.8341310968498656e-05, "loss": 0.5171, "num_input_tokens_seen": 8206424, "step": 730, "train_runtime": 1191.1851, "train_tokens_per_second": 6889.294 }, { "epoch": 0.35279294414111717, "grad_norm": 0.6847190260887146, "learning_rate": 4.831873938880012e-05, "loss": 0.5407, "num_input_tokens_seen": 8262160, "step": 735, "train_runtime": 1199.2457, "train_tokens_per_second": 6889.464 }, { "epoch": 0.35519289614207716, "grad_norm": 0.5282928347587585, "learning_rate": 4.829602060949282e-05, "loss": 0.5729, "num_input_tokens_seen": 8316480, "step": 740, "train_runtime": 1207.1347, "train_tokens_per_second": 6889.438 }, { "epoch": 0.35759284814303716, "grad_norm": 0.39273539185523987, "learning_rate": 4.827315477398914e-05, "loss": 0.4796, "num_input_tokens_seen": 8379024, "step": 745, "train_runtime": 1216.4818, "train_tokens_per_second": 6887.916 }, { "epoch": 0.3599928001439971, "grad_norm": 0.442878395318985, "learning_rate": 4.825014202662972e-05, "loss": 0.5178, "num_input_tokens_seen": 8436408, "step": 750, "train_runtime": 1224.6516, "train_tokens_per_second": 6888.823 }, { "epoch": 0.3623927521449571, "grad_norm": 0.5023097395896912, "learning_rate": 4.82269825126826e-05, "loss": 0.5436, "num_input_tokens_seen": 8494184, "step": 755, "train_runtime": 1233.1902, "train_tokens_per_second": 6887.975 }, { "epoch": 0.3647927041459171, "grad_norm": 0.6507300138473511, "learning_rate": 4.8203676378342263e-05, "loss": 0.5761, "num_input_tokens_seen": 8543600, "step": 760, "train_runtime": 1240.5356, "train_tokens_per_second": 6887.025 }, { "epoch": 0.36719265614687707, "grad_norm": 0.6500417590141296, "learning_rate": 4.818022377072876e-05, "loss": 0.5519, "num_input_tokens_seen": 8601672, "step": 765, "train_runtime": 1249.3942, "train_tokens_per_second": 6884.674 }, { "epoch": 0.36959260814783707, "grad_norm": 0.720543384552002, "learning_rate": 4.8156624837886744e-05, "loss": 0.5488, "num_input_tokens_seen": 8654824, "step": 770, "train_runtime": 1257.4098, "train_tokens_per_second": 6883.058 }, { "epoch": 0.371992560148797, "grad_norm": 0.5728187561035156, "learning_rate": 4.813287972878454e-05, "loss": 0.5093, "num_input_tokens_seen": 8709032, "step": 775, "train_runtime": 1265.5372, "train_tokens_per_second": 6881.688 }, { "epoch": 0.374392512149757, "grad_norm": 0.6271533966064453, "learning_rate": 4.810898859331322e-05, "loss": 0.5451, "num_input_tokens_seen": 8766264, "step": 780, "train_runtime": 1273.7019, "train_tokens_per_second": 6882.508 }, { "epoch": 0.376792464150717, "grad_norm": 0.5907756686210632, "learning_rate": 4.8084951582285634e-05, "loss": 0.4965, "num_input_tokens_seen": 8820344, "step": 785, "train_runtime": 1281.2863, "train_tokens_per_second": 6883.976 }, { "epoch": 0.379192416151677, "grad_norm": 0.5380600094795227, "learning_rate": 4.80607688474355e-05, "loss": 0.5298, "num_input_tokens_seen": 8881528, "step": 790, "train_runtime": 1289.7202, "train_tokens_per_second": 6886.399 }, { "epoch": 0.381592368152637, "grad_norm": 0.6812204718589783, "learning_rate": 4.803644054141639e-05, "loss": 0.5278, "num_input_tokens_seen": 8939712, "step": 795, "train_runtime": 1298.1298, "train_tokens_per_second": 6886.609 }, { "epoch": 0.3839923201535969, "grad_norm": 0.8065762519836426, "learning_rate": 4.8011966817800804e-05, "loss": 0.548, "num_input_tokens_seen": 8994888, "step": 800, "train_runtime": 1306.2424, "train_tokens_per_second": 6886.078 }, { "epoch": 0.3863922721545569, "grad_norm": 0.7721138596534729, "learning_rate": 4.79873478310792e-05, "loss": 0.5446, "num_input_tokens_seen": 9052200, "step": 805, "train_runtime": 1314.0422, "train_tokens_per_second": 6888.82 }, { "epoch": 0.3887922241555169, "grad_norm": 0.7508792281150818, "learning_rate": 4.796258373665899e-05, "loss": 0.5531, "num_input_tokens_seen": 9106936, "step": 810, "train_runtime": 1322.1708, "train_tokens_per_second": 6887.867 }, { "epoch": 0.3911921761564769, "grad_norm": 0.7303242087364197, "learning_rate": 4.793767469086361e-05, "loss": 0.5786, "num_input_tokens_seen": 9158400, "step": 815, "train_runtime": 1329.9099, "train_tokens_per_second": 6886.482 }, { "epoch": 0.3935921281574368, "grad_norm": 0.5493381023406982, "learning_rate": 4.791262085093147e-05, "loss": 0.5285, "num_input_tokens_seen": 9218552, "step": 820, "train_runtime": 1338.4057, "train_tokens_per_second": 6887.711 }, { "epoch": 0.3959920801583968, "grad_norm": 0.5721644163131714, "learning_rate": 4.788742237501499e-05, "loss": 0.5481, "num_input_tokens_seen": 9272768, "step": 825, "train_runtime": 1346.3952, "train_tokens_per_second": 6887.107 }, { "epoch": 0.3983920321593568, "grad_norm": 0.5689188241958618, "learning_rate": 4.786207942217965e-05, "loss": 0.5457, "num_input_tokens_seen": 9327048, "step": 830, "train_runtime": 1354.3004, "train_tokens_per_second": 6886.986 }, { "epoch": 0.4007919841603168, "grad_norm": 0.48985663056373596, "learning_rate": 4.783659215240289e-05, "loss": 0.5067, "num_input_tokens_seen": 9389344, "step": 835, "train_runtime": 1363.2987, "train_tokens_per_second": 6887.224 }, { "epoch": 0.4031919361612768, "grad_norm": 0.7661736011505127, "learning_rate": 4.78109607265732e-05, "loss": 0.5576, "num_input_tokens_seen": 9444656, "step": 840, "train_runtime": 1371.5402, "train_tokens_per_second": 6886.168 }, { "epoch": 0.40559188816223674, "grad_norm": 0.6617030501365662, "learning_rate": 4.778518530648899e-05, "loss": 0.5566, "num_input_tokens_seen": 9499464, "step": 845, "train_runtime": 1379.7517, "train_tokens_per_second": 6884.908 }, { "epoch": 0.40799184016319673, "grad_norm": 0.6450020670890808, "learning_rate": 4.77592660548577e-05, "loss": 0.5486, "num_input_tokens_seen": 9553432, "step": 850, "train_runtime": 1387.7923, "train_tokens_per_second": 6883.906 }, { "epoch": 0.4103917921641567, "grad_norm": 0.6538447737693787, "learning_rate": 4.7733203135294676e-05, "loss": 0.5289, "num_input_tokens_seen": 9608536, "step": 855, "train_runtime": 1396.0755, "train_tokens_per_second": 6882.533 }, { "epoch": 0.4127917441651167, "grad_norm": 0.5988488793373108, "learning_rate": 4.770699671232216e-05, "loss": 0.5261, "num_input_tokens_seen": 9661208, "step": 860, "train_runtime": 1403.8269, "train_tokens_per_second": 6882.051 }, { "epoch": 0.41519169616607665, "grad_norm": 0.5807068347930908, "learning_rate": 4.768064695136829e-05, "loss": 0.5306, "num_input_tokens_seen": 9721752, "step": 865, "train_runtime": 1412.2285, "train_tokens_per_second": 6883.98 }, { "epoch": 0.41759164816703664, "grad_norm": 0.48121166229248047, "learning_rate": 4.765415401876599e-05, "loss": 0.5549, "num_input_tokens_seen": 9779768, "step": 870, "train_runtime": 1420.4407, "train_tokens_per_second": 6885.024 }, { "epoch": 0.41999160016799664, "grad_norm": 0.565889835357666, "learning_rate": 4.7627518081751975e-05, "loss": 0.5355, "num_input_tokens_seen": 9835272, "step": 875, "train_runtime": 1428.7292, "train_tokens_per_second": 6883.93 }, { "epoch": 0.42239155216895663, "grad_norm": 0.7845768928527832, "learning_rate": 4.760073930846569e-05, "loss": 0.5411, "num_input_tokens_seen": 9890512, "step": 880, "train_runtime": 1436.5286, "train_tokens_per_second": 6885.009 }, { "epoch": 0.4247915041699166, "grad_norm": 0.6052142381668091, "learning_rate": 4.75738178679482e-05, "loss": 0.5432, "num_input_tokens_seen": 9944392, "step": 885, "train_runtime": 1444.2701, "train_tokens_per_second": 6885.41 }, { "epoch": 0.42719145617087656, "grad_norm": 0.6109101176261902, "learning_rate": 4.754675393014117e-05, "loss": 0.4997, "num_input_tokens_seen": 9999080, "step": 890, "train_runtime": 1452.4138, "train_tokens_per_second": 6884.457 }, { "epoch": 0.42959140817183655, "grad_norm": 0.8205054998397827, "learning_rate": 4.751954766588581e-05, "loss": 0.5276, "num_input_tokens_seen": 10053320, "step": 895, "train_runtime": 1460.9561, "train_tokens_per_second": 6881.329 }, { "epoch": 0.43199136017279655, "grad_norm": 0.6081852316856384, "learning_rate": 4.749219924692172e-05, "loss": 0.4801, "num_input_tokens_seen": 10112592, "step": 900, "train_runtime": 1469.5246, "train_tokens_per_second": 6881.54 }, { "epoch": 0.43439131217375654, "grad_norm": 0.6444746851921082, "learning_rate": 4.7464708845885877e-05, "loss": 0.4976, "num_input_tokens_seen": 10168072, "step": 905, "train_runtime": 1477.8554, "train_tokens_per_second": 6880.289 }, { "epoch": 0.43679126417471653, "grad_norm": 0.591349184513092, "learning_rate": 4.7437076636311514e-05, "loss": 0.5343, "num_input_tokens_seen": 10221648, "step": 910, "train_runtime": 1486.1153, "train_tokens_per_second": 6878.099 }, { "epoch": 0.43919121617567647, "grad_norm": 0.6491187810897827, "learning_rate": 4.7409302792627044e-05, "loss": 0.4946, "num_input_tokens_seen": 10284000, "step": 915, "train_runtime": 1494.8745, "train_tokens_per_second": 6879.507 }, { "epoch": 0.44159116817663646, "grad_norm": 0.6963967084884644, "learning_rate": 4.738138749015492e-05, "loss": 0.5109, "num_input_tokens_seen": 10340768, "step": 920, "train_runtime": 1502.9982, "train_tokens_per_second": 6880.094 }, { "epoch": 0.44399112017759645, "grad_norm": 0.4319298267364502, "learning_rate": 4.735333090511056e-05, "loss": 0.5082, "num_input_tokens_seen": 10400928, "step": 925, "train_runtime": 1511.7616, "train_tokens_per_second": 6880.006 }, { "epoch": 0.44639107217855645, "grad_norm": 0.6248960494995117, "learning_rate": 4.732513321460127e-05, "loss": 0.5612, "num_input_tokens_seen": 10456208, "step": 930, "train_runtime": 1519.9104, "train_tokens_per_second": 6879.49 }, { "epoch": 0.4487910241795164, "grad_norm": 0.7751626372337341, "learning_rate": 4.729679459662502e-05, "loss": 0.5253, "num_input_tokens_seen": 10513352, "step": 935, "train_runtime": 1528.6579, "train_tokens_per_second": 6877.505 }, { "epoch": 0.4511909761804764, "grad_norm": 0.5862913131713867, "learning_rate": 4.726831523006944e-05, "loss": 0.5403, "num_input_tokens_seen": 10568872, "step": 940, "train_runtime": 1537.6257, "train_tokens_per_second": 6873.501 }, { "epoch": 0.45359092818143637, "grad_norm": 0.7188037633895874, "learning_rate": 4.7239695294710586e-05, "loss": 0.5332, "num_input_tokens_seen": 10623984, "step": 945, "train_runtime": 1545.7364, "train_tokens_per_second": 6873.089 }, { "epoch": 0.45599088018239636, "grad_norm": 0.7903031706809998, "learning_rate": 4.7210934971211906e-05, "loss": 0.572, "num_input_tokens_seen": 10675064, "step": 950, "train_runtime": 1553.3218, "train_tokens_per_second": 6872.41 }, { "epoch": 0.45839083218335636, "grad_norm": 0.5360180139541626, "learning_rate": 4.718203444112301e-05, "loss": 0.4812, "num_input_tokens_seen": 10735624, "step": 955, "train_runtime": 1561.8181, "train_tokens_per_second": 6873.799 }, { "epoch": 0.4607907841843163, "grad_norm": 0.6711071133613586, "learning_rate": 4.7152993886878585e-05, "loss": 0.4681, "num_input_tokens_seen": 10790472, "step": 960, "train_runtime": 1569.6307, "train_tokens_per_second": 6874.529 }, { "epoch": 0.4631907361852763, "grad_norm": 0.6165657043457031, "learning_rate": 4.712381349179721e-05, "loss": 0.508, "num_input_tokens_seen": 10844896, "step": 965, "train_runtime": 1577.4118, "train_tokens_per_second": 6875.12 }, { "epoch": 0.4655906881862363, "grad_norm": 0.6834767460823059, "learning_rate": 4.709449344008021e-05, "loss": 0.4988, "num_input_tokens_seen": 10902552, "step": 970, "train_runtime": 1586.0811, "train_tokens_per_second": 6873.893 }, { "epoch": 0.46799064018719627, "grad_norm": 0.7366524338722229, "learning_rate": 4.706503391681049e-05, "loss": 0.5755, "num_input_tokens_seen": 10956224, "step": 975, "train_runtime": 1593.6535, "train_tokens_per_second": 6874.91 }, { "epoch": 0.47039059218815626, "grad_norm": 0.5903698205947876, "learning_rate": 4.7035435107951384e-05, "loss": 0.5283, "num_input_tokens_seen": 11011072, "step": 980, "train_runtime": 1601.6979, "train_tokens_per_second": 6874.625 }, { "epoch": 0.4727905441891162, "grad_norm": 0.631288468837738, "learning_rate": 4.700569720034545e-05, "loss": 0.4954, "num_input_tokens_seen": 11066344, "step": 985, "train_runtime": 1609.8085, "train_tokens_per_second": 6874.323 }, { "epoch": 0.4751904961900762, "grad_norm": 0.5448499917984009, "learning_rate": 4.697582038171332e-05, "loss": 0.5431, "num_input_tokens_seen": 11121472, "step": 990, "train_runtime": 1618.0718, "train_tokens_per_second": 6873.287 }, { "epoch": 0.4775904481910362, "grad_norm": 0.5397956967353821, "learning_rate": 4.694580484065248e-05, "loss": 0.4836, "num_input_tokens_seen": 11181736, "step": 995, "train_runtime": 1627.2301, "train_tokens_per_second": 6871.638 }, { "epoch": 0.4799904001919962, "grad_norm": 0.7059435248374939, "learning_rate": 4.6915650766636156e-05, "loss": 0.4765, "num_input_tokens_seen": 11241912, "step": 1000, "train_runtime": 1635.6606, "train_tokens_per_second": 6873.01 }, { "epoch": 0.4823903521929561, "grad_norm": 0.5551899075508118, "learning_rate": 4.6885358350011986e-05, "loss": 0.523, "num_input_tokens_seen": 11296568, "step": 1005, "train_runtime": 1644.0041, "train_tokens_per_second": 6871.375 }, { "epoch": 0.4847903041939161, "grad_norm": 0.659951388835907, "learning_rate": 4.6854927782000954e-05, "loss": 0.4891, "num_input_tokens_seen": 11351944, "step": 1010, "train_runtime": 1652.1239, "train_tokens_per_second": 6871.121 }, { "epoch": 0.4871902561948761, "grad_norm": 0.6763627529144287, "learning_rate": 4.6824359254696105e-05, "loss": 0.502, "num_input_tokens_seen": 11410584, "step": 1015, "train_runtime": 1661.21, "train_tokens_per_second": 6868.839 }, { "epoch": 0.4895902081958361, "grad_norm": 0.49618440866470337, "learning_rate": 4.6793652961061364e-05, "loss": 0.5451, "num_input_tokens_seen": 11465560, "step": 1020, "train_runtime": 1669.6454, "train_tokens_per_second": 6867.063 }, { "epoch": 0.4919901601967961, "grad_norm": 0.6427881717681885, "learning_rate": 4.676280909493028e-05, "loss": 0.5277, "num_input_tokens_seen": 11523960, "step": 1025, "train_runtime": 1678.2247, "train_tokens_per_second": 6866.756 }, { "epoch": 0.494390112197756, "grad_norm": 0.7086818218231201, "learning_rate": 4.673182785100485e-05, "loss": 0.4885, "num_input_tokens_seen": 11584904, "step": 1030, "train_runtime": 1687.3682, "train_tokens_per_second": 6865.665 }, { "epoch": 0.496790064198716, "grad_norm": 0.5998096466064453, "learning_rate": 4.6700709424854274e-05, "loss": 0.5266, "num_input_tokens_seen": 11642456, "step": 1035, "train_runtime": 1696.3396, "train_tokens_per_second": 6863.281 }, { "epoch": 0.499190016199676, "grad_norm": 0.6782186627388, "learning_rate": 4.66694540129137e-05, "loss": 0.5813, "num_input_tokens_seen": 11696912, "step": 1040, "train_runtime": 1704.0809, "train_tokens_per_second": 6864.059 }, { "epoch": 0.501589968200636, "grad_norm": 0.541053056716919, "learning_rate": 4.6638061812483005e-05, "loss": 0.4875, "num_input_tokens_seen": 11755104, "step": 1045, "train_runtime": 1712.4044, "train_tokens_per_second": 6864.677 }, { "epoch": 0.5039899202015959, "grad_norm": 0.6517828106880188, "learning_rate": 4.660653302172554e-05, "loss": 0.5367, "num_input_tokens_seen": 11810984, "step": 1050, "train_runtime": 1720.4999, "train_tokens_per_second": 6864.856 }, { "epoch": 0.5063898722025559, "grad_norm": 0.6961039900779724, "learning_rate": 4.6574867839666895e-05, "loss": 0.5314, "num_input_tokens_seen": 11862312, "step": 1055, "train_runtime": 1728.5831, "train_tokens_per_second": 6862.448 }, { "epoch": 0.5087898242035159, "grad_norm": 0.7300373911857605, "learning_rate": 4.654306646619361e-05, "loss": 0.5496, "num_input_tokens_seen": 11923072, "step": 1060, "train_runtime": 1737.4285, "train_tokens_per_second": 6862.482 }, { "epoch": 0.5111897762044759, "grad_norm": 0.7295413017272949, "learning_rate": 4.6511129102051954e-05, "loss": 0.5205, "num_input_tokens_seen": 11978568, "step": 1065, "train_runtime": 1745.4719, "train_tokens_per_second": 6862.653 }, { "epoch": 0.5135897282054359, "grad_norm": 0.5520017743110657, "learning_rate": 4.647905594884663e-05, "loss": 0.4768, "num_input_tokens_seen": 12035000, "step": 1070, "train_runtime": 1754.3574, "train_tokens_per_second": 6860.062 }, { "epoch": 0.5159896802063959, "grad_norm": 0.5629371404647827, "learning_rate": 4.6446847209039504e-05, "loss": 0.5136, "num_input_tokens_seen": 12096040, "step": 1075, "train_runtime": 1762.6751, "train_tokens_per_second": 6862.32 }, { "epoch": 0.5183896322073559, "grad_norm": 0.750357449054718, "learning_rate": 4.6414503085948334e-05, "loss": 0.5022, "num_input_tokens_seen": 12148448, "step": 1080, "train_runtime": 1770.3881, "train_tokens_per_second": 6862.025 }, { "epoch": 0.5207895842083158, "grad_norm": 0.9546124339103699, "learning_rate": 4.63820237837455e-05, "loss": 0.5196, "num_input_tokens_seen": 12207120, "step": 1085, "train_runtime": 1778.4216, "train_tokens_per_second": 6864.019 }, { "epoch": 0.5231895362092758, "grad_norm": 0.6891536712646484, "learning_rate": 4.634940950745668e-05, "loss": 0.5566, "num_input_tokens_seen": 12261136, "step": 1090, "train_runtime": 1786.6486, "train_tokens_per_second": 6862.646 }, { "epoch": 0.5255894882102358, "grad_norm": 0.7175304889678955, "learning_rate": 4.631666046295959e-05, "loss": 0.5483, "num_input_tokens_seen": 12313856, "step": 1095, "train_runtime": 1794.8084, "train_tokens_per_second": 6860.819 }, { "epoch": 0.5279894402111958, "grad_norm": 0.7148723602294922, "learning_rate": 4.628377685698268e-05, "loss": 0.5072, "num_input_tokens_seen": 12367984, "step": 1100, "train_runtime": 1802.927, "train_tokens_per_second": 6859.947 }, { "epoch": 0.5303893922121558, "grad_norm": 0.6276180148124695, "learning_rate": 4.6250758897103775e-05, "loss": 0.5316, "num_input_tokens_seen": 12422128, "step": 1105, "train_runtime": 1810.8688, "train_tokens_per_second": 6859.761 }, { "epoch": 0.5327893442131157, "grad_norm": 0.5570586919784546, "learning_rate": 4.621760679174887e-05, "loss": 0.4781, "num_input_tokens_seen": 12477576, "step": 1110, "train_runtime": 1818.8781, "train_tokens_per_second": 6860.04 }, { "epoch": 0.5351892962140757, "grad_norm": 0.46177980303764343, "learning_rate": 4.618432075019071e-05, "loss": 0.5028, "num_input_tokens_seen": 12536840, "step": 1115, "train_runtime": 1827.02, "train_tokens_per_second": 6861.906 }, { "epoch": 0.5375892482150357, "grad_norm": 0.8723595142364502, "learning_rate": 4.615090098254753e-05, "loss": 0.5637, "num_input_tokens_seen": 12592424, "step": 1120, "train_runtime": 1835.1133, "train_tokens_per_second": 6861.933 }, { "epoch": 0.5399892002159957, "grad_norm": 0.5950156450271606, "learning_rate": 4.6117347699781706e-05, "loss": 0.5276, "num_input_tokens_seen": 12650424, "step": 1125, "train_runtime": 1843.7548, "train_tokens_per_second": 6861.229 }, { "epoch": 0.5423891522169556, "grad_norm": 0.7282635569572449, "learning_rate": 4.608366111369843e-05, "loss": 0.518, "num_input_tokens_seen": 12706224, "step": 1130, "train_runtime": 1851.8221, "train_tokens_per_second": 6861.471 }, { "epoch": 0.5447891042179156, "grad_norm": 0.5508381724357605, "learning_rate": 4.6049841436944385e-05, "loss": 0.4956, "num_input_tokens_seen": 12767096, "step": 1135, "train_runtime": 1860.911, "train_tokens_per_second": 6860.67 }, { "epoch": 0.5471890562188756, "grad_norm": 0.57481849193573, "learning_rate": 4.6015888883006364e-05, "loss": 0.539, "num_input_tokens_seen": 12821808, "step": 1140, "train_runtime": 1868.849, "train_tokens_per_second": 6860.805 }, { "epoch": 0.5495890082198356, "grad_norm": 0.4912041425704956, "learning_rate": 4.598180366620996e-05, "loss": 0.5163, "num_input_tokens_seen": 12874928, "step": 1145, "train_runtime": 1876.9045, "train_tokens_per_second": 6859.661 }, { "epoch": 0.5519889602207956, "grad_norm": 0.666242778301239, "learning_rate": 4.594758600171821e-05, "loss": 0.5662, "num_input_tokens_seen": 12927848, "step": 1150, "train_runtime": 1884.5707, "train_tokens_per_second": 6859.837 }, { "epoch": 0.5543889122217556, "grad_norm": 0.6598814129829407, "learning_rate": 4.591323610553021e-05, "loss": 0.493, "num_input_tokens_seen": 12985640, "step": 1155, "train_runtime": 1892.6667, "train_tokens_per_second": 6861.028 }, { "epoch": 0.5567888642227156, "grad_norm": 0.8162060379981995, "learning_rate": 4.587875419447979e-05, "loss": 0.5289, "num_input_tokens_seen": 13041608, "step": 1160, "train_runtime": 1901.1294, "train_tokens_per_second": 6859.927 }, { "epoch": 0.5591888162236756, "grad_norm": 0.7061068415641785, "learning_rate": 4.5844140486234086e-05, "loss": 0.4997, "num_input_tokens_seen": 13094240, "step": 1165, "train_runtime": 1909.2382, "train_tokens_per_second": 6858.358 }, { "epoch": 0.5615887682246355, "grad_norm": 0.5444318056106567, "learning_rate": 4.580939519929226e-05, "loss": 0.5155, "num_input_tokens_seen": 13150544, "step": 1170, "train_runtime": 1917.2941, "train_tokens_per_second": 6858.908 }, { "epoch": 0.5639887202255955, "grad_norm": 0.5705589652061462, "learning_rate": 4.577451855298402e-05, "loss": 0.4927, "num_input_tokens_seen": 13211016, "step": 1175, "train_runtime": 1925.8239, "train_tokens_per_second": 6859.93 }, { "epoch": 0.5663886722265554, "grad_norm": 0.6715133190155029, "learning_rate": 4.5739510767468295e-05, "loss": 0.5525, "num_input_tokens_seen": 13269168, "step": 1180, "train_runtime": 1934.5386, "train_tokens_per_second": 6859.087 }, { "epoch": 0.5687886242275154, "grad_norm": 0.5893720388412476, "learning_rate": 4.570437206373183e-05, "loss": 0.5094, "num_input_tokens_seen": 13326336, "step": 1185, "train_runtime": 1942.8203, "train_tokens_per_second": 6859.274 }, { "epoch": 0.5711885762284754, "grad_norm": 0.5553702116012573, "learning_rate": 4.5669102663587795e-05, "loss": 0.5036, "num_input_tokens_seen": 13382784, "step": 1190, "train_runtime": 1950.7367, "train_tokens_per_second": 6860.374 }, { "epoch": 0.5735885282294354, "grad_norm": 0.9842544198036194, "learning_rate": 4.563370278967437e-05, "loss": 0.523, "num_input_tokens_seen": 13438016, "step": 1195, "train_runtime": 1958.7567, "train_tokens_per_second": 6860.482 }, { "epoch": 0.5759884802303954, "grad_norm": 0.7406736612319946, "learning_rate": 4.559817266545337e-05, "loss": 0.562, "num_input_tokens_seen": 13492904, "step": 1200, "train_runtime": 1966.6536, "train_tokens_per_second": 6860.844 }, { "epoch": 0.5783884322313554, "grad_norm": 0.6010822057723999, "learning_rate": 4.5562512515208816e-05, "loss": 0.5257, "num_input_tokens_seen": 13546992, "step": 1205, "train_runtime": 1974.2569, "train_tokens_per_second": 6861.818 }, { "epoch": 0.5807883842323154, "grad_norm": 0.5682114362716675, "learning_rate": 4.5526722564045486e-05, "loss": 0.5234, "num_input_tokens_seen": 13599704, "step": 1210, "train_runtime": 1982.0432, "train_tokens_per_second": 6861.457 }, { "epoch": 0.5831883362332754, "grad_norm": 0.7476803064346313, "learning_rate": 4.5490803037887556e-05, "loss": 0.4522, "num_input_tokens_seen": 13658840, "step": 1215, "train_runtime": 1990.3973, "train_tokens_per_second": 6862.369 }, { "epoch": 0.5855882882342354, "grad_norm": 0.8684011697769165, "learning_rate": 4.545475416347714e-05, "loss": 0.504, "num_input_tokens_seen": 13712920, "step": 1220, "train_runtime": 1998.5695, "train_tokens_per_second": 6861.367 }, { "epoch": 0.5879882402351952, "grad_norm": 0.6915135383605957, "learning_rate": 4.5418576168372864e-05, "loss": 0.5473, "num_input_tokens_seen": 13768056, "step": 1225, "train_runtime": 2006.2278, "train_tokens_per_second": 6862.658 }, { "epoch": 0.5903881922361552, "grad_norm": 0.6309444308280945, "learning_rate": 4.538226928094841e-05, "loss": 0.5321, "num_input_tokens_seen": 13826288, "step": 1230, "train_runtime": 2014.608, "train_tokens_per_second": 6863.016 }, { "epoch": 0.5927881442371152, "grad_norm": 0.7776080965995789, "learning_rate": 4.534583373039112e-05, "loss": 0.5578, "num_input_tokens_seen": 13880688, "step": 1235, "train_runtime": 2022.5528, "train_tokens_per_second": 6862.955 }, { "epoch": 0.5951880962380752, "grad_norm": 0.5800984501838684, "learning_rate": 4.530926974670052e-05, "loss": 0.5097, "num_input_tokens_seen": 13937072, "step": 1240, "train_runtime": 2030.7522, "train_tokens_per_second": 6863.01 }, { "epoch": 0.5975880482390352, "grad_norm": 0.6254319548606873, "learning_rate": 4.5272577560686834e-05, "loss": 0.5038, "num_input_tokens_seen": 13990528, "step": 1245, "train_runtime": 2038.6265, "train_tokens_per_second": 6862.723 }, { "epoch": 0.5999880002399952, "grad_norm": 0.7174450755119324, "learning_rate": 4.523575740396962e-05, "loss": 0.5304, "num_input_tokens_seen": 14044296, "step": 1250, "train_runtime": 2046.8343, "train_tokens_per_second": 6861.472 }, { "epoch": 0.6023879522409552, "grad_norm": 0.7481257915496826, "learning_rate": 4.5198809508976206e-05, "loss": 0.4927, "num_input_tokens_seen": 14102520, "step": 1255, "train_runtime": 2055.3394, "train_tokens_per_second": 6861.407 }, { "epoch": 0.6047879042419152, "grad_norm": 0.909005343914032, "learning_rate": 4.516173410894028e-05, "loss": 0.5067, "num_input_tokens_seen": 14153848, "step": 1260, "train_runtime": 2062.8941, "train_tokens_per_second": 6861.161 }, { "epoch": 0.6071878562428752, "grad_norm": 0.674818754196167, "learning_rate": 4.512453143790042e-05, "loss": 0.528, "num_input_tokens_seen": 14210416, "step": 1265, "train_runtime": 2071.062, "train_tokens_per_second": 6861.415 }, { "epoch": 0.6095878082438351, "grad_norm": 0.7137752771377563, "learning_rate": 4.508720173069859e-05, "loss": 0.5395, "num_input_tokens_seen": 14263360, "step": 1270, "train_runtime": 2079.097, "train_tokens_per_second": 6860.363 }, { "epoch": 0.6119877602447951, "grad_norm": 0.5564314723014832, "learning_rate": 4.5049745222978665e-05, "loss": 0.522, "num_input_tokens_seen": 14320200, "step": 1275, "train_runtime": 2087.1564, "train_tokens_per_second": 6861.105 }, { "epoch": 0.6143877122457551, "grad_norm": 0.7505349516868591, "learning_rate": 4.501216215118498e-05, "loss": 0.5303, "num_input_tokens_seen": 14376904, "step": 1280, "train_runtime": 2095.304, "train_tokens_per_second": 6861.488 }, { "epoch": 0.616787664246715, "grad_norm": 0.6077600121498108, "learning_rate": 4.497445275256076e-05, "loss": 0.5027, "num_input_tokens_seen": 14434888, "step": 1285, "train_runtime": 2103.8746, "train_tokens_per_second": 6861.097 }, { "epoch": 0.619187616247675, "grad_norm": 0.6120113730430603, "learning_rate": 4.4936617265146696e-05, "loss": 0.5192, "num_input_tokens_seen": 14489232, "step": 1290, "train_runtime": 2112.087, "train_tokens_per_second": 6860.149 }, { "epoch": 0.621587568248635, "grad_norm": 0.7720391750335693, "learning_rate": 4.489865592777941e-05, "loss": 0.5137, "num_input_tokens_seen": 14543200, "step": 1295, "train_runtime": 2119.9779, "train_tokens_per_second": 6860.072 }, { "epoch": 0.623987520249595, "grad_norm": 0.8337739706039429, "learning_rate": 4.486056898008996e-05, "loss": 0.5647, "num_input_tokens_seen": 14597160, "step": 1300, "train_runtime": 2127.8662, "train_tokens_per_second": 6859.999 }, { "epoch": 0.626387472250555, "grad_norm": 0.6936734914779663, "learning_rate": 4.48223566625023e-05, "loss": 0.5372, "num_input_tokens_seen": 14656120, "step": 1305, "train_runtime": 2136.0775, "train_tokens_per_second": 6861.23 }, { "epoch": 0.628787424251515, "grad_norm": 0.42849820852279663, "learning_rate": 4.47840192162318e-05, "loss": 0.4987, "num_input_tokens_seen": 14715168, "step": 1310, "train_runtime": 2144.4803, "train_tokens_per_second": 6861.881 }, { "epoch": 0.6311873762524749, "grad_norm": 0.6073727607727051, "learning_rate": 4.47455568832837e-05, "loss": 0.5242, "num_input_tokens_seen": 14771992, "step": 1315, "train_runtime": 2152.4662, "train_tokens_per_second": 6862.822 }, { "epoch": 0.6335873282534349, "grad_norm": 0.81267911195755, "learning_rate": 4.470696990645158e-05, "loss": 0.5488, "num_input_tokens_seen": 14827224, "step": 1320, "train_runtime": 2160.85, "train_tokens_per_second": 6861.755 }, { "epoch": 0.6359872802543949, "grad_norm": 0.9082570672035217, "learning_rate": 4.4668258529315855e-05, "loss": 0.5578, "num_input_tokens_seen": 14880216, "step": 1325, "train_runtime": 2168.5347, "train_tokens_per_second": 6861.876 }, { "epoch": 0.6383872322553549, "grad_norm": 0.4958833158016205, "learning_rate": 4.462942299624219e-05, "loss": 0.4897, "num_input_tokens_seen": 14938264, "step": 1330, "train_runtime": 2176.7759, "train_tokens_per_second": 6862.564 }, { "epoch": 0.6407871842563149, "grad_norm": 0.5597286224365234, "learning_rate": 4.459046355238e-05, "loss": 0.5071, "num_input_tokens_seen": 14996424, "step": 1335, "train_runtime": 2184.9625, "train_tokens_per_second": 6863.47 }, { "epoch": 0.6431871362572749, "grad_norm": 0.5538758635520935, "learning_rate": 4.455138044366088e-05, "loss": 0.5117, "num_input_tokens_seen": 15054880, "step": 1340, "train_runtime": 2193.2667, "train_tokens_per_second": 6864.136 }, { "epoch": 0.6455870882582349, "grad_norm": 0.6640130877494812, "learning_rate": 4.4512173916797085e-05, "loss": 0.4721, "num_input_tokens_seen": 15117888, "step": 1345, "train_runtime": 2202.1591, "train_tokens_per_second": 6865.03 }, { "epoch": 0.6479870402591948, "grad_norm": 0.8442539572715759, "learning_rate": 4.447284421927991e-05, "loss": 0.554, "num_input_tokens_seen": 15175016, "step": 1350, "train_runtime": 2211.0135, "train_tokens_per_second": 6863.376 }, { "epoch": 0.6503869922601548, "grad_norm": 0.7367165684700012, "learning_rate": 4.443339159937818e-05, "loss": 0.5125, "num_input_tokens_seen": 15230944, "step": 1355, "train_runtime": 2219.076, "train_tokens_per_second": 6863.642 }, { "epoch": 0.6527869442611148, "grad_norm": 0.6845333576202393, "learning_rate": 4.439381630613668e-05, "loss": 0.5286, "num_input_tokens_seen": 15287896, "step": 1360, "train_runtime": 2227.4192, "train_tokens_per_second": 6863.502 }, { "epoch": 0.6551868962620747, "grad_norm": 0.6416659355163574, "learning_rate": 4.435411858937456e-05, "loss": 0.6131, "num_input_tokens_seen": 15342584, "step": 1365, "train_runtime": 2235.1031, "train_tokens_per_second": 6864.374 }, { "epoch": 0.6575868482630347, "grad_norm": 0.5809879302978516, "learning_rate": 4.431429869968378e-05, "loss": 0.5062, "num_input_tokens_seen": 15404096, "step": 1370, "train_runtime": 2243.5171, "train_tokens_per_second": 6866.048 }, { "epoch": 0.6599868002639947, "grad_norm": 0.6339114308357239, "learning_rate": 4.427435688842748e-05, "loss": 0.4943, "num_input_tokens_seen": 15462616, "step": 1375, "train_runtime": 2251.8474, "train_tokens_per_second": 6866.636 }, { "epoch": 0.6623867522649547, "grad_norm": 0.4654648005962372, "learning_rate": 4.423429340773847e-05, "loss": 0.5096, "num_input_tokens_seen": 15519912, "step": 1380, "train_runtime": 2260.1318, "train_tokens_per_second": 6866.817 }, { "epoch": 0.6647867042659147, "grad_norm": 0.6752036809921265, "learning_rate": 4.41941085105176e-05, "loss": 0.5394, "num_input_tokens_seen": 15576136, "step": 1385, "train_runtime": 2268.2935, "train_tokens_per_second": 6866.896 }, { "epoch": 0.6671866562668747, "grad_norm": 0.5208489894866943, "learning_rate": 4.415380245043213e-05, "loss": 0.4537, "num_input_tokens_seen": 15633480, "step": 1390, "train_runtime": 2276.6508, "train_tokens_per_second": 6866.877 }, { "epoch": 0.6695866082678347, "grad_norm": 0.6454225778579712, "learning_rate": 4.4113375481914186e-05, "loss": 0.5155, "num_input_tokens_seen": 15688200, "step": 1395, "train_runtime": 2284.8437, "train_tokens_per_second": 6866.203 }, { "epoch": 0.6719865602687947, "grad_norm": 0.5845027565956116, "learning_rate": 4.407282786015913e-05, "loss": 0.5255, "num_input_tokens_seen": 15742392, "step": 1400, "train_runtime": 2292.7215, "train_tokens_per_second": 6866.247 }, { "epoch": 0.6743865122697547, "grad_norm": 0.9591690301895142, "learning_rate": 4.403215984112392e-05, "loss": 0.5122, "num_input_tokens_seen": 15799472, "step": 1405, "train_runtime": 2301.0926, "train_tokens_per_second": 6866.074 }, { "epoch": 0.6767864642707145, "grad_norm": 0.6333798766136169, "learning_rate": 4.3991371681525556e-05, "loss": 0.511, "num_input_tokens_seen": 15858960, "step": 1410, "train_runtime": 2309.9976, "train_tokens_per_second": 6865.358 }, { "epoch": 0.6791864162716745, "grad_norm": 0.5859664082527161, "learning_rate": 4.395046363883941e-05, "loss": 0.5375, "num_input_tokens_seen": 15915472, "step": 1415, "train_runtime": 2317.9598, "train_tokens_per_second": 6866.155 }, { "epoch": 0.6815863682726345, "grad_norm": 0.6732012629508972, "learning_rate": 4.390943597129761e-05, "loss": 0.5682, "num_input_tokens_seen": 15970752, "step": 1420, "train_runtime": 2325.6345, "train_tokens_per_second": 6867.267 }, { "epoch": 0.6839863202735945, "grad_norm": 0.7597581148147583, "learning_rate": 4.3868288937887445e-05, "loss": 0.5164, "num_input_tokens_seen": 16025456, "step": 1425, "train_runtime": 2333.8264, "train_tokens_per_second": 6866.601 }, { "epoch": 0.6863862722745545, "grad_norm": 0.7212057113647461, "learning_rate": 4.382702279834965e-05, "loss": 0.5524, "num_input_tokens_seen": 16075744, "step": 1430, "train_runtime": 2341.4051, "train_tokens_per_second": 6865.853 }, { "epoch": 0.6887862242755145, "grad_norm": 0.58528733253479, "learning_rate": 4.378563781317687e-05, "loss": 0.497, "num_input_tokens_seen": 16137672, "step": 1435, "train_runtime": 2350.3848, "train_tokens_per_second": 6865.97 }, { "epoch": 0.6911861762764745, "grad_norm": 0.570091962814331, "learning_rate": 4.374413424361195e-05, "loss": 0.4888, "num_input_tokens_seen": 16199088, "step": 1440, "train_runtime": 2358.886, "train_tokens_per_second": 6867.262 }, { "epoch": 0.6935861282774345, "grad_norm": 0.768666684627533, "learning_rate": 4.370251235164625e-05, "loss": 0.5343, "num_input_tokens_seen": 16253792, "step": 1445, "train_runtime": 2367.1689, "train_tokens_per_second": 6866.342 }, { "epoch": 0.6959860802783945, "grad_norm": 0.6287879347801208, "learning_rate": 4.366077240001813e-05, "loss": 0.4848, "num_input_tokens_seen": 16316608, "step": 1450, "train_runtime": 2376.0866, "train_tokens_per_second": 6867.009 }, { "epoch": 0.6983860322793544, "grad_norm": 0.74793541431427, "learning_rate": 4.361891465221112e-05, "loss": 0.4847, "num_input_tokens_seen": 16375648, "step": 1455, "train_runtime": 2384.4535, "train_tokens_per_second": 6867.673 }, { "epoch": 0.7007859842803144, "grad_norm": 0.6209436655044556, "learning_rate": 4.3576939372452394e-05, "loss": 0.5295, "num_input_tokens_seen": 16429360, "step": 1460, "train_runtime": 2392.6099, "train_tokens_per_second": 6866.711 }, { "epoch": 0.7031859362812743, "grad_norm": 0.7456108331680298, "learning_rate": 4.353484682571101e-05, "loss": 0.5144, "num_input_tokens_seen": 16480088, "step": 1465, "train_runtime": 2400.4701, "train_tokens_per_second": 6865.359 }, { "epoch": 0.7055858882822343, "grad_norm": 0.573098361492157, "learning_rate": 4.349263727769629e-05, "loss": 0.4636, "num_input_tokens_seen": 16538968, "step": 1470, "train_runtime": 2408.9134, "train_tokens_per_second": 6865.738 }, { "epoch": 0.7079858402831943, "grad_norm": 0.6599897146224976, "learning_rate": 4.3450310994856135e-05, "loss": 0.5415, "num_input_tokens_seen": 16595104, "step": 1475, "train_runtime": 2417.2231, "train_tokens_per_second": 6865.359 }, { "epoch": 0.7103857922841543, "grad_norm": 0.9016920328140259, "learning_rate": 4.3407868244375315e-05, "loss": 0.5367, "num_input_tokens_seen": 16650488, "step": 1480, "train_runtime": 2425.2913, "train_tokens_per_second": 6865.356 }, { "epoch": 0.7127857442851143, "grad_norm": 0.7661956548690796, "learning_rate": 4.3365309294173825e-05, "loss": 0.4729, "num_input_tokens_seen": 16701384, "step": 1485, "train_runtime": 2432.991, "train_tokens_per_second": 6864.548 }, { "epoch": 0.7151856962860743, "grad_norm": 0.8703396916389465, "learning_rate": 4.332263441290515e-05, "loss": 0.5373, "num_input_tokens_seen": 16754152, "step": 1490, "train_runtime": 2440.463, "train_tokens_per_second": 6865.153 }, { "epoch": 0.7175856482870343, "grad_norm": 0.633375883102417, "learning_rate": 4.3279843869954604e-05, "loss": 0.5037, "num_input_tokens_seen": 16809056, "step": 1495, "train_runtime": 2448.2645, "train_tokens_per_second": 6865.703 }, { "epoch": 0.7199856002879942, "grad_norm": 0.7101417779922485, "learning_rate": 4.3236937935437614e-05, "loss": 0.5324, "num_input_tokens_seen": 16859504, "step": 1500, "train_runtime": 2455.6516, "train_tokens_per_second": 6865.593 }, { "epoch": 0.7223855522889542, "grad_norm": 0.6423754692077637, "learning_rate": 4.3193916880198004e-05, "loss": 0.5109, "num_input_tokens_seen": 16919952, "step": 1505, "train_runtime": 2464.4089, "train_tokens_per_second": 6865.724 }, { "epoch": 0.7247855042899142, "grad_norm": 0.7076619863510132, "learning_rate": 4.3150780975806315e-05, "loss": 0.5425, "num_input_tokens_seen": 16976592, "step": 1510, "train_runtime": 2472.6158, "train_tokens_per_second": 6865.843 }, { "epoch": 0.7271854562908742, "grad_norm": 0.5288546085357666, "learning_rate": 4.310753049455806e-05, "loss": 0.515, "num_input_tokens_seen": 17034816, "step": 1515, "train_runtime": 2480.9341, "train_tokens_per_second": 6866.291 }, { "epoch": 0.7295854082918342, "grad_norm": 0.6262106895446777, "learning_rate": 4.3064165709472036e-05, "loss": 0.5271, "num_input_tokens_seen": 17088560, "step": 1520, "train_runtime": 2488.3235, "train_tokens_per_second": 6867.499 }, { "epoch": 0.7319853602927942, "grad_norm": 0.5250151753425598, "learning_rate": 4.3020686894288564e-05, "loss": 0.5055, "num_input_tokens_seen": 17144640, "step": 1525, "train_runtime": 2496.1311, "train_tokens_per_second": 6868.485 }, { "epoch": 0.7343853122937541, "grad_norm": 0.7805795669555664, "learning_rate": 4.2977094323467784e-05, "loss": 0.48, "num_input_tokens_seen": 17200416, "step": 1530, "train_runtime": 2504.3678, "train_tokens_per_second": 6868.167 }, { "epoch": 0.7367852642947141, "grad_norm": 0.7616066336631775, "learning_rate": 4.293338827218794e-05, "loss": 0.4972, "num_input_tokens_seen": 17256344, "step": 1535, "train_runtime": 2512.462, "train_tokens_per_second": 6868.3 }, { "epoch": 0.7391852162956741, "grad_norm": 0.7682455778121948, "learning_rate": 4.288956901634359e-05, "loss": 0.4691, "num_input_tokens_seen": 17314072, "step": 1540, "train_runtime": 2520.9232, "train_tokens_per_second": 6868.147 }, { "epoch": 0.741585168296634, "grad_norm": 0.7621558308601379, "learning_rate": 4.2845636832543914e-05, "loss": 0.4942, "num_input_tokens_seen": 17373728, "step": 1545, "train_runtime": 2529.3504, "train_tokens_per_second": 6868.85 }, { "epoch": 0.743985120297594, "grad_norm": 0.6085621118545532, "learning_rate": 4.2801591998110946e-05, "loss": 0.5119, "num_input_tokens_seen": 17425920, "step": 1550, "train_runtime": 2536.8035, "train_tokens_per_second": 6869.243 }, { "epoch": 0.746385072298554, "grad_norm": 0.6101738214492798, "learning_rate": 4.275743479107785e-05, "loss": 0.5201, "num_input_tokens_seen": 17480304, "step": 1555, "train_runtime": 2544.9492, "train_tokens_per_second": 6868.626 }, { "epoch": 0.748785024299514, "grad_norm": 0.6207472085952759, "learning_rate": 4.271316549018708e-05, "loss": 0.517, "num_input_tokens_seen": 17539776, "step": 1560, "train_runtime": 2553.0822, "train_tokens_per_second": 6870.04 }, { "epoch": 0.751184976300474, "grad_norm": 0.688941478729248, "learning_rate": 4.2668784374888756e-05, "loss": 0.4894, "num_input_tokens_seen": 17595928, "step": 1565, "train_runtime": 2561.6413, "train_tokens_per_second": 6869.005 }, { "epoch": 0.753584928301434, "grad_norm": 0.9783554673194885, "learning_rate": 4.262429172533878e-05, "loss": 0.5213, "num_input_tokens_seen": 17651664, "step": 1570, "train_runtime": 2569.4494, "train_tokens_per_second": 6869.824 }, { "epoch": 0.755984880302394, "grad_norm": 0.9513911604881287, "learning_rate": 4.257968782239714e-05, "loss": 0.506, "num_input_tokens_seen": 17703960, "step": 1575, "train_runtime": 2576.9625, "train_tokens_per_second": 6870.088 }, { "epoch": 0.758384832303354, "grad_norm": 0.7099276185035706, "learning_rate": 4.2534972947626094e-05, "loss": 0.5073, "num_input_tokens_seen": 17761448, "step": 1580, "train_runtime": 2585.427, "train_tokens_per_second": 6869.832 }, { "epoch": 0.760784784304314, "grad_norm": 0.5648279786109924, "learning_rate": 4.249014738328842e-05, "loss": 0.5265, "num_input_tokens_seen": 17817984, "step": 1585, "train_runtime": 2593.1431, "train_tokens_per_second": 6871.192 }, { "epoch": 0.763184736305274, "grad_norm": 0.6818917989730835, "learning_rate": 4.2445211412345615e-05, "loss": 0.5244, "num_input_tokens_seen": 17874768, "step": 1590, "train_runtime": 2601.224, "train_tokens_per_second": 6871.676 }, { "epoch": 0.7655846883062338, "grad_norm": 0.6163448691368103, "learning_rate": 4.240016531845612e-05, "loss": 0.5406, "num_input_tokens_seen": 17931864, "step": 1595, "train_runtime": 2609.5192, "train_tokens_per_second": 6871.712 }, { "epoch": 0.7679846403071938, "grad_norm": 0.6879476308822632, "learning_rate": 4.235500938597354e-05, "loss": 0.4871, "num_input_tokens_seen": 17985744, "step": 1600, "train_runtime": 2617.2291, "train_tokens_per_second": 6872.056 }, { "epoch": 0.7703845923081538, "grad_norm": 0.5437011122703552, "learning_rate": 4.230974389994483e-05, "loss": 0.5015, "num_input_tokens_seen": 18044152, "step": 1605, "train_runtime": 2625.4686, "train_tokens_per_second": 6872.736 }, { "epoch": 0.7727845443091138, "grad_norm": 0.5755176544189453, "learning_rate": 4.226436914610849e-05, "loss": 0.541, "num_input_tokens_seen": 18100976, "step": 1610, "train_runtime": 2633.5328, "train_tokens_per_second": 6873.268 }, { "epoch": 0.7751844963100738, "grad_norm": 0.6550777554512024, "learning_rate": 4.2218885410892785e-05, "loss": 0.5314, "num_input_tokens_seen": 18156240, "step": 1615, "train_runtime": 2641.1036, "train_tokens_per_second": 6874.49 }, { "epoch": 0.7775844483110338, "grad_norm": 0.6372175216674805, "learning_rate": 4.2173292981413914e-05, "loss": 0.4875, "num_input_tokens_seen": 18216472, "step": 1620, "train_runtime": 2649.6605, "train_tokens_per_second": 6875.021 }, { "epoch": 0.7799844003119938, "grad_norm": 0.5091462731361389, "learning_rate": 4.212759214547424e-05, "loss": 0.4954, "num_input_tokens_seen": 18271168, "step": 1625, "train_runtime": 2657.4608, "train_tokens_per_second": 6875.423 }, { "epoch": 0.7823843523129538, "grad_norm": 0.6974900960922241, "learning_rate": 4.2081783191560405e-05, "loss": 0.4939, "num_input_tokens_seen": 18326128, "step": 1630, "train_runtime": 2665.3267, "train_tokens_per_second": 6875.753 }, { "epoch": 0.7847843043139138, "grad_norm": 0.5476020574569702, "learning_rate": 4.203586640884156e-05, "loss": 0.4995, "num_input_tokens_seen": 18385280, "step": 1635, "train_runtime": 2673.6877, "train_tokens_per_second": 6876.375 }, { "epoch": 0.7871842563148737, "grad_norm": 0.5772519111633301, "learning_rate": 4.1989842087167534e-05, "loss": 0.5198, "num_input_tokens_seen": 18444000, "step": 1640, "train_runtime": 2682.4357, "train_tokens_per_second": 6875.84 }, { "epoch": 0.7895842083158336, "grad_norm": 0.6971266269683838, "learning_rate": 4.1943710517066984e-05, "loss": 0.4696, "num_input_tokens_seen": 18500344, "step": 1645, "train_runtime": 2690.5876, "train_tokens_per_second": 6875.949 }, { "epoch": 0.7919841603167936, "grad_norm": 0.7783945798873901, "learning_rate": 4.1897471989745575e-05, "loss": 0.4777, "num_input_tokens_seen": 18553136, "step": 1650, "train_runtime": 2698.2872, "train_tokens_per_second": 6875.894 }, { "epoch": 0.7943841123177536, "grad_norm": 0.7614520192146301, "learning_rate": 4.185112679708415e-05, "loss": 0.525, "num_input_tokens_seen": 18610264, "step": 1655, "train_runtime": 2706.4236, "train_tokens_per_second": 6876.331 }, { "epoch": 0.7967840643187136, "grad_norm": 0.5857712626457214, "learning_rate": 4.180467523163686e-05, "loss": 0.4906, "num_input_tokens_seen": 18670624, "step": 1660, "train_runtime": 2714.993, "train_tokens_per_second": 6876.859 }, { "epoch": 0.7991840163196736, "grad_norm": 0.5816935300827026, "learning_rate": 4.175811758662935e-05, "loss": 0.4851, "num_input_tokens_seen": 18727824, "step": 1665, "train_runtime": 2723.4951, "train_tokens_per_second": 6876.393 }, { "epoch": 0.8015839683206336, "grad_norm": 0.5751060843467712, "learning_rate": 4.1711454155956895e-05, "loss": 0.4694, "num_input_tokens_seen": 18785440, "step": 1670, "train_runtime": 2731.305, "train_tokens_per_second": 6877.826 }, { "epoch": 0.8039839203215936, "grad_norm": 0.8796506524085999, "learning_rate": 4.166468523418251e-05, "loss": 0.5254, "num_input_tokens_seen": 18839288, "step": 1675, "train_runtime": 2739.4392, "train_tokens_per_second": 6877.06 }, { "epoch": 0.8063838723225536, "grad_norm": 0.6676029562950134, "learning_rate": 4.1617811116535176e-05, "loss": 0.5521, "num_input_tokens_seen": 18893696, "step": 1680, "train_runtime": 2747.1069, "train_tokens_per_second": 6877.67 }, { "epoch": 0.8087838243235135, "grad_norm": 0.8193256258964539, "learning_rate": 4.1570832098907874e-05, "loss": 0.5444, "num_input_tokens_seen": 18946504, "step": 1685, "train_runtime": 2754.72, "train_tokens_per_second": 6877.833 }, { "epoch": 0.8111837763244735, "grad_norm": 0.5464473962783813, "learning_rate": 4.152374847785579e-05, "loss": 0.5321, "num_input_tokens_seen": 19003664, "step": 1690, "train_runtime": 2763.0844, "train_tokens_per_second": 6877.699 }, { "epoch": 0.8135837283254335, "grad_norm": 0.8191189169883728, "learning_rate": 4.1476560550594414e-05, "loss": 0.4826, "num_input_tokens_seen": 19056544, "step": 1695, "train_runtime": 2770.9361, "train_tokens_per_second": 6877.295 }, { "epoch": 0.8159836803263935, "grad_norm": 0.745058000087738, "learning_rate": 4.142926861499768e-05, "loss": 0.5543, "num_input_tokens_seen": 19107344, "step": 1700, "train_runtime": 2778.5593, "train_tokens_per_second": 6876.709 }, { "epoch": 0.8183836323273534, "grad_norm": 0.6147037744522095, "learning_rate": 4.138187296959606e-05, "loss": 0.505, "num_input_tokens_seen": 19162000, "step": 1705, "train_runtime": 2786.3906, "train_tokens_per_second": 6876.997 }, { "epoch": 0.8207835843283134, "grad_norm": 0.687018632888794, "learning_rate": 4.13343739135747e-05, "loss": 0.522, "num_input_tokens_seen": 19217512, "step": 1710, "train_runtime": 2794.2498, "train_tokens_per_second": 6877.521 }, { "epoch": 0.8231835363292734, "grad_norm": 0.6172505617141724, "learning_rate": 4.128677174677153e-05, "loss": 0.5411, "num_input_tokens_seen": 19276384, "step": 1715, "train_runtime": 2802.4832, "train_tokens_per_second": 6878.323 }, { "epoch": 0.8255834883302334, "grad_norm": 0.735072135925293, "learning_rate": 4.123906676967536e-05, "loss": 0.513, "num_input_tokens_seen": 19328432, "step": 1720, "train_runtime": 2810.1311, "train_tokens_per_second": 6878.125 }, { "epoch": 0.8279834403311934, "grad_norm": 0.9113159775733948, "learning_rate": 4.1191259283424e-05, "loss": 0.5244, "num_input_tokens_seen": 19384016, "step": 1725, "train_runtime": 2818.2045, "train_tokens_per_second": 6878.144 }, { "epoch": 0.8303833923321533, "grad_norm": 0.8989443778991699, "learning_rate": 4.1143349589802326e-05, "loss": 0.5471, "num_input_tokens_seen": 19442016, "step": 1730, "train_runtime": 2826.3519, "train_tokens_per_second": 6878.838 }, { "epoch": 0.8327833443331133, "grad_norm": 0.572564423084259, "learning_rate": 4.1095337991240436e-05, "loss": 0.5352, "num_input_tokens_seen": 19496880, "step": 1735, "train_runtime": 2834.1751, "train_tokens_per_second": 6879.208 }, { "epoch": 0.8351832963340733, "grad_norm": 0.4649478793144226, "learning_rate": 4.104722479081167e-05, "loss": 0.4709, "num_input_tokens_seen": 19555656, "step": 1740, "train_runtime": 2842.1514, "train_tokens_per_second": 6880.582 }, { "epoch": 0.8375832483350333, "grad_norm": 0.6450087428092957, "learning_rate": 4.099901029223075e-05, "loss": 0.5104, "num_input_tokens_seen": 19610352, "step": 1745, "train_runtime": 2849.9024, "train_tokens_per_second": 6881.061 }, { "epoch": 0.8399832003359933, "grad_norm": 0.7608988881111145, "learning_rate": 4.095069479985183e-05, "loss": 0.5151, "num_input_tokens_seen": 19666656, "step": 1750, "train_runtime": 2858.2857, "train_tokens_per_second": 6880.577 }, { "epoch": 0.8423831523369533, "grad_norm": 0.5766634345054626, "learning_rate": 4.090227861866659e-05, "loss": 0.5355, "num_input_tokens_seen": 19723528, "step": 1755, "train_runtime": 2866.3853, "train_tokens_per_second": 6880.976 }, { "epoch": 0.8447831043379133, "grad_norm": 0.8256959915161133, "learning_rate": 4.085376205430233e-05, "loss": 0.5475, "num_input_tokens_seen": 19775232, "step": 1760, "train_runtime": 2873.9931, "train_tokens_per_second": 6880.751 }, { "epoch": 0.8471830563388733, "grad_norm": 0.6020644903182983, "learning_rate": 4.080514541301998e-05, "loss": 0.5043, "num_input_tokens_seen": 19832592, "step": 1765, "train_runtime": 2881.8352, "train_tokens_per_second": 6881.931 }, { "epoch": 0.8495830083398332, "grad_norm": 0.6027383804321289, "learning_rate": 4.075642900171223e-05, "loss": 0.5501, "num_input_tokens_seen": 19886104, "step": 1770, "train_runtime": 2889.3788, "train_tokens_per_second": 6882.484 }, { "epoch": 0.8519829603407932, "grad_norm": 0.7463006377220154, "learning_rate": 4.070761312790157e-05, "loss": 0.5666, "num_input_tokens_seen": 19944808, "step": 1775, "train_runtime": 2897.8024, "train_tokens_per_second": 6882.736 }, { "epoch": 0.8543829123417531, "grad_norm": 0.5846840143203735, "learning_rate": 4.065869809973833e-05, "loss": 0.5026, "num_input_tokens_seen": 20000048, "step": 1780, "train_runtime": 2905.6359, "train_tokens_per_second": 6883.191 }, { "epoch": 0.8567828643427131, "grad_norm": 0.6461730599403381, "learning_rate": 4.060968422599879e-05, "loss": 0.4991, "num_input_tokens_seen": 20054800, "step": 1785, "train_runtime": 2913.7209, "train_tokens_per_second": 6882.883 }, { "epoch": 0.8591828163436731, "grad_norm": 0.7940958142280579, "learning_rate": 4.0560571816083156e-05, "loss": 0.5496, "num_input_tokens_seen": 20111120, "step": 1790, "train_runtime": 2921.8875, "train_tokens_per_second": 6882.921 }, { "epoch": 0.8615827683446331, "grad_norm": 0.6765144467353821, "learning_rate": 4.051136118001364e-05, "loss": 0.4827, "num_input_tokens_seen": 20165552, "step": 1795, "train_runtime": 2929.7258, "train_tokens_per_second": 6883.085 }, { "epoch": 0.8639827203455931, "grad_norm": 0.9223127365112305, "learning_rate": 4.046205262843254e-05, "loss": 0.4949, "num_input_tokens_seen": 20221072, "step": 1800, "train_runtime": 2938.3425, "train_tokens_per_second": 6881.796 }, { "epoch": 0.8663826723465531, "grad_norm": 0.5317054390907288, "learning_rate": 4.041264647260022e-05, "loss": 0.4844, "num_input_tokens_seen": 20277640, "step": 1805, "train_runtime": 2947.9518, "train_tokens_per_second": 6878.552 }, { "epoch": 0.8687826243475131, "grad_norm": 0.5232411623001099, "learning_rate": 4.036314302439319e-05, "loss": 0.4938, "num_input_tokens_seen": 20333328, "step": 1810, "train_runtime": 2955.884, "train_tokens_per_second": 6878.933 }, { "epoch": 0.8711825763484731, "grad_norm": 0.7968527674674988, "learning_rate": 4.031354259630209e-05, "loss": 0.5246, "num_input_tokens_seen": 20389752, "step": 1815, "train_runtime": 2963.7323, "train_tokens_per_second": 6879.755 }, { "epoch": 0.8735825283494331, "grad_norm": 0.5793075561523438, "learning_rate": 4.026384550142978e-05, "loss": 0.5467, "num_input_tokens_seen": 20447184, "step": 1820, "train_runtime": 2971.7237, "train_tokens_per_second": 6880.58 }, { "epoch": 0.875982480350393, "grad_norm": 0.6629696488380432, "learning_rate": 4.0214052053489304e-05, "loss": 0.4753, "num_input_tokens_seen": 20501512, "step": 1825, "train_runtime": 2979.5222, "train_tokens_per_second": 6880.805 }, { "epoch": 0.8783824323513529, "grad_norm": 0.6974778175354004, "learning_rate": 4.016416256680194e-05, "loss": 0.5134, "num_input_tokens_seen": 20556688, "step": 1830, "train_runtime": 2987.3905, "train_tokens_per_second": 6881.152 }, { "epoch": 0.8807823843523129, "grad_norm": 0.7780594825744629, "learning_rate": 4.011417735629522e-05, "loss": 0.4771, "num_input_tokens_seen": 20613504, "step": 1835, "train_runtime": 2995.7447, "train_tokens_per_second": 6880.928 }, { "epoch": 0.8831823363532729, "grad_norm": 0.6135735511779785, "learning_rate": 4.006409673750094e-05, "loss": 0.4904, "num_input_tokens_seen": 20670776, "step": 1840, "train_runtime": 3004.2957, "train_tokens_per_second": 6880.407 }, { "epoch": 0.8855822883542329, "grad_norm": 0.6567316651344299, "learning_rate": 4.0013921026553125e-05, "loss": 0.5172, "num_input_tokens_seen": 20726776, "step": 1845, "train_runtime": 3012.3296, "train_tokens_per_second": 6880.647 }, { "epoch": 0.8879822403551929, "grad_norm": 0.733647882938385, "learning_rate": 3.9963650540186116e-05, "loss": 0.5168, "num_input_tokens_seen": 20781792, "step": 1850, "train_runtime": 3020.8457, "train_tokens_per_second": 6879.462 }, { "epoch": 0.8903821923561529, "grad_norm": 0.7651314735412598, "learning_rate": 3.991328559573248e-05, "loss": 0.551, "num_input_tokens_seen": 20835512, "step": 1855, "train_runtime": 3028.6209, "train_tokens_per_second": 6879.538 }, { "epoch": 0.8927821443571129, "grad_norm": 0.7899940013885498, "learning_rate": 3.9862826511121085e-05, "loss": 0.5242, "num_input_tokens_seen": 20887216, "step": 1860, "train_runtime": 3036.1277, "train_tokens_per_second": 6879.558 }, { "epoch": 0.8951820963580729, "grad_norm": 0.6774663329124451, "learning_rate": 3.981227360487504e-05, "loss": 0.5273, "num_input_tokens_seen": 20943744, "step": 1865, "train_runtime": 3044.3369, "train_tokens_per_second": 6879.575 }, { "epoch": 0.8975820483590328, "grad_norm": 0.6696859002113342, "learning_rate": 3.976162719610972e-05, "loss": 0.5006, "num_input_tokens_seen": 20991568, "step": 1870, "train_runtime": 3053.2072, "train_tokens_per_second": 6875.252 }, { "epoch": 0.8999820003599928, "grad_norm": 0.7721266746520996, "learning_rate": 3.971088760453071e-05, "loss": 0.5214, "num_input_tokens_seen": 21047408, "step": 1875, "train_runtime": 3061.9813, "train_tokens_per_second": 6873.787 }, { "epoch": 0.9023819523609528, "grad_norm": 0.7528117299079895, "learning_rate": 3.966005515043183e-05, "loss": 0.5172, "num_input_tokens_seen": 21105344, "step": 1880, "train_runtime": 3070.238, "train_tokens_per_second": 6874.172 }, { "epoch": 0.9047819043619127, "grad_norm": 0.7893593311309814, "learning_rate": 3.960913015469311e-05, "loss": 0.5581, "num_input_tokens_seen": 21161704, "step": 1885, "train_runtime": 3078.4575, "train_tokens_per_second": 6874.126 }, { "epoch": 0.9071818563628727, "grad_norm": 0.6411826610565186, "learning_rate": 3.95581129387787e-05, "loss": 0.5006, "num_input_tokens_seen": 21220960, "step": 1890, "train_runtime": 3087.9925, "train_tokens_per_second": 6872.089 }, { "epoch": 0.9095818083638327, "grad_norm": 0.48201116919517517, "learning_rate": 3.950700382473494e-05, "loss": 0.5143, "num_input_tokens_seen": 21285456, "step": 1895, "train_runtime": 3097.6261, "train_tokens_per_second": 6871.538 }, { "epoch": 0.9119817603647927, "grad_norm": 0.7874345779418945, "learning_rate": 3.9455803135188265e-05, "loss": 0.5133, "num_input_tokens_seen": 21340656, "step": 1900, "train_runtime": 3105.618, "train_tokens_per_second": 6871.629 }, { "epoch": 0.9143817123657527, "grad_norm": 0.8059301972389221, "learning_rate": 3.940451119334315e-05, "loss": 0.4716, "num_input_tokens_seen": 21402256, "step": 1905, "train_runtime": 3114.7644, "train_tokens_per_second": 6871.228 }, { "epoch": 0.9167816643667127, "grad_norm": 0.5982013940811157, "learning_rate": 3.935312832298014e-05, "loss": 0.4752, "num_input_tokens_seen": 21456968, "step": 1910, "train_runtime": 3122.6252, "train_tokens_per_second": 6871.452 }, { "epoch": 0.9191816163676726, "grad_norm": 0.6114861965179443, "learning_rate": 3.9301654848453744e-05, "loss": 0.5358, "num_input_tokens_seen": 21510880, "step": 1915, "train_runtime": 3130.5306, "train_tokens_per_second": 6871.321 }, { "epoch": 0.9215815683686326, "grad_norm": 0.6739422678947449, "learning_rate": 3.9250091094690424e-05, "loss": 0.508, "num_input_tokens_seen": 21567176, "step": 1920, "train_runtime": 3139.4979, "train_tokens_per_second": 6869.626 }, { "epoch": 0.9239815203695926, "grad_norm": 0.9573784470558167, "learning_rate": 3.9198437387186514e-05, "loss": 0.4969, "num_input_tokens_seen": 21616728, "step": 1925, "train_runtime": 3147.1512, "train_tokens_per_second": 6868.665 }, { "epoch": 0.9263814723705526, "grad_norm": 0.6872597336769104, "learning_rate": 3.914669405200619e-05, "loss": 0.5231, "num_input_tokens_seen": 21669600, "step": 1930, "train_runtime": 3154.6855, "train_tokens_per_second": 6869.021 }, { "epoch": 0.9287814243715126, "grad_norm": 0.5402712225914001, "learning_rate": 3.909486141577941e-05, "loss": 0.5557, "num_input_tokens_seen": 21725144, "step": 1935, "train_runtime": 3162.9029, "train_tokens_per_second": 6868.736 }, { "epoch": 0.9311813763724726, "grad_norm": 0.5620856881141663, "learning_rate": 3.904293980569983e-05, "loss": 0.5202, "num_input_tokens_seen": 21780960, "step": 1940, "train_runtime": 3171.7075, "train_tokens_per_second": 6867.266 }, { "epoch": 0.9335813283734326, "grad_norm": 0.48633241653442383, "learning_rate": 3.899092954952276e-05, "loss": 0.4965, "num_input_tokens_seen": 21835904, "step": 1945, "train_runtime": 3180.9981, "train_tokens_per_second": 6864.482 }, { "epoch": 0.9359812803743925, "grad_norm": 0.6408486366271973, "learning_rate": 3.89388309755631e-05, "loss": 0.5271, "num_input_tokens_seen": 21890264, "step": 1950, "train_runtime": 3188.8619, "train_tokens_per_second": 6864.601 }, { "epoch": 0.9383812323753525, "grad_norm": 0.6832561492919922, "learning_rate": 3.888664441269324e-05, "loss": 0.513, "num_input_tokens_seen": 21943944, "step": 1955, "train_runtime": 3196.9004, "train_tokens_per_second": 6864.131 }, { "epoch": 0.9407811843763125, "grad_norm": 0.7224368453025818, "learning_rate": 3.8834370190341016e-05, "loss": 0.4975, "num_input_tokens_seen": 22000688, "step": 1960, "train_runtime": 3205.2356, "train_tokens_per_second": 6863.985 }, { "epoch": 0.9431811363772724, "grad_norm": 0.921877384185791, "learning_rate": 3.8782008638487585e-05, "loss": 0.5142, "num_input_tokens_seen": 22056928, "step": 1965, "train_runtime": 3213.437, "train_tokens_per_second": 6863.968 }, { "epoch": 0.9455810883782324, "grad_norm": 0.8015443682670593, "learning_rate": 3.872956008766541e-05, "loss": 0.5345, "num_input_tokens_seen": 22109984, "step": 1970, "train_runtime": 3221.3456, "train_tokens_per_second": 6863.586 }, { "epoch": 0.9479810403791924, "grad_norm": 0.60637366771698, "learning_rate": 3.867702486895611e-05, "loss": 0.519, "num_input_tokens_seen": 22167792, "step": 1975, "train_runtime": 3229.4918, "train_tokens_per_second": 6864.173 }, { "epoch": 0.9503809923801524, "grad_norm": 0.6260784268379211, "learning_rate": 3.86244033139884e-05, "loss": 0.4549, "num_input_tokens_seen": 22224944, "step": 1980, "train_runtime": 3237.4363, "train_tokens_per_second": 6864.983 }, { "epoch": 0.9527809443811124, "grad_norm": 0.7488238215446472, "learning_rate": 3.857169575493601e-05, "loss": 0.4988, "num_input_tokens_seen": 22280208, "step": 1985, "train_runtime": 3245.3144, "train_tokens_per_second": 6865.347 }, { "epoch": 0.9551808963820724, "grad_norm": 1.2673466205596924, "learning_rate": 3.851890252451553e-05, "loss": 0.5948, "num_input_tokens_seen": 22331688, "step": 1990, "train_runtime": 3252.7162, "train_tokens_per_second": 6865.551 }, { "epoch": 0.9575808483830324, "grad_norm": 0.7167654633522034, "learning_rate": 3.846602395598441e-05, "loss": 0.4765, "num_input_tokens_seen": 22391056, "step": 1995, "train_runtime": 3261.3251, "train_tokens_per_second": 6865.631 }, { "epoch": 0.9599808003839924, "grad_norm": 0.7767099142074585, "learning_rate": 3.8413060383138735e-05, "loss": 0.5067, "num_input_tokens_seen": 22442560, "step": 2000, "train_runtime": 3268.751, "train_tokens_per_second": 6865.791 }, { "epoch": 0.9623807523849524, "grad_norm": 0.6243239641189575, "learning_rate": 3.836001214031122e-05, "loss": 0.441, "num_input_tokens_seen": 22504640, "step": 2005, "train_runtime": 3277.3712, "train_tokens_per_second": 6866.674 }, { "epoch": 0.9647807043859122, "grad_norm": 0.7347325086593628, "learning_rate": 3.830687956236907e-05, "loss": 0.4923, "num_input_tokens_seen": 22565448, "step": 2010, "train_runtime": 3285.5854, "train_tokens_per_second": 6868.014 }, { "epoch": 0.9671806563868722, "grad_norm": 0.7760552167892456, "learning_rate": 3.8253662984711795e-05, "loss": 0.4971, "num_input_tokens_seen": 22618928, "step": 2015, "train_runtime": 3293.6417, "train_tokens_per_second": 6867.453 }, { "epoch": 0.9695806083878322, "grad_norm": 0.6205884218215942, "learning_rate": 3.820036274326922e-05, "loss": 0.4979, "num_input_tokens_seen": 22674720, "step": 2020, "train_runtime": 3301.4874, "train_tokens_per_second": 6868.032 }, { "epoch": 0.9719805603887922, "grad_norm": 0.7021058797836304, "learning_rate": 3.8146979174499265e-05, "loss": 0.48, "num_input_tokens_seen": 22734768, "step": 2025, "train_runtime": 3309.628, "train_tokens_per_second": 6869.282 }, { "epoch": 0.9743805123897522, "grad_norm": 0.8105769753456116, "learning_rate": 3.809351261538585e-05, "loss": 0.4802, "num_input_tokens_seen": 22792864, "step": 2030, "train_runtime": 3318.078, "train_tokens_per_second": 6869.297 }, { "epoch": 0.9767804643907122, "grad_norm": 0.7583296895027161, "learning_rate": 3.8039963403436806e-05, "loss": 0.5393, "num_input_tokens_seen": 22846392, "step": 2035, "train_runtime": 3326.837, "train_tokens_per_second": 6867.301 }, { "epoch": 0.9791804163916722, "grad_norm": 0.7417272925376892, "learning_rate": 3.798633187668166e-05, "loss": 0.5505, "num_input_tokens_seen": 22899608, "step": 2040, "train_runtime": 3337.0101, "train_tokens_per_second": 6862.313 }, { "epoch": 0.9815803683926322, "grad_norm": 0.6118446588516235, "learning_rate": 3.793261837366959e-05, "loss": 0.4829, "num_input_tokens_seen": 22960648, "step": 2045, "train_runtime": 3348.0559, "train_tokens_per_second": 6857.905 }, { "epoch": 0.9839803203935922, "grad_norm": 0.6822954416275024, "learning_rate": 3.7878823233467234e-05, "loss": 0.5252, "num_input_tokens_seen": 23017960, "step": 2050, "train_runtime": 3357.979, "train_tokens_per_second": 6854.706 }, { "epoch": 0.9863802723945521, "grad_norm": 0.8443323373794556, "learning_rate": 3.782494679565656e-05, "loss": 0.5098, "num_input_tokens_seen": 23073264, "step": 2055, "train_runtime": 3367.9787, "train_tokens_per_second": 6850.775 }, { "epoch": 0.988780224395512, "grad_norm": 0.8180744647979736, "learning_rate": 3.777098940033275e-05, "loss": 0.4722, "num_input_tokens_seen": 23130952, "step": 2060, "train_runtime": 3379.0655, "train_tokens_per_second": 6845.37 }, { "epoch": 0.991180176396472, "grad_norm": 1.0012092590332031, "learning_rate": 3.7716951388102e-05, "loss": 0.512, "num_input_tokens_seen": 23184912, "step": 2065, "train_runtime": 3390.0285, "train_tokens_per_second": 6839.15 }, { "epoch": 0.993580128397432, "grad_norm": 0.8469212651252747, "learning_rate": 3.766283310007943e-05, "loss": 0.5002, "num_input_tokens_seen": 23238656, "step": 2070, "train_runtime": 3398.1559, "train_tokens_per_second": 6838.608 }, { "epoch": 0.995980080398392, "grad_norm": 0.7020851969718933, "learning_rate": 3.7608634877886885e-05, "loss": 0.5014, "num_input_tokens_seen": 23293008, "step": 2075, "train_runtime": 3406.0069, "train_tokens_per_second": 6838.802 }, { "epoch": 0.998380032399352, "grad_norm": 0.9155061841011047, "learning_rate": 3.755435706365079e-05, "loss": 0.4932, "num_input_tokens_seen": 23349040, "step": 2080, "train_runtime": 3414.3354, "train_tokens_per_second": 6838.531 }, { "epoch": 1.000479990400192, "grad_norm": 0.7089964151382446, "learning_rate": 3.7500000000000003e-05, "loss": 0.5376, "num_input_tokens_seen": 23400800, "step": 2085, "train_runtime": 3421.5018, "train_tokens_per_second": 6839.336 }, { "epoch": 1.002879942401152, "grad_norm": 0.5927316546440125, "learning_rate": 3.7445564030063646e-05, "loss": 0.4811, "num_input_tokens_seen": 23456048, "step": 2090, "train_runtime": 3429.5202, "train_tokens_per_second": 6839.455 }, { "epoch": 1.005279894402112, "grad_norm": 0.5862952470779419, "learning_rate": 3.739104949746893e-05, "loss": 0.4931, "num_input_tokens_seen": 23511576, "step": 2095, "train_runtime": 3437.359, "train_tokens_per_second": 6840.012 }, { "epoch": 1.0076798464030718, "grad_norm": 0.8004628419876099, "learning_rate": 3.7336456746339e-05, "loss": 0.4666, "num_input_tokens_seen": 23567088, "step": 2100, "train_runtime": 3445.3003, "train_tokens_per_second": 6840.358 }, { "epoch": 1.010079798404032, "grad_norm": 0.5078383088111877, "learning_rate": 3.728178612129075e-05, "loss": 0.4806, "num_input_tokens_seen": 23626528, "step": 2105, "train_runtime": 3454.0548, "train_tokens_per_second": 6840.23 }, { "epoch": 1.0124797504049918, "grad_norm": 0.8467037081718445, "learning_rate": 3.722703796743267e-05, "loss": 0.4856, "num_input_tokens_seen": 23681288, "step": 2110, "train_runtime": 3462.258, "train_tokens_per_second": 6839.839 }, { "epoch": 1.014879702405952, "grad_norm": 0.6897312998771667, "learning_rate": 3.7172212630362627e-05, "loss": 0.5198, "num_input_tokens_seen": 23740272, "step": 2115, "train_runtime": 3470.5143, "train_tokens_per_second": 6840.563 }, { "epoch": 1.0172796544069118, "grad_norm": 0.7425886392593384, "learning_rate": 3.7117310456165696e-05, "loss": 0.5217, "num_input_tokens_seen": 23796168, "step": 2120, "train_runtime": 3478.8621, "train_tokens_per_second": 6840.216 }, { "epoch": 1.019679606407872, "grad_norm": 0.7550194263458252, "learning_rate": 3.7062331791412045e-05, "loss": 0.5463, "num_input_tokens_seen": 23852288, "step": 2125, "train_runtime": 3486.8348, "train_tokens_per_second": 6840.67 }, { "epoch": 1.0220795584088318, "grad_norm": 0.5753782391548157, "learning_rate": 3.700727698315463e-05, "loss": 0.5069, "num_input_tokens_seen": 23906400, "step": 2130, "train_runtime": 3494.7803, "train_tokens_per_second": 6840.602 }, { "epoch": 1.024479510409792, "grad_norm": 0.7684709429740906, "learning_rate": 3.6952146378927095e-05, "loss": 0.4976, "num_input_tokens_seen": 23966288, "step": 2135, "train_runtime": 3503.1065, "train_tokens_per_second": 6841.439 }, { "epoch": 1.0268794624107518, "grad_norm": 0.8290258646011353, "learning_rate": 3.689694032674153e-05, "loss": 0.4863, "num_input_tokens_seen": 24019784, "step": 2140, "train_runtime": 3511.9759, "train_tokens_per_second": 6839.393 }, { "epoch": 1.0292794144117117, "grad_norm": 0.5777615904808044, "learning_rate": 3.684165917508628e-05, "loss": 0.5026, "num_input_tokens_seen": 24075104, "step": 2145, "train_runtime": 3522.5617, "train_tokens_per_second": 6834.544 }, { "epoch": 1.0316793664126718, "grad_norm": 0.8155114650726318, "learning_rate": 3.678630327292381e-05, "loss": 0.5197, "num_input_tokens_seen": 24125896, "step": 2150, "train_runtime": 3530.4751, "train_tokens_per_second": 6833.612 }, { "epoch": 1.0340793184136317, "grad_norm": 0.5378252267837524, "learning_rate": 3.673087296968838e-05, "loss": 0.4873, "num_input_tokens_seen": 24182088, "step": 2155, "train_runtime": 3538.664, "train_tokens_per_second": 6833.677 }, { "epoch": 1.0364792704145918, "grad_norm": 0.8574205040931702, "learning_rate": 3.667536861528396e-05, "loss": 0.515, "num_input_tokens_seen": 24242048, "step": 2160, "train_runtime": 3547.103, "train_tokens_per_second": 6834.323 }, { "epoch": 1.0388792224155516, "grad_norm": 0.8171690106391907, "learning_rate": 3.661979056008191e-05, "loss": 0.486, "num_input_tokens_seen": 24294336, "step": 2165, "train_runtime": 3554.7165, "train_tokens_per_second": 6834.395 }, { "epoch": 1.0412791744165117, "grad_norm": 0.7367947101593018, "learning_rate": 3.6564139154918895e-05, "loss": 0.5121, "num_input_tokens_seen": 24348872, "step": 2170, "train_runtime": 3562.3935, "train_tokens_per_second": 6834.975 }, { "epoch": 1.0436791264174716, "grad_norm": 0.718895673751831, "learning_rate": 3.6508414751094556e-05, "loss": 0.5462, "num_input_tokens_seen": 24402136, "step": 2175, "train_runtime": 3570.1249, "train_tokens_per_second": 6835.093 }, { "epoch": 1.0460790784184317, "grad_norm": 0.7847620248794556, "learning_rate": 3.6452617700369345e-05, "loss": 0.4975, "num_input_tokens_seen": 24451792, "step": 2180, "train_runtime": 3577.4533, "train_tokens_per_second": 6834.972 }, { "epoch": 1.0484790304193916, "grad_norm": 0.7218212485313416, "learning_rate": 3.639674835496232e-05, "loss": 0.568, "num_input_tokens_seen": 24508800, "step": 2185, "train_runtime": 3585.3931, "train_tokens_per_second": 6835.736 }, { "epoch": 1.0508789824203515, "grad_norm": 0.6216446161270142, "learning_rate": 3.634080706754887e-05, "loss": 0.5024, "num_input_tokens_seen": 24567000, "step": 2190, "train_runtime": 3593.4867, "train_tokens_per_second": 6836.536 }, { "epoch": 1.0532789344213116, "grad_norm": 0.7098725438117981, "learning_rate": 3.628479419125852e-05, "loss": 0.5057, "num_input_tokens_seen": 24629752, "step": 2195, "train_runtime": 3602.2113, "train_tokens_per_second": 6837.398 }, { "epoch": 1.0556788864222715, "grad_norm": 0.7154077887535095, "learning_rate": 3.6228710079672734e-05, "loss": 0.5329, "num_input_tokens_seen": 24685968, "step": 2200, "train_runtime": 3610.3704, "train_tokens_per_second": 6837.517 }, { "epoch": 1.0580788384232316, "grad_norm": 0.6186597347259521, "learning_rate": 3.6172555086822615e-05, "loss": 0.5114, "num_input_tokens_seen": 24745552, "step": 2205, "train_runtime": 3618.4119, "train_tokens_per_second": 6838.788 }, { "epoch": 1.0604787904241915, "grad_norm": 0.7932461500167847, "learning_rate": 3.6116329567186724e-05, "loss": 0.4939, "num_input_tokens_seen": 24799856, "step": 2210, "train_runtime": 3626.1603, "train_tokens_per_second": 6839.151 }, { "epoch": 1.0628787424251516, "grad_norm": 0.7647953629493713, "learning_rate": 3.6060033875688804e-05, "loss": 0.5289, "num_input_tokens_seen": 24853952, "step": 2215, "train_runtime": 3633.6609, "train_tokens_per_second": 6839.921 }, { "epoch": 1.0652786944261114, "grad_norm": 0.722197413444519, "learning_rate": 3.600366836769557e-05, "loss": 0.5015, "num_input_tokens_seen": 24911328, "step": 2220, "train_runtime": 3641.5303, "train_tokens_per_second": 6840.895 }, { "epoch": 1.0676786464270716, "grad_norm": 0.9403772354125977, "learning_rate": 3.5947233399014444e-05, "loss": 0.4982, "num_input_tokens_seen": 24967496, "step": 2225, "train_runtime": 3649.8212, "train_tokens_per_second": 6840.745 }, { "epoch": 1.0700785984280314, "grad_norm": 0.5855931639671326, "learning_rate": 3.589072932589134e-05, "loss": 0.4706, "num_input_tokens_seen": 25028408, "step": 2230, "train_runtime": 3658.1326, "train_tokens_per_second": 6841.854 }, { "epoch": 1.0724785504289913, "grad_norm": 0.7537211179733276, "learning_rate": 3.583415650500837e-05, "loss": 0.5351, "num_input_tokens_seen": 25082672, "step": 2235, "train_runtime": 3665.8181, "train_tokens_per_second": 6842.312 }, { "epoch": 1.0748785024299514, "grad_norm": 0.7052933573722839, "learning_rate": 3.577751529348163e-05, "loss": 0.5137, "num_input_tokens_seen": 25138272, "step": 2240, "train_runtime": 3673.8839, "train_tokens_per_second": 6842.424 }, { "epoch": 1.0772784544309113, "grad_norm": 0.6160354614257812, "learning_rate": 3.572080604885894e-05, "loss": 0.4984, "num_input_tokens_seen": 25198880, "step": 2245, "train_runtime": 3682.6208, "train_tokens_per_second": 6842.649 }, { "epoch": 1.0796784064318714, "grad_norm": 0.7151322960853577, "learning_rate": 3.566402912911755e-05, "loss": 0.4745, "num_input_tokens_seen": 25255672, "step": 2250, "train_runtime": 3691.127, "train_tokens_per_second": 6842.266 }, { "epoch": 1.0820783584328313, "grad_norm": 0.6750310063362122, "learning_rate": 3.560718489266194e-05, "loss": 0.4705, "num_input_tokens_seen": 25310096, "step": 2255, "train_runtime": 3698.9218, "train_tokens_per_second": 6842.561 }, { "epoch": 1.0844783104337914, "grad_norm": 0.7280714511871338, "learning_rate": 3.555027369832151e-05, "loss": 0.529, "num_input_tokens_seen": 25365416, "step": 2260, "train_runtime": 3706.9184, "train_tokens_per_second": 6842.723 }, { "epoch": 1.0868782624347513, "grad_norm": 0.7498377561569214, "learning_rate": 3.5493295905348334e-05, "loss": 0.4974, "num_input_tokens_seen": 25421480, "step": 2265, "train_runtime": 3715.1661, "train_tokens_per_second": 6842.623 }, { "epoch": 1.0892782144357114, "grad_norm": 0.7328541874885559, "learning_rate": 3.54362518734149e-05, "loss": 0.4618, "num_input_tokens_seen": 25482160, "step": 2270, "train_runtime": 3723.7211, "train_tokens_per_second": 6843.198 }, { "epoch": 1.0916781664366713, "grad_norm": 0.6172477006912231, "learning_rate": 3.537914196261181e-05, "loss": 0.5266, "num_input_tokens_seen": 25538416, "step": 2275, "train_runtime": 3731.9378, "train_tokens_per_second": 6843.205 }, { "epoch": 1.0940781184376314, "grad_norm": 0.5969734191894531, "learning_rate": 3.5321966533445547e-05, "loss": 0.5244, "num_input_tokens_seen": 25594328, "step": 2280, "train_runtime": 3739.9474, "train_tokens_per_second": 6843.499 }, { "epoch": 1.0964780704385912, "grad_norm": 0.9102872610092163, "learning_rate": 3.526472594683617e-05, "loss": 0.5011, "num_input_tokens_seen": 25647608, "step": 2285, "train_runtime": 3747.8696, "train_tokens_per_second": 6843.25 }, { "epoch": 1.0988780224395511, "grad_norm": 0.7734837532043457, "learning_rate": 3.5207420564115045e-05, "loss": 0.5229, "num_input_tokens_seen": 25702960, "step": 2290, "train_runtime": 3755.5877, "train_tokens_per_second": 6843.925 }, { "epoch": 1.1012779744405112, "grad_norm": 0.6865848898887634, "learning_rate": 3.515005074702256e-05, "loss": 0.5035, "num_input_tokens_seen": 25758120, "step": 2295, "train_runtime": 3763.673, "train_tokens_per_second": 6843.878 }, { "epoch": 1.1036779264414711, "grad_norm": 0.6671602129936218, "learning_rate": 3.509261685770585e-05, "loss": 0.4939, "num_input_tokens_seen": 25817024, "step": 2300, "train_runtime": 3772.0902, "train_tokens_per_second": 6844.222 }, { "epoch": 1.1060778784424312, "grad_norm": 0.6217396855354309, "learning_rate": 3.5035119258716495e-05, "loss": 0.5389, "num_input_tokens_seen": 25876744, "step": 2305, "train_runtime": 3780.9145, "train_tokens_per_second": 6844.044 }, { "epoch": 1.108477830443391, "grad_norm": 0.7444595098495483, "learning_rate": 3.497755831300828e-05, "loss": 0.49, "num_input_tokens_seen": 25928600, "step": 2310, "train_runtime": 3788.9853, "train_tokens_per_second": 6843.151 }, { "epoch": 1.1108777824443512, "grad_norm": 0.6591025590896606, "learning_rate": 3.491993438393481e-05, "loss": 0.4658, "num_input_tokens_seen": 25985192, "step": 2315, "train_runtime": 3797.2779, "train_tokens_per_second": 6843.11 }, { "epoch": 1.113277734445311, "grad_norm": 0.7887580394744873, "learning_rate": 3.486224783524731e-05, "loss": 0.5464, "num_input_tokens_seen": 26040520, "step": 2320, "train_runtime": 3804.9274, "train_tokens_per_second": 6843.894 }, { "epoch": 1.1156776864462712, "grad_norm": 0.8074533939361572, "learning_rate": 3.480449903109229e-05, "loss": 0.5227, "num_input_tokens_seen": 26093336, "step": 2325, "train_runtime": 3812.5053, "train_tokens_per_second": 6844.144 }, { "epoch": 1.118077638447231, "grad_norm": 0.7056359648704529, "learning_rate": 3.474668833600923e-05, "loss": 0.4759, "num_input_tokens_seen": 26148320, "step": 2330, "train_runtime": 3820.6134, "train_tokens_per_second": 6844.011 }, { "epoch": 1.120477590448191, "grad_norm": 0.841861367225647, "learning_rate": 3.4688816114928327e-05, "loss": 0.5181, "num_input_tokens_seen": 26206080, "step": 2335, "train_runtime": 3828.5922, "train_tokens_per_second": 6844.835 }, { "epoch": 1.122877542449151, "grad_norm": 0.6521568298339844, "learning_rate": 3.4630882733168116e-05, "loss": 0.4938, "num_input_tokens_seen": 26262688, "step": 2340, "train_runtime": 3836.5264, "train_tokens_per_second": 6845.434 }, { "epoch": 1.125277494450111, "grad_norm": 0.7665443420410156, "learning_rate": 3.4572888556433246e-05, "loss": 0.4681, "num_input_tokens_seen": 26321160, "step": 2345, "train_runtime": 3844.9857, "train_tokens_per_second": 6845.581 }, { "epoch": 1.127677446451071, "grad_norm": 0.616336464881897, "learning_rate": 3.451483395081212e-05, "loss": 0.4631, "num_input_tokens_seen": 26378192, "step": 2350, "train_runtime": 3853.2119, "train_tokens_per_second": 6845.767 }, { "epoch": 1.130077398452031, "grad_norm": 0.6478726863861084, "learning_rate": 3.445671928277461e-05, "loss": 0.4676, "num_input_tokens_seen": 26430848, "step": 2355, "train_runtime": 3861.022, "train_tokens_per_second": 6845.558 }, { "epoch": 1.132477350452991, "grad_norm": 0.6371597647666931, "learning_rate": 3.4398544919169715e-05, "loss": 0.4904, "num_input_tokens_seen": 26489064, "step": 2360, "train_runtime": 3868.9291, "train_tokens_per_second": 6846.614 }, { "epoch": 1.134877302453951, "grad_norm": 0.6929451823234558, "learning_rate": 3.4340311227223273e-05, "loss": 0.5352, "num_input_tokens_seen": 26543528, "step": 2365, "train_runtime": 3877.0017, "train_tokens_per_second": 6846.406 }, { "epoch": 1.137277254454911, "grad_norm": 0.9073979258537292, "learning_rate": 3.428201857453562e-05, "loss": 0.5051, "num_input_tokens_seen": 26596928, "step": 2370, "train_runtime": 3884.7443, "train_tokens_per_second": 6846.507 }, { "epoch": 1.139677206455871, "grad_norm": 0.7150000929832458, "learning_rate": 3.422366732907931e-05, "loss": 0.4361, "num_input_tokens_seen": 26654072, "step": 2375, "train_runtime": 3893.2295, "train_tokens_per_second": 6846.263 }, { "epoch": 1.1420771584568308, "grad_norm": 0.6671944260597229, "learning_rate": 3.416525785919673e-05, "loss": 0.488, "num_input_tokens_seen": 26707464, "step": 2380, "train_runtime": 3901.0068, "train_tokens_per_second": 6846.3 }, { "epoch": 1.1444771104577909, "grad_norm": 0.585337221622467, "learning_rate": 3.410679053359784e-05, "loss": 0.4326, "num_input_tokens_seen": 26766704, "step": 2385, "train_runtime": 3909.5898, "train_tokens_per_second": 6846.423 }, { "epoch": 1.1468770624587508, "grad_norm": 0.5534717440605164, "learning_rate": 3.404826572135779e-05, "loss": 0.4831, "num_input_tokens_seen": 26826328, "step": 2390, "train_runtime": 3918.5924, "train_tokens_per_second": 6845.909 }, { "epoch": 1.1492770144597109, "grad_norm": 0.5429486632347107, "learning_rate": 3.398968379191462e-05, "loss": 0.4909, "num_input_tokens_seen": 26880888, "step": 2395, "train_runtime": 3926.453, "train_tokens_per_second": 6846.099 }, { "epoch": 1.1516769664606707, "grad_norm": 0.8771390914916992, "learning_rate": 3.393104511506694e-05, "loss": 0.4903, "num_input_tokens_seen": 26937800, "step": 2400, "train_runtime": 3934.3502, "train_tokens_per_second": 6846.823 }, { "epoch": 1.1540769184616309, "grad_norm": 0.7701951861381531, "learning_rate": 3.387235006097155e-05, "loss": 0.4994, "num_input_tokens_seen": 26993776, "step": 2405, "train_runtime": 3942.0785, "train_tokens_per_second": 6847.6 }, { "epoch": 1.1564768704625907, "grad_norm": 0.5495705008506775, "learning_rate": 3.381359900014116e-05, "loss": 0.4745, "num_input_tokens_seen": 27053440, "step": 2410, "train_runtime": 3950.7471, "train_tokens_per_second": 6847.677 }, { "epoch": 1.1588768224635508, "grad_norm": 0.7725142240524292, "learning_rate": 3.375479230344199e-05, "loss": 0.5404, "num_input_tokens_seen": 27104744, "step": 2415, "train_runtime": 3958.6488, "train_tokens_per_second": 6846.969 }, { "epoch": 1.1612767744645107, "grad_norm": 1.0459918975830078, "learning_rate": 3.369593034209149e-05, "loss": 0.5069, "num_input_tokens_seen": 27159864, "step": 2420, "train_runtime": 3967.0288, "train_tokens_per_second": 6846.4 }, { "epoch": 1.1636767264654706, "grad_norm": 0.6602296829223633, "learning_rate": 3.363701348765597e-05, "loss": 0.4541, "num_input_tokens_seen": 27219344, "step": 2425, "train_runtime": 3976.0119, "train_tokens_per_second": 6845.891 }, { "epoch": 1.1660766784664307, "grad_norm": 0.5902988910675049, "learning_rate": 3.3578042112048226e-05, "loss": 0.4447, "num_input_tokens_seen": 27279536, "step": 2430, "train_runtime": 3984.8836, "train_tokens_per_second": 6845.755 }, { "epoch": 1.1684766304673906, "grad_norm": 0.9325588941574097, "learning_rate": 3.351901658752524e-05, "loss": 0.5227, "num_input_tokens_seen": 27336160, "step": 2435, "train_runtime": 3992.8679, "train_tokens_per_second": 6846.247 }, { "epoch": 1.1708765824683507, "grad_norm": 0.6601638793945312, "learning_rate": 3.34599372866858e-05, "loss": 0.4813, "num_input_tokens_seen": 27393304, "step": 2440, "train_runtime": 4001.2293, "train_tokens_per_second": 6846.222 }, { "epoch": 1.1732765344693106, "grad_norm": 0.8339878916740417, "learning_rate": 3.3400804582468154e-05, "loss": 0.5101, "num_input_tokens_seen": 27444632, "step": 2445, "train_runtime": 4008.6642, "train_tokens_per_second": 6846.329 }, { "epoch": 1.1756764864702707, "grad_norm": 0.8969867825508118, "learning_rate": 3.334161884814769e-05, "loss": 0.4709, "num_input_tokens_seen": 27502576, "step": 2450, "train_runtime": 4016.7436, "train_tokens_per_second": 6846.983 }, { "epoch": 1.1780764384712306, "grad_norm": 0.8373593091964722, "learning_rate": 3.3282380457334505e-05, "loss": 0.5498, "num_input_tokens_seen": 27559352, "step": 2455, "train_runtime": 4024.9244, "train_tokens_per_second": 6847.173 }, { "epoch": 1.1804763904721907, "grad_norm": 0.8110735416412354, "learning_rate": 3.3223089783971114e-05, "loss": 0.507, "num_input_tokens_seen": 27615472, "step": 2460, "train_runtime": 4032.7198, "train_tokens_per_second": 6847.853 }, { "epoch": 1.1828763424731505, "grad_norm": 0.7023930549621582, "learning_rate": 3.3163747202330066e-05, "loss": 0.498, "num_input_tokens_seen": 27671096, "step": 2465, "train_runtime": 4040.3448, "train_tokens_per_second": 6848.697 }, { "epoch": 1.1852762944741104, "grad_norm": 0.783581554889679, "learning_rate": 3.310435308701156e-05, "loss": 0.5188, "num_input_tokens_seen": 27722512, "step": 2470, "train_runtime": 4048.0115, "train_tokens_per_second": 6848.427 }, { "epoch": 1.1876762464750705, "grad_norm": 0.7718804478645325, "learning_rate": 3.304490781294114e-05, "loss": 0.4861, "num_input_tokens_seen": 27778280, "step": 2475, "train_runtime": 4055.8209, "train_tokens_per_second": 6848.991 }, { "epoch": 1.1900761984760304, "grad_norm": 0.5067981481552124, "learning_rate": 3.2985411755367246e-05, "loss": 0.4792, "num_input_tokens_seen": 27839424, "step": 2480, "train_runtime": 4064.5853, "train_tokens_per_second": 6849.266 }, { "epoch": 1.1924761504769905, "grad_norm": 0.7346833348274231, "learning_rate": 3.292586528985894e-05, "loss": 0.4599, "num_input_tokens_seen": 27894440, "step": 2485, "train_runtime": 4072.5002, "train_tokens_per_second": 6849.463 }, { "epoch": 1.1948761024779504, "grad_norm": 0.5885698199272156, "learning_rate": 3.2866268792303424e-05, "loss": 0.4936, "num_input_tokens_seen": 27959096, "step": 2490, "train_runtime": 4082.1306, "train_tokens_per_second": 6849.143 }, { "epoch": 1.1972760544789105, "grad_norm": 0.5944679975509644, "learning_rate": 3.2806622638903764e-05, "loss": 0.5008, "num_input_tokens_seen": 28010352, "step": 2495, "train_runtime": 4089.5284, "train_tokens_per_second": 6849.287 }, { "epoch": 1.1996760064798704, "grad_norm": 0.7197619080543518, "learning_rate": 3.274692720617649e-05, "loss": 0.5232, "num_input_tokens_seen": 28067424, "step": 2500, "train_runtime": 4098.1617, "train_tokens_per_second": 6848.784 }, { "epoch": 1.2020759584808305, "grad_norm": 0.71132493019104, "learning_rate": 3.2687182870949185e-05, "loss": 0.4749, "num_input_tokens_seen": 28126704, "step": 2505, "train_runtime": 4106.4308, "train_tokens_per_second": 6849.428 }, { "epoch": 1.2044759104817904, "grad_norm": 0.7117146849632263, "learning_rate": 3.2627390010358133e-05, "loss": 0.4965, "num_input_tokens_seen": 28184072, "step": 2510, "train_runtime": 4114.8063, "train_tokens_per_second": 6849.429 }, { "epoch": 1.2068758624827503, "grad_norm": 0.7712971568107605, "learning_rate": 3.256754900184593e-05, "loss": 0.489, "num_input_tokens_seen": 28237608, "step": 2515, "train_runtime": 4122.4987, "train_tokens_per_second": 6849.634 }, { "epoch": 1.2092758144837104, "grad_norm": 0.843129575252533, "learning_rate": 3.2507660223159115e-05, "loss": 0.449, "num_input_tokens_seen": 28299544, "step": 2520, "train_runtime": 4131.2681, "train_tokens_per_second": 6850.086 }, { "epoch": 1.2116757664846702, "grad_norm": 0.6665219068527222, "learning_rate": 3.2447724052345786e-05, "loss": 0.4269, "num_input_tokens_seen": 28357640, "step": 2525, "train_runtime": 4139.6319, "train_tokens_per_second": 6850.281 }, { "epoch": 1.2140757184856303, "grad_norm": 0.7961658835411072, "learning_rate": 3.238774086775317e-05, "loss": 0.4937, "num_input_tokens_seen": 28411848, "step": 2530, "train_runtime": 4147.578, "train_tokens_per_second": 6850.226 }, { "epoch": 1.2164756704865902, "grad_norm": 0.7647880911827087, "learning_rate": 3.2327711048025314e-05, "loss": 0.473, "num_input_tokens_seen": 28465072, "step": 2535, "train_runtime": 4155.7446, "train_tokens_per_second": 6849.572 }, { "epoch": 1.2188756224875503, "grad_norm": 0.7645636796951294, "learning_rate": 3.226763497210061e-05, "loss": 0.5217, "num_input_tokens_seen": 28513584, "step": 2540, "train_runtime": 4162.7633, "train_tokens_per_second": 6849.677 }, { "epoch": 1.2212755744885102, "grad_norm": 0.9397866725921631, "learning_rate": 3.2207513019209455e-05, "loss": 0.5058, "num_input_tokens_seen": 28569888, "step": 2545, "train_runtime": 4170.8063, "train_tokens_per_second": 6849.968 }, { "epoch": 1.2236755264894703, "grad_norm": 0.8510188460350037, "learning_rate": 3.2147345568871874e-05, "loss": 0.4699, "num_input_tokens_seen": 28623888, "step": 2550, "train_runtime": 4178.6198, "train_tokens_per_second": 6850.082 }, { "epoch": 1.2260754784904302, "grad_norm": 0.7524721622467041, "learning_rate": 3.208713300089504e-05, "loss": 0.4585, "num_input_tokens_seen": 28680088, "step": 2555, "train_runtime": 4187.0852, "train_tokens_per_second": 6849.655 }, { "epoch": 1.22847543049139, "grad_norm": 0.6238115429878235, "learning_rate": 3.2026875695370975e-05, "loss": 0.4872, "num_input_tokens_seen": 28733184, "step": 2560, "train_runtime": 4194.4934, "train_tokens_per_second": 6850.216 }, { "epoch": 1.2308753824923502, "grad_norm": 0.8195456862449646, "learning_rate": 3.1966574032674074e-05, "loss": 0.5134, "num_input_tokens_seen": 28787400, "step": 2565, "train_runtime": 4202.0819, "train_tokens_per_second": 6850.747 }, { "epoch": 1.23327533449331, "grad_norm": 0.7062321305274963, "learning_rate": 3.190622839345878e-05, "loss": 0.4758, "num_input_tokens_seen": 28840944, "step": 2570, "train_runtime": 4209.9012, "train_tokens_per_second": 6850.741 }, { "epoch": 1.2356752864942702, "grad_norm": 0.6290914416313171, "learning_rate": 3.184583915865709e-05, "loss": 0.5343, "num_input_tokens_seen": 28893352, "step": 2575, "train_runtime": 4217.2229, "train_tokens_per_second": 6851.275 }, { "epoch": 1.23807523849523, "grad_norm": 0.6599912643432617, "learning_rate": 3.178540670947624e-05, "loss": 0.4822, "num_input_tokens_seen": 28952544, "step": 2580, "train_runtime": 4225.8796, "train_tokens_per_second": 6851.247 }, { "epoch": 1.2404751904961901, "grad_norm": 0.6899898052215576, "learning_rate": 3.172493142739622e-05, "loss": 0.4529, "num_input_tokens_seen": 29007344, "step": 2585, "train_runtime": 4233.7269, "train_tokens_per_second": 6851.492 }, { "epoch": 1.24287514249715, "grad_norm": 0.8615679144859314, "learning_rate": 3.1664413694167424e-05, "loss": 0.5018, "num_input_tokens_seen": 29065880, "step": 2590, "train_runtime": 4242.1314, "train_tokens_per_second": 6851.716 }, { "epoch": 1.2452750944981101, "grad_norm": 0.829759955406189, "learning_rate": 3.160385389180822e-05, "loss": 0.5014, "num_input_tokens_seen": 29120600, "step": 2595, "train_runtime": 4250.6385, "train_tokens_per_second": 6850.877 }, { "epoch": 1.24767504649907, "grad_norm": 1.099179744720459, "learning_rate": 3.154325240260254e-05, "loss": 0.4823, "num_input_tokens_seen": 29174832, "step": 2600, "train_runtime": 4258.7641, "train_tokens_per_second": 6850.54 }, { "epoch": 1.25007499850003, "grad_norm": 0.7731813788414001, "learning_rate": 3.148260960909745e-05, "loss": 0.4527, "num_input_tokens_seen": 29228680, "step": 2605, "train_runtime": 4266.7683, "train_tokens_per_second": 6850.309 }, { "epoch": 1.25247495050099, "grad_norm": 0.7874563336372375, "learning_rate": 3.1421925894100745e-05, "loss": 0.5152, "num_input_tokens_seen": 29282976, "step": 2610, "train_runtime": 4274.5977, "train_tokens_per_second": 6850.464 }, { "epoch": 1.2548749025019499, "grad_norm": 0.6936095952987671, "learning_rate": 3.1361201640678554e-05, "loss": 0.5055, "num_input_tokens_seen": 29337384, "step": 2615, "train_runtime": 4282.725, "train_tokens_per_second": 6850.168 }, { "epoch": 1.25727485450291, "grad_norm": 0.8180893063545227, "learning_rate": 3.130043723215291e-05, "loss": 0.4808, "num_input_tokens_seen": 29398256, "step": 2620, "train_runtime": 4291.6094, "train_tokens_per_second": 6850.17 }, { "epoch": 1.2596748065038699, "grad_norm": 0.7401306629180908, "learning_rate": 3.123963305209932e-05, "loss": 0.5101, "num_input_tokens_seen": 29455288, "step": 2625, "train_runtime": 4299.6287, "train_tokens_per_second": 6850.659 }, { "epoch": 1.26207475850483, "grad_norm": 0.7376925349235535, "learning_rate": 3.1178789484344326e-05, "loss": 0.468, "num_input_tokens_seen": 29513208, "step": 2630, "train_runtime": 4308.0487, "train_tokens_per_second": 6850.714 }, { "epoch": 1.2644747105057899, "grad_norm": 0.7442266345024109, "learning_rate": 3.1117906912963124e-05, "loss": 0.5214, "num_input_tokens_seen": 29566424, "step": 2635, "train_runtime": 4315.7814, "train_tokens_per_second": 6850.77 }, { "epoch": 1.26687466250675, "grad_norm": 0.7198356986045837, "learning_rate": 3.105698572227712e-05, "loss": 0.5059, "num_input_tokens_seen": 29621112, "step": 2640, "train_runtime": 4324.1308, "train_tokens_per_second": 6850.189 }, { "epoch": 1.2692746145077098, "grad_norm": 0.6759196519851685, "learning_rate": 3.0996026296851516e-05, "loss": 0.4705, "num_input_tokens_seen": 29672896, "step": 2645, "train_runtime": 4331.5888, "train_tokens_per_second": 6850.349 }, { "epoch": 1.2716745665086697, "grad_norm": 0.659756600856781, "learning_rate": 3.093502902149285e-05, "loss": 0.4753, "num_input_tokens_seen": 29724344, "step": 2650, "train_runtime": 4339.2532, "train_tokens_per_second": 6850.106 }, { "epoch": 1.2740745185096298, "grad_norm": 0.7627817988395691, "learning_rate": 3.087399428124659e-05, "loss": 0.5218, "num_input_tokens_seen": 29779744, "step": 2655, "train_runtime": 4347.2112, "train_tokens_per_second": 6850.31 }, { "epoch": 1.2764744705105897, "grad_norm": 0.5417824387550354, "learning_rate": 3.081292246139473e-05, "loss": 0.4784, "num_input_tokens_seen": 29834824, "step": 2660, "train_runtime": 4355.3061, "train_tokens_per_second": 6850.224 }, { "epoch": 1.2788744225115498, "grad_norm": 0.7506272792816162, "learning_rate": 3.0751813947453265e-05, "loss": 0.4886, "num_input_tokens_seen": 29890520, "step": 2665, "train_runtime": 4362.9276, "train_tokens_per_second": 6851.024 }, { "epoch": 1.2812743745125097, "grad_norm": 0.6071366667747498, "learning_rate": 3.069066912516991e-05, "loss": 0.5277, "num_input_tokens_seen": 29945288, "step": 2670, "train_runtime": 4370.6908, "train_tokens_per_second": 6851.386 }, { "epoch": 1.2836743265134698, "grad_norm": 0.7744503021240234, "learning_rate": 3.0629488380521504e-05, "loss": 0.5158, "num_input_tokens_seen": 30001032, "step": 2675, "train_runtime": 4378.8355, "train_tokens_per_second": 6851.372 }, { "epoch": 1.2860742785144297, "grad_norm": 0.4839749336242676, "learning_rate": 3.056827209971167e-05, "loss": 0.5022, "num_input_tokens_seen": 30057416, "step": 2680, "train_runtime": 4387.5074, "train_tokens_per_second": 6850.682 }, { "epoch": 1.2884742305153898, "grad_norm": 0.5500566363334656, "learning_rate": 3.0507020669168367e-05, "loss": 0.4875, "num_input_tokens_seen": 30113512, "step": 2685, "train_runtime": 4395.5794, "train_tokens_per_second": 6850.863 }, { "epoch": 1.2908741825163497, "grad_norm": 0.7816157341003418, "learning_rate": 3.044573447554141e-05, "loss": 0.4872, "num_input_tokens_seen": 30171064, "step": 2690, "train_runtime": 4404.1038, "train_tokens_per_second": 6850.671 }, { "epoch": 1.2932741345173095, "grad_norm": 0.6968929767608643, "learning_rate": 3.038441390570008e-05, "loss": 0.4715, "num_input_tokens_seen": 30226872, "step": 2695, "train_runtime": 4412.2507, "train_tokens_per_second": 6850.67 }, { "epoch": 1.2956740865182697, "grad_norm": 0.8923588395118713, "learning_rate": 3.0323059346730666e-05, "loss": 0.5249, "num_input_tokens_seen": 30281784, "step": 2700, "train_runtime": 4420.3662, "train_tokens_per_second": 6850.515 }, { "epoch": 1.2980740385192295, "grad_norm": 0.9175417423248291, "learning_rate": 3.026167118593396e-05, "loss": 0.5334, "num_input_tokens_seen": 30336824, "step": 2705, "train_runtime": 4428.4152, "train_tokens_per_second": 6850.492 }, { "epoch": 1.3004739905201896, "grad_norm": 0.5945408344268799, "learning_rate": 3.0200249810822922e-05, "loss": 0.4795, "num_input_tokens_seen": 30391968, "step": 2710, "train_runtime": 4436.7566, "train_tokens_per_second": 6850.042 }, { "epoch": 1.3028739425211495, "grad_norm": 0.6741787195205688, "learning_rate": 3.0138795609120156e-05, "loss": 0.5054, "num_input_tokens_seen": 30448056, "step": 2715, "train_runtime": 4445.0926, "train_tokens_per_second": 6849.814 }, { "epoch": 1.3052738945221096, "grad_norm": 0.7565773129463196, "learning_rate": 3.0077308968755484e-05, "loss": 0.4871, "num_input_tokens_seen": 30509528, "step": 2720, "train_runtime": 4454.1899, "train_tokens_per_second": 6849.624 }, { "epoch": 1.3076738465230695, "grad_norm": 0.7174657583236694, "learning_rate": 3.0015790277863504e-05, "loss": 0.5235, "num_input_tokens_seen": 30564064, "step": 2725, "train_runtime": 4462.4576, "train_tokens_per_second": 6849.155 }, { "epoch": 1.3100737985240296, "grad_norm": 0.808497965335846, "learning_rate": 2.9954239924781114e-05, "loss": 0.5481, "num_input_tokens_seen": 30617256, "step": 2730, "train_runtime": 4469.9742, "train_tokens_per_second": 6849.538 }, { "epoch": 1.3124737505249895, "grad_norm": 0.7192595601081848, "learning_rate": 2.9892658298045105e-05, "loss": 0.4882, "num_input_tokens_seen": 30676776, "step": 2735, "train_runtime": 4478.1351, "train_tokens_per_second": 6850.346 }, { "epoch": 1.3148737025259494, "grad_norm": 0.7198320627212524, "learning_rate": 2.983104578638966e-05, "loss": 0.5133, "num_input_tokens_seen": 30729600, "step": 2740, "train_runtime": 4486.2754, "train_tokens_per_second": 6849.691 }, { "epoch": 1.3172736545269095, "grad_norm": 0.6649105548858643, "learning_rate": 2.976940277874395e-05, "loss": 0.4772, "num_input_tokens_seen": 30786720, "step": 2745, "train_runtime": 4494.0586, "train_tokens_per_second": 6850.538 }, { "epoch": 1.3196736065278696, "grad_norm": 0.8715736269950867, "learning_rate": 2.9707729664229623e-05, "loss": 0.5323, "num_input_tokens_seen": 30844488, "step": 2750, "train_runtime": 4502.1358, "train_tokens_per_second": 6851.079 }, { "epoch": 1.3220735585288295, "grad_norm": 0.7848823666572571, "learning_rate": 2.964602683215839e-05, "loss": 0.5318, "num_input_tokens_seen": 30901200, "step": 2755, "train_runtime": 4510.5455, "train_tokens_per_second": 6850.879 }, { "epoch": 1.3244735105297893, "grad_norm": 0.5609360337257385, "learning_rate": 2.958429467202956e-05, "loss": 0.4453, "num_input_tokens_seen": 30957496, "step": 2760, "train_runtime": 4519.3334, "train_tokens_per_second": 6850.014 }, { "epoch": 1.3268734625307494, "grad_norm": 0.8397387266159058, "learning_rate": 2.9522533573527568e-05, "loss": 0.4547, "num_input_tokens_seen": 31014440, "step": 2765, "train_runtime": 4527.735, "train_tokens_per_second": 6849.88 }, { "epoch": 1.3292734145317093, "grad_norm": 0.883388340473175, "learning_rate": 2.9460743926519524e-05, "loss": 0.4866, "num_input_tokens_seen": 31069232, "step": 2770, "train_runtime": 4535.7952, "train_tokens_per_second": 6849.787 }, { "epoch": 1.3316733665326694, "grad_norm": 0.6454315185546875, "learning_rate": 2.9398926121052757e-05, "loss": 0.4363, "num_input_tokens_seen": 31124192, "step": 2775, "train_runtime": 4543.7024, "train_tokens_per_second": 6849.963 }, { "epoch": 1.3340733185336293, "grad_norm": 0.8647413849830627, "learning_rate": 2.933708054735232e-05, "loss": 0.5387, "num_input_tokens_seen": 31181208, "step": 2780, "train_runtime": 4551.829, "train_tokens_per_second": 6850.259 }, { "epoch": 1.3364732705345892, "grad_norm": 0.8238906860351562, "learning_rate": 2.9275207595818587e-05, "loss": 0.4733, "num_input_tokens_seen": 31238792, "step": 2785, "train_runtime": 4560.1671, "train_tokens_per_second": 6850.361 }, { "epoch": 1.3388732225355493, "grad_norm": 0.8096624612808228, "learning_rate": 2.9213307657024747e-05, "loss": 0.4498, "num_input_tokens_seen": 31293408, "step": 2790, "train_runtime": 4568.3465, "train_tokens_per_second": 6850.051 }, { "epoch": 1.3412731745365094, "grad_norm": 0.6373225450515747, "learning_rate": 2.9151381121714326e-05, "loss": 0.4626, "num_input_tokens_seen": 31351360, "step": 2795, "train_runtime": 4576.4713, "train_tokens_per_second": 6850.553 }, { "epoch": 1.3436731265374693, "grad_norm": 0.9298360347747803, "learning_rate": 2.9089428380798765e-05, "loss": 0.5147, "num_input_tokens_seen": 31408064, "step": 2800, "train_runtime": 4584.763, "train_tokens_per_second": 6850.532 }, { "epoch": 1.3460730785384292, "grad_norm": 0.7824495434761047, "learning_rate": 2.9027449825354914e-05, "loss": 0.5005, "num_input_tokens_seen": 31465944, "step": 2805, "train_runtime": 4593.7143, "train_tokens_per_second": 6849.783 }, { "epoch": 1.3484730305393893, "grad_norm": 0.8347817063331604, "learning_rate": 2.8965445846622575e-05, "loss": 0.5212, "num_input_tokens_seen": 31519296, "step": 2810, "train_runtime": 4601.6577, "train_tokens_per_second": 6849.552 }, { "epoch": 1.3508729825403492, "grad_norm": 0.7829338312149048, "learning_rate": 2.8903416836002046e-05, "loss": 0.4881, "num_input_tokens_seen": 31575040, "step": 2815, "train_runtime": 4609.5566, "train_tokens_per_second": 6849.908 }, { "epoch": 1.3532729345413093, "grad_norm": 0.7527592182159424, "learning_rate": 2.8841363185051627e-05, "loss": 0.5284, "num_input_tokens_seen": 31627864, "step": 2820, "train_runtime": 4617.3734, "train_tokens_per_second": 6849.752 }, { "epoch": 1.3556728865422691, "grad_norm": 0.5921339988708496, "learning_rate": 2.877928528548518e-05, "loss": 0.5337, "num_input_tokens_seen": 31681448, "step": 2825, "train_runtime": 4625.135, "train_tokens_per_second": 6849.843 }, { "epoch": 1.358072838543229, "grad_norm": 0.8095146417617798, "learning_rate": 2.871718352916961e-05, "loss": 0.4355, "num_input_tokens_seen": 31734720, "step": 2830, "train_runtime": 4632.6583, "train_tokens_per_second": 6850.218 }, { "epoch": 1.3604727905441891, "grad_norm": 0.863218367099762, "learning_rate": 2.8655058308122435e-05, "loss": 0.522, "num_input_tokens_seen": 31786472, "step": 2835, "train_runtime": 4640.2065, "train_tokens_per_second": 6850.228 }, { "epoch": 1.3628727425451492, "grad_norm": 0.6763318181037903, "learning_rate": 2.8592910014509284e-05, "loss": 0.4825, "num_input_tokens_seen": 31842040, "step": 2840, "train_runtime": 4648.7432, "train_tokens_per_second": 6849.602 }, { "epoch": 1.3652726945461091, "grad_norm": 0.9902337789535522, "learning_rate": 2.853073904064144e-05, "loss": 0.4791, "num_input_tokens_seen": 31901936, "step": 2845, "train_runtime": 4657.7444, "train_tokens_per_second": 6849.224 }, { "epoch": 1.367672646547069, "grad_norm": 0.607513427734375, "learning_rate": 2.8468545778973365e-05, "loss": 0.4962, "num_input_tokens_seen": 31955760, "step": 2850, "train_runtime": 4665.9209, "train_tokens_per_second": 6848.757 }, { "epoch": 1.370072598548029, "grad_norm": 0.7585775256156921, "learning_rate": 2.8406330622100185e-05, "loss": 0.5193, "num_input_tokens_seen": 32012936, "step": 2855, "train_runtime": 4674.1143, "train_tokens_per_second": 6848.984 }, { "epoch": 1.372472550548989, "grad_norm": 0.6520575284957886, "learning_rate": 2.834409396275526e-05, "loss": 0.4838, "num_input_tokens_seen": 32075400, "step": 2860, "train_runtime": 4683.1148, "train_tokens_per_second": 6849.159 }, { "epoch": 1.374872502549949, "grad_norm": 0.7430661916732788, "learning_rate": 2.8281836193807677e-05, "loss": 0.5193, "num_input_tokens_seen": 32127560, "step": 2865, "train_runtime": 4690.6625, "train_tokens_per_second": 6849.258 }, { "epoch": 1.377272454550909, "grad_norm": 0.6538442373275757, "learning_rate": 2.821955770825978e-05, "loss": 0.563, "num_input_tokens_seen": 32182368, "step": 2870, "train_runtime": 4698.3261, "train_tokens_per_second": 6849.752 }, { "epoch": 1.3796724065518688, "grad_norm": 0.6958315968513489, "learning_rate": 2.81572588992447e-05, "loss": 0.4983, "num_input_tokens_seen": 32238704, "step": 2875, "train_runtime": 4706.8956, "train_tokens_per_second": 6849.25 }, { "epoch": 1.382072358552829, "grad_norm": 0.5171172618865967, "learning_rate": 2.809494016002382e-05, "loss": 0.4887, "num_input_tokens_seen": 32299312, "step": 2880, "train_runtime": 4717.4351, "train_tokens_per_second": 6846.795 }, { "epoch": 1.384472310553789, "grad_norm": 0.7386242151260376, "learning_rate": 2.8032601883984373e-05, "loss": 0.4676, "num_input_tokens_seen": 32353968, "step": 2885, "train_runtime": 4727.1468, "train_tokens_per_second": 6844.291 }, { "epoch": 1.386872262554749, "grad_norm": 0.6488030552864075, "learning_rate": 2.7970244464636907e-05, "loss": 0.5187, "num_input_tokens_seen": 32408248, "step": 2890, "train_runtime": 4737.0735, "train_tokens_per_second": 6841.407 }, { "epoch": 1.3892722145557088, "grad_norm": 0.7091050744056702, "learning_rate": 2.7907868295612805e-05, "loss": 0.5009, "num_input_tokens_seen": 32461008, "step": 2895, "train_runtime": 4746.6232, "train_tokens_per_second": 6838.758 }, { "epoch": 1.391672166556669, "grad_norm": 0.735463559627533, "learning_rate": 2.7845473770661816e-05, "loss": 0.4448, "num_input_tokens_seen": 32519744, "step": 2900, "train_runtime": 4756.731, "train_tokens_per_second": 6836.574 }, { "epoch": 1.3940721185576288, "grad_norm": 0.8551938533782959, "learning_rate": 2.7783061283649547e-05, "loss": 0.4562, "num_input_tokens_seen": 32575104, "step": 2905, "train_runtime": 4767.5045, "train_tokens_per_second": 6832.737 }, { "epoch": 1.396472070558589, "grad_norm": 0.8265554904937744, "learning_rate": 2.7720631228555003e-05, "loss": 0.4771, "num_input_tokens_seen": 32633880, "step": 2910, "train_runtime": 4778.118, "train_tokens_per_second": 6829.861 }, { "epoch": 1.3988720225595488, "grad_norm": 0.7008459568023682, "learning_rate": 2.7658183999468096e-05, "loss": 0.5213, "num_input_tokens_seen": 32687728, "step": 2915, "train_runtime": 4787.6745, "train_tokens_per_second": 6827.475 }, { "epoch": 1.4012719745605087, "grad_norm": 0.714462399482727, "learning_rate": 2.759571999058712e-05, "loss": 0.4879, "num_input_tokens_seen": 32744776, "step": 2920, "train_runtime": 4798.5825, "train_tokens_per_second": 6823.843 }, { "epoch": 1.4036719265614688, "grad_norm": 0.7445899248123169, "learning_rate": 2.7533239596216326e-05, "loss": 0.4801, "num_input_tokens_seen": 32802640, "step": 2925, "train_runtime": 4809.0391, "train_tokens_per_second": 6821.038 }, { "epoch": 1.4060718785624289, "grad_norm": 0.7316624522209167, "learning_rate": 2.747074321076336e-05, "loss": 0.4811, "num_input_tokens_seen": 32858848, "step": 2930, "train_runtime": 4819.753, "train_tokens_per_second": 6817.538 }, { "epoch": 1.4084718305633888, "grad_norm": 0.8229737877845764, "learning_rate": 2.7408231228736854e-05, "loss": 0.4749, "num_input_tokens_seen": 32915328, "step": 2935, "train_runtime": 4829.6875, "train_tokens_per_second": 6815.209 }, { "epoch": 1.4108717825643486, "grad_norm": 0.6625364422798157, "learning_rate": 2.7345704044743857e-05, "loss": 0.5214, "num_input_tokens_seen": 32970256, "step": 2940, "train_runtime": 4839.5418, "train_tokens_per_second": 6812.681 }, { "epoch": 1.4132717345653087, "grad_norm": 0.7320582270622253, "learning_rate": 2.7283162053487406e-05, "loss": 0.5137, "num_input_tokens_seen": 33024728, "step": 2945, "train_runtime": 4849.3505, "train_tokens_per_second": 6810.134 }, { "epoch": 1.4156716865662686, "grad_norm": 0.8458564281463623, "learning_rate": 2.7220605649763997e-05, "loss": 0.4864, "num_input_tokens_seen": 33083776, "step": 2950, "train_runtime": 4859.7251, "train_tokens_per_second": 6807.746 }, { "epoch": 1.4180716385672287, "grad_norm": 0.6681801676750183, "learning_rate": 2.71580352284611e-05, "loss": 0.4656, "num_input_tokens_seen": 33141792, "step": 2955, "train_runtime": 4870.089, "train_tokens_per_second": 6805.172 }, { "epoch": 1.4204715905681886, "grad_norm": 0.5828260779380798, "learning_rate": 2.7095451184554684e-05, "loss": 0.4626, "num_input_tokens_seen": 33200320, "step": 2960, "train_runtime": 4879.7888, "train_tokens_per_second": 6803.639 }, { "epoch": 1.4228715425691485, "grad_norm": 0.6321309208869934, "learning_rate": 2.7032853913106702e-05, "loss": 0.5166, "num_input_tokens_seen": 33258192, "step": 2965, "train_runtime": 4889.401, "train_tokens_per_second": 6802.1 }, { "epoch": 1.4252714945701086, "grad_norm": 0.5766092538833618, "learning_rate": 2.697024380926261e-05, "loss": 0.4709, "num_input_tokens_seen": 33315416, "step": 2970, "train_runtime": 4899.761, "train_tokens_per_second": 6799.396 }, { "epoch": 1.4276714465710687, "grad_norm": 0.5863097906112671, "learning_rate": 2.6907621268248867e-05, "loss": 0.4682, "num_input_tokens_seen": 33374248, "step": 2975, "train_runtime": 4910.9171, "train_tokens_per_second": 6795.93 }, { "epoch": 1.4300713985720286, "grad_norm": 0.6625893115997314, "learning_rate": 2.6844986685370438e-05, "loss": 0.4795, "num_input_tokens_seen": 33430576, "step": 2980, "train_runtime": 4920.8367, "train_tokens_per_second": 6793.677 }, { "epoch": 1.4324713505729885, "grad_norm": 0.889992356300354, "learning_rate": 2.6782340456008304e-05, "loss": 0.5081, "num_input_tokens_seen": 33481872, "step": 2985, "train_runtime": 4930.6268, "train_tokens_per_second": 6790.591 }, { "epoch": 1.4348713025739486, "grad_norm": 0.8572867512702942, "learning_rate": 2.6719682975616972e-05, "loss": 0.5238, "num_input_tokens_seen": 33535608, "step": 2990, "train_runtime": 4940.3628, "train_tokens_per_second": 6788.086 }, { "epoch": 1.4372712545749085, "grad_norm": 0.7185449600219727, "learning_rate": 2.6657014639721963e-05, "loss": 0.4628, "num_input_tokens_seen": 33595176, "step": 2995, "train_runtime": 4950.583, "train_tokens_per_second": 6786.105 }, { "epoch": 1.4396712065758686, "grad_norm": 0.6952937245368958, "learning_rate": 2.659433584391733e-05, "loss": 0.4726, "num_input_tokens_seen": 33655192, "step": 3000, "train_runtime": 4960.7955, "train_tokens_per_second": 6784.233 }, { "epoch": 1.4420711585768284, "grad_norm": 0.5073747634887695, "learning_rate": 2.6531646983863135e-05, "loss": 0.5086, "num_input_tokens_seen": 33710344, "step": 3005, "train_runtime": 4971.2496, "train_tokens_per_second": 6781.06 }, { "epoch": 1.4444711105777883, "grad_norm": 0.5523395538330078, "learning_rate": 2.6468948455283006e-05, "loss": 0.4855, "num_input_tokens_seen": 33762880, "step": 3010, "train_runtime": 4981.002, "train_tokens_per_second": 6778.331 }, { "epoch": 1.4468710625787484, "grad_norm": 0.7493255138397217, "learning_rate": 2.6406240653961562e-05, "loss": 0.5121, "num_input_tokens_seen": 33814912, "step": 3015, "train_runtime": 4990.9252, "train_tokens_per_second": 6775.279 }, { "epoch": 1.4492710145797085, "grad_norm": 0.7933918833732605, "learning_rate": 2.6343523975741995e-05, "loss": 0.4822, "num_input_tokens_seen": 33869336, "step": 3020, "train_runtime": 5000.7837, "train_tokens_per_second": 6772.806 }, { "epoch": 1.4516709665806684, "grad_norm": 0.827980101108551, "learning_rate": 2.628079881652351e-05, "loss": 0.5094, "num_input_tokens_seen": 33921376, "step": 3025, "train_runtime": 5010.3271, "train_tokens_per_second": 6770.292 }, { "epoch": 1.4540709185816283, "grad_norm": 0.7234380841255188, "learning_rate": 2.6218065572258847e-05, "loss": 0.4494, "num_input_tokens_seen": 33979216, "step": 3030, "train_runtime": 5021.1603, "train_tokens_per_second": 6767.204 }, { "epoch": 1.4564708705825884, "grad_norm": 0.6564066410064697, "learning_rate": 2.6155324638951795e-05, "loss": 0.5281, "num_input_tokens_seen": 34036320, "step": 3035, "train_runtime": 5032.1108, "train_tokens_per_second": 6763.826 }, { "epoch": 1.4588708225835483, "grad_norm": 0.9267168045043945, "learning_rate": 2.6092576412654668e-05, "loss": 0.5001, "num_input_tokens_seen": 34090128, "step": 3040, "train_runtime": 5042.1218, "train_tokens_per_second": 6761.068 }, { "epoch": 1.4612707745845084, "grad_norm": 0.6622974276542664, "learning_rate": 2.602982128946583e-05, "loss": 0.4876, "num_input_tokens_seen": 34148400, "step": 3045, "train_runtime": 5052.2931, "train_tokens_per_second": 6758.99 }, { "epoch": 1.4636707265854683, "grad_norm": 0.6938877105712891, "learning_rate": 2.596705966552718e-05, "loss": 0.4316, "num_input_tokens_seen": 34205656, "step": 3050, "train_runtime": 5063.4654, "train_tokens_per_second": 6755.385 }, { "epoch": 1.4660706785864281, "grad_norm": 1.1527178287506104, "learning_rate": 2.5904291937021623e-05, "loss": 0.5168, "num_input_tokens_seen": 34256136, "step": 3055, "train_runtime": 5073.3962, "train_tokens_per_second": 6752.111 }, { "epoch": 1.4684706305873882, "grad_norm": 0.8553231358528137, "learning_rate": 2.5841518500170647e-05, "loss": 0.4756, "num_input_tokens_seen": 34311976, "step": 3060, "train_runtime": 5083.9773, "train_tokens_per_second": 6749.042 }, { "epoch": 1.4708705825883484, "grad_norm": 0.6087079644203186, "learning_rate": 2.5778739751231747e-05, "loss": 0.4665, "num_input_tokens_seen": 34370640, "step": 3065, "train_runtime": 5094.5141, "train_tokens_per_second": 6746.598 }, { "epoch": 1.4732705345893082, "grad_norm": 0.7348918318748474, "learning_rate": 2.5715956086495947e-05, "loss": 0.4652, "num_input_tokens_seen": 34421432, "step": 3070, "train_runtime": 5103.6348, "train_tokens_per_second": 6744.494 }, { "epoch": 1.4756704865902681, "grad_norm": 1.1253235340118408, "learning_rate": 2.565316790228532e-05, "loss": 0.4909, "num_input_tokens_seen": 34478304, "step": 3075, "train_runtime": 5113.6496, "train_tokens_per_second": 6742.406 }, { "epoch": 1.4780704385912282, "grad_norm": 0.7545915842056274, "learning_rate": 2.5590375594950443e-05, "loss": 0.4865, "num_input_tokens_seen": 34532640, "step": 3080, "train_runtime": 5123.1565, "train_tokens_per_second": 6740.501 }, { "epoch": 1.480470390592188, "grad_norm": 0.8254991769790649, "learning_rate": 2.5527579560867947e-05, "loss": 0.503, "num_input_tokens_seen": 34597280, "step": 3085, "train_runtime": 5135.0435, "train_tokens_per_second": 6737.485 }, { "epoch": 1.4828703425931482, "grad_norm": 0.7427690625190735, "learning_rate": 2.546478019643797e-05, "loss": 0.4799, "num_input_tokens_seen": 34654488, "step": 3090, "train_runtime": 5145.9423, "train_tokens_per_second": 6734.333 }, { "epoch": 1.485270294594108, "grad_norm": 0.6483776569366455, "learning_rate": 2.540197789808168e-05, "loss": 0.4463, "num_input_tokens_seen": 34716120, "step": 3095, "train_runtime": 5158.485, "train_tokens_per_second": 6729.906 }, { "epoch": 1.487670246595068, "grad_norm": 0.5190485715866089, "learning_rate": 2.5339173062238774e-05, "loss": 0.4597, "num_input_tokens_seen": 34777640, "step": 3100, "train_runtime": 5171.0585, "train_tokens_per_second": 6725.439 }, { "epoch": 1.490070198596028, "grad_norm": 0.5749461054801941, "learning_rate": 2.5276366085364937e-05, "loss": 0.5084, "num_input_tokens_seen": 34831992, "step": 3105, "train_runtime": 5181.3994, "train_tokens_per_second": 6722.507 }, { "epoch": 1.4924701505969882, "grad_norm": 0.7715994119644165, "learning_rate": 2.52135573639294e-05, "loss": 0.4786, "num_input_tokens_seen": 34894736, "step": 3110, "train_runtime": 5191.5337, "train_tokens_per_second": 6721.47 }, { "epoch": 1.494870102597948, "grad_norm": 0.9101441502571106, "learning_rate": 2.5150747294412398e-05, "loss": 0.5175, "num_input_tokens_seen": 34951296, "step": 3115, "train_runtime": 5201.6456, "train_tokens_per_second": 6719.277 }, { "epoch": 1.497270054598908, "grad_norm": 0.7418543696403503, "learning_rate": 2.508793627330267e-05, "loss": 0.451, "num_input_tokens_seen": 35006168, "step": 3120, "train_runtime": 5211.4651, "train_tokens_per_second": 6717.145 }, { "epoch": 1.499670006599868, "grad_norm": 0.7147541642189026, "learning_rate": 2.502512469709497e-05, "loss": 0.5077, "num_input_tokens_seen": 35059176, "step": 3125, "train_runtime": 5221.3263, "train_tokens_per_second": 6714.611 }, { "epoch": 1.5020699586008281, "grad_norm": 0.5535465478897095, "learning_rate": 2.4962312962287544e-05, "loss": 0.4924, "num_input_tokens_seen": 35114264, "step": 3130, "train_runtime": 5230.6201, "train_tokens_per_second": 6713.212 }, { "epoch": 1.504469910601788, "grad_norm": 0.7213118672370911, "learning_rate": 2.4899501465379644e-05, "loss": 0.5004, "num_input_tokens_seen": 35168424, "step": 3135, "train_runtime": 5241.0072, "train_tokens_per_second": 6710.242 }, { "epoch": 1.506869862602748, "grad_norm": 0.7794874310493469, "learning_rate": 2.4836690602869044e-05, "loss": 0.5145, "num_input_tokens_seen": 35224296, "step": 3140, "train_runtime": 5250.7072, "train_tokens_per_second": 6708.486 }, { "epoch": 1.5092698146037078, "grad_norm": 0.9129291772842407, "learning_rate": 2.4773880771249477e-05, "loss": 0.4889, "num_input_tokens_seen": 35280088, "step": 3145, "train_runtime": 5261.3252, "train_tokens_per_second": 6705.552 }, { "epoch": 1.511669766604668, "grad_norm": 0.7600094079971313, "learning_rate": 2.4711072367008176e-05, "loss": 0.4967, "num_input_tokens_seen": 35340720, "step": 3150, "train_runtime": 5271.563, "train_tokens_per_second": 6704.031 }, { "epoch": 1.514069718605628, "grad_norm": 0.5989595055580139, "learning_rate": 2.4648265786623388e-05, "loss": 0.4843, "num_input_tokens_seen": 35397240, "step": 3155, "train_runtime": 5282.0778, "train_tokens_per_second": 6701.386 }, { "epoch": 1.5164696706065879, "grad_norm": 0.6885458827018738, "learning_rate": 2.4585461426561818e-05, "loss": 0.5011, "num_input_tokens_seen": 35460504, "step": 3160, "train_runtime": 5293.3254, "train_tokens_per_second": 6699.098 }, { "epoch": 1.5188696226075478, "grad_norm": 0.5150988698005676, "learning_rate": 2.452265968327618e-05, "loss": 0.512, "num_input_tokens_seen": 35517032, "step": 3165, "train_runtime": 5303.2586, "train_tokens_per_second": 6697.209 }, { "epoch": 1.5212695746085079, "grad_norm": 0.7029662132263184, "learning_rate": 2.4459860953202635e-05, "loss": 0.4807, "num_input_tokens_seen": 35567328, "step": 3170, "train_runtime": 5312.0452, "train_tokens_per_second": 6695.6 }, { "epoch": 1.523669526609468, "grad_norm": 0.6837257742881775, "learning_rate": 2.4397065632758374e-05, "loss": 0.4578, "num_input_tokens_seen": 35622032, "step": 3175, "train_runtime": 5321.4999, "train_tokens_per_second": 6693.983 }, { "epoch": 1.5260694786104279, "grad_norm": 0.7105430364608765, "learning_rate": 2.4334274118339014e-05, "loss": 0.512, "num_input_tokens_seen": 35684184, "step": 3180, "train_runtime": 5331.4522, "train_tokens_per_second": 6693.145 }, { "epoch": 1.5284694306113877, "grad_norm": 0.788021445274353, "learning_rate": 2.4271486806316173e-05, "loss": 0.5011, "num_input_tokens_seen": 35741544, "step": 3185, "train_runtime": 5341.311, "train_tokens_per_second": 6691.53 }, { "epoch": 1.5308693826123476, "grad_norm": 0.8190677165985107, "learning_rate": 2.420870409303495e-05, "loss": 0.4627, "num_input_tokens_seen": 35797096, "step": 3190, "train_runtime": 5350.6319, "train_tokens_per_second": 6690.256 }, { "epoch": 1.5332693346133077, "grad_norm": 0.9217768907546997, "learning_rate": 2.4145926374811395e-05, "loss": 0.4672, "num_input_tokens_seen": 35849520, "step": 3195, "train_runtime": 5360.1483, "train_tokens_per_second": 6688.158 }, { "epoch": 1.5356692866142678, "grad_norm": 0.729516327381134, "learning_rate": 2.4083154047930014e-05, "loss": 0.4645, "num_input_tokens_seen": 35908672, "step": 3200, "train_runtime": 5371.5509, "train_tokens_per_second": 6684.973 }, { "epoch": 1.5380692386152277, "grad_norm": 0.7882852554321289, "learning_rate": 2.4020387508641322e-05, "loss": 0.4833, "num_input_tokens_seen": 35963328, "step": 3205, "train_runtime": 5382.0522, "train_tokens_per_second": 6682.085 }, { "epoch": 1.5404691906161876, "grad_norm": 0.6502909660339355, "learning_rate": 2.3957627153159277e-05, "loss": 0.4763, "num_input_tokens_seen": 36021192, "step": 3210, "train_runtime": 5392.0941, "train_tokens_per_second": 6680.372 }, { "epoch": 1.5428691426171477, "grad_norm": 0.8590161204338074, "learning_rate": 2.3894873377658788e-05, "loss": 0.4768, "num_input_tokens_seen": 36078448, "step": 3215, "train_runtime": 5402.6273, "train_tokens_per_second": 6677.945 }, { "epoch": 1.5452690946181078, "grad_norm": 1.034970760345459, "learning_rate": 2.383212657827324e-05, "loss": 0.502, "num_input_tokens_seen": 36132656, "step": 3220, "train_runtime": 5412.5546, "train_tokens_per_second": 6675.712 }, { "epoch": 1.5476690466190677, "grad_norm": 0.5326734185218811, "learning_rate": 2.3769387151092e-05, "loss": 0.4883, "num_input_tokens_seen": 36191712, "step": 3225, "train_runtime": 5422.7637, "train_tokens_per_second": 6674.034 }, { "epoch": 1.5500689986200276, "grad_norm": 0.9736510515213013, "learning_rate": 2.370665549215787e-05, "loss": 0.5341, "num_input_tokens_seen": 36245160, "step": 3230, "train_runtime": 5432.9922, "train_tokens_per_second": 6671.307 }, { "epoch": 1.5524689506209874, "grad_norm": 0.6917448043823242, "learning_rate": 2.3643931997464617e-05, "loss": 0.4849, "num_input_tokens_seen": 36303576, "step": 3235, "train_runtime": 5443.3631, "train_tokens_per_second": 6669.328 }, { "epoch": 1.5548689026219475, "grad_norm": 0.9082401394844055, "learning_rate": 2.35812170629545e-05, "loss": 0.4583, "num_input_tokens_seen": 36360840, "step": 3240, "train_runtime": 5453.245, "train_tokens_per_second": 6667.744 }, { "epoch": 1.5572688546229077, "grad_norm": 0.6470857262611389, "learning_rate": 2.351851108451571e-05, "loss": 0.4604, "num_input_tokens_seen": 36422200, "step": 3245, "train_runtime": 5463.4424, "train_tokens_per_second": 6666.529 }, { "epoch": 1.5596688066238675, "grad_norm": 0.8061736822128296, "learning_rate": 2.34558144579799e-05, "loss": 0.5048, "num_input_tokens_seen": 36476632, "step": 3250, "train_runtime": 5473.1542, "train_tokens_per_second": 6664.645 }, { "epoch": 1.5620687586248274, "grad_norm": 0.7560340762138367, "learning_rate": 2.339312757911973e-05, "loss": 0.5113, "num_input_tokens_seen": 36529792, "step": 3255, "train_runtime": 5482.3009, "train_tokens_per_second": 6663.223 }, { "epoch": 1.5644687106257875, "grad_norm": 0.7179074883460999, "learning_rate": 2.3330450843646296e-05, "loss": 0.5005, "num_input_tokens_seen": 36586016, "step": 3260, "train_runtime": 5492.5745, "train_tokens_per_second": 6660.996 }, { "epoch": 1.5668686626267476, "grad_norm": 0.5973109602928162, "learning_rate": 2.3267784647206658e-05, "loss": 0.4804, "num_input_tokens_seen": 36641112, "step": 3265, "train_runtime": 5502.2894, "train_tokens_per_second": 6659.248 }, { "epoch": 1.5692686146277075, "grad_norm": 0.9687879681587219, "learning_rate": 2.3205129385381355e-05, "loss": 0.4928, "num_input_tokens_seen": 36697088, "step": 3270, "train_runtime": 5512.5707, "train_tokens_per_second": 6656.983 }, { "epoch": 1.5716685666286674, "grad_norm": 0.6984615325927734, "learning_rate": 2.3142485453681925e-05, "loss": 0.4872, "num_input_tokens_seen": 36755920, "step": 3275, "train_runtime": 5523.731, "train_tokens_per_second": 6654.184 }, { "epoch": 1.5740685186296273, "grad_norm": 0.7793405652046204, "learning_rate": 2.307985324754835e-05, "loss": 0.5391, "num_input_tokens_seen": 36811304, "step": 3280, "train_runtime": 5534.0048, "train_tokens_per_second": 6651.838 }, { "epoch": 1.5764684706305874, "grad_norm": 0.7121679782867432, "learning_rate": 2.3017233162346608e-05, "loss": 0.4955, "num_input_tokens_seen": 36868680, "step": 3285, "train_runtime": 5543.499, "train_tokens_per_second": 6650.796 }, { "epoch": 1.5788684226315475, "grad_norm": 0.9568763375282288, "learning_rate": 2.295462559336618e-05, "loss": 0.4775, "num_input_tokens_seen": 36925400, "step": 3290, "train_runtime": 5553.1982, "train_tokens_per_second": 6649.394 }, { "epoch": 1.5812683746325074, "grad_norm": 0.5952507257461548, "learning_rate": 2.2892030935817517e-05, "loss": 0.457, "num_input_tokens_seen": 36984032, "step": 3295, "train_runtime": 5563.7199, "train_tokens_per_second": 6647.357 }, { "epoch": 1.5836683266334672, "grad_norm": 0.8516509532928467, "learning_rate": 2.2829449584829558e-05, "loss": 0.5231, "num_input_tokens_seen": 37038928, "step": 3300, "train_runtime": 5573.6606, "train_tokens_per_second": 6645.35 }, { "epoch": 1.5860682786344273, "grad_norm": 0.569814920425415, "learning_rate": 2.2766881935447275e-05, "loss": 0.5044, "num_input_tokens_seen": 37092208, "step": 3305, "train_runtime": 5583.51, "train_tokens_per_second": 6643.17 }, { "epoch": 1.5884682306353874, "grad_norm": 0.8386396169662476, "learning_rate": 2.2704328382629138e-05, "loss": 0.4753, "num_input_tokens_seen": 37147680, "step": 3310, "train_runtime": 5592.6848, "train_tokens_per_second": 6642.191 }, { "epoch": 1.5908681826363473, "grad_norm": 0.7655364871025085, "learning_rate": 2.264178932124462e-05, "loss": 0.4796, "num_input_tokens_seen": 37203656, "step": 3315, "train_runtime": 5601.9649, "train_tokens_per_second": 6641.18 }, { "epoch": 1.5932681346373072, "grad_norm": 0.8739466071128845, "learning_rate": 2.257926514607171e-05, "loss": 0.4852, "num_input_tokens_seen": 37263520, "step": 3320, "train_runtime": 5612.1576, "train_tokens_per_second": 6639.785 }, { "epoch": 1.595668086638267, "grad_norm": 0.6632476449012756, "learning_rate": 2.2516756251794463e-05, "loss": 0.5121, "num_input_tokens_seen": 37318192, "step": 3325, "train_runtime": 5621.7888, "train_tokens_per_second": 6638.135 }, { "epoch": 1.5980680386392272, "grad_norm": 0.7768703699111938, "learning_rate": 2.245426303300044e-05, "loss": 0.5128, "num_input_tokens_seen": 37374224, "step": 3330, "train_runtime": 5631.8308, "train_tokens_per_second": 6636.248 }, { "epoch": 1.6004679906401873, "grad_norm": 0.7217375636100769, "learning_rate": 2.2391785884178256e-05, "loss": 0.4835, "num_input_tokens_seen": 37435240, "step": 3335, "train_runtime": 5642.3272, "train_tokens_per_second": 6634.716 }, { "epoch": 1.6028679426411472, "grad_norm": 0.5615156888961792, "learning_rate": 2.2329325199715114e-05, "loss": 0.4575, "num_input_tokens_seen": 37492120, "step": 3340, "train_runtime": 5652.3686, "train_tokens_per_second": 6632.993 }, { "epoch": 1.605267894642107, "grad_norm": 0.826392650604248, "learning_rate": 2.226688137389425e-05, "loss": 0.4922, "num_input_tokens_seen": 37548408, "step": 3345, "train_runtime": 5662.4517, "train_tokens_per_second": 6631.122 }, { "epoch": 1.6076678466430672, "grad_norm": 0.589180052280426, "learning_rate": 2.220445480089248e-05, "loss": 0.4807, "num_input_tokens_seen": 37610280, "step": 3350, "train_runtime": 5674.3947, "train_tokens_per_second": 6628.069 }, { "epoch": 1.6100677986440273, "grad_norm": 0.8704653978347778, "learning_rate": 2.214204587477774e-05, "loss": 0.5322, "num_input_tokens_seen": 37668512, "step": 3355, "train_runtime": 5684.5435, "train_tokens_per_second": 6626.48 }, { "epoch": 1.6124677506449872, "grad_norm": 0.7563439607620239, "learning_rate": 2.207965498950655e-05, "loss": 0.4843, "num_input_tokens_seen": 37727112, "step": 3360, "train_runtime": 5694.2908, "train_tokens_per_second": 6625.428 }, { "epoch": 1.614867702645947, "grad_norm": 0.7133488059043884, "learning_rate": 2.2017282538921556e-05, "loss": 0.4732, "num_input_tokens_seen": 37780192, "step": 3365, "train_runtime": 5703.5817, "train_tokens_per_second": 6623.942 }, { "epoch": 1.617267654646907, "grad_norm": 0.8156766295433044, "learning_rate": 2.1954928916749006e-05, "loss": 0.5115, "num_input_tokens_seen": 37839376, "step": 3370, "train_runtime": 5713.8648, "train_tokens_per_second": 6622.379 }, { "epoch": 1.619667606647867, "grad_norm": 0.7063591480255127, "learning_rate": 2.1892594516596343e-05, "loss": 0.5177, "num_input_tokens_seen": 37894296, "step": 3375, "train_runtime": 5723.4986, "train_tokens_per_second": 6620.827 }, { "epoch": 1.6220675586488271, "grad_norm": 0.8170085549354553, "learning_rate": 2.183027973194964e-05, "loss": 0.4848, "num_input_tokens_seen": 37951552, "step": 3380, "train_runtime": 5733.3985, "train_tokens_per_second": 6619.382 }, { "epoch": 1.624467510649787, "grad_norm": 0.6729702353477478, "learning_rate": 2.176798495617114e-05, "loss": 0.4927, "num_input_tokens_seen": 38011968, "step": 3385, "train_runtime": 5743.0143, "train_tokens_per_second": 6618.818 }, { "epoch": 1.6268674626507469, "grad_norm": 0.7593095898628235, "learning_rate": 2.1705710582496815e-05, "loss": 0.4888, "num_input_tokens_seen": 38067280, "step": 3390, "train_runtime": 5752.7516, "train_tokens_per_second": 6617.23 }, { "epoch": 1.629267414651707, "grad_norm": 1.1748439073562622, "learning_rate": 2.1643457004033807e-05, "loss": 0.5178, "num_input_tokens_seen": 38124912, "step": 3395, "train_runtime": 5763.3474, "train_tokens_per_second": 6615.064 }, { "epoch": 1.631667366652667, "grad_norm": 0.8947390913963318, "learning_rate": 2.1581224613758005e-05, "loss": 0.5112, "num_input_tokens_seen": 38178808, "step": 3400, "train_runtime": 5772.7591, "train_tokens_per_second": 6613.615 }, { "epoch": 1.634067318653627, "grad_norm": 0.702033519744873, "learning_rate": 2.1519013804511562e-05, "loss": 0.5106, "num_input_tokens_seen": 38233976, "step": 3405, "train_runtime": 5782.5071, "train_tokens_per_second": 6612.007 }, { "epoch": 1.6364672706545869, "grad_norm": 0.9868459105491638, "learning_rate": 2.145682496900039e-05, "loss": 0.501, "num_input_tokens_seen": 38291736, "step": 3410, "train_runtime": 5792.3708, "train_tokens_per_second": 6610.719 }, { "epoch": 1.6388672226555467, "grad_norm": 1.0660921335220337, "learning_rate": 2.1394658499791684e-05, "loss": 0.4836, "num_input_tokens_seen": 38347056, "step": 3415, "train_runtime": 5800.6961, "train_tokens_per_second": 6610.768 }, { "epoch": 1.6412671746565068, "grad_norm": 0.809270441532135, "learning_rate": 2.1332514789311448e-05, "loss": 0.5138, "num_input_tokens_seen": 38399184, "step": 3420, "train_runtime": 5808.2869, "train_tokens_per_second": 6611.103 }, { "epoch": 1.643667126657467, "grad_norm": 0.7200763821601868, "learning_rate": 2.1270394229842044e-05, "loss": 0.4522, "num_input_tokens_seen": 38456896, "step": 3425, "train_runtime": 5816.1423, "train_tokens_per_second": 6612.097 }, { "epoch": 1.6460670786584268, "grad_norm": 0.8460598587989807, "learning_rate": 2.1208297213519686e-05, "loss": 0.4847, "num_input_tokens_seen": 38512168, "step": 3430, "train_runtime": 5823.8311, "train_tokens_per_second": 6612.858 }, { "epoch": 1.6484670306593867, "grad_norm": 0.7235488891601562, "learning_rate": 2.1146224132331944e-05, "loss": 0.4733, "num_input_tokens_seen": 38573240, "step": 3435, "train_runtime": 5832.0444, "train_tokens_per_second": 6614.017 }, { "epoch": 1.6508669826603468, "grad_norm": 0.8452171087265015, "learning_rate": 2.1084175378115344e-05, "loss": 0.5236, "num_input_tokens_seen": 38624080, "step": 3440, "train_runtime": 5839.2065, "train_tokens_per_second": 6614.611 }, { "epoch": 1.653266934661307, "grad_norm": 0.7488996982574463, "learning_rate": 2.1022151342552815e-05, "loss": 0.5226, "num_input_tokens_seen": 38679488, "step": 3445, "train_runtime": 5846.9076, "train_tokens_per_second": 6615.375 }, { "epoch": 1.6556668866622668, "grad_norm": 0.7845451235771179, "learning_rate": 2.0960152417171243e-05, "loss": 0.4533, "num_input_tokens_seen": 38736136, "step": 3450, "train_runtime": 5855.1703, "train_tokens_per_second": 6615.715 }, { "epoch": 1.6580668386632267, "grad_norm": 0.9303568005561829, "learning_rate": 2.089817899333904e-05, "loss": 0.483, "num_input_tokens_seen": 38788592, "step": 3455, "train_runtime": 5862.705, "train_tokens_per_second": 6616.16 }, { "epoch": 1.6604667906641866, "grad_norm": 0.7032025456428528, "learning_rate": 2.083623146226362e-05, "loss": 0.4556, "num_input_tokens_seen": 38846528, "step": 3460, "train_runtime": 5870.8119, "train_tokens_per_second": 6616.892 }, { "epoch": 1.6628667426651467, "grad_norm": 1.0094935894012451, "learning_rate": 2.0774310214988942e-05, "loss": 0.545, "num_input_tokens_seen": 38896768, "step": 3465, "train_runtime": 5879.1312, "train_tokens_per_second": 6616.074 }, { "epoch": 1.6652666946661068, "grad_norm": 0.8336009979248047, "learning_rate": 2.071241564239305e-05, "loss": 0.4741, "num_input_tokens_seen": 38952672, "step": 3470, "train_runtime": 5888.8317, "train_tokens_per_second": 6614.669 }, { "epoch": 1.6676666466670667, "grad_norm": 0.6727505326271057, "learning_rate": 2.0650548135185618e-05, "loss": 0.4831, "num_input_tokens_seen": 39007376, "step": 3475, "train_runtime": 5898.9169, "train_tokens_per_second": 6612.634 }, { "epoch": 1.6700665986680265, "grad_norm": 0.7282326221466064, "learning_rate": 2.0588708083905468e-05, "loss": 0.5174, "num_input_tokens_seen": 39064568, "step": 3480, "train_runtime": 5909.1279, "train_tokens_per_second": 6610.886 }, { "epoch": 1.6724665506689866, "grad_norm": 0.6648644208908081, "learning_rate": 2.0526895878918077e-05, "loss": 0.5055, "num_input_tokens_seen": 39117320, "step": 3485, "train_runtime": 5918.494, "train_tokens_per_second": 6609.337 }, { "epoch": 1.6748665026699467, "grad_norm": 0.8427759408950806, "learning_rate": 2.0465111910413192e-05, "loss": 0.5316, "num_input_tokens_seen": 39171840, "step": 3490, "train_runtime": 5927.2143, "train_tokens_per_second": 6608.811 }, { "epoch": 1.6772664546709066, "grad_norm": 0.6149888634681702, "learning_rate": 2.040335656840228e-05, "loss": 0.4517, "num_input_tokens_seen": 39226624, "step": 3495, "train_runtime": 5935.062, "train_tokens_per_second": 6609.303 }, { "epoch": 1.6796664066718665, "grad_norm": 0.9388527870178223, "learning_rate": 2.03416302427161e-05, "loss": 0.5067, "num_input_tokens_seen": 39284168, "step": 3500, "train_runtime": 5942.9844, "train_tokens_per_second": 6610.175 }, { "epoch": 1.6820663586728264, "grad_norm": 0.8548518419265747, "learning_rate": 2.027993332300227e-05, "loss": 0.5064, "num_input_tokens_seen": 39340120, "step": 3505, "train_runtime": 5951.1485, "train_tokens_per_second": 6610.509 }, { "epoch": 1.6844663106737865, "grad_norm": 0.6581935882568359, "learning_rate": 2.021826619872278e-05, "loss": 0.4523, "num_input_tokens_seen": 39399136, "step": 3510, "train_runtime": 5959.3451, "train_tokens_per_second": 6611.32 }, { "epoch": 1.6868662626747466, "grad_norm": 0.6218190789222717, "learning_rate": 2.0156629259151515e-05, "loss": 0.4804, "num_input_tokens_seen": 39456808, "step": 3515, "train_runtime": 5967.3525, "train_tokens_per_second": 6612.113 }, { "epoch": 1.6892662146757065, "grad_norm": 0.8073654174804688, "learning_rate": 2.0095022893371826e-05, "loss": 0.4838, "num_input_tokens_seen": 39516000, "step": 3520, "train_runtime": 5975.9682, "train_tokens_per_second": 6612.485 }, { "epoch": 1.6916661666766664, "grad_norm": 0.7715812921524048, "learning_rate": 2.0033447490274083e-05, "loss": 0.4669, "num_input_tokens_seen": 39569280, "step": 3525, "train_runtime": 5983.6596, "train_tokens_per_second": 6612.89 }, { "epoch": 1.6940661186776265, "grad_norm": 0.8139777183532715, "learning_rate": 1.99719034385532e-05, "loss": 0.5031, "num_input_tokens_seen": 39625464, "step": 3530, "train_runtime": 5991.822, "train_tokens_per_second": 6613.258 }, { "epoch": 1.6964660706785866, "grad_norm": 0.7577908635139465, "learning_rate": 1.9910391126706158e-05, "loss": 0.4991, "num_input_tokens_seen": 39676928, "step": 3535, "train_runtime": 5999.8126, "train_tokens_per_second": 6613.028 }, { "epoch": 1.6988660226795465, "grad_norm": 0.5273564457893372, "learning_rate": 1.9848910943029624e-05, "loss": 0.4548, "num_input_tokens_seen": 39734168, "step": 3540, "train_runtime": 6008.5552, "train_tokens_per_second": 6612.932 }, { "epoch": 1.7012659746805063, "grad_norm": 0.8542927503585815, "learning_rate": 1.978746327561741e-05, "loss": 0.4886, "num_input_tokens_seen": 39795520, "step": 3545, "train_runtime": 6017.0289, "train_tokens_per_second": 6613.816 }, { "epoch": 1.7036659266814662, "grad_norm": 0.6213528513908386, "learning_rate": 1.972604851235811e-05, "loss": 0.4737, "num_input_tokens_seen": 39851264, "step": 3550, "train_runtime": 6025.5762, "train_tokens_per_second": 6613.685 }, { "epoch": 1.7060658786824263, "grad_norm": 0.7265267372131348, "learning_rate": 1.9664667040932577e-05, "loss": 0.5013, "num_input_tokens_seen": 39904120, "step": 3555, "train_runtime": 6033.0567, "train_tokens_per_second": 6614.246 }, { "epoch": 1.7084658306833864, "grad_norm": 0.8746877312660217, "learning_rate": 1.9603319248811542e-05, "loss": 0.4541, "num_input_tokens_seen": 39957104, "step": 3560, "train_runtime": 6040.7403, "train_tokens_per_second": 6614.604 }, { "epoch": 1.7108657826843463, "grad_norm": 0.690990686416626, "learning_rate": 1.9542005523253103e-05, "loss": 0.5057, "num_input_tokens_seen": 40014640, "step": 3565, "train_runtime": 6048.7384, "train_tokens_per_second": 6615.37 }, { "epoch": 1.7132657346853062, "grad_norm": 0.5996572375297546, "learning_rate": 1.948072625130032e-05, "loss": 0.5071, "num_input_tokens_seen": 40071928, "step": 3570, "train_runtime": 6056.481, "train_tokens_per_second": 6616.371 }, { "epoch": 1.7156656866862663, "grad_norm": 1.0447416305541992, "learning_rate": 1.9419481819778785e-05, "loss": 0.5099, "num_input_tokens_seen": 40125856, "step": 3575, "train_runtime": 6063.7113, "train_tokens_per_second": 6617.376 }, { "epoch": 1.7180656386872264, "grad_norm": 1.0107308626174927, "learning_rate": 1.9358272615294153e-05, "loss": 0.4823, "num_input_tokens_seen": 40181760, "step": 3580, "train_runtime": 6071.812, "train_tokens_per_second": 6617.754 }, { "epoch": 1.7204655906881863, "grad_norm": 0.7742976546287537, "learning_rate": 1.9297099024229675e-05, "loss": 0.5261, "num_input_tokens_seen": 40236472, "step": 3585, "train_runtime": 6079.4422, "train_tokens_per_second": 6618.448 }, { "epoch": 1.7228655426891462, "grad_norm": 0.7820068597793579, "learning_rate": 1.923596143274385e-05, "loss": 0.4674, "num_input_tokens_seen": 40295104, "step": 3590, "train_runtime": 6087.6682, "train_tokens_per_second": 6619.136 }, { "epoch": 1.725265494690106, "grad_norm": 0.6710221171379089, "learning_rate": 1.9174860226767876e-05, "loss": 0.5175, "num_input_tokens_seen": 40345800, "step": 3595, "train_runtime": 6095.1949, "train_tokens_per_second": 6619.28 }, { "epoch": 1.7276654466910661, "grad_norm": 0.7176735401153564, "learning_rate": 1.91137957920033e-05, "loss": 0.5171, "num_input_tokens_seen": 40402256, "step": 3600, "train_runtime": 6103.2553, "train_tokens_per_second": 6619.788 }, { "epoch": 1.7300653986920262, "grad_norm": 0.9111002087593079, "learning_rate": 1.905276851391954e-05, "loss": 0.4883, "num_input_tokens_seen": 40458888, "step": 3605, "train_runtime": 6111.7843, "train_tokens_per_second": 6619.816 }, { "epoch": 1.7324653506929861, "grad_norm": 0.7179924845695496, "learning_rate": 1.899177877775146e-05, "loss": 0.4852, "num_input_tokens_seen": 40516112, "step": 3610, "train_runtime": 6120.0523, "train_tokens_per_second": 6620.223 }, { "epoch": 1.734865302693946, "grad_norm": 0.7747234106063843, "learning_rate": 1.8930826968496943e-05, "loss": 0.5067, "num_input_tokens_seen": 40572824, "step": 3615, "train_runtime": 6128.2202, "train_tokens_per_second": 6620.654 }, { "epoch": 1.7372652546949061, "grad_norm": 0.7451600432395935, "learning_rate": 1.8869913470914448e-05, "loss": 0.4881, "num_input_tokens_seen": 40631656, "step": 3620, "train_runtime": 6136.6832, "train_tokens_per_second": 6621.11 }, { "epoch": 1.7396652066958662, "grad_norm": 0.9544029235839844, "learning_rate": 1.880903866952062e-05, "loss": 0.5206, "num_input_tokens_seen": 40687064, "step": 3625, "train_runtime": 6144.9437, "train_tokens_per_second": 6621.227 }, { "epoch": 1.742065158696826, "grad_norm": 0.7754983901977539, "learning_rate": 1.8748202948587813e-05, "loss": 0.4979, "num_input_tokens_seen": 40743400, "step": 3630, "train_runtime": 6153.1589, "train_tokens_per_second": 6621.542 }, { "epoch": 1.744465110697786, "grad_norm": 0.7278411388397217, "learning_rate": 1.8687406692141673e-05, "loss": 0.4632, "num_input_tokens_seen": 40802376, "step": 3635, "train_runtime": 6161.8706, "train_tokens_per_second": 6621.751 }, { "epoch": 1.7468650626987459, "grad_norm": 0.6943597793579102, "learning_rate": 1.8626650283958762e-05, "loss": 0.4851, "num_input_tokens_seen": 40854616, "step": 3640, "train_runtime": 6169.6683, "train_tokens_per_second": 6621.85 }, { "epoch": 1.749265014699706, "grad_norm": 0.8194776177406311, "learning_rate": 1.8565934107564068e-05, "loss": 0.4573, "num_input_tokens_seen": 40911032, "step": 3645, "train_runtime": 6178.2227, "train_tokens_per_second": 6621.812 }, { "epoch": 1.751664966700666, "grad_norm": 0.8596030473709106, "learning_rate": 1.8505258546228623e-05, "loss": 0.4862, "num_input_tokens_seen": 40970312, "step": 3650, "train_runtime": 6186.5562, "train_tokens_per_second": 6622.475 }, { "epoch": 1.754064918701626, "grad_norm": 0.6645076274871826, "learning_rate": 1.8444623982967098e-05, "loss": 0.4606, "num_input_tokens_seen": 41028576, "step": 3655, "train_runtime": 6195.0286, "train_tokens_per_second": 6622.823 }, { "epoch": 1.7564648707025858, "grad_norm": 0.668375551700592, "learning_rate": 1.8384030800535332e-05, "loss": 0.4504, "num_input_tokens_seen": 41088352, "step": 3660, "train_runtime": 6203.7002, "train_tokens_per_second": 6623.201 }, { "epoch": 1.758864822703546, "grad_norm": 0.6859973669052124, "learning_rate": 1.832347938142796e-05, "loss": 0.5408, "num_input_tokens_seen": 41144096, "step": 3665, "train_runtime": 6211.4168, "train_tokens_per_second": 6623.947 }, { "epoch": 1.761264774704506, "grad_norm": 0.8838623762130737, "learning_rate": 1.8262970107875994e-05, "loss": 0.4798, "num_input_tokens_seen": 41199488, "step": 3670, "train_runtime": 6219.0044, "train_tokens_per_second": 6624.772 }, { "epoch": 1.763664726705466, "grad_norm": 0.8268917202949524, "learning_rate": 1.8202503361844393e-05, "loss": 0.5226, "num_input_tokens_seen": 41254392, "step": 3675, "train_runtime": 6226.8544, "train_tokens_per_second": 6625.238 }, { "epoch": 1.7660646787064258, "grad_norm": 0.9109818339347839, "learning_rate": 1.8142079525029672e-05, "loss": 0.5196, "num_input_tokens_seen": 41310952, "step": 3680, "train_runtime": 6234.9064, "train_tokens_per_second": 6625.753 }, { "epoch": 1.7684646307073857, "grad_norm": 0.8743447661399841, "learning_rate": 1.808169897885745e-05, "loss": 0.4813, "num_input_tokens_seen": 41363784, "step": 3685, "train_runtime": 6242.8579, "train_tokens_per_second": 6625.777 }, { "epoch": 1.7708645827083458, "grad_norm": 0.8028547763824463, "learning_rate": 1.802136210448012e-05, "loss": 0.4864, "num_input_tokens_seen": 41418736, "step": 3690, "train_runtime": 6250.665, "train_tokens_per_second": 6626.293 }, { "epoch": 1.773264534709306, "grad_norm": 0.8359841108322144, "learning_rate": 1.796106928277437e-05, "loss": 0.451, "num_input_tokens_seen": 41480096, "step": 3695, "train_runtime": 6259.4151, "train_tokens_per_second": 6626.833 }, { "epoch": 1.7756644867102658, "grad_norm": 0.6087771654129028, "learning_rate": 1.7900820894338786e-05, "loss": 0.4405, "num_input_tokens_seen": 41535640, "step": 3700, "train_runtime": 6267.1679, "train_tokens_per_second": 6627.498 }, { "epoch": 1.7780644387112257, "grad_norm": 0.7156651020050049, "learning_rate": 1.7840617319491527e-05, "loss": 0.51, "num_input_tokens_seen": 41592104, "step": 3705, "train_runtime": 6275.4346, "train_tokens_per_second": 6627.765 }, { "epoch": 1.7804643907121858, "grad_norm": 0.7992216348648071, "learning_rate": 1.7780458938267807e-05, "loss": 0.4488, "num_input_tokens_seen": 41649776, "step": 3710, "train_runtime": 6283.7454, "train_tokens_per_second": 6628.177 }, { "epoch": 1.7828643427131459, "grad_norm": 0.7933105230331421, "learning_rate": 1.772034613041758e-05, "loss": 0.4581, "num_input_tokens_seen": 41707280, "step": 3715, "train_runtime": 6291.9245, "train_tokens_per_second": 6628.7 }, { "epoch": 1.7852642947141057, "grad_norm": 0.8297272324562073, "learning_rate": 1.7660279275403124e-05, "loss": 0.4598, "num_input_tokens_seen": 41765768, "step": 3720, "train_runtime": 6300.2081, "train_tokens_per_second": 6629.268 }, { "epoch": 1.7876642467150656, "grad_norm": 0.6287772059440613, "learning_rate": 1.7600258752396626e-05, "loss": 0.4783, "num_input_tokens_seen": 41819576, "step": 3725, "train_runtime": 6308.2419, "train_tokens_per_second": 6629.355 }, { "epoch": 1.7900641987160257, "grad_norm": 0.7246582508087158, "learning_rate": 1.754028494027782e-05, "loss": 0.4821, "num_input_tokens_seen": 41876528, "step": 3730, "train_runtime": 6316.3849, "train_tokens_per_second": 6629.825 }, { "epoch": 1.7924641507169856, "grad_norm": 0.752740204334259, "learning_rate": 1.748035821763154e-05, "loss": 0.4984, "num_input_tokens_seen": 41933488, "step": 3735, "train_runtime": 6324.4895, "train_tokens_per_second": 6630.336 }, { "epoch": 1.7948641027179457, "grad_norm": 0.7370868921279907, "learning_rate": 1.7420478962745424e-05, "loss": 0.4707, "num_input_tokens_seen": 41989264, "step": 3740, "train_runtime": 6332.3923, "train_tokens_per_second": 6630.869 }, { "epoch": 1.7972640547189056, "grad_norm": 0.5607179999351501, "learning_rate": 1.736064755360742e-05, "loss": 0.5113, "num_input_tokens_seen": 42045264, "step": 3745, "train_runtime": 6340.5688, "train_tokens_per_second": 6631.15 }, { "epoch": 1.7996640067198655, "grad_norm": 0.851588785648346, "learning_rate": 1.7300864367903462e-05, "loss": 0.4807, "num_input_tokens_seen": 42103712, "step": 3750, "train_runtime": 6348.5367, "train_tokens_per_second": 6632.034 }, { "epoch": 1.8020639587208256, "grad_norm": 0.6969419717788696, "learning_rate": 1.7241129783015108e-05, "loss": 0.5129, "num_input_tokens_seen": 42156568, "step": 3755, "train_runtime": 6356.2935, "train_tokens_per_second": 6632.256 }, { "epoch": 1.8044639107217857, "grad_norm": 0.705589771270752, "learning_rate": 1.7181444176017077e-05, "loss": 0.4709, "num_input_tokens_seen": 42214056, "step": 3760, "train_runtime": 6364.5049, "train_tokens_per_second": 6632.732 }, { "epoch": 1.8068638627227456, "grad_norm": 0.9332826733589172, "learning_rate": 1.7121807923674926e-05, "loss": 0.4609, "num_input_tokens_seen": 42270872, "step": 3765, "train_runtime": 6372.8289, "train_tokens_per_second": 6632.984 }, { "epoch": 1.8092638147237055, "grad_norm": 0.6459842324256897, "learning_rate": 1.7062221402442678e-05, "loss": 0.5136, "num_input_tokens_seen": 42324392, "step": 3770, "train_runtime": 6380.6203, "train_tokens_per_second": 6633.272 }, { "epoch": 1.8116637667246656, "grad_norm": 0.8273303508758545, "learning_rate": 1.7002684988460417e-05, "loss": 0.465, "num_input_tokens_seen": 42381736, "step": 3775, "train_runtime": 6388.9298, "train_tokens_per_second": 6633.621 }, { "epoch": 1.8140637187256254, "grad_norm": 0.6155418157577515, "learning_rate": 1.694319905755193e-05, "loss": 0.4924, "num_input_tokens_seen": 42442312, "step": 3780, "train_runtime": 6399.8787, "train_tokens_per_second": 6631.737 }, { "epoch": 1.8164636707265855, "grad_norm": 1.0188329219818115, "learning_rate": 1.6883763985222305e-05, "loss": 0.468, "num_input_tokens_seen": 42496896, "step": 3785, "train_runtime": 6409.4045, "train_tokens_per_second": 6630.397 }, { "epoch": 1.8188636227275454, "grad_norm": 0.604070782661438, "learning_rate": 1.6824380146655633e-05, "loss": 0.5271, "num_input_tokens_seen": 42554600, "step": 3790, "train_runtime": 6419.249, "train_tokens_per_second": 6629.218 }, { "epoch": 1.8212635747285053, "grad_norm": 0.7463460564613342, "learning_rate": 1.6765047916712545e-05, "loss": 0.5052, "num_input_tokens_seen": 42611168, "step": 3795, "train_runtime": 6429.2745, "train_tokens_per_second": 6627.679 }, { "epoch": 1.8236635267294654, "grad_norm": 0.6504276990890503, "learning_rate": 1.6705767669927914e-05, "loss": 0.4572, "num_input_tokens_seen": 42668344, "step": 3800, "train_runtime": 6440.1221, "train_tokens_per_second": 6625.394 }, { "epoch": 1.8260634787304255, "grad_norm": 0.8336795568466187, "learning_rate": 1.6646539780508478e-05, "loss": 0.4514, "num_input_tokens_seen": 42725880, "step": 3805, "train_runtime": 6450.437, "train_tokens_per_second": 6623.719 }, { "epoch": 1.8284634307313854, "grad_norm": 0.6106321215629578, "learning_rate": 1.658736462233045e-05, "loss": 0.4553, "num_input_tokens_seen": 42785824, "step": 3810, "train_runtime": 6460.6963, "train_tokens_per_second": 6622.479 }, { "epoch": 1.8308633827323453, "grad_norm": 0.9887316823005676, "learning_rate": 1.6528242568937174e-05, "loss": 0.5347, "num_input_tokens_seen": 42840440, "step": 3815, "train_runtime": 6470.5401, "train_tokens_per_second": 6620.845 }, { "epoch": 1.8332633347333054, "grad_norm": 0.6800510287284851, "learning_rate": 1.6469173993536787e-05, "loss": 0.5028, "num_input_tokens_seen": 42893576, "step": 3820, "train_runtime": 6480.2024, "train_tokens_per_second": 6619.172 }, { "epoch": 1.8356632867342653, "grad_norm": 0.5527476668357849, "learning_rate": 1.641015926899985e-05, "loss": 0.4997, "num_input_tokens_seen": 42952744, "step": 3825, "train_runtime": 6490.1332, "train_tokens_per_second": 6618.161 }, { "epoch": 1.8380632387352254, "grad_norm": 0.833662211894989, "learning_rate": 1.6351198767856978e-05, "loss": 0.5076, "num_input_tokens_seen": 43010768, "step": 3830, "train_runtime": 6498.8469, "train_tokens_per_second": 6618.215 }, { "epoch": 1.8404631907361853, "grad_norm": 0.8122771978378296, "learning_rate": 1.6292292862296482e-05, "loss": 0.4789, "num_input_tokens_seen": 43067120, "step": 3835, "train_runtime": 6506.9502, "train_tokens_per_second": 6618.634 }, { "epoch": 1.8428631427371451, "grad_norm": 0.7453281283378601, "learning_rate": 1.6233441924162085e-05, "loss": 0.472, "num_input_tokens_seen": 43124944, "step": 3840, "train_runtime": 6514.9238, "train_tokens_per_second": 6619.409 }, { "epoch": 1.8452630947381052, "grad_norm": 0.7798519730567932, "learning_rate": 1.617464632495048e-05, "loss": 0.4968, "num_input_tokens_seen": 43181496, "step": 3845, "train_runtime": 6522.7215, "train_tokens_per_second": 6620.165 }, { "epoch": 1.8476630467390653, "grad_norm": 0.770413339138031, "learning_rate": 1.611590643580906e-05, "loss": 0.4799, "num_input_tokens_seen": 43236224, "step": 3850, "train_runtime": 6531.17, "train_tokens_per_second": 6619.981 }, { "epoch": 1.8500629987400252, "grad_norm": 0.7712330222129822, "learning_rate": 1.6057222627533554e-05, "loss": 0.4825, "num_input_tokens_seen": 43291464, "step": 3855, "train_runtime": 6539.3507, "train_tokens_per_second": 6620.147 }, { "epoch": 1.852462950740985, "grad_norm": 0.667767345905304, "learning_rate": 1.599859527056566e-05, "loss": 0.4525, "num_input_tokens_seen": 43349520, "step": 3860, "train_runtime": 6547.7333, "train_tokens_per_second": 6620.538 }, { "epoch": 1.8548629027419452, "grad_norm": 0.8143635988235474, "learning_rate": 1.594002473499073e-05, "loss": 0.4601, "num_input_tokens_seen": 43410208, "step": 3865, "train_runtime": 6556.4635, "train_tokens_per_second": 6620.979 }, { "epoch": 1.857262854742905, "grad_norm": 0.6884592771530151, "learning_rate": 1.588151139053544e-05, "loss": 0.4458, "num_input_tokens_seen": 43469344, "step": 3870, "train_runtime": 6565.0785, "train_tokens_per_second": 6621.298 }, { "epoch": 1.8596628067438652, "grad_norm": 0.8038159608840942, "learning_rate": 1.5823055606565458e-05, "loss": 0.4859, "num_input_tokens_seen": 43526440, "step": 3875, "train_runtime": 6573.964, "train_tokens_per_second": 6621.034 }, { "epoch": 1.862062758744825, "grad_norm": 0.6315177083015442, "learning_rate": 1.5764657752083072e-05, "loss": 0.4795, "num_input_tokens_seen": 43583936, "step": 3880, "train_runtime": 6582.4382, "train_tokens_per_second": 6621.245 }, { "epoch": 1.864462710745785, "grad_norm": 0.7281184792518616, "learning_rate": 1.5706318195724894e-05, "loss": 0.4707, "num_input_tokens_seen": 43639480, "step": 3885, "train_runtime": 6590.3977, "train_tokens_per_second": 6621.676 }, { "epoch": 1.866862662746745, "grad_norm": 0.8681549429893494, "learning_rate": 1.5648037305759566e-05, "loss": 0.4557, "num_input_tokens_seen": 43690520, "step": 3890, "train_runtime": 6598.0076, "train_tokens_per_second": 6621.775 }, { "epoch": 1.8692626147477052, "grad_norm": 0.9573807120323181, "learning_rate": 1.5589815450085355e-05, "loss": 0.4621, "num_input_tokens_seen": 43749480, "step": 3895, "train_runtime": 6606.515, "train_tokens_per_second": 6622.172 }, { "epoch": 1.871662566748665, "grad_norm": 0.9825738072395325, "learning_rate": 1.5531652996227885e-05, "loss": 0.4627, "num_input_tokens_seen": 43799824, "step": 3900, "train_runtime": 6614.0046, "train_tokens_per_second": 6622.285 }, { "epoch": 1.874062518749625, "grad_norm": 0.8160600662231445, "learning_rate": 1.5473550311337833e-05, "loss": 0.4806, "num_input_tokens_seen": 43858032, "step": 3905, "train_runtime": 6622.3127, "train_tokens_per_second": 6622.767 }, { "epoch": 1.876462470750585, "grad_norm": 0.8037713766098022, "learning_rate": 1.541550776218855e-05, "loss": 0.4767, "num_input_tokens_seen": 43914232, "step": 3910, "train_runtime": 6630.3703, "train_tokens_per_second": 6623.194 }, { "epoch": 1.878862422751545, "grad_norm": 0.8697477579116821, "learning_rate": 1.535752571517379e-05, "loss": 0.4582, "num_input_tokens_seen": 43970744, "step": 3915, "train_runtime": 6638.2775, "train_tokens_per_second": 6623.818 }, { "epoch": 1.881262374752505, "grad_norm": 0.6897442936897278, "learning_rate": 1.529960453630538e-05, "loss": 0.4725, "num_input_tokens_seen": 44028408, "step": 3920, "train_runtime": 6646.2538, "train_tokens_per_second": 6624.545 }, { "epoch": 1.883662326753465, "grad_norm": 0.7267577052116394, "learning_rate": 1.5241744591210954e-05, "loss": 0.4661, "num_input_tokens_seen": 44085968, "step": 3925, "train_runtime": 6654.4818, "train_tokens_per_second": 6625.004 }, { "epoch": 1.8860622787544248, "grad_norm": 0.6550572514533997, "learning_rate": 1.5183946245131563e-05, "loss": 0.5171, "num_input_tokens_seen": 44143360, "step": 3930, "train_runtime": 6662.7155, "train_tokens_per_second": 6625.431 }, { "epoch": 1.8884622307553849, "grad_norm": 0.8330610394477844, "learning_rate": 1.5126209862919427e-05, "loss": 0.4935, "num_input_tokens_seen": 44193864, "step": 3935, "train_runtime": 6669.9997, "train_tokens_per_second": 6625.767 }, { "epoch": 1.890862182756345, "grad_norm": 0.8436587452888489, "learning_rate": 1.506853580903564e-05, "loss": 0.5181, "num_input_tokens_seen": 44249464, "step": 3940, "train_runtime": 6677.7583, "train_tokens_per_second": 6626.395 }, { "epoch": 1.8932621347573049, "grad_norm": 0.8945364356040955, "learning_rate": 1.5010924447547808e-05, "loss": 0.445, "num_input_tokens_seen": 44306480, "step": 3945, "train_runtime": 6685.9167, "train_tokens_per_second": 6626.837 }, { "epoch": 1.8956620867582648, "grad_norm": 0.7293525338172913, "learning_rate": 1.4953376142127828e-05, "loss": 0.4933, "num_input_tokens_seen": 44363776, "step": 3950, "train_runtime": 6693.7184, "train_tokens_per_second": 6627.673 }, { "epoch": 1.8980620387592249, "grad_norm": 0.8093637228012085, "learning_rate": 1.4895891256049548e-05, "loss": 0.4952, "num_input_tokens_seen": 44419016, "step": 3955, "train_runtime": 6701.9236, "train_tokens_per_second": 6627.801 }, { "epoch": 1.900461990760185, "grad_norm": 0.8808810710906982, "learning_rate": 1.483847015218647e-05, "loss": 0.5036, "num_input_tokens_seen": 44473296, "step": 3960, "train_runtime": 6709.8008, "train_tokens_per_second": 6628.11 }, { "epoch": 1.9028619427611448, "grad_norm": 0.606708288192749, "learning_rate": 1.4781113193009466e-05, "loss": 0.4709, "num_input_tokens_seen": 44533064, "step": 3965, "train_runtime": 6718.435, "train_tokens_per_second": 6628.488 }, { "epoch": 1.9052618947621047, "grad_norm": 0.7501396536827087, "learning_rate": 1.472382074058451e-05, "loss": 0.487, "num_input_tokens_seen": 44591088, "step": 3970, "train_runtime": 6726.4717, "train_tokens_per_second": 6629.194 }, { "epoch": 1.9076618467630646, "grad_norm": 0.7472719550132751, "learning_rate": 1.4666593156570376e-05, "loss": 0.4822, "num_input_tokens_seen": 44639864, "step": 3975, "train_runtime": 6733.726, "train_tokens_per_second": 6629.296 }, { "epoch": 1.9100617987640247, "grad_norm": 0.9028266668319702, "learning_rate": 1.460943080221635e-05, "loss": 0.4792, "num_input_tokens_seen": 44697568, "step": 3980, "train_runtime": 6742.2116, "train_tokens_per_second": 6629.511 }, { "epoch": 1.9124617507649848, "grad_norm": 0.6775950193405151, "learning_rate": 1.4552334038359938e-05, "loss": 0.4861, "num_input_tokens_seen": 44750848, "step": 3985, "train_runtime": 6749.8459, "train_tokens_per_second": 6629.907 }, { "epoch": 1.9148617027659447, "grad_norm": 0.7115968465805054, "learning_rate": 1.4495303225424656e-05, "loss": 0.4546, "num_input_tokens_seen": 44804648, "step": 3990, "train_runtime": 6758.0597, "train_tokens_per_second": 6629.809 }, { "epoch": 1.9172616547669046, "grad_norm": 0.8527563214302063, "learning_rate": 1.4438338723417654e-05, "loss": 0.5007, "num_input_tokens_seen": 44860632, "step": 3995, "train_runtime": 6766.2816, "train_tokens_per_second": 6630.027 }, { "epoch": 1.9196616067678647, "grad_norm": 0.8954775333404541, "learning_rate": 1.4381440891927512e-05, "loss": 0.5301, "num_input_tokens_seen": 44913712, "step": 4000, "train_runtime": 6774.2353, "train_tokens_per_second": 6630.078 }, { "epoch": 1.9220615587688248, "grad_norm": 0.7284995317459106, "learning_rate": 1.432461009012196e-05, "loss": 0.5028, "num_input_tokens_seen": 44970992, "step": 4005, "train_runtime": 6782.3775, "train_tokens_per_second": 6630.565 }, { "epoch": 1.9244615107697847, "grad_norm": 1.017869472503662, "learning_rate": 1.4267846676745598e-05, "loss": 0.4618, "num_input_tokens_seen": 45024328, "step": 4010, "train_runtime": 6790.5882, "train_tokens_per_second": 6630.402 }, { "epoch": 1.9268614627707445, "grad_norm": 0.7588083148002625, "learning_rate": 1.4211151010117627e-05, "loss": 0.5078, "num_input_tokens_seen": 45082296, "step": 4015, "train_runtime": 6798.9435, "train_tokens_per_second": 6630.78 }, { "epoch": 1.9292614147717044, "grad_norm": 0.66818767786026, "learning_rate": 1.4154523448129597e-05, "loss": 0.4823, "num_input_tokens_seen": 45137992, "step": 4020, "train_runtime": 6806.9385, "train_tokens_per_second": 6631.174 }, { "epoch": 1.9316613667726645, "grad_norm": 0.700678825378418, "learning_rate": 1.4097964348243172e-05, "loss": 0.4639, "num_input_tokens_seen": 45197208, "step": 4025, "train_runtime": 6815.6104, "train_tokens_per_second": 6631.425 }, { "epoch": 1.9340613187736246, "grad_norm": 0.8906050324440002, "learning_rate": 1.4041474067487814e-05, "loss": 0.4599, "num_input_tokens_seen": 45256040, "step": 4030, "train_runtime": 6824.0323, "train_tokens_per_second": 6631.862 }, { "epoch": 1.9364612707745845, "grad_norm": 0.8205930590629578, "learning_rate": 1.3985052962458593e-05, "loss": 0.4903, "num_input_tokens_seen": 45311968, "step": 4035, "train_runtime": 6831.8772, "train_tokens_per_second": 6632.433 }, { "epoch": 1.9388612227755444, "grad_norm": 0.9148489832878113, "learning_rate": 1.3928701389313897e-05, "loss": 0.4939, "num_input_tokens_seen": 45361584, "step": 4040, "train_runtime": 6839.5045, "train_tokens_per_second": 6632.291 }, { "epoch": 1.9412611747765045, "grad_norm": 1.021208643913269, "learning_rate": 1.3872419703773187e-05, "loss": 0.4876, "num_input_tokens_seen": 45421616, "step": 4045, "train_runtime": 6848.0389, "train_tokens_per_second": 6632.792 }, { "epoch": 1.9436611267774646, "grad_norm": 0.8669795393943787, "learning_rate": 1.3816208261114755e-05, "loss": 0.5142, "num_input_tokens_seen": 45475784, "step": 4050, "train_runtime": 6855.5247, "train_tokens_per_second": 6633.451 }, { "epoch": 1.9460610787784245, "grad_norm": 1.084006428718567, "learning_rate": 1.3760067416173511e-05, "loss": 0.4949, "num_input_tokens_seen": 45529816, "step": 4055, "train_runtime": 6863.458, "train_tokens_per_second": 6633.655 }, { "epoch": 1.9484610307793844, "grad_norm": 0.639717161655426, "learning_rate": 1.3703997523338688e-05, "loss": 0.4917, "num_input_tokens_seen": 45585432, "step": 4060, "train_runtime": 6870.9893, "train_tokens_per_second": 6634.479 }, { "epoch": 1.9508609827803443, "grad_norm": 0.7942274808883667, "learning_rate": 1.3647998936551643e-05, "loss": 0.4542, "num_input_tokens_seen": 45642256, "step": 4065, "train_runtime": 6879.1089, "train_tokens_per_second": 6634.908 }, { "epoch": 1.9532609347813044, "grad_norm": 0.7706002593040466, "learning_rate": 1.3592072009303603e-05, "loss": 0.4767, "num_input_tokens_seen": 45700704, "step": 4070, "train_runtime": 6887.1919, "train_tokens_per_second": 6635.608 }, { "epoch": 1.9556608867822645, "grad_norm": 0.6891798377037048, "learning_rate": 1.3536217094633471e-05, "loss": 0.4649, "num_input_tokens_seen": 45754672, "step": 4075, "train_runtime": 6895.2959, "train_tokens_per_second": 6635.636 }, { "epoch": 1.9580608387832243, "grad_norm": 0.6927337646484375, "learning_rate": 1.3480434545125562e-05, "loss": 0.4794, "num_input_tokens_seen": 45805360, "step": 4080, "train_runtime": 6902.6999, "train_tokens_per_second": 6635.861 }, { "epoch": 1.9604607907841842, "grad_norm": 0.7922900319099426, "learning_rate": 1.3424724712907355e-05, "loss": 0.5073, "num_input_tokens_seen": 45859408, "step": 4085, "train_runtime": 6910.3792, "train_tokens_per_second": 6636.309 }, { "epoch": 1.9628607427851443, "grad_norm": 0.5073052048683167, "learning_rate": 1.3369087949647352e-05, "loss": 0.4844, "num_input_tokens_seen": 45915912, "step": 4090, "train_runtime": 6918.4066, "train_tokens_per_second": 6636.776 }, { "epoch": 1.9652606947861044, "grad_norm": 0.805068850517273, "learning_rate": 1.3313524606552763e-05, "loss": 0.4683, "num_input_tokens_seen": 45972424, "step": 4095, "train_runtime": 6926.7284, "train_tokens_per_second": 6636.961 }, { "epoch": 1.9676606467870643, "grad_norm": 0.7410593628883362, "learning_rate": 1.3258035034367338e-05, "loss": 0.4847, "num_input_tokens_seen": 46029616, "step": 4100, "train_runtime": 6934.891, "train_tokens_per_second": 6637.396 }, { "epoch": 1.9700605987880242, "grad_norm": 0.9381468296051025, "learning_rate": 1.3202619583369189e-05, "loss": 0.5131, "num_input_tokens_seen": 46087816, "step": 4105, "train_runtime": 6943.9707, "train_tokens_per_second": 6637.098 }, { "epoch": 1.972460550788984, "grad_norm": 0.7725812792778015, "learning_rate": 1.3147278603368487e-05, "loss": 0.496, "num_input_tokens_seen": 46141504, "step": 4110, "train_runtime": 6951.6679, "train_tokens_per_second": 6637.472 }, { "epoch": 1.9748605027899442, "grad_norm": 0.9349031448364258, "learning_rate": 1.3092012443705332e-05, "loss": 0.4513, "num_input_tokens_seen": 46202072, "step": 4115, "train_runtime": 6960.2643, "train_tokens_per_second": 6637.977 }, { "epoch": 1.9772604547909043, "grad_norm": 0.5486748218536377, "learning_rate": 1.3036821453247506e-05, "loss": 0.4997, "num_input_tokens_seen": 46258400, "step": 4120, "train_runtime": 6968.5186, "train_tokens_per_second": 6638.197 }, { "epoch": 1.9796604067918642, "grad_norm": 0.8410947322845459, "learning_rate": 1.2981705980388295e-05, "loss": 0.5062, "num_input_tokens_seen": 46309656, "step": 4125, "train_runtime": 6975.9975, "train_tokens_per_second": 6638.428 }, { "epoch": 1.982060358792824, "grad_norm": 0.6465336680412292, "learning_rate": 1.2926666373044294e-05, "loss": 0.4891, "num_input_tokens_seen": 46366888, "step": 4130, "train_runtime": 6984.2364, "train_tokens_per_second": 6638.791 }, { "epoch": 1.9844603107937842, "grad_norm": 0.6658479571342468, "learning_rate": 1.2871702978653163e-05, "loss": 0.5002, "num_input_tokens_seen": 46419304, "step": 4135, "train_runtime": 6991.7902, "train_tokens_per_second": 6639.116 }, { "epoch": 1.9868602627947443, "grad_norm": 0.8227950930595398, "learning_rate": 1.28168161441715e-05, "loss": 0.5105, "num_input_tokens_seen": 46469520, "step": 4140, "train_runtime": 6999.0924, "train_tokens_per_second": 6639.364 }, { "epoch": 1.9892602147957041, "grad_norm": 1.1198500394821167, "learning_rate": 1.27620062160726e-05, "loss": 0.5154, "num_input_tokens_seen": 46523240, "step": 4145, "train_runtime": 7007.0718, "train_tokens_per_second": 6639.47 }, { "epoch": 1.991660166796664, "grad_norm": 0.8290591835975647, "learning_rate": 1.2707273540344274e-05, "loss": 0.5361, "num_input_tokens_seen": 46577712, "step": 4150, "train_runtime": 7015.0992, "train_tokens_per_second": 6639.637 }, { "epoch": 1.994060118797624, "grad_norm": 0.6306242346763611, "learning_rate": 1.265261846248672e-05, "loss": 0.4873, "num_input_tokens_seen": 46629984, "step": 4155, "train_runtime": 7022.7592, "train_tokens_per_second": 6639.838 }, { "epoch": 1.996460070798584, "grad_norm": 0.8492105007171631, "learning_rate": 1.2598041327510254e-05, "loss": 0.4779, "num_input_tokens_seen": 46689664, "step": 4160, "train_runtime": 7031.9599, "train_tokens_per_second": 6639.637 }, { "epoch": 1.9988600227995441, "grad_norm": 0.8231053352355957, "learning_rate": 1.25435424799332e-05, "loss": 0.4451, "num_input_tokens_seen": 46752192, "step": 4165, "train_runtime": 7041.1099, "train_tokens_per_second": 6639.89 }, { "epoch": 2.000959980800384, "grad_norm": 0.6937538385391235, "learning_rate": 1.2489122263779684e-05, "loss": 0.4431, "num_input_tokens_seen": 46800120, "step": 4170, "train_runtime": 7048.3249, "train_tokens_per_second": 6639.893 }, { "epoch": 2.003359932801344, "grad_norm": 0.5429336428642273, "learning_rate": 1.2434781022577476e-05, "loss": 0.4561, "num_input_tokens_seen": 46859352, "step": 4175, "train_runtime": 7056.8347, "train_tokens_per_second": 6640.279 }, { "epoch": 2.005759884802304, "grad_norm": 0.7788823843002319, "learning_rate": 1.2380519099355831e-05, "loss": 0.4531, "num_input_tokens_seen": 46918656, "step": 4180, "train_runtime": 7065.0446, "train_tokens_per_second": 6640.957 }, { "epoch": 2.008159836803264, "grad_norm": 0.7995026111602783, "learning_rate": 1.2326336836643274e-05, "loss": 0.5048, "num_input_tokens_seen": 46976896, "step": 4185, "train_runtime": 7073.7177, "train_tokens_per_second": 6641.048 }, { "epoch": 2.010559788804224, "grad_norm": 0.7401773929595947, "learning_rate": 1.227223457646551e-05, "loss": 0.4846, "num_input_tokens_seen": 47033584, "step": 4190, "train_runtime": 7081.7376, "train_tokens_per_second": 6641.532 }, { "epoch": 2.012959740805184, "grad_norm": 1.0051988363265991, "learning_rate": 1.22182126603432e-05, "loss": 0.497, "num_input_tokens_seen": 47084560, "step": 4195, "train_runtime": 7089.243, "train_tokens_per_second": 6641.691 }, { "epoch": 2.0153596928061437, "grad_norm": 0.7586055994033813, "learning_rate": 1.2164271429289837e-05, "loss": 0.4671, "num_input_tokens_seen": 47141040, "step": 4200, "train_runtime": 7097.5162, "train_tokens_per_second": 6641.907 }, { "epoch": 2.017759644807104, "grad_norm": 0.6509086489677429, "learning_rate": 1.2110411223809612e-05, "loss": 0.4329, "num_input_tokens_seen": 47198656, "step": 4205, "train_runtime": 7107.6834, "train_tokens_per_second": 6640.512 }, { "epoch": 2.020159596808064, "grad_norm": 0.7223982810974121, "learning_rate": 1.2056632383895217e-05, "loss": 0.4903, "num_input_tokens_seen": 47255504, "step": 4210, "train_runtime": 7117.033, "train_tokens_per_second": 6639.776 }, { "epoch": 2.0225595488090238, "grad_norm": 0.9436632990837097, "learning_rate": 1.2002935249025732e-05, "loss": 0.4788, "num_input_tokens_seen": 47307728, "step": 4215, "train_runtime": 7126.5999, "train_tokens_per_second": 6638.191 }, { "epoch": 2.0249595008099837, "grad_norm": 0.7383816838264465, "learning_rate": 1.1949320158164466e-05, "loss": 0.4692, "num_input_tokens_seen": 47365504, "step": 4220, "train_runtime": 7136.5388, "train_tokens_per_second": 6637.041 }, { "epoch": 2.027359452810944, "grad_norm": 0.8641635775566101, "learning_rate": 1.1895787449756834e-05, "loss": 0.4565, "num_input_tokens_seen": 47424664, "step": 4225, "train_runtime": 7147.349, "train_tokens_per_second": 6635.28 }, { "epoch": 2.029759404811904, "grad_norm": 0.8401957750320435, "learning_rate": 1.1842337461728232e-05, "loss": 0.5177, "num_input_tokens_seen": 47482624, "step": 4230, "train_runtime": 7158.1241, "train_tokens_per_second": 6633.389 }, { "epoch": 2.0321593568128637, "grad_norm": 0.7083563208580017, "learning_rate": 1.1788970531481832e-05, "loss": 0.4509, "num_input_tokens_seen": 47541264, "step": 4235, "train_runtime": 7168.9418, "train_tokens_per_second": 6631.559 }, { "epoch": 2.0345593088138236, "grad_norm": 0.7770140171051025, "learning_rate": 1.1735686995896559e-05, "loss": 0.5111, "num_input_tokens_seen": 47596256, "step": 4240, "train_runtime": 7178.4941, "train_tokens_per_second": 6630.396 }, { "epoch": 2.0369592608147835, "grad_norm": 0.8754630088806152, "learning_rate": 1.1682487191324868e-05, "loss": 0.5576, "num_input_tokens_seen": 47649808, "step": 4245, "train_runtime": 7188.2139, "train_tokens_per_second": 6628.88 }, { "epoch": 2.039359212815744, "grad_norm": 0.6423441767692566, "learning_rate": 1.1629371453590671e-05, "loss": 0.4836, "num_input_tokens_seen": 47709328, "step": 4250, "train_runtime": 7198.5845, "train_tokens_per_second": 6627.599 }, { "epoch": 2.0417591648167037, "grad_norm": 0.7070155143737793, "learning_rate": 1.1576340117987233e-05, "loss": 0.5057, "num_input_tokens_seen": 47765800, "step": 4255, "train_runtime": 7209.1424, "train_tokens_per_second": 6625.726 }, { "epoch": 2.0441591168176636, "grad_norm": 0.8831612467765808, "learning_rate": 1.1523393519274996e-05, "loss": 0.4447, "num_input_tokens_seen": 47820320, "step": 4260, "train_runtime": 7218.3344, "train_tokens_per_second": 6624.841 }, { "epoch": 2.0465590688186235, "grad_norm": 0.6510924100875854, "learning_rate": 1.1470531991679523e-05, "loss": 0.5101, "num_input_tokens_seen": 47876928, "step": 4265, "train_runtime": 7228.3009, "train_tokens_per_second": 6623.538 }, { "epoch": 2.048959020819584, "grad_norm": 0.6335709691047668, "learning_rate": 1.1417755868889343e-05, "loss": 0.4432, "num_input_tokens_seen": 47933280, "step": 4270, "train_runtime": 7237.6205, "train_tokens_per_second": 6622.795 }, { "epoch": 2.0513589728205437, "grad_norm": 0.7883151769638062, "learning_rate": 1.1365065484053895e-05, "loss": 0.4606, "num_input_tokens_seen": 47991280, "step": 4275, "train_runtime": 7247.4539, "train_tokens_per_second": 6621.812 }, { "epoch": 2.0537589248215036, "grad_norm": 0.8296838998794556, "learning_rate": 1.1312461169781383e-05, "loss": 0.4669, "num_input_tokens_seen": 48045896, "step": 4280, "train_runtime": 7257.2601, "train_tokens_per_second": 6620.391 }, { "epoch": 2.0561588768224635, "grad_norm": 0.8068815469741821, "learning_rate": 1.1259943258136682e-05, "loss": 0.4849, "num_input_tokens_seen": 48105824, "step": 4285, "train_runtime": 7268.0346, "train_tokens_per_second": 6618.822 }, { "epoch": 2.0585588288234233, "grad_norm": 0.977588415145874, "learning_rate": 1.1207512080639273e-05, "loss": 0.4956, "num_input_tokens_seen": 48160632, "step": 4290, "train_runtime": 7277.9569, "train_tokens_per_second": 6617.329 }, { "epoch": 2.0609587808243837, "grad_norm": 0.7364087700843811, "learning_rate": 1.1155167968261105e-05, "loss": 0.4357, "num_input_tokens_seen": 48217992, "step": 4295, "train_runtime": 7288.3331, "train_tokens_per_second": 6615.778 }, { "epoch": 2.0633587328253435, "grad_norm": 0.757265031337738, "learning_rate": 1.1102911251424526e-05, "loss": 0.4907, "num_input_tokens_seen": 48276216, "step": 4300, "train_runtime": 7298.7103, "train_tokens_per_second": 6614.349 }, { "epoch": 2.0657586848263034, "grad_norm": 0.773041844367981, "learning_rate": 1.1050742260000226e-05, "loss": 0.4687, "num_input_tokens_seen": 48331296, "step": 4305, "train_runtime": 7308.4104, "train_tokens_per_second": 6613.106 }, { "epoch": 2.0681586368272633, "grad_norm": 1.1142570972442627, "learning_rate": 1.0998661323305107e-05, "loss": 0.4574, "num_input_tokens_seen": 48387368, "step": 4310, "train_runtime": 7317.8081, "train_tokens_per_second": 6612.276 }, { "epoch": 2.0705585888282236, "grad_norm": 1.0279673337936401, "learning_rate": 1.094666877010023e-05, "loss": 0.5004, "num_input_tokens_seen": 48440296, "step": 4315, "train_runtime": 7327.8587, "train_tokens_per_second": 6610.43 }, { "epoch": 2.0729585408291835, "grad_norm": 0.9261734485626221, "learning_rate": 1.0894764928588721e-05, "loss": 0.4747, "num_input_tokens_seen": 48492496, "step": 4320, "train_runtime": 7336.9344, "train_tokens_per_second": 6609.368 }, { "epoch": 2.0753584928301434, "grad_norm": 1.1111286878585815, "learning_rate": 1.0842950126413742e-05, "loss": 0.5137, "num_input_tokens_seen": 48549184, "step": 4325, "train_runtime": 7346.6107, "train_tokens_per_second": 6608.378 }, { "epoch": 2.0777584448311033, "grad_norm": 0.8526914119720459, "learning_rate": 1.0791224690656384e-05, "loss": 0.4573, "num_input_tokens_seen": 48601016, "step": 4330, "train_runtime": 7354.8806, "train_tokens_per_second": 6607.995 }, { "epoch": 2.080158396832063, "grad_norm": 0.5850500464439392, "learning_rate": 1.0739588947833593e-05, "loss": 0.4814, "num_input_tokens_seen": 48655504, "step": 4335, "train_runtime": 7363.7381, "train_tokens_per_second": 6607.446 }, { "epoch": 2.0825583488330235, "grad_norm": 1.0572696924209595, "learning_rate": 1.068804322389616e-05, "loss": 0.4997, "num_input_tokens_seen": 48708616, "step": 4340, "train_runtime": 7372.6454, "train_tokens_per_second": 6606.667 }, { "epoch": 2.0849583008339834, "grad_norm": 0.5862051844596863, "learning_rate": 1.06365878442266e-05, "loss": 0.4459, "num_input_tokens_seen": 48769440, "step": 4345, "train_runtime": 7382.0777, "train_tokens_per_second": 6606.465 }, { "epoch": 2.0873582528349433, "grad_norm": 0.7404434680938721, "learning_rate": 1.0585223133637143e-05, "loss": 0.4882, "num_input_tokens_seen": 48827720, "step": 4350, "train_runtime": 7391.0584, "train_tokens_per_second": 6606.323 }, { "epoch": 2.089758204835903, "grad_norm": 0.7802624106407166, "learning_rate": 1.053394941636768e-05, "loss": 0.5322, "num_input_tokens_seen": 48879552, "step": 4355, "train_runtime": 7398.7138, "train_tokens_per_second": 6606.493 }, { "epoch": 2.0921581568368635, "grad_norm": 0.7315226197242737, "learning_rate": 1.0482767016083694e-05, "loss": 0.4515, "num_input_tokens_seen": 48932848, "step": 4360, "train_runtime": 7406.1993, "train_tokens_per_second": 6607.012 }, { "epoch": 2.0945581088378233, "grad_norm": 0.967128574848175, "learning_rate": 1.0431676255874232e-05, "loss": 0.5213, "num_input_tokens_seen": 48989744, "step": 4365, "train_runtime": 7414.1239, "train_tokens_per_second": 6607.624 }, { "epoch": 2.0969580608387832, "grad_norm": 0.731792151927948, "learning_rate": 1.0380677458249852e-05, "loss": 0.4821, "num_input_tokens_seen": 49043888, "step": 4370, "train_runtime": 7421.75, "train_tokens_per_second": 6608.13 }, { "epoch": 2.099358012839743, "grad_norm": 0.8551647067070007, "learning_rate": 1.0329770945140618e-05, "loss": 0.5018, "num_input_tokens_seen": 49099976, "step": 4375, "train_runtime": 7429.6538, "train_tokens_per_second": 6608.649 }, { "epoch": 2.101757964840703, "grad_norm": 0.8482736945152283, "learning_rate": 1.0278957037894048e-05, "loss": 0.5266, "num_input_tokens_seen": 49158168, "step": 4380, "train_runtime": 7437.7108, "train_tokens_per_second": 6609.314 }, { "epoch": 2.1041579168416633, "grad_norm": 0.8070186376571655, "learning_rate": 1.0228236057273063e-05, "loss": 0.4906, "num_input_tokens_seen": 49209920, "step": 4385, "train_runtime": 7445.1797, "train_tokens_per_second": 6609.635 }, { "epoch": 2.106557868842623, "grad_norm": 0.7493661046028137, "learning_rate": 1.0177608323454008e-05, "loss": 0.5067, "num_input_tokens_seen": 49262384, "step": 4390, "train_runtime": 7452.9186, "train_tokens_per_second": 6609.811 }, { "epoch": 2.108957820843583, "grad_norm": 0.7874744534492493, "learning_rate": 1.0127074156024594e-05, "loss": 0.4642, "num_input_tokens_seen": 49315632, "step": 4395, "train_runtime": 7460.8462, "train_tokens_per_second": 6609.925 }, { "epoch": 2.111357772844543, "grad_norm": 0.9224854707717896, "learning_rate": 1.0076633873981883e-05, "loss": 0.4984, "num_input_tokens_seen": 49371384, "step": 4400, "train_runtime": 7468.9769, "train_tokens_per_second": 6610.194 }, { "epoch": 2.1137577248455033, "grad_norm": 0.8540477156639099, "learning_rate": 1.0026287795730319e-05, "loss": 0.4767, "num_input_tokens_seen": 49426056, "step": 4405, "train_runtime": 7477.3027, "train_tokens_per_second": 6610.145 }, { "epoch": 2.116157676846463, "grad_norm": 1.0904680490493774, "learning_rate": 9.976036239079656e-06, "loss": 0.491, "num_input_tokens_seen": 49483160, "step": 4410, "train_runtime": 7485.9905, "train_tokens_per_second": 6610.102 }, { "epoch": 2.118557628847423, "grad_norm": 0.5771769881248474, "learning_rate": 9.925879521242978e-06, "loss": 0.4566, "num_input_tokens_seen": 49537568, "step": 4415, "train_runtime": 7494.7254, "train_tokens_per_second": 6609.657 }, { "epoch": 2.120957580848383, "grad_norm": 0.765743613243103, "learning_rate": 9.87581795883473e-06, "loss": 0.4878, "num_input_tokens_seen": 49594120, "step": 4420, "train_runtime": 7503.2956, "train_tokens_per_second": 6609.645 }, { "epoch": 2.123357532849343, "grad_norm": 0.8731431365013123, "learning_rate": 9.825851867868646e-06, "loss": 0.4871, "num_input_tokens_seen": 49647944, "step": 4425, "train_runtime": 7511.5023, "train_tokens_per_second": 6609.589 }, { "epoch": 2.125757484850303, "grad_norm": 0.9633266925811768, "learning_rate": 9.775981563755835e-06, "loss": 0.4747, "num_input_tokens_seen": 49702848, "step": 4430, "train_runtime": 7520.7688, "train_tokens_per_second": 6608.746 }, { "epoch": 2.128157436851263, "grad_norm": 0.8484842777252197, "learning_rate": 9.726207361302716e-06, "loss": 0.4871, "num_input_tokens_seen": 49754336, "step": 4435, "train_runtime": 7530.7578, "train_tokens_per_second": 6606.817 }, { "epoch": 2.130557388852223, "grad_norm": 0.6933907270431519, "learning_rate": 9.676529574709104e-06, "loss": 0.4813, "num_input_tokens_seen": 49818104, "step": 4440, "train_runtime": 7543.2161, "train_tokens_per_second": 6604.359 }, { "epoch": 2.132957340853183, "grad_norm": 0.8864620327949524, "learning_rate": 9.62694851756616e-06, "loss": 0.5196, "num_input_tokens_seen": 49872640, "step": 4445, "train_runtime": 7553.5538, "train_tokens_per_second": 6602.54 }, { "epoch": 2.135357292854143, "grad_norm": 0.6627900004386902, "learning_rate": 9.577464502854432e-06, "loss": 0.441, "num_input_tokens_seen": 49929176, "step": 4450, "train_runtime": 7563.6578, "train_tokens_per_second": 6601.194 }, { "epoch": 2.137757244855103, "grad_norm": 0.8925694823265076, "learning_rate": 9.528077842941929e-06, "loss": 0.4755, "num_input_tokens_seen": 49984040, "step": 4455, "train_runtime": 7574.4434, "train_tokens_per_second": 6599.038 }, { "epoch": 2.140157196856063, "grad_norm": 0.7881972789764404, "learning_rate": 9.478788849582071e-06, "loss": 0.4841, "num_input_tokens_seen": 50036368, "step": 4460, "train_runtime": 7584.774, "train_tokens_per_second": 6596.949 }, { "epoch": 2.1425571488570228, "grad_norm": 0.7480626106262207, "learning_rate": 9.42959783391176e-06, "loss": 0.4813, "num_input_tokens_seen": 50091376, "step": 4465, "train_runtime": 7594.7525, "train_tokens_per_second": 6595.524 }, { "epoch": 2.1449571008579826, "grad_norm": 0.8503336310386658, "learning_rate": 9.38050510644944e-06, "loss": 0.4844, "num_input_tokens_seen": 50148472, "step": 4470, "train_runtime": 7604.4881, "train_tokens_per_second": 6594.589 }, { "epoch": 2.147357052858943, "grad_norm": 0.896701991558075, "learning_rate": 9.331510977093077e-06, "loss": 0.4784, "num_input_tokens_seen": 50202392, "step": 4475, "train_runtime": 7614.8511, "train_tokens_per_second": 6592.695 }, { "epoch": 2.149757004859903, "grad_norm": 0.7483791708946228, "learning_rate": 9.282615755118266e-06, "loss": 0.4473, "num_input_tokens_seen": 50262048, "step": 4480, "train_runtime": 7625.4864, "train_tokens_per_second": 6591.324 }, { "epoch": 2.1521569568608627, "grad_norm": 0.8028972148895264, "learning_rate": 9.23381974917622e-06, "loss": 0.4611, "num_input_tokens_seen": 50318512, "step": 4485, "train_runtime": 7635.9972, "train_tokens_per_second": 6589.645 }, { "epoch": 2.1545569088618226, "grad_norm": 0.7019287347793579, "learning_rate": 9.185123267291881e-06, "loss": 0.4622, "num_input_tokens_seen": 50371472, "step": 4490, "train_runtime": 7645.9049, "train_tokens_per_second": 6588.033 }, { "epoch": 2.156956860862783, "grad_norm": 0.849296510219574, "learning_rate": 9.136526616861921e-06, "loss": 0.501, "num_input_tokens_seen": 50425888, "step": 4495, "train_runtime": 7656.303, "train_tokens_per_second": 6586.193 }, { "epoch": 2.159356812863743, "grad_norm": 0.5608788728713989, "learning_rate": 9.088030104652829e-06, "loss": 0.4828, "num_input_tokens_seen": 50484136, "step": 4500, "train_runtime": 7666.3853, "train_tokens_per_second": 6585.129 }, { "epoch": 2.1617567648647027, "grad_norm": 0.7533180713653564, "learning_rate": 9.03963403679899e-06, "loss": 0.463, "num_input_tokens_seen": 50540376, "step": 4505, "train_runtime": 7676.9709, "train_tokens_per_second": 6583.375 }, { "epoch": 2.1641567168656626, "grad_norm": 0.8343721032142639, "learning_rate": 8.99133871880071e-06, "loss": 0.4948, "num_input_tokens_seen": 50594968, "step": 4510, "train_runtime": 7687.0369, "train_tokens_per_second": 6581.856 }, { "epoch": 2.1665566688666225, "grad_norm": 1.0494121313095093, "learning_rate": 8.943144455522314e-06, "loss": 0.4919, "num_input_tokens_seen": 50649296, "step": 4515, "train_runtime": 7697.4192, "train_tokens_per_second": 6580.036 }, { "epoch": 2.168956620867583, "grad_norm": 0.8824997544288635, "learning_rate": 8.895051551190248e-06, "loss": 0.4279, "num_input_tokens_seen": 50706696, "step": 4520, "train_runtime": 7707.9222, "train_tokens_per_second": 6578.517 }, { "epoch": 2.1713565728685427, "grad_norm": 0.8693490028381348, "learning_rate": 8.847060309391084e-06, "loss": 0.4776, "num_input_tokens_seen": 50758984, "step": 4525, "train_runtime": 7717.5559, "train_tokens_per_second": 6577.08 }, { "epoch": 2.1737565248695025, "grad_norm": 0.6775808334350586, "learning_rate": 8.799171033069695e-06, "loss": 0.4821, "num_input_tokens_seen": 50812536, "step": 4530, "train_runtime": 7727.2348, "train_tokens_per_second": 6575.772 }, { "epoch": 2.1761564768704624, "grad_norm": 0.7019457817077637, "learning_rate": 8.75138402452725e-06, "loss": 0.4698, "num_input_tokens_seen": 50867192, "step": 4535, "train_runtime": 7737.0022, "train_tokens_per_second": 6574.535 }, { "epoch": 2.1785564288714228, "grad_norm": 0.6866047978401184, "learning_rate": 8.7036995854194e-06, "loss": 0.4612, "num_input_tokens_seen": 50925384, "step": 4540, "train_runtime": 7746.4582, "train_tokens_per_second": 6574.022 }, { "epoch": 2.1809563808723826, "grad_norm": 0.605133593082428, "learning_rate": 8.656118016754292e-06, "loss": 0.4939, "num_input_tokens_seen": 50983216, "step": 4545, "train_runtime": 7757.2379, "train_tokens_per_second": 6572.341 }, { "epoch": 2.1833563328733425, "grad_norm": 0.6981828212738037, "learning_rate": 8.608639618890702e-06, "loss": 0.5204, "num_input_tokens_seen": 51038664, "step": 4550, "train_runtime": 7767.547, "train_tokens_per_second": 6570.757 }, { "epoch": 2.1857562848743024, "grad_norm": 0.8705071806907654, "learning_rate": 8.561264691536172e-06, "loss": 0.4907, "num_input_tokens_seen": 51096648, "step": 4555, "train_runtime": 7777.3381, "train_tokens_per_second": 6569.94 }, { "epoch": 2.1881562368752627, "grad_norm": 0.7312107682228088, "learning_rate": 8.51399353374506e-06, "loss": 0.5114, "num_input_tokens_seen": 51152456, "step": 4560, "train_runtime": 7787.2126, "train_tokens_per_second": 6568.776 }, { "epoch": 2.1905561888762226, "grad_norm": 0.8138951063156128, "learning_rate": 8.466826443916667e-06, "loss": 0.4822, "num_input_tokens_seen": 51207840, "step": 4565, "train_runtime": 7796.4469, "train_tokens_per_second": 6568.1 }, { "epoch": 2.1929561408771825, "grad_norm": 0.6703912019729614, "learning_rate": 8.4197637197934e-06, "loss": 0.4849, "num_input_tokens_seen": 51261448, "step": 4570, "train_runtime": 7806.184, "train_tokens_per_second": 6566.774 }, { "epoch": 2.1953560928781424, "grad_norm": 0.9687227010726929, "learning_rate": 8.37280565845884e-06, "loss": 0.467, "num_input_tokens_seen": 51317720, "step": 4575, "train_runtime": 7816.3502, "train_tokens_per_second": 6565.433 }, { "epoch": 2.1977560448791023, "grad_norm": 0.8064000606536865, "learning_rate": 8.325952556335878e-06, "loss": 0.4851, "num_input_tokens_seen": 51372576, "step": 4580, "train_runtime": 7825.9422, "train_tokens_per_second": 6564.395 }, { "epoch": 2.2001559968800626, "grad_norm": 0.8729395866394043, "learning_rate": 8.279204709184843e-06, "loss": 0.5434, "num_input_tokens_seen": 51422552, "step": 4585, "train_runtime": 7835.905, "train_tokens_per_second": 6562.427 }, { "epoch": 2.2025559488810225, "grad_norm": 0.898769199848175, "learning_rate": 8.232562412101674e-06, "loss": 0.5217, "num_input_tokens_seen": 51477960, "step": 4590, "train_runtime": 7846.3182, "train_tokens_per_second": 6560.779 }, { "epoch": 2.2049559008819823, "grad_norm": 0.9951900243759155, "learning_rate": 8.186025959515995e-06, "loss": 0.4839, "num_input_tokens_seen": 51537952, "step": 4595, "train_runtime": 7856.7191, "train_tokens_per_second": 6559.73 }, { "epoch": 2.2073558528829422, "grad_norm": 0.8248569965362549, "learning_rate": 8.139595645189282e-06, "loss": 0.4497, "num_input_tokens_seen": 51592688, "step": 4600, "train_runtime": 7866.4031, "train_tokens_per_second": 6558.612 }, { "epoch": 2.209755804883902, "grad_norm": 0.8907241821289062, "learning_rate": 8.09327176221305e-06, "loss": 0.4774, "num_input_tokens_seen": 51645280, "step": 4605, "train_runtime": 7876.1364, "train_tokens_per_second": 6557.185 }, { "epoch": 2.2121557568848624, "grad_norm": 0.6718706488609314, "learning_rate": 8.047054603006931e-06, "loss": 0.5308, "num_input_tokens_seen": 51698536, "step": 4610, "train_runtime": 7886.3852, "train_tokens_per_second": 6555.416 }, { "epoch": 2.2145557088858223, "grad_norm": 0.6906898617744446, "learning_rate": 8.000944459316864e-06, "loss": 0.4422, "num_input_tokens_seen": 51756256, "step": 4615, "train_runtime": 7897.4196, "train_tokens_per_second": 6553.565 }, { "epoch": 2.216955660886782, "grad_norm": 0.7952353954315186, "learning_rate": 7.954941622213272e-06, "loss": 0.5049, "num_input_tokens_seen": 51813256, "step": 4620, "train_runtime": 7907.2916, "train_tokens_per_second": 6552.592 }, { "epoch": 2.219355612887742, "grad_norm": 0.7251629829406738, "learning_rate": 7.909046382089203e-06, "loss": 0.4541, "num_input_tokens_seen": 51867560, "step": 4625, "train_runtime": 7917.3897, "train_tokens_per_second": 6551.093 }, { "epoch": 2.2217555648887024, "grad_norm": 0.7001914978027344, "learning_rate": 7.863259028658485e-06, "loss": 0.4918, "num_input_tokens_seen": 51920280, "step": 4630, "train_runtime": 7927.2271, "train_tokens_per_second": 6549.614 }, { "epoch": 2.2241555168896623, "grad_norm": 0.722760021686554, "learning_rate": 7.817579850953904e-06, "loss": 0.4356, "num_input_tokens_seen": 51975984, "step": 4635, "train_runtime": 7939.1498, "train_tokens_per_second": 6546.795 }, { "epoch": 2.226555468890622, "grad_norm": 0.8394641876220703, "learning_rate": 7.77200913732542e-06, "loss": 0.5007, "num_input_tokens_seen": 52031784, "step": 4640, "train_runtime": 7948.6393, "train_tokens_per_second": 6545.999 }, { "epoch": 2.228955420891582, "grad_norm": 0.8581427335739136, "learning_rate": 7.72654717543828e-06, "loss": 0.4482, "num_input_tokens_seen": 52086728, "step": 4645, "train_runtime": 7958.8142, "train_tokens_per_second": 6544.534 }, { "epoch": 2.2313553728925424, "grad_norm": 0.8242650032043457, "learning_rate": 7.681194252271242e-06, "loss": 0.4219, "num_input_tokens_seen": 52143544, "step": 4650, "train_runtime": 7968.3907, "train_tokens_per_second": 6543.799 }, { "epoch": 2.2337553248935023, "grad_norm": 0.7680621147155762, "learning_rate": 7.635950654114782e-06, "loss": 0.4771, "num_input_tokens_seen": 52203016, "step": 4655, "train_runtime": 7978.6952, "train_tokens_per_second": 6542.801 }, { "epoch": 2.236155276894462, "grad_norm": 0.6597278118133545, "learning_rate": 7.5908166665692285e-06, "loss": 0.4791, "num_input_tokens_seen": 52258320, "step": 4660, "train_runtime": 7988.4947, "train_tokens_per_second": 6541.698 }, { "epoch": 2.238555228895422, "grad_norm": 0.8721866011619568, "learning_rate": 7.545792574543003e-06, "loss": 0.4895, "num_input_tokens_seen": 52313336, "step": 4665, "train_runtime": 7998.6775, "train_tokens_per_second": 6540.248 }, { "epoch": 2.240955180896382, "grad_norm": 1.1070098876953125, "learning_rate": 7.500878662250818e-06, "loss": 0.5019, "num_input_tokens_seen": 52366728, "step": 4670, "train_runtime": 8008.484, "train_tokens_per_second": 6538.906 }, { "epoch": 2.2433551328973422, "grad_norm": 0.6862952709197998, "learning_rate": 7.456075213211883e-06, "loss": 0.4622, "num_input_tokens_seen": 52423136, "step": 4675, "train_runtime": 8018.9005, "train_tokens_per_second": 6537.447 }, { "epoch": 2.245755084898302, "grad_norm": 0.7063257098197937, "learning_rate": 7.411382510248091e-06, "loss": 0.4422, "num_input_tokens_seen": 52480088, "step": 4680, "train_runtime": 8028.8285, "train_tokens_per_second": 6536.456 }, { "epoch": 2.248155036899262, "grad_norm": 0.7958875894546509, "learning_rate": 7.366800835482246e-06, "loss": 0.4774, "num_input_tokens_seen": 52538696, "step": 4685, "train_runtime": 8038.5124, "train_tokens_per_second": 6535.873 }, { "epoch": 2.250554988900222, "grad_norm": 0.7092862725257874, "learning_rate": 7.3223304703363135e-06, "loss": 0.4537, "num_input_tokens_seen": 52598800, "step": 4690, "train_runtime": 8049.8427, "train_tokens_per_second": 6534.14 }, { "epoch": 2.2529549409011818, "grad_norm": 0.6956859230995178, "learning_rate": 7.277971695529592e-06, "loss": 0.4435, "num_input_tokens_seen": 52657280, "step": 4695, "train_runtime": 8060.2392, "train_tokens_per_second": 6532.967 }, { "epoch": 2.255354892902142, "grad_norm": 0.6482681632041931, "learning_rate": 7.233724791076968e-06, "loss": 0.455, "num_input_tokens_seen": 52713952, "step": 4700, "train_runtime": 8070.1937, "train_tokens_per_second": 6531.931 }, { "epoch": 2.257754844903102, "grad_norm": 0.7593861222267151, "learning_rate": 7.189590036287167e-06, "loss": 0.4506, "num_input_tokens_seen": 52772688, "step": 4705, "train_runtime": 8080.8866, "train_tokens_per_second": 6530.557 }, { "epoch": 2.260154796904062, "grad_norm": 0.8229504823684692, "learning_rate": 7.145567709760942e-06, "loss": 0.4944, "num_input_tokens_seen": 52829984, "step": 4710, "train_runtime": 8091.297, "train_tokens_per_second": 6529.236 }, { "epoch": 2.2625547489050217, "grad_norm": 0.7563186287879944, "learning_rate": 7.1016580893893514e-06, "loss": 0.485, "num_input_tokens_seen": 52888368, "step": 4715, "train_runtime": 8102.4796, "train_tokens_per_second": 6527.43 }, { "epoch": 2.264954700905982, "grad_norm": 0.8408580422401428, "learning_rate": 7.057861452352005e-06, "loss": 0.4722, "num_input_tokens_seen": 52945664, "step": 4720, "train_runtime": 8112.5815, "train_tokens_per_second": 6526.364 }, { "epoch": 2.267354652906942, "grad_norm": 0.791147768497467, "learning_rate": 7.014178075115305e-06, "loss": 0.5043, "num_input_tokens_seen": 53001096, "step": 4725, "train_runtime": 8122.5542, "train_tokens_per_second": 6525.176 }, { "epoch": 2.269754604907902, "grad_norm": 0.8713123798370361, "learning_rate": 6.9706082334306895e-06, "loss": 0.4978, "num_input_tokens_seen": 53054936, "step": 4730, "train_runtime": 8132.2978, "train_tokens_per_second": 6523.979 }, { "epoch": 2.2721545569088617, "grad_norm": 0.9158002734184265, "learning_rate": 6.927152202332898e-06, "loss": 0.4493, "num_input_tokens_seen": 53115032, "step": 4735, "train_runtime": 8142.6092, "train_tokens_per_second": 6523.097 }, { "epoch": 2.274554508909822, "grad_norm": 0.8470547795295715, "learning_rate": 6.883810256138268e-06, "loss": 0.5082, "num_input_tokens_seen": 53168048, "step": 4740, "train_runtime": 8152.4189, "train_tokens_per_second": 6521.751 }, { "epoch": 2.276954460910782, "grad_norm": 0.8152704834938049, "learning_rate": 6.8405826684429495e-06, "loss": 0.4622, "num_input_tokens_seen": 53228112, "step": 4745, "train_runtime": 8163.4113, "train_tokens_per_second": 6520.327 }, { "epoch": 2.279354412911742, "grad_norm": 1.1918436288833618, "learning_rate": 6.7974697121212044e-06, "loss": 0.475, "num_input_tokens_seen": 53282056, "step": 4750, "train_runtime": 8172.6885, "train_tokens_per_second": 6519.526 }, { "epoch": 2.2817543649127017, "grad_norm": 0.8063285946846008, "learning_rate": 6.754471659323708e-06, "loss": 0.4444, "num_input_tokens_seen": 53342728, "step": 4755, "train_runtime": 8181.7917, "train_tokens_per_second": 6519.688 }, { "epoch": 2.2841543169136616, "grad_norm": 0.8364700078964233, "learning_rate": 6.711588781475786e-06, "loss": 0.4833, "num_input_tokens_seen": 53397656, "step": 4760, "train_runtime": 8189.9068, "train_tokens_per_second": 6519.935 }, { "epoch": 2.286554268914622, "grad_norm": 0.8302350640296936, "learning_rate": 6.668821349275714e-06, "loss": 0.4532, "num_input_tokens_seen": 53452736, "step": 4765, "train_runtime": 8198.4472, "train_tokens_per_second": 6519.861 }, { "epoch": 2.2889542209155818, "grad_norm": 0.7638778686523438, "learning_rate": 6.626169632693041e-06, "loss": 0.4679, "num_input_tokens_seen": 53510640, "step": 4770, "train_runtime": 8207.0649, "train_tokens_per_second": 6520.07 }, { "epoch": 2.2913541729165416, "grad_norm": 0.6307675242424011, "learning_rate": 6.5836339009668564e-06, "loss": 0.4336, "num_input_tokens_seen": 53568536, "step": 4775, "train_runtime": 8216.2863, "train_tokens_per_second": 6519.799 }, { "epoch": 2.2937541249175015, "grad_norm": 0.7008303999900818, "learning_rate": 6.541214422604078e-06, "loss": 0.4903, "num_input_tokens_seen": 53623272, "step": 4780, "train_runtime": 8224.518, "train_tokens_per_second": 6519.929 }, { "epoch": 2.2961540769184614, "grad_norm": 0.7568659782409668, "learning_rate": 6.49891146537778e-06, "loss": 0.4665, "num_input_tokens_seen": 53680840, "step": 4785, "train_runtime": 8233.1619, "train_tokens_per_second": 6520.076 }, { "epoch": 2.2985540289194217, "grad_norm": 0.7729014158248901, "learning_rate": 6.456725296325511e-06, "loss": 0.4648, "num_input_tokens_seen": 53736888, "step": 4790, "train_runtime": 8241.812, "train_tokens_per_second": 6520.033 }, { "epoch": 2.3009539809203816, "grad_norm": 0.8767671585083008, "learning_rate": 6.414656181747578e-06, "loss": 0.4426, "num_input_tokens_seen": 53793888, "step": 4795, "train_runtime": 8250.382, "train_tokens_per_second": 6520.169 }, { "epoch": 2.3033539329213415, "grad_norm": 0.5542830228805542, "learning_rate": 6.3727043872053775e-06, "loss": 0.4942, "num_input_tokens_seen": 53853120, "step": 4800, "train_runtime": 8259.364, "train_tokens_per_second": 6520.25 }, { "epoch": 2.3057538849223014, "grad_norm": 0.677183985710144, "learning_rate": 6.330870177519749e-06, "loss": 0.4601, "num_input_tokens_seen": 53911008, "step": 4805, "train_runtime": 8268.0332, "train_tokens_per_second": 6520.415 }, { "epoch": 2.3081538369232617, "grad_norm": 0.6295929551124573, "learning_rate": 6.2891538167692525e-06, "loss": 0.4975, "num_input_tokens_seen": 53970856, "step": 4810, "train_runtime": 8276.673, "train_tokens_per_second": 6520.839 }, { "epoch": 2.3105537889242216, "grad_norm": 0.6823136806488037, "learning_rate": 6.247555568288524e-06, "loss": 0.5108, "num_input_tokens_seen": 54024760, "step": 4815, "train_runtime": 8284.8494, "train_tokens_per_second": 6520.91 }, { "epoch": 2.3129537409251815, "grad_norm": 1.1955187320709229, "learning_rate": 6.2060756946666385e-06, "loss": 0.4972, "num_input_tokens_seen": 54079992, "step": 4820, "train_runtime": 8293.4716, "train_tokens_per_second": 6520.791 }, { "epoch": 2.3153536929261413, "grad_norm": 0.5726960301399231, "learning_rate": 6.164714457745416e-06, "loss": 0.4765, "num_input_tokens_seen": 54137056, "step": 4825, "train_runtime": 8302.0452, "train_tokens_per_second": 6520.93 }, { "epoch": 2.3177536449271017, "grad_norm": 0.8014964461326599, "learning_rate": 6.123472118617779e-06, "loss": 0.502, "num_input_tokens_seen": 54187216, "step": 4830, "train_runtime": 8309.8904, "train_tokens_per_second": 6520.81 }, { "epoch": 2.3201535969280616, "grad_norm": 0.6722724437713623, "learning_rate": 6.082348937626103e-06, "loss": 0.5223, "num_input_tokens_seen": 54243408, "step": 4835, "train_runtime": 8318.422, "train_tokens_per_second": 6520.877 }, { "epoch": 2.3225535489290214, "grad_norm": 0.7219895720481873, "learning_rate": 6.041345174360602e-06, "loss": 0.4379, "num_input_tokens_seen": 54300888, "step": 4840, "train_runtime": 8327.0808, "train_tokens_per_second": 6520.999 }, { "epoch": 2.3249535009299813, "grad_norm": 0.5452620983123779, "learning_rate": 6.0004610876576385e-06, "loss": 0.425, "num_input_tokens_seen": 54359080, "step": 4845, "train_runtime": 8335.9015, "train_tokens_per_second": 6521.08 }, { "epoch": 2.327353452930941, "grad_norm": 0.7828608751296997, "learning_rate": 5.9596969355981165e-06, "loss": 0.4783, "num_input_tokens_seen": 54414784, "step": 4850, "train_runtime": 8343.8457, "train_tokens_per_second": 6521.547 }, { "epoch": 2.3297534049319015, "grad_norm": 0.7745143175125122, "learning_rate": 5.9190529755058786e-06, "loss": 0.4625, "num_input_tokens_seen": 54469544, "step": 4855, "train_runtime": 8352.9742, "train_tokens_per_second": 6520.976 }, { "epoch": 2.3321533569328614, "grad_norm": 0.7965600490570068, "learning_rate": 5.878529463946028e-06, "loss": 0.4517, "num_input_tokens_seen": 54525088, "step": 4860, "train_runtime": 8362.2759, "train_tokens_per_second": 6520.365 }, { "epoch": 2.3345533089338213, "grad_norm": 0.7234916090965271, "learning_rate": 5.838126656723353e-06, "loss": 0.4848, "num_input_tokens_seen": 54581656, "step": 4865, "train_runtime": 8372.4358, "train_tokens_per_second": 6519.209 }, { "epoch": 2.336953260934781, "grad_norm": 0.8496655225753784, "learning_rate": 5.797844808880681e-06, "loss": 0.4535, "num_input_tokens_seen": 54633656, "step": 4870, "train_runtime": 8381.8667, "train_tokens_per_second": 6518.077 }, { "epoch": 2.339353212935741, "grad_norm": 0.8986937999725342, "learning_rate": 5.757684174697306e-06, "loss": 0.5149, "num_input_tokens_seen": 54688552, "step": 4875, "train_runtime": 8392.2449, "train_tokens_per_second": 6516.558 }, { "epoch": 2.3417531649367014, "grad_norm": 0.8993620276451111, "learning_rate": 5.717645007687333e-06, "loss": 0.4811, "num_input_tokens_seen": 54745736, "step": 4880, "train_runtime": 8401.6978, "train_tokens_per_second": 6516.032 }, { "epoch": 2.3441531169376613, "grad_norm": 0.8470688462257385, "learning_rate": 5.677727560598117e-06, "loss": 0.4531, "num_input_tokens_seen": 54801056, "step": 4885, "train_runtime": 8411.9299, "train_tokens_per_second": 6514.683 }, { "epoch": 2.346553068938621, "grad_norm": 0.7177883982658386, "learning_rate": 5.637932085408665e-06, "loss": 0.428, "num_input_tokens_seen": 54862792, "step": 4890, "train_runtime": 8422.5464, "train_tokens_per_second": 6513.801 }, { "epoch": 2.348953020939581, "grad_norm": 0.9984344840049744, "learning_rate": 5.598258833328024e-06, "loss": 0.5082, "num_input_tokens_seen": 54917120, "step": 4895, "train_runtime": 8432.5181, "train_tokens_per_second": 6512.541 }, { "epoch": 2.3513529729405414, "grad_norm": 0.7532204985618591, "learning_rate": 5.558708054793702e-06, "loss": 0.4747, "num_input_tokens_seen": 54970952, "step": 4900, "train_runtime": 8442.5045, "train_tokens_per_second": 6511.214 }, { "epoch": 2.3537529249415012, "grad_norm": 0.9301844835281372, "learning_rate": 5.519279999470114e-06, "loss": 0.4653, "num_input_tokens_seen": 55030344, "step": 4905, "train_runtime": 8453.3379, "train_tokens_per_second": 6509.895 }, { "epoch": 2.356152876942461, "grad_norm": 0.7001831531524658, "learning_rate": 5.47997491624696e-06, "loss": 0.4505, "num_input_tokens_seen": 55089240, "step": 4910, "train_runtime": 8463.8354, "train_tokens_per_second": 6508.78 }, { "epoch": 2.358552828943421, "grad_norm": 1.0007083415985107, "learning_rate": 5.440793053237703e-06, "loss": 0.4951, "num_input_tokens_seen": 55145288, "step": 4915, "train_runtime": 8472.6075, "train_tokens_per_second": 6508.656 }, { "epoch": 2.3609527809443813, "grad_norm": 0.807292103767395, "learning_rate": 5.401734657777949e-06, "loss": 0.4555, "num_input_tokens_seen": 55202104, "step": 4920, "train_runtime": 8481.8958, "train_tokens_per_second": 6508.227 }, { "epoch": 2.363352732945341, "grad_norm": 0.8415015339851379, "learning_rate": 5.362799976423946e-06, "loss": 0.4936, "num_input_tokens_seen": 55259704, "step": 4925, "train_runtime": 8490.9011, "train_tokens_per_second": 6508.108 }, { "epoch": 2.365752684946301, "grad_norm": 0.6624288558959961, "learning_rate": 5.323989254950973e-06, "loss": 0.4645, "num_input_tokens_seen": 55317744, "step": 4930, "train_runtime": 8500.053, "train_tokens_per_second": 6507.929 }, { "epoch": 2.368152636947261, "grad_norm": 0.8374559283256531, "learning_rate": 5.285302738351813e-06, "loss": 0.4797, "num_input_tokens_seen": 55372296, "step": 4935, "train_runtime": 8507.8541, "train_tokens_per_second": 6508.374 }, { "epoch": 2.370552588948221, "grad_norm": 0.5884356498718262, "learning_rate": 5.246740670835227e-06, "loss": 0.4606, "num_input_tokens_seen": 55433904, "step": 4940, "train_runtime": 8517.3387, "train_tokens_per_second": 6508.36 }, { "epoch": 2.372952540949181, "grad_norm": 0.7946999669075012, "learning_rate": 5.208303295824368e-06, "loss": 0.4901, "num_input_tokens_seen": 55489480, "step": 4945, "train_runtime": 8525.7706, "train_tokens_per_second": 6508.442 }, { "epoch": 2.375352492950141, "grad_norm": 0.8008665442466736, "learning_rate": 5.16999085595527e-06, "loss": 0.4489, "num_input_tokens_seen": 55548432, "step": 4950, "train_runtime": 8534.5861, "train_tokens_per_second": 6508.626 }, { "epoch": 2.377752444951101, "grad_norm": 0.6131346225738525, "learning_rate": 5.1318035930753295e-06, "loss": 0.4751, "num_input_tokens_seen": 55606952, "step": 4955, "train_runtime": 8544.472, "train_tokens_per_second": 6507.945 }, { "epoch": 2.380152396952061, "grad_norm": 0.6987022757530212, "learning_rate": 5.09374174824174e-06, "loss": 0.4716, "num_input_tokens_seen": 55665912, "step": 4960, "train_runtime": 8553.5875, "train_tokens_per_second": 6507.902 }, { "epoch": 2.3825523489530207, "grad_norm": 0.9554920792579651, "learning_rate": 5.0558055617200205e-06, "loss": 0.4208, "num_input_tokens_seen": 55719624, "step": 4965, "train_runtime": 8561.4317, "train_tokens_per_second": 6508.213 }, { "epoch": 2.384952300953981, "grad_norm": 0.7300603985786438, "learning_rate": 5.0179952729824395e-06, "loss": 0.4832, "num_input_tokens_seen": 55774472, "step": 4970, "train_runtime": 8570.1123, "train_tokens_per_second": 6508.021 }, { "epoch": 2.387352252954941, "grad_norm": 0.8243890404701233, "learning_rate": 4.980311120706569e-06, "loss": 0.5135, "num_input_tokens_seen": 55826392, "step": 4975, "train_runtime": 8578.1037, "train_tokens_per_second": 6508.011 }, { "epoch": 2.389752204955901, "grad_norm": 0.7249002456665039, "learning_rate": 4.942753342773718e-06, "loss": 0.5443, "num_input_tokens_seen": 55880968, "step": 4980, "train_runtime": 8586.3873, "train_tokens_per_second": 6508.088 }, { "epoch": 2.3921521569568607, "grad_norm": 0.883586585521698, "learning_rate": 4.90532217626746e-06, "loss": 0.4719, "num_input_tokens_seen": 55933504, "step": 4985, "train_runtime": 8594.9139, "train_tokens_per_second": 6507.745 }, { "epoch": 2.394552108957821, "grad_norm": 0.9183365702629089, "learning_rate": 4.868017857472157e-06, "loss": 0.4971, "num_input_tokens_seen": 55986736, "step": 4990, "train_runtime": 8603.0537, "train_tokens_per_second": 6507.775 }, { "epoch": 2.396952060958781, "grad_norm": 0.9093974232673645, "learning_rate": 4.830840621871416e-06, "loss": 0.471, "num_input_tokens_seen": 56042472, "step": 4995, "train_runtime": 8612.3964, "train_tokens_per_second": 6507.187 }, { "epoch": 2.3993520129597408, "grad_norm": 0.8658146858215332, "learning_rate": 4.793790704146639e-06, "loss": 0.5096, "num_input_tokens_seen": 56094608, "step": 5000, "train_runtime": 8620.6801, "train_tokens_per_second": 6506.982 }, { "epoch": 2.4017519649607006, "grad_norm": 0.881760835647583, "learning_rate": 4.756868338175552e-06, "loss": 0.4545, "num_input_tokens_seen": 56152192, "step": 5005, "train_runtime": 8628.9033, "train_tokens_per_second": 6507.454 }, { "epoch": 2.404151916961661, "grad_norm": 0.6396927833557129, "learning_rate": 4.7200737570306765e-06, "loss": 0.482, "num_input_tokens_seen": 56209072, "step": 5010, "train_runtime": 8637.3318, "train_tokens_per_second": 6507.689 }, { "epoch": 2.406551868962621, "grad_norm": 0.7207968831062317, "learning_rate": 4.683407192977923e-06, "loss": 0.4701, "num_input_tokens_seen": 56265496, "step": 5015, "train_runtime": 8645.9013, "train_tokens_per_second": 6507.765 }, { "epoch": 2.4089518209635807, "grad_norm": 0.6970353126525879, "learning_rate": 4.646868877475083e-06, "loss": 0.4906, "num_input_tokens_seen": 56324336, "step": 5020, "train_runtime": 8654.8609, "train_tokens_per_second": 6507.827 }, { "epoch": 2.4113517729645406, "grad_norm": 0.6664267182350159, "learning_rate": 4.610459041170376e-06, "loss": 0.4497, "num_input_tokens_seen": 56387160, "step": 5025, "train_runtime": 8664.2456, "train_tokens_per_second": 6508.029 }, { "epoch": 2.4137517249655005, "grad_norm": 0.6361657977104187, "learning_rate": 4.574177913900992e-06, "loss": 0.4473, "num_input_tokens_seen": 56450040, "step": 5030, "train_runtime": 8672.7824, "train_tokens_per_second": 6508.873 }, { "epoch": 2.416151676966461, "grad_norm": 0.9782693386077881, "learning_rate": 4.538025724691647e-06, "loss": 0.5403, "num_input_tokens_seen": 56509192, "step": 5035, "train_runtime": 8680.8979, "train_tokens_per_second": 6509.602 }, { "epoch": 2.4185516289674207, "grad_norm": 1.0109143257141113, "learning_rate": 4.502002701753149e-06, "loss": 0.4535, "num_input_tokens_seen": 56564168, "step": 5040, "train_runtime": 8689.3056, "train_tokens_per_second": 6509.63 }, { "epoch": 2.4209515809683806, "grad_norm": 0.8760951161384583, "learning_rate": 4.4661090724809286e-06, "loss": 0.4666, "num_input_tokens_seen": 56619720, "step": 5045, "train_runtime": 8698.0152, "train_tokens_per_second": 6509.499 }, { "epoch": 2.4233515329693405, "grad_norm": 0.879936933517456, "learning_rate": 4.430345063453614e-06, "loss": 0.4685, "num_input_tokens_seen": 56674064, "step": 5050, "train_runtime": 8707.0335, "train_tokens_per_second": 6508.998 }, { "epoch": 2.4257514849703004, "grad_norm": 0.5749469995498657, "learning_rate": 4.394710900431628e-06, "loss": 0.5077, "num_input_tokens_seen": 56730176, "step": 5055, "train_runtime": 8715.7157, "train_tokens_per_second": 6508.952 }, { "epoch": 2.4281514369712607, "grad_norm": 0.670002818107605, "learning_rate": 4.359206808355715e-06, "loss": 0.4711, "num_input_tokens_seen": 56786912, "step": 5060, "train_runtime": 8724.2214, "train_tokens_per_second": 6509.109 }, { "epoch": 2.4305513889722206, "grad_norm": 0.8267392516136169, "learning_rate": 4.32383301134556e-06, "loss": 0.468, "num_input_tokens_seen": 56846864, "step": 5065, "train_runtime": 8733.1875, "train_tokens_per_second": 6509.292 }, { "epoch": 2.4329513409731804, "grad_norm": 0.9042259454727173, "learning_rate": 4.288589732698365e-06, "loss": 0.4722, "num_input_tokens_seen": 56903624, "step": 5070, "train_runtime": 8741.802, "train_tokens_per_second": 6509.37 }, { "epoch": 2.4353512929741403, "grad_norm": 0.9303114414215088, "learning_rate": 4.253477194887423e-06, "loss": 0.4879, "num_input_tokens_seen": 56961168, "step": 5075, "train_runtime": 8750.9039, "train_tokens_per_second": 6509.175 }, { "epoch": 2.4377512449751007, "grad_norm": 0.8733497858047485, "learning_rate": 4.218495619560725e-06, "loss": 0.4762, "num_input_tokens_seen": 57017760, "step": 5080, "train_runtime": 8759.4851, "train_tokens_per_second": 6509.259 }, { "epoch": 2.4401511969760605, "grad_norm": 0.8203326463699341, "learning_rate": 4.1836452275395624e-06, "loss": 0.4934, "num_input_tokens_seen": 57072760, "step": 5085, "train_runtime": 8768.1106, "train_tokens_per_second": 6509.129 }, { "epoch": 2.4425511489770204, "grad_norm": 1.0363794565200806, "learning_rate": 4.148926238817141e-06, "loss": 0.4518, "num_input_tokens_seen": 57128592, "step": 5090, "train_runtime": 8776.3031, "train_tokens_per_second": 6509.414 }, { "epoch": 2.4449511009779803, "grad_norm": 0.9167368412017822, "learning_rate": 4.114338872557175e-06, "loss": 0.4542, "num_input_tokens_seen": 57184720, "step": 5095, "train_runtime": 8784.8429, "train_tokens_per_second": 6509.476 }, { "epoch": 2.4473510529789406, "grad_norm": 0.662429928779602, "learning_rate": 4.079883347092506e-06, "loss": 0.4811, "num_input_tokens_seen": 57248888, "step": 5100, "train_runtime": 8794.2311, "train_tokens_per_second": 6509.823 }, { "epoch": 2.4497510049799005, "grad_norm": 0.6756502389907837, "learning_rate": 4.045559879923747e-06, "loss": 0.454, "num_input_tokens_seen": 57307744, "step": 5105, "train_runtime": 8803.0414, "train_tokens_per_second": 6509.994 }, { "epoch": 2.4521509569808604, "grad_norm": 0.7121127843856812, "learning_rate": 4.011368687717867e-06, "loss": 0.4506, "num_input_tokens_seen": 57363824, "step": 5110, "train_runtime": 8811.1922, "train_tokens_per_second": 6510.336 }, { "epoch": 2.4545509089818203, "grad_norm": 0.764569878578186, "learning_rate": 3.977309986306874e-06, "loss": 0.4614, "num_input_tokens_seen": 57422952, "step": 5115, "train_runtime": 8819.6634, "train_tokens_per_second": 6510.787 }, { "epoch": 2.45695086098278, "grad_norm": 0.9439240097999573, "learning_rate": 3.943383990686425e-06, "loss": 0.5036, "num_input_tokens_seen": 57475568, "step": 5120, "train_runtime": 8827.7896, "train_tokens_per_second": 6510.754 }, { "epoch": 2.4593508129837405, "grad_norm": 0.7676842212677002, "learning_rate": 3.909590915014455e-06, "loss": 0.4741, "num_input_tokens_seen": 57533000, "step": 5125, "train_runtime": 8836.2004, "train_tokens_per_second": 6511.056 }, { "epoch": 2.4617507649847004, "grad_norm": 0.7224127054214478, "learning_rate": 3.875930972609851e-06, "loss": 0.4555, "num_input_tokens_seen": 57591416, "step": 5130, "train_runtime": 8844.7508, "train_tokens_per_second": 6511.367 }, { "epoch": 2.4641507169856602, "grad_norm": 0.8699045777320862, "learning_rate": 3.842404375951089e-06, "loss": 0.4948, "num_input_tokens_seen": 57648120, "step": 5135, "train_runtime": 8853.2169, "train_tokens_per_second": 6511.545 }, { "epoch": 2.46655066898662, "grad_norm": 0.8307254910469055, "learning_rate": 3.809011336674917e-06, "loss": 0.4747, "num_input_tokens_seen": 57705096, "step": 5140, "train_runtime": 8861.9212, "train_tokens_per_second": 6511.579 }, { "epoch": 2.46895062098758, "grad_norm": 1.0947297811508179, "learning_rate": 3.7757520655749863e-06, "loss": 0.4711, "num_input_tokens_seen": 57760000, "step": 5145, "train_runtime": 8870.5168, "train_tokens_per_second": 6511.458 }, { "epoch": 2.4713505729885403, "grad_norm": 0.6444729566574097, "learning_rate": 3.7426267726005354e-06, "loss": 0.4566, "num_input_tokens_seen": 57814992, "step": 5150, "train_runtime": 8879.2323, "train_tokens_per_second": 6511.26 }, { "epoch": 2.4737505249895, "grad_norm": 0.7921139001846313, "learning_rate": 3.709635666855077e-06, "loss": 0.4552, "num_input_tokens_seen": 57870400, "step": 5155, "train_runtime": 8888.1359, "train_tokens_per_second": 6510.972 }, { "epoch": 2.47615047699046, "grad_norm": 0.6223105192184448, "learning_rate": 3.6767789565950563e-06, "loss": 0.425, "num_input_tokens_seen": 57932208, "step": 5160, "train_runtime": 8896.7689, "train_tokens_per_second": 6511.601 }, { "epoch": 2.4785504289914204, "grad_norm": 0.7725955843925476, "learning_rate": 3.64405684922855e-06, "loss": 0.4413, "num_input_tokens_seen": 57989280, "step": 5165, "train_runtime": 8905.0042, "train_tokens_per_second": 6511.988 }, { "epoch": 2.4809503809923803, "grad_norm": 0.7563416361808777, "learning_rate": 3.611469551313959e-06, "loss": 0.521, "num_input_tokens_seen": 58045968, "step": 5170, "train_runtime": 8913.261, "train_tokens_per_second": 6512.316 }, { "epoch": 2.48335033299334, "grad_norm": 0.7822843790054321, "learning_rate": 3.579017268558693e-06, "loss": 0.4989, "num_input_tokens_seen": 58098536, "step": 5175, "train_runtime": 8920.913, "train_tokens_per_second": 6512.622 }, { "epoch": 2.4857502849943, "grad_norm": 0.80488520860672, "learning_rate": 3.5467002058178764e-06, "loss": 0.498, "num_input_tokens_seen": 58153656, "step": 5180, "train_runtime": 8929.6199, "train_tokens_per_second": 6512.445 }, { "epoch": 2.48815023699526, "grad_norm": 0.7986950278282166, "learning_rate": 3.514518567093056e-06, "loss": 0.4513, "num_input_tokens_seen": 58208960, "step": 5185, "train_runtime": 8938.3362, "train_tokens_per_second": 6512.281 }, { "epoch": 2.4905501889962203, "grad_norm": 0.7876197695732117, "learning_rate": 3.4824725555309272e-06, "loss": 0.4757, "num_input_tokens_seen": 58268880, "step": 5190, "train_runtime": 8946.6352, "train_tokens_per_second": 6512.938 }, { "epoch": 2.49295014099718, "grad_norm": 0.8735581040382385, "learning_rate": 3.4505623734220226e-06, "loss": 0.4926, "num_input_tokens_seen": 58323184, "step": 5195, "train_runtime": 8954.4183, "train_tokens_per_second": 6513.341 }, { "epoch": 2.49535009299814, "grad_norm": 0.8230021595954895, "learning_rate": 3.4187882221994564e-06, "loss": 0.5169, "num_input_tokens_seen": 58379592, "step": 5200, "train_runtime": 8962.9041, "train_tokens_per_second": 6513.468 }, { "epoch": 2.4977500449991, "grad_norm": 0.9317114353179932, "learning_rate": 3.3871503024376554e-06, "loss": 0.4625, "num_input_tokens_seen": 58439472, "step": 5205, "train_runtime": 8971.3456, "train_tokens_per_second": 6514.014 }, { "epoch": 2.50014999700006, "grad_norm": 0.889101505279541, "learning_rate": 3.3556488138510674e-06, "loss": 0.4478, "num_input_tokens_seen": 58498776, "step": 5210, "train_runtime": 8980.13, "train_tokens_per_second": 6514.246 }, { "epoch": 2.50254994900102, "grad_norm": 0.5332804322242737, "learning_rate": 3.3242839552929366e-06, "loss": 0.4552, "num_input_tokens_seen": 58559344, "step": 5215, "train_runtime": 8988.8739, "train_tokens_per_second": 6514.647 }, { "epoch": 2.50494990100198, "grad_norm": 0.9555898308753967, "learning_rate": 3.2930559247540267e-06, "loss": 0.4537, "num_input_tokens_seen": 58614416, "step": 5220, "train_runtime": 8997.7825, "train_tokens_per_second": 6514.318 }, { "epoch": 2.50734985300294, "grad_norm": 1.1382311582565308, "learning_rate": 3.2619649193613626e-06, "loss": 0.5041, "num_input_tokens_seen": 58667216, "step": 5225, "train_runtime": 9006.2657, "train_tokens_per_second": 6514.045 }, { "epoch": 2.5097498050038998, "grad_norm": 1.1261781454086304, "learning_rate": 3.2310111353770045e-06, "loss": 0.5123, "num_input_tokens_seen": 58722648, "step": 5230, "train_runtime": 9014.5273, "train_tokens_per_second": 6514.224 }, { "epoch": 2.5121497570048597, "grad_norm": 0.6339508295059204, "learning_rate": 3.2001947681967987e-06, "loss": 0.466, "num_input_tokens_seen": 58780640, "step": 5235, "train_runtime": 9023.7118, "train_tokens_per_second": 6514.02 }, { "epoch": 2.51454970900582, "grad_norm": 0.8819341659545898, "learning_rate": 3.169516012349161e-06, "loss": 0.4855, "num_input_tokens_seen": 58839080, "step": 5240, "train_runtime": 9032.9027, "train_tokens_per_second": 6513.862 }, { "epoch": 2.51694966100678, "grad_norm": 0.8198482394218445, "learning_rate": 3.138975061493815e-06, "loss": 0.5462, "num_input_tokens_seen": 58888056, "step": 5245, "train_runtime": 9041.1086, "train_tokens_per_second": 6513.367 }, { "epoch": 2.5193496130077397, "grad_norm": 0.7308799028396606, "learning_rate": 3.1085721084205987e-06, "loss": 0.4879, "num_input_tokens_seen": 58948912, "step": 5250, "train_runtime": 9049.9278, "train_tokens_per_second": 6513.744 }, { "epoch": 2.5217495650087, "grad_norm": 0.7503857612609863, "learning_rate": 3.078307345048251e-06, "loss": 0.434, "num_input_tokens_seen": 59005656, "step": 5255, "train_runtime": 9058.4522, "train_tokens_per_second": 6513.878 }, { "epoch": 2.52414951700966, "grad_norm": 0.7755120992660522, "learning_rate": 3.0481809624231667e-06, "loss": 0.4226, "num_input_tokens_seen": 59064880, "step": 5260, "train_runtime": 9067.2632, "train_tokens_per_second": 6514.08 }, { "epoch": 2.52654946901062, "grad_norm": 0.7984574437141418, "learning_rate": 3.018193150718224e-06, "loss": 0.4881, "num_input_tokens_seen": 59122920, "step": 5265, "train_runtime": 9075.8636, "train_tokens_per_second": 6514.302 }, { "epoch": 2.5289494210115797, "grad_norm": 0.7857392430305481, "learning_rate": 2.9883440992315744e-06, "loss": 0.4949, "num_input_tokens_seen": 59180768, "step": 5270, "train_runtime": 9084.2259, "train_tokens_per_second": 6514.674 }, { "epoch": 2.5313493730125396, "grad_norm": 0.7636000514030457, "learning_rate": 2.9586339963854402e-06, "loss": 0.4584, "num_input_tokens_seen": 59236392, "step": 5275, "train_runtime": 9093.425, "train_tokens_per_second": 6514.2 }, { "epoch": 2.5337493250135, "grad_norm": 0.7404913306236267, "learning_rate": 2.929063029724924e-06, "loss": 0.5001, "num_input_tokens_seen": 59288152, "step": 5280, "train_runtime": 9101.2939, "train_tokens_per_second": 6514.255 }, { "epoch": 2.53614927701446, "grad_norm": 0.8310667872428894, "learning_rate": 2.8996313859168373e-06, "loss": 0.4752, "num_input_tokens_seen": 59350448, "step": 5285, "train_runtime": 9109.697, "train_tokens_per_second": 6515.085 }, { "epoch": 2.5385492290154197, "grad_norm": 0.7058178782463074, "learning_rate": 2.8703392507485244e-06, "loss": 0.5058, "num_input_tokens_seen": 59405224, "step": 5290, "train_runtime": 9118.1859, "train_tokens_per_second": 6515.027 }, { "epoch": 2.5409491810163796, "grad_norm": 0.9837594628334045, "learning_rate": 2.8411868091266614e-06, "loss": 0.5101, "num_input_tokens_seen": 59459408, "step": 5295, "train_runtime": 9125.7939, "train_tokens_per_second": 6515.533 }, { "epoch": 2.5433491330173394, "grad_norm": 0.749136745929718, "learning_rate": 2.812174245076121e-06, "loss": 0.4509, "num_input_tokens_seen": 59519864, "step": 5300, "train_runtime": 9134.5564, "train_tokens_per_second": 6515.901 }, { "epoch": 2.5457490850182998, "grad_norm": 0.8679369688034058, "learning_rate": 2.783301741738803e-06, "loss": 0.5337, "num_input_tokens_seen": 59575648, "step": 5305, "train_runtime": 9142.5914, "train_tokens_per_second": 6516.276 }, { "epoch": 2.5481490370192597, "grad_norm": 0.7311270833015442, "learning_rate": 2.75456948137246e-06, "loss": 0.4446, "num_input_tokens_seen": 59631568, "step": 5310, "train_runtime": 9150.8949, "train_tokens_per_second": 6516.474 }, { "epoch": 2.5505489890202195, "grad_norm": 0.9072261452674866, "learning_rate": 2.725977645349567e-06, "loss": 0.4515, "num_input_tokens_seen": 59688168, "step": 5315, "train_runtime": 9158.8503, "train_tokens_per_second": 6516.993 }, { "epoch": 2.5529489410211794, "grad_norm": 0.7925878763198853, "learning_rate": 2.6975264141561792e-06, "loss": 0.4743, "num_input_tokens_seen": 59750784, "step": 5320, "train_runtime": 9167.7914, "train_tokens_per_second": 6517.468 }, { "epoch": 2.5553488930221393, "grad_norm": 0.7712064981460571, "learning_rate": 2.6692159673907674e-06, "loss": 0.4835, "num_input_tokens_seen": 59804776, "step": 5325, "train_runtime": 9176.5665, "train_tokens_per_second": 6517.119 }, { "epoch": 2.5577488450230996, "grad_norm": 0.9932171106338501, "learning_rate": 2.641046483763107e-06, "loss": 0.4954, "num_input_tokens_seen": 59862336, "step": 5330, "train_runtime": 9184.9522, "train_tokens_per_second": 6517.436 }, { "epoch": 2.5601487970240595, "grad_norm": 0.8807353377342224, "learning_rate": 2.613018141093143e-06, "loss": 0.5017, "num_input_tokens_seen": 59920072, "step": 5335, "train_runtime": 9193.9014, "train_tokens_per_second": 6517.372 }, { "epoch": 2.5625487490250194, "grad_norm": 0.7849051356315613, "learning_rate": 2.585131116309872e-06, "loss": 0.4951, "num_input_tokens_seen": 59975568, "step": 5340, "train_runtime": 9202.2095, "train_tokens_per_second": 6517.518 }, { "epoch": 2.5649487010259797, "grad_norm": 0.5779772400856018, "learning_rate": 2.557385585450217e-06, "loss": 0.4706, "num_input_tokens_seen": 60036392, "step": 5345, "train_runtime": 9211.2288, "train_tokens_per_second": 6517.74 }, { "epoch": 2.5673486530269396, "grad_norm": 0.9567521810531616, "learning_rate": 2.529781723657915e-06, "loss": 0.4893, "num_input_tokens_seen": 60093024, "step": 5350, "train_runtime": 9220.1795, "train_tokens_per_second": 6517.555 }, { "epoch": 2.5697486050278995, "grad_norm": 0.7940301299095154, "learning_rate": 2.5023197051824267e-06, "loss": 0.5055, "num_input_tokens_seen": 60144920, "step": 5355, "train_runtime": 9228.2311, "train_tokens_per_second": 6517.492 }, { "epoch": 2.5721485570288594, "grad_norm": 0.9344842433929443, "learning_rate": 2.4749997033778228e-06, "loss": 0.5167, "num_input_tokens_seen": 60203224, "step": 5360, "train_runtime": 9236.6101, "train_tokens_per_second": 6517.892 }, { "epoch": 2.5745485090298192, "grad_norm": 0.9174864888191223, "learning_rate": 2.4478218907016877e-06, "loss": 0.4896, "num_input_tokens_seen": 60259032, "step": 5365, "train_runtime": 9245.2879, "train_tokens_per_second": 6517.81 }, { "epoch": 2.5769484610307796, "grad_norm": 0.9624903798103333, "learning_rate": 2.4207864387140512e-06, "loss": 0.5132, "num_input_tokens_seen": 60308024, "step": 5370, "train_runtime": 9253.8315, "train_tokens_per_second": 6517.087 }, { "epoch": 2.5793484130317395, "grad_norm": 0.6800229549407959, "learning_rate": 2.3938935180762707e-06, "loss": 0.5086, "num_input_tokens_seen": 60362552, "step": 5375, "train_runtime": 9261.5584, "train_tokens_per_second": 6517.537 }, { "epoch": 2.5817483650326993, "grad_norm": 0.9939396977424622, "learning_rate": 2.36714329854999e-06, "loss": 0.5001, "num_input_tokens_seen": 60415520, "step": 5380, "train_runtime": 9269.7261, "train_tokens_per_second": 6517.509 }, { "epoch": 2.584148317033659, "grad_norm": 0.7869457602500916, "learning_rate": 2.3405359489960365e-06, "loss": 0.493, "num_input_tokens_seen": 60469016, "step": 5385, "train_runtime": 9277.4328, "train_tokens_per_second": 6517.861 }, { "epoch": 2.586548269034619, "grad_norm": 0.8779625296592712, "learning_rate": 2.314071637373394e-06, "loss": 0.537, "num_input_tokens_seen": 60528736, "step": 5390, "train_runtime": 9286.7608, "train_tokens_per_second": 6517.745 }, { "epoch": 2.5889482210355794, "grad_norm": 0.9168468713760376, "learning_rate": 2.2877505307380976e-06, "loss": 0.5101, "num_input_tokens_seen": 60585352, "step": 5395, "train_runtime": 9294.8068, "train_tokens_per_second": 6518.194 }, { "epoch": 2.5913481730365393, "grad_norm": 0.7564955353736877, "learning_rate": 2.2615727952422033e-06, "loss": 0.4426, "num_input_tokens_seen": 60645192, "step": 5400, "train_runtime": 9303.554, "train_tokens_per_second": 6518.497 }, { "epoch": 2.593748125037499, "grad_norm": 0.823637843132019, "learning_rate": 2.235538596132747e-06, "loss": 0.4401, "num_input_tokens_seen": 60705872, "step": 5405, "train_runtime": 9314.3874, "train_tokens_per_second": 6517.43 }, { "epoch": 2.596148077038459, "grad_norm": 0.5428220629692078, "learning_rate": 2.2096480977506883e-06, "loss": 0.466, "num_input_tokens_seen": 60766448, "step": 5410, "train_runtime": 9324.731, "train_tokens_per_second": 6516.697 }, { "epoch": 2.598548029039419, "grad_norm": 1.0644038915634155, "learning_rate": 2.183901463529861e-06, "loss": 0.4647, "num_input_tokens_seen": 60820832, "step": 5415, "train_runtime": 9335.3113, "train_tokens_per_second": 6515.137 }, { "epoch": 2.6009479810403793, "grad_norm": 0.7919825315475464, "learning_rate": 2.1582988559959773e-06, "loss": 0.4435, "num_input_tokens_seen": 60879048, "step": 5420, "train_runtime": 9346.1879, "train_tokens_per_second": 6513.784 }, { "epoch": 2.603347933041339, "grad_norm": 1.047285556793213, "learning_rate": 2.132840436765568e-06, "loss": 0.4641, "num_input_tokens_seen": 60927720, "step": 5425, "train_runtime": 9355.613, "train_tokens_per_second": 6512.424 }, { "epoch": 2.605747885042299, "grad_norm": 0.9616097211837769, "learning_rate": 2.1075263665449737e-06, "loss": 0.4677, "num_input_tokens_seen": 60981576, "step": 5430, "train_runtime": 9365.6809, "train_tokens_per_second": 6511.174 }, { "epoch": 2.6081478370432594, "grad_norm": 0.9964049458503723, "learning_rate": 2.082356805129332e-06, "loss": 0.4929, "num_input_tokens_seen": 61039448, "step": 5435, "train_runtime": 9376.1343, "train_tokens_per_second": 6510.087 }, { "epoch": 2.6105477890442192, "grad_norm": 0.8985645174980164, "learning_rate": 2.0573319114015775e-06, "loss": 0.4886, "num_input_tokens_seen": 61093640, "step": 5440, "train_runtime": 9386.3154, "train_tokens_per_second": 6508.799 }, { "epoch": 2.612947741045179, "grad_norm": 0.7488046884536743, "learning_rate": 2.0324518433314206e-06, "loss": 0.4697, "num_input_tokens_seen": 61149808, "step": 5445, "train_runtime": 9396.5128, "train_tokens_per_second": 6507.713 }, { "epoch": 2.615347693046139, "grad_norm": 0.7769824862480164, "learning_rate": 2.0077167579743593e-06, "loss": 0.4645, "num_input_tokens_seen": 61206176, "step": 5450, "train_runtime": 9406.5758, "train_tokens_per_second": 6506.743 }, { "epoch": 2.617747645047099, "grad_norm": 0.7720673084259033, "learning_rate": 1.9831268114706925e-06, "loss": 0.4667, "num_input_tokens_seen": 61266712, "step": 5455, "train_runtime": 9417.2442, "train_tokens_per_second": 6505.8 }, { "epoch": 2.620147597048059, "grad_norm": 0.7182523012161255, "learning_rate": 1.958682159044531e-06, "loss": 0.4644, "num_input_tokens_seen": 61319856, "step": 5460, "train_runtime": 9426.6437, "train_tokens_per_second": 6504.951 }, { "epoch": 2.622547549049019, "grad_norm": 0.8977944850921631, "learning_rate": 1.934382955002803e-06, "loss": 0.5007, "num_input_tokens_seen": 61377048, "step": 5465, "train_runtime": 9437.2729, "train_tokens_per_second": 6503.685 }, { "epoch": 2.624947501049979, "grad_norm": 0.7803311347961426, "learning_rate": 1.9102293527343163e-06, "loss": 0.4658, "num_input_tokens_seen": 61434248, "step": 5470, "train_runtime": 9448.0138, "train_tokens_per_second": 6502.345 }, { "epoch": 2.627347453050939, "grad_norm": 0.72231125831604, "learning_rate": 1.886221504708746e-06, "loss": 0.4968, "num_input_tokens_seen": 61494600, "step": 5475, "train_runtime": 9459.0534, "train_tokens_per_second": 6501.137 }, { "epoch": 2.6297474050518987, "grad_norm": 0.5621334314346313, "learning_rate": 1.8623595624757045e-06, "loss": 0.4606, "num_input_tokens_seen": 61555232, "step": 5480, "train_runtime": 9469.3682, "train_tokens_per_second": 6500.458 }, { "epoch": 2.632147357052859, "grad_norm": 0.6386857628822327, "learning_rate": 1.8386436766637593e-06, "loss": 0.4647, "num_input_tokens_seen": 61610480, "step": 5485, "train_runtime": 9479.3329, "train_tokens_per_second": 6499.453 }, { "epoch": 2.634547309053819, "grad_norm": 0.6079943776130676, "learning_rate": 1.8150739969795245e-06, "loss": 0.4742, "num_input_tokens_seen": 61666936, "step": 5490, "train_runtime": 9489.1199, "train_tokens_per_second": 6498.699 }, { "epoch": 2.636947261054779, "grad_norm": 0.6471970677375793, "learning_rate": 1.7916506722066573e-06, "loss": 0.5121, "num_input_tokens_seen": 61723152, "step": 5495, "train_runtime": 9498.3327, "train_tokens_per_second": 6498.314 }, { "epoch": 2.639347213055739, "grad_norm": 0.8927129507064819, "learning_rate": 1.7683738502049658e-06, "loss": 0.5282, "num_input_tokens_seen": 61779792, "step": 5500, "train_runtime": 9508.4194, "train_tokens_per_second": 6497.378 }, { "epoch": 2.6417471650566986, "grad_norm": 0.9175587296485901, "learning_rate": 1.7452436779094527e-06, "loss": 0.5226, "num_input_tokens_seen": 61837696, "step": 5505, "train_runtime": 9518.2144, "train_tokens_per_second": 6496.775 }, { "epoch": 2.644147117057659, "grad_norm": 0.6489665508270264, "learning_rate": 1.7222603013294036e-06, "loss": 0.4645, "num_input_tokens_seen": 61896032, "step": 5510, "train_runtime": 9528.6748, "train_tokens_per_second": 6495.765 }, { "epoch": 2.646547069058619, "grad_norm": 0.8270627856254578, "learning_rate": 1.6994238655474394e-06, "loss": 0.4943, "num_input_tokens_seen": 61949384, "step": 5515, "train_runtime": 9538.4414, "train_tokens_per_second": 6494.707 }, { "epoch": 2.6489470210595787, "grad_norm": 0.7798356413841248, "learning_rate": 1.6767345147186336e-06, "loss": 0.5109, "num_input_tokens_seen": 62002592, "step": 5520, "train_runtime": 9548.0079, "train_tokens_per_second": 6493.773 }, { "epoch": 2.651346973060539, "grad_norm": 0.8514456748962402, "learning_rate": 1.6541923920695756e-06, "loss": 0.4477, "num_input_tokens_seen": 62055040, "step": 5525, "train_runtime": 9558.322, "train_tokens_per_second": 6492.253 }, { "epoch": 2.653746925061499, "grad_norm": 1.0111453533172607, "learning_rate": 1.6317976398974782e-06, "loss": 0.5174, "num_input_tokens_seen": 62109976, "step": 5530, "train_runtime": 9567.8838, "train_tokens_per_second": 6491.506 }, { "epoch": 2.6561468770624588, "grad_norm": 0.702575147151947, "learning_rate": 1.6095503995692762e-06, "loss": 0.4668, "num_input_tokens_seen": 62167376, "step": 5535, "train_runtime": 9577.5036, "train_tokens_per_second": 6490.979 }, { "epoch": 2.6585468290634187, "grad_norm": 0.8962842226028442, "learning_rate": 1.5874508115207408e-06, "loss": 0.4676, "num_input_tokens_seen": 62221488, "step": 5540, "train_runtime": 9587.9271, "train_tokens_per_second": 6489.566 }, { "epoch": 2.6609467810643785, "grad_norm": 0.7158124446868896, "learning_rate": 1.5654990152555837e-06, "loss": 0.4947, "num_input_tokens_seen": 62277176, "step": 5545, "train_runtime": 9597.969, "train_tokens_per_second": 6488.579 }, { "epoch": 2.663346733065339, "grad_norm": 1.1132010221481323, "learning_rate": 1.5436951493445762e-06, "loss": 0.4875, "num_input_tokens_seen": 62330544, "step": 5550, "train_runtime": 9607.4993, "train_tokens_per_second": 6487.697 }, { "epoch": 2.6657466850662987, "grad_norm": 0.8258331418037415, "learning_rate": 1.5220393514246895e-06, "loss": 0.5035, "num_input_tokens_seen": 62381768, "step": 5555, "train_runtime": 9616.8354, "train_tokens_per_second": 6486.725 }, { "epoch": 2.6681466370672586, "grad_norm": 0.8152797818183899, "learning_rate": 1.5005317581982092e-06, "loss": 0.4839, "num_input_tokens_seen": 62436944, "step": 5560, "train_runtime": 9626.6187, "train_tokens_per_second": 6485.864 }, { "epoch": 2.6705465890682185, "grad_norm": 0.8248258233070374, "learning_rate": 1.479172505431875e-06, "loss": 0.4973, "num_input_tokens_seen": 62491352, "step": 5565, "train_runtime": 9636.4281, "train_tokens_per_second": 6484.908 }, { "epoch": 2.6729465410691784, "grad_norm": 1.0632202625274658, "learning_rate": 1.4579617279560393e-06, "loss": 0.486, "num_input_tokens_seen": 62546464, "step": 5570, "train_runtime": 9646.0848, "train_tokens_per_second": 6484.13 }, { "epoch": 2.6753464930701387, "grad_norm": 1.1524382829666138, "learning_rate": 1.4368995596637902e-06, "loss": 0.4729, "num_input_tokens_seen": 62602496, "step": 5575, "train_runtime": 9656.9657, "train_tokens_per_second": 6482.626 }, { "epoch": 2.6777464450710986, "grad_norm": 0.66849684715271, "learning_rate": 1.415986133510122e-06, "loss": 0.4894, "num_input_tokens_seen": 62664360, "step": 5580, "train_runtime": 9668.3929, "train_tokens_per_second": 6481.363 }, { "epoch": 2.6801463970720585, "grad_norm": 0.7072093486785889, "learning_rate": 1.395221581511097e-06, "loss": 0.4524, "num_input_tokens_seen": 62721848, "step": 5585, "train_runtime": 9678.2677, "train_tokens_per_second": 6480.69 }, { "epoch": 2.682546349073019, "grad_norm": 0.8476486802101135, "learning_rate": 1.3746060347430118e-06, "loss": 0.4765, "num_input_tokens_seen": 62776544, "step": 5590, "train_runtime": 9687.8596, "train_tokens_per_second": 6479.919 }, { "epoch": 2.6849463010739782, "grad_norm": 0.807366132736206, "learning_rate": 1.354139623341566e-06, "loss": 0.4656, "num_input_tokens_seen": 62834048, "step": 5595, "train_runtime": 9698.4717, "train_tokens_per_second": 6478.758 }, { "epoch": 2.6873462530749386, "grad_norm": 0.6468657851219177, "learning_rate": 1.3338224765010315e-06, "loss": 0.4573, "num_input_tokens_seen": 62894360, "step": 5600, "train_runtime": 9709.7847, "train_tokens_per_second": 6477.421 }, { "epoch": 2.6897462050758985, "grad_norm": 0.9837515354156494, "learning_rate": 1.3136547224734646e-06, "loss": 0.4944, "num_input_tokens_seen": 62952560, "step": 5605, "train_runtime": 9720.79, "train_tokens_per_second": 6476.074 }, { "epoch": 2.6921461570768583, "grad_norm": 0.7956768274307251, "learning_rate": 1.2936364885678676e-06, "loss": 0.4829, "num_input_tokens_seen": 63006360, "step": 5610, "train_runtime": 9729.8891, "train_tokens_per_second": 6475.548 }, { "epoch": 2.6945461090778187, "grad_norm": 0.7825217247009277, "learning_rate": 1.2737679011493947e-06, "loss": 0.4819, "num_input_tokens_seen": 63065920, "step": 5615, "train_runtime": 9740.3812, "train_tokens_per_second": 6474.687 }, { "epoch": 2.6969460610787785, "grad_norm": 0.8457074761390686, "learning_rate": 1.2540490856385672e-06, "loss": 0.4717, "num_input_tokens_seen": 63121320, "step": 5620, "train_runtime": 9751.1742, "train_tokens_per_second": 6473.202 }, { "epoch": 2.6993460130797384, "grad_norm": 0.8086642026901245, "learning_rate": 1.23448016651046e-06, "loss": 0.462, "num_input_tokens_seen": 63176440, "step": 5625, "train_runtime": 9760.6545, "train_tokens_per_second": 6472.562 }, { "epoch": 2.7017459650806983, "grad_norm": 0.6313350796699524, "learning_rate": 1.215061267293932e-06, "loss": 0.4332, "num_input_tokens_seen": 63242712, "step": 5630, "train_runtime": 9772.2646, "train_tokens_per_second": 6471.654 }, { "epoch": 2.704145917081658, "grad_norm": 1.2930268049240112, "learning_rate": 1.195792510570834e-06, "loss": 0.4613, "num_input_tokens_seen": 63294640, "step": 5635, "train_runtime": 9782.3592, "train_tokens_per_second": 6470.284 }, { "epoch": 2.7065458690826185, "grad_norm": 0.6524819731712341, "learning_rate": 1.1766740179752572e-06, "loss": 0.4588, "num_input_tokens_seen": 63353040, "step": 5640, "train_runtime": 9793.201, "train_tokens_per_second": 6469.084 }, { "epoch": 2.7089458210835784, "grad_norm": 0.9691641330718994, "learning_rate": 1.1577059101927385e-06, "loss": 0.5275, "num_input_tokens_seen": 63408480, "step": 5645, "train_runtime": 9803.1346, "train_tokens_per_second": 6468.184 }, { "epoch": 2.7113457730845383, "grad_norm": 0.7839572429656982, "learning_rate": 1.138888306959504e-06, "loss": 0.4728, "num_input_tokens_seen": 63465824, "step": 5650, "train_runtime": 9814.8407, "train_tokens_per_second": 6466.312 }, { "epoch": 2.713745725085498, "grad_norm": 0.9171317219734192, "learning_rate": 1.1202213270617322e-06, "loss": 0.4897, "num_input_tokens_seen": 63518744, "step": 5655, "train_runtime": 9824.5678, "train_tokens_per_second": 6465.297 }, { "epoch": 2.716145677086458, "grad_norm": 1.0188878774642944, "learning_rate": 1.101705088334795e-06, "loss": 0.4849, "num_input_tokens_seen": 63573232, "step": 5660, "train_runtime": 9833.9406, "train_tokens_per_second": 6464.675 }, { "epoch": 2.7185456290874184, "grad_norm": 0.811906099319458, "learning_rate": 1.0833397076624897e-06, "loss": 0.4778, "num_input_tokens_seen": 63626872, "step": 5665, "train_runtime": 9843.8939, "train_tokens_per_second": 6463.588 }, { "epoch": 2.7209455810883783, "grad_norm": 0.9648638367652893, "learning_rate": 1.065125300976344e-06, "loss": 0.5255, "num_input_tokens_seen": 63680184, "step": 5670, "train_runtime": 9852.7656, "train_tokens_per_second": 6463.179 }, { "epoch": 2.723345533089338, "grad_norm": 0.8658723831176758, "learning_rate": 1.0470619832548461e-06, "loss": 0.5119, "num_input_tokens_seen": 63732752, "step": 5675, "train_runtime": 9861.8742, "train_tokens_per_second": 6462.54 }, { "epoch": 2.7257454850902985, "grad_norm": 0.6413763761520386, "learning_rate": 1.0291498685227441e-06, "loss": 0.4683, "num_input_tokens_seen": 63790384, "step": 5680, "train_runtime": 9873.128, "train_tokens_per_second": 6461.011 }, { "epoch": 2.7281454370912583, "grad_norm": 0.9176835417747498, "learning_rate": 1.0113890698503076e-06, "loss": 0.4943, "num_input_tokens_seen": 63845528, "step": 5685, "train_runtime": 9883.5777, "train_tokens_per_second": 6459.759 }, { "epoch": 2.7305453890922182, "grad_norm": 0.8102623224258423, "learning_rate": 9.937796993526343e-07, "loss": 0.4989, "num_input_tokens_seen": 63898616, "step": 5690, "train_runtime": 9893.716, "train_tokens_per_second": 6458.505 }, { "epoch": 2.732945341093178, "grad_norm": 0.7839487195014954, "learning_rate": 9.763218681889203e-07, "loss": 0.4506, "num_input_tokens_seen": 63953600, "step": 5695, "train_runtime": 9903.0294, "train_tokens_per_second": 6457.983 }, { "epoch": 2.735345293094138, "grad_norm": 0.8236997723579407, "learning_rate": 9.59015686561779e-07, "loss": 0.4606, "num_input_tokens_seen": 64012184, "step": 5700, "train_runtime": 9913.4852, "train_tokens_per_second": 6457.082 }, { "epoch": 2.7377452450950983, "grad_norm": 0.7789479494094849, "learning_rate": 9.418612637165286e-07, "loss": 0.4545, "num_input_tokens_seen": 64065248, "step": 5705, "train_runtime": 9924.2434, "train_tokens_per_second": 6455.429 }, { "epoch": 2.740145197096058, "grad_norm": 0.890102744102478, "learning_rate": 9.24858707940518e-07, "loss": 0.5299, "num_input_tokens_seen": 64120216, "step": 5710, "train_runtime": 9934.9595, "train_tokens_per_second": 6453.999 }, { "epoch": 2.742545149097018, "grad_norm": 0.9005339741706848, "learning_rate": 9.08008126562418e-07, "loss": 0.4609, "num_input_tokens_seen": 64181128, "step": 5715, "train_runtime": 9946.018, "train_tokens_per_second": 6452.947 }, { "epoch": 2.744945101097978, "grad_norm": 0.9289687275886536, "learning_rate": 8.913096259515835e-07, "loss": 0.464, "num_input_tokens_seen": 64234984, "step": 5720, "train_runtime": 9954.6483, "train_tokens_per_second": 6452.763 }, { "epoch": 2.747345053098938, "grad_norm": 1.0818783044815063, "learning_rate": 8.747633115173404e-07, "loss": 0.4932, "num_input_tokens_seen": 64290040, "step": 5725, "train_runtime": 9963.0154, "train_tokens_per_second": 6452.87 }, { "epoch": 2.749745005099898, "grad_norm": 0.7084750533103943, "learning_rate": 8.583692877083465e-07, "loss": 0.4344, "num_input_tokens_seen": 64347256, "step": 5730, "train_runtime": 9971.5711, "train_tokens_per_second": 6453.071 }, { "epoch": 2.752144957100858, "grad_norm": 0.8155821561813354, "learning_rate": 8.421276580119236e-07, "loss": 0.4921, "num_input_tokens_seen": 64401448, "step": 5735, "train_runtime": 9980.1585, "train_tokens_per_second": 6452.948 }, { "epoch": 2.754544909101818, "grad_norm": 0.7858007550239563, "learning_rate": 8.260385249534042e-07, "loss": 0.4953, "num_input_tokens_seen": 64457576, "step": 5740, "train_runtime": 9988.8703, "train_tokens_per_second": 6452.94 }, { "epoch": 2.756944861102778, "grad_norm": 0.8042717576026917, "learning_rate": 8.101019900954881e-07, "loss": 0.4595, "num_input_tokens_seen": 64515152, "step": 5745, "train_runtime": 9998.2113, "train_tokens_per_second": 6452.669 }, { "epoch": 2.7593448131037377, "grad_norm": 0.61765056848526, "learning_rate": 7.943181540375988e-07, "loss": 0.4843, "num_input_tokens_seen": 64573768, "step": 5750, "train_runtime": 10006.8604, "train_tokens_per_second": 6452.95 }, { "epoch": 2.761744765104698, "grad_norm": 0.8006062507629395, "learning_rate": 7.786871164152415e-07, "loss": 0.4595, "num_input_tokens_seen": 64626520, "step": 5755, "train_runtime": 10014.3267, "train_tokens_per_second": 6453.406 }, { "epoch": 2.764144717105658, "grad_norm": 0.7694302797317505, "learning_rate": 7.632089758993932e-07, "loss": 0.4565, "num_input_tokens_seen": 64683224, "step": 5760, "train_runtime": 10022.5457, "train_tokens_per_second": 6453.772 }, { "epoch": 2.766544669106618, "grad_norm": 0.7269204258918762, "learning_rate": 7.478838301958502e-07, "loss": 0.4728, "num_input_tokens_seen": 64738056, "step": 5765, "train_runtime": 10030.9759, "train_tokens_per_second": 6453.814 }, { "epoch": 2.768944621107578, "grad_norm": 0.8213253021240234, "learning_rate": 7.327117760446478e-07, "loss": 0.4835, "num_input_tokens_seen": 64790592, "step": 5770, "train_runtime": 10039.0056, "train_tokens_per_second": 6453.885 }, { "epoch": 2.771344573108538, "grad_norm": 0.6208813190460205, "learning_rate": 7.17692909219414e-07, "loss": 0.4922, "num_input_tokens_seen": 64844640, "step": 5775, "train_runtime": 10047.9962, "train_tokens_per_second": 6453.49 }, { "epoch": 2.773744525109498, "grad_norm": 0.7945714592933655, "learning_rate": 7.028273245267947e-07, "loss": 0.4473, "num_input_tokens_seen": 64903320, "step": 5780, "train_runtime": 10056.9037, "train_tokens_per_second": 6453.609 }, { "epoch": 2.7761444771104578, "grad_norm": 0.6964590549468994, "learning_rate": 6.881151158058263e-07, "loss": 0.5196, "num_input_tokens_seen": 64963432, "step": 5785, "train_runtime": 10066.3751, "train_tokens_per_second": 6453.508 }, { "epoch": 2.7785444291114176, "grad_norm": 0.7940050959587097, "learning_rate": 6.735563759273783e-07, "loss": 0.4862, "num_input_tokens_seen": 65020920, "step": 5790, "train_runtime": 10074.8972, "train_tokens_per_second": 6453.755 }, { "epoch": 2.780944381112378, "grad_norm": 0.7207697033882141, "learning_rate": 6.591511967935282e-07, "loss": 0.4557, "num_input_tokens_seen": 65077720, "step": 5795, "train_runtime": 10083.2022, "train_tokens_per_second": 6454.073 }, { "epoch": 2.783344333113338, "grad_norm": 0.9495781064033508, "learning_rate": 6.448996693370179e-07, "loss": 0.4682, "num_input_tokens_seen": 65133616, "step": 5800, "train_runtime": 10092.261, "train_tokens_per_second": 6453.818 }, { "epoch": 2.7857442851142977, "grad_norm": 0.8136801719665527, "learning_rate": 6.308018835206541e-07, "loss": 0.4646, "num_input_tokens_seen": 65187840, "step": 5805, "train_runtime": 10100.5435, "train_tokens_per_second": 6453.894 }, { "epoch": 2.7881442371152576, "grad_norm": 0.6333021521568298, "learning_rate": 6.168579283367476e-07, "loss": 0.472, "num_input_tokens_seen": 65240368, "step": 5810, "train_runtime": 10108.7592, "train_tokens_per_second": 6453.845 }, { "epoch": 2.7905441891162175, "grad_norm": 1.0317847728729248, "learning_rate": 6.030678918065552e-07, "loss": 0.4831, "num_input_tokens_seen": 65295184, "step": 5815, "train_runtime": 10117.6223, "train_tokens_per_second": 6453.61 }, { "epoch": 2.792944141117178, "grad_norm": 1.2926782369613647, "learning_rate": 5.894318609797222e-07, "loss": 0.4951, "num_input_tokens_seen": 65351248, "step": 5820, "train_runtime": 10125.5866, "train_tokens_per_second": 6454.07 }, { "epoch": 2.7953440931181377, "grad_norm": 0.8632203936576843, "learning_rate": 5.759499219337328e-07, "loss": 0.4852, "num_input_tokens_seen": 65405976, "step": 5825, "train_runtime": 10133.4185, "train_tokens_per_second": 6454.483 }, { "epoch": 2.7977440451190976, "grad_norm": 0.8666356801986694, "learning_rate": 5.626221597733655e-07, "loss": 0.4505, "num_input_tokens_seen": 65466136, "step": 5830, "train_runtime": 10141.883, "train_tokens_per_second": 6455.028 }, { "epoch": 2.8001439971200575, "grad_norm": 0.894623875617981, "learning_rate": 5.494486586301528e-07, "loss": 0.5448, "num_input_tokens_seen": 65518496, "step": 5835, "train_runtime": 10149.8014, "train_tokens_per_second": 6455.151 }, { "epoch": 2.8025439491210173, "grad_norm": 0.8759870529174805, "learning_rate": 5.364295016618643e-07, "loss": 0.4865, "num_input_tokens_seen": 65577616, "step": 5840, "train_runtime": 10157.9244, "train_tokens_per_second": 6455.809 }, { "epoch": 2.8049439011219777, "grad_norm": 0.7551533579826355, "learning_rate": 5.235647710519626e-07, "loss": 0.4664, "num_input_tokens_seen": 65634592, "step": 5845, "train_runtime": 10166.5957, "train_tokens_per_second": 6455.907 }, { "epoch": 2.8073438531229375, "grad_norm": 0.7756850719451904, "learning_rate": 5.108545480090931e-07, "loss": 0.4649, "num_input_tokens_seen": 65691480, "step": 5850, "train_runtime": 10174.9677, "train_tokens_per_second": 6456.186 }, { "epoch": 2.8097438051238974, "grad_norm": 0.6903165578842163, "learning_rate": 4.982989127665816e-07, "loss": 0.4969, "num_input_tokens_seen": 65745568, "step": 5855, "train_runtime": 10183.3283, "train_tokens_per_second": 6456.196 }, { "epoch": 2.8121437571248578, "grad_norm": 0.7350341081619263, "learning_rate": 4.858979445819089e-07, "loss": 0.4742, "num_input_tokens_seen": 65799784, "step": 5860, "train_runtime": 10190.9666, "train_tokens_per_second": 6456.677 }, { "epoch": 2.8145437091258176, "grad_norm": 0.7910242676734924, "learning_rate": 4.7365172173621796e-07, "loss": 0.4561, "num_input_tokens_seen": 65856528, "step": 5865, "train_runtime": 10199.5186, "train_tokens_per_second": 6456.827 }, { "epoch": 2.8169436611267775, "grad_norm": 0.8002808094024658, "learning_rate": 4.615603215338299e-07, "loss": 0.4425, "num_input_tokens_seen": 65911144, "step": 5870, "train_runtime": 10208.0985, "train_tokens_per_second": 6456.75 }, { "epoch": 2.8193436131277374, "grad_norm": 0.6876586079597473, "learning_rate": 4.496238203017422e-07, "loss": 0.4873, "num_input_tokens_seen": 65971080, "step": 5875, "train_runtime": 10216.3273, "train_tokens_per_second": 6457.416 }, { "epoch": 2.8217435651286973, "grad_norm": 0.65282142162323, "learning_rate": 4.3784229338915406e-07, "loss": 0.4867, "num_input_tokens_seen": 66026344, "step": 5880, "train_runtime": 10224.7475, "train_tokens_per_second": 6457.504 }, { "epoch": 2.8241435171296576, "grad_norm": 0.6614166498184204, "learning_rate": 4.262158151669804e-07, "loss": 0.4813, "num_input_tokens_seen": 66082360, "step": 5885, "train_runtime": 10233.2091, "train_tokens_per_second": 6457.638 }, { "epoch": 2.8265434691306175, "grad_norm": 0.7193440794944763, "learning_rate": 4.147444590274052e-07, "loss": 0.4968, "num_input_tokens_seen": 66134928, "step": 5890, "train_runtime": 10241.3234, "train_tokens_per_second": 6457.654 }, { "epoch": 2.8289434211315774, "grad_norm": 0.7374788522720337, "learning_rate": 4.0342829738339583e-07, "loss": 0.4744, "num_input_tokens_seen": 66190032, "step": 5895, "train_runtime": 10249.2265, "train_tokens_per_second": 6458.051 }, { "epoch": 2.8313433731325373, "grad_norm": 0.9320788979530334, "learning_rate": 3.922674016682504e-07, "loss": 0.4819, "num_input_tokens_seen": 66244312, "step": 5900, "train_runtime": 10256.9977, "train_tokens_per_second": 6458.45 }, { "epoch": 2.833743325133497, "grad_norm": 0.526983916759491, "learning_rate": 3.812618423351622e-07, "loss": 0.4424, "num_input_tokens_seen": 66305552, "step": 5905, "train_runtime": 10265.6243, "train_tokens_per_second": 6458.989 }, { "epoch": 2.8361432771344575, "grad_norm": 0.9565876722335815, "learning_rate": 3.704116888567505e-07, "loss": 0.4926, "num_input_tokens_seen": 66358648, "step": 5910, "train_runtime": 10273.7771, "train_tokens_per_second": 6459.031 }, { "epoch": 2.8385432291354173, "grad_norm": 0.9867433905601501, "learning_rate": 3.597170097246416e-07, "loss": 0.4706, "num_input_tokens_seen": 66417384, "step": 5915, "train_runtime": 10283.2277, "train_tokens_per_second": 6458.807 }, { "epoch": 2.8409431811363772, "grad_norm": 0.6663256883621216, "learning_rate": 3.4917787244902743e-07, "loss": 0.4945, "num_input_tokens_seen": 66477648, "step": 5920, "train_runtime": 10293.4798, "train_tokens_per_second": 6458.229 }, { "epoch": 2.843343133137337, "grad_norm": 0.621631920337677, "learning_rate": 3.387943435582436e-07, "loss": 0.495, "num_input_tokens_seen": 66532464, "step": 5925, "train_runtime": 10302.8802, "train_tokens_per_second": 6457.657 }, { "epoch": 2.845743085138297, "grad_norm": 0.638155460357666, "learning_rate": 3.285664885983447e-07, "loss": 0.4263, "num_input_tokens_seen": 66589296, "step": 5930, "train_runtime": 10312.6945, "train_tokens_per_second": 6457.022 }, { "epoch": 2.8481430371392573, "grad_norm": 0.7790648341178894, "learning_rate": 3.184943721326938e-07, "loss": 0.4473, "num_input_tokens_seen": 66648144, "step": 5935, "train_runtime": 10322.4204, "train_tokens_per_second": 6456.639 }, { "epoch": 2.850542989140217, "grad_norm": 0.9435281753540039, "learning_rate": 3.0857805774155423e-07, "loss": 0.4773, "num_input_tokens_seen": 66702560, "step": 5940, "train_runtime": 10331.5732, "train_tokens_per_second": 6456.186 }, { "epoch": 2.852942941141177, "grad_norm": 0.7527910470962524, "learning_rate": 2.988176080216898e-07, "loss": 0.5113, "num_input_tokens_seen": 66757360, "step": 5945, "train_runtime": 10341.338, "train_tokens_per_second": 6455.389 }, { "epoch": 2.8553428931421374, "grad_norm": 0.949381411075592, "learning_rate": 2.892130845859653e-07, "loss": 0.5225, "num_input_tokens_seen": 66813080, "step": 5950, "train_runtime": 10351.5482, "train_tokens_per_second": 6454.405 }, { "epoch": 2.8577428451430973, "grad_norm": 0.682515561580658, "learning_rate": 2.7976454806296906e-07, "loss": 0.4474, "num_input_tokens_seen": 66870744, "step": 5955, "train_runtime": 10361.7884, "train_tokens_per_second": 6453.591 }, { "epoch": 2.860142797144057, "grad_norm": 0.8949669599533081, "learning_rate": 2.7047205809660746e-07, "loss": 0.4552, "num_input_tokens_seen": 66926176, "step": 5960, "train_runtime": 10372.0384, "train_tokens_per_second": 6452.558 }, { "epoch": 2.862542749145017, "grad_norm": 0.672732949256897, "learning_rate": 2.6133567334575e-07, "loss": 0.461, "num_input_tokens_seen": 66982736, "step": 5965, "train_runtime": 10381.6755, "train_tokens_per_second": 6452.016 }, { "epoch": 2.864942701145977, "grad_norm": 0.7349382638931274, "learning_rate": 2.523554514838544e-07, "loss": 0.4649, "num_input_tokens_seen": 67040256, "step": 5970, "train_runtime": 10391.7883, "train_tokens_per_second": 6451.272 }, { "epoch": 2.8673426531469373, "grad_norm": 0.7584925293922424, "learning_rate": 2.435314491985974e-07, "loss": 0.5227, "num_input_tokens_seen": 67098776, "step": 5975, "train_runtime": 10401.6032, "train_tokens_per_second": 6450.811 }, { "epoch": 2.869742605147897, "grad_norm": 0.8414415717124939, "learning_rate": 2.3486372219151675e-07, "loss": 0.4989, "num_input_tokens_seen": 67151768, "step": 5980, "train_runtime": 10411.5952, "train_tokens_per_second": 6449.71 }, { "epoch": 2.872142557148857, "grad_norm": 0.6477630734443665, "learning_rate": 2.263523251776617e-07, "loss": 0.4962, "num_input_tokens_seen": 67210600, "step": 5985, "train_runtime": 10422.1011, "train_tokens_per_second": 6448.853 }, { "epoch": 2.874542509149817, "grad_norm": 1.1014198064804077, "learning_rate": 2.1799731188525407e-07, "loss": 0.5162, "num_input_tokens_seen": 67263744, "step": 5990, "train_runtime": 10431.8385, "train_tokens_per_second": 6447.928 }, { "epoch": 2.876942461150777, "grad_norm": 0.9391694664955139, "learning_rate": 2.0979873505533876e-07, "loss": 0.449, "num_input_tokens_seen": 67316560, "step": 5995, "train_runtime": 10441.9194, "train_tokens_per_second": 6446.761 }, { "epoch": 2.879342413151737, "grad_norm": 0.8007956147193909, "learning_rate": 2.0175664644145053e-07, "loss": 0.4849, "num_input_tokens_seen": 67373408, "step": 6000, "train_runtime": 10452.6728, "train_tokens_per_second": 6445.568 }, { "epoch": 2.881742365152697, "grad_norm": 0.7711721658706665, "learning_rate": 1.9387109680930327e-07, "loss": 0.4332, "num_input_tokens_seen": 67428800, "step": 6005, "train_runtime": 10463.242, "train_tokens_per_second": 6444.351 }, { "epoch": 2.884142317153657, "grad_norm": 0.8150792121887207, "learning_rate": 1.8614213593644846e-07, "loss": 0.4459, "num_input_tokens_seen": 67490440, "step": 6010, "train_runtime": 10473.7424, "train_tokens_per_second": 6443.775 }, { "epoch": 2.8865422691546168, "grad_norm": 0.7124377489089966, "learning_rate": 1.7856981261197002e-07, "loss": 0.4779, "num_input_tokens_seen": 67545608, "step": 6015, "train_runtime": 10483.2085, "train_tokens_per_second": 6443.219 }, { "epoch": 2.8889422211555766, "grad_norm": 0.8673171997070312, "learning_rate": 1.7115417463618722e-07, "loss": 0.4598, "num_input_tokens_seen": 67595400, "step": 6020, "train_runtime": 10492.2481, "train_tokens_per_second": 6442.413 }, { "epoch": 2.891342173156537, "grad_norm": 0.7837307453155518, "learning_rate": 1.638952688203327e-07, "loss": 0.4797, "num_input_tokens_seen": 67646720, "step": 6025, "train_runtime": 10501.2034, "train_tokens_per_second": 6441.806 }, { "epoch": 2.893742125157497, "grad_norm": 0.6940703392028809, "learning_rate": 1.567931409862694e-07, "loss": 0.4915, "num_input_tokens_seen": 67700752, "step": 6030, "train_runtime": 10511.0778, "train_tokens_per_second": 6440.895 }, { "epoch": 2.8961420771584567, "grad_norm": 0.8700549602508545, "learning_rate": 1.4984783596619922e-07, "loss": 0.4946, "num_input_tokens_seen": 67755144, "step": 6035, "train_runtime": 10520.7321, "train_tokens_per_second": 6440.155 }, { "epoch": 2.898542029159417, "grad_norm": 0.7011561989784241, "learning_rate": 1.430593976023825e-07, "loss": 0.4919, "num_input_tokens_seen": 67814680, "step": 6040, "train_runtime": 10531.5769, "train_tokens_per_second": 6439.176 }, { "epoch": 2.900941981160377, "grad_norm": 0.893417477607727, "learning_rate": 1.3642786874685233e-07, "loss": 0.5055, "num_input_tokens_seen": 67867648, "step": 6045, "train_runtime": 10541.6146, "train_tokens_per_second": 6438.07 }, { "epoch": 2.903341933161337, "grad_norm": 0.7926166653633118, "learning_rate": 1.299532912611534e-07, "loss": 0.459, "num_input_tokens_seen": 67922728, "step": 6050, "train_runtime": 10550.8628, "train_tokens_per_second": 6437.647 }, { "epoch": 2.9057418851622967, "grad_norm": 0.7883651852607727, "learning_rate": 1.2363570601608143e-07, "loss": 0.4636, "num_input_tokens_seen": 67975200, "step": 6055, "train_runtime": 10560.1447, "train_tokens_per_second": 6436.957 }, { "epoch": 2.9081418371632566, "grad_norm": 0.9356446266174316, "learning_rate": 1.1747515289140254e-07, "loss": 0.4612, "num_input_tokens_seen": 68029864, "step": 6060, "train_runtime": 10570.9284, "train_tokens_per_second": 6435.562 }, { "epoch": 2.910541789164217, "grad_norm": 1.2164058685302734, "learning_rate": 1.1147167077562859e-07, "loss": 0.5042, "num_input_tokens_seen": 68079824, "step": 6065, "train_runtime": 10580.6679, "train_tokens_per_second": 6434.36 }, { "epoch": 2.912941741165177, "grad_norm": 0.9457964301109314, "learning_rate": 1.0562529756576179e-07, "loss": 0.4287, "num_input_tokens_seen": 68136632, "step": 6070, "train_runtime": 10591.0019, "train_tokens_per_second": 6433.445 }, { "epoch": 2.9153416931661367, "grad_norm": 0.7782816290855408, "learning_rate": 9.993607016704209e-08, "loss": 0.4994, "num_input_tokens_seen": 68192816, "step": 6075, "train_runtime": 10601.2725, "train_tokens_per_second": 6432.512 }, { "epoch": 2.9177416451670966, "grad_norm": 0.7655016183853149, "learning_rate": 9.440402449274188e-08, "loss": 0.5164, "num_input_tokens_seen": 68244208, "step": 6080, "train_runtime": 10610.674, "train_tokens_per_second": 6431.656 }, { "epoch": 2.9201415971680564, "grad_norm": 0.8917096257209778, "learning_rate": 8.902919546390776e-08, "loss": 0.4609, "num_input_tokens_seen": 68300352, "step": 6085, "train_runtime": 10620.9066, "train_tokens_per_second": 6430.746 }, { "epoch": 2.9225415491690168, "grad_norm": 0.940250039100647, "learning_rate": 8.381161700916906e-08, "loss": 0.5296, "num_input_tokens_seen": 68350392, "step": 6090, "train_runtime": 10630.1557, "train_tokens_per_second": 6429.858 }, { "epoch": 2.9249415011699766, "grad_norm": 0.8829488158226013, "learning_rate": 7.87513220644992e-08, "loss": 0.5012, "num_input_tokens_seen": 68405152, "step": 6095, "train_runtime": 10639.9288, "train_tokens_per_second": 6429.099 }, { "epoch": 2.9273414531709365, "grad_norm": 0.9745586514472961, "learning_rate": 7.384834257302687e-08, "loss": 0.5022, "num_input_tokens_seen": 68461336, "step": 6100, "train_runtime": 10650.3632, "train_tokens_per_second": 6428.075 }, { "epoch": 2.9297414051718964, "grad_norm": 0.9082819819450378, "learning_rate": 6.910270948482789e-08, "loss": 0.477, "num_input_tokens_seen": 68512936, "step": 6105, "train_runtime": 10660.0699, "train_tokens_per_second": 6427.063 }, { "epoch": 2.9321413571728563, "grad_norm": 0.831038773059845, "learning_rate": 6.451445275671986e-08, "loss": 0.4894, "num_input_tokens_seen": 68569728, "step": 6110, "train_runtime": 10670.2152, "train_tokens_per_second": 6426.274 }, { "epoch": 2.9345413091738166, "grad_norm": 0.7757657170295715, "learning_rate": 6.008360135208724e-08, "loss": 0.4685, "num_input_tokens_seen": 68623976, "step": 6115, "train_runtime": 10680.1954, "train_tokens_per_second": 6425.348 }, { "epoch": 2.9369412611747765, "grad_norm": 0.8630353212356567, "learning_rate": 5.581018324069543e-08, "loss": 0.4904, "num_input_tokens_seen": 68679096, "step": 6120, "train_runtime": 10691.1399, "train_tokens_per_second": 6423.926 }, { "epoch": 2.9393412131757364, "grad_norm": 0.881776750087738, "learning_rate": 5.169422539850477e-08, "loss": 0.4671, "num_input_tokens_seen": 68734576, "step": 6125, "train_runtime": 10700.7437, "train_tokens_per_second": 6423.346 }, { "epoch": 2.9417411651766967, "grad_norm": 0.8964380025863647, "learning_rate": 4.773575380750961e-08, "loss": 0.469, "num_input_tokens_seen": 68793128, "step": 6130, "train_runtime": 10711.0036, "train_tokens_per_second": 6422.659 }, { "epoch": 2.9441411171776566, "grad_norm": 0.8133379220962524, "learning_rate": 4.393479345557727e-08, "loss": 0.5031, "num_input_tokens_seen": 68847592, "step": 6135, "train_runtime": 10721.1224, "train_tokens_per_second": 6421.678 }, { "epoch": 2.9465410691786165, "grad_norm": 0.6794693470001221, "learning_rate": 4.0291368336276e-08, "loss": 0.4709, "num_input_tokens_seen": 68905096, "step": 6140, "train_runtime": 10731.8838, "train_tokens_per_second": 6420.597 }, { "epoch": 2.9489410211795763, "grad_norm": 0.8234326839447021, "learning_rate": 3.6805501448744505e-08, "loss": 0.4638, "num_input_tokens_seen": 68960224, "step": 6145, "train_runtime": 10741.5942, "train_tokens_per_second": 6419.924 }, { "epoch": 2.9513409731805362, "grad_norm": 0.8420405387878418, "learning_rate": 3.347721479751986e-08, "loss": 0.5143, "num_input_tokens_seen": 69014200, "step": 6150, "train_runtime": 10751.3552, "train_tokens_per_second": 6419.116 }, { "epoch": 2.9537409251814966, "grad_norm": 0.876466691493988, "learning_rate": 3.0306529392426507e-08, "loss": 0.4258, "num_input_tokens_seen": 69071584, "step": 6155, "train_runtime": 10761.6029, "train_tokens_per_second": 6418.336 }, { "epoch": 2.9561408771824564, "grad_norm": 0.8103510737419128, "learning_rate": 2.72934652484208e-08, "loss": 0.4785, "num_input_tokens_seen": 69125824, "step": 6160, "train_runtime": 10771.3537, "train_tokens_per_second": 6417.561 }, { "epoch": 2.9585408291834163, "grad_norm": 0.9023430347442627, "learning_rate": 2.4438041385480003e-08, "loss": 0.5019, "num_input_tokens_seen": 69183992, "step": 6165, "train_runtime": 10782.2651, "train_tokens_per_second": 6416.462 }, { "epoch": 2.960940781184376, "grad_norm": 0.9007648825645447, "learning_rate": 2.174027582848015e-08, "loss": 0.4764, "num_input_tokens_seen": 69243264, "step": 6170, "train_runtime": 10792.8565, "train_tokens_per_second": 6415.657 }, { "epoch": 2.963340733185336, "grad_norm": 0.9024353623390198, "learning_rate": 1.92001856070656e-08, "loss": 0.499, "num_input_tokens_seen": 69299200, "step": 6175, "train_runtime": 10803.5555, "train_tokens_per_second": 6414.481 }, { "epoch": 2.9657406851862964, "grad_norm": 0.7554855942726135, "learning_rate": 1.6817786755568553e-08, "loss": 0.4397, "num_input_tokens_seen": 69352824, "step": 6180, "train_runtime": 10812.9366, "train_tokens_per_second": 6413.875 }, { "epoch": 2.9681406371872563, "grad_norm": 0.7788093686103821, "learning_rate": 1.4593094312889688e-08, "loss": 0.452, "num_input_tokens_seen": 69415024, "step": 6185, "train_runtime": 10823.0536, "train_tokens_per_second": 6413.627 }, { "epoch": 2.970540589188216, "grad_norm": 0.7968340516090393, "learning_rate": 1.2526122322401024e-08, "loss": 0.4915, "num_input_tokens_seen": 69471512, "step": 6190, "train_runtime": 10832.747, "train_tokens_per_second": 6413.102 }, { "epoch": 2.972940541189176, "grad_norm": 0.7601198554039001, "learning_rate": 1.0616883831873758e-08, "loss": 0.4443, "num_input_tokens_seen": 69527768, "step": 6195, "train_runtime": 10842.6627, "train_tokens_per_second": 6412.426 }, { "epoch": 2.975340493190136, "grad_norm": 0.8078719973564148, "learning_rate": 8.86539089338112e-09, "loss": 0.4387, "num_input_tokens_seen": 69583024, "step": 6200, "train_runtime": 10852.4744, "train_tokens_per_second": 6411.72 }, { "epoch": 2.9777404451910963, "grad_norm": 1.0166022777557373, "learning_rate": 7.271654563223429e-09, "loss": 0.4519, "num_input_tokens_seen": 69639080, "step": 6205, "train_runtime": 10863.2159, "train_tokens_per_second": 6410.54 }, { "epoch": 2.980140397192056, "grad_norm": 1.051282286643982, "learning_rate": 5.835684901869809e-09, "loss": 0.5355, "num_input_tokens_seen": 69695440, "step": 6210, "train_runtime": 10873.6609, "train_tokens_per_second": 6409.565 }, { "epoch": 2.982540349193016, "grad_norm": 0.9155645966529846, "learning_rate": 4.5574909738804735e-09, "loss": 0.4775, "num_input_tokens_seen": 69752488, "step": 6215, "train_runtime": 10884.1415, "train_tokens_per_second": 6408.635 }, { "epoch": 2.9849403011939764, "grad_norm": 0.8648121356964111, "learning_rate": 3.4370808478595417e-09, "loss": 0.4861, "num_input_tokens_seen": 69804712, "step": 6220, "train_runtime": 10894.0291, "train_tokens_per_second": 6407.612 }, { "epoch": 2.9873402531949362, "grad_norm": 0.9490159153938293, "learning_rate": 2.474461596396749e-09, "loss": 0.4641, "num_input_tokens_seen": 69863384, "step": 6225, "train_runtime": 10903.095, "train_tokens_per_second": 6407.665 }, { "epoch": 2.989740205195896, "grad_norm": 0.823014497756958, "learning_rate": 1.6696392960341423e-09, "loss": 0.4785, "num_input_tokens_seen": 69920712, "step": 6230, "train_runtime": 10911.9258, "train_tokens_per_second": 6407.733 }, { "epoch": 2.992140157196856, "grad_norm": 0.9870671629905701, "learning_rate": 1.022619027207794e-09, "loss": 0.4529, "num_input_tokens_seen": 69978976, "step": 6235, "train_runtime": 10920.4005, "train_tokens_per_second": 6408.096 }, { "epoch": 2.994540109197816, "grad_norm": 0.8132453560829163, "learning_rate": 5.334048742394737e-10, "loss": 0.4621, "num_input_tokens_seen": 70037816, "step": 6240, "train_runtime": 10929.3119, "train_tokens_per_second": 6408.255 }, { "epoch": 2.996940061198776, "grad_norm": 0.9090087413787842, "learning_rate": 2.0199992529501554e-10, "loss": 0.4757, "num_input_tokens_seen": 70098000, "step": 6245, "train_runtime": 10938.0514, "train_tokens_per_second": 6408.637 }, { "epoch": 2.999340013199736, "grad_norm": 0.8769118189811707, "learning_rate": 2.8406272370440357e-11, "loss": 0.463, "num_input_tokens_seen": 70153968, "step": 6250, "train_runtime": 10946.6798, "train_tokens_per_second": 6408.698 }, { "epoch": 3.0, "num_input_tokens_seen": 70167528, "step": 6252, "total_flos": 3.161046812140241e+18, "train_loss": 0.5038315440246255, "train_runtime": 10949.1572, "train_samples_per_second": 27.399, "train_steps_per_second": 0.571 } ], "logging_steps": 5, "max_steps": 6252, "num_input_tokens_seen": 70167528, "num_train_epochs": 3, "save_steps": 1500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.161046812140241e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }