{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7839559871158865, "eval_steps": 500, "global_step": 50016, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005015713289289101, "grad_norm": 2.6373515129089355, "learning_rate": 1.875e-05, "loss": 39.2137, "step": 32, "throughput": 3031.484591979932 }, { "epoch": 0.0010031426578578201, "grad_norm": 1.6154510974884033, "learning_rate": 3.75e-05, "loss": 30.9048, "step": 64, "throughput": 4533.5848217950215 }, { "epoch": 0.0015047139867867302, "grad_norm": 2.1769161224365234, "learning_rate": 5.625e-05, "loss": 27.3343, "step": 96, "throughput": 5442.436560864993 }, { "epoch": 0.0020062853157156403, "grad_norm": 1.8507866859436035, "learning_rate": 7.5e-05, "loss": 24.8344, "step": 128, "throughput": 6048.333849188605 }, { "epoch": 0.0025078566446445506, "grad_norm": 1.6263043880462646, "learning_rate": 9.374999999999999e-05, "loss": 23.0449, "step": 160, "throughput": 6383.579958146075 }, { "epoch": 0.0030094279735734604, "grad_norm": 1.5903853178024292, "learning_rate": 0.0001125, "loss": 21.6192, "step": 192, "throughput": 6716.015359285796 }, { "epoch": 0.0035109993025023707, "grad_norm": 1.3139172792434692, "learning_rate": 0.00013125, "loss": 20.2952, "step": 224, "throughput": 6973.364827539361 }, { "epoch": 0.0040125706314312806, "grad_norm": 1.414817214012146, "learning_rate": 0.00015, "loss": 19.0528, "step": 256, "throughput": 7182.123779353329 }, { "epoch": 0.004514141960360191, "grad_norm": 0.9963351488113403, "learning_rate": 0.00016874999999999998, "loss": 17.8896, "step": 288, "throughput": 7289.056846050263 }, { "epoch": 0.005015713289289101, "grad_norm": 0.9738964438438416, "learning_rate": 0.00018749999999999998, "loss": 16.8125, "step": 320, "throughput": 7435.817597231107 }, { "epoch": 0.005517284618218011, "grad_norm": 0.8014869689941406, "learning_rate": 0.00020624999999999997, "loss": 15.9875, "step": 352, "throughput": 7558.026515468645 }, { "epoch": 0.006018855947146921, "grad_norm": 0.6334019899368286, "learning_rate": 0.000225, "loss": 15.2491, "step": 384, "throughput": 7665.0553985229435 }, { "epoch": 0.006520427276075831, "grad_norm": 0.691703736782074, "learning_rate": 0.00024375, "loss": 14.7027, "step": 416, "throughput": 7706.13283153309 }, { "epoch": 0.007021998605004741, "grad_norm": 0.6111452579498291, "learning_rate": 0.0002625, "loss": 14.2629, "step": 448, "throughput": 7788.969234107833 }, { "epoch": 0.007523569933933652, "grad_norm": 0.453713983297348, "learning_rate": 0.00028125, "loss": 13.8273, "step": 480, "throughput": 7861.532758662085 }, { "epoch": 0.008025141262862561, "grad_norm": 0.6347829103469849, "learning_rate": 0.0003, "loss": 13.5631, "step": 512, "throughput": 7927.929938236498 }, { "epoch": 0.008526712591791472, "grad_norm": 0.36244717240333557, "learning_rate": 0.00029999972162979993, "loss": 13.2913, "step": 544, "throughput": 7950.168975118933 }, { "epoch": 0.009028283920720382, "grad_norm": 0.3927552402019501, "learning_rate": 0.00029999888652034774, "loss": 13.0637, "step": 576, "throughput": 8002.683417615384 }, { "epoch": 0.009529855249649291, "grad_norm": 0.34958720207214355, "learning_rate": 0.00029999749467508744, "loss": 12.8461, "step": 608, "throughput": 8051.561816817223 }, { "epoch": 0.010031426578578202, "grad_norm": 0.39206886291503906, "learning_rate": 0.0002999955460997589, "loss": 12.6701, "step": 640, "throughput": 8097.459941385641 }, { "epoch": 0.010532997907507112, "grad_norm": 0.32412078976631165, "learning_rate": 0.0002999930408023982, "loss": 12.5112, "step": 672, "throughput": 8110.693951807436 }, { "epoch": 0.011034569236436023, "grad_norm": 0.28792303800582886, "learning_rate": 0.00029998997879333714, "loss": 12.3516, "step": 704, "throughput": 8146.659534494293 }, { "epoch": 0.011536140565364932, "grad_norm": 0.27620336413383484, "learning_rate": 0.0002999863600852034, "loss": 12.2134, "step": 736, "throughput": 8181.849297422524 }, { "epoch": 0.012037711894293842, "grad_norm": 0.298735648393631, "learning_rate": 0.0002999821846929206, "loss": 12.1216, "step": 768, "throughput": 8215.768342687106 }, { "epoch": 0.012539283223222753, "grad_norm": 0.2960178256034851, "learning_rate": 0.000299977452633708, "loss": 12.0131, "step": 800, "throughput": 8224.10883755775 }, { "epoch": 0.013040854552151662, "grad_norm": 0.2541326582431793, "learning_rate": 0.00029997216392708075, "loss": 11.9168, "step": 832, "throughput": 8250.757327240604 }, { "epoch": 0.013542425881080573, "grad_norm": 0.31256556510925293, "learning_rate": 0.00029996631859484943, "loss": 11.8253, "step": 864, "throughput": 8277.59484237759 }, { "epoch": 0.014043997210009483, "grad_norm": 0.2150149643421173, "learning_rate": 0.00029995991666112014, "loss": 11.7395, "step": 896, "throughput": 8303.820467973554 }, { "epoch": 0.014545568538938392, "grad_norm": 0.20100632309913635, "learning_rate": 0.0002999529581522946, "loss": 11.6624, "step": 928, "throughput": 8307.53804985821 }, { "epoch": 0.015047139867867303, "grad_norm": 0.23360204696655273, "learning_rate": 0.0002999454430970696, "loss": 11.6254, "step": 960, "throughput": 8328.20061532659 }, { "epoch": 0.015548711196796213, "grad_norm": 0.19596298038959503, "learning_rate": 0.0002999373715264373, "loss": 11.5469, "step": 992, "throughput": 8349.693540163496 }, { "epoch": 0.016050282525725122, "grad_norm": 0.23281055688858032, "learning_rate": 0.0002999287434736849, "loss": 11.4881, "step": 1024, "throughput": 8370.74705624476 }, { "epoch": 0.016551853854654033, "grad_norm": 0.178004190325737, "learning_rate": 0.0002999195589743945, "loss": 11.4228, "step": 1056, "throughput": 8371.927394946062 }, { "epoch": 0.017053425183582945, "grad_norm": 0.17390933632850647, "learning_rate": 0.000299909818066443, "loss": 11.3634, "step": 1088, "throughput": 8388.5787076317 }, { "epoch": 0.017554996512511852, "grad_norm": 0.20782887935638428, "learning_rate": 0.00029989952079000195, "loss": 11.3356, "step": 1120, "throughput": 8406.042588855575 }, { "epoch": 0.018056567841440763, "grad_norm": 0.17906507849693298, "learning_rate": 0.0002998886671875373, "loss": 11.268, "step": 1152, "throughput": 8423.431878637526 }, { "epoch": 0.018558139170369675, "grad_norm": 0.2125791758298874, "learning_rate": 0.0002998772573038094, "loss": 11.2191, "step": 1184, "throughput": 8423.562959023457 }, { "epoch": 0.019059710499298582, "grad_norm": 0.1875011920928955, "learning_rate": 0.0002998652911858726, "loss": 11.166, "step": 1216, "throughput": 8437.670481903486 }, { "epoch": 0.019561281828227493, "grad_norm": 0.18936695158481598, "learning_rate": 0.00029985276888307524, "loss": 11.1251, "step": 1248, "throughput": 8452.193237807254 }, { "epoch": 0.020062853157156404, "grad_norm": 0.177974134683609, "learning_rate": 0.00029983969044705927, "loss": 11.1019, "step": 1280, "throughput": 8466.85665649742 }, { "epoch": 0.020564424486085316, "grad_norm": 0.16055719554424286, "learning_rate": 0.0002998260559317603, "loss": 11.0582, "step": 1312, "throughput": 8466.517167981028 }, { "epoch": 0.021065995815014223, "grad_norm": 0.16099485754966736, "learning_rate": 0.00029981186539340703, "loss": 11.0095, "step": 1344, "throughput": 8478.566566937574 }, { "epoch": 0.021567567143943134, "grad_norm": 0.1568784862756729, "learning_rate": 0.0002997971188905213, "loss": 10.9878, "step": 1376, "throughput": 8491.080428955667 }, { "epoch": 0.022069138472872046, "grad_norm": 0.15591877698898315, "learning_rate": 0.0002997818164839178, "loss": 10.95, "step": 1408, "throughput": 8503.772964081749 }, { "epoch": 0.022570709801800953, "grad_norm": 0.1584455668926239, "learning_rate": 0.00029976595823670354, "loss": 10.9177, "step": 1440, "throughput": 8502.49272473002 }, { "epoch": 0.023072281130729864, "grad_norm": 0.1538284718990326, "learning_rate": 0.0002997495442142781, "loss": 10.9034, "step": 1472, "throughput": 8512.850418983315 }, { "epoch": 0.023573852459658776, "grad_norm": 0.16589871048927307, "learning_rate": 0.000299732574484333, "loss": 10.862, "step": 1504, "throughput": 8523.613773945295 }, { "epoch": 0.024075423788587683, "grad_norm": 0.1705697625875473, "learning_rate": 0.0002997150491168514, "loss": 10.8344, "step": 1536, "throughput": 8534.577834014373 }, { "epoch": 0.024576995117516594, "grad_norm": 0.15539616346359253, "learning_rate": 0.0002996969681841079, "loss": 10.7975, "step": 1568, "throughput": 8532.849110741738 }, { "epoch": 0.025078566446445506, "grad_norm": 0.13422518968582153, "learning_rate": 0.0002996783317606684, "loss": 10.7751, "step": 1600, "throughput": 8540.6999977051 }, { "epoch": 0.025580137775374417, "grad_norm": 0.14098893105983734, "learning_rate": 0.0002996591399233895, "loss": 10.742, "step": 1632, "throughput": 8550.000233087334 }, { "epoch": 0.026081709104303324, "grad_norm": 0.13725745677947998, "learning_rate": 0.00029963939275141855, "loss": 10.7043, "step": 1664, "throughput": 8559.634576291046 }, { "epoch": 0.026583280433232236, "grad_norm": 0.15282359719276428, "learning_rate": 0.00029961909032619275, "loss": 10.6942, "step": 1696, "throughput": 8557.213588623437 }, { "epoch": 0.027084851762161147, "grad_norm": 0.15533699095249176, "learning_rate": 0.00029959823273143947, "loss": 10.6708, "step": 1728, "throughput": 8563.929337010282 }, { "epoch": 0.027586423091090054, "grad_norm": 0.1476975679397583, "learning_rate": 0.0002995768200531755, "loss": 10.6754, "step": 1760, "throughput": 8572.364628252106 }, { "epoch": 0.028087994420018966, "grad_norm": 0.13373318314552307, "learning_rate": 0.00029955485237970675, "loss": 10.633, "step": 1792, "throughput": 8580.991220450485 }, { "epoch": 0.028589565748947877, "grad_norm": 0.13255077600479126, "learning_rate": 0.00029953232980162793, "loss": 10.612, "step": 1824, "throughput": 8579.270567797352 }, { "epoch": 0.029091137077876784, "grad_norm": 0.1498062014579773, "learning_rate": 0.0002995092524118223, "loss": 10.57, "step": 1856, "throughput": 8584.96901712662 }, { "epoch": 0.029592708406805696, "grad_norm": 0.1309068202972412, "learning_rate": 0.00029948562030546107, "loss": 10.5787, "step": 1888, "throughput": 8592.560589507188 }, { "epoch": 0.030094279735734607, "grad_norm": 0.13275641202926636, "learning_rate": 0.00029946143358000306, "loss": 10.5466, "step": 1920, "throughput": 8600.325823833216 }, { "epoch": 0.030595851064663518, "grad_norm": 0.13608723878860474, "learning_rate": 0.0002994366923351945, "loss": 10.531, "step": 1952, "throughput": 8598.203774351727 }, { "epoch": 0.031097422393592426, "grad_norm": 0.12380360066890717, "learning_rate": 0.00029941139667306817, "loss": 10.5066, "step": 1984, "throughput": 8603.133954877543 }, { "epoch": 0.03159899372252133, "grad_norm": 0.145261749625206, "learning_rate": 0.00029938554669794364, "loss": 10.4803, "step": 2016, "throughput": 8609.89138038132 }, { "epoch": 0.032100565051450244, "grad_norm": 0.13648687303066254, "learning_rate": 0.00029935914251642625, "loss": 10.4657, "step": 2048, "throughput": 8616.946997737774 }, { "epoch": 0.032602136380379156, "grad_norm": 0.14016900956630707, "learning_rate": 0.0002993321842374069, "loss": 10.4494, "step": 2080, "throughput": 8604.296026764225 }, { "epoch": 0.03310370770930807, "grad_norm": 0.14942410588264465, "learning_rate": 0.00029930467197206156, "loss": 10.4193, "step": 2112, "throughput": 8608.41214757555 }, { "epoch": 0.03360527903823698, "grad_norm": 0.14700458943843842, "learning_rate": 0.000299276605833851, "loss": 10.3977, "step": 2144, "throughput": 8614.543771421539 }, { "epoch": 0.03410685036716589, "grad_norm": 0.12929627299308777, "learning_rate": 0.00029924798593851994, "loss": 10.3986, "step": 2176, "throughput": 8621.065638504637 }, { "epoch": 0.0346084216960948, "grad_norm": 0.14943945407867432, "learning_rate": 0.00029921881240409703, "loss": 10.3857, "step": 2208, "throughput": 8618.170501024351 }, { "epoch": 0.035109993025023704, "grad_norm": 0.1490052491426468, "learning_rate": 0.00029918908535089394, "loss": 10.3782, "step": 2240, "throughput": 8621.5559803982 }, { "epoch": 0.035611564353952616, "grad_norm": 0.1199193075299263, "learning_rate": 0.00029915880490150515, "loss": 10.3513, "step": 2272, "throughput": 8627.148534803291 }, { "epoch": 0.03611313568288153, "grad_norm": 0.12391626089811325, "learning_rate": 0.0002991279711808072, "loss": 10.3492, "step": 2304, "throughput": 8633.14469700088 }, { "epoch": 0.03661470701181044, "grad_norm": 0.1489352434873581, "learning_rate": 0.0002990965843159587, "loss": 10.3098, "step": 2336, "throughput": 8631.454564886973 }, { "epoch": 0.03711627834073935, "grad_norm": 0.12652361392974854, "learning_rate": 0.000299064644436399, "loss": 10.3038, "step": 2368, "throughput": 8634.488781208483 }, { "epoch": 0.03761784966966826, "grad_norm": 0.13568635284900665, "learning_rate": 0.0002990321516738482, "loss": 10.2685, "step": 2400, "throughput": 8640.191384361931 }, { "epoch": 0.038119420998597164, "grad_norm": 0.1313263326883316, "learning_rate": 0.00029899910616230674, "loss": 10.2806, "step": 2432, "throughput": 8645.23888018826 }, { "epoch": 0.038620992327526076, "grad_norm": 0.12250286340713501, "learning_rate": 0.0002989655080380543, "loss": 10.2797, "step": 2464, "throughput": 8642.636898666538 }, { "epoch": 0.03912256365645499, "grad_norm": 0.11763161420822144, "learning_rate": 0.0002989313574396496, "loss": 10.2454, "step": 2496, "throughput": 8645.384442994879 }, { "epoch": 0.0396241349853839, "grad_norm": 0.13227710127830505, "learning_rate": 0.00029889665450792983, "loss": 10.2309, "step": 2528, "throughput": 8650.663248170937 }, { "epoch": 0.04012570631431281, "grad_norm": 0.12891387939453125, "learning_rate": 0.0002988613993860101, "loss": 10.2252, "step": 2560, "throughput": 8655.331219016643 }, { "epoch": 0.04062727764324172, "grad_norm": 0.13591551780700684, "learning_rate": 0.0002988255922192825, "loss": 10.2132, "step": 2592, "throughput": 8652.438713629675 }, { "epoch": 0.04112884897217063, "grad_norm": 0.13800325989723206, "learning_rate": 0.000298789233155416, "loss": 10.1996, "step": 2624, "throughput": 8655.184002646449 }, { "epoch": 0.041630420301099536, "grad_norm": 0.13499251008033752, "learning_rate": 0.0002987523223443554, "loss": 10.1903, "step": 2656, "throughput": 8659.739091286676 }, { "epoch": 0.04213199163002845, "grad_norm": 0.12725511193275452, "learning_rate": 0.000298714859938321, "loss": 10.1742, "step": 2688, "throughput": 8664.175771510912 }, { "epoch": 0.04263356295895736, "grad_norm": 0.13918425142765045, "learning_rate": 0.0002986768460918079, "loss": 10.1607, "step": 2720, "throughput": 8661.906185594662 }, { "epoch": 0.04313513428788627, "grad_norm": 0.12420102953910828, "learning_rate": 0.0002986382809615853, "loss": 10.1532, "step": 2752, "throughput": 8664.108344168615 }, { "epoch": 0.04363670561681518, "grad_norm": 0.13506534695625305, "learning_rate": 0.00029859916470669596, "loss": 10.1531, "step": 2784, "throughput": 8668.340072989078 }, { "epoch": 0.04413827694574409, "grad_norm": 0.11485131084918976, "learning_rate": 0.0002985594974884554, "loss": 10.1036, "step": 2816, "throughput": 8672.514168355223 }, { "epoch": 0.044639848274673, "grad_norm": 0.1429208666086197, "learning_rate": 0.00029851927947045136, "loss": 10.1181, "step": 2848, "throughput": 8670.511688031254 }, { "epoch": 0.04514141960360191, "grad_norm": 0.11533421277999878, "learning_rate": 0.000298478510818543, "loss": 10.0922, "step": 2880, "throughput": 8672.608976050276 }, { "epoch": 0.04564299093253082, "grad_norm": 0.12951959669589996, "learning_rate": 0.0002984371917008604, "loss": 10.0784, "step": 2912, "throughput": 8676.554281193294 }, { "epoch": 0.04614456226145973, "grad_norm": 0.1254311501979828, "learning_rate": 0.0002983953222878037, "loss": 10.096, "step": 2944, "throughput": 8680.355487940858 }, { "epoch": 0.04664613359038864, "grad_norm": 0.15013115108013153, "learning_rate": 0.0002983529027520426, "loss": 10.0558, "step": 2976, "throughput": 8678.506209881712 }, { "epoch": 0.04714770491931755, "grad_norm": 0.12344110757112503, "learning_rate": 0.0002983099332685153, "loss": 10.065, "step": 3008, "throughput": 8680.447547364309 }, { "epoch": 0.04764927624824646, "grad_norm": 0.12854412198066711, "learning_rate": 0.000298266414014428, "loss": 10.0563, "step": 3040, "throughput": 8684.174220917293 }, { "epoch": 0.04815084757717537, "grad_norm": 0.12837445735931396, "learning_rate": 0.0002982223451692544, "loss": 10.0484, "step": 3072, "throughput": 8687.846008837005 }, { "epoch": 0.04865241890610428, "grad_norm": 0.13306200504302979, "learning_rate": 0.0002981777269147344, "loss": 10.0373, "step": 3104, "throughput": 8686.269605240876 }, { "epoch": 0.04915399023503319, "grad_norm": 0.1327691376209259, "learning_rate": 0.0002981325594348739, "loss": 10.0474, "step": 3136, "throughput": 8687.881093275713 }, { "epoch": 0.0496555615639621, "grad_norm": 0.11801597476005554, "learning_rate": 0.00029808684291594373, "loss": 10.0057, "step": 3168, "throughput": 8691.366927033056 }, { "epoch": 0.05015713289289101, "grad_norm": 0.114040307700634, "learning_rate": 0.0002980405775464789, "loss": 9.9989, "step": 3200, "throughput": 8694.78405139367 }, { "epoch": 0.05065870422181992, "grad_norm": 0.11511870473623276, "learning_rate": 0.00029799376351727797, "loss": 9.9831, "step": 3232, "throughput": 8692.988671152094 }, { "epoch": 0.051160275550748834, "grad_norm": 0.13341563940048218, "learning_rate": 0.00029794640102140206, "loss": 9.9744, "step": 3264, "throughput": 8694.304613157901 }, { "epoch": 0.05166184687967774, "grad_norm": 0.14193041622638702, "learning_rate": 0.00029789849025417433, "loss": 9.9716, "step": 3296, "throughput": 8697.71995228885 }, { "epoch": 0.05216341820860665, "grad_norm": 0.11138767004013062, "learning_rate": 0.0002978500314131789, "loss": 10.0049, "step": 3328, "throughput": 8700.995142735857 }, { "epoch": 0.05266498953753556, "grad_norm": 0.13181596994400024, "learning_rate": 0.00029780102469826014, "loss": 9.9559, "step": 3360, "throughput": 8699.412860118453 }, { "epoch": 0.05316656086646447, "grad_norm": 0.11928830295801163, "learning_rate": 0.00029775147031152195, "loss": 9.9436, "step": 3392, "throughput": 8700.922524938454 }, { "epoch": 0.05366813219539338, "grad_norm": 0.12960603833198547, "learning_rate": 0.0002977013684573267, "loss": 9.9464, "step": 3424, "throughput": 8704.083537777888 }, { "epoch": 0.054169703524322294, "grad_norm": 0.12624278664588928, "learning_rate": 0.0002976507193422946, "loss": 9.939, "step": 3456, "throughput": 8707.186611773384 }, { "epoch": 0.0546712748532512, "grad_norm": 0.12646783888339996, "learning_rate": 0.00029759952317530284, "loss": 9.9485, "step": 3488, "throughput": 8705.7946206228 }, { "epoch": 0.05517284618218011, "grad_norm": 0.1135874092578888, "learning_rate": 0.0002975477801674845, "loss": 9.8956, "step": 3520, "throughput": 8706.898540237127 }, { "epoch": 0.05567441751110902, "grad_norm": 0.11660734564065933, "learning_rate": 0.00029749549053222784, "loss": 9.9178, "step": 3552, "throughput": 8709.896727910347 }, { "epoch": 0.05617598884003793, "grad_norm": 0.11348798871040344, "learning_rate": 0.0002974426544851755, "loss": 9.8913, "step": 3584, "throughput": 8712.842678586643 }, { "epoch": 0.05667756016896684, "grad_norm": 0.11837482452392578, "learning_rate": 0.00029738927224422354, "loss": 9.8937, "step": 3616, "throughput": 8711.675362629981 }, { "epoch": 0.057179131497895753, "grad_norm": 0.1253383755683899, "learning_rate": 0.0002973353440295205, "loss": 9.8689, "step": 3648, "throughput": 8712.58627493939 }, { "epoch": 0.057680702826824665, "grad_norm": 0.12361543625593185, "learning_rate": 0.0002972808700634664, "loss": 9.8713, "step": 3680, "throughput": 8715.511098762165 }, { "epoch": 0.05818227415575357, "grad_norm": 0.1185847818851471, "learning_rate": 0.0002972258505707121, "loss": 9.8642, "step": 3712, "throughput": 8718.259214378111 }, { "epoch": 0.05868384548468248, "grad_norm": 0.11482404917478561, "learning_rate": 0.00029717028577815817, "loss": 9.8517, "step": 3744, "throughput": 8716.86528152881 }, { "epoch": 0.05918541681361139, "grad_norm": 0.12030131369829178, "learning_rate": 0.0002971141759149539, "loss": 9.8704, "step": 3776, "throughput": 8717.99026040238 }, { "epoch": 0.0596869881425403, "grad_norm": 0.11121921986341476, "learning_rate": 0.00029705752121249665, "loss": 9.846, "step": 3808, "throughput": 8720.784126088147 }, { "epoch": 0.060188559471469213, "grad_norm": 0.11496740579605103, "learning_rate": 0.0002970003219044305, "loss": 9.833, "step": 3840, "throughput": 8723.510819994406 }, { "epoch": 0.060690130800398125, "grad_norm": 0.11950097233057022, "learning_rate": 0.0002969425782266455, "loss": 9.847, "step": 3872, "throughput": 8722.220559962287 }, { "epoch": 0.061191702129327036, "grad_norm": 0.11807090044021606, "learning_rate": 0.0002968842904172769, "loss": 9.837, "step": 3904, "throughput": 8722.88462889842 }, { "epoch": 0.06169327345825594, "grad_norm": 0.11705534160137177, "learning_rate": 0.00029682545871670375, "loss": 9.8289, "step": 3936, "throughput": 8725.452473969277 }, { "epoch": 0.06219484478718485, "grad_norm": 0.11206234246492386, "learning_rate": 0.0002967660833675481, "loss": 9.814, "step": 3968, "throughput": 8728.053752565047 }, { "epoch": 0.06269641611611376, "grad_norm": 0.10935520380735397, "learning_rate": 0.0002967061646146741, "loss": 9.8, "step": 4000, "throughput": 8726.928482257368 }, { "epoch": 0.06319798744504267, "grad_norm": 0.12385314702987671, "learning_rate": 0.00029664570270518685, "loss": 9.7849, "step": 4032, "throughput": 8727.5696353496 }, { "epoch": 0.06369955877397158, "grad_norm": 0.11679193377494812, "learning_rate": 0.00029658469788843147, "loss": 9.7898, "step": 4064, "throughput": 8730.008881642094 }, { "epoch": 0.06420113010290049, "grad_norm": 0.11991138756275177, "learning_rate": 0.00029652315041599203, "loss": 9.7781, "step": 4096, "throughput": 8732.473879711068 }, { "epoch": 0.0647027014318294, "grad_norm": 0.13184259831905365, "learning_rate": 0.00029646106054169046, "loss": 9.7812, "step": 4128, "throughput": 8726.75567838056 }, { "epoch": 0.06520427276075831, "grad_norm": 0.11864672601222992, "learning_rate": 0.00029639842852158553, "loss": 9.7764, "step": 4160, "throughput": 8727.531637802169 }, { "epoch": 0.06570584408968723, "grad_norm": 0.10902588814496994, "learning_rate": 0.00029633525461397194, "loss": 9.7707, "step": 4192, "throughput": 8729.976476059877 }, { "epoch": 0.06620741541861613, "grad_norm": 0.10904784500598907, "learning_rate": 0.00029627153907937903, "loss": 9.7731, "step": 4224, "throughput": 8732.42270047259 }, { "epoch": 0.06670898674754504, "grad_norm": 0.1083018109202385, "learning_rate": 0.0002962072821805699, "loss": 9.7378, "step": 4256, "throughput": 8731.099681392934 }, { "epoch": 0.06721055807647396, "grad_norm": 0.1126420721411705, "learning_rate": 0.0002961424841825402, "loss": 9.7451, "step": 4288, "throughput": 8731.601959832633 }, { "epoch": 0.06771212940540286, "grad_norm": 0.11158367246389389, "learning_rate": 0.00029607714535251703, "loss": 9.7397, "step": 4320, "throughput": 8733.870236599323 }, { "epoch": 0.06821370073433178, "grad_norm": 0.10763044655323029, "learning_rate": 0.00029601126595995794, "loss": 9.7427, "step": 4352, "throughput": 8736.180002336438 }, { "epoch": 0.06871527206326068, "grad_norm": 0.11694357544183731, "learning_rate": 0.0002959448462765497, "loss": 9.7229, "step": 4384, "throughput": 8734.995267095126 }, { "epoch": 0.0692168433921896, "grad_norm": 0.1064562126994133, "learning_rate": 0.0002958778865762072, "loss": 9.7388, "step": 4416, "throughput": 8735.564838968481 }, { "epoch": 0.0697184147211185, "grad_norm": 0.10848239809274673, "learning_rate": 0.0002958103871350727, "loss": 9.7179, "step": 4448, "throughput": 8737.615081448372 }, { "epoch": 0.07021998605004741, "grad_norm": 0.10709454864263535, "learning_rate": 0.0002957423482315139, "loss": 9.7198, "step": 4480, "throughput": 8739.853794654147 }, { "epoch": 0.07072155737897633, "grad_norm": 0.12926509976387024, "learning_rate": 0.0002956737701461235, "loss": 9.7057, "step": 4512, "throughput": 8739.03085797585 }, { "epoch": 0.07122312870790523, "grad_norm": 0.1091700941324234, "learning_rate": 0.00029560465316171773, "loss": 9.6923, "step": 4544, "throughput": 8739.543393337932 }, { "epoch": 0.07172470003683415, "grad_norm": 0.11444082856178284, "learning_rate": 0.0002955349975633352, "loss": 9.7073, "step": 4576, "throughput": 8741.510902754704 }, { "epoch": 0.07222627136576305, "grad_norm": 0.1193537712097168, "learning_rate": 0.00029546480363823577, "loss": 9.7022, "step": 4608, "throughput": 8743.724679614512 }, { "epoch": 0.07272784269469197, "grad_norm": 0.11008062213659286, "learning_rate": 0.0002953940716758995, "loss": 9.6769, "step": 4640, "throughput": 8742.567475185164 }, { "epoch": 0.07322941402362088, "grad_norm": 0.11342762410640717, "learning_rate": 0.0002953228019680252, "loss": 9.6866, "step": 4672, "throughput": 8743.06418128608 }, { "epoch": 0.07373098535254978, "grad_norm": 0.10978548973798752, "learning_rate": 0.0002952509948085293, "loss": 9.6647, "step": 4704, "throughput": 8744.934773526928 }, { "epoch": 0.0742325566814787, "grad_norm": 0.10810253769159317, "learning_rate": 0.00029517865049354477, "loss": 9.6861, "step": 4736, "throughput": 8747.043262691786 }, { "epoch": 0.0747341280104076, "grad_norm": 0.12077952176332474, "learning_rate": 0.0002951057693214197, "loss": 9.6609, "step": 4768, "throughput": 8745.863832630565 }, { "epoch": 0.07523569933933652, "grad_norm": 0.12620708346366882, "learning_rate": 0.0002950323515927164, "loss": 9.6417, "step": 4800, "throughput": 8746.33913872354 }, { "epoch": 0.07573727066826542, "grad_norm": 0.10805953294038773, "learning_rate": 0.0002949583976102097, "loss": 9.6571, "step": 4832, "throughput": 8748.119129389326 }, { "epoch": 0.07623884199719433, "grad_norm": 0.10737334191799164, "learning_rate": 0.00029488390767888606, "loss": 9.6458, "step": 4864, "throughput": 8750.2710186966 }, { "epoch": 0.07674041332612325, "grad_norm": 0.10387426614761353, "learning_rate": 0.0002948088821059422, "loss": 9.6404, "step": 4896, "throughput": 8749.044764652741 }, { "epoch": 0.07724198465505215, "grad_norm": 0.10420756787061691, "learning_rate": 0.0002947333212007838, "loss": 9.6436, "step": 4928, "throughput": 8749.480158538867 }, { "epoch": 0.07774355598398107, "grad_norm": 0.11674252152442932, "learning_rate": 0.0002946572252750242, "loss": 9.6466, "step": 4960, "throughput": 8751.365321390707 }, { "epoch": 0.07824512731290997, "grad_norm": 0.11858739703893661, "learning_rate": 0.0002945805946424834, "loss": 9.6272, "step": 4992, "throughput": 8753.230851191058 }, { "epoch": 0.07874669864183889, "grad_norm": 0.12305985391139984, "learning_rate": 0.0002945034296191861, "loss": 9.636, "step": 5024, "throughput": 8751.780277949114 }, { "epoch": 0.0792482699707678, "grad_norm": 0.11536737531423569, "learning_rate": 0.00029442573052336127, "loss": 9.6316, "step": 5056, "throughput": 8752.300934613511 }, { "epoch": 0.0797498412996967, "grad_norm": 0.10255605727434158, "learning_rate": 0.0002943474976754401, "loss": 9.5882, "step": 5088, "throughput": 8754.111959971262 }, { "epoch": 0.08025141262862562, "grad_norm": 0.1140480637550354, "learning_rate": 0.0002942687313980552, "loss": 9.6156, "step": 5120, "throughput": 8755.960772925051 }, { "epoch": 0.08075298395755452, "grad_norm": 0.10552536696195602, "learning_rate": 0.0002941894320160389, "loss": 9.6169, "step": 5152, "throughput": 8754.902346274297 }, { "epoch": 0.08125455528648344, "grad_norm": 0.10546742379665375, "learning_rate": 0.00029410959985642205, "loss": 9.5985, "step": 5184, "throughput": 8755.319127740058 }, { "epoch": 0.08175612661541234, "grad_norm": 0.10557737201452255, "learning_rate": 0.0002940292352484327, "loss": 9.5846, "step": 5216, "throughput": 8757.070036568484 }, { "epoch": 0.08225769794434126, "grad_norm": 0.10859539359807968, "learning_rate": 0.0002939483385234948, "loss": 9.5832, "step": 5248, "throughput": 8758.805477156033 }, { "epoch": 0.08275926927327017, "grad_norm": 0.1116238459944725, "learning_rate": 0.0002938669100152266, "loss": 9.605, "step": 5280, "throughput": 8758.001263340446 }, { "epoch": 0.08326084060219907, "grad_norm": 0.10401325672864914, "learning_rate": 0.00029378495005943954, "loss": 9.5738, "step": 5312, "throughput": 8758.390589772524 }, { "epoch": 0.08376241193112799, "grad_norm": 0.11249975115060806, "learning_rate": 0.00029370245899413677, "loss": 9.5761, "step": 5344, "throughput": 8759.943487108056 }, { "epoch": 0.0842639832600569, "grad_norm": 0.10400120913982391, "learning_rate": 0.0002936194371595116, "loss": 9.5772, "step": 5376, "throughput": 8761.705736556785 }, { "epoch": 0.08476555458898581, "grad_norm": 0.11678726971149445, "learning_rate": 0.00029353588489794636, "loss": 9.5678, "step": 5408, "throughput": 8760.785024398701 }, { "epoch": 0.08526712591791472, "grad_norm": 0.10207447409629822, "learning_rate": 0.0002934518025540109, "loss": 9.5648, "step": 5440, "throughput": 8761.207794467886 }, { "epoch": 0.08576869724684363, "grad_norm": 0.10971741378307343, "learning_rate": 0.00029336719047446096, "loss": 9.5842, "step": 5472, "throughput": 8762.6991526964 }, { "epoch": 0.08627026857577254, "grad_norm": 0.10587865859270096, "learning_rate": 0.000293282049008237, "loss": 9.5558, "step": 5504, "throughput": 8764.382572374267 }, { "epoch": 0.08677183990470144, "grad_norm": 0.10832160711288452, "learning_rate": 0.00029319637850646273, "loss": 9.5593, "step": 5536, "throughput": 8763.682026107219 }, { "epoch": 0.08727341123363036, "grad_norm": 0.1197381466627121, "learning_rate": 0.0002931101793224435, "loss": 9.5612, "step": 5568, "throughput": 8764.26764075061 }, { "epoch": 0.08777498256255926, "grad_norm": 0.10048508644104004, "learning_rate": 0.0002930234518116651, "loss": 9.5599, "step": 5600, "throughput": 8765.526614902168 }, { "epoch": 0.08827655389148818, "grad_norm": 0.11333134025335312, "learning_rate": 0.000292936196331792, "loss": 9.5267, "step": 5632, "throughput": 8767.121347324892 }, { "epoch": 0.08877812522041709, "grad_norm": 0.10634933412075043, "learning_rate": 0.000292848413242666, "loss": 9.5516, "step": 5664, "throughput": 8766.28425052529 }, { "epoch": 0.089279696549346, "grad_norm": 0.1022300124168396, "learning_rate": 0.0002927601029063049, "loss": 9.5298, "step": 5696, "throughput": 8766.822494400065 }, { "epoch": 0.08978126787827491, "grad_norm": 0.11001124978065491, "learning_rate": 0.0002926712656869007, "loss": 9.5238, "step": 5728, "throughput": 8768.014349855302 }, { "epoch": 0.09028283920720381, "grad_norm": 0.10148325562477112, "learning_rate": 0.0002925819019508184, "loss": 9.5295, "step": 5760, "throughput": 8769.51986607424 }, { "epoch": 0.09078441053613273, "grad_norm": 0.11986260861158371, "learning_rate": 0.0002924920120665943, "loss": 9.5362, "step": 5792, "throughput": 8768.542280819205 }, { "epoch": 0.09128598186506164, "grad_norm": 0.11802539229393005, "learning_rate": 0.00029240159640493463, "loss": 9.5297, "step": 5824, "throughput": 8769.037029142866 }, { "epoch": 0.09178755319399055, "grad_norm": 0.10226628929376602, "learning_rate": 0.00029231065533871374, "loss": 9.5186, "step": 5856, "throughput": 8770.189301563876 }, { "epoch": 0.09228912452291946, "grad_norm": 0.09981340169906616, "learning_rate": 0.0002922191892429729, "loss": 9.4993, "step": 5888, "throughput": 8771.715936979588 }, { "epoch": 0.09279069585184836, "grad_norm": 0.11034058779478073, "learning_rate": 0.0002921271984949185, "loss": 9.5075, "step": 5920, "throughput": 8771.140273309473 }, { "epoch": 0.09329226718077728, "grad_norm": 0.09680221229791641, "learning_rate": 0.0002920346834739208, "loss": 9.4944, "step": 5952, "throughput": 8771.453342478308 }, { "epoch": 0.09379383850970618, "grad_norm": 0.10602926462888718, "learning_rate": 0.0002919416445615119, "loss": 9.4971, "step": 5984, "throughput": 8772.612423388544 }, { "epoch": 0.0942954098386351, "grad_norm": 0.09529194980859756, "learning_rate": 0.0002918480821413846, "loss": 9.4783, "step": 6016, "throughput": 8774.060226804862 }, { "epoch": 0.094796981167564, "grad_norm": 0.11012410372495651, "learning_rate": 0.0002917539965993906, "loss": 9.4814, "step": 6048, "throughput": 8773.541619500458 }, { "epoch": 0.09529855249649292, "grad_norm": 0.11098303645849228, "learning_rate": 0.00029165938832353885, "loss": 9.486, "step": 6080, "throughput": 8773.753906713271 }, { "epoch": 0.09580012382542183, "grad_norm": 0.10786409676074982, "learning_rate": 0.00029156425770399434, "loss": 9.4732, "step": 6112, "throughput": 8774.859206813278 }, { "epoch": 0.09630169515435073, "grad_norm": 0.10273081809282303, "learning_rate": 0.0002914686051330759, "loss": 9.4749, "step": 6144, "throughput": 8776.312027347161 }, { "epoch": 0.09680326648327965, "grad_norm": 0.10742319375276566, "learning_rate": 0.00029137243100525506, "loss": 9.4978, "step": 6176, "throughput": 8772.277513786172 }, { "epoch": 0.09730483781220856, "grad_norm": 0.10680707544088364, "learning_rate": 0.00029127573571715416, "loss": 9.4679, "step": 6208, "throughput": 8772.23422342053 }, { "epoch": 0.09780640914113747, "grad_norm": 0.10717196762561798, "learning_rate": 0.00029117851966754495, "loss": 9.4652, "step": 6240, "throughput": 8773.386896101358 }, { "epoch": 0.09830798047006638, "grad_norm": 0.09716420620679855, "learning_rate": 0.00029108078325734666, "loss": 9.4707, "step": 6272, "throughput": 8774.817011223837 }, { "epoch": 0.0988095517989953, "grad_norm": 0.10774450749158859, "learning_rate": 0.0002909825268896245, "loss": 9.4607, "step": 6304, "throughput": 8774.22740411009 }, { "epoch": 0.0993111231279242, "grad_norm": 0.09601946175098419, "learning_rate": 0.000290883750969588, "loss": 9.4527, "step": 6336, "throughput": 8774.561502422925 }, { "epoch": 0.0998126944568531, "grad_norm": 0.10894615203142166, "learning_rate": 0.00029078445590458946, "loss": 9.442, "step": 6368, "throughput": 8775.561618802984 }, { "epoch": 0.10031426578578202, "grad_norm": 0.09985128045082092, "learning_rate": 0.0002906846421041219, "loss": 9.479, "step": 6400, "throughput": 8776.897730433244 }, { "epoch": 0.10081583711471093, "grad_norm": 0.09858572483062744, "learning_rate": 0.00029058430997981784, "loss": 9.4263, "step": 6432, "throughput": 8776.367922921861 }, { "epoch": 0.10131740844363984, "grad_norm": 0.10523559898138046, "learning_rate": 0.0002904834599454472, "loss": 9.4307, "step": 6464, "throughput": 8776.785085883334 }, { "epoch": 0.10181897977256875, "grad_norm": 0.0995122492313385, "learning_rate": 0.00029038209241691575, "loss": 9.4566, "step": 6496, "throughput": 8777.531086805837 }, { "epoch": 0.10232055110149767, "grad_norm": 0.10437292605638504, "learning_rate": 0.0002902802078122636, "loss": 9.4127, "step": 6528, "throughput": 8778.849186284346 }, { "epoch": 0.10282212243042657, "grad_norm": 0.10644105076789856, "learning_rate": 0.00029017780655166315, "loss": 9.4328, "step": 6560, "throughput": 8778.169575081867 }, { "epoch": 0.10332369375935548, "grad_norm": 0.10638494044542313, "learning_rate": 0.0002900748890574175, "loss": 9.4391, "step": 6592, "throughput": 8778.733712323921 }, { "epoch": 0.1038252650882844, "grad_norm": 0.09863536059856415, "learning_rate": 0.0002899714557539586, "loss": 9.4357, "step": 6624, "throughput": 8779.33307552037 }, { "epoch": 0.1043268364172133, "grad_norm": 0.11023005098104477, "learning_rate": 0.00028986750706784574, "loss": 9.4383, "step": 6656, "throughput": 8780.65893379113 }, { "epoch": 0.10482840774614222, "grad_norm": 0.10076352208852768, "learning_rate": 0.0002897630434277637, "loss": 9.4148, "step": 6688, "throughput": 8780.240505214717 }, { "epoch": 0.10532997907507112, "grad_norm": 0.09036028385162354, "learning_rate": 0.0002896580652645207, "loss": 9.4093, "step": 6720, "throughput": 8780.66309519438 }, { "epoch": 0.10583155040400004, "grad_norm": 0.09312684834003448, "learning_rate": 0.00028955257301104714, "loss": 9.3995, "step": 6752, "throughput": 8781.240044175389 }, { "epoch": 0.10633312173292894, "grad_norm": 0.114221952855587, "learning_rate": 0.00028944656710239337, "loss": 9.3911, "step": 6784, "throughput": 8782.48113812559 }, { "epoch": 0.10683469306185785, "grad_norm": 0.0931376963853836, "learning_rate": 0.00028934004797572795, "loss": 9.427, "step": 6816, "throughput": 8782.077289376892 }, { "epoch": 0.10733626439078676, "grad_norm": 0.10255315154790878, "learning_rate": 0.00028923301607033616, "loss": 9.3771, "step": 6848, "throughput": 8782.590252463471 }, { "epoch": 0.10783783571971567, "grad_norm": 0.10455886274576187, "learning_rate": 0.0002891254718276178, "loss": 9.4268, "step": 6880, "throughput": 8783.169961134565 }, { "epoch": 0.10833940704864459, "grad_norm": 0.09569968283176422, "learning_rate": 0.00028901741569108586, "loss": 9.3963, "step": 6912, "throughput": 8784.373364351874 }, { "epoch": 0.10884097837757349, "grad_norm": 0.10171248763799667, "learning_rate": 0.00028890884810636394, "loss": 9.4016, "step": 6944, "throughput": 8783.993265016647 }, { "epoch": 0.1093425497065024, "grad_norm": 0.0940864086151123, "learning_rate": 0.00028879976952118523, "loss": 9.3953, "step": 6976, "throughput": 8784.41693816875 }, { "epoch": 0.10984412103543131, "grad_norm": 0.09311022609472275, "learning_rate": 0.0002886901803853901, "loss": 9.4155, "step": 7008, "throughput": 8784.969029180918 }, { "epoch": 0.11034569236436022, "grad_norm": 0.10037797689437866, "learning_rate": 0.00028858008115092445, "loss": 9.3822, "step": 7040, "throughput": 8786.190972002953 }, { "epoch": 0.11084726369328914, "grad_norm": 0.09330254048109055, "learning_rate": 0.0002884694722718378, "loss": 9.3832, "step": 7072, "throughput": 8785.815264238152 }, { "epoch": 0.11134883502221804, "grad_norm": 0.10277887433767319, "learning_rate": 0.00028835835420428163, "loss": 9.3735, "step": 7104, "throughput": 8786.055555264302 }, { "epoch": 0.11185040635114696, "grad_norm": 0.09488580375909805, "learning_rate": 0.000288246727406507, "loss": 9.3748, "step": 7136, "throughput": 8786.588604106677 }, { "epoch": 0.11235197768007586, "grad_norm": 0.10583002865314484, "learning_rate": 0.00028813459233886335, "loss": 9.3646, "step": 7168, "throughput": 8787.867037853663 }, { "epoch": 0.11285354900900477, "grad_norm": 0.10686413943767548, "learning_rate": 0.00028802194946379585, "loss": 9.3436, "step": 7200, "throughput": 8787.70036951069 }, { "epoch": 0.11335512033793368, "grad_norm": 0.09465917944908142, "learning_rate": 0.0002879087992458442, "loss": 9.3593, "step": 7232, "throughput": 8787.736361138324 }, { "epoch": 0.11385669166686259, "grad_norm": 0.09683835506439209, "learning_rate": 0.00028779514215164015, "loss": 9.3462, "step": 7264, "throughput": 8788.262568915294 }, { "epoch": 0.11435826299579151, "grad_norm": 0.09295953065156937, "learning_rate": 0.0002876809786499059, "loss": 9.3604, "step": 7296, "throughput": 8789.55067224747 }, { "epoch": 0.11485983432472041, "grad_norm": 0.08935536444187164, "learning_rate": 0.0002875663092114521, "loss": 9.3685, "step": 7328, "throughput": 8789.450310964028 }, { "epoch": 0.11536140565364933, "grad_norm": 0.1067778691649437, "learning_rate": 0.0002874511343091758, "loss": 9.3559, "step": 7360, "throughput": 8789.533171952675 }, { "epoch": 0.11586297698257823, "grad_norm": 0.1031869426369667, "learning_rate": 0.00028733545441805874, "loss": 9.359, "step": 7392, "throughput": 8790.117743777615 }, { "epoch": 0.11636454831150714, "grad_norm": 0.09314560145139694, "learning_rate": 0.00028721927001516503, "loss": 9.3671, "step": 7424, "throughput": 8791.3475967336 }, { "epoch": 0.11686611964043606, "grad_norm": 0.10189881175756454, "learning_rate": 0.00028710258157963955, "loss": 9.356, "step": 7456, "throughput": 8791.08301501178 }, { "epoch": 0.11736769096936496, "grad_norm": 0.10202765464782715, "learning_rate": 0.00028698538959270577, "loss": 9.3532, "step": 7488, "throughput": 8791.172960360063 }, { "epoch": 0.11786926229829388, "grad_norm": 0.11254678666591644, "learning_rate": 0.00028686769453766366, "loss": 9.3508, "step": 7520, "throughput": 8791.70103738155 }, { "epoch": 0.11837083362722278, "grad_norm": 0.09686455875635147, "learning_rate": 0.00028674949689988814, "loss": 9.3269, "step": 7552, "throughput": 8792.904480615553 }, { "epoch": 0.1188724049561517, "grad_norm": 0.0967174768447876, "learning_rate": 0.00028663079716682654, "loss": 9.3219, "step": 7584, "throughput": 8792.668287370536 }, { "epoch": 0.1193739762850806, "grad_norm": 0.09918548911809921, "learning_rate": 0.00028651159582799695, "loss": 9.3291, "step": 7616, "throughput": 8792.746124169227 }, { "epoch": 0.11987554761400951, "grad_norm": 0.09100424498319626, "learning_rate": 0.000286391893374986, "loss": 9.3354, "step": 7648, "throughput": 8793.249523980605 }, { "epoch": 0.12037711894293843, "grad_norm": 0.10109537839889526, "learning_rate": 0.0002862716903014469, "loss": 9.3325, "step": 7680, "throughput": 8794.43398400681 }, { "epoch": 0.12087869027186733, "grad_norm": 0.09545495361089706, "learning_rate": 0.0002861509871030977, "loss": 9.3165, "step": 7712, "throughput": 8794.374266231913 }, { "epoch": 0.12138026160079625, "grad_norm": 0.09386946260929108, "learning_rate": 0.0002860297842777185, "loss": 9.2992, "step": 7744, "throughput": 8794.424310030801 }, { "epoch": 0.12188183292972515, "grad_norm": 0.09427899122238159, "learning_rate": 0.00028590808232515025, "loss": 9.3118, "step": 7776, "throughput": 8794.856080279087 }, { "epoch": 0.12238340425865407, "grad_norm": 0.09586696326732635, "learning_rate": 0.00028578588174729214, "loss": 9.3064, "step": 7808, "throughput": 8796.0053929776 }, { "epoch": 0.12288497558758298, "grad_norm": 0.10378427803516388, "learning_rate": 0.0002856631830480997, "loss": 9.3089, "step": 7840, "throughput": 8795.877169587182 }, { "epoch": 0.12338654691651188, "grad_norm": 0.09206092357635498, "learning_rate": 0.0002855399867335827, "loss": 9.303, "step": 7872, "throughput": 8795.95365876067 }, { "epoch": 0.1238881182454408, "grad_norm": 0.1028638631105423, "learning_rate": 0.0002854162933118032, "loss": 9.3107, "step": 7904, "throughput": 8796.369941584839 }, { "epoch": 0.1243896895743697, "grad_norm": 0.09411998093128204, "learning_rate": 0.0002852921032928732, "loss": 9.2964, "step": 7936, "throughput": 8797.50387293777 }, { "epoch": 0.12489126090329862, "grad_norm": 0.08698836714029312, "learning_rate": 0.0002851674171889526, "loss": 9.2972, "step": 7968, "throughput": 8797.140561187089 }, { "epoch": 0.12539283223222752, "grad_norm": 0.09941025078296661, "learning_rate": 0.0002850422355142474, "loss": 9.2937, "step": 8000, "throughput": 8797.22027781598 }, { "epoch": 0.12589440356115644, "grad_norm": 0.0902884230017662, "learning_rate": 0.00028491655878500716, "loss": 9.2986, "step": 8032, "throughput": 8797.70882325548 }, { "epoch": 0.12639597489008533, "grad_norm": 0.09279583394527435, "learning_rate": 0.0002847903875195231, "loss": 9.2844, "step": 8064, "throughput": 8798.813097833674 }, { "epoch": 0.12689754621901425, "grad_norm": 0.08902335166931152, "learning_rate": 0.00028466372223812575, "loss": 9.2554, "step": 8096, "throughput": 8798.64995728701 }, { "epoch": 0.12739911754794317, "grad_norm": 0.09631629288196564, "learning_rate": 0.0002845365634631833, "loss": 9.3199, "step": 8128, "throughput": 8798.723076061977 }, { "epoch": 0.1279006888768721, "grad_norm": 0.09583239257335663, "learning_rate": 0.0002844089117190988, "loss": 9.281, "step": 8160, "throughput": 8799.30489082074 }, { "epoch": 0.12840226020580098, "grad_norm": 0.1015479788184166, "learning_rate": 0.0002842807675323085, "loss": 9.2922, "step": 8192, "throughput": 8800.297249952806 }, { "epoch": 0.1289038315347299, "grad_norm": 0.08587797731161118, "learning_rate": 0.00028415213143127935, "loss": 9.2946, "step": 8224, "throughput": 8797.422031741422 }, { "epoch": 0.1294054028636588, "grad_norm": 0.08758696168661118, "learning_rate": 0.00028402300394650697, "loss": 9.2858, "step": 8256, "throughput": 8797.608642785463 }, { "epoch": 0.1299069741925877, "grad_norm": 0.09429500997066498, "learning_rate": 0.0002838933856105136, "loss": 9.2681, "step": 8288, "throughput": 8798.190467266051 }, { "epoch": 0.13040854552151662, "grad_norm": 0.09859396517276764, "learning_rate": 0.0002837632769578455, "loss": 9.284, "step": 8320, "throughput": 8798.986546394839 }, { "epoch": 0.13091011685044554, "grad_norm": 0.0930311530828476, "learning_rate": 0.00028363267852507133, "loss": 9.2665, "step": 8352, "throughput": 8798.865374036568 }, { "epoch": 0.13141168817937446, "grad_norm": 0.09189429879188538, "learning_rate": 0.0002835015908507793, "loss": 9.2858, "step": 8384, "throughput": 8799.037145915567 }, { "epoch": 0.13191325950830335, "grad_norm": 0.09416882693767548, "learning_rate": 0.0002833700144755753, "loss": 9.2591, "step": 8416, "throughput": 8799.647256916047 }, { "epoch": 0.13241483083723227, "grad_norm": 0.09771085530519485, "learning_rate": 0.0002832379499420808, "loss": 9.2772, "step": 8448, "throughput": 8800.302025027004 }, { "epoch": 0.13291640216616118, "grad_norm": 0.09605992585420609, "learning_rate": 0.0002831053977949303, "loss": 9.2571, "step": 8480, "throughput": 8800.269159153138 }, { "epoch": 0.13341797349509008, "grad_norm": 0.09242798388004303, "learning_rate": 0.00028297235858076923, "loss": 9.265, "step": 8512, "throughput": 8800.330415544971 }, { "epoch": 0.133919544824019, "grad_norm": 0.0932585746049881, "learning_rate": 0.0002828388328482517, "loss": 9.2515, "step": 8544, "throughput": 8800.918277345787 }, { "epoch": 0.1344211161529479, "grad_norm": 0.09092196822166443, "learning_rate": 0.0002827048211480383, "loss": 9.2499, "step": 8576, "throughput": 8801.608592729954 }, { "epoch": 0.13492268748187683, "grad_norm": 0.09770791232585907, "learning_rate": 0.00028257032403279354, "loss": 9.2567, "step": 8608, "throughput": 8801.558844237528 }, { "epoch": 0.13542425881080572, "grad_norm": 0.10055698454380035, "learning_rate": 0.00028243534205718405, "loss": 9.2512, "step": 8640, "throughput": 8801.763149454433 }, { "epoch": 0.13592583013973464, "grad_norm": 0.09112855792045593, "learning_rate": 0.00028229987577787585, "loss": 9.2453, "step": 8672, "throughput": 8802.337256009518 }, { "epoch": 0.13642740146866356, "grad_norm": 0.09379726648330688, "learning_rate": 0.00028216392575353225, "loss": 9.2256, "step": 8704, "throughput": 8803.026187051975 }, { "epoch": 0.13692897279759245, "grad_norm": 0.09850838780403137, "learning_rate": 0.00028202749254481165, "loss": 9.2331, "step": 8736, "throughput": 8803.125678082812 }, { "epoch": 0.13743054412652136, "grad_norm": 0.09180594235658646, "learning_rate": 0.0002818905767143649, "loss": 9.2448, "step": 8768, "throughput": 8803.326981019853 }, { "epoch": 0.13793211545545028, "grad_norm": 0.0894036740064621, "learning_rate": 0.0002817531788268333, "loss": 9.2408, "step": 8800, "throughput": 8803.89868187386 }, { "epoch": 0.1384336867843792, "grad_norm": 0.10017931461334229, "learning_rate": 0.0002816152994488462, "loss": 9.2397, "step": 8832, "throughput": 8804.577001273188 }, { "epoch": 0.1389352581133081, "grad_norm": 0.08872395753860474, "learning_rate": 0.0002814769391490185, "loss": 9.2626, "step": 8864, "throughput": 8804.650554415306 }, { "epoch": 0.139436829442237, "grad_norm": 0.08980926871299744, "learning_rate": 0.0002813380984979486, "loss": 9.2282, "step": 8896, "throughput": 8804.558633045863 }, { "epoch": 0.13993840077116593, "grad_norm": 0.08992139995098114, "learning_rate": 0.00028119877806821557, "loss": 9.2294, "step": 8928, "throughput": 8805.109496685498 }, { "epoch": 0.14043997210009482, "grad_norm": 0.09270741790533066, "learning_rate": 0.00028105897843437746, "loss": 9.2416, "step": 8960, "throughput": 8805.72966199901 }, { "epoch": 0.14094154342902374, "grad_norm": 0.09832657873630524, "learning_rate": 0.0002809187001729683, "loss": 9.2475, "step": 8992, "throughput": 8805.730741485399 }, { "epoch": 0.14144311475795265, "grad_norm": 0.09440915286540985, "learning_rate": 0.00028077794386249604, "loss": 9.2224, "step": 9024, "throughput": 8805.796344473592 }, { "epoch": 0.14194468608688157, "grad_norm": 0.09356938302516937, "learning_rate": 0.0002806367100834401, "loss": 9.2184, "step": 9056, "throughput": 8806.347742433114 }, { "epoch": 0.14244625741581046, "grad_norm": 0.08971024304628372, "learning_rate": 0.00028049499941824906, "loss": 9.225, "step": 9088, "throughput": 8806.948924234961 }, { "epoch": 0.14294782874473938, "grad_norm": 0.09602756053209305, "learning_rate": 0.0002803528124513382, "loss": 9.2023, "step": 9120, "throughput": 8807.236542892864 }, { "epoch": 0.1434494000736683, "grad_norm": 0.08790814876556396, "learning_rate": 0.00028021014976908676, "loss": 9.2285, "step": 9152, "throughput": 8807.245979642503 }, { "epoch": 0.1439509714025972, "grad_norm": 0.09404074400663376, "learning_rate": 0.0002800670119598363, "loss": 9.1934, "step": 9184, "throughput": 8807.76363103956 }, { "epoch": 0.1444525427315261, "grad_norm": 0.09263089299201965, "learning_rate": 0.0002799233996138874, "loss": 9.2266, "step": 9216, "throughput": 8808.38551889481 }, { "epoch": 0.14495411406045502, "grad_norm": 0.10047102719545364, "learning_rate": 0.00027977931332349786, "loss": 9.2069, "step": 9248, "throughput": 8808.654971911983 }, { "epoch": 0.14545568538938394, "grad_norm": 0.08788640052080154, "learning_rate": 0.00027963475368288006, "loss": 9.2235, "step": 9280, "throughput": 8808.410717545134 }, { "epoch": 0.14595725671831283, "grad_norm": 0.09000838547945023, "learning_rate": 0.00027948972128819823, "loss": 9.2016, "step": 9312, "throughput": 8808.989347922286 }, { "epoch": 0.14645882804724175, "grad_norm": 0.08718439191579819, "learning_rate": 0.0002793442167375665, "loss": 9.1963, "step": 9344, "throughput": 8809.608424197246 }, { "epoch": 0.14696039937617067, "grad_norm": 0.09730090200901031, "learning_rate": 0.0002791982406310461, "loss": 9.2075, "step": 9376, "throughput": 8809.664159961862 }, { "epoch": 0.14746197070509956, "grad_norm": 0.08973324298858643, "learning_rate": 0.0002790517935706428, "loss": 9.2052, "step": 9408, "throughput": 8809.583264650293 }, { "epoch": 0.14796354203402848, "grad_norm": 0.0930982455611229, "learning_rate": 0.00027890487616030475, "loss": 9.2064, "step": 9440, "throughput": 8810.203150029063 }, { "epoch": 0.1484651133629574, "grad_norm": 0.0945277065038681, "learning_rate": 0.0002787574890059199, "loss": 9.1756, "step": 9472, "throughput": 8810.752868527594 }, { "epoch": 0.1489666846918863, "grad_norm": 0.09098446369171143, "learning_rate": 0.0002786096327153131, "loss": 9.2186, "step": 9504, "throughput": 8810.953288912602 }, { "epoch": 0.1494682560208152, "grad_norm": 0.09192486852407455, "learning_rate": 0.00027846130789824437, "loss": 9.1797, "step": 9536, "throughput": 8811.010016247727 }, { "epoch": 0.14996982734974412, "grad_norm": 0.09026999771595001, "learning_rate": 0.00027831251516640553, "loss": 9.2007, "step": 9568, "throughput": 8811.586749223503 }, { "epoch": 0.15047139867867304, "grad_norm": 0.08644837141036987, "learning_rate": 0.00027816325513341835, "loss": 9.1898, "step": 9600, "throughput": 8812.119277191767 }, { "epoch": 0.15097297000760193, "grad_norm": 0.09695939719676971, "learning_rate": 0.0002780135284148315, "loss": 9.1974, "step": 9632, "throughput": 8812.412326591517 }, { "epoch": 0.15147454133653085, "grad_norm": 0.09113366156816483, "learning_rate": 0.00027786333562811855, "loss": 9.1829, "step": 9664, "throughput": 8812.383500445432 }, { "epoch": 0.15197611266545977, "grad_norm": 0.08756538480520248, "learning_rate": 0.00027771267739267494, "loss": 9.1776, "step": 9696, "throughput": 8813.011200739571 }, { "epoch": 0.15247768399438866, "grad_norm": 0.10144542157649994, "learning_rate": 0.0002775615543298157, "loss": 9.1674, "step": 9728, "throughput": 8813.579825339031 }, { "epoch": 0.15297925532331758, "grad_norm": 0.0889199823141098, "learning_rate": 0.0002774099670627728, "loss": 9.185, "step": 9760, "throughput": 8813.850669007317 }, { "epoch": 0.1534808266522465, "grad_norm": 0.08933806419372559, "learning_rate": 0.00027725791621669257, "loss": 9.1948, "step": 9792, "throughput": 8813.74460647051 }, { "epoch": 0.1539823979811754, "grad_norm": 0.08577853441238403, "learning_rate": 0.0002771054024186331, "loss": 9.1949, "step": 9824, "throughput": 8814.22588833097 }, { "epoch": 0.1544839693101043, "grad_norm": 0.08825614303350449, "learning_rate": 0.0002769524262975618, "loss": 9.1621, "step": 9856, "throughput": 8814.707823396035 }, { "epoch": 0.15498554063903322, "grad_norm": 0.08949641138315201, "learning_rate": 0.0002767989884843527, "loss": 9.1642, "step": 9888, "throughput": 8814.907522626361 }, { "epoch": 0.15548711196796214, "grad_norm": 0.09024640172719955, "learning_rate": 0.0002766450896117837, "loss": 9.1762, "step": 9920, "throughput": 8814.85194309132 }, { "epoch": 0.15598868329689103, "grad_norm": 0.09068801254034042, "learning_rate": 0.0002764907303145342, "loss": 9.1875, "step": 9952, "throughput": 8815.33405831657 }, { "epoch": 0.15649025462581995, "grad_norm": 0.08363509178161621, "learning_rate": 0.00027633591122918244, "loss": 9.159, "step": 9984, "throughput": 8815.8473612818 }, { "epoch": 0.15699182595474886, "grad_norm": 0.09737946093082428, "learning_rate": 0.0002761806329942028, "loss": 9.1766, "step": 10016, "throughput": 8815.942484982741 }, { "epoch": 0.15749339728367778, "grad_norm": 0.0928775891661644, "learning_rate": 0.0002760248962499632, "loss": 9.1529, "step": 10048, "throughput": 8815.921196414467 }, { "epoch": 0.15799496861260667, "grad_norm": 0.08588163554668427, "learning_rate": 0.0002758687016387223, "loss": 9.1796, "step": 10080, "throughput": 8816.397065306171 }, { "epoch": 0.1584965399415356, "grad_norm": 0.09236335009336472, "learning_rate": 0.0002757120498046273, "loss": 9.1775, "step": 10112, "throughput": 8816.860867169062 }, { "epoch": 0.1589981112704645, "grad_norm": 0.09708454459905624, "learning_rate": 0.00027555494139371077, "loss": 9.1648, "step": 10144, "throughput": 8816.976222054867 }, { "epoch": 0.1594996825993934, "grad_norm": 0.09289638698101044, "learning_rate": 0.0002753973770538882, "loss": 9.1374, "step": 10176, "throughput": 8816.974242370972 }, { "epoch": 0.16000125392832232, "grad_norm": 0.08685285598039627, "learning_rate": 0.00027523935743495553, "loss": 9.1247, "step": 10208, "throughput": 8817.429767353415 }, { "epoch": 0.16050282525725124, "grad_norm": 0.09010959416627884, "learning_rate": 0.00027508088318858604, "loss": 9.1647, "step": 10240, "throughput": 8817.910904094293 }, { "epoch": 0.16100439658618015, "grad_norm": 0.08543059229850769, "learning_rate": 0.000274921954968328, "loss": 9.1521, "step": 10272, "throughput": 8816.138866181464 }, { "epoch": 0.16150596791510904, "grad_norm": 0.08699269592761993, "learning_rate": 0.0002747625734296019, "loss": 9.1511, "step": 10304, "throughput": 8816.057816004723 }, { "epoch": 0.16200753924403796, "grad_norm": 0.08303535729646683, "learning_rate": 0.00027460273922969757, "loss": 9.1778, "step": 10336, "throughput": 8816.583570439654 }, { "epoch": 0.16250911057296688, "grad_norm": 0.08931925892829895, "learning_rate": 0.0002744424530277719, "loss": 9.1468, "step": 10368, "throughput": 8817.054724984973 }, { "epoch": 0.16301068190189577, "grad_norm": 0.08841574937105179, "learning_rate": 0.0002742817154848455, "loss": 9.1337, "step": 10400, "throughput": 8817.296466945818 }, { "epoch": 0.1635122532308247, "grad_norm": 0.08364134281873703, "learning_rate": 0.00027412052726380053, "loss": 9.1558, "step": 10432, "throughput": 8817.129164380318 }, { "epoch": 0.1640138245597536, "grad_norm": 0.0879436731338501, "learning_rate": 0.00027395888902937777, "loss": 9.1385, "step": 10464, "throughput": 8817.69940152977 }, { "epoch": 0.16451539588868253, "grad_norm": 0.08633450418710709, "learning_rate": 0.0002737968014481737, "loss": 9.1373, "step": 10496, "throughput": 8818.078687782507 }, { "epoch": 0.16501696721761142, "grad_norm": 0.08205156028270721, "learning_rate": 0.000273634265188638, "loss": 9.132, "step": 10528, "throughput": 8818.33960536414 }, { "epoch": 0.16551853854654033, "grad_norm": 0.0833672285079956, "learning_rate": 0.0002734712809210706, "loss": 9.1376, "step": 10560, "throughput": 8818.221824229719 }, { "epoch": 0.16602010987546925, "grad_norm": 0.0895625501871109, "learning_rate": 0.00027330784931761925, "loss": 9.1149, "step": 10592, "throughput": 8818.894932316287 }, { "epoch": 0.16652168120439814, "grad_norm": 0.0861629992723465, "learning_rate": 0.0002731439710522763, "loss": 9.1102, "step": 10624, "throughput": 8819.223744484703 }, { "epoch": 0.16702325253332706, "grad_norm": 0.0922650396823883, "learning_rate": 0.00027297964680087617, "loss": 9.1304, "step": 10656, "throughput": 8819.425250401511 }, { "epoch": 0.16752482386225598, "grad_norm": 0.08803316205739975, "learning_rate": 0.0002728148772410926, "loss": 9.1387, "step": 10688, "throughput": 8819.313232610373 }, { "epoch": 0.1680263951911849, "grad_norm": 0.08782891929149628, "learning_rate": 0.0002726496630524358, "loss": 9.1549, "step": 10720, "throughput": 8820.06652653723 }, { "epoch": 0.1685279665201138, "grad_norm": 0.08598551899194717, "learning_rate": 0.00027248400491624946, "loss": 9.1005, "step": 10752, "throughput": 8820.29782491265 }, { "epoch": 0.1690295378490427, "grad_norm": 0.09307502955198288, "learning_rate": 0.00027231790351570827, "loss": 9.1229, "step": 10784, "throughput": 8820.491573445923 }, { "epoch": 0.16953110917797162, "grad_norm": 0.08863917738199234, "learning_rate": 0.00027215135953581485, "loss": 9.1274, "step": 10816, "throughput": 8820.347253560274 }, { "epoch": 0.1700326805069005, "grad_norm": 0.08433914929628372, "learning_rate": 0.00027198437366339717, "loss": 9.1096, "step": 10848, "throughput": 8821.061915668972 }, { "epoch": 0.17053425183582943, "grad_norm": 0.08366360515356064, "learning_rate": 0.00027181694658710544, "loss": 9.1036, "step": 10880, "throughput": 8821.259467346279 }, { "epoch": 0.17103582316475835, "grad_norm": 0.09207843244075775, "learning_rate": 0.00027164907899740936, "loss": 9.1121, "step": 10912, "throughput": 8821.515016943495 }, { "epoch": 0.17153739449368727, "grad_norm": 0.08775728940963745, "learning_rate": 0.0002714807715865954, "loss": 9.1289, "step": 10944, "throughput": 8821.438910822044 }, { "epoch": 0.17203896582261616, "grad_norm": 0.08356799185276031, "learning_rate": 0.0002713120250487638, "loss": 9.1053, "step": 10976, "throughput": 8822.131257270616 }, { "epoch": 0.17254053715154508, "grad_norm": 0.09217944741249084, "learning_rate": 0.0002711428400798258, "loss": 9.0969, "step": 11008, "throughput": 8822.297615116977 }, { "epoch": 0.173042108480474, "grad_norm": 0.0949367806315422, "learning_rate": 0.00027097321737750075, "loss": 9.1049, "step": 11040, "throughput": 8822.518257508536 }, { "epoch": 0.17354367980940288, "grad_norm": 0.08590497821569443, "learning_rate": 0.00027080315764131316, "loss": 9.0914, "step": 11072, "throughput": 8822.459541071326 }, { "epoch": 0.1740452511383318, "grad_norm": 0.08156461268663406, "learning_rate": 0.0002706326615725898, "loss": 9.1036, "step": 11104, "throughput": 8823.149445206382 }, { "epoch": 0.17454682246726072, "grad_norm": 0.08349745720624924, "learning_rate": 0.0002704617298744571, "loss": 9.087, "step": 11136, "throughput": 8823.319571727785 }, { "epoch": 0.17504839379618964, "grad_norm": 0.08500142395496368, "learning_rate": 0.00027029036325183775, "loss": 9.0931, "step": 11168, "throughput": 8823.57281056996 }, { "epoch": 0.17554996512511853, "grad_norm": 0.07930979877710342, "learning_rate": 0.0002701185624114483, "loss": 9.1172, "step": 11200, "throughput": 8823.489514513913 }, { "epoch": 0.17605153645404745, "grad_norm": 0.08875308930873871, "learning_rate": 0.0002699463280617959, "loss": 9.1167, "step": 11232, "throughput": 8824.160138599997 }, { "epoch": 0.17655310778297637, "grad_norm": 0.09016279131174088, "learning_rate": 0.00026977366091317554, "loss": 9.0826, "step": 11264, "throughput": 8824.325303855396 }, { "epoch": 0.17705467911190526, "grad_norm": 0.08394233882427216, "learning_rate": 0.00026960056167766704, "loss": 9.0927, "step": 11296, "throughput": 8824.564024855992 }, { "epoch": 0.17755625044083417, "grad_norm": 0.08371694386005402, "learning_rate": 0.0002694270310691321, "loss": 9.0813, "step": 11328, "throughput": 8824.520854810047 }, { "epoch": 0.1780578217697631, "grad_norm": 0.08177390694618225, "learning_rate": 0.0002692530698032116, "loss": 9.0806, "step": 11360, "throughput": 8825.18968171211 }, { "epoch": 0.178559393098692, "grad_norm": 0.08288709819316864, "learning_rate": 0.00026907867859732223, "loss": 9.0814, "step": 11392, "throughput": 8825.334697789634 }, { "epoch": 0.1790609644276209, "grad_norm": 0.08150959759950638, "learning_rate": 0.0002689038581706538, "loss": 9.0865, "step": 11424, "throughput": 8825.619281128274 }, { "epoch": 0.17956253575654982, "grad_norm": 0.0917833000421524, "learning_rate": 0.0002687286092441664, "loss": 9.0634, "step": 11456, "throughput": 8825.46108093822 }, { "epoch": 0.18006410708547874, "grad_norm": 0.08228582888841629, "learning_rate": 0.00026855293254058693, "loss": 9.078, "step": 11488, "throughput": 8826.122405390453 }, { "epoch": 0.18056567841440763, "grad_norm": 0.09273815155029297, "learning_rate": 0.0002683768287844068, "loss": 9.0745, "step": 11520, "throughput": 8826.264264292311 }, { "epoch": 0.18106724974333654, "grad_norm": 0.08710981905460358, "learning_rate": 0.0002682002987018783, "loss": 9.0937, "step": 11552, "throughput": 8826.403465179152 }, { "epoch": 0.18156882107226546, "grad_norm": 0.08702866733074188, "learning_rate": 0.00026802334302101214, "loss": 9.0829, "step": 11584, "throughput": 8826.163922195796 }, { "epoch": 0.18207039240119435, "grad_norm": 0.08332011848688126, "learning_rate": 0.000267845962471574, "loss": 9.0764, "step": 11616, "throughput": 8826.810681241455 }, { "epoch": 0.18257196373012327, "grad_norm": 0.08716616779565811, "learning_rate": 0.0002676681577850818, "loss": 9.0667, "step": 11648, "throughput": 8827.000958585257 }, { "epoch": 0.1830735350590522, "grad_norm": 0.09485767781734467, "learning_rate": 0.0002674899296948026, "loss": 9.0654, "step": 11680, "throughput": 8827.257014002527 }, { "epoch": 0.1835751063879811, "grad_norm": 0.08811061829328537, "learning_rate": 0.00026731127893574955, "loss": 9.0763, "step": 11712, "throughput": 8827.252253691711 }, { "epoch": 0.18407667771691, "grad_norm": 0.0871390700340271, "learning_rate": 0.00026713220624467894, "loss": 9.0888, "step": 11744, "throughput": 8827.888916068738 }, { "epoch": 0.18457824904583892, "grad_norm": 0.09058859199285507, "learning_rate": 0.00026695271236008703, "loss": 9.0656, "step": 11776, "throughput": 8828.18880745889 }, { "epoch": 0.18507982037476783, "grad_norm": 0.09240787476301193, "learning_rate": 0.00026677279802220726, "loss": 9.0753, "step": 11808, "throughput": 8828.28059056515 }, { "epoch": 0.18558139170369672, "grad_norm": 0.08470363169908524, "learning_rate": 0.00026659246397300673, "loss": 9.0631, "step": 11840, "throughput": 8828.482871548256 }, { "epoch": 0.18608296303262564, "grad_norm": 0.08286084234714508, "learning_rate": 0.00026641171095618366, "loss": 9.0549, "step": 11872, "throughput": 8829.017412682424 }, { "epoch": 0.18658453436155456, "grad_norm": 0.08882605284452438, "learning_rate": 0.0002662305397171641, "loss": 9.067, "step": 11904, "throughput": 8829.305455070786 }, { "epoch": 0.18708610569048348, "grad_norm": 0.08985249698162079, "learning_rate": 0.0002660489510030986, "loss": 9.0736, "step": 11936, "throughput": 8829.388474243517 }, { "epoch": 0.18758767701941237, "grad_norm": 0.08082137256860733, "learning_rate": 0.00026586694556285975, "loss": 9.0721, "step": 11968, "throughput": 8829.531639227724 }, { "epoch": 0.1880892483483413, "grad_norm": 0.0825573056936264, "learning_rate": 0.0002656845241470384, "loss": 9.0508, "step": 12000, "throughput": 8830.087587061433 }, { "epoch": 0.1885908196772702, "grad_norm": 0.08357170969247818, "learning_rate": 0.0002655016875079411, "loss": 9.0589, "step": 12032, "throughput": 8830.444982736646 }, { "epoch": 0.1890923910061991, "grad_norm": 0.08905555307865143, "learning_rate": 0.00026531843639958656, "loss": 9.048, "step": 12064, "throughput": 8830.452768499454 }, { "epoch": 0.189593962335128, "grad_norm": 0.09543339163064957, "learning_rate": 0.00026513477157770303, "loss": 9.0543, "step": 12096, "throughput": 8830.603060387728 }, { "epoch": 0.19009553366405693, "grad_norm": 0.08120942860841751, "learning_rate": 0.0002649506937997248, "loss": 9.0557, "step": 12128, "throughput": 8831.166615808197 }, { "epoch": 0.19059710499298585, "grad_norm": 0.0824967548251152, "learning_rate": 0.00026476620382478896, "loss": 9.065, "step": 12160, "throughput": 8831.508908054715 }, { "epoch": 0.19109867632191474, "grad_norm": 0.08354393392801285, "learning_rate": 0.0002645813024137329, "loss": 9.0662, "step": 12192, "throughput": 8831.518942983914 }, { "epoch": 0.19160024765084366, "grad_norm": 0.08198045194149017, "learning_rate": 0.00026439599032909055, "loss": 9.0654, "step": 12224, "throughput": 8831.612563602785 }, { "epoch": 0.19210181897977258, "grad_norm": 0.08542396128177643, "learning_rate": 0.0002642102683350894, "loss": 9.0627, "step": 12256, "throughput": 8832.15725508659 }, { "epoch": 0.19260339030870147, "grad_norm": 0.0902579054236412, "learning_rate": 0.00026402413719764774, "loss": 9.0462, "step": 12288, "throughput": 8832.467440454431 }, { "epoch": 0.19310496163763038, "grad_norm": 0.08987751603126526, "learning_rate": 0.0002638375976843707, "loss": 9.0471, "step": 12320, "throughput": 8830.962024657469 }, { "epoch": 0.1936065329665593, "grad_norm": 0.08961810916662216, "learning_rate": 0.0002636506505645478, "loss": 9.0339, "step": 12352, "throughput": 8830.759731801822 }, { "epoch": 0.19410810429548822, "grad_norm": 0.08138346672058105, "learning_rate": 0.00026346329660914964, "loss": 9.0483, "step": 12384, "throughput": 8831.283922611019 }, { "epoch": 0.1946096756244171, "grad_norm": 0.07983388006687164, "learning_rate": 0.00026327553659082444, "loss": 9.0541, "step": 12416, "throughput": 8831.629960002096 }, { "epoch": 0.19511124695334603, "grad_norm": 0.07903102785348892, "learning_rate": 0.00026308737128389513, "loss": 9.0251, "step": 12448, "throughput": 8831.828895401724 }, { "epoch": 0.19561281828227495, "grad_norm": 0.08130428940057755, "learning_rate": 0.0002628988014643558, "loss": 9.0535, "step": 12480, "throughput": 8831.631966129466 }, { "epoch": 0.19611438961120384, "grad_norm": 0.09019070118665695, "learning_rate": 0.00026270982790986916, "loss": 9.0521, "step": 12512, "throughput": 8832.183929204004 }, { "epoch": 0.19661596094013276, "grad_norm": 0.08807606995105743, "learning_rate": 0.00026252045139976254, "loss": 9.034, "step": 12544, "throughput": 8832.506918598896 }, { "epoch": 0.19711753226906167, "grad_norm": 0.08415339887142181, "learning_rate": 0.00026233067271502536, "loss": 9.0125, "step": 12576, "throughput": 8832.614296363512 }, { "epoch": 0.1976191035979906, "grad_norm": 0.07860168814659119, "learning_rate": 0.0002621404926383054, "loss": 9.0416, "step": 12608, "throughput": 8832.509196753348 }, { "epoch": 0.19812067492691948, "grad_norm": 0.08424630761146545, "learning_rate": 0.0002619499119539059, "loss": 9.0189, "step": 12640, "throughput": 8833.027633810203 }, { "epoch": 0.1986222462558484, "grad_norm": 0.08129549771547318, "learning_rate": 0.0002617589314477821, "loss": 9.0077, "step": 12672, "throughput": 8833.345538127445 }, { "epoch": 0.19912381758477732, "grad_norm": 0.08588721603155136, "learning_rate": 0.0002615675519075383, "loss": 9.0287, "step": 12704, "throughput": 8833.38708333183 }, { "epoch": 0.1996253889137062, "grad_norm": 0.08470887690782547, "learning_rate": 0.00026137577412242415, "loss": 9.0132, "step": 12736, "throughput": 8833.346991276674 }, { "epoch": 0.20012696024263513, "grad_norm": 0.0842299535870552, "learning_rate": 0.00026118359888333193, "loss": 8.9919, "step": 12768, "throughput": 8833.900822243395 }, { "epoch": 0.20062853157156404, "grad_norm": 0.09161315113306046, "learning_rate": 0.00026099102698279276, "loss": 9.0182, "step": 12800, "throughput": 8834.136623124325 }, { "epoch": 0.20113010290049296, "grad_norm": 0.0936942771077156, "learning_rate": 0.0002607980592149739, "loss": 9.0176, "step": 12832, "throughput": 8834.322026360836 }, { "epoch": 0.20163167422942185, "grad_norm": 0.08174356073141098, "learning_rate": 0.00026060469637567484, "loss": 9.0272, "step": 12864, "throughput": 8834.21636981973 }, { "epoch": 0.20213324555835077, "grad_norm": 0.0919305831193924, "learning_rate": 0.0002604109392623246, "loss": 9.0392, "step": 12896, "throughput": 8834.751212132414 }, { "epoch": 0.2026348168872797, "grad_norm": 0.0808793231844902, "learning_rate": 0.00026021678867397803, "loss": 9.022, "step": 12928, "throughput": 8835.025467652535 }, { "epoch": 0.20313638821620858, "grad_norm": 0.07867848128080368, "learning_rate": 0.00026002224541131274, "loss": 9.005, "step": 12960, "throughput": 8835.151417184361 }, { "epoch": 0.2036379595451375, "grad_norm": 0.08789924532175064, "learning_rate": 0.00025982731027662575, "loss": 9.0143, "step": 12992, "throughput": 8835.07073633031 }, { "epoch": 0.20413953087406642, "grad_norm": 0.08775952458381653, "learning_rate": 0.00025963198407383015, "loss": 9.0259, "step": 13024, "throughput": 8835.597813814413 }, { "epoch": 0.20464110220299533, "grad_norm": 0.07885041832923889, "learning_rate": 0.0002594362676084517, "loss": 9.0086, "step": 13056, "throughput": 8835.860231370563 }, { "epoch": 0.20514267353192422, "grad_norm": 0.08015837520360947, "learning_rate": 0.0002592401616876258, "loss": 9.0122, "step": 13088, "throughput": 8835.986147542233 }, { "epoch": 0.20564424486085314, "grad_norm": 0.08334717154502869, "learning_rate": 0.00025904366712009374, "loss": 9.0219, "step": 13120, "throughput": 8835.944234812112 }, { "epoch": 0.20614581618978206, "grad_norm": 0.08041632175445557, "learning_rate": 0.00025884678471619976, "loss": 9.0075, "step": 13152, "throughput": 8836.489174584438 }, { "epoch": 0.20664738751871095, "grad_norm": 0.08151724934577942, "learning_rate": 0.0002586495152878874, "loss": 9.0009, "step": 13184, "throughput": 8836.720480927283 }, { "epoch": 0.20714895884763987, "grad_norm": 0.08173686265945435, "learning_rate": 0.0002584518596486965, "loss": 9.0035, "step": 13216, "throughput": 8836.863723253064 }, { "epoch": 0.2076505301765688, "grad_norm": 0.08243660628795624, "learning_rate": 0.00025825381861375936, "loss": 9.0124, "step": 13248, "throughput": 8836.739052834788 }, { "epoch": 0.2081521015054977, "grad_norm": 0.08186870068311691, "learning_rate": 0.00025805539299979794, "loss": 9.0141, "step": 13280, "throughput": 8837.27394297609 }, { "epoch": 0.2086536728344266, "grad_norm": 0.08113693445920944, "learning_rate": 0.0002578565836251199, "loss": 9.0042, "step": 13312, "throughput": 8837.517877764802 }, { "epoch": 0.2091552441633555, "grad_norm": 0.08530589938163757, "learning_rate": 0.0002576573913096158, "loss": 9.0082, "step": 13344, "throughput": 8837.662081132275 }, { "epoch": 0.20965681549228443, "grad_norm": 0.0843268632888794, "learning_rate": 0.00025745781687475534, "loss": 9.0011, "step": 13376, "throughput": 8837.451410811751 }, { "epoch": 0.21015838682121332, "grad_norm": 0.08069983124732971, "learning_rate": 0.000257257861143584, "loss": 8.9965, "step": 13408, "throughput": 8837.967265466526 }, { "epoch": 0.21065995815014224, "grad_norm": 0.08797997981309891, "learning_rate": 0.00025705752494071995, "loss": 8.9972, "step": 13440, "throughput": 8838.164365564864 }, { "epoch": 0.21116152947907116, "grad_norm": 0.07875420898199081, "learning_rate": 0.0002568568090923501, "loss": 8.9905, "step": 13472, "throughput": 8838.29210044764 }, { "epoch": 0.21166310080800008, "grad_norm": 0.07931997627019882, "learning_rate": 0.0002566557144262273, "loss": 8.9995, "step": 13504, "throughput": 8838.121292638185 }, { "epoch": 0.21216467213692897, "grad_norm": 0.0929732397198677, "learning_rate": 0.00025645424177166663, "loss": 9.0055, "step": 13536, "throughput": 8838.629544739428 }, { "epoch": 0.21266624346585788, "grad_norm": 0.08669450134038925, "learning_rate": 0.0002562523919595418, "loss": 8.9944, "step": 13568, "throughput": 8838.865832350326 }, { "epoch": 0.2131678147947868, "grad_norm": 0.07837585359811783, "learning_rate": 0.0002560501658222821, "loss": 8.9782, "step": 13600, "throughput": 8839.040956514082 }, { "epoch": 0.2136693861237157, "grad_norm": 0.08947234600782394, "learning_rate": 0.0002558475641938686, "loss": 8.9823, "step": 13632, "throughput": 8838.921683629353 }, { "epoch": 0.2141709574526446, "grad_norm": 0.08249003440141678, "learning_rate": 0.00025564458790983114, "loss": 9.0016, "step": 13664, "throughput": 8839.415178957333 }, { "epoch": 0.21467252878157353, "grad_norm": 0.08924390375614166, "learning_rate": 0.0002554412378072445, "loss": 8.985, "step": 13696, "throughput": 8839.758654670273 }, { "epoch": 0.21517410011050242, "grad_norm": 0.08233807235956192, "learning_rate": 0.0002552375147247251, "loss": 8.9774, "step": 13728, "throughput": 8839.705210842236 }, { "epoch": 0.21567567143943134, "grad_norm": 0.0813618078827858, "learning_rate": 0.0002550334195024275, "loss": 8.9801, "step": 13760, "throughput": 8839.59999048092 }, { "epoch": 0.21617724276836026, "grad_norm": 0.07620636373758316, "learning_rate": 0.00025482895298204096, "loss": 8.9757, "step": 13792, "throughput": 8840.157377923262 }, { "epoch": 0.21667881409728917, "grad_norm": 0.08532450348138809, "learning_rate": 0.0002546241160067861, "loss": 8.9766, "step": 13824, "throughput": 8840.425867617763 }, { "epoch": 0.21718038542621806, "grad_norm": 0.08044885098934174, "learning_rate": 0.00025441890942141124, "loss": 8.994, "step": 13856, "throughput": 8840.400033321612 }, { "epoch": 0.21768195675514698, "grad_norm": 0.08269073814153671, "learning_rate": 0.00025421333407218884, "loss": 8.9888, "step": 13888, "throughput": 8840.231364870235 }, { "epoch": 0.2181835280840759, "grad_norm": 0.08159191161394119, "learning_rate": 0.0002540073908069124, "loss": 8.978, "step": 13920, "throughput": 8840.79464592254 }, { "epoch": 0.2186850994130048, "grad_norm": 0.08631081134080887, "learning_rate": 0.0002538010804748924, "loss": 8.9384, "step": 13952, "throughput": 8841.080385514952 }, { "epoch": 0.2191866707419337, "grad_norm": 0.07901880890130997, "learning_rate": 0.0002535944039269533, "loss": 8.986, "step": 13984, "throughput": 8841.048159496497 }, { "epoch": 0.21968824207086263, "grad_norm": 0.07891877740621567, "learning_rate": 0.0002533873620154299, "loss": 8.9743, "step": 14016, "throughput": 8840.959387551367 }, { "epoch": 0.22018981339979155, "grad_norm": 0.07669687271118164, "learning_rate": 0.0002531799555941635, "loss": 8.9712, "step": 14048, "throughput": 8841.50220321759 }, { "epoch": 0.22069138472872044, "grad_norm": 0.08152962476015091, "learning_rate": 0.00025297218551849885, "loss": 8.9487, "step": 14080, "throughput": 8841.774650330875 }, { "epoch": 0.22119295605764935, "grad_norm": 0.09327496588230133, "learning_rate": 0.00025276405264528044, "loss": 8.9746, "step": 14112, "throughput": 8841.79885493629 }, { "epoch": 0.22169452738657827, "grad_norm": 0.07541276514530182, "learning_rate": 0.00025255555783284877, "loss": 8.9697, "step": 14144, "throughput": 8841.727072238291 }, { "epoch": 0.22219609871550716, "grad_norm": 0.08202675729990005, "learning_rate": 0.0002523467019410371, "loss": 8.9714, "step": 14176, "throughput": 8842.26026429414 }, { "epoch": 0.22269767004443608, "grad_norm": 0.08090902864933014, "learning_rate": 0.00025213748583116776, "loss": 8.9849, "step": 14208, "throughput": 8842.548919521794 }, { "epoch": 0.223199241373365, "grad_norm": 0.080088309943676, "learning_rate": 0.0002519279103660486, "loss": 8.9407, "step": 14240, "throughput": 8842.441244363596 }, { "epoch": 0.22370081270229392, "grad_norm": 0.09118451923131943, "learning_rate": 0.0002517179764099694, "loss": 8.937, "step": 14272, "throughput": 8842.283163338127 }, { "epoch": 0.2242023840312228, "grad_norm": 0.0921279564499855, "learning_rate": 0.00025150768482869846, "loss": 8.9623, "step": 14304, "throughput": 8842.817465690732 }, { "epoch": 0.22470395536015172, "grad_norm": 0.07893610000610352, "learning_rate": 0.0002512970364894789, "loss": 8.9598, "step": 14336, "throughput": 8843.119331329157 }, { "epoch": 0.22520552668908064, "grad_norm": 0.08488957583904266, "learning_rate": 0.00025108603226102515, "loss": 8.9652, "step": 14368, "throughput": 8841.600361125842 }, { "epoch": 0.22570709801800953, "grad_norm": 0.07883718609809875, "learning_rate": 0.0002508746730135191, "loss": 8.9593, "step": 14400, "throughput": 8841.445944335217 }, { "epoch": 0.22620866934693845, "grad_norm": 0.08114926517009735, "learning_rate": 0.00025066295961860704, "loss": 8.9453, "step": 14432, "throughput": 8841.984887459164 }, { "epoch": 0.22671024067586737, "grad_norm": 0.0800662562251091, "learning_rate": 0.0002504508929493957, "loss": 8.9527, "step": 14464, "throughput": 8842.348536699352 }, { "epoch": 0.2272118120047963, "grad_norm": 0.0862034410238266, "learning_rate": 0.00025023847388044846, "loss": 8.9404, "step": 14496, "throughput": 8842.28608146023 }, { "epoch": 0.22771338333372518, "grad_norm": 0.09161315113306046, "learning_rate": 0.0002500257032877823, "loss": 8.9541, "step": 14528, "throughput": 8842.1214286272 }, { "epoch": 0.2282149546626541, "grad_norm": 0.08155594021081924, "learning_rate": 0.0002498125820488639, "loss": 8.9548, "step": 14560, "throughput": 8842.656952860514 }, { "epoch": 0.22871652599158301, "grad_norm": 0.09579102694988251, "learning_rate": 0.00024959911104260565, "loss": 8.9496, "step": 14592, "throughput": 8843.03001049117 }, { "epoch": 0.2292180973205119, "grad_norm": 0.0857420563697815, "learning_rate": 0.00024938529114936273, "loss": 8.963, "step": 14624, "throughput": 8842.954005956697 }, { "epoch": 0.22971966864944082, "grad_norm": 0.08455734699964523, "learning_rate": 0.000249171123250929, "loss": 8.9558, "step": 14656, "throughput": 8842.871556219898 }, { "epoch": 0.23022123997836974, "grad_norm": 0.08038202673196793, "learning_rate": 0.00024895660823053353, "loss": 8.944, "step": 14688, "throughput": 8843.389256958373 }, { "epoch": 0.23072281130729866, "grad_norm": 0.07368004322052002, "learning_rate": 0.00024874174697283685, "loss": 8.966, "step": 14720, "throughput": 8843.724505638962 }, { "epoch": 0.23122438263622755, "grad_norm": 0.07424993813037872, "learning_rate": 0.0002485265403639275, "loss": 8.9445, "step": 14752, "throughput": 8843.614720535039 }, { "epoch": 0.23172595396515647, "grad_norm": 0.07684794813394547, "learning_rate": 0.0002483109892913181, "loss": 8.9629, "step": 14784, "throughput": 8843.554306116497 }, { "epoch": 0.23222752529408539, "grad_norm": 0.08115751296281815, "learning_rate": 0.0002480950946439419, "loss": 8.9452, "step": 14816, "throughput": 8844.063542567186 }, { "epoch": 0.23272909662301428, "grad_norm": 0.07495855540037155, "learning_rate": 0.0002478788573121491, "loss": 8.9245, "step": 14848, "throughput": 8844.431926105013 }, { "epoch": 0.2332306679519432, "grad_norm": 0.08068421483039856, "learning_rate": 0.0002476622781877031, "loss": 8.925, "step": 14880, "throughput": 8844.360587283169 }, { "epoch": 0.2337322392808721, "grad_norm": 0.07673174142837524, "learning_rate": 0.0002474453581637769, "loss": 8.94, "step": 14912, "throughput": 8844.243253170607 }, { "epoch": 0.23423381060980103, "grad_norm": 0.08002306520938873, "learning_rate": 0.00024722809813494933, "loss": 8.9419, "step": 14944, "throughput": 8844.749124458962 }, { "epoch": 0.23473538193872992, "grad_norm": 0.08444210141897202, "learning_rate": 0.00024701049899720123, "loss": 8.9346, "step": 14976, "throughput": 8845.12059524771 }, { "epoch": 0.23523695326765884, "grad_norm": 0.07796745747327805, "learning_rate": 0.0002467925616479122, "loss": 8.9592, "step": 15008, "throughput": 8845.064143363543 }, { "epoch": 0.23573852459658776, "grad_norm": 0.0815286934375763, "learning_rate": 0.0002465742869858566, "loss": 8.9434, "step": 15040, "throughput": 8845.028768403501 }, { "epoch": 0.23624009592551665, "grad_norm": 0.07568243145942688, "learning_rate": 0.0002463556759111996, "loss": 8.9199, "step": 15072, "throughput": 8845.53829814696 }, { "epoch": 0.23674166725444556, "grad_norm": 0.08475536853075027, "learning_rate": 0.00024613672932549403, "loss": 8.9335, "step": 15104, "throughput": 8845.914556859072 }, { "epoch": 0.23724323858337448, "grad_norm": 0.08863034099340439, "learning_rate": 0.00024591744813167625, "loss": 8.9221, "step": 15136, "throughput": 8845.680743425255 }, { "epoch": 0.2377448099123034, "grad_norm": 0.09319442510604858, "learning_rate": 0.00024569783323406255, "loss": 8.9204, "step": 15168, "throughput": 8845.717517430303 }, { "epoch": 0.2382463812412323, "grad_norm": 0.0772862657904625, "learning_rate": 0.00024547788553834536, "loss": 8.9287, "step": 15200, "throughput": 8846.219617937282 }, { "epoch": 0.2387479525701612, "grad_norm": 0.09362047165632248, "learning_rate": 0.00024525760595158977, "loss": 8.9426, "step": 15232, "throughput": 8846.58514079462 }, { "epoch": 0.23924952389909013, "grad_norm": 0.07392299920320511, "learning_rate": 0.0002450369953822293, "loss": 8.93, "step": 15264, "throughput": 8846.413348386604 }, { "epoch": 0.23975109522801902, "grad_norm": 0.07865717262029648, "learning_rate": 0.0002448160547400627, "loss": 8.9214, "step": 15296, "throughput": 8846.330000239195 }, { "epoch": 0.24025266655694794, "grad_norm": 0.0818992331624031, "learning_rate": 0.00024459478493624973, "loss": 8.9222, "step": 15328, "throughput": 8846.830431784812 }, { "epoch": 0.24075423788587685, "grad_norm": 0.08360916376113892, "learning_rate": 0.0002443731868833078, "loss": 8.9249, "step": 15360, "throughput": 8847.183830926322 }, { "epoch": 0.24125580921480577, "grad_norm": 0.08100838959217072, "learning_rate": 0.0002441512614951079, "loss": 8.8944, "step": 15392, "throughput": 8847.052000253154 }, { "epoch": 0.24175738054373466, "grad_norm": 0.07800208777189255, "learning_rate": 0.00024392900968687103, "loss": 8.9333, "step": 15424, "throughput": 8846.88050180888 }, { "epoch": 0.24225895187266358, "grad_norm": 0.08001139760017395, "learning_rate": 0.00024370643237516426, "loss": 8.9166, "step": 15456, "throughput": 8847.353385177725 }, { "epoch": 0.2427605232015925, "grad_norm": 0.07767579704523087, "learning_rate": 0.00024348353047789708, "loss": 8.9362, "step": 15488, "throughput": 8847.707394107481 }, { "epoch": 0.2432620945305214, "grad_norm": 0.08075279742479324, "learning_rate": 0.0002432603049143176, "loss": 8.9314, "step": 15520, "throughput": 8847.557036979117 }, { "epoch": 0.2437636658594503, "grad_norm": 0.08244162052869797, "learning_rate": 0.0002430367566050087, "loss": 8.9232, "step": 15552, "throughput": 8847.42904210372 }, { "epoch": 0.24426523718837923, "grad_norm": 0.08184251189231873, "learning_rate": 0.00024281288647188425, "loss": 8.9084, "step": 15584, "throughput": 8847.911019319828 }, { "epoch": 0.24476680851730814, "grad_norm": 0.07141350954771042, "learning_rate": 0.00024258869543818535, "loss": 8.8898, "step": 15616, "throughput": 8848.260056243274 }, { "epoch": 0.24526837984623703, "grad_norm": 0.09621914476156235, "learning_rate": 0.00024236418442847652, "loss": 8.9365, "step": 15648, "throughput": 8848.086390788978 }, { "epoch": 0.24576995117516595, "grad_norm": 0.08253178000450134, "learning_rate": 0.0002421393543686418, "loss": 8.9261, "step": 15680, "throughput": 8847.997217683149 }, { "epoch": 0.24627152250409487, "grad_norm": 0.08006506413221359, "learning_rate": 0.00024191420618588103, "loss": 8.9107, "step": 15712, "throughput": 8848.472194979533 }, { "epoch": 0.24677309383302376, "grad_norm": 0.07812228798866272, "learning_rate": 0.000241688740808706, "loss": 8.9108, "step": 15744, "throughput": 8848.844196536425 }, { "epoch": 0.24727466516195268, "grad_norm": 0.0766172781586647, "learning_rate": 0.0002414629591669366, "loss": 8.9101, "step": 15776, "throughput": 8848.763152880012 }, { "epoch": 0.2477762364908816, "grad_norm": 0.07769637554883957, "learning_rate": 0.0002412368621916969, "loss": 8.9097, "step": 15808, "throughput": 8848.663138376132 }, { "epoch": 0.2482778078198105, "grad_norm": 0.07883188128471375, "learning_rate": 0.0002410104508154116, "loss": 8.913, "step": 15840, "throughput": 8849.133796430357 }, { "epoch": 0.2487793791487394, "grad_norm": 0.08420856297016144, "learning_rate": 0.00024078372597180183, "loss": 8.907, "step": 15872, "throughput": 8849.475555093464 }, { "epoch": 0.24928095047766832, "grad_norm": 0.08431058377027512, "learning_rate": 0.00024055668859588157, "loss": 8.8894, "step": 15904, "throughput": 8849.408986160002 }, { "epoch": 0.24978252180659724, "grad_norm": 0.07932168245315552, "learning_rate": 0.0002403293396239536, "loss": 8.9009, "step": 15936, "throughput": 8849.197175271445 }, { "epoch": 0.25028409313552613, "grad_norm": 0.08460178226232529, "learning_rate": 0.00024010167999360575, "loss": 8.9212, "step": 15968, "throughput": 8849.662358621626 }, { "epoch": 0.25078566446445505, "grad_norm": 0.08003672957420349, "learning_rate": 0.00023987371064370698, "loss": 8.8948, "step": 16000, "throughput": 8850.04432114083 }, { "epoch": 0.25128723579338397, "grad_norm": 0.08153244853019714, "learning_rate": 0.00023964543251440363, "loss": 8.8742, "step": 16032, "throughput": 8849.954581929667 }, { "epoch": 0.2517888071223129, "grad_norm": 0.08619936555624008, "learning_rate": 0.00023941684654711534, "loss": 8.9046, "step": 16064, "throughput": 8849.672440477376 }, { "epoch": 0.2522903784512418, "grad_norm": 0.07380315661430359, "learning_rate": 0.0002391879536845313, "loss": 8.9018, "step": 16096, "throughput": 8850.142241698979 }, { "epoch": 0.25279194978017067, "grad_norm": 0.08683207631111145, "learning_rate": 0.0002389587548706064, "loss": 8.8875, "step": 16128, "throughput": 8850.546625557658 }, { "epoch": 0.2532935211090996, "grad_norm": 0.07719036191701889, "learning_rate": 0.0002387292510505572, "loss": 8.8818, "step": 16160, "throughput": 8850.427029930328 }, { "epoch": 0.2537950924380285, "grad_norm": 0.07802041620016098, "learning_rate": 0.00023849944317085812, "loss": 8.9115, "step": 16192, "throughput": 8850.281862064956 }, { "epoch": 0.2542966637669574, "grad_norm": 0.07789981365203857, "learning_rate": 0.0002382693321792376, "loss": 8.898, "step": 16224, "throughput": 8850.741104033605 }, { "epoch": 0.25479823509588634, "grad_norm": 0.07908471673727036, "learning_rate": 0.00023803891902467406, "loss": 8.9094, "step": 16256, "throughput": 8851.116113257427 }, { "epoch": 0.25529980642481526, "grad_norm": 0.08057553321123123, "learning_rate": 0.0002378082046573919, "loss": 8.8815, "step": 16288, "throughput": 8851.002029177316 }, { "epoch": 0.2558013777537442, "grad_norm": 0.07993035018444061, "learning_rate": 0.00023757719002885793, "loss": 8.8784, "step": 16320, "throughput": 8850.852525962286 }, { "epoch": 0.25630294908267304, "grad_norm": 0.08261415362358093, "learning_rate": 0.00023734587609177725, "loss": 8.8955, "step": 16352, "throughput": 8851.312677135089 }, { "epoch": 0.25680452041160196, "grad_norm": 0.08019208908081055, "learning_rate": 0.000237114263800089, "loss": 8.8989, "step": 16384, "throughput": 8851.708561726255 }, { "epoch": 0.2573060917405309, "grad_norm": 0.08069220185279846, "learning_rate": 0.0002368823541089632, "loss": 8.8979, "step": 16416, "throughput": 8850.402276688073 }, { "epoch": 0.2578076630694598, "grad_norm": 0.08015625178813934, "learning_rate": 0.00023665014797479602, "loss": 8.8898, "step": 16448, "throughput": 8850.273118023713 }, { "epoch": 0.2583092343983887, "grad_norm": 0.08302200585603714, "learning_rate": 0.00023641764635520617, "loss": 8.879, "step": 16480, "throughput": 8850.730562782745 }, { "epoch": 0.2588108057273176, "grad_norm": 0.07490982115268707, "learning_rate": 0.0002361848502090311, "loss": 8.8805, "step": 16512, "throughput": 8851.111261480031 }, { "epoch": 0.25931237705624655, "grad_norm": 0.0764228031039238, "learning_rate": 0.0002359517604963228, "loss": 8.8969, "step": 16544, "throughput": 8851.001378740231 }, { "epoch": 0.2598139483851754, "grad_norm": 0.07773542404174805, "learning_rate": 0.0002357183781783439, "loss": 8.8794, "step": 16576, "throughput": 8850.781342052993 }, { "epoch": 0.2603155197141043, "grad_norm": 0.09659688174724579, "learning_rate": 0.0002354847042175638, "loss": 8.8671, "step": 16608, "throughput": 8851.225868462627 }, { "epoch": 0.26081709104303324, "grad_norm": 0.0778978168964386, "learning_rate": 0.0002352507395776546, "loss": 8.8743, "step": 16640, "throughput": 8851.597799348436 }, { "epoch": 0.26131866237196216, "grad_norm": 0.0769830197095871, "learning_rate": 0.00023501648522348715, "loss": 8.8877, "step": 16672, "throughput": 8851.56506268054 }, { "epoch": 0.2618202337008911, "grad_norm": 0.08127515017986298, "learning_rate": 0.0002347819421211271, "loss": 8.8921, "step": 16704, "throughput": 8851.318214920258 }, { "epoch": 0.26232180502982, "grad_norm": 0.08417029678821564, "learning_rate": 0.00023454711123783092, "loss": 8.8755, "step": 16736, "throughput": 8851.757418120564 }, { "epoch": 0.2628233763587489, "grad_norm": 0.07545661926269531, "learning_rate": 0.00023431199354204192, "loss": 8.8772, "step": 16768, "throughput": 8852.122258397247 }, { "epoch": 0.2633249476876778, "grad_norm": 0.07834376394748688, "learning_rate": 0.00023407659000338607, "loss": 8.8865, "step": 16800, "throughput": 8852.051755423026 }, { "epoch": 0.2638265190166067, "grad_norm": 0.07725591957569122, "learning_rate": 0.00023384090159266833, "loss": 8.8804, "step": 16832, "throughput": 8851.755112202223 }, { "epoch": 0.2643280903455356, "grad_norm": 0.07641211152076721, "learning_rate": 0.00023360492928186838, "loss": 8.8721, "step": 16864, "throughput": 8852.189476055588 }, { "epoch": 0.26482966167446453, "grad_norm": 0.07766464352607727, "learning_rate": 0.00023336867404413674, "loss": 8.8797, "step": 16896, "throughput": 8852.561389159348 }, { "epoch": 0.26533123300339345, "grad_norm": 0.08463136851787567, "learning_rate": 0.0002331321368537907, "loss": 8.8606, "step": 16928, "throughput": 8852.384328747887 }, { "epoch": 0.26583280433232237, "grad_norm": 0.08805921673774719, "learning_rate": 0.0002328953186863103, "loss": 8.8684, "step": 16960, "throughput": 8852.249964482584 }, { "epoch": 0.2663343756612513, "grad_norm": 0.07233244925737381, "learning_rate": 0.00023265822051833442, "loss": 8.8793, "step": 16992, "throughput": 8852.687555471475 }, { "epoch": 0.26683594699018015, "grad_norm": 0.0757635310292244, "learning_rate": 0.00023242084332765662, "loss": 8.8467, "step": 17024, "throughput": 8853.059822154499 }, { "epoch": 0.26733751831910907, "grad_norm": 0.07592292875051498, "learning_rate": 0.0002321831880932211, "loss": 8.8471, "step": 17056, "throughput": 8852.917810217152 }, { "epoch": 0.267839089648038, "grad_norm": 0.08107979595661163, "learning_rate": 0.00023194525579511876, "loss": 8.8732, "step": 17088, "throughput": 8852.652403313208 }, { "epoch": 0.2683406609769669, "grad_norm": 0.07222038507461548, "learning_rate": 0.00023170704741458308, "loss": 8.8802, "step": 17120, "throughput": 8853.078328328778 }, { "epoch": 0.2688422323058958, "grad_norm": 0.07747363299131393, "learning_rate": 0.00023146856393398615, "loss": 8.8865, "step": 17152, "throughput": 8853.434671798588 }, { "epoch": 0.26934380363482474, "grad_norm": 0.08264749497175217, "learning_rate": 0.0002312298063368346, "loss": 8.874, "step": 17184, "throughput": 8853.327615268681 }, { "epoch": 0.26984537496375366, "grad_norm": 0.07909268140792847, "learning_rate": 0.00023099077560776536, "loss": 8.8685, "step": 17216, "throughput": 8853.066307208373 }, { "epoch": 0.2703469462926825, "grad_norm": 0.07460527122020721, "learning_rate": 0.00023075147273254195, "loss": 8.8518, "step": 17248, "throughput": 8853.494637244397 }, { "epoch": 0.27084851762161144, "grad_norm": 0.0753212720155716, "learning_rate": 0.0002305118986980501, "loss": 8.8774, "step": 17280, "throughput": 8853.857709105841 }, { "epoch": 0.27135008895054036, "grad_norm": 0.0831172987818718, "learning_rate": 0.00023027205449229388, "loss": 8.8699, "step": 17312, "throughput": 8853.796085376076 }, { "epoch": 0.2718516602794693, "grad_norm": 0.08725563436746597, "learning_rate": 0.00023003194110439145, "loss": 8.8637, "step": 17344, "throughput": 8853.553669857007 }, { "epoch": 0.2723532316083982, "grad_norm": 0.075401172041893, "learning_rate": 0.00022979155952457118, "loss": 8.8637, "step": 17376, "throughput": 8853.973439032789 }, { "epoch": 0.2728548029373271, "grad_norm": 0.07143397629261017, "learning_rate": 0.00022955091074416733, "loss": 8.8577, "step": 17408, "throughput": 8854.336355288591 }, { "epoch": 0.27335637426625603, "grad_norm": 0.07553057372570038, "learning_rate": 0.0002293099957556163, "loss": 8.8733, "step": 17440, "throughput": 8854.379570846144 }, { "epoch": 0.2738579455951849, "grad_norm": 0.08055870234966278, "learning_rate": 0.00022906881555245212, "loss": 8.8247, "step": 17472, "throughput": 8854.08924615641 }, { "epoch": 0.2743595169241138, "grad_norm": 0.0758543387055397, "learning_rate": 0.0002288273711293028, "loss": 8.8353, "step": 17504, "throughput": 8854.501014264812 }, { "epoch": 0.27486108825304273, "grad_norm": 0.07926324754953384, "learning_rate": 0.00022858566348188568, "loss": 8.8772, "step": 17536, "throughput": 8854.84422521045 }, { "epoch": 0.27536265958197165, "grad_norm": 0.07983296364545822, "learning_rate": 0.00022834369360700394, "loss": 8.8558, "step": 17568, "throughput": 8854.904033354123 }, { "epoch": 0.27586423091090057, "grad_norm": 0.07504521310329437, "learning_rate": 0.00022810146250254196, "loss": 8.8663, "step": 17600, "throughput": 8854.581894360635 }, { "epoch": 0.2763658022398295, "grad_norm": 0.0762917697429657, "learning_rate": 0.00022785897116746166, "loss": 8.836, "step": 17632, "throughput": 8854.990553638641 }, { "epoch": 0.2768673735687584, "grad_norm": 0.07859618216753006, "learning_rate": 0.00022761622060179793, "loss": 8.8458, "step": 17664, "throughput": 8855.338111904854 }, { "epoch": 0.27736894489768726, "grad_norm": 0.07929467409849167, "learning_rate": 0.00022737321180665488, "loss": 8.8529, "step": 17696, "throughput": 8855.424227586012 }, { "epoch": 0.2778705162266162, "grad_norm": 0.08522498607635498, "learning_rate": 0.00022712994578420143, "loss": 8.879, "step": 17728, "throughput": 8855.043453434968 }, { "epoch": 0.2783720875555451, "grad_norm": 0.08172594010829926, "learning_rate": 0.00022688642353766746, "loss": 8.8265, "step": 17760, "throughput": 8855.448267391399 }, { "epoch": 0.278873658884474, "grad_norm": 0.07379541546106339, "learning_rate": 0.00022664264607133937, "loss": 8.8516, "step": 17792, "throughput": 8855.78500977693 }, { "epoch": 0.27937523021340294, "grad_norm": 0.08074238151311874, "learning_rate": 0.00022639861439055617, "loss": 8.8508, "step": 17824, "throughput": 8855.85539062389 }, { "epoch": 0.27987680154233185, "grad_norm": 0.07020998746156693, "learning_rate": 0.00022615432950170528, "loss": 8.8343, "step": 17856, "throughput": 8855.62056538393 }, { "epoch": 0.2803783728712608, "grad_norm": 0.07520820200443268, "learning_rate": 0.00022590979241221825, "loss": 8.8356, "step": 17888, "throughput": 8855.967147418854 }, { "epoch": 0.28087994420018964, "grad_norm": 0.07711929082870483, "learning_rate": 0.00022566500413056677, "loss": 8.8239, "step": 17920, "throughput": 8856.296819007126 }, { "epoch": 0.28138151552911855, "grad_norm": 0.082347571849823, "learning_rate": 0.00022541996566625841, "loss": 8.8508, "step": 17952, "throughput": 8856.407864410561 }, { "epoch": 0.28188308685804747, "grad_norm": 0.07675802707672119, "learning_rate": 0.00022517467802983266, "loss": 8.8465, "step": 17984, "throughput": 8855.985756478674 }, { "epoch": 0.2823846581869764, "grad_norm": 0.07771243155002594, "learning_rate": 0.0002249291422328563, "loss": 8.8411, "step": 18016, "throughput": 8856.335225590043 }, { "epoch": 0.2828862295159053, "grad_norm": 0.07669182121753693, "learning_rate": 0.00022468335928791977, "loss": 8.8348, "step": 18048, "throughput": 8856.675590260706 }, { "epoch": 0.2833878008448342, "grad_norm": 0.07644347846508026, "learning_rate": 0.00022443733020863262, "loss": 8.8249, "step": 18080, "throughput": 8856.790422663576 }, { "epoch": 0.28388937217376314, "grad_norm": 0.07570263743400574, "learning_rate": 0.00022419105600961955, "loss": 8.8312, "step": 18112, "throughput": 8856.386037032886 }, { "epoch": 0.284390943502692, "grad_norm": 0.07852896302938461, "learning_rate": 0.00022394453770651607, "loss": 8.8398, "step": 18144, "throughput": 8856.740806178015 }, { "epoch": 0.2848925148316209, "grad_norm": 0.07880748063325882, "learning_rate": 0.00022369777631596436, "loss": 8.8073, "step": 18176, "throughput": 8857.067865452158 }, { "epoch": 0.28539408616054984, "grad_norm": 0.08232726156711578, "learning_rate": 0.00022345077285560914, "loss": 8.8411, "step": 18208, "throughput": 8857.218494518329 }, { "epoch": 0.28589565748947876, "grad_norm": 0.07877877354621887, "learning_rate": 0.00022320352834409343, "loss": 8.8383, "step": 18240, "throughput": 8856.812395985982 }, { "epoch": 0.2863972288184077, "grad_norm": 0.08451380580663681, "learning_rate": 0.0002229560438010543, "loss": 8.8075, "step": 18272, "throughput": 8857.160473869671 }, { "epoch": 0.2868988001473366, "grad_norm": 0.0770159438252449, "learning_rate": 0.00022270832024711882, "loss": 8.8476, "step": 18304, "throughput": 8857.499612175583 }, { "epoch": 0.2874003714762655, "grad_norm": 0.07113203406333923, "learning_rate": 0.00022246035870389952, "loss": 8.8182, "step": 18336, "throughput": 8857.645947663099 }, { "epoch": 0.2879019428051944, "grad_norm": 0.07758703827857971, "learning_rate": 0.00022221216019399067, "loss": 8.8354, "step": 18368, "throughput": 8857.217151383928 }, { "epoch": 0.2884035141341233, "grad_norm": 0.07979318499565125, "learning_rate": 0.00022196372574096357, "loss": 8.8236, "step": 18400, "throughput": 8857.567966195786 }, { "epoch": 0.2889050854630522, "grad_norm": 0.08021023869514465, "learning_rate": 0.00022171505636936272, "loss": 8.8404, "step": 18432, "throughput": 8857.892187582856 }, { "epoch": 0.28940665679198113, "grad_norm": 0.07425079494714737, "learning_rate": 0.00022146615310470125, "loss": 8.8425, "step": 18464, "throughput": 8856.808418432038 }, { "epoch": 0.28990822812091005, "grad_norm": 0.07755149155855179, "learning_rate": 0.0002212170169734571, "loss": 8.8284, "step": 18496, "throughput": 8856.443410239268 }, { "epoch": 0.29040979944983897, "grad_norm": 0.07777094095945358, "learning_rate": 0.0002209676490030683, "loss": 8.8119, "step": 18528, "throughput": 8856.778676716774 }, { "epoch": 0.2909113707787679, "grad_norm": 0.08375611156225204, "learning_rate": 0.0002207180502219291, "loss": 8.8353, "step": 18560, "throughput": 8857.11272009533 }, { "epoch": 0.29141294210769675, "grad_norm": 0.08027869462966919, "learning_rate": 0.00022046822165938565, "loss": 8.8138, "step": 18592, "throughput": 8857.243246798447 }, { "epoch": 0.29191451343662567, "grad_norm": 0.07487241923809052, "learning_rate": 0.00022021816434573168, "loss": 8.8214, "step": 18624, "throughput": 8856.791305351167 }, { "epoch": 0.2924160847655546, "grad_norm": 0.07839448004961014, "learning_rate": 0.0002199678793122043, "loss": 8.828, "step": 18656, "throughput": 8857.088713588137 }, { "epoch": 0.2929176560944835, "grad_norm": 0.07757352292537689, "learning_rate": 0.0002197173675909797, "loss": 8.8034, "step": 18688, "throughput": 8857.407527915473 }, { "epoch": 0.2934192274234124, "grad_norm": 0.0852331817150116, "learning_rate": 0.00021946663021516895, "loss": 8.8277, "step": 18720, "throughput": 8857.545830289808 }, { "epoch": 0.29392079875234134, "grad_norm": 0.07976188510656357, "learning_rate": 0.0002192156682188138, "loss": 8.8383, "step": 18752, "throughput": 8857.14916930161 }, { "epoch": 0.29442237008127026, "grad_norm": 0.08353574573993683, "learning_rate": 0.00021896448263688224, "loss": 8.8016, "step": 18784, "throughput": 8857.460271507598 }, { "epoch": 0.2949239414101991, "grad_norm": 0.07478364557027817, "learning_rate": 0.00021871307450526428, "loss": 8.819, "step": 18816, "throughput": 8857.789192633714 }, { "epoch": 0.29542551273912804, "grad_norm": 0.08134711533784866, "learning_rate": 0.00021846144486076794, "loss": 8.7982, "step": 18848, "throughput": 8858.006440622918 }, { "epoch": 0.29592708406805696, "grad_norm": 0.07802871614694595, "learning_rate": 0.00021820959474111448, "loss": 8.8167, "step": 18880, "throughput": 8857.656923278422 }, { "epoch": 0.2964286553969859, "grad_norm": 0.07292959839105606, "learning_rate": 0.00021795752518493462, "loss": 8.8177, "step": 18912, "throughput": 8857.952320622437 }, { "epoch": 0.2969302267259148, "grad_norm": 0.07705529034137726, "learning_rate": 0.0002177052372317639, "loss": 8.809, "step": 18944, "throughput": 8858.283256023147 }, { "epoch": 0.2974317980548437, "grad_norm": 0.07219377160072327, "learning_rate": 0.00021745273192203871, "loss": 8.7992, "step": 18976, "throughput": 8858.485744890328 }, { "epoch": 0.2979333693837726, "grad_norm": 0.07287011295557022, "learning_rate": 0.00021720001029709152, "loss": 8.8159, "step": 19008, "throughput": 8858.189442588646 }, { "epoch": 0.2984349407127015, "grad_norm": 0.08055272698402405, "learning_rate": 0.00021694707339914722, "loss": 8.8072, "step": 19040, "throughput": 8858.477380933875 }, { "epoch": 0.2989365120416304, "grad_norm": 0.07618486136198044, "learning_rate": 0.00021669392227131816, "loss": 8.8157, "step": 19072, "throughput": 8858.800639897141 }, { "epoch": 0.2994380833705593, "grad_norm": 0.08146921545267105, "learning_rate": 0.0002164405579576005, "loss": 8.8208, "step": 19104, "throughput": 8859.004658024229 }, { "epoch": 0.29993965469948825, "grad_norm": 0.07948251813650131, "learning_rate": 0.0002161869815028694, "loss": 8.806, "step": 19136, "throughput": 8858.673377763416 }, { "epoch": 0.30044122602841716, "grad_norm": 0.07705122232437134, "learning_rate": 0.00021593319395287483, "loss": 8.8067, "step": 19168, "throughput": 8858.923571471372 }, { "epoch": 0.3009427973573461, "grad_norm": 0.07735167443752289, "learning_rate": 0.0002156791963542374, "loss": 8.8013, "step": 19200, "throughput": 8859.241962821041 }, { "epoch": 0.30144436868627494, "grad_norm": 0.07171089202165604, "learning_rate": 0.00021542498975444404, "loss": 8.7911, "step": 19232, "throughput": 8859.40818699571 }, { "epoch": 0.30194594001520386, "grad_norm": 0.07420752197504044, "learning_rate": 0.0002151705752018435, "loss": 8.7965, "step": 19264, "throughput": 8859.045418316844 }, { "epoch": 0.3024475113441328, "grad_norm": 0.07215207815170288, "learning_rate": 0.0002149159537456421, "loss": 8.8041, "step": 19296, "throughput": 8859.322351742645 }, { "epoch": 0.3029490826730617, "grad_norm": 0.0752219557762146, "learning_rate": 0.00021466112643589948, "loss": 8.7525, "step": 19328, "throughput": 8859.64663023597 }, { "epoch": 0.3034506540019906, "grad_norm": 0.07393915206193924, "learning_rate": 0.00021440609432352427, "loss": 8.8176, "step": 19360, "throughput": 8859.80417616545 }, { "epoch": 0.30395222533091953, "grad_norm": 0.071009062230587, "learning_rate": 0.00021415085846026961, "loss": 8.8123, "step": 19392, "throughput": 8859.451551701533 }, { "epoch": 0.30445379665984845, "grad_norm": 0.07098925113677979, "learning_rate": 0.00021389541989872904, "loss": 8.7806, "step": 19424, "throughput": 8859.725222106339 }, { "epoch": 0.3049553679887773, "grad_norm": 0.07782450318336487, "learning_rate": 0.00021363977969233186, "loss": 8.806, "step": 19456, "throughput": 8860.046117312131 }, { "epoch": 0.30545693931770623, "grad_norm": 0.07848142087459564, "learning_rate": 0.000213383938895339, "loss": 8.8004, "step": 19488, "throughput": 8860.240900857605 }, { "epoch": 0.30595851064663515, "grad_norm": 0.07593582570552826, "learning_rate": 0.00021312789856283885, "loss": 8.7832, "step": 19520, "throughput": 8859.933631407544 }, { "epoch": 0.30646008197556407, "grad_norm": 0.07840114831924438, "learning_rate": 0.0002128716597507423, "loss": 8.8136, "step": 19552, "throughput": 8860.162457203185 }, { "epoch": 0.306961653304493, "grad_norm": 0.07489600032567978, "learning_rate": 0.00021261522351577906, "loss": 8.8106, "step": 19584, "throughput": 8860.481862699764 }, { "epoch": 0.3074632246334219, "grad_norm": 0.07132317125797272, "learning_rate": 0.00021235859091549294, "loss": 8.8066, "step": 19616, "throughput": 8860.70040022265 }, { "epoch": 0.3079647959623508, "grad_norm": 0.07415090501308441, "learning_rate": 0.0002121017630082375, "loss": 8.7978, "step": 19648, "throughput": 8860.37478055252 }, { "epoch": 0.3084663672912797, "grad_norm": 0.07169399410486221, "learning_rate": 0.0002118447408531718, "loss": 8.7662, "step": 19680, "throughput": 8860.571476568459 }, { "epoch": 0.3089679386202086, "grad_norm": 0.07761473208665848, "learning_rate": 0.00021158752551025603, "loss": 8.784, "step": 19712, "throughput": 8860.875005279453 }, { "epoch": 0.3094695099491375, "grad_norm": 0.06970509141683578, "learning_rate": 0.0002113301180402469, "loss": 8.8111, "step": 19744, "throughput": 8861.079905140443 }, { "epoch": 0.30997108127806644, "grad_norm": 0.08040442317724228, "learning_rate": 0.0002110725195046937, "loss": 8.8005, "step": 19776, "throughput": 8860.789572291105 }, { "epoch": 0.31047265260699536, "grad_norm": 0.0767935961484909, "learning_rate": 0.00021081473096593348, "loss": 8.7964, "step": 19808, "throughput": 8861.053776804249 }, { "epoch": 0.3109742239359243, "grad_norm": 0.08461003750562668, "learning_rate": 0.000210556753487087, "loss": 8.788, "step": 19840, "throughput": 8861.359350502951 }, { "epoch": 0.3114757952648532, "grad_norm": 0.07828964293003082, "learning_rate": 0.00021029858813205408, "loss": 8.7645, "step": 19872, "throughput": 8861.55610888214 }, { "epoch": 0.31197736659378206, "grad_norm": 0.07688968628644943, "learning_rate": 0.00021004023596550946, "loss": 8.7912, "step": 19904, "throughput": 8861.254054605806 }, { "epoch": 0.312478937922711, "grad_norm": 0.07793660461902618, "learning_rate": 0.00020978169805289823, "loss": 8.7965, "step": 19936, "throughput": 8861.43085922013 }, { "epoch": 0.3129805092516399, "grad_norm": 0.08023947477340698, "learning_rate": 0.0002095229754604315, "loss": 8.7836, "step": 19968, "throughput": 8861.73919396938 }, { "epoch": 0.3134820805805688, "grad_norm": 0.06911280006170273, "learning_rate": 0.00020926406925508202, "loss": 8.7903, "step": 20000, "throughput": 8861.956996963781 }, { "epoch": 0.31398365190949773, "grad_norm": 0.07631926983594894, "learning_rate": 0.00020900498050457973, "loss": 8.8002, "step": 20032, "throughput": 8861.663725932234 }, { "epoch": 0.31448522323842665, "grad_norm": 0.07329485565423965, "learning_rate": 0.0002087457102774074, "loss": 8.7874, "step": 20064, "throughput": 8861.784496418903 }, { "epoch": 0.31498679456735557, "grad_norm": 0.0776711255311966, "learning_rate": 0.00020848625964279622, "loss": 8.7753, "step": 20096, "throughput": 8862.072151779965 }, { "epoch": 0.31548836589628443, "grad_norm": 0.07343296706676483, "learning_rate": 0.0002082266296707214, "loss": 8.7819, "step": 20128, "throughput": 8862.30263300387 }, { "epoch": 0.31598993722521335, "grad_norm": 0.07486861944198608, "learning_rate": 0.0002079668214318977, "loss": 8.7772, "step": 20160, "throughput": 8862.023287445149 }, { "epoch": 0.31649150855414226, "grad_norm": 0.07053958624601364, "learning_rate": 0.00020770683599777507, "loss": 8.7748, "step": 20192, "throughput": 8862.130224170123 }, { "epoch": 0.3169930798830712, "grad_norm": 0.07576991617679596, "learning_rate": 0.0002074466744405342, "loss": 8.7726, "step": 20224, "throughput": 8862.436196256274 }, { "epoch": 0.3174946512120001, "grad_norm": 0.07820528000593185, "learning_rate": 0.00020718633783308214, "loss": 8.764, "step": 20256, "throughput": 8862.643837958087 }, { "epoch": 0.317996222540929, "grad_norm": 0.07988286018371582, "learning_rate": 0.00020692582724904778, "loss": 8.7765, "step": 20288, "throughput": 8862.373019248727 }, { "epoch": 0.31849779386985794, "grad_norm": 0.07375448942184448, "learning_rate": 0.00020666514376277762, "loss": 8.7767, "step": 20320, "throughput": 8862.507849895823 }, { "epoch": 0.3189993651987868, "grad_norm": 0.07463452219963074, "learning_rate": 0.00020640428844933108, "loss": 8.7799, "step": 20352, "throughput": 8862.80857270762 }, { "epoch": 0.3195009365277157, "grad_norm": 0.0773720070719719, "learning_rate": 0.00020614326238447623, "loss": 8.7773, "step": 20384, "throughput": 8863.007704410107 }, { "epoch": 0.32000250785664464, "grad_norm": 0.07711444795131683, "learning_rate": 0.0002058820666446854, "loss": 8.7749, "step": 20416, "throughput": 8862.702690685419 }, { "epoch": 0.32050407918557355, "grad_norm": 0.07666298002004623, "learning_rate": 0.00020562070230713058, "loss": 8.7838, "step": 20448, "throughput": 8862.813511884808 }, { "epoch": 0.32100565051450247, "grad_norm": 0.08265794813632965, "learning_rate": 0.00020535917044967899, "loss": 8.7659, "step": 20480, "throughput": 8863.104435326033 }, { "epoch": 0.3215072218434314, "grad_norm": 0.07821661978960037, "learning_rate": 0.00020509747215088887, "loss": 8.7716, "step": 20512, "throughput": 8862.248408416432 }, { "epoch": 0.3220087931723603, "grad_norm": 0.07673295587301254, "learning_rate": 0.00020483560849000475, "loss": 8.7724, "step": 20544, "throughput": 8861.950632851338 }, { "epoch": 0.32251036450128917, "grad_norm": 0.07887155562639236, "learning_rate": 0.00020457358054695317, "loss": 8.7633, "step": 20576, "throughput": 8862.104116942202 }, { "epoch": 0.3230119358302181, "grad_norm": 0.0740322545170784, "learning_rate": 0.00020431138940233808, "loss": 8.7607, "step": 20608, "throughput": 8862.405699218747 }, { "epoch": 0.323513507159147, "grad_norm": 0.07571806013584137, "learning_rate": 0.00020404903613743664, "loss": 8.7607, "step": 20640, "throughput": 8862.580925632892 }, { "epoch": 0.3240150784880759, "grad_norm": 0.07117355614900589, "learning_rate": 0.0002037865218341944, "loss": 8.7698, "step": 20672, "throughput": 8862.345848437657 }, { "epoch": 0.32451664981700484, "grad_norm": 0.07989663630723953, "learning_rate": 0.00020352384757522113, "loss": 8.7533, "step": 20704, "throughput": 8862.46471189943 }, { "epoch": 0.32501822114593376, "grad_norm": 0.07916408777236938, "learning_rate": 0.00020326101444378633, "loss": 8.7779, "step": 20736, "throughput": 8862.762289104425 }, { "epoch": 0.3255197924748627, "grad_norm": 0.07352910190820694, "learning_rate": 0.0002029980235238145, "loss": 8.7761, "step": 20768, "throughput": 8862.900620640583 }, { "epoch": 0.32602136380379154, "grad_norm": 0.0776766985654831, "learning_rate": 0.0002027348758998811, "loss": 8.7684, "step": 20800, "throughput": 8862.731017971897 }, { "epoch": 0.32652293513272046, "grad_norm": 0.07036112248897552, "learning_rate": 0.0002024715726572076, "loss": 8.781, "step": 20832, "throughput": 8862.792925614236 }, { "epoch": 0.3270245064616494, "grad_norm": 0.07312353700399399, "learning_rate": 0.0002022081148816574, "loss": 8.7549, "step": 20864, "throughput": 8863.089293067325 }, { "epoch": 0.3275260777905783, "grad_norm": 0.07475027441978455, "learning_rate": 0.0002019445036597312, "loss": 8.7529, "step": 20896, "throughput": 8863.261879082387 }, { "epoch": 0.3280276491195072, "grad_norm": 0.0749066025018692, "learning_rate": 0.00020168074007856232, "loss": 8.7625, "step": 20928, "throughput": 8863.12822142747 }, { "epoch": 0.32852922044843613, "grad_norm": 0.07506958395242691, "learning_rate": 0.00020141682522591272, "loss": 8.7479, "step": 20960, "throughput": 8863.186783284209 }, { "epoch": 0.32903079177736505, "grad_norm": 0.07391706109046936, "learning_rate": 0.0002011527601901679, "loss": 8.767, "step": 20992, "throughput": 8863.46507968349 }, { "epoch": 0.3295323631062939, "grad_norm": 0.08027796447277069, "learning_rate": 0.00020088854606033292, "loss": 8.7727, "step": 21024, "throughput": 8863.629659452417 }, { "epoch": 0.33003393443522283, "grad_norm": 0.07504010945558548, "learning_rate": 0.00020062418392602767, "loss": 8.7545, "step": 21056, "throughput": 8863.450811395789 }, { "epoch": 0.33053550576415175, "grad_norm": 0.07410600036382675, "learning_rate": 0.00020035967487748226, "loss": 8.7675, "step": 21088, "throughput": 8863.513552246986 }, { "epoch": 0.33103707709308067, "grad_norm": 0.07737985253334045, "learning_rate": 0.00020009502000553286, "loss": 8.7567, "step": 21120, "throughput": 8863.794578497278 }, { "epoch": 0.3315386484220096, "grad_norm": 0.07133954018354416, "learning_rate": 0.00019983022040161692, "loss": 8.7443, "step": 21152, "throughput": 8863.957209664966 }, { "epoch": 0.3320402197509385, "grad_norm": 0.07269904017448425, "learning_rate": 0.00019956527715776887, "loss": 8.7645, "step": 21184, "throughput": 8863.802577653145 }, { "epoch": 0.3325417910798674, "grad_norm": 0.07274427264928818, "learning_rate": 0.0001993001913666153, "loss": 8.7624, "step": 21216, "throughput": 8863.817185842325 }, { "epoch": 0.3330433624087963, "grad_norm": 0.09224282950162888, "learning_rate": 0.00019903496412137093, "loss": 8.7587, "step": 21248, "throughput": 8864.10281088239 }, { "epoch": 0.3335449337377252, "grad_norm": 0.08311135321855545, "learning_rate": 0.00019876959651583362, "loss": 8.7614, "step": 21280, "throughput": 8864.230523668733 }, { "epoch": 0.3340465050666541, "grad_norm": 0.07675494253635406, "learning_rate": 0.00019850408964438023, "loss": 8.769, "step": 21312, "throughput": 8864.095079543797 }, { "epoch": 0.33454807639558304, "grad_norm": 0.08694007992744446, "learning_rate": 0.00019823844460196177, "loss": 8.763, "step": 21344, "throughput": 8864.0769825026 }, { "epoch": 0.33504964772451196, "grad_norm": 0.07510685920715332, "learning_rate": 0.00019797266248409932, "loss": 8.7363, "step": 21376, "throughput": 8864.363601091953 }, { "epoch": 0.3355512190534409, "grad_norm": 0.07192866504192352, "learning_rate": 0.000197706744386879, "loss": 8.7521, "step": 21408, "throughput": 8864.56809397559 }, { "epoch": 0.3360527903823698, "grad_norm": 0.07726701349020004, "learning_rate": 0.00019744069140694795, "loss": 8.7686, "step": 21440, "throughput": 8864.399078833221 }, { "epoch": 0.33655436171129866, "grad_norm": 0.07312531024217606, "learning_rate": 0.00019717450464150935, "loss": 8.7255, "step": 21472, "throughput": 8864.395924493345 }, { "epoch": 0.3370559330402276, "grad_norm": 0.07701530307531357, "learning_rate": 0.00019690818518831827, "loss": 8.7591, "step": 21504, "throughput": 8864.67034622098 }, { "epoch": 0.3375575043691565, "grad_norm": 0.0789533257484436, "learning_rate": 0.0001966417341456769, "loss": 8.7412, "step": 21536, "throughput": 8864.858013910027 }, { "epoch": 0.3380590756980854, "grad_norm": 0.07506786286830902, "learning_rate": 0.0001963751526124301, "loss": 8.7517, "step": 21568, "throughput": 8864.705015491547 }, { "epoch": 0.3385606470270143, "grad_norm": 0.08154984563589096, "learning_rate": 0.00019610844168796096, "loss": 8.766, "step": 21600, "throughput": 8864.701365582961 }, { "epoch": 0.33906221835594325, "grad_norm": 0.07672927528619766, "learning_rate": 0.0001958416024721861, "loss": 8.74, "step": 21632, "throughput": 8864.980959781919 }, { "epoch": 0.33956378968487216, "grad_norm": 0.07608082890510559, "learning_rate": 0.00019557463606555118, "loss": 8.7345, "step": 21664, "throughput": 8865.14812470399 }, { "epoch": 0.340065361013801, "grad_norm": 0.07585503160953522, "learning_rate": 0.0001953075435690266, "loss": 8.7663, "step": 21696, "throughput": 8865.01654954425 }, { "epoch": 0.34056693234272994, "grad_norm": 0.07558625936508179, "learning_rate": 0.0001950403260841024, "loss": 8.7208, "step": 21728, "throughput": 8865.01576011177 }, { "epoch": 0.34106850367165886, "grad_norm": 0.07169059664011002, "learning_rate": 0.0001947729847127845, "loss": 8.7642, "step": 21760, "throughput": 8865.297653033553 }, { "epoch": 0.3415700750005878, "grad_norm": 0.07630694657564163, "learning_rate": 0.00019450552055758934, "loss": 8.7271, "step": 21792, "throughput": 8865.502595810825 }, { "epoch": 0.3420716463295167, "grad_norm": 0.07808644324541092, "learning_rate": 0.00019423793472153996, "loss": 8.7277, "step": 21824, "throughput": 8865.337814499435 }, { "epoch": 0.3425732176584456, "grad_norm": 0.07151120156049728, "learning_rate": 0.0001939702283081611, "loss": 8.7362, "step": 21856, "throughput": 8865.367864029218 }, { "epoch": 0.34307478898737453, "grad_norm": 0.07521945238113403, "learning_rate": 0.00019370240242147488, "loss": 8.748, "step": 21888, "throughput": 8865.610709442726 }, { "epoch": 0.3435763603163034, "grad_norm": 0.0807991772890091, "learning_rate": 0.000193434458165996, "loss": 8.7549, "step": 21920, "throughput": 8865.805759537128 }, { "epoch": 0.3440779316452323, "grad_norm": 0.0741187185049057, "learning_rate": 0.00019316639664672733, "loss": 8.7242, "step": 21952, "throughput": 8865.651708866617 }, { "epoch": 0.34457950297416123, "grad_norm": 0.07587863504886627, "learning_rate": 0.00019289821896915544, "loss": 8.7291, "step": 21984, "throughput": 8865.681779431436 }, { "epoch": 0.34508107430309015, "grad_norm": 0.07360213249921799, "learning_rate": 0.00019262992623924585, "loss": 8.7376, "step": 22016, "throughput": 8865.925156901621 }, { "epoch": 0.34558264563201907, "grad_norm": 0.07608392089605331, "learning_rate": 0.00019236151956343852, "loss": 8.7118, "step": 22048, "throughput": 8866.107911752862 }, { "epoch": 0.346084216960948, "grad_norm": 0.07788034528493881, "learning_rate": 0.00019209300004864341, "loss": 8.7388, "step": 22080, "throughput": 8866.039446435672 }, { "epoch": 0.3465857882898769, "grad_norm": 0.07935154438018799, "learning_rate": 0.00019182436880223585, "loss": 8.7374, "step": 22112, "throughput": 8865.998122149582 }, { "epoch": 0.34708735961880577, "grad_norm": 0.07901440560817719, "learning_rate": 0.00019155562693205178, "loss": 8.7314, "step": 22144, "throughput": 8866.230615305081 }, { "epoch": 0.3475889309477347, "grad_norm": 0.07936616241931915, "learning_rate": 0.00019128677554638355, "loss": 8.7069, "step": 22176, "throughput": 8866.37955423216 }, { "epoch": 0.3480905022766636, "grad_norm": 0.08413133025169373, "learning_rate": 0.0001910178157539751, "loss": 8.7277, "step": 22208, "throughput": 8866.289248236735 }, { "epoch": 0.3485920736055925, "grad_norm": 0.07251808792352676, "learning_rate": 0.00019074874866401733, "loss": 8.7532, "step": 22240, "throughput": 8866.223294338466 }, { "epoch": 0.34909364493452144, "grad_norm": 0.07380632311105728, "learning_rate": 0.00019047957538614375, "loss": 8.7284, "step": 22272, "throughput": 8866.443971829462 }, { "epoch": 0.34959521626345036, "grad_norm": 0.08033832162618637, "learning_rate": 0.00019021029703042576, "loss": 8.7286, "step": 22304, "throughput": 8866.646670464452 }, { "epoch": 0.3500967875923793, "grad_norm": 0.07755397260189056, "learning_rate": 0.0001899409147073681, "loss": 8.7335, "step": 22336, "throughput": 8866.487407628047 }, { "epoch": 0.35059835892130814, "grad_norm": 0.0786450207233429, "learning_rate": 0.0001896714295279043, "loss": 8.7219, "step": 22368, "throughput": 8866.459515548839 }, { "epoch": 0.35109993025023706, "grad_norm": 0.07564543187618256, "learning_rate": 0.00018940184260339194, "loss": 8.7401, "step": 22400, "throughput": 8866.67087849643 }, { "epoch": 0.351601501579166, "grad_norm": 0.07476343214511871, "learning_rate": 0.00018913215504560838, "loss": 8.7529, "step": 22432, "throughput": 8866.867201927287 }, { "epoch": 0.3521030729080949, "grad_norm": 0.07755191624164581, "learning_rate": 0.0001888623679667459, "loss": 8.7243, "step": 22464, "throughput": 8866.767721777669 }, { "epoch": 0.3526046442370238, "grad_norm": 0.07201996445655823, "learning_rate": 0.00018859248247940722, "loss": 8.731, "step": 22496, "throughput": 8866.769220643666 }, { "epoch": 0.35310621556595273, "grad_norm": 0.07190986722707748, "learning_rate": 0.0001883224996966008, "loss": 8.7428, "step": 22528, "throughput": 8866.943729486306 }, { "epoch": 0.35360778689488165, "grad_norm": 0.07117670029401779, "learning_rate": 0.00018805242073173653, "loss": 8.7311, "step": 22560, "throughput": 8866.144862009913 }, { "epoch": 0.3541093582238105, "grad_norm": 0.07088455557823181, "learning_rate": 0.00018778224669862087, "loss": 8.7338, "step": 22592, "throughput": 8866.042481283248 }, { "epoch": 0.35461092955273943, "grad_norm": 0.07562088221311569, "learning_rate": 0.0001875119787114523, "loss": 8.7135, "step": 22624, "throughput": 8866.069381839432 }, { "epoch": 0.35511250088166835, "grad_norm": 0.07029681652784348, "learning_rate": 0.00018724161788481676, "loss": 8.7263, "step": 22656, "throughput": 8866.261751059588 }, { "epoch": 0.35561407221059727, "grad_norm": 0.07344888150691986, "learning_rate": 0.00018697116533368316, "loss": 8.7098, "step": 22688, "throughput": 8866.397560376685 }, { "epoch": 0.3561156435395262, "grad_norm": 0.07342462986707687, "learning_rate": 0.00018670062217339867, "loss": 8.7426, "step": 22720, "throughput": 8866.329069189647 }, { "epoch": 0.3566172148684551, "grad_norm": 0.07215454429388046, "learning_rate": 0.0001864299895196839, "loss": 8.7513, "step": 22752, "throughput": 8866.362397261639 }, { "epoch": 0.357118786197384, "grad_norm": 0.07690445333719254, "learning_rate": 0.00018615926848862893, "loss": 8.7467, "step": 22784, "throughput": 8866.53666383422 }, { "epoch": 0.3576203575263129, "grad_norm": 0.07840703427791595, "learning_rate": 0.00018588846019668793, "loss": 8.7088, "step": 22816, "throughput": 8866.691489926814 }, { "epoch": 0.3581219288552418, "grad_norm": 0.0753103718161583, "learning_rate": 0.00018561756576067524, "loss": 8.7098, "step": 22848, "throughput": 8866.619402757455 }, { "epoch": 0.3586235001841707, "grad_norm": 0.07334914058446884, "learning_rate": 0.0001853465862977602, "loss": 8.7045, "step": 22880, "throughput": 8866.65715732974 }, { "epoch": 0.35912507151309964, "grad_norm": 0.07309671491384506, "learning_rate": 0.00018507552292546295, "loss": 8.6958, "step": 22912, "throughput": 8866.789061564716 }, { "epoch": 0.35962664284202855, "grad_norm": 0.07427306473255157, "learning_rate": 0.00018480437676164968, "loss": 8.6913, "step": 22944, "throughput": 8866.966795257267 }, { "epoch": 0.3601282141709575, "grad_norm": 0.07504558563232422, "learning_rate": 0.00018453314892452795, "loss": 8.7261, "step": 22976, "throughput": 8866.94648140496 }, { "epoch": 0.36062978549988634, "grad_norm": 0.07671528309583664, "learning_rate": 0.00018426184053264215, "loss": 8.7222, "step": 23008, "throughput": 8866.981521554353 }, { "epoch": 0.36113135682881525, "grad_norm": 0.07723363488912582, "learning_rate": 0.0001839904527048689, "loss": 8.7267, "step": 23040, "throughput": 8867.111959507398 }, { "epoch": 0.36163292815774417, "grad_norm": 0.07163956016302109, "learning_rate": 0.0001837189865604124, "loss": 8.698, "step": 23072, "throughput": 8867.259393831302 }, { "epoch": 0.3621344994866731, "grad_norm": 0.07297907024621964, "learning_rate": 0.00018344744321879987, "loss": 8.7214, "step": 23104, "throughput": 8867.24415376364 }, { "epoch": 0.362636070815602, "grad_norm": 0.07717998325824738, "learning_rate": 0.0001831758237998768, "loss": 8.6994, "step": 23136, "throughput": 8867.269568652184 }, { "epoch": 0.3631376421445309, "grad_norm": 0.06853833794593811, "learning_rate": 0.00018290412942380252, "loss": 8.6805, "step": 23168, "throughput": 8867.423731234427 }, { "epoch": 0.36363921347345984, "grad_norm": 0.08529718220233917, "learning_rate": 0.00018263236121104543, "loss": 8.7047, "step": 23200, "throughput": 8867.61257424937 }, { "epoch": 0.3641407848023887, "grad_norm": 0.07281111925840378, "learning_rate": 0.00018236052028237847, "loss": 8.7055, "step": 23232, "throughput": 8867.513943135138 }, { "epoch": 0.3646423561313176, "grad_norm": 0.07215279340744019, "learning_rate": 0.0001820886077588744, "loss": 8.7075, "step": 23264, "throughput": 8867.534972717942 }, { "epoch": 0.36514392746024654, "grad_norm": 0.07825150340795517, "learning_rate": 0.00018181662476190127, "loss": 8.7221, "step": 23296, "throughput": 8867.653692389635 }, { "epoch": 0.36564549878917546, "grad_norm": 0.06967805325984955, "learning_rate": 0.00018154457241311773, "loss": 8.7052, "step": 23328, "throughput": 8867.854170707304 }, { "epoch": 0.3661470701181044, "grad_norm": 0.07603953033685684, "learning_rate": 0.00018127245183446858, "loss": 8.6969, "step": 23360, "throughput": 8867.761518804655 }, { "epoch": 0.3666486414470333, "grad_norm": 0.07054081559181213, "learning_rate": 0.00018100026414817987, "loss": 8.715, "step": 23392, "throughput": 8867.792561097536 }, { "epoch": 0.3671502127759622, "grad_norm": 0.0735711082816124, "learning_rate": 0.00018072801047675432, "loss": 8.7058, "step": 23424, "throughput": 8867.919506011427 }, { "epoch": 0.3676517841048911, "grad_norm": 0.07133655995130539, "learning_rate": 0.00018045569194296697, "loss": 8.7024, "step": 23456, "throughput": 8868.07518636643 }, { "epoch": 0.36815335543382, "grad_norm": 0.07236292213201523, "learning_rate": 0.00018018330966986022, "loss": 8.6952, "step": 23488, "throughput": 8868.049974110492 }, { "epoch": 0.3686549267627489, "grad_norm": 0.07108893990516663, "learning_rate": 0.00017991086478073943, "loss": 8.7161, "step": 23520, "throughput": 8868.052097936794 }, { "epoch": 0.36915649809167783, "grad_norm": 0.08245235681533813, "learning_rate": 0.0001796383583991681, "loss": 8.7005, "step": 23552, "throughput": 8868.188362773484 }, { "epoch": 0.36965806942060675, "grad_norm": 0.07327444851398468, "learning_rate": 0.00017936579164896333, "loss": 8.7022, "step": 23584, "throughput": 8868.340869115167 }, { "epoch": 0.37015964074953567, "grad_norm": 0.0701148584485054, "learning_rate": 0.0001790931656541912, "loss": 8.7133, "step": 23616, "throughput": 8868.29974006014 }, { "epoch": 0.3706612120784646, "grad_norm": 0.07192771881818771, "learning_rate": 0.00017882048153916214, "loss": 8.7273, "step": 23648, "throughput": 8868.346506488127 }, { "epoch": 0.37116278340739345, "grad_norm": 0.07537954300642014, "learning_rate": 0.00017854774042842626, "loss": 8.7159, "step": 23680, "throughput": 8868.482967951622 }, { "epoch": 0.37166435473632237, "grad_norm": 0.07714162021875381, "learning_rate": 0.00017827494344676873, "loss": 8.7134, "step": 23712, "throughput": 8868.679327198255 }, { "epoch": 0.3721659260652513, "grad_norm": 0.0708460807800293, "learning_rate": 0.000178002091719205, "loss": 8.6908, "step": 23744, "throughput": 8868.578581239224 }, { "epoch": 0.3726674973941802, "grad_norm": 0.070463165640831, "learning_rate": 0.00017772918637097657, "loss": 8.7001, "step": 23776, "throughput": 8868.63710866715 }, { "epoch": 0.3731690687231091, "grad_norm": 0.07860637456178665, "learning_rate": 0.00017745622852754575, "loss": 8.7026, "step": 23808, "throughput": 8868.75675413827 }, { "epoch": 0.37367064005203804, "grad_norm": 0.07869873940944672, "learning_rate": 0.00017718321931459163, "loss": 8.6933, "step": 23840, "throughput": 8868.94579659867 }, { "epoch": 0.37417221138096696, "grad_norm": 0.08232009410858154, "learning_rate": 0.00017691015985800488, "loss": 8.7128, "step": 23872, "throughput": 8868.846136915936 }, { "epoch": 0.3746737827098958, "grad_norm": 0.07898972928524017, "learning_rate": 0.0001766370512838836, "loss": 8.693, "step": 23904, "throughput": 8868.847818264216 }, { "epoch": 0.37517535403882474, "grad_norm": 0.0719093382358551, "learning_rate": 0.00017636389471852834, "loss": 8.6928, "step": 23936, "throughput": 8869.011717480415 }, { "epoch": 0.37567692536775366, "grad_norm": 0.07355839759111404, "learning_rate": 0.0001760906912884376, "loss": 8.6936, "step": 23968, "throughput": 8869.146767865981 }, { "epoch": 0.3761784966966826, "grad_norm": 0.07554195076227188, "learning_rate": 0.00017581744212030308, "loss": 8.6896, "step": 24000, "throughput": 8869.047406839176 }, { "epoch": 0.3766800680256115, "grad_norm": 0.07405807077884674, "learning_rate": 0.00017554414834100525, "loss": 8.681, "step": 24032, "throughput": 8869.049468682455 }, { "epoch": 0.3771816393545404, "grad_norm": 0.07666285336017609, "learning_rate": 0.00017527081107760834, "loss": 8.6829, "step": 24064, "throughput": 8869.210629828673 }, { "epoch": 0.37768321068346933, "grad_norm": 0.08341163396835327, "learning_rate": 0.00017499743145735615, "loss": 8.695, "step": 24096, "throughput": 8869.362543407795 }, { "epoch": 0.3781847820123982, "grad_norm": 0.07192159444093704, "learning_rate": 0.00017472401060766697, "loss": 8.6939, "step": 24128, "throughput": 8869.239088716518 }, { "epoch": 0.3786863533413271, "grad_norm": 0.07966649532318115, "learning_rate": 0.0001744505496561292, "loss": 8.698, "step": 24160, "throughput": 8869.33513356526 }, { "epoch": 0.379187924670256, "grad_norm": 0.07046504318714142, "learning_rate": 0.00017417704973049668, "loss": 8.7069, "step": 24192, "throughput": 8869.457953968098 }, { "epoch": 0.37968949599918494, "grad_norm": 0.07870694249868393, "learning_rate": 0.00017390351195868385, "loss": 8.6969, "step": 24224, "throughput": 8869.603073077134 }, { "epoch": 0.38019106732811386, "grad_norm": 0.07509059458971024, "learning_rate": 0.00017362993746876135, "loss": 8.695, "step": 24256, "throughput": 8869.509359643629 }, { "epoch": 0.3806926386570428, "grad_norm": 0.0709197148680687, "learning_rate": 0.00017335632738895113, "loss": 8.694, "step": 24288, "throughput": 8869.57627443901 }, { "epoch": 0.3811942099859717, "grad_norm": 0.07527151703834534, "learning_rate": 0.000173082682847622, "loss": 8.6916, "step": 24320, "throughput": 8869.696624010772 }, { "epoch": 0.38169578131490056, "grad_norm": 0.07428012043237686, "learning_rate": 0.0001728090049732848, "loss": 8.684, "step": 24352, "throughput": 8869.84031541236 }, { "epoch": 0.3821973526438295, "grad_norm": 0.07557906955480576, "learning_rate": 0.00017253529489458802, "loss": 8.6978, "step": 24384, "throughput": 8869.77798243609 }, { "epoch": 0.3826989239727584, "grad_norm": 0.07178075611591339, "learning_rate": 0.00017226155374031271, "loss": 8.6802, "step": 24416, "throughput": 8869.83705986506 }, { "epoch": 0.3832004953016873, "grad_norm": 0.0776342898607254, "learning_rate": 0.0001719877826393683, "loss": 8.682, "step": 24448, "throughput": 8869.958717232506 }, { "epoch": 0.38370206663061623, "grad_norm": 0.07733438163995743, "learning_rate": 0.00017171398272078752, "loss": 8.67, "step": 24480, "throughput": 8870.12149243361 }, { "epoch": 0.38420363795954515, "grad_norm": 0.07164875417947769, "learning_rate": 0.00017144015511372208, "loss": 8.6772, "step": 24512, "throughput": 8870.061453331795 }, { "epoch": 0.38470520928847407, "grad_norm": 0.06802382320165634, "learning_rate": 0.00017116630094743792, "loss": 8.6741, "step": 24544, "throughput": 8870.136585820119 }, { "epoch": 0.38520678061740293, "grad_norm": 0.08131495863199234, "learning_rate": 0.00017089242135131036, "loss": 8.6745, "step": 24576, "throughput": 8870.257355447251 }, { "epoch": 0.38570835194633185, "grad_norm": 0.07555441558361053, "learning_rate": 0.0001706185174548197, "loss": 8.6685, "step": 24608, "throughput": 8869.510047014594 }, { "epoch": 0.38620992327526077, "grad_norm": 0.07308689504861832, "learning_rate": 0.0001703445903875464, "loss": 8.6748, "step": 24640, "throughput": 8869.508137207475 }, { "epoch": 0.3867114946041897, "grad_norm": 0.07100782543420792, "learning_rate": 0.00017007064127916644, "loss": 8.6804, "step": 24672, "throughput": 8869.535923334248 }, { "epoch": 0.3872130659331186, "grad_norm": 0.07744865119457245, "learning_rate": 0.0001697966712594469, "loss": 8.6914, "step": 24704, "throughput": 8869.636771237965 }, { "epoch": 0.3877146372620475, "grad_norm": 0.0693967267870903, "learning_rate": 0.00016952268145824082, "loss": 8.6752, "step": 24736, "throughput": 8869.735649269436 }, { "epoch": 0.38821620859097644, "grad_norm": 0.07560817897319794, "learning_rate": 0.00016924867300548304, "loss": 8.6659, "step": 24768, "throughput": 8869.670423971971 }, { "epoch": 0.3887177799199053, "grad_norm": 0.07274651527404785, "learning_rate": 0.00016897464703118515, "loss": 8.6905, "step": 24800, "throughput": 8869.720716639269 }, { "epoch": 0.3892193512488342, "grad_norm": 0.07410851866006851, "learning_rate": 0.00016870060466543112, "loss": 8.65, "step": 24832, "throughput": 8869.822123904682 }, { "epoch": 0.38972092257776314, "grad_norm": 0.08210521936416626, "learning_rate": 0.0001684265470383725, "loss": 8.6903, "step": 24864, "throughput": 8869.90443907778 }, { "epoch": 0.39022249390669206, "grad_norm": 0.07470977306365967, "learning_rate": 0.0001681524752802237, "loss": 8.6972, "step": 24896, "throughput": 8869.853368054952 }, { "epoch": 0.390724065235621, "grad_norm": 0.07381650805473328, "learning_rate": 0.00016787839052125758, "loss": 8.6787, "step": 24928, "throughput": 8869.890685049933 }, { "epoch": 0.3912256365645499, "grad_norm": 0.06976639479398727, "learning_rate": 0.00016760429389180037, "loss": 8.6935, "step": 24960, "throughput": 8870.016580322115 }, { "epoch": 0.3917272078934788, "grad_norm": 0.07536919414997101, "learning_rate": 0.00016733018652222744, "loss": 8.6541, "step": 24992, "throughput": 8870.083976126552 }, { "epoch": 0.3922287792224077, "grad_norm": 0.06791981309652328, "learning_rate": 0.0001670560695429584, "loss": 8.6712, "step": 25024, "throughput": 8870.029229798514 }, { "epoch": 0.3927303505513366, "grad_norm": 0.07341574877500534, "learning_rate": 0.00016678194408445245, "loss": 8.6457, "step": 25056, "throughput": 8870.067860416464 }, { "epoch": 0.3932319218802655, "grad_norm": 0.07675306499004364, "learning_rate": 0.00016650781127720382, "loss": 8.6806, "step": 25088, "throughput": 8870.186841482271 }, { "epoch": 0.39373349320919443, "grad_norm": 0.07608195394277573, "learning_rate": 0.00016623367225173703, "loss": 8.6829, "step": 25120, "throughput": 8870.274246204594 }, { "epoch": 0.39423506453812335, "grad_norm": 0.07675729691982269, "learning_rate": 0.00016595952813860216, "loss": 8.6735, "step": 25152, "throughput": 8870.254386805169 }, { "epoch": 0.39473663586705227, "grad_norm": 0.07194288820028305, "learning_rate": 0.00016568538006837046, "loss": 8.6817, "step": 25184, "throughput": 8870.327053030003 }, { "epoch": 0.3952382071959812, "grad_norm": 0.07365325093269348, "learning_rate": 0.00016541122917162934, "loss": 8.6625, "step": 25216, "throughput": 8870.425180152624 }, { "epoch": 0.39573977852491005, "grad_norm": 0.07837095856666565, "learning_rate": 0.00016513707657897785, "loss": 8.6911, "step": 25248, "throughput": 8870.557387619592 }, { "epoch": 0.39624134985383896, "grad_norm": 0.07529828697443008, "learning_rate": 0.00016486292342102215, "loss": 8.6722, "step": 25280, "throughput": 8870.488229558243 }, { "epoch": 0.3967429211827679, "grad_norm": 0.07008527964353561, "learning_rate": 0.0001645887708283707, "loss": 8.6661, "step": 25312, "throughput": 8870.529715633977 }, { "epoch": 0.3972444925116968, "grad_norm": 0.07274020463228226, "learning_rate": 0.00016431461993162954, "loss": 8.6695, "step": 25344, "throughput": 8870.610755685926 }, { "epoch": 0.3977460638406257, "grad_norm": 0.07809685915708542, "learning_rate": 0.00016404047186139784, "loss": 8.6645, "step": 25376, "throughput": 8870.67675068345 }, { "epoch": 0.39824763516955464, "grad_norm": 0.08027014881372452, "learning_rate": 0.00016376632774826297, "loss": 8.6604, "step": 25408, "throughput": 8870.62851229032 }, { "epoch": 0.39874920649848355, "grad_norm": 0.07037220895290375, "learning_rate": 0.0001634921887227962, "loss": 8.6662, "step": 25440, "throughput": 8870.68671648454 }, { "epoch": 0.3992507778274124, "grad_norm": 0.07897590100765228, "learning_rate": 0.00016321805591554755, "loss": 8.6595, "step": 25472, "throughput": 8870.774777575869 }, { "epoch": 0.39975234915634134, "grad_norm": 0.07503047585487366, "learning_rate": 0.00016294393045704163, "loss": 8.6695, "step": 25504, "throughput": 8870.866991168901 }, { "epoch": 0.40025392048527025, "grad_norm": 0.07193060964345932, "learning_rate": 0.00016266981347777255, "loss": 8.6627, "step": 25536, "throughput": 8870.829377769813 }, { "epoch": 0.40075549181419917, "grad_norm": 0.07066937536001205, "learning_rate": 0.00016239570610819963, "loss": 8.6515, "step": 25568, "throughput": 8870.865608844539 }, { "epoch": 0.4012570631431281, "grad_norm": 0.06965267658233643, "learning_rate": 0.00016212160947874242, "loss": 8.6543, "step": 25600, "throughput": 8870.946621920393 }, { "epoch": 0.401758634472057, "grad_norm": 0.07811598479747772, "learning_rate": 0.00016184752471977627, "loss": 8.6526, "step": 25632, "throughput": 8871.096432293107 }, { "epoch": 0.4022602058009859, "grad_norm": 0.08140557259321213, "learning_rate": 0.0001615734529616275, "loss": 8.6447, "step": 25664, "throughput": 8871.087218631283 }, { "epoch": 0.4027617771299148, "grad_norm": 0.08000985532999039, "learning_rate": 0.00016129939533456888, "loss": 8.6645, "step": 25696, "throughput": 8871.150626139613 }, { "epoch": 0.4032633484588437, "grad_norm": 0.07303277403116226, "learning_rate": 0.00016102535296881485, "loss": 8.6534, "step": 25728, "throughput": 8871.24360844604 }, { "epoch": 0.4037649197877726, "grad_norm": 0.07288201153278351, "learning_rate": 0.00016075132699451701, "loss": 8.6671, "step": 25760, "throughput": 8871.357493273781 }, { "epoch": 0.40426649111670154, "grad_norm": 0.07564268261194229, "learning_rate": 0.00016047731854175917, "loss": 8.6561, "step": 25792, "throughput": 8871.299038597108 }, { "epoch": 0.40476806244563046, "grad_norm": 0.07370468974113464, "learning_rate": 0.00016020332874055313, "loss": 8.6658, "step": 25824, "throughput": 8871.375135691831 }, { "epoch": 0.4052696337745594, "grad_norm": 0.0826638862490654, "learning_rate": 0.00015992935872083356, "loss": 8.672, "step": 25856, "throughput": 8871.475268392245 }, { "epoch": 0.4057712051034883, "grad_norm": 0.07711603492498398, "learning_rate": 0.00015965540961245363, "loss": 8.6426, "step": 25888, "throughput": 8871.610726872705 }, { "epoch": 0.40627277643241716, "grad_norm": 0.07239612936973572, "learning_rate": 0.0001593814825451803, "loss": 8.6563, "step": 25920, "throughput": 8871.537753317134 }, { "epoch": 0.4067743477613461, "grad_norm": 0.07643844932317734, "learning_rate": 0.00015910757864868967, "loss": 8.6636, "step": 25952, "throughput": 8871.602785647328 }, { "epoch": 0.407275919090275, "grad_norm": 0.06919372826814651, "learning_rate": 0.0001588336990525621, "loss": 8.6657, "step": 25984, "throughput": 8871.725906269707 }, { "epoch": 0.4077774904192039, "grad_norm": 0.0749737024307251, "learning_rate": 0.00015855984488627792, "loss": 8.6709, "step": 26016, "throughput": 8871.885830919566 }, { "epoch": 0.40827906174813283, "grad_norm": 0.07118481397628784, "learning_rate": 0.00015828601727921248, "loss": 8.6471, "step": 26048, "throughput": 8871.78394377422 }, { "epoch": 0.40878063307706175, "grad_norm": 0.07816265523433685, "learning_rate": 0.0001580122173606317, "loss": 8.6644, "step": 26080, "throughput": 8871.82914920052 }, { "epoch": 0.40928220440599067, "grad_norm": 0.07230495661497116, "learning_rate": 0.00015773844625968726, "loss": 8.654, "step": 26112, "throughput": 8871.933818994134 }, { "epoch": 0.40978377573491953, "grad_norm": 0.06881450116634369, "learning_rate": 0.00015746470510541197, "loss": 8.6284, "step": 26144, "throughput": 8872.083173340814 }, { "epoch": 0.41028534706384845, "grad_norm": 0.07161400467157364, "learning_rate": 0.00015719099502671516, "loss": 8.6412, "step": 26176, "throughput": 8872.000838936652 }, { "epoch": 0.41078691839277737, "grad_norm": 0.07901474833488464, "learning_rate": 0.00015691731715237802, "loss": 8.633, "step": 26208, "throughput": 8872.06924059419 }, { "epoch": 0.4112884897217063, "grad_norm": 0.07110141217708588, "learning_rate": 0.00015664367261104887, "loss": 8.6585, "step": 26240, "throughput": 8872.149282164934 }, { "epoch": 0.4117900610506352, "grad_norm": 0.07459357380867004, "learning_rate": 0.00015637006253123865, "loss": 8.6473, "step": 26272, "throughput": 8872.275020695792 }, { "epoch": 0.4122916323795641, "grad_norm": 0.0729188397526741, "learning_rate": 0.00015609648804131612, "loss": 8.6276, "step": 26304, "throughput": 8872.212686359257 }, { "epoch": 0.41279320370849304, "grad_norm": 0.0700279101729393, "learning_rate": 0.00015582295026950332, "loss": 8.6507, "step": 26336, "throughput": 8872.283869781426 }, { "epoch": 0.4132947750374219, "grad_norm": 0.07937642931938171, "learning_rate": 0.00015554945034387075, "loss": 8.6427, "step": 26368, "throughput": 8872.336145608379 }, { "epoch": 0.4137963463663508, "grad_norm": 0.07519602030515671, "learning_rate": 0.00015527598939233303, "loss": 8.6468, "step": 26400, "throughput": 8872.465062210009 }, { "epoch": 0.41429791769527974, "grad_norm": 0.07092050462961197, "learning_rate": 0.00015500256854264385, "loss": 8.6378, "step": 26432, "throughput": 8872.406151953877 }, { "epoch": 0.41479948902420866, "grad_norm": 0.07096114754676819, "learning_rate": 0.00015472918892239166, "loss": 8.6505, "step": 26464, "throughput": 8872.47318466504 }, { "epoch": 0.4153010603531376, "grad_norm": 0.07255687564611435, "learning_rate": 0.00015445585165899475, "loss": 8.6462, "step": 26496, "throughput": 8872.526006080514 }, { "epoch": 0.4158026316820665, "grad_norm": 0.07390446960926056, "learning_rate": 0.00015418255787969692, "loss": 8.6444, "step": 26528, "throughput": 8872.666706172875 }, { "epoch": 0.4163042030109954, "grad_norm": 0.07466679066419601, "learning_rate": 0.0001539093087115624, "loss": 8.6638, "step": 26560, "throughput": 8872.605439396339 }, { "epoch": 0.4168057743399243, "grad_norm": 0.07198496162891388, "learning_rate": 0.00015363610528147163, "loss": 8.6529, "step": 26592, "throughput": 8872.687260092538 }, { "epoch": 0.4173073456688532, "grad_norm": 0.07361474633216858, "learning_rate": 0.00015336294871611637, "loss": 8.6325, "step": 26624, "throughput": 8872.760453910214 }, { "epoch": 0.4178089169977821, "grad_norm": 0.07440745830535889, "learning_rate": 0.00015308984014199511, "loss": 8.6457, "step": 26656, "throughput": 8872.094436600915 }, { "epoch": 0.418310488326711, "grad_norm": 0.0752161368727684, "learning_rate": 0.00015281678068540836, "loss": 8.6277, "step": 26688, "throughput": 8872.026746895188 }, { "epoch": 0.41881205965563995, "grad_norm": 0.07112942636013031, "learning_rate": 0.00015254377147245424, "loss": 8.6257, "step": 26720, "throughput": 8872.054171423386 }, { "epoch": 0.41931363098456886, "grad_norm": 0.09089133888483047, "learning_rate": 0.00015227081362902343, "loss": 8.6544, "step": 26752, "throughput": 8872.150418236 }, { "epoch": 0.4198152023134978, "grad_norm": 0.07310041040182114, "learning_rate": 0.000151997908280795, "loss": 8.6263, "step": 26784, "throughput": 8872.261633703853 }, { "epoch": 0.42031677364242664, "grad_norm": 0.07701604068279266, "learning_rate": 0.0001517250565532313, "loss": 8.6376, "step": 26816, "throughput": 8872.21203272543 }, { "epoch": 0.42081834497135556, "grad_norm": 0.07558988779783249, "learning_rate": 0.00015145225957157373, "loss": 8.6273, "step": 26848, "throughput": 8872.26083379703 }, { "epoch": 0.4213199163002845, "grad_norm": 0.07615737617015839, "learning_rate": 0.00015117951846083786, "loss": 8.6408, "step": 26880, "throughput": 8872.354404114056 }, { "epoch": 0.4218214876292134, "grad_norm": 0.07847557216882706, "learning_rate": 0.0001509068343458088, "loss": 8.625, "step": 26912, "throughput": 8872.468278857184 }, { "epoch": 0.4223230589581423, "grad_norm": 0.0712590292096138, "learning_rate": 0.00015063420835103667, "loss": 8.6498, "step": 26944, "throughput": 8872.410536215344 }, { "epoch": 0.42282463028707123, "grad_norm": 0.07340807467699051, "learning_rate": 0.0001503616416008319, "loss": 8.6503, "step": 26976, "throughput": 8872.482341042714 }, { "epoch": 0.42332620161600015, "grad_norm": 0.07415439188480377, "learning_rate": 0.00015008913521926052, "loss": 8.6437, "step": 27008, "throughput": 8872.581708434158 }, { "epoch": 0.423827772944929, "grad_norm": 0.07077255845069885, "learning_rate": 0.00014981669033013972, "loss": 8.6333, "step": 27040, "throughput": 8872.676276991508 }, { "epoch": 0.42432934427385793, "grad_norm": 0.07701191306114197, "learning_rate": 0.00014954430805703302, "loss": 8.6152, "step": 27072, "throughput": 8872.600067118728 }, { "epoch": 0.42483091560278685, "grad_norm": 0.07171820104122162, "learning_rate": 0.00014927198952324568, "loss": 8.6111, "step": 27104, "throughput": 8872.64366520588 }, { "epoch": 0.42533248693171577, "grad_norm": 0.0763024315237999, "learning_rate": 0.00014899973585182012, "loss": 8.6408, "step": 27136, "throughput": 8872.751917567166 }, { "epoch": 0.4258340582606447, "grad_norm": 0.07294578105211258, "learning_rate": 0.00014872754816553141, "loss": 8.6184, "step": 27168, "throughput": 8872.822795246944 }, { "epoch": 0.4263356295895736, "grad_norm": 0.07896049320697784, "learning_rate": 0.00014845542758688222, "loss": 8.6557, "step": 27200, "throughput": 8872.714598463881 }, { "epoch": 0.42683720091850247, "grad_norm": 0.07082242518663406, "learning_rate": 0.00014818337523809876, "loss": 8.6371, "step": 27232, "throughput": 8872.772047112663 }, { "epoch": 0.4273387722474314, "grad_norm": 0.07124398648738861, "learning_rate": 0.0001479113922411256, "loss": 8.6304, "step": 27264, "throughput": 8872.881022358144 }, { "epoch": 0.4278403435763603, "grad_norm": 0.07090860605239868, "learning_rate": 0.00014763947971762153, "loss": 8.6245, "step": 27296, "throughput": 8872.950858100683 }, { "epoch": 0.4283419149052892, "grad_norm": 0.07200966030359268, "learning_rate": 0.00014736763878895457, "loss": 8.623, "step": 27328, "throughput": 8872.846323918813 }, { "epoch": 0.42884348623421814, "grad_norm": 0.07737179845571518, "learning_rate": 0.00014709587057619748, "loss": 8.6463, "step": 27360, "throughput": 8872.900561966622 }, { "epoch": 0.42934505756314706, "grad_norm": 0.0813121348619461, "learning_rate": 0.0001468241762001232, "loss": 8.6273, "step": 27392, "throughput": 8873.021353610933 }, { "epoch": 0.429846628892076, "grad_norm": 0.07603728026151657, "learning_rate": 0.00014655255678120015, "loss": 8.6264, "step": 27424, "throughput": 8873.088690250943 }, { "epoch": 0.43034820022100484, "grad_norm": 0.07271303981542587, "learning_rate": 0.0001462810134395876, "loss": 8.6131, "step": 27456, "throughput": 8873.018806837712 }, { "epoch": 0.43084977154993376, "grad_norm": 0.07247339189052582, "learning_rate": 0.0001460095472951311, "loss": 8.6458, "step": 27488, "throughput": 8873.06833676307 }, { "epoch": 0.4313513428788627, "grad_norm": 0.07254812121391296, "learning_rate": 0.0001457381594673579, "loss": 8.6332, "step": 27520, "throughput": 8873.193427807744 }, { "epoch": 0.4318529142077916, "grad_norm": 0.07892932742834091, "learning_rate": 0.00014546685107547205, "loss": 8.6261, "step": 27552, "throughput": 8873.262743001163 }, { "epoch": 0.4323544855367205, "grad_norm": 0.07944980263710022, "learning_rate": 0.00014519562323835034, "loss": 8.6376, "step": 27584, "throughput": 8873.179062347474 }, { "epoch": 0.43285605686564943, "grad_norm": 0.07589094340801239, "learning_rate": 0.000144924477074537, "loss": 8.6238, "step": 27616, "throughput": 8873.226907929838 }, { "epoch": 0.43335762819457835, "grad_norm": 0.06933537870645523, "learning_rate": 0.00014465341370223977, "loss": 8.6294, "step": 27648, "throughput": 8873.345501280693 }, { "epoch": 0.4338591995235072, "grad_norm": 0.07746944576501846, "learning_rate": 0.00014438243423932476, "loss": 8.6122, "step": 27680, "throughput": 8873.411787429883 }, { "epoch": 0.43436077085243613, "grad_norm": 0.07950767129659653, "learning_rate": 0.00014411153980331198, "loss": 8.6242, "step": 27712, "throughput": 8873.34829689749 }, { "epoch": 0.43486234218136505, "grad_norm": 0.0721384733915329, "learning_rate": 0.00014384073151137104, "loss": 8.6003, "step": 27744, "throughput": 8873.384867423294 }, { "epoch": 0.43536391351029397, "grad_norm": 0.07308559864759445, "learning_rate": 0.00014357001048031603, "loss": 8.6236, "step": 27776, "throughput": 8873.492587486146 }, { "epoch": 0.4358654848392229, "grad_norm": 0.07080280035734177, "learning_rate": 0.00014329937782660136, "loss": 8.6146, "step": 27808, "throughput": 8873.573468115323 }, { "epoch": 0.4363670561681518, "grad_norm": 0.07138007879257202, "learning_rate": 0.00014302883466631676, "loss": 8.6406, "step": 27840, "throughput": 8873.479798005088 }, { "epoch": 0.4368686274970807, "grad_norm": 0.07479699701070786, "learning_rate": 0.0001427583821151832, "loss": 8.624, "step": 27872, "throughput": 8873.554725941001 }, { "epoch": 0.4373701988260096, "grad_norm": 0.07214164733886719, "learning_rate": 0.0001424880212885477, "loss": 8.6228, "step": 27904, "throughput": 8873.70765646058 }, { "epoch": 0.4378717701549385, "grad_norm": 0.08458901941776276, "learning_rate": 0.0001422177533013791, "loss": 8.6466, "step": 27936, "throughput": 8873.799614476173 }, { "epoch": 0.4383733414838674, "grad_norm": 0.07676039636135101, "learning_rate": 0.00014194757926826342, "loss": 8.6201, "step": 27968, "throughput": 8873.73437624315 }, { "epoch": 0.43887491281279634, "grad_norm": 0.07848938554525375, "learning_rate": 0.00014167750030339915, "loss": 8.6172, "step": 28000, "throughput": 8873.781156642763 }, { "epoch": 0.43937648414172525, "grad_norm": 0.07305929809808731, "learning_rate": 0.00014140751752059278, "loss": 8.6037, "step": 28032, "throughput": 8873.913417033487 }, { "epoch": 0.4398780554706542, "grad_norm": 0.08314230293035507, "learning_rate": 0.0001411376320332541, "loss": 8.6151, "step": 28064, "throughput": 8873.997696387973 }, { "epoch": 0.4403796267995831, "grad_norm": 0.07182671874761581, "learning_rate": 0.0001408678449543916, "loss": 8.6344, "step": 28096, "throughput": 8873.914407981782 }, { "epoch": 0.44088119812851195, "grad_norm": 0.07496217638254166, "learning_rate": 0.00014059815739660806, "loss": 8.6093, "step": 28128, "throughput": 8873.966478015534 }, { "epoch": 0.44138276945744087, "grad_norm": 0.07898411899805069, "learning_rate": 0.00014032857047209573, "loss": 8.6135, "step": 28160, "throughput": 8874.110777954276 }, { "epoch": 0.4418843407863698, "grad_norm": 0.07386748492717743, "learning_rate": 0.0001400590852926319, "loss": 8.6112, "step": 28192, "throughput": 8874.210112978559 }, { "epoch": 0.4423859121152987, "grad_norm": 0.07431478798389435, "learning_rate": 0.00013978970296957423, "loss": 8.606, "step": 28224, "throughput": 8874.171101333972 }, { "epoch": 0.4428874834442276, "grad_norm": 0.06942659616470337, "learning_rate": 0.00013952042461385625, "loss": 8.6087, "step": 28256, "throughput": 8874.187968964894 }, { "epoch": 0.44338905477315654, "grad_norm": 0.07364679872989655, "learning_rate": 0.00013925125133598266, "loss": 8.6124, "step": 28288, "throughput": 8874.330495103006 }, { "epoch": 0.44389062610208546, "grad_norm": 0.07738090306520462, "learning_rate": 0.0001389821842460249, "loss": 8.614, "step": 28320, "throughput": 8874.412446854609 }, { "epoch": 0.4443921974310143, "grad_norm": 0.07922062277793884, "learning_rate": 0.00013871322445361642, "loss": 8.6208, "step": 28352, "throughput": 8874.382217319006 }, { "epoch": 0.44489376875994324, "grad_norm": 0.0814685970544815, "learning_rate": 0.00013844437306794822, "loss": 8.6136, "step": 28384, "throughput": 8874.428691964775 }, { "epoch": 0.44539534008887216, "grad_norm": 0.07049067318439484, "learning_rate": 0.00013817563119776415, "loss": 8.5931, "step": 28416, "throughput": 8874.54071049907 }, { "epoch": 0.4458969114178011, "grad_norm": 0.08059202134609222, "learning_rate": 0.00013790699995135658, "loss": 8.6005, "step": 28448, "throughput": 8874.625059568178 }, { "epoch": 0.44639848274673, "grad_norm": 0.0694175511598587, "learning_rate": 0.00013763848043656148, "loss": 8.6154, "step": 28480, "throughput": 8874.587918082316 }, { "epoch": 0.4469000540756589, "grad_norm": 0.07339881360530853, "learning_rate": 0.00013737007376075414, "loss": 8.5956, "step": 28512, "throughput": 8874.633311785028 }, { "epoch": 0.44740162540458783, "grad_norm": 0.07454710453748703, "learning_rate": 0.0001371017810308445, "loss": 8.5953, "step": 28544, "throughput": 8874.732500278362 }, { "epoch": 0.4479031967335167, "grad_norm": 0.07213406264781952, "learning_rate": 0.00013683360335327264, "loss": 8.6271, "step": 28576, "throughput": 8874.838207728804 }, { "epoch": 0.4484047680624456, "grad_norm": 0.07962165027856827, "learning_rate": 0.000136565541834004, "loss": 8.6044, "step": 28608, "throughput": 8874.793838753636 }, { "epoch": 0.44890633939137453, "grad_norm": 0.07944195717573166, "learning_rate": 0.00013629759757852512, "loss": 8.61, "step": 28640, "throughput": 8874.835978417446 }, { "epoch": 0.44940791072030345, "grad_norm": 0.07339708507061005, "learning_rate": 0.00013602977169183884, "loss": 8.5739, "step": 28672, "throughput": 8874.939632681488 }, { "epoch": 0.44990948204923237, "grad_norm": 0.07606098800897598, "learning_rate": 0.00013576206527846004, "loss": 8.596, "step": 28704, "throughput": 8874.311135577633 }, { "epoch": 0.4504110533781613, "grad_norm": 0.07325044274330139, "learning_rate": 0.00013549447944241066, "loss": 8.6102, "step": 28736, "throughput": 8874.26637586415 }, { "epoch": 0.4509126247070902, "grad_norm": 0.07296261191368103, "learning_rate": 0.00013522701528721553, "loss": 8.6007, "step": 28768, "throughput": 8874.278863558644 }, { "epoch": 0.45141419603601907, "grad_norm": 0.0751008540391922, "learning_rate": 0.00013495967391589757, "loss": 8.5945, "step": 28800, "throughput": 8874.41343208501 }, { "epoch": 0.451915767364948, "grad_norm": 0.0770215392112732, "learning_rate": 0.00013469245643097345, "loss": 8.5957, "step": 28832, "throughput": 8874.474691285905 }, { "epoch": 0.4524173386938769, "grad_norm": 0.07606975734233856, "learning_rate": 0.0001344253639344488, "loss": 8.6028, "step": 28864, "throughput": 8874.454438965466 }, { "epoch": 0.4529189100228058, "grad_norm": 0.08218943327665329, "learning_rate": 0.00013415839752781392, "loss": 8.605, "step": 28896, "throughput": 8874.496925533944 }, { "epoch": 0.45342048135173474, "grad_norm": 0.07142216712236404, "learning_rate": 0.00013389155831203904, "loss": 8.6072, "step": 28928, "throughput": 8874.60667201992 }, { "epoch": 0.45392205268066366, "grad_norm": 0.08242444694042206, "learning_rate": 0.0001336248473875699, "loss": 8.602, "step": 28960, "throughput": 8874.707204975388 }, { "epoch": 0.4544236240095926, "grad_norm": 0.07306291908025742, "learning_rate": 0.00013335826585432313, "loss": 8.5926, "step": 28992, "throughput": 8874.691354885646 }, { "epoch": 0.45492519533852144, "grad_norm": 0.07619068026542664, "learning_rate": 0.00013309181481168173, "loss": 8.5913, "step": 29024, "throughput": 8874.700598118392 }, { "epoch": 0.45542676666745036, "grad_norm": 0.07132818549871445, "learning_rate": 0.00013282549535849065, "loss": 8.5916, "step": 29056, "throughput": 8874.825296268311 }, { "epoch": 0.4559283379963793, "grad_norm": 0.0796361193060875, "learning_rate": 0.00013255930859305205, "loss": 8.5836, "step": 29088, "throughput": 8874.928128288897 }, { "epoch": 0.4564299093253082, "grad_norm": 0.07567333430051804, "learning_rate": 0.000132293255613121, "loss": 8.6048, "step": 29120, "throughput": 8874.894230182781 }, { "epoch": 0.4569314806542371, "grad_norm": 0.07186120748519897, "learning_rate": 0.00013202733751590067, "loss": 8.587, "step": 29152, "throughput": 8874.943752287812 }, { "epoch": 0.45743305198316603, "grad_norm": 0.07044616341590881, "learning_rate": 0.00013176155539803818, "loss": 8.5969, "step": 29184, "throughput": 8875.022302255356 }, { "epoch": 0.45793462331209495, "grad_norm": 0.07153672724962234, "learning_rate": 0.00013149591035561977, "loss": 8.588, "step": 29216, "throughput": 8875.121942073476 }, { "epoch": 0.4584361946410238, "grad_norm": 0.0733792632818222, "learning_rate": 0.00013123040348416633, "loss": 8.5845, "step": 29248, "throughput": 8875.102025766064 }, { "epoch": 0.4589377659699527, "grad_norm": 0.07488974928855896, "learning_rate": 0.00013096503587862906, "loss": 8.6104, "step": 29280, "throughput": 8875.120380254424 }, { "epoch": 0.45943933729888164, "grad_norm": 0.07074250280857086, "learning_rate": 0.00013069980863338466, "loss": 8.5874, "step": 29312, "throughput": 8875.230425116066 }, { "epoch": 0.45994090862781056, "grad_norm": 0.07670116424560547, "learning_rate": 0.00013043472284223113, "loss": 8.5993, "step": 29344, "throughput": 8875.329479619582 }, { "epoch": 0.4604424799567395, "grad_norm": 0.07585009187459946, "learning_rate": 0.00013016977959838305, "loss": 8.5996, "step": 29376, "throughput": 8875.337858363735 }, { "epoch": 0.4609440512856684, "grad_norm": 0.0730140432715416, "learning_rate": 0.00012990497999446714, "loss": 8.5962, "step": 29408, "throughput": 8875.394291925266 }, { "epoch": 0.4614456226145973, "grad_norm": 0.0731014758348465, "learning_rate": 0.00012964032512251773, "loss": 8.595, "step": 29440, "throughput": 8875.505643899913 }, { "epoch": 0.4619471939435262, "grad_norm": 0.07306238263845444, "learning_rate": 0.00012937581607397236, "loss": 8.5911, "step": 29472, "throughput": 8875.564190178333 }, { "epoch": 0.4624487652724551, "grad_norm": 0.07264445722103119, "learning_rate": 0.00012911145393966703, "loss": 8.6046, "step": 29504, "throughput": 8875.600704726261 }, { "epoch": 0.462950336601384, "grad_norm": 0.07225610315799713, "learning_rate": 0.00012884723980983206, "loss": 8.5972, "step": 29536, "throughput": 8875.625193209586 }, { "epoch": 0.46345190793031293, "grad_norm": 0.07001107931137085, "learning_rate": 0.00012858317477408728, "loss": 8.6009, "step": 29568, "throughput": 8875.749078339293 }, { "epoch": 0.46395347925924185, "grad_norm": 0.07203416526317596, "learning_rate": 0.00012831925992143765, "loss": 8.6036, "step": 29600, "throughput": 8875.786375865675 }, { "epoch": 0.46445505058817077, "grad_norm": 0.06918448954820633, "learning_rate": 0.00012805549634026882, "loss": 8.588, "step": 29632, "throughput": 8875.833505349525 }, { "epoch": 0.4649566219170997, "grad_norm": 0.07581827789545059, "learning_rate": 0.00012779188511834256, "loss": 8.5998, "step": 29664, "throughput": 8875.849645993643 }, { "epoch": 0.46545819324602855, "grad_norm": 0.07366356998682022, "learning_rate": 0.00012752842734279238, "loss": 8.5807, "step": 29696, "throughput": 8875.961924137613 }, { "epoch": 0.46595976457495747, "grad_norm": 0.09135285019874573, "learning_rate": 0.0001272651241001189, "loss": 8.5932, "step": 29728, "throughput": 8876.071536647383 }, { "epoch": 0.4664613359038864, "grad_norm": 0.07081723213195801, "learning_rate": 0.00012700197647618549, "loss": 8.6031, "step": 29760, "throughput": 8876.095025167422 }, { "epoch": 0.4669629072328153, "grad_norm": 0.07992235571146011, "learning_rate": 0.00012673898555621373, "loss": 8.5904, "step": 29792, "throughput": 8876.10402483122 }, { "epoch": 0.4674644785617442, "grad_norm": 0.0734575167298317, "learning_rate": 0.00012647615242477887, "loss": 8.5568, "step": 29824, "throughput": 8876.20873167529 }, { "epoch": 0.46796604989067314, "grad_norm": 0.07131079584360123, "learning_rate": 0.0001262134781658056, "loss": 8.578, "step": 29856, "throughput": 8876.300822006067 }, { "epoch": 0.46846762121960206, "grad_norm": 0.07791458070278168, "learning_rate": 0.00012595096386256336, "loss": 8.5786, "step": 29888, "throughput": 8876.313810453454 }, { "epoch": 0.4689691925485309, "grad_norm": 0.07824485749006271, "learning_rate": 0.0001256886105976619, "loss": 8.6037, "step": 29920, "throughput": 8876.316123304772 }, { "epoch": 0.46947076387745984, "grad_norm": 0.07415210455656052, "learning_rate": 0.0001254264194530468, "loss": 8.5928, "step": 29952, "throughput": 8876.423316323264 }, { "epoch": 0.46997233520638876, "grad_norm": 0.07395078986883163, "learning_rate": 0.00012516439150999525, "loss": 8.5904, "step": 29984, "throughput": 8876.508654661935 }, { "epoch": 0.4704739065353177, "grad_norm": 0.07381439954042435, "learning_rate": 0.00012490252784911113, "loss": 8.5586, "step": 30016, "throughput": 8876.508811455968 }, { "epoch": 0.4709754778642466, "grad_norm": 0.07365623861551285, "learning_rate": 0.000124640829550321, "loss": 8.578, "step": 30048, "throughput": 8876.557384132608 }, { "epoch": 0.4714770491931755, "grad_norm": 0.06920890510082245, "learning_rate": 0.00012437929769286942, "loss": 8.5793, "step": 30080, "throughput": 8876.677506162016 }, { "epoch": 0.47197862052210443, "grad_norm": 0.07251748442649841, "learning_rate": 0.0001241179333553146, "loss": 8.5943, "step": 30112, "throughput": 8876.767853579231 }, { "epoch": 0.4724801918510333, "grad_norm": 0.08146440237760544, "learning_rate": 0.00012385673761552374, "loss": 8.5751, "step": 30144, "throughput": 8876.77526841676 }, { "epoch": 0.4729817631799622, "grad_norm": 0.06725624948740005, "learning_rate": 0.00012359571155066894, "loss": 8.5845, "step": 30176, "throughput": 8876.796895522772 }, { "epoch": 0.47348333450889113, "grad_norm": 0.07321012765169144, "learning_rate": 0.00012333485623722238, "loss": 8.5994, "step": 30208, "throughput": 8876.894242975139 }, { "epoch": 0.47398490583782005, "grad_norm": 0.06948108971118927, "learning_rate": 0.00012307417275095222, "loss": 8.5785, "step": 30240, "throughput": 8876.978857239008 }, { "epoch": 0.47448647716674897, "grad_norm": 0.07041703164577484, "learning_rate": 0.00012281366216691786, "loss": 8.5649, "step": 30272, "throughput": 8876.976901438517 }, { "epoch": 0.4749880484956779, "grad_norm": 0.07631143927574158, "learning_rate": 0.00012255332555946582, "loss": 8.5625, "step": 30304, "throughput": 8877.043764416963 }, { "epoch": 0.4754896198246068, "grad_norm": 0.0704835057258606, "learning_rate": 0.00012229316400222493, "loss": 8.59, "step": 30336, "throughput": 8877.134415497327 }, { "epoch": 0.47599119115353566, "grad_norm": 0.06994818150997162, "learning_rate": 0.00012203317856810232, "loss": 8.5859, "step": 30368, "throughput": 8877.18800100491 }, { "epoch": 0.4764927624824646, "grad_norm": 0.07705579698085785, "learning_rate": 0.0001217733703292786, "loss": 8.563, "step": 30400, "throughput": 8877.194226844798 }, { "epoch": 0.4769943338113935, "grad_norm": 0.07564400136470795, "learning_rate": 0.0001215137403572038, "loss": 8.5769, "step": 30432, "throughput": 8877.238367015549 }, { "epoch": 0.4774959051403224, "grad_norm": 0.07389659434556961, "learning_rate": 0.00012125428972259264, "loss": 8.5711, "step": 30464, "throughput": 8877.356088149087 }, { "epoch": 0.47799747646925134, "grad_norm": 0.07688334584236145, "learning_rate": 0.0001209950194954203, "loss": 8.5819, "step": 30496, "throughput": 8877.41932988071 }, { "epoch": 0.47849904779818025, "grad_norm": 0.07335702329874039, "learning_rate": 0.00012073593074491802, "loss": 8.5979, "step": 30528, "throughput": 8877.403826067108 }, { "epoch": 0.4790006191271092, "grad_norm": 0.07970409095287323, "learning_rate": 0.0001204770245395685, "loss": 8.5868, "step": 30560, "throughput": 8877.435590379278 }, { "epoch": 0.47950219045603804, "grad_norm": 0.073320172727108, "learning_rate": 0.00012021830194710178, "loss": 8.5728, "step": 30592, "throughput": 8877.552825749823 }, { "epoch": 0.48000376178496695, "grad_norm": 0.07845813781023026, "learning_rate": 0.00011995976403449054, "loss": 8.5728, "step": 30624, "throughput": 8877.614290495207 }, { "epoch": 0.48050533311389587, "grad_norm": 0.07357929646968842, "learning_rate": 0.00011970141186794592, "loss": 8.5839, "step": 30656, "throughput": 8877.611112615054 }, { "epoch": 0.4810069044428248, "grad_norm": 0.08569731563329697, "learning_rate": 0.00011944324651291299, "loss": 8.5576, "step": 30688, "throughput": 8877.662194945713 }, { "epoch": 0.4815084757717537, "grad_norm": 0.07027926295995712, "learning_rate": 0.00011918526903406647, "loss": 8.5569, "step": 30720, "throughput": 8877.783592572207 }, { "epoch": 0.4820100471006826, "grad_norm": 0.0785306990146637, "learning_rate": 0.0001189274804953063, "loss": 8.5765, "step": 30752, "throughput": 8877.205577707166 }, { "epoch": 0.48251161842961154, "grad_norm": 0.08035396784543991, "learning_rate": 0.00011866988195975307, "loss": 8.5716, "step": 30784, "throughput": 8877.162571351546 }, { "epoch": 0.4830131897585404, "grad_norm": 0.07607050985097885, "learning_rate": 0.00011841247448974398, "loss": 8.5749, "step": 30816, "throughput": 8877.200508662241 }, { "epoch": 0.4835147610874693, "grad_norm": 0.07426037639379501, "learning_rate": 0.00011815525914682817, "loss": 8.5535, "step": 30848, "throughput": 8877.338847754632 }, { "epoch": 0.48401633241639824, "grad_norm": 0.07136990875005722, "learning_rate": 0.00011789823699176249, "loss": 8.5748, "step": 30880, "throughput": 8877.429074529899 }, { "epoch": 0.48451790374532716, "grad_norm": 0.0776275247335434, "learning_rate": 0.00011764140908450703, "loss": 8.5631, "step": 30912, "throughput": 8877.401479776045 }, { "epoch": 0.4850194750742561, "grad_norm": 0.07546462118625641, "learning_rate": 0.0001173847764842209, "loss": 8.5765, "step": 30944, "throughput": 8877.458627369684 }, { "epoch": 0.485521046403185, "grad_norm": 0.07272691279649734, "learning_rate": 0.00011712834024925766, "loss": 8.5726, "step": 30976, "throughput": 8877.599290194024 }, { "epoch": 0.4860226177321139, "grad_norm": 0.07284701615571976, "learning_rate": 0.00011687210143716116, "loss": 8.558, "step": 31008, "throughput": 8877.681189949853 }, { "epoch": 0.4865241890610428, "grad_norm": 0.08194044232368469, "learning_rate": 0.00011661606110466095, "loss": 8.5643, "step": 31040, "throughput": 8877.64047607755 }, { "epoch": 0.4870257603899717, "grad_norm": 0.07076172530651093, "learning_rate": 0.00011636022030766818, "loss": 8.5746, "step": 31072, "throughput": 8877.68333698756 }, { "epoch": 0.4875273317189006, "grad_norm": 0.07631973922252655, "learning_rate": 0.00011610458010127093, "loss": 8.5635, "step": 31104, "throughput": 8877.804494843896 }, { "epoch": 0.48802890304782953, "grad_norm": 0.0787317231297493, "learning_rate": 0.00011584914153973036, "loss": 8.584, "step": 31136, "throughput": 8877.88577758543 }, { "epoch": 0.48853047437675845, "grad_norm": 0.07193674147129059, "learning_rate": 0.00011559390567647571, "loss": 8.5611, "step": 31168, "throughput": 8877.853278013274 }, { "epoch": 0.48903204570568737, "grad_norm": 0.06967335939407349, "learning_rate": 0.00011533887356410052, "loss": 8.5708, "step": 31200, "throughput": 8877.882846063356 }, { "epoch": 0.4895336170346163, "grad_norm": 0.07708664983510971, "learning_rate": 0.00011508404625435791, "loss": 8.5709, "step": 31232, "throughput": 8878.004876389365 }, { "epoch": 0.49003518836354515, "grad_norm": 0.07327646017074585, "learning_rate": 0.00011482942479815651, "loss": 8.5505, "step": 31264, "throughput": 8878.096059697875 }, { "epoch": 0.49053675969247407, "grad_norm": 0.07120782136917114, "learning_rate": 0.00011457501024555593, "loss": 8.5701, "step": 31296, "throughput": 8878.083957310526 }, { "epoch": 0.491038331021403, "grad_norm": 0.06819991022348404, "learning_rate": 0.00011432080364576256, "loss": 8.5491, "step": 31328, "throughput": 8878.177801032847 }, { "epoch": 0.4915399023503319, "grad_norm": 0.0801183432340622, "learning_rate": 0.00011406680604712517, "loss": 8.5686, "step": 31360, "throughput": 8878.30045967166 }, { "epoch": 0.4920414736792608, "grad_norm": 0.06938584893941879, "learning_rate": 0.00011381301849713059, "loss": 8.5674, "step": 31392, "throughput": 8878.38557907876 }, { "epoch": 0.49254304500818974, "grad_norm": 0.1152784451842308, "learning_rate": 0.00011355944204239944, "loss": 8.5672, "step": 31424, "throughput": 8878.390782361848 }, { "epoch": 0.4930446163371186, "grad_norm": 0.07254608720541, "learning_rate": 0.0001133060777286818, "loss": 8.5559, "step": 31456, "throughput": 8878.445844506625 }, { "epoch": 0.4935461876660475, "grad_norm": 0.07228664308786392, "learning_rate": 0.00011305292660085278, "loss": 8.5488, "step": 31488, "throughput": 8878.577357518385 }, { "epoch": 0.49404775899497644, "grad_norm": 0.07752840220928192, "learning_rate": 0.00011279998970290844, "loss": 8.5768, "step": 31520, "throughput": 8878.657580171179 }, { "epoch": 0.49454933032390536, "grad_norm": 0.08368710428476334, "learning_rate": 0.0001125472680779613, "loss": 8.5621, "step": 31552, "throughput": 8878.657276550583 }, { "epoch": 0.4950509016528343, "grad_norm": 0.07768121361732483, "learning_rate": 0.00011229476276823608, "loss": 8.5495, "step": 31584, "throughput": 8878.720184959087 }, { "epoch": 0.4955524729817632, "grad_norm": 0.07697410136461258, "learning_rate": 0.00011204247481506535, "loss": 8.5502, "step": 31616, "throughput": 8878.852409219156 }, { "epoch": 0.4960540443106921, "grad_norm": 0.07087410986423492, "learning_rate": 0.00011179040525888552, "loss": 8.5554, "step": 31648, "throughput": 8878.956401158952 }, { "epoch": 0.496555615639621, "grad_norm": 0.07429654896259308, "learning_rate": 0.00011153855513923207, "loss": 8.544, "step": 31680, "throughput": 8878.967627668655 }, { "epoch": 0.4970571869685499, "grad_norm": 0.09350360184907913, "learning_rate": 0.00011128692549473568, "loss": 8.5657, "step": 31712, "throughput": 8879.020572731342 }, { "epoch": 0.4975587582974788, "grad_norm": 0.07406873255968094, "learning_rate": 0.00011103551736311777, "loss": 8.5473, "step": 31744, "throughput": 8879.15050775114 }, { "epoch": 0.4980603296264077, "grad_norm": 0.07790439575910568, "learning_rate": 0.0001107843317811862, "loss": 8.5429, "step": 31776, "throughput": 8879.233401099962 }, { "epoch": 0.49856190095533665, "grad_norm": 0.07620224356651306, "learning_rate": 0.00011053336978483102, "loss": 8.5706, "step": 31808, "throughput": 8879.19222412429 }, { "epoch": 0.49906347228426556, "grad_norm": 0.08028724044561386, "learning_rate": 0.00011028263240902033, "loss": 8.539, "step": 31840, "throughput": 8879.253677757002 }, { "epoch": 0.4995650436131945, "grad_norm": 0.07354696840047836, "learning_rate": 0.0001100321206877957, "loss": 8.5431, "step": 31872, "throughput": 8879.351637433128 }, { "epoch": 0.5000666149421233, "grad_norm": 0.073768250644207, "learning_rate": 0.00010978183565426832, "loss": 8.556, "step": 31904, "throughput": 8879.423566520085 }, { "epoch": 0.5005681862710523, "grad_norm": 0.08214636147022247, "learning_rate": 0.00010953177834061435, "loss": 8.5693, "step": 31936, "throughput": 8879.384955887333 }, { "epoch": 0.5010697575999812, "grad_norm": 0.07873082906007767, "learning_rate": 0.00010928194977807091, "loss": 8.5494, "step": 31968, "throughput": 8879.47590132547 }, { "epoch": 0.5015713289289101, "grad_norm": 0.0739561915397644, "learning_rate": 0.00010903235099693174, "loss": 8.5376, "step": 32000, "throughput": 8879.5924262509 }, { "epoch": 0.502072900257839, "grad_norm": 0.08155156672000885, "learning_rate": 0.00010878298302654294, "loss": 8.569, "step": 32032, "throughput": 8879.651295267644 }, { "epoch": 0.5025744715867679, "grad_norm": 0.07305172085762024, "learning_rate": 0.00010853384689529873, "loss": 8.5545, "step": 32064, "throughput": 8879.635714989827 }, { "epoch": 0.5030760429156969, "grad_norm": 0.07556242495775223, "learning_rate": 0.00010828494363063732, "loss": 8.5534, "step": 32096, "throughput": 8879.706163985868 }, { "epoch": 0.5035776142446258, "grad_norm": 0.07298897951841354, "learning_rate": 0.0001080362742590364, "loss": 8.5631, "step": 32128, "throughput": 8879.817239222819 }, { "epoch": 0.5040791855735547, "grad_norm": 0.08162616193294525, "learning_rate": 0.00010778783980600939, "loss": 8.569, "step": 32160, "throughput": 8879.905393904175 }, { "epoch": 0.5045807569024836, "grad_norm": 0.07203565537929535, "learning_rate": 0.00010753964129610052, "loss": 8.5517, "step": 32192, "throughput": 8879.852917096618 }, { "epoch": 0.5050823282314125, "grad_norm": 0.07308918237686157, "learning_rate": 0.00010729167975288122, "loss": 8.5551, "step": 32224, "throughput": 8879.898546137045 }, { "epoch": 0.5055838995603413, "grad_norm": 0.07474292814731598, "learning_rate": 0.0001070439561989457, "loss": 8.5609, "step": 32256, "throughput": 8880.034979977114 }, { "epoch": 0.5060854708892703, "grad_norm": 0.07707252353429794, "learning_rate": 0.00010679647165590659, "loss": 8.5408, "step": 32288, "throughput": 8880.079002533506 }, { "epoch": 0.5065870422181992, "grad_norm": 0.07359939813613892, "learning_rate": 0.00010654922714439083, "loss": 8.5409, "step": 32320, "throughput": 8880.03827720708 }, { "epoch": 0.5070886135471281, "grad_norm": 0.08229360729455948, "learning_rate": 0.00010630222368403561, "loss": 8.5313, "step": 32352, "throughput": 8880.097772317216 }, { "epoch": 0.507590184876057, "grad_norm": 0.07442633807659149, "learning_rate": 0.00010605546229348396, "loss": 8.5595, "step": 32384, "throughput": 8880.231914183245 }, { "epoch": 0.5080917562049859, "grad_norm": 0.07239026576280594, "learning_rate": 0.00010580894399038044, "loss": 8.5563, "step": 32416, "throughput": 8880.331477725678 }, { "epoch": 0.5085933275339148, "grad_norm": 0.07314996421337128, "learning_rate": 0.00010556266979136734, "loss": 8.5389, "step": 32448, "throughput": 8880.30725576251 }, { "epoch": 0.5090948988628438, "grad_norm": 0.07866779714822769, "learning_rate": 0.00010531664071208019, "loss": 8.5422, "step": 32480, "throughput": 8880.404943660236 }, { "epoch": 0.5095964701917727, "grad_norm": 0.07595469057559967, "learning_rate": 0.00010507085776714369, "loss": 8.5319, "step": 32512, "throughput": 8880.517738723465 }, { "epoch": 0.5100980415207016, "grad_norm": 0.07140462100505829, "learning_rate": 0.00010482532197016732, "loss": 8.5504, "step": 32544, "throughput": 8880.613775883952 }, { "epoch": 0.5105996128496305, "grad_norm": 0.07571039348840714, "learning_rate": 0.00010458003433374152, "loss": 8.5415, "step": 32576, "throughput": 8880.57318962394 }, { "epoch": 0.5111011841785594, "grad_norm": 0.08015038073062897, "learning_rate": 0.00010433499586943319, "loss": 8.5512, "step": 32608, "throughput": 8880.661175602947 }, { "epoch": 0.5116027555074883, "grad_norm": 0.0716933086514473, "learning_rate": 0.00010409020758778178, "loss": 8.5497, "step": 32640, "throughput": 8880.779949426185 }, { "epoch": 0.5121043268364173, "grad_norm": 0.07379721105098724, "learning_rate": 0.00010384567049829474, "loss": 8.5389, "step": 32672, "throughput": 8880.86449327483 }, { "epoch": 0.5126058981653461, "grad_norm": 0.06897587329149246, "learning_rate": 0.00010360138560944379, "loss": 8.536, "step": 32704, "throughput": 8880.796608475323 }, { "epoch": 0.513107469494275, "grad_norm": 0.07183413207530975, "learning_rate": 0.00010335735392866061, "loss": 8.5316, "step": 32736, "throughput": 8880.86476432713 }, { "epoch": 0.5136090408232039, "grad_norm": 0.07627425342798233, "learning_rate": 0.00010311357646233255, "loss": 8.5474, "step": 32768, "throughput": 8880.967565750585 }, { "epoch": 0.5141106121521328, "grad_norm": 0.08256326615810394, "learning_rate": 0.00010287005421579854, "loss": 8.5603, "step": 32800, "throughput": 8880.427350610618 }, { "epoch": 0.5146121834810617, "grad_norm": 0.08104515075683594, "learning_rate": 0.00010262678819334511, "loss": 8.5393, "step": 32832, "throughput": 8880.38917977172 }, { "epoch": 0.5151137548099907, "grad_norm": 0.08197243511676788, "learning_rate": 0.00010238377939820202, "loss": 8.5454, "step": 32864, "throughput": 8880.469958966982 }, { "epoch": 0.5156153261389196, "grad_norm": 0.08547014743089676, "learning_rate": 0.00010214102883253832, "loss": 8.5422, "step": 32896, "throughput": 8880.562745151854 }, { "epoch": 0.5161168974678485, "grad_norm": 0.0771087184548378, "learning_rate": 0.00010189853749745799, "loss": 8.528, "step": 32928, "throughput": 8880.67678790059 }, { "epoch": 0.5166184687967774, "grad_norm": 0.07997187227010727, "learning_rate": 0.00010165630639299606, "loss": 8.5308, "step": 32960, "throughput": 8880.615246084193 }, { "epoch": 0.5171200401257063, "grad_norm": 0.08800628036260605, "learning_rate": 0.00010141433651811429, "loss": 8.5355, "step": 32992, "throughput": 8880.689982268765 }, { "epoch": 0.5176216114546353, "grad_norm": 0.09052596986293793, "learning_rate": 0.00010117262887069724, "loss": 8.5431, "step": 33024, "throughput": 8880.793214106987 }, { "epoch": 0.5181231827835642, "grad_norm": 0.07647205144166946, "learning_rate": 0.00010093118444754784, "loss": 8.5479, "step": 33056, "throughput": 8880.903723461652 }, { "epoch": 0.5186247541124931, "grad_norm": 0.07544301450252533, "learning_rate": 0.0001006900042443837, "loss": 8.5154, "step": 33088, "throughput": 8880.808497164393 }, { "epoch": 0.519126325441422, "grad_norm": 0.08619405329227448, "learning_rate": 0.00010044908925583264, "loss": 8.5522, "step": 33120, "throughput": 8880.884355850601 }, { "epoch": 0.5196278967703508, "grad_norm": 0.07676394283771515, "learning_rate": 0.00010020844047542886, "loss": 8.5266, "step": 33152, "throughput": 8880.995931369265 }, { "epoch": 0.5201294680992797, "grad_norm": 0.07478612661361694, "learning_rate": 9.996805889560857e-05, "loss": 8.5333, "step": 33184, "throughput": 8881.09647931044 }, { "epoch": 0.5206310394282087, "grad_norm": 0.07415000349283218, "learning_rate": 9.972794550770612e-05, "loss": 8.5272, "step": 33216, "throughput": 8881.035190751842 }, { "epoch": 0.5211326107571376, "grad_norm": 0.07461415231227875, "learning_rate": 9.948810130194984e-05, "loss": 8.5506, "step": 33248, "throughput": 8881.106414600572 }, { "epoch": 0.5216341820860665, "grad_norm": 0.07567655295133591, "learning_rate": 9.924852726745807e-05, "loss": 8.5222, "step": 33280, "throughput": 8881.204834663215 }, { "epoch": 0.5221357534149954, "grad_norm": 0.07916685938835144, "learning_rate": 9.900922439223464e-05, "loss": 8.5652, "step": 33312, "throughput": 8881.3064814247 }, { "epoch": 0.5226373247439243, "grad_norm": 0.06847506016492844, "learning_rate": 9.877019366316541e-05, "loss": 8.5146, "step": 33344, "throughput": 8881.272806522671 }, { "epoch": 0.5231388960728532, "grad_norm": 0.07268689572811127, "learning_rate": 9.85314360660138e-05, "loss": 8.5187, "step": 33376, "throughput": 8881.33976370783 }, { "epoch": 0.5236404674017822, "grad_norm": 0.0702684298157692, "learning_rate": 9.829295258541692e-05, "loss": 8.5289, "step": 33408, "throughput": 8881.454946050295 }, { "epoch": 0.5241420387307111, "grad_norm": 0.07464347779750824, "learning_rate": 9.805474420488123e-05, "loss": 8.5505, "step": 33440, "throughput": 8881.55141287771 }, { "epoch": 0.52464361005964, "grad_norm": 0.07443258166313171, "learning_rate": 9.78168119067789e-05, "loss": 8.5471, "step": 33472, "throughput": 8881.522813382444 }, { "epoch": 0.5251451813885689, "grad_norm": 0.07743299752473831, "learning_rate": 9.757915667234339e-05, "loss": 8.5459, "step": 33504, "throughput": 8881.584108867713 }, { "epoch": 0.5256467527174978, "grad_norm": 0.07928189635276794, "learning_rate": 9.734177948166558e-05, "loss": 8.5381, "step": 33536, "throughput": 8881.690984580047 }, { "epoch": 0.5261483240464266, "grad_norm": 0.07928431779146194, "learning_rate": 9.710468131368968e-05, "loss": 8.5025, "step": 33568, "throughput": 8881.775171208094 }, { "epoch": 0.5266498953753556, "grad_norm": 0.07774075865745544, "learning_rate": 9.68678631462093e-05, "loss": 8.5461, "step": 33600, "throughput": 8881.787401343072 }, { "epoch": 0.5271514667042845, "grad_norm": 0.07617669552564621, "learning_rate": 9.66313259558633e-05, "loss": 8.5331, "step": 33632, "throughput": 8881.813049327631 }, { "epoch": 0.5276530380332134, "grad_norm": 0.08349672704935074, "learning_rate": 9.639507071813166e-05, "loss": 8.5034, "step": 33664, "throughput": 8881.91225326137 }, { "epoch": 0.5281546093621423, "grad_norm": 0.07760000973939896, "learning_rate": 9.615909840733167e-05, "loss": 8.5311, "step": 33696, "throughput": 8882.02504331997 }, { "epoch": 0.5286561806910712, "grad_norm": 0.07814379036426544, "learning_rate": 9.592340999661393e-05, "loss": 8.5289, "step": 33728, "throughput": 8882.026454517742 }, { "epoch": 0.5291577520200001, "grad_norm": 0.07366339862346649, "learning_rate": 9.568800645795812e-05, "loss": 8.5552, "step": 33760, "throughput": 8882.043253472537 }, { "epoch": 0.5296593233489291, "grad_norm": 0.07660157978534698, "learning_rate": 9.545288876216901e-05, "loss": 8.5227, "step": 33792, "throughput": 8882.149626940249 }, { "epoch": 0.530160894677858, "grad_norm": 0.07678160816431046, "learning_rate": 9.521805787887285e-05, "loss": 8.518, "step": 33824, "throughput": 8882.220820754601 }, { "epoch": 0.5306624660067869, "grad_norm": 0.07604127377271652, "learning_rate": 9.498351477651286e-05, "loss": 8.5325, "step": 33856, "throughput": 8882.205366979448 }, { "epoch": 0.5311640373357158, "grad_norm": 0.07513943314552307, "learning_rate": 9.47492604223454e-05, "loss": 8.5155, "step": 33888, "throughput": 8882.214404156059 }, { "epoch": 0.5316656086646447, "grad_norm": 0.08751774579286575, "learning_rate": 9.451529578243618e-05, "loss": 8.5269, "step": 33920, "throughput": 8882.309489996225 }, { "epoch": 0.5321671799935737, "grad_norm": 0.07623326778411865, "learning_rate": 9.428162182165607e-05, "loss": 8.5138, "step": 33952, "throughput": 8882.40839640058 }, { "epoch": 0.5326687513225026, "grad_norm": 0.06985815614461899, "learning_rate": 9.40482395036772e-05, "loss": 8.5227, "step": 33984, "throughput": 8882.401640771297 }, { "epoch": 0.5331703226514314, "grad_norm": 0.07082241773605347, "learning_rate": 9.381514979096888e-05, "loss": 8.5024, "step": 34016, "throughput": 8882.42714656863 }, { "epoch": 0.5336718939803603, "grad_norm": 0.07866815477609634, "learning_rate": 9.35823536447938e-05, "loss": 8.5337, "step": 34048, "throughput": 8882.52549045798 }, { "epoch": 0.5341734653092892, "grad_norm": 0.07148824632167816, "learning_rate": 9.334985202520395e-05, "loss": 8.5005, "step": 34080, "throughput": 8882.623048916446 }, { "epoch": 0.5346750366382181, "grad_norm": 0.06872270256280899, "learning_rate": 9.311764589103679e-05, "loss": 8.5324, "step": 34112, "throughput": 8882.655170274242 }, { "epoch": 0.5351766079671471, "grad_norm": 0.07169391959905624, "learning_rate": 9.288573619991096e-05, "loss": 8.532, "step": 34144, "throughput": 8882.656679601589 }, { "epoch": 0.535678179296076, "grad_norm": 0.07784446328878403, "learning_rate": 9.265412390822278e-05, "loss": 8.5363, "step": 34176, "throughput": 8882.771757879587 }, { "epoch": 0.5361797506250049, "grad_norm": 0.07849624007940292, "learning_rate": 9.242280997114204e-05, "loss": 8.5078, "step": 34208, "throughput": 8882.852619223795 }, { "epoch": 0.5366813219539338, "grad_norm": 0.0793418139219284, "learning_rate": 9.219179534260811e-05, "loss": 8.5131, "step": 34240, "throughput": 8882.884299489215 }, { "epoch": 0.5371828932828627, "grad_norm": 0.07151144742965698, "learning_rate": 9.196108097532597e-05, "loss": 8.5116, "step": 34272, "throughput": 8882.89297451379 }, { "epoch": 0.5376844646117916, "grad_norm": 0.07799839228391647, "learning_rate": 9.173066782076236e-05, "loss": 8.5191, "step": 34304, "throughput": 8883.013899560407 }, { "epoch": 0.5381860359407206, "grad_norm": 0.06935375928878784, "learning_rate": 9.15005568291418e-05, "loss": 8.4986, "step": 34336, "throughput": 8883.143128934125 }, { "epoch": 0.5386876072696495, "grad_norm": 0.07601752877235413, "learning_rate": 9.12707489494428e-05, "loss": 8.4912, "step": 34368, "throughput": 8883.12263471243 }, { "epoch": 0.5391891785985784, "grad_norm": 0.07418935745954514, "learning_rate": 9.104124512939357e-05, "loss": 8.5373, "step": 34400, "throughput": 8883.126118539001 }, { "epoch": 0.5396907499275073, "grad_norm": 0.07352650910615921, "learning_rate": 9.081204631546867e-05, "loss": 8.5107, "step": 34432, "throughput": 8883.241844806826 }, { "epoch": 0.5401923212564361, "grad_norm": 0.07347220927476883, "learning_rate": 9.058315345288465e-05, "loss": 8.4956, "step": 34464, "throughput": 8883.34991937916 }, { "epoch": 0.540693892585365, "grad_norm": 0.07640855014324188, "learning_rate": 9.035456748559639e-05, "loss": 8.5371, "step": 34496, "throughput": 8883.324710473114 }, { "epoch": 0.541195463914294, "grad_norm": 0.10376151651144028, "learning_rate": 9.012628935629299e-05, "loss": 8.5067, "step": 34528, "throughput": 8883.330183627477 }, { "epoch": 0.5416970352432229, "grad_norm": 0.07584423571825027, "learning_rate": 8.989832000639424e-05, "loss": 8.4991, "step": 34560, "throughput": 8883.445808840017 }, { "epoch": 0.5421986065721518, "grad_norm": 0.06925126165151596, "learning_rate": 8.967066037604637e-05, "loss": 8.5198, "step": 34592, "throughput": 8883.562907732476 }, { "epoch": 0.5427001779010807, "grad_norm": 0.07632188498973846, "learning_rate": 8.944331140411841e-05, "loss": 8.525, "step": 34624, "throughput": 8883.549628252418 }, { "epoch": 0.5432017492300096, "grad_norm": 0.07406525313854218, "learning_rate": 8.921627402819813e-05, "loss": 8.5115, "step": 34656, "throughput": 8883.528618607796 }, { "epoch": 0.5437033205589386, "grad_norm": 0.07298589497804642, "learning_rate": 8.898954918458835e-05, "loss": 8.5207, "step": 34688, "throughput": 8883.647321597313 }, { "epoch": 0.5442048918878675, "grad_norm": 0.07894790172576904, "learning_rate": 8.876313780830305e-05, "loss": 8.5316, "step": 34720, "throughput": 8883.781552914974 }, { "epoch": 0.5447064632167964, "grad_norm": 0.0751766636967659, "learning_rate": 8.853704083306341e-05, "loss": 8.5235, "step": 34752, "throughput": 8883.768226796805 }, { "epoch": 0.5452080345457253, "grad_norm": 0.08003483712673187, "learning_rate": 8.831125919129397e-05, "loss": 8.5187, "step": 34784, "throughput": 8883.746416303493 }, { "epoch": 0.5457096058746542, "grad_norm": 0.07891687750816345, "learning_rate": 8.808579381411892e-05, "loss": 8.5174, "step": 34816, "throughput": 8883.877403875109 }, { "epoch": 0.5462111772035831, "grad_norm": 0.07358083128929138, "learning_rate": 8.786064563135815e-05, "loss": 8.5205, "step": 34848, "throughput": 8883.383841534293 }, { "epoch": 0.5467127485325121, "grad_norm": 0.0873100757598877, "learning_rate": 8.763581557152348e-05, "loss": 8.5185, "step": 34880, "throughput": 8883.393249342576 }, { "epoch": 0.5472143198614409, "grad_norm": 0.07261183112859726, "learning_rate": 8.741130456181463e-05, "loss": 8.5077, "step": 34912, "throughput": 8883.376127779211 }, { "epoch": 0.5477158911903698, "grad_norm": 0.0730830505490303, "learning_rate": 8.718711352811573e-05, "loss": 8.5139, "step": 34944, "throughput": 8883.498276316366 }, { "epoch": 0.5482174625192987, "grad_norm": 0.07217606902122498, "learning_rate": 8.696324339499135e-05, "loss": 8.5159, "step": 34976, "throughput": 8883.597327656475 }, { "epoch": 0.5487190338482276, "grad_norm": 0.08003465086221695, "learning_rate": 8.673969508568242e-05, "loss": 8.4899, "step": 35008, "throughput": 8883.574050139705 }, { "epoch": 0.5492206051771565, "grad_norm": 0.07030902057886124, "learning_rate": 8.651646952210293e-05, "loss": 8.5107, "step": 35040, "throughput": 8883.581419458513 }, { "epoch": 0.5497221765060855, "grad_norm": 0.07257349044084549, "learning_rate": 8.629356762483573e-05, "loss": 8.5134, "step": 35072, "throughput": 8883.695128678115 }, { "epoch": 0.5502237478350144, "grad_norm": 0.08085116744041443, "learning_rate": 8.607099031312901e-05, "loss": 8.5062, "step": 35104, "throughput": 8883.821992382886 }, { "epoch": 0.5507253191639433, "grad_norm": 0.08801715821027756, "learning_rate": 8.58487385048921e-05, "loss": 8.53, "step": 35136, "throughput": 8883.797216991694 }, { "epoch": 0.5512268904928722, "grad_norm": 0.07201190292835236, "learning_rate": 8.562681311669218e-05, "loss": 8.5199, "step": 35168, "throughput": 8883.765672377716 }, { "epoch": 0.5517284618218011, "grad_norm": 0.07491514086723328, "learning_rate": 8.540521506375026e-05, "loss": 8.512, "step": 35200, "throughput": 8883.881929345513 }, { "epoch": 0.55223003315073, "grad_norm": 0.07810447365045547, "learning_rate": 8.518394525993734e-05, "loss": 8.5059, "step": 35232, "throughput": 8883.995820434235 }, { "epoch": 0.552731604479659, "grad_norm": 0.07982519268989563, "learning_rate": 8.496300461777068e-05, "loss": 8.5158, "step": 35264, "throughput": 8883.975344967028 }, { "epoch": 0.5532331758085879, "grad_norm": 0.08146160840988159, "learning_rate": 8.474239404841023e-05, "loss": 8.5038, "step": 35296, "throughput": 8883.938716291977 }, { "epoch": 0.5537347471375168, "grad_norm": 0.0752020925283432, "learning_rate": 8.452211446165458e-05, "loss": 8.5064, "step": 35328, "throughput": 8884.057691728514 }, { "epoch": 0.5542363184664456, "grad_norm": 0.0742310956120491, "learning_rate": 8.430216676593744e-05, "loss": 8.5308, "step": 35360, "throughput": 8884.159179951805 }, { "epoch": 0.5547378897953745, "grad_norm": 0.07772351056337357, "learning_rate": 8.408255186832372e-05, "loss": 8.5202, "step": 35392, "throughput": 8884.131018453878 }, { "epoch": 0.5552394611243034, "grad_norm": 0.07519405335187912, "learning_rate": 8.386327067450593e-05, "loss": 8.4915, "step": 35424, "throughput": 8884.115394230439 }, { "epoch": 0.5557410324532324, "grad_norm": 0.07935325801372528, "learning_rate": 8.36443240888004e-05, "loss": 8.5052, "step": 35456, "throughput": 8884.23882340746 }, { "epoch": 0.5562426037821613, "grad_norm": 0.07572668790817261, "learning_rate": 8.342571301414342e-05, "loss": 8.5201, "step": 35488, "throughput": 8884.340679096225 }, { "epoch": 0.5567441751110902, "grad_norm": 0.08227578550577164, "learning_rate": 8.320743835208775e-05, "loss": 8.522, "step": 35520, "throughput": 8884.341695444704 }, { "epoch": 0.5572457464400191, "grad_norm": 0.07324469834566116, "learning_rate": 8.298950100279872e-05, "loss": 8.5111, "step": 35552, "throughput": 8884.332478967055 }, { "epoch": 0.557747317768948, "grad_norm": 0.08193643391132355, "learning_rate": 8.27719018650507e-05, "loss": 8.5238, "step": 35584, "throughput": 8884.452878142642 }, { "epoch": 0.558248889097877, "grad_norm": 0.10601537674665451, "learning_rate": 8.255464183622304e-05, "loss": 8.5241, "step": 35616, "throughput": 8884.556082756277 }, { "epoch": 0.5587504604268059, "grad_norm": 0.0735933780670166, "learning_rate": 8.23377218122968e-05, "loss": 8.5082, "step": 35648, "throughput": 8884.528473292614 }, { "epoch": 0.5592520317557348, "grad_norm": 0.06966464966535568, "learning_rate": 8.212114268785083e-05, "loss": 8.4966, "step": 35680, "throughput": 8884.53071339156 }, { "epoch": 0.5597536030846637, "grad_norm": 0.0795019343495369, "learning_rate": 8.190490535605809e-05, "loss": 8.4768, "step": 35712, "throughput": 8884.643559526203 }, { "epoch": 0.5602551744135926, "grad_norm": 0.07111742347478867, "learning_rate": 8.16890107086819e-05, "loss": 8.494, "step": 35744, "throughput": 8884.744796587207 }, { "epoch": 0.5607567457425215, "grad_norm": 0.07843293994665146, "learning_rate": 8.14734596360725e-05, "loss": 8.5043, "step": 35776, "throughput": 8884.730667694086 }, { "epoch": 0.5612583170714504, "grad_norm": 0.08138687163591385, "learning_rate": 8.12582530271631e-05, "loss": 8.5186, "step": 35808, "throughput": 8884.735111118021 }, { "epoch": 0.5617598884003793, "grad_norm": 0.07690315693616867, "learning_rate": 8.104339176946648e-05, "loss": 8.477, "step": 35840, "throughput": 8884.85119216147 }, { "epoch": 0.5622614597293082, "grad_norm": 0.0783621296286583, "learning_rate": 8.082887674907099e-05, "loss": 8.4963, "step": 35872, "throughput": 8884.941615967653 }, { "epoch": 0.5627630310582371, "grad_norm": 0.07450364530086517, "learning_rate": 8.061470885063726e-05, "loss": 8.5131, "step": 35904, "throughput": 8884.948427501336 }, { "epoch": 0.563264602387166, "grad_norm": 0.08089811354875565, "learning_rate": 8.040088895739433e-05, "loss": 8.5175, "step": 35936, "throughput": 8884.96577940189 }, { "epoch": 0.5637661737160949, "grad_norm": 0.07151561975479126, "learning_rate": 8.018741795113614e-05, "loss": 8.5038, "step": 35968, "throughput": 8885.047550193507 }, { "epoch": 0.5642677450450239, "grad_norm": 0.07573382556438446, "learning_rate": 7.997429671221764e-05, "loss": 8.5013, "step": 36000, "throughput": 8885.163958594867 }, { "epoch": 0.5647693163739528, "grad_norm": 0.08782751113176346, "learning_rate": 7.97615261195515e-05, "loss": 8.5074, "step": 36032, "throughput": 8885.143885689009 }, { "epoch": 0.5652708877028817, "grad_norm": 0.07787059992551804, "learning_rate": 7.95491070506043e-05, "loss": 8.5219, "step": 36064, "throughput": 8885.151103577797 }, { "epoch": 0.5657724590318106, "grad_norm": 0.07755567133426666, "learning_rate": 7.933704038139292e-05, "loss": 8.4863, "step": 36096, "throughput": 8885.227813447 }, { "epoch": 0.5662740303607395, "grad_norm": 0.0700237974524498, "learning_rate": 7.912532698648089e-05, "loss": 8.4916, "step": 36128, "throughput": 8885.333697137732 }, { "epoch": 0.5667756016896685, "grad_norm": 0.071931391954422, "learning_rate": 7.891396773897487e-05, "loss": 8.4837, "step": 36160, "throughput": 8885.319873634005 }, { "epoch": 0.5672771730185974, "grad_norm": 0.09684596210718155, "learning_rate": 7.870296351052104e-05, "loss": 8.479, "step": 36192, "throughput": 8885.345592323565 }, { "epoch": 0.5677787443475263, "grad_norm": 0.07962139695882797, "learning_rate": 7.849231517130151e-05, "loss": 8.493, "step": 36224, "throughput": 8885.403833978344 }, { "epoch": 0.5682803156764551, "grad_norm": 0.12747646868228912, "learning_rate": 7.828202359003058e-05, "loss": 8.4928, "step": 36256, "throughput": 8885.521391567749 }, { "epoch": 0.568781887005384, "grad_norm": 0.07479926198720932, "learning_rate": 7.807208963395139e-05, "loss": 8.4844, "step": 36288, "throughput": 8885.518655107615 }, { "epoch": 0.5692834583343129, "grad_norm": 0.07229211181402206, "learning_rate": 7.786251416883218e-05, "loss": 8.4924, "step": 36320, "throughput": 8885.550054180885 }, { "epoch": 0.5697850296632418, "grad_norm": 0.07093744724988937, "learning_rate": 7.765329805896287e-05, "loss": 8.5047, "step": 36352, "throughput": 8885.623178166292 }, { "epoch": 0.5702866009921708, "grad_norm": 0.07500205934047699, "learning_rate": 7.744444216715117e-05, "loss": 8.5033, "step": 36384, "throughput": 8885.710608474703 }, { "epoch": 0.5707881723210997, "grad_norm": 0.08139976114034653, "learning_rate": 7.723594735471952e-05, "loss": 8.5042, "step": 36416, "throughput": 8885.703183113117 }, { "epoch": 0.5712897436500286, "grad_norm": 0.07652562111616135, "learning_rate": 7.702781448150109e-05, "loss": 8.4971, "step": 36448, "throughput": 8885.708370268474 }, { "epoch": 0.5717913149789575, "grad_norm": 0.07908913493156433, "learning_rate": 7.682004440583654e-05, "loss": 8.4896, "step": 36480, "throughput": 8885.771023556996 }, { "epoch": 0.5722928863078864, "grad_norm": 0.0737721249461174, "learning_rate": 7.661263798457014e-05, "loss": 8.4904, "step": 36512, "throughput": 8885.893284596656 }, { "epoch": 0.5727944576368154, "grad_norm": 0.07399065047502518, "learning_rate": 7.64055960730467e-05, "loss": 8.4715, "step": 36544, "throughput": 8885.89665086395 }, { "epoch": 0.5732960289657443, "grad_norm": 0.07493384927511215, "learning_rate": 7.619891952510763e-05, "loss": 8.5003, "step": 36576, "throughput": 8885.881908155181 }, { "epoch": 0.5737976002946732, "grad_norm": 0.08065405488014221, "learning_rate": 7.599260919308764e-05, "loss": 8.4902, "step": 36608, "throughput": 8885.931565297993 }, { "epoch": 0.5742991716236021, "grad_norm": 0.07617328315973282, "learning_rate": 7.578666592781114e-05, "loss": 8.4989, "step": 36640, "throughput": 8886.04701726584 }, { "epoch": 0.574800742952531, "grad_norm": 0.07760798931121826, "learning_rate": 7.558109057858874e-05, "loss": 8.478, "step": 36672, "throughput": 8886.097106253352 }, { "epoch": 0.5753023142814598, "grad_norm": 0.07901562750339508, "learning_rate": 7.53758839932139e-05, "loss": 8.4934, "step": 36704, "throughput": 8886.058945302642 }, { "epoch": 0.5758038856103888, "grad_norm": 0.07319517433643341, "learning_rate": 7.517104701795905e-05, "loss": 8.4736, "step": 36736, "throughput": 8886.096675370176 }, { "epoch": 0.5763054569393177, "grad_norm": 0.07419974356889725, "learning_rate": 7.496658049757255e-05, "loss": 8.4854, "step": 36768, "throughput": 8886.215806089569 }, { "epoch": 0.5768070282682466, "grad_norm": 0.08911358565092087, "learning_rate": 7.476248527527492e-05, "loss": 8.4836, "step": 36800, "throughput": 8886.26092789472 }, { "epoch": 0.5773085995971755, "grad_norm": 0.07458078861236572, "learning_rate": 7.455876219275552e-05, "loss": 8.4766, "step": 36832, "throughput": 8886.227500957024 }, { "epoch": 0.5778101709261044, "grad_norm": 0.07872738689184189, "learning_rate": 7.435541209016885e-05, "loss": 8.5004, "step": 36864, "throughput": 8886.277084685154 }, { "epoch": 0.5783117422550333, "grad_norm": 0.07543252408504486, "learning_rate": 7.415243580613134e-05, "loss": 8.4784, "step": 36896, "throughput": 8885.820176886395 }, { "epoch": 0.5788133135839623, "grad_norm": 0.06923045217990875, "learning_rate": 7.394983417771791e-05, "loss": 8.4983, "step": 36928, "throughput": 8885.825564564371 }, { "epoch": 0.5793148849128912, "grad_norm": 0.08396708220243454, "learning_rate": 7.374760804045815e-05, "loss": 8.5045, "step": 36960, "throughput": 8885.798962338915 }, { "epoch": 0.5798164562418201, "grad_norm": 0.07503468543291092, "learning_rate": 7.354575822833331e-05, "loss": 8.4812, "step": 36992, "throughput": 8885.86703467302 }, { "epoch": 0.580318027570749, "grad_norm": 0.08438971638679504, "learning_rate": 7.334428557377258e-05, "loss": 8.5011, "step": 37024, "throughput": 8885.979098155063 }, { "epoch": 0.5808195988996779, "grad_norm": 0.07898327708244324, "learning_rate": 7.314319090764985e-05, "loss": 8.4963, "step": 37056, "throughput": 8886.007212733173 }, { "epoch": 0.5813211702286069, "grad_norm": 0.07942887395620346, "learning_rate": 7.294247505928003e-05, "loss": 8.4917, "step": 37088, "throughput": 8885.973764510194 }, { "epoch": 0.5818227415575358, "grad_norm": 0.08021794259548187, "learning_rate": 7.274213885641592e-05, "loss": 8.482, "step": 37120, "throughput": 8886.027117021331 }, { "epoch": 0.5823243128864646, "grad_norm": 0.07906454056501389, "learning_rate": 7.254218312524461e-05, "loss": 8.4884, "step": 37152, "throughput": 8886.159000057156 }, { "epoch": 0.5828258842153935, "grad_norm": 0.071269690990448, "learning_rate": 7.234260869038417e-05, "loss": 8.5017, "step": 37184, "throughput": 8886.220635883392 }, { "epoch": 0.5833274555443224, "grad_norm": 0.07759478688240051, "learning_rate": 7.214341637488007e-05, "loss": 8.5069, "step": 37216, "throughput": 8886.22728503575 }, { "epoch": 0.5838290268732513, "grad_norm": 0.07550026476383209, "learning_rate": 7.194460700020206e-05, "loss": 8.4778, "step": 37248, "throughput": 8886.241364373975 }, { "epoch": 0.5843305982021803, "grad_norm": 0.07170677930116653, "learning_rate": 7.174618138624058e-05, "loss": 8.4978, "step": 37280, "throughput": 8886.359489058588 }, { "epoch": 0.5848321695311092, "grad_norm": 0.08358849585056305, "learning_rate": 7.154814035130351e-05, "loss": 8.4932, "step": 37312, "throughput": 8886.402678647795 }, { "epoch": 0.5853337408600381, "grad_norm": 0.0761912614107132, "learning_rate": 7.135048471211257e-05, "loss": 8.4841, "step": 37344, "throughput": 8886.419195436345 }, { "epoch": 0.585835312188967, "grad_norm": 0.07179554551839828, "learning_rate": 7.115321528380024e-05, "loss": 8.5062, "step": 37376, "throughput": 8886.407493711495 }, { "epoch": 0.5863368835178959, "grad_norm": 0.10507599264383316, "learning_rate": 7.095633287990622e-05, "loss": 8.4877, "step": 37408, "throughput": 8886.557576944077 }, { "epoch": 0.5868384548468248, "grad_norm": 0.08583709597587585, "learning_rate": 7.075983831237421e-05, "loss": 8.4672, "step": 37440, "throughput": 8886.628687314525 }, { "epoch": 0.5873400261757538, "grad_norm": 0.07975829392671585, "learning_rate": 7.056373239154826e-05, "loss": 8.4819, "step": 37472, "throughput": 8886.650496913231 }, { "epoch": 0.5878415975046827, "grad_norm": 0.07645593583583832, "learning_rate": 7.036801592616982e-05, "loss": 8.4568, "step": 37504, "throughput": 8886.638514566923 }, { "epoch": 0.5883431688336116, "grad_norm": 0.07267199456691742, "learning_rate": 7.017268972337419e-05, "loss": 8.458, "step": 37536, "throughput": 8886.790797617663 }, { "epoch": 0.5888447401625405, "grad_norm": 0.07526517659425735, "learning_rate": 6.997775458868724e-05, "loss": 8.4878, "step": 37568, "throughput": 8886.852398981955 }, { "epoch": 0.5893463114914693, "grad_norm": 0.07897040247917175, "learning_rate": 6.978321132602197e-05, "loss": 8.4842, "step": 37600, "throughput": 8886.843188088478 }, { "epoch": 0.5898478828203982, "grad_norm": 0.08200077712535858, "learning_rate": 6.95890607376754e-05, "loss": 8.4688, "step": 37632, "throughput": 8886.852773489969 }, { "epoch": 0.5903494541493272, "grad_norm": 0.07454876601696014, "learning_rate": 6.939530362432513e-05, "loss": 8.4862, "step": 37664, "throughput": 8887.001803979198 }, { "epoch": 0.5908510254782561, "grad_norm": 0.08084976673126221, "learning_rate": 6.920194078502611e-05, "loss": 8.4805, "step": 37696, "throughput": 8887.072771566227 }, { "epoch": 0.591352596807185, "grad_norm": 0.07187534868717194, "learning_rate": 6.900897301720721e-05, "loss": 8.4855, "step": 37728, "throughput": 8887.058230712222 }, { "epoch": 0.5918541681361139, "grad_norm": 0.06797279417514801, "learning_rate": 6.881640111666807e-05, "loss": 8.5012, "step": 37760, "throughput": 8887.052989800553 }, { "epoch": 0.5923557394650428, "grad_norm": 0.07955506443977356, "learning_rate": 6.862422587757581e-05, "loss": 8.4873, "step": 37792, "throughput": 8887.19590824548 }, { "epoch": 0.5928573107939717, "grad_norm": 0.07935384660959244, "learning_rate": 6.843244809246173e-05, "loss": 8.5038, "step": 37824, "throughput": 8887.268535404251 }, { "epoch": 0.5933588821229007, "grad_norm": 0.07026589661836624, "learning_rate": 6.824106855221788e-05, "loss": 8.485, "step": 37856, "throughput": 8887.246180262757 }, { "epoch": 0.5938604534518296, "grad_norm": 0.07547298073768616, "learning_rate": 6.805008804609411e-05, "loss": 8.48, "step": 37888, "throughput": 8887.253621811145 }, { "epoch": 0.5943620247807585, "grad_norm": 0.08095958828926086, "learning_rate": 6.78595073616946e-05, "loss": 8.476, "step": 37920, "throughput": 8887.366914831318 }, { "epoch": 0.5948635961096874, "grad_norm": 0.08343052119016647, "learning_rate": 6.766932728497468e-05, "loss": 8.501, "step": 37952, "throughput": 8887.442337101993 }, { "epoch": 0.5953651674386163, "grad_norm": 0.07000665366649628, "learning_rate": 6.747954860023746e-05, "loss": 8.5003, "step": 37984, "throughput": 8887.4245648121 }, { "epoch": 0.5958667387675451, "grad_norm": 0.07254104316234589, "learning_rate": 6.729017209013086e-05, "loss": 8.4894, "step": 38016, "throughput": 8887.438706603321 }, { "epoch": 0.5963683100964741, "grad_norm": 0.07622817903757095, "learning_rate": 6.710119853564422e-05, "loss": 8.495, "step": 38048, "throughput": 8887.558373040401 }, { "epoch": 0.596869881425403, "grad_norm": 0.09238285571336746, "learning_rate": 6.69126287161049e-05, "loss": 8.5041, "step": 38080, "throughput": 8887.620834143852 }, { "epoch": 0.5973714527543319, "grad_norm": 0.08314980566501617, "learning_rate": 6.672446340917553e-05, "loss": 8.4884, "step": 38112, "throughput": 8887.602805809758 }, { "epoch": 0.5978730240832608, "grad_norm": 0.08117211610078812, "learning_rate": 6.653670339085031e-05, "loss": 8.4792, "step": 38144, "throughput": 8887.628870528435 }, { "epoch": 0.5983745954121897, "grad_norm": 0.0752682313323021, "learning_rate": 6.634934943545217e-05, "loss": 8.4863, "step": 38176, "throughput": 8887.739052589919 }, { "epoch": 0.5988761667411187, "grad_norm": 0.07365549355745316, "learning_rate": 6.616240231562933e-05, "loss": 8.4621, "step": 38208, "throughput": 8887.788441557968 }, { "epoch": 0.5993777380700476, "grad_norm": 0.0774432122707367, "learning_rate": 6.597586280235227e-05, "loss": 8.491, "step": 38240, "throughput": 8887.777448927698 }, { "epoch": 0.5998793093989765, "grad_norm": 0.08132058382034302, "learning_rate": 6.578973166491053e-05, "loss": 8.475, "step": 38272, "throughput": 8887.796764760067 }, { "epoch": 0.6003808807279054, "grad_norm": 0.07279162108898163, "learning_rate": 6.560400967090948e-05, "loss": 8.4748, "step": 38304, "throughput": 8887.91410213565 }, { "epoch": 0.6008824520568343, "grad_norm": 0.07235167175531387, "learning_rate": 6.54186975862671e-05, "loss": 8.4689, "step": 38336, "throughput": 8887.977423903028 }, { "epoch": 0.6013840233857632, "grad_norm": 0.07213466614484787, "learning_rate": 6.523379617521104e-05, "loss": 8.4697, "step": 38368, "throughput": 8887.960699058545 }, { "epoch": 0.6018855947146922, "grad_norm": 0.0761331170797348, "learning_rate": 6.504930620027524e-05, "loss": 8.471, "step": 38400, "throughput": 8887.9758169686 }, { "epoch": 0.6023871660436211, "grad_norm": 0.07972035557031631, "learning_rate": 6.486522842229692e-05, "loss": 8.4779, "step": 38432, "throughput": 8888.093491518159 }, { "epoch": 0.6028887373725499, "grad_norm": 0.07006843388080597, "learning_rate": 6.468156360041337e-05, "loss": 8.4802, "step": 38464, "throughput": 8888.155714870378 }, { "epoch": 0.6033903087014788, "grad_norm": 0.07934440672397614, "learning_rate": 6.449831249205887e-05, "loss": 8.4662, "step": 38496, "throughput": 8888.122112787783 }, { "epoch": 0.6038918800304077, "grad_norm": 0.07978641241788864, "learning_rate": 6.431547585296156e-05, "loss": 8.4644, "step": 38528, "throughput": 8888.136088222396 }, { "epoch": 0.6043934513593366, "grad_norm": 0.08016802370548248, "learning_rate": 6.413305443714022e-05, "loss": 8.4664, "step": 38560, "throughput": 8888.255943168042 }, { "epoch": 0.6048950226882656, "grad_norm": 0.0753985047340393, "learning_rate": 6.395104899690134e-05, "loss": 8.4689, "step": 38592, "throughput": 8888.334236582781 }, { "epoch": 0.6053965940171945, "grad_norm": 0.08145113289356232, "learning_rate": 6.37694602828359e-05, "loss": 8.4775, "step": 38624, "throughput": 8888.327950078981 }, { "epoch": 0.6058981653461234, "grad_norm": 0.07525403797626495, "learning_rate": 6.358828904381632e-05, "loss": 8.4654, "step": 38656, "throughput": 8888.313105008327 }, { "epoch": 0.6063997366750523, "grad_norm": 0.08340601623058319, "learning_rate": 6.340753602699327e-05, "loss": 8.4894, "step": 38688, "throughput": 8888.417408381963 }, { "epoch": 0.6069013080039812, "grad_norm": 0.09112949669361115, "learning_rate": 6.322720197779275e-05, "loss": 8.4926, "step": 38720, "throughput": 8888.502338303459 }, { "epoch": 0.6074028793329102, "grad_norm": 0.07264062762260437, "learning_rate": 6.304728763991291e-05, "loss": 8.4727, "step": 38752, "throughput": 8888.501994446655 }, { "epoch": 0.6079044506618391, "grad_norm": 0.07016344368457794, "learning_rate": 6.286779375532107e-05, "loss": 8.4899, "step": 38784, "throughput": 8888.494497272626 }, { "epoch": 0.608406021990768, "grad_norm": 0.0778326466679573, "learning_rate": 6.268872106425044e-05, "loss": 8.4491, "step": 38816, "throughput": 8888.587600647677 }, { "epoch": 0.6089075933196969, "grad_norm": 0.06960074603557587, "learning_rate": 6.25100703051974e-05, "loss": 8.4596, "step": 38848, "throughput": 8888.669944429052 }, { "epoch": 0.6094091646486258, "grad_norm": 0.07572120428085327, "learning_rate": 6.233184221491818e-05, "loss": 8.4787, "step": 38880, "throughput": 8888.687934410838 }, { "epoch": 0.6099107359775546, "grad_norm": 0.0762973502278328, "learning_rate": 6.2154037528426e-05, "loss": 8.4591, "step": 38912, "throughput": 8888.677327994048 }, { "epoch": 0.6104123073064835, "grad_norm": 0.08137981593608856, "learning_rate": 6.197665697898784e-05, "loss": 8.4705, "step": 38944, "throughput": 8888.243099346064 }, { "epoch": 0.6109138786354125, "grad_norm": 0.08791758865118027, "learning_rate": 6.179970129812166e-05, "loss": 8.4666, "step": 38976, "throughput": 8888.328760115613 }, { "epoch": 0.6114154499643414, "grad_norm": 0.07459740340709686, "learning_rate": 6.16231712155932e-05, "loss": 8.4809, "step": 39008, "throughput": 8888.300833774185 }, { "epoch": 0.6119170212932703, "grad_norm": 0.08468503504991531, "learning_rate": 6.144706745941308e-05, "loss": 8.4617, "step": 39040, "throughput": 8888.288480514897 }, { "epoch": 0.6124185926221992, "grad_norm": 0.0737047791481018, "learning_rate": 6.127139075583363e-05, "loss": 8.4671, "step": 39072, "throughput": 8888.391755677107 }, { "epoch": 0.6129201639511281, "grad_norm": 0.07463378459215164, "learning_rate": 6.109614182934616e-05, "loss": 8.4668, "step": 39104, "throughput": 8888.47370055962 }, { "epoch": 0.6134217352800571, "grad_norm": 0.07298613339662552, "learning_rate": 6.092132140267775e-05, "loss": 8.4549, "step": 39136, "throughput": 8888.460213971857 }, { "epoch": 0.613923306608986, "grad_norm": 0.09070611000061035, "learning_rate": 6.074693019678839e-05, "loss": 8.4759, "step": 39168, "throughput": 8888.464936901131 }, { "epoch": 0.6144248779379149, "grad_norm": 0.07279788702726364, "learning_rate": 6.0572968930867827e-05, "loss": 8.4575, "step": 39200, "throughput": 8888.570133881885 }, { "epoch": 0.6149264492668438, "grad_norm": 0.08355917036533356, "learning_rate": 6.039943832233293e-05, "loss": 8.4659, "step": 39232, "throughput": 8888.641314332495 }, { "epoch": 0.6154280205957727, "grad_norm": 0.07693200558423996, "learning_rate": 6.022633908682442e-05, "loss": 8.4661, "step": 39264, "throughput": 8888.600508933388 }, { "epoch": 0.6159295919247016, "grad_norm": 0.08254023641347885, "learning_rate": 6.005367193820408e-05, "loss": 8.4609, "step": 39296, "throughput": 8888.611166639346 }, { "epoch": 0.6164311632536306, "grad_norm": 0.07168231159448624, "learning_rate": 5.9881437588551675e-05, "loss": 8.4665, "step": 39328, "throughput": 8888.713888915134 }, { "epoch": 0.6169327345825594, "grad_norm": 0.08645154535770416, "learning_rate": 5.970963674816224e-05, "loss": 8.4573, "step": 39360, "throughput": 8888.808883024485 }, { "epoch": 0.6174343059114883, "grad_norm": 0.0723043829202652, "learning_rate": 5.953827012554291e-05, "loss": 8.4695, "step": 39392, "throughput": 8888.755871735146 }, { "epoch": 0.6179358772404172, "grad_norm": 0.08425875008106232, "learning_rate": 5.9367338427410197e-05, "loss": 8.4644, "step": 39424, "throughput": 8888.740088349667 }, { "epoch": 0.6184374485693461, "grad_norm": 0.10152871906757355, "learning_rate": 5.9196842358686866e-05, "loss": 8.4703, "step": 39456, "throughput": 8888.853319781223 }, { "epoch": 0.618939019898275, "grad_norm": 0.07223829627037048, "learning_rate": 5.902678262249923e-05, "loss": 8.4706, "step": 39488, "throughput": 8888.957989404042 }, { "epoch": 0.619440591227204, "grad_norm": 0.0781422033905983, "learning_rate": 5.885715992017419e-05, "loss": 8.4563, "step": 39520, "throughput": 8888.928558727486 }, { "epoch": 0.6199421625561329, "grad_norm": 0.07951901853084564, "learning_rate": 5.86879749512362e-05, "loss": 8.4608, "step": 39552, "throughput": 8888.908106562007 }, { "epoch": 0.6204437338850618, "grad_norm": 0.07194496691226959, "learning_rate": 5.851922841340461e-05, "loss": 8.4655, "step": 39584, "throughput": 8889.00558454673 }, { "epoch": 0.6209453052139907, "grad_norm": 0.07164505869150162, "learning_rate": 5.835092100259063e-05, "loss": 8.4528, "step": 39616, "throughput": 8889.105516908532 }, { "epoch": 0.6214468765429196, "grad_norm": 0.07270708680152893, "learning_rate": 5.818305341289458e-05, "loss": 8.4718, "step": 39648, "throughput": 8889.086944760817 }, { "epoch": 0.6219484478718486, "grad_norm": 0.08275506645441055, "learning_rate": 5.8015626336602814e-05, "loss": 8.4544, "step": 39680, "throughput": 8889.103020213392 }, { "epoch": 0.6224500192007775, "grad_norm": 0.07898057997226715, "learning_rate": 5.7848640464185124e-05, "loss": 8.4759, "step": 39712, "throughput": 8889.179327476135 }, { "epoch": 0.6229515905297064, "grad_norm": 0.07599367201328278, "learning_rate": 5.768209648429174e-05, "loss": 8.4706, "step": 39744, "throughput": 8889.26222452118 }, { "epoch": 0.6234531618586353, "grad_norm": 0.07903989404439926, "learning_rate": 5.751599508375059e-05, "loss": 8.4711, "step": 39776, "throughput": 8889.247412514856 }, { "epoch": 0.6239547331875641, "grad_norm": 0.07619236409664154, "learning_rate": 5.735033694756423e-05, "loss": 8.4552, "step": 39808, "throughput": 8889.229682914522 }, { "epoch": 0.624456304516493, "grad_norm": 0.07185118645429611, "learning_rate": 5.718512275890737e-05, "loss": 8.4543, "step": 39840, "throughput": 8889.34296320931 }, { "epoch": 0.624957875845422, "grad_norm": 0.07389679551124573, "learning_rate": 5.70203531991238e-05, "loss": 8.4598, "step": 39872, "throughput": 8889.437729760273 }, { "epoch": 0.6254594471743509, "grad_norm": 0.08193694055080414, "learning_rate": 5.6856028947723734e-05, "loss": 8.4619, "step": 39904, "throughput": 8889.412655528009 }, { "epoch": 0.6259610185032798, "grad_norm": 0.0743367001414299, "learning_rate": 5.669215068238075e-05, "loss": 8.4472, "step": 39936, "throughput": 8889.431497923688 }, { "epoch": 0.6264625898322087, "grad_norm": 0.07348310202360153, "learning_rate": 5.652871907892934e-05, "loss": 8.4661, "step": 39968, "throughput": 8889.521056001866 }, { "epoch": 0.6269641611611376, "grad_norm": 0.07830830663442612, "learning_rate": 5.6365734811362026e-05, "loss": 8.4629, "step": 40000, "throughput": 8889.62522923375 }, { "epoch": 0.6274657324900665, "grad_norm": 0.08075550198554993, "learning_rate": 5.620319855182629e-05, "loss": 8.4526, "step": 40032, "throughput": 8889.585173448973 }, { "epoch": 0.6279673038189955, "grad_norm": 0.0775223821401596, "learning_rate": 5.60411109706222e-05, "loss": 8.4293, "step": 40064, "throughput": 8889.569407062347 }, { "epoch": 0.6284688751479244, "grad_norm": 0.07300705462694168, "learning_rate": 5.587947273619938e-05, "loss": 8.4419, "step": 40096, "throughput": 8889.694101400937 }, { "epoch": 0.6289704464768533, "grad_norm": 0.07574678957462311, "learning_rate": 5.5718284515154476e-05, "loss": 8.4488, "step": 40128, "throughput": 8889.792966352983 }, { "epoch": 0.6294720178057822, "grad_norm": 0.07535702735185623, "learning_rate": 5.5557546972228114e-05, "loss": 8.4759, "step": 40160, "throughput": 8889.779889465628 }, { "epoch": 0.6299735891347111, "grad_norm": 0.08066742867231369, "learning_rate": 5.539726077030239e-05, "loss": 8.4689, "step": 40192, "throughput": 8889.748342331195 }, { "epoch": 0.63047516046364, "grad_norm": 0.07345551997423172, "learning_rate": 5.523742657039809e-05, "loss": 8.4555, "step": 40224, "throughput": 8889.869626650914 }, { "epoch": 0.6309767317925689, "grad_norm": 0.07321788370609283, "learning_rate": 5.5078045031672005e-05, "loss": 8.4845, "step": 40256, "throughput": 8889.95961121878 }, { "epoch": 0.6314783031214978, "grad_norm": 0.06991337984800339, "learning_rate": 5.491911681141394e-05, "loss": 8.4488, "step": 40288, "throughput": 8889.946034390981 }, { "epoch": 0.6319798744504267, "grad_norm": 0.07396293431520462, "learning_rate": 5.476064256504443e-05, "loss": 8.487, "step": 40320, "throughput": 8889.93125361081 }, { "epoch": 0.6324814457793556, "grad_norm": 0.07787778973579407, "learning_rate": 5.460262294611172e-05, "loss": 8.471, "step": 40352, "throughput": 8890.03715925217 }, { "epoch": 0.6329830171082845, "grad_norm": 0.0852857232093811, "learning_rate": 5.444505860628923e-05, "loss": 8.4354, "step": 40384, "throughput": 8890.129414989433 }, { "epoch": 0.6334845884372134, "grad_norm": 0.07648869603872299, "learning_rate": 5.428795019537268e-05, "loss": 8.4452, "step": 40416, "throughput": 8890.106046954319 }, { "epoch": 0.6339861597661424, "grad_norm": 0.07862301170825958, "learning_rate": 5.413129836127766e-05, "loss": 8.4576, "step": 40448, "throughput": 8890.111296393472 }, { "epoch": 0.6344877310950713, "grad_norm": 0.07297796756029129, "learning_rate": 5.3975103750036805e-05, "loss": 8.4432, "step": 40480, "throughput": 8890.172831193517 }, { "epoch": 0.6349893024240002, "grad_norm": 0.08166255801916122, "learning_rate": 5.3819367005797186e-05, "loss": 8.4398, "step": 40512, "throughput": 8890.25577196834 }, { "epoch": 0.6354908737529291, "grad_norm": 0.07448875904083252, "learning_rate": 5.366408877081752e-05, "loss": 8.4707, "step": 40544, "throughput": 8890.241049888195 }, { "epoch": 0.635992445081858, "grad_norm": 0.07577415555715561, "learning_rate": 5.3509269685465764e-05, "loss": 8.4439, "step": 40576, "throughput": 8890.287344628367 }, { "epoch": 0.636494016410787, "grad_norm": 0.07667967677116394, "learning_rate": 5.3354910388216274e-05, "loss": 8.4611, "step": 40608, "throughput": 8890.303070975244 }, { "epoch": 0.6369955877397159, "grad_norm": 0.07609532028436661, "learning_rate": 5.3201011515647276e-05, "loss": 8.4667, "step": 40640, "throughput": 8890.394540219058 }, { "epoch": 0.6374971590686448, "grad_norm": 0.07594712823629379, "learning_rate": 5.304757370243811e-05, "loss": 8.4443, "step": 40672, "throughput": 8890.380144757337 }, { "epoch": 0.6379987303975736, "grad_norm": 0.07783151417970657, "learning_rate": 5.2894597581366835e-05, "loss": 8.4683, "step": 40704, "throughput": 8890.426500048565 }, { "epoch": 0.6385003017265025, "grad_norm": 0.08565002679824829, "learning_rate": 5.274208378330737e-05, "loss": 8.467, "step": 40736, "throughput": 8890.456915042812 }, { "epoch": 0.6390018730554314, "grad_norm": 0.07598944753408432, "learning_rate": 5.2590032937227154e-05, "loss": 8.4597, "step": 40768, "throughput": 8890.551799466513 }, { "epoch": 0.6395034443843604, "grad_norm": 0.08324527740478516, "learning_rate": 5.2438445670184244e-05, "loss": 8.4601, "step": 40800, "throughput": 8890.510488344746 }, { "epoch": 0.6400050157132893, "grad_norm": 0.0747150406241417, "learning_rate": 5.2287322607325e-05, "loss": 8.4416, "step": 40832, "throughput": 8890.561222276989 }, { "epoch": 0.6405065870422182, "grad_norm": 0.07257223129272461, "learning_rate": 5.213666437188141e-05, "loss": 8.4731, "step": 40864, "throughput": 8890.597776004659 }, { "epoch": 0.6410081583711471, "grad_norm": 0.07314898818731308, "learning_rate": 5.1986471585168485e-05, "loss": 8.4614, "step": 40896, "throughput": 8890.689564843153 }, { "epoch": 0.641509729700076, "grad_norm": 0.08105824142694473, "learning_rate": 5.183674486658167e-05, "loss": 8.4763, "step": 40928, "throughput": 8890.654920890887 }, { "epoch": 0.6420113010290049, "grad_norm": 0.07510533183813095, "learning_rate": 5.168748483359445e-05, "loss": 8.4648, "step": 40960, "throughput": 8890.6647160896 }, { "epoch": 0.6425128723579339, "grad_norm": 0.07071134448051453, "learning_rate": 5.153869210175563e-05, "loss": 8.4359, "step": 40992, "throughput": 8890.230400694345 }, { "epoch": 0.6430144436868628, "grad_norm": 0.07490675896406174, "learning_rate": 5.139036728468686e-05, "loss": 8.4446, "step": 41024, "throughput": 8890.323327806726 }, { "epoch": 0.6435160150157917, "grad_norm": 0.07903076708316803, "learning_rate": 5.124251099408012e-05, "loss": 8.434, "step": 41056, "throughput": 8890.298497593038 }, { "epoch": 0.6440175863447206, "grad_norm": 0.07559023797512054, "learning_rate": 5.1095123839695224e-05, "loss": 8.4566, "step": 41088, "throughput": 8890.331839698267 }, { "epoch": 0.6445191576736495, "grad_norm": 0.0739353820681572, "learning_rate": 5.0948206429357224e-05, "loss": 8.4676, "step": 41120, "throughput": 8890.38437114305 }, { "epoch": 0.6450207290025783, "grad_norm": 0.08174904435873032, "learning_rate": 5.080175936895392e-05, "loss": 8.4735, "step": 41152, "throughput": 8890.48658112442 }, { "epoch": 0.6455223003315073, "grad_norm": 0.0728902593255043, "learning_rate": 5.065578326243348e-05, "loss": 8.4228, "step": 41184, "throughput": 8890.48142922228 }, { "epoch": 0.6460238716604362, "grad_norm": 0.08640056103467941, "learning_rate": 5.0510278711801735e-05, "loss": 8.4599, "step": 41216, "throughput": 8890.523424199167 }, { "epoch": 0.6465254429893651, "grad_norm": 0.07627113163471222, "learning_rate": 5.036524631711996e-05, "loss": 8.4511, "step": 41248, "throughput": 8890.57459525624 }, { "epoch": 0.647027014318294, "grad_norm": 0.07577764242887497, "learning_rate": 5.02206866765021e-05, "loss": 8.4497, "step": 41280, "throughput": 8890.680407253796 }, { "epoch": 0.6475285856472229, "grad_norm": 0.07532326877117157, "learning_rate": 5.007660038611259e-05, "loss": 8.4625, "step": 41312, "throughput": 8890.682211363383 }, { "epoch": 0.6480301569761518, "grad_norm": 0.08512426912784576, "learning_rate": 4.9932988040163726e-05, "loss": 8.4481, "step": 41344, "throughput": 8890.730346071183 }, { "epoch": 0.6485317283050808, "grad_norm": 0.08049551397562027, "learning_rate": 4.978985023091324e-05, "loss": 8.4542, "step": 41376, "throughput": 8890.755837575718 }, { "epoch": 0.6490332996340097, "grad_norm": 0.0796920582652092, "learning_rate": 4.964718754866186e-05, "loss": 8.445, "step": 41408, "throughput": 8890.876073098967 }, { "epoch": 0.6495348709629386, "grad_norm": 0.07723706215620041, "learning_rate": 4.95050005817509e-05, "loss": 8.4584, "step": 41440, "throughput": 8890.840355837523 }, { "epoch": 0.6500364422918675, "grad_norm": 0.07665670663118362, "learning_rate": 4.936328991655988e-05, "loss": 8.4199, "step": 41472, "throughput": 8890.8515222639 }, { "epoch": 0.6505380136207964, "grad_norm": 0.07526352256536484, "learning_rate": 4.9222056137504e-05, "loss": 8.4412, "step": 41504, "throughput": 8890.912446640825 }, { "epoch": 0.6510395849497254, "grad_norm": 0.07671099901199341, "learning_rate": 4.908129982703169e-05, "loss": 8.4451, "step": 41536, "throughput": 8891.01823466129 }, { "epoch": 0.6515411562786543, "grad_norm": 0.07746397703886032, "learning_rate": 4.8941021565622516e-05, "loss": 8.4567, "step": 41568, "throughput": 8891.00199828533 }, { "epoch": 0.6520427276075831, "grad_norm": 0.07340981811285019, "learning_rate": 4.880122193178441e-05, "loss": 8.4557, "step": 41600, "throughput": 8890.99166793191 }, { "epoch": 0.652544298936512, "grad_norm": 0.07931915670633316, "learning_rate": 4.866190150205143e-05, "loss": 8.449, "step": 41632, "throughput": 8891.04096212233 }, { "epoch": 0.6530458702654409, "grad_norm": 0.07507026195526123, "learning_rate": 4.8523060850981476e-05, "loss": 8.4285, "step": 41664, "throughput": 8891.154000433016 }, { "epoch": 0.6535474415943698, "grad_norm": 0.0907602310180664, "learning_rate": 4.838470055115379e-05, "loss": 8.4494, "step": 41696, "throughput": 8891.133463958888 }, { "epoch": 0.6540490129232988, "grad_norm": 0.07654330134391785, "learning_rate": 4.82468211731667e-05, "loss": 8.4551, "step": 41728, "throughput": 8891.12942557528 }, { "epoch": 0.6545505842522277, "grad_norm": 0.07788608968257904, "learning_rate": 4.8109423285635116e-05, "loss": 8.453, "step": 41760, "throughput": 8891.176412367118 }, { "epoch": 0.6550521555811566, "grad_norm": 0.07579880207777023, "learning_rate": 4.797250745518833e-05, "loss": 8.4164, "step": 41792, "throughput": 8891.289322325178 }, { "epoch": 0.6555537269100855, "grad_norm": 0.07648367434740067, "learning_rate": 4.7836074246467685e-05, "loss": 8.4408, "step": 41824, "throughput": 8891.292242880736 }, { "epoch": 0.6560552982390144, "grad_norm": 0.0801176205277443, "learning_rate": 4.770012422212412e-05, "loss": 8.4428, "step": 41856, "throughput": 8891.281171123885 }, { "epoch": 0.6565568695679433, "grad_norm": 0.07864414155483246, "learning_rate": 4.756465794281592e-05, "loss": 8.4415, "step": 41888, "throughput": 8891.340641747001 }, { "epoch": 0.6570584408968723, "grad_norm": 0.0740278884768486, "learning_rate": 4.742967596720641e-05, "loss": 8.4544, "step": 41920, "throughput": 8891.435292238171 }, { "epoch": 0.6575600122258012, "grad_norm": 0.07727956771850586, "learning_rate": 4.729517885196169e-05, "loss": 8.4639, "step": 41952, "throughput": 8891.43753280489 }, { "epoch": 0.6580615835547301, "grad_norm": 0.08166259527206421, "learning_rate": 4.716116715174827e-05, "loss": 8.4382, "step": 41984, "throughput": 8891.458781093757 }, { "epoch": 0.6585631548836589, "grad_norm": 0.07395589351654053, "learning_rate": 4.702764141923075e-05, "loss": 8.4559, "step": 42016, "throughput": 8891.479298197422 }, { "epoch": 0.6590647262125878, "grad_norm": 0.0727098360657692, "learning_rate": 4.6894602205069674e-05, "loss": 8.4285, "step": 42048, "throughput": 8891.577574987778 }, { "epoch": 0.6595662975415167, "grad_norm": 0.08630625158548355, "learning_rate": 4.6762050057919165e-05, "loss": 8.4324, "step": 42080, "throughput": 8891.56434519639 }, { "epoch": 0.6600678688704457, "grad_norm": 0.07782085984945297, "learning_rate": 4.6629985524424686e-05, "loss": 8.4326, "step": 42112, "throughput": 8891.53842517495 }, { "epoch": 0.6605694401993746, "grad_norm": 0.07772604376077652, "learning_rate": 4.649840914922071e-05, "loss": 8.4428, "step": 42144, "throughput": 8891.597460590583 }, { "epoch": 0.6610710115283035, "grad_norm": 0.08418555557727814, "learning_rate": 4.636732147492863e-05, "loss": 8.4317, "step": 42176, "throughput": 8891.697936976874 }, { "epoch": 0.6615725828572324, "grad_norm": 0.08168166130781174, "learning_rate": 4.6236723042154424e-05, "loss": 8.4574, "step": 42208, "throughput": 8891.667655818983 }, { "epoch": 0.6620741541861613, "grad_norm": 0.07932674884796143, "learning_rate": 4.61066143894864e-05, "loss": 8.4471, "step": 42240, "throughput": 8891.666142559854 }, { "epoch": 0.6625757255150903, "grad_norm": 0.07798092812299728, "learning_rate": 4.5976996053492996e-05, "loss": 8.4351, "step": 42272, "throughput": 8891.69451428347 }, { "epoch": 0.6630772968440192, "grad_norm": 0.09541884064674377, "learning_rate": 4.5847868568720646e-05, "loss": 8.4264, "step": 42304, "throughput": 8891.79013699881 }, { "epoch": 0.6635788681729481, "grad_norm": 0.07872533798217773, "learning_rate": 4.571923246769147e-05, "loss": 8.45, "step": 42336, "throughput": 8891.764855345577 }, { "epoch": 0.664080439501877, "grad_norm": 0.07670172303915024, "learning_rate": 4.559108828090115e-05, "loss": 8.4414, "step": 42368, "throughput": 8891.80559471926 }, { "epoch": 0.6645820108308059, "grad_norm": 0.0734407901763916, "learning_rate": 4.546343653681667e-05, "loss": 8.4511, "step": 42400, "throughput": 8891.832799174283 }, { "epoch": 0.6650835821597348, "grad_norm": 0.10266581922769547, "learning_rate": 4.53362777618742e-05, "loss": 8.437, "step": 42432, "throughput": 8891.934778631321 }, { "epoch": 0.6655851534886637, "grad_norm": 0.07219371199607849, "learning_rate": 4.52096124804769e-05, "loss": 8.4346, "step": 42464, "throughput": 8891.932544586749 }, { "epoch": 0.6660867248175926, "grad_norm": 0.07945689558982849, "learning_rate": 4.508344121499281e-05, "loss": 8.4346, "step": 42496, "throughput": 8891.953683702157 }, { "epoch": 0.6665882961465215, "grad_norm": 0.08158594369888306, "learning_rate": 4.495776448575255e-05, "loss": 8.4336, "step": 42528, "throughput": 8891.976572119378 }, { "epoch": 0.6670898674754504, "grad_norm": 0.07030566781759262, "learning_rate": 4.483258281104734e-05, "loss": 8.4158, "step": 42560, "throughput": 8892.083303595728 }, { "epoch": 0.6675914388043793, "grad_norm": 0.07251957803964615, "learning_rate": 4.470789670712681e-05, "loss": 8.4341, "step": 42592, "throughput": 8892.073672250981 }, { "epoch": 0.6680930101333082, "grad_norm": 0.08236385881900787, "learning_rate": 4.458370668819676e-05, "loss": 8.4496, "step": 42624, "throughput": 8892.09051768536 }, { "epoch": 0.6685945814622372, "grad_norm": 0.0733921006321907, "learning_rate": 4.4460013266417226e-05, "loss": 8.424, "step": 42656, "throughput": 8892.135135286919 }, { "epoch": 0.6690961527911661, "grad_norm": 0.08383151143789291, "learning_rate": 4.433681695190027e-05, "loss": 8.4315, "step": 42688, "throughput": 8892.220307742966 }, { "epoch": 0.669597724120095, "grad_norm": 0.08101122826337814, "learning_rate": 4.421411825270785e-05, "loss": 8.4227, "step": 42720, "throughput": 8892.227313835552 }, { "epoch": 0.6700992954490239, "grad_norm": 0.07665781676769257, "learning_rate": 4.4091917674849727e-05, "loss": 8.4411, "step": 42752, "throughput": 8892.223930766586 }, { "epoch": 0.6706008667779528, "grad_norm": 0.07866701483726501, "learning_rate": 4.397021572228147e-05, "loss": 8.4472, "step": 42784, "throughput": 8892.283116147706 }, { "epoch": 0.6711024381068817, "grad_norm": 0.07769527286291122, "learning_rate": 4.38490128969023e-05, "loss": 8.4492, "step": 42816, "throughput": 8892.359819898718 }, { "epoch": 0.6716040094358107, "grad_norm": 0.08288507908582687, "learning_rate": 4.3728309698553056e-05, "loss": 8.4514, "step": 42848, "throughput": 8892.378210684165 }, { "epoch": 0.6721055807647396, "grad_norm": 0.07571935653686523, "learning_rate": 4.3608106625014014e-05, "loss": 8.4338, "step": 42880, "throughput": 8892.355894409146 }, { "epoch": 0.6726071520936684, "grad_norm": 0.0770522728562355, "learning_rate": 4.348840417200306e-05, "loss": 8.4417, "step": 42912, "throughput": 8892.42663160015 }, { "epoch": 0.6731087234225973, "grad_norm": 0.08030234277248383, "learning_rate": 4.336920283317343e-05, "loss": 8.4531, "step": 42944, "throughput": 8892.508469753493 }, { "epoch": 0.6736102947515262, "grad_norm": 0.07389728724956512, "learning_rate": 4.325050310011183e-05, "loss": 8.4575, "step": 42976, "throughput": 8892.518853790318 }, { "epoch": 0.6741118660804551, "grad_norm": 0.08421860635280609, "learning_rate": 4.3132305462336306e-05, "loss": 8.4433, "step": 43008, "throughput": 8892.490140747977 }, { "epoch": 0.6746134374093841, "grad_norm": 0.07845675945281982, "learning_rate": 4.301461040729424e-05, "loss": 8.4634, "step": 43040, "throughput": 8892.038946444562 }, { "epoch": 0.675115008738313, "grad_norm": 0.08324997872114182, "learning_rate": 4.289741842036042e-05, "loss": 8.4343, "step": 43072, "throughput": 8892.10840247032 }, { "epoch": 0.6756165800672419, "grad_norm": 0.07667215168476105, "learning_rate": 4.2780729984834916e-05, "loss": 8.4246, "step": 43104, "throughput": 8892.101825268575 }, { "epoch": 0.6761181513961708, "grad_norm": 0.07493610680103302, "learning_rate": 4.266454558194122e-05, "loss": 8.4348, "step": 43136, "throughput": 8892.085081520685 }, { "epoch": 0.6766197227250997, "grad_norm": 0.08308251947164536, "learning_rate": 4.254886569082413e-05, "loss": 8.4182, "step": 43168, "throughput": 8892.149902078248 }, { "epoch": 0.6771212940540287, "grad_norm": 0.08159197866916656, "learning_rate": 4.243369078854788e-05, "loss": 8.4305, "step": 43200, "throughput": 8892.234243039149 }, { "epoch": 0.6776228653829576, "grad_norm": 0.08277013152837753, "learning_rate": 4.231902135009407e-05, "loss": 8.4528, "step": 43232, "throughput": 8892.228431107438 }, { "epoch": 0.6781244367118865, "grad_norm": 0.08104237169027328, "learning_rate": 4.220485784835984e-05, "loss": 8.4421, "step": 43264, "throughput": 8892.238283717748 }, { "epoch": 0.6786260080408154, "grad_norm": 0.07368003576993942, "learning_rate": 4.209120075415577e-05, "loss": 8.4157, "step": 43296, "throughput": 8892.29604650612 }, { "epoch": 0.6791275793697443, "grad_norm": 0.0873587504029274, "learning_rate": 4.197805053620411e-05, "loss": 8.4212, "step": 43328, "throughput": 8892.373143548699 }, { "epoch": 0.6796291506986731, "grad_norm": 0.08577149361371994, "learning_rate": 4.186540766113665e-05, "loss": 8.426, "step": 43360, "throughput": 8892.386226062543 }, { "epoch": 0.680130722027602, "grad_norm": 0.08065624535083771, "learning_rate": 4.1753272593492956e-05, "loss": 8.4365, "step": 43392, "throughput": 8892.378610413167 }, { "epoch": 0.680632293356531, "grad_norm": 0.07502314448356628, "learning_rate": 4.1641645795718364e-05, "loss": 8.4459, "step": 43424, "throughput": 8892.440894507208 }, { "epoch": 0.6811338646854599, "grad_norm": 0.09220045059919357, "learning_rate": 4.153052772816217e-05, "loss": 8.4139, "step": 43456, "throughput": 8892.508903647942 }, { "epoch": 0.6816354360143888, "grad_norm": 0.07826574891805649, "learning_rate": 4.141991884907555e-05, "loss": 8.4193, "step": 43488, "throughput": 8892.53030710533 }, { "epoch": 0.6821370073433177, "grad_norm": 0.07697897404432297, "learning_rate": 4.1309819614609865e-05, "loss": 8.4213, "step": 43520, "throughput": 8892.526230805855 }, { "epoch": 0.6826385786722466, "grad_norm": 0.07445234060287476, "learning_rate": 4.1200230478814695e-05, "loss": 8.4411, "step": 43552, "throughput": 8892.590827408416 }, { "epoch": 0.6831401500011756, "grad_norm": 0.07667402923107147, "learning_rate": 4.109115189363601e-05, "loss": 8.4357, "step": 43584, "throughput": 8892.659610304425 }, { "epoch": 0.6836417213301045, "grad_norm": 0.07593469321727753, "learning_rate": 4.0982584308914114e-05, "loss": 8.4079, "step": 43616, "throughput": 8892.660647639474 }, { "epoch": 0.6841432926590334, "grad_norm": 0.07302851229906082, "learning_rate": 4.0874528172382114e-05, "loss": 8.4365, "step": 43648, "throughput": 8892.622449682587 }, { "epoch": 0.6846448639879623, "grad_norm": 0.07740382850170135, "learning_rate": 4.0766983929663835e-05, "loss": 8.4103, "step": 43680, "throughput": 8892.69376033167 }, { "epoch": 0.6851464353168912, "grad_norm": 0.07721805572509766, "learning_rate": 4.065995202427206e-05, "loss": 8.4223, "step": 43712, "throughput": 8892.755478850933 }, { "epoch": 0.6856480066458202, "grad_norm": 0.07630830258131027, "learning_rate": 4.055343289760664e-05, "loss": 8.4292, "step": 43744, "throughput": 8892.764671392262 }, { "epoch": 0.6861495779747491, "grad_norm": 0.07535416632890701, "learning_rate": 4.0447426988952816e-05, "loss": 8.4037, "step": 43776, "throughput": 8892.720405218828 }, { "epoch": 0.6866511493036779, "grad_norm": 0.07637656480073929, "learning_rate": 4.0341934735479224e-05, "loss": 8.4254, "step": 43808, "throughput": 8892.78404823713 }, { "epoch": 0.6871527206326068, "grad_norm": 0.07621389627456665, "learning_rate": 4.02369565722363e-05, "loss": 8.4427, "step": 43840, "throughput": 8892.854481523502 }, { "epoch": 0.6876542919615357, "grad_norm": 0.07737398892641068, "learning_rate": 4.013249293215422e-05, "loss": 8.4014, "step": 43872, "throughput": 8892.86718784577 }, { "epoch": 0.6881558632904646, "grad_norm": 0.07027604430913925, "learning_rate": 4.0028544246041406e-05, "loss": 8.4149, "step": 43904, "throughput": 8892.83076939561 }, { "epoch": 0.6886574346193935, "grad_norm": 0.09703302383422852, "learning_rate": 3.99251109425825e-05, "loss": 8.4426, "step": 43936, "throughput": 8892.896175152631 }, { "epoch": 0.6891590059483225, "grad_norm": 0.07484853267669678, "learning_rate": 3.982219344833681e-05, "loss": 8.4334, "step": 43968, "throughput": 8892.970712885226 }, { "epoch": 0.6896605772772514, "grad_norm": 0.07486287504434586, "learning_rate": 3.971979218773634e-05, "loss": 8.4072, "step": 44000, "throughput": 8892.976506505798 }, { "epoch": 0.6901621486061803, "grad_norm": 0.08308301866054535, "learning_rate": 3.961790758308418e-05, "loss": 8.4238, "step": 44032, "throughput": 8892.957701269286 }, { "epoch": 0.6906637199351092, "grad_norm": 0.07927940040826797, "learning_rate": 3.951654005455281e-05, "loss": 8.4175, "step": 44064, "throughput": 8893.056101654616 }, { "epoch": 0.6911652912640381, "grad_norm": 0.07408682256937027, "learning_rate": 3.9415690020182154e-05, "loss": 8.4284, "step": 44096, "throughput": 8893.10282842892 }, { "epoch": 0.6916668625929671, "grad_norm": 0.07429683953523636, "learning_rate": 3.9315357895878066e-05, "loss": 8.42, "step": 44128, "throughput": 8893.090918747575 }, { "epoch": 0.692168433921896, "grad_norm": 0.07516713440418243, "learning_rate": 3.921554409541053e-05, "loss": 8.4188, "step": 44160, "throughput": 8893.090593852326 }, { "epoch": 0.6926700052508249, "grad_norm": 0.08680712431669235, "learning_rate": 3.911624903041198e-05, "loss": 8.4395, "step": 44192, "throughput": 8893.163611643442 }, { "epoch": 0.6931715765797538, "grad_norm": 0.07508208602666855, "learning_rate": 3.9017473110375525e-05, "loss": 8.4294, "step": 44224, "throughput": 8893.215364050175 }, { "epoch": 0.6936731479086826, "grad_norm": 0.07337779551744461, "learning_rate": 3.891921674265336e-05, "loss": 8.4095, "step": 44256, "throughput": 8893.217250426818 }, { "epoch": 0.6941747192376115, "grad_norm": 0.08711987733840942, "learning_rate": 3.8821480332455024e-05, "loss": 8.4182, "step": 44288, "throughput": 8893.222381878295 }, { "epoch": 0.6946762905665405, "grad_norm": 0.07527220249176025, "learning_rate": 3.87242642828458e-05, "loss": 8.4389, "step": 44320, "throughput": 8893.30610869026 }, { "epoch": 0.6951778618954694, "grad_norm": 0.0782904177904129, "learning_rate": 3.862756899474493e-05, "loss": 8.4264, "step": 44352, "throughput": 8893.35663963662 }, { "epoch": 0.6956794332243983, "grad_norm": 0.07315529882907867, "learning_rate": 3.853139486692408e-05, "loss": 8.3995, "step": 44384, "throughput": 8893.36900993904 }, { "epoch": 0.6961810045533272, "grad_norm": 0.07299330830574036, "learning_rate": 3.843574229600565e-05, "loss": 8.4172, "step": 44416, "throughput": 8893.364377678341 }, { "epoch": 0.6966825758822561, "grad_norm": 0.07816275954246521, "learning_rate": 3.834061167646112e-05, "loss": 8.4272, "step": 44448, "throughput": 8893.45437929874 }, { "epoch": 0.697184147211185, "grad_norm": 0.07753538340330124, "learning_rate": 3.8246003400609424e-05, "loss": 8.4189, "step": 44480, "throughput": 8893.522440354585 }, { "epoch": 0.697685718540114, "grad_norm": 0.0742424800992012, "learning_rate": 3.81519178586154e-05, "loss": 8.4166, "step": 44512, "throughput": 8893.504933907088 }, { "epoch": 0.6981872898690429, "grad_norm": 0.0793553963303566, "learning_rate": 3.805835543848809e-05, "loss": 8.4453, "step": 44544, "throughput": 8893.485443527716 }, { "epoch": 0.6986888611979718, "grad_norm": 0.07728642970323563, "learning_rate": 3.796531652607919e-05, "loss": 8.4371, "step": 44576, "throughput": 8893.568888989736 }, { "epoch": 0.6991904325269007, "grad_norm": 0.07519408315420151, "learning_rate": 3.7872801505081434e-05, "loss": 8.4439, "step": 44608, "throughput": 8893.645694995372 }, { "epoch": 0.6996920038558296, "grad_norm": 0.08204668760299683, "learning_rate": 3.778081075702709e-05, "loss": 8.414, "step": 44640, "throughput": 8893.618307605391 }, { "epoch": 0.7001935751847586, "grad_norm": 0.07789931446313858, "learning_rate": 3.7689344661286264e-05, "loss": 8.4418, "step": 44672, "throughput": 8893.589302884588 }, { "epoch": 0.7006951465136874, "grad_norm": 0.07528722286224365, "learning_rate": 3.759840359506536e-05, "loss": 8.4128, "step": 44704, "throughput": 8893.668691472745 }, { "epoch": 0.7011967178426163, "grad_norm": 0.07807424664497375, "learning_rate": 3.750798793340565e-05, "loss": 8.4278, "step": 44736, "throughput": 8893.748497852805 }, { "epoch": 0.7016982891715452, "grad_norm": 0.0780852809548378, "learning_rate": 3.7418098049181573e-05, "loss": 8.4391, "step": 44768, "throughput": 8893.740000518525 }, { "epoch": 0.7021998605004741, "grad_norm": 0.0890655443072319, "learning_rate": 3.732873431309929e-05, "loss": 8.416, "step": 44800, "throughput": 8893.753083047186 }, { "epoch": 0.702701431829403, "grad_norm": 0.07461878657341003, "learning_rate": 3.7239897093695106e-05, "loss": 8.4273, "step": 44832, "throughput": 8893.805946569892 }, { "epoch": 0.703203003158332, "grad_norm": 0.07440055161714554, "learning_rate": 3.715158675733396e-05, "loss": 8.427, "step": 44864, "throughput": 8893.89045700433 }, { "epoch": 0.7037045744872609, "grad_norm": 0.0728885680437088, "learning_rate": 3.706380366820796e-05, "loss": 8.4221, "step": 44896, "throughput": 8893.882995999113 }, { "epoch": 0.7042061458161898, "grad_norm": 0.09329033643007278, "learning_rate": 3.6976548188334834e-05, "loss": 8.4037, "step": 44928, "throughput": 8893.894693010705 }, { "epoch": 0.7047077171451187, "grad_norm": 0.07572057843208313, "learning_rate": 3.688982067755642e-05, "loss": 8.4042, "step": 44960, "throughput": 8893.955631621076 }, { "epoch": 0.7052092884740476, "grad_norm": 0.08463934808969498, "learning_rate": 3.680362149353724e-05, "loss": 8.4324, "step": 44992, "throughput": 8894.0305370298 }, { "epoch": 0.7057108598029765, "grad_norm": 0.0743819996714592, "learning_rate": 3.671795099176297e-05, "loss": 8.4116, "step": 45024, "throughput": 8894.01516265958 }, { "epoch": 0.7062124311319055, "grad_norm": 0.07929202914237976, "learning_rate": 3.6632809525539055e-05, "loss": 8.4251, "step": 45056, "throughput": 8894.018330030944 }, { "epoch": 0.7067140024608344, "grad_norm": 0.0749729722738266, "learning_rate": 3.6548197445989086e-05, "loss": 8.4302, "step": 45088, "throughput": 8893.599986004909 }, { "epoch": 0.7072155737897633, "grad_norm": 0.07228686660528183, "learning_rate": 3.6464115102053596e-05, "loss": 8.4133, "step": 45120, "throughput": 8893.675785171614 }, { "epoch": 0.7077171451186921, "grad_norm": 0.0786343514919281, "learning_rate": 3.6380562840488376e-05, "loss": 8.4365, "step": 45152, "throughput": 8893.641740279048 }, { "epoch": 0.708218716447621, "grad_norm": 0.0737488642334938, "learning_rate": 3.629754100586323e-05, "loss": 8.414, "step": 45184, "throughput": 8893.642569637646 }, { "epoch": 0.7087202877765499, "grad_norm": 0.08403093367815018, "learning_rate": 3.6215049940560433e-05, "loss": 8.4351, "step": 45216, "throughput": 8893.732444783913 }, { "epoch": 0.7092218591054789, "grad_norm": 0.08579878509044647, "learning_rate": 3.613308998477339e-05, "loss": 8.3874, "step": 45248, "throughput": 8893.810930803864 }, { "epoch": 0.7097234304344078, "grad_norm": 0.07760308682918549, "learning_rate": 3.605166147650517e-05, "loss": 8.4148, "step": 45280, "throughput": 8893.777941937868 }, { "epoch": 0.7102250017633367, "grad_norm": 0.0734131783246994, "learning_rate": 3.597076475156726e-05, "loss": 8.4353, "step": 45312, "throughput": 8893.765879205092 }, { "epoch": 0.7107265730922656, "grad_norm": 0.08017542213201523, "learning_rate": 3.589040014357791e-05, "loss": 8.4379, "step": 45344, "throughput": 8893.84987470408 }, { "epoch": 0.7112281444211945, "grad_norm": 0.08208679407835007, "learning_rate": 3.581056798396105e-05, "loss": 8.4242, "step": 45376, "throughput": 8893.941186582775 }, { "epoch": 0.7117297157501234, "grad_norm": 0.10507772862911224, "learning_rate": 3.57312686019447e-05, "loss": 8.4098, "step": 45408, "throughput": 8893.914640901407 }, { "epoch": 0.7122312870790524, "grad_norm": 0.07428514212369919, "learning_rate": 3.565250232455983e-05, "loss": 8.4264, "step": 45440, "throughput": 8893.898159260922 }, { "epoch": 0.7127328584079813, "grad_norm": 0.07359248399734497, "learning_rate": 3.55742694766387e-05, "loss": 8.4139, "step": 45472, "throughput": 8893.988018737435 }, { "epoch": 0.7132344297369102, "grad_norm": 0.07536429166793823, "learning_rate": 3.549657038081386e-05, "loss": 8.4189, "step": 45504, "throughput": 8894.083007954694 }, { "epoch": 0.7137360010658391, "grad_norm": 0.07599938660860062, "learning_rate": 3.5419405357516624e-05, "loss": 8.4067, "step": 45536, "throughput": 8894.044541761768 }, { "epoch": 0.714237572394768, "grad_norm": 0.07829944044351578, "learning_rate": 3.534277472497574e-05, "loss": 8.4119, "step": 45568, "throughput": 8894.038960063219 }, { "epoch": 0.7147391437236968, "grad_norm": 0.10090679675340652, "learning_rate": 3.52666787992162e-05, "loss": 8.432, "step": 45600, "throughput": 8894.131161951538 }, { "epoch": 0.7152407150526258, "grad_norm": 0.07359547913074493, "learning_rate": 3.519111789405779e-05, "loss": 8.4454, "step": 45632, "throughput": 8894.225585803943 }, { "epoch": 0.7157422863815547, "grad_norm": 0.07896874845027924, "learning_rate": 3.5116092321113936e-05, "loss": 8.4216, "step": 45664, "throughput": 8894.173687919589 }, { "epoch": 0.7162438577104836, "grad_norm": 0.08137936145067215, "learning_rate": 3.504160238979032e-05, "loss": 8.3998, "step": 45696, "throughput": 8894.15914533694 }, { "epoch": 0.7167454290394125, "grad_norm": 0.07841339707374573, "learning_rate": 3.496764840728361e-05, "loss": 8.3995, "step": 45728, "throughput": 8894.242613880711 }, { "epoch": 0.7172470003683414, "grad_norm": 0.09461364895105362, "learning_rate": 3.489423067858027e-05, "loss": 8.4117, "step": 45760, "throughput": 8894.331621507126 }, { "epoch": 0.7177485716972704, "grad_norm": 0.07874492555856705, "learning_rate": 3.4821349506455255e-05, "loss": 8.4268, "step": 45792, "throughput": 8894.298459594376 }, { "epoch": 0.7182501430261993, "grad_norm": 0.07716283202171326, "learning_rate": 3.47490051914707e-05, "loss": 8.3923, "step": 45824, "throughput": 8894.276196431043 }, { "epoch": 0.7187517143551282, "grad_norm": 0.08003325760364532, "learning_rate": 3.4677198031974784e-05, "loss": 8.4112, "step": 45856, "throughput": 8894.377815461386 }, { "epoch": 0.7192532856840571, "grad_norm": 0.08741440623998642, "learning_rate": 3.4605928324100444e-05, "loss": 8.4322, "step": 45888, "throughput": 8894.454749304441 }, { "epoch": 0.719754857012986, "grad_norm": 0.07573242485523224, "learning_rate": 3.45351963617642e-05, "loss": 8.4023, "step": 45920, "throughput": 8894.404858200647 }, { "epoch": 0.720256428341915, "grad_norm": 0.08988625556230545, "learning_rate": 3.446500243666481e-05, "loss": 8.432, "step": 45952, "throughput": 8894.370859969931 }, { "epoch": 0.7207579996708439, "grad_norm": 0.07446542382240295, "learning_rate": 3.439534683828228e-05, "loss": 8.4157, "step": 45984, "throughput": 8894.465354263342 }, { "epoch": 0.7212595709997727, "grad_norm": 0.07130029797554016, "learning_rate": 3.4326229853876475e-05, "loss": 8.4267, "step": 46016, "throughput": 8894.547165939282 }, { "epoch": 0.7217611423287016, "grad_norm": 0.09455982595682144, "learning_rate": 3.425765176848607e-05, "loss": 8.4167, "step": 46048, "throughput": 8894.505506216065 }, { "epoch": 0.7222627136576305, "grad_norm": 0.07977471500635147, "learning_rate": 3.418961286492728e-05, "loss": 8.4246, "step": 46080, "throughput": 8894.48630621797 }, { "epoch": 0.7227642849865594, "grad_norm": 0.08261588215827942, "learning_rate": 3.412211342379273e-05, "loss": 8.4299, "step": 46112, "throughput": 8894.578353839574 }, { "epoch": 0.7232658563154883, "grad_norm": 0.07853943109512329, "learning_rate": 3.405515372345033e-05, "loss": 8.415, "step": 46144, "throughput": 8894.650451220767 }, { "epoch": 0.7237674276444173, "grad_norm": 0.08436351269483566, "learning_rate": 3.398873404004209e-05, "loss": 8.4269, "step": 46176, "throughput": 8894.625718834395 }, { "epoch": 0.7242689989733462, "grad_norm": 0.0723554790019989, "learning_rate": 3.392285464748298e-05, "loss": 8.4267, "step": 46208, "throughput": 8894.640924683054 }, { "epoch": 0.7247705703022751, "grad_norm": 0.0779571607708931, "learning_rate": 3.385751581745979e-05, "loss": 8.4206, "step": 46240, "throughput": 8894.715659554004 }, { "epoch": 0.725272141631204, "grad_norm": 0.07471830397844315, "learning_rate": 3.379271781943007e-05, "loss": 8.4034, "step": 46272, "throughput": 8894.789629584317 }, { "epoch": 0.7257737129601329, "grad_norm": 0.08449774235486984, "learning_rate": 3.372846092062095e-05, "loss": 8.416, "step": 46304, "throughput": 8894.76510619109 }, { "epoch": 0.7262752842890619, "grad_norm": 0.07693791389465332, "learning_rate": 3.366474538602806e-05, "loss": 8.4207, "step": 46336, "throughput": 8894.779872623403 }, { "epoch": 0.7267768556179908, "grad_norm": 0.07645770162343979, "learning_rate": 3.3601571478414455e-05, "loss": 8.4001, "step": 46368, "throughput": 8894.855687529682 }, { "epoch": 0.7272784269469197, "grad_norm": 0.07502170652151108, "learning_rate": 3.3538939458309556e-05, "loss": 8.4142, "step": 46400, "throughput": 8894.934405652899 }, { "epoch": 0.7277799982758486, "grad_norm": 0.07404825091362, "learning_rate": 3.347684958400795e-05, "loss": 8.4029, "step": 46432, "throughput": 8894.906361495016 }, { "epoch": 0.7282815696047774, "grad_norm": 0.07349622249603271, "learning_rate": 3.341530211156847e-05, "loss": 8.4086, "step": 46464, "throughput": 8894.905702225637 }, { "epoch": 0.7287831409337063, "grad_norm": 0.07417155802249908, "learning_rate": 3.33542972948131e-05, "loss": 8.4119, "step": 46496, "throughput": 8894.986632206708 }, { "epoch": 0.7292847122626352, "grad_norm": 0.07474679499864578, "learning_rate": 3.329383538532587e-05, "loss": 8.417, "step": 46528, "throughput": 8895.063273953068 }, { "epoch": 0.7297862835915642, "grad_norm": 0.07456853985786438, "learning_rate": 3.323391663245188e-05, "loss": 8.4095, "step": 46560, "throughput": 8895.038880258573 }, { "epoch": 0.7302878549204931, "grad_norm": 0.08269164711236954, "learning_rate": 3.3174541283296225e-05, "loss": 8.4066, "step": 46592, "throughput": 8895.007096485651 }, { "epoch": 0.730789426249422, "grad_norm": 0.07989947497844696, "learning_rate": 3.311570958272303e-05, "loss": 8.3936, "step": 46624, "throughput": 8895.086686128505 }, { "epoch": 0.7312909975783509, "grad_norm": 0.0955742597579956, "learning_rate": 3.305742177335444e-05, "loss": 8.4006, "step": 46656, "throughput": 8895.1647328995 }, { "epoch": 0.7317925689072798, "grad_norm": 0.0774775892496109, "learning_rate": 3.29996780955695e-05, "loss": 8.3994, "step": 46688, "throughput": 8895.15862533771 }, { "epoch": 0.7322941402362088, "grad_norm": 0.07914526760578156, "learning_rate": 3.294247878750333e-05, "loss": 8.4166, "step": 46720, "throughput": 8895.125977648093 }, { "epoch": 0.7327957115651377, "grad_norm": 0.0831366628408432, "learning_rate": 3.288582408504603e-05, "loss": 8.4, "step": 46752, "throughput": 8895.192527081916 }, { "epoch": 0.7332972828940666, "grad_norm": 0.07906366139650345, "learning_rate": 3.2829714221841805e-05, "loss": 8.4372, "step": 46784, "throughput": 8895.26399886747 }, { "epoch": 0.7337988542229955, "grad_norm": 0.08686784654855728, "learning_rate": 3.2774149429287854e-05, "loss": 8.4136, "step": 46816, "throughput": 8895.276284252073 }, { "epoch": 0.7343004255519244, "grad_norm": 0.07456189393997192, "learning_rate": 3.271912993653357e-05, "loss": 8.4237, "step": 46848, "throughput": 8895.240545857285 }, { "epoch": 0.7348019968808533, "grad_norm": 0.0784822478890419, "learning_rate": 3.266465597047948e-05, "loss": 8.4144, "step": 46880, "throughput": 8895.311687503458 }, { "epoch": 0.7353035682097822, "grad_norm": 0.07475121319293976, "learning_rate": 3.261072775577641e-05, "loss": 8.4172, "step": 46912, "throughput": 8895.390936161253 }, { "epoch": 0.7358051395387111, "grad_norm": 0.07888434082269669, "learning_rate": 3.255734551482446e-05, "loss": 8.4018, "step": 46944, "throughput": 8895.40673442008 }, { "epoch": 0.73630671086764, "grad_norm": 0.08462213724851608, "learning_rate": 3.2504509467772154e-05, "loss": 8.4053, "step": 46976, "throughput": 8895.382993414712 }, { "epoch": 0.7368082821965689, "grad_norm": 0.08841930329799652, "learning_rate": 3.24522198325155e-05, "loss": 8.4, "step": 47008, "throughput": 8895.447764250572 }, { "epoch": 0.7373098535254978, "grad_norm": 0.07977207005023956, "learning_rate": 3.2400476824697126e-05, "loss": 8.4007, "step": 47040, "throughput": 8895.516781410292 }, { "epoch": 0.7378114248544267, "grad_norm": 0.0798354521393776, "learning_rate": 3.234928065770532e-05, "loss": 8.4296, "step": 47072, "throughput": 8895.507091154892 }, { "epoch": 0.7383129961833557, "grad_norm": 0.08863840252161026, "learning_rate": 3.2298631542673254e-05, "loss": 8.4139, "step": 47104, "throughput": 8895.49570712789 }, { "epoch": 0.7388145675122846, "grad_norm": 0.08590971678495407, "learning_rate": 3.2248529688478036e-05, "loss": 8.4385, "step": 47136, "throughput": 8895.124791778228 }, { "epoch": 0.7393161388412135, "grad_norm": 0.07869445532560349, "learning_rate": 3.2198975301739834e-05, "loss": 8.4068, "step": 47168, "throughput": 8895.203100537423 }, { "epoch": 0.7398177101701424, "grad_norm": 0.07871249318122864, "learning_rate": 3.214996858682109e-05, "loss": 8.4091, "step": 47200, "throughput": 8895.19382455048 }, { "epoch": 0.7403192814990713, "grad_norm": 0.08750054985284805, "learning_rate": 3.210150974582565e-05, "loss": 8.4275, "step": 47232, "throughput": 8895.161067308942 }, { "epoch": 0.7408208528280003, "grad_norm": 0.08193980902433395, "learning_rate": 3.205359897859793e-05, "loss": 8.4034, "step": 47264, "throughput": 8895.249539504743 }, { "epoch": 0.7413224241569292, "grad_norm": 0.0736575797200203, "learning_rate": 3.2006236482722034e-05, "loss": 8.3818, "step": 47296, "throughput": 8895.33768076271 }, { "epoch": 0.7418239954858581, "grad_norm": 0.0773068219423294, "learning_rate": 3.195942245352108e-05, "loss": 8.4243, "step": 47328, "throughput": 8895.324131426094 }, { "epoch": 0.7423255668147869, "grad_norm": 0.0826229453086853, "learning_rate": 3.191315708405626e-05, "loss": 8.4079, "step": 47360, "throughput": 8895.304989026181 }, { "epoch": 0.7428271381437158, "grad_norm": 0.07899387180805206, "learning_rate": 3.1867440565126066e-05, "loss": 8.4325, "step": 47392, "throughput": 8895.374665493979 }, { "epoch": 0.7433287094726447, "grad_norm": 0.16860762238502502, "learning_rate": 3.182227308526557e-05, "loss": 8.4022, "step": 47424, "throughput": 8895.458982203403 }, { "epoch": 0.7438302808015737, "grad_norm": 0.07298004627227783, "learning_rate": 3.17776548307456e-05, "loss": 8.4252, "step": 47456, "throughput": 8895.449998748658 }, { "epoch": 0.7443318521305026, "grad_norm": 0.07910618185997009, "learning_rate": 3.173358598557196e-05, "loss": 8.4017, "step": 47488, "throughput": 8895.42143796254 }, { "epoch": 0.7448334234594315, "grad_norm": 0.07871539145708084, "learning_rate": 3.169006673148473e-05, "loss": 8.3927, "step": 47520, "throughput": 8895.500163381023 }, { "epoch": 0.7453349947883604, "grad_norm": 0.09069735556840897, "learning_rate": 3.1647097247957385e-05, "loss": 8.4048, "step": 47552, "throughput": 8895.603389232176 }, { "epoch": 0.7458365661172893, "grad_norm": 0.08377867192029953, "learning_rate": 3.160467771219624e-05, "loss": 8.4123, "step": 47584, "throughput": 8895.581162530754 }, { "epoch": 0.7463381374462182, "grad_norm": 0.07731027156114578, "learning_rate": 3.1562808299139596e-05, "loss": 8.4229, "step": 47616, "throughput": 8895.540017225196 }, { "epoch": 0.7468397087751472, "grad_norm": 0.0828324630856514, "learning_rate": 3.1521489181457005e-05, "loss": 8.4149, "step": 47648, "throughput": 8895.616793753607 }, { "epoch": 0.7473412801040761, "grad_norm": 0.10275556892156601, "learning_rate": 3.1480720529548654e-05, "loss": 8.4065, "step": 47680, "throughput": 8895.722701961871 }, { "epoch": 0.747842851433005, "grad_norm": 0.09654593467712402, "learning_rate": 3.1440502511544566e-05, "loss": 8.4057, "step": 47712, "throughput": 8895.714717567691 }, { "epoch": 0.7483444227619339, "grad_norm": 0.08919548988342285, "learning_rate": 3.1400835293303984e-05, "loss": 8.4201, "step": 47744, "throughput": 8895.67539681683 }, { "epoch": 0.7488459940908628, "grad_norm": 0.08011159300804138, "learning_rate": 3.136171903841463e-05, "loss": 8.4234, "step": 47776, "throughput": 8895.754041534876 }, { "epoch": 0.7493475654197916, "grad_norm": 0.08086296170949936, "learning_rate": 3.1323153908192057e-05, "loss": 8.4123, "step": 47808, "throughput": 8895.867884774138 }, { "epoch": 0.7498491367487206, "grad_norm": 0.07986344397068024, "learning_rate": 3.128514006167897e-05, "loss": 8.4253, "step": 47840, "throughput": 8895.852017272211 }, { "epoch": 0.7503507080776495, "grad_norm": 0.07865214347839355, "learning_rate": 3.124767765564459e-05, "loss": 8.404, "step": 47872, "throughput": 8895.824903835235 }, { "epoch": 0.7508522794065784, "grad_norm": 0.07204648107290268, "learning_rate": 3.121076684458398e-05, "loss": 8.4139, "step": 47904, "throughput": 8895.893625485049 }, { "epoch": 0.7513538507355073, "grad_norm": 0.08589160442352295, "learning_rate": 3.1174407780717433e-05, "loss": 8.4103, "step": 47936, "throughput": 8895.9942421036 }, { "epoch": 0.7518554220644362, "grad_norm": 0.09325224161148071, "learning_rate": 3.113860061398985e-05, "loss": 8.3908, "step": 47968, "throughput": 8895.961989508723 }, { "epoch": 0.7523569933933651, "grad_norm": 0.08093949407339096, "learning_rate": 3.110334549207009e-05, "loss": 8.4049, "step": 48000, "throughput": 8895.953046044224 }, { "epoch": 0.7528585647222941, "grad_norm": 0.09794154763221741, "learning_rate": 3.1068642560350375e-05, "loss": 8.3908, "step": 48032, "throughput": 8896.009533534052 }, { "epoch": 0.753360136051223, "grad_norm": 0.07886497676372528, "learning_rate": 3.103449196194569e-05, "loss": 8.4077, "step": 48064, "throughput": 8896.106538320832 }, { "epoch": 0.7538617073801519, "grad_norm": 0.08433572947978973, "learning_rate": 3.1000893837693234e-05, "loss": 8.4283, "step": 48096, "throughput": 8896.095852382228 }, { "epoch": 0.7543632787090808, "grad_norm": 0.07943733781576157, "learning_rate": 3.096784832615175e-05, "loss": 8.3774, "step": 48128, "throughput": 8896.080020295462 }, { "epoch": 0.7548648500380097, "grad_norm": 0.07670002430677414, "learning_rate": 3.093535556360101e-05, "loss": 8.4291, "step": 48160, "throughput": 8896.140985773469 }, { "epoch": 0.7553664213669387, "grad_norm": 0.08377315104007721, "learning_rate": 3.0903415684041285e-05, "loss": 8.4105, "step": 48192, "throughput": 8896.234860974488 }, { "epoch": 0.7558679926958676, "grad_norm": 0.08657362312078476, "learning_rate": 3.087202881919273e-05, "loss": 8.4106, "step": 48224, "throughput": 8896.232594710647 }, { "epoch": 0.7563695640247964, "grad_norm": 0.08452863991260529, "learning_rate": 3.084119509849488e-05, "loss": 8.4097, "step": 48256, "throughput": 8896.233640640303 }, { "epoch": 0.7568711353537253, "grad_norm": 0.08716598898172379, "learning_rate": 3.081091464910606e-05, "loss": 8.4183, "step": 48288, "throughput": 8896.302885903542 }, { "epoch": 0.7573727066826542, "grad_norm": 0.0744166448712349, "learning_rate": 3.078118759590295e-05, "loss": 8.4048, "step": 48320, "throughput": 8896.408020396842 }, { "epoch": 0.7578742780115831, "grad_norm": 0.08003176748752594, "learning_rate": 3.075201406148001e-05, "loss": 8.4005, "step": 48352, "throughput": 8896.377259493374 }, { "epoch": 0.758375849340512, "grad_norm": 0.07641202211380005, "learning_rate": 3.072339416614899e-05, "loss": 8.4061, "step": 48384, "throughput": 8896.355147113938 }, { "epoch": 0.758877420669441, "grad_norm": 0.08283281326293945, "learning_rate": 3.069532802793839e-05, "loss": 8.3926, "step": 48416, "throughput": 8896.417785882708 }, { "epoch": 0.7593789919983699, "grad_norm": 0.08166106045246124, "learning_rate": 3.066781576259309e-05, "loss": 8.4236, "step": 48448, "throughput": 8896.52052368501 }, { "epoch": 0.7598805633272988, "grad_norm": 0.08659474551677704, "learning_rate": 3.0640857483573714e-05, "loss": 8.4095, "step": 48480, "throughput": 8896.489488514619 }, { "epoch": 0.7603821346562277, "grad_norm": 0.07853197306394577, "learning_rate": 3.061445330205631e-05, "loss": 8.3986, "step": 48512, "throughput": 8896.482951834703 }, { "epoch": 0.7608837059851566, "grad_norm": 0.0776255801320076, "learning_rate": 3.0588603326931796e-05, "loss": 8.4111, "step": 48544, "throughput": 8896.515826418297 }, { "epoch": 0.7613852773140856, "grad_norm": 0.07786612212657928, "learning_rate": 3.056330766480554e-05, "loss": 8.4013, "step": 48576, "throughput": 8896.621289813445 }, { "epoch": 0.7618868486430145, "grad_norm": 0.07415422052145004, "learning_rate": 3.053856641999694e-05, "loss": 8.4016, "step": 48608, "throughput": 8896.604208429937 }, { "epoch": 0.7623884199719434, "grad_norm": 0.07848580926656723, "learning_rate": 3.0514379694538932e-05, "loss": 8.3971, "step": 48640, "throughput": 8896.604218805502 }, { "epoch": 0.7628899913008723, "grad_norm": 0.07402683794498444, "learning_rate": 3.0490747588177684e-05, "loss": 8.4195, "step": 48672, "throughput": 8896.648336095484 }, { "epoch": 0.7633915626298011, "grad_norm": 0.07746639102697372, "learning_rate": 3.0467670198372044e-05, "loss": 8.4097, "step": 48704, "throughput": 8896.755910382974 }, { "epoch": 0.76389313395873, "grad_norm": 0.07922644168138504, "learning_rate": 3.044514762029326e-05, "loss": 8.3952, "step": 48736, "throughput": 8896.714345888688 }, { "epoch": 0.764394705287659, "grad_norm": 0.0855683907866478, "learning_rate": 3.0423179946824494e-05, "loss": 8.4142, "step": 48768, "throughput": 8896.72705439139 }, { "epoch": 0.7648962766165879, "grad_norm": 0.07957347482442856, "learning_rate": 3.040176726856049e-05, "loss": 8.4067, "step": 48800, "throughput": 8896.773556908885 }, { "epoch": 0.7653978479455168, "grad_norm": 0.07848001271486282, "learning_rate": 3.0380909673807205e-05, "loss": 8.3885, "step": 48832, "throughput": 8896.879164687118 }, { "epoch": 0.7658994192744457, "grad_norm": 0.08187400549650192, "learning_rate": 3.0360607248581437e-05, "loss": 8.4094, "step": 48864, "throughput": 8896.820305927478 }, { "epoch": 0.7664009906033746, "grad_norm": 0.08541760593652725, "learning_rate": 3.0340860076610427e-05, "loss": 8.3964, "step": 48896, "throughput": 8896.814791286453 }, { "epoch": 0.7669025619323036, "grad_norm": 0.0846327468752861, "learning_rate": 3.0321668239331582e-05, "loss": 8.4064, "step": 48928, "throughput": 8896.849370458765 }, { "epoch": 0.7674041332612325, "grad_norm": 0.11470185965299606, "learning_rate": 3.030303181589207e-05, "loss": 8.3863, "step": 48960, "throughput": 8896.954688708582 }, { "epoch": 0.7679057045901614, "grad_norm": 0.08249527961015701, "learning_rate": 3.0284950883148598e-05, "loss": 8.3919, "step": 48992, "throughput": 8896.907999028363 }, { "epoch": 0.7684072759190903, "grad_norm": 0.08515627682209015, "learning_rate": 3.026742551566696e-05, "loss": 8.3981, "step": 49024, "throughput": 8896.916454662103 }, { "epoch": 0.7689088472480192, "grad_norm": 0.07765959948301315, "learning_rate": 3.0250455785721827e-05, "loss": 8.4031, "step": 49056, "throughput": 8896.958168425917 }, { "epoch": 0.7694104185769481, "grad_norm": 0.08357566595077515, "learning_rate": 3.023404176329643e-05, "loss": 8.4077, "step": 49088, "throughput": 8897.05878428246 }, { "epoch": 0.7699119899058771, "grad_norm": 0.0747281089425087, "learning_rate": 3.021818351608223e-05, "loss": 8.4002, "step": 49120, "throughput": 8897.018858526288 }, { "epoch": 0.7704135612348059, "grad_norm": 0.07242199033498764, "learning_rate": 3.0202881109478676e-05, "loss": 8.4119, "step": 49152, "throughput": 8897.003698841101 }, { "epoch": 0.7709151325637348, "grad_norm": 0.09169508516788483, "learning_rate": 3.0188134606592958e-05, "loss": 8.4123, "step": 49184, "throughput": 8896.6229254484 }, { "epoch": 0.7714167038926637, "grad_norm": 0.07344987988471985, "learning_rate": 3.017394406823969e-05, "loss": 8.4033, "step": 49216, "throughput": 8896.726633691835 }, { "epoch": 0.7719182752215926, "grad_norm": 0.0744016170501709, "learning_rate": 3.0160309552940704e-05, "loss": 8.427, "step": 49248, "throughput": 8896.704982883732 }, { "epoch": 0.7724198465505215, "grad_norm": 0.07636785507202148, "learning_rate": 3.014723111692476e-05, "loss": 8.4045, "step": 49280, "throughput": 8896.70186019132 }, { "epoch": 0.7729214178794505, "grad_norm": 0.07975538820028305, "learning_rate": 3.013470881412739e-05, "loss": 8.3832, "step": 49312, "throughput": 8896.750642487581 }, { "epoch": 0.7734229892083794, "grad_norm": 0.07941026985645294, "learning_rate": 3.0122742696190606e-05, "loss": 8.4185, "step": 49344, "throughput": 8896.850750910937 }, { "epoch": 0.7739245605373083, "grad_norm": 0.07521425932645798, "learning_rate": 3.0111332812462692e-05, "loss": 8.4036, "step": 49376, "throughput": 8896.813329451572 }, { "epoch": 0.7744261318662372, "grad_norm": 0.07578073441982269, "learning_rate": 3.0100479209998055e-05, "loss": 8.3934, "step": 49408, "throughput": 8896.831284182328 }, { "epoch": 0.7749277031951661, "grad_norm": 0.09989731013774872, "learning_rate": 3.0090181933556994e-05, "loss": 8.4085, "step": 49440, "throughput": 8896.879084852144 }, { "epoch": 0.775429274524095, "grad_norm": 0.08798599988222122, "learning_rate": 3.0080441025605494e-05, "loss": 8.3887, "step": 49472, "throughput": 8896.984651790684 }, { "epoch": 0.775930845853024, "grad_norm": 0.08368100225925446, "learning_rate": 3.007125652631508e-05, "loss": 8.3882, "step": 49504, "throughput": 8896.924194215266 }, { "epoch": 0.7764324171819529, "grad_norm": 0.07710754871368408, "learning_rate": 3.006262847356269e-05, "loss": 8.3931, "step": 49536, "throughput": 8896.92686921276 }, { "epoch": 0.7769339885108818, "grad_norm": 0.08023960143327713, "learning_rate": 3.0054556902930394e-05, "loss": 8.4064, "step": 49568, "throughput": 8896.966642033312 }, { "epoch": 0.7774355598398106, "grad_norm": 0.07401903718709946, "learning_rate": 3.0047041847705404e-05, "loss": 8.41, "step": 49600, "throughput": 8897.071490538188 }, { "epoch": 0.7779371311687395, "grad_norm": 0.08108972012996674, "learning_rate": 3.0040083338879834e-05, "loss": 8.3832, "step": 49632, "throughput": 8897.013010592347 }, { "epoch": 0.7784387024976684, "grad_norm": 0.08791600167751312, "learning_rate": 3.0033681405150554e-05, "loss": 8.4159, "step": 49664, "throughput": 8897.0419185872 }, { "epoch": 0.7789402738265974, "grad_norm": 0.07926676422357559, "learning_rate": 3.0027836072919202e-05, "loss": 8.3874, "step": 49696, "throughput": 8897.077031031937 }, { "epoch": 0.7794418451555263, "grad_norm": 0.14427538216114044, "learning_rate": 3.002254736629194e-05, "loss": 8.4154, "step": 49728, "throughput": 8897.182181287917 }, { "epoch": 0.7799434164844552, "grad_norm": 0.09725314378738403, "learning_rate": 3.001781530707938e-05, "loss": 8.3869, "step": 49760, "throughput": 8897.149475721768 }, { "epoch": 0.7804449878133841, "grad_norm": 0.07859361916780472, "learning_rate": 3.0013639914796586e-05, "loss": 8.404, "step": 49792, "throughput": 8897.178265734985 }, { "epoch": 0.780946559142313, "grad_norm": 0.07424914091825485, "learning_rate": 3.001002120666285e-05, "loss": 8.3968, "step": 49824, "throughput": 8897.209883269028 }, { "epoch": 0.781448130471242, "grad_norm": 0.07746188342571259, "learning_rate": 3.0006959197601765e-05, "loss": 8.4069, "step": 49856, "throughput": 8897.308706740672 }, { "epoch": 0.7819497018001709, "grad_norm": 0.09235438704490662, "learning_rate": 3.000445390024106e-05, "loss": 8.4015, "step": 49888, "throughput": 8897.259296904602 }, { "epoch": 0.7824512731290998, "grad_norm": 0.07753193378448486, "learning_rate": 3.0002505324912582e-05, "loss": 8.3881, "step": 49920, "throughput": 8897.287677847626 }, { "epoch": 0.7829528444580287, "grad_norm": 0.09510205686092377, "learning_rate": 3.0001113479652246e-05, "loss": 8.398, "step": 49952, "throughput": 8897.326425338933 }, { "epoch": 0.7834544157869576, "grad_norm": 0.07465380430221558, "learning_rate": 3.0000278370200057e-05, "loss": 8.4051, "step": 49984, "throughput": 8897.427443018712 }, { "epoch": 0.7839559871158865, "grad_norm": 0.0882812961935997, "learning_rate": 2.9999999999999997e-05, "loss": 8.3954, "step": 50016, "throughput": 8897.361097831126 }, { "epoch": 0.7839559871158865, "step": 50016, "throughput": 8896.939390152644, "total_flos": 1.1998395573363655e+21, "train_loss": 8.993753637019747, "train_runtime": 368424.292, "train_samples_per_second": 139.015, "train_steps_per_second": 0.136 } ], "logging_steps": 32, "max_steps": 50016, "num_input_tokens_seen": 104891154432, "num_train_epochs": 1, "save_steps": 2048, "stateful_callbacks": { "LogCallback": { "elapsed_time": 368424.2897763252, "start_time": 1766739748.8421957 }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1998395573363655e+21, "train_batch_size": 8, "trial_name": null, "trial_params": null }