{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 936, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0106951871657754, "grad_norm": 5.869461536407471, "learning_rate": 9.30232558139535e-07, "loss": 0.3054, "step": 5 }, { "epoch": 0.0213903743315508, "grad_norm": 4.7690653800964355, "learning_rate": 2.0930232558139536e-06, "loss": 0.2661, "step": 10 }, { "epoch": 0.03208556149732621, "grad_norm": 4.248436450958252, "learning_rate": 3.2558139534883724e-06, "loss": 0.2409, "step": 15 }, { "epoch": 0.0427807486631016, "grad_norm": 4.08921480178833, "learning_rate": 4.418604651162791e-06, "loss": 0.2228, "step": 20 }, { "epoch": 0.053475935828877004, "grad_norm": 4.846882343292236, "learning_rate": 5.58139534883721e-06, "loss": 0.2258, "step": 25 }, { "epoch": 0.06417112299465241, "grad_norm": 4.230620384216309, "learning_rate": 6.744186046511628e-06, "loss": 0.2103, "step": 30 }, { "epoch": 0.0748663101604278, "grad_norm": 4.4064130783081055, "learning_rate": 7.906976744186048e-06, "loss": 0.2122, "step": 35 }, { "epoch": 0.0855614973262032, "grad_norm": 4.86531400680542, "learning_rate": 9.069767441860465e-06, "loss": 0.1996, "step": 40 }, { "epoch": 0.0962566844919786, "grad_norm": 5.53223991394043, "learning_rate": 9.999986679414613e-06, "loss": 0.2311, "step": 45 }, { "epoch": 0.10695187165775401, "grad_norm": 4.2916178703308105, "learning_rate": 9.999520466378376e-06, "loss": 0.2319, "step": 50 }, { "epoch": 0.11764705882352941, "grad_norm": 4.200897216796875, "learning_rate": 9.998388295046227e-06, "loss": 0.2368, "step": 55 }, { "epoch": 0.12834224598930483, "grad_norm": 4.808861255645752, "learning_rate": 9.996590316228402e-06, "loss": 0.2354, "step": 60 }, { "epoch": 0.13903743315508021, "grad_norm": 4.5134077072143555, "learning_rate": 9.994126769423656e-06, "loss": 0.234, "step": 65 }, { "epoch": 0.1497326203208556, "grad_norm": 4.348900318145752, "learning_rate": 9.990997982787348e-06, "loss": 0.2541, "step": 70 }, { "epoch": 0.16042780748663102, "grad_norm": 4.4039306640625, "learning_rate": 9.98720437308773e-06, "loss": 0.2496, "step": 75 }, { "epoch": 0.1711229946524064, "grad_norm": 4.726532936096191, "learning_rate": 9.982746445650437e-06, "loss": 0.2797, "step": 80 }, { "epoch": 0.18181818181818182, "grad_norm": 4.256592273712158, "learning_rate": 9.977624794291172e-06, "loss": 0.2518, "step": 85 }, { "epoch": 0.1925133689839572, "grad_norm": 4.330168724060059, "learning_rate": 9.97184010123661e-06, "loss": 0.261, "step": 90 }, { "epoch": 0.20320855614973263, "grad_norm": 4.281119346618652, "learning_rate": 9.965393137033512e-06, "loss": 0.2564, "step": 95 }, { "epoch": 0.21390374331550802, "grad_norm": 4.247673511505127, "learning_rate": 9.958284760446104e-06, "loss": 0.2501, "step": 100 }, { "epoch": 0.22459893048128343, "grad_norm": 3.930504322052002, "learning_rate": 9.950515918341666e-06, "loss": 0.2504, "step": 105 }, { "epoch": 0.23529411764705882, "grad_norm": 4.084280967712402, "learning_rate": 9.942087645564415e-06, "loss": 0.2706, "step": 110 }, { "epoch": 0.24598930481283424, "grad_norm": 3.869400978088379, "learning_rate": 9.93300106479766e-06, "loss": 0.2499, "step": 115 }, { "epoch": 0.25668449197860965, "grad_norm": 4.0887675285339355, "learning_rate": 9.923257386414253e-06, "loss": 0.2628, "step": 120 }, { "epoch": 0.26737967914438504, "grad_norm": 4.140570163726807, "learning_rate": 9.912857908315363e-06, "loss": 0.2642, "step": 125 }, { "epoch": 0.27807486631016043, "grad_norm": 4.731294631958008, "learning_rate": 9.901804015757588e-06, "loss": 0.2669, "step": 130 }, { "epoch": 0.2887700534759358, "grad_norm": 4.113650321960449, "learning_rate": 9.89009718116843e-06, "loss": 0.2809, "step": 135 }, { "epoch": 0.2994652406417112, "grad_norm": 3.8770499229431152, "learning_rate": 9.877738963950175e-06, "loss": 0.2705, "step": 140 }, { "epoch": 0.31016042780748665, "grad_norm": 4.077112197875977, "learning_rate": 9.864731010272152e-06, "loss": 0.2548, "step": 145 }, { "epoch": 0.32085561497326204, "grad_norm": 4.771427631378174, "learning_rate": 9.851075052851476e-06, "loss": 0.2534, "step": 150 }, { "epoch": 0.3315508021390374, "grad_norm": 3.84387469291687, "learning_rate": 9.83677291072223e-06, "loss": 0.2438, "step": 155 }, { "epoch": 0.3422459893048128, "grad_norm": 4.427935600280762, "learning_rate": 9.821826488993168e-06, "loss": 0.2791, "step": 160 }, { "epoch": 0.35294117647058826, "grad_norm": 4.1868767738342285, "learning_rate": 9.806237778593941e-06, "loss": 0.2734, "step": 165 }, { "epoch": 0.36363636363636365, "grad_norm": 3.8055779933929443, "learning_rate": 9.790008856009902e-06, "loss": 0.2463, "step": 170 }, { "epoch": 0.37433155080213903, "grad_norm": 4.041962623596191, "learning_rate": 9.773141883005507e-06, "loss": 0.2813, "step": 175 }, { "epoch": 0.3850267379679144, "grad_norm": 3.6363329887390137, "learning_rate": 9.755639106336347e-06, "loss": 0.2724, "step": 180 }, { "epoch": 0.39572192513368987, "grad_norm": 3.50486421585083, "learning_rate": 9.737502857449894e-06, "loss": 0.271, "step": 185 }, { "epoch": 0.40641711229946526, "grad_norm": 3.4967920780181885, "learning_rate": 9.718735552174923e-06, "loss": 0.2467, "step": 190 }, { "epoch": 0.41711229946524064, "grad_norm": 3.3797831535339355, "learning_rate": 9.699339690399717e-06, "loss": 0.2621, "step": 195 }, { "epoch": 0.42780748663101603, "grad_norm": 15.109771728515625, "learning_rate": 9.679317855739073e-06, "loss": 0.2791, "step": 200 }, { "epoch": 0.4385026737967914, "grad_norm": 3.828474760055542, "learning_rate": 9.658672715190151e-06, "loss": 0.2829, "step": 205 }, { "epoch": 0.44919786096256686, "grad_norm": 3.2750980854034424, "learning_rate": 9.637407018777224e-06, "loss": 0.2866, "step": 210 }, { "epoch": 0.45989304812834225, "grad_norm": 3.615290880203247, "learning_rate": 9.615523599185353e-06, "loss": 0.2422, "step": 215 }, { "epoch": 0.47058823529411764, "grad_norm": 3.5919339656829834, "learning_rate": 9.593025371383064e-06, "loss": 0.269, "step": 220 }, { "epoch": 0.48128342245989303, "grad_norm": 4.708985805511475, "learning_rate": 9.569915332234068e-06, "loss": 0.2686, "step": 225 }, { "epoch": 0.4919786096256685, "grad_norm": 3.5988545417785645, "learning_rate": 9.546196560098062e-06, "loss": 0.273, "step": 230 }, { "epoch": 0.5026737967914439, "grad_norm": 3.4772067070007324, "learning_rate": 9.521872214420668e-06, "loss": 0.2395, "step": 235 }, { "epoch": 0.5133689839572193, "grad_norm": 3.7974278926849365, "learning_rate": 9.496945535312597e-06, "loss": 0.2682, "step": 240 }, { "epoch": 0.5240641711229946, "grad_norm": 3.69970440864563, "learning_rate": 9.471419843118036e-06, "loss": 0.2557, "step": 245 }, { "epoch": 0.5347593582887701, "grad_norm": 3.470737934112549, "learning_rate": 9.44529853797238e-06, "loss": 0.266, "step": 250 }, { "epoch": 0.5454545454545454, "grad_norm": 3.733236312866211, "learning_rate": 9.418585099349306e-06, "loss": 0.2779, "step": 255 }, { "epoch": 0.5561497326203209, "grad_norm": 3.434349298477173, "learning_rate": 9.391283085597299e-06, "loss": 0.2508, "step": 260 }, { "epoch": 0.5668449197860963, "grad_norm": 3.5163183212280273, "learning_rate": 9.36339613346565e-06, "loss": 0.2727, "step": 265 }, { "epoch": 0.5775401069518716, "grad_norm": 4.07936429977417, "learning_rate": 9.33492795762005e-06, "loss": 0.2714, "step": 270 }, { "epoch": 0.5882352941176471, "grad_norm": 3.171149492263794, "learning_rate": 9.305882350147763e-06, "loss": 0.2585, "step": 275 }, { "epoch": 0.5989304812834224, "grad_norm": 3.424302101135254, "learning_rate": 9.276263180052498e-06, "loss": 0.2584, "step": 280 }, { "epoch": 0.6096256684491979, "grad_norm": 3.800421714782715, "learning_rate": 9.246074392739057e-06, "loss": 0.2771, "step": 285 }, { "epoch": 0.6203208556149733, "grad_norm": 3.4711737632751465, "learning_rate": 9.21532000948778e-06, "loss": 0.265, "step": 290 }, { "epoch": 0.6310160427807486, "grad_norm": 3.5598926544189453, "learning_rate": 9.184004126918891e-06, "loss": 0.2653, "step": 295 }, { "epoch": 0.6417112299465241, "grad_norm": 3.320875644683838, "learning_rate": 9.152130916446817e-06, "loss": 0.2373, "step": 300 }, { "epoch": 0.6524064171122995, "grad_norm": 3.606067419052124, "learning_rate": 9.119704623724528e-06, "loss": 0.2779, "step": 305 }, { "epoch": 0.6631016042780749, "grad_norm": 3.189316511154175, "learning_rate": 9.086729568078006e-06, "loss": 0.2813, "step": 310 }, { "epoch": 0.6737967914438503, "grad_norm": 3.725550651550293, "learning_rate": 9.05321014193089e-06, "loss": 0.284, "step": 315 }, { "epoch": 0.6844919786096256, "grad_norm": 3.6915953159332275, "learning_rate": 9.019150810219376e-06, "loss": 0.2646, "step": 320 }, { "epoch": 0.6951871657754011, "grad_norm": 3.499095916748047, "learning_rate": 8.984556109797484e-06, "loss": 0.2693, "step": 325 }, { "epoch": 0.7058823529411765, "grad_norm": 3.671689510345459, "learning_rate": 8.949430648832716e-06, "loss": 0.2662, "step": 330 }, { "epoch": 0.7165775401069518, "grad_norm": 3.549403429031372, "learning_rate": 8.91377910619223e-06, "loss": 0.2745, "step": 335 }, { "epoch": 0.7272727272727273, "grad_norm": 3.2107975482940674, "learning_rate": 8.8776062308196e-06, "loss": 0.2462, "step": 340 }, { "epoch": 0.7379679144385026, "grad_norm": 3.040982246398926, "learning_rate": 8.84091684110223e-06, "loss": 0.2663, "step": 345 }, { "epoch": 0.7486631016042781, "grad_norm": 3.460141658782959, "learning_rate": 8.803715824229525e-06, "loss": 0.2595, "step": 350 }, { "epoch": 0.7593582887700535, "grad_norm": 3.5115394592285156, "learning_rate": 8.766008135541896e-06, "loss": 0.2476, "step": 355 }, { "epoch": 0.7700534759358288, "grad_norm": 3.2345032691955566, "learning_rate": 8.727798797870688e-06, "loss": 0.2399, "step": 360 }, { "epoch": 0.7807486631016043, "grad_norm": 3.489104986190796, "learning_rate": 8.689092900869112e-06, "loss": 0.2568, "step": 365 }, { "epoch": 0.7914438502673797, "grad_norm": 3.4822070598602295, "learning_rate": 8.649895600334284e-06, "loss": 0.2779, "step": 370 }, { "epoch": 0.8021390374331551, "grad_norm": 3.6014904975891113, "learning_rate": 8.610212117520453e-06, "loss": 0.2738, "step": 375 }, { "epoch": 0.8128342245989305, "grad_norm": 3.3353493213653564, "learning_rate": 8.570047738443502e-06, "loss": 0.2564, "step": 380 }, { "epoch": 0.8235294117647058, "grad_norm": 3.1294198036193848, "learning_rate": 8.52940781317683e-06, "loss": 0.2459, "step": 385 }, { "epoch": 0.8342245989304813, "grad_norm": 3.2122206687927246, "learning_rate": 8.48829775513869e-06, "loss": 0.2639, "step": 390 }, { "epoch": 0.8449197860962567, "grad_norm": 3.2475640773773193, "learning_rate": 8.446723040371114e-06, "loss": 0.2545, "step": 395 }, { "epoch": 0.8556149732620321, "grad_norm": 3.113609552383423, "learning_rate": 8.40468920681047e-06, "loss": 0.2712, "step": 400 }, { "epoch": 0.8663101604278075, "grad_norm": 3.231935501098633, "learning_rate": 8.362201853549777e-06, "loss": 0.27, "step": 405 }, { "epoch": 0.8770053475935828, "grad_norm": 3.3826744556427, "learning_rate": 8.319266640092899e-06, "loss": 0.2823, "step": 410 }, { "epoch": 0.8877005347593583, "grad_norm": 3.3691983222961426, "learning_rate": 8.275889285600656e-06, "loss": 0.2555, "step": 415 }, { "epoch": 0.8983957219251337, "grad_norm": 3.4460556507110596, "learning_rate": 8.23207556812902e-06, "loss": 0.2615, "step": 420 }, { "epoch": 0.9090909090909091, "grad_norm": 3.0713820457458496, "learning_rate": 8.187831323859445e-06, "loss": 0.2512, "step": 425 }, { "epoch": 0.9197860962566845, "grad_norm": 2.781736373901367, "learning_rate": 8.143162446321465e-06, "loss": 0.2568, "step": 430 }, { "epoch": 0.93048128342246, "grad_norm": 3.0802292823791504, "learning_rate": 8.098074885607646e-06, "loss": 0.2506, "step": 435 }, { "epoch": 0.9411764705882353, "grad_norm": 3.0498411655426025, "learning_rate": 8.052574647581009e-06, "loss": 0.2586, "step": 440 }, { "epoch": 0.9518716577540107, "grad_norm": 3.0598437786102295, "learning_rate": 8.006667793075026e-06, "loss": 0.2576, "step": 445 }, { "epoch": 0.9625668449197861, "grad_norm": 2.935920238494873, "learning_rate": 7.960360437086287e-06, "loss": 0.2363, "step": 450 }, { "epoch": 0.9732620320855615, "grad_norm": 3.2348110675811768, "learning_rate": 7.91365874795995e-06, "loss": 0.2677, "step": 455 }, { "epoch": 0.983957219251337, "grad_norm": 3.4138996601104736, "learning_rate": 7.866568946568107e-06, "loss": 0.2702, "step": 460 }, { "epoch": 0.9946524064171123, "grad_norm": 2.949201822280884, "learning_rate": 7.819097305481112e-06, "loss": 0.2487, "step": 465 }, { "epoch": 1.0042780748663103, "grad_norm": 2.0031890869140625, "learning_rate": 7.771250148132067e-06, "loss": 0.1638, "step": 470 }, { "epoch": 1.0149732620320855, "grad_norm": 2.1339635848999023, "learning_rate": 7.723033847974503e-06, "loss": 0.1013, "step": 475 }, { "epoch": 1.025668449197861, "grad_norm": 2.822927474975586, "learning_rate": 7.674454827633413e-06, "loss": 0.0924, "step": 480 }, { "epoch": 1.0363636363636364, "grad_norm": 3.7076971530914307, "learning_rate": 7.625519558049722e-06, "loss": 0.1075, "step": 485 }, { "epoch": 1.0470588235294118, "grad_norm": 2.7152915000915527, "learning_rate": 7.576234557618336e-06, "loss": 0.0926, "step": 490 }, { "epoch": 1.0577540106951873, "grad_norm": 3.0349481105804443, "learning_rate": 7.526606391319862e-06, "loss": 0.1054, "step": 495 }, { "epoch": 1.0684491978609625, "grad_norm": 2.7987284660339355, "learning_rate": 7.476641669846121e-06, "loss": 0.0999, "step": 500 }, { "epoch": 1.079144385026738, "grad_norm": 2.2019879817962646, "learning_rate": 7.426347048719577e-06, "loss": 0.1082, "step": 505 }, { "epoch": 1.0898395721925134, "grad_norm": 2.8885598182678223, "learning_rate": 7.375729227406789e-06, "loss": 0.1227, "step": 510 }, { "epoch": 1.1005347593582888, "grad_norm": 2.8309147357940674, "learning_rate": 7.324794948426015e-06, "loss": 0.0943, "step": 515 }, { "epoch": 1.1112299465240643, "grad_norm": 2.788823366165161, "learning_rate": 7.273550996449077e-06, "loss": 0.1038, "step": 520 }, { "epoch": 1.1219251336898395, "grad_norm": 2.1967334747314453, "learning_rate": 7.222004197397613e-06, "loss": 0.0956, "step": 525 }, { "epoch": 1.132620320855615, "grad_norm": 2.6466832160949707, "learning_rate": 7.170161417533836e-06, "loss": 0.0946, "step": 530 }, { "epoch": 1.1433155080213904, "grad_norm": 2.3864052295684814, "learning_rate": 7.118029562545915e-06, "loss": 0.0868, "step": 535 }, { "epoch": 1.1540106951871658, "grad_norm": 2.6868040561676025, "learning_rate": 7.065615576628107e-06, "loss": 0.1009, "step": 540 }, { "epoch": 1.1647058823529413, "grad_norm": 2.748037815093994, "learning_rate": 7.0129264415557585e-06, "loss": 0.1067, "step": 545 }, { "epoch": 1.1754010695187165, "grad_norm": 3.183288097381592, "learning_rate": 6.959969175755306e-06, "loss": 0.1009, "step": 550 }, { "epoch": 1.186096256684492, "grad_norm": 2.472684383392334, "learning_rate": 6.906750833369386e-06, "loss": 0.0912, "step": 555 }, { "epoch": 1.1967914438502674, "grad_norm": 3.301072359085083, "learning_rate": 6.8532785033171975e-06, "loss": 0.1026, "step": 560 }, { "epoch": 1.2074866310160428, "grad_norm": 2.607532262802124, "learning_rate": 6.799559308350219e-06, "loss": 0.0985, "step": 565 }, { "epoch": 1.2181818181818183, "grad_norm": 2.714355230331421, "learning_rate": 6.745600404103431e-06, "loss": 0.0982, "step": 570 }, { "epoch": 1.2288770053475937, "grad_norm": 3.1020829677581787, "learning_rate": 6.6914089781421535e-06, "loss": 0.1084, "step": 575 }, { "epoch": 1.239572192513369, "grad_norm": 3.3140673637390137, "learning_rate": 6.636992249004629e-06, "loss": 0.1054, "step": 580 }, { "epoch": 1.2502673796791444, "grad_norm": 2.7101094722747803, "learning_rate": 6.582357465240488e-06, "loss": 0.1045, "step": 585 }, { "epoch": 1.2609625668449198, "grad_norm": 2.9788272380828857, "learning_rate": 6.527511904445194e-06, "loss": 0.0883, "step": 590 }, { "epoch": 1.2716577540106953, "grad_norm": 3.2309136390686035, "learning_rate": 6.472462872290654e-06, "loss": 0.0946, "step": 595 }, { "epoch": 1.2823529411764705, "grad_norm": 2.4168930053710938, "learning_rate": 6.417217701552059e-06, "loss": 0.0996, "step": 600 }, { "epoch": 1.293048128342246, "grad_norm": 2.6719398498535156, "learning_rate": 6.36178375113113e-06, "loss": 0.0963, "step": 605 }, { "epoch": 1.3037433155080214, "grad_norm": 2.4402902126312256, "learning_rate": 6.3061684050758776e-06, "loss": 0.0986, "step": 610 }, { "epoch": 1.3144385026737968, "grad_norm": 3.2404541969299316, "learning_rate": 6.250379071597018e-06, "loss": 0.0925, "step": 615 }, { "epoch": 1.3251336898395722, "grad_norm": 2.904754161834717, "learning_rate": 6.194423182081161e-06, "loss": 0.1125, "step": 620 }, { "epoch": 1.3358288770053477, "grad_norm": 3.329714059829712, "learning_rate": 6.138308190100918e-06, "loss": 0.1038, "step": 625 }, { "epoch": 1.346524064171123, "grad_norm": 3.2199363708496094, "learning_rate": 6.082041570422059e-06, "loss": 0.1097, "step": 630 }, { "epoch": 1.3572192513368984, "grad_norm": 2.6863086223602295, "learning_rate": 6.025630818007833e-06, "loss": 0.1001, "step": 635 }, { "epoch": 1.3679144385026738, "grad_norm": 2.825577735900879, "learning_rate": 5.969083447020606e-06, "loss": 0.1018, "step": 640 }, { "epoch": 1.3786096256684492, "grad_norm": 2.606800079345703, "learning_rate": 5.912406989820948e-06, "loss": 0.1043, "step": 645 }, { "epoch": 1.3893048128342245, "grad_norm": 2.4628851413726807, "learning_rate": 5.855608995964283e-06, "loss": 0.0961, "step": 650 }, { "epoch": 1.4, "grad_norm": 2.584479331970215, "learning_rate": 5.798697031195257e-06, "loss": 0.1025, "step": 655 }, { "epoch": 1.4106951871657754, "grad_norm": 2.4871041774749756, "learning_rate": 5.741678676439946e-06, "loss": 0.0993, "step": 660 }, { "epoch": 1.4213903743315508, "grad_norm": 2.876145601272583, "learning_rate": 5.684561526796045e-06, "loss": 0.0968, "step": 665 }, { "epoch": 1.4320855614973262, "grad_norm": 3.0748062133789062, "learning_rate": 5.627353190521168e-06, "loss": 0.0824, "step": 670 }, { "epoch": 1.4427807486631017, "grad_norm": 2.5074338912963867, "learning_rate": 5.570061288019385e-06, "loss": 0.0994, "step": 675 }, { "epoch": 1.4534759358288771, "grad_norm": 2.8043177127838135, "learning_rate": 5.51269345082617e-06, "loss": 0.1007, "step": 680 }, { "epoch": 1.4641711229946524, "grad_norm": 2.6914560794830322, "learning_rate": 5.455257320591825e-06, "loss": 0.1021, "step": 685 }, { "epoch": 1.4748663101604278, "grad_norm": 2.208425521850586, "learning_rate": 5.397760548063591e-06, "loss": 0.0918, "step": 690 }, { "epoch": 1.4855614973262032, "grad_norm": 2.6503446102142334, "learning_rate": 5.340210792066531e-06, "loss": 0.0967, "step": 695 }, { "epoch": 1.4962566844919787, "grad_norm": 2.4122114181518555, "learning_rate": 5.282615718483344e-06, "loss": 0.0859, "step": 700 }, { "epoch": 1.506951871657754, "grad_norm": 2.084665060043335, "learning_rate": 5.224982999233228e-06, "loss": 0.0858, "step": 705 }, { "epoch": 1.5176470588235293, "grad_norm": 3.171128034591675, "learning_rate": 5.167320311249951e-06, "loss": 0.1037, "step": 710 }, { "epoch": 1.5283422459893048, "grad_norm": 3.304917573928833, "learning_rate": 5.109635335459256e-06, "loss": 0.0845, "step": 715 }, { "epoch": 1.5390374331550802, "grad_norm": 2.538336992263794, "learning_rate": 5.051935755755713e-06, "loss": 0.0875, "step": 720 }, { "epoch": 1.5497326203208557, "grad_norm": 3.2048041820526123, "learning_rate": 4.9942292579791965e-06, "loss": 0.0957, "step": 725 }, { "epoch": 1.5604278074866311, "grad_norm": 2.32966685295105, "learning_rate": 4.936523528891111e-06, "loss": 0.08, "step": 730 }, { "epoch": 1.5711229946524066, "grad_norm": 2.5276477336883545, "learning_rate": 4.878826255150453e-06, "loss": 0.1025, "step": 735 }, { "epoch": 1.5818181818181818, "grad_norm": 2.489506244659424, "learning_rate": 4.821145122289941e-06, "loss": 0.0905, "step": 740 }, { "epoch": 1.5925133689839572, "grad_norm": 2.0386905670166016, "learning_rate": 4.763487813692252e-06, "loss": 0.0863, "step": 745 }, { "epoch": 1.6032085561497325, "grad_norm": 3.1131086349487305, "learning_rate": 4.705862009566564e-06, "loss": 0.0884, "step": 750 }, { "epoch": 1.613903743315508, "grad_norm": 2.19130277633667, "learning_rate": 4.648275385925522e-06, "loss": 0.0919, "step": 755 }, { "epoch": 1.6245989304812833, "grad_norm": 2.7279891967773438, "learning_rate": 4.5907356135627605e-06, "loss": 0.0956, "step": 760 }, { "epoch": 1.6352941176470588, "grad_norm": 2.4997522830963135, "learning_rate": 4.533250357031104e-06, "loss": 0.1017, "step": 765 }, { "epoch": 1.6459893048128342, "grad_norm": 2.488523006439209, "learning_rate": 4.475827273621639e-06, "loss": 0.0968, "step": 770 }, { "epoch": 1.6566844919786097, "grad_norm": 2.9442784786224365, "learning_rate": 4.418474012343711e-06, "loss": 0.0973, "step": 775 }, { "epoch": 1.6673796791443851, "grad_norm": 2.7047040462493896, "learning_rate": 4.361198212906048e-06, "loss": 0.0884, "step": 780 }, { "epoch": 1.6780748663101606, "grad_norm": 2.4104325771331787, "learning_rate": 4.304007504699118e-06, "loss": 0.087, "step": 785 }, { "epoch": 1.6887700534759358, "grad_norm": 2.9229371547698975, "learning_rate": 4.246909505778862e-06, "loss": 0.0772, "step": 790 }, { "epoch": 1.6994652406417112, "grad_norm": 2.7569501399993896, "learning_rate": 4.189911821851928e-06, "loss": 0.0973, "step": 795 }, { "epoch": 1.7101604278074867, "grad_norm": 2.2701845169067383, "learning_rate": 4.1330220452625644e-06, "loss": 0.0869, "step": 800 }, { "epoch": 1.720855614973262, "grad_norm": 2.91239857673645, "learning_rate": 4.076247753981285e-06, "loss": 0.0985, "step": 805 }, { "epoch": 1.7315508021390373, "grad_norm": 2.3714444637298584, "learning_rate": 4.019596510595447e-06, "loss": 0.0862, "step": 810 }, { "epoch": 1.7422459893048128, "grad_norm": 2.617003917694092, "learning_rate": 3.963075861301886e-06, "loss": 0.0933, "step": 815 }, { "epoch": 1.7529411764705882, "grad_norm": 2.6759443283081055, "learning_rate": 3.9066933349017165e-06, "loss": 0.0883, "step": 820 }, { "epoch": 1.7636363636363637, "grad_norm": 3.302290439605713, "learning_rate": 3.8504564417974795e-06, "loss": 0.0988, "step": 825 }, { "epoch": 1.7743315508021391, "grad_norm": 2.1597912311553955, "learning_rate": 3.7943726729927154e-06, "loss": 0.0922, "step": 830 }, { "epoch": 1.7850267379679146, "grad_norm": 2.2167508602142334, "learning_rate": 3.738449499094121e-06, "loss": 0.0807, "step": 835 }, { "epoch": 1.79572192513369, "grad_norm": 2.180342197418213, "learning_rate": 3.682694369316446e-06, "loss": 0.0741, "step": 840 }, { "epoch": 1.8064171122994652, "grad_norm": 3.1924726963043213, "learning_rate": 3.6271147104902192e-06, "loss": 0.0836, "step": 845 }, { "epoch": 1.8171122994652407, "grad_norm": 3.1651058197021484, "learning_rate": 3.5717179260724544e-06, "loss": 0.0911, "step": 850 }, { "epoch": 1.8278074866310159, "grad_norm": 3.11163330078125, "learning_rate": 3.5165113951604874e-06, "loss": 0.094, "step": 855 }, { "epoch": 1.8385026737967913, "grad_norm": 2.2718892097473145, "learning_rate": 3.461502471509045e-06, "loss": 0.0818, "step": 860 }, { "epoch": 1.8491978609625668, "grad_norm": 2.625342607498169, "learning_rate": 3.4066984825506855e-06, "loss": 0.0891, "step": 865 }, { "epoch": 1.8598930481283422, "grad_norm": 2.286201000213623, "learning_rate": 3.35210672841976e-06, "loss": 0.0906, "step": 870 }, { "epoch": 1.8705882352941177, "grad_norm": 2.516383171081543, "learning_rate": 3.297734480980002e-06, "loss": 0.0856, "step": 875 }, { "epoch": 1.881283422459893, "grad_norm": 3.1512904167175293, "learning_rate": 3.2435889828558753e-06, "loss": 0.0845, "step": 880 }, { "epoch": 1.8919786096256686, "grad_norm": 3.0289194583892822, "learning_rate": 3.1896774464678327e-06, "loss": 0.0956, "step": 885 }, { "epoch": 1.902673796791444, "grad_norm": 2.4393064975738525, "learning_rate": 3.1360070530715885e-06, "loss": 0.0824, "step": 890 }, { "epoch": 1.9133689839572192, "grad_norm": 2.774142026901245, "learning_rate": 3.0825849518015334e-06, "loss": 0.0873, "step": 895 }, { "epoch": 1.9240641711229947, "grad_norm": 2.386697292327881, "learning_rate": 3.029418258718454e-06, "loss": 0.0836, "step": 900 }, { "epoch": 1.93475935828877, "grad_norm": 2.7990691661834717, "learning_rate": 2.9765140558616287e-06, "loss": 0.0875, "step": 905 }, { "epoch": 1.9454545454545453, "grad_norm": 2.4611012935638428, "learning_rate": 2.9238793903054757e-06, "loss": 0.0791, "step": 910 }, { "epoch": 1.9561497326203208, "grad_norm": 2.292494058609009, "learning_rate": 2.8715212732208523e-06, "loss": 0.0777, "step": 915 }, { "epoch": 1.9668449197860962, "grad_norm": 2.5736472606658936, "learning_rate": 2.819446678941126e-06, "loss": 0.0853, "step": 920 }, { "epoch": 1.9775401069518717, "grad_norm": 2.792259693145752, "learning_rate": 2.7676625440331756e-06, "loss": 0.08, "step": 925 }, { "epoch": 1.988235294117647, "grad_norm": 2.7590394020080566, "learning_rate": 2.7161757663734012e-06, "loss": 0.0876, "step": 930 }, { "epoch": 1.9989304812834225, "grad_norm": 3.141918420791626, "learning_rate": 2.6649932042288994e-06, "loss": 0.0874, "step": 935 } ], "logging_steps": 5, "max_steps": 1404, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.244589888375357e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }