| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9745042492917846, | |
| "eval_steps": 500, | |
| "global_step": 264, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0113314447592068, | |
| "grad_norm": 24.700892966997085, | |
| "learning_rate": 1.8518518518518518e-07, | |
| "loss": 1.6228, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0226628895184136, | |
| "grad_norm": 25.319873190341358, | |
| "learning_rate": 3.7037037037037036e-07, | |
| "loss": 1.6327, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0339943342776204, | |
| "grad_norm": 24.785410230478583, | |
| "learning_rate": 5.555555555555555e-07, | |
| "loss": 1.6085, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.0453257790368272, | |
| "grad_norm": 24.530949684513566, | |
| "learning_rate": 7.407407407407407e-07, | |
| "loss": 1.5921, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.056657223796033995, | |
| "grad_norm": 21.437475590002645, | |
| "learning_rate": 9.259259259259259e-07, | |
| "loss": 1.4876, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0679886685552408, | |
| "grad_norm": 23.598738552751918, | |
| "learning_rate": 1.111111111111111e-06, | |
| "loss": 1.5618, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.07932011331444759, | |
| "grad_norm": 24.355821351281282, | |
| "learning_rate": 1.2962962962962962e-06, | |
| "loss": 1.5728, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.0906515580736544, | |
| "grad_norm": 17.949944552531893, | |
| "learning_rate": 1.4814814814814815e-06, | |
| "loss": 1.3796, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.10198300283286119, | |
| "grad_norm": 15.608410231061514, | |
| "learning_rate": 1.6666666666666667e-06, | |
| "loss": 1.3663, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.11331444759206799, | |
| "grad_norm": 12.967898302916174, | |
| "learning_rate": 1.8518518518518519e-06, | |
| "loss": 1.2852, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.12464589235127478, | |
| "grad_norm": 5.5745605303959005, | |
| "learning_rate": 2.037037037037037e-06, | |
| "loss": 1.1179, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.1359773371104816, | |
| "grad_norm": 5.003567410730206, | |
| "learning_rate": 2.222222222222222e-06, | |
| "loss": 1.1428, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.14730878186968838, | |
| "grad_norm": 4.580788434319785, | |
| "learning_rate": 2.4074074074074075e-06, | |
| "loss": 1.1386, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.15864022662889518, | |
| "grad_norm": 4.208213518050668, | |
| "learning_rate": 2.5925925925925925e-06, | |
| "loss": 1.1171, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.16997167138810199, | |
| "grad_norm": 3.6924260397238076, | |
| "learning_rate": 2.7777777777777783e-06, | |
| "loss": 0.9945, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.1813031161473088, | |
| "grad_norm": 4.282401825239783, | |
| "learning_rate": 2.962962962962963e-06, | |
| "loss": 1.0298, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.19263456090651557, | |
| "grad_norm": 3.998142705803431, | |
| "learning_rate": 3.1481481481481483e-06, | |
| "loss": 0.998, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.20396600566572237, | |
| "grad_norm": 3.652954638326853, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 1.0252, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.21529745042492918, | |
| "grad_norm": 3.1106976788005833, | |
| "learning_rate": 3.5185185185185187e-06, | |
| "loss": 0.948, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.22662889518413598, | |
| "grad_norm": 2.3583842636458874, | |
| "learning_rate": 3.7037037037037037e-06, | |
| "loss": 0.9141, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.23796033994334279, | |
| "grad_norm": 2.331383902586234, | |
| "learning_rate": 3.88888888888889e-06, | |
| "loss": 0.91, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.24929178470254956, | |
| "grad_norm": 2.3290851154332155, | |
| "learning_rate": 4.074074074074074e-06, | |
| "loss": 0.92, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.26062322946175637, | |
| "grad_norm": 2.088852025013323, | |
| "learning_rate": 4.2592592592592596e-06, | |
| "loss": 0.8521, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.2719546742209632, | |
| "grad_norm": 1.8360508671725202, | |
| "learning_rate": 4.444444444444444e-06, | |
| "loss": 0.8143, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.28328611898017, | |
| "grad_norm": 1.7885693902522186, | |
| "learning_rate": 4.62962962962963e-06, | |
| "loss": 0.8486, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.29461756373937675, | |
| "grad_norm": 1.9031889309290633, | |
| "learning_rate": 4.814814814814815e-06, | |
| "loss": 0.834, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.3059490084985836, | |
| "grad_norm": 1.6679534815550068, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8224, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.31728045325779036, | |
| "grad_norm": 1.6886754763902796, | |
| "learning_rate": 4.999780362391087e-06, | |
| "loss": 0.8133, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.3286118980169972, | |
| "grad_norm": 1.8004804802995344, | |
| "learning_rate": 4.9991214881568884e-06, | |
| "loss": 0.8145, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.33994334277620397, | |
| "grad_norm": 1.7444546855121879, | |
| "learning_rate": 4.998023493068255e-06, | |
| "loss": 0.8028, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.35127478753541075, | |
| "grad_norm": 1.6882783810092117, | |
| "learning_rate": 4.996486570053999e-06, | |
| "loss": 0.784, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.3626062322946176, | |
| "grad_norm": 1.7424306321156553, | |
| "learning_rate": 4.994510989166998e-06, | |
| "loss": 0.802, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.37393767705382436, | |
| "grad_norm": 1.5326860366155606, | |
| "learning_rate": 4.99209709753674e-06, | |
| "loss": 0.7578, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.38526912181303113, | |
| "grad_norm": 1.645466819050736, | |
| "learning_rate": 4.9892453193083354e-06, | |
| "loss": 0.7715, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.39660056657223797, | |
| "grad_norm": 1.8351805495790603, | |
| "learning_rate": 4.9859561555679835e-06, | |
| "loss": 0.7516, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.40793201133144474, | |
| "grad_norm": 1.5892721538246122, | |
| "learning_rate": 4.982230184254934e-06, | |
| "loss": 0.7658, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.4192634560906516, | |
| "grad_norm": 1.493020596244931, | |
| "learning_rate": 4.978068060059929e-06, | |
| "loss": 0.7676, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.43059490084985835, | |
| "grad_norm": 1.502680841198379, | |
| "learning_rate": 4.9734705143101744e-06, | |
| "loss": 0.7674, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.44192634560906513, | |
| "grad_norm": 1.526315536964418, | |
| "learning_rate": 4.968438354840834e-06, | |
| "loss": 0.747, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.45325779036827196, | |
| "grad_norm": 1.771193783955554, | |
| "learning_rate": 4.962972465853087e-06, | |
| "loss": 0.8251, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.46458923512747874, | |
| "grad_norm": 1.4751654383068609, | |
| "learning_rate": 4.9570738077587635e-06, | |
| "loss": 0.7587, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.47592067988668557, | |
| "grad_norm": 1.498540884039298, | |
| "learning_rate": 4.950743417011591e-06, | |
| "loss": 0.8311, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.48725212464589235, | |
| "grad_norm": 1.4059209239064798, | |
| "learning_rate": 4.9439824059250794e-06, | |
| "loss": 0.7655, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.4985835694050991, | |
| "grad_norm": 1.4871399277100446, | |
| "learning_rate": 4.936791962477076e-06, | |
| "loss": 0.7358, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.509915014164306, | |
| "grad_norm": 1.4257000290167645, | |
| "learning_rate": 4.929173350101025e-06, | |
| "loss": 0.7163, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.5212464589235127, | |
| "grad_norm": 1.4659192682033193, | |
| "learning_rate": 4.921127907463972e-06, | |
| "loss": 0.7061, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.5325779036827195, | |
| "grad_norm": 1.484618377703729, | |
| "learning_rate": 4.912657048231343e-06, | |
| "loss": 0.7651, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.5439093484419264, | |
| "grad_norm": 1.4947896835576073, | |
| "learning_rate": 4.903762260818552e-06, | |
| "loss": 0.7311, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.5552407932011332, | |
| "grad_norm": 1.3805335802092924, | |
| "learning_rate": 4.89444510812947e-06, | |
| "loss": 0.7327, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.56657223796034, | |
| "grad_norm": 1.5190953752467466, | |
| "learning_rate": 4.884707227281807e-06, | |
| "loss": 0.772, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.5779036827195467, | |
| "grad_norm": 1.5965731476318468, | |
| "learning_rate": 4.874550329319457e-06, | |
| "loss": 0.698, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.5892351274787535, | |
| "grad_norm": 1.4556721950898377, | |
| "learning_rate": 4.863976198911845e-06, | |
| "loss": 0.7267, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.6005665722379604, | |
| "grad_norm": 1.548206103166572, | |
| "learning_rate": 4.852986694040347e-06, | |
| "loss": 0.7188, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.6118980169971672, | |
| "grad_norm": 1.5029231129419343, | |
| "learning_rate": 4.84158374567182e-06, | |
| "loss": 0.7452, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.623229461756374, | |
| "grad_norm": 1.3962918545527763, | |
| "learning_rate": 4.829769357419317e-06, | |
| "loss": 0.7117, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.6345609065155807, | |
| "grad_norm": 1.3990946918305756, | |
| "learning_rate": 4.817545605190026e-06, | |
| "loss": 0.6797, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.6458923512747875, | |
| "grad_norm": 1.4406458676366833, | |
| "learning_rate": 4.804914636820517e-06, | |
| "loss": 0.7229, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.6572237960339944, | |
| "grad_norm": 1.4774647700554635, | |
| "learning_rate": 4.791878671699343e-06, | |
| "loss": 0.7117, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.6685552407932012, | |
| "grad_norm": 1.5167039884678175, | |
| "learning_rate": 4.77844000037707e-06, | |
| "loss": 0.7401, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.6798866855524079, | |
| "grad_norm": 1.427117682796941, | |
| "learning_rate": 4.764600984163809e-06, | |
| "loss": 0.7299, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.6912181303116147, | |
| "grad_norm": 1.4111273048659723, | |
| "learning_rate": 4.750364054714302e-06, | |
| "loss": 0.6947, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.7025495750708215, | |
| "grad_norm": 1.4031005513050043, | |
| "learning_rate": 4.735731713600665e-06, | |
| "loss": 0.7104, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.7138810198300283, | |
| "grad_norm": 1.3908217678623624, | |
| "learning_rate": 4.72070653187283e-06, | |
| "loss": 0.7215, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.7252124645892352, | |
| "grad_norm": 1.4016377075250317, | |
| "learning_rate": 4.705291149606787e-06, | |
| "loss": 0.6801, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.7365439093484419, | |
| "grad_norm": 1.373368498937934, | |
| "learning_rate": 4.6894882754406965e-06, | |
| "loss": 0.7115, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.7478753541076487, | |
| "grad_norm": 1.3083514867883594, | |
| "learning_rate": 4.673300686098957e-06, | |
| "loss": 0.6944, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.7592067988668555, | |
| "grad_norm": 1.408923716819617, | |
| "learning_rate": 4.6567312259043e-06, | |
| "loss": 0.7166, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.7705382436260623, | |
| "grad_norm": 1.4446149604718292, | |
| "learning_rate": 4.639782806278021e-06, | |
| "loss": 0.7643, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.7818696883852692, | |
| "grad_norm": 1.472987719405972, | |
| "learning_rate": 4.622458405228411e-06, | |
| "loss": 0.6748, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.7932011331444759, | |
| "grad_norm": 1.3910737638781718, | |
| "learning_rate": 4.604761066827485e-06, | |
| "loss": 0.6599, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.8045325779036827, | |
| "grad_norm": 1.4486804488805007, | |
| "learning_rate": 4.586693900676116e-06, | |
| "loss": 0.6844, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.8158640226628895, | |
| "grad_norm": 1.4339732853429255, | |
| "learning_rate": 4.568260081357644e-06, | |
| "loss": 0.6934, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.8271954674220963, | |
| "grad_norm": 1.4071241403434436, | |
| "learning_rate": 4.549462847880066e-06, | |
| "loss": 0.7042, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.8385269121813032, | |
| "grad_norm": 1.381067168557849, | |
| "learning_rate": 4.5303055031069165e-06, | |
| "loss": 0.6594, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.8498583569405099, | |
| "grad_norm": 1.370636257051823, | |
| "learning_rate": 4.510791413176912e-06, | |
| "loss": 0.7339, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.8611898016997167, | |
| "grad_norm": 1.3933507828709049, | |
| "learning_rate": 4.490924006912497e-06, | |
| "loss": 0.7319, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.8725212464589235, | |
| "grad_norm": 1.4434701376432475, | |
| "learning_rate": 4.470706775217355e-06, | |
| "loss": 0.7235, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.8838526912181303, | |
| "grad_norm": 1.3230684123602419, | |
| "learning_rate": 4.450143270463031e-06, | |
| "loss": 0.6653, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.8951841359773371, | |
| "grad_norm": 1.377425868827391, | |
| "learning_rate": 4.429237105864735e-06, | |
| "loss": 0.6929, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.9065155807365439, | |
| "grad_norm": 1.4118910710643473, | |
| "learning_rate": 4.407991954846471e-06, | |
| "loss": 0.6713, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.9178470254957507, | |
| "grad_norm": 1.2911698771129907, | |
| "learning_rate": 4.386411550395576e-06, | |
| "loss": 0.6828, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.9291784702549575, | |
| "grad_norm": 1.3062416329678588, | |
| "learning_rate": 4.364499684406796e-06, | |
| "loss": 0.6902, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.9405099150141643, | |
| "grad_norm": 1.4212108714610014, | |
| "learning_rate": 4.3422602070160116e-06, | |
| "loss": 0.7139, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.9518413597733711, | |
| "grad_norm": 1.4022201237779546, | |
| "learning_rate": 4.319697025923736e-06, | |
| "loss": 0.696, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.9631728045325779, | |
| "grad_norm": 1.4000079484179047, | |
| "learning_rate": 4.296814105708482e-06, | |
| "loss": 0.6978, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.9745042492917847, | |
| "grad_norm": 1.3135869027950182, | |
| "learning_rate": 4.273615467130156e-06, | |
| "loss": 0.7094, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.9858356940509915, | |
| "grad_norm": 1.5103342340296206, | |
| "learning_rate": 4.250105186423564e-06, | |
| "loss": 0.6864, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.9971671388101983, | |
| "grad_norm": 1.5395763391294515, | |
| "learning_rate": 4.226287394582176e-06, | |
| "loss": 0.6997, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.5395763391294515, | |
| "learning_rate": 4.202166276632274e-06, | |
| "loss": 0.7015, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 1.0113314447592068, | |
| "grad_norm": 2.9632436977275973, | |
| "learning_rate": 4.177746070897593e-06, | |
| "loss": 0.6146, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.0226628895184136, | |
| "grad_norm": 1.3701820521458077, | |
| "learning_rate": 4.15303106825461e-06, | |
| "loss": 0.5952, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 1.0339943342776203, | |
| "grad_norm": 1.4232251740156472, | |
| "learning_rate": 4.128025611378594e-06, | |
| "loss": 0.6013, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 1.045325779036827, | |
| "grad_norm": 1.4243450179098962, | |
| "learning_rate": 4.10273409398055e-06, | |
| "loss": 0.5838, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 1.056657223796034, | |
| "grad_norm": 1.3024426134525557, | |
| "learning_rate": 4.077160960035207e-06, | |
| "loss": 0.5719, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 1.0679886685552409, | |
| "grad_norm": 1.346350514168427, | |
| "learning_rate": 4.051310703000155e-06, | |
| "loss": 0.5969, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 1.0793201133144477, | |
| "grad_norm": 1.370884384362504, | |
| "learning_rate": 4.025187865026311e-06, | |
| "loss": 0.6079, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 1.0906515580736544, | |
| "grad_norm": 1.462001933003737, | |
| "learning_rate": 3.998797036159813e-06, | |
| "loss": 0.6286, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 1.1019830028328612, | |
| "grad_norm": 1.332600598608553, | |
| "learning_rate": 3.972142853535499e-06, | |
| "loss": 0.606, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 1.113314447592068, | |
| "grad_norm": 1.426061501851815, | |
| "learning_rate": 3.945230000562121e-06, | |
| "loss": 0.6109, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 1.1246458923512748, | |
| "grad_norm": 1.4004325571146066, | |
| "learning_rate": 3.918063206099421e-06, | |
| "loss": 0.62, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.1359773371104815, | |
| "grad_norm": 1.3396518626669338, | |
| "learning_rate": 3.890647243627218e-06, | |
| "loss": 0.5934, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 1.1473087818696883, | |
| "grad_norm": 1.380208956021839, | |
| "learning_rate": 3.862986930406669e-06, | |
| "loss": 0.5968, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 1.158640226628895, | |
| "grad_norm": 1.3961154205163853, | |
| "learning_rate": 3.83508712663382e-06, | |
| "loss": 0.6032, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 1.1699716713881019, | |
| "grad_norm": 1.3448453940648393, | |
| "learning_rate": 3.8069527345856233e-06, | |
| "loss": 0.5915, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 1.1813031161473089, | |
| "grad_norm": 1.3902990971135147, | |
| "learning_rate": 3.7785886977585562e-06, | |
| "loss": 0.5918, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.1926345609065157, | |
| "grad_norm": 1.3820749771088052, | |
| "learning_rate": 3.7500000000000005e-06, | |
| "loss": 0.5969, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 1.2039660056657224, | |
| "grad_norm": 1.4716970228552981, | |
| "learning_rate": 3.7211916646325315e-06, | |
| "loss": 0.5941, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 1.2152974504249292, | |
| "grad_norm": 1.34235642577707, | |
| "learning_rate": 3.6921687535712657e-06, | |
| "loss": 0.5803, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 1.226628895184136, | |
| "grad_norm": 1.360958822094796, | |
| "learning_rate": 3.662936366434435e-06, | |
| "loss": 0.5882, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 1.2379603399433428, | |
| "grad_norm": 1.3865643140417971, | |
| "learning_rate": 3.6334996396473298e-06, | |
| "loss": 0.6127, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.2492917847025495, | |
| "grad_norm": 1.3096840589094985, | |
| "learning_rate": 3.6038637455397802e-06, | |
| "loss": 0.5703, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 1.2606232294617563, | |
| "grad_norm": 1.4346753864435622, | |
| "learning_rate": 3.57403389143732e-06, | |
| "loss": 0.5997, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 1.271954674220963, | |
| "grad_norm": 1.4377459031956494, | |
| "learning_rate": 3.5440153187462146e-06, | |
| "loss": 0.6251, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 1.28328611898017, | |
| "grad_norm": 1.4225200803648703, | |
| "learning_rate": 3.513813302032485e-06, | |
| "loss": 0.6202, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 1.2946175637393766, | |
| "grad_norm": 1.3836498692303638, | |
| "learning_rate": 3.4834331480951213e-06, | |
| "loss": 0.5944, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.3059490084985836, | |
| "grad_norm": 1.281401506633823, | |
| "learning_rate": 3.4528801950336177e-06, | |
| "loss": 0.551, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 1.3172804532577904, | |
| "grad_norm": 1.4771037032630314, | |
| "learning_rate": 3.4221598113100196e-06, | |
| "loss": 0.6072, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 1.3286118980169972, | |
| "grad_norm": 1.4652916781647747, | |
| "learning_rate": 3.391277394805628e-06, | |
| "loss": 0.6166, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 1.339943342776204, | |
| "grad_norm": 1.3590014582336747, | |
| "learning_rate": 3.3602383718725363e-06, | |
| "loss": 0.5753, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 1.3512747875354107, | |
| "grad_norm": 1.3644484003880029, | |
| "learning_rate": 3.32904819638017e-06, | |
| "loss": 0.5892, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.3626062322946175, | |
| "grad_norm": 1.3191831153419338, | |
| "learning_rate": 3.2977123487569816e-06, | |
| "loss": 0.5624, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 1.3739376770538243, | |
| "grad_norm": 1.3319312452343077, | |
| "learning_rate": 3.2662363350274874e-06, | |
| "loss": 0.5851, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 1.385269121813031, | |
| "grad_norm": 1.4257384239422966, | |
| "learning_rate": 3.234625685844803e-06, | |
| "loss": 0.5893, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 1.3966005665722379, | |
| "grad_norm": 1.3953828272396132, | |
| "learning_rate": 3.202885955518849e-06, | |
| "loss": 0.5973, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 1.4079320113314449, | |
| "grad_norm": 1.395900408790709, | |
| "learning_rate": 3.171022721040409e-06, | |
| "loss": 0.588, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.4192634560906516, | |
| "grad_norm": 1.4082770595189713, | |
| "learning_rate": 3.139041581101187e-06, | |
| "loss": 0.5955, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 1.4305949008498584, | |
| "grad_norm": 1.4111017631505742, | |
| "learning_rate": 3.10694815511007e-06, | |
| "loss": 0.6304, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 1.4419263456090652, | |
| "grad_norm": 1.3684496285444403, | |
| "learning_rate": 3.0747480822057342e-06, | |
| "loss": 0.5895, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 1.453257790368272, | |
| "grad_norm": 1.3150808865823653, | |
| "learning_rate": 3.0424470202657953e-06, | |
| "loss": 0.577, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 1.4645892351274787, | |
| "grad_norm": 1.4075517143230738, | |
| "learning_rate": 3.0100506449126622e-06, | |
| "loss": 0.5939, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.4759206798866855, | |
| "grad_norm": 1.3153882543446243, | |
| "learning_rate": 2.9775646485162697e-06, | |
| "loss": 0.5735, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 1.4872521246458923, | |
| "grad_norm": 1.3348680664842318, | |
| "learning_rate": 2.9449947391938768e-06, | |
| "loss": 0.625, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 1.498583569405099, | |
| "grad_norm": 1.3489224682673129, | |
| "learning_rate": 2.9123466398070855e-06, | |
| "loss": 0.5981, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 1.509915014164306, | |
| "grad_norm": 1.3429701772744527, | |
| "learning_rate": 2.8796260869562865e-06, | |
| "loss": 0.5887, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 1.5212464589235126, | |
| "grad_norm": 1.4722400862474931, | |
| "learning_rate": 2.8468388299726714e-06, | |
| "loss": 0.5831, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.5325779036827196, | |
| "grad_norm": 1.2672960818674879, | |
| "learning_rate": 2.8139906299080205e-06, | |
| "loss": 0.5825, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 1.5439093484419264, | |
| "grad_norm": 1.3627343997731545, | |
| "learning_rate": 2.781087258522431e-06, | |
| "loss": 0.5832, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 1.5552407932011332, | |
| "grad_norm": 1.2876772239163903, | |
| "learning_rate": 2.7481344972701545e-06, | |
| "loss": 0.5531, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 1.56657223796034, | |
| "grad_norm": 1.3472454325008387, | |
| "learning_rate": 2.7151381362837424e-06, | |
| "loss": 0.5842, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 1.5779036827195467, | |
| "grad_norm": 1.3762098724750873, | |
| "learning_rate": 2.682103973356659e-06, | |
| "loss": 0.5712, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.5892351274787535, | |
| "grad_norm": 1.3492795625450165, | |
| "learning_rate": 2.64903781292455e-06, | |
| "loss": 0.5782, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 1.6005665722379603, | |
| "grad_norm": 1.3499587695758297, | |
| "learning_rate": 2.615945465045346e-06, | |
| "loss": 0.5669, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 1.6118980169971673, | |
| "grad_norm": 1.3511311195079772, | |
| "learning_rate": 2.5828327443783775e-06, | |
| "loss": 0.551, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 1.6232294617563738, | |
| "grad_norm": 1.33090787217115, | |
| "learning_rate": 2.5497054691626754e-06, | |
| "loss": 0.579, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 1.6345609065155808, | |
| "grad_norm": 1.3670590438796546, | |
| "learning_rate": 2.5165694601946566e-06, | |
| "loss": 0.5959, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.6458923512747874, | |
| "grad_norm": 1.3299516191015563, | |
| "learning_rate": 2.483430539805344e-06, | |
| "loss": 0.5979, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 1.6572237960339944, | |
| "grad_norm": 1.4129085459444395, | |
| "learning_rate": 2.4502945308373246e-06, | |
| "loss": 0.585, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 1.6685552407932012, | |
| "grad_norm": 1.4275247309590513, | |
| "learning_rate": 2.4171672556216237e-06, | |
| "loss": 0.576, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 1.679886685552408, | |
| "grad_norm": 1.3619200458990444, | |
| "learning_rate": 2.3840545349546538e-06, | |
| "loss": 0.5841, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 1.6912181303116147, | |
| "grad_norm": 1.3819435365787816, | |
| "learning_rate": 2.3509621870754505e-06, | |
| "loss": 0.5685, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.7025495750708215, | |
| "grad_norm": 1.3294738919196907, | |
| "learning_rate": 2.317896026643341e-06, | |
| "loss": 0.5871, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 1.7138810198300283, | |
| "grad_norm": 1.2730007734790987, | |
| "learning_rate": 2.2848618637162584e-06, | |
| "loss": 0.5592, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 1.725212464589235, | |
| "grad_norm": 1.3785771168476042, | |
| "learning_rate": 2.2518655027298468e-06, | |
| "loss": 0.577, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 1.736543909348442, | |
| "grad_norm": 1.3530333929384266, | |
| "learning_rate": 2.21891274147757e-06, | |
| "loss": 0.5503, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 1.7478753541076486, | |
| "grad_norm": 1.4267187662110872, | |
| "learning_rate": 2.1860093700919804e-06, | |
| "loss": 0.6071, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.7592067988668556, | |
| "grad_norm": 1.3951110903420234, | |
| "learning_rate": 2.15316117002733e-06, | |
| "loss": 0.5629, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 1.7705382436260622, | |
| "grad_norm": 1.3033237896515593, | |
| "learning_rate": 2.1203739130437147e-06, | |
| "loss": 0.5452, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 1.7818696883852692, | |
| "grad_norm": 1.380166287245991, | |
| "learning_rate": 2.0876533601929153e-06, | |
| "loss": 0.5811, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 1.793201133144476, | |
| "grad_norm": 1.2839541967552655, | |
| "learning_rate": 2.055005260806125e-06, | |
| "loss": 0.5672, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 1.8045325779036827, | |
| "grad_norm": 1.3067862009338267, | |
| "learning_rate": 2.0224353514837307e-06, | |
| "loss": 0.5683, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.8158640226628895, | |
| "grad_norm": 1.3243283509277737, | |
| "learning_rate": 1.989949355087339e-06, | |
| "loss": 0.5689, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 1.8271954674220963, | |
| "grad_norm": 1.3356790286830134, | |
| "learning_rate": 1.957552979734205e-06, | |
| "loss": 0.5802, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 1.8385269121813033, | |
| "grad_norm": 1.2892603884545701, | |
| "learning_rate": 1.9252519177942666e-06, | |
| "loss": 0.5692, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 1.8498583569405098, | |
| "grad_norm": 1.4440754700865919, | |
| "learning_rate": 1.8930518448899304e-06, | |
| "loss": 0.5965, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 1.8611898016997168, | |
| "grad_norm": 1.387136746836695, | |
| "learning_rate": 1.8609584188988135e-06, | |
| "loss": 0.5736, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.8725212464589234, | |
| "grad_norm": 1.1917095159893407, | |
| "learning_rate": 1.8289772789595917e-06, | |
| "loss": 0.6144, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 1.8838526912181304, | |
| "grad_norm": 1.375742481693197, | |
| "learning_rate": 1.7971140444811514e-06, | |
| "loss": 0.5763, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 1.8951841359773371, | |
| "grad_norm": 1.2404940123916632, | |
| "learning_rate": 1.7653743141551983e-06, | |
| "loss": 0.6063, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 1.906515580736544, | |
| "grad_norm": 1.3989557605487426, | |
| "learning_rate": 1.7337636649725132e-06, | |
| "loss": 0.5892, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 1.9178470254957507, | |
| "grad_norm": 1.2710248336513368, | |
| "learning_rate": 1.7022876512430197e-06, | |
| "loss": 0.5813, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.9291784702549575, | |
| "grad_norm": 1.2716638071587159, | |
| "learning_rate": 1.6709518036198307e-06, | |
| "loss": 0.5565, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 1.9405099150141643, | |
| "grad_norm": 1.277667152903096, | |
| "learning_rate": 1.6397616281274648e-06, | |
| "loss": 0.5727, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 1.951841359773371, | |
| "grad_norm": 1.296382463166991, | |
| "learning_rate": 1.6087226051943728e-06, | |
| "loss": 0.593, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 1.963172804532578, | |
| "grad_norm": 1.2844540181786357, | |
| "learning_rate": 1.5778401886899808e-06, | |
| "loss": 0.5841, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 1.9745042492917846, | |
| "grad_norm": 1.3053433701687789, | |
| "learning_rate": 1.5471198049663822e-06, | |
| "loss": 0.575, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.9858356940509916, | |
| "grad_norm": 1.268811845033466, | |
| "learning_rate": 1.51656685190488e-06, | |
| "loss": 0.588, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 1.9971671388101981, | |
| "grad_norm": 1.247792131322831, | |
| "learning_rate": 1.4861866979675155e-06, | |
| "loss": 0.5534, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.247792131322831, | |
| "learning_rate": 1.455984681253787e-06, | |
| "loss": 0.5438, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 2.011331444759207, | |
| "grad_norm": 2.7694057916801307, | |
| "learning_rate": 1.4259661085626802e-06, | |
| "loss": 0.5062, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 2.0226628895184136, | |
| "grad_norm": 1.2786953899807016, | |
| "learning_rate": 1.3961362544602215e-06, | |
| "loss": 0.4878, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.0339943342776206, | |
| "grad_norm": 1.3338942724237564, | |
| "learning_rate": 1.3665003603526706e-06, | |
| "loss": 0.5131, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 2.045325779036827, | |
| "grad_norm": 1.3381680372557467, | |
| "learning_rate": 1.3370636335655656e-06, | |
| "loss": 0.4976, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 2.056657223796034, | |
| "grad_norm": 1.2513390840039769, | |
| "learning_rate": 1.3078312464287355e-06, | |
| "loss": 0.5211, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 2.0679886685552407, | |
| "grad_norm": 1.2770532289534442, | |
| "learning_rate": 1.2788083353674694e-06, | |
| "loss": 0.5007, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 2.0793201133144477, | |
| "grad_norm": 1.2927230496024624, | |
| "learning_rate": 1.2500000000000007e-06, | |
| "loss": 0.4622, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 2.090651558073654, | |
| "grad_norm": 1.2508343832786988, | |
| "learning_rate": 1.2214113022414448e-06, | |
| "loss": 0.4844, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 2.101983002832861, | |
| "grad_norm": 1.207057675657945, | |
| "learning_rate": 1.1930472654143777e-06, | |
| "loss": 0.4948, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 2.113314447592068, | |
| "grad_norm": 1.2515484350526327, | |
| "learning_rate": 1.1649128733661802e-06, | |
| "loss": 0.4975, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 2.1246458923512748, | |
| "grad_norm": 1.2820103087615313, | |
| "learning_rate": 1.1370130695933317e-06, | |
| "loss": 0.5033, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 2.1359773371104818, | |
| "grad_norm": 1.3883649461446737, | |
| "learning_rate": 1.1093527563727827e-06, | |
| "loss": 0.4959, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.1473087818696883, | |
| "grad_norm": 1.2985788114536188, | |
| "learning_rate": 1.0819367939005802e-06, | |
| "loss": 0.5109, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 2.1586402266288953, | |
| "grad_norm": 1.3965326476598117, | |
| "learning_rate": 1.0547699994378787e-06, | |
| "loss": 0.4812, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 2.169971671388102, | |
| "grad_norm": 1.3349013266585352, | |
| "learning_rate": 1.0278571464645013e-06, | |
| "loss": 0.4926, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 2.181303116147309, | |
| "grad_norm": 1.3212505511291743, | |
| "learning_rate": 1.0012029638401871e-06, | |
| "loss": 0.4882, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 2.1926345609065154, | |
| "grad_norm": 1.30500171720855, | |
| "learning_rate": 9.74812134973689e-07, | |
| "loss": 0.5173, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 2.2039660056657224, | |
| "grad_norm": 1.2488171277934157, | |
| "learning_rate": 9.486892969998465e-07, | |
| "loss": 0.482, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 2.215297450424929, | |
| "grad_norm": 1.2854653346406788, | |
| "learning_rate": 9.228390399647944e-07, | |
| "loss": 0.5015, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 2.226628895184136, | |
| "grad_norm": 1.323794953427552, | |
| "learning_rate": 8.972659060194505e-07, | |
| "loss": 0.4735, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 2.237960339943343, | |
| "grad_norm": 1.3376265040553337, | |
| "learning_rate": 8.719743886214071e-07, | |
| "loss": 0.4875, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 2.2492917847025495, | |
| "grad_norm": 1.3285002033736215, | |
| "learning_rate": 8.469689317453907e-07, | |
| "loss": 0.4962, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.2606232294617565, | |
| "grad_norm": 1.2555976735704084, | |
| "learning_rate": 8.222539291024079e-07, | |
| "loss": 0.5005, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 2.271954674220963, | |
| "grad_norm": 1.2410255920694668, | |
| "learning_rate": 7.978337233677269e-07, | |
| "loss": 0.4882, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 2.28328611898017, | |
| "grad_norm": 1.2713376284380098, | |
| "learning_rate": 7.737126054178238e-07, | |
| "loss": 0.4739, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 2.2946175637393766, | |
| "grad_norm": 1.2772097228177208, | |
| "learning_rate": 7.49894813576437e-07, | |
| "loss": 0.4652, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 2.3059490084985836, | |
| "grad_norm": 1.3985331094997697, | |
| "learning_rate": 7.26384532869844e-07, | |
| "loss": 0.4983, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 2.31728045325779, | |
| "grad_norm": 1.2385757104958386, | |
| "learning_rate": 7.031858942915187e-07, | |
| "loss": 0.4848, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 2.328611898016997, | |
| "grad_norm": 1.298621814561205, | |
| "learning_rate": 6.803029740762648e-07, | |
| "loss": 0.499, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 2.3399433427762037, | |
| "grad_norm": 1.1982592548193622, | |
| "learning_rate": 6.577397929839891e-07, | |
| "loss": 0.5074, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 2.3512747875354107, | |
| "grad_norm": 1.3230369234558008, | |
| "learning_rate": 6.355003155932052e-07, | |
| "loss": 0.5082, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 2.3626062322946177, | |
| "grad_norm": 1.2353352817463088, | |
| "learning_rate": 6.135884496044245e-07, | |
| "loss": 0.5024, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.3739376770538243, | |
| "grad_norm": 1.2866413763490911, | |
| "learning_rate": 5.920080451535296e-07, | |
| "loss": 0.5158, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 2.3852691218130313, | |
| "grad_norm": 1.366484849292798, | |
| "learning_rate": 5.707628941352655e-07, | |
| "loss": 0.5068, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 2.396600566572238, | |
| "grad_norm": 1.287762702589202, | |
| "learning_rate": 5.4985672953697e-07, | |
| "loss": 0.4563, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 2.407932011331445, | |
| "grad_norm": 1.2310856033033788, | |
| "learning_rate": 5.292932247826449e-07, | |
| "loss": 0.5104, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 2.4192634560906514, | |
| "grad_norm": 1.2722457695575675, | |
| "learning_rate": 5.090759930875039e-07, | |
| "loss": 0.4745, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 2.4305949008498584, | |
| "grad_norm": 1.1881380406354796, | |
| "learning_rate": 4.892085868230881e-07, | |
| "loss": 0.4684, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 2.441926345609065, | |
| "grad_norm": 1.2279337924506066, | |
| "learning_rate": 4.696944968930847e-07, | |
| "loss": 0.4766, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 2.453257790368272, | |
| "grad_norm": 1.2965879507443776, | |
| "learning_rate": 4.505371521199342e-07, | |
| "loss": 0.4887, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 2.4645892351274785, | |
| "grad_norm": 1.2726278849407309, | |
| "learning_rate": 4.317399186423574e-07, | |
| "loss": 0.49, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 2.4759206798866855, | |
| "grad_norm": 1.2083763664947262, | |
| "learning_rate": 4.1330609932388493e-07, | |
| "loss": 0.4714, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.4872521246458925, | |
| "grad_norm": 1.2790819302434515, | |
| "learning_rate": 3.9523893317251624e-07, | |
| "loss": 0.4924, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 2.498583569405099, | |
| "grad_norm": 1.2824144839819291, | |
| "learning_rate": 3.7754159477158994e-07, | |
| "loss": 0.4969, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 2.509915014164306, | |
| "grad_norm": 1.2322829650550764, | |
| "learning_rate": 3.602171937219789e-07, | |
| "loss": 0.4922, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 2.5212464589235126, | |
| "grad_norm": 1.2932932363502108, | |
| "learning_rate": 3.4326877409570083e-07, | |
| "loss": 0.5135, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 2.5325779036827196, | |
| "grad_norm": 1.3265821035298209, | |
| "learning_rate": 3.266993139010438e-07, | |
| "loss": 0.4824, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 2.543909348441926, | |
| "grad_norm": 1.2634627344428153, | |
| "learning_rate": 3.1051172455930395e-07, | |
| "loss": 0.4756, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 2.555240793201133, | |
| "grad_norm": 1.2955666908690064, | |
| "learning_rate": 2.947088503932136e-07, | |
| "loss": 0.49, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 2.56657223796034, | |
| "grad_norm": 1.2966827797243252, | |
| "learning_rate": 2.792934681271708e-07, | |
| "loss": 0.5022, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 2.5779036827195467, | |
| "grad_norm": 1.294488037545825, | |
| "learning_rate": 2.642682863993354e-07, | |
| "loss": 0.4995, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 2.5892351274787533, | |
| "grad_norm": 1.2899319442218364, | |
| "learning_rate": 2.4963594528569835e-07, | |
| "loss": 0.5022, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.6005665722379603, | |
| "grad_norm": 1.2411992299049828, | |
| "learning_rate": 2.3539901583619186e-07, | |
| "loss": 0.4815, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 2.6118980169971673, | |
| "grad_norm": 1.290057331367847, | |
| "learning_rate": 2.2155999962293035e-07, | |
| "loss": 0.4777, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 2.623229461756374, | |
| "grad_norm": 1.336160271366626, | |
| "learning_rate": 2.081213283006575e-07, | |
| "loss": 0.4814, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 2.634560906515581, | |
| "grad_norm": 1.2513839029803793, | |
| "learning_rate": 1.9508536317948358e-07, | |
| "loss": 0.4871, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 2.6458923512747874, | |
| "grad_norm": 1.212015408532025, | |
| "learning_rate": 1.824543948099744e-07, | |
| "loss": 0.4726, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 2.6572237960339944, | |
| "grad_norm": 1.2427046825799253, | |
| "learning_rate": 1.702306425806838e-07, | |
| "loss": 0.4807, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 2.668555240793201, | |
| "grad_norm": 1.2778407829783978, | |
| "learning_rate": 1.584162543281806e-07, | |
| "loss": 0.4957, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 2.679886685552408, | |
| "grad_norm": 1.226886722658913, | |
| "learning_rate": 1.4701330595965401e-07, | |
| "loss": 0.4898, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 2.691218130311615, | |
| "grad_norm": 1.300360670595622, | |
| "learning_rate": 1.3602380108815537e-07, | |
| "loss": 0.4841, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 2.7025495750708215, | |
| "grad_norm": 1.313490684903286, | |
| "learning_rate": 1.2544967068054332e-07, | |
| "loss": 0.4954, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.713881019830028, | |
| "grad_norm": 1.2920263932419636, | |
| "learning_rate": 1.152927727181935e-07, | |
| "loss": 0.5249, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 2.725212464589235, | |
| "grad_norm": 1.3447396451038305, | |
| "learning_rate": 1.0555489187053097e-07, | |
| "loss": 0.5207, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 2.736543909348442, | |
| "grad_norm": 1.3527682966828949, | |
| "learning_rate": 9.623773918144896e-08, | |
| "loss": 0.5077, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 2.7478753541076486, | |
| "grad_norm": 1.368471140884032, | |
| "learning_rate": 8.734295176865748e-08, | |
| "loss": 0.5081, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 2.7592067988668556, | |
| "grad_norm": 1.278186747537926, | |
| "learning_rate": 7.88720925360284e-08, | |
| "loss": 0.4944, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 2.770538243626062, | |
| "grad_norm": 1.337416906476136, | |
| "learning_rate": 7.082664989897486e-08, | |
| "loss": 0.4764, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 2.781869688385269, | |
| "grad_norm": 1.280766158552745, | |
| "learning_rate": 6.320803752292465e-08, | |
| "loss": 0.4567, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 2.7932011331444757, | |
| "grad_norm": 1.1973951885363563, | |
| "learning_rate": 5.601759407492108e-08, | |
| "loss": 0.4896, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 2.8045325779036827, | |
| "grad_norm": 1.3298617699728477, | |
| "learning_rate": 4.9256582988409795e-08, | |
| "loss": 0.5015, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 2.8158640226628897, | |
| "grad_norm": 1.2840493283766243, | |
| "learning_rate": 4.292619224123717e-08, | |
| "loss": 0.4702, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.8271954674220963, | |
| "grad_norm": 1.2753413052551887, | |
| "learning_rate": 3.702753414691368e-08, | |
| "loss": 0.4677, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 2.8385269121813033, | |
| "grad_norm": 1.2669874412423119, | |
| "learning_rate": 3.15616451591666e-08, | |
| "loss": 0.485, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 2.84985835694051, | |
| "grad_norm": 1.2670916015981173, | |
| "learning_rate": 2.6529485689825996e-08, | |
| "loss": 0.4979, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 2.861189801699717, | |
| "grad_norm": 1.3102429493654797, | |
| "learning_rate": 2.1931939940071368e-08, | |
| "loss": 0.4719, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 2.8725212464589234, | |
| "grad_norm": 1.2716849784011235, | |
| "learning_rate": 1.7769815745066476e-08, | |
| "loss": 0.5018, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 2.8838526912181304, | |
| "grad_norm": 1.2711203429937163, | |
| "learning_rate": 1.4043844432016507e-08, | |
| "loss": 0.5098, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 2.8951841359773374, | |
| "grad_norm": 1.3331489651068749, | |
| "learning_rate": 1.0754680691665299e-08, | |
| "loss": 0.4731, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 2.906515580736544, | |
| "grad_norm": 1.3288134823029176, | |
| "learning_rate": 7.90290246326042e-09, | |
| "loss": 0.5416, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 2.9178470254957505, | |
| "grad_norm": 1.2953779393999099, | |
| "learning_rate": 5.489010833002739e-09, | |
| "loss": 0.4851, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 2.9291784702549575, | |
| "grad_norm": 1.3104504966732855, | |
| "learning_rate": 3.51342994600129e-09, | |
| "loss": 0.5082, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.9405099150141645, | |
| "grad_norm": 1.2404192298354901, | |
| "learning_rate": 1.976506931745392e-09, | |
| "loss": 0.4797, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 2.951841359773371, | |
| "grad_norm": 1.25894417590066, | |
| "learning_rate": 8.78511843112051e-10, | |
| "loss": 0.5091, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 2.963172804532578, | |
| "grad_norm": 1.3062783600910168, | |
| "learning_rate": 2.1963760891391406e-10, | |
| "loss": 0.495, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 2.9745042492917846, | |
| "grad_norm": 1.2953858272939929, | |
| "learning_rate": 0.0, | |
| "loss": 0.4696, | |
| "step": 264 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 264, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 28634663264256.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |