{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9983579638752053, "eval_steps": 500, "global_step": 171, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0058383506659368724, "grad_norm": 23.346039635432966, "learning_rate": 0.0, "loss": 1.0216, "step": 1 }, { "epoch": 0.011676701331873745, "grad_norm": 18.69440800650612, "learning_rate": 5.555555555555555e-07, "loss": 1.0159, "step": 2 }, { "epoch": 0.01751505199781062, "grad_norm": 14.923328604146274, "learning_rate": 1.111111111111111e-06, "loss": 1.0972, "step": 3 }, { "epoch": 0.02335340266374749, "grad_norm": 25.385980327121796, "learning_rate": 1.6666666666666667e-06, "loss": 1.295, "step": 4 }, { "epoch": 0.029191753329684364, "grad_norm": 18.16503547733233, "learning_rate": 2.222222222222222e-06, "loss": 0.9012, "step": 5 }, { "epoch": 0.03503010399562124, "grad_norm": 53.35453317315207, "learning_rate": 2.7777777777777783e-06, "loss": 1.0288, "step": 6 }, { "epoch": 0.04086845466155811, "grad_norm": 3.9479906393452455, "learning_rate": 3.3333333333333333e-06, "loss": 0.8768, "step": 7 }, { "epoch": 0.04670680532749498, "grad_norm": 3.526815627243875, "learning_rate": 3.88888888888889e-06, "loss": 1.0274, "step": 8 }, { "epoch": 0.052545155993431854, "grad_norm": 2.9003505340702893, "learning_rate": 4.444444444444444e-06, "loss": 0.9328, "step": 9 }, { "epoch": 0.05838350665936873, "grad_norm": 3.6872295301025293, "learning_rate": 5e-06, "loss": 1.1684, "step": 10 }, { "epoch": 0.0642218573253056, "grad_norm": 4.385584615681111, "learning_rate": 5.555555555555557e-06, "loss": 1.0925, "step": 11 }, { "epoch": 0.07006020799124248, "grad_norm": 3.013525965990989, "learning_rate": 6.111111111111112e-06, "loss": 0.8918, "step": 12 }, { "epoch": 0.07589855865717934, "grad_norm": 2.7201676256210483, "learning_rate": 6.666666666666667e-06, "loss": 0.9572, "step": 13 }, { "epoch": 0.08173690932311622, "grad_norm": 2.823242289265648, "learning_rate": 7.222222222222223e-06, "loss": 0.8408, "step": 14 }, { "epoch": 0.08757525998905309, "grad_norm": 2.0560673624265435, "learning_rate": 7.77777777777778e-06, "loss": 0.7294, "step": 15 }, { "epoch": 0.09341361065498996, "grad_norm": 2.827884964059297, "learning_rate": 8.333333333333334e-06, "loss": 0.9351, "step": 16 }, { "epoch": 0.09925196132092684, "grad_norm": 2.353407743397884, "learning_rate": 8.888888888888888e-06, "loss": 0.8883, "step": 17 }, { "epoch": 0.10509031198686371, "grad_norm": 2.5476305018155885, "learning_rate": 9.444444444444445e-06, "loss": 0.8612, "step": 18 }, { "epoch": 0.11092866265280059, "grad_norm": 2.5549072265785018, "learning_rate": 1e-05, "loss": 0.8514, "step": 19 }, { "epoch": 0.11676701331873746, "grad_norm": 3.374243976367514, "learning_rate": 9.998945997517957e-06, "loss": 1.0582, "step": 20 }, { "epoch": 0.12260536398467432, "grad_norm": 3.1687760719813345, "learning_rate": 9.99578443444032e-06, "loss": 0.8147, "step": 21 }, { "epoch": 0.1284437146506112, "grad_norm": 1.917729903158416, "learning_rate": 9.990516643685222e-06, "loss": 0.7036, "step": 22 }, { "epoch": 0.13428206531654807, "grad_norm": 3.080654926506411, "learning_rate": 9.983144846158472e-06, "loss": 1.1692, "step": 23 }, { "epoch": 0.14012041598248495, "grad_norm": 2.1635641035874635, "learning_rate": 9.973672149817232e-06, "loss": 0.7105, "step": 24 }, { "epoch": 0.14595876664842183, "grad_norm": 1.8290404225487313, "learning_rate": 9.96210254835968e-06, "loss": 0.7225, "step": 25 }, { "epoch": 0.1517971173143587, "grad_norm": 2.3343564258608533, "learning_rate": 9.948440919541277e-06, "loss": 0.938, "step": 26 }, { "epoch": 0.15763546798029557, "grad_norm": 2.2111438984922045, "learning_rate": 9.932693023118299e-06, "loss": 0.9262, "step": 27 }, { "epoch": 0.16347381864623245, "grad_norm": 2.1134981775332373, "learning_rate": 9.91486549841951e-06, "loss": 0.8067, "step": 28 }, { "epoch": 0.1693121693121693, "grad_norm": 1.8305245997733364, "learning_rate": 9.894965861547023e-06, "loss": 0.8317, "step": 29 }, { "epoch": 0.17515051997810618, "grad_norm": 2.236853620521483, "learning_rate": 9.873002502207502e-06, "loss": 0.7304, "step": 30 }, { "epoch": 0.18098887064404307, "grad_norm": 2.3611233354670347, "learning_rate": 9.848984680175049e-06, "loss": 0.8728, "step": 31 }, { "epoch": 0.18682722130997992, "grad_norm": 4.000809003005407, "learning_rate": 9.822922521387277e-06, "loss": 0.7416, "step": 32 }, { "epoch": 0.1926655719759168, "grad_norm": 1.8812162065176985, "learning_rate": 9.794827013676206e-06, "loss": 0.7018, "step": 33 }, { "epoch": 0.19850392264185368, "grad_norm": 1.6923974873335865, "learning_rate": 9.764710002135784e-06, "loss": 0.6455, "step": 34 }, { "epoch": 0.20434227330779056, "grad_norm": 2.8035506463841533, "learning_rate": 9.732584184127973e-06, "loss": 1.0517, "step": 35 }, { "epoch": 0.21018062397372742, "grad_norm": 1.5840802349672924, "learning_rate": 9.698463103929542e-06, "loss": 0.6771, "step": 36 }, { "epoch": 0.2160189746396643, "grad_norm": 2.3024892626964544, "learning_rate": 9.66236114702178e-06, "loss": 0.8554, "step": 37 }, { "epoch": 0.22185732530560118, "grad_norm": 2.0003910939871803, "learning_rate": 9.62429353402556e-06, "loss": 0.8013, "step": 38 }, { "epoch": 0.22769567597153803, "grad_norm": 1.9517674635934668, "learning_rate": 9.584276314284316e-06, "loss": 0.8374, "step": 39 }, { "epoch": 0.2335340266374749, "grad_norm": 2.131535143692049, "learning_rate": 9.542326359097619e-06, "loss": 0.8691, "step": 40 }, { "epoch": 0.2393723773034118, "grad_norm": 1.791753421500237, "learning_rate": 9.498461354608228e-06, "loss": 0.6558, "step": 41 }, { "epoch": 0.24521072796934865, "grad_norm": 1.7285954496576958, "learning_rate": 9.452699794345583e-06, "loss": 0.7824, "step": 42 }, { "epoch": 0.2510490786352855, "grad_norm": 1.5567965071977108, "learning_rate": 9.405060971428924e-06, "loss": 0.639, "step": 43 }, { "epoch": 0.2568874293012224, "grad_norm": 1.4375142273921275, "learning_rate": 9.355564970433288e-06, "loss": 0.6775, "step": 44 }, { "epoch": 0.2627257799671593, "grad_norm": 1.67716239348873, "learning_rate": 9.30423265892184e-06, "loss": 0.7687, "step": 45 }, { "epoch": 0.26856413063309614, "grad_norm": 2.1560741267757697, "learning_rate": 9.251085678648072e-06, "loss": 0.9605, "step": 46 }, { "epoch": 0.274402481299033, "grad_norm": 2.4945601087320424, "learning_rate": 9.196146436431635e-06, "loss": 1.0254, "step": 47 }, { "epoch": 0.2802408319649699, "grad_norm": 1.9131551064591947, "learning_rate": 9.13943809471159e-06, "loss": 0.8508, "step": 48 }, { "epoch": 0.28607918263090676, "grad_norm": 1.4757864890468595, "learning_rate": 9.08098456178111e-06, "loss": 0.6386, "step": 49 }, { "epoch": 0.29191753329684367, "grad_norm": 2.0882336434748994, "learning_rate": 9.020810481707709e-06, "loss": 0.8752, "step": 50 }, { "epoch": 0.2977558839627805, "grad_norm": 1.4257729762626692, "learning_rate": 8.958941223943292e-06, "loss": 0.5883, "step": 51 }, { "epoch": 0.3035942346287174, "grad_norm": 2.0617578833829646, "learning_rate": 8.895402872628352e-06, "loss": 0.8631, "step": 52 }, { "epoch": 0.3094325852946543, "grad_norm": 2.0387352545759696, "learning_rate": 8.83022221559489e-06, "loss": 0.7665, "step": 53 }, { "epoch": 0.31527093596059114, "grad_norm": 1.6825440700596306, "learning_rate": 8.763426733072624e-06, "loss": 0.7121, "step": 54 }, { "epoch": 0.321109286626528, "grad_norm": 1.7039787938294297, "learning_rate": 8.695044586103297e-06, "loss": 0.7544, "step": 55 }, { "epoch": 0.3269476372924649, "grad_norm": 1.747764255109329, "learning_rate": 8.625104604667965e-06, "loss": 0.7664, "step": 56 }, { "epoch": 0.33278598795840175, "grad_norm": 1.4189105312955703, "learning_rate": 8.553636275532236e-06, "loss": 0.5921, "step": 57 }, { "epoch": 0.3386243386243386, "grad_norm": 1.7728898990705366, "learning_rate": 8.480669729814635e-06, "loss": 0.6721, "step": 58 }, { "epoch": 0.3444626892902755, "grad_norm": 1.6614414969326399, "learning_rate": 8.40623573028327e-06, "loss": 0.7133, "step": 59 }, { "epoch": 0.35030103995621237, "grad_norm": 1.8451715614266795, "learning_rate": 8.330365658386252e-06, "loss": 0.8457, "step": 60 }, { "epoch": 0.3561393906221492, "grad_norm": 1.569948888332489, "learning_rate": 8.25309150102121e-06, "loss": 0.7208, "step": 61 }, { "epoch": 0.36197774128808613, "grad_norm": 1.556596504048036, "learning_rate": 8.174445837049614e-06, "loss": 0.7143, "step": 62 }, { "epoch": 0.367816091954023, "grad_norm": 1.792684008076691, "learning_rate": 8.094461823561473e-06, "loss": 0.7932, "step": 63 }, { "epoch": 0.37365444261995984, "grad_norm": 1.8602705054299433, "learning_rate": 8.013173181896283e-06, "loss": 0.805, "step": 64 }, { "epoch": 0.37949279328589675, "grad_norm": 1.971836720979613, "learning_rate": 7.930614183426074e-06, "loss": 0.7688, "step": 65 }, { "epoch": 0.3853311439518336, "grad_norm": 1.4036541372691438, "learning_rate": 7.846819635106569e-06, "loss": 0.701, "step": 66 }, { "epoch": 0.39116949461777045, "grad_norm": 1.3144866978901222, "learning_rate": 7.76182486480253e-06, "loss": 0.5913, "step": 67 }, { "epoch": 0.39700784528370736, "grad_norm": 1.7802596312579286, "learning_rate": 7.675665706393502e-06, "loss": 0.8198, "step": 68 }, { "epoch": 0.4028461959496442, "grad_norm": 2.4200518553212413, "learning_rate": 7.588378484666214e-06, "loss": 0.9622, "step": 69 }, { "epoch": 0.4086845466155811, "grad_norm": 2.0689999772599594, "learning_rate": 7.500000000000001e-06, "loss": 0.8649, "step": 70 }, { "epoch": 0.414522897281518, "grad_norm": 1.454146433084145, "learning_rate": 7.4105675128517456e-06, "loss": 0.6499, "step": 71 }, { "epoch": 0.42036124794745483, "grad_norm": 1.9735117012184902, "learning_rate": 7.320118728046818e-06, "loss": 0.8629, "step": 72 }, { "epoch": 0.42619959861339174, "grad_norm": 1.3269967533943117, "learning_rate": 7.2286917788826926e-06, "loss": 0.6302, "step": 73 }, { "epoch": 0.4320379492793286, "grad_norm": 2.1156499352177467, "learning_rate": 7.136325211051905e-06, "loss": 0.953, "step": 74 }, { "epoch": 0.43787629994526545, "grad_norm": 1.6661986473128974, "learning_rate": 7.043057966391158e-06, "loss": 0.7642, "step": 75 }, { "epoch": 0.44371465061120235, "grad_norm": 1.3168655506535973, "learning_rate": 6.948929366463397e-06, "loss": 0.5953, "step": 76 }, { "epoch": 0.4495530012771392, "grad_norm": 1.542487982540137, "learning_rate": 6.8539790959798045e-06, "loss": 0.6802, "step": 77 }, { "epoch": 0.45539135194307606, "grad_norm": 1.9826739527814456, "learning_rate": 6.758247186068684e-06, "loss": 0.87, "step": 78 }, { "epoch": 0.46122970260901297, "grad_norm": 1.6743878429099177, "learning_rate": 6.6617739973982985e-06, "loss": 0.7126, "step": 79 }, { "epoch": 0.4670680532749498, "grad_norm": 1.6060875322085453, "learning_rate": 6.5646002031607726e-06, "loss": 0.7116, "step": 80 }, { "epoch": 0.4729064039408867, "grad_norm": 1.4970094698253724, "learning_rate": 6.466766771924231e-06, "loss": 0.7887, "step": 81 }, { "epoch": 0.4787447546068236, "grad_norm": 1.4205978407297999, "learning_rate": 6.368314950360416e-06, "loss": 0.6496, "step": 82 }, { "epoch": 0.48458310527276044, "grad_norm": 1.8875915736340494, "learning_rate": 6.269286245855039e-06, "loss": 0.9423, "step": 83 }, { "epoch": 0.4904214559386973, "grad_norm": 1.551811827102774, "learning_rate": 6.169722409008244e-06, "loss": 0.7458, "step": 84 }, { "epoch": 0.4962598066046342, "grad_norm": 1.5216093240427255, "learning_rate": 6.0696654160324875e-06, "loss": 0.7234, "step": 85 }, { "epoch": 0.502098157270571, "grad_norm": 2.0664851747739608, "learning_rate": 5.9691574510553505e-06, "loss": 0.8706, "step": 86 }, { "epoch": 0.5079365079365079, "grad_norm": 1.3755000837322797, "learning_rate": 5.8682408883346535e-06, "loss": 0.6907, "step": 87 }, { "epoch": 0.5137748586024448, "grad_norm": 2.0452056254353668, "learning_rate": 5.766958274393428e-06, "loss": 0.9291, "step": 88 }, { "epoch": 0.5196132092683817, "grad_norm": 2.0322847397035653, "learning_rate": 5.66535231008227e-06, "loss": 0.9572, "step": 89 }, { "epoch": 0.5254515599343186, "grad_norm": 1.4742200753435941, "learning_rate": 5.5634658325766066e-06, "loss": 0.6378, "step": 90 }, { "epoch": 0.5312899106002554, "grad_norm": 1.4173205118801524, "learning_rate": 5.46134179731651e-06, "loss": 0.7323, "step": 91 }, { "epoch": 0.5371282612661923, "grad_norm": 2.1798164249882626, "learning_rate": 5.359023259896638e-06, "loss": 1.0747, "step": 92 }, { "epoch": 0.5429666119321291, "grad_norm": 1.1947784867302391, "learning_rate": 5.2565533579139484e-06, "loss": 0.596, "step": 93 }, { "epoch": 0.548804962598066, "grad_norm": 1.351691353184327, "learning_rate": 5.153975292780852e-06, "loss": 0.7037, "step": 94 }, { "epoch": 0.554643313264003, "grad_norm": 1.821832384011939, "learning_rate": 5.05133231151145e-06, "loss": 0.8998, "step": 95 }, { "epoch": 0.5604816639299398, "grad_norm": 1.5639613554205625, "learning_rate": 4.948667688488552e-06, "loss": 0.7998, "step": 96 }, { "epoch": 0.5663200145958767, "grad_norm": 1.4103317767531058, "learning_rate": 4.846024707219149e-06, "loss": 0.6575, "step": 97 }, { "epoch": 0.5721583652618135, "grad_norm": 1.7200282821490869, "learning_rate": 4.7434466420860515e-06, "loss": 0.8301, "step": 98 }, { "epoch": 0.5779967159277504, "grad_norm": 1.9713093143708447, "learning_rate": 4.640976740103363e-06, "loss": 0.9142, "step": 99 }, { "epoch": 0.5838350665936873, "grad_norm": 1.7128758244015903, "learning_rate": 4.53865820268349e-06, "loss": 0.7895, "step": 100 }, { "epoch": 0.5896734172596242, "grad_norm": 1.799486256542215, "learning_rate": 4.436534167423395e-06, "loss": 0.8077, "step": 101 }, { "epoch": 0.595511767925561, "grad_norm": 1.4390066868776004, "learning_rate": 4.334647689917734e-06, "loss": 0.7538, "step": 102 }, { "epoch": 0.6013501185914979, "grad_norm": 1.562773521063493, "learning_rate": 4.233041725606573e-06, "loss": 0.7073, "step": 103 }, { "epoch": 0.6071884692574347, "grad_norm": 1.3659053235335727, "learning_rate": 4.131759111665349e-06, "loss": 0.6003, "step": 104 }, { "epoch": 0.6130268199233716, "grad_norm": 1.8759691152378297, "learning_rate": 4.03084254894465e-06, "loss": 0.8311, "step": 105 }, { "epoch": 0.6188651705893086, "grad_norm": 2.0594639991765367, "learning_rate": 3.930334583967514e-06, "loss": 1.1216, "step": 106 }, { "epoch": 0.6247035212552454, "grad_norm": 1.317441590353052, "learning_rate": 3.8302775909917585e-06, "loss": 0.7016, "step": 107 }, { "epoch": 0.6305418719211823, "grad_norm": 1.604492101282993, "learning_rate": 3.730713754144961e-06, "loss": 0.7752, "step": 108 }, { "epoch": 0.6363802225871191, "grad_norm": 1.2737569729242313, "learning_rate": 3.6316850496395863e-06, "loss": 0.7015, "step": 109 }, { "epoch": 0.642218573253056, "grad_norm": 2.1188338693809214, "learning_rate": 3.5332332280757706e-06, "loss": 0.8263, "step": 110 }, { "epoch": 0.6480569239189928, "grad_norm": 1.5348448926667313, "learning_rate": 3.4353997968392295e-06, "loss": 0.7388, "step": 111 }, { "epoch": 0.6538952745849298, "grad_norm": 1.3745471248589747, "learning_rate": 3.3382260026017027e-06, "loss": 0.6576, "step": 112 }, { "epoch": 0.6597336252508667, "grad_norm": 1.5116712690331373, "learning_rate": 3.241752813931316e-06, "loss": 0.8259, "step": 113 }, { "epoch": 0.6655719759168035, "grad_norm": 1.420711782360687, "learning_rate": 3.1460209040201967e-06, "loss": 0.7073, "step": 114 }, { "epoch": 0.6714103265827404, "grad_norm": 1.1843506681593685, "learning_rate": 3.0510706335366034e-06, "loss": 0.5862, "step": 115 }, { "epoch": 0.6772486772486772, "grad_norm": 1.4452829978036497, "learning_rate": 2.956942033608843e-06, "loss": 0.7392, "step": 116 }, { "epoch": 0.6830870279146142, "grad_norm": 1.6148556395277947, "learning_rate": 2.863674788948097e-06, "loss": 0.7822, "step": 117 }, { "epoch": 0.688925378580551, "grad_norm": 1.6171370218368475, "learning_rate": 2.771308221117309e-06, "loss": 0.774, "step": 118 }, { "epoch": 0.6947637292464879, "grad_norm": 1.3721101508521718, "learning_rate": 2.6798812719531843e-06, "loss": 0.6439, "step": 119 }, { "epoch": 0.7006020799124247, "grad_norm": 1.5513076884934305, "learning_rate": 2.5894324871482557e-06, "loss": 0.7712, "step": 120 }, { "epoch": 0.7064404305783616, "grad_norm": 1.3608399248886607, "learning_rate": 2.5000000000000015e-06, "loss": 0.6666, "step": 121 }, { "epoch": 0.7122787812442984, "grad_norm": 1.4546065360955336, "learning_rate": 2.411621515333788e-06, "loss": 0.7305, "step": 122 }, { "epoch": 0.7181171319102354, "grad_norm": 1.50756294259041, "learning_rate": 2.324334293606499e-06, "loss": 0.8454, "step": 123 }, { "epoch": 0.7239554825761723, "grad_norm": 1.430596947679965, "learning_rate": 2.238175135197471e-06, "loss": 0.6996, "step": 124 }, { "epoch": 0.7297938332421091, "grad_norm": 1.336144492021404, "learning_rate": 2.1531803648934333e-06, "loss": 0.692, "step": 125 }, { "epoch": 0.735632183908046, "grad_norm": 1.7715157306940374, "learning_rate": 2.069385816573928e-06, "loss": 0.8395, "step": 126 }, { "epoch": 0.7414705345739828, "grad_norm": 1.5456925788355012, "learning_rate": 1.9868268181037186e-06, "loss": 0.7025, "step": 127 }, { "epoch": 0.7473088852399197, "grad_norm": 1.3679386356577312, "learning_rate": 1.9055381764385272e-06, "loss": 0.6619, "step": 128 }, { "epoch": 0.7531472359058566, "grad_norm": 1.5918300174062097, "learning_rate": 1.8255541629503865e-06, "loss": 0.8029, "step": 129 }, { "epoch": 0.7589855865717935, "grad_norm": 1.469146037966741, "learning_rate": 1.746908498978791e-06, "loss": 0.6717, "step": 130 }, { "epoch": 0.7648239372377303, "grad_norm": 1.328289890211526, "learning_rate": 1.6696343416137495e-06, "loss": 0.6729, "step": 131 }, { "epoch": 0.7706622879036672, "grad_norm": 1.6152433920164602, "learning_rate": 1.5937642697167288e-06, "loss": 0.8082, "step": 132 }, { "epoch": 0.776500638569604, "grad_norm": 1.6972098871026293, "learning_rate": 1.5193302701853674e-06, "loss": 0.8128, "step": 133 }, { "epoch": 0.7823389892355409, "grad_norm": 1.8868147982818937, "learning_rate": 1.4463637244677648e-06, "loss": 1.0077, "step": 134 }, { "epoch": 0.7881773399014779, "grad_norm": 1.533794614028789, "learning_rate": 1.374895395332037e-06, "loss": 0.7204, "step": 135 }, { "epoch": 0.7940156905674147, "grad_norm": 1.1485993201785478, "learning_rate": 1.3049554138967052e-06, "loss": 0.5837, "step": 136 }, { "epoch": 0.7998540412333516, "grad_norm": 1.9049649848237078, "learning_rate": 1.2365732669273778e-06, "loss": 0.7, "step": 137 }, { "epoch": 0.8056923918992884, "grad_norm": 1.5878122929966123, "learning_rate": 1.1697777844051105e-06, "loss": 0.7382, "step": 138 }, { "epoch": 0.8115307425652253, "grad_norm": 1.4709273722338752, "learning_rate": 1.1045971273716476e-06, "loss": 0.7135, "step": 139 }, { "epoch": 0.8173690932311622, "grad_norm": 1.433559790573233, "learning_rate": 1.0410587760567104e-06, "loss": 0.7693, "step": 140 }, { "epoch": 0.8232074438970991, "grad_norm": 1.52758209016128, "learning_rate": 9.791895182922911e-07, "loss": 0.7034, "step": 141 }, { "epoch": 0.829045794563036, "grad_norm": 1.9400739394519702, "learning_rate": 9.190154382188921e-07, "loss": 0.8852, "step": 142 }, { "epoch": 0.8348841452289728, "grad_norm": 1.958208722988338, "learning_rate": 8.605619052884106e-07, "loss": 0.9601, "step": 143 }, { "epoch": 0.8407224958949097, "grad_norm": 1.4807510002946493, "learning_rate": 8.03853563568367e-07, "loss": 0.7411, "step": 144 }, { "epoch": 0.8465608465608465, "grad_norm": 1.3793677468152326, "learning_rate": 7.489143213519301e-07, "loss": 0.7921, "step": 145 }, { "epoch": 0.8523991972267835, "grad_norm": 1.4895621691570022, "learning_rate": 6.957673410781617e-07, "loss": 0.7503, "step": 146 }, { "epoch": 0.8582375478927203, "grad_norm": 1.7116219153256789, "learning_rate": 6.444350295667112e-07, "loss": 0.7971, "step": 147 }, { "epoch": 0.8640758985586572, "grad_norm": 1.5797223164404173, "learning_rate": 5.949390285710777e-07, "loss": 0.8277, "step": 148 }, { "epoch": 0.869914249224594, "grad_norm": 1.3148568307618735, "learning_rate": 5.473002056544191e-07, "loss": 0.6685, "step": 149 }, { "epoch": 0.8757525998905309, "grad_norm": 1.4107080603490212, "learning_rate": 5.015386453917742e-07, "loss": 0.6838, "step": 150 }, { "epoch": 0.8815909505564677, "grad_norm": 1.5629187051898064, "learning_rate": 4.576736409023813e-07, "loss": 0.7233, "step": 151 }, { "epoch": 0.8874293012224047, "grad_norm": 1.4669164529598444, "learning_rate": 4.15723685715686e-07, "loss": 0.836, "step": 152 }, { "epoch": 0.8932676518883416, "grad_norm": 1.454872607521009, "learning_rate": 3.7570646597444196e-07, "loss": 0.7642, "step": 153 }, { "epoch": 0.8991060025542784, "grad_norm": 1.6759220531326142, "learning_rate": 3.3763885297822153e-07, "loss": 0.8544, "step": 154 }, { "epoch": 0.9049443532202153, "grad_norm": 1.7087176753461515, "learning_rate": 3.015368960704584e-07, "loss": 0.7322, "step": 155 }, { "epoch": 0.9107827038861521, "grad_norm": 1.6388339629990647, "learning_rate": 2.6741581587202747e-07, "loss": 0.8009, "step": 156 }, { "epoch": 0.9166210545520891, "grad_norm": 1.4635363517587032, "learning_rate": 2.3528999786421758e-07, "loss": 0.7706, "step": 157 }, { "epoch": 0.9224594052180259, "grad_norm": 1.6021455462502143, "learning_rate": 2.0517298632379445e-07, "loss": 0.7973, "step": 158 }, { "epoch": 0.9282977558839628, "grad_norm": 1.5367883855881703, "learning_rate": 1.770774786127244e-07, "loss": 0.7287, "step": 159 }, { "epoch": 0.9341361065498996, "grad_norm": 1.2425259572983562, "learning_rate": 1.510153198249531e-07, "loss": 0.6761, "step": 160 }, { "epoch": 0.9399744572158365, "grad_norm": 1.359745226364094, "learning_rate": 1.2699749779249926e-07, "loss": 0.6421, "step": 161 }, { "epoch": 0.9458128078817734, "grad_norm": 1.4246913880107532, "learning_rate": 1.0503413845297739e-07, "loss": 0.8001, "step": 162 }, { "epoch": 0.9516511585477103, "grad_norm": 1.2590352425153586, "learning_rate": 8.513450158049109e-08, "loss": 0.723, "step": 163 }, { "epoch": 0.9574895092136472, "grad_norm": 1.694689146111032, "learning_rate": 6.730697688170251e-08, "loss": 0.8005, "step": 164 }, { "epoch": 0.963327859879584, "grad_norm": 1.3191005186230464, "learning_rate": 5.155908045872349e-08, "loss": 0.667, "step": 165 }, { "epoch": 0.9691662105455209, "grad_norm": 1.4117265002070536, "learning_rate": 3.7897451640321326e-08, "loss": 0.7146, "step": 166 }, { "epoch": 0.9750045612114577, "grad_norm": 1.2794859494757669, "learning_rate": 2.6327850182769065e-08, "loss": 0.6562, "step": 167 }, { "epoch": 0.9808429118773946, "grad_norm": 1.4089880813633835, "learning_rate": 1.6855153841527915e-08, "loss": 0.6582, "step": 168 }, { "epoch": 0.9866812625433315, "grad_norm": 1.4165079720131926, "learning_rate": 9.48335631477948e-09, "loss": 0.7843, "step": 169 }, { "epoch": 0.9925196132092684, "grad_norm": 1.2128283609191615, "learning_rate": 4.2155655596809455e-09, "loss": 0.6164, "step": 170 }, { "epoch": 0.9983579638752053, "grad_norm": 1.2569650088824407, "learning_rate": 1.054002482043237e-09, "loss": 0.6706, "step": 171 } ], "logging_steps": 1, "max_steps": 171, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 4050, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 14533949767680.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }