| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9983579638752053, | |
| "eval_steps": 500, | |
| "global_step": 171, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0058383506659368724, | |
| "grad_norm": 23.346039635432966, | |
| "learning_rate": 0.0, | |
| "loss": 1.0216, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.011676701331873745, | |
| "grad_norm": 18.69440800650612, | |
| "learning_rate": 5.555555555555555e-07, | |
| "loss": 1.0159, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.01751505199781062, | |
| "grad_norm": 14.923328604146274, | |
| "learning_rate": 1.111111111111111e-06, | |
| "loss": 1.0972, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.02335340266374749, | |
| "grad_norm": 25.385980327121796, | |
| "learning_rate": 1.6666666666666667e-06, | |
| "loss": 1.295, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.029191753329684364, | |
| "grad_norm": 18.16503547733233, | |
| "learning_rate": 2.222222222222222e-06, | |
| "loss": 0.9012, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.03503010399562124, | |
| "grad_norm": 53.35453317315207, | |
| "learning_rate": 2.7777777777777783e-06, | |
| "loss": 1.0288, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.04086845466155811, | |
| "grad_norm": 3.9479906393452455, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 0.8768, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.04670680532749498, | |
| "grad_norm": 3.526815627243875, | |
| "learning_rate": 3.88888888888889e-06, | |
| "loss": 1.0274, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.052545155993431854, | |
| "grad_norm": 2.9003505340702893, | |
| "learning_rate": 4.444444444444444e-06, | |
| "loss": 0.9328, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.05838350665936873, | |
| "grad_norm": 3.6872295301025293, | |
| "learning_rate": 5e-06, | |
| "loss": 1.1684, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0642218573253056, | |
| "grad_norm": 4.385584615681111, | |
| "learning_rate": 5.555555555555557e-06, | |
| "loss": 1.0925, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.07006020799124248, | |
| "grad_norm": 3.013525965990989, | |
| "learning_rate": 6.111111111111112e-06, | |
| "loss": 0.8918, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.07589855865717934, | |
| "grad_norm": 2.7201676256210483, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 0.9572, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.08173690932311622, | |
| "grad_norm": 2.823242289265648, | |
| "learning_rate": 7.222222222222223e-06, | |
| "loss": 0.8408, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.08757525998905309, | |
| "grad_norm": 2.0560673624265435, | |
| "learning_rate": 7.77777777777778e-06, | |
| "loss": 0.7294, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.09341361065498996, | |
| "grad_norm": 2.827884964059297, | |
| "learning_rate": 8.333333333333334e-06, | |
| "loss": 0.9351, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.09925196132092684, | |
| "grad_norm": 2.353407743397884, | |
| "learning_rate": 8.888888888888888e-06, | |
| "loss": 0.8883, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.10509031198686371, | |
| "grad_norm": 2.5476305018155885, | |
| "learning_rate": 9.444444444444445e-06, | |
| "loss": 0.8612, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.11092866265280059, | |
| "grad_norm": 2.5549072265785018, | |
| "learning_rate": 1e-05, | |
| "loss": 0.8514, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.11676701331873746, | |
| "grad_norm": 3.374243976367514, | |
| "learning_rate": 9.998945997517957e-06, | |
| "loss": 1.0582, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.12260536398467432, | |
| "grad_norm": 3.1687760719813345, | |
| "learning_rate": 9.99578443444032e-06, | |
| "loss": 0.8147, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.1284437146506112, | |
| "grad_norm": 1.917729903158416, | |
| "learning_rate": 9.990516643685222e-06, | |
| "loss": 0.7036, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.13428206531654807, | |
| "grad_norm": 3.080654926506411, | |
| "learning_rate": 9.983144846158472e-06, | |
| "loss": 1.1692, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.14012041598248495, | |
| "grad_norm": 2.1635641035874635, | |
| "learning_rate": 9.973672149817232e-06, | |
| "loss": 0.7105, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.14595876664842183, | |
| "grad_norm": 1.8290404225487313, | |
| "learning_rate": 9.96210254835968e-06, | |
| "loss": 0.7225, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.1517971173143587, | |
| "grad_norm": 2.3343564258608533, | |
| "learning_rate": 9.948440919541277e-06, | |
| "loss": 0.938, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.15763546798029557, | |
| "grad_norm": 2.2111438984922045, | |
| "learning_rate": 9.932693023118299e-06, | |
| "loss": 0.9262, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.16347381864623245, | |
| "grad_norm": 2.1134981775332373, | |
| "learning_rate": 9.91486549841951e-06, | |
| "loss": 0.8067, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.1693121693121693, | |
| "grad_norm": 1.8305245997733364, | |
| "learning_rate": 9.894965861547023e-06, | |
| "loss": 0.8317, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.17515051997810618, | |
| "grad_norm": 2.236853620521483, | |
| "learning_rate": 9.873002502207502e-06, | |
| "loss": 0.7304, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.18098887064404307, | |
| "grad_norm": 2.3611233354670347, | |
| "learning_rate": 9.848984680175049e-06, | |
| "loss": 0.8728, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.18682722130997992, | |
| "grad_norm": 4.000809003005407, | |
| "learning_rate": 9.822922521387277e-06, | |
| "loss": 0.7416, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.1926655719759168, | |
| "grad_norm": 1.8812162065176985, | |
| "learning_rate": 9.794827013676206e-06, | |
| "loss": 0.7018, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.19850392264185368, | |
| "grad_norm": 1.6923974873335865, | |
| "learning_rate": 9.764710002135784e-06, | |
| "loss": 0.6455, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.20434227330779056, | |
| "grad_norm": 2.8035506463841533, | |
| "learning_rate": 9.732584184127973e-06, | |
| "loss": 1.0517, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.21018062397372742, | |
| "grad_norm": 1.5840802349672924, | |
| "learning_rate": 9.698463103929542e-06, | |
| "loss": 0.6771, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.2160189746396643, | |
| "grad_norm": 2.3024892626964544, | |
| "learning_rate": 9.66236114702178e-06, | |
| "loss": 0.8554, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.22185732530560118, | |
| "grad_norm": 2.0003910939871803, | |
| "learning_rate": 9.62429353402556e-06, | |
| "loss": 0.8013, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.22769567597153803, | |
| "grad_norm": 1.9517674635934668, | |
| "learning_rate": 9.584276314284316e-06, | |
| "loss": 0.8374, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.2335340266374749, | |
| "grad_norm": 2.131535143692049, | |
| "learning_rate": 9.542326359097619e-06, | |
| "loss": 0.8691, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.2393723773034118, | |
| "grad_norm": 1.791753421500237, | |
| "learning_rate": 9.498461354608228e-06, | |
| "loss": 0.6558, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.24521072796934865, | |
| "grad_norm": 1.7285954496576958, | |
| "learning_rate": 9.452699794345583e-06, | |
| "loss": 0.7824, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.2510490786352855, | |
| "grad_norm": 1.5567965071977108, | |
| "learning_rate": 9.405060971428924e-06, | |
| "loss": 0.639, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.2568874293012224, | |
| "grad_norm": 1.4375142273921275, | |
| "learning_rate": 9.355564970433288e-06, | |
| "loss": 0.6775, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.2627257799671593, | |
| "grad_norm": 1.67716239348873, | |
| "learning_rate": 9.30423265892184e-06, | |
| "loss": 0.7687, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.26856413063309614, | |
| "grad_norm": 2.1560741267757697, | |
| "learning_rate": 9.251085678648072e-06, | |
| "loss": 0.9605, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.274402481299033, | |
| "grad_norm": 2.4945601087320424, | |
| "learning_rate": 9.196146436431635e-06, | |
| "loss": 1.0254, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.2802408319649699, | |
| "grad_norm": 1.9131551064591947, | |
| "learning_rate": 9.13943809471159e-06, | |
| "loss": 0.8508, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.28607918263090676, | |
| "grad_norm": 1.4757864890468595, | |
| "learning_rate": 9.08098456178111e-06, | |
| "loss": 0.6386, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.29191753329684367, | |
| "grad_norm": 2.0882336434748994, | |
| "learning_rate": 9.020810481707709e-06, | |
| "loss": 0.8752, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.2977558839627805, | |
| "grad_norm": 1.4257729762626692, | |
| "learning_rate": 8.958941223943292e-06, | |
| "loss": 0.5883, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.3035942346287174, | |
| "grad_norm": 2.0617578833829646, | |
| "learning_rate": 8.895402872628352e-06, | |
| "loss": 0.8631, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.3094325852946543, | |
| "grad_norm": 2.0387352545759696, | |
| "learning_rate": 8.83022221559489e-06, | |
| "loss": 0.7665, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.31527093596059114, | |
| "grad_norm": 1.6825440700596306, | |
| "learning_rate": 8.763426733072624e-06, | |
| "loss": 0.7121, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.321109286626528, | |
| "grad_norm": 1.7039787938294297, | |
| "learning_rate": 8.695044586103297e-06, | |
| "loss": 0.7544, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.3269476372924649, | |
| "grad_norm": 1.747764255109329, | |
| "learning_rate": 8.625104604667965e-06, | |
| "loss": 0.7664, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.33278598795840175, | |
| "grad_norm": 1.4189105312955703, | |
| "learning_rate": 8.553636275532236e-06, | |
| "loss": 0.5921, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.3386243386243386, | |
| "grad_norm": 1.7728898990705366, | |
| "learning_rate": 8.480669729814635e-06, | |
| "loss": 0.6721, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.3444626892902755, | |
| "grad_norm": 1.6614414969326399, | |
| "learning_rate": 8.40623573028327e-06, | |
| "loss": 0.7133, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.35030103995621237, | |
| "grad_norm": 1.8451715614266795, | |
| "learning_rate": 8.330365658386252e-06, | |
| "loss": 0.8457, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.3561393906221492, | |
| "grad_norm": 1.569948888332489, | |
| "learning_rate": 8.25309150102121e-06, | |
| "loss": 0.7208, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.36197774128808613, | |
| "grad_norm": 1.556596504048036, | |
| "learning_rate": 8.174445837049614e-06, | |
| "loss": 0.7143, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.367816091954023, | |
| "grad_norm": 1.792684008076691, | |
| "learning_rate": 8.094461823561473e-06, | |
| "loss": 0.7932, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.37365444261995984, | |
| "grad_norm": 1.8602705054299433, | |
| "learning_rate": 8.013173181896283e-06, | |
| "loss": 0.805, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.37949279328589675, | |
| "grad_norm": 1.971836720979613, | |
| "learning_rate": 7.930614183426074e-06, | |
| "loss": 0.7688, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.3853311439518336, | |
| "grad_norm": 1.4036541372691438, | |
| "learning_rate": 7.846819635106569e-06, | |
| "loss": 0.701, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.39116949461777045, | |
| "grad_norm": 1.3144866978901222, | |
| "learning_rate": 7.76182486480253e-06, | |
| "loss": 0.5913, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.39700784528370736, | |
| "grad_norm": 1.7802596312579286, | |
| "learning_rate": 7.675665706393502e-06, | |
| "loss": 0.8198, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.4028461959496442, | |
| "grad_norm": 2.4200518553212413, | |
| "learning_rate": 7.588378484666214e-06, | |
| "loss": 0.9622, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.4086845466155811, | |
| "grad_norm": 2.0689999772599594, | |
| "learning_rate": 7.500000000000001e-06, | |
| "loss": 0.8649, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.414522897281518, | |
| "grad_norm": 1.454146433084145, | |
| "learning_rate": 7.4105675128517456e-06, | |
| "loss": 0.6499, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.42036124794745483, | |
| "grad_norm": 1.9735117012184902, | |
| "learning_rate": 7.320118728046818e-06, | |
| "loss": 0.8629, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.42619959861339174, | |
| "grad_norm": 1.3269967533943117, | |
| "learning_rate": 7.2286917788826926e-06, | |
| "loss": 0.6302, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.4320379492793286, | |
| "grad_norm": 2.1156499352177467, | |
| "learning_rate": 7.136325211051905e-06, | |
| "loss": 0.953, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.43787629994526545, | |
| "grad_norm": 1.6661986473128974, | |
| "learning_rate": 7.043057966391158e-06, | |
| "loss": 0.7642, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.44371465061120235, | |
| "grad_norm": 1.3168655506535973, | |
| "learning_rate": 6.948929366463397e-06, | |
| "loss": 0.5953, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.4495530012771392, | |
| "grad_norm": 1.542487982540137, | |
| "learning_rate": 6.8539790959798045e-06, | |
| "loss": 0.6802, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.45539135194307606, | |
| "grad_norm": 1.9826739527814456, | |
| "learning_rate": 6.758247186068684e-06, | |
| "loss": 0.87, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.46122970260901297, | |
| "grad_norm": 1.6743878429099177, | |
| "learning_rate": 6.6617739973982985e-06, | |
| "loss": 0.7126, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.4670680532749498, | |
| "grad_norm": 1.6060875322085453, | |
| "learning_rate": 6.5646002031607726e-06, | |
| "loss": 0.7116, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.4729064039408867, | |
| "grad_norm": 1.4970094698253724, | |
| "learning_rate": 6.466766771924231e-06, | |
| "loss": 0.7887, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.4787447546068236, | |
| "grad_norm": 1.4205978407297999, | |
| "learning_rate": 6.368314950360416e-06, | |
| "loss": 0.6496, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.48458310527276044, | |
| "grad_norm": 1.8875915736340494, | |
| "learning_rate": 6.269286245855039e-06, | |
| "loss": 0.9423, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.4904214559386973, | |
| "grad_norm": 1.551811827102774, | |
| "learning_rate": 6.169722409008244e-06, | |
| "loss": 0.7458, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.4962598066046342, | |
| "grad_norm": 1.5216093240427255, | |
| "learning_rate": 6.0696654160324875e-06, | |
| "loss": 0.7234, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.502098157270571, | |
| "grad_norm": 2.0664851747739608, | |
| "learning_rate": 5.9691574510553505e-06, | |
| "loss": 0.8706, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.5079365079365079, | |
| "grad_norm": 1.3755000837322797, | |
| "learning_rate": 5.8682408883346535e-06, | |
| "loss": 0.6907, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.5137748586024448, | |
| "grad_norm": 2.0452056254353668, | |
| "learning_rate": 5.766958274393428e-06, | |
| "loss": 0.9291, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.5196132092683817, | |
| "grad_norm": 2.0322847397035653, | |
| "learning_rate": 5.66535231008227e-06, | |
| "loss": 0.9572, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.5254515599343186, | |
| "grad_norm": 1.4742200753435941, | |
| "learning_rate": 5.5634658325766066e-06, | |
| "loss": 0.6378, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.5312899106002554, | |
| "grad_norm": 1.4173205118801524, | |
| "learning_rate": 5.46134179731651e-06, | |
| "loss": 0.7323, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.5371282612661923, | |
| "grad_norm": 2.1798164249882626, | |
| "learning_rate": 5.359023259896638e-06, | |
| "loss": 1.0747, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.5429666119321291, | |
| "grad_norm": 1.1947784867302391, | |
| "learning_rate": 5.2565533579139484e-06, | |
| "loss": 0.596, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.548804962598066, | |
| "grad_norm": 1.351691353184327, | |
| "learning_rate": 5.153975292780852e-06, | |
| "loss": 0.7037, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.554643313264003, | |
| "grad_norm": 1.821832384011939, | |
| "learning_rate": 5.05133231151145e-06, | |
| "loss": 0.8998, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.5604816639299398, | |
| "grad_norm": 1.5639613554205625, | |
| "learning_rate": 4.948667688488552e-06, | |
| "loss": 0.7998, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.5663200145958767, | |
| "grad_norm": 1.4103317767531058, | |
| "learning_rate": 4.846024707219149e-06, | |
| "loss": 0.6575, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.5721583652618135, | |
| "grad_norm": 1.7200282821490869, | |
| "learning_rate": 4.7434466420860515e-06, | |
| "loss": 0.8301, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.5779967159277504, | |
| "grad_norm": 1.9713093143708447, | |
| "learning_rate": 4.640976740103363e-06, | |
| "loss": 0.9142, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.5838350665936873, | |
| "grad_norm": 1.7128758244015903, | |
| "learning_rate": 4.53865820268349e-06, | |
| "loss": 0.7895, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.5896734172596242, | |
| "grad_norm": 1.799486256542215, | |
| "learning_rate": 4.436534167423395e-06, | |
| "loss": 0.8077, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.595511767925561, | |
| "grad_norm": 1.4390066868776004, | |
| "learning_rate": 4.334647689917734e-06, | |
| "loss": 0.7538, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.6013501185914979, | |
| "grad_norm": 1.562773521063493, | |
| "learning_rate": 4.233041725606573e-06, | |
| "loss": 0.7073, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.6071884692574347, | |
| "grad_norm": 1.3659053235335727, | |
| "learning_rate": 4.131759111665349e-06, | |
| "loss": 0.6003, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.6130268199233716, | |
| "grad_norm": 1.8759691152378297, | |
| "learning_rate": 4.03084254894465e-06, | |
| "loss": 0.8311, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.6188651705893086, | |
| "grad_norm": 2.0594639991765367, | |
| "learning_rate": 3.930334583967514e-06, | |
| "loss": 1.1216, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.6247035212552454, | |
| "grad_norm": 1.317441590353052, | |
| "learning_rate": 3.8302775909917585e-06, | |
| "loss": 0.7016, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.6305418719211823, | |
| "grad_norm": 1.604492101282993, | |
| "learning_rate": 3.730713754144961e-06, | |
| "loss": 0.7752, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.6363802225871191, | |
| "grad_norm": 1.2737569729242313, | |
| "learning_rate": 3.6316850496395863e-06, | |
| "loss": 0.7015, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.642218573253056, | |
| "grad_norm": 2.1188338693809214, | |
| "learning_rate": 3.5332332280757706e-06, | |
| "loss": 0.8263, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.6480569239189928, | |
| "grad_norm": 1.5348448926667313, | |
| "learning_rate": 3.4353997968392295e-06, | |
| "loss": 0.7388, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.6538952745849298, | |
| "grad_norm": 1.3745471248589747, | |
| "learning_rate": 3.3382260026017027e-06, | |
| "loss": 0.6576, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.6597336252508667, | |
| "grad_norm": 1.5116712690331373, | |
| "learning_rate": 3.241752813931316e-06, | |
| "loss": 0.8259, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.6655719759168035, | |
| "grad_norm": 1.420711782360687, | |
| "learning_rate": 3.1460209040201967e-06, | |
| "loss": 0.7073, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.6714103265827404, | |
| "grad_norm": 1.1843506681593685, | |
| "learning_rate": 3.0510706335366034e-06, | |
| "loss": 0.5862, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.6772486772486772, | |
| "grad_norm": 1.4452829978036497, | |
| "learning_rate": 2.956942033608843e-06, | |
| "loss": 0.7392, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.6830870279146142, | |
| "grad_norm": 1.6148556395277947, | |
| "learning_rate": 2.863674788948097e-06, | |
| "loss": 0.7822, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.688925378580551, | |
| "grad_norm": 1.6171370218368475, | |
| "learning_rate": 2.771308221117309e-06, | |
| "loss": 0.774, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.6947637292464879, | |
| "grad_norm": 1.3721101508521718, | |
| "learning_rate": 2.6798812719531843e-06, | |
| "loss": 0.6439, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.7006020799124247, | |
| "grad_norm": 1.5513076884934305, | |
| "learning_rate": 2.5894324871482557e-06, | |
| "loss": 0.7712, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.7064404305783616, | |
| "grad_norm": 1.3608399248886607, | |
| "learning_rate": 2.5000000000000015e-06, | |
| "loss": 0.6666, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.7122787812442984, | |
| "grad_norm": 1.4546065360955336, | |
| "learning_rate": 2.411621515333788e-06, | |
| "loss": 0.7305, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.7181171319102354, | |
| "grad_norm": 1.50756294259041, | |
| "learning_rate": 2.324334293606499e-06, | |
| "loss": 0.8454, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.7239554825761723, | |
| "grad_norm": 1.430596947679965, | |
| "learning_rate": 2.238175135197471e-06, | |
| "loss": 0.6996, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.7297938332421091, | |
| "grad_norm": 1.336144492021404, | |
| "learning_rate": 2.1531803648934333e-06, | |
| "loss": 0.692, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.735632183908046, | |
| "grad_norm": 1.7715157306940374, | |
| "learning_rate": 2.069385816573928e-06, | |
| "loss": 0.8395, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.7414705345739828, | |
| "grad_norm": 1.5456925788355012, | |
| "learning_rate": 1.9868268181037186e-06, | |
| "loss": 0.7025, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.7473088852399197, | |
| "grad_norm": 1.3679386356577312, | |
| "learning_rate": 1.9055381764385272e-06, | |
| "loss": 0.6619, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.7531472359058566, | |
| "grad_norm": 1.5918300174062097, | |
| "learning_rate": 1.8255541629503865e-06, | |
| "loss": 0.8029, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.7589855865717935, | |
| "grad_norm": 1.469146037966741, | |
| "learning_rate": 1.746908498978791e-06, | |
| "loss": 0.6717, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.7648239372377303, | |
| "grad_norm": 1.328289890211526, | |
| "learning_rate": 1.6696343416137495e-06, | |
| "loss": 0.6729, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.7706622879036672, | |
| "grad_norm": 1.6152433920164602, | |
| "learning_rate": 1.5937642697167288e-06, | |
| "loss": 0.8082, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.776500638569604, | |
| "grad_norm": 1.6972098871026293, | |
| "learning_rate": 1.5193302701853674e-06, | |
| "loss": 0.8128, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.7823389892355409, | |
| "grad_norm": 1.8868147982818937, | |
| "learning_rate": 1.4463637244677648e-06, | |
| "loss": 1.0077, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.7881773399014779, | |
| "grad_norm": 1.533794614028789, | |
| "learning_rate": 1.374895395332037e-06, | |
| "loss": 0.7204, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.7940156905674147, | |
| "grad_norm": 1.1485993201785478, | |
| "learning_rate": 1.3049554138967052e-06, | |
| "loss": 0.5837, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.7998540412333516, | |
| "grad_norm": 1.9049649848237078, | |
| "learning_rate": 1.2365732669273778e-06, | |
| "loss": 0.7, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.8056923918992884, | |
| "grad_norm": 1.5878122929966123, | |
| "learning_rate": 1.1697777844051105e-06, | |
| "loss": 0.7382, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.8115307425652253, | |
| "grad_norm": 1.4709273722338752, | |
| "learning_rate": 1.1045971273716476e-06, | |
| "loss": 0.7135, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.8173690932311622, | |
| "grad_norm": 1.433559790573233, | |
| "learning_rate": 1.0410587760567104e-06, | |
| "loss": 0.7693, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.8232074438970991, | |
| "grad_norm": 1.52758209016128, | |
| "learning_rate": 9.791895182922911e-07, | |
| "loss": 0.7034, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.829045794563036, | |
| "grad_norm": 1.9400739394519702, | |
| "learning_rate": 9.190154382188921e-07, | |
| "loss": 0.8852, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.8348841452289728, | |
| "grad_norm": 1.958208722988338, | |
| "learning_rate": 8.605619052884106e-07, | |
| "loss": 0.9601, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.8407224958949097, | |
| "grad_norm": 1.4807510002946493, | |
| "learning_rate": 8.03853563568367e-07, | |
| "loss": 0.7411, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.8465608465608465, | |
| "grad_norm": 1.3793677468152326, | |
| "learning_rate": 7.489143213519301e-07, | |
| "loss": 0.7921, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.8523991972267835, | |
| "grad_norm": 1.4895621691570022, | |
| "learning_rate": 6.957673410781617e-07, | |
| "loss": 0.7503, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.8582375478927203, | |
| "grad_norm": 1.7116219153256789, | |
| "learning_rate": 6.444350295667112e-07, | |
| "loss": 0.7971, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.8640758985586572, | |
| "grad_norm": 1.5797223164404173, | |
| "learning_rate": 5.949390285710777e-07, | |
| "loss": 0.8277, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.869914249224594, | |
| "grad_norm": 1.3148568307618735, | |
| "learning_rate": 5.473002056544191e-07, | |
| "loss": 0.6685, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.8757525998905309, | |
| "grad_norm": 1.4107080603490212, | |
| "learning_rate": 5.015386453917742e-07, | |
| "loss": 0.6838, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.8815909505564677, | |
| "grad_norm": 1.5629187051898064, | |
| "learning_rate": 4.576736409023813e-07, | |
| "loss": 0.7233, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.8874293012224047, | |
| "grad_norm": 1.4669164529598444, | |
| "learning_rate": 4.15723685715686e-07, | |
| "loss": 0.836, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.8932676518883416, | |
| "grad_norm": 1.454872607521009, | |
| "learning_rate": 3.7570646597444196e-07, | |
| "loss": 0.7642, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.8991060025542784, | |
| "grad_norm": 1.6759220531326142, | |
| "learning_rate": 3.3763885297822153e-07, | |
| "loss": 0.8544, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.9049443532202153, | |
| "grad_norm": 1.7087176753461515, | |
| "learning_rate": 3.015368960704584e-07, | |
| "loss": 0.7322, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.9107827038861521, | |
| "grad_norm": 1.6388339629990647, | |
| "learning_rate": 2.6741581587202747e-07, | |
| "loss": 0.8009, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.9166210545520891, | |
| "grad_norm": 1.4635363517587032, | |
| "learning_rate": 2.3528999786421758e-07, | |
| "loss": 0.7706, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.9224594052180259, | |
| "grad_norm": 1.6021455462502143, | |
| "learning_rate": 2.0517298632379445e-07, | |
| "loss": 0.7973, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.9282977558839628, | |
| "grad_norm": 1.5367883855881703, | |
| "learning_rate": 1.770774786127244e-07, | |
| "loss": 0.7287, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.9341361065498996, | |
| "grad_norm": 1.2425259572983562, | |
| "learning_rate": 1.510153198249531e-07, | |
| "loss": 0.6761, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.9399744572158365, | |
| "grad_norm": 1.359745226364094, | |
| "learning_rate": 1.2699749779249926e-07, | |
| "loss": 0.6421, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.9458128078817734, | |
| "grad_norm": 1.4246913880107532, | |
| "learning_rate": 1.0503413845297739e-07, | |
| "loss": 0.8001, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.9516511585477103, | |
| "grad_norm": 1.2590352425153586, | |
| "learning_rate": 8.513450158049109e-08, | |
| "loss": 0.723, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.9574895092136472, | |
| "grad_norm": 1.694689146111032, | |
| "learning_rate": 6.730697688170251e-08, | |
| "loss": 0.8005, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.963327859879584, | |
| "grad_norm": 1.3191005186230464, | |
| "learning_rate": 5.155908045872349e-08, | |
| "loss": 0.667, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.9691662105455209, | |
| "grad_norm": 1.4117265002070536, | |
| "learning_rate": 3.7897451640321326e-08, | |
| "loss": 0.7146, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.9750045612114577, | |
| "grad_norm": 1.2794859494757669, | |
| "learning_rate": 2.6327850182769065e-08, | |
| "loss": 0.6562, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.9808429118773946, | |
| "grad_norm": 1.4089880813633835, | |
| "learning_rate": 1.6855153841527915e-08, | |
| "loss": 0.6582, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.9866812625433315, | |
| "grad_norm": 1.4165079720131926, | |
| "learning_rate": 9.48335631477948e-09, | |
| "loss": 0.7843, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.9925196132092684, | |
| "grad_norm": 1.2128283609191615, | |
| "learning_rate": 4.2155655596809455e-09, | |
| "loss": 0.6164, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.9983579638752053, | |
| "grad_norm": 1.2569650088824407, | |
| "learning_rate": 1.054002482043237e-09, | |
| "loss": 0.6706, | |
| "step": 171 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 171, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 4050, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 14533949767680.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |