{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 176, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011363636363636364, "grad_norm": 0.04888428873089485, "learning_rate": 2.777777777777778e-06, "loss": 0.9533, "step": 1 }, { "epoch": 0.022727272727272728, "grad_norm": 0.04118674609403888, "learning_rate": 5.555555555555556e-06, "loss": 0.8187, "step": 2 }, { "epoch": 0.03409090909090909, "grad_norm": 0.036867673437168666, "learning_rate": 8.333333333333334e-06, "loss": 0.7078, "step": 3 }, { "epoch": 0.045454545454545456, "grad_norm": 0.04847692066919062, "learning_rate": 1.1111111111111112e-05, "loss": 0.987, "step": 4 }, { "epoch": 0.056818181818181816, "grad_norm": 0.05003862080856868, "learning_rate": 1.388888888888889e-05, "loss": 0.9577, "step": 5 }, { "epoch": 0.06818181818181818, "grad_norm": 0.04493399553279938, "learning_rate": 1.6666666666666667e-05, "loss": 0.9205, "step": 6 }, { "epoch": 0.07954545454545454, "grad_norm": 0.044174232851205655, "learning_rate": 1.9444444444444445e-05, "loss": 0.7971, "step": 7 }, { "epoch": 0.09090909090909091, "grad_norm": 0.0537270325545959, "learning_rate": 2.2222222222222223e-05, "loss": 0.875, "step": 8 }, { "epoch": 0.10227272727272728, "grad_norm": 0.047274651714941, "learning_rate": 2.5e-05, "loss": 0.7865, "step": 9 }, { "epoch": 0.11363636363636363, "grad_norm": 0.047438139846740904, "learning_rate": 2.777777777777778e-05, "loss": 0.7949, "step": 10 }, { "epoch": 0.125, "grad_norm": 0.0486064821581454, "learning_rate": 3.055555555555556e-05, "loss": 0.9187, "step": 11 }, { "epoch": 0.13636363636363635, "grad_norm": 0.049768338512046005, "learning_rate": 3.3333333333333335e-05, "loss": 0.927, "step": 12 }, { "epoch": 0.14772727272727273, "grad_norm": 0.050752786859426545, "learning_rate": 3.611111111111111e-05, "loss": 0.882, "step": 13 }, { "epoch": 0.1590909090909091, "grad_norm": 0.04238163918748248, "learning_rate": 3.888888888888889e-05, "loss": 0.8273, "step": 14 }, { "epoch": 0.17045454545454544, "grad_norm": 0.03389553366668896, "learning_rate": 4.166666666666667e-05, "loss": 0.801, "step": 15 }, { "epoch": 0.18181818181818182, "grad_norm": 0.03219935695711762, "learning_rate": 4.4444444444444447e-05, "loss": 0.8313, "step": 16 }, { "epoch": 0.19318181818181818, "grad_norm": 0.027821411081551388, "learning_rate": 4.722222222222222e-05, "loss": 0.6832, "step": 17 }, { "epoch": 0.20454545454545456, "grad_norm": 0.028125631584449494, "learning_rate": 5e-05, "loss": 0.724, "step": 18 }, { "epoch": 0.2159090909090909, "grad_norm": 0.03372930491801512, "learning_rate": 4.9995058244251644e-05, "loss": 0.765, "step": 19 }, { "epoch": 0.22727272727272727, "grad_norm": 0.03207593160425088, "learning_rate": 4.9980234930682546e-05, "loss": 0.8319, "step": 20 }, { "epoch": 0.23863636363636365, "grad_norm": 0.04392253595539216, "learning_rate": 4.995553591954832e-05, "loss": 0.8572, "step": 21 }, { "epoch": 0.25, "grad_norm": 0.043837015398686424, "learning_rate": 4.99209709753674e-05, "loss": 0.7994, "step": 22 }, { "epoch": 0.26136363636363635, "grad_norm": 0.038660838245880964, "learning_rate": 4.9876553763060684e-05, "loss": 0.7595, "step": 23 }, { "epoch": 0.2727272727272727, "grad_norm": 0.03958181928823714, "learning_rate": 4.982230184254933e-05, "loss": 0.7524, "step": 24 }, { "epoch": 0.2840909090909091, "grad_norm": 0.04692096023674341, "learning_rate": 4.975823666181255e-05, "loss": 0.7565, "step": 25 }, { "epoch": 0.29545454545454547, "grad_norm": 0.039034963022689224, "learning_rate": 4.968438354840834e-05, "loss": 0.8968, "step": 26 }, { "epoch": 0.3068181818181818, "grad_norm": 0.03472677387974209, "learning_rate": 4.960077169946052e-05, "loss": 0.7522, "step": 27 }, { "epoch": 0.3181818181818182, "grad_norm": 0.031244118470989204, "learning_rate": 4.950743417011591e-05, "loss": 0.7937, "step": 28 }, { "epoch": 0.32954545454545453, "grad_norm": 0.029092500515729473, "learning_rate": 4.940440786047628e-05, "loss": 0.667, "step": 29 }, { "epoch": 0.3409090909090909, "grad_norm": 0.028474671621308404, "learning_rate": 4.929173350101025e-05, "loss": 0.7255, "step": 30 }, { "epoch": 0.3522727272727273, "grad_norm": 0.021514453472108053, "learning_rate": 4.9169455636450935e-05, "loss": 0.7683, "step": 31 }, { "epoch": 0.36363636363636365, "grad_norm": 0.01979063211912252, "learning_rate": 4.903762260818551e-05, "loss": 0.7236, "step": 32 }, { "epoch": 0.375, "grad_norm": 0.019373274784309575, "learning_rate": 4.889628653514402e-05, "loss": 0.6894, "step": 33 }, { "epoch": 0.38636363636363635, "grad_norm": 0.01807357066294673, "learning_rate": 4.874550329319457e-05, "loss": 0.7558, "step": 34 }, { "epoch": 0.3977272727272727, "grad_norm": 0.01770418827474799, "learning_rate": 4.8585332493053364e-05, "loss": 0.6813, "step": 35 }, { "epoch": 0.4090909090909091, "grad_norm": 0.01675836399147525, "learning_rate": 4.84158374567182e-05, "loss": 0.7559, "step": 36 }, { "epoch": 0.42045454545454547, "grad_norm": 0.021175097986321438, "learning_rate": 4.8237085192434676e-05, "loss": 0.8176, "step": 37 }, { "epoch": 0.4318181818181818, "grad_norm": 0.017100608642967, "learning_rate": 4.804914636820517e-05, "loss": 0.7112, "step": 38 }, { "epoch": 0.4431818181818182, "grad_norm": 0.01683117973292665, "learning_rate": 4.7852095283850866e-05, "loss": 0.7345, "step": 39 }, { "epoch": 0.45454545454545453, "grad_norm": 0.019570606734413423, "learning_rate": 4.764600984163808e-05, "loss": 0.8125, "step": 40 }, { "epoch": 0.4659090909090909, "grad_norm": 0.015675238814520476, "learning_rate": 4.743097151548031e-05, "loss": 0.6994, "step": 41 }, { "epoch": 0.4772727272727273, "grad_norm": 0.019792988850059114, "learning_rate": 4.72070653187283e-05, "loss": 0.7837, "step": 42 }, { "epoch": 0.48863636363636365, "grad_norm": 0.015785852432543244, "learning_rate": 4.6974379770560846e-05, "loss": 0.6608, "step": 43 }, { "epoch": 0.5, "grad_norm": 0.015533678745760247, "learning_rate": 4.673300686098957e-05, "loss": 0.7031, "step": 44 }, { "epoch": 0.5113636363636364, "grad_norm": 0.01921291960352695, "learning_rate": 4.648304201449153e-05, "loss": 0.8267, "step": 45 }, { "epoch": 0.5227272727272727, "grad_norm": 0.01614254754588724, "learning_rate": 4.6224584052284106e-05, "loss": 0.7058, "step": 46 }, { "epoch": 0.5340909090909091, "grad_norm": 0.019677063872610357, "learning_rate": 4.5957735153256915e-05, "loss": 0.6621, "step": 47 }, { "epoch": 0.5454545454545454, "grad_norm": 0.013519303189431392, "learning_rate": 4.5682600813576435e-05, "loss": 0.7478, "step": 48 }, { "epoch": 0.5568181818181818, "grad_norm": 0.013215293773761478, "learning_rate": 4.539928980497903e-05, "loss": 0.7034, "step": 49 }, { "epoch": 0.5681818181818182, "grad_norm": 0.013694552146445563, "learning_rate": 4.510791413176912e-05, "loss": 0.7448, "step": 50 }, { "epoch": 0.5795454545454546, "grad_norm": 0.016162877179262552, "learning_rate": 4.480858898653936e-05, "loss": 0.6728, "step": 51 }, { "epoch": 0.5909090909090909, "grad_norm": 0.017056245721837348, "learning_rate": 4.4501432704630305e-05, "loss": 0.6714, "step": 52 }, { "epoch": 0.6022727272727273, "grad_norm": 0.01602874080265432, "learning_rate": 4.41865667173477e-05, "loss": 0.7667, "step": 53 }, { "epoch": 0.6136363636363636, "grad_norm": 0.01556595840862479, "learning_rate": 4.386411550395576e-05, "loss": 0.6934, "step": 54 }, { "epoch": 0.625, "grad_norm": 0.014885950964239066, "learning_rate": 4.353420654246546e-05, "loss": 0.6909, "step": 55 }, { "epoch": 0.6363636363636364, "grad_norm": 0.01399086114921709, "learning_rate": 4.319697025923736e-05, "loss": 0.6817, "step": 56 }, { "epoch": 0.6477272727272727, "grad_norm": 0.016980372482516305, "learning_rate": 4.285253997741875e-05, "loss": 0.7485, "step": 57 }, { "epoch": 0.6590909090909091, "grad_norm": 0.01370669813419946, "learning_rate": 4.2501051864235636e-05, "loss": 0.5524, "step": 58 }, { "epoch": 0.6704545454545454, "grad_norm": 0.0161962614922305, "learning_rate": 4.214264487716033e-05, "loss": 0.7145, "step": 59 }, { "epoch": 0.6818181818181818, "grad_norm": 0.01432616251653496, "learning_rate": 4.177746070897592e-05, "loss": 0.7263, "step": 60 }, { "epoch": 0.6931818181818182, "grad_norm": 0.01515485647301396, "learning_rate": 4.140564373175939e-05, "loss": 0.6531, "step": 61 }, { "epoch": 0.7045454545454546, "grad_norm": 0.014400704458474281, "learning_rate": 4.10273409398055e-05, "loss": 0.7696, "step": 62 }, { "epoch": 0.7159090909090909, "grad_norm": 0.015458672360864132, "learning_rate": 4.0642701891514e-05, "loss": 0.7081, "step": 63 }, { "epoch": 0.7272727272727273, "grad_norm": 0.015263330526513203, "learning_rate": 4.025187865026311e-05, "loss": 0.725, "step": 64 }, { "epoch": 0.7386363636363636, "grad_norm": 0.015125337784791405, "learning_rate": 3.985502572429276e-05, "loss": 0.7111, "step": 65 }, { "epoch": 0.75, "grad_norm": 0.013985325014602094, "learning_rate": 3.945230000562121e-05, "loss": 0.7249, "step": 66 }, { "epoch": 0.7613636363636364, "grad_norm": 0.0155589237978013, "learning_rate": 3.9043860708019273e-05, "loss": 0.6881, "step": 67 }, { "epoch": 0.7727272727272727, "grad_norm": 0.01882918883166355, "learning_rate": 3.862986930406669e-05, "loss": 0.8102, "step": 68 }, { "epoch": 0.7840909090909091, "grad_norm": 0.015308078133970605, "learning_rate": 3.821048946131549e-05, "loss": 0.6919, "step": 69 }, { "epoch": 0.7954545454545454, "grad_norm": 0.01630125314346408, "learning_rate": 3.778588697758556e-05, "loss": 0.6464, "step": 70 }, { "epoch": 0.8068181818181818, "grad_norm": 0.015050157672318546, "learning_rate": 3.7356229715418074e-05, "loss": 0.7114, "step": 71 }, { "epoch": 0.8181818181818182, "grad_norm": 0.015186905856860966, "learning_rate": 3.6921687535712656e-05, "loss": 0.719, "step": 72 }, { "epoch": 0.8295454545454546, "grad_norm": 0.01935797435359767, "learning_rate": 3.6482432230574446e-05, "loss": 0.7255, "step": 73 }, { "epoch": 0.8409090909090909, "grad_norm": 0.016661093988049422, "learning_rate": 3.60386374553978e-05, "loss": 0.6642, "step": 74 }, { "epoch": 0.8522727272727273, "grad_norm": 0.017167771916012636, "learning_rate": 3.5590478660213214e-05, "loss": 0.677, "step": 75 }, { "epoch": 0.8636363636363636, "grad_norm": 0.015022481447368145, "learning_rate": 3.5138133020324845e-05, "loss": 0.7307, "step": 76 }, { "epoch": 0.875, "grad_norm": 0.01785502395194187, "learning_rate": 3.468177936626603e-05, "loss": 0.789, "step": 77 }, { "epoch": 0.8863636363636364, "grad_norm": 0.01864810858161923, "learning_rate": 3.4221598113100195e-05, "loss": 0.7059, "step": 78 }, { "epoch": 0.8977272727272727, "grad_norm": 0.016575657529251383, "learning_rate": 3.375777118909561e-05, "loss": 0.7456, "step": 79 }, { "epoch": 0.9090909090909091, "grad_norm": 0.016188984904551812, "learning_rate": 3.32904819638017e-05, "loss": 0.683, "step": 80 }, { "epoch": 0.9204545454545454, "grad_norm": 0.016436588560373562, "learning_rate": 3.2819915175555684e-05, "loss": 0.6475, "step": 81 }, { "epoch": 0.9318181818181818, "grad_norm": 0.017564054129248746, "learning_rate": 3.234625685844803e-05, "loss": 0.622, "step": 82 }, { "epoch": 0.9431818181818182, "grad_norm": 0.019239130214828547, "learning_rate": 3.186969426877563e-05, "loss": 0.638, "step": 83 }, { "epoch": 0.9545454545454546, "grad_norm": 0.01778558528784305, "learning_rate": 3.139041581101187e-05, "loss": 0.5941, "step": 84 }, { "epoch": 0.9659090909090909, "grad_norm": 0.03326719961885704, "learning_rate": 3.090861096332263e-05, "loss": 0.6721, "step": 85 }, { "epoch": 0.9772727272727273, "grad_norm": 0.017697778364759945, "learning_rate": 3.042447020265795e-05, "loss": 0.663, "step": 86 }, { "epoch": 0.9886363636363636, "grad_norm": 0.015032572277755535, "learning_rate": 2.993818492944882e-05, "loss": 0.6462, "step": 87 }, { "epoch": 1.0, "grad_norm": 0.01697573905444168, "learning_rate": 2.9449947391938766e-05, "loss": 0.6957, "step": 88 }, { "epoch": 1.0113636363636365, "grad_norm": 0.01629821245832795, "learning_rate": 2.8959950610180374e-05, "loss": 0.6629, "step": 89 }, { "epoch": 1.0227272727272727, "grad_norm": 0.015172985524910379, "learning_rate": 2.8468388299726712e-05, "loss": 0.6602, "step": 90 }, { "epoch": 1.0340909090909092, "grad_norm": 0.015577909780648352, "learning_rate": 2.7975454795047622e-05, "loss": 0.796, "step": 91 }, { "epoch": 1.0454545454545454, "grad_norm": 0.017309986409190056, "learning_rate": 2.7481344972701545e-05, "loss": 0.6618, "step": 92 }, { "epoch": 1.0568181818181819, "grad_norm": 0.01661621611638882, "learning_rate": 2.6986254174292862e-05, "loss": 0.7065, "step": 93 }, { "epoch": 1.0681818181818181, "grad_norm": 0.016737287474769654, "learning_rate": 2.6490378129245498e-05, "loss": 0.6457, "step": 94 }, { "epoch": 1.0795454545454546, "grad_norm": 0.016921442525318752, "learning_rate": 2.599391287742315e-05, "loss": 0.7422, "step": 95 }, { "epoch": 1.0909090909090908, "grad_norm": 0.01644141496345415, "learning_rate": 2.5497054691626753e-05, "loss": 0.7125, "step": 96 }, { "epoch": 1.1022727272727273, "grad_norm": 0.01566715623525306, "learning_rate": 2.5e-05, "loss": 0.7015, "step": 97 }, { "epoch": 1.1136363636363635, "grad_norm": 0.016042962886209804, "learning_rate": 2.4502945308373246e-05, "loss": 0.6911, "step": 98 }, { "epoch": 1.125, "grad_norm": 0.014451142623462782, "learning_rate": 2.4006087122576863e-05, "loss": 0.6499, "step": 99 }, { "epoch": 1.1363636363636362, "grad_norm": 0.016365537745248963, "learning_rate": 2.3509621870754505e-05, "loss": 0.6564, "step": 100 }, { "epoch": 1.1477272727272727, "grad_norm": 0.015223080653847816, "learning_rate": 2.301374582570714e-05, "loss": 0.6767, "step": 101 }, { "epoch": 1.1590909090909092, "grad_norm": 0.017013122218294784, "learning_rate": 2.2518655027298464e-05, "loss": 0.6712, "step": 102 }, { "epoch": 1.1704545454545454, "grad_norm": 0.017431829042961085, "learning_rate": 2.2024545204952383e-05, "loss": 0.6363, "step": 103 }, { "epoch": 1.1818181818181819, "grad_norm": 0.01638659813387577, "learning_rate": 2.1531611700273297e-05, "loss": 0.7341, "step": 104 }, { "epoch": 1.1931818181818181, "grad_norm": 0.015883920910595706, "learning_rate": 2.104004938981963e-05, "loss": 0.7503, "step": 105 }, { "epoch": 1.2045454545454546, "grad_norm": 0.016308759795444855, "learning_rate": 2.055005260806125e-05, "loss": 0.7494, "step": 106 }, { "epoch": 1.2159090909090908, "grad_norm": 0.01653825796554113, "learning_rate": 2.0061815070551186e-05, "loss": 0.584, "step": 107 }, { "epoch": 1.2272727272727273, "grad_norm": 0.015995947052341933, "learning_rate": 1.957552979734205e-05, "loss": 0.7943, "step": 108 }, { "epoch": 1.2386363636363638, "grad_norm": 0.018158719978620477, "learning_rate": 1.9091389036677382e-05, "loss": 0.7001, "step": 109 }, { "epoch": 1.25, "grad_norm": 0.015286756025554763, "learning_rate": 1.8609584188988136e-05, "loss": 0.6132, "step": 110 }, { "epoch": 1.2613636363636362, "grad_norm": 0.014766283231039113, "learning_rate": 1.813030573122437e-05, "loss": 0.6789, "step": 111 }, { "epoch": 1.2727272727272727, "grad_norm": 0.018163850190524557, "learning_rate": 1.7653743141551983e-05, "loss": 0.6641, "step": 112 }, { "epoch": 1.2840909090909092, "grad_norm": 0.015499094565043982, "learning_rate": 1.7180084824444325e-05, "loss": 0.7034, "step": 113 }, { "epoch": 1.2954545454545454, "grad_norm": 0.01666315009242263, "learning_rate": 1.6709518036198308e-05, "loss": 0.6644, "step": 114 }, { "epoch": 1.3068181818181819, "grad_norm": 0.018550049211631166, "learning_rate": 1.6242228810904392e-05, "loss": 0.6314, "step": 115 }, { "epoch": 1.3181818181818181, "grad_norm": 0.016414228273475284, "learning_rate": 1.5778401886899807e-05, "loss": 0.7085, "step": 116 }, { "epoch": 1.3295454545454546, "grad_norm": 0.016629484072509887, "learning_rate": 1.5318220633733978e-05, "loss": 0.6328, "step": 117 }, { "epoch": 1.3409090909090908, "grad_norm": 0.01735441033712776, "learning_rate": 1.4861866979675154e-05, "loss": 0.6987, "step": 118 }, { "epoch": 1.3522727272727273, "grad_norm": 0.01653379311481811, "learning_rate": 1.4409521339786808e-05, "loss": 0.7689, "step": 119 }, { "epoch": 1.3636363636363638, "grad_norm": 0.017109513908993915, "learning_rate": 1.3961362544602213e-05, "loss": 0.852, "step": 120 }, { "epoch": 1.375, "grad_norm": 0.015606168185551323, "learning_rate": 1.3517567769425548e-05, "loss": 0.6131, "step": 121 }, { "epoch": 1.3863636363636362, "grad_norm": 0.016954213866876906, "learning_rate": 1.3078312464287353e-05, "loss": 0.7262, "step": 122 }, { "epoch": 1.3977272727272727, "grad_norm": 0.019722191608232728, "learning_rate": 1.2643770284581929e-05, "loss": 0.6411, "step": 123 }, { "epoch": 1.4090909090909092, "grad_norm": 0.016754592315893345, "learning_rate": 1.2214113022414448e-05, "loss": 0.7406, "step": 124 }, { "epoch": 1.4204545454545454, "grad_norm": 0.01811366422363491, "learning_rate": 1.1789510538684523e-05, "loss": 0.6915, "step": 125 }, { "epoch": 1.4318181818181819, "grad_norm": 0.015117191146405742, "learning_rate": 1.1370130695933318e-05, "loss": 0.7914, "step": 126 }, { "epoch": 1.4431818181818181, "grad_norm": 0.019162871723577116, "learning_rate": 1.0956139291980727e-05, "loss": 0.7924, "step": 127 }, { "epoch": 1.4545454545454546, "grad_norm": 0.017257641671438298, "learning_rate": 1.0547699994378787e-05, "loss": 0.6671, "step": 128 }, { "epoch": 1.4659090909090908, "grad_norm": 0.018952048140386035, "learning_rate": 1.0144974275707241e-05, "loss": 0.7046, "step": 129 }, { "epoch": 1.4772727272727273, "grad_norm": 0.02016982804986924, "learning_rate": 9.748121349736892e-06, "loss": 0.7542, "step": 130 }, { "epoch": 1.4886363636363638, "grad_norm": 0.015861465051683968, "learning_rate": 9.357298108486003e-06, "loss": 0.6563, "step": 131 }, { "epoch": 1.5, "grad_norm": 0.016677424892624696, "learning_rate": 8.972659060194506e-06, "loss": 0.6743, "step": 132 }, { "epoch": 1.5113636363636362, "grad_norm": 0.017220590202846452, "learning_rate": 8.594356268240616e-06, "loss": 0.5913, "step": 133 }, { "epoch": 1.5227272727272727, "grad_norm": 0.017476835977041916, "learning_rate": 8.222539291024078e-06, "loss": 0.6184, "step": 134 }, { "epoch": 1.5340909090909092, "grad_norm": 0.01852744989540818, "learning_rate": 7.857355122839675e-06, "loss": 0.8003, "step": 135 }, { "epoch": 1.5454545454545454, "grad_norm": 0.017724119693348375, "learning_rate": 7.4989481357643694e-06, "loss": 0.7319, "step": 136 }, { "epoch": 1.5568181818181817, "grad_norm": 0.01827897532047109, "learning_rate": 7.147460022581257e-06, "loss": 0.7496, "step": 137 }, { "epoch": 1.5681818181818183, "grad_norm": 0.017356487062778, "learning_rate": 6.803029740762648e-06, "loss": 0.6998, "step": 138 }, { "epoch": 1.5795454545454546, "grad_norm": 0.01850857557468877, "learning_rate": 6.465793457534553e-06, "loss": 0.6103, "step": 139 }, { "epoch": 1.5909090909090908, "grad_norm": 0.018299658199403747, "learning_rate": 6.135884496044244e-06, "loss": 0.6448, "step": 140 }, { "epoch": 1.6022727272727273, "grad_norm": 0.018388053763702464, "learning_rate": 5.813433282652298e-06, "loss": 0.6399, "step": 141 }, { "epoch": 1.6136363636363638, "grad_norm": 0.02080237717937431, "learning_rate": 5.4985672953697e-06, "loss": 0.6313, "step": 142 }, { "epoch": 1.625, "grad_norm": 0.01796260664782129, "learning_rate": 5.191411013460645e-06, "loss": 0.6843, "step": 143 }, { "epoch": 1.6363636363636362, "grad_norm": 0.02056859413635753, "learning_rate": 4.892085868230881e-06, "loss": 0.7157, "step": 144 }, { "epoch": 1.6477272727272727, "grad_norm": 0.016274442923538548, "learning_rate": 4.600710195020982e-06, "loss": 0.7276, "step": 145 }, { "epoch": 1.6590909090909092, "grad_norm": 0.01815637927898507, "learning_rate": 4.317399186423574e-06, "loss": 0.6305, "step": 146 }, { "epoch": 1.6704545454545454, "grad_norm": 0.01897651128017473, "learning_rate": 4.042264846743085e-06, "loss": 0.675, "step": 147 }, { "epoch": 1.6818181818181817, "grad_norm": 0.018136754953765325, "learning_rate": 3.775415947715899e-06, "loss": 0.7152, "step": 148 }, { "epoch": 1.6931818181818183, "grad_norm": 0.018312167884684892, "learning_rate": 3.516957985508476e-06, "loss": 0.6128, "step": 149 }, { "epoch": 1.7045454545454546, "grad_norm": 0.01872198284953679, "learning_rate": 3.266993139010438e-06, "loss": 0.7636, "step": 150 }, { "epoch": 1.7159090909090908, "grad_norm": 0.019247219050936033, "learning_rate": 3.0256202294391577e-06, "loss": 0.7763, "step": 151 }, { "epoch": 1.7272727272727273, "grad_norm": 0.01747541840805843, "learning_rate": 2.792934681271708e-06, "loss": 0.718, "step": 152 }, { "epoch": 1.7386363636363638, "grad_norm": 0.016543357188300384, "learning_rate": 2.5690284845196923e-06, "loss": 0.7503, "step": 153 }, { "epoch": 1.75, "grad_norm": 0.017777133602865168, "learning_rate": 2.3539901583619185e-06, "loss": 0.6796, "step": 154 }, { "epoch": 1.7613636363636362, "grad_norm": 0.017730228650558268, "learning_rate": 2.147904716149135e-06, "loss": 0.7887, "step": 155 }, { "epoch": 1.7727272727272727, "grad_norm": 0.016904387380041325, "learning_rate": 1.9508536317948357e-06, "loss": 0.7028, "step": 156 }, { "epoch": 1.7840909090909092, "grad_norm": 0.017048389518616498, "learning_rate": 1.7629148075653245e-06, "loss": 0.6987, "step": 157 }, { "epoch": 1.7954545454545454, "grad_norm": 0.019231086359691812, "learning_rate": 1.5841625432818057e-06, "loss": 0.7262, "step": 158 }, { "epoch": 1.8068181818181817, "grad_norm": 0.018406406662889105, "learning_rate": 1.4146675069466403e-06, "loss": 0.6232, "step": 159 }, { "epoch": 1.8181818181818183, "grad_norm": 0.01859904836356649, "learning_rate": 1.2544967068054332e-06, "loss": 0.7216, "step": 160 }, { "epoch": 1.8295454545454546, "grad_norm": 0.017748264732114775, "learning_rate": 1.1037134648559794e-06, "loss": 0.66, "step": 161 }, { "epoch": 1.8409090909090908, "grad_norm": 0.019172670841775385, "learning_rate": 9.623773918144897e-07, "loss": 0.6719, "step": 162 }, { "epoch": 1.8522727272727273, "grad_norm": 0.017201032284762996, "learning_rate": 8.305443635490711e-07, "loss": 0.6841, "step": 163 }, { "epoch": 1.8636363636363638, "grad_norm": 0.017602592399586683, "learning_rate": 7.082664989897487e-07, "loss": 0.759, "step": 164 }, { "epoch": 1.875, "grad_norm": 0.01737274402951971, "learning_rate": 5.955921395237318e-07, "loss": 0.6251, "step": 165 }, { "epoch": 1.8863636363636362, "grad_norm": 0.018381873767541995, "learning_rate": 4.925658298840979e-07, "loss": 0.7006, "step": 166 }, { "epoch": 1.8977272727272727, "grad_norm": 0.017002639935913035, "learning_rate": 3.992283005394837e-07, "loss": 0.5917, "step": 167 }, { "epoch": 1.9090909090909092, "grad_norm": 0.017463979654969486, "learning_rate": 3.1561645159166597e-07, "loss": 0.6757, "step": 168 }, { "epoch": 1.9204545454545454, "grad_norm": 0.018406237369712176, "learning_rate": 2.417633381874534e-07, "loss": 0.7824, "step": 169 }, { "epoch": 1.9318181818181817, "grad_norm": 0.017967868873337563, "learning_rate": 1.7769815745066475e-07, "loss": 0.7394, "step": 170 }, { "epoch": 1.9431818181818183, "grad_norm": 0.01623825702144127, "learning_rate": 1.234462369393147e-07, "loss": 0.7684, "step": 171 }, { "epoch": 1.9545454545454546, "grad_norm": 0.017608033119901965, "learning_rate": 7.90290246326042e-08, "loss": 0.7831, "step": 172 }, { "epoch": 1.9659090909090908, "grad_norm": 0.017319359934865306, "learning_rate": 4.4464080451675494e-08, "loss": 0.6974, "step": 173 }, { "epoch": 1.9772727272727273, "grad_norm": 0.017475580158119938, "learning_rate": 1.976506931745392e-08, "loss": 0.7574, "step": 174 }, { "epoch": 1.9886363636363638, "grad_norm": 0.017048411256850495, "learning_rate": 4.941755748361088e-09, "loss": 0.6594, "step": 175 }, { "epoch": 2.0, "grad_norm": 0.018638627651477082, "learning_rate": 0.0, "loss": 0.7388, "step": 176 }, { "epoch": 2.0, "step": 176, "total_flos": 3054747482324992.0, "train_loss": 0.7215218588032506, "train_runtime": 3666.1508, "train_samples_per_second": 1.53, "train_steps_per_second": 0.048 } ], "logging_steps": 1, "max_steps": 176, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3054747482324992.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }