{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9724310776942353, "eval_steps": 500, "global_step": 147, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.020050125313283207, "grad_norm": 6.100090649437154, "learning_rate": 5.333333333333334e-06, "loss": 1.0088, "step": 1 }, { "epoch": 0.040100250626566414, "grad_norm": 6.165069480911681, "learning_rate": 1.0666666666666667e-05, "loss": 1.0202, "step": 2 }, { "epoch": 0.06015037593984962, "grad_norm": 4.472412116195948, "learning_rate": 1.6000000000000003e-05, "loss": 0.9496, "step": 3 }, { "epoch": 0.08020050125313283, "grad_norm": 4.963164130948997, "learning_rate": 2.1333333333333335e-05, "loss": 0.9406, "step": 4 }, { "epoch": 0.10025062656641603, "grad_norm": 4.041819143127636, "learning_rate": 2.6666666666666667e-05, "loss": 0.8794, "step": 5 }, { "epoch": 0.12030075187969924, "grad_norm": 3.719380540370943, "learning_rate": 3.2000000000000005e-05, "loss": 0.9034, "step": 6 }, { "epoch": 0.14035087719298245, "grad_norm": 2.3309985890149765, "learning_rate": 3.733333333333334e-05, "loss": 0.8102, "step": 7 }, { "epoch": 0.16040100250626566, "grad_norm": 2.2467101719324454, "learning_rate": 4.266666666666667e-05, "loss": 0.786, "step": 8 }, { "epoch": 0.18045112781954886, "grad_norm": 2.8539093369835227, "learning_rate": 4.8e-05, "loss": 0.7531, "step": 9 }, { "epoch": 0.20050125313283207, "grad_norm": 2.0083048527276404, "learning_rate": 5.333333333333333e-05, "loss": 0.7468, "step": 10 }, { "epoch": 0.22055137844611528, "grad_norm": 2.165125600420533, "learning_rate": 5.8666666666666665e-05, "loss": 0.7391, "step": 11 }, { "epoch": 0.24060150375939848, "grad_norm": 2.122858303676939, "learning_rate": 6.400000000000001e-05, "loss": 0.7233, "step": 12 }, { "epoch": 0.2606516290726817, "grad_norm": 2.2789276233273843, "learning_rate": 6.933333333333334e-05, "loss": 0.7286, "step": 13 }, { "epoch": 0.2807017543859649, "grad_norm": 2.1616002933600393, "learning_rate": 7.466666666666667e-05, "loss": 0.7199, "step": 14 }, { "epoch": 0.3007518796992481, "grad_norm": 3.2259560404881986, "learning_rate": 8e-05, "loss": 0.7223, "step": 15 }, { "epoch": 0.3208020050125313, "grad_norm": 1.5369895434247212, "learning_rate": 7.998867178772517e-05, "loss": 0.7063, "step": 16 }, { "epoch": 0.3408521303258145, "grad_norm": 3.28272311228189, "learning_rate": 7.995469356732033e-05, "loss": 0.7174, "step": 17 }, { "epoch": 0.3609022556390977, "grad_norm": 2.1611599200246974, "learning_rate": 7.989808458441014e-05, "loss": 0.6981, "step": 18 }, { "epoch": 0.38095238095238093, "grad_norm": 1.9324451852274291, "learning_rate": 7.981887690292339e-05, "loss": 0.6943, "step": 19 }, { "epoch": 0.40100250626566414, "grad_norm": 18.248869277300834, "learning_rate": 7.971711538693153e-05, "loss": 0.6998, "step": 20 }, { "epoch": 0.42105263157894735, "grad_norm": 2.181882821404686, "learning_rate": 7.959285767523732e-05, "loss": 0.7193, "step": 21 }, { "epoch": 0.44110275689223055, "grad_norm": 1.2480300021039636, "learning_rate": 7.944617414872747e-05, "loss": 0.6843, "step": 22 }, { "epoch": 0.46115288220551376, "grad_norm": 2.3412336103636693, "learning_rate": 7.927714789050826e-05, "loss": 0.7089, "step": 23 }, { "epoch": 0.48120300751879697, "grad_norm": 1.9642689883848843, "learning_rate": 7.908587463884638e-05, "loss": 0.6787, "step": 24 }, { "epoch": 0.5012531328320802, "grad_norm": 1.3434579333848315, "learning_rate": 7.887246273294167e-05, "loss": 0.6773, "step": 25 }, { "epoch": 0.5213032581453634, "grad_norm": 1.440602627293297, "learning_rate": 7.863703305156273e-05, "loss": 0.6792, "step": 26 }, { "epoch": 0.5413533834586466, "grad_norm": 1.3175913095028617, "learning_rate": 7.837971894457991e-05, "loss": 0.6654, "step": 27 }, { "epoch": 0.5614035087719298, "grad_norm": 1.3198703652450279, "learning_rate": 7.810066615743443e-05, "loss": 0.6524, "step": 28 }, { "epoch": 0.581453634085213, "grad_norm": 0.781557000061513, "learning_rate": 7.780003274858674e-05, "loss": 0.6573, "step": 29 }, { "epoch": 0.6015037593984962, "grad_norm": 1.1674794127969026, "learning_rate": 7.747798899999048e-05, "loss": 0.6664, "step": 30 }, { "epoch": 0.6215538847117794, "grad_norm": 1.623871556077205, "learning_rate": 7.71347173206429e-05, "loss": 0.6722, "step": 31 }, { "epoch": 0.6416040100250626, "grad_norm": 1.1713900589893373, "learning_rate": 7.677041214326663e-05, "loss": 0.6403, "step": 32 }, { "epoch": 0.6616541353383458, "grad_norm": 1.1207108511375592, "learning_rate": 7.638527981418075e-05, "loss": 0.6427, "step": 33 }, { "epoch": 0.681704260651629, "grad_norm": 1.6390101502728163, "learning_rate": 7.597953847642413e-05, "loss": 0.6451, "step": 34 }, { "epoch": 0.7017543859649122, "grad_norm": 0.9703477640239104, "learning_rate": 7.555341794619695e-05, "loss": 0.6371, "step": 35 }, { "epoch": 0.7218045112781954, "grad_norm": 1.888557130236824, "learning_rate": 7.510715958269023e-05, "loss": 0.6385, "step": 36 }, { "epoch": 0.7418546365914787, "grad_norm": 1.4709265925596289, "learning_rate": 7.464101615137756e-05, "loss": 0.6468, "step": 37 }, { "epoch": 0.7619047619047619, "grad_norm": 1.3340842858015245, "learning_rate": 7.415525168084593e-05, "loss": 0.636, "step": 38 }, { "epoch": 0.7819548872180451, "grad_norm": 1.2331491459930695, "learning_rate": 7.365014131324725e-05, "loss": 0.6423, "step": 39 }, { "epoch": 0.8020050125313283, "grad_norm": 1.0459942770103978, "learning_rate": 7.312597114845483e-05, "loss": 0.6405, "step": 40 }, { "epoch": 0.8220551378446115, "grad_norm": 0.9742677496779505, "learning_rate": 7.258303808201343e-05, "loss": 0.619, "step": 41 }, { "epoch": 0.8421052631578947, "grad_norm": 0.9126761857775636, "learning_rate": 7.202164963697442e-05, "loss": 0.6237, "step": 42 }, { "epoch": 0.8621553884711779, "grad_norm": 1.1034224988906396, "learning_rate": 7.144212378971151e-05, "loss": 0.6126, "step": 43 }, { "epoch": 0.8822055137844611, "grad_norm": 0.9050299139188852, "learning_rate": 7.084478878981552e-05, "loss": 0.6199, "step": 44 }, { "epoch": 0.9022556390977443, "grad_norm": 0.9484346415934313, "learning_rate": 7.022998297417034e-05, "loss": 0.6242, "step": 45 }, { "epoch": 0.9223057644110275, "grad_norm": 0.6761984153684619, "learning_rate": 6.959805457531536e-05, "loss": 0.6271, "step": 46 }, { "epoch": 0.9423558897243107, "grad_norm": 0.7734163089859195, "learning_rate": 6.89493615242028e-05, "loss": 0.6057, "step": 47 }, { "epoch": 0.9624060150375939, "grad_norm": 0.7126127374462361, "learning_rate": 6.828427124746191e-05, "loss": 0.6143, "step": 48 }, { "epoch": 0.9824561403508771, "grad_norm": 0.7040138292374798, "learning_rate": 6.760316045928449e-05, "loss": 0.5971, "step": 49 }, { "epoch": 1.0150375939849625, "grad_norm": 1.0358340840041869, "learning_rate": 6.690641494805011e-05, "loss": 1.0623, "step": 50 }, { "epoch": 1.0350877192982457, "grad_norm": 0.975942175036343, "learning_rate": 6.619442935781141e-05, "loss": 0.5949, "step": 51 }, { "epoch": 1.055137844611529, "grad_norm": 1.3817333278949258, "learning_rate": 6.546760696476354e-05, "loss": 0.5965, "step": 52 }, { "epoch": 1.0751879699248121, "grad_norm": 0.5594198623207998, "learning_rate": 6.472635944882421e-05, "loss": 0.5817, "step": 53 }, { "epoch": 1.0952380952380953, "grad_norm": 1.3502604906380835, "learning_rate": 6.397110666045388e-05, "loss": 0.5936, "step": 54 }, { "epoch": 1.1152882205513786, "grad_norm": 0.662382371248644, "learning_rate": 6.320227638284793e-05, "loss": 0.597, "step": 55 }, { "epoch": 1.1353383458646618, "grad_norm": 1.0092828128239282, "learning_rate": 6.242030408963576e-05, "loss": 0.5895, "step": 56 }, { "epoch": 1.155388471177945, "grad_norm": 0.8148251028605993, "learning_rate": 6.162563269822391e-05, "loss": 0.5796, "step": 57 }, { "epoch": 1.1754385964912282, "grad_norm": 0.7504641683568708, "learning_rate": 6.0818712318922894e-05, "loss": 0.5756, "step": 58 }, { "epoch": 1.1954887218045114, "grad_norm": 0.7075215261142646, "learning_rate": 6.000000000000001e-05, "loss": 0.5899, "step": 59 }, { "epoch": 1.2155388471177946, "grad_norm": 0.616627011160271, "learning_rate": 5.916995946880228e-05, "loss": 0.5756, "step": 60 }, { "epoch": 1.2355889724310778, "grad_norm": 0.6224665456566449, "learning_rate": 5.832906086909642e-05, "loss": 0.5717, "step": 61 }, { "epoch": 1.255639097744361, "grad_norm": 0.5410026994188678, "learning_rate": 5.747778049477438e-05, "loss": 0.5719, "step": 62 }, { "epoch": 1.2756892230576442, "grad_norm": 0.5138816389771951, "learning_rate": 5.661660052007547e-05, "loss": 0.5767, "step": 63 }, { "epoch": 1.2957393483709274, "grad_norm": 0.538651466379072, "learning_rate": 5.574600872647766e-05, "loss": 0.5754, "step": 64 }, { "epoch": 1.3157894736842106, "grad_norm": 0.39832101327162095, "learning_rate": 5.48664982264131e-05, "loss": 0.5806, "step": 65 }, { "epoch": 1.3358395989974938, "grad_norm": 0.46665463498566045, "learning_rate": 5.397856718396394e-05, "loss": 0.5622, "step": 66 }, { "epoch": 1.355889724310777, "grad_norm": 0.33596753910393834, "learning_rate": 5.3082718532696874e-05, "loss": 0.5635, "step": 67 }, { "epoch": 1.3759398496240602, "grad_norm": 0.36330721064185245, "learning_rate": 5.217945969079629e-05, "loss": 0.5728, "step": 68 }, { "epoch": 1.3959899749373434, "grad_norm": 0.31668248946183414, "learning_rate": 5.1269302273657195e-05, "loss": 0.5829, "step": 69 }, { "epoch": 1.4160401002506267, "grad_norm": 0.3108858835369522, "learning_rate": 5.0352761804100835e-05, "loss": 0.5797, "step": 70 }, { "epoch": 1.4360902255639099, "grad_norm": 0.37076038809913947, "learning_rate": 4.94303574203771e-05, "loss": 0.5678, "step": 71 }, { "epoch": 1.456140350877193, "grad_norm": 0.27669921803024705, "learning_rate": 4.8502611582119065e-05, "loss": 0.5644, "step": 72 }, { "epoch": 1.4761904761904763, "grad_norm": 0.3206960394723747, "learning_rate": 4.7570049774416414e-05, "loss": 0.5696, "step": 73 }, { "epoch": 1.4962406015037595, "grad_norm": 0.35836009606291436, "learning_rate": 4.663320021017497e-05, "loss": 0.5574, "step": 74 }, { "epoch": 1.5162907268170427, "grad_norm": 0.26025491223433994, "learning_rate": 4.5692593530931416e-05, "loss": 0.5683, "step": 75 }, { "epoch": 1.536340852130326, "grad_norm": 0.25457026883587025, "learning_rate": 4.474876250629221e-05, "loss": 0.565, "step": 76 }, { "epoch": 1.556390977443609, "grad_norm": 0.32769122481932667, "learning_rate": 4.38022417321673e-05, "loss": 0.5641, "step": 77 }, { "epoch": 1.5764411027568923, "grad_norm": 0.22460593199796938, "learning_rate": 4.2853567327969296e-05, "loss": 0.557, "step": 78 }, { "epoch": 1.5964912280701755, "grad_norm": 0.21753199007833202, "learning_rate": 4.19032766329497e-05, "loss": 0.5578, "step": 79 }, { "epoch": 1.6165413533834587, "grad_norm": 0.23083331300054297, "learning_rate": 4.0951907901844296e-05, "loss": 0.5622, "step": 80 }, { "epoch": 1.636591478696742, "grad_norm": 0.2564157643733938, "learning_rate": 4e-05, "loss": 0.5657, "step": 81 }, { "epoch": 1.6566416040100251, "grad_norm": 0.27813133557097336, "learning_rate": 3.904809209815571e-05, "loss": 0.5603, "step": 82 }, { "epoch": 1.6766917293233083, "grad_norm": 0.2038258932695245, "learning_rate": 3.809672336705031e-05, "loss": 0.5572, "step": 83 }, { "epoch": 1.6967418546365916, "grad_norm": 0.29155489774909854, "learning_rate": 3.714643267203071e-05, "loss": 0.5544, "step": 84 }, { "epoch": 1.7167919799498748, "grad_norm": 0.16516627488221486, "learning_rate": 3.6197758267832705e-05, "loss": 0.5584, "step": 85 }, { "epoch": 1.736842105263158, "grad_norm": 0.29249607068570543, "learning_rate": 3.5251237493707804e-05, "loss": 0.5677, "step": 86 }, { "epoch": 1.7568922305764412, "grad_norm": 0.19265408478071472, "learning_rate": 3.4307406469068604e-05, "loss": 0.5632, "step": 87 }, { "epoch": 1.7769423558897244, "grad_norm": 0.21254433897953356, "learning_rate": 3.3366799789825044e-05, "loss": 0.5512, "step": 88 }, { "epoch": 1.7969924812030076, "grad_norm": 0.2170294921675785, "learning_rate": 3.2429950225583606e-05, "loss": 0.5493, "step": 89 }, { "epoch": 1.8170426065162908, "grad_norm": 0.17517457106296594, "learning_rate": 3.1497388417880935e-05, "loss": 0.5522, "step": 90 }, { "epoch": 1.837092731829574, "grad_norm": 0.21184436677480925, "learning_rate": 3.0569642579622905e-05, "loss": 0.5533, "step": 91 }, { "epoch": 1.8571428571428572, "grad_norm": 0.1566054507779003, "learning_rate": 2.9647238195899168e-05, "loss": 0.538, "step": 92 }, { "epoch": 1.8771929824561404, "grad_norm": 0.19291518917014827, "learning_rate": 2.873069772634281e-05, "loss": 0.5613, "step": 93 }, { "epoch": 1.8972431077694236, "grad_norm": 0.14403155629811726, "learning_rate": 2.7820540309203728e-05, "loss": 0.5561, "step": 94 }, { "epoch": 1.9172932330827068, "grad_norm": 0.18577775134211605, "learning_rate": 2.691728146730314e-05, "loss": 0.5619, "step": 95 }, { "epoch": 1.93734335839599, "grad_norm": 0.17655395819783837, "learning_rate": 2.6021432816036073e-05, "loss": 0.557, "step": 96 }, { "epoch": 1.9573934837092732, "grad_norm": 0.15678742149650277, "learning_rate": 2.5133501773586905e-05, "loss": 0.55, "step": 97 }, { "epoch": 1.9774436090225564, "grad_norm": 0.147011080540491, "learning_rate": 2.425399127352235e-05, "loss": 0.5615, "step": 98 }, { "epoch": 2.0100250626566414, "grad_norm": 0.2896327588272941, "learning_rate": 2.338339947992455e-05, "loss": 0.984, "step": 99 }, { "epoch": 2.030075187969925, "grad_norm": 0.17749955956373084, "learning_rate": 2.2522219505225627e-05, "loss": 0.5472, "step": 100 }, { "epoch": 2.050125313283208, "grad_norm": 0.18960883016778624, "learning_rate": 2.1670939130903585e-05, "loss": 0.5246, "step": 101 }, { "epoch": 2.0701754385964914, "grad_norm": 0.19580310466277323, "learning_rate": 2.0830040531197744e-05, "loss": 0.5333, "step": 102 }, { "epoch": 2.090225563909774, "grad_norm": 0.17002438913970971, "learning_rate": 2.0000000000000012e-05, "loss": 0.5232, "step": 103 }, { "epoch": 2.110275689223058, "grad_norm": 0.23635746246402964, "learning_rate": 1.9181287681077116e-05, "loss": 0.5299, "step": 104 }, { "epoch": 2.1303258145363406, "grad_norm": 0.15533319903966208, "learning_rate": 1.8374367301776112e-05, "loss": 0.5193, "step": 105 }, { "epoch": 2.1503759398496243, "grad_norm": 0.2082307249817901, "learning_rate": 1.7579695910364235e-05, "loss": 0.5342, "step": 106 }, { "epoch": 2.170426065162907, "grad_norm": 0.14435259557657618, "learning_rate": 1.679772361715208e-05, "loss": 0.5361, "step": 107 }, { "epoch": 2.1904761904761907, "grad_norm": 0.20712238757176837, "learning_rate": 1.6028893339546122e-05, "loss": 0.5331, "step": 108 }, { "epoch": 2.2105263157894735, "grad_norm": 0.13522587328433971, "learning_rate": 1.527364055117579e-05, "loss": 0.5329, "step": 109 }, { "epoch": 2.230576441102757, "grad_norm": 0.1593070945975799, "learning_rate": 1.4532393035236477e-05, "loss": 0.5323, "step": 110 }, { "epoch": 2.25062656641604, "grad_norm": 0.1461967123983933, "learning_rate": 1.3805570642188602e-05, "loss": 0.5162, "step": 111 }, { "epoch": 2.2706766917293235, "grad_norm": 0.11139864548024211, "learning_rate": 1.30935850519499e-05, "loss": 0.5258, "step": 112 }, { "epoch": 2.2907268170426063, "grad_norm": 0.1459949648258527, "learning_rate": 1.2396839540715528e-05, "loss": 0.5249, "step": 113 }, { "epoch": 2.31077694235589, "grad_norm": 0.1134072731915256, "learning_rate": 1.1715728752538103e-05, "loss": 0.5283, "step": 114 }, { "epoch": 2.3308270676691727, "grad_norm": 0.10061797227153219, "learning_rate": 1.1050638475797193e-05, "loss": 0.5264, "step": 115 }, { "epoch": 2.3508771929824563, "grad_norm": 0.11967704006720019, "learning_rate": 1.0401945424684653e-05, "loss": 0.5258, "step": 116 }, { "epoch": 2.370927318295739, "grad_norm": 0.10081258865949869, "learning_rate": 9.770017025829675e-06, "loss": 0.5125, "step": 117 }, { "epoch": 2.3909774436090228, "grad_norm": 0.0910736424605034, "learning_rate": 9.155211210184495e-06, "loss": 0.5215, "step": 118 }, { "epoch": 2.4110275689223055, "grad_norm": 0.10320752546874508, "learning_rate": 8.55787621028851e-06, "loss": 0.5162, "step": 119 }, { "epoch": 2.431077694235589, "grad_norm": 0.09217800293365436, "learning_rate": 7.978350363025588e-06, "loss": 0.5343, "step": 120 }, { "epoch": 2.451127819548872, "grad_norm": 0.08490382451555073, "learning_rate": 7.416961917986572e-06, "loss": 0.5219, "step": 121 }, { "epoch": 2.4711779448621556, "grad_norm": 0.08656662969137564, "learning_rate": 6.874028851545174e-06, "loss": 0.5212, "step": 122 }, { "epoch": 2.4912280701754383, "grad_norm": 0.09050314726798651, "learning_rate": 6.349858686752748e-06, "loss": 0.5328, "step": 123 }, { "epoch": 2.511278195488722, "grad_norm": 0.0824254065764698, "learning_rate": 5.8447483191540784e-06, "loss": 0.5282, "step": 124 }, { "epoch": 2.5313283208020048, "grad_norm": 0.09004753737634522, "learning_rate": 5.358983848622452e-06, "loss": 0.5291, "step": 125 }, { "epoch": 2.5513784461152884, "grad_norm": 0.08491851685766005, "learning_rate": 4.892840417309775e-06, "loss": 0.5174, "step": 126 }, { "epoch": 2.571428571428571, "grad_norm": 0.08667374099676277, "learning_rate": 4.446582053803066e-06, "loss": 0.5269, "step": 127 }, { "epoch": 2.591478696741855, "grad_norm": 0.08634185611814611, "learning_rate": 4.020461523575873e-06, "loss": 0.5404, "step": 128 }, { "epoch": 2.6115288220551376, "grad_norm": 0.08142061930820212, "learning_rate": 3.6147201858192627e-06, "loss": 0.5297, "step": 129 }, { "epoch": 2.6315789473684212, "grad_norm": 0.08234802409772482, "learning_rate": 3.2295878567333784e-06, "loss": 0.5347, "step": 130 }, { "epoch": 2.651629072681704, "grad_norm": 0.0850700303547024, "learning_rate": 2.8652826793570975e-06, "loss": 0.5309, "step": 131 }, { "epoch": 2.6716791979949877, "grad_norm": 0.08576766617100663, "learning_rate": 2.5220110000095366e-06, "loss": 0.529, "step": 132 }, { "epoch": 2.6917293233082704, "grad_norm": 0.08621611962612956, "learning_rate": 2.199967251413262e-06, "loss": 0.526, "step": 133 }, { "epoch": 2.711779448621554, "grad_norm": 0.07973648331165978, "learning_rate": 1.8993338425655805e-06, "loss": 0.5291, "step": 134 }, { "epoch": 2.731829573934837, "grad_norm": 0.07607751613838545, "learning_rate": 1.6202810554201099e-06, "loss": 0.5287, "step": 135 }, { "epoch": 2.7518796992481205, "grad_norm": 0.0764140719381241, "learning_rate": 1.3629669484372722e-06, "loss": 0.519, "step": 136 }, { "epoch": 2.7719298245614032, "grad_norm": 0.07473144353146483, "learning_rate": 1.127537267058334e-06, "loss": 0.5299, "step": 137 }, { "epoch": 2.791979949874687, "grad_norm": 0.07476164790510335, "learning_rate": 9.141253611536238e-07, "loss": 0.5335, "step": 138 }, { "epoch": 2.8120300751879697, "grad_norm": 0.07472989997236351, "learning_rate": 7.228521094917318e-07, "loss": 0.5156, "step": 139 }, { "epoch": 2.8320802005012533, "grad_norm": 0.07763611917758945, "learning_rate": 5.538258512725403e-07, "loss": 0.528, "step": 140 }, { "epoch": 2.852130325814536, "grad_norm": 0.07864532159225761, "learning_rate": 4.0714232476269265e-07, "loss": 0.5205, "step": 141 }, { "epoch": 2.8721804511278197, "grad_norm": 0.07157416039875991, "learning_rate": 2.8288461306846817e-07, "loss": 0.5251, "step": 142 }, { "epoch": 2.8922305764411025, "grad_norm": 0.07874362826484563, "learning_rate": 1.8112309707661647e-07, "loss": 0.5326, "step": 143 }, { "epoch": 2.912280701754386, "grad_norm": 0.07678251046065998, "learning_rate": 1.019154155898594e-07, "loss": 0.5325, "step": 144 }, { "epoch": 2.932330827067669, "grad_norm": 0.07928853233638587, "learning_rate": 4.530643267968149e-08, "loss": 0.5283, "step": 145 }, { "epoch": 2.9523809523809526, "grad_norm": 0.07049261679058248, "learning_rate": 1.1328212274839267e-08, "loss": 0.5307, "step": 146 }, { "epoch": 2.9724310776942353, "grad_norm": 0.07518275904349948, "learning_rate": 0.0, "loss": 0.5333, "step": 147 }, { "epoch": 2.9724310776942353, "step": 147, "total_flos": 3.782746824809382e+18, "train_loss": 0.6079183743924511, "train_runtime": 22677.177, "train_samples_per_second": 3.374, "train_steps_per_second": 0.006 } ], "logging_steps": 1.0, "max_steps": 147, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.782746824809382e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }