{ "best_global_step": 300, "best_metric": 0.11345648020505905, "best_model_checkpoint": "./qwen-math-lora/checkpoint-300", "epoch": 1.5800922874093606, "eval_steps": 50, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005273566249176005, "grad_norm": 1.5119972229003906, "learning_rate": 0.0, "loss": 1.6366, "step": 1 }, { "epoch": 0.01054713249835201, "grad_norm": 1.4765245914459229, "learning_rate": 1.0000000000000002e-06, "loss": 1.6667, "step": 2 }, { "epoch": 0.015820698747528016, "grad_norm": 1.4236232042312622, "learning_rate": 2.0000000000000003e-06, "loss": 1.6576, "step": 3 }, { "epoch": 0.02109426499670402, "grad_norm": 1.5235203504562378, "learning_rate": 3e-06, "loss": 1.6572, "step": 4 }, { "epoch": 0.026367831245880026, "grad_norm": 1.4115797281265259, "learning_rate": 4.000000000000001e-06, "loss": 1.5819, "step": 5 }, { "epoch": 0.03164139749505603, "grad_norm": 1.4249836206436157, "learning_rate": 5e-06, "loss": 1.6502, "step": 6 }, { "epoch": 0.03691496374423204, "grad_norm": 1.3071649074554443, "learning_rate": 6e-06, "loss": 1.6807, "step": 7 }, { "epoch": 0.04218852999340804, "grad_norm": 1.2610119581222534, "learning_rate": 7.000000000000001e-06, "loss": 1.5825, "step": 8 }, { "epoch": 0.047462096242584045, "grad_norm": 1.235508918762207, "learning_rate": 8.000000000000001e-06, "loss": 1.5464, "step": 9 }, { "epoch": 0.05273566249176005, "grad_norm": 1.151235580444336, "learning_rate": 9e-06, "loss": 1.5856, "step": 10 }, { "epoch": 0.05800922874093606, "grad_norm": 1.058812141418457, "learning_rate": 1e-05, "loss": 1.542, "step": 11 }, { "epoch": 0.06328279499011207, "grad_norm": 0.935869038105011, "learning_rate": 1.1000000000000001e-05, "loss": 1.5216, "step": 12 }, { "epoch": 0.06855636123928807, "grad_norm": 0.8530864715576172, "learning_rate": 1.2e-05, "loss": 1.4271, "step": 13 }, { "epoch": 0.07382992748846408, "grad_norm": 0.836365282535553, "learning_rate": 1.3000000000000001e-05, "loss": 1.4095, "step": 14 }, { "epoch": 0.07910349373764008, "grad_norm": 0.7369374632835388, "learning_rate": 1.4000000000000001e-05, "loss": 1.3664, "step": 15 }, { "epoch": 0.08437705998681608, "grad_norm": 0.741736888885498, "learning_rate": 1.5e-05, "loss": 1.4031, "step": 16 }, { "epoch": 0.08965062623599208, "grad_norm": 0.6538688540458679, "learning_rate": 1.6000000000000003e-05, "loss": 1.3768, "step": 17 }, { "epoch": 0.09492419248516809, "grad_norm": 0.6126262545585632, "learning_rate": 1.7000000000000003e-05, "loss": 1.3554, "step": 18 }, { "epoch": 0.1001977587343441, "grad_norm": 0.5822679996490479, "learning_rate": 1.8e-05, "loss": 1.2992, "step": 19 }, { "epoch": 0.1054713249835201, "grad_norm": 0.5410017967224121, "learning_rate": 1.9e-05, "loss": 1.2494, "step": 20 }, { "epoch": 0.11074489123269611, "grad_norm": 0.5416837334632874, "learning_rate": 2e-05, "loss": 1.2604, "step": 21 }, { "epoch": 0.11601845748187212, "grad_norm": 0.5807645320892334, "learning_rate": 2.1e-05, "loss": 1.178, "step": 22 }, { "epoch": 0.12129202373104812, "grad_norm": 0.5549229383468628, "learning_rate": 2.2000000000000003e-05, "loss": 1.1861, "step": 23 }, { "epoch": 0.12656558998022413, "grad_norm": 0.5763499736785889, "learning_rate": 2.3000000000000003e-05, "loss": 1.2242, "step": 24 }, { "epoch": 0.13183915622940012, "grad_norm": 0.5674681663513184, "learning_rate": 2.4e-05, "loss": 1.1442, "step": 25 }, { "epoch": 0.13711272247857614, "grad_norm": 0.5441560745239258, "learning_rate": 2.5e-05, "loss": 1.0821, "step": 26 }, { "epoch": 0.14238628872775214, "grad_norm": 0.58034348487854, "learning_rate": 2.6000000000000002e-05, "loss": 1.1068, "step": 27 }, { "epoch": 0.14765985497692816, "grad_norm": 0.563574492931366, "learning_rate": 2.7000000000000002e-05, "loss": 1.0316, "step": 28 }, { "epoch": 0.15293342122610415, "grad_norm": 0.5922898054122925, "learning_rate": 2.8000000000000003e-05, "loss": 1.0707, "step": 29 }, { "epoch": 0.15820698747528017, "grad_norm": 0.46859392523765564, "learning_rate": 2.9e-05, "loss": 0.97, "step": 30 }, { "epoch": 0.16348055372445616, "grad_norm": 0.7508406639099121, "learning_rate": 3e-05, "loss": 0.9857, "step": 31 }, { "epoch": 0.16875411997363216, "grad_norm": 0.6806529760360718, "learning_rate": 3.1e-05, "loss": 0.9589, "step": 32 }, { "epoch": 0.17402768622280818, "grad_norm": 0.35177281498908997, "learning_rate": 3.2000000000000005e-05, "loss": 0.9319, "step": 33 }, { "epoch": 0.17930125247198417, "grad_norm": 0.35340362787246704, "learning_rate": 3.3e-05, "loss": 0.9878, "step": 34 }, { "epoch": 0.1845748187211602, "grad_norm": 0.3041383624076843, "learning_rate": 3.4000000000000007e-05, "loss": 0.9501, "step": 35 }, { "epoch": 0.18984838497033618, "grad_norm": 0.29335305094718933, "learning_rate": 3.5e-05, "loss": 0.8826, "step": 36 }, { "epoch": 0.1951219512195122, "grad_norm": 0.2781873345375061, "learning_rate": 3.6e-05, "loss": 0.9757, "step": 37 }, { "epoch": 0.2003955174686882, "grad_norm": 0.3608724772930145, "learning_rate": 3.7e-05, "loss": 0.9229, "step": 38 }, { "epoch": 0.20566908371786422, "grad_norm": 0.2756713032722473, "learning_rate": 3.8e-05, "loss": 0.8868, "step": 39 }, { "epoch": 0.2109426499670402, "grad_norm": 0.3764660060405731, "learning_rate": 3.9000000000000006e-05, "loss": 0.9301, "step": 40 }, { "epoch": 0.21621621621621623, "grad_norm": 0.27100852131843567, "learning_rate": 4e-05, "loss": 0.9014, "step": 41 }, { "epoch": 0.22148978246539222, "grad_norm": 0.27153897285461426, "learning_rate": 4.1e-05, "loss": 0.8569, "step": 42 }, { "epoch": 0.2267633487145682, "grad_norm": 0.2656016945838928, "learning_rate": 4.2e-05, "loss": 0.8353, "step": 43 }, { "epoch": 0.23203691496374423, "grad_norm": 0.30224132537841797, "learning_rate": 4.3e-05, "loss": 0.8531, "step": 44 }, { "epoch": 0.23731048121292023, "grad_norm": 0.2992110252380371, "learning_rate": 4.4000000000000006e-05, "loss": 0.9029, "step": 45 }, { "epoch": 0.24258404746209625, "grad_norm": 0.2795073091983795, "learning_rate": 4.5e-05, "loss": 0.8582, "step": 46 }, { "epoch": 0.24785761371127224, "grad_norm": 0.27543389797210693, "learning_rate": 4.600000000000001e-05, "loss": 0.7899, "step": 47 }, { "epoch": 0.25313117996044826, "grad_norm": 0.26102226972579956, "learning_rate": 4.7e-05, "loss": 0.7705, "step": 48 }, { "epoch": 0.2584047462096243, "grad_norm": 0.32240045070648193, "learning_rate": 4.8e-05, "loss": 0.7833, "step": 49 }, { "epoch": 0.26367831245880025, "grad_norm": 0.2760595679283142, "learning_rate": 4.9e-05, "loss": 0.8035, "step": 50 }, { "epoch": 0.26367831245880025, "eval_loss": 0.8153137564659119, "eval_runtime": 133.8438, "eval_samples_per_second": 8.002, "eval_steps_per_second": 2.002, "step": 50 }, { "epoch": 0.26895187870797627, "grad_norm": 0.29733768105506897, "learning_rate": 5e-05, "loss": 0.8767, "step": 51 }, { "epoch": 0.2742254449571523, "grad_norm": 0.4476633667945862, "learning_rate": 5.1000000000000006e-05, "loss": 0.7695, "step": 52 }, { "epoch": 0.2794990112063283, "grad_norm": 0.3744952380657196, "learning_rate": 5.2000000000000004e-05, "loss": 0.8464, "step": 53 }, { "epoch": 0.28477257745550427, "grad_norm": 0.2564408779144287, "learning_rate": 5.300000000000001e-05, "loss": 0.8191, "step": 54 }, { "epoch": 0.2900461437046803, "grad_norm": 0.2613051235675812, "learning_rate": 5.4000000000000005e-05, "loss": 0.7771, "step": 55 }, { "epoch": 0.2953197099538563, "grad_norm": 0.4838894307613373, "learning_rate": 5.500000000000001e-05, "loss": 0.7751, "step": 56 }, { "epoch": 0.3005932762030323, "grad_norm": 0.28951677680015564, "learning_rate": 5.6000000000000006e-05, "loss": 0.7704, "step": 57 }, { "epoch": 0.3058668424522083, "grad_norm": 0.2760978043079376, "learning_rate": 5.6999999999999996e-05, "loss": 0.8058, "step": 58 }, { "epoch": 0.3111404087013843, "grad_norm": 0.2781215310096741, "learning_rate": 5.8e-05, "loss": 0.7634, "step": 59 }, { "epoch": 0.31641397495056034, "grad_norm": 0.25308936834335327, "learning_rate": 5.9e-05, "loss": 0.7516, "step": 60 }, { "epoch": 0.3216875411997363, "grad_norm": 0.3314322531223297, "learning_rate": 6e-05, "loss": 0.7492, "step": 61 }, { "epoch": 0.3269611074489123, "grad_norm": 0.26924365758895874, "learning_rate": 6.1e-05, "loss": 0.7459, "step": 62 }, { "epoch": 0.33223467369808835, "grad_norm": 0.26491013169288635, "learning_rate": 6.2e-05, "loss": 0.7338, "step": 63 }, { "epoch": 0.3375082399472643, "grad_norm": 0.28656676411628723, "learning_rate": 6.3e-05, "loss": 0.7454, "step": 64 }, { "epoch": 0.34278180619644033, "grad_norm": 0.3129251301288605, "learning_rate": 6.400000000000001e-05, "loss": 0.751, "step": 65 }, { "epoch": 0.34805537244561635, "grad_norm": 0.3116537928581238, "learning_rate": 6.500000000000001e-05, "loss": 0.6941, "step": 66 }, { "epoch": 0.35332893869479237, "grad_norm": 0.3021077513694763, "learning_rate": 6.6e-05, "loss": 0.7172, "step": 67 }, { "epoch": 0.35860250494396834, "grad_norm": 0.2933245599269867, "learning_rate": 6.7e-05, "loss": 0.7293, "step": 68 }, { "epoch": 0.36387607119314436, "grad_norm": 0.32778868079185486, "learning_rate": 6.800000000000001e-05, "loss": 0.6935, "step": 69 }, { "epoch": 0.3691496374423204, "grad_norm": 0.286576509475708, "learning_rate": 6.9e-05, "loss": 0.6441, "step": 70 }, { "epoch": 0.3744232036914964, "grad_norm": 0.27806833386421204, "learning_rate": 7e-05, "loss": 0.7246, "step": 71 }, { "epoch": 0.37969676994067236, "grad_norm": 0.31078678369522095, "learning_rate": 7.1e-05, "loss": 0.6322, "step": 72 }, { "epoch": 0.3849703361898484, "grad_norm": 0.3146444261074066, "learning_rate": 7.2e-05, "loss": 0.6872, "step": 73 }, { "epoch": 0.3902439024390244, "grad_norm": 0.3151572346687317, "learning_rate": 7.3e-05, "loss": 0.693, "step": 74 }, { "epoch": 0.39551746868820037, "grad_norm": 0.33185523748397827, "learning_rate": 7.4e-05, "loss": 0.6937, "step": 75 }, { "epoch": 0.4007910349373764, "grad_norm": 0.3287936747074127, "learning_rate": 7.500000000000001e-05, "loss": 0.7058, "step": 76 }, { "epoch": 0.4060646011865524, "grad_norm": 0.34535712003707886, "learning_rate": 7.6e-05, "loss": 0.6538, "step": 77 }, { "epoch": 0.41133816743572843, "grad_norm": 0.34255126118659973, "learning_rate": 7.7e-05, "loss": 0.674, "step": 78 }, { "epoch": 0.4166117336849044, "grad_norm": 0.7276009321212769, "learning_rate": 7.800000000000001e-05, "loss": 0.6221, "step": 79 }, { "epoch": 0.4218852999340804, "grad_norm": 0.41575613617897034, "learning_rate": 7.900000000000001e-05, "loss": 0.5828, "step": 80 }, { "epoch": 0.42715886618325644, "grad_norm": 0.33262866735458374, "learning_rate": 8e-05, "loss": 0.5782, "step": 81 }, { "epoch": 0.43243243243243246, "grad_norm": 0.33510202169418335, "learning_rate": 8.1e-05, "loss": 0.5731, "step": 82 }, { "epoch": 0.4377059986816084, "grad_norm": 0.3654046058654785, "learning_rate": 8.2e-05, "loss": 0.5739, "step": 83 }, { "epoch": 0.44297956493078444, "grad_norm": 0.3834691643714905, "learning_rate": 8.3e-05, "loss": 0.5629, "step": 84 }, { "epoch": 0.44825313117996046, "grad_norm": 0.3804622292518616, "learning_rate": 8.4e-05, "loss": 0.5433, "step": 85 }, { "epoch": 0.4535266974291364, "grad_norm": 0.3488738238811493, "learning_rate": 8.5e-05, "loss": 0.5517, "step": 86 }, { "epoch": 0.45880026367831245, "grad_norm": 0.38344502449035645, "learning_rate": 8.6e-05, "loss": 0.576, "step": 87 }, { "epoch": 0.46407382992748847, "grad_norm": 0.3855077624320984, "learning_rate": 8.7e-05, "loss": 0.527, "step": 88 }, { "epoch": 0.4693473961766645, "grad_norm": 0.3912067711353302, "learning_rate": 8.800000000000001e-05, "loss": 0.5165, "step": 89 }, { "epoch": 0.47462096242584045, "grad_norm": 0.4480763077735901, "learning_rate": 8.900000000000001e-05, "loss": 0.5103, "step": 90 }, { "epoch": 0.4798945286750165, "grad_norm": 0.4126266539096832, "learning_rate": 9e-05, "loss": 0.5246, "step": 91 }, { "epoch": 0.4851680949241925, "grad_norm": 0.41678905487060547, "learning_rate": 9.1e-05, "loss": 0.55, "step": 92 }, { "epoch": 0.4904416611733685, "grad_norm": 0.42350953817367554, "learning_rate": 9.200000000000001e-05, "loss": 0.493, "step": 93 }, { "epoch": 0.4957152274225445, "grad_norm": 0.44608232378959656, "learning_rate": 9.300000000000001e-05, "loss": 0.5233, "step": 94 }, { "epoch": 0.5009887936717206, "grad_norm": 0.44571366906166077, "learning_rate": 9.4e-05, "loss": 0.56, "step": 95 }, { "epoch": 0.5062623599208965, "grad_norm": 0.44927364587783813, "learning_rate": 9.5e-05, "loss": 0.5191, "step": 96 }, { "epoch": 0.5115359261700725, "grad_norm": 0.5781615376472473, "learning_rate": 9.6e-05, "loss": 0.5259, "step": 97 }, { "epoch": 0.5168094924192486, "grad_norm": 0.4781758785247803, "learning_rate": 9.7e-05, "loss": 0.5061, "step": 98 }, { "epoch": 0.5220830586684245, "grad_norm": 0.46505609154701233, "learning_rate": 9.8e-05, "loss": 0.4804, "step": 99 }, { "epoch": 0.5273566249176005, "grad_norm": 0.7176192998886108, "learning_rate": 9.900000000000001e-05, "loss": 0.4287, "step": 100 }, { "epoch": 0.5273566249176005, "eval_loss": 0.4786125123500824, "eval_runtime": 133.8519, "eval_samples_per_second": 8.001, "eval_steps_per_second": 2.002, "step": 100 }, { "epoch": 0.5326301911667766, "grad_norm": 0.4295816421508789, "learning_rate": 0.0001, "loss": 0.4786, "step": 101 }, { "epoch": 0.5379037574159525, "grad_norm": 0.47132614254951477, "learning_rate": 0.000101, "loss": 0.4703, "step": 102 }, { "epoch": 0.5431773236651285, "grad_norm": 0.5543473958969116, "learning_rate": 0.00010200000000000001, "loss": 0.4401, "step": 103 }, { "epoch": 0.5484508899143046, "grad_norm": 0.498334139585495, "learning_rate": 0.00010300000000000001, "loss": 0.4324, "step": 104 }, { "epoch": 0.5537244561634805, "grad_norm": 0.49423035979270935, "learning_rate": 0.00010400000000000001, "loss": 0.4316, "step": 105 }, { "epoch": 0.5589980224126566, "grad_norm": 5.340365409851074, "learning_rate": 0.000105, "loss": 0.4713, "step": 106 }, { "epoch": 0.5642715886618326, "grad_norm": 0.5593706965446472, "learning_rate": 0.00010600000000000002, "loss": 0.4143, "step": 107 }, { "epoch": 0.5695451549110085, "grad_norm": 0.4752410054206848, "learning_rate": 0.00010700000000000001, "loss": 0.419, "step": 108 }, { "epoch": 0.5748187211601846, "grad_norm": 0.6359984278678894, "learning_rate": 0.00010800000000000001, "loss": 0.4151, "step": 109 }, { "epoch": 0.5800922874093606, "grad_norm": 0.5052346587181091, "learning_rate": 0.000109, "loss": 0.3952, "step": 110 }, { "epoch": 0.5853658536585366, "grad_norm": 0.49212637543678284, "learning_rate": 0.00011000000000000002, "loss": 0.4617, "step": 111 }, { "epoch": 0.5906394199077126, "grad_norm": 0.5236564874649048, "learning_rate": 0.00011100000000000001, "loss": 0.4325, "step": 112 }, { "epoch": 0.5959129861568886, "grad_norm": 0.6041468381881714, "learning_rate": 0.00011200000000000001, "loss": 0.4009, "step": 113 }, { "epoch": 0.6011865524060646, "grad_norm": 0.5389513969421387, "learning_rate": 0.000113, "loss": 0.4263, "step": 114 }, { "epoch": 0.6064601186552406, "grad_norm": 0.5749898552894592, "learning_rate": 0.00011399999999999999, "loss": 0.4105, "step": 115 }, { "epoch": 0.6117336849044166, "grad_norm": 0.5574321150779724, "learning_rate": 0.00011499999999999999, "loss": 0.3967, "step": 116 }, { "epoch": 0.6170072511535926, "grad_norm": 0.5891500115394592, "learning_rate": 0.000116, "loss": 0.3991, "step": 117 }, { "epoch": 0.6222808174027686, "grad_norm": 0.5306826829910278, "learning_rate": 0.000117, "loss": 0.3726, "step": 118 }, { "epoch": 0.6275543836519446, "grad_norm": 0.4786357581615448, "learning_rate": 0.000118, "loss": 0.3541, "step": 119 }, { "epoch": 0.6328279499011207, "grad_norm": 0.47434163093566895, "learning_rate": 0.000119, "loss": 0.3471, "step": 120 }, { "epoch": 0.6381015161502966, "grad_norm": 0.5113804340362549, "learning_rate": 0.00012, "loss": 0.3519, "step": 121 }, { "epoch": 0.6433750823994726, "grad_norm": 0.5574295520782471, "learning_rate": 0.000121, "loss": 0.3591, "step": 122 }, { "epoch": 0.6486486486486487, "grad_norm": 1.5176341533660889, "learning_rate": 0.000122, "loss": 0.331, "step": 123 }, { "epoch": 0.6539222148978246, "grad_norm": 0.5883108377456665, "learning_rate": 0.000123, "loss": 0.3621, "step": 124 }, { "epoch": 0.6591957811470006, "grad_norm": 0.5086923837661743, "learning_rate": 0.000124, "loss": 0.3719, "step": 125 }, { "epoch": 0.6644693473961767, "grad_norm": 0.5057904124259949, "learning_rate": 0.000125, "loss": 0.3418, "step": 126 }, { "epoch": 0.6697429136453527, "grad_norm": 0.5942703485488892, "learning_rate": 0.000126, "loss": 0.305, "step": 127 }, { "epoch": 0.6750164798945286, "grad_norm": 0.4942289888858795, "learning_rate": 0.000127, "loss": 0.3509, "step": 128 }, { "epoch": 0.6802900461437047, "grad_norm": 0.6494962573051453, "learning_rate": 0.00012800000000000002, "loss": 0.3425, "step": 129 }, { "epoch": 0.6855636123928807, "grad_norm": 1.0529124736785889, "learning_rate": 0.00012900000000000002, "loss": 0.294, "step": 130 }, { "epoch": 0.6908371786420567, "grad_norm": 0.6346781253814697, "learning_rate": 0.00013000000000000002, "loss": 0.325, "step": 131 }, { "epoch": 0.6961107448912327, "grad_norm": 0.5200821161270142, "learning_rate": 0.000131, "loss": 0.3484, "step": 132 }, { "epoch": 0.7013843111404087, "grad_norm": 0.49618640542030334, "learning_rate": 0.000132, "loss": 0.301, "step": 133 }, { "epoch": 0.7066578773895847, "grad_norm": 0.4997330904006958, "learning_rate": 0.000133, "loss": 0.2953, "step": 134 }, { "epoch": 0.7119314436387607, "grad_norm": 0.5263347625732422, "learning_rate": 0.000134, "loss": 0.2767, "step": 135 }, { "epoch": 0.7172050098879367, "grad_norm": 0.560567319393158, "learning_rate": 0.00013500000000000003, "loss": 0.3286, "step": 136 }, { "epoch": 0.7224785761371127, "grad_norm": 0.4766915738582611, "learning_rate": 0.00013600000000000003, "loss": 0.3108, "step": 137 }, { "epoch": 0.7277521423862887, "grad_norm": 0.47753745317459106, "learning_rate": 0.00013700000000000002, "loss": 0.2282, "step": 138 }, { "epoch": 0.7330257086354647, "grad_norm": 0.5010929107666016, "learning_rate": 0.000138, "loss": 0.2731, "step": 139 }, { "epoch": 0.7382992748846408, "grad_norm": 0.5264869928359985, "learning_rate": 0.000139, "loss": 0.2598, "step": 140 }, { "epoch": 0.7435728411338167, "grad_norm": 0.47988757491111755, "learning_rate": 0.00014, "loss": 0.2637, "step": 141 }, { "epoch": 0.7488464073829928, "grad_norm": 0.48291894793510437, "learning_rate": 0.000141, "loss": 0.2739, "step": 142 }, { "epoch": 0.7541199736321688, "grad_norm": 0.5980640649795532, "learning_rate": 0.000142, "loss": 0.3233, "step": 143 }, { "epoch": 0.7593935398813447, "grad_norm": 0.46733126044273376, "learning_rate": 0.000143, "loss": 0.2315, "step": 144 }, { "epoch": 0.7646671061305208, "grad_norm": 0.4654427766799927, "learning_rate": 0.000144, "loss": 0.2479, "step": 145 }, { "epoch": 0.7699406723796968, "grad_norm": 0.46202385425567627, "learning_rate": 0.000145, "loss": 0.3064, "step": 146 }, { "epoch": 0.7752142386288727, "grad_norm": 0.47191861271858215, "learning_rate": 0.000146, "loss": 0.2139, "step": 147 }, { "epoch": 0.7804878048780488, "grad_norm": 0.5178374648094177, "learning_rate": 0.000147, "loss": 0.2304, "step": 148 }, { "epoch": 0.7857613711272248, "grad_norm": 0.3869185149669647, "learning_rate": 0.000148, "loss": 0.2772, "step": 149 }, { "epoch": 0.7910349373764007, "grad_norm": 0.4422077238559723, "learning_rate": 0.00014900000000000002, "loss": 0.2469, "step": 150 }, { "epoch": 0.7910349373764007, "eval_loss": 0.2504952847957611, "eval_runtime": 133.8524, "eval_samples_per_second": 8.001, "eval_steps_per_second": 2.002, "step": 150 }, { "epoch": 0.7963085036255768, "grad_norm": 0.4492229223251343, "learning_rate": 0.00015000000000000001, "loss": 0.2298, "step": 151 }, { "epoch": 0.8015820698747528, "grad_norm": 0.5070360898971558, "learning_rate": 0.000151, "loss": 0.2389, "step": 152 }, { "epoch": 0.8068556361239289, "grad_norm": 0.39493462443351746, "learning_rate": 0.000152, "loss": 0.206, "step": 153 }, { "epoch": 0.8121292023731048, "grad_norm": 0.44301116466522217, "learning_rate": 0.000153, "loss": 0.2592, "step": 154 }, { "epoch": 0.8174027686222808, "grad_norm": 0.4067859351634979, "learning_rate": 0.000154, "loss": 0.2242, "step": 155 }, { "epoch": 0.8226763348714569, "grad_norm": 0.43918946385383606, "learning_rate": 0.000155, "loss": 0.2127, "step": 156 }, { "epoch": 0.8279499011206328, "grad_norm": 0.5059219598770142, "learning_rate": 0.00015600000000000002, "loss": 0.2561, "step": 157 }, { "epoch": 0.8332234673698088, "grad_norm": 0.4179636836051941, "learning_rate": 0.00015700000000000002, "loss": 0.201, "step": 158 }, { "epoch": 0.8384970336189849, "grad_norm": 0.4800855219364166, "learning_rate": 0.00015800000000000002, "loss": 0.2486, "step": 159 }, { "epoch": 0.8437705998681608, "grad_norm": 0.4267498254776001, "learning_rate": 0.00015900000000000002, "loss": 0.2054, "step": 160 }, { "epoch": 0.8490441661173368, "grad_norm": 0.4868602156639099, "learning_rate": 0.00016, "loss": 0.2206, "step": 161 }, { "epoch": 0.8543177323665129, "grad_norm": 0.4100910723209381, "learning_rate": 0.000161, "loss": 0.2076, "step": 162 }, { "epoch": 0.8595912986156888, "grad_norm": 0.3785172402858734, "learning_rate": 0.000162, "loss": 0.2175, "step": 163 }, { "epoch": 0.8648648648648649, "grad_norm": 0.4334642291069031, "learning_rate": 0.000163, "loss": 0.1801, "step": 164 }, { "epoch": 0.8701384311140409, "grad_norm": 0.3873803913593292, "learning_rate": 0.000164, "loss": 0.203, "step": 165 }, { "epoch": 0.8754119973632168, "grad_norm": 0.43101224303245544, "learning_rate": 0.000165, "loss": 0.2021, "step": 166 }, { "epoch": 0.8806855636123929, "grad_norm": 0.43550118803977966, "learning_rate": 0.000166, "loss": 0.2096, "step": 167 }, { "epoch": 0.8859591298615689, "grad_norm": 0.44657325744628906, "learning_rate": 0.000167, "loss": 0.2052, "step": 168 }, { "epoch": 0.8912326961107448, "grad_norm": 0.4124061167240143, "learning_rate": 0.000168, "loss": 0.2199, "step": 169 }, { "epoch": 0.8965062623599209, "grad_norm": 0.5452592372894287, "learning_rate": 0.00016900000000000002, "loss": 0.2295, "step": 170 }, { "epoch": 0.9017798286090969, "grad_norm": 0.41437071561813354, "learning_rate": 0.00017, "loss": 0.1891, "step": 171 }, { "epoch": 0.9070533948582729, "grad_norm": 0.3778395354747772, "learning_rate": 0.000171, "loss": 0.2194, "step": 172 }, { "epoch": 0.9123269611074489, "grad_norm": 0.37173032760620117, "learning_rate": 0.000172, "loss": 0.1594, "step": 173 }, { "epoch": 0.9176005273566249, "grad_norm": 0.38124048709869385, "learning_rate": 0.000173, "loss": 0.1975, "step": 174 }, { "epoch": 0.922874093605801, "grad_norm": 0.48111554980278015, "learning_rate": 0.000174, "loss": 0.2017, "step": 175 }, { "epoch": 0.9281476598549769, "grad_norm": 0.44690003991127014, "learning_rate": 0.000175, "loss": 0.1859, "step": 176 }, { "epoch": 0.9334212261041529, "grad_norm": 0.3716354966163635, "learning_rate": 0.00017600000000000002, "loss": 0.1964, "step": 177 }, { "epoch": 0.938694792353329, "grad_norm": 0.36687999963760376, "learning_rate": 0.00017700000000000002, "loss": 0.1982, "step": 178 }, { "epoch": 0.9439683586025049, "grad_norm": 0.4883500337600708, "learning_rate": 0.00017800000000000002, "loss": 0.2219, "step": 179 }, { "epoch": 0.9492419248516809, "grad_norm": 0.33809033036231995, "learning_rate": 0.00017900000000000001, "loss": 0.1812, "step": 180 }, { "epoch": 0.954515491100857, "grad_norm": 0.3546331524848938, "learning_rate": 0.00018, "loss": 0.1767, "step": 181 }, { "epoch": 0.959789057350033, "grad_norm": 0.357530802488327, "learning_rate": 0.000181, "loss": 0.1823, "step": 182 }, { "epoch": 0.9650626235992089, "grad_norm": 0.34756705164909363, "learning_rate": 0.000182, "loss": 0.2015, "step": 183 }, { "epoch": 0.970336189848385, "grad_norm": 0.36489251255989075, "learning_rate": 0.000183, "loss": 0.1799, "step": 184 }, { "epoch": 0.975609756097561, "grad_norm": 0.3720036745071411, "learning_rate": 0.00018400000000000003, "loss": 0.1852, "step": 185 }, { "epoch": 0.980883322346737, "grad_norm": 0.3317737877368927, "learning_rate": 0.00018500000000000002, "loss": 0.1868, "step": 186 }, { "epoch": 0.986156888595913, "grad_norm": 0.35604041814804077, "learning_rate": 0.00018600000000000002, "loss": 0.1787, "step": 187 }, { "epoch": 0.991430454845089, "grad_norm": 0.37222427129745483, "learning_rate": 0.00018700000000000002, "loss": 0.1757, "step": 188 }, { "epoch": 0.996704021094265, "grad_norm": 0.40133216977119446, "learning_rate": 0.000188, "loss": 0.1767, "step": 189 }, { "epoch": 1.0, "grad_norm": 0.36636754870414734, "learning_rate": 0.00018899999999999999, "loss": 0.1186, "step": 190 }, { "epoch": 1.005273566249176, "grad_norm": 0.3590473234653473, "learning_rate": 0.00019, "loss": 0.1585, "step": 191 }, { "epoch": 1.010547132498352, "grad_norm": 0.3549407124519348, "learning_rate": 0.000191, "loss": 0.1512, "step": 192 }, { "epoch": 1.015820698747528, "grad_norm": 0.3402779698371887, "learning_rate": 0.000192, "loss": 0.1317, "step": 193 }, { "epoch": 1.021094264996704, "grad_norm": 0.27974751591682434, "learning_rate": 0.000193, "loss": 0.1492, "step": 194 }, { "epoch": 1.02636783124588, "grad_norm": 0.2740594446659088, "learning_rate": 0.000194, "loss": 0.1614, "step": 195 }, { "epoch": 1.031641397495056, "grad_norm": 0.3438091576099396, "learning_rate": 0.000195, "loss": 0.1639, "step": 196 }, { "epoch": 1.036914963744232, "grad_norm": 0.35183268785476685, "learning_rate": 0.000196, "loss": 0.137, "step": 197 }, { "epoch": 1.042188529993408, "grad_norm": 0.3638111352920532, "learning_rate": 0.00019700000000000002, "loss": 0.1896, "step": 198 }, { "epoch": 1.047462096242584, "grad_norm": 0.3941810131072998, "learning_rate": 0.00019800000000000002, "loss": 0.1905, "step": 199 }, { "epoch": 1.05273566249176, "grad_norm": 0.2802982032299042, "learning_rate": 0.000199, "loss": 0.1355, "step": 200 }, { "epoch": 1.05273566249176, "eval_loss": 0.16099050641059875, "eval_runtime": 133.9383, "eval_samples_per_second": 7.996, "eval_steps_per_second": 2.001, "step": 200 }, { "epoch": 1.0580092287409362, "grad_norm": 0.33312973380088806, "learning_rate": 0.0002, "loss": 0.1296, "step": 201 }, { "epoch": 1.063282794990112, "grad_norm": 0.291446328163147, "learning_rate": 0.00019999842640648654, "loss": 0.1354, "step": 202 }, { "epoch": 1.068556361239288, "grad_norm": 0.3492049276828766, "learning_rate": 0.00019999370567547008, "loss": 0.1609, "step": 203 }, { "epoch": 1.0738299274884642, "grad_norm": 0.36389562487602234, "learning_rate": 0.00019998583795552083, "loss": 0.1234, "step": 204 }, { "epoch": 1.07910349373764, "grad_norm": 0.3225807845592499, "learning_rate": 0.00019997482349425066, "loss": 0.142, "step": 205 }, { "epoch": 1.084377059986816, "grad_norm": 0.3185547888278961, "learning_rate": 0.00019996066263830531, "loss": 0.1493, "step": 206 }, { "epoch": 1.0896506262359922, "grad_norm": 0.3187515139579773, "learning_rate": 0.00019994335583335335, "loss": 0.1595, "step": 207 }, { "epoch": 1.094924192485168, "grad_norm": 0.3453561067581177, "learning_rate": 0.0001999229036240723, "loss": 0.1547, "step": 208 }, { "epoch": 1.1001977587343441, "grad_norm": 0.3260701894760132, "learning_rate": 0.00019989930665413147, "loss": 0.1426, "step": 209 }, { "epoch": 1.1054713249835202, "grad_norm": 0.3505662679672241, "learning_rate": 0.00019987256566617162, "loss": 0.1619, "step": 210 }, { "epoch": 1.110744891232696, "grad_norm": 0.32154926657676697, "learning_rate": 0.00019984268150178167, "loss": 0.1474, "step": 211 }, { "epoch": 1.1160184574818721, "grad_norm": 0.2730904817581177, "learning_rate": 0.00019980965510147213, "loss": 0.1307, "step": 212 }, { "epoch": 1.1212920237310482, "grad_norm": 0.3337661921977997, "learning_rate": 0.0001997734875046456, "loss": 0.1584, "step": 213 }, { "epoch": 1.126565589980224, "grad_norm": 0.3607318103313446, "learning_rate": 0.00019973417984956403, "loss": 0.1223, "step": 214 }, { "epoch": 1.1318391562294001, "grad_norm": 0.27768680453300476, "learning_rate": 0.0001996917333733128, "loss": 0.1209, "step": 215 }, { "epoch": 1.1371127224785762, "grad_norm": 0.2751491665840149, "learning_rate": 0.00019964614941176195, "loss": 0.1168, "step": 216 }, { "epoch": 1.142386288727752, "grad_norm": 0.3006565570831299, "learning_rate": 0.00019959742939952392, "loss": 0.1295, "step": 217 }, { "epoch": 1.1476598549769281, "grad_norm": 0.2547905743122101, "learning_rate": 0.00019954557486990868, "loss": 0.1247, "step": 218 }, { "epoch": 1.1529334212261042, "grad_norm": 0.25938180088996887, "learning_rate": 0.00019949058745487522, "loss": 0.1247, "step": 219 }, { "epoch": 1.15820698747528, "grad_norm": 0.3042941391468048, "learning_rate": 0.00019943246888498041, "loss": 0.1645, "step": 220 }, { "epoch": 1.1634805537244561, "grad_norm": 0.25871893763542175, "learning_rate": 0.00019937122098932428, "loss": 0.1431, "step": 221 }, { "epoch": 1.1687541199736322, "grad_norm": 0.24148327112197876, "learning_rate": 0.00019930684569549264, "loss": 0.1375, "step": 222 }, { "epoch": 1.174027686222808, "grad_norm": 0.25406157970428467, "learning_rate": 0.00019923934502949644, "loss": 0.1524, "step": 223 }, { "epoch": 1.1793012524719841, "grad_norm": 0.3008594512939453, "learning_rate": 0.00019916872111570784, "loss": 0.1353, "step": 224 }, { "epoch": 1.1845748187211602, "grad_norm": 0.2584022879600525, "learning_rate": 0.00019909497617679348, "loss": 0.1147, "step": 225 }, { "epoch": 1.189848384970336, "grad_norm": 0.2885512709617615, "learning_rate": 0.00019901811253364456, "loss": 0.1388, "step": 226 }, { "epoch": 1.1951219512195121, "grad_norm": 0.3085253834724426, "learning_rate": 0.00019893813260530368, "loss": 0.1278, "step": 227 }, { "epoch": 1.2003955174686882, "grad_norm": 0.23244811594486237, "learning_rate": 0.00019885503890888876, "loss": 0.1299, "step": 228 }, { "epoch": 1.2056690837178643, "grad_norm": 0.21688468754291534, "learning_rate": 0.00019876883405951377, "loss": 0.1145, "step": 229 }, { "epoch": 1.2109426499670402, "grad_norm": 0.2418506145477295, "learning_rate": 0.00019867952077020666, "loss": 0.1351, "step": 230 }, { "epoch": 1.2162162162162162, "grad_norm": 0.27453094720840454, "learning_rate": 0.0001985871018518236, "loss": 0.1222, "step": 231 }, { "epoch": 1.2214897824653923, "grad_norm": 0.20536746084690094, "learning_rate": 0.00019849158021296081, "loss": 0.1157, "step": 232 }, { "epoch": 1.2267633487145682, "grad_norm": 0.2276519536972046, "learning_rate": 0.00019839295885986296, "loss": 0.1266, "step": 233 }, { "epoch": 1.2320369149637442, "grad_norm": 0.2710774838924408, "learning_rate": 0.00019829124089632845, "loss": 0.1257, "step": 234 }, { "epoch": 1.2373104812129203, "grad_norm": 0.2697718143463135, "learning_rate": 0.00019818642952361187, "loss": 0.1304, "step": 235 }, { "epoch": 1.2425840474620962, "grad_norm": 0.21641883254051208, "learning_rate": 0.00019807852804032305, "loss": 0.1149, "step": 236 }, { "epoch": 1.2478576137112722, "grad_norm": 0.23116011917591095, "learning_rate": 0.00019796753984232358, "loss": 0.1115, "step": 237 }, { "epoch": 1.2531311799604483, "grad_norm": 0.23961959779262543, "learning_rate": 0.00019785346842261957, "loss": 0.1046, "step": 238 }, { "epoch": 1.2584047462096244, "grad_norm": 0.2854941487312317, "learning_rate": 0.00019773631737125192, "loss": 0.1289, "step": 239 }, { "epoch": 1.2636783124588002, "grad_norm": 0.2735542058944702, "learning_rate": 0.0001976160903751834, "loss": 0.1243, "step": 240 }, { "epoch": 1.2689518787079763, "grad_norm": 0.2876754105091095, "learning_rate": 0.00019749279121818235, "loss": 0.1712, "step": 241 }, { "epoch": 1.2742254449571524, "grad_norm": 0.21064290404319763, "learning_rate": 0.00019736642378070392, "loss": 0.1026, "step": 242 }, { "epoch": 1.2794990112063283, "grad_norm": 0.2385692000389099, "learning_rate": 0.00019723699203976766, "loss": 0.1132, "step": 243 }, { "epoch": 1.2847725774555043, "grad_norm": 0.2054402083158493, "learning_rate": 0.00019710450006883256, "loss": 0.1366, "step": 244 }, { "epoch": 1.2900461437046804, "grad_norm": 0.25641337037086487, "learning_rate": 0.0001969689520376687, "loss": 0.1401, "step": 245 }, { "epoch": 1.2953197099538563, "grad_norm": 0.21759799122810364, "learning_rate": 0.00019683035221222618, "loss": 0.1186, "step": 246 }, { "epoch": 1.3005932762030323, "grad_norm": 0.20061059296131134, "learning_rate": 0.00019668870495450066, "loss": 0.1008, "step": 247 }, { "epoch": 1.3058668424522084, "grad_norm": 0.22263573110103607, "learning_rate": 0.0001965440147223963, "loss": 0.1201, "step": 248 }, { "epoch": 1.3111404087013843, "grad_norm": 0.22843922674655914, "learning_rate": 0.00019639628606958533, "loss": 0.1115, "step": 249 }, { "epoch": 1.3164139749505603, "grad_norm": 0.20508253574371338, "learning_rate": 0.00019624552364536473, "loss": 0.1088, "step": 250 }, { "epoch": 1.3164139749505603, "eval_loss": 0.12800532579421997, "eval_runtime": 133.8769, "eval_samples_per_second": 8.0, "eval_steps_per_second": 2.002, "step": 250 }, { "epoch": 1.3216875411997364, "grad_norm": 0.2247203290462494, "learning_rate": 0.00019609173219450998, "loss": 0.1406, "step": 251 }, { "epoch": 1.3269611074489123, "grad_norm": 0.28306570649147034, "learning_rate": 0.0001959349165571256, "loss": 0.137, "step": 252 }, { "epoch": 1.3322346736980883, "grad_norm": 0.21649472415447235, "learning_rate": 0.00019577508166849304, "loss": 0.1043, "step": 253 }, { "epoch": 1.3375082399472644, "grad_norm": 0.23190827667713165, "learning_rate": 0.0001956122325589152, "loss": 0.1043, "step": 254 }, { "epoch": 1.3427818061964403, "grad_norm": 0.21395829319953918, "learning_rate": 0.00019544637435355808, "loss": 0.1118, "step": 255 }, { "epoch": 1.3480553724456164, "grad_norm": 0.20570361614227295, "learning_rate": 0.00019527751227228963, "loss": 0.1059, "step": 256 }, { "epoch": 1.3533289386947924, "grad_norm": 0.22916211187839508, "learning_rate": 0.00019510565162951537, "loss": 0.109, "step": 257 }, { "epoch": 1.3586025049439683, "grad_norm": 0.2180647999048233, "learning_rate": 0.00019493079783401113, "loss": 0.1272, "step": 258 }, { "epoch": 1.3638760711931444, "grad_norm": 0.19418495893478394, "learning_rate": 0.0001947529563887529, "loss": 0.1288, "step": 259 }, { "epoch": 1.3691496374423204, "grad_norm": 0.2715223431587219, "learning_rate": 0.00019457213289074355, "loss": 0.098, "step": 260 }, { "epoch": 1.3744232036914963, "grad_norm": 0.19249342381954193, "learning_rate": 0.00019438833303083678, "loss": 0.1185, "step": 261 }, { "epoch": 1.3796967699406724, "grad_norm": 0.1977251172065735, "learning_rate": 0.00019420156259355791, "loss": 0.1283, "step": 262 }, { "epoch": 1.3849703361898484, "grad_norm": 0.19867144525051117, "learning_rate": 0.0001940118274569219, "loss": 0.1132, "step": 263 }, { "epoch": 1.3902439024390243, "grad_norm": 0.17431101202964783, "learning_rate": 0.00019381913359224842, "loss": 0.0956, "step": 264 }, { "epoch": 1.3955174686882004, "grad_norm": 0.2786570191383362, "learning_rate": 0.00019362348706397373, "loss": 0.1041, "step": 265 }, { "epoch": 1.4007910349373764, "grad_norm": 0.2277083843946457, "learning_rate": 0.00019342489402945998, "loss": 0.1034, "step": 266 }, { "epoch": 1.4060646011865523, "grad_norm": 0.18951818346977234, "learning_rate": 0.00019322336073880142, "loss": 0.1243, "step": 267 }, { "epoch": 1.4113381674357284, "grad_norm": 0.18908710777759552, "learning_rate": 0.00019301889353462762, "loss": 0.1019, "step": 268 }, { "epoch": 1.4166117336849045, "grad_norm": 0.24964019656181335, "learning_rate": 0.0001928114988519039, "loss": 0.1315, "step": 269 }, { "epoch": 1.4218852999340803, "grad_norm": 0.22528688609600067, "learning_rate": 0.0001926011832177288, "loss": 0.0979, "step": 270 }, { "epoch": 1.4271588661832564, "grad_norm": 0.21379578113555908, "learning_rate": 0.0001923879532511287, "loss": 0.1064, "step": 271 }, { "epoch": 1.4324324324324325, "grad_norm": 0.21753202378749847, "learning_rate": 0.0001921718156628494, "loss": 0.0954, "step": 272 }, { "epoch": 1.4377059986816083, "grad_norm": 0.20682744681835175, "learning_rate": 0.0001919527772551451, "loss": 0.1055, "step": 273 }, { "epoch": 1.4429795649307844, "grad_norm": 0.19650743901729584, "learning_rate": 0.00019173084492156407, "loss": 0.1229, "step": 274 }, { "epoch": 1.4482531311799605, "grad_norm": 0.19758552312850952, "learning_rate": 0.00019150602564673198, "loss": 0.1019, "step": 275 }, { "epoch": 1.4535266974291363, "grad_norm": 0.20337599515914917, "learning_rate": 0.00019127832650613189, "loss": 0.0997, "step": 276 }, { "epoch": 1.4588002636783124, "grad_norm": 0.23217317461967468, "learning_rate": 0.00019104775466588161, "loss": 0.1211, "step": 277 }, { "epoch": 1.4640738299274885, "grad_norm": 0.20149654150009155, "learning_rate": 0.00019081431738250814, "loss": 0.0889, "step": 278 }, { "epoch": 1.4693473961766645, "grad_norm": 0.19859851896762848, "learning_rate": 0.00019057802200271942, "loss": 0.1133, "step": 279 }, { "epoch": 1.4746209624258404, "grad_norm": 0.2119692862033844, "learning_rate": 0.00019033887596317298, "loss": 0.1264, "step": 280 }, { "epoch": 1.4798945286750165, "grad_norm": 0.1985294073820114, "learning_rate": 0.0001900968867902419, "loss": 0.0941, "step": 281 }, { "epoch": 1.4851680949241926, "grad_norm": 0.22264046967029572, "learning_rate": 0.00018985206209977813, "loss": 0.1265, "step": 282 }, { "epoch": 1.4904416611733686, "grad_norm": 0.17052385210990906, "learning_rate": 0.00018960440959687254, "loss": 0.0947, "step": 283 }, { "epoch": 1.4957152274225445, "grad_norm": 0.17365668714046478, "learning_rate": 0.00018935393707561251, "loss": 0.1199, "step": 284 }, { "epoch": 1.5009887936717206, "grad_norm": 0.23060303926467896, "learning_rate": 0.0001891006524188368, "loss": 0.0909, "step": 285 }, { "epoch": 1.5062623599208966, "grad_norm": 0.18114161491394043, "learning_rate": 0.00018884456359788724, "loss": 0.1117, "step": 286 }, { "epoch": 1.5115359261700725, "grad_norm": 0.22013631463050842, "learning_rate": 0.000188585678672358, "loss": 0.1164, "step": 287 }, { "epoch": 1.5168094924192486, "grad_norm": 0.24089427292346954, "learning_rate": 0.00018832400578984183, "loss": 0.1177, "step": 288 }, { "epoch": 1.5220830586684246, "grad_norm": 0.17679591476917267, "learning_rate": 0.0001880595531856738, "loss": 0.107, "step": 289 }, { "epoch": 1.5273566249176005, "grad_norm": 0.15667003393173218, "learning_rate": 0.00018779232918267195, "loss": 0.1008, "step": 290 }, { "epoch": 1.5326301911667766, "grad_norm": 0.21019265055656433, "learning_rate": 0.00018752234219087538, "loss": 0.1291, "step": 291 }, { "epoch": 1.5379037574159526, "grad_norm": 0.1911863535642624, "learning_rate": 0.00018724960070727972, "loss": 0.1246, "step": 292 }, { "epoch": 1.5431773236651285, "grad_norm": 0.16309945285320282, "learning_rate": 0.00018697411331556956, "loss": 0.1063, "step": 293 }, { "epoch": 1.5484508899143046, "grad_norm": 0.15654757618904114, "learning_rate": 0.0001866958886858483, "loss": 0.1043, "step": 294 }, { "epoch": 1.5537244561634806, "grad_norm": 0.17349812388420105, "learning_rate": 0.0001864149355743655, "loss": 0.0799, "step": 295 }, { "epoch": 1.5589980224126565, "grad_norm": 0.19882531464099884, "learning_rate": 0.00018613126282324092, "loss": 0.0983, "step": 296 }, { "epoch": 1.5642715886618326, "grad_norm": 0.1695946753025055, "learning_rate": 0.00018584487936018661, "loss": 0.0947, "step": 297 }, { "epoch": 1.5695451549110087, "grad_norm": 0.2050606608390808, "learning_rate": 0.00018555579419822583, "loss": 0.1108, "step": 298 }, { "epoch": 1.5748187211601845, "grad_norm": 0.18069462478160858, "learning_rate": 0.00018526401643540922, "loss": 0.1137, "step": 299 }, { "epoch": 1.5800922874093606, "grad_norm": 0.2282589226961136, "learning_rate": 0.00018496955525452874, "loss": 0.1134, "step": 300 }, { "epoch": 1.5800922874093606, "eval_loss": 0.11345648020505905, "eval_runtime": 133.8392, "eval_samples_per_second": 8.002, "eval_steps_per_second": 2.002, "step": 300 } ], "logging_steps": 1, "max_steps": 760, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.968706055036672e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }