| { | |
| "best_global_step": 300, | |
| "best_metric": 0.11345648020505905, | |
| "best_model_checkpoint": "./qwen-math-lora/checkpoint-300", | |
| "epoch": 1.5800922874093606, | |
| "eval_steps": 50, | |
| "global_step": 300, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005273566249176005, | |
| "grad_norm": 1.5119972229003906, | |
| "learning_rate": 0.0, | |
| "loss": 1.6366, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.01054713249835201, | |
| "grad_norm": 1.4765245914459229, | |
| "learning_rate": 1.0000000000000002e-06, | |
| "loss": 1.6667, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.015820698747528016, | |
| "grad_norm": 1.4236232042312622, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 1.6576, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.02109426499670402, | |
| "grad_norm": 1.5235203504562378, | |
| "learning_rate": 3e-06, | |
| "loss": 1.6572, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.026367831245880026, | |
| "grad_norm": 1.4115797281265259, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 1.5819, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.03164139749505603, | |
| "grad_norm": 1.4249836206436157, | |
| "learning_rate": 5e-06, | |
| "loss": 1.6502, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.03691496374423204, | |
| "grad_norm": 1.3071649074554443, | |
| "learning_rate": 6e-06, | |
| "loss": 1.6807, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.04218852999340804, | |
| "grad_norm": 1.2610119581222534, | |
| "learning_rate": 7.000000000000001e-06, | |
| "loss": 1.5825, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.047462096242584045, | |
| "grad_norm": 1.235508918762207, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 1.5464, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.05273566249176005, | |
| "grad_norm": 1.151235580444336, | |
| "learning_rate": 9e-06, | |
| "loss": 1.5856, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.05800922874093606, | |
| "grad_norm": 1.058812141418457, | |
| "learning_rate": 1e-05, | |
| "loss": 1.542, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.06328279499011207, | |
| "grad_norm": 0.935869038105011, | |
| "learning_rate": 1.1000000000000001e-05, | |
| "loss": 1.5216, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.06855636123928807, | |
| "grad_norm": 0.8530864715576172, | |
| "learning_rate": 1.2e-05, | |
| "loss": 1.4271, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.07382992748846408, | |
| "grad_norm": 0.836365282535553, | |
| "learning_rate": 1.3000000000000001e-05, | |
| "loss": 1.4095, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.07910349373764008, | |
| "grad_norm": 0.7369374632835388, | |
| "learning_rate": 1.4000000000000001e-05, | |
| "loss": 1.3664, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.08437705998681608, | |
| "grad_norm": 0.741736888885498, | |
| "learning_rate": 1.5e-05, | |
| "loss": 1.4031, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.08965062623599208, | |
| "grad_norm": 0.6538688540458679, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 1.3768, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.09492419248516809, | |
| "grad_norm": 0.6126262545585632, | |
| "learning_rate": 1.7000000000000003e-05, | |
| "loss": 1.3554, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.1001977587343441, | |
| "grad_norm": 0.5822679996490479, | |
| "learning_rate": 1.8e-05, | |
| "loss": 1.2992, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.1054713249835201, | |
| "grad_norm": 0.5410017967224121, | |
| "learning_rate": 1.9e-05, | |
| "loss": 1.2494, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.11074489123269611, | |
| "grad_norm": 0.5416837334632874, | |
| "learning_rate": 2e-05, | |
| "loss": 1.2604, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.11601845748187212, | |
| "grad_norm": 0.5807645320892334, | |
| "learning_rate": 2.1e-05, | |
| "loss": 1.178, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.12129202373104812, | |
| "grad_norm": 0.5549229383468628, | |
| "learning_rate": 2.2000000000000003e-05, | |
| "loss": 1.1861, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.12656558998022413, | |
| "grad_norm": 0.5763499736785889, | |
| "learning_rate": 2.3000000000000003e-05, | |
| "loss": 1.2242, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.13183915622940012, | |
| "grad_norm": 0.5674681663513184, | |
| "learning_rate": 2.4e-05, | |
| "loss": 1.1442, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.13711272247857614, | |
| "grad_norm": 0.5441560745239258, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.0821, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.14238628872775214, | |
| "grad_norm": 0.58034348487854, | |
| "learning_rate": 2.6000000000000002e-05, | |
| "loss": 1.1068, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.14765985497692816, | |
| "grad_norm": 0.563574492931366, | |
| "learning_rate": 2.7000000000000002e-05, | |
| "loss": 1.0316, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.15293342122610415, | |
| "grad_norm": 0.5922898054122925, | |
| "learning_rate": 2.8000000000000003e-05, | |
| "loss": 1.0707, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.15820698747528017, | |
| "grad_norm": 0.46859392523765564, | |
| "learning_rate": 2.9e-05, | |
| "loss": 0.97, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.16348055372445616, | |
| "grad_norm": 0.7508406639099121, | |
| "learning_rate": 3e-05, | |
| "loss": 0.9857, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.16875411997363216, | |
| "grad_norm": 0.6806529760360718, | |
| "learning_rate": 3.1e-05, | |
| "loss": 0.9589, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.17402768622280818, | |
| "grad_norm": 0.35177281498908997, | |
| "learning_rate": 3.2000000000000005e-05, | |
| "loss": 0.9319, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.17930125247198417, | |
| "grad_norm": 0.35340362787246704, | |
| "learning_rate": 3.3e-05, | |
| "loss": 0.9878, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.1845748187211602, | |
| "grad_norm": 0.3041383624076843, | |
| "learning_rate": 3.4000000000000007e-05, | |
| "loss": 0.9501, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.18984838497033618, | |
| "grad_norm": 0.29335305094718933, | |
| "learning_rate": 3.5e-05, | |
| "loss": 0.8826, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.1951219512195122, | |
| "grad_norm": 0.2781873345375061, | |
| "learning_rate": 3.6e-05, | |
| "loss": 0.9757, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.2003955174686882, | |
| "grad_norm": 0.3608724772930145, | |
| "learning_rate": 3.7e-05, | |
| "loss": 0.9229, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.20566908371786422, | |
| "grad_norm": 0.2756713032722473, | |
| "learning_rate": 3.8e-05, | |
| "loss": 0.8868, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.2109426499670402, | |
| "grad_norm": 0.3764660060405731, | |
| "learning_rate": 3.9000000000000006e-05, | |
| "loss": 0.9301, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.21621621621621623, | |
| "grad_norm": 0.27100852131843567, | |
| "learning_rate": 4e-05, | |
| "loss": 0.9014, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.22148978246539222, | |
| "grad_norm": 0.27153897285461426, | |
| "learning_rate": 4.1e-05, | |
| "loss": 0.8569, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.2267633487145682, | |
| "grad_norm": 0.2656016945838928, | |
| "learning_rate": 4.2e-05, | |
| "loss": 0.8353, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.23203691496374423, | |
| "grad_norm": 0.30224132537841797, | |
| "learning_rate": 4.3e-05, | |
| "loss": 0.8531, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.23731048121292023, | |
| "grad_norm": 0.2992110252380371, | |
| "learning_rate": 4.4000000000000006e-05, | |
| "loss": 0.9029, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.24258404746209625, | |
| "grad_norm": 0.2795073091983795, | |
| "learning_rate": 4.5e-05, | |
| "loss": 0.8582, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.24785761371127224, | |
| "grad_norm": 0.27543389797210693, | |
| "learning_rate": 4.600000000000001e-05, | |
| "loss": 0.7899, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.25313117996044826, | |
| "grad_norm": 0.26102226972579956, | |
| "learning_rate": 4.7e-05, | |
| "loss": 0.7705, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.2584047462096243, | |
| "grad_norm": 0.32240045070648193, | |
| "learning_rate": 4.8e-05, | |
| "loss": 0.7833, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.26367831245880025, | |
| "grad_norm": 0.2760595679283142, | |
| "learning_rate": 4.9e-05, | |
| "loss": 0.8035, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.26367831245880025, | |
| "eval_loss": 0.8153137564659119, | |
| "eval_runtime": 133.8438, | |
| "eval_samples_per_second": 8.002, | |
| "eval_steps_per_second": 2.002, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.26895187870797627, | |
| "grad_norm": 0.29733768105506897, | |
| "learning_rate": 5e-05, | |
| "loss": 0.8767, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.2742254449571523, | |
| "grad_norm": 0.4476633667945862, | |
| "learning_rate": 5.1000000000000006e-05, | |
| "loss": 0.7695, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.2794990112063283, | |
| "grad_norm": 0.3744952380657196, | |
| "learning_rate": 5.2000000000000004e-05, | |
| "loss": 0.8464, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.28477257745550427, | |
| "grad_norm": 0.2564408779144287, | |
| "learning_rate": 5.300000000000001e-05, | |
| "loss": 0.8191, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.2900461437046803, | |
| "grad_norm": 0.2613051235675812, | |
| "learning_rate": 5.4000000000000005e-05, | |
| "loss": 0.7771, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.2953197099538563, | |
| "grad_norm": 0.4838894307613373, | |
| "learning_rate": 5.500000000000001e-05, | |
| "loss": 0.7751, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.3005932762030323, | |
| "grad_norm": 0.28951677680015564, | |
| "learning_rate": 5.6000000000000006e-05, | |
| "loss": 0.7704, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.3058668424522083, | |
| "grad_norm": 0.2760978043079376, | |
| "learning_rate": 5.6999999999999996e-05, | |
| "loss": 0.8058, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.3111404087013843, | |
| "grad_norm": 0.2781215310096741, | |
| "learning_rate": 5.8e-05, | |
| "loss": 0.7634, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.31641397495056034, | |
| "grad_norm": 0.25308936834335327, | |
| "learning_rate": 5.9e-05, | |
| "loss": 0.7516, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.3216875411997363, | |
| "grad_norm": 0.3314322531223297, | |
| "learning_rate": 6e-05, | |
| "loss": 0.7492, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.3269611074489123, | |
| "grad_norm": 0.26924365758895874, | |
| "learning_rate": 6.1e-05, | |
| "loss": 0.7459, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.33223467369808835, | |
| "grad_norm": 0.26491013169288635, | |
| "learning_rate": 6.2e-05, | |
| "loss": 0.7338, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.3375082399472643, | |
| "grad_norm": 0.28656676411628723, | |
| "learning_rate": 6.3e-05, | |
| "loss": 0.7454, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.34278180619644033, | |
| "grad_norm": 0.3129251301288605, | |
| "learning_rate": 6.400000000000001e-05, | |
| "loss": 0.751, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.34805537244561635, | |
| "grad_norm": 0.3116537928581238, | |
| "learning_rate": 6.500000000000001e-05, | |
| "loss": 0.6941, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.35332893869479237, | |
| "grad_norm": 0.3021077513694763, | |
| "learning_rate": 6.6e-05, | |
| "loss": 0.7172, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.35860250494396834, | |
| "grad_norm": 0.2933245599269867, | |
| "learning_rate": 6.7e-05, | |
| "loss": 0.7293, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.36387607119314436, | |
| "grad_norm": 0.32778868079185486, | |
| "learning_rate": 6.800000000000001e-05, | |
| "loss": 0.6935, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.3691496374423204, | |
| "grad_norm": 0.286576509475708, | |
| "learning_rate": 6.9e-05, | |
| "loss": 0.6441, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.3744232036914964, | |
| "grad_norm": 0.27806833386421204, | |
| "learning_rate": 7e-05, | |
| "loss": 0.7246, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.37969676994067236, | |
| "grad_norm": 0.31078678369522095, | |
| "learning_rate": 7.1e-05, | |
| "loss": 0.6322, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.3849703361898484, | |
| "grad_norm": 0.3146444261074066, | |
| "learning_rate": 7.2e-05, | |
| "loss": 0.6872, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.3902439024390244, | |
| "grad_norm": 0.3151572346687317, | |
| "learning_rate": 7.3e-05, | |
| "loss": 0.693, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.39551746868820037, | |
| "grad_norm": 0.33185523748397827, | |
| "learning_rate": 7.4e-05, | |
| "loss": 0.6937, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.4007910349373764, | |
| "grad_norm": 0.3287936747074127, | |
| "learning_rate": 7.500000000000001e-05, | |
| "loss": 0.7058, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.4060646011865524, | |
| "grad_norm": 0.34535712003707886, | |
| "learning_rate": 7.6e-05, | |
| "loss": 0.6538, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.41133816743572843, | |
| "grad_norm": 0.34255126118659973, | |
| "learning_rate": 7.7e-05, | |
| "loss": 0.674, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.4166117336849044, | |
| "grad_norm": 0.7276009321212769, | |
| "learning_rate": 7.800000000000001e-05, | |
| "loss": 0.6221, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.4218852999340804, | |
| "grad_norm": 0.41575613617897034, | |
| "learning_rate": 7.900000000000001e-05, | |
| "loss": 0.5828, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.42715886618325644, | |
| "grad_norm": 0.33262866735458374, | |
| "learning_rate": 8e-05, | |
| "loss": 0.5782, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.43243243243243246, | |
| "grad_norm": 0.33510202169418335, | |
| "learning_rate": 8.1e-05, | |
| "loss": 0.5731, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.4377059986816084, | |
| "grad_norm": 0.3654046058654785, | |
| "learning_rate": 8.2e-05, | |
| "loss": 0.5739, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.44297956493078444, | |
| "grad_norm": 0.3834691643714905, | |
| "learning_rate": 8.3e-05, | |
| "loss": 0.5629, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.44825313117996046, | |
| "grad_norm": 0.3804622292518616, | |
| "learning_rate": 8.4e-05, | |
| "loss": 0.5433, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.4535266974291364, | |
| "grad_norm": 0.3488738238811493, | |
| "learning_rate": 8.5e-05, | |
| "loss": 0.5517, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.45880026367831245, | |
| "grad_norm": 0.38344502449035645, | |
| "learning_rate": 8.6e-05, | |
| "loss": 0.576, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.46407382992748847, | |
| "grad_norm": 0.3855077624320984, | |
| "learning_rate": 8.7e-05, | |
| "loss": 0.527, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.4693473961766645, | |
| "grad_norm": 0.3912067711353302, | |
| "learning_rate": 8.800000000000001e-05, | |
| "loss": 0.5165, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.47462096242584045, | |
| "grad_norm": 0.4480763077735901, | |
| "learning_rate": 8.900000000000001e-05, | |
| "loss": 0.5103, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.4798945286750165, | |
| "grad_norm": 0.4126266539096832, | |
| "learning_rate": 9e-05, | |
| "loss": 0.5246, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.4851680949241925, | |
| "grad_norm": 0.41678905487060547, | |
| "learning_rate": 9.1e-05, | |
| "loss": 0.55, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.4904416611733685, | |
| "grad_norm": 0.42350953817367554, | |
| "learning_rate": 9.200000000000001e-05, | |
| "loss": 0.493, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.4957152274225445, | |
| "grad_norm": 0.44608232378959656, | |
| "learning_rate": 9.300000000000001e-05, | |
| "loss": 0.5233, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.5009887936717206, | |
| "grad_norm": 0.44571366906166077, | |
| "learning_rate": 9.4e-05, | |
| "loss": 0.56, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.5062623599208965, | |
| "grad_norm": 0.44927364587783813, | |
| "learning_rate": 9.5e-05, | |
| "loss": 0.5191, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.5115359261700725, | |
| "grad_norm": 0.5781615376472473, | |
| "learning_rate": 9.6e-05, | |
| "loss": 0.5259, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.5168094924192486, | |
| "grad_norm": 0.4781758785247803, | |
| "learning_rate": 9.7e-05, | |
| "loss": 0.5061, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.5220830586684245, | |
| "grad_norm": 0.46505609154701233, | |
| "learning_rate": 9.8e-05, | |
| "loss": 0.4804, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.5273566249176005, | |
| "grad_norm": 0.7176192998886108, | |
| "learning_rate": 9.900000000000001e-05, | |
| "loss": 0.4287, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.5273566249176005, | |
| "eval_loss": 0.4786125123500824, | |
| "eval_runtime": 133.8519, | |
| "eval_samples_per_second": 8.001, | |
| "eval_steps_per_second": 2.002, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.5326301911667766, | |
| "grad_norm": 0.4295816421508789, | |
| "learning_rate": 0.0001, | |
| "loss": 0.4786, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.5379037574159525, | |
| "grad_norm": 0.47132614254951477, | |
| "learning_rate": 0.000101, | |
| "loss": 0.4703, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.5431773236651285, | |
| "grad_norm": 0.5543473958969116, | |
| "learning_rate": 0.00010200000000000001, | |
| "loss": 0.4401, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.5484508899143046, | |
| "grad_norm": 0.498334139585495, | |
| "learning_rate": 0.00010300000000000001, | |
| "loss": 0.4324, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.5537244561634805, | |
| "grad_norm": 0.49423035979270935, | |
| "learning_rate": 0.00010400000000000001, | |
| "loss": 0.4316, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.5589980224126566, | |
| "grad_norm": 5.340365409851074, | |
| "learning_rate": 0.000105, | |
| "loss": 0.4713, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.5642715886618326, | |
| "grad_norm": 0.5593706965446472, | |
| "learning_rate": 0.00010600000000000002, | |
| "loss": 0.4143, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.5695451549110085, | |
| "grad_norm": 0.4752410054206848, | |
| "learning_rate": 0.00010700000000000001, | |
| "loss": 0.419, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.5748187211601846, | |
| "grad_norm": 0.6359984278678894, | |
| "learning_rate": 0.00010800000000000001, | |
| "loss": 0.4151, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.5800922874093606, | |
| "grad_norm": 0.5052346587181091, | |
| "learning_rate": 0.000109, | |
| "loss": 0.3952, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.5853658536585366, | |
| "grad_norm": 0.49212637543678284, | |
| "learning_rate": 0.00011000000000000002, | |
| "loss": 0.4617, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.5906394199077126, | |
| "grad_norm": 0.5236564874649048, | |
| "learning_rate": 0.00011100000000000001, | |
| "loss": 0.4325, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.5959129861568886, | |
| "grad_norm": 0.6041468381881714, | |
| "learning_rate": 0.00011200000000000001, | |
| "loss": 0.4009, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.6011865524060646, | |
| "grad_norm": 0.5389513969421387, | |
| "learning_rate": 0.000113, | |
| "loss": 0.4263, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.6064601186552406, | |
| "grad_norm": 0.5749898552894592, | |
| "learning_rate": 0.00011399999999999999, | |
| "loss": 0.4105, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.6117336849044166, | |
| "grad_norm": 0.5574321150779724, | |
| "learning_rate": 0.00011499999999999999, | |
| "loss": 0.3967, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.6170072511535926, | |
| "grad_norm": 0.5891500115394592, | |
| "learning_rate": 0.000116, | |
| "loss": 0.3991, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.6222808174027686, | |
| "grad_norm": 0.5306826829910278, | |
| "learning_rate": 0.000117, | |
| "loss": 0.3726, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.6275543836519446, | |
| "grad_norm": 0.4786357581615448, | |
| "learning_rate": 0.000118, | |
| "loss": 0.3541, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.6328279499011207, | |
| "grad_norm": 0.47434163093566895, | |
| "learning_rate": 0.000119, | |
| "loss": 0.3471, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.6381015161502966, | |
| "grad_norm": 0.5113804340362549, | |
| "learning_rate": 0.00012, | |
| "loss": 0.3519, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.6433750823994726, | |
| "grad_norm": 0.5574295520782471, | |
| "learning_rate": 0.000121, | |
| "loss": 0.3591, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.6486486486486487, | |
| "grad_norm": 1.5176341533660889, | |
| "learning_rate": 0.000122, | |
| "loss": 0.331, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.6539222148978246, | |
| "grad_norm": 0.5883108377456665, | |
| "learning_rate": 0.000123, | |
| "loss": 0.3621, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.6591957811470006, | |
| "grad_norm": 0.5086923837661743, | |
| "learning_rate": 0.000124, | |
| "loss": 0.3719, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.6644693473961767, | |
| "grad_norm": 0.5057904124259949, | |
| "learning_rate": 0.000125, | |
| "loss": 0.3418, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.6697429136453527, | |
| "grad_norm": 0.5942703485488892, | |
| "learning_rate": 0.000126, | |
| "loss": 0.305, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.6750164798945286, | |
| "grad_norm": 0.4942289888858795, | |
| "learning_rate": 0.000127, | |
| "loss": 0.3509, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.6802900461437047, | |
| "grad_norm": 0.6494962573051453, | |
| "learning_rate": 0.00012800000000000002, | |
| "loss": 0.3425, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.6855636123928807, | |
| "grad_norm": 1.0529124736785889, | |
| "learning_rate": 0.00012900000000000002, | |
| "loss": 0.294, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.6908371786420567, | |
| "grad_norm": 0.6346781253814697, | |
| "learning_rate": 0.00013000000000000002, | |
| "loss": 0.325, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.6961107448912327, | |
| "grad_norm": 0.5200821161270142, | |
| "learning_rate": 0.000131, | |
| "loss": 0.3484, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.7013843111404087, | |
| "grad_norm": 0.49618640542030334, | |
| "learning_rate": 0.000132, | |
| "loss": 0.301, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.7066578773895847, | |
| "grad_norm": 0.4997330904006958, | |
| "learning_rate": 0.000133, | |
| "loss": 0.2953, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.7119314436387607, | |
| "grad_norm": 0.5263347625732422, | |
| "learning_rate": 0.000134, | |
| "loss": 0.2767, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.7172050098879367, | |
| "grad_norm": 0.560567319393158, | |
| "learning_rate": 0.00013500000000000003, | |
| "loss": 0.3286, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.7224785761371127, | |
| "grad_norm": 0.4766915738582611, | |
| "learning_rate": 0.00013600000000000003, | |
| "loss": 0.3108, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.7277521423862887, | |
| "grad_norm": 0.47753745317459106, | |
| "learning_rate": 0.00013700000000000002, | |
| "loss": 0.2282, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.7330257086354647, | |
| "grad_norm": 0.5010929107666016, | |
| "learning_rate": 0.000138, | |
| "loss": 0.2731, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.7382992748846408, | |
| "grad_norm": 0.5264869928359985, | |
| "learning_rate": 0.000139, | |
| "loss": 0.2598, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.7435728411338167, | |
| "grad_norm": 0.47988757491111755, | |
| "learning_rate": 0.00014, | |
| "loss": 0.2637, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.7488464073829928, | |
| "grad_norm": 0.48291894793510437, | |
| "learning_rate": 0.000141, | |
| "loss": 0.2739, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.7541199736321688, | |
| "grad_norm": 0.5980640649795532, | |
| "learning_rate": 0.000142, | |
| "loss": 0.3233, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.7593935398813447, | |
| "grad_norm": 0.46733126044273376, | |
| "learning_rate": 0.000143, | |
| "loss": 0.2315, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.7646671061305208, | |
| "grad_norm": 0.4654427766799927, | |
| "learning_rate": 0.000144, | |
| "loss": 0.2479, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.7699406723796968, | |
| "grad_norm": 0.46202385425567627, | |
| "learning_rate": 0.000145, | |
| "loss": 0.3064, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.7752142386288727, | |
| "grad_norm": 0.47191861271858215, | |
| "learning_rate": 0.000146, | |
| "loss": 0.2139, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.7804878048780488, | |
| "grad_norm": 0.5178374648094177, | |
| "learning_rate": 0.000147, | |
| "loss": 0.2304, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.7857613711272248, | |
| "grad_norm": 0.3869185149669647, | |
| "learning_rate": 0.000148, | |
| "loss": 0.2772, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.7910349373764007, | |
| "grad_norm": 0.4422077238559723, | |
| "learning_rate": 0.00014900000000000002, | |
| "loss": 0.2469, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.7910349373764007, | |
| "eval_loss": 0.2504952847957611, | |
| "eval_runtime": 133.8524, | |
| "eval_samples_per_second": 8.001, | |
| "eval_steps_per_second": 2.002, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.7963085036255768, | |
| "grad_norm": 0.4492229223251343, | |
| "learning_rate": 0.00015000000000000001, | |
| "loss": 0.2298, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.8015820698747528, | |
| "grad_norm": 0.5070360898971558, | |
| "learning_rate": 0.000151, | |
| "loss": 0.2389, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.8068556361239289, | |
| "grad_norm": 0.39493462443351746, | |
| "learning_rate": 0.000152, | |
| "loss": 0.206, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.8121292023731048, | |
| "grad_norm": 0.44301116466522217, | |
| "learning_rate": 0.000153, | |
| "loss": 0.2592, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.8174027686222808, | |
| "grad_norm": 0.4067859351634979, | |
| "learning_rate": 0.000154, | |
| "loss": 0.2242, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.8226763348714569, | |
| "grad_norm": 0.43918946385383606, | |
| "learning_rate": 0.000155, | |
| "loss": 0.2127, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.8279499011206328, | |
| "grad_norm": 0.5059219598770142, | |
| "learning_rate": 0.00015600000000000002, | |
| "loss": 0.2561, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.8332234673698088, | |
| "grad_norm": 0.4179636836051941, | |
| "learning_rate": 0.00015700000000000002, | |
| "loss": 0.201, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.8384970336189849, | |
| "grad_norm": 0.4800855219364166, | |
| "learning_rate": 0.00015800000000000002, | |
| "loss": 0.2486, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.8437705998681608, | |
| "grad_norm": 0.4267498254776001, | |
| "learning_rate": 0.00015900000000000002, | |
| "loss": 0.2054, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.8490441661173368, | |
| "grad_norm": 0.4868602156639099, | |
| "learning_rate": 0.00016, | |
| "loss": 0.2206, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.8543177323665129, | |
| "grad_norm": 0.4100910723209381, | |
| "learning_rate": 0.000161, | |
| "loss": 0.2076, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.8595912986156888, | |
| "grad_norm": 0.3785172402858734, | |
| "learning_rate": 0.000162, | |
| "loss": 0.2175, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.8648648648648649, | |
| "grad_norm": 0.4334642291069031, | |
| "learning_rate": 0.000163, | |
| "loss": 0.1801, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.8701384311140409, | |
| "grad_norm": 0.3873803913593292, | |
| "learning_rate": 0.000164, | |
| "loss": 0.203, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.8754119973632168, | |
| "grad_norm": 0.43101224303245544, | |
| "learning_rate": 0.000165, | |
| "loss": 0.2021, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.8806855636123929, | |
| "grad_norm": 0.43550118803977966, | |
| "learning_rate": 0.000166, | |
| "loss": 0.2096, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.8859591298615689, | |
| "grad_norm": 0.44657325744628906, | |
| "learning_rate": 0.000167, | |
| "loss": 0.2052, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.8912326961107448, | |
| "grad_norm": 0.4124061167240143, | |
| "learning_rate": 0.000168, | |
| "loss": 0.2199, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.8965062623599209, | |
| "grad_norm": 0.5452592372894287, | |
| "learning_rate": 0.00016900000000000002, | |
| "loss": 0.2295, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.9017798286090969, | |
| "grad_norm": 0.41437071561813354, | |
| "learning_rate": 0.00017, | |
| "loss": 0.1891, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.9070533948582729, | |
| "grad_norm": 0.3778395354747772, | |
| "learning_rate": 0.000171, | |
| "loss": 0.2194, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.9123269611074489, | |
| "grad_norm": 0.37173032760620117, | |
| "learning_rate": 0.000172, | |
| "loss": 0.1594, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.9176005273566249, | |
| "grad_norm": 0.38124048709869385, | |
| "learning_rate": 0.000173, | |
| "loss": 0.1975, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.922874093605801, | |
| "grad_norm": 0.48111554980278015, | |
| "learning_rate": 0.000174, | |
| "loss": 0.2017, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.9281476598549769, | |
| "grad_norm": 0.44690003991127014, | |
| "learning_rate": 0.000175, | |
| "loss": 0.1859, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.9334212261041529, | |
| "grad_norm": 0.3716354966163635, | |
| "learning_rate": 0.00017600000000000002, | |
| "loss": 0.1964, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.938694792353329, | |
| "grad_norm": 0.36687999963760376, | |
| "learning_rate": 0.00017700000000000002, | |
| "loss": 0.1982, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.9439683586025049, | |
| "grad_norm": 0.4883500337600708, | |
| "learning_rate": 0.00017800000000000002, | |
| "loss": 0.2219, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.9492419248516809, | |
| "grad_norm": 0.33809033036231995, | |
| "learning_rate": 0.00017900000000000001, | |
| "loss": 0.1812, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.954515491100857, | |
| "grad_norm": 0.3546331524848938, | |
| "learning_rate": 0.00018, | |
| "loss": 0.1767, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.959789057350033, | |
| "grad_norm": 0.357530802488327, | |
| "learning_rate": 0.000181, | |
| "loss": 0.1823, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.9650626235992089, | |
| "grad_norm": 0.34756705164909363, | |
| "learning_rate": 0.000182, | |
| "loss": 0.2015, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.970336189848385, | |
| "grad_norm": 0.36489251255989075, | |
| "learning_rate": 0.000183, | |
| "loss": 0.1799, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.975609756097561, | |
| "grad_norm": 0.3720036745071411, | |
| "learning_rate": 0.00018400000000000003, | |
| "loss": 0.1852, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.980883322346737, | |
| "grad_norm": 0.3317737877368927, | |
| "learning_rate": 0.00018500000000000002, | |
| "loss": 0.1868, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.986156888595913, | |
| "grad_norm": 0.35604041814804077, | |
| "learning_rate": 0.00018600000000000002, | |
| "loss": 0.1787, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.991430454845089, | |
| "grad_norm": 0.37222427129745483, | |
| "learning_rate": 0.00018700000000000002, | |
| "loss": 0.1757, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.996704021094265, | |
| "grad_norm": 0.40133216977119446, | |
| "learning_rate": 0.000188, | |
| "loss": 0.1767, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.36636754870414734, | |
| "learning_rate": 0.00018899999999999999, | |
| "loss": 0.1186, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.005273566249176, | |
| "grad_norm": 0.3590473234653473, | |
| "learning_rate": 0.00019, | |
| "loss": 0.1585, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 1.010547132498352, | |
| "grad_norm": 0.3549407124519348, | |
| "learning_rate": 0.000191, | |
| "loss": 0.1512, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 1.015820698747528, | |
| "grad_norm": 0.3402779698371887, | |
| "learning_rate": 0.000192, | |
| "loss": 0.1317, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 1.021094264996704, | |
| "grad_norm": 0.27974751591682434, | |
| "learning_rate": 0.000193, | |
| "loss": 0.1492, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 1.02636783124588, | |
| "grad_norm": 0.2740594446659088, | |
| "learning_rate": 0.000194, | |
| "loss": 0.1614, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.031641397495056, | |
| "grad_norm": 0.3438091576099396, | |
| "learning_rate": 0.000195, | |
| "loss": 0.1639, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 1.036914963744232, | |
| "grad_norm": 0.35183268785476685, | |
| "learning_rate": 0.000196, | |
| "loss": 0.137, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 1.042188529993408, | |
| "grad_norm": 0.3638111352920532, | |
| "learning_rate": 0.00019700000000000002, | |
| "loss": 0.1896, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 1.047462096242584, | |
| "grad_norm": 0.3941810131072998, | |
| "learning_rate": 0.00019800000000000002, | |
| "loss": 0.1905, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 1.05273566249176, | |
| "grad_norm": 0.2802982032299042, | |
| "learning_rate": 0.000199, | |
| "loss": 0.1355, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.05273566249176, | |
| "eval_loss": 0.16099050641059875, | |
| "eval_runtime": 133.9383, | |
| "eval_samples_per_second": 7.996, | |
| "eval_steps_per_second": 2.001, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.0580092287409362, | |
| "grad_norm": 0.33312973380088806, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1296, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 1.063282794990112, | |
| "grad_norm": 0.291446328163147, | |
| "learning_rate": 0.00019999842640648654, | |
| "loss": 0.1354, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 1.068556361239288, | |
| "grad_norm": 0.3492049276828766, | |
| "learning_rate": 0.00019999370567547008, | |
| "loss": 0.1609, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 1.0738299274884642, | |
| "grad_norm": 0.36389562487602234, | |
| "learning_rate": 0.00019998583795552083, | |
| "loss": 0.1234, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 1.07910349373764, | |
| "grad_norm": 0.3225807845592499, | |
| "learning_rate": 0.00019997482349425066, | |
| "loss": 0.142, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.084377059986816, | |
| "grad_norm": 0.3185547888278961, | |
| "learning_rate": 0.00019996066263830531, | |
| "loss": 0.1493, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 1.0896506262359922, | |
| "grad_norm": 0.3187515139579773, | |
| "learning_rate": 0.00019994335583335335, | |
| "loss": 0.1595, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 1.094924192485168, | |
| "grad_norm": 0.3453561067581177, | |
| "learning_rate": 0.0001999229036240723, | |
| "loss": 0.1547, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 1.1001977587343441, | |
| "grad_norm": 0.3260701894760132, | |
| "learning_rate": 0.00019989930665413147, | |
| "loss": 0.1426, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 1.1054713249835202, | |
| "grad_norm": 0.3505662679672241, | |
| "learning_rate": 0.00019987256566617162, | |
| "loss": 0.1619, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.110744891232696, | |
| "grad_norm": 0.32154926657676697, | |
| "learning_rate": 0.00019984268150178167, | |
| "loss": 0.1474, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 1.1160184574818721, | |
| "grad_norm": 0.2730904817581177, | |
| "learning_rate": 0.00019980965510147213, | |
| "loss": 0.1307, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 1.1212920237310482, | |
| "grad_norm": 0.3337661921977997, | |
| "learning_rate": 0.0001997734875046456, | |
| "loss": 0.1584, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 1.126565589980224, | |
| "grad_norm": 0.3607318103313446, | |
| "learning_rate": 0.00019973417984956403, | |
| "loss": 0.1223, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 1.1318391562294001, | |
| "grad_norm": 0.27768680453300476, | |
| "learning_rate": 0.0001996917333733128, | |
| "loss": 0.1209, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.1371127224785762, | |
| "grad_norm": 0.2751491665840149, | |
| "learning_rate": 0.00019964614941176195, | |
| "loss": 0.1168, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 1.142386288727752, | |
| "grad_norm": 0.3006565570831299, | |
| "learning_rate": 0.00019959742939952392, | |
| "loss": 0.1295, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 1.1476598549769281, | |
| "grad_norm": 0.2547905743122101, | |
| "learning_rate": 0.00019954557486990868, | |
| "loss": 0.1247, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 1.1529334212261042, | |
| "grad_norm": 0.25938180088996887, | |
| "learning_rate": 0.00019949058745487522, | |
| "loss": 0.1247, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 1.15820698747528, | |
| "grad_norm": 0.3042941391468048, | |
| "learning_rate": 0.00019943246888498041, | |
| "loss": 0.1645, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.1634805537244561, | |
| "grad_norm": 0.25871893763542175, | |
| "learning_rate": 0.00019937122098932428, | |
| "loss": 0.1431, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 1.1687541199736322, | |
| "grad_norm": 0.24148327112197876, | |
| "learning_rate": 0.00019930684569549264, | |
| "loss": 0.1375, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 1.174027686222808, | |
| "grad_norm": 0.25406157970428467, | |
| "learning_rate": 0.00019923934502949644, | |
| "loss": 0.1524, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 1.1793012524719841, | |
| "grad_norm": 0.3008594512939453, | |
| "learning_rate": 0.00019916872111570784, | |
| "loss": 0.1353, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 1.1845748187211602, | |
| "grad_norm": 0.2584022879600525, | |
| "learning_rate": 0.00019909497617679348, | |
| "loss": 0.1147, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.189848384970336, | |
| "grad_norm": 0.2885512709617615, | |
| "learning_rate": 0.00019901811253364456, | |
| "loss": 0.1388, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 1.1951219512195121, | |
| "grad_norm": 0.3085253834724426, | |
| "learning_rate": 0.00019893813260530368, | |
| "loss": 0.1278, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 1.2003955174686882, | |
| "grad_norm": 0.23244811594486237, | |
| "learning_rate": 0.00019885503890888876, | |
| "loss": 0.1299, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 1.2056690837178643, | |
| "grad_norm": 0.21688468754291534, | |
| "learning_rate": 0.00019876883405951377, | |
| "loss": 0.1145, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 1.2109426499670402, | |
| "grad_norm": 0.2418506145477295, | |
| "learning_rate": 0.00019867952077020666, | |
| "loss": 0.1351, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.2162162162162162, | |
| "grad_norm": 0.27453094720840454, | |
| "learning_rate": 0.0001985871018518236, | |
| "loss": 0.1222, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 1.2214897824653923, | |
| "grad_norm": 0.20536746084690094, | |
| "learning_rate": 0.00019849158021296081, | |
| "loss": 0.1157, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 1.2267633487145682, | |
| "grad_norm": 0.2276519536972046, | |
| "learning_rate": 0.00019839295885986296, | |
| "loss": 0.1266, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 1.2320369149637442, | |
| "grad_norm": 0.2710774838924408, | |
| "learning_rate": 0.00019829124089632845, | |
| "loss": 0.1257, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 1.2373104812129203, | |
| "grad_norm": 0.2697718143463135, | |
| "learning_rate": 0.00019818642952361187, | |
| "loss": 0.1304, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.2425840474620962, | |
| "grad_norm": 0.21641883254051208, | |
| "learning_rate": 0.00019807852804032305, | |
| "loss": 0.1149, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 1.2478576137112722, | |
| "grad_norm": 0.23116011917591095, | |
| "learning_rate": 0.00019796753984232358, | |
| "loss": 0.1115, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 1.2531311799604483, | |
| "grad_norm": 0.23961959779262543, | |
| "learning_rate": 0.00019785346842261957, | |
| "loss": 0.1046, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 1.2584047462096244, | |
| "grad_norm": 0.2854941487312317, | |
| "learning_rate": 0.00019773631737125192, | |
| "loss": 0.1289, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 1.2636783124588002, | |
| "grad_norm": 0.2735542058944702, | |
| "learning_rate": 0.0001976160903751834, | |
| "loss": 0.1243, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.2689518787079763, | |
| "grad_norm": 0.2876754105091095, | |
| "learning_rate": 0.00019749279121818235, | |
| "loss": 0.1712, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 1.2742254449571524, | |
| "grad_norm": 0.21064290404319763, | |
| "learning_rate": 0.00019736642378070392, | |
| "loss": 0.1026, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 1.2794990112063283, | |
| "grad_norm": 0.2385692000389099, | |
| "learning_rate": 0.00019723699203976766, | |
| "loss": 0.1132, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 1.2847725774555043, | |
| "grad_norm": 0.2054402083158493, | |
| "learning_rate": 0.00019710450006883256, | |
| "loss": 0.1366, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 1.2900461437046804, | |
| "grad_norm": 0.25641337037086487, | |
| "learning_rate": 0.0001969689520376687, | |
| "loss": 0.1401, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.2953197099538563, | |
| "grad_norm": 0.21759799122810364, | |
| "learning_rate": 0.00019683035221222618, | |
| "loss": 0.1186, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 1.3005932762030323, | |
| "grad_norm": 0.20061059296131134, | |
| "learning_rate": 0.00019668870495450066, | |
| "loss": 0.1008, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 1.3058668424522084, | |
| "grad_norm": 0.22263573110103607, | |
| "learning_rate": 0.0001965440147223963, | |
| "loss": 0.1201, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 1.3111404087013843, | |
| "grad_norm": 0.22843922674655914, | |
| "learning_rate": 0.00019639628606958533, | |
| "loss": 0.1115, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 1.3164139749505603, | |
| "grad_norm": 0.20508253574371338, | |
| "learning_rate": 0.00019624552364536473, | |
| "loss": 0.1088, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.3164139749505603, | |
| "eval_loss": 0.12800532579421997, | |
| "eval_runtime": 133.8769, | |
| "eval_samples_per_second": 8.0, | |
| "eval_steps_per_second": 2.002, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.3216875411997364, | |
| "grad_norm": 0.2247203290462494, | |
| "learning_rate": 0.00019609173219450998, | |
| "loss": 0.1406, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 1.3269611074489123, | |
| "grad_norm": 0.28306570649147034, | |
| "learning_rate": 0.0001959349165571256, | |
| "loss": 0.137, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 1.3322346736980883, | |
| "grad_norm": 0.21649472415447235, | |
| "learning_rate": 0.00019577508166849304, | |
| "loss": 0.1043, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 1.3375082399472644, | |
| "grad_norm": 0.23190827667713165, | |
| "learning_rate": 0.0001956122325589152, | |
| "loss": 0.1043, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 1.3427818061964403, | |
| "grad_norm": 0.21395829319953918, | |
| "learning_rate": 0.00019544637435355808, | |
| "loss": 0.1118, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.3480553724456164, | |
| "grad_norm": 0.20570361614227295, | |
| "learning_rate": 0.00019527751227228963, | |
| "loss": 0.1059, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 1.3533289386947924, | |
| "grad_norm": 0.22916211187839508, | |
| "learning_rate": 0.00019510565162951537, | |
| "loss": 0.109, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 1.3586025049439683, | |
| "grad_norm": 0.2180647999048233, | |
| "learning_rate": 0.00019493079783401113, | |
| "loss": 0.1272, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 1.3638760711931444, | |
| "grad_norm": 0.19418495893478394, | |
| "learning_rate": 0.0001947529563887529, | |
| "loss": 0.1288, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 1.3691496374423204, | |
| "grad_norm": 0.2715223431587219, | |
| "learning_rate": 0.00019457213289074355, | |
| "loss": 0.098, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.3744232036914963, | |
| "grad_norm": 0.19249342381954193, | |
| "learning_rate": 0.00019438833303083678, | |
| "loss": 0.1185, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 1.3796967699406724, | |
| "grad_norm": 0.1977251172065735, | |
| "learning_rate": 0.00019420156259355791, | |
| "loss": 0.1283, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 1.3849703361898484, | |
| "grad_norm": 0.19867144525051117, | |
| "learning_rate": 0.0001940118274569219, | |
| "loss": 0.1132, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 1.3902439024390243, | |
| "grad_norm": 0.17431101202964783, | |
| "learning_rate": 0.00019381913359224842, | |
| "loss": 0.0956, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 1.3955174686882004, | |
| "grad_norm": 0.2786570191383362, | |
| "learning_rate": 0.00019362348706397373, | |
| "loss": 0.1041, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.4007910349373764, | |
| "grad_norm": 0.2277083843946457, | |
| "learning_rate": 0.00019342489402945998, | |
| "loss": 0.1034, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 1.4060646011865523, | |
| "grad_norm": 0.18951818346977234, | |
| "learning_rate": 0.00019322336073880142, | |
| "loss": 0.1243, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 1.4113381674357284, | |
| "grad_norm": 0.18908710777759552, | |
| "learning_rate": 0.00019301889353462762, | |
| "loss": 0.1019, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 1.4166117336849045, | |
| "grad_norm": 0.24964019656181335, | |
| "learning_rate": 0.0001928114988519039, | |
| "loss": 0.1315, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 1.4218852999340803, | |
| "grad_norm": 0.22528688609600067, | |
| "learning_rate": 0.0001926011832177288, | |
| "loss": 0.0979, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.4271588661832564, | |
| "grad_norm": 0.21379578113555908, | |
| "learning_rate": 0.0001923879532511287, | |
| "loss": 0.1064, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 1.4324324324324325, | |
| "grad_norm": 0.21753202378749847, | |
| "learning_rate": 0.0001921718156628494, | |
| "loss": 0.0954, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 1.4377059986816083, | |
| "grad_norm": 0.20682744681835175, | |
| "learning_rate": 0.0001919527772551451, | |
| "loss": 0.1055, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 1.4429795649307844, | |
| "grad_norm": 0.19650743901729584, | |
| "learning_rate": 0.00019173084492156407, | |
| "loss": 0.1229, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 1.4482531311799605, | |
| "grad_norm": 0.19758552312850952, | |
| "learning_rate": 0.00019150602564673198, | |
| "loss": 0.1019, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.4535266974291363, | |
| "grad_norm": 0.20337599515914917, | |
| "learning_rate": 0.00019127832650613189, | |
| "loss": 0.0997, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 1.4588002636783124, | |
| "grad_norm": 0.23217317461967468, | |
| "learning_rate": 0.00019104775466588161, | |
| "loss": 0.1211, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 1.4640738299274885, | |
| "grad_norm": 0.20149654150009155, | |
| "learning_rate": 0.00019081431738250814, | |
| "loss": 0.0889, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 1.4693473961766645, | |
| "grad_norm": 0.19859851896762848, | |
| "learning_rate": 0.00019057802200271942, | |
| "loss": 0.1133, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 1.4746209624258404, | |
| "grad_norm": 0.2119692862033844, | |
| "learning_rate": 0.00019033887596317298, | |
| "loss": 0.1264, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.4798945286750165, | |
| "grad_norm": 0.1985294073820114, | |
| "learning_rate": 0.0001900968867902419, | |
| "loss": 0.0941, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 1.4851680949241926, | |
| "grad_norm": 0.22264046967029572, | |
| "learning_rate": 0.00018985206209977813, | |
| "loss": 0.1265, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 1.4904416611733686, | |
| "grad_norm": 0.17052385210990906, | |
| "learning_rate": 0.00018960440959687254, | |
| "loss": 0.0947, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 1.4957152274225445, | |
| "grad_norm": 0.17365668714046478, | |
| "learning_rate": 0.00018935393707561251, | |
| "loss": 0.1199, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 1.5009887936717206, | |
| "grad_norm": 0.23060303926467896, | |
| "learning_rate": 0.0001891006524188368, | |
| "loss": 0.0909, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.5062623599208966, | |
| "grad_norm": 0.18114161491394043, | |
| "learning_rate": 0.00018884456359788724, | |
| "loss": 0.1117, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 1.5115359261700725, | |
| "grad_norm": 0.22013631463050842, | |
| "learning_rate": 0.000188585678672358, | |
| "loss": 0.1164, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 1.5168094924192486, | |
| "grad_norm": 0.24089427292346954, | |
| "learning_rate": 0.00018832400578984183, | |
| "loss": 0.1177, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 1.5220830586684246, | |
| "grad_norm": 0.17679591476917267, | |
| "learning_rate": 0.0001880595531856738, | |
| "loss": 0.107, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 1.5273566249176005, | |
| "grad_norm": 0.15667003393173218, | |
| "learning_rate": 0.00018779232918267195, | |
| "loss": 0.1008, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.5326301911667766, | |
| "grad_norm": 0.21019265055656433, | |
| "learning_rate": 0.00018752234219087538, | |
| "loss": 0.1291, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 1.5379037574159526, | |
| "grad_norm": 0.1911863535642624, | |
| "learning_rate": 0.00018724960070727972, | |
| "loss": 0.1246, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 1.5431773236651285, | |
| "grad_norm": 0.16309945285320282, | |
| "learning_rate": 0.00018697411331556956, | |
| "loss": 0.1063, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 1.5484508899143046, | |
| "grad_norm": 0.15654757618904114, | |
| "learning_rate": 0.0001866958886858483, | |
| "loss": 0.1043, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 1.5537244561634806, | |
| "grad_norm": 0.17349812388420105, | |
| "learning_rate": 0.0001864149355743655, | |
| "loss": 0.0799, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.5589980224126565, | |
| "grad_norm": 0.19882531464099884, | |
| "learning_rate": 0.00018613126282324092, | |
| "loss": 0.0983, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 1.5642715886618326, | |
| "grad_norm": 0.1695946753025055, | |
| "learning_rate": 0.00018584487936018661, | |
| "loss": 0.0947, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 1.5695451549110087, | |
| "grad_norm": 0.2050606608390808, | |
| "learning_rate": 0.00018555579419822583, | |
| "loss": 0.1108, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 1.5748187211601845, | |
| "grad_norm": 0.18069462478160858, | |
| "learning_rate": 0.00018526401643540922, | |
| "loss": 0.1137, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 1.5800922874093606, | |
| "grad_norm": 0.2282589226961136, | |
| "learning_rate": 0.00018496955525452874, | |
| "loss": 0.1134, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.5800922874093606, | |
| "eval_loss": 0.11345648020505905, | |
| "eval_runtime": 133.8392, | |
| "eval_samples_per_second": 8.002, | |
| "eval_steps_per_second": 2.002, | |
| "step": 300 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 760, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.968706055036672e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |