| { |
| "best_global_step": 7434, |
| "best_metric": 0.1926165670156479, |
| "best_model_checkpoint": "saves/prompt-tuning/llama-3-8b-instruct/train_mrpc_1752826679/checkpoint-7434", |
| "epoch": 10.0, |
| "eval_steps": 413, |
| "global_step": 8260, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.006053268765133172, |
| "grad_norm": 0.9306966662406921, |
| "learning_rate": 2.421307506053269e-07, |
| "loss": 9.4178, |
| "num_input_tokens_seen": 1920, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.012106537530266344, |
| "grad_norm": 0.9190129637718201, |
| "learning_rate": 5.447941888619855e-07, |
| "loss": 9.0931, |
| "num_input_tokens_seen": 3808, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.018159806295399514, |
| "grad_norm": 0.9833366870880127, |
| "learning_rate": 8.474576271186441e-07, |
| "loss": 8.9513, |
| "num_input_tokens_seen": 5856, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.024213075060532687, |
| "grad_norm": 0.872596800327301, |
| "learning_rate": 1.1501210653753028e-06, |
| "loss": 9.0331, |
| "num_input_tokens_seen": 7840, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.03026634382566586, |
| "grad_norm": 1.0374343395233154, |
| "learning_rate": 1.4527845036319614e-06, |
| "loss": 9.2719, |
| "num_input_tokens_seen": 10080, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.03631961259079903, |
| "grad_norm": 1.032057762145996, |
| "learning_rate": 1.7554479418886198e-06, |
| "loss": 8.9447, |
| "num_input_tokens_seen": 12064, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.0423728813559322, |
| "grad_norm": 0.8499752879142761, |
| "learning_rate": 2.0581113801452785e-06, |
| "loss": 9.2155, |
| "num_input_tokens_seen": 14144, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.048426150121065374, |
| "grad_norm": 1.0143111944198608, |
| "learning_rate": 2.3607748184019373e-06, |
| "loss": 9.1714, |
| "num_input_tokens_seen": 16128, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.05447941888619855, |
| "grad_norm": 0.8975183367729187, |
| "learning_rate": 2.6634382566585957e-06, |
| "loss": 9.0606, |
| "num_input_tokens_seen": 18112, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.06053268765133172, |
| "grad_norm": 0.9107595682144165, |
| "learning_rate": 2.9661016949152545e-06, |
| "loss": 8.9885, |
| "num_input_tokens_seen": 20288, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.06658595641646489, |
| "grad_norm": 0.8479609489440918, |
| "learning_rate": 3.268765133171913e-06, |
| "loss": 9.1961, |
| "num_input_tokens_seen": 22368, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.07263922518159806, |
| "grad_norm": 1.1509559154510498, |
| "learning_rate": 3.5714285714285714e-06, |
| "loss": 9.0768, |
| "num_input_tokens_seen": 24256, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.07869249394673124, |
| "grad_norm": 1.1028001308441162, |
| "learning_rate": 3.87409200968523e-06, |
| "loss": 9.0916, |
| "num_input_tokens_seen": 26464, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.0847457627118644, |
| "grad_norm": 0.898245632648468, |
| "learning_rate": 4.176755447941889e-06, |
| "loss": 9.4615, |
| "num_input_tokens_seen": 28736, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.09079903147699758, |
| "grad_norm": 0.8542805314064026, |
| "learning_rate": 4.479418886198548e-06, |
| "loss": 9.1008, |
| "num_input_tokens_seen": 30848, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.09685230024213075, |
| "grad_norm": 0.8152970671653748, |
| "learning_rate": 4.782082324455206e-06, |
| "loss": 8.9621, |
| "num_input_tokens_seen": 32992, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.10290556900726393, |
| "grad_norm": 0.875648021697998, |
| "learning_rate": 5.084745762711865e-06, |
| "loss": 9.1143, |
| "num_input_tokens_seen": 34976, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.1089588377723971, |
| "grad_norm": 1.0231714248657227, |
| "learning_rate": 5.3874092009685235e-06, |
| "loss": 9.1063, |
| "num_input_tokens_seen": 36896, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.11501210653753027, |
| "grad_norm": 0.9986609816551208, |
| "learning_rate": 5.6900726392251815e-06, |
| "loss": 8.7926, |
| "num_input_tokens_seen": 38848, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.12106537530266344, |
| "grad_norm": 1.7170308828353882, |
| "learning_rate": 5.99273607748184e-06, |
| "loss": 9.388, |
| "num_input_tokens_seen": 40960, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.1271186440677966, |
| "grad_norm": 0.9397395849227905, |
| "learning_rate": 6.295399515738499e-06, |
| "loss": 9.0194, |
| "num_input_tokens_seen": 43008, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.13317191283292978, |
| "grad_norm": 0.8648118376731873, |
| "learning_rate": 6.598062953995157e-06, |
| "loss": 9.3715, |
| "num_input_tokens_seen": 44992, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.13922518159806296, |
| "grad_norm": 0.9399133920669556, |
| "learning_rate": 6.900726392251816e-06, |
| "loss": 9.0017, |
| "num_input_tokens_seen": 47072, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.14527845036319612, |
| "grad_norm": 1.1335257291793823, |
| "learning_rate": 7.203389830508475e-06, |
| "loss": 9.175, |
| "num_input_tokens_seen": 49152, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.1513317191283293, |
| "grad_norm": 0.99947589635849, |
| "learning_rate": 7.5060532687651345e-06, |
| "loss": 9.1888, |
| "num_input_tokens_seen": 51232, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.15738498789346247, |
| "grad_norm": 1.170379877090454, |
| "learning_rate": 7.808716707021792e-06, |
| "loss": 8.8822, |
| "num_input_tokens_seen": 53312, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.16343825665859565, |
| "grad_norm": 0.7438183426856995, |
| "learning_rate": 8.111380145278451e-06, |
| "loss": 9.206, |
| "num_input_tokens_seen": 55520, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.1694915254237288, |
| "grad_norm": 1.0866670608520508, |
| "learning_rate": 8.41404358353511e-06, |
| "loss": 8.9543, |
| "num_input_tokens_seen": 57440, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.17554479418886199, |
| "grad_norm": 1.0499428510665894, |
| "learning_rate": 8.716707021791767e-06, |
| "loss": 9.0318, |
| "num_input_tokens_seen": 59584, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.18159806295399517, |
| "grad_norm": 1.2830836772918701, |
| "learning_rate": 9.019370460048427e-06, |
| "loss": 8.9027, |
| "num_input_tokens_seen": 61760, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.18765133171912832, |
| "grad_norm": 0.9168046116828918, |
| "learning_rate": 9.322033898305085e-06, |
| "loss": 8.9873, |
| "num_input_tokens_seen": 63744, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.1937046004842615, |
| "grad_norm": 0.9422778487205505, |
| "learning_rate": 9.624697336561745e-06, |
| "loss": 8.6118, |
| "num_input_tokens_seen": 65856, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.19975786924939468, |
| "grad_norm": 0.9154998064041138, |
| "learning_rate": 9.927360774818403e-06, |
| "loss": 9.1733, |
| "num_input_tokens_seen": 67840, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.20581113801452786, |
| "grad_norm": 0.8919447660446167, |
| "learning_rate": 1.023002421307506e-05, |
| "loss": 8.8098, |
| "num_input_tokens_seen": 69920, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.211864406779661, |
| "grad_norm": 0.9792394638061523, |
| "learning_rate": 1.053268765133172e-05, |
| "loss": 8.8662, |
| "num_input_tokens_seen": 72032, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.2179176755447942, |
| "grad_norm": 0.8393341898918152, |
| "learning_rate": 1.0835351089588378e-05, |
| "loss": 8.6951, |
| "num_input_tokens_seen": 74112, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.22397094430992737, |
| "grad_norm": 1.0700838565826416, |
| "learning_rate": 1.1138014527845036e-05, |
| "loss": 8.8696, |
| "num_input_tokens_seen": 76288, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.23002421307506055, |
| "grad_norm": 0.9679522514343262, |
| "learning_rate": 1.1440677966101696e-05, |
| "loss": 8.539, |
| "num_input_tokens_seen": 78496, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.2360774818401937, |
| "grad_norm": 0.9779477119445801, |
| "learning_rate": 1.1743341404358354e-05, |
| "loss": 8.6739, |
| "num_input_tokens_seen": 80416, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.24213075060532688, |
| "grad_norm": 1.2384376525878906, |
| "learning_rate": 1.2046004842615012e-05, |
| "loss": 8.4287, |
| "num_input_tokens_seen": 82432, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.24818401937046006, |
| "grad_norm": 1.2574758529663086, |
| "learning_rate": 1.2348668280871672e-05, |
| "loss": 8.535, |
| "num_input_tokens_seen": 84416, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.2542372881355932, |
| "grad_norm": 1.1402815580368042, |
| "learning_rate": 1.2651331719128328e-05, |
| "loss": 8.5997, |
| "num_input_tokens_seen": 86336, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.2602905569007264, |
| "grad_norm": 0.937260627746582, |
| "learning_rate": 1.2953995157384988e-05, |
| "loss": 8.9429, |
| "num_input_tokens_seen": 88352, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.26634382566585957, |
| "grad_norm": 1.2747514247894287, |
| "learning_rate": 1.3256658595641647e-05, |
| "loss": 8.6613, |
| "num_input_tokens_seen": 90432, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.27239709443099275, |
| "grad_norm": 1.070163369178772, |
| "learning_rate": 1.3559322033898305e-05, |
| "loss": 8.5556, |
| "num_input_tokens_seen": 92512, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.2784503631961259, |
| "grad_norm": 1.2982138395309448, |
| "learning_rate": 1.3861985472154965e-05, |
| "loss": 8.4637, |
| "num_input_tokens_seen": 94528, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.2845036319612591, |
| "grad_norm": 0.9148821234703064, |
| "learning_rate": 1.4164648910411623e-05, |
| "loss": 8.5035, |
| "num_input_tokens_seen": 96576, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.29055690072639223, |
| "grad_norm": 1.0620417594909668, |
| "learning_rate": 1.4467312348668283e-05, |
| "loss": 8.5325, |
| "num_input_tokens_seen": 98624, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.2966101694915254, |
| "grad_norm": 1.2041453123092651, |
| "learning_rate": 1.4769975786924939e-05, |
| "loss": 8.6525, |
| "num_input_tokens_seen": 100832, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.3026634382566586, |
| "grad_norm": 1.011583685874939, |
| "learning_rate": 1.5072639225181599e-05, |
| "loss": 8.7208, |
| "num_input_tokens_seen": 102784, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.30871670702179177, |
| "grad_norm": 1.1622391939163208, |
| "learning_rate": 1.5375302663438258e-05, |
| "loss": 8.0941, |
| "num_input_tokens_seen": 104800, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.31476997578692495, |
| "grad_norm": 0.8897132277488708, |
| "learning_rate": 1.5677966101694916e-05, |
| "loss": 8.5926, |
| "num_input_tokens_seen": 106880, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.32082324455205813, |
| "grad_norm": 1.2517374753952026, |
| "learning_rate": 1.5980629539951574e-05, |
| "loss": 8.7554, |
| "num_input_tokens_seen": 108960, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.3268765133171913, |
| "grad_norm": 1.0453944206237793, |
| "learning_rate": 1.6283292978208232e-05, |
| "loss": 8.0182, |
| "num_input_tokens_seen": 110944, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.33292978208232443, |
| "grad_norm": 1.1679898500442505, |
| "learning_rate": 1.6585956416464894e-05, |
| "loss": 8.2963, |
| "num_input_tokens_seen": 113088, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.3389830508474576, |
| "grad_norm": 2.013692617416382, |
| "learning_rate": 1.6888619854721548e-05, |
| "loss": 8.4073, |
| "num_input_tokens_seen": 115072, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.3450363196125908, |
| "grad_norm": 0.9799766540527344, |
| "learning_rate": 1.719128329297821e-05, |
| "loss": 8.3549, |
| "num_input_tokens_seen": 117280, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.35108958837772397, |
| "grad_norm": 1.1790446043014526, |
| "learning_rate": 1.7493946731234868e-05, |
| "loss": 8.2065, |
| "num_input_tokens_seen": 119456, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.35714285714285715, |
| "grad_norm": 1.1667110919952393, |
| "learning_rate": 1.7796610169491526e-05, |
| "loss": 8.1662, |
| "num_input_tokens_seen": 121472, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.36319612590799033, |
| "grad_norm": 1.7007219791412354, |
| "learning_rate": 1.8099273607748184e-05, |
| "loss": 7.9676, |
| "num_input_tokens_seen": 123648, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.3692493946731235, |
| "grad_norm": 0.9762911200523376, |
| "learning_rate": 1.8401937046004845e-05, |
| "loss": 8.2834, |
| "num_input_tokens_seen": 125632, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.37530266343825663, |
| "grad_norm": 1.1433628797531128, |
| "learning_rate": 1.8704600484261503e-05, |
| "loss": 8.2186, |
| "num_input_tokens_seen": 127680, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.3813559322033898, |
| "grad_norm": 1.6947895288467407, |
| "learning_rate": 1.900726392251816e-05, |
| "loss": 8.0444, |
| "num_input_tokens_seen": 129856, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.387409200968523, |
| "grad_norm": 1.220507025718689, |
| "learning_rate": 1.930992736077482e-05, |
| "loss": 7.8954, |
| "num_input_tokens_seen": 131872, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.3934624697336562, |
| "grad_norm": 1.1875652074813843, |
| "learning_rate": 1.9612590799031477e-05, |
| "loss": 8.475, |
| "num_input_tokens_seen": 133984, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.39951573849878935, |
| "grad_norm": 0.9924948811531067, |
| "learning_rate": 1.9915254237288135e-05, |
| "loss": 7.9364, |
| "num_input_tokens_seen": 136128, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.40556900726392253, |
| "grad_norm": 1.0750553607940674, |
| "learning_rate": 2.0217917675544796e-05, |
| "loss": 7.9083, |
| "num_input_tokens_seen": 138208, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.4116222760290557, |
| "grad_norm": 1.6533371210098267, |
| "learning_rate": 2.0520581113801454e-05, |
| "loss": 7.8747, |
| "num_input_tokens_seen": 140288, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.41767554479418884, |
| "grad_norm": 1.074575662612915, |
| "learning_rate": 2.0823244552058112e-05, |
| "loss": 7.8869, |
| "num_input_tokens_seen": 142336, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.423728813559322, |
| "grad_norm": 1.265809178352356, |
| "learning_rate": 2.1125907990314774e-05, |
| "loss": 7.6201, |
| "num_input_tokens_seen": 144448, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.4297820823244552, |
| "grad_norm": 1.2359554767608643, |
| "learning_rate": 2.1428571428571428e-05, |
| "loss": 8.0956, |
| "num_input_tokens_seen": 146464, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.4358353510895884, |
| "grad_norm": 1.045859456062317, |
| "learning_rate": 2.1731234866828086e-05, |
| "loss": 7.6519, |
| "num_input_tokens_seen": 148512, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.44188861985472155, |
| "grad_norm": 0.9650263786315918, |
| "learning_rate": 2.2033898305084748e-05, |
| "loss": 7.7015, |
| "num_input_tokens_seen": 150496, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.44794188861985473, |
| "grad_norm": 1.254760980606079, |
| "learning_rate": 2.2336561743341405e-05, |
| "loss": 7.6647, |
| "num_input_tokens_seen": 152608, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.4539951573849879, |
| "grad_norm": 1.2025991678237915, |
| "learning_rate": 2.2639225181598063e-05, |
| "loss": 7.5153, |
| "num_input_tokens_seen": 154624, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.4600484261501211, |
| "grad_norm": 1.1741917133331299, |
| "learning_rate": 2.2941888619854725e-05, |
| "loss": 7.7002, |
| "num_input_tokens_seen": 156608, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.4661016949152542, |
| "grad_norm": 1.2749474048614502, |
| "learning_rate": 2.3244552058111383e-05, |
| "loss": 7.6395, |
| "num_input_tokens_seen": 158560, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.4721549636803874, |
| "grad_norm": 1.0503923892974854, |
| "learning_rate": 2.3547215496368037e-05, |
| "loss": 7.5312, |
| "num_input_tokens_seen": 160640, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.4782082324455206, |
| "grad_norm": 1.223946452140808, |
| "learning_rate": 2.38498789346247e-05, |
| "loss": 7.4305, |
| "num_input_tokens_seen": 162784, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.48426150121065376, |
| "grad_norm": 1.1325708627700806, |
| "learning_rate": 2.4152542372881357e-05, |
| "loss": 7.6007, |
| "num_input_tokens_seen": 164832, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.49031476997578693, |
| "grad_norm": 1.2484787702560425, |
| "learning_rate": 2.4455205811138015e-05, |
| "loss": 7.2238, |
| "num_input_tokens_seen": 166880, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.4963680387409201, |
| "grad_norm": 1.2033966779708862, |
| "learning_rate": 2.4757869249394676e-05, |
| "loss": 7.0586, |
| "num_input_tokens_seen": 169088, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.5, |
| "eval_loss": 6.989433765411377, |
| "eval_runtime": 7.6115, |
| "eval_samples_per_second": 48.217, |
| "eval_steps_per_second": 12.087, |
| "num_input_tokens_seen": 170336, |
| "step": 413 |
| }, |
| { |
| "epoch": 0.5024213075060533, |
| "grad_norm": 1.1991325616836548, |
| "learning_rate": 2.5060532687651334e-05, |
| "loss": 7.0567, |
| "num_input_tokens_seen": 171232, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.5084745762711864, |
| "grad_norm": 1.1006559133529663, |
| "learning_rate": 2.536319612590799e-05, |
| "loss": 7.4312, |
| "num_input_tokens_seen": 173376, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.5145278450363197, |
| "grad_norm": 1.2216989994049072, |
| "learning_rate": 2.566585956416465e-05, |
| "loss": 6.3815, |
| "num_input_tokens_seen": 175424, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.5205811138014528, |
| "grad_norm": 1.2695162296295166, |
| "learning_rate": 2.5968523002421308e-05, |
| "loss": 7.0186, |
| "num_input_tokens_seen": 177664, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.5266343825665859, |
| "grad_norm": 1.5784332752227783, |
| "learning_rate": 2.627118644067797e-05, |
| "loss": 6.4012, |
| "num_input_tokens_seen": 179648, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.5326876513317191, |
| "grad_norm": 1.2797585725784302, |
| "learning_rate": 2.6573849878934624e-05, |
| "loss": 6.9028, |
| "num_input_tokens_seen": 181664, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.5387409200968523, |
| "grad_norm": 1.2088321447372437, |
| "learning_rate": 2.6876513317191282e-05, |
| "loss": 7.0006, |
| "num_input_tokens_seen": 183840, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.5447941888619855, |
| "grad_norm": 1.2397788763046265, |
| "learning_rate": 2.7179176755447943e-05, |
| "loss": 6.7666, |
| "num_input_tokens_seen": 185696, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.5508474576271186, |
| "grad_norm": 1.3759512901306152, |
| "learning_rate": 2.74818401937046e-05, |
| "loss": 6.6979, |
| "num_input_tokens_seen": 187712, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.5569007263922519, |
| "grad_norm": 1.2864426374435425, |
| "learning_rate": 2.7784503631961263e-05, |
| "loss": 6.9788, |
| "num_input_tokens_seen": 189728, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.562953995157385, |
| "grad_norm": 1.124521017074585, |
| "learning_rate": 2.8087167070217917e-05, |
| "loss": 6.0966, |
| "num_input_tokens_seen": 191744, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.5690072639225182, |
| "grad_norm": 1.215418815612793, |
| "learning_rate": 2.838983050847458e-05, |
| "loss": 6.1199, |
| "num_input_tokens_seen": 193888, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.5750605326876513, |
| "grad_norm": 1.4469285011291504, |
| "learning_rate": 2.8692493946731237e-05, |
| "loss": 6.437, |
| "num_input_tokens_seen": 195904, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.5811138014527845, |
| "grad_norm": 1.1506513357162476, |
| "learning_rate": 2.899515738498789e-05, |
| "loss": 6.211, |
| "num_input_tokens_seen": 197952, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.5871670702179177, |
| "grad_norm": 1.3723676204681396, |
| "learning_rate": 2.9297820823244553e-05, |
| "loss": 6.1162, |
| "num_input_tokens_seen": 199968, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.5932203389830508, |
| "grad_norm": 1.255276083946228, |
| "learning_rate": 2.960048426150121e-05, |
| "loss": 5.9268, |
| "num_input_tokens_seen": 201984, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.5992736077481841, |
| "grad_norm": 1.320577621459961, |
| "learning_rate": 2.9903147699757872e-05, |
| "loss": 5.7367, |
| "num_input_tokens_seen": 204000, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.6053268765133172, |
| "grad_norm": 1.3590911626815796, |
| "learning_rate": 3.0205811138014527e-05, |
| "loss": 5.9233, |
| "num_input_tokens_seen": 206080, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.6113801452784504, |
| "grad_norm": 1.168292760848999, |
| "learning_rate": 3.050847457627119e-05, |
| "loss": 6.1837, |
| "num_input_tokens_seen": 208000, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.6174334140435835, |
| "grad_norm": 1.3557924032211304, |
| "learning_rate": 3.0811138014527846e-05, |
| "loss": 6.3513, |
| "num_input_tokens_seen": 210208, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.6234866828087167, |
| "grad_norm": 1.2354793548583984, |
| "learning_rate": 3.111380145278451e-05, |
| "loss": 5.8031, |
| "num_input_tokens_seen": 212288, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.6295399515738499, |
| "grad_norm": 1.4400395154953003, |
| "learning_rate": 3.141646489104116e-05, |
| "loss": 5.6595, |
| "num_input_tokens_seen": 214400, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.635593220338983, |
| "grad_norm": 1.2808489799499512, |
| "learning_rate": 3.1719128329297823e-05, |
| "loss": 4.9563, |
| "num_input_tokens_seen": 216512, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.6416464891041163, |
| "grad_norm": 1.2228987216949463, |
| "learning_rate": 3.2021791767554485e-05, |
| "loss": 6.0747, |
| "num_input_tokens_seen": 218496, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.6476997578692494, |
| "grad_norm": 1.248143196105957, |
| "learning_rate": 3.232445520581114e-05, |
| "loss": 5.2172, |
| "num_input_tokens_seen": 220512, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.6537530266343826, |
| "grad_norm": 1.1083874702453613, |
| "learning_rate": 3.26271186440678e-05, |
| "loss": 5.5222, |
| "num_input_tokens_seen": 222560, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.6598062953995157, |
| "grad_norm": 1.1539931297302246, |
| "learning_rate": 3.2929782082324455e-05, |
| "loss": 5.2704, |
| "num_input_tokens_seen": 224480, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.6658595641646489, |
| "grad_norm": 1.0974218845367432, |
| "learning_rate": 3.323244552058112e-05, |
| "loss": 5.5403, |
| "num_input_tokens_seen": 226560, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.6719128329297821, |
| "grad_norm": 1.1053953170776367, |
| "learning_rate": 3.353510895883777e-05, |
| "loss": 5.2455, |
| "num_input_tokens_seen": 228512, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.6779661016949152, |
| "grad_norm": 1.1519495248794556, |
| "learning_rate": 3.383777239709443e-05, |
| "loss": 5.0985, |
| "num_input_tokens_seen": 230528, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.6840193704600485, |
| "grad_norm": 1.394205093383789, |
| "learning_rate": 3.4140435835351094e-05, |
| "loss": 5.0764, |
| "num_input_tokens_seen": 232768, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.6900726392251816, |
| "grad_norm": 1.074160099029541, |
| "learning_rate": 3.444309927360775e-05, |
| "loss": 4.8116, |
| "num_input_tokens_seen": 234784, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.6961259079903148, |
| "grad_norm": 0.8987542390823364, |
| "learning_rate": 3.474576271186441e-05, |
| "loss": 4.9556, |
| "num_input_tokens_seen": 236832, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.7021791767554479, |
| "grad_norm": 0.9763084053993225, |
| "learning_rate": 3.5048426150121065e-05, |
| "loss": 4.9298, |
| "num_input_tokens_seen": 238784, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.7082324455205811, |
| "grad_norm": 1.0321810245513916, |
| "learning_rate": 3.5351089588377726e-05, |
| "loss": 5.1537, |
| "num_input_tokens_seen": 240768, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.7142857142857143, |
| "grad_norm": 1.2082122564315796, |
| "learning_rate": 3.565375302663439e-05, |
| "loss": 4.6048, |
| "num_input_tokens_seen": 242848, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.7203389830508474, |
| "grad_norm": 1.3089044094085693, |
| "learning_rate": 3.595641646489104e-05, |
| "loss": 4.1533, |
| "num_input_tokens_seen": 244992, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.7263922518159807, |
| "grad_norm": 0.9767692685127258, |
| "learning_rate": 3.62590799031477e-05, |
| "loss": 4.5364, |
| "num_input_tokens_seen": 247040, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.7324455205811138, |
| "grad_norm": 1.4216803312301636, |
| "learning_rate": 3.656174334140436e-05, |
| "loss": 4.4169, |
| "num_input_tokens_seen": 249184, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.738498789346247, |
| "grad_norm": 1.5430711507797241, |
| "learning_rate": 3.686440677966102e-05, |
| "loss": 4.6772, |
| "num_input_tokens_seen": 251104, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.7445520581113801, |
| "grad_norm": 1.3631322383880615, |
| "learning_rate": 3.7167070217917674e-05, |
| "loss": 4.414, |
| "num_input_tokens_seen": 253056, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.7506053268765133, |
| "grad_norm": 0.8792169690132141, |
| "learning_rate": 3.7469733656174335e-05, |
| "loss": 4.6464, |
| "num_input_tokens_seen": 255008, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.7566585956416465, |
| "grad_norm": 1.0657660961151123, |
| "learning_rate": 3.7772397094431e-05, |
| "loss": 4.353, |
| "num_input_tokens_seen": 257120, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.7627118644067796, |
| "grad_norm": 0.9193087220191956, |
| "learning_rate": 3.807506053268765e-05, |
| "loss": 3.9445, |
| "num_input_tokens_seen": 259168, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.7687651331719129, |
| "grad_norm": 0.841438889503479, |
| "learning_rate": 3.837772397094431e-05, |
| "loss": 4.2762, |
| "num_input_tokens_seen": 261216, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.774818401937046, |
| "grad_norm": 0.747069239616394, |
| "learning_rate": 3.868038740920097e-05, |
| "loss": 4.005, |
| "num_input_tokens_seen": 263136, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.7808716707021792, |
| "grad_norm": 0.7466037273406982, |
| "learning_rate": 3.898305084745763e-05, |
| "loss": 4.0149, |
| "num_input_tokens_seen": 265184, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.7869249394673123, |
| "grad_norm": 1.3515541553497314, |
| "learning_rate": 3.928571428571429e-05, |
| "loss": 4.3661, |
| "num_input_tokens_seen": 267264, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.7929782082324455, |
| "grad_norm": 0.8749038577079773, |
| "learning_rate": 3.958837772397095e-05, |
| "loss": 4.4243, |
| "num_input_tokens_seen": 269216, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.7990314769975787, |
| "grad_norm": 0.8567478656768799, |
| "learning_rate": 3.9891041162227606e-05, |
| "loss": 3.915, |
| "num_input_tokens_seen": 271264, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.8050847457627118, |
| "grad_norm": 0.7087087631225586, |
| "learning_rate": 4.019370460048426e-05, |
| "loss": 4.0109, |
| "num_input_tokens_seen": 273280, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.8111380145278451, |
| "grad_norm": 0.7404273152351379, |
| "learning_rate": 4.049636803874092e-05, |
| "loss": 3.4617, |
| "num_input_tokens_seen": 275392, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.8171912832929782, |
| "grad_norm": 1.2327122688293457, |
| "learning_rate": 4.0799031476997577e-05, |
| "loss": 4.2449, |
| "num_input_tokens_seen": 277440, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.8232445520581114, |
| "grad_norm": 0.707499623298645, |
| "learning_rate": 4.110169491525424e-05, |
| "loss": 4.2136, |
| "num_input_tokens_seen": 279456, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.8292978208232445, |
| "grad_norm": 0.7081287503242493, |
| "learning_rate": 4.14043583535109e-05, |
| "loss": 3.7978, |
| "num_input_tokens_seen": 281632, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.8353510895883777, |
| "grad_norm": 0.6375666260719299, |
| "learning_rate": 4.170702179176756e-05, |
| "loss": 4.1373, |
| "num_input_tokens_seen": 283680, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.8414043583535109, |
| "grad_norm": 0.7673113346099854, |
| "learning_rate": 4.2009685230024215e-05, |
| "loss": 3.3347, |
| "num_input_tokens_seen": 285760, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.847457627118644, |
| "grad_norm": 1.0868170261383057, |
| "learning_rate": 4.231234866828087e-05, |
| "loss": 3.5471, |
| "num_input_tokens_seen": 287904, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.8535108958837773, |
| "grad_norm": 0.8338655829429626, |
| "learning_rate": 4.261501210653753e-05, |
| "loss": 4.0102, |
| "num_input_tokens_seen": 289952, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.8595641646489104, |
| "grad_norm": 0.6879520416259766, |
| "learning_rate": 4.2917675544794186e-05, |
| "loss": 3.5516, |
| "num_input_tokens_seen": 291968, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.8656174334140436, |
| "grad_norm": 0.6293056011199951, |
| "learning_rate": 4.3220338983050854e-05, |
| "loss": 3.6021, |
| "num_input_tokens_seen": 293952, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.8716707021791767, |
| "grad_norm": 0.6270057559013367, |
| "learning_rate": 4.352300242130751e-05, |
| "loss": 3.9037, |
| "num_input_tokens_seen": 296192, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.8777239709443099, |
| "grad_norm": 0.6205592155456543, |
| "learning_rate": 4.382566585956417e-05, |
| "loss": 3.4889, |
| "num_input_tokens_seen": 298336, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.8837772397094431, |
| "grad_norm": 0.764479398727417, |
| "learning_rate": 4.4128329297820825e-05, |
| "loss": 3.401, |
| "num_input_tokens_seen": 300352, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.8898305084745762, |
| "grad_norm": 0.5813034772872925, |
| "learning_rate": 4.443099273607748e-05, |
| "loss": 3.0854, |
| "num_input_tokens_seen": 302432, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.8958837772397095, |
| "grad_norm": 0.787390410900116, |
| "learning_rate": 4.473365617433414e-05, |
| "loss": 3.5745, |
| "num_input_tokens_seen": 304544, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.9019370460048426, |
| "grad_norm": 0.7502769827842712, |
| "learning_rate": 4.50363196125908e-05, |
| "loss": 3.2791, |
| "num_input_tokens_seen": 306592, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.9079903147699758, |
| "grad_norm": 0.6328691244125366, |
| "learning_rate": 4.533898305084746e-05, |
| "loss": 2.5843, |
| "num_input_tokens_seen": 308672, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.914043583535109, |
| "grad_norm": 0.6782240271568298, |
| "learning_rate": 4.564164648910412e-05, |
| "loss": 2.5833, |
| "num_input_tokens_seen": 310656, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.9200968523002422, |
| "grad_norm": 0.7579760551452637, |
| "learning_rate": 4.594430992736078e-05, |
| "loss": 3.0647, |
| "num_input_tokens_seen": 312768, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.9261501210653753, |
| "grad_norm": 0.5894309878349304, |
| "learning_rate": 4.6246973365617434e-05, |
| "loss": 2.6112, |
| "num_input_tokens_seen": 314880, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.9322033898305084, |
| "grad_norm": 0.6657602190971375, |
| "learning_rate": 4.654963680387409e-05, |
| "loss": 2.9165, |
| "num_input_tokens_seen": 316960, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.9382566585956417, |
| "grad_norm": 0.5649463534355164, |
| "learning_rate": 4.685230024213076e-05, |
| "loss": 2.5276, |
| "num_input_tokens_seen": 319072, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.9443099273607748, |
| "grad_norm": 0.6244813799858093, |
| "learning_rate": 4.715496368038741e-05, |
| "loss": 2.9745, |
| "num_input_tokens_seen": 321248, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.950363196125908, |
| "grad_norm": 0.5977820754051208, |
| "learning_rate": 4.745762711864407e-05, |
| "loss": 2.799, |
| "num_input_tokens_seen": 323168, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.9564164648910412, |
| "grad_norm": 0.6189103722572327, |
| "learning_rate": 4.776029055690073e-05, |
| "loss": 3.3312, |
| "num_input_tokens_seen": 325280, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.9624697336561744, |
| "grad_norm": 0.5569372773170471, |
| "learning_rate": 4.806295399515739e-05, |
| "loss": 2.7812, |
| "num_input_tokens_seen": 327264, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.9685230024213075, |
| "grad_norm": 0.8147594332695007, |
| "learning_rate": 4.836561743341404e-05, |
| "loss": 3.1029, |
| "num_input_tokens_seen": 329312, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.9745762711864406, |
| "grad_norm": 0.5704480409622192, |
| "learning_rate": 4.8668280871670705e-05, |
| "loss": 2.7304, |
| "num_input_tokens_seen": 331296, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.9806295399515739, |
| "grad_norm": 0.700729489326477, |
| "learning_rate": 4.8970944309927366e-05, |
| "loss": 3.1512, |
| "num_input_tokens_seen": 333280, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.986682808716707, |
| "grad_norm": 0.99073326587677, |
| "learning_rate": 4.927360774818402e-05, |
| "loss": 2.8073, |
| "num_input_tokens_seen": 335392, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.9927360774818402, |
| "grad_norm": 0.6502202749252319, |
| "learning_rate": 4.957627118644068e-05, |
| "loss": 2.7864, |
| "num_input_tokens_seen": 337504, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.9987893462469734, |
| "grad_norm": 0.5440614819526672, |
| "learning_rate": 4.9878934624697336e-05, |
| "loss": 2.6606, |
| "num_input_tokens_seen": 339488, |
| "step": 825 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 2.582019090652466, |
| "eval_runtime": 7.6268, |
| "eval_samples_per_second": 48.119, |
| "eval_steps_per_second": 12.063, |
| "num_input_tokens_seen": 339568, |
| "step": 826 |
| }, |
| { |
| "epoch": 1.0048426150121066, |
| "grad_norm": 0.7248643040657043, |
| "learning_rate": 4.99999799087436e-05, |
| "loss": 3.3943, |
| "num_input_tokens_seen": 341168, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.0108958837772397, |
| "grad_norm": 0.465541809797287, |
| "learning_rate": 4.999985712896029e-05, |
| "loss": 2.7291, |
| "num_input_tokens_seen": 343216, |
| "step": 835 |
| }, |
| { |
| "epoch": 1.0169491525423728, |
| "grad_norm": 0.5930444598197937, |
| "learning_rate": 4.9999622731750315e-05, |
| "loss": 2.1777, |
| "num_input_tokens_seen": 345200, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.023002421307506, |
| "grad_norm": 0.5876166224479675, |
| "learning_rate": 4.999927671816018e-05, |
| "loss": 2.2214, |
| "num_input_tokens_seen": 347408, |
| "step": 845 |
| }, |
| { |
| "epoch": 1.0290556900726393, |
| "grad_norm": 0.5622490048408508, |
| "learning_rate": 4.999881908973474e-05, |
| "loss": 3.1609, |
| "num_input_tokens_seen": 349488, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.0351089588377724, |
| "grad_norm": 0.6987181901931763, |
| "learning_rate": 4.9998249848517185e-05, |
| "loss": 2.0788, |
| "num_input_tokens_seen": 351536, |
| "step": 855 |
| }, |
| { |
| "epoch": 1.0411622276029056, |
| "grad_norm": 0.5235864520072937, |
| "learning_rate": 4.999756899704902e-05, |
| "loss": 2.4917, |
| "num_input_tokens_seen": 353488, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.0472154963680387, |
| "grad_norm": 0.6432493329048157, |
| "learning_rate": 4.999677653837004e-05, |
| "loss": 2.5089, |
| "num_input_tokens_seen": 355504, |
| "step": 865 |
| }, |
| { |
| "epoch": 1.053268765133172, |
| "grad_norm": 0.49857422709465027, |
| "learning_rate": 4.999587247601837e-05, |
| "loss": 2.8026, |
| "num_input_tokens_seen": 357456, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.0593220338983051, |
| "grad_norm": 0.5426047444343567, |
| "learning_rate": 4.99948568140304e-05, |
| "loss": 2.3627, |
| "num_input_tokens_seen": 359568, |
| "step": 875 |
| }, |
| { |
| "epoch": 1.0653753026634383, |
| "grad_norm": 0.5244787335395813, |
| "learning_rate": 4.999372955694077e-05, |
| "loss": 2.5294, |
| "num_input_tokens_seen": 361616, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.0714285714285714, |
| "grad_norm": 0.474320650100708, |
| "learning_rate": 4.999249070978237e-05, |
| "loss": 2.1705, |
| "num_input_tokens_seen": 363696, |
| "step": 885 |
| }, |
| { |
| "epoch": 1.0774818401937045, |
| "grad_norm": 0.5307512879371643, |
| "learning_rate": 4.9991140278086316e-05, |
| "loss": 2.2412, |
| "num_input_tokens_seen": 365712, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.0835351089588379, |
| "grad_norm": 0.5476939082145691, |
| "learning_rate": 4.998967826788191e-05, |
| "loss": 2.5056, |
| "num_input_tokens_seen": 367824, |
| "step": 895 |
| }, |
| { |
| "epoch": 1.089588377723971, |
| "grad_norm": 0.6253821849822998, |
| "learning_rate": 4.998810468569661e-05, |
| "loss": 2.2057, |
| "num_input_tokens_seen": 369840, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.0956416464891041, |
| "grad_norm": 0.5360320806503296, |
| "learning_rate": 4.998641953855604e-05, |
| "loss": 2.9306, |
| "num_input_tokens_seen": 371952, |
| "step": 905 |
| }, |
| { |
| "epoch": 1.1016949152542372, |
| "grad_norm": 0.4829923212528229, |
| "learning_rate": 4.9984622833983906e-05, |
| "loss": 2.5923, |
| "num_input_tokens_seen": 374032, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.1077481840193704, |
| "grad_norm": 0.6471547484397888, |
| "learning_rate": 4.9982714580002e-05, |
| "loss": 1.7457, |
| "num_input_tokens_seen": 376176, |
| "step": 915 |
| }, |
| { |
| "epoch": 1.1138014527845037, |
| "grad_norm": 0.5247544646263123, |
| "learning_rate": 4.998069478513013e-05, |
| "loss": 2.5289, |
| "num_input_tokens_seen": 378224, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.1198547215496368, |
| "grad_norm": 0.741806149482727, |
| "learning_rate": 4.997856345838615e-05, |
| "loss": 2.0697, |
| "num_input_tokens_seen": 380304, |
| "step": 925 |
| }, |
| { |
| "epoch": 1.12590799031477, |
| "grad_norm": 0.5769193172454834, |
| "learning_rate": 4.997632060928582e-05, |
| "loss": 2.1848, |
| "num_input_tokens_seen": 382416, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.131961259079903, |
| "grad_norm": 0.6430028676986694, |
| "learning_rate": 4.997396624784284e-05, |
| "loss": 2.2152, |
| "num_input_tokens_seen": 384592, |
| "step": 935 |
| }, |
| { |
| "epoch": 1.1380145278450362, |
| "grad_norm": 0.537960946559906, |
| "learning_rate": 4.9971500384568795e-05, |
| "loss": 1.7291, |
| "num_input_tokens_seen": 386640, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.1440677966101696, |
| "grad_norm": 0.41977596282958984, |
| "learning_rate": 4.996892303047306e-05, |
| "loss": 2.1337, |
| "num_input_tokens_seen": 388752, |
| "step": 945 |
| }, |
| { |
| "epoch": 1.1501210653753027, |
| "grad_norm": 0.775093138217926, |
| "learning_rate": 4.996623419706282e-05, |
| "loss": 2.413, |
| "num_input_tokens_seen": 390928, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.1561743341404358, |
| "grad_norm": 0.6247055530548096, |
| "learning_rate": 4.996343389634298e-05, |
| "loss": 1.9488, |
| "num_input_tokens_seen": 393104, |
| "step": 955 |
| }, |
| { |
| "epoch": 1.162227602905569, |
| "grad_norm": 0.8783226013183594, |
| "learning_rate": 4.996052214081608e-05, |
| "loss": 1.8272, |
| "num_input_tokens_seen": 395184, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.1682808716707023, |
| "grad_norm": 0.594253659248352, |
| "learning_rate": 4.995749894348232e-05, |
| "loss": 1.8625, |
| "num_input_tokens_seen": 397264, |
| "step": 965 |
| }, |
| { |
| "epoch": 1.1743341404358354, |
| "grad_norm": 0.5288816094398499, |
| "learning_rate": 4.995436431783942e-05, |
| "loss": 1.4127, |
| "num_input_tokens_seen": 399376, |
| "step": 970 |
| }, |
| { |
| "epoch": 1.1803874092009685, |
| "grad_norm": 0.6039729118347168, |
| "learning_rate": 4.9951118277882636e-05, |
| "loss": 2.2276, |
| "num_input_tokens_seen": 401296, |
| "step": 975 |
| }, |
| { |
| "epoch": 1.1864406779661016, |
| "grad_norm": 0.7827556729316711, |
| "learning_rate": 4.994776083810463e-05, |
| "loss": 1.9446, |
| "num_input_tokens_seen": 403344, |
| "step": 980 |
| }, |
| { |
| "epoch": 1.192493946731235, |
| "grad_norm": 0.46428972482681274, |
| "learning_rate": 4.994429201349542e-05, |
| "loss": 2.2557, |
| "num_input_tokens_seen": 405328, |
| "step": 985 |
| }, |
| { |
| "epoch": 1.1985472154963681, |
| "grad_norm": 0.4292607307434082, |
| "learning_rate": 4.994071181954237e-05, |
| "loss": 1.9263, |
| "num_input_tokens_seen": 407344, |
| "step": 990 |
| }, |
| { |
| "epoch": 1.2046004842615012, |
| "grad_norm": 0.5664454698562622, |
| "learning_rate": 4.993702027223004e-05, |
| "loss": 2.0525, |
| "num_input_tokens_seen": 409296, |
| "step": 995 |
| }, |
| { |
| "epoch": 1.2106537530266344, |
| "grad_norm": 0.4367380440235138, |
| "learning_rate": 4.9933217388040164e-05, |
| "loss": 2.1169, |
| "num_input_tokens_seen": 411536, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.2167070217917675, |
| "grad_norm": 0.5089828968048096, |
| "learning_rate": 4.992930318395157e-05, |
| "loss": 1.2446, |
| "num_input_tokens_seen": 413712, |
| "step": 1005 |
| }, |
| { |
| "epoch": 1.2227602905569008, |
| "grad_norm": 0.557706356048584, |
| "learning_rate": 4.99252776774401e-05, |
| "loss": 1.3772, |
| "num_input_tokens_seen": 415856, |
| "step": 1010 |
| }, |
| { |
| "epoch": 1.228813559322034, |
| "grad_norm": 0.473179429769516, |
| "learning_rate": 4.99211408864785e-05, |
| "loss": 1.7906, |
| "num_input_tokens_seen": 417808, |
| "step": 1015 |
| }, |
| { |
| "epoch": 1.234866828087167, |
| "grad_norm": 0.49337419867515564, |
| "learning_rate": 4.991689282953642e-05, |
| "loss": 1.3523, |
| "num_input_tokens_seen": 419760, |
| "step": 1020 |
| }, |
| { |
| "epoch": 1.2409200968523002, |
| "grad_norm": 0.36889055371284485, |
| "learning_rate": 4.991253352558025e-05, |
| "loss": 1.8714, |
| "num_input_tokens_seen": 421840, |
| "step": 1025 |
| }, |
| { |
| "epoch": 1.2469733656174333, |
| "grad_norm": 0.48412299156188965, |
| "learning_rate": 4.9908062994073056e-05, |
| "loss": 1.7757, |
| "num_input_tokens_seen": 424112, |
| "step": 1030 |
| }, |
| { |
| "epoch": 1.2530266343825667, |
| "grad_norm": 0.5914756655693054, |
| "learning_rate": 4.990348125497454e-05, |
| "loss": 1.223, |
| "num_input_tokens_seen": 426128, |
| "step": 1035 |
| }, |
| { |
| "epoch": 1.2590799031476998, |
| "grad_norm": 0.4885713458061218, |
| "learning_rate": 4.9898788328740884e-05, |
| "loss": 1.5508, |
| "num_input_tokens_seen": 428272, |
| "step": 1040 |
| }, |
| { |
| "epoch": 1.265133171912833, |
| "grad_norm": 0.42730310559272766, |
| "learning_rate": 4.989398423632471e-05, |
| "loss": 1.9498, |
| "num_input_tokens_seen": 430320, |
| "step": 1045 |
| }, |
| { |
| "epoch": 1.271186440677966, |
| "grad_norm": 0.5261942148208618, |
| "learning_rate": 4.988906899917496e-05, |
| "loss": 1.6678, |
| "num_input_tokens_seen": 432272, |
| "step": 1050 |
| }, |
| { |
| "epoch": 1.2772397094430992, |
| "grad_norm": 0.4744739830493927, |
| "learning_rate": 4.988404263923679e-05, |
| "loss": 1.4057, |
| "num_input_tokens_seen": 434384, |
| "step": 1055 |
| }, |
| { |
| "epoch": 1.2832929782082325, |
| "grad_norm": 0.5921871066093445, |
| "learning_rate": 4.987890517895152e-05, |
| "loss": 1.2233, |
| "num_input_tokens_seen": 436336, |
| "step": 1060 |
| }, |
| { |
| "epoch": 1.2893462469733656, |
| "grad_norm": 0.47654107213020325, |
| "learning_rate": 4.987365664125647e-05, |
| "loss": 1.7395, |
| "num_input_tokens_seen": 438352, |
| "step": 1065 |
| }, |
| { |
| "epoch": 1.2953995157384988, |
| "grad_norm": 0.44112324714660645, |
| "learning_rate": 4.986829704958491e-05, |
| "loss": 1.597, |
| "num_input_tokens_seen": 440336, |
| "step": 1070 |
| }, |
| { |
| "epoch": 1.3014527845036319, |
| "grad_norm": 0.3871079981327057, |
| "learning_rate": 4.986282642786594e-05, |
| "loss": 0.9274, |
| "num_input_tokens_seen": 442352, |
| "step": 1075 |
| }, |
| { |
| "epoch": 1.307506053268765, |
| "grad_norm": 0.41969189047813416, |
| "learning_rate": 4.985724480052435e-05, |
| "loss": 1.3723, |
| "num_input_tokens_seen": 444496, |
| "step": 1080 |
| }, |
| { |
| "epoch": 1.3135593220338984, |
| "grad_norm": 0.5627338886260986, |
| "learning_rate": 4.985155219248057e-05, |
| "loss": 1.7963, |
| "num_input_tokens_seen": 446576, |
| "step": 1085 |
| }, |
| { |
| "epoch": 1.3196125907990315, |
| "grad_norm": 0.5490552186965942, |
| "learning_rate": 4.9845748629150514e-05, |
| "loss": 1.6921, |
| "num_input_tokens_seen": 448656, |
| "step": 1090 |
| }, |
| { |
| "epoch": 1.3256658595641646, |
| "grad_norm": 0.3618658185005188, |
| "learning_rate": 4.9839834136445485e-05, |
| "loss": 1.7861, |
| "num_input_tokens_seen": 450672, |
| "step": 1095 |
| }, |
| { |
| "epoch": 1.331719128329298, |
| "grad_norm": 0.6138935089111328, |
| "learning_rate": 4.983380874077204e-05, |
| "loss": 1.8834, |
| "num_input_tokens_seen": 452784, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.3377723970944309, |
| "grad_norm": 0.46193164587020874, |
| "learning_rate": 4.982767246903192e-05, |
| "loss": 1.5385, |
| "num_input_tokens_seen": 454864, |
| "step": 1105 |
| }, |
| { |
| "epoch": 1.3438256658595642, |
| "grad_norm": 0.5908941626548767, |
| "learning_rate": 4.982142534862185e-05, |
| "loss": 1.5447, |
| "num_input_tokens_seen": 457072, |
| "step": 1110 |
| }, |
| { |
| "epoch": 1.3498789346246973, |
| "grad_norm": 0.4145558774471283, |
| "learning_rate": 4.981506740743351e-05, |
| "loss": 1.4449, |
| "num_input_tokens_seen": 459184, |
| "step": 1115 |
| }, |
| { |
| "epoch": 1.3559322033898304, |
| "grad_norm": 0.4188134968280792, |
| "learning_rate": 4.980859867385334e-05, |
| "loss": 1.3915, |
| "num_input_tokens_seen": 461136, |
| "step": 1120 |
| }, |
| { |
| "epoch": 1.3619854721549638, |
| "grad_norm": 0.43662577867507935, |
| "learning_rate": 4.9802019176762434e-05, |
| "loss": 1.3782, |
| "num_input_tokens_seen": 463152, |
| "step": 1125 |
| }, |
| { |
| "epoch": 1.368038740920097, |
| "grad_norm": 0.3851994276046753, |
| "learning_rate": 4.9795328945536425e-05, |
| "loss": 1.6662, |
| "num_input_tokens_seen": 465232, |
| "step": 1130 |
| }, |
| { |
| "epoch": 1.37409200968523, |
| "grad_norm": 0.4279670715332031, |
| "learning_rate": 4.978852801004534e-05, |
| "loss": 1.6636, |
| "num_input_tokens_seen": 467248, |
| "step": 1135 |
| }, |
| { |
| "epoch": 1.3801452784503632, |
| "grad_norm": 0.35590389370918274, |
| "learning_rate": 4.9781616400653464e-05, |
| "loss": 1.2952, |
| "num_input_tokens_seen": 469424, |
| "step": 1140 |
| }, |
| { |
| "epoch": 1.3861985472154963, |
| "grad_norm": 0.3899083435535431, |
| "learning_rate": 4.9774594148219225e-05, |
| "loss": 1.6788, |
| "num_input_tokens_seen": 471344, |
| "step": 1145 |
| }, |
| { |
| "epoch": 1.3922518159806296, |
| "grad_norm": 0.4239688813686371, |
| "learning_rate": 4.976746128409502e-05, |
| "loss": 1.5144, |
| "num_input_tokens_seen": 473296, |
| "step": 1150 |
| }, |
| { |
| "epoch": 1.3983050847457628, |
| "grad_norm": 0.5207110047340393, |
| "learning_rate": 4.9760217840127126e-05, |
| "loss": 1.1491, |
| "num_input_tokens_seen": 475312, |
| "step": 1155 |
| }, |
| { |
| "epoch": 1.4043583535108959, |
| "grad_norm": 0.40052133798599243, |
| "learning_rate": 4.97528638486555e-05, |
| "loss": 1.0316, |
| "num_input_tokens_seen": 477360, |
| "step": 1160 |
| }, |
| { |
| "epoch": 1.410411622276029, |
| "grad_norm": 0.33712899684906006, |
| "learning_rate": 4.9745399342513666e-05, |
| "loss": 1.3778, |
| "num_input_tokens_seen": 479376, |
| "step": 1165 |
| }, |
| { |
| "epoch": 1.4164648910411621, |
| "grad_norm": 0.36989855766296387, |
| "learning_rate": 4.9737824355028584e-05, |
| "loss": 1.1389, |
| "num_input_tokens_seen": 481456, |
| "step": 1170 |
| }, |
| { |
| "epoch": 1.4225181598062955, |
| "grad_norm": 0.3517148494720459, |
| "learning_rate": 4.973013892002047e-05, |
| "loss": 1.2113, |
| "num_input_tokens_seen": 483632, |
| "step": 1175 |
| }, |
| { |
| "epoch": 1.4285714285714286, |
| "grad_norm": 0.4277711510658264, |
| "learning_rate": 4.9722343071802665e-05, |
| "loss": 1.4132, |
| "num_input_tokens_seen": 485680, |
| "step": 1180 |
| }, |
| { |
| "epoch": 1.4346246973365617, |
| "grad_norm": 0.47057533264160156, |
| "learning_rate": 4.971443684518145e-05, |
| "loss": 1.6084, |
| "num_input_tokens_seen": 487792, |
| "step": 1185 |
| }, |
| { |
| "epoch": 1.4406779661016949, |
| "grad_norm": 0.3864545226097107, |
| "learning_rate": 4.970642027545596e-05, |
| "loss": 1.2336, |
| "num_input_tokens_seen": 489840, |
| "step": 1190 |
| }, |
| { |
| "epoch": 1.446731234866828, |
| "grad_norm": 0.45822954177856445, |
| "learning_rate": 4.969829339841793e-05, |
| "loss": 0.8102, |
| "num_input_tokens_seen": 491824, |
| "step": 1195 |
| }, |
| { |
| "epoch": 1.4527845036319613, |
| "grad_norm": 0.2616255283355713, |
| "learning_rate": 4.9690056250351626e-05, |
| "loss": 1.3447, |
| "num_input_tokens_seen": 493872, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.4588377723970944, |
| "grad_norm": 0.559545636177063, |
| "learning_rate": 4.9681708868033616e-05, |
| "loss": 1.1485, |
| "num_input_tokens_seen": 495952, |
| "step": 1205 |
| }, |
| { |
| "epoch": 1.4648910411622276, |
| "grad_norm": 0.5678349137306213, |
| "learning_rate": 4.967325128873266e-05, |
| "loss": 1.3425, |
| "num_input_tokens_seen": 497936, |
| "step": 1210 |
| }, |
| { |
| "epoch": 1.4709443099273607, |
| "grad_norm": 0.3209479749202728, |
| "learning_rate": 4.96646835502095e-05, |
| "loss": 1.2659, |
| "num_input_tokens_seen": 499856, |
| "step": 1215 |
| }, |
| { |
| "epoch": 1.4769975786924938, |
| "grad_norm": 0.3425200283527374, |
| "learning_rate": 4.965600569071671e-05, |
| "loss": 1.2493, |
| "num_input_tokens_seen": 501840, |
| "step": 1220 |
| }, |
| { |
| "epoch": 1.4830508474576272, |
| "grad_norm": 0.3827836215496063, |
| "learning_rate": 4.9647217748998534e-05, |
| "loss": 1.1635, |
| "num_input_tokens_seen": 503952, |
| "step": 1225 |
| }, |
| { |
| "epoch": 1.4891041162227603, |
| "grad_norm": 0.43089863657951355, |
| "learning_rate": 4.963831976429067e-05, |
| "loss": 1.2059, |
| "num_input_tokens_seen": 505936, |
| "step": 1230 |
| }, |
| { |
| "epoch": 1.4951573849878934, |
| "grad_norm": 0.33711570501327515, |
| "learning_rate": 4.9629311776320176e-05, |
| "loss": 1.3525, |
| "num_input_tokens_seen": 507984, |
| "step": 1235 |
| }, |
| { |
| "epoch": 1.5, |
| "eval_loss": 1.0737199783325195, |
| "eval_runtime": 7.6374, |
| "eval_samples_per_second": 48.053, |
| "eval_steps_per_second": 12.046, |
| "num_input_tokens_seen": 509456, |
| "step": 1239 |
| }, |
| { |
| "epoch": 1.5012106537530268, |
| "grad_norm": 0.5355046391487122, |
| "learning_rate": 4.962019382530521e-05, |
| "loss": 1.655, |
| "num_input_tokens_seen": 509840, |
| "step": 1240 |
| }, |
| { |
| "epoch": 1.5072639225181597, |
| "grad_norm": 0.28325432538986206, |
| "learning_rate": 4.9610965951954885e-05, |
| "loss": 1.0433, |
| "num_input_tokens_seen": 511856, |
| "step": 1245 |
| }, |
| { |
| "epoch": 1.513317191283293, |
| "grad_norm": 0.4197021722793579, |
| "learning_rate": 4.960162819746911e-05, |
| "loss": 0.9547, |
| "num_input_tokens_seen": 513904, |
| "step": 1250 |
| }, |
| { |
| "epoch": 1.5193704600484261, |
| "grad_norm": 0.7074657082557678, |
| "learning_rate": 4.9592180603538366e-05, |
| "loss": 1.4886, |
| "num_input_tokens_seen": 515824, |
| "step": 1255 |
| }, |
| { |
| "epoch": 1.5254237288135593, |
| "grad_norm": 0.29705020785331726, |
| "learning_rate": 4.958262321234353e-05, |
| "loss": 0.9793, |
| "num_input_tokens_seen": 517744, |
| "step": 1260 |
| }, |
| { |
| "epoch": 1.5314769975786926, |
| "grad_norm": 0.2653312087059021, |
| "learning_rate": 4.95729560665557e-05, |
| "loss": 0.576, |
| "num_input_tokens_seen": 519824, |
| "step": 1265 |
| }, |
| { |
| "epoch": 1.5375302663438255, |
| "grad_norm": 0.30260780453681946, |
| "learning_rate": 4.956317920933602e-05, |
| "loss": 1.253, |
| "num_input_tokens_seen": 521936, |
| "step": 1270 |
| }, |
| { |
| "epoch": 1.5435835351089588, |
| "grad_norm": 0.3453550636768341, |
| "learning_rate": 4.955329268433543e-05, |
| "loss": 1.0863, |
| "num_input_tokens_seen": 524176, |
| "step": 1275 |
| }, |
| { |
| "epoch": 1.549636803874092, |
| "grad_norm": 0.2838487923145294, |
| "learning_rate": 4.954329653569452e-05, |
| "loss": 1.0201, |
| "num_input_tokens_seen": 526256, |
| "step": 1280 |
| }, |
| { |
| "epoch": 1.555690072639225, |
| "grad_norm": 0.4418029487133026, |
| "learning_rate": 4.953319080804333e-05, |
| "loss": 1.1724, |
| "num_input_tokens_seen": 528304, |
| "step": 1285 |
| }, |
| { |
| "epoch": 1.5617433414043584, |
| "grad_norm": 0.34448719024658203, |
| "learning_rate": 4.952297554650113e-05, |
| "loss": 1.1168, |
| "num_input_tokens_seen": 530320, |
| "step": 1290 |
| }, |
| { |
| "epoch": 1.5677966101694916, |
| "grad_norm": 0.33033546805381775, |
| "learning_rate": 4.9512650796676216e-05, |
| "loss": 1.0487, |
| "num_input_tokens_seen": 532400, |
| "step": 1295 |
| }, |
| { |
| "epoch": 1.5738498789346247, |
| "grad_norm": 0.525662362575531, |
| "learning_rate": 4.9502216604665744e-05, |
| "loss": 1.0373, |
| "num_input_tokens_seen": 534416, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.5799031476997578, |
| "grad_norm": 0.31474101543426514, |
| "learning_rate": 4.949167301705548e-05, |
| "loss": 0.9942, |
| "num_input_tokens_seen": 536592, |
| "step": 1305 |
| }, |
| { |
| "epoch": 1.585956416464891, |
| "grad_norm": 0.3259902596473694, |
| "learning_rate": 4.948102008091962e-05, |
| "loss": 0.5542, |
| "num_input_tokens_seen": 538704, |
| "step": 1310 |
| }, |
| { |
| "epoch": 1.5920096852300243, |
| "grad_norm": 0.27523431181907654, |
| "learning_rate": 4.947025784382057e-05, |
| "loss": 1.0819, |
| "num_input_tokens_seen": 540912, |
| "step": 1315 |
| }, |
| { |
| "epoch": 1.5980629539951574, |
| "grad_norm": 0.29886868596076965, |
| "learning_rate": 4.9459386353808736e-05, |
| "loss": 0.9661, |
| "num_input_tokens_seen": 542864, |
| "step": 1320 |
| }, |
| { |
| "epoch": 1.6041162227602905, |
| "grad_norm": 0.7062246799468994, |
| "learning_rate": 4.944840565942229e-05, |
| "loss": 1.2079, |
| "num_input_tokens_seen": 544816, |
| "step": 1325 |
| }, |
| { |
| "epoch": 1.6101694915254239, |
| "grad_norm": 0.5258677005767822, |
| "learning_rate": 4.943731580968701e-05, |
| "loss": 1.0958, |
| "num_input_tokens_seen": 546864, |
| "step": 1330 |
| }, |
| { |
| "epoch": 1.6162227602905568, |
| "grad_norm": 0.45779022574424744, |
| "learning_rate": 4.942611685411598e-05, |
| "loss": 0.9574, |
| "num_input_tokens_seen": 548912, |
| "step": 1335 |
| }, |
| { |
| "epoch": 1.6222760290556901, |
| "grad_norm": 0.7074362635612488, |
| "learning_rate": 4.9414808842709435e-05, |
| "loss": 1.1306, |
| "num_input_tokens_seen": 551056, |
| "step": 1340 |
| }, |
| { |
| "epoch": 1.6283292978208233, |
| "grad_norm": 0.34025338292121887, |
| "learning_rate": 4.940339182595451e-05, |
| "loss": 0.5357, |
| "num_input_tokens_seen": 553072, |
| "step": 1345 |
| }, |
| { |
| "epoch": 1.6343825665859564, |
| "grad_norm": 0.32569214701652527, |
| "learning_rate": 4.9391865854825015e-05, |
| "loss": 0.9225, |
| "num_input_tokens_seen": 555248, |
| "step": 1350 |
| }, |
| { |
| "epoch": 1.6404358353510897, |
| "grad_norm": 0.3453126549720764, |
| "learning_rate": 4.938023098078122e-05, |
| "loss": 0.8826, |
| "num_input_tokens_seen": 557360, |
| "step": 1355 |
| }, |
| { |
| "epoch": 1.6464891041162226, |
| "grad_norm": 0.315043568611145, |
| "learning_rate": 4.93684872557696e-05, |
| "loss": 0.9092, |
| "num_input_tokens_seen": 559376, |
| "step": 1360 |
| }, |
| { |
| "epoch": 1.652542372881356, |
| "grad_norm": 0.3264979422092438, |
| "learning_rate": 4.935663473222264e-05, |
| "loss": 0.584, |
| "num_input_tokens_seen": 561296, |
| "step": 1365 |
| }, |
| { |
| "epoch": 1.658595641646489, |
| "grad_norm": 0.28142881393432617, |
| "learning_rate": 4.934467346305856e-05, |
| "loss": 0.9263, |
| "num_input_tokens_seen": 563376, |
| "step": 1370 |
| }, |
| { |
| "epoch": 1.6646489104116222, |
| "grad_norm": 0.22819487750530243, |
| "learning_rate": 4.933260350168112e-05, |
| "loss": 0.5478, |
| "num_input_tokens_seen": 565424, |
| "step": 1375 |
| }, |
| { |
| "epoch": 1.6707021791767556, |
| "grad_norm": 0.28255125880241394, |
| "learning_rate": 4.932042490197933e-05, |
| "loss": 0.8529, |
| "num_input_tokens_seen": 567504, |
| "step": 1380 |
| }, |
| { |
| "epoch": 1.6767554479418885, |
| "grad_norm": 0.325191855430603, |
| "learning_rate": 4.930813771832728e-05, |
| "loss": 1.2288, |
| "num_input_tokens_seen": 569744, |
| "step": 1385 |
| }, |
| { |
| "epoch": 1.6828087167070218, |
| "grad_norm": 0.35881543159484863, |
| "learning_rate": 4.929574200558382e-05, |
| "loss": 0.629, |
| "num_input_tokens_seen": 571888, |
| "step": 1390 |
| }, |
| { |
| "epoch": 1.688861985472155, |
| "grad_norm": 0.24505288898944855, |
| "learning_rate": 4.928323781909239e-05, |
| "loss": 0.6763, |
| "num_input_tokens_seen": 574000, |
| "step": 1395 |
| }, |
| { |
| "epoch": 1.694915254237288, |
| "grad_norm": 0.2823852300643921, |
| "learning_rate": 4.927062521468068e-05, |
| "loss": 0.8719, |
| "num_input_tokens_seen": 575984, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.7009685230024214, |
| "grad_norm": 0.4133392870426178, |
| "learning_rate": 4.92579042486605e-05, |
| "loss": 0.9651, |
| "num_input_tokens_seen": 577968, |
| "step": 1405 |
| }, |
| { |
| "epoch": 1.7070217917675545, |
| "grad_norm": 0.6881241202354431, |
| "learning_rate": 4.924507497782743e-05, |
| "loss": 1.171, |
| "num_input_tokens_seen": 580112, |
| "step": 1410 |
| }, |
| { |
| "epoch": 1.7130750605326877, |
| "grad_norm": 0.5143849849700928, |
| "learning_rate": 4.923213745946059e-05, |
| "loss": 1.1722, |
| "num_input_tokens_seen": 582160, |
| "step": 1415 |
| }, |
| { |
| "epoch": 1.7191283292978208, |
| "grad_norm": 0.4631313979625702, |
| "learning_rate": 4.921909175132242e-05, |
| "loss": 0.9948, |
| "num_input_tokens_seen": 584304, |
| "step": 1420 |
| }, |
| { |
| "epoch": 1.725181598062954, |
| "grad_norm": 0.40659844875335693, |
| "learning_rate": 4.920593791165839e-05, |
| "loss": 0.9927, |
| "num_input_tokens_seen": 586096, |
| "step": 1425 |
| }, |
| { |
| "epoch": 1.7312348668280872, |
| "grad_norm": 0.8530993461608887, |
| "learning_rate": 4.919267599919674e-05, |
| "loss": 0.9462, |
| "num_input_tokens_seen": 588240, |
| "step": 1430 |
| }, |
| { |
| "epoch": 1.7372881355932204, |
| "grad_norm": 0.279767245054245, |
| "learning_rate": 4.917930607314823e-05, |
| "loss": 0.8155, |
| "num_input_tokens_seen": 590480, |
| "step": 1435 |
| }, |
| { |
| "epoch": 1.7433414043583535, |
| "grad_norm": 0.520563542842865, |
| "learning_rate": 4.916582819320588e-05, |
| "loss": 0.8034, |
| "num_input_tokens_seen": 592528, |
| "step": 1440 |
| }, |
| { |
| "epoch": 1.7493946731234868, |
| "grad_norm": 0.3725779056549072, |
| "learning_rate": 4.915224241954467e-05, |
| "loss": 0.9614, |
| "num_input_tokens_seen": 594416, |
| "step": 1445 |
| }, |
| { |
| "epoch": 1.7554479418886197, |
| "grad_norm": 0.27733102440834045, |
| "learning_rate": 4.9138548812821316e-05, |
| "loss": 0.5638, |
| "num_input_tokens_seen": 596432, |
| "step": 1450 |
| }, |
| { |
| "epoch": 1.761501210653753, |
| "grad_norm": 0.36456015706062317, |
| "learning_rate": 4.912474743417399e-05, |
| "loss": 0.738, |
| "num_input_tokens_seen": 598352, |
| "step": 1455 |
| }, |
| { |
| "epoch": 1.7675544794188862, |
| "grad_norm": 0.38345053791999817, |
| "learning_rate": 4.911083834522199e-05, |
| "loss": 0.7135, |
| "num_input_tokens_seen": 600432, |
| "step": 1460 |
| }, |
| { |
| "epoch": 1.7736077481840193, |
| "grad_norm": 0.5697410702705383, |
| "learning_rate": 4.909682160806556e-05, |
| "loss": 0.8639, |
| "num_input_tokens_seen": 602480, |
| "step": 1465 |
| }, |
| { |
| "epoch": 1.7796610169491527, |
| "grad_norm": 0.35034531354904175, |
| "learning_rate": 4.908269728528553e-05, |
| "loss": 0.6209, |
| "num_input_tokens_seen": 604496, |
| "step": 1470 |
| }, |
| { |
| "epoch": 1.7857142857142856, |
| "grad_norm": 0.34798485040664673, |
| "learning_rate": 4.90684654399431e-05, |
| "loss": 0.6588, |
| "num_input_tokens_seen": 606672, |
| "step": 1475 |
| }, |
| { |
| "epoch": 1.791767554479419, |
| "grad_norm": 0.3376762866973877, |
| "learning_rate": 4.9054126135579495e-05, |
| "loss": 0.5111, |
| "num_input_tokens_seen": 608752, |
| "step": 1480 |
| }, |
| { |
| "epoch": 1.797820823244552, |
| "grad_norm": 0.3494233787059784, |
| "learning_rate": 4.9039679436215734e-05, |
| "loss": 0.7603, |
| "num_input_tokens_seen": 610608, |
| "step": 1485 |
| }, |
| { |
| "epoch": 1.8038740920096852, |
| "grad_norm": 0.34808069467544556, |
| "learning_rate": 4.9025125406352335e-05, |
| "loss": 0.7663, |
| "num_input_tokens_seen": 612688, |
| "step": 1490 |
| }, |
| { |
| "epoch": 1.8099273607748185, |
| "grad_norm": 0.25778406858444214, |
| "learning_rate": 4.9010464110968976e-05, |
| "loss": 0.5657, |
| "num_input_tokens_seen": 614704, |
| "step": 1495 |
| }, |
| { |
| "epoch": 1.8159806295399514, |
| "grad_norm": 0.39974820613861084, |
| "learning_rate": 4.89956956155243e-05, |
| "loss": 0.739, |
| "num_input_tokens_seen": 616784, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.8220338983050848, |
| "grad_norm": 0.38527342677116394, |
| "learning_rate": 4.898081998595555e-05, |
| "loss": 0.8289, |
| "num_input_tokens_seen": 618736, |
| "step": 1505 |
| }, |
| { |
| "epoch": 1.828087167070218, |
| "grad_norm": 0.3349371552467346, |
| "learning_rate": 4.8965837288678253e-05, |
| "loss": 0.505, |
| "num_input_tokens_seen": 620944, |
| "step": 1510 |
| }, |
| { |
| "epoch": 1.834140435835351, |
| "grad_norm": 0.41494446992874146, |
| "learning_rate": 4.895074759058601e-05, |
| "loss": 0.7171, |
| "num_input_tokens_seen": 623120, |
| "step": 1515 |
| }, |
| { |
| "epoch": 1.8401937046004844, |
| "grad_norm": 0.281497985124588, |
| "learning_rate": 4.893555095905014e-05, |
| "loss": 0.5712, |
| "num_input_tokens_seen": 625136, |
| "step": 1520 |
| }, |
| { |
| "epoch": 1.8462469733656173, |
| "grad_norm": 0.2609713673591614, |
| "learning_rate": 4.892024746191939e-05, |
| "loss": 0.7263, |
| "num_input_tokens_seen": 627216, |
| "step": 1525 |
| }, |
| { |
| "epoch": 1.8523002421307506, |
| "grad_norm": 0.30866149067878723, |
| "learning_rate": 4.890483716751961e-05, |
| "loss": 0.5653, |
| "num_input_tokens_seen": 629136, |
| "step": 1530 |
| }, |
| { |
| "epoch": 1.8583535108958837, |
| "grad_norm": 0.3215227425098419, |
| "learning_rate": 4.888932014465352e-05, |
| "loss": 0.6417, |
| "num_input_tokens_seen": 631216, |
| "step": 1535 |
| }, |
| { |
| "epoch": 1.8644067796610169, |
| "grad_norm": 0.344390869140625, |
| "learning_rate": 4.8873696462600303e-05, |
| "loss": 0.7692, |
| "num_input_tokens_seen": 633328, |
| "step": 1540 |
| }, |
| { |
| "epoch": 1.8704600484261502, |
| "grad_norm": 0.5009181499481201, |
| "learning_rate": 4.8857966191115365e-05, |
| "loss": 0.6689, |
| "num_input_tokens_seen": 635440, |
| "step": 1545 |
| }, |
| { |
| "epoch": 1.8765133171912833, |
| "grad_norm": 0.4212459921836853, |
| "learning_rate": 4.884212940043001e-05, |
| "loss": 0.615, |
| "num_input_tokens_seen": 637360, |
| "step": 1550 |
| }, |
| { |
| "epoch": 1.8825665859564165, |
| "grad_norm": 0.6118276119232178, |
| "learning_rate": 4.882618616125111e-05, |
| "loss": 0.8355, |
| "num_input_tokens_seen": 639280, |
| "step": 1555 |
| }, |
| { |
| "epoch": 1.8886198547215496, |
| "grad_norm": 0.712222158908844, |
| "learning_rate": 4.881013654476081e-05, |
| "loss": 0.6072, |
| "num_input_tokens_seen": 641328, |
| "step": 1560 |
| }, |
| { |
| "epoch": 1.8946731234866827, |
| "grad_norm": 0.6113438606262207, |
| "learning_rate": 4.8793980622616195e-05, |
| "loss": 0.6023, |
| "num_input_tokens_seen": 643376, |
| "step": 1565 |
| }, |
| { |
| "epoch": 1.900726392251816, |
| "grad_norm": 0.25265204906463623, |
| "learning_rate": 4.877771846694897e-05, |
| "loss": 0.7591, |
| "num_input_tokens_seen": 645488, |
| "step": 1570 |
| }, |
| { |
| "epoch": 1.9067796610169492, |
| "grad_norm": 0.4864021837711334, |
| "learning_rate": 4.876135015036515e-05, |
| "loss": 0.66, |
| "num_input_tokens_seen": 647600, |
| "step": 1575 |
| }, |
| { |
| "epoch": 1.9128329297820823, |
| "grad_norm": 0.29899653792381287, |
| "learning_rate": 4.874487574594473e-05, |
| "loss": 0.5094, |
| "num_input_tokens_seen": 649648, |
| "step": 1580 |
| }, |
| { |
| "epoch": 1.9188861985472156, |
| "grad_norm": 0.40387359261512756, |
| "learning_rate": 4.872829532724136e-05, |
| "loss": 0.7514, |
| "num_input_tokens_seen": 651760, |
| "step": 1585 |
| }, |
| { |
| "epoch": 1.9249394673123486, |
| "grad_norm": 0.3166121244430542, |
| "learning_rate": 4.8711608968282e-05, |
| "loss": 0.4008, |
| "num_input_tokens_seen": 653840, |
| "step": 1590 |
| }, |
| { |
| "epoch": 1.930992736077482, |
| "grad_norm": 0.4781055450439453, |
| "learning_rate": 4.8694816743566616e-05, |
| "loss": 0.5978, |
| "num_input_tokens_seen": 655888, |
| "step": 1595 |
| }, |
| { |
| "epoch": 1.937046004842615, |
| "grad_norm": 0.369778573513031, |
| "learning_rate": 4.867791872806785e-05, |
| "loss": 0.5324, |
| "num_input_tokens_seen": 657776, |
| "step": 1600 |
| }, |
| { |
| "epoch": 1.9430992736077481, |
| "grad_norm": 0.4247503876686096, |
| "learning_rate": 4.8660914997230624e-05, |
| "loss": 0.4761, |
| "num_input_tokens_seen": 659952, |
| "step": 1605 |
| }, |
| { |
| "epoch": 1.9491525423728815, |
| "grad_norm": 0.4868212938308716, |
| "learning_rate": 4.8643805626971894e-05, |
| "loss": 0.5615, |
| "num_input_tokens_seen": 661968, |
| "step": 1610 |
| }, |
| { |
| "epoch": 1.9552058111380144, |
| "grad_norm": 0.41385340690612793, |
| "learning_rate": 4.862659069368026e-05, |
| "loss": 0.5552, |
| "num_input_tokens_seen": 664048, |
| "step": 1615 |
| }, |
| { |
| "epoch": 1.9612590799031477, |
| "grad_norm": 0.3044254183769226, |
| "learning_rate": 4.8609270274215614e-05, |
| "loss": 0.5219, |
| "num_input_tokens_seen": 665968, |
| "step": 1620 |
| }, |
| { |
| "epoch": 1.9673123486682809, |
| "grad_norm": 0.303305059671402, |
| "learning_rate": 4.859184444590882e-05, |
| "loss": 0.5789, |
| "num_input_tokens_seen": 667920, |
| "step": 1625 |
| }, |
| { |
| "epoch": 1.973365617433414, |
| "grad_norm": 0.3417951166629791, |
| "learning_rate": 4.857431328656137e-05, |
| "loss": 0.4571, |
| "num_input_tokens_seen": 669968, |
| "step": 1630 |
| }, |
| { |
| "epoch": 1.9794188861985473, |
| "grad_norm": 0.3936750590801239, |
| "learning_rate": 4.855667687444504e-05, |
| "loss": 0.4423, |
| "num_input_tokens_seen": 672016, |
| "step": 1635 |
| }, |
| { |
| "epoch": 1.9854721549636802, |
| "grad_norm": 0.5645264983177185, |
| "learning_rate": 4.85389352883015e-05, |
| "loss": 0.5947, |
| "num_input_tokens_seen": 674096, |
| "step": 1640 |
| }, |
| { |
| "epoch": 1.9915254237288136, |
| "grad_norm": 0.32442790269851685, |
| "learning_rate": 4.8521088607342016e-05, |
| "loss": 0.3428, |
| "num_input_tokens_seen": 676208, |
| "step": 1645 |
| }, |
| { |
| "epoch": 1.9975786924939467, |
| "grad_norm": 0.309976726770401, |
| "learning_rate": 4.850313691124707e-05, |
| "loss": 0.5717, |
| "num_input_tokens_seen": 678256, |
| "step": 1650 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.4883747100830078, |
| "eval_runtime": 7.6471, |
| "eval_samples_per_second": 47.992, |
| "eval_steps_per_second": 12.031, |
| "num_input_tokens_seen": 678688, |
| "step": 1652 |
| }, |
| { |
| "epoch": 2.00363196125908, |
| "grad_norm": 0.2366669774055481, |
| "learning_rate": 4.8485080280166006e-05, |
| "loss": 0.5427, |
| "num_input_tokens_seen": 679936, |
| "step": 1655 |
| }, |
| { |
| "epoch": 2.009685230024213, |
| "grad_norm": 0.3256445527076721, |
| "learning_rate": 4.8466918794716666e-05, |
| "loss": 0.5143, |
| "num_input_tokens_seen": 681984, |
| "step": 1660 |
| }, |
| { |
| "epoch": 2.015738498789346, |
| "grad_norm": 0.33861058950424194, |
| "learning_rate": 4.8448652535985045e-05, |
| "loss": 0.5045, |
| "num_input_tokens_seen": 684096, |
| "step": 1665 |
| }, |
| { |
| "epoch": 2.0217917675544794, |
| "grad_norm": 0.7048678994178772, |
| "learning_rate": 4.8430281585524926e-05, |
| "loss": 0.4777, |
| "num_input_tokens_seen": 686016, |
| "step": 1670 |
| }, |
| { |
| "epoch": 2.0278450363196128, |
| "grad_norm": 0.3535924553871155, |
| "learning_rate": 4.841180602535751e-05, |
| "loss": 0.5098, |
| "num_input_tokens_seen": 687904, |
| "step": 1675 |
| }, |
| { |
| "epoch": 2.0338983050847457, |
| "grad_norm": 0.28195756673812866, |
| "learning_rate": 4.839322593797104e-05, |
| "loss": 0.2498, |
| "num_input_tokens_seen": 689888, |
| "step": 1680 |
| }, |
| { |
| "epoch": 2.039951573849879, |
| "grad_norm": 0.3816757798194885, |
| "learning_rate": 4.837454140632045e-05, |
| "loss": 0.5178, |
| "num_input_tokens_seen": 691808, |
| "step": 1685 |
| }, |
| { |
| "epoch": 2.046004842615012, |
| "grad_norm": 0.23576921224594116, |
| "learning_rate": 4.8355752513826995e-05, |
| "loss": 0.5619, |
| "num_input_tokens_seen": 693856, |
| "step": 1690 |
| }, |
| { |
| "epoch": 2.0520581113801453, |
| "grad_norm": 0.254920095205307, |
| "learning_rate": 4.833685934437787e-05, |
| "loss": 0.5845, |
| "num_input_tokens_seen": 695680, |
| "step": 1695 |
| }, |
| { |
| "epoch": 2.0581113801452786, |
| "grad_norm": 0.21005745232105255, |
| "learning_rate": 4.831786198232583e-05, |
| "loss": 0.4312, |
| "num_input_tokens_seen": 697792, |
| "step": 1700 |
| }, |
| { |
| "epoch": 2.0641646489104115, |
| "grad_norm": 0.611488401889801, |
| "learning_rate": 4.8298760512488824e-05, |
| "loss": 0.4216, |
| "num_input_tokens_seen": 699712, |
| "step": 1705 |
| }, |
| { |
| "epoch": 2.070217917675545, |
| "grad_norm": 0.2882086932659149, |
| "learning_rate": 4.827955502014963e-05, |
| "loss": 0.3905, |
| "num_input_tokens_seen": 701792, |
| "step": 1710 |
| }, |
| { |
| "epoch": 2.0762711864406778, |
| "grad_norm": 0.2670579254627228, |
| "learning_rate": 4.826024559105542e-05, |
| "loss": 0.4422, |
| "num_input_tokens_seen": 703936, |
| "step": 1715 |
| }, |
| { |
| "epoch": 2.082324455205811, |
| "grad_norm": 0.43527278304100037, |
| "learning_rate": 4.8240832311417465e-05, |
| "loss": 0.3583, |
| "num_input_tokens_seen": 705952, |
| "step": 1720 |
| }, |
| { |
| "epoch": 2.0883777239709445, |
| "grad_norm": 0.46741268038749695, |
| "learning_rate": 4.822131526791065e-05, |
| "loss": 0.4249, |
| "num_input_tokens_seen": 708064, |
| "step": 1725 |
| }, |
| { |
| "epoch": 2.0944309927360774, |
| "grad_norm": 0.7110249400138855, |
| "learning_rate": 4.820169454767318e-05, |
| "loss": 0.5408, |
| "num_input_tokens_seen": 709952, |
| "step": 1730 |
| }, |
| { |
| "epoch": 2.1004842615012107, |
| "grad_norm": 0.35839521884918213, |
| "learning_rate": 4.8181970238306104e-05, |
| "loss": 0.4818, |
| "num_input_tokens_seen": 712032, |
| "step": 1735 |
| }, |
| { |
| "epoch": 2.106537530266344, |
| "grad_norm": 0.3425065577030182, |
| "learning_rate": 4.816214242787302e-05, |
| "loss": 0.3356, |
| "num_input_tokens_seen": 714048, |
| "step": 1740 |
| }, |
| { |
| "epoch": 2.112590799031477, |
| "grad_norm": 0.34282979369163513, |
| "learning_rate": 4.814221120489958e-05, |
| "loss": 0.4707, |
| "num_input_tokens_seen": 716224, |
| "step": 1745 |
| }, |
| { |
| "epoch": 2.1186440677966103, |
| "grad_norm": 0.4210341274738312, |
| "learning_rate": 4.812217665837316e-05, |
| "loss": 0.4559, |
| "num_input_tokens_seen": 718144, |
| "step": 1750 |
| }, |
| { |
| "epoch": 2.124697336561743, |
| "grad_norm": 0.47861868143081665, |
| "learning_rate": 4.810203887774247e-05, |
| "loss": 0.3916, |
| "num_input_tokens_seen": 720288, |
| "step": 1755 |
| }, |
| { |
| "epoch": 2.1307506053268765, |
| "grad_norm": 0.4552263915538788, |
| "learning_rate": 4.808179795291712e-05, |
| "loss": 0.5108, |
| "num_input_tokens_seen": 722368, |
| "step": 1760 |
| }, |
| { |
| "epoch": 2.13680387409201, |
| "grad_norm": 0.31580081582069397, |
| "learning_rate": 4.8061453974267195e-05, |
| "loss": 0.3638, |
| "num_input_tokens_seen": 724480, |
| "step": 1765 |
| }, |
| { |
| "epoch": 2.142857142857143, |
| "grad_norm": 0.5044140219688416, |
| "learning_rate": 4.804100703262294e-05, |
| "loss": 0.4291, |
| "num_input_tokens_seen": 726784, |
| "step": 1770 |
| }, |
| { |
| "epoch": 2.148910411622276, |
| "grad_norm": 0.3842848241329193, |
| "learning_rate": 4.8020457219274266e-05, |
| "loss": 0.4259, |
| "num_input_tokens_seen": 728768, |
| "step": 1775 |
| }, |
| { |
| "epoch": 2.154963680387409, |
| "grad_norm": 0.20223240554332733, |
| "learning_rate": 4.799980462597039e-05, |
| "loss": 0.408, |
| "num_input_tokens_seen": 730816, |
| "step": 1780 |
| }, |
| { |
| "epoch": 2.1610169491525424, |
| "grad_norm": 0.25541648268699646, |
| "learning_rate": 4.7979049344919416e-05, |
| "loss": 0.365, |
| "num_input_tokens_seen": 732896, |
| "step": 1785 |
| }, |
| { |
| "epoch": 2.1670702179176757, |
| "grad_norm": 0.1965058445930481, |
| "learning_rate": 4.795819146878792e-05, |
| "loss": 0.403, |
| "num_input_tokens_seen": 734816, |
| "step": 1790 |
| }, |
| { |
| "epoch": 2.1731234866828086, |
| "grad_norm": 0.5697582364082336, |
| "learning_rate": 4.7937231090700516e-05, |
| "loss": 0.3318, |
| "num_input_tokens_seen": 736800, |
| "step": 1795 |
| }, |
| { |
| "epoch": 2.179176755447942, |
| "grad_norm": 0.288495808839798, |
| "learning_rate": 4.7916168304239496e-05, |
| "loss": 0.3209, |
| "num_input_tokens_seen": 738944, |
| "step": 1800 |
| }, |
| { |
| "epoch": 2.185230024213075, |
| "grad_norm": 0.2042865753173828, |
| "learning_rate": 4.789500320344435e-05, |
| "loss": 0.4533, |
| "num_input_tokens_seen": 741184, |
| "step": 1805 |
| }, |
| { |
| "epoch": 2.1912832929782082, |
| "grad_norm": 0.2230595350265503, |
| "learning_rate": 4.787373588281138e-05, |
| "loss": 0.3429, |
| "num_input_tokens_seen": 743136, |
| "step": 1810 |
| }, |
| { |
| "epoch": 2.1973365617433416, |
| "grad_norm": 0.49934640526771545, |
| "learning_rate": 4.785236643729327e-05, |
| "loss": 0.3595, |
| "num_input_tokens_seen": 745152, |
| "step": 1815 |
| }, |
| { |
| "epoch": 2.2033898305084745, |
| "grad_norm": 0.22397416830062866, |
| "learning_rate": 4.7830894962298675e-05, |
| "loss": 0.4056, |
| "num_input_tokens_seen": 747200, |
| "step": 1820 |
| }, |
| { |
| "epoch": 2.209443099273608, |
| "grad_norm": 0.44224563241004944, |
| "learning_rate": 4.7809321553691764e-05, |
| "loss": 0.4471, |
| "num_input_tokens_seen": 749344, |
| "step": 1825 |
| }, |
| { |
| "epoch": 2.2154963680387407, |
| "grad_norm": 0.4338313341140747, |
| "learning_rate": 4.778764630779183e-05, |
| "loss": 0.4444, |
| "num_input_tokens_seen": 751360, |
| "step": 1830 |
| }, |
| { |
| "epoch": 2.221549636803874, |
| "grad_norm": 0.30568885803222656, |
| "learning_rate": 4.7765869321372836e-05, |
| "loss": 0.4138, |
| "num_input_tokens_seen": 753408, |
| "step": 1835 |
| }, |
| { |
| "epoch": 2.2276029055690074, |
| "grad_norm": 0.42496392130851746, |
| "learning_rate": 4.774399069166296e-05, |
| "loss": 0.3409, |
| "num_input_tokens_seen": 755456, |
| "step": 1840 |
| }, |
| { |
| "epoch": 2.2336561743341403, |
| "grad_norm": 0.7065222859382629, |
| "learning_rate": 4.772201051634426e-05, |
| "loss": 0.5082, |
| "num_input_tokens_seen": 757504, |
| "step": 1845 |
| }, |
| { |
| "epoch": 2.2397094430992737, |
| "grad_norm": 0.5929968357086182, |
| "learning_rate": 4.769992889355208e-05, |
| "loss": 0.4972, |
| "num_input_tokens_seen": 759584, |
| "step": 1850 |
| }, |
| { |
| "epoch": 2.2457627118644066, |
| "grad_norm": 0.30461207032203674, |
| "learning_rate": 4.767774592187475e-05, |
| "loss": 0.3155, |
| "num_input_tokens_seen": 761632, |
| "step": 1855 |
| }, |
| { |
| "epoch": 2.25181598062954, |
| "grad_norm": 0.366491436958313, |
| "learning_rate": 4.76554617003531e-05, |
| "loss": 0.4084, |
| "num_input_tokens_seen": 763648, |
| "step": 1860 |
| }, |
| { |
| "epoch": 2.2578692493946733, |
| "grad_norm": 0.4429326355457306, |
| "learning_rate": 4.763307632847998e-05, |
| "loss": 0.3694, |
| "num_input_tokens_seen": 765696, |
| "step": 1865 |
| }, |
| { |
| "epoch": 2.263922518159806, |
| "grad_norm": 0.5864928364753723, |
| "learning_rate": 4.761058990619986e-05, |
| "loss": 0.4321, |
| "num_input_tokens_seen": 767776, |
| "step": 1870 |
| }, |
| { |
| "epoch": 2.2699757869249395, |
| "grad_norm": 0.5651528239250183, |
| "learning_rate": 4.7588002533908405e-05, |
| "loss": 0.3283, |
| "num_input_tokens_seen": 769856, |
| "step": 1875 |
| }, |
| { |
| "epoch": 2.2760290556900724, |
| "grad_norm": 0.42014461755752563, |
| "learning_rate": 4.756531431245195e-05, |
| "loss": 0.3513, |
| "num_input_tokens_seen": 771904, |
| "step": 1880 |
| }, |
| { |
| "epoch": 2.2820823244552058, |
| "grad_norm": 0.3946358859539032, |
| "learning_rate": 4.75425253431271e-05, |
| "loss": 0.323, |
| "num_input_tokens_seen": 774016, |
| "step": 1885 |
| }, |
| { |
| "epoch": 2.288135593220339, |
| "grad_norm": 0.5422669649124146, |
| "learning_rate": 4.7519635727680286e-05, |
| "loss": 0.4076, |
| "num_input_tokens_seen": 776000, |
| "step": 1890 |
| }, |
| { |
| "epoch": 2.294188861985472, |
| "grad_norm": 0.4705069959163666, |
| "learning_rate": 4.749664556830731e-05, |
| "loss": 0.3228, |
| "num_input_tokens_seen": 778208, |
| "step": 1895 |
| }, |
| { |
| "epoch": 2.3002421307506054, |
| "grad_norm": 0.270033597946167, |
| "learning_rate": 4.747355496765283e-05, |
| "loss": 0.364, |
| "num_input_tokens_seen": 780128, |
| "step": 1900 |
| }, |
| { |
| "epoch": 2.3062953995157383, |
| "grad_norm": 0.3493155837059021, |
| "learning_rate": 4.745036402880999e-05, |
| "loss": 0.2773, |
| "num_input_tokens_seen": 782112, |
| "step": 1905 |
| }, |
| { |
| "epoch": 2.3123486682808716, |
| "grad_norm": 0.44850051403045654, |
| "learning_rate": 4.7427072855319886e-05, |
| "loss": 0.4591, |
| "num_input_tokens_seen": 784256, |
| "step": 1910 |
| }, |
| { |
| "epoch": 2.318401937046005, |
| "grad_norm": 0.34571966528892517, |
| "learning_rate": 4.740368155117116e-05, |
| "loss": 0.3282, |
| "num_input_tokens_seen": 786272, |
| "step": 1915 |
| }, |
| { |
| "epoch": 2.324455205811138, |
| "grad_norm": 0.27352625131607056, |
| "learning_rate": 4.7380190220799484e-05, |
| "loss": 0.2587, |
| "num_input_tokens_seen": 788352, |
| "step": 1920 |
| }, |
| { |
| "epoch": 2.330508474576271, |
| "grad_norm": 0.3315185308456421, |
| "learning_rate": 4.735659896908713e-05, |
| "loss": 0.3131, |
| "num_input_tokens_seen": 790368, |
| "step": 1925 |
| }, |
| { |
| "epoch": 2.3365617433414045, |
| "grad_norm": 0.1994757354259491, |
| "learning_rate": 4.73329079013625e-05, |
| "loss": 0.3382, |
| "num_input_tokens_seen": 792320, |
| "step": 1930 |
| }, |
| { |
| "epoch": 2.3426150121065374, |
| "grad_norm": 0.2385847270488739, |
| "learning_rate": 4.730911712339964e-05, |
| "loss": 0.321, |
| "num_input_tokens_seen": 794272, |
| "step": 1935 |
| }, |
| { |
| "epoch": 2.348668280871671, |
| "grad_norm": 0.41050824522972107, |
| "learning_rate": 4.728522674141776e-05, |
| "loss": 0.4058, |
| "num_input_tokens_seen": 796416, |
| "step": 1940 |
| }, |
| { |
| "epoch": 2.3547215496368037, |
| "grad_norm": 0.3930312693119049, |
| "learning_rate": 4.7261236862080805e-05, |
| "loss": 0.372, |
| "num_input_tokens_seen": 798432, |
| "step": 1945 |
| }, |
| { |
| "epoch": 2.360774818401937, |
| "grad_norm": 0.42457321286201477, |
| "learning_rate": 4.723714759249692e-05, |
| "loss": 0.3736, |
| "num_input_tokens_seen": 800480, |
| "step": 1950 |
| }, |
| { |
| "epoch": 2.3668280871670704, |
| "grad_norm": 0.3120960593223572, |
| "learning_rate": 4.721295904021802e-05, |
| "loss": 0.3412, |
| "num_input_tokens_seen": 802432, |
| "step": 1955 |
| }, |
| { |
| "epoch": 2.3728813559322033, |
| "grad_norm": 0.28398585319519043, |
| "learning_rate": 4.718867131323927e-05, |
| "loss": 0.373, |
| "num_input_tokens_seen": 804352, |
| "step": 1960 |
| }, |
| { |
| "epoch": 2.3789346246973366, |
| "grad_norm": 0.5278405547142029, |
| "learning_rate": 4.7164284519998644e-05, |
| "loss": 0.4249, |
| "num_input_tokens_seen": 806272, |
| "step": 1965 |
| }, |
| { |
| "epoch": 2.38498789346247, |
| "grad_norm": 0.417784184217453, |
| "learning_rate": 4.71397987693764e-05, |
| "loss": 0.309, |
| "num_input_tokens_seen": 808352, |
| "step": 1970 |
| }, |
| { |
| "epoch": 2.391041162227603, |
| "grad_norm": 0.4515986442565918, |
| "learning_rate": 4.711521417069462e-05, |
| "loss": 0.267, |
| "num_input_tokens_seen": 810304, |
| "step": 1975 |
| }, |
| { |
| "epoch": 2.3970944309927362, |
| "grad_norm": 0.3050227463245392, |
| "learning_rate": 4.709053083371672e-05, |
| "loss": 0.309, |
| "num_input_tokens_seen": 812384, |
| "step": 1980 |
| }, |
| { |
| "epoch": 2.403147699757869, |
| "grad_norm": 0.2810879051685333, |
| "learning_rate": 4.706574886864696e-05, |
| "loss": 0.3117, |
| "num_input_tokens_seen": 814400, |
| "step": 1985 |
| }, |
| { |
| "epoch": 2.4092009685230025, |
| "grad_norm": 0.38138630986213684, |
| "learning_rate": 4.7040868386129935e-05, |
| "loss": 0.2702, |
| "num_input_tokens_seen": 816448, |
| "step": 1990 |
| }, |
| { |
| "epoch": 2.415254237288136, |
| "grad_norm": 0.2792954444885254, |
| "learning_rate": 4.701588949725009e-05, |
| "loss": 0.3046, |
| "num_input_tokens_seen": 818624, |
| "step": 1995 |
| }, |
| { |
| "epoch": 2.4213075060532687, |
| "grad_norm": 0.2553127110004425, |
| "learning_rate": 4.699081231353124e-05, |
| "loss": 0.3271, |
| "num_input_tokens_seen": 820640, |
| "step": 2000 |
| }, |
| { |
| "epoch": 2.427360774818402, |
| "grad_norm": 0.33620360493659973, |
| "learning_rate": 4.696563694693605e-05, |
| "loss": 0.3885, |
| "num_input_tokens_seen": 822752, |
| "step": 2005 |
| }, |
| { |
| "epoch": 2.433414043583535, |
| "grad_norm": 0.14532940089702606, |
| "learning_rate": 4.694036350986556e-05, |
| "loss": 0.3864, |
| "num_input_tokens_seen": 824832, |
| "step": 2010 |
| }, |
| { |
| "epoch": 2.4394673123486683, |
| "grad_norm": 0.32560446858406067, |
| "learning_rate": 4.6914992115158634e-05, |
| "loss": 0.3715, |
| "num_input_tokens_seen": 826816, |
| "step": 2015 |
| }, |
| { |
| "epoch": 2.4455205811138017, |
| "grad_norm": 0.4040687084197998, |
| "learning_rate": 4.688952287609152e-05, |
| "loss": 0.346, |
| "num_input_tokens_seen": 828672, |
| "step": 2020 |
| }, |
| { |
| "epoch": 2.4515738498789346, |
| "grad_norm": 0.29258260130882263, |
| "learning_rate": 4.686395590637732e-05, |
| "loss": 0.2396, |
| "num_input_tokens_seen": 830752, |
| "step": 2025 |
| }, |
| { |
| "epoch": 2.457627118644068, |
| "grad_norm": 0.4037116467952728, |
| "learning_rate": 4.683829132016544e-05, |
| "loss": 0.3272, |
| "num_input_tokens_seen": 832832, |
| "step": 2030 |
| }, |
| { |
| "epoch": 2.463680387409201, |
| "grad_norm": 0.27900296449661255, |
| "learning_rate": 4.6812529232041144e-05, |
| "loss": 0.3394, |
| "num_input_tokens_seen": 834848, |
| "step": 2035 |
| }, |
| { |
| "epoch": 2.469733656174334, |
| "grad_norm": 0.31596145033836365, |
| "learning_rate": 4.6786669757025016e-05, |
| "loss": 0.3296, |
| "num_input_tokens_seen": 836864, |
| "step": 2040 |
| }, |
| { |
| "epoch": 2.4757869249394675, |
| "grad_norm": 0.3748748004436493, |
| "learning_rate": 4.676071301057243e-05, |
| "loss": 0.3517, |
| "num_input_tokens_seen": 838784, |
| "step": 2045 |
| }, |
| { |
| "epoch": 2.4818401937046004, |
| "grad_norm": 0.30301108956336975, |
| "learning_rate": 4.673465910857306e-05, |
| "loss": 0.2382, |
| "num_input_tokens_seen": 840832, |
| "step": 2050 |
| }, |
| { |
| "epoch": 2.4878934624697338, |
| "grad_norm": 0.29738783836364746, |
| "learning_rate": 4.670850816735035e-05, |
| "loss": 0.2785, |
| "num_input_tokens_seen": 842752, |
| "step": 2055 |
| }, |
| { |
| "epoch": 2.4939467312348667, |
| "grad_norm": 0.41478708386421204, |
| "learning_rate": 4.668226030366101e-05, |
| "loss": 0.3304, |
| "num_input_tokens_seen": 844928, |
| "step": 2060 |
| }, |
| { |
| "epoch": 2.5, |
| "grad_norm": 0.34825223684310913, |
| "learning_rate": 4.665591563469445e-05, |
| "loss": 0.3105, |
| "num_input_tokens_seen": 847104, |
| "step": 2065 |
| }, |
| { |
| "epoch": 2.5, |
| "eval_loss": 0.30181387066841125, |
| "eval_runtime": 7.655, |
| "eval_samples_per_second": 47.943, |
| "eval_steps_per_second": 12.018, |
| "num_input_tokens_seen": 847104, |
| "step": 2065 |
| }, |
| { |
| "epoch": 2.5060532687651333, |
| "grad_norm": 0.31229767203330994, |
| "learning_rate": 4.662947427807231e-05, |
| "loss": 0.3219, |
| "num_input_tokens_seen": 849120, |
| "step": 2070 |
| }, |
| { |
| "epoch": 2.5121065375302662, |
| "grad_norm": 0.5778428912162781, |
| "learning_rate": 4.6602936351847924e-05, |
| "loss": 0.2441, |
| "num_input_tokens_seen": 851296, |
| "step": 2075 |
| }, |
| { |
| "epoch": 2.5181598062953996, |
| "grad_norm": 0.32353171706199646, |
| "learning_rate": 4.657630197450577e-05, |
| "loss": 0.2938, |
| "num_input_tokens_seen": 853344, |
| "step": 2080 |
| }, |
| { |
| "epoch": 2.5242130750605325, |
| "grad_norm": 0.23973071575164795, |
| "learning_rate": 4.6549571264960945e-05, |
| "loss": 0.3105, |
| "num_input_tokens_seen": 855264, |
| "step": 2085 |
| }, |
| { |
| "epoch": 2.530266343825666, |
| "grad_norm": 0.27126574516296387, |
| "learning_rate": 4.652274434255866e-05, |
| "loss": 0.3286, |
| "num_input_tokens_seen": 857152, |
| "step": 2090 |
| }, |
| { |
| "epoch": 2.536319612590799, |
| "grad_norm": 0.3393484652042389, |
| "learning_rate": 4.6495821327073675e-05, |
| "loss": 0.3151, |
| "num_input_tokens_seen": 859232, |
| "step": 2095 |
| }, |
| { |
| "epoch": 2.542372881355932, |
| "grad_norm": 0.2748543620109558, |
| "learning_rate": 4.6468802338709783e-05, |
| "loss": 0.219, |
| "num_input_tokens_seen": 861312, |
| "step": 2100 |
| }, |
| { |
| "epoch": 2.5484261501210654, |
| "grad_norm": 0.5479225516319275, |
| "learning_rate": 4.644168749809929e-05, |
| "loss": 0.4622, |
| "num_input_tokens_seen": 863552, |
| "step": 2105 |
| }, |
| { |
| "epoch": 2.5544794188861983, |
| "grad_norm": 0.22317218780517578, |
| "learning_rate": 4.6414476926302406e-05, |
| "loss": 0.3443, |
| "num_input_tokens_seen": 865568, |
| "step": 2110 |
| }, |
| { |
| "epoch": 2.5605326876513317, |
| "grad_norm": 0.34723904728889465, |
| "learning_rate": 4.638717074480682e-05, |
| "loss": 0.3002, |
| "num_input_tokens_seen": 867488, |
| "step": 2115 |
| }, |
| { |
| "epoch": 2.566585956416465, |
| "grad_norm": 0.37053370475769043, |
| "learning_rate": 4.6359769075527026e-05, |
| "loss": 0.2777, |
| "num_input_tokens_seen": 869504, |
| "step": 2120 |
| }, |
| { |
| "epoch": 2.572639225181598, |
| "grad_norm": 0.34994322061538696, |
| "learning_rate": 4.6332272040803895e-05, |
| "loss": 0.3066, |
| "num_input_tokens_seen": 871680, |
| "step": 2125 |
| }, |
| { |
| "epoch": 2.5786924939467313, |
| "grad_norm": 0.38928884267807007, |
| "learning_rate": 4.630467976340405e-05, |
| "loss": 0.2616, |
| "num_input_tokens_seen": 873856, |
| "step": 2130 |
| }, |
| { |
| "epoch": 2.584745762711864, |
| "grad_norm": 0.24148716032505035, |
| "learning_rate": 4.6276992366519365e-05, |
| "loss": 0.2632, |
| "num_input_tokens_seen": 875968, |
| "step": 2135 |
| }, |
| { |
| "epoch": 2.5907990314769975, |
| "grad_norm": 0.3962620496749878, |
| "learning_rate": 4.624920997376637e-05, |
| "loss": 0.2491, |
| "num_input_tokens_seen": 878144, |
| "step": 2140 |
| }, |
| { |
| "epoch": 2.596852300242131, |
| "grad_norm": 0.37537842988967896, |
| "learning_rate": 4.622133270918576e-05, |
| "loss": 0.3718, |
| "num_input_tokens_seen": 880320, |
| "step": 2145 |
| }, |
| { |
| "epoch": 2.6029055690072638, |
| "grad_norm": 0.39540189504623413, |
| "learning_rate": 4.619336069724177e-05, |
| "loss": 0.3046, |
| "num_input_tokens_seen": 882400, |
| "step": 2150 |
| }, |
| { |
| "epoch": 2.608958837772397, |
| "grad_norm": 0.2502098083496094, |
| "learning_rate": 4.6165294062821696e-05, |
| "loss": 0.3243, |
| "num_input_tokens_seen": 884512, |
| "step": 2155 |
| }, |
| { |
| "epoch": 2.61501210653753, |
| "grad_norm": 0.4678116738796234, |
| "learning_rate": 4.613713293123525e-05, |
| "loss": 0.3253, |
| "num_input_tokens_seen": 886592, |
| "step": 2160 |
| }, |
| { |
| "epoch": 2.6210653753026634, |
| "grad_norm": 0.2680087387561798, |
| "learning_rate": 4.610887742821408e-05, |
| "loss": 0.2398, |
| "num_input_tokens_seen": 888640, |
| "step": 2165 |
| }, |
| { |
| "epoch": 2.6271186440677967, |
| "grad_norm": 0.2689138650894165, |
| "learning_rate": 4.608052767991118e-05, |
| "loss": 0.2988, |
| "num_input_tokens_seen": 890880, |
| "step": 2170 |
| }, |
| { |
| "epoch": 2.6331719128329296, |
| "grad_norm": 0.39841464161872864, |
| "learning_rate": 4.605208381290029e-05, |
| "loss": 0.3057, |
| "num_input_tokens_seen": 892832, |
| "step": 2175 |
| }, |
| { |
| "epoch": 2.639225181598063, |
| "grad_norm": 0.6183250546455383, |
| "learning_rate": 4.6023545954175406e-05, |
| "loss": 0.2699, |
| "num_input_tokens_seen": 894912, |
| "step": 2180 |
| }, |
| { |
| "epoch": 2.645278450363196, |
| "grad_norm": 0.9258324503898621, |
| "learning_rate": 4.599491423115014e-05, |
| "loss": 0.3599, |
| "num_input_tokens_seen": 896960, |
| "step": 2185 |
| }, |
| { |
| "epoch": 2.651331719128329, |
| "grad_norm": 0.26727059483528137, |
| "learning_rate": 4.59661887716572e-05, |
| "loss": 0.3234, |
| "num_input_tokens_seen": 898816, |
| "step": 2190 |
| }, |
| { |
| "epoch": 2.6573849878934626, |
| "grad_norm": 0.37612417340278625, |
| "learning_rate": 4.5937369703947785e-05, |
| "loss": 0.252, |
| "num_input_tokens_seen": 900832, |
| "step": 2195 |
| }, |
| { |
| "epoch": 2.663438256658596, |
| "grad_norm": 0.46908697485923767, |
| "learning_rate": 4.590845715669104e-05, |
| "loss": 0.2772, |
| "num_input_tokens_seen": 902944, |
| "step": 2200 |
| }, |
| { |
| "epoch": 2.669491525423729, |
| "grad_norm": 0.4039635956287384, |
| "learning_rate": 4.5879451258973465e-05, |
| "loss": 0.3309, |
| "num_input_tokens_seen": 905056, |
| "step": 2205 |
| }, |
| { |
| "epoch": 2.6755447941888617, |
| "grad_norm": 0.24156828224658966, |
| "learning_rate": 4.5850352140298356e-05, |
| "loss": 0.2873, |
| "num_input_tokens_seen": 907232, |
| "step": 2210 |
| }, |
| { |
| "epoch": 2.681598062953995, |
| "grad_norm": 0.29607707262039185, |
| "learning_rate": 4.582115993058519e-05, |
| "loss": 0.2946, |
| "num_input_tokens_seen": 909248, |
| "step": 2215 |
| }, |
| { |
| "epoch": 2.6876513317191284, |
| "grad_norm": 0.2865666449069977, |
| "learning_rate": 4.5791874760169095e-05, |
| "loss": 0.2863, |
| "num_input_tokens_seen": 911264, |
| "step": 2220 |
| }, |
| { |
| "epoch": 2.6937046004842617, |
| "grad_norm": 0.2770346701145172, |
| "learning_rate": 4.5762496759800246e-05, |
| "loss": 0.2555, |
| "num_input_tokens_seen": 913216, |
| "step": 2225 |
| }, |
| { |
| "epoch": 2.6997578692493946, |
| "grad_norm": 0.2658018469810486, |
| "learning_rate": 4.573302606064324e-05, |
| "loss": 0.235, |
| "num_input_tokens_seen": 915328, |
| "step": 2230 |
| }, |
| { |
| "epoch": 2.705811138014528, |
| "grad_norm": 0.3834080100059509, |
| "learning_rate": 4.5703462794276574e-05, |
| "loss": 0.2784, |
| "num_input_tokens_seen": 917568, |
| "step": 2235 |
| }, |
| { |
| "epoch": 2.711864406779661, |
| "grad_norm": 0.7092241048812866, |
| "learning_rate": 4.567380709269205e-05, |
| "loss": 0.3431, |
| "num_input_tokens_seen": 919424, |
| "step": 2240 |
| }, |
| { |
| "epoch": 2.7179176755447942, |
| "grad_norm": 0.3505384922027588, |
| "learning_rate": 4.5644059088294145e-05, |
| "loss": 0.2031, |
| "num_input_tokens_seen": 921376, |
| "step": 2245 |
| }, |
| { |
| "epoch": 2.7239709443099276, |
| "grad_norm": 0.3387749195098877, |
| "learning_rate": 4.561421891389943e-05, |
| "loss": 0.3005, |
| "num_input_tokens_seen": 923456, |
| "step": 2250 |
| }, |
| { |
| "epoch": 2.7300242130750605, |
| "grad_norm": 0.6630310416221619, |
| "learning_rate": 4.558428670273601e-05, |
| "loss": 0.3047, |
| "num_input_tokens_seen": 925504, |
| "step": 2255 |
| }, |
| { |
| "epoch": 2.736077481840194, |
| "grad_norm": 0.5666266679763794, |
| "learning_rate": 4.555426258844292e-05, |
| "loss": 0.2098, |
| "num_input_tokens_seen": 927456, |
| "step": 2260 |
| }, |
| { |
| "epoch": 2.7421307506053267, |
| "grad_norm": 0.31670263409614563, |
| "learning_rate": 4.552414670506949e-05, |
| "loss": 0.2569, |
| "num_input_tokens_seen": 929440, |
| "step": 2265 |
| }, |
| { |
| "epoch": 2.74818401937046, |
| "grad_norm": 0.37908151745796204, |
| "learning_rate": 4.5493939187074784e-05, |
| "loss": 0.2991, |
| "num_input_tokens_seen": 931712, |
| "step": 2270 |
| }, |
| { |
| "epoch": 2.7542372881355934, |
| "grad_norm": 0.49768969416618347, |
| "learning_rate": 4.5463640169326994e-05, |
| "loss": 0.2594, |
| "num_input_tokens_seen": 933664, |
| "step": 2275 |
| }, |
| { |
| "epoch": 2.7602905569007263, |
| "grad_norm": 0.3185713589191437, |
| "learning_rate": 4.5433249787102816e-05, |
| "loss": 0.2381, |
| "num_input_tokens_seen": 935840, |
| "step": 2280 |
| }, |
| { |
| "epoch": 2.7663438256658597, |
| "grad_norm": 0.2714245915412903, |
| "learning_rate": 4.54027681760869e-05, |
| "loss": 0.2479, |
| "num_input_tokens_seen": 937920, |
| "step": 2285 |
| }, |
| { |
| "epoch": 2.7723970944309926, |
| "grad_norm": 0.3374306857585907, |
| "learning_rate": 4.537219547237115e-05, |
| "loss": 0.3209, |
| "num_input_tokens_seen": 940160, |
| "step": 2290 |
| }, |
| { |
| "epoch": 2.778450363196126, |
| "grad_norm": 0.14487412571907043, |
| "learning_rate": 4.5341531812454234e-05, |
| "loss": 0.3606, |
| "num_input_tokens_seen": 942304, |
| "step": 2295 |
| }, |
| { |
| "epoch": 2.7845036319612593, |
| "grad_norm": 0.287188321352005, |
| "learning_rate": 4.5310777333240885e-05, |
| "loss": 0.2898, |
| "num_input_tokens_seen": 944288, |
| "step": 2300 |
| }, |
| { |
| "epoch": 2.790556900726392, |
| "grad_norm": 0.4892873466014862, |
| "learning_rate": 4.52799321720413e-05, |
| "loss": 0.2616, |
| "num_input_tokens_seen": 946368, |
| "step": 2305 |
| }, |
| { |
| "epoch": 2.7966101694915255, |
| "grad_norm": 0.14393118023872375, |
| "learning_rate": 4.524899646657059e-05, |
| "loss": 0.2497, |
| "num_input_tokens_seen": 948512, |
| "step": 2310 |
| }, |
| { |
| "epoch": 2.8026634382566584, |
| "grad_norm": 0.24992553889751434, |
| "learning_rate": 4.521797035494809e-05, |
| "loss": 0.2415, |
| "num_input_tokens_seen": 950624, |
| "step": 2315 |
| }, |
| { |
| "epoch": 2.8087167070217918, |
| "grad_norm": 0.2596166431903839, |
| "learning_rate": 4.5186853975696775e-05, |
| "loss": 0.2845, |
| "num_input_tokens_seen": 952640, |
| "step": 2320 |
| }, |
| { |
| "epoch": 2.814769975786925, |
| "grad_norm": 0.5178759694099426, |
| "learning_rate": 4.515564746774265e-05, |
| "loss": 0.2581, |
| "num_input_tokens_seen": 954656, |
| "step": 2325 |
| }, |
| { |
| "epoch": 2.820823244552058, |
| "grad_norm": 0.29620280861854553, |
| "learning_rate": 4.512435097041412e-05, |
| "loss": 0.2705, |
| "num_input_tokens_seen": 956736, |
| "step": 2330 |
| }, |
| { |
| "epoch": 2.8268765133171914, |
| "grad_norm": 0.23814593255519867, |
| "learning_rate": 4.509296462344136e-05, |
| "loss": 0.2684, |
| "num_input_tokens_seen": 958816, |
| "step": 2335 |
| }, |
| { |
| "epoch": 2.8329297820823243, |
| "grad_norm": 0.20183499157428741, |
| "learning_rate": 4.50614885669557e-05, |
| "loss": 0.3398, |
| "num_input_tokens_seen": 960800, |
| "step": 2340 |
| }, |
| { |
| "epoch": 2.8389830508474576, |
| "grad_norm": 0.516444981098175, |
| "learning_rate": 4.5029922941489e-05, |
| "loss": 0.2652, |
| "num_input_tokens_seen": 962976, |
| "step": 2345 |
| }, |
| { |
| "epoch": 2.845036319612591, |
| "grad_norm": 0.2504444420337677, |
| "learning_rate": 4.499826788797302e-05, |
| "loss": 0.2801, |
| "num_input_tokens_seen": 965088, |
| "step": 2350 |
| }, |
| { |
| "epoch": 2.851089588377724, |
| "grad_norm": 0.3967917561531067, |
| "learning_rate": 4.49665235477388e-05, |
| "loss": 0.244, |
| "num_input_tokens_seen": 967136, |
| "step": 2355 |
| }, |
| { |
| "epoch": 2.857142857142857, |
| "grad_norm": 0.3460484445095062, |
| "learning_rate": 4.493469006251601e-05, |
| "loss": 0.2949, |
| "num_input_tokens_seen": 969248, |
| "step": 2360 |
| }, |
| { |
| "epoch": 2.86319612590799, |
| "grad_norm": 0.308718740940094, |
| "learning_rate": 4.490276757443233e-05, |
| "loss": 0.3074, |
| "num_input_tokens_seen": 971360, |
| "step": 2365 |
| }, |
| { |
| "epoch": 2.8692493946731235, |
| "grad_norm": 0.5150527954101562, |
| "learning_rate": 4.487075622601281e-05, |
| "loss": 0.2457, |
| "num_input_tokens_seen": 973408, |
| "step": 2370 |
| }, |
| { |
| "epoch": 2.875302663438257, |
| "grad_norm": 0.27302834391593933, |
| "learning_rate": 4.483865616017924e-05, |
| "loss": 0.2176, |
| "num_input_tokens_seen": 975392, |
| "step": 2375 |
| }, |
| { |
| "epoch": 2.8813559322033897, |
| "grad_norm": 0.16442711651325226, |
| "learning_rate": 4.480646752024951e-05, |
| "loss": 0.2374, |
| "num_input_tokens_seen": 977376, |
| "step": 2380 |
| }, |
| { |
| "epoch": 2.887409200968523, |
| "grad_norm": 0.20484280586242676, |
| "learning_rate": 4.477419044993697e-05, |
| "loss": 0.2465, |
| "num_input_tokens_seen": 979328, |
| "step": 2385 |
| }, |
| { |
| "epoch": 2.893462469733656, |
| "grad_norm": 0.31378448009490967, |
| "learning_rate": 4.474182509334978e-05, |
| "loss": 0.2673, |
| "num_input_tokens_seen": 981376, |
| "step": 2390 |
| }, |
| { |
| "epoch": 2.8995157384987893, |
| "grad_norm": 0.2574687600135803, |
| "learning_rate": 4.470937159499029e-05, |
| "loss": 0.2357, |
| "num_input_tokens_seen": 983360, |
| "step": 2395 |
| }, |
| { |
| "epoch": 2.9055690072639226, |
| "grad_norm": 0.36439743638038635, |
| "learning_rate": 4.467683009975435e-05, |
| "loss": 0.2959, |
| "num_input_tokens_seen": 985408, |
| "step": 2400 |
| }, |
| { |
| "epoch": 2.9116222760290555, |
| "grad_norm": 0.30939945578575134, |
| "learning_rate": 4.464420075293072e-05, |
| "loss": 0.2785, |
| "num_input_tokens_seen": 987584, |
| "step": 2405 |
| }, |
| { |
| "epoch": 2.917675544794189, |
| "grad_norm": 0.5510197877883911, |
| "learning_rate": 4.4611483700200374e-05, |
| "loss": 0.3106, |
| "num_input_tokens_seen": 989632, |
| "step": 2410 |
| }, |
| { |
| "epoch": 2.923728813559322, |
| "grad_norm": 0.4339779019355774, |
| "learning_rate": 4.457867908763589e-05, |
| "loss": 0.1908, |
| "num_input_tokens_seen": 991680, |
| "step": 2415 |
| }, |
| { |
| "epoch": 2.929782082324455, |
| "grad_norm": 0.23311731219291687, |
| "learning_rate": 4.454578706170075e-05, |
| "loss": 0.1806, |
| "num_input_tokens_seen": 993824, |
| "step": 2420 |
| }, |
| { |
| "epoch": 2.9358353510895885, |
| "grad_norm": 0.17986497282981873, |
| "learning_rate": 4.4512807769248723e-05, |
| "loss": 0.2348, |
| "num_input_tokens_seen": 995904, |
| "step": 2425 |
| }, |
| { |
| "epoch": 2.9418886198547214, |
| "grad_norm": 0.16040824353694916, |
| "learning_rate": 4.447974135752321e-05, |
| "loss": 0.2326, |
| "num_input_tokens_seen": 998080, |
| "step": 2430 |
| }, |
| { |
| "epoch": 2.9479418886198547, |
| "grad_norm": 0.39264827966690063, |
| "learning_rate": 4.444658797415656e-05, |
| "loss": 0.252, |
| "num_input_tokens_seen": 1000160, |
| "step": 2435 |
| }, |
| { |
| "epoch": 2.9539951573849876, |
| "grad_norm": 0.22295121848583221, |
| "learning_rate": 4.441334776716944e-05, |
| "loss": 0.2739, |
| "num_input_tokens_seen": 1002368, |
| "step": 2440 |
| }, |
| { |
| "epoch": 2.960048426150121, |
| "grad_norm": 0.264324814081192, |
| "learning_rate": 4.438002088497015e-05, |
| "loss": 0.2678, |
| "num_input_tokens_seen": 1004448, |
| "step": 2445 |
| }, |
| { |
| "epoch": 2.9661016949152543, |
| "grad_norm": 0.3659127652645111, |
| "learning_rate": 4.434660747635396e-05, |
| "loss": 0.2666, |
| "num_input_tokens_seen": 1006336, |
| "step": 2450 |
| }, |
| { |
| "epoch": 2.9721549636803877, |
| "grad_norm": 0.26207435131073, |
| "learning_rate": 4.4313107690502485e-05, |
| "loss": 0.2269, |
| "num_input_tokens_seen": 1008416, |
| "step": 2455 |
| }, |
| { |
| "epoch": 2.9782082324455206, |
| "grad_norm": 0.3960884213447571, |
| "learning_rate": 4.427952167698298e-05, |
| "loss": 0.2154, |
| "num_input_tokens_seen": 1010400, |
| "step": 2460 |
| }, |
| { |
| "epoch": 2.9842615012106535, |
| "grad_norm": 0.2359500676393509, |
| "learning_rate": 4.4245849585747654e-05, |
| "loss": 0.2584, |
| "num_input_tokens_seen": 1012320, |
| "step": 2465 |
| }, |
| { |
| "epoch": 2.990314769975787, |
| "grad_norm": 0.2086065411567688, |
| "learning_rate": 4.4212091567133083e-05, |
| "loss": 0.3244, |
| "num_input_tokens_seen": 1014400, |
| "step": 2470 |
| }, |
| { |
| "epoch": 2.99636803874092, |
| "grad_norm": 0.40261396765708923, |
| "learning_rate": 4.417824777185943e-05, |
| "loss": 0.1992, |
| "num_input_tokens_seen": 1016480, |
| "step": 2475 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 0.24942238628864288, |
| "eval_runtime": 7.6488, |
| "eval_samples_per_second": 47.981, |
| "eval_steps_per_second": 12.028, |
| "num_input_tokens_seen": 1017368, |
| "step": 2478 |
| }, |
| { |
| "epoch": 3.002421307506053, |
| "grad_norm": 0.46520891785621643, |
| "learning_rate": 4.414431835102987e-05, |
| "loss": 0.1979, |
| "num_input_tokens_seen": 1018232, |
| "step": 2480 |
| }, |
| { |
| "epoch": 3.0084745762711864, |
| "grad_norm": 0.4228960871696472, |
| "learning_rate": 4.411030345612984e-05, |
| "loss": 0.3344, |
| "num_input_tokens_seen": 1020280, |
| "step": 2485 |
| }, |
| { |
| "epoch": 3.0145278450363198, |
| "grad_norm": 0.2000986933708191, |
| "learning_rate": 4.407620323902643e-05, |
| "loss": 0.2287, |
| "num_input_tokens_seen": 1022328, |
| "step": 2490 |
| }, |
| { |
| "epoch": 3.0205811138014527, |
| "grad_norm": 0.2613808214664459, |
| "learning_rate": 4.404201785196762e-05, |
| "loss": 0.2272, |
| "num_input_tokens_seen": 1024312, |
| "step": 2495 |
| }, |
| { |
| "epoch": 3.026634382566586, |
| "grad_norm": 0.28690946102142334, |
| "learning_rate": 4.400774744758171e-05, |
| "loss": 0.2868, |
| "num_input_tokens_seen": 1026392, |
| "step": 2500 |
| }, |
| { |
| "epoch": 3.032687651331719, |
| "grad_norm": 0.32995086908340454, |
| "learning_rate": 4.397339217887652e-05, |
| "loss": 0.1902, |
| "num_input_tokens_seen": 1028344, |
| "step": 2505 |
| }, |
| { |
| "epoch": 3.0387409200968523, |
| "grad_norm": 0.28165653347969055, |
| "learning_rate": 4.393895219923881e-05, |
| "loss": 0.2976, |
| "num_input_tokens_seen": 1030520, |
| "step": 2510 |
| }, |
| { |
| "epoch": 3.0447941888619856, |
| "grad_norm": 0.3806597590446472, |
| "learning_rate": 4.3904427662433534e-05, |
| "loss": 0.2382, |
| "num_input_tokens_seen": 1032728, |
| "step": 2515 |
| }, |
| { |
| "epoch": 3.0508474576271185, |
| "grad_norm": 0.2645051181316376, |
| "learning_rate": 4.386981872260317e-05, |
| "loss": 0.293, |
| "num_input_tokens_seen": 1034808, |
| "step": 2520 |
| }, |
| { |
| "epoch": 3.056900726392252, |
| "grad_norm": 0.40518951416015625, |
| "learning_rate": 4.383512553426703e-05, |
| "loss": 0.2884, |
| "num_input_tokens_seen": 1036888, |
| "step": 2525 |
| }, |
| { |
| "epoch": 3.062953995157385, |
| "grad_norm": 0.40520647168159485, |
| "learning_rate": 4.380034825232059e-05, |
| "loss": 0.2538, |
| "num_input_tokens_seen": 1039000, |
| "step": 2530 |
| }, |
| { |
| "epoch": 3.069007263922518, |
| "grad_norm": 0.3130396008491516, |
| "learning_rate": 4.376548703203474e-05, |
| "loss": 0.2188, |
| "num_input_tokens_seen": 1040984, |
| "step": 2535 |
| }, |
| { |
| "epoch": 3.0750605326876514, |
| "grad_norm": 0.40585508942604065, |
| "learning_rate": 4.3730542029055174e-05, |
| "loss": 0.304, |
| "num_input_tokens_seen": 1043000, |
| "step": 2540 |
| }, |
| { |
| "epoch": 3.0811138014527844, |
| "grad_norm": 0.5657253265380859, |
| "learning_rate": 4.3695513399401646e-05, |
| "loss": 0.2721, |
| "num_input_tokens_seen": 1044984, |
| "step": 2545 |
| }, |
| { |
| "epoch": 3.0871670702179177, |
| "grad_norm": 0.4377957284450531, |
| "learning_rate": 4.366040129946725e-05, |
| "loss": 0.2414, |
| "num_input_tokens_seen": 1047096, |
| "step": 2550 |
| }, |
| { |
| "epoch": 3.093220338983051, |
| "grad_norm": 0.457617849111557, |
| "learning_rate": 4.362520588601777e-05, |
| "loss": 0.2374, |
| "num_input_tokens_seen": 1049208, |
| "step": 2555 |
| }, |
| { |
| "epoch": 3.099273607748184, |
| "grad_norm": 0.2504066824913025, |
| "learning_rate": 4.3589927316190983e-05, |
| "loss": 0.2801, |
| "num_input_tokens_seen": 1051192, |
| "step": 2560 |
| }, |
| { |
| "epoch": 3.1053268765133173, |
| "grad_norm": 0.2995288074016571, |
| "learning_rate": 4.35545657474959e-05, |
| "loss": 0.2118, |
| "num_input_tokens_seen": 1053240, |
| "step": 2565 |
| }, |
| { |
| "epoch": 3.11138014527845, |
| "grad_norm": 0.42375728487968445, |
| "learning_rate": 4.351912133781213e-05, |
| "loss": 0.2382, |
| "num_input_tokens_seen": 1055384, |
| "step": 2570 |
| }, |
| { |
| "epoch": 3.1174334140435835, |
| "grad_norm": 0.20157060027122498, |
| "learning_rate": 4.3483594245389106e-05, |
| "loss": 0.2179, |
| "num_input_tokens_seen": 1057464, |
| "step": 2575 |
| }, |
| { |
| "epoch": 3.123486682808717, |
| "grad_norm": 0.15554273128509521, |
| "learning_rate": 4.3447984628845464e-05, |
| "loss": 0.236, |
| "num_input_tokens_seen": 1059736, |
| "step": 2580 |
| }, |
| { |
| "epoch": 3.12953995157385, |
| "grad_norm": 0.30764302611351013, |
| "learning_rate": 4.341229264716825e-05, |
| "loss": 0.2008, |
| "num_input_tokens_seen": 1061656, |
| "step": 2585 |
| }, |
| { |
| "epoch": 3.135593220338983, |
| "grad_norm": 0.2842766344547272, |
| "learning_rate": 4.337651845971227e-05, |
| "loss": 0.2826, |
| "num_input_tokens_seen": 1063736, |
| "step": 2590 |
| }, |
| { |
| "epoch": 3.141646489104116, |
| "grad_norm": 0.4298422932624817, |
| "learning_rate": 4.334066222619933e-05, |
| "loss": 0.2163, |
| "num_input_tokens_seen": 1065720, |
| "step": 2595 |
| }, |
| { |
| "epoch": 3.1476997578692494, |
| "grad_norm": 0.37042006850242615, |
| "learning_rate": 4.3304724106717584e-05, |
| "loss": 0.2486, |
| "num_input_tokens_seen": 1067736, |
| "step": 2600 |
| }, |
| { |
| "epoch": 3.1537530266343827, |
| "grad_norm": 0.2581944465637207, |
| "learning_rate": 4.326870426172075e-05, |
| "loss": 0.2596, |
| "num_input_tokens_seen": 1069848, |
| "step": 2605 |
| }, |
| { |
| "epoch": 3.1598062953995156, |
| "grad_norm": 0.5771628618240356, |
| "learning_rate": 4.323260285202746e-05, |
| "loss": 0.2379, |
| "num_input_tokens_seen": 1071960, |
| "step": 2610 |
| }, |
| { |
| "epoch": 3.165859564164649, |
| "grad_norm": 0.20025081932544708, |
| "learning_rate": 4.3196420038820475e-05, |
| "loss": 0.2452, |
| "num_input_tokens_seen": 1074072, |
| "step": 2615 |
| }, |
| { |
| "epoch": 3.171912832929782, |
| "grad_norm": 0.235732302069664, |
| "learning_rate": 4.316015598364603e-05, |
| "loss": 0.2758, |
| "num_input_tokens_seen": 1076184, |
| "step": 2620 |
| }, |
| { |
| "epoch": 3.1779661016949152, |
| "grad_norm": 0.31593799591064453, |
| "learning_rate": 4.312381084841307e-05, |
| "loss": 0.2636, |
| "num_input_tokens_seen": 1078360, |
| "step": 2625 |
| }, |
| { |
| "epoch": 3.1840193704600486, |
| "grad_norm": 0.20142661035060883, |
| "learning_rate": 4.308738479539254e-05, |
| "loss": 0.2691, |
| "num_input_tokens_seen": 1080440, |
| "step": 2630 |
| }, |
| { |
| "epoch": 3.1900726392251815, |
| "grad_norm": 0.31453707814216614, |
| "learning_rate": 4.305087798721665e-05, |
| "loss": 0.2879, |
| "num_input_tokens_seen": 1082520, |
| "step": 2635 |
| }, |
| { |
| "epoch": 3.196125907990315, |
| "grad_norm": 0.47332870960235596, |
| "learning_rate": 4.30142905868782e-05, |
| "loss": 0.1986, |
| "num_input_tokens_seen": 1084408, |
| "step": 2640 |
| }, |
| { |
| "epoch": 3.2021791767554477, |
| "grad_norm": 0.22829139232635498, |
| "learning_rate": 4.297762275772975e-05, |
| "loss": 0.2329, |
| "num_input_tokens_seen": 1086424, |
| "step": 2645 |
| }, |
| { |
| "epoch": 3.208232445520581, |
| "grad_norm": 0.6322407722473145, |
| "learning_rate": 4.2940874663483005e-05, |
| "loss": 0.2979, |
| "num_input_tokens_seen": 1088440, |
| "step": 2650 |
| }, |
| { |
| "epoch": 3.2142857142857144, |
| "grad_norm": 0.480846643447876, |
| "learning_rate": 4.2904046468208006e-05, |
| "loss": 0.2561, |
| "num_input_tokens_seen": 1090648, |
| "step": 2655 |
| }, |
| { |
| "epoch": 3.2203389830508473, |
| "grad_norm": 0.3780890107154846, |
| "learning_rate": 4.286713833633242e-05, |
| "loss": 0.1881, |
| "num_input_tokens_seen": 1092632, |
| "step": 2660 |
| }, |
| { |
| "epoch": 3.2263922518159807, |
| "grad_norm": 0.2670578360557556, |
| "learning_rate": 4.283015043264084e-05, |
| "loss": 0.2409, |
| "num_input_tokens_seen": 1094648, |
| "step": 2665 |
| }, |
| { |
| "epoch": 3.232445520581114, |
| "grad_norm": 0.3466191291809082, |
| "learning_rate": 4.279308292227396e-05, |
| "loss": 0.2745, |
| "num_input_tokens_seen": 1096728, |
| "step": 2670 |
| }, |
| { |
| "epoch": 3.238498789346247, |
| "grad_norm": 0.4749204218387604, |
| "learning_rate": 4.275593597072796e-05, |
| "loss": 0.1985, |
| "num_input_tokens_seen": 1098872, |
| "step": 2675 |
| }, |
| { |
| "epoch": 3.2445520581113803, |
| "grad_norm": 0.3466539978981018, |
| "learning_rate": 4.2718709743853654e-05, |
| "loss": 0.251, |
| "num_input_tokens_seen": 1101048, |
| "step": 2680 |
| }, |
| { |
| "epoch": 3.250605326876513, |
| "grad_norm": 0.47215142846107483, |
| "learning_rate": 4.268140440785584e-05, |
| "loss": 0.276, |
| "num_input_tokens_seen": 1103320, |
| "step": 2685 |
| }, |
| { |
| "epoch": 3.2566585956416465, |
| "grad_norm": 0.20738205313682556, |
| "learning_rate": 4.264402012929247e-05, |
| "loss": 0.2304, |
| "num_input_tokens_seen": 1105336, |
| "step": 2690 |
| }, |
| { |
| "epoch": 3.26271186440678, |
| "grad_norm": 0.5281394720077515, |
| "learning_rate": 4.2606557075073996e-05, |
| "loss": 0.238, |
| "num_input_tokens_seen": 1107288, |
| "step": 2695 |
| }, |
| { |
| "epoch": 3.2687651331719128, |
| "grad_norm": 0.2263878583908081, |
| "learning_rate": 4.256901541246255e-05, |
| "loss": 0.2733, |
| "num_input_tokens_seen": 1109368, |
| "step": 2700 |
| }, |
| { |
| "epoch": 3.274818401937046, |
| "grad_norm": 0.4091036021709442, |
| "learning_rate": 4.253139530907124e-05, |
| "loss": 0.2031, |
| "num_input_tokens_seen": 1111576, |
| "step": 2705 |
| }, |
| { |
| "epoch": 3.280871670702179, |
| "grad_norm": 0.22964748740196228, |
| "learning_rate": 4.249369693286341e-05, |
| "loss": 0.2798, |
| "num_input_tokens_seen": 1113624, |
| "step": 2710 |
| }, |
| { |
| "epoch": 3.2869249394673123, |
| "grad_norm": 0.36693185567855835, |
| "learning_rate": 4.245592045215182e-05, |
| "loss": 0.2267, |
| "num_input_tokens_seen": 1115512, |
| "step": 2715 |
| }, |
| { |
| "epoch": 3.2929782082324457, |
| "grad_norm": 0.3728233277797699, |
| "learning_rate": 4.2418066035598e-05, |
| "loss": 0.2075, |
| "num_input_tokens_seen": 1117464, |
| "step": 2720 |
| }, |
| { |
| "epoch": 3.2990314769975786, |
| "grad_norm": 0.23038436472415924, |
| "learning_rate": 4.238013385221142e-05, |
| "loss": 0.2456, |
| "num_input_tokens_seen": 1119480, |
| "step": 2725 |
| }, |
| { |
| "epoch": 3.305084745762712, |
| "grad_norm": 0.8414861559867859, |
| "learning_rate": 4.2342124071348744e-05, |
| "loss": 0.1987, |
| "num_input_tokens_seen": 1121496, |
| "step": 2730 |
| }, |
| { |
| "epoch": 3.3111380145278453, |
| "grad_norm": 0.26781532168388367, |
| "learning_rate": 4.230403686271309e-05, |
| "loss": 0.2279, |
| "num_input_tokens_seen": 1123480, |
| "step": 2735 |
| }, |
| { |
| "epoch": 3.317191283292978, |
| "grad_norm": 0.3459651470184326, |
| "learning_rate": 4.2265872396353314e-05, |
| "loss": 0.259, |
| "num_input_tokens_seen": 1125528, |
| "step": 2740 |
| }, |
| { |
| "epoch": 3.3232445520581115, |
| "grad_norm": 0.34573832154273987, |
| "learning_rate": 4.2227630842663136e-05, |
| "loss": 0.2559, |
| "num_input_tokens_seen": 1127512, |
| "step": 2745 |
| }, |
| { |
| "epoch": 3.3292978208232444, |
| "grad_norm": 0.38154226541519165, |
| "learning_rate": 4.21893123723805e-05, |
| "loss": 0.2461, |
| "num_input_tokens_seen": 1129592, |
| "step": 2750 |
| }, |
| { |
| "epoch": 3.335351089588378, |
| "grad_norm": 0.41822800040245056, |
| "learning_rate": 4.2150917156586735e-05, |
| "loss": 0.2271, |
| "num_input_tokens_seen": 1131576, |
| "step": 2755 |
| }, |
| { |
| "epoch": 3.341404358353511, |
| "grad_norm": 0.2876574397087097, |
| "learning_rate": 4.211244536670584e-05, |
| "loss": 0.2166, |
| "num_input_tokens_seen": 1133784, |
| "step": 2760 |
| }, |
| { |
| "epoch": 3.347457627118644, |
| "grad_norm": 0.19482114911079407, |
| "learning_rate": 4.207389717450368e-05, |
| "loss": 0.2443, |
| "num_input_tokens_seen": 1135800, |
| "step": 2765 |
| }, |
| { |
| "epoch": 3.3535108958837774, |
| "grad_norm": 0.4246940314769745, |
| "learning_rate": 4.203527275208723e-05, |
| "loss": 0.2363, |
| "num_input_tokens_seen": 1137784, |
| "step": 2770 |
| }, |
| { |
| "epoch": 3.3595641646489103, |
| "grad_norm": 0.23316825926303864, |
| "learning_rate": 4.199657227190384e-05, |
| "loss": 0.2813, |
| "num_input_tokens_seen": 1139896, |
| "step": 2775 |
| }, |
| { |
| "epoch": 3.3656174334140436, |
| "grad_norm": 0.5419919490814209, |
| "learning_rate": 4.195779590674041e-05, |
| "loss": 0.2422, |
| "num_input_tokens_seen": 1142040, |
| "step": 2780 |
| }, |
| { |
| "epoch": 3.371670702179177, |
| "grad_norm": 0.33602726459503174, |
| "learning_rate": 4.191894382972264e-05, |
| "loss": 0.1776, |
| "num_input_tokens_seen": 1144088, |
| "step": 2785 |
| }, |
| { |
| "epoch": 3.37772397094431, |
| "grad_norm": 0.31846752762794495, |
| "learning_rate": 4.188001621431429e-05, |
| "loss": 0.3204, |
| "num_input_tokens_seen": 1146232, |
| "step": 2790 |
| }, |
| { |
| "epoch": 3.383777239709443, |
| "grad_norm": 0.34241190552711487, |
| "learning_rate": 4.184101323431636e-05, |
| "loss": 0.2267, |
| "num_input_tokens_seen": 1148344, |
| "step": 2795 |
| }, |
| { |
| "epoch": 3.389830508474576, |
| "grad_norm": 0.4527415931224823, |
| "learning_rate": 4.180193506386634e-05, |
| "loss": 0.2485, |
| "num_input_tokens_seen": 1150360, |
| "step": 2800 |
| }, |
| { |
| "epoch": 3.3958837772397095, |
| "grad_norm": 0.23502758145332336, |
| "learning_rate": 4.1762781877437406e-05, |
| "loss": 0.2533, |
| "num_input_tokens_seen": 1152376, |
| "step": 2805 |
| }, |
| { |
| "epoch": 3.401937046004843, |
| "grad_norm": 0.3534802496433258, |
| "learning_rate": 4.172355384983769e-05, |
| "loss": 0.2588, |
| "num_input_tokens_seen": 1154424, |
| "step": 2810 |
| }, |
| { |
| "epoch": 3.4079903147699757, |
| "grad_norm": 0.20341426134109497, |
| "learning_rate": 4.168425115620944e-05, |
| "loss": 0.2147, |
| "num_input_tokens_seen": 1156472, |
| "step": 2815 |
| }, |
| { |
| "epoch": 3.414043583535109, |
| "grad_norm": 0.16827057301998138, |
| "learning_rate": 4.164487397202829e-05, |
| "loss": 0.2492, |
| "num_input_tokens_seen": 1158552, |
| "step": 2820 |
| }, |
| { |
| "epoch": 3.420096852300242, |
| "grad_norm": 0.29268819093704224, |
| "learning_rate": 4.160542247310244e-05, |
| "loss": 0.237, |
| "num_input_tokens_seen": 1160696, |
| "step": 2825 |
| }, |
| { |
| "epoch": 3.4261501210653753, |
| "grad_norm": 0.29612547159194946, |
| "learning_rate": 4.156589683557189e-05, |
| "loss": 0.2695, |
| "num_input_tokens_seen": 1162808, |
| "step": 2830 |
| }, |
| { |
| "epoch": 3.4322033898305087, |
| "grad_norm": 0.2688729166984558, |
| "learning_rate": 4.1526297235907635e-05, |
| "loss": 0.2316, |
| "num_input_tokens_seen": 1164728, |
| "step": 2835 |
| }, |
| { |
| "epoch": 3.4382566585956416, |
| "grad_norm": 0.3905165195465088, |
| "learning_rate": 4.148662385091091e-05, |
| "loss": 0.2335, |
| "num_input_tokens_seen": 1166808, |
| "step": 2840 |
| }, |
| { |
| "epoch": 3.444309927360775, |
| "grad_norm": 0.20383936166763306, |
| "learning_rate": 4.144687685771238e-05, |
| "loss": 0.2363, |
| "num_input_tokens_seen": 1168888, |
| "step": 2845 |
| }, |
| { |
| "epoch": 3.450363196125908, |
| "grad_norm": 0.723222017288208, |
| "learning_rate": 4.140705643377133e-05, |
| "loss": 0.2416, |
| "num_input_tokens_seen": 1170872, |
| "step": 2850 |
| }, |
| { |
| "epoch": 3.456416464891041, |
| "grad_norm": 0.14048586785793304, |
| "learning_rate": 4.1367162756874925e-05, |
| "loss": 0.2231, |
| "num_input_tokens_seen": 1172984, |
| "step": 2855 |
| }, |
| { |
| "epoch": 3.4624697336561745, |
| "grad_norm": 0.46648848056793213, |
| "learning_rate": 4.132719600513734e-05, |
| "loss": 0.2515, |
| "num_input_tokens_seen": 1175032, |
| "step": 2860 |
| }, |
| { |
| "epoch": 3.4685230024213074, |
| "grad_norm": 0.3065139949321747, |
| "learning_rate": 4.128715635699905e-05, |
| "loss": 0.233, |
| "num_input_tokens_seen": 1177240, |
| "step": 2865 |
| }, |
| { |
| "epoch": 3.4745762711864407, |
| "grad_norm": 0.41870254278182983, |
| "learning_rate": 4.124704399122597e-05, |
| "loss": 0.2413, |
| "num_input_tokens_seen": 1179192, |
| "step": 2870 |
| }, |
| { |
| "epoch": 3.4806295399515736, |
| "grad_norm": 0.20896819233894348, |
| "learning_rate": 4.120685908690869e-05, |
| "loss": 0.233, |
| "num_input_tokens_seen": 1181112, |
| "step": 2875 |
| }, |
| { |
| "epoch": 3.486682808716707, |
| "grad_norm": 0.9305875301361084, |
| "learning_rate": 4.1166601823461656e-05, |
| "loss": 0.2271, |
| "num_input_tokens_seen": 1183128, |
| "step": 2880 |
| }, |
| { |
| "epoch": 3.4927360774818403, |
| "grad_norm": 0.18621258437633514, |
| "learning_rate": 4.112627238062239e-05, |
| "loss": 0.2076, |
| "num_input_tokens_seen": 1185240, |
| "step": 2885 |
| }, |
| { |
| "epoch": 3.4987893462469732, |
| "grad_norm": 0.2047727108001709, |
| "learning_rate": 4.1085870938450656e-05, |
| "loss": 0.2427, |
| "num_input_tokens_seen": 1187320, |
| "step": 2890 |
| }, |
| { |
| "epoch": 3.5, |
| "eval_loss": 0.22800089418888092, |
| "eval_runtime": 7.6582, |
| "eval_samples_per_second": 47.923, |
| "eval_steps_per_second": 12.013, |
| "num_input_tokens_seen": 1187704, |
| "step": 2891 |
| }, |
| { |
| "epoch": 3.5048426150121066, |
| "grad_norm": 0.15814676880836487, |
| "learning_rate": 4.1045397677327684e-05, |
| "loss": 0.2445, |
| "num_input_tokens_seen": 1189400, |
| "step": 2895 |
| }, |
| { |
| "epoch": 3.5108958837772395, |
| "grad_norm": 0.28475645184516907, |
| "learning_rate": 4.1004852777955364e-05, |
| "loss": 0.225, |
| "num_input_tokens_seen": 1191384, |
| "step": 2900 |
| }, |
| { |
| "epoch": 3.516949152542373, |
| "grad_norm": 0.593259334564209, |
| "learning_rate": 4.096423642135543e-05, |
| "loss": 0.2577, |
| "num_input_tokens_seen": 1193368, |
| "step": 2905 |
| }, |
| { |
| "epoch": 3.523002421307506, |
| "grad_norm": 0.38055557012557983, |
| "learning_rate": 4.0923548788868625e-05, |
| "loss": 0.1564, |
| "num_input_tokens_seen": 1195512, |
| "step": 2910 |
| }, |
| { |
| "epoch": 3.529055690072639, |
| "grad_norm": 0.21010547876358032, |
| "learning_rate": 4.0882790062153957e-05, |
| "loss": 0.2271, |
| "num_input_tokens_seen": 1197560, |
| "step": 2915 |
| }, |
| { |
| "epoch": 3.5351089588377724, |
| "grad_norm": 0.12626993656158447, |
| "learning_rate": 4.084196042318783e-05, |
| "loss": 0.2022, |
| "num_input_tokens_seen": 1199768, |
| "step": 2920 |
| }, |
| { |
| "epoch": 3.5411622276029053, |
| "grad_norm": 0.445035845041275, |
| "learning_rate": 4.080106005426326e-05, |
| "loss": 0.2392, |
| "num_input_tokens_seen": 1201848, |
| "step": 2925 |
| }, |
| { |
| "epoch": 3.5472154963680387, |
| "grad_norm": 0.34557801485061646, |
| "learning_rate": 4.076008913798903e-05, |
| "loss": 0.2011, |
| "num_input_tokens_seen": 1203896, |
| "step": 2930 |
| }, |
| { |
| "epoch": 3.553268765133172, |
| "grad_norm": 0.2371188998222351, |
| "learning_rate": 4.071904785728894e-05, |
| "loss": 0.2198, |
| "num_input_tokens_seen": 1205880, |
| "step": 2935 |
| }, |
| { |
| "epoch": 3.559322033898305, |
| "grad_norm": 0.6092042922973633, |
| "learning_rate": 4.0677936395400906e-05, |
| "loss": 0.2326, |
| "num_input_tokens_seen": 1207896, |
| "step": 2940 |
| }, |
| { |
| "epoch": 3.5653753026634383, |
| "grad_norm": 0.22718770802021027, |
| "learning_rate": 4.063675493587621e-05, |
| "loss": 0.2946, |
| "num_input_tokens_seen": 1210008, |
| "step": 2945 |
| }, |
| { |
| "epoch": 3.571428571428571, |
| "grad_norm": 0.3553445339202881, |
| "learning_rate": 4.059550366257864e-05, |
| "loss": 0.2138, |
| "num_input_tokens_seen": 1212024, |
| "step": 2950 |
| }, |
| { |
| "epoch": 3.5774818401937045, |
| "grad_norm": 0.2177080363035202, |
| "learning_rate": 4.055418275968368e-05, |
| "loss": 0.2023, |
| "num_input_tokens_seen": 1214040, |
| "step": 2955 |
| }, |
| { |
| "epoch": 3.583535108958838, |
| "grad_norm": 0.37577736377716064, |
| "learning_rate": 4.0512792411677705e-05, |
| "loss": 0.226, |
| "num_input_tokens_seen": 1216088, |
| "step": 2960 |
| }, |
| { |
| "epoch": 3.589588377723971, |
| "grad_norm": 0.38355129957199097, |
| "learning_rate": 4.047133280335713e-05, |
| "loss": 0.2245, |
| "num_input_tokens_seen": 1218136, |
| "step": 2965 |
| }, |
| { |
| "epoch": 3.595641646489104, |
| "grad_norm": 0.44311100244522095, |
| "learning_rate": 4.042980411982762e-05, |
| "loss": 0.2063, |
| "num_input_tokens_seen": 1220248, |
| "step": 2970 |
| }, |
| { |
| "epoch": 3.601694915254237, |
| "grad_norm": 0.17676201462745667, |
| "learning_rate": 4.0388206546503215e-05, |
| "loss": 0.233, |
| "num_input_tokens_seen": 1222360, |
| "step": 2975 |
| }, |
| { |
| "epoch": 3.6077481840193704, |
| "grad_norm": 0.17263735830783844, |
| "learning_rate": 4.0346540269105546e-05, |
| "loss": 0.2547, |
| "num_input_tokens_seen": 1224568, |
| "step": 2980 |
| }, |
| { |
| "epoch": 3.6138014527845037, |
| "grad_norm": 0.4356435239315033, |
| "learning_rate": 4.030480547366297e-05, |
| "loss": 0.2218, |
| "num_input_tokens_seen": 1226648, |
| "step": 2985 |
| }, |
| { |
| "epoch": 3.619854721549637, |
| "grad_norm": 0.4267004728317261, |
| "learning_rate": 4.026300234650979e-05, |
| "loss": 0.25, |
| "num_input_tokens_seen": 1228600, |
| "step": 2990 |
| }, |
| { |
| "epoch": 3.62590799031477, |
| "grad_norm": 0.3298884630203247, |
| "learning_rate": 4.022113107428536e-05, |
| "loss": 0.1997, |
| "num_input_tokens_seen": 1230616, |
| "step": 2995 |
| }, |
| { |
| "epoch": 3.6319612590799033, |
| "grad_norm": 0.41261032223701477, |
| "learning_rate": 4.0179191843933286e-05, |
| "loss": 0.2059, |
| "num_input_tokens_seen": 1232632, |
| "step": 3000 |
| }, |
| { |
| "epoch": 3.638014527845036, |
| "grad_norm": 0.3201148808002472, |
| "learning_rate": 4.013718484270061e-05, |
| "loss": 0.2073, |
| "num_input_tokens_seen": 1234552, |
| "step": 3005 |
| }, |
| { |
| "epoch": 3.6440677966101696, |
| "grad_norm": 0.13799035549163818, |
| "learning_rate": 4.009511025813694e-05, |
| "loss": 0.2202, |
| "num_input_tokens_seen": 1236728, |
| "step": 3010 |
| }, |
| { |
| "epoch": 3.650121065375303, |
| "grad_norm": 0.6029819250106812, |
| "learning_rate": 4.005296827809362e-05, |
| "loss": 0.258, |
| "num_input_tokens_seen": 1238776, |
| "step": 3015 |
| }, |
| { |
| "epoch": 3.656174334140436, |
| "grad_norm": 0.40506264567375183, |
| "learning_rate": 4.001075909072289e-05, |
| "loss": 0.2388, |
| "num_input_tokens_seen": 1240856, |
| "step": 3020 |
| }, |
| { |
| "epoch": 3.662227602905569, |
| "grad_norm": 0.41822484135627747, |
| "learning_rate": 3.9968482884477075e-05, |
| "loss": 0.1788, |
| "num_input_tokens_seen": 1242936, |
| "step": 3025 |
| }, |
| { |
| "epoch": 3.668280871670702, |
| "grad_norm": 0.13896431028842926, |
| "learning_rate": 3.992613984810771e-05, |
| "loss": 0.2036, |
| "num_input_tokens_seen": 1245080, |
| "step": 3030 |
| }, |
| { |
| "epoch": 3.6743341404358354, |
| "grad_norm": 0.41950222849845886, |
| "learning_rate": 3.988373017066469e-05, |
| "loss": 0.1901, |
| "num_input_tokens_seen": 1247192, |
| "step": 3035 |
| }, |
| { |
| "epoch": 3.6803874092009687, |
| "grad_norm": 0.36087945103645325, |
| "learning_rate": 3.984125404149548e-05, |
| "loss": 0.175, |
| "num_input_tokens_seen": 1249240, |
| "step": 3040 |
| }, |
| { |
| "epoch": 3.6864406779661016, |
| "grad_norm": 0.18386933207511902, |
| "learning_rate": 3.9798711650244194e-05, |
| "loss": 0.2577, |
| "num_input_tokens_seen": 1251320, |
| "step": 3045 |
| }, |
| { |
| "epoch": 3.692493946731235, |
| "grad_norm": 0.5813478231430054, |
| "learning_rate": 3.9756103186850825e-05, |
| "loss": 0.2656, |
| "num_input_tokens_seen": 1253336, |
| "step": 3050 |
| }, |
| { |
| "epoch": 3.698547215496368, |
| "grad_norm": 0.25909698009490967, |
| "learning_rate": 3.971342884155033e-05, |
| "loss": 0.2039, |
| "num_input_tokens_seen": 1255352, |
| "step": 3055 |
| }, |
| { |
| "epoch": 3.7046004842615012, |
| "grad_norm": 0.42619478702545166, |
| "learning_rate": 3.9670688804871815e-05, |
| "loss": 0.2081, |
| "num_input_tokens_seen": 1257272, |
| "step": 3060 |
| }, |
| { |
| "epoch": 3.7106537530266346, |
| "grad_norm": 0.2290722280740738, |
| "learning_rate": 3.96278832676377e-05, |
| "loss": 0.1983, |
| "num_input_tokens_seen": 1259416, |
| "step": 3065 |
| }, |
| { |
| "epoch": 3.7167070217917675, |
| "grad_norm": 0.4949394166469574, |
| "learning_rate": 3.958501242096283e-05, |
| "loss": 0.1779, |
| "num_input_tokens_seen": 1261496, |
| "step": 3070 |
| }, |
| { |
| "epoch": 3.722760290556901, |
| "grad_norm": 0.1661137193441391, |
| "learning_rate": 3.954207645625365e-05, |
| "loss": 0.1849, |
| "num_input_tokens_seen": 1263480, |
| "step": 3075 |
| }, |
| { |
| "epoch": 3.7288135593220337, |
| "grad_norm": 0.586648166179657, |
| "learning_rate": 3.949907556520731e-05, |
| "loss": 0.2301, |
| "num_input_tokens_seen": 1265528, |
| "step": 3080 |
| }, |
| { |
| "epoch": 3.734866828087167, |
| "grad_norm": 0.49252766370773315, |
| "learning_rate": 3.9456009939810886e-05, |
| "loss": 0.2263, |
| "num_input_tokens_seen": 1267512, |
| "step": 3085 |
| }, |
| { |
| "epoch": 3.7409200968523004, |
| "grad_norm": 0.3811275362968445, |
| "learning_rate": 3.941287977234043e-05, |
| "loss": 0.1951, |
| "num_input_tokens_seen": 1269560, |
| "step": 3090 |
| }, |
| { |
| "epoch": 3.7469733656174333, |
| "grad_norm": 0.6618674993515015, |
| "learning_rate": 3.9369685255360175e-05, |
| "loss": 0.2639, |
| "num_input_tokens_seen": 1271640, |
| "step": 3095 |
| }, |
| { |
| "epoch": 3.7530266343825667, |
| "grad_norm": 0.2644120156764984, |
| "learning_rate": 3.9326426581721663e-05, |
| "loss": 0.2003, |
| "num_input_tokens_seen": 1273688, |
| "step": 3100 |
| }, |
| { |
| "epoch": 3.7590799031476996, |
| "grad_norm": 0.5133991837501526, |
| "learning_rate": 3.9283103944562874e-05, |
| "loss": 0.21, |
| "num_input_tokens_seen": 1275768, |
| "step": 3105 |
| }, |
| { |
| "epoch": 3.765133171912833, |
| "grad_norm": 0.3209037184715271, |
| "learning_rate": 3.923971753730735e-05, |
| "loss": 0.193, |
| "num_input_tokens_seen": 1277752, |
| "step": 3110 |
| }, |
| { |
| "epoch": 3.7711864406779663, |
| "grad_norm": 0.478312611579895, |
| "learning_rate": 3.919626755366338e-05, |
| "loss": 0.2513, |
| "num_input_tokens_seen": 1279864, |
| "step": 3115 |
| }, |
| { |
| "epoch": 3.777239709443099, |
| "grad_norm": 0.5169130563735962, |
| "learning_rate": 3.9152754187623086e-05, |
| "loss": 0.2899, |
| "num_input_tokens_seen": 1281880, |
| "step": 3120 |
| }, |
| { |
| "epoch": 3.7832929782082325, |
| "grad_norm": 0.7324628829956055, |
| "learning_rate": 3.910917763346156e-05, |
| "loss": 0.2468, |
| "num_input_tokens_seen": 1283928, |
| "step": 3125 |
| }, |
| { |
| "epoch": 3.7893462469733654, |
| "grad_norm": 0.4140474200248718, |
| "learning_rate": 3.906553808573604e-05, |
| "loss": 0.2402, |
| "num_input_tokens_seen": 1285944, |
| "step": 3130 |
| }, |
| { |
| "epoch": 3.7953995157384988, |
| "grad_norm": 0.4780600965023041, |
| "learning_rate": 3.9021835739285e-05, |
| "loss": 0.1766, |
| "num_input_tokens_seen": 1287928, |
| "step": 3135 |
| }, |
| { |
| "epoch": 3.801452784503632, |
| "grad_norm": 0.17176905274391174, |
| "learning_rate": 3.897807078922728e-05, |
| "loss": 0.1831, |
| "num_input_tokens_seen": 1290008, |
| "step": 3140 |
| }, |
| { |
| "epoch": 3.807506053268765, |
| "grad_norm": 0.280134916305542, |
| "learning_rate": 3.8934243430961265e-05, |
| "loss": 0.227, |
| "num_input_tokens_seen": 1292120, |
| "step": 3145 |
| }, |
| { |
| "epoch": 3.8135593220338984, |
| "grad_norm": 0.37576010823249817, |
| "learning_rate": 3.889035386016393e-05, |
| "loss": 0.1984, |
| "num_input_tokens_seen": 1294040, |
| "step": 3150 |
| }, |
| { |
| "epoch": 3.8196125907990313, |
| "grad_norm": 0.3381688892841339, |
| "learning_rate": 3.8846402272790044e-05, |
| "loss": 0.2482, |
| "num_input_tokens_seen": 1296024, |
| "step": 3155 |
| }, |
| { |
| "epoch": 3.8256658595641646, |
| "grad_norm": 0.27905845642089844, |
| "learning_rate": 3.8802388865071246e-05, |
| "loss": 0.2775, |
| "num_input_tokens_seen": 1298104, |
| "step": 3160 |
| }, |
| { |
| "epoch": 3.831719128329298, |
| "grad_norm": 0.5494442582130432, |
| "learning_rate": 3.875831383351519e-05, |
| "loss": 0.2043, |
| "num_input_tokens_seen": 1300248, |
| "step": 3165 |
| }, |
| { |
| "epoch": 3.837772397094431, |
| "grad_norm": 0.2875140607357025, |
| "learning_rate": 3.8714177374904683e-05, |
| "loss": 0.2415, |
| "num_input_tokens_seen": 1302104, |
| "step": 3170 |
| }, |
| { |
| "epoch": 3.843825665859564, |
| "grad_norm": 0.42942824959754944, |
| "learning_rate": 3.866997968629674e-05, |
| "loss": 0.231, |
| "num_input_tokens_seen": 1304056, |
| "step": 3175 |
| }, |
| { |
| "epoch": 3.849878934624697, |
| "grad_norm": 0.2650049924850464, |
| "learning_rate": 3.86257209650218e-05, |
| "loss": 0.2288, |
| "num_input_tokens_seen": 1306104, |
| "step": 3180 |
| }, |
| { |
| "epoch": 3.8559322033898304, |
| "grad_norm": 0.27059897780418396, |
| "learning_rate": 3.858140140868276e-05, |
| "loss": 0.2286, |
| "num_input_tokens_seen": 1308056, |
| "step": 3185 |
| }, |
| { |
| "epoch": 3.861985472154964, |
| "grad_norm": 0.45076775550842285, |
| "learning_rate": 3.853702121515416e-05, |
| "loss": 0.2228, |
| "num_input_tokens_seen": 1310104, |
| "step": 3190 |
| }, |
| { |
| "epoch": 3.8680387409200967, |
| "grad_norm": 0.3580031394958496, |
| "learning_rate": 3.849258058258124e-05, |
| "loss": 0.1614, |
| "num_input_tokens_seen": 1312152, |
| "step": 3195 |
| }, |
| { |
| "epoch": 3.87409200968523, |
| "grad_norm": 0.546181321144104, |
| "learning_rate": 3.84480797093791e-05, |
| "loss": 0.2287, |
| "num_input_tokens_seen": 1314328, |
| "step": 3200 |
| }, |
| { |
| "epoch": 3.880145278450363, |
| "grad_norm": 0.2203136831521988, |
| "learning_rate": 3.8403518794231795e-05, |
| "loss": 0.2066, |
| "num_input_tokens_seen": 1316344, |
| "step": 3205 |
| }, |
| { |
| "epoch": 3.8861985472154963, |
| "grad_norm": 0.34884241223335266, |
| "learning_rate": 3.835889803609145e-05, |
| "loss": 0.1961, |
| "num_input_tokens_seen": 1318360, |
| "step": 3210 |
| }, |
| { |
| "epoch": 3.8922518159806296, |
| "grad_norm": 0.2783122956752777, |
| "learning_rate": 3.8314217634177376e-05, |
| "loss": 0.1879, |
| "num_input_tokens_seen": 1320376, |
| "step": 3215 |
| }, |
| { |
| "epoch": 3.898305084745763, |
| "grad_norm": 0.35985687375068665, |
| "learning_rate": 3.826947778797516e-05, |
| "loss": 0.1994, |
| "num_input_tokens_seen": 1322616, |
| "step": 3220 |
| }, |
| { |
| "epoch": 3.904358353510896, |
| "grad_norm": 0.5949720740318298, |
| "learning_rate": 3.822467869723581e-05, |
| "loss": 0.2264, |
| "num_input_tokens_seen": 1324664, |
| "step": 3225 |
| }, |
| { |
| "epoch": 3.910411622276029, |
| "grad_norm": 0.48355862498283386, |
| "learning_rate": 3.8179820561974835e-05, |
| "loss": 0.199, |
| "num_input_tokens_seen": 1326616, |
| "step": 3230 |
| }, |
| { |
| "epoch": 3.916464891041162, |
| "grad_norm": 0.31632059812545776, |
| "learning_rate": 3.813490358247137e-05, |
| "loss": 0.215, |
| "num_input_tokens_seen": 1328760, |
| "step": 3235 |
| }, |
| { |
| "epoch": 3.9225181598062955, |
| "grad_norm": 0.2386462688446045, |
| "learning_rate": 3.8089927959267255e-05, |
| "loss": 0.2309, |
| "num_input_tokens_seen": 1330968, |
| "step": 3240 |
| }, |
| { |
| "epoch": 3.928571428571429, |
| "grad_norm": 0.7619367241859436, |
| "learning_rate": 3.8044893893166203e-05, |
| "loss": 0.2507, |
| "num_input_tokens_seen": 1332952, |
| "step": 3245 |
| }, |
| { |
| "epoch": 3.9346246973365617, |
| "grad_norm": 0.19365699589252472, |
| "learning_rate": 3.799980158523279e-05, |
| "loss": 0.2405, |
| "num_input_tokens_seen": 1335064, |
| "step": 3250 |
| }, |
| { |
| "epoch": 3.940677966101695, |
| "grad_norm": 0.2004794329404831, |
| "learning_rate": 3.795465123679167e-05, |
| "loss": 0.2155, |
| "num_input_tokens_seen": 1337080, |
| "step": 3255 |
| }, |
| { |
| "epoch": 3.946731234866828, |
| "grad_norm": 0.34971562027931213, |
| "learning_rate": 3.790944304942664e-05, |
| "loss": 0.2313, |
| "num_input_tokens_seen": 1339096, |
| "step": 3260 |
| }, |
| { |
| "epoch": 3.9527845036319613, |
| "grad_norm": 0.5549309253692627, |
| "learning_rate": 3.7864177224979696e-05, |
| "loss": 0.2106, |
| "num_input_tokens_seen": 1341048, |
| "step": 3265 |
| }, |
| { |
| "epoch": 3.9588377723970947, |
| "grad_norm": 0.27585166692733765, |
| "learning_rate": 3.781885396555019e-05, |
| "loss": 0.1912, |
| "num_input_tokens_seen": 1343224, |
| "step": 3270 |
| }, |
| { |
| "epoch": 3.9648910411622276, |
| "grad_norm": 0.6372965574264526, |
| "learning_rate": 3.777347347349392e-05, |
| "loss": 0.2522, |
| "num_input_tokens_seen": 1345272, |
| "step": 3275 |
| }, |
| { |
| "epoch": 3.970944309927361, |
| "grad_norm": 0.5122102499008179, |
| "learning_rate": 3.7728035951422166e-05, |
| "loss": 0.1883, |
| "num_input_tokens_seen": 1347416, |
| "step": 3280 |
| }, |
| { |
| "epoch": 3.976997578692494, |
| "grad_norm": 0.19257624447345734, |
| "learning_rate": 3.7682541602200875e-05, |
| "loss": 0.2209, |
| "num_input_tokens_seen": 1349464, |
| "step": 3285 |
| }, |
| { |
| "epoch": 3.983050847457627, |
| "grad_norm": 0.24967695772647858, |
| "learning_rate": 3.76369906289497e-05, |
| "loss": 0.2161, |
| "num_input_tokens_seen": 1351352, |
| "step": 3290 |
| }, |
| { |
| "epoch": 3.9891041162227605, |
| "grad_norm": 0.8242369294166565, |
| "learning_rate": 3.7591383235041086e-05, |
| "loss": 0.2098, |
| "num_input_tokens_seen": 1353368, |
| "step": 3295 |
| }, |
| { |
| "epoch": 3.9951573849878934, |
| "grad_norm": 0.39511680603027344, |
| "learning_rate": 3.75457196240994e-05, |
| "loss": 0.2324, |
| "num_input_tokens_seen": 1355416, |
| "step": 3300 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 0.21625132858753204, |
| "eval_runtime": 7.6483, |
| "eval_samples_per_second": 47.984, |
| "eval_steps_per_second": 12.029, |
| "num_input_tokens_seen": 1356744, |
| "step": 3304 |
| }, |
| { |
| "epoch": 4.001210653753026, |
| "grad_norm": 0.2912016212940216, |
| "learning_rate": 3.7500000000000003e-05, |
| "loss": 0.1943, |
| "num_input_tokens_seen": 1357192, |
| "step": 3305 |
| }, |
| { |
| "epoch": 4.00726392251816, |
| "grad_norm": 0.25249361991882324, |
| "learning_rate": 3.7454224566868327e-05, |
| "loss": 0.1848, |
| "num_input_tokens_seen": 1359272, |
| "step": 3310 |
| }, |
| { |
| "epoch": 4.013317191283293, |
| "grad_norm": 0.26317933201789856, |
| "learning_rate": 3.7408393529078985e-05, |
| "loss": 0.1638, |
| "num_input_tokens_seen": 1361480, |
| "step": 3315 |
| }, |
| { |
| "epoch": 4.019370460048426, |
| "grad_norm": 0.644443690776825, |
| "learning_rate": 3.7362507091254836e-05, |
| "loss": 0.2148, |
| "num_input_tokens_seen": 1363560, |
| "step": 3320 |
| }, |
| { |
| "epoch": 4.02542372881356, |
| "grad_norm": 0.32187148928642273, |
| "learning_rate": 3.7316565458266114e-05, |
| "loss": 0.246, |
| "num_input_tokens_seen": 1365640, |
| "step": 3325 |
| }, |
| { |
| "epoch": 4.031476997578692, |
| "grad_norm": 0.6679955720901489, |
| "learning_rate": 3.727056883522945e-05, |
| "loss": 0.1812, |
| "num_input_tokens_seen": 1367720, |
| "step": 3330 |
| }, |
| { |
| "epoch": 4.0375302663438255, |
| "grad_norm": 0.476051926612854, |
| "learning_rate": 3.722451742750701e-05, |
| "loss": 0.2184, |
| "num_input_tokens_seen": 1369704, |
| "step": 3335 |
| }, |
| { |
| "epoch": 4.043583535108959, |
| "grad_norm": 0.2979244887828827, |
| "learning_rate": 3.717841144070556e-05, |
| "loss": 0.1884, |
| "num_input_tokens_seen": 1371816, |
| "step": 3340 |
| }, |
| { |
| "epoch": 4.049636803874092, |
| "grad_norm": 0.3987829089164734, |
| "learning_rate": 3.713225108067553e-05, |
| "loss": 0.1617, |
| "num_input_tokens_seen": 1373800, |
| "step": 3345 |
| }, |
| { |
| "epoch": 4.0556900726392255, |
| "grad_norm": 0.3506500720977783, |
| "learning_rate": 3.708603655351012e-05, |
| "loss": 0.2037, |
| "num_input_tokens_seen": 1375752, |
| "step": 3350 |
| }, |
| { |
| "epoch": 4.061743341404358, |
| "grad_norm": 0.2431643158197403, |
| "learning_rate": 3.7039768065544395e-05, |
| "loss": 0.2281, |
| "num_input_tokens_seen": 1377896, |
| "step": 3355 |
| }, |
| { |
| "epoch": 4.067796610169491, |
| "grad_norm": 0.4559185206890106, |
| "learning_rate": 3.69934458233543e-05, |
| "loss": 0.2157, |
| "num_input_tokens_seen": 1379912, |
| "step": 3360 |
| }, |
| { |
| "epoch": 4.073849878934625, |
| "grad_norm": 0.4202342629432678, |
| "learning_rate": 3.694707003375579e-05, |
| "loss": 0.2652, |
| "num_input_tokens_seen": 1381960, |
| "step": 3365 |
| }, |
| { |
| "epoch": 4.079903147699758, |
| "grad_norm": 0.25236576795578003, |
| "learning_rate": 3.690064090380392e-05, |
| "loss": 0.1905, |
| "num_input_tokens_seen": 1384072, |
| "step": 3370 |
| }, |
| { |
| "epoch": 4.085956416464891, |
| "grad_norm": 0.37686240673065186, |
| "learning_rate": 3.685415864079185e-05, |
| "loss": 0.2558, |
| "num_input_tokens_seen": 1386152, |
| "step": 3375 |
| }, |
| { |
| "epoch": 4.092009685230024, |
| "grad_norm": 0.17573748528957367, |
| "learning_rate": 3.680762345225001e-05, |
| "loss": 0.2213, |
| "num_input_tokens_seen": 1388168, |
| "step": 3380 |
| }, |
| { |
| "epoch": 4.098062953995157, |
| "grad_norm": 0.2726494371891022, |
| "learning_rate": 3.676103554594511e-05, |
| "loss": 0.2293, |
| "num_input_tokens_seen": 1390152, |
| "step": 3385 |
| }, |
| { |
| "epoch": 4.1041162227602905, |
| "grad_norm": 0.5579743981361389, |
| "learning_rate": 3.671439512987921e-05, |
| "loss": 0.1803, |
| "num_input_tokens_seen": 1392168, |
| "step": 3390 |
| }, |
| { |
| "epoch": 4.110169491525424, |
| "grad_norm": 0.5857843160629272, |
| "learning_rate": 3.666770241228883e-05, |
| "loss": 0.2598, |
| "num_input_tokens_seen": 1394376, |
| "step": 3395 |
| }, |
| { |
| "epoch": 4.116222760290557, |
| "grad_norm": 0.23575536906719208, |
| "learning_rate": 3.6620957601644016e-05, |
| "loss": 0.19, |
| "num_input_tokens_seen": 1396520, |
| "step": 3400 |
| }, |
| { |
| "epoch": 4.12227602905569, |
| "grad_norm": 0.22089159488677979, |
| "learning_rate": 3.657416090664737e-05, |
| "loss": 0.213, |
| "num_input_tokens_seen": 1398600, |
| "step": 3405 |
| }, |
| { |
| "epoch": 4.128329297820823, |
| "grad_norm": 0.4912413954734802, |
| "learning_rate": 3.652731253623315e-05, |
| "loss": 0.2162, |
| "num_input_tokens_seen": 1400584, |
| "step": 3410 |
| }, |
| { |
| "epoch": 4.134382566585956, |
| "grad_norm": 0.37442249059677124, |
| "learning_rate": 3.648041269956634e-05, |
| "loss": 0.2423, |
| "num_input_tokens_seen": 1402760, |
| "step": 3415 |
| }, |
| { |
| "epoch": 4.14043583535109, |
| "grad_norm": 0.6094481945037842, |
| "learning_rate": 3.6433461606041695e-05, |
| "loss": 0.2129, |
| "num_input_tokens_seen": 1404936, |
| "step": 3420 |
| }, |
| { |
| "epoch": 4.146489104116223, |
| "grad_norm": 0.4806797206401825, |
| "learning_rate": 3.6386459465282824e-05, |
| "loss": 0.1782, |
| "num_input_tokens_seen": 1406920, |
| "step": 3425 |
| }, |
| { |
| "epoch": 4.1525423728813555, |
| "grad_norm": 0.31108349561691284, |
| "learning_rate": 3.6339406487141255e-05, |
| "loss": 0.2099, |
| "num_input_tokens_seen": 1408840, |
| "step": 3430 |
| }, |
| { |
| "epoch": 4.158595641646489, |
| "grad_norm": 0.4414166510105133, |
| "learning_rate": 3.6292302881695464e-05, |
| "loss": 0.2565, |
| "num_input_tokens_seen": 1410696, |
| "step": 3435 |
| }, |
| { |
| "epoch": 4.164648910411622, |
| "grad_norm": 0.24182790517807007, |
| "learning_rate": 3.6245148859249996e-05, |
| "loss": 0.1817, |
| "num_input_tokens_seen": 1412680, |
| "step": 3440 |
| }, |
| { |
| "epoch": 4.170702179176756, |
| "grad_norm": 0.4077731668949127, |
| "learning_rate": 3.619794463033447e-05, |
| "loss": 0.2213, |
| "num_input_tokens_seen": 1414728, |
| "step": 3445 |
| }, |
| { |
| "epoch": 4.176755447941889, |
| "grad_norm": 0.40043166279792786, |
| "learning_rate": 3.6150690405702685e-05, |
| "loss": 0.1822, |
| "num_input_tokens_seen": 1416712, |
| "step": 3450 |
| }, |
| { |
| "epoch": 4.182808716707021, |
| "grad_norm": 0.5376760959625244, |
| "learning_rate": 3.6103386396331635e-05, |
| "loss": 0.2822, |
| "num_input_tokens_seen": 1418952, |
| "step": 3455 |
| }, |
| { |
| "epoch": 4.188861985472155, |
| "grad_norm": 0.3132179081439972, |
| "learning_rate": 3.605603281342061e-05, |
| "loss": 0.1866, |
| "num_input_tokens_seen": 1421032, |
| "step": 3460 |
| }, |
| { |
| "epoch": 4.194915254237288, |
| "grad_norm": 0.21826693415641785, |
| "learning_rate": 3.6008629868390204e-05, |
| "loss": 0.1688, |
| "num_input_tokens_seen": 1423048, |
| "step": 3465 |
| }, |
| { |
| "epoch": 4.200968523002421, |
| "grad_norm": 0.4169551432132721, |
| "learning_rate": 3.5961177772881434e-05, |
| "loss": 0.2681, |
| "num_input_tokens_seen": 1425192, |
| "step": 3470 |
| }, |
| { |
| "epoch": 4.207021791767555, |
| "grad_norm": 0.23512586951255798, |
| "learning_rate": 3.591367673875472e-05, |
| "loss": 0.2293, |
| "num_input_tokens_seen": 1427304, |
| "step": 3475 |
| }, |
| { |
| "epoch": 4.213075060532688, |
| "grad_norm": 0.4705086350440979, |
| "learning_rate": 3.5866126978089025e-05, |
| "loss": 0.2288, |
| "num_input_tokens_seen": 1429288, |
| "step": 3480 |
| }, |
| { |
| "epoch": 4.219128329297821, |
| "grad_norm": 0.19527237117290497, |
| "learning_rate": 3.5818528703180826e-05, |
| "loss": 0.1969, |
| "num_input_tokens_seen": 1431400, |
| "step": 3485 |
| }, |
| { |
| "epoch": 4.225181598062954, |
| "grad_norm": 0.2775489091873169, |
| "learning_rate": 3.577088212654322e-05, |
| "loss": 0.2218, |
| "num_input_tokens_seen": 1433576, |
| "step": 3490 |
| }, |
| { |
| "epoch": 4.231234866828087, |
| "grad_norm": 0.34490349888801575, |
| "learning_rate": 3.572318746090496e-05, |
| "loss": 0.2514, |
| "num_input_tokens_seen": 1435560, |
| "step": 3495 |
| }, |
| { |
| "epoch": 4.237288135593221, |
| "grad_norm": 0.34515276551246643, |
| "learning_rate": 3.5675444919209486e-05, |
| "loss": 0.232, |
| "num_input_tokens_seen": 1437672, |
| "step": 3500 |
| }, |
| { |
| "epoch": 4.243341404358354, |
| "grad_norm": 0.3978605270385742, |
| "learning_rate": 3.5627654714614e-05, |
| "loss": 0.232, |
| "num_input_tokens_seen": 1439656, |
| "step": 3505 |
| }, |
| { |
| "epoch": 4.249394673123486, |
| "grad_norm": 0.3605253994464874, |
| "learning_rate": 3.557981706048852e-05, |
| "loss": 0.1853, |
| "num_input_tokens_seen": 1441608, |
| "step": 3510 |
| }, |
| { |
| "epoch": 4.25544794188862, |
| "grad_norm": 0.6109866499900818, |
| "learning_rate": 3.5531932170414896e-05, |
| "loss": 0.1796, |
| "num_input_tokens_seen": 1443624, |
| "step": 3515 |
| }, |
| { |
| "epoch": 4.261501210653753, |
| "grad_norm": 0.18549123406410217, |
| "learning_rate": 3.5484000258185876e-05, |
| "loss": 0.2322, |
| "num_input_tokens_seen": 1445736, |
| "step": 3520 |
| }, |
| { |
| "epoch": 4.267554479418886, |
| "grad_norm": 0.30822399258613586, |
| "learning_rate": 3.5436021537804144e-05, |
| "loss": 0.2381, |
| "num_input_tokens_seen": 1447880, |
| "step": 3525 |
| }, |
| { |
| "epoch": 4.27360774818402, |
| "grad_norm": 0.6384687423706055, |
| "learning_rate": 3.538799622348139e-05, |
| "loss": 0.2094, |
| "num_input_tokens_seen": 1449896, |
| "step": 3530 |
| }, |
| { |
| "epoch": 4.279661016949152, |
| "grad_norm": 0.32620224356651306, |
| "learning_rate": 3.5339924529637304e-05, |
| "loss": 0.2166, |
| "num_input_tokens_seen": 1451880, |
| "step": 3535 |
| }, |
| { |
| "epoch": 4.285714285714286, |
| "grad_norm": 0.7632647156715393, |
| "learning_rate": 3.529180667089868e-05, |
| "loss": 0.2139, |
| "num_input_tokens_seen": 1453992, |
| "step": 3540 |
| }, |
| { |
| "epoch": 4.291767554479419, |
| "grad_norm": 0.42984357476234436, |
| "learning_rate": 3.52436428620984e-05, |
| "loss": 0.2418, |
| "num_input_tokens_seen": 1455912, |
| "step": 3545 |
| }, |
| { |
| "epoch": 4.297820823244552, |
| "grad_norm": 0.2978380024433136, |
| "learning_rate": 3.5195433318274516e-05, |
| "loss": 0.1875, |
| "num_input_tokens_seen": 1458024, |
| "step": 3550 |
| }, |
| { |
| "epoch": 4.303874092009686, |
| "grad_norm": 0.313346266746521, |
| "learning_rate": 3.514717825466925e-05, |
| "loss": 0.2418, |
| "num_input_tokens_seen": 1459976, |
| "step": 3555 |
| }, |
| { |
| "epoch": 4.309927360774818, |
| "grad_norm": 0.3105279803276062, |
| "learning_rate": 3.509887788672809e-05, |
| "loss": 0.2009, |
| "num_input_tokens_seen": 1462120, |
| "step": 3560 |
| }, |
| { |
| "epoch": 4.315980629539951, |
| "grad_norm": 0.30705446004867554, |
| "learning_rate": 3.5050532430098774e-05, |
| "loss": 0.1903, |
| "num_input_tokens_seen": 1464104, |
| "step": 3565 |
| }, |
| { |
| "epoch": 4.322033898305085, |
| "grad_norm": 0.3329887390136719, |
| "learning_rate": 3.500214210063035e-05, |
| "loss": 0.1929, |
| "num_input_tokens_seen": 1466216, |
| "step": 3570 |
| }, |
| { |
| "epoch": 4.328087167070218, |
| "grad_norm": 0.5141538381576538, |
| "learning_rate": 3.495370711437221e-05, |
| "loss": 0.1594, |
| "num_input_tokens_seen": 1468264, |
| "step": 3575 |
| }, |
| { |
| "epoch": 4.3341404358353515, |
| "grad_norm": 0.24372707307338715, |
| "learning_rate": 3.490522768757316e-05, |
| "loss": 0.2276, |
| "num_input_tokens_seen": 1470408, |
| "step": 3580 |
| }, |
| { |
| "epoch": 4.340193704600484, |
| "grad_norm": 0.27404314279556274, |
| "learning_rate": 3.485670403668036e-05, |
| "loss": 0.1823, |
| "num_input_tokens_seen": 1472392, |
| "step": 3585 |
| }, |
| { |
| "epoch": 4.346246973365617, |
| "grad_norm": 0.3949719965457916, |
| "learning_rate": 3.480813637833846e-05, |
| "loss": 0.2276, |
| "num_input_tokens_seen": 1474504, |
| "step": 3590 |
| }, |
| { |
| "epoch": 4.352300242130751, |
| "grad_norm": 0.5173758864402771, |
| "learning_rate": 3.475952492938859e-05, |
| "loss": 0.2222, |
| "num_input_tokens_seen": 1476616, |
| "step": 3595 |
| }, |
| { |
| "epoch": 4.358353510895884, |
| "grad_norm": 0.28251126408576965, |
| "learning_rate": 3.471086990686737e-05, |
| "loss": 0.2226, |
| "num_input_tokens_seen": 1478664, |
| "step": 3600 |
| }, |
| { |
| "epoch": 4.364406779661017, |
| "grad_norm": 0.19757187366485596, |
| "learning_rate": 3.466217152800598e-05, |
| "loss": 0.2274, |
| "num_input_tokens_seen": 1480648, |
| "step": 3605 |
| }, |
| { |
| "epoch": 4.37046004842615, |
| "grad_norm": 0.7404996156692505, |
| "learning_rate": 3.461343001022919e-05, |
| "loss": 0.1638, |
| "num_input_tokens_seen": 1482760, |
| "step": 3610 |
| }, |
| { |
| "epoch": 4.376513317191283, |
| "grad_norm": 0.4144309163093567, |
| "learning_rate": 3.456464557115433e-05, |
| "loss": 0.2315, |
| "num_input_tokens_seen": 1484744, |
| "step": 3615 |
| }, |
| { |
| "epoch": 4.3825665859564165, |
| "grad_norm": 0.3570268154144287, |
| "learning_rate": 3.45158184285904e-05, |
| "loss": 0.1846, |
| "num_input_tokens_seen": 1486728, |
| "step": 3620 |
| }, |
| { |
| "epoch": 4.38861985472155, |
| "grad_norm": 0.5335553884506226, |
| "learning_rate": 3.446694880053704e-05, |
| "loss": 0.2198, |
| "num_input_tokens_seen": 1488808, |
| "step": 3625 |
| }, |
| { |
| "epoch": 4.394673123486683, |
| "grad_norm": 0.8102007508277893, |
| "learning_rate": 3.441803690518359e-05, |
| "loss": 0.1719, |
| "num_input_tokens_seen": 1490984, |
| "step": 3630 |
| }, |
| { |
| "epoch": 4.400726392251816, |
| "grad_norm": 0.47367629408836365, |
| "learning_rate": 3.4369082960908084e-05, |
| "loss": 0.2525, |
| "num_input_tokens_seen": 1493000, |
| "step": 3635 |
| }, |
| { |
| "epoch": 4.406779661016949, |
| "grad_norm": 0.4657382071018219, |
| "learning_rate": 3.432008718627631e-05, |
| "loss": 0.2557, |
| "num_input_tokens_seen": 1494920, |
| "step": 3640 |
| }, |
| { |
| "epoch": 4.412832929782082, |
| "grad_norm": 0.3918997645378113, |
| "learning_rate": 3.4271049800040805e-05, |
| "loss": 0.2012, |
| "num_input_tokens_seen": 1496904, |
| "step": 3645 |
| }, |
| { |
| "epoch": 4.418886198547216, |
| "grad_norm": 0.27507132291793823, |
| "learning_rate": 3.42219710211399e-05, |
| "loss": 0.2064, |
| "num_input_tokens_seen": 1498792, |
| "step": 3650 |
| }, |
| { |
| "epoch": 4.424939467312349, |
| "grad_norm": 0.17722375690937042, |
| "learning_rate": 3.417285106869673e-05, |
| "loss": 0.2538, |
| "num_input_tokens_seen": 1500840, |
| "step": 3655 |
| }, |
| { |
| "epoch": 4.4309927360774815, |
| "grad_norm": 0.346029132604599, |
| "learning_rate": 3.4123690162018246e-05, |
| "loss": 0.2201, |
| "num_input_tokens_seen": 1502888, |
| "step": 3660 |
| }, |
| { |
| "epoch": 4.437046004842615, |
| "grad_norm": 0.580996036529541, |
| "learning_rate": 3.407448852059426e-05, |
| "loss": 0.176, |
| "num_input_tokens_seen": 1504904, |
| "step": 3665 |
| }, |
| { |
| "epoch": 4.443099273607748, |
| "grad_norm": 0.47829699516296387, |
| "learning_rate": 3.4025246364096455e-05, |
| "loss": 0.2409, |
| "num_input_tokens_seen": 1506824, |
| "step": 3670 |
| }, |
| { |
| "epoch": 4.4491525423728815, |
| "grad_norm": 0.4041799008846283, |
| "learning_rate": 3.397596391237739e-05, |
| "loss": 0.217, |
| "num_input_tokens_seen": 1508872, |
| "step": 3675 |
| }, |
| { |
| "epoch": 4.455205811138015, |
| "grad_norm": 0.370988667011261, |
| "learning_rate": 3.3926641385469556e-05, |
| "loss": 0.227, |
| "num_input_tokens_seen": 1510824, |
| "step": 3680 |
| }, |
| { |
| "epoch": 4.461259079903147, |
| "grad_norm": 0.22420404851436615, |
| "learning_rate": 3.387727900358435e-05, |
| "loss": 0.1959, |
| "num_input_tokens_seen": 1512968, |
| "step": 3685 |
| }, |
| { |
| "epoch": 4.467312348668281, |
| "grad_norm": 0.3692123591899872, |
| "learning_rate": 3.38278769871111e-05, |
| "loss": 0.2212, |
| "num_input_tokens_seen": 1515144, |
| "step": 3690 |
| }, |
| { |
| "epoch": 4.473365617433414, |
| "grad_norm": 0.25859665870666504, |
| "learning_rate": 3.377843555661612e-05, |
| "loss": 0.1944, |
| "num_input_tokens_seen": 1517192, |
| "step": 3695 |
| }, |
| { |
| "epoch": 4.479418886198547, |
| "grad_norm": 0.3631698787212372, |
| "learning_rate": 3.372895493284167e-05, |
| "loss": 0.2207, |
| "num_input_tokens_seen": 1519400, |
| "step": 3700 |
| }, |
| { |
| "epoch": 4.485472154963681, |
| "grad_norm": 0.6008762717247009, |
| "learning_rate": 3.367943533670501e-05, |
| "loss": 0.1803, |
| "num_input_tokens_seen": 1521416, |
| "step": 3705 |
| }, |
| { |
| "epoch": 4.491525423728813, |
| "grad_norm": 0.3014872968196869, |
| "learning_rate": 3.3629876989297405e-05, |
| "loss": 0.2239, |
| "num_input_tokens_seen": 1523240, |
| "step": 3710 |
| }, |
| { |
| "epoch": 4.4975786924939465, |
| "grad_norm": 0.2603475749492645, |
| "learning_rate": 3.3580280111883125e-05, |
| "loss": 0.1545, |
| "num_input_tokens_seen": 1525288, |
| "step": 3715 |
| }, |
| { |
| "epoch": 4.5, |
| "eval_loss": 0.20897646248340607, |
| "eval_runtime": 7.6764, |
| "eval_samples_per_second": 47.809, |
| "eval_steps_per_second": 11.985, |
| "num_input_tokens_seen": 1526088, |
| "step": 3717 |
| }, |
| { |
| "epoch": 4.50363196125908, |
| "grad_norm": 0.3308848440647125, |
| "learning_rate": 3.3530644925898465e-05, |
| "loss": 0.1761, |
| "num_input_tokens_seen": 1527304, |
| "step": 3720 |
| }, |
| { |
| "epoch": 4.509685230024213, |
| "grad_norm": 0.3628905117511749, |
| "learning_rate": 3.348097165295076e-05, |
| "loss": 0.2372, |
| "num_input_tokens_seen": 1529384, |
| "step": 3725 |
| }, |
| { |
| "epoch": 4.5157384987893465, |
| "grad_norm": 0.1671944260597229, |
| "learning_rate": 3.34312605148174e-05, |
| "loss": 0.2064, |
| "num_input_tokens_seen": 1531464, |
| "step": 3730 |
| }, |
| { |
| "epoch": 4.521791767554479, |
| "grad_norm": 0.3956878185272217, |
| "learning_rate": 3.338151173344483e-05, |
| "loss": 0.1946, |
| "num_input_tokens_seen": 1533608, |
| "step": 3735 |
| }, |
| { |
| "epoch": 4.527845036319612, |
| "grad_norm": 0.19678837060928345, |
| "learning_rate": 3.333172553094754e-05, |
| "loss": 0.221, |
| "num_input_tokens_seen": 1535656, |
| "step": 3740 |
| }, |
| { |
| "epoch": 4.533898305084746, |
| "grad_norm": 0.1590597778558731, |
| "learning_rate": 3.328190212960712e-05, |
| "loss": 0.213, |
| "num_input_tokens_seen": 1537640, |
| "step": 3745 |
| }, |
| { |
| "epoch": 4.539951573849879, |
| "grad_norm": 0.5338016748428345, |
| "learning_rate": 3.323204175187125e-05, |
| "loss": 0.2367, |
| "num_input_tokens_seen": 1539592, |
| "step": 3750 |
| }, |
| { |
| "epoch": 4.546004842615012, |
| "grad_norm": 0.17397870123386383, |
| "learning_rate": 3.318214462035266e-05, |
| "loss": 0.1874, |
| "num_input_tokens_seen": 1541576, |
| "step": 3755 |
| }, |
| { |
| "epoch": 4.552058111380145, |
| "grad_norm": 0.24137651920318604, |
| "learning_rate": 3.3132210957828226e-05, |
| "loss": 0.2471, |
| "num_input_tokens_seen": 1543464, |
| "step": 3760 |
| }, |
| { |
| "epoch": 4.558111380145278, |
| "grad_norm": 0.3203127384185791, |
| "learning_rate": 3.3082240987237875e-05, |
| "loss": 0.2038, |
| "num_input_tokens_seen": 1545416, |
| "step": 3765 |
| }, |
| { |
| "epoch": 4.5641646489104115, |
| "grad_norm": 0.5310072302818298, |
| "learning_rate": 3.3032234931683684e-05, |
| "loss": 0.2339, |
| "num_input_tokens_seen": 1547432, |
| "step": 3770 |
| }, |
| { |
| "epoch": 4.570217917675545, |
| "grad_norm": 0.23544982075691223, |
| "learning_rate": 3.2982193014428805e-05, |
| "loss": 0.1628, |
| "num_input_tokens_seen": 1549576, |
| "step": 3775 |
| }, |
| { |
| "epoch": 4.576271186440678, |
| "grad_norm": 0.2020537257194519, |
| "learning_rate": 3.2932115458896515e-05, |
| "loss": 0.185, |
| "num_input_tokens_seen": 1551688, |
| "step": 3780 |
| }, |
| { |
| "epoch": 4.582324455205811, |
| "grad_norm": 0.3683018386363983, |
| "learning_rate": 3.2882002488669204e-05, |
| "loss": 0.204, |
| "num_input_tokens_seen": 1553672, |
| "step": 3785 |
| }, |
| { |
| "epoch": 4.588377723970944, |
| "grad_norm": 0.7094999551773071, |
| "learning_rate": 3.28318543274874e-05, |
| "loss": 0.2383, |
| "num_input_tokens_seen": 1555720, |
| "step": 3790 |
| }, |
| { |
| "epoch": 4.594430992736077, |
| "grad_norm": 0.21470731496810913, |
| "learning_rate": 3.278167119924872e-05, |
| "loss": 0.1616, |
| "num_input_tokens_seen": 1557672, |
| "step": 3795 |
| }, |
| { |
| "epoch": 4.600484261501211, |
| "grad_norm": 0.49693506956100464, |
| "learning_rate": 3.27314533280069e-05, |
| "loss": 0.2272, |
| "num_input_tokens_seen": 1559880, |
| "step": 3800 |
| }, |
| { |
| "epoch": 4.606537530266344, |
| "grad_norm": 0.23775242269039154, |
| "learning_rate": 3.268120093797082e-05, |
| "loss": 0.2189, |
| "num_input_tokens_seen": 1561960, |
| "step": 3805 |
| }, |
| { |
| "epoch": 4.6125907990314765, |
| "grad_norm": 0.38969048857688904, |
| "learning_rate": 3.263091425350345e-05, |
| "loss": 0.2471, |
| "num_input_tokens_seen": 1563880, |
| "step": 3810 |
| }, |
| { |
| "epoch": 4.61864406779661, |
| "grad_norm": 0.2580578029155731, |
| "learning_rate": 3.258059349912089e-05, |
| "loss": 0.185, |
| "num_input_tokens_seen": 1565896, |
| "step": 3815 |
| }, |
| { |
| "epoch": 4.624697336561743, |
| "grad_norm": 0.9072468280792236, |
| "learning_rate": 3.253023889949135e-05, |
| "loss": 0.2373, |
| "num_input_tokens_seen": 1568040, |
| "step": 3820 |
| }, |
| { |
| "epoch": 4.6307506053268765, |
| "grad_norm": 0.2106904834508896, |
| "learning_rate": 3.247985067943414e-05, |
| "loss": 0.2335, |
| "num_input_tokens_seen": 1570056, |
| "step": 3825 |
| }, |
| { |
| "epoch": 4.63680387409201, |
| "grad_norm": 0.1807175725698471, |
| "learning_rate": 3.2429429063918696e-05, |
| "loss": 0.181, |
| "num_input_tokens_seen": 1572168, |
| "step": 3830 |
| }, |
| { |
| "epoch": 4.642857142857143, |
| "grad_norm": 0.21980927884578705, |
| "learning_rate": 3.2378974278063534e-05, |
| "loss": 0.2634, |
| "num_input_tokens_seen": 1574216, |
| "step": 3835 |
| }, |
| { |
| "epoch": 4.648910411622276, |
| "grad_norm": 0.40217307209968567, |
| "learning_rate": 3.232848654713528e-05, |
| "loss": 0.1687, |
| "num_input_tokens_seen": 1576168, |
| "step": 3840 |
| }, |
| { |
| "epoch": 4.654963680387409, |
| "grad_norm": 0.6263716220855713, |
| "learning_rate": 3.227796609654765e-05, |
| "loss": 0.2034, |
| "num_input_tokens_seen": 1578152, |
| "step": 3845 |
| }, |
| { |
| "epoch": 4.661016949152542, |
| "grad_norm": 0.24075907468795776, |
| "learning_rate": 3.222741315186043e-05, |
| "loss": 0.1756, |
| "num_input_tokens_seen": 1580104, |
| "step": 3850 |
| }, |
| { |
| "epoch": 4.667070217917676, |
| "grad_norm": 0.5776565074920654, |
| "learning_rate": 3.217682793877851e-05, |
| "loss": 0.2559, |
| "num_input_tokens_seen": 1582056, |
| "step": 3855 |
| }, |
| { |
| "epoch": 4.673123486682809, |
| "grad_norm": 0.5002198815345764, |
| "learning_rate": 3.212621068315081e-05, |
| "loss": 0.1741, |
| "num_input_tokens_seen": 1584136, |
| "step": 3860 |
| }, |
| { |
| "epoch": 4.6791767554479415, |
| "grad_norm": 0.3583095669746399, |
| "learning_rate": 3.207556161096935e-05, |
| "loss": 0.1901, |
| "num_input_tokens_seen": 1586184, |
| "step": 3865 |
| }, |
| { |
| "epoch": 4.685230024213075, |
| "grad_norm": 0.6010826230049133, |
| "learning_rate": 3.202488094836819e-05, |
| "loss": 0.1933, |
| "num_input_tokens_seen": 1588296, |
| "step": 3870 |
| }, |
| { |
| "epoch": 4.691283292978208, |
| "grad_norm": 0.14531512558460236, |
| "learning_rate": 3.197416892162242e-05, |
| "loss": 0.1686, |
| "num_input_tokens_seen": 1590504, |
| "step": 3875 |
| }, |
| { |
| "epoch": 4.697336561743342, |
| "grad_norm": 0.2853067219257355, |
| "learning_rate": 3.1923425757147175e-05, |
| "loss": 0.1855, |
| "num_input_tokens_seen": 1592584, |
| "step": 3880 |
| }, |
| { |
| "epoch": 4.703389830508475, |
| "grad_norm": 0.3593171238899231, |
| "learning_rate": 3.1872651681496604e-05, |
| "loss": 0.3084, |
| "num_input_tokens_seen": 1594728, |
| "step": 3885 |
| }, |
| { |
| "epoch": 4.709443099273607, |
| "grad_norm": 0.2465737909078598, |
| "learning_rate": 3.182184692136287e-05, |
| "loss": 0.214, |
| "num_input_tokens_seen": 1596776, |
| "step": 3890 |
| }, |
| { |
| "epoch": 4.715496368038741, |
| "grad_norm": 0.34766706824302673, |
| "learning_rate": 3.177101170357513e-05, |
| "loss": 0.2025, |
| "num_input_tokens_seen": 1598984, |
| "step": 3895 |
| }, |
| { |
| "epoch": 4.721549636803874, |
| "grad_norm": 0.21941545605659485, |
| "learning_rate": 3.1720146255098535e-05, |
| "loss": 0.1893, |
| "num_input_tokens_seen": 1601096, |
| "step": 3900 |
| }, |
| { |
| "epoch": 4.727602905569007, |
| "grad_norm": 0.22825156152248383, |
| "learning_rate": 3.16692508030332e-05, |
| "loss": 0.1929, |
| "num_input_tokens_seen": 1603336, |
| "step": 3905 |
| }, |
| { |
| "epoch": 4.733656174334141, |
| "grad_norm": 0.19262927770614624, |
| "learning_rate": 3.16183255746132e-05, |
| "loss": 0.1449, |
| "num_input_tokens_seen": 1605320, |
| "step": 3910 |
| }, |
| { |
| "epoch": 4.739709443099273, |
| "grad_norm": 0.33510878682136536, |
| "learning_rate": 3.156737079720555e-05, |
| "loss": 0.1931, |
| "num_input_tokens_seen": 1607304, |
| "step": 3915 |
| }, |
| { |
| "epoch": 4.745762711864407, |
| "grad_norm": 0.5965043902397156, |
| "learning_rate": 3.151638669830919e-05, |
| "loss": 0.273, |
| "num_input_tokens_seen": 1609384, |
| "step": 3920 |
| }, |
| { |
| "epoch": 4.75181598062954, |
| "grad_norm": 0.45328766107559204, |
| "learning_rate": 3.1465373505554e-05, |
| "loss": 0.2301, |
| "num_input_tokens_seen": 1611304, |
| "step": 3925 |
| }, |
| { |
| "epoch": 4.757869249394673, |
| "grad_norm": 0.4171653985977173, |
| "learning_rate": 3.14143314466997e-05, |
| "loss": 0.2207, |
| "num_input_tokens_seen": 1613192, |
| "step": 3930 |
| }, |
| { |
| "epoch": 4.763922518159807, |
| "grad_norm": 0.3365018665790558, |
| "learning_rate": 3.136326074963494e-05, |
| "loss": 0.233, |
| "num_input_tokens_seen": 1615304, |
| "step": 3935 |
| }, |
| { |
| "epoch": 4.76997578692494, |
| "grad_norm": 0.25079458951950073, |
| "learning_rate": 3.131216164237622e-05, |
| "loss": 0.2067, |
| "num_input_tokens_seen": 1617288, |
| "step": 3940 |
| }, |
| { |
| "epoch": 4.776029055690072, |
| "grad_norm": 0.24499015510082245, |
| "learning_rate": 3.1261034353066884e-05, |
| "loss": 0.1732, |
| "num_input_tokens_seen": 1619336, |
| "step": 3945 |
| }, |
| { |
| "epoch": 4.782082324455206, |
| "grad_norm": 0.3134455382823944, |
| "learning_rate": 3.1209879109976064e-05, |
| "loss": 0.2142, |
| "num_input_tokens_seen": 1621416, |
| "step": 3950 |
| }, |
| { |
| "epoch": 4.788135593220339, |
| "grad_norm": 0.1736644208431244, |
| "learning_rate": 3.115869614149776e-05, |
| "loss": 0.1606, |
| "num_input_tokens_seen": 1623432, |
| "step": 3955 |
| }, |
| { |
| "epoch": 4.7941888619854724, |
| "grad_norm": 0.23227068781852722, |
| "learning_rate": 3.1107485676149714e-05, |
| "loss": 0.298, |
| "num_input_tokens_seen": 1625448, |
| "step": 3960 |
| }, |
| { |
| "epoch": 4.800242130750606, |
| "grad_norm": 0.38661643862724304, |
| "learning_rate": 3.105624794257245e-05, |
| "loss": 0.2285, |
| "num_input_tokens_seen": 1627624, |
| "step": 3965 |
| }, |
| { |
| "epoch": 4.806295399515738, |
| "grad_norm": 0.5172131657600403, |
| "learning_rate": 3.100498316952823e-05, |
| "loss": 0.2128, |
| "num_input_tokens_seen": 1629800, |
| "step": 3970 |
| }, |
| { |
| "epoch": 4.812348668280872, |
| "grad_norm": 0.22766058146953583, |
| "learning_rate": 3.095369158590006e-05, |
| "loss": 0.1662, |
| "num_input_tokens_seen": 1631720, |
| "step": 3975 |
| }, |
| { |
| "epoch": 4.818401937046005, |
| "grad_norm": 0.2815072238445282, |
| "learning_rate": 3.09023734206906e-05, |
| "loss": 0.1829, |
| "num_input_tokens_seen": 1633704, |
| "step": 3980 |
| }, |
| { |
| "epoch": 4.824455205811138, |
| "grad_norm": 0.21688483655452728, |
| "learning_rate": 3.085102890302125e-05, |
| "loss": 0.1778, |
| "num_input_tokens_seen": 1635656, |
| "step": 3985 |
| }, |
| { |
| "epoch": 4.830508474576272, |
| "grad_norm": 0.5362370014190674, |
| "learning_rate": 3.079965826213102e-05, |
| "loss": 0.1966, |
| "num_input_tokens_seen": 1637736, |
| "step": 3990 |
| }, |
| { |
| "epoch": 4.836561743341404, |
| "grad_norm": 0.4161891043186188, |
| "learning_rate": 3.074826172737559e-05, |
| "loss": 0.2171, |
| "num_input_tokens_seen": 1639816, |
| "step": 3995 |
| }, |
| { |
| "epoch": 4.842615012106537, |
| "grad_norm": 0.4591980576515198, |
| "learning_rate": 3.0696839528226206e-05, |
| "loss": 0.1543, |
| "num_input_tokens_seen": 1641736, |
| "step": 4000 |
| }, |
| { |
| "epoch": 4.848668280871671, |
| "grad_norm": 0.2013375163078308, |
| "learning_rate": 3.064539189426874e-05, |
| "loss": 0.2386, |
| "num_input_tokens_seen": 1643656, |
| "step": 4005 |
| }, |
| { |
| "epoch": 4.854721549636804, |
| "grad_norm": 0.29858848452568054, |
| "learning_rate": 3.059391905520259e-05, |
| "loss": 0.1984, |
| "num_input_tokens_seen": 1645736, |
| "step": 4010 |
| }, |
| { |
| "epoch": 4.8607748184019375, |
| "grad_norm": 0.2506105303764343, |
| "learning_rate": 3.054242124083972e-05, |
| "loss": 0.2283, |
| "num_input_tokens_seen": 1647688, |
| "step": 4015 |
| }, |
| { |
| "epoch": 4.86682808716707, |
| "grad_norm": 0.3207820653915405, |
| "learning_rate": 3.0490898681103575e-05, |
| "loss": 0.1705, |
| "num_input_tokens_seen": 1649768, |
| "step": 4020 |
| }, |
| { |
| "epoch": 4.872881355932203, |
| "grad_norm": 0.6677879691123962, |
| "learning_rate": 3.0439351606028094e-05, |
| "loss": 0.2255, |
| "num_input_tokens_seen": 1651688, |
| "step": 4025 |
| }, |
| { |
| "epoch": 4.878934624697337, |
| "grad_norm": 0.2945452034473419, |
| "learning_rate": 3.0387780245756655e-05, |
| "loss": 0.1643, |
| "num_input_tokens_seen": 1653896, |
| "step": 4030 |
| }, |
| { |
| "epoch": 4.88498789346247, |
| "grad_norm": 0.7847174406051636, |
| "learning_rate": 3.0336184830541093e-05, |
| "loss": 0.1758, |
| "num_input_tokens_seen": 1656008, |
| "step": 4035 |
| }, |
| { |
| "epoch": 4.891041162227603, |
| "grad_norm": 0.43117037415504456, |
| "learning_rate": 3.028456559074061e-05, |
| "loss": 0.2417, |
| "num_input_tokens_seen": 1658088, |
| "step": 4040 |
| }, |
| { |
| "epoch": 4.897094430992736, |
| "grad_norm": 0.4288983941078186, |
| "learning_rate": 3.0232922756820804e-05, |
| "loss": 0.1886, |
| "num_input_tokens_seen": 1660200, |
| "step": 4045 |
| }, |
| { |
| "epoch": 4.903147699757869, |
| "grad_norm": 0.22557829320430756, |
| "learning_rate": 3.0181256559352587e-05, |
| "loss": 0.2271, |
| "num_input_tokens_seen": 1662440, |
| "step": 4050 |
| }, |
| { |
| "epoch": 4.9092009685230025, |
| "grad_norm": 0.21760496497154236, |
| "learning_rate": 3.0129567229011214e-05, |
| "loss": 0.2214, |
| "num_input_tokens_seen": 1664552, |
| "step": 4055 |
| }, |
| { |
| "epoch": 4.915254237288136, |
| "grad_norm": 0.270313560962677, |
| "learning_rate": 3.0077854996575184e-05, |
| "loss": 0.198, |
| "num_input_tokens_seen": 1666600, |
| "step": 4060 |
| }, |
| { |
| "epoch": 4.921307506053269, |
| "grad_norm": 0.27527645230293274, |
| "learning_rate": 3.0026120092925293e-05, |
| "loss": 0.1852, |
| "num_input_tokens_seen": 1668776, |
| "step": 4065 |
| }, |
| { |
| "epoch": 4.927360774818402, |
| "grad_norm": 0.42429453134536743, |
| "learning_rate": 2.9974362749043512e-05, |
| "loss": 0.2108, |
| "num_input_tokens_seen": 1670952, |
| "step": 4070 |
| }, |
| { |
| "epoch": 4.933414043583535, |
| "grad_norm": 0.25142019987106323, |
| "learning_rate": 2.9922583196012037e-05, |
| "loss": 0.211, |
| "num_input_tokens_seen": 1673128, |
| "step": 4075 |
| }, |
| { |
| "epoch": 4.939467312348668, |
| "grad_norm": 0.2894810140132904, |
| "learning_rate": 2.9870781665012204e-05, |
| "loss": 0.2428, |
| "num_input_tokens_seen": 1675112, |
| "step": 4080 |
| }, |
| { |
| "epoch": 4.945520581113802, |
| "grad_norm": 0.5641507506370544, |
| "learning_rate": 2.981895838732348e-05, |
| "loss": 0.2332, |
| "num_input_tokens_seen": 1677096, |
| "step": 4085 |
| }, |
| { |
| "epoch": 4.951573849878935, |
| "grad_norm": 0.5573171973228455, |
| "learning_rate": 2.9767113594322426e-05, |
| "loss": 0.1913, |
| "num_input_tokens_seen": 1679080, |
| "step": 4090 |
| }, |
| { |
| "epoch": 4.9576271186440675, |
| "grad_norm": 0.39944949746131897, |
| "learning_rate": 2.9715247517481655e-05, |
| "loss": 0.1562, |
| "num_input_tokens_seen": 1681000, |
| "step": 4095 |
| }, |
| { |
| "epoch": 4.963680387409201, |
| "grad_norm": 0.34067437052726746, |
| "learning_rate": 2.96633603883688e-05, |
| "loss": 0.2139, |
| "num_input_tokens_seen": 1683048, |
| "step": 4100 |
| }, |
| { |
| "epoch": 4.969733656174334, |
| "grad_norm": 0.3487281799316406, |
| "learning_rate": 2.961145243864552e-05, |
| "loss": 0.2071, |
| "num_input_tokens_seen": 1685160, |
| "step": 4105 |
| }, |
| { |
| "epoch": 4.9757869249394675, |
| "grad_norm": 0.5525677800178528, |
| "learning_rate": 2.9559523900066395e-05, |
| "loss": 0.2234, |
| "num_input_tokens_seen": 1687048, |
| "step": 4110 |
| }, |
| { |
| "epoch": 4.981840193704601, |
| "grad_norm": 0.23773516714572906, |
| "learning_rate": 2.9507575004477955e-05, |
| "loss": 0.2243, |
| "num_input_tokens_seen": 1689000, |
| "step": 4115 |
| }, |
| { |
| "epoch": 4.987893462469733, |
| "grad_norm": 0.36967313289642334, |
| "learning_rate": 2.9455605983817598e-05, |
| "loss": 0.2064, |
| "num_input_tokens_seen": 1691112, |
| "step": 4120 |
| }, |
| { |
| "epoch": 4.993946731234867, |
| "grad_norm": 0.6218871474266052, |
| "learning_rate": 2.9403617070112587e-05, |
| "loss": 0.209, |
| "num_input_tokens_seen": 1693160, |
| "step": 4125 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.287613183259964, |
| "learning_rate": 2.9351608495479004e-05, |
| "loss": 0.1736, |
| "num_input_tokens_seen": 1694912, |
| "step": 4130 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_loss": 0.20516888797283173, |
| "eval_runtime": 7.6695, |
| "eval_samples_per_second": 47.852, |
| "eval_steps_per_second": 11.996, |
| "num_input_tokens_seen": 1694912, |
| "step": 4130 |
| }, |
| { |
| "epoch": 5.006053268765133, |
| "grad_norm": 0.5180403590202332, |
| "learning_rate": 2.92995804921207e-05, |
| "loss": 0.2037, |
| "num_input_tokens_seen": 1697056, |
| "step": 4135 |
| }, |
| { |
| "epoch": 5.012106537530267, |
| "grad_norm": 0.3011190593242645, |
| "learning_rate": 2.9247533292328273e-05, |
| "loss": 0.1396, |
| "num_input_tokens_seen": 1699168, |
| "step": 4140 |
| }, |
| { |
| "epoch": 5.018159806295399, |
| "grad_norm": 0.2943519055843353, |
| "learning_rate": 2.9195467128478044e-05, |
| "loss": 0.2345, |
| "num_input_tokens_seen": 1701152, |
| "step": 4145 |
| }, |
| { |
| "epoch": 5.0242130750605325, |
| "grad_norm": 0.32699859142303467, |
| "learning_rate": 2.914338223303098e-05, |
| "loss": 0.2692, |
| "num_input_tokens_seen": 1703168, |
| "step": 4150 |
| }, |
| { |
| "epoch": 5.030266343825666, |
| "grad_norm": 0.41539907455444336, |
| "learning_rate": 2.9091278838531695e-05, |
| "loss": 0.1607, |
| "num_input_tokens_seen": 1705120, |
| "step": 4155 |
| }, |
| { |
| "epoch": 5.036319612590799, |
| "grad_norm": 0.5979402661323547, |
| "learning_rate": 2.9039157177607383e-05, |
| "loss": 0.2667, |
| "num_input_tokens_seen": 1707200, |
| "step": 4160 |
| }, |
| { |
| "epoch": 5.0423728813559325, |
| "grad_norm": 0.19587047398090363, |
| "learning_rate": 2.8987017482966815e-05, |
| "loss": 0.247, |
| "num_input_tokens_seen": 1709120, |
| "step": 4165 |
| }, |
| { |
| "epoch": 5.048426150121065, |
| "grad_norm": 0.3168056309223175, |
| "learning_rate": 2.893485998739926e-05, |
| "loss": 0.1605, |
| "num_input_tokens_seen": 1711200, |
| "step": 4170 |
| }, |
| { |
| "epoch": 5.054479418886198, |
| "grad_norm": 0.5250975489616394, |
| "learning_rate": 2.8882684923773458e-05, |
| "loss": 0.237, |
| "num_input_tokens_seen": 1713248, |
| "step": 4175 |
| }, |
| { |
| "epoch": 5.060532687651332, |
| "grad_norm": 0.35053110122680664, |
| "learning_rate": 2.883049252503659e-05, |
| "loss": 0.215, |
| "num_input_tokens_seen": 1715296, |
| "step": 4180 |
| }, |
| { |
| "epoch": 5.066585956416465, |
| "grad_norm": 0.3158166706562042, |
| "learning_rate": 2.877828302421325e-05, |
| "loss": 0.186, |
| "num_input_tokens_seen": 1717280, |
| "step": 4185 |
| }, |
| { |
| "epoch": 5.072639225181598, |
| "grad_norm": 0.4719834327697754, |
| "learning_rate": 2.872605665440436e-05, |
| "loss": 0.1786, |
| "num_input_tokens_seen": 1719136, |
| "step": 4190 |
| }, |
| { |
| "epoch": 5.078692493946731, |
| "grad_norm": 0.18149235844612122, |
| "learning_rate": 2.8673813648786196e-05, |
| "loss": 0.1909, |
| "num_input_tokens_seen": 1721152, |
| "step": 4195 |
| }, |
| { |
| "epoch": 5.084745762711864, |
| "grad_norm": 0.3479498028755188, |
| "learning_rate": 2.862155424060926e-05, |
| "loss": 0.1581, |
| "num_input_tokens_seen": 1723328, |
| "step": 4200 |
| }, |
| { |
| "epoch": 5.0907990314769975, |
| "grad_norm": 0.39899131655693054, |
| "learning_rate": 2.856927866319733e-05, |
| "loss": 0.1815, |
| "num_input_tokens_seen": 1725280, |
| "step": 4205 |
| }, |
| { |
| "epoch": 5.096852300242131, |
| "grad_norm": 0.4906311631202698, |
| "learning_rate": 2.851698714994635e-05, |
| "loss": 0.1775, |
| "num_input_tokens_seen": 1727328, |
| "step": 4210 |
| }, |
| { |
| "epoch": 5.102905569007264, |
| "grad_norm": 0.5108490586280823, |
| "learning_rate": 2.8464679934323424e-05, |
| "loss": 0.1915, |
| "num_input_tokens_seen": 1729472, |
| "step": 4215 |
| }, |
| { |
| "epoch": 5.108958837772397, |
| "grad_norm": 0.17742492258548737, |
| "learning_rate": 2.841235724986575e-05, |
| "loss": 0.2299, |
| "num_input_tokens_seen": 1731392, |
| "step": 4220 |
| }, |
| { |
| "epoch": 5.11501210653753, |
| "grad_norm": 0.45351698994636536, |
| "learning_rate": 2.8360019330179604e-05, |
| "loss": 0.2097, |
| "num_input_tokens_seen": 1733472, |
| "step": 4225 |
| }, |
| { |
| "epoch": 5.121065375302663, |
| "grad_norm": 0.306892991065979, |
| "learning_rate": 2.8307666408939278e-05, |
| "loss": 0.2384, |
| "num_input_tokens_seen": 1735520, |
| "step": 4230 |
| }, |
| { |
| "epoch": 5.127118644067797, |
| "grad_norm": 0.15731698274612427, |
| "learning_rate": 2.8255298719886043e-05, |
| "loss": 0.2091, |
| "num_input_tokens_seen": 1737536, |
| "step": 4235 |
| }, |
| { |
| "epoch": 5.13317191283293, |
| "grad_norm": 0.30763283371925354, |
| "learning_rate": 2.820291649682709e-05, |
| "loss": 0.2208, |
| "num_input_tokens_seen": 1739424, |
| "step": 4240 |
| }, |
| { |
| "epoch": 5.1392251815980625, |
| "grad_norm": 0.3806794285774231, |
| "learning_rate": 2.8150519973634543e-05, |
| "loss": 0.2194, |
| "num_input_tokens_seen": 1741536, |
| "step": 4245 |
| }, |
| { |
| "epoch": 5.145278450363196, |
| "grad_norm": 0.303024560213089, |
| "learning_rate": 2.809810938424432e-05, |
| "loss": 0.1936, |
| "num_input_tokens_seen": 1743488, |
| "step": 4250 |
| }, |
| { |
| "epoch": 5.151331719128329, |
| "grad_norm": 0.4599146842956543, |
| "learning_rate": 2.804568496265516e-05, |
| "loss": 0.1732, |
| "num_input_tokens_seen": 1745728, |
| "step": 4255 |
| }, |
| { |
| "epoch": 5.157384987893463, |
| "grad_norm": 0.5426612496376038, |
| "learning_rate": 2.799324694292757e-05, |
| "loss": 0.1742, |
| "num_input_tokens_seen": 1747808, |
| "step": 4260 |
| }, |
| { |
| "epoch": 5.163438256658596, |
| "grad_norm": 0.33161652088165283, |
| "learning_rate": 2.7940795559182764e-05, |
| "loss": 0.137, |
| "num_input_tokens_seen": 1749856, |
| "step": 4265 |
| }, |
| { |
| "epoch": 5.169491525423728, |
| "grad_norm": 0.36280059814453125, |
| "learning_rate": 2.788833104560161e-05, |
| "loss": 0.207, |
| "num_input_tokens_seen": 1751904, |
| "step": 4270 |
| }, |
| { |
| "epoch": 5.175544794188862, |
| "grad_norm": 0.21575656533241272, |
| "learning_rate": 2.7835853636423616e-05, |
| "loss": 0.2459, |
| "num_input_tokens_seen": 1753984, |
| "step": 4275 |
| }, |
| { |
| "epoch": 5.181598062953995, |
| "grad_norm": 0.19078552722930908, |
| "learning_rate": 2.7783363565945847e-05, |
| "loss": 0.1951, |
| "num_input_tokens_seen": 1756000, |
| "step": 4280 |
| }, |
| { |
| "epoch": 5.187651331719128, |
| "grad_norm": 0.3999118208885193, |
| "learning_rate": 2.773086106852192e-05, |
| "loss": 0.1999, |
| "num_input_tokens_seen": 1758080, |
| "step": 4285 |
| }, |
| { |
| "epoch": 5.193704600484262, |
| "grad_norm": 0.5344605445861816, |
| "learning_rate": 2.7678346378560903e-05, |
| "loss": 0.2717, |
| "num_input_tokens_seen": 1760224, |
| "step": 4290 |
| }, |
| { |
| "epoch": 5.199757869249395, |
| "grad_norm": 0.5719698071479797, |
| "learning_rate": 2.762581973052633e-05, |
| "loss": 0.2219, |
| "num_input_tokens_seen": 1762176, |
| "step": 4295 |
| }, |
| { |
| "epoch": 5.2058111380145276, |
| "grad_norm": 0.30912867188453674, |
| "learning_rate": 2.7573281358935104e-05, |
| "loss": 0.1739, |
| "num_input_tokens_seen": 1764352, |
| "step": 4300 |
| }, |
| { |
| "epoch": 5.211864406779661, |
| "grad_norm": 0.30914685130119324, |
| "learning_rate": 2.7520731498356494e-05, |
| "loss": 0.2146, |
| "num_input_tokens_seen": 1766432, |
| "step": 4305 |
| }, |
| { |
| "epoch": 5.217917675544794, |
| "grad_norm": 0.34490132331848145, |
| "learning_rate": 2.746817038341103e-05, |
| "loss": 0.21, |
| "num_input_tokens_seen": 1768608, |
| "step": 4310 |
| }, |
| { |
| "epoch": 5.223970944309928, |
| "grad_norm": 0.279784619808197, |
| "learning_rate": 2.7415598248769524e-05, |
| "loss": 0.165, |
| "num_input_tokens_seen": 1770752, |
| "step": 4315 |
| }, |
| { |
| "epoch": 5.230024213075061, |
| "grad_norm": 0.5204907059669495, |
| "learning_rate": 2.7363015329151965e-05, |
| "loss": 0.1473, |
| "num_input_tokens_seen": 1772832, |
| "step": 4320 |
| }, |
| { |
| "epoch": 5.236077481840193, |
| "grad_norm": 0.25638312101364136, |
| "learning_rate": 2.73104218593265e-05, |
| "loss": 0.1849, |
| "num_input_tokens_seen": 1774912, |
| "step": 4325 |
| }, |
| { |
| "epoch": 5.242130750605327, |
| "grad_norm": 0.24555715918540955, |
| "learning_rate": 2.7257818074108394e-05, |
| "loss": 0.2021, |
| "num_input_tokens_seen": 1777024, |
| "step": 4330 |
| }, |
| { |
| "epoch": 5.24818401937046, |
| "grad_norm": 0.47604724764823914, |
| "learning_rate": 2.7205204208358947e-05, |
| "loss": 0.1737, |
| "num_input_tokens_seen": 1779008, |
| "step": 4335 |
| }, |
| { |
| "epoch": 5.254237288135593, |
| "grad_norm": 0.7559889554977417, |
| "learning_rate": 2.715258049698446e-05, |
| "loss": 0.1734, |
| "num_input_tokens_seen": 1780896, |
| "step": 4340 |
| }, |
| { |
| "epoch": 5.260290556900727, |
| "grad_norm": 0.17673127353191376, |
| "learning_rate": 2.709994717493523e-05, |
| "loss": 0.1974, |
| "num_input_tokens_seen": 1783008, |
| "step": 4345 |
| }, |
| { |
| "epoch": 5.266343825665859, |
| "grad_norm": 0.3296299874782562, |
| "learning_rate": 2.7047304477204416e-05, |
| "loss": 0.1688, |
| "num_input_tokens_seen": 1784992, |
| "step": 4350 |
| }, |
| { |
| "epoch": 5.272397094430993, |
| "grad_norm": 0.7079688906669617, |
| "learning_rate": 2.6994652638827078e-05, |
| "loss": 0.2545, |
| "num_input_tokens_seen": 1787008, |
| "step": 4355 |
| }, |
| { |
| "epoch": 5.278450363196126, |
| "grad_norm": 0.3917844593524933, |
| "learning_rate": 2.694199189487906e-05, |
| "loss": 0.1699, |
| "num_input_tokens_seen": 1789120, |
| "step": 4360 |
| }, |
| { |
| "epoch": 5.284503631961259, |
| "grad_norm": 0.5629355311393738, |
| "learning_rate": 2.688932248047597e-05, |
| "loss": 0.2087, |
| "num_input_tokens_seen": 1791232, |
| "step": 4365 |
| }, |
| { |
| "epoch": 5.290556900726393, |
| "grad_norm": 0.20843997597694397, |
| "learning_rate": 2.683664463077214e-05, |
| "loss": 0.2279, |
| "num_input_tokens_seen": 1793440, |
| "step": 4370 |
| }, |
| { |
| "epoch": 5.296610169491525, |
| "grad_norm": 0.1895889788866043, |
| "learning_rate": 2.678395858095955e-05, |
| "loss": 0.2205, |
| "num_input_tokens_seen": 1795488, |
| "step": 4375 |
| }, |
| { |
| "epoch": 5.302663438256658, |
| "grad_norm": 0.3004699647426605, |
| "learning_rate": 2.6731264566266795e-05, |
| "loss": 0.2444, |
| "num_input_tokens_seen": 1797600, |
| "step": 4380 |
| }, |
| { |
| "epoch": 5.308716707021792, |
| "grad_norm": 0.26090365648269653, |
| "learning_rate": 2.6678562821958043e-05, |
| "loss": 0.1892, |
| "num_input_tokens_seen": 1799584, |
| "step": 4385 |
| }, |
| { |
| "epoch": 5.314769975786925, |
| "grad_norm": 0.7535932064056396, |
| "learning_rate": 2.6625853583331943e-05, |
| "loss": 0.2302, |
| "num_input_tokens_seen": 1801440, |
| "step": 4390 |
| }, |
| { |
| "epoch": 5.3208232445520585, |
| "grad_norm": 0.19294387102127075, |
| "learning_rate": 2.6573137085720638e-05, |
| "loss": 0.2219, |
| "num_input_tokens_seen": 1803456, |
| "step": 4395 |
| }, |
| { |
| "epoch": 5.326876513317191, |
| "grad_norm": 0.24952974915504456, |
| "learning_rate": 2.6520413564488672e-05, |
| "loss": 0.2026, |
| "num_input_tokens_seen": 1805440, |
| "step": 4400 |
| }, |
| { |
| "epoch": 5.332929782082324, |
| "grad_norm": 0.8084157109260559, |
| "learning_rate": 2.6467683255031918e-05, |
| "loss": 0.1894, |
| "num_input_tokens_seen": 1807360, |
| "step": 4405 |
| }, |
| { |
| "epoch": 5.338983050847458, |
| "grad_norm": 0.6292164921760559, |
| "learning_rate": 2.6414946392776597e-05, |
| "loss": 0.1677, |
| "num_input_tokens_seen": 1809344, |
| "step": 4410 |
| }, |
| { |
| "epoch": 5.345036319612591, |
| "grad_norm": 0.2978937029838562, |
| "learning_rate": 2.636220321317816e-05, |
| "loss": 0.2417, |
| "num_input_tokens_seen": 1811456, |
| "step": 4415 |
| }, |
| { |
| "epoch": 5.351089588377724, |
| "grad_norm": 0.8585381507873535, |
| "learning_rate": 2.6309453951720274e-05, |
| "loss": 0.2038, |
| "num_input_tokens_seen": 1813600, |
| "step": 4420 |
| }, |
| { |
| "epoch": 5.357142857142857, |
| "grad_norm": 0.8026732206344604, |
| "learning_rate": 2.625669884391377e-05, |
| "loss": 0.1899, |
| "num_input_tokens_seen": 1815424, |
| "step": 4425 |
| }, |
| { |
| "epoch": 5.36319612590799, |
| "grad_norm": 0.273573637008667, |
| "learning_rate": 2.6203938125295552e-05, |
| "loss": 0.2159, |
| "num_input_tokens_seen": 1817600, |
| "step": 4430 |
| }, |
| { |
| "epoch": 5.3692493946731235, |
| "grad_norm": 0.31223616003990173, |
| "learning_rate": 2.6151172031427597e-05, |
| "loss": 0.1901, |
| "num_input_tokens_seen": 1819648, |
| "step": 4435 |
| }, |
| { |
| "epoch": 5.375302663438257, |
| "grad_norm": 0.482793390750885, |
| "learning_rate": 2.609840079789588e-05, |
| "loss": 0.2084, |
| "num_input_tokens_seen": 1821728, |
| "step": 4440 |
| }, |
| { |
| "epoch": 5.38135593220339, |
| "grad_norm": 0.4068007469177246, |
| "learning_rate": 2.604562466030931e-05, |
| "loss": 0.2242, |
| "num_input_tokens_seen": 1823776, |
| "step": 4445 |
| }, |
| { |
| "epoch": 5.387409200968523, |
| "grad_norm": 0.4406266510486603, |
| "learning_rate": 2.599284385429871e-05, |
| "loss": 0.2179, |
| "num_input_tokens_seen": 1825856, |
| "step": 4450 |
| }, |
| { |
| "epoch": 5.393462469733656, |
| "grad_norm": 0.34258127212524414, |
| "learning_rate": 2.594005861551574e-05, |
| "loss": 0.2282, |
| "num_input_tokens_seen": 1827936, |
| "step": 4455 |
| }, |
| { |
| "epoch": 5.399515738498789, |
| "grad_norm": 0.37745401263237, |
| "learning_rate": 2.588726917963183e-05, |
| "loss": 0.2292, |
| "num_input_tokens_seen": 1829824, |
| "step": 4460 |
| }, |
| { |
| "epoch": 5.405569007263923, |
| "grad_norm": 0.29385197162628174, |
| "learning_rate": 2.5834475782337187e-05, |
| "loss": 0.1741, |
| "num_input_tokens_seen": 1831936, |
| "step": 4465 |
| }, |
| { |
| "epoch": 5.411622276029056, |
| "grad_norm": 0.4422626495361328, |
| "learning_rate": 2.578167865933967e-05, |
| "loss": 0.1931, |
| "num_input_tokens_seen": 1834048, |
| "step": 4470 |
| }, |
| { |
| "epoch": 5.4176755447941884, |
| "grad_norm": 0.4614945352077484, |
| "learning_rate": 2.5728878046363785e-05, |
| "loss": 0.2001, |
| "num_input_tokens_seen": 1836192, |
| "step": 4475 |
| }, |
| { |
| "epoch": 5.423728813559322, |
| "grad_norm": 0.14360632002353668, |
| "learning_rate": 2.5676074179149635e-05, |
| "loss": 0.1719, |
| "num_input_tokens_seen": 1838432, |
| "step": 4480 |
| }, |
| { |
| "epoch": 5.429782082324455, |
| "grad_norm": 0.23715360462665558, |
| "learning_rate": 2.5623267293451826e-05, |
| "loss": 0.2123, |
| "num_input_tokens_seen": 1840576, |
| "step": 4485 |
| }, |
| { |
| "epoch": 5.4358353510895885, |
| "grad_norm": 0.19056779146194458, |
| "learning_rate": 2.5570457625038457e-05, |
| "loss": 0.1604, |
| "num_input_tokens_seen": 1842624, |
| "step": 4490 |
| }, |
| { |
| "epoch": 5.441888619854722, |
| "grad_norm": 0.3184802532196045, |
| "learning_rate": 2.551764540969005e-05, |
| "loss": 0.1944, |
| "num_input_tokens_seen": 1844576, |
| "step": 4495 |
| }, |
| { |
| "epoch": 5.447941888619855, |
| "grad_norm": 0.32326728105545044, |
| "learning_rate": 2.5464830883198492e-05, |
| "loss": 0.2239, |
| "num_input_tokens_seen": 1846560, |
| "step": 4500 |
| }, |
| { |
| "epoch": 5.453995157384988, |
| "grad_norm": 0.25731295347213745, |
| "learning_rate": 2.5412014281365986e-05, |
| "loss": 0.2062, |
| "num_input_tokens_seen": 1848576, |
| "step": 4505 |
| }, |
| { |
| "epoch": 5.460048426150121, |
| "grad_norm": 0.22198982536792755, |
| "learning_rate": 2.5359195840004023e-05, |
| "loss": 0.1922, |
| "num_input_tokens_seen": 1850688, |
| "step": 4510 |
| }, |
| { |
| "epoch": 5.466101694915254, |
| "grad_norm": 0.3998708426952362, |
| "learning_rate": 2.5306375794932273e-05, |
| "loss": 0.209, |
| "num_input_tokens_seen": 1852800, |
| "step": 4515 |
| }, |
| { |
| "epoch": 5.472154963680388, |
| "grad_norm": 0.2438637763261795, |
| "learning_rate": 2.52535543819776e-05, |
| "loss": 0.1988, |
| "num_input_tokens_seen": 1854720, |
| "step": 4520 |
| }, |
| { |
| "epoch": 5.478208232445521, |
| "grad_norm": 0.268893301486969, |
| "learning_rate": 2.5200731836972956e-05, |
| "loss": 0.1946, |
| "num_input_tokens_seen": 1856768, |
| "step": 4525 |
| }, |
| { |
| "epoch": 5.4842615012106535, |
| "grad_norm": 0.18818461894989014, |
| "learning_rate": 2.5147908395756343e-05, |
| "loss": 0.1624, |
| "num_input_tokens_seen": 1858848, |
| "step": 4530 |
| }, |
| { |
| "epoch": 5.490314769975787, |
| "grad_norm": 0.29403194785118103, |
| "learning_rate": 2.5095084294169768e-05, |
| "loss": 0.1598, |
| "num_input_tokens_seen": 1860896, |
| "step": 4535 |
| }, |
| { |
| "epoch": 5.49636803874092, |
| "grad_norm": 0.40155667066574097, |
| "learning_rate": 2.5042259768058208e-05, |
| "loss": 0.2032, |
| "num_input_tokens_seen": 1862816, |
| "step": 4540 |
| }, |
| { |
| "epoch": 5.5, |
| "eval_loss": 0.19803602993488312, |
| "eval_runtime": 7.6757, |
| "eval_samples_per_second": 47.813, |
| "eval_steps_per_second": 11.986, |
| "num_input_tokens_seen": 1864000, |
| "step": 4543 |
| }, |
| { |
| "epoch": 5.5024213075060535, |
| "grad_norm": 0.5182916522026062, |
| "learning_rate": 2.4989435053268497e-05, |
| "loss": 0.1842, |
| "num_input_tokens_seen": 1864832, |
| "step": 4545 |
| }, |
| { |
| "epoch": 5.508474576271187, |
| "grad_norm": 0.2231263369321823, |
| "learning_rate": 2.493661038564835e-05, |
| "loss": 0.2327, |
| "num_input_tokens_seen": 1866752, |
| "step": 4550 |
| }, |
| { |
| "epoch": 5.514527845036319, |
| "grad_norm": 0.2348439246416092, |
| "learning_rate": 2.4883786001045238e-05, |
| "loss": 0.1917, |
| "num_input_tokens_seen": 1868928, |
| "step": 4555 |
| }, |
| { |
| "epoch": 5.520581113801453, |
| "grad_norm": 0.2559913992881775, |
| "learning_rate": 2.4830962135305398e-05, |
| "loss": 0.2102, |
| "num_input_tokens_seen": 1871072, |
| "step": 4560 |
| }, |
| { |
| "epoch": 5.526634382566586, |
| "grad_norm": 0.2784074544906616, |
| "learning_rate": 2.4778139024272724e-05, |
| "loss": 0.1326, |
| "num_input_tokens_seen": 1873152, |
| "step": 4565 |
| }, |
| { |
| "epoch": 5.532687651331719, |
| "grad_norm": 0.5361984968185425, |
| "learning_rate": 2.4725316903787765e-05, |
| "loss": 0.222, |
| "num_input_tokens_seen": 1875040, |
| "step": 4570 |
| }, |
| { |
| "epoch": 5.538740920096853, |
| "grad_norm": 0.27152353525161743, |
| "learning_rate": 2.4672496009686622e-05, |
| "loss": 0.1456, |
| "num_input_tokens_seen": 1877184, |
| "step": 4575 |
| }, |
| { |
| "epoch": 5.544794188861985, |
| "grad_norm": 0.2514403760433197, |
| "learning_rate": 2.4619676577799946e-05, |
| "loss": 0.1693, |
| "num_input_tokens_seen": 1879232, |
| "step": 4580 |
| }, |
| { |
| "epoch": 5.5508474576271185, |
| "grad_norm": 0.43311169743537903, |
| "learning_rate": 2.4566858843951847e-05, |
| "loss": 0.178, |
| "num_input_tokens_seen": 1881312, |
| "step": 4585 |
| }, |
| { |
| "epoch": 5.556900726392252, |
| "grad_norm": 0.25901293754577637, |
| "learning_rate": 2.451404304395884e-05, |
| "loss": 0.1962, |
| "num_input_tokens_seen": 1883328, |
| "step": 4590 |
| }, |
| { |
| "epoch": 5.562953995157385, |
| "grad_norm": 0.8058931231498718, |
| "learning_rate": 2.446122941362883e-05, |
| "loss": 0.214, |
| "num_input_tokens_seen": 1885248, |
| "step": 4595 |
| }, |
| { |
| "epoch": 5.5690072639225185, |
| "grad_norm": 0.45033499598503113, |
| "learning_rate": 2.4408418188760026e-05, |
| "loss": 0.2066, |
| "num_input_tokens_seen": 1887328, |
| "step": 4600 |
| }, |
| { |
| "epoch": 5.575060532687651, |
| "grad_norm": 0.44099339842796326, |
| "learning_rate": 2.435560960513989e-05, |
| "loss": 0.1666, |
| "num_input_tokens_seen": 1889408, |
| "step": 4605 |
| }, |
| { |
| "epoch": 5.581113801452784, |
| "grad_norm": 0.49961069226264954, |
| "learning_rate": 2.4302803898544106e-05, |
| "loss": 0.2623, |
| "num_input_tokens_seen": 1891456, |
| "step": 4610 |
| }, |
| { |
| "epoch": 5.587167070217918, |
| "grad_norm": 0.35013407468795776, |
| "learning_rate": 2.425000130473549e-05, |
| "loss": 0.1608, |
| "num_input_tokens_seen": 1893696, |
| "step": 4615 |
| }, |
| { |
| "epoch": 5.593220338983051, |
| "grad_norm": 0.2398826777935028, |
| "learning_rate": 2.4197202059463e-05, |
| "loss": 0.1292, |
| "num_input_tokens_seen": 1895616, |
| "step": 4620 |
| }, |
| { |
| "epoch": 5.599273607748184, |
| "grad_norm": 0.5173197984695435, |
| "learning_rate": 2.4144406398460594e-05, |
| "loss": 0.172, |
| "num_input_tokens_seen": 1897600, |
| "step": 4625 |
| }, |
| { |
| "epoch": 5.605326876513317, |
| "grad_norm": 0.3305251896381378, |
| "learning_rate": 2.4091614557446267e-05, |
| "loss": 0.29, |
| "num_input_tokens_seen": 1899616, |
| "step": 4630 |
| }, |
| { |
| "epoch": 5.61138014527845, |
| "grad_norm": 0.2020539939403534, |
| "learning_rate": 2.4038826772120932e-05, |
| "loss": 0.1916, |
| "num_input_tokens_seen": 1901568, |
| "step": 4635 |
| }, |
| { |
| "epoch": 5.6174334140435835, |
| "grad_norm": 0.5813433527946472, |
| "learning_rate": 2.398604327816742e-05, |
| "loss": 0.1752, |
| "num_input_tokens_seen": 1903616, |
| "step": 4640 |
| }, |
| { |
| "epoch": 5.623486682808717, |
| "grad_norm": 0.2911786139011383, |
| "learning_rate": 2.3933264311249377e-05, |
| "loss": 0.291, |
| "num_input_tokens_seen": 1905504, |
| "step": 4645 |
| }, |
| { |
| "epoch": 5.62953995157385, |
| "grad_norm": 0.3432205617427826, |
| "learning_rate": 2.3880490107010255e-05, |
| "loss": 0.2202, |
| "num_input_tokens_seen": 1907456, |
| "step": 4650 |
| }, |
| { |
| "epoch": 5.635593220338983, |
| "grad_norm": 0.630300760269165, |
| "learning_rate": 2.382772090107223e-05, |
| "loss": 0.2532, |
| "num_input_tokens_seen": 1909568, |
| "step": 4655 |
| }, |
| { |
| "epoch": 5.641646489104116, |
| "grad_norm": 0.5793043375015259, |
| "learning_rate": 2.3774956929035177e-05, |
| "loss": 0.1747, |
| "num_input_tokens_seen": 1911840, |
| "step": 4660 |
| }, |
| { |
| "epoch": 5.647699757869249, |
| "grad_norm": 0.29113325476646423, |
| "learning_rate": 2.3722198426475593e-05, |
| "loss": 0.2324, |
| "num_input_tokens_seen": 1913952, |
| "step": 4665 |
| }, |
| { |
| "epoch": 5.653753026634383, |
| "grad_norm": 0.49811410903930664, |
| "learning_rate": 2.3669445628945542e-05, |
| "loss": 0.2511, |
| "num_input_tokens_seen": 1916160, |
| "step": 4670 |
| }, |
| { |
| "epoch": 5.659806295399516, |
| "grad_norm": 0.5889513492584229, |
| "learning_rate": 2.3616698771971633e-05, |
| "loss": 0.2265, |
| "num_input_tokens_seen": 1918144, |
| "step": 4675 |
| }, |
| { |
| "epoch": 5.6658595641646485, |
| "grad_norm": 0.4038580358028412, |
| "learning_rate": 2.356395809105396e-05, |
| "loss": 0.2026, |
| "num_input_tokens_seen": 1920160, |
| "step": 4680 |
| }, |
| { |
| "epoch": 5.671912832929782, |
| "grad_norm": 0.2696045935153961, |
| "learning_rate": 2.3511223821665028e-05, |
| "loss": 0.2205, |
| "num_input_tokens_seen": 1922368, |
| "step": 4685 |
| }, |
| { |
| "epoch": 5.677966101694915, |
| "grad_norm": 0.47910162806510925, |
| "learning_rate": 2.3458496199248717e-05, |
| "loss": 0.2148, |
| "num_input_tokens_seen": 1924448, |
| "step": 4690 |
| }, |
| { |
| "epoch": 5.684019370460049, |
| "grad_norm": 0.4316138029098511, |
| "learning_rate": 2.340577545921923e-05, |
| "loss": 0.1914, |
| "num_input_tokens_seen": 1926592, |
| "step": 4695 |
| }, |
| { |
| "epoch": 5.690072639225182, |
| "grad_norm": 0.22598016262054443, |
| "learning_rate": 2.335306183696006e-05, |
| "loss": 0.2272, |
| "num_input_tokens_seen": 1928672, |
| "step": 4700 |
| }, |
| { |
| "epoch": 5.696125907990314, |
| "grad_norm": 0.287159264087677, |
| "learning_rate": 2.3300355567822897e-05, |
| "loss": 0.1917, |
| "num_input_tokens_seen": 1930816, |
| "step": 4705 |
| }, |
| { |
| "epoch": 5.702179176755448, |
| "grad_norm": 0.42663267254829407, |
| "learning_rate": 2.324765688712661e-05, |
| "loss": 0.2083, |
| "num_input_tokens_seen": 1932896, |
| "step": 4710 |
| }, |
| { |
| "epoch": 5.708232445520581, |
| "grad_norm": 0.44988030195236206, |
| "learning_rate": 2.3194966030156187e-05, |
| "loss": 0.2052, |
| "num_input_tokens_seen": 1934944, |
| "step": 4715 |
| }, |
| { |
| "epoch": 5.714285714285714, |
| "grad_norm": 0.19450457394123077, |
| "learning_rate": 2.31422832321617e-05, |
| "loss": 0.1821, |
| "num_input_tokens_seen": 1937152, |
| "step": 4720 |
| }, |
| { |
| "epoch": 5.720338983050848, |
| "grad_norm": 0.31133726239204407, |
| "learning_rate": 2.308960872835721e-05, |
| "loss": 0.1653, |
| "num_input_tokens_seen": 1939328, |
| "step": 4725 |
| }, |
| { |
| "epoch": 5.72639225181598, |
| "grad_norm": 0.23220045864582062, |
| "learning_rate": 2.3036942753919775e-05, |
| "loss": 0.2198, |
| "num_input_tokens_seen": 1941312, |
| "step": 4730 |
| }, |
| { |
| "epoch": 5.732445520581114, |
| "grad_norm": 0.31443890929222107, |
| "learning_rate": 2.2984285543988352e-05, |
| "loss": 0.2039, |
| "num_input_tokens_seen": 1943264, |
| "step": 4735 |
| }, |
| { |
| "epoch": 5.738498789346247, |
| "grad_norm": 0.40928277373313904, |
| "learning_rate": 2.2931637333662785e-05, |
| "loss": 0.2785, |
| "num_input_tokens_seen": 1945312, |
| "step": 4740 |
| }, |
| { |
| "epoch": 5.74455205811138, |
| "grad_norm": 0.3616982400417328, |
| "learning_rate": 2.287899835800273e-05, |
| "loss": 0.2155, |
| "num_input_tokens_seen": 1947328, |
| "step": 4745 |
| }, |
| { |
| "epoch": 5.750605326876514, |
| "grad_norm": 0.30421537160873413, |
| "learning_rate": 2.2826368852026597e-05, |
| "loss": 0.1956, |
| "num_input_tokens_seen": 1949376, |
| "step": 4750 |
| }, |
| { |
| "epoch": 5.756658595641646, |
| "grad_norm": 0.20220312476158142, |
| "learning_rate": 2.277374905071053e-05, |
| "loss": 0.2156, |
| "num_input_tokens_seen": 1951456, |
| "step": 4755 |
| }, |
| { |
| "epoch": 5.762711864406779, |
| "grad_norm": 0.15912894904613495, |
| "learning_rate": 2.2721139188987357e-05, |
| "loss": 0.1431, |
| "num_input_tokens_seen": 1953536, |
| "step": 4760 |
| }, |
| { |
| "epoch": 5.768765133171913, |
| "grad_norm": 0.48402050137519836, |
| "learning_rate": 2.26685395017455e-05, |
| "loss": 0.195, |
| "num_input_tokens_seen": 1955744, |
| "step": 4765 |
| }, |
| { |
| "epoch": 5.774818401937046, |
| "grad_norm": 0.16937394440174103, |
| "learning_rate": 2.261595022382799e-05, |
| "loss": 0.1489, |
| "num_input_tokens_seen": 1957824, |
| "step": 4770 |
| }, |
| { |
| "epoch": 5.780871670702179, |
| "grad_norm": 0.39521676301956177, |
| "learning_rate": 2.256337159003134e-05, |
| "loss": 0.2381, |
| "num_input_tokens_seen": 1960000, |
| "step": 4775 |
| }, |
| { |
| "epoch": 5.786924939467312, |
| "grad_norm": 0.5848916172981262, |
| "learning_rate": 2.251080383510459e-05, |
| "loss": 0.1842, |
| "num_input_tokens_seen": 1962144, |
| "step": 4780 |
| }, |
| { |
| "epoch": 5.792978208232445, |
| "grad_norm": 0.6398055553436279, |
| "learning_rate": 2.2458247193748155e-05, |
| "loss": 0.194, |
| "num_input_tokens_seen": 1964192, |
| "step": 4785 |
| }, |
| { |
| "epoch": 5.799031476997579, |
| "grad_norm": 0.43140700459480286, |
| "learning_rate": 2.240570190061288e-05, |
| "loss": 0.1563, |
| "num_input_tokens_seen": 1966240, |
| "step": 4790 |
| }, |
| { |
| "epoch": 5.805084745762712, |
| "grad_norm": 0.20955638587474823, |
| "learning_rate": 2.2353168190298915e-05, |
| "loss": 0.184, |
| "num_input_tokens_seen": 1968320, |
| "step": 4795 |
| }, |
| { |
| "epoch": 5.811138014527845, |
| "grad_norm": 0.19946417212486267, |
| "learning_rate": 2.2300646297354704e-05, |
| "loss": 0.205, |
| "num_input_tokens_seen": 1970304, |
| "step": 4800 |
| }, |
| { |
| "epoch": 5.817191283292978, |
| "grad_norm": 0.43935373425483704, |
| "learning_rate": 2.224813645627592e-05, |
| "loss": 0.2019, |
| "num_input_tokens_seen": 1972320, |
| "step": 4805 |
| }, |
| { |
| "epoch": 5.823244552058111, |
| "grad_norm": 0.5332738757133484, |
| "learning_rate": 2.2195638901504452e-05, |
| "loss": 0.1506, |
| "num_input_tokens_seen": 1974432, |
| "step": 4810 |
| }, |
| { |
| "epoch": 5.829297820823244, |
| "grad_norm": 0.1991523951292038, |
| "learning_rate": 2.2143153867427305e-05, |
| "loss": 0.1763, |
| "num_input_tokens_seen": 1976480, |
| "step": 4815 |
| }, |
| { |
| "epoch": 5.835351089588378, |
| "grad_norm": 0.3126385509967804, |
| "learning_rate": 2.2090681588375594e-05, |
| "loss": 0.161, |
| "num_input_tokens_seen": 1978624, |
| "step": 4820 |
| }, |
| { |
| "epoch": 5.841404358353511, |
| "grad_norm": 0.2539404630661011, |
| "learning_rate": 2.2038222298623507e-05, |
| "loss": 0.1658, |
| "num_input_tokens_seen": 1980736, |
| "step": 4825 |
| }, |
| { |
| "epoch": 5.847457627118644, |
| "grad_norm": 0.21135450899600983, |
| "learning_rate": 2.1985776232387202e-05, |
| "loss": 0.1837, |
| "num_input_tokens_seen": 1982656, |
| "step": 4830 |
| }, |
| { |
| "epoch": 5.853510895883777, |
| "grad_norm": 0.4475288689136505, |
| "learning_rate": 2.1933343623823814e-05, |
| "loss": 0.2242, |
| "num_input_tokens_seen": 1984832, |
| "step": 4835 |
| }, |
| { |
| "epoch": 5.85956416464891, |
| "grad_norm": 0.42206016182899475, |
| "learning_rate": 2.1880924707030407e-05, |
| "loss": 0.2168, |
| "num_input_tokens_seen": 1986848, |
| "step": 4840 |
| }, |
| { |
| "epoch": 5.865617433414044, |
| "grad_norm": 0.282394140958786, |
| "learning_rate": 2.1828519716042888e-05, |
| "loss": 0.2081, |
| "num_input_tokens_seen": 1988992, |
| "step": 4845 |
| }, |
| { |
| "epoch": 5.871670702179177, |
| "grad_norm": 0.17639875411987305, |
| "learning_rate": 2.177612888483502e-05, |
| "loss": 0.2092, |
| "num_input_tokens_seen": 1990976, |
| "step": 4850 |
| }, |
| { |
| "epoch": 5.877723970944309, |
| "grad_norm": 0.19221574068069458, |
| "learning_rate": 2.1723752447317312e-05, |
| "loss": 0.2042, |
| "num_input_tokens_seen": 1992928, |
| "step": 4855 |
| }, |
| { |
| "epoch": 5.883777239709443, |
| "grad_norm": 0.3984043002128601, |
| "learning_rate": 2.167139063733605e-05, |
| "loss": 0.182, |
| "num_input_tokens_seen": 1995104, |
| "step": 4860 |
| }, |
| { |
| "epoch": 5.889830508474576, |
| "grad_norm": 0.5806388854980469, |
| "learning_rate": 2.161904368867217e-05, |
| "loss": 0.1596, |
| "num_input_tokens_seen": 1997312, |
| "step": 4865 |
| }, |
| { |
| "epoch": 5.8958837772397095, |
| "grad_norm": 0.45268091559410095, |
| "learning_rate": 2.1566711835040284e-05, |
| "loss": 0.1881, |
| "num_input_tokens_seen": 1999232, |
| "step": 4870 |
| }, |
| { |
| "epoch": 5.901937046004843, |
| "grad_norm": 0.3012757897377014, |
| "learning_rate": 2.1514395310087596e-05, |
| "loss": 0.2373, |
| "num_input_tokens_seen": 2001088, |
| "step": 4875 |
| }, |
| { |
| "epoch": 5.907990314769976, |
| "grad_norm": 0.45305120944976807, |
| "learning_rate": 2.1462094347392887e-05, |
| "loss": 0.2568, |
| "num_input_tokens_seen": 2003232, |
| "step": 4880 |
| }, |
| { |
| "epoch": 5.914043583535109, |
| "grad_norm": 0.18014615774154663, |
| "learning_rate": 2.1409809180465436e-05, |
| "loss": 0.2056, |
| "num_input_tokens_seen": 2005312, |
| "step": 4885 |
| }, |
| { |
| "epoch": 5.920096852300242, |
| "grad_norm": 0.3118605613708496, |
| "learning_rate": 2.1357540042744006e-05, |
| "loss": 0.18, |
| "num_input_tokens_seen": 2007392, |
| "step": 4890 |
| }, |
| { |
| "epoch": 5.926150121065375, |
| "grad_norm": 0.32448166608810425, |
| "learning_rate": 2.1305287167595808e-05, |
| "loss": 0.1899, |
| "num_input_tokens_seen": 2009568, |
| "step": 4895 |
| }, |
| { |
| "epoch": 5.932203389830509, |
| "grad_norm": 0.5477737784385681, |
| "learning_rate": 2.1253050788315436e-05, |
| "loss": 0.1707, |
| "num_input_tokens_seen": 2011424, |
| "step": 4900 |
| }, |
| { |
| "epoch": 5.938256658595642, |
| "grad_norm": 0.27477216720581055, |
| "learning_rate": 2.120083113812381e-05, |
| "loss": 0.2064, |
| "num_input_tokens_seen": 2013504, |
| "step": 4905 |
| }, |
| { |
| "epoch": 5.9443099273607745, |
| "grad_norm": 0.42188262939453125, |
| "learning_rate": 2.1148628450167203e-05, |
| "loss": 0.1808, |
| "num_input_tokens_seen": 2015872, |
| "step": 4910 |
| }, |
| { |
| "epoch": 5.950363196125908, |
| "grad_norm": 0.735011100769043, |
| "learning_rate": 2.109644295751612e-05, |
| "loss": 0.2526, |
| "num_input_tokens_seen": 2017888, |
| "step": 4915 |
| }, |
| { |
| "epoch": 5.956416464891041, |
| "grad_norm": 0.20138289034366608, |
| "learning_rate": 2.1044274893164316e-05, |
| "loss": 0.1827, |
| "num_input_tokens_seen": 2019904, |
| "step": 4920 |
| }, |
| { |
| "epoch": 5.9624697336561745, |
| "grad_norm": 0.19165141880512238, |
| "learning_rate": 2.0992124490027727e-05, |
| "loss": 0.1505, |
| "num_input_tokens_seen": 2022016, |
| "step": 4925 |
| }, |
| { |
| "epoch": 5.968523002421308, |
| "grad_norm": 0.5342246890068054, |
| "learning_rate": 2.0939991980943437e-05, |
| "loss": 0.266, |
| "num_input_tokens_seen": 2023936, |
| "step": 4930 |
| }, |
| { |
| "epoch": 5.97457627118644, |
| "grad_norm": 0.33259841799736023, |
| "learning_rate": 2.088787759866863e-05, |
| "loss": 0.2171, |
| "num_input_tokens_seen": 2026080, |
| "step": 4935 |
| }, |
| { |
| "epoch": 5.980629539951574, |
| "grad_norm": 0.4620624780654907, |
| "learning_rate": 2.0835781575879574e-05, |
| "loss": 0.1882, |
| "num_input_tokens_seen": 2028064, |
| "step": 4940 |
| }, |
| { |
| "epoch": 5.986682808716707, |
| "grad_norm": 0.32478955388069153, |
| "learning_rate": 2.0783704145170547e-05, |
| "loss": 0.1876, |
| "num_input_tokens_seen": 2029920, |
| "step": 4945 |
| }, |
| { |
| "epoch": 5.99273607748184, |
| "grad_norm": 0.27669787406921387, |
| "learning_rate": 2.0731645539052845e-05, |
| "loss": 0.1983, |
| "num_input_tokens_seen": 2031776, |
| "step": 4950 |
| }, |
| { |
| "epoch": 5.998789346246974, |
| "grad_norm": 0.3508172631263733, |
| "learning_rate": 2.067960598995369e-05, |
| "loss": 0.1573, |
| "num_input_tokens_seen": 2033888, |
| "step": 4955 |
| }, |
| { |
| "epoch": 6.0, |
| "eval_loss": 0.19914162158966064, |
| "eval_runtime": 7.6639, |
| "eval_samples_per_second": 47.887, |
| "eval_steps_per_second": 12.004, |
| "num_input_tokens_seen": 2033992, |
| "step": 4956 |
| }, |
| { |
| "epoch": 6.004842615012106, |
| "grad_norm": 0.3270215392112732, |
| "learning_rate": 2.062758573021523e-05, |
| "loss": 0.2291, |
| "num_input_tokens_seen": 2035752, |
| "step": 4960 |
| }, |
| { |
| "epoch": 6.0108958837772395, |
| "grad_norm": 0.5372700095176697, |
| "learning_rate": 2.0575584992093505e-05, |
| "loss": 0.2248, |
| "num_input_tokens_seen": 2037896, |
| "step": 4965 |
| }, |
| { |
| "epoch": 6.016949152542373, |
| "grad_norm": 0.27195754647254944, |
| "learning_rate": 2.0523604007757374e-05, |
| "loss": 0.1674, |
| "num_input_tokens_seen": 2040008, |
| "step": 4970 |
| }, |
| { |
| "epoch": 6.023002421307506, |
| "grad_norm": 0.43325644731521606, |
| "learning_rate": 2.0471643009287523e-05, |
| "loss": 0.2058, |
| "num_input_tokens_seen": 2042056, |
| "step": 4975 |
| }, |
| { |
| "epoch": 6.0290556900726395, |
| "grad_norm": 0.3632386326789856, |
| "learning_rate": 2.0419702228675395e-05, |
| "loss": 0.1823, |
| "num_input_tokens_seen": 2044040, |
| "step": 4980 |
| }, |
| { |
| "epoch": 6.035108958837772, |
| "grad_norm": 0.21754832565784454, |
| "learning_rate": 2.0367781897822147e-05, |
| "loss": 0.1702, |
| "num_input_tokens_seen": 2046056, |
| "step": 4985 |
| }, |
| { |
| "epoch": 6.041162227602905, |
| "grad_norm": 0.4620909094810486, |
| "learning_rate": 2.031588224853767e-05, |
| "loss": 0.2043, |
| "num_input_tokens_seen": 2047944, |
| "step": 4990 |
| }, |
| { |
| "epoch": 6.047215496368039, |
| "grad_norm": 0.4430719017982483, |
| "learning_rate": 2.0264003512539493e-05, |
| "loss": 0.1664, |
| "num_input_tokens_seen": 2050024, |
| "step": 4995 |
| }, |
| { |
| "epoch": 6.053268765133172, |
| "grad_norm": 0.08858021348714828, |
| "learning_rate": 2.0212145921451787e-05, |
| "loss": 0.1751, |
| "num_input_tokens_seen": 2052040, |
| "step": 5000 |
| }, |
| { |
| "epoch": 6.059322033898305, |
| "grad_norm": 0.3103668987751007, |
| "learning_rate": 2.01603097068043e-05, |
| "loss": 0.1743, |
| "num_input_tokens_seen": 2054120, |
| "step": 5005 |
| }, |
| { |
| "epoch": 6.065375302663438, |
| "grad_norm": 0.5079805254936218, |
| "learning_rate": 2.0108495100031364e-05, |
| "loss": 0.197, |
| "num_input_tokens_seen": 2056296, |
| "step": 5010 |
| }, |
| { |
| "epoch": 6.071428571428571, |
| "grad_norm": 0.32969242334365845, |
| "learning_rate": 2.0056702332470806e-05, |
| "loss": 0.1821, |
| "num_input_tokens_seen": 2058408, |
| "step": 5015 |
| }, |
| { |
| "epoch": 6.0774818401937045, |
| "grad_norm": 0.6401872634887695, |
| "learning_rate": 2.0004931635362982e-05, |
| "loss": 0.1669, |
| "num_input_tokens_seen": 2060456, |
| "step": 5020 |
| }, |
| { |
| "epoch": 6.083535108958838, |
| "grad_norm": 0.39123037457466125, |
| "learning_rate": 1.995318323984969e-05, |
| "loss": 0.1602, |
| "num_input_tokens_seen": 2062408, |
| "step": 5025 |
| }, |
| { |
| "epoch": 6.089588377723971, |
| "grad_norm": 0.8995078206062317, |
| "learning_rate": 1.9901457376973143e-05, |
| "loss": 0.2636, |
| "num_input_tokens_seen": 2064392, |
| "step": 5030 |
| }, |
| { |
| "epoch": 6.095641646489105, |
| "grad_norm": 0.4956354796886444, |
| "learning_rate": 1.9849754277674993e-05, |
| "loss": 0.2069, |
| "num_input_tokens_seen": 2066504, |
| "step": 5035 |
| }, |
| { |
| "epoch": 6.101694915254237, |
| "grad_norm": 0.4333103597164154, |
| "learning_rate": 1.979807417279521e-05, |
| "loss": 0.1664, |
| "num_input_tokens_seen": 2068584, |
| "step": 5040 |
| }, |
| { |
| "epoch": 6.10774818401937, |
| "grad_norm": 0.31508636474609375, |
| "learning_rate": 1.974641729307115e-05, |
| "loss": 0.1259, |
| "num_input_tokens_seen": 2070696, |
| "step": 5045 |
| }, |
| { |
| "epoch": 6.113801452784504, |
| "grad_norm": 0.22805508971214294, |
| "learning_rate": 1.9694783869136435e-05, |
| "loss": 0.2087, |
| "num_input_tokens_seen": 2072872, |
| "step": 5050 |
| }, |
| { |
| "epoch": 6.119854721549637, |
| "grad_norm": 0.6083996891975403, |
| "learning_rate": 1.9643174131519986e-05, |
| "loss": 0.2082, |
| "num_input_tokens_seen": 2074824, |
| "step": 5055 |
| }, |
| { |
| "epoch": 6.12590799031477, |
| "grad_norm": 0.4532402753829956, |
| "learning_rate": 1.9591588310644967e-05, |
| "loss": 0.1813, |
| "num_input_tokens_seen": 2076808, |
| "step": 5060 |
| }, |
| { |
| "epoch": 6.131961259079903, |
| "grad_norm": 0.20531706511974335, |
| "learning_rate": 1.9540026636827742e-05, |
| "loss": 0.2425, |
| "num_input_tokens_seen": 2078856, |
| "step": 5065 |
| }, |
| { |
| "epoch": 6.138014527845036, |
| "grad_norm": 0.5168257355690002, |
| "learning_rate": 1.948848934027689e-05, |
| "loss": 0.2144, |
| "num_input_tokens_seen": 2081096, |
| "step": 5070 |
| }, |
| { |
| "epoch": 6.1440677966101696, |
| "grad_norm": 0.2479662299156189, |
| "learning_rate": 1.9436976651092144e-05, |
| "loss": 0.1903, |
| "num_input_tokens_seen": 2083144, |
| "step": 5075 |
| }, |
| { |
| "epoch": 6.150121065375303, |
| "grad_norm": 0.6428490281105042, |
| "learning_rate": 1.9385488799263372e-05, |
| "loss": 0.1957, |
| "num_input_tokens_seen": 2085096, |
| "step": 5080 |
| }, |
| { |
| "epoch": 6.156174334140436, |
| "grad_norm": 0.19707000255584717, |
| "learning_rate": 1.9334026014669543e-05, |
| "loss": 0.2212, |
| "num_input_tokens_seen": 2087272, |
| "step": 5085 |
| }, |
| { |
| "epoch": 6.162227602905569, |
| "grad_norm": 0.20903225243091583, |
| "learning_rate": 1.9282588527077715e-05, |
| "loss": 0.242, |
| "num_input_tokens_seen": 2089224, |
| "step": 5090 |
| }, |
| { |
| "epoch": 6.168280871670702, |
| "grad_norm": 0.422122985124588, |
| "learning_rate": 1.9231176566142006e-05, |
| "loss": 0.1921, |
| "num_input_tokens_seen": 2091432, |
| "step": 5095 |
| }, |
| { |
| "epoch": 6.174334140435835, |
| "grad_norm": 0.21547742187976837, |
| "learning_rate": 1.917979036140255e-05, |
| "loss": 0.1773, |
| "num_input_tokens_seen": 2093384, |
| "step": 5100 |
| }, |
| { |
| "epoch": 6.180387409200969, |
| "grad_norm": 0.2356327474117279, |
| "learning_rate": 1.9128430142284503e-05, |
| "loss": 0.2193, |
| "num_input_tokens_seen": 2095528, |
| "step": 5105 |
| }, |
| { |
| "epoch": 6.186440677966102, |
| "grad_norm": 0.24147894978523254, |
| "learning_rate": 1.9077096138096992e-05, |
| "loss": 0.1707, |
| "num_input_tokens_seen": 2097544, |
| "step": 5110 |
| }, |
| { |
| "epoch": 6.1924939467312345, |
| "grad_norm": 0.5662580728530884, |
| "learning_rate": 1.9025788578032113e-05, |
| "loss": 0.1485, |
| "num_input_tokens_seen": 2099592, |
| "step": 5115 |
| }, |
| { |
| "epoch": 6.198547215496368, |
| "grad_norm": 0.438174307346344, |
| "learning_rate": 1.8974507691163867e-05, |
| "loss": 0.2244, |
| "num_input_tokens_seen": 2101448, |
| "step": 5120 |
| }, |
| { |
| "epoch": 6.204600484261501, |
| "grad_norm": 0.25707370042800903, |
| "learning_rate": 1.892325370644721e-05, |
| "loss": 0.2259, |
| "num_input_tokens_seen": 2103528, |
| "step": 5125 |
| }, |
| { |
| "epoch": 6.210653753026635, |
| "grad_norm": 0.9178801774978638, |
| "learning_rate": 1.8872026852716954e-05, |
| "loss": 0.1827, |
| "num_input_tokens_seen": 2105640, |
| "step": 5130 |
| }, |
| { |
| "epoch": 6.216707021791768, |
| "grad_norm": 0.3757168650627136, |
| "learning_rate": 1.8820827358686793e-05, |
| "loss": 0.2044, |
| "num_input_tokens_seen": 2107688, |
| "step": 5135 |
| }, |
| { |
| "epoch": 6.2227602905569, |
| "grad_norm": 0.3537003993988037, |
| "learning_rate": 1.8769655452948274e-05, |
| "loss": 0.1993, |
| "num_input_tokens_seen": 2109608, |
| "step": 5140 |
| }, |
| { |
| "epoch": 6.228813559322034, |
| "grad_norm": 0.5321641564369202, |
| "learning_rate": 1.8718511363969733e-05, |
| "loss": 0.1781, |
| "num_input_tokens_seen": 2111752, |
| "step": 5145 |
| }, |
| { |
| "epoch": 6.234866828087167, |
| "grad_norm": 0.3141115605831146, |
| "learning_rate": 1.8667395320095367e-05, |
| "loss": 0.2158, |
| "num_input_tokens_seen": 2113928, |
| "step": 5150 |
| }, |
| { |
| "epoch": 6.2409200968523, |
| "grad_norm": 0.41241806745529175, |
| "learning_rate": 1.8616307549544113e-05, |
| "loss": 0.1971, |
| "num_input_tokens_seen": 2115944, |
| "step": 5155 |
| }, |
| { |
| "epoch": 6.246973365617434, |
| "grad_norm": 0.3469935655593872, |
| "learning_rate": 1.85652482804087e-05, |
| "loss": 0.1891, |
| "num_input_tokens_seen": 2118120, |
| "step": 5160 |
| }, |
| { |
| "epoch": 6.253026634382566, |
| "grad_norm": 0.41373348236083984, |
| "learning_rate": 1.85142177406546e-05, |
| "loss": 0.2418, |
| "num_input_tokens_seen": 2120136, |
| "step": 5165 |
| }, |
| { |
| "epoch": 6.2590799031477, |
| "grad_norm": 0.4460534155368805, |
| "learning_rate": 1.8463216158119015e-05, |
| "loss": 0.1782, |
| "num_input_tokens_seen": 2122184, |
| "step": 5170 |
| }, |
| { |
| "epoch": 6.265133171912833, |
| "grad_norm": 0.622747004032135, |
| "learning_rate": 1.8412243760509867e-05, |
| "loss": 0.2638, |
| "num_input_tokens_seen": 2124200, |
| "step": 5175 |
| }, |
| { |
| "epoch": 6.271186440677966, |
| "grad_norm": 0.25548455119132996, |
| "learning_rate": 1.8361300775404765e-05, |
| "loss": 0.2087, |
| "num_input_tokens_seen": 2126280, |
| "step": 5180 |
| }, |
| { |
| "epoch": 6.2772397094431, |
| "grad_norm": 0.3224746882915497, |
| "learning_rate": 1.8310387430250014e-05, |
| "loss": 0.1626, |
| "num_input_tokens_seen": 2128360, |
| "step": 5185 |
| }, |
| { |
| "epoch": 6.283292978208232, |
| "grad_norm": 0.8245432376861572, |
| "learning_rate": 1.825950395235956e-05, |
| "loss": 0.2176, |
| "num_input_tokens_seen": 2130248, |
| "step": 5190 |
| }, |
| { |
| "epoch": 6.289346246973365, |
| "grad_norm": 0.22725388407707214, |
| "learning_rate": 1.8208650568914033e-05, |
| "loss": 0.1755, |
| "num_input_tokens_seen": 2132200, |
| "step": 5195 |
| }, |
| { |
| "epoch": 6.295399515738499, |
| "grad_norm": 0.20645160973072052, |
| "learning_rate": 1.815782750695967e-05, |
| "loss": 0.2066, |
| "num_input_tokens_seen": 2134216, |
| "step": 5200 |
| }, |
| { |
| "epoch": 6.301452784503632, |
| "grad_norm": 0.25996553897857666, |
| "learning_rate": 1.810703499340735e-05, |
| "loss": 0.1658, |
| "num_input_tokens_seen": 2136392, |
| "step": 5205 |
| }, |
| { |
| "epoch": 6.3075060532687655, |
| "grad_norm": 0.6341909170150757, |
| "learning_rate": 1.8056273255031552e-05, |
| "loss": 0.1992, |
| "num_input_tokens_seen": 2138504, |
| "step": 5210 |
| }, |
| { |
| "epoch": 6.313559322033898, |
| "grad_norm": 0.7847995758056641, |
| "learning_rate": 1.8005542518469366e-05, |
| "loss": 0.2247, |
| "num_input_tokens_seen": 2140616, |
| "step": 5215 |
| }, |
| { |
| "epoch": 6.319612590799031, |
| "grad_norm": 0.14493349194526672, |
| "learning_rate": 1.7954843010219446e-05, |
| "loss": 0.1328, |
| "num_input_tokens_seen": 2142664, |
| "step": 5220 |
| }, |
| { |
| "epoch": 6.325665859564165, |
| "grad_norm": 0.3631038963794708, |
| "learning_rate": 1.790417495664103e-05, |
| "loss": 0.1609, |
| "num_input_tokens_seen": 2144744, |
| "step": 5225 |
| }, |
| { |
| "epoch": 6.331719128329298, |
| "grad_norm": 0.22148382663726807, |
| "learning_rate": 1.785353858395292e-05, |
| "loss": 0.2085, |
| "num_input_tokens_seen": 2146760, |
| "step": 5230 |
| }, |
| { |
| "epoch": 6.337772397094431, |
| "grad_norm": 0.5689935684204102, |
| "learning_rate": 1.7802934118232482e-05, |
| "loss": 0.1547, |
| "num_input_tokens_seen": 2148904, |
| "step": 5235 |
| }, |
| { |
| "epoch": 6.343825665859564, |
| "grad_norm": 0.4348227381706238, |
| "learning_rate": 1.775236178541461e-05, |
| "loss": 0.2075, |
| "num_input_tokens_seen": 2151080, |
| "step": 5240 |
| }, |
| { |
| "epoch": 6.349878934624697, |
| "grad_norm": 0.46762341260910034, |
| "learning_rate": 1.7701821811290743e-05, |
| "loss": 0.1998, |
| "num_input_tokens_seen": 2153032, |
| "step": 5245 |
| }, |
| { |
| "epoch": 6.3559322033898304, |
| "grad_norm": 0.42551153898239136, |
| "learning_rate": 1.7651314421507843e-05, |
| "loss": 0.2698, |
| "num_input_tokens_seen": 2155080, |
| "step": 5250 |
| }, |
| { |
| "epoch": 6.361985472154964, |
| "grad_norm": 0.8728198409080505, |
| "learning_rate": 1.7600839841567395e-05, |
| "loss": 0.2016, |
| "num_input_tokens_seen": 2157064, |
| "step": 5255 |
| }, |
| { |
| "epoch": 6.368038740920097, |
| "grad_norm": 0.19018343091011047, |
| "learning_rate": 1.7550398296824395e-05, |
| "loss": 0.2008, |
| "num_input_tokens_seen": 2159208, |
| "step": 5260 |
| }, |
| { |
| "epoch": 6.37409200968523, |
| "grad_norm": 0.40862205624580383, |
| "learning_rate": 1.749999001248635e-05, |
| "loss": 0.1524, |
| "num_input_tokens_seen": 2161384, |
| "step": 5265 |
| }, |
| { |
| "epoch": 6.380145278450363, |
| "grad_norm": 0.22967901825904846, |
| "learning_rate": 1.7449615213612264e-05, |
| "loss": 0.2346, |
| "num_input_tokens_seen": 2163432, |
| "step": 5270 |
| }, |
| { |
| "epoch": 6.386198547215496, |
| "grad_norm": 0.40333184599876404, |
| "learning_rate": 1.7399274125111635e-05, |
| "loss": 0.2333, |
| "num_input_tokens_seen": 2165640, |
| "step": 5275 |
| }, |
| { |
| "epoch": 6.39225181598063, |
| "grad_norm": 0.5382645726203918, |
| "learning_rate": 1.7348966971743465e-05, |
| "loss": 0.2162, |
| "num_input_tokens_seen": 2167656, |
| "step": 5280 |
| }, |
| { |
| "epoch": 6.398305084745763, |
| "grad_norm": 0.24603554606437683, |
| "learning_rate": 1.729869397811523e-05, |
| "loss": 0.1456, |
| "num_input_tokens_seen": 2169672, |
| "step": 5285 |
| }, |
| { |
| "epoch": 6.404358353510895, |
| "grad_norm": 0.25845322012901306, |
| "learning_rate": 1.72484553686819e-05, |
| "loss": 0.1402, |
| "num_input_tokens_seen": 2171592, |
| "step": 5290 |
| }, |
| { |
| "epoch": 6.410411622276029, |
| "grad_norm": 0.46098440885543823, |
| "learning_rate": 1.719825136774494e-05, |
| "loss": 0.2029, |
| "num_input_tokens_seen": 2173576, |
| "step": 5295 |
| }, |
| { |
| "epoch": 6.416464891041162, |
| "grad_norm": 0.41342851519584656, |
| "learning_rate": 1.714808219945129e-05, |
| "loss": 0.197, |
| "num_input_tokens_seen": 2175592, |
| "step": 5300 |
| }, |
| { |
| "epoch": 6.4225181598062955, |
| "grad_norm": 0.2243460714817047, |
| "learning_rate": 1.709794808779234e-05, |
| "loss": 0.1834, |
| "num_input_tokens_seen": 2177512, |
| "step": 5305 |
| }, |
| { |
| "epoch": 6.428571428571429, |
| "grad_norm": 0.23830708861351013, |
| "learning_rate": 1.704784925660301e-05, |
| "loss": 0.2058, |
| "num_input_tokens_seen": 2179560, |
| "step": 5310 |
| }, |
| { |
| "epoch": 6.434624697336561, |
| "grad_norm": 0.6135181784629822, |
| "learning_rate": 1.699778592956069e-05, |
| "loss": 0.1569, |
| "num_input_tokens_seen": 2181608, |
| "step": 5315 |
| }, |
| { |
| "epoch": 6.440677966101695, |
| "grad_norm": 0.35326752066612244, |
| "learning_rate": 1.6947758330184226e-05, |
| "loss": 0.1707, |
| "num_input_tokens_seen": 2183656, |
| "step": 5320 |
| }, |
| { |
| "epoch": 6.446731234866828, |
| "grad_norm": 0.46248602867126465, |
| "learning_rate": 1.689776668183299e-05, |
| "loss": 0.2167, |
| "num_input_tokens_seen": 2185576, |
| "step": 5325 |
| }, |
| { |
| "epoch": 6.452784503631961, |
| "grad_norm": 0.6065472960472107, |
| "learning_rate": 1.6847811207705813e-05, |
| "loss": 0.1469, |
| "num_input_tokens_seen": 2187592, |
| "step": 5330 |
| }, |
| { |
| "epoch": 6.458837772397095, |
| "grad_norm": 0.33997687697410583, |
| "learning_rate": 1.6797892130840036e-05, |
| "loss": 0.1863, |
| "num_input_tokens_seen": 2189544, |
| "step": 5335 |
| }, |
| { |
| "epoch": 6.464891041162228, |
| "grad_norm": 0.23591935634613037, |
| "learning_rate": 1.6748009674110477e-05, |
| "loss": 0.1666, |
| "num_input_tokens_seen": 2191496, |
| "step": 5340 |
| }, |
| { |
| "epoch": 6.4709443099273605, |
| "grad_norm": 0.17576082050800323, |
| "learning_rate": 1.669816406022848e-05, |
| "loss": 0.2128, |
| "num_input_tokens_seen": 2193416, |
| "step": 5345 |
| }, |
| { |
| "epoch": 6.476997578692494, |
| "grad_norm": 0.502656102180481, |
| "learning_rate": 1.6648355511740876e-05, |
| "loss": 0.2245, |
| "num_input_tokens_seen": 2195432, |
| "step": 5350 |
| }, |
| { |
| "epoch": 6.483050847457627, |
| "grad_norm": 0.5052187442779541, |
| "learning_rate": 1.659858425102902e-05, |
| "loss": 0.1798, |
| "num_input_tokens_seen": 2197288, |
| "step": 5355 |
| }, |
| { |
| "epoch": 6.4891041162227605, |
| "grad_norm": 0.3692941665649414, |
| "learning_rate": 1.6548850500307772e-05, |
| "loss": 0.1916, |
| "num_input_tokens_seen": 2199368, |
| "step": 5360 |
| }, |
| { |
| "epoch": 6.495157384987894, |
| "grad_norm": 0.19063787162303925, |
| "learning_rate": 1.649915448162455e-05, |
| "loss": 0.2011, |
| "num_input_tokens_seen": 2201640, |
| "step": 5365 |
| }, |
| { |
| "epoch": 6.5, |
| "eval_loss": 0.1942528784275055, |
| "eval_runtime": 7.6676, |
| "eval_samples_per_second": 47.864, |
| "eval_steps_per_second": 11.999, |
| "num_input_tokens_seen": 2203208, |
| "step": 5369 |
| }, |
| { |
| "epoch": 6.501210653753026, |
| "grad_norm": 0.5192694067955017, |
| "learning_rate": 1.6449496416858284e-05, |
| "loss": 0.1779, |
| "num_input_tokens_seen": 2203592, |
| "step": 5370 |
| }, |
| { |
| "epoch": 6.50726392251816, |
| "grad_norm": 0.1905914694070816, |
| "learning_rate": 1.6399876527718456e-05, |
| "loss": 0.1898, |
| "num_input_tokens_seen": 2205640, |
| "step": 5375 |
| }, |
| { |
| "epoch": 6.513317191283293, |
| "grad_norm": 0.31801435351371765, |
| "learning_rate": 1.6350295035744094e-05, |
| "loss": 0.1873, |
| "num_input_tokens_seen": 2207752, |
| "step": 5380 |
| }, |
| { |
| "epoch": 6.519370460048426, |
| "grad_norm": 0.2036231905221939, |
| "learning_rate": 1.6300752162302822e-05, |
| "loss": 0.1816, |
| "num_input_tokens_seen": 2209864, |
| "step": 5385 |
| }, |
| { |
| "epoch": 6.52542372881356, |
| "grad_norm": 0.5242741107940674, |
| "learning_rate": 1.625124812858982e-05, |
| "loss": 0.1966, |
| "num_input_tokens_seen": 2211944, |
| "step": 5390 |
| }, |
| { |
| "epoch": 6.531476997578692, |
| "grad_norm": 0.25197815895080566, |
| "learning_rate": 1.6201783155626862e-05, |
| "loss": 0.1954, |
| "num_input_tokens_seen": 2213928, |
| "step": 5395 |
| }, |
| { |
| "epoch": 6.5375302663438255, |
| "grad_norm": 0.31938010454177856, |
| "learning_rate": 1.615235746426133e-05, |
| "loss": 0.1158, |
| "num_input_tokens_seen": 2215944, |
| "step": 5400 |
| }, |
| { |
| "epoch": 6.543583535108959, |
| "grad_norm": 0.8137195110321045, |
| "learning_rate": 1.6102971275165228e-05, |
| "loss": 0.2767, |
| "num_input_tokens_seen": 2217960, |
| "step": 5405 |
| }, |
| { |
| "epoch": 6.549636803874092, |
| "grad_norm": 0.5164667963981628, |
| "learning_rate": 1.6053624808834188e-05, |
| "loss": 0.2201, |
| "num_input_tokens_seen": 2219976, |
| "step": 5410 |
| }, |
| { |
| "epoch": 6.5556900726392255, |
| "grad_norm": 0.4208105802536011, |
| "learning_rate": 1.6004318285586497e-05, |
| "loss": 0.2047, |
| "num_input_tokens_seen": 2221992, |
| "step": 5415 |
| }, |
| { |
| "epoch": 6.561743341404358, |
| "grad_norm": 0.27663904428482056, |
| "learning_rate": 1.5955051925562092e-05, |
| "loss": 0.1983, |
| "num_input_tokens_seen": 2223976, |
| "step": 5420 |
| }, |
| { |
| "epoch": 6.567796610169491, |
| "grad_norm": 0.4385055601596832, |
| "learning_rate": 1.590582594872162e-05, |
| "loss": 0.1701, |
| "num_input_tokens_seen": 2226088, |
| "step": 5425 |
| }, |
| { |
| "epoch": 6.573849878934625, |
| "grad_norm": 0.370122492313385, |
| "learning_rate": 1.585664057484539e-05, |
| "loss": 0.1739, |
| "num_input_tokens_seen": 2228008, |
| "step": 5430 |
| }, |
| { |
| "epoch": 6.579903147699758, |
| "grad_norm": 0.21666517853736877, |
| "learning_rate": 1.5807496023532472e-05, |
| "loss": 0.259, |
| "num_input_tokens_seen": 2229928, |
| "step": 5435 |
| }, |
| { |
| "epoch": 6.585956416464891, |
| "grad_norm": 0.116627536714077, |
| "learning_rate": 1.5758392514199644e-05, |
| "loss": 0.148, |
| "num_input_tokens_seen": 2231912, |
| "step": 5440 |
| }, |
| { |
| "epoch": 6.592009685230024, |
| "grad_norm": 0.6437224745750427, |
| "learning_rate": 1.5709330266080446e-05, |
| "loss": 0.1809, |
| "num_input_tokens_seen": 2233992, |
| "step": 5445 |
| }, |
| { |
| "epoch": 6.598062953995157, |
| "grad_norm": 0.26287803053855896, |
| "learning_rate": 1.5660309498224225e-05, |
| "loss": 0.2281, |
| "num_input_tokens_seen": 2235976, |
| "step": 5450 |
| }, |
| { |
| "epoch": 6.6041162227602905, |
| "grad_norm": 0.3082634210586548, |
| "learning_rate": 1.5611330429495096e-05, |
| "loss": 0.2018, |
| "num_input_tokens_seen": 2238088, |
| "step": 5455 |
| }, |
| { |
| "epoch": 6.610169491525424, |
| "grad_norm": 0.4467432498931885, |
| "learning_rate": 1.556239327857101e-05, |
| "loss": 0.2263, |
| "num_input_tokens_seen": 2240136, |
| "step": 5460 |
| }, |
| { |
| "epoch": 6.616222760290557, |
| "grad_norm": 0.4696289598941803, |
| "learning_rate": 1.551349826394278e-05, |
| "loss": 0.2286, |
| "num_input_tokens_seen": 2242280, |
| "step": 5465 |
| }, |
| { |
| "epoch": 6.622276029055691, |
| "grad_norm": 0.40213438868522644, |
| "learning_rate": 1.5464645603913066e-05, |
| "loss": 0.2321, |
| "num_input_tokens_seen": 2244360, |
| "step": 5470 |
| }, |
| { |
| "epoch": 6.628329297820823, |
| "grad_norm": 0.5149974822998047, |
| "learning_rate": 1.5415835516595465e-05, |
| "loss": 0.2331, |
| "num_input_tokens_seen": 2246536, |
| "step": 5475 |
| }, |
| { |
| "epoch": 6.634382566585956, |
| "grad_norm": 0.2991121709346771, |
| "learning_rate": 1.5367068219913456e-05, |
| "loss": 0.2169, |
| "num_input_tokens_seen": 2248616, |
| "step": 5480 |
| }, |
| { |
| "epoch": 6.64043583535109, |
| "grad_norm": 0.35001322627067566, |
| "learning_rate": 1.5318343931599503e-05, |
| "loss": 0.1699, |
| "num_input_tokens_seen": 2250664, |
| "step": 5485 |
| }, |
| { |
| "epoch": 6.646489104116223, |
| "grad_norm": 0.32206714153289795, |
| "learning_rate": 1.5269662869194036e-05, |
| "loss": 0.1692, |
| "num_input_tokens_seen": 2252808, |
| "step": 5490 |
| }, |
| { |
| "epoch": 6.652542372881356, |
| "grad_norm": 0.18183429539203644, |
| "learning_rate": 1.5221025250044486e-05, |
| "loss": 0.1729, |
| "num_input_tokens_seen": 2254984, |
| "step": 5495 |
| }, |
| { |
| "epoch": 6.658595641646489, |
| "grad_norm": 0.29498735070228577, |
| "learning_rate": 1.517243129130433e-05, |
| "loss": 0.1736, |
| "num_input_tokens_seen": 2257064, |
| "step": 5500 |
| }, |
| { |
| "epoch": 6.664648910411622, |
| "grad_norm": 0.25461438298225403, |
| "learning_rate": 1.512388120993212e-05, |
| "loss": 0.2131, |
| "num_input_tokens_seen": 2259112, |
| "step": 5505 |
| }, |
| { |
| "epoch": 6.670702179176756, |
| "grad_norm": 0.2526037096977234, |
| "learning_rate": 1.5075375222690496e-05, |
| "loss": 0.1998, |
| "num_input_tokens_seen": 2261224, |
| "step": 5510 |
| }, |
| { |
| "epoch": 6.676755447941889, |
| "grad_norm": 0.2894221842288971, |
| "learning_rate": 1.5026913546145232e-05, |
| "loss": 0.195, |
| "num_input_tokens_seen": 2263400, |
| "step": 5515 |
| }, |
| { |
| "epoch": 6.682808716707022, |
| "grad_norm": 0.5050965547561646, |
| "learning_rate": 1.4978496396664279e-05, |
| "loss": 0.1876, |
| "num_input_tokens_seen": 2265416, |
| "step": 5520 |
| }, |
| { |
| "epoch": 6.688861985472155, |
| "grad_norm": 0.28199806809425354, |
| "learning_rate": 1.4930123990416766e-05, |
| "loss": 0.17, |
| "num_input_tokens_seen": 2267560, |
| "step": 5525 |
| }, |
| { |
| "epoch": 6.694915254237288, |
| "grad_norm": 0.1659095585346222, |
| "learning_rate": 1.4881796543372079e-05, |
| "loss": 0.2046, |
| "num_input_tokens_seen": 2269544, |
| "step": 5530 |
| }, |
| { |
| "epoch": 6.700968523002421, |
| "grad_norm": 0.266445130109787, |
| "learning_rate": 1.4833514271298859e-05, |
| "loss": 0.2144, |
| "num_input_tokens_seen": 2271720, |
| "step": 5535 |
| }, |
| { |
| "epoch": 6.707021791767555, |
| "grad_norm": 0.4423094689846039, |
| "learning_rate": 1.4785277389764046e-05, |
| "loss": 0.1755, |
| "num_input_tokens_seen": 2273928, |
| "step": 5540 |
| }, |
| { |
| "epoch": 6.713075060532688, |
| "grad_norm": 0.6484824419021606, |
| "learning_rate": 1.4737086114131943e-05, |
| "loss": 0.1762, |
| "num_input_tokens_seen": 2275912, |
| "step": 5545 |
| }, |
| { |
| "epoch": 6.719128329297821, |
| "grad_norm": 0.5835667848587036, |
| "learning_rate": 1.4688940659563225e-05, |
| "loss": 0.2265, |
| "num_input_tokens_seen": 2278024, |
| "step": 5550 |
| }, |
| { |
| "epoch": 6.725181598062954, |
| "grad_norm": 0.34741050004959106, |
| "learning_rate": 1.4640841241013995e-05, |
| "loss": 0.2256, |
| "num_input_tokens_seen": 2280136, |
| "step": 5555 |
| }, |
| { |
| "epoch": 6.731234866828087, |
| "grad_norm": 0.4059230089187622, |
| "learning_rate": 1.4592788073234803e-05, |
| "loss": 0.2226, |
| "num_input_tokens_seen": 2282248, |
| "step": 5560 |
| }, |
| { |
| "epoch": 6.737288135593221, |
| "grad_norm": 0.30088284611701965, |
| "learning_rate": 1.4544781370769723e-05, |
| "loss": 0.1943, |
| "num_input_tokens_seen": 2284424, |
| "step": 5565 |
| }, |
| { |
| "epoch": 6.743341404358354, |
| "grad_norm": 0.4113398492336273, |
| "learning_rate": 1.4496821347955359e-05, |
| "loss": 0.2255, |
| "num_input_tokens_seen": 2286344, |
| "step": 5570 |
| }, |
| { |
| "epoch": 6.749394673123486, |
| "grad_norm": 0.4694417119026184, |
| "learning_rate": 1.444890821891991e-05, |
| "loss": 0.1479, |
| "num_input_tokens_seen": 2288456, |
| "step": 5575 |
| }, |
| { |
| "epoch": 6.75544794188862, |
| "grad_norm": 0.4086979329586029, |
| "learning_rate": 1.4401042197582193e-05, |
| "loss": 0.185, |
| "num_input_tokens_seen": 2290312, |
| "step": 5580 |
| }, |
| { |
| "epoch": 6.761501210653753, |
| "grad_norm": 0.33632540702819824, |
| "learning_rate": 1.4353223497650731e-05, |
| "loss": 0.2046, |
| "num_input_tokens_seen": 2292392, |
| "step": 5585 |
| }, |
| { |
| "epoch": 6.767554479418886, |
| "grad_norm": 0.3914327323436737, |
| "learning_rate": 1.4305452332622748e-05, |
| "loss": 0.1743, |
| "num_input_tokens_seen": 2294472, |
| "step": 5590 |
| }, |
| { |
| "epoch": 6.77360774818402, |
| "grad_norm": 0.25511395931243896, |
| "learning_rate": 1.4257728915783244e-05, |
| "loss": 0.1711, |
| "num_input_tokens_seen": 2296616, |
| "step": 5595 |
| }, |
| { |
| "epoch": 6.779661016949152, |
| "grad_norm": 0.20177119970321655, |
| "learning_rate": 1.4210053460204023e-05, |
| "loss": 0.1648, |
| "num_input_tokens_seen": 2298856, |
| "step": 5600 |
| }, |
| { |
| "epoch": 6.785714285714286, |
| "grad_norm": 0.6001597046852112, |
| "learning_rate": 1.4162426178742788e-05, |
| "loss": 0.1752, |
| "num_input_tokens_seen": 2301064, |
| "step": 5605 |
| }, |
| { |
| "epoch": 6.791767554479419, |
| "grad_norm": 0.6545194983482361, |
| "learning_rate": 1.4114847284042132e-05, |
| "loss": 0.2128, |
| "num_input_tokens_seen": 2303208, |
| "step": 5610 |
| }, |
| { |
| "epoch": 6.797820823244552, |
| "grad_norm": 0.7270299196243286, |
| "learning_rate": 1.4067316988528617e-05, |
| "loss": 0.1954, |
| "num_input_tokens_seen": 2305288, |
| "step": 5615 |
| }, |
| { |
| "epoch": 6.803874092009686, |
| "grad_norm": 0.2439718395471573, |
| "learning_rate": 1.4019835504411827e-05, |
| "loss": 0.1862, |
| "num_input_tokens_seen": 2307304, |
| "step": 5620 |
| }, |
| { |
| "epoch": 6.809927360774818, |
| "grad_norm": 0.2995070219039917, |
| "learning_rate": 1.3972403043683419e-05, |
| "loss": 0.1388, |
| "num_input_tokens_seen": 2309448, |
| "step": 5625 |
| }, |
| { |
| "epoch": 6.815980629539951, |
| "grad_norm": 0.1656215786933899, |
| "learning_rate": 1.3925019818116164e-05, |
| "loss": 0.2252, |
| "num_input_tokens_seen": 2311368, |
| "step": 5630 |
| }, |
| { |
| "epoch": 6.822033898305085, |
| "grad_norm": 0.23687121272087097, |
| "learning_rate": 1.387768603926302e-05, |
| "loss": 0.2442, |
| "num_input_tokens_seen": 2313448, |
| "step": 5635 |
| }, |
| { |
| "epoch": 6.828087167070218, |
| "grad_norm": 0.5961008667945862, |
| "learning_rate": 1.383040191845619e-05, |
| "loss": 0.2309, |
| "num_input_tokens_seen": 2315464, |
| "step": 5640 |
| }, |
| { |
| "epoch": 6.8341404358353515, |
| "grad_norm": 0.5766225457191467, |
| "learning_rate": 1.378316766680615e-05, |
| "loss": 0.1924, |
| "num_input_tokens_seen": 2317480, |
| "step": 5645 |
| }, |
| { |
| "epoch": 6.840193704600484, |
| "grad_norm": 0.49906712770462036, |
| "learning_rate": 1.373598349520073e-05, |
| "loss": 0.2317, |
| "num_input_tokens_seen": 2319432, |
| "step": 5650 |
| }, |
| { |
| "epoch": 6.846246973365617, |
| "grad_norm": 0.3310575485229492, |
| "learning_rate": 1.3688849614304164e-05, |
| "loss": 0.1679, |
| "num_input_tokens_seen": 2321512, |
| "step": 5655 |
| }, |
| { |
| "epoch": 6.852300242130751, |
| "grad_norm": 0.43674537539482117, |
| "learning_rate": 1.3641766234556146e-05, |
| "loss": 0.2433, |
| "num_input_tokens_seen": 2323560, |
| "step": 5660 |
| }, |
| { |
| "epoch": 6.858353510895884, |
| "grad_norm": 0.23639240860939026, |
| "learning_rate": 1.3594733566170926e-05, |
| "loss": 0.1914, |
| "num_input_tokens_seen": 2325576, |
| "step": 5665 |
| }, |
| { |
| "epoch": 6.864406779661017, |
| "grad_norm": 0.42987579107284546, |
| "learning_rate": 1.3547751819136309e-05, |
| "loss": 0.211, |
| "num_input_tokens_seen": 2327496, |
| "step": 5670 |
| }, |
| { |
| "epoch": 6.87046004842615, |
| "grad_norm": 0.4337780177593231, |
| "learning_rate": 1.350082120321276e-05, |
| "loss": 0.1777, |
| "num_input_tokens_seen": 2329576, |
| "step": 5675 |
| }, |
| { |
| "epoch": 6.876513317191283, |
| "grad_norm": 0.23105379939079285, |
| "learning_rate": 1.3453941927932456e-05, |
| "loss": 0.169, |
| "num_input_tokens_seen": 2331656, |
| "step": 5680 |
| }, |
| { |
| "epoch": 6.8825665859564165, |
| "grad_norm": 0.2618115544319153, |
| "learning_rate": 1.3407114202598369e-05, |
| "loss": 0.1588, |
| "num_input_tokens_seen": 2333800, |
| "step": 5685 |
| }, |
| { |
| "epoch": 6.88861985472155, |
| "grad_norm": 0.33183878660202026, |
| "learning_rate": 1.3360338236283295e-05, |
| "loss": 0.1325, |
| "num_input_tokens_seen": 2336008, |
| "step": 5690 |
| }, |
| { |
| "epoch": 6.894673123486683, |
| "grad_norm": 0.1858716458082199, |
| "learning_rate": 1.3313614237828948e-05, |
| "loss": 0.1979, |
| "num_input_tokens_seen": 2337928, |
| "step": 5695 |
| }, |
| { |
| "epoch": 6.900726392251816, |
| "grad_norm": 0.3457566797733307, |
| "learning_rate": 1.3266942415845018e-05, |
| "loss": 0.226, |
| "num_input_tokens_seen": 2340008, |
| "step": 5700 |
| }, |
| { |
| "epoch": 6.906779661016949, |
| "grad_norm": 0.4498755633831024, |
| "learning_rate": 1.3220322978708242e-05, |
| "loss": 0.2436, |
| "num_input_tokens_seen": 2342024, |
| "step": 5705 |
| }, |
| { |
| "epoch": 6.912832929782082, |
| "grad_norm": 0.4708850383758545, |
| "learning_rate": 1.317375613456147e-05, |
| "loss": 0.1851, |
| "num_input_tokens_seen": 2343976, |
| "step": 5710 |
| }, |
| { |
| "epoch": 6.918886198547216, |
| "grad_norm": 0.29391157627105713, |
| "learning_rate": 1.3127242091312752e-05, |
| "loss": 0.1956, |
| "num_input_tokens_seen": 2345928, |
| "step": 5715 |
| }, |
| { |
| "epoch": 6.924939467312349, |
| "grad_norm": 0.4773377478122711, |
| "learning_rate": 1.3080781056634373e-05, |
| "loss": 0.1706, |
| "num_input_tokens_seen": 2347976, |
| "step": 5720 |
| }, |
| { |
| "epoch": 6.9309927360774815, |
| "grad_norm": 0.7161561250686646, |
| "learning_rate": 1.3034373237961983e-05, |
| "loss": 0.182, |
| "num_input_tokens_seen": 2349992, |
| "step": 5725 |
| }, |
| { |
| "epoch": 6.937046004842615, |
| "grad_norm": 0.42767396569252014, |
| "learning_rate": 1.2988018842493604e-05, |
| "loss": 0.2041, |
| "num_input_tokens_seen": 2351976, |
| "step": 5730 |
| }, |
| { |
| "epoch": 6.943099273607748, |
| "grad_norm": 0.4129497706890106, |
| "learning_rate": 1.2941718077188758e-05, |
| "loss": 0.2342, |
| "num_input_tokens_seen": 2353992, |
| "step": 5735 |
| }, |
| { |
| "epoch": 6.9491525423728815, |
| "grad_norm": 0.21269860863685608, |
| "learning_rate": 1.2895471148767508e-05, |
| "loss": 0.2384, |
| "num_input_tokens_seen": 2355976, |
| "step": 5740 |
| }, |
| { |
| "epoch": 6.955205811138015, |
| "grad_norm": 0.15844842791557312, |
| "learning_rate": 1.2849278263709572e-05, |
| "loss": 0.2002, |
| "num_input_tokens_seen": 2358152, |
| "step": 5745 |
| }, |
| { |
| "epoch": 6.961259079903147, |
| "grad_norm": 0.48844364285469055, |
| "learning_rate": 1.2803139628253364e-05, |
| "loss": 0.2342, |
| "num_input_tokens_seen": 2360040, |
| "step": 5750 |
| }, |
| { |
| "epoch": 6.967312348668281, |
| "grad_norm": 0.18955184519290924, |
| "learning_rate": 1.2757055448395092e-05, |
| "loss": 0.161, |
| "num_input_tokens_seen": 2362056, |
| "step": 5755 |
| }, |
| { |
| "epoch": 6.973365617433414, |
| "grad_norm": 0.3704724907875061, |
| "learning_rate": 1.271102592988782e-05, |
| "loss": 0.1991, |
| "num_input_tokens_seen": 2364008, |
| "step": 5760 |
| }, |
| { |
| "epoch": 6.979418886198547, |
| "grad_norm": 0.37986063957214355, |
| "learning_rate": 1.2665051278240602e-05, |
| "loss": 0.1412, |
| "num_input_tokens_seen": 2366056, |
| "step": 5765 |
| }, |
| { |
| "epoch": 6.985472154963681, |
| "grad_norm": 0.5057016611099243, |
| "learning_rate": 1.2619131698717504e-05, |
| "loss": 0.1442, |
| "num_input_tokens_seen": 2368072, |
| "step": 5770 |
| }, |
| { |
| "epoch": 6.991525423728813, |
| "grad_norm": 0.38172945380210876, |
| "learning_rate": 1.2573267396336686e-05, |
| "loss": 0.2447, |
| "num_input_tokens_seen": 2370024, |
| "step": 5775 |
| }, |
| { |
| "epoch": 6.9975786924939465, |
| "grad_norm": 0.22704635560512543, |
| "learning_rate": 1.2527458575869539e-05, |
| "loss": 0.1563, |
| "num_input_tokens_seen": 2372008, |
| "step": 5780 |
| }, |
| { |
| "epoch": 7.0, |
| "eval_loss": 0.19412890076637268, |
| "eval_runtime": 7.6654, |
| "eval_samples_per_second": 47.877, |
| "eval_steps_per_second": 12.002, |
| "num_input_tokens_seen": 2372464, |
| "step": 5782 |
| }, |
| { |
| "epoch": 7.00363196125908, |
| "grad_norm": 0.4464855492115021, |
| "learning_rate": 1.2481705441839756e-05, |
| "loss": 0.1698, |
| "num_input_tokens_seen": 2373648, |
| "step": 5785 |
| }, |
| { |
| "epoch": 7.009685230024213, |
| "grad_norm": 0.454283744096756, |
| "learning_rate": 1.2436008198522376e-05, |
| "loss": 0.1951, |
| "num_input_tokens_seen": 2375696, |
| "step": 5790 |
| }, |
| { |
| "epoch": 7.0157384987893465, |
| "grad_norm": 0.33035552501678467, |
| "learning_rate": 1.2390367049942916e-05, |
| "loss": 0.2008, |
| "num_input_tokens_seen": 2377872, |
| "step": 5795 |
| }, |
| { |
| "epoch": 7.021791767554479, |
| "grad_norm": 0.488673597574234, |
| "learning_rate": 1.2344782199876431e-05, |
| "loss": 0.1777, |
| "num_input_tokens_seen": 2379792, |
| "step": 5800 |
| }, |
| { |
| "epoch": 7.027845036319612, |
| "grad_norm": 0.40355005860328674, |
| "learning_rate": 1.2299253851846651e-05, |
| "loss": 0.1888, |
| "num_input_tokens_seen": 2381744, |
| "step": 5805 |
| }, |
| { |
| "epoch": 7.033898305084746, |
| "grad_norm": 0.44384610652923584, |
| "learning_rate": 1.2253782209125012e-05, |
| "loss": 0.1806, |
| "num_input_tokens_seen": 2383664, |
| "step": 5810 |
| }, |
| { |
| "epoch": 7.039951573849879, |
| "grad_norm": 0.5420294404029846, |
| "learning_rate": 1.220836747472978e-05, |
| "loss": 0.1907, |
| "num_input_tokens_seen": 2385744, |
| "step": 5815 |
| }, |
| { |
| "epoch": 7.046004842615012, |
| "grad_norm": 0.32249927520751953, |
| "learning_rate": 1.2163009851425137e-05, |
| "loss": 0.2015, |
| "num_input_tokens_seen": 2387920, |
| "step": 5820 |
| }, |
| { |
| "epoch": 7.052058111380146, |
| "grad_norm": 0.16516876220703125, |
| "learning_rate": 1.2117709541720306e-05, |
| "loss": 0.1807, |
| "num_input_tokens_seen": 2390000, |
| "step": 5825 |
| }, |
| { |
| "epoch": 7.058111380145278, |
| "grad_norm": 0.48648953437805176, |
| "learning_rate": 1.2072466747868597e-05, |
| "loss": 0.2129, |
| "num_input_tokens_seen": 2392016, |
| "step": 5830 |
| }, |
| { |
| "epoch": 7.0641646489104115, |
| "grad_norm": 0.41197946667671204, |
| "learning_rate": 1.2027281671866531e-05, |
| "loss": 0.1885, |
| "num_input_tokens_seen": 2394032, |
| "step": 5835 |
| }, |
| { |
| "epoch": 7.070217917675545, |
| "grad_norm": 0.4627268314361572, |
| "learning_rate": 1.198215451545293e-05, |
| "loss": 0.2085, |
| "num_input_tokens_seen": 2395888, |
| "step": 5840 |
| }, |
| { |
| "epoch": 7.076271186440678, |
| "grad_norm": 0.26652011275291443, |
| "learning_rate": 1.193708548010804e-05, |
| "loss": 0.1696, |
| "num_input_tokens_seen": 2398032, |
| "step": 5845 |
| }, |
| { |
| "epoch": 7.0823244552058116, |
| "grad_norm": 0.25916314125061035, |
| "learning_rate": 1.1892074767052611e-05, |
| "loss": 0.1557, |
| "num_input_tokens_seen": 2400016, |
| "step": 5850 |
| }, |
| { |
| "epoch": 7.088377723970944, |
| "grad_norm": 0.16255183517932892, |
| "learning_rate": 1.1847122577246964e-05, |
| "loss": 0.1613, |
| "num_input_tokens_seen": 2402032, |
| "step": 5855 |
| }, |
| { |
| "epoch": 7.094430992736077, |
| "grad_norm": 0.34801170229911804, |
| "learning_rate": 1.1802229111390157e-05, |
| "loss": 0.1624, |
| "num_input_tokens_seen": 2404048, |
| "step": 5860 |
| }, |
| { |
| "epoch": 7.100484261501211, |
| "grad_norm": 0.3824155330657959, |
| "learning_rate": 1.175739456991908e-05, |
| "loss": 0.1967, |
| "num_input_tokens_seen": 2406128, |
| "step": 5865 |
| }, |
| { |
| "epoch": 7.106537530266344, |
| "grad_norm": 0.3208667039871216, |
| "learning_rate": 1.1712619153007517e-05, |
| "loss": 0.1975, |
| "num_input_tokens_seen": 2408112, |
| "step": 5870 |
| }, |
| { |
| "epoch": 7.112590799031477, |
| "grad_norm": 0.5324909090995789, |
| "learning_rate": 1.166790306056528e-05, |
| "loss": 0.197, |
| "num_input_tokens_seen": 2410160, |
| "step": 5875 |
| }, |
| { |
| "epoch": 7.11864406779661, |
| "grad_norm": 0.3845199942588806, |
| "learning_rate": 1.1623246492237305e-05, |
| "loss": 0.2374, |
| "num_input_tokens_seen": 2412112, |
| "step": 5880 |
| }, |
| { |
| "epoch": 7.124697336561743, |
| "grad_norm": 0.361144095659256, |
| "learning_rate": 1.1578649647402806e-05, |
| "loss": 0.2234, |
| "num_input_tokens_seen": 2414224, |
| "step": 5885 |
| }, |
| { |
| "epoch": 7.1307506053268765, |
| "grad_norm": 0.2833709418773651, |
| "learning_rate": 1.1534112725174306e-05, |
| "loss": 0.1777, |
| "num_input_tokens_seen": 2416272, |
| "step": 5890 |
| }, |
| { |
| "epoch": 7.13680387409201, |
| "grad_norm": 0.3276798129081726, |
| "learning_rate": 1.1489635924396817e-05, |
| "loss": 0.199, |
| "num_input_tokens_seen": 2418224, |
| "step": 5895 |
| }, |
| { |
| "epoch": 7.142857142857143, |
| "grad_norm": 0.24078544974327087, |
| "learning_rate": 1.1445219443646896e-05, |
| "loss": 0.2053, |
| "num_input_tokens_seen": 2420336, |
| "step": 5900 |
| }, |
| { |
| "epoch": 7.148910411622276, |
| "grad_norm": 0.38029778003692627, |
| "learning_rate": 1.1400863481231833e-05, |
| "loss": 0.2084, |
| "num_input_tokens_seen": 2422288, |
| "step": 5905 |
| }, |
| { |
| "epoch": 7.154963680387409, |
| "grad_norm": 0.518143355846405, |
| "learning_rate": 1.1356568235188682e-05, |
| "loss": 0.2198, |
| "num_input_tokens_seen": 2424400, |
| "step": 5910 |
| }, |
| { |
| "epoch": 7.161016949152542, |
| "grad_norm": 0.3390759229660034, |
| "learning_rate": 1.1312333903283435e-05, |
| "loss": 0.1782, |
| "num_input_tokens_seen": 2426384, |
| "step": 5915 |
| }, |
| { |
| "epoch": 7.167070217917676, |
| "grad_norm": 0.2414584904909134, |
| "learning_rate": 1.1268160683010096e-05, |
| "loss": 0.2306, |
| "num_input_tokens_seen": 2428400, |
| "step": 5920 |
| }, |
| { |
| "epoch": 7.173123486682809, |
| "grad_norm": 0.40217629075050354, |
| "learning_rate": 1.122404877158986e-05, |
| "loss": 0.1684, |
| "num_input_tokens_seen": 2430416, |
| "step": 5925 |
| }, |
| { |
| "epoch": 7.1791767554479415, |
| "grad_norm": 0.2742639482021332, |
| "learning_rate": 1.1179998365970174e-05, |
| "loss": 0.2193, |
| "num_input_tokens_seen": 2432464, |
| "step": 5930 |
| }, |
| { |
| "epoch": 7.185230024213075, |
| "grad_norm": 0.6828016042709351, |
| "learning_rate": 1.113600966282386e-05, |
| "loss": 0.1524, |
| "num_input_tokens_seen": 2434480, |
| "step": 5935 |
| }, |
| { |
| "epoch": 7.191283292978208, |
| "grad_norm": 0.26155775785446167, |
| "learning_rate": 1.1092082858548275e-05, |
| "loss": 0.2492, |
| "num_input_tokens_seen": 2436496, |
| "step": 5940 |
| }, |
| { |
| "epoch": 7.197336561743342, |
| "grad_norm": 0.4543386697769165, |
| "learning_rate": 1.1048218149264434e-05, |
| "loss": 0.2051, |
| "num_input_tokens_seen": 2438544, |
| "step": 5945 |
| }, |
| { |
| "epoch": 7.203389830508475, |
| "grad_norm": 0.18633881211280823, |
| "learning_rate": 1.1004415730816083e-05, |
| "loss": 0.1676, |
| "num_input_tokens_seen": 2440528, |
| "step": 5950 |
| }, |
| { |
| "epoch": 7.209443099273607, |
| "grad_norm": 0.2519204318523407, |
| "learning_rate": 1.0960675798768871e-05, |
| "loss": 0.2131, |
| "num_input_tokens_seen": 2442576, |
| "step": 5955 |
| }, |
| { |
| "epoch": 7.215496368038741, |
| "grad_norm": 0.30878016352653503, |
| "learning_rate": 1.0916998548409449e-05, |
| "loss": 0.1859, |
| "num_input_tokens_seen": 2444560, |
| "step": 5960 |
| }, |
| { |
| "epoch": 7.221549636803874, |
| "grad_norm": 0.2655375301837921, |
| "learning_rate": 1.0873384174744641e-05, |
| "loss": 0.1829, |
| "num_input_tokens_seen": 2446704, |
| "step": 5965 |
| }, |
| { |
| "epoch": 7.227602905569007, |
| "grad_norm": 0.32266807556152344, |
| "learning_rate": 1.0829832872500523e-05, |
| "loss": 0.2161, |
| "num_input_tokens_seen": 2448720, |
| "step": 5970 |
| }, |
| { |
| "epoch": 7.233656174334141, |
| "grad_norm": 0.30273690819740295, |
| "learning_rate": 1.078634483612157e-05, |
| "loss": 0.2075, |
| "num_input_tokens_seen": 2450832, |
| "step": 5975 |
| }, |
| { |
| "epoch": 7.239709443099273, |
| "grad_norm": 0.2916242778301239, |
| "learning_rate": 1.0742920259769792e-05, |
| "loss": 0.2097, |
| "num_input_tokens_seen": 2452752, |
| "step": 5980 |
| }, |
| { |
| "epoch": 7.245762711864407, |
| "grad_norm": 0.5907586216926575, |
| "learning_rate": 1.06995593373239e-05, |
| "loss": 0.2263, |
| "num_input_tokens_seen": 2454832, |
| "step": 5985 |
| }, |
| { |
| "epoch": 7.25181598062954, |
| "grad_norm": 0.28651994466781616, |
| "learning_rate": 1.0656262262378367e-05, |
| "loss": 0.1945, |
| "num_input_tokens_seen": 2456848, |
| "step": 5990 |
| }, |
| { |
| "epoch": 7.257869249394673, |
| "grad_norm": 0.32831430435180664, |
| "learning_rate": 1.0613029228242627e-05, |
| "loss": 0.2183, |
| "num_input_tokens_seen": 2459024, |
| "step": 5995 |
| }, |
| { |
| "epoch": 7.263922518159807, |
| "grad_norm": 0.4951678514480591, |
| "learning_rate": 1.0569860427940179e-05, |
| "loss": 0.1769, |
| "num_input_tokens_seen": 2461168, |
| "step": 6000 |
| }, |
| { |
| "epoch": 7.269975786924939, |
| "grad_norm": 0.3258248567581177, |
| "learning_rate": 1.0526756054207737e-05, |
| "loss": 0.2026, |
| "num_input_tokens_seen": 2463120, |
| "step": 6005 |
| }, |
| { |
| "epoch": 7.276029055690072, |
| "grad_norm": 0.3931015431880951, |
| "learning_rate": 1.0483716299494392e-05, |
| "loss": 0.2463, |
| "num_input_tokens_seen": 2465168, |
| "step": 6010 |
| }, |
| { |
| "epoch": 7.282082324455206, |
| "grad_norm": 0.32765695452690125, |
| "learning_rate": 1.044074135596069e-05, |
| "loss": 0.2419, |
| "num_input_tokens_seen": 2467248, |
| "step": 6015 |
| }, |
| { |
| "epoch": 7.288135593220339, |
| "grad_norm": 0.4035666584968567, |
| "learning_rate": 1.0397831415477823e-05, |
| "loss": 0.1898, |
| "num_input_tokens_seen": 2469200, |
| "step": 6020 |
| }, |
| { |
| "epoch": 7.2941888619854724, |
| "grad_norm": 0.4926074147224426, |
| "learning_rate": 1.0354986669626796e-05, |
| "loss": 0.1693, |
| "num_input_tokens_seen": 2471312, |
| "step": 6025 |
| }, |
| { |
| "epoch": 7.300242130750606, |
| "grad_norm": 0.27269190549850464, |
| "learning_rate": 1.0312207309697502e-05, |
| "loss": 0.1727, |
| "num_input_tokens_seen": 2473424, |
| "step": 6030 |
| }, |
| { |
| "epoch": 7.306295399515738, |
| "grad_norm": 0.35158050060272217, |
| "learning_rate": 1.0269493526687915e-05, |
| "loss": 0.2355, |
| "num_input_tokens_seen": 2475504, |
| "step": 6035 |
| }, |
| { |
| "epoch": 7.312348668280872, |
| "grad_norm": 0.3150555193424225, |
| "learning_rate": 1.0226845511303219e-05, |
| "loss": 0.1743, |
| "num_input_tokens_seen": 2477392, |
| "step": 6040 |
| }, |
| { |
| "epoch": 7.318401937046005, |
| "grad_norm": 0.3554048240184784, |
| "learning_rate": 1.0184263453954988e-05, |
| "loss": 0.1633, |
| "num_input_tokens_seen": 2479504, |
| "step": 6045 |
| }, |
| { |
| "epoch": 7.324455205811138, |
| "grad_norm": 0.27928879857063293, |
| "learning_rate": 1.0141747544760285e-05, |
| "loss": 0.1727, |
| "num_input_tokens_seen": 2481584, |
| "step": 6050 |
| }, |
| { |
| "epoch": 7.330508474576272, |
| "grad_norm": 0.4214881658554077, |
| "learning_rate": 1.0099297973540852e-05, |
| "loss": 0.2341, |
| "num_input_tokens_seen": 2483632, |
| "step": 6055 |
| }, |
| { |
| "epoch": 7.336561743341404, |
| "grad_norm": 0.6717732548713684, |
| "learning_rate": 1.0056914929822248e-05, |
| "loss": 0.2076, |
| "num_input_tokens_seen": 2485680, |
| "step": 6060 |
| }, |
| { |
| "epoch": 7.342615012106537, |
| "grad_norm": 0.1972280591726303, |
| "learning_rate": 1.0014598602832995e-05, |
| "loss": 0.2043, |
| "num_input_tokens_seen": 2487824, |
| "step": 6065 |
| }, |
| { |
| "epoch": 7.348668280871671, |
| "grad_norm": 0.3708639144897461, |
| "learning_rate": 9.972349181503773e-06, |
| "loss": 0.2367, |
| "num_input_tokens_seen": 2489872, |
| "step": 6070 |
| }, |
| { |
| "epoch": 7.354721549636804, |
| "grad_norm": 0.3415837585926056, |
| "learning_rate": 9.930166854466516e-06, |
| "loss": 0.1697, |
| "num_input_tokens_seen": 2491888, |
| "step": 6075 |
| }, |
| { |
| "epoch": 7.3607748184019375, |
| "grad_norm": 0.4768062233924866, |
| "learning_rate": 9.888051810053617e-06, |
| "loss": 0.2276, |
| "num_input_tokens_seen": 2493968, |
| "step": 6080 |
| }, |
| { |
| "epoch": 7.36682808716707, |
| "grad_norm": 0.22263237833976746, |
| "learning_rate": 9.846004236297052e-06, |
| "loss": 0.1632, |
| "num_input_tokens_seen": 2495920, |
| "step": 6085 |
| }, |
| { |
| "epoch": 7.372881355932203, |
| "grad_norm": 0.5537928938865662, |
| "learning_rate": 9.804024320927604e-06, |
| "loss": 0.2295, |
| "num_input_tokens_seen": 2498000, |
| "step": 6090 |
| }, |
| { |
| "epoch": 7.378934624697337, |
| "grad_norm": 0.5018237829208374, |
| "learning_rate": 9.76211225137392e-06, |
| "loss": 0.1775, |
| "num_input_tokens_seen": 2499952, |
| "step": 6095 |
| }, |
| { |
| "epoch": 7.38498789346247, |
| "grad_norm": 0.25171011686325073, |
| "learning_rate": 9.720268214761763e-06, |
| "loss": 0.1942, |
| "num_input_tokens_seen": 2501968, |
| "step": 6100 |
| }, |
| { |
| "epoch": 7.391041162227603, |
| "grad_norm": 0.4276196360588074, |
| "learning_rate": 9.678492397913167e-06, |
| "loss": 0.1667, |
| "num_input_tokens_seen": 2503984, |
| "step": 6105 |
| }, |
| { |
| "epoch": 7.397094430992736, |
| "grad_norm": 0.3869098126888275, |
| "learning_rate": 9.636784987345554e-06, |
| "loss": 0.1765, |
| "num_input_tokens_seen": 2505968, |
| "step": 6110 |
| }, |
| { |
| "epoch": 7.403147699757869, |
| "grad_norm": 0.40317302942276, |
| "learning_rate": 9.595146169270944e-06, |
| "loss": 0.1462, |
| "num_input_tokens_seen": 2508080, |
| "step": 6115 |
| }, |
| { |
| "epoch": 7.4092009685230025, |
| "grad_norm": 0.9469394683837891, |
| "learning_rate": 9.553576129595101e-06, |
| "loss": 0.2031, |
| "num_input_tokens_seen": 2510160, |
| "step": 6120 |
| }, |
| { |
| "epoch": 7.415254237288136, |
| "grad_norm": 0.4130316376686096, |
| "learning_rate": 9.512075053916735e-06, |
| "loss": 0.181, |
| "num_input_tokens_seen": 2512304, |
| "step": 6125 |
| }, |
| { |
| "epoch": 7.421307506053269, |
| "grad_norm": 0.32896122336387634, |
| "learning_rate": 9.470643127526627e-06, |
| "loss": 0.2073, |
| "num_input_tokens_seen": 2514544, |
| "step": 6130 |
| }, |
| { |
| "epoch": 7.427360774818402, |
| "grad_norm": 0.45505499839782715, |
| "learning_rate": 9.429280535406834e-06, |
| "loss": 0.1299, |
| "num_input_tokens_seen": 2516592, |
| "step": 6135 |
| }, |
| { |
| "epoch": 7.433414043583535, |
| "grad_norm": 0.5739582777023315, |
| "learning_rate": 9.387987462229859e-06, |
| "loss": 0.21, |
| "num_input_tokens_seen": 2518512, |
| "step": 6140 |
| }, |
| { |
| "epoch": 7.439467312348668, |
| "grad_norm": 0.2781931459903717, |
| "learning_rate": 9.346764092357801e-06, |
| "loss": 0.2209, |
| "num_input_tokens_seen": 2520592, |
| "step": 6145 |
| }, |
| { |
| "epoch": 7.445520581113802, |
| "grad_norm": 0.5292943120002747, |
| "learning_rate": 9.305610609841598e-06, |
| "loss": 0.2169, |
| "num_input_tokens_seen": 2522640, |
| "step": 6150 |
| }, |
| { |
| "epoch": 7.451573849878935, |
| "grad_norm": 0.24941454827785492, |
| "learning_rate": 9.264527198420117e-06, |
| "loss": 0.1681, |
| "num_input_tokens_seen": 2524592, |
| "step": 6155 |
| }, |
| { |
| "epoch": 7.4576271186440675, |
| "grad_norm": 0.3354622423648834, |
| "learning_rate": 9.2235140415194e-06, |
| "loss": 0.1792, |
| "num_input_tokens_seen": 2526576, |
| "step": 6160 |
| }, |
| { |
| "epoch": 7.463680387409201, |
| "grad_norm": 0.2857242226600647, |
| "learning_rate": 9.182571322251796e-06, |
| "loss": 0.1592, |
| "num_input_tokens_seen": 2528656, |
| "step": 6165 |
| }, |
| { |
| "epoch": 7.469733656174334, |
| "grad_norm": 0.38037776947021484, |
| "learning_rate": 9.141699223415221e-06, |
| "loss": 0.1801, |
| "num_input_tokens_seen": 2530864, |
| "step": 6170 |
| }, |
| { |
| "epoch": 7.4757869249394675, |
| "grad_norm": 0.34874293208122253, |
| "learning_rate": 9.10089792749223e-06, |
| "loss": 0.2093, |
| "num_input_tokens_seen": 2532880, |
| "step": 6175 |
| }, |
| { |
| "epoch": 7.481840193704601, |
| "grad_norm": 0.39451298117637634, |
| "learning_rate": 9.06016761664929e-06, |
| "loss": 0.1991, |
| "num_input_tokens_seen": 2534864, |
| "step": 6180 |
| }, |
| { |
| "epoch": 7.487893462469733, |
| "grad_norm": 0.1872560977935791, |
| "learning_rate": 9.019508472735958e-06, |
| "loss": 0.206, |
| "num_input_tokens_seen": 2536912, |
| "step": 6185 |
| }, |
| { |
| "epoch": 7.493946731234867, |
| "grad_norm": 0.23219034075737, |
| "learning_rate": 8.978920677284022e-06, |
| "loss": 0.164, |
| "num_input_tokens_seen": 2538832, |
| "step": 6190 |
| }, |
| { |
| "epoch": 7.5, |
| "grad_norm": 0.25058627128601074, |
| "learning_rate": 8.938404411506732e-06, |
| "loss": 0.1754, |
| "num_input_tokens_seen": 2540880, |
| "step": 6195 |
| }, |
| { |
| "epoch": 7.5, |
| "eval_loss": 0.1945834755897522, |
| "eval_runtime": 7.6621, |
| "eval_samples_per_second": 47.898, |
| "eval_steps_per_second": 12.007, |
| "num_input_tokens_seen": 2540880, |
| "step": 6195 |
| }, |
| { |
| "epoch": 7.506053268765133, |
| "grad_norm": 0.26562848687171936, |
| "learning_rate": 8.897959856297971e-06, |
| "loss": 0.1756, |
| "num_input_tokens_seen": 2542832, |
| "step": 6200 |
| }, |
| { |
| "epoch": 7.512106537530267, |
| "grad_norm": 0.39521145820617676, |
| "learning_rate": 8.857587192231452e-06, |
| "loss": 0.2033, |
| "num_input_tokens_seen": 2544784, |
| "step": 6205 |
| }, |
| { |
| "epoch": 7.518159806295399, |
| "grad_norm": 0.28637948632240295, |
| "learning_rate": 8.817286599559932e-06, |
| "loss": 0.2455, |
| "num_input_tokens_seen": 2546832, |
| "step": 6210 |
| }, |
| { |
| "epoch": 7.5242130750605325, |
| "grad_norm": 0.33285093307495117, |
| "learning_rate": 8.777058258214377e-06, |
| "loss": 0.1732, |
| "num_input_tokens_seen": 2549008, |
| "step": 6215 |
| }, |
| { |
| "epoch": 7.530266343825666, |
| "grad_norm": 0.26155713200569153, |
| "learning_rate": 8.736902347803163e-06, |
| "loss": 0.1816, |
| "num_input_tokens_seen": 2551120, |
| "step": 6220 |
| }, |
| { |
| "epoch": 7.536319612590799, |
| "grad_norm": 0.1600845903158188, |
| "learning_rate": 8.696819047611288e-06, |
| "loss": 0.1729, |
| "num_input_tokens_seen": 2553168, |
| "step": 6225 |
| }, |
| { |
| "epoch": 7.5423728813559325, |
| "grad_norm": 0.16967473924160004, |
| "learning_rate": 8.65680853659958e-06, |
| "loss": 0.2011, |
| "num_input_tokens_seen": 2555184, |
| "step": 6230 |
| }, |
| { |
| "epoch": 7.548426150121065, |
| "grad_norm": 0.3509404957294464, |
| "learning_rate": 8.616870993403864e-06, |
| "loss": 0.1908, |
| "num_input_tokens_seen": 2557264, |
| "step": 6235 |
| }, |
| { |
| "epoch": 7.554479418886198, |
| "grad_norm": 0.4448184370994568, |
| "learning_rate": 8.577006596334191e-06, |
| "loss": 0.1522, |
| "num_input_tokens_seen": 2559248, |
| "step": 6240 |
| }, |
| { |
| "epoch": 7.560532687651332, |
| "grad_norm": 0.18600642681121826, |
| "learning_rate": 8.537215523374038e-06, |
| "loss": 0.1745, |
| "num_input_tokens_seen": 2561264, |
| "step": 6245 |
| }, |
| { |
| "epoch": 7.566585956416465, |
| "grad_norm": 0.39082223176956177, |
| "learning_rate": 8.4974979521795e-06, |
| "loss": 0.248, |
| "num_input_tokens_seen": 2563120, |
| "step": 6250 |
| }, |
| { |
| "epoch": 7.572639225181598, |
| "grad_norm": 0.16721884906291962, |
| "learning_rate": 8.45785406007852e-06, |
| "loss": 0.1841, |
| "num_input_tokens_seen": 2565104, |
| "step": 6255 |
| }, |
| { |
| "epoch": 7.578692493946731, |
| "grad_norm": 0.5195043087005615, |
| "learning_rate": 8.418284024070069e-06, |
| "loss": 0.2244, |
| "num_input_tokens_seen": 2567216, |
| "step": 6260 |
| }, |
| { |
| "epoch": 7.584745762711864, |
| "grad_norm": 0.2869345247745514, |
| "learning_rate": 8.378788020823394e-06, |
| "loss": 0.176, |
| "num_input_tokens_seen": 2569360, |
| "step": 6265 |
| }, |
| { |
| "epoch": 7.5907990314769975, |
| "grad_norm": 0.4993572533130646, |
| "learning_rate": 8.33936622667719e-06, |
| "loss": 0.1738, |
| "num_input_tokens_seen": 2571536, |
| "step": 6270 |
| }, |
| { |
| "epoch": 7.596852300242131, |
| "grad_norm": 0.47090262174606323, |
| "learning_rate": 8.300018817638825e-06, |
| "loss": 0.1931, |
| "num_input_tokens_seen": 2573648, |
| "step": 6275 |
| }, |
| { |
| "epoch": 7.602905569007264, |
| "grad_norm": 0.20473451912403107, |
| "learning_rate": 8.260745969383565e-06, |
| "loss": 0.1478, |
| "num_input_tokens_seen": 2575888, |
| "step": 6280 |
| }, |
| { |
| "epoch": 7.608958837772397, |
| "grad_norm": 0.22230201959609985, |
| "learning_rate": 8.221547857253781e-06, |
| "loss": 0.2082, |
| "num_input_tokens_seen": 2578032, |
| "step": 6285 |
| }, |
| { |
| "epoch": 7.61501210653753, |
| "grad_norm": 0.22477145493030548, |
| "learning_rate": 8.182424656258178e-06, |
| "loss": 0.1753, |
| "num_input_tokens_seen": 2580016, |
| "step": 6290 |
| }, |
| { |
| "epoch": 7.621065375302663, |
| "grad_norm": 0.2392880618572235, |
| "learning_rate": 8.143376541070993e-06, |
| "loss": 0.1815, |
| "num_input_tokens_seen": 2582192, |
| "step": 6295 |
| }, |
| { |
| "epoch": 7.627118644067797, |
| "grad_norm": 0.26627084612846375, |
| "learning_rate": 8.104403686031225e-06, |
| "loss": 0.1773, |
| "num_input_tokens_seen": 2584208, |
| "step": 6300 |
| }, |
| { |
| "epoch": 7.63317191283293, |
| "grad_norm": 0.23821784555912018, |
| "learning_rate": 8.06550626514185e-06, |
| "loss": 0.1836, |
| "num_input_tokens_seen": 2586160, |
| "step": 6305 |
| }, |
| { |
| "epoch": 7.6392251815980625, |
| "grad_norm": 0.5925246477127075, |
| "learning_rate": 8.026684452069084e-06, |
| "loss": 0.1881, |
| "num_input_tokens_seen": 2588240, |
| "step": 6310 |
| }, |
| { |
| "epoch": 7.645278450363196, |
| "grad_norm": 0.46966490149497986, |
| "learning_rate": 7.987938420141536e-06, |
| "loss": 0.2023, |
| "num_input_tokens_seen": 2590480, |
| "step": 6315 |
| }, |
| { |
| "epoch": 7.651331719128329, |
| "grad_norm": 0.3513939678668976, |
| "learning_rate": 7.949268342349495e-06, |
| "loss": 0.1665, |
| "num_input_tokens_seen": 2592400, |
| "step": 6320 |
| }, |
| { |
| "epoch": 7.657384987893463, |
| "grad_norm": 0.5639855265617371, |
| "learning_rate": 7.910674391344129e-06, |
| "loss": 0.2155, |
| "num_input_tokens_seen": 2594352, |
| "step": 6325 |
| }, |
| { |
| "epoch": 7.663438256658596, |
| "grad_norm": 0.3428677022457123, |
| "learning_rate": 7.872156739436722e-06, |
| "loss": 0.2361, |
| "num_input_tokens_seen": 2596464, |
| "step": 6330 |
| }, |
| { |
| "epoch": 7.669491525423728, |
| "grad_norm": 0.31462541222572327, |
| "learning_rate": 7.833715558597907e-06, |
| "loss": 0.2222, |
| "num_input_tokens_seen": 2598544, |
| "step": 6335 |
| }, |
| { |
| "epoch": 7.675544794188862, |
| "grad_norm": 0.37966230511665344, |
| "learning_rate": 7.795351020456887e-06, |
| "loss": 0.204, |
| "num_input_tokens_seen": 2600656, |
| "step": 6340 |
| }, |
| { |
| "epoch": 7.681598062953995, |
| "grad_norm": 0.5934991836547852, |
| "learning_rate": 7.757063296300681e-06, |
| "loss": 0.1995, |
| "num_input_tokens_seen": 2602832, |
| "step": 6345 |
| }, |
| { |
| "epoch": 7.687651331719128, |
| "grad_norm": 0.25648924708366394, |
| "learning_rate": 7.718852557073366e-06, |
| "loss": 0.2062, |
| "num_input_tokens_seen": 2605008, |
| "step": 6350 |
| }, |
| { |
| "epoch": 7.693704600484262, |
| "grad_norm": 0.3750579357147217, |
| "learning_rate": 7.680718973375287e-06, |
| "loss": 0.2089, |
| "num_input_tokens_seen": 2607152, |
| "step": 6355 |
| }, |
| { |
| "epoch": 7.699757869249394, |
| "grad_norm": 0.5193522572517395, |
| "learning_rate": 7.642662715462315e-06, |
| "loss": 0.199, |
| "num_input_tokens_seen": 2609264, |
| "step": 6360 |
| }, |
| { |
| "epoch": 7.7058111380145276, |
| "grad_norm": 0.38370561599731445, |
| "learning_rate": 7.604683953245076e-06, |
| "loss": 0.1815, |
| "num_input_tokens_seen": 2611344, |
| "step": 6365 |
| }, |
| { |
| "epoch": 7.711864406779661, |
| "grad_norm": 0.49967658519744873, |
| "learning_rate": 7.566782856288224e-06, |
| "loss": 0.225, |
| "num_input_tokens_seen": 2613488, |
| "step": 6370 |
| }, |
| { |
| "epoch": 7.717917675544794, |
| "grad_norm": 0.8820407390594482, |
| "learning_rate": 7.5289595938096344e-06, |
| "loss": 0.2263, |
| "num_input_tokens_seen": 2615408, |
| "step": 6375 |
| }, |
| { |
| "epoch": 7.723970944309928, |
| "grad_norm": 0.2882877290248871, |
| "learning_rate": 7.4912143346796805e-06, |
| "loss": 0.1609, |
| "num_input_tokens_seen": 2617424, |
| "step": 6380 |
| }, |
| { |
| "epoch": 7.73002421307506, |
| "grad_norm": 0.2562648057937622, |
| "learning_rate": 7.4535472474204645e-06, |
| "loss": 0.186, |
| "num_input_tokens_seen": 2619312, |
| "step": 6385 |
| }, |
| { |
| "epoch": 7.736077481840193, |
| "grad_norm": 0.32210859656333923, |
| "learning_rate": 7.415958500205103e-06, |
| "loss": 0.2744, |
| "num_input_tokens_seen": 2621168, |
| "step": 6390 |
| }, |
| { |
| "epoch": 7.742130750605327, |
| "grad_norm": 0.3754838705062866, |
| "learning_rate": 7.37844826085691e-06, |
| "loss": 0.1679, |
| "num_input_tokens_seen": 2623152, |
| "step": 6395 |
| }, |
| { |
| "epoch": 7.74818401937046, |
| "grad_norm": 0.6544762253761292, |
| "learning_rate": 7.341016696848699e-06, |
| "loss": 0.174, |
| "num_input_tokens_seen": 2625328, |
| "step": 6400 |
| }, |
| { |
| "epoch": 7.754237288135593, |
| "grad_norm": 0.2997070252895355, |
| "learning_rate": 7.303663975302022e-06, |
| "loss": 0.1707, |
| "num_input_tokens_seen": 2627536, |
| "step": 6405 |
| }, |
| { |
| "epoch": 7.760290556900727, |
| "grad_norm": 0.6238000392913818, |
| "learning_rate": 7.2663902629864165e-06, |
| "loss": 0.1843, |
| "num_input_tokens_seen": 2629616, |
| "step": 6410 |
| }, |
| { |
| "epoch": 7.766343825665859, |
| "grad_norm": 0.6944513916969299, |
| "learning_rate": 7.229195726318669e-06, |
| "loss": 0.1671, |
| "num_input_tokens_seen": 2631696, |
| "step": 6415 |
| }, |
| { |
| "epoch": 7.772397094430993, |
| "grad_norm": 0.3414466679096222, |
| "learning_rate": 7.192080531362067e-06, |
| "loss": 0.1743, |
| "num_input_tokens_seen": 2633776, |
| "step": 6420 |
| }, |
| { |
| "epoch": 7.778450363196126, |
| "grad_norm": 0.6340538263320923, |
| "learning_rate": 7.155044843825651e-06, |
| "loss": 0.1968, |
| "num_input_tokens_seen": 2635920, |
| "step": 6425 |
| }, |
| { |
| "epoch": 7.784503631961259, |
| "grad_norm": 0.16706466674804688, |
| "learning_rate": 7.118088829063504e-06, |
| "loss": 0.1469, |
| "num_input_tokens_seen": 2637936, |
| "step": 6430 |
| }, |
| { |
| "epoch": 7.790556900726393, |
| "grad_norm": 0.3441086411476135, |
| "learning_rate": 7.081212652073979e-06, |
| "loss": 0.2074, |
| "num_input_tokens_seen": 2639984, |
| "step": 6435 |
| }, |
| { |
| "epoch": 7.796610169491525, |
| "grad_norm": 0.5931211113929749, |
| "learning_rate": 7.044416477498972e-06, |
| "loss": 0.2321, |
| "num_input_tokens_seen": 2642000, |
| "step": 6440 |
| }, |
| { |
| "epoch": 7.802663438256658, |
| "grad_norm": 0.3084107041358948, |
| "learning_rate": 7.007700469623185e-06, |
| "loss": 0.1949, |
| "num_input_tokens_seen": 2644016, |
| "step": 6445 |
| }, |
| { |
| "epoch": 7.808716707021792, |
| "grad_norm": 0.23492549359798431, |
| "learning_rate": 6.971064792373427e-06, |
| "loss": 0.2085, |
| "num_input_tokens_seen": 2646000, |
| "step": 6450 |
| }, |
| { |
| "epoch": 7.814769975786925, |
| "grad_norm": 0.729832112789154, |
| "learning_rate": 6.934509609317821e-06, |
| "loss": 0.2352, |
| "num_input_tokens_seen": 2648048, |
| "step": 6455 |
| }, |
| { |
| "epoch": 7.8208232445520585, |
| "grad_norm": 0.5351848602294922, |
| "learning_rate": 6.898035083665124e-06, |
| "loss": 0.185, |
| "num_input_tokens_seen": 2650064, |
| "step": 6460 |
| }, |
| { |
| "epoch": 7.826876513317191, |
| "grad_norm": 0.5391151905059814, |
| "learning_rate": 6.861641378263964e-06, |
| "loss": 0.2177, |
| "num_input_tokens_seen": 2652016, |
| "step": 6465 |
| }, |
| { |
| "epoch": 7.832929782082324, |
| "grad_norm": 0.3528791069984436, |
| "learning_rate": 6.825328655602153e-06, |
| "loss": 0.2053, |
| "num_input_tokens_seen": 2654192, |
| "step": 6470 |
| }, |
| { |
| "epoch": 7.838983050847458, |
| "grad_norm": 0.3976289629936218, |
| "learning_rate": 6.789097077805917e-06, |
| "loss": 0.1793, |
| "num_input_tokens_seen": 2656208, |
| "step": 6475 |
| }, |
| { |
| "epoch": 7.845036319612591, |
| "grad_norm": 0.25404056906700134, |
| "learning_rate": 6.7529468066392015e-06, |
| "loss": 0.2116, |
| "num_input_tokens_seen": 2658384, |
| "step": 6480 |
| }, |
| { |
| "epoch": 7.851089588377724, |
| "grad_norm": 0.6700761914253235, |
| "learning_rate": 6.7168780035029385e-06, |
| "loss": 0.1969, |
| "num_input_tokens_seen": 2660528, |
| "step": 6485 |
| }, |
| { |
| "epoch": 7.857142857142857, |
| "grad_norm": 0.3516135513782501, |
| "learning_rate": 6.680890829434325e-06, |
| "loss": 0.22, |
| "num_input_tokens_seen": 2662640, |
| "step": 6490 |
| }, |
| { |
| "epoch": 7.86319612590799, |
| "grad_norm": 0.2062719315290451, |
| "learning_rate": 6.644985445106114e-06, |
| "loss": 0.226, |
| "num_input_tokens_seen": 2664624, |
| "step": 6495 |
| }, |
| { |
| "epoch": 7.8692493946731235, |
| "grad_norm": 0.36615464091300964, |
| "learning_rate": 6.609162010825881e-06, |
| "loss": 0.1469, |
| "num_input_tokens_seen": 2666640, |
| "step": 6500 |
| }, |
| { |
| "epoch": 7.875302663438257, |
| "grad_norm": 0.4066512882709503, |
| "learning_rate": 6.573420686535317e-06, |
| "loss": 0.1825, |
| "num_input_tokens_seen": 2668592, |
| "step": 6505 |
| }, |
| { |
| "epoch": 7.88135593220339, |
| "grad_norm": 0.16517671942710876, |
| "learning_rate": 6.537761631809533e-06, |
| "loss": 0.1821, |
| "num_input_tokens_seen": 2670704, |
| "step": 6510 |
| }, |
| { |
| "epoch": 7.8874092009685235, |
| "grad_norm": 0.24977752566337585, |
| "learning_rate": 6.502185005856312e-06, |
| "loss": 0.2068, |
| "num_input_tokens_seen": 2672752, |
| "step": 6515 |
| }, |
| { |
| "epoch": 7.893462469733656, |
| "grad_norm": 0.2774293124675751, |
| "learning_rate": 6.4666909675154155e-06, |
| "loss": 0.173, |
| "num_input_tokens_seen": 2674864, |
| "step": 6520 |
| }, |
| { |
| "epoch": 7.899515738498789, |
| "grad_norm": 0.6305215954780579, |
| "learning_rate": 6.431279675257873e-06, |
| "loss": 0.2029, |
| "num_input_tokens_seen": 2676944, |
| "step": 6525 |
| }, |
| { |
| "epoch": 7.905569007263923, |
| "grad_norm": 0.3509519398212433, |
| "learning_rate": 6.395951287185295e-06, |
| "loss": 0.2087, |
| "num_input_tokens_seen": 2679024, |
| "step": 6530 |
| }, |
| { |
| "epoch": 7.911622276029056, |
| "grad_norm": 0.3554039001464844, |
| "learning_rate": 6.360705961029126e-06, |
| "loss": 0.1893, |
| "num_input_tokens_seen": 2680976, |
| "step": 6535 |
| }, |
| { |
| "epoch": 7.917675544794189, |
| "grad_norm": 0.18056441843509674, |
| "learning_rate": 6.325543854149968e-06, |
| "loss": 0.16, |
| "num_input_tokens_seen": 2683088, |
| "step": 6540 |
| }, |
| { |
| "epoch": 7.923728813559322, |
| "grad_norm": 0.2351856678724289, |
| "learning_rate": 6.290465123536876e-06, |
| "loss": 0.2458, |
| "num_input_tokens_seen": 2685072, |
| "step": 6545 |
| }, |
| { |
| "epoch": 7.929782082324455, |
| "grad_norm": 0.46706005930900574, |
| "learning_rate": 6.255469925806643e-06, |
| "loss": 0.1786, |
| "num_input_tokens_seen": 2687184, |
| "step": 6550 |
| }, |
| { |
| "epoch": 7.9358353510895885, |
| "grad_norm": 0.40436601638793945, |
| "learning_rate": 6.220558417203132e-06, |
| "loss": 0.1804, |
| "num_input_tokens_seen": 2689232, |
| "step": 6555 |
| }, |
| { |
| "epoch": 7.941888619854722, |
| "grad_norm": 0.32167142629623413, |
| "learning_rate": 6.185730753596539e-06, |
| "loss": 0.156, |
| "num_input_tokens_seen": 2691280, |
| "step": 6560 |
| }, |
| { |
| "epoch": 7.947941888619855, |
| "grad_norm": 0.7094486355781555, |
| "learning_rate": 6.150987090482715e-06, |
| "loss": 0.1874, |
| "num_input_tokens_seen": 2693328, |
| "step": 6565 |
| }, |
| { |
| "epoch": 7.953995157384988, |
| "grad_norm": 0.6512470245361328, |
| "learning_rate": 6.116327582982484e-06, |
| "loss": 0.1722, |
| "num_input_tokens_seen": 2695440, |
| "step": 6570 |
| }, |
| { |
| "epoch": 7.960048426150121, |
| "grad_norm": 0.4269331395626068, |
| "learning_rate": 6.0817523858409245e-06, |
| "loss": 0.1372, |
| "num_input_tokens_seen": 2697488, |
| "step": 6575 |
| }, |
| { |
| "epoch": 7.966101694915254, |
| "grad_norm": 0.16428448259830475, |
| "learning_rate": 6.047261653426708e-06, |
| "loss": 0.1753, |
| "num_input_tokens_seen": 2699504, |
| "step": 6580 |
| }, |
| { |
| "epoch": 7.972154963680388, |
| "grad_norm": 0.5245192646980286, |
| "learning_rate": 6.012855539731374e-06, |
| "loss": 0.1825, |
| "num_input_tokens_seen": 2701456, |
| "step": 6585 |
| }, |
| { |
| "epoch": 7.978208232445521, |
| "grad_norm": 0.3002568483352661, |
| "learning_rate": 5.978534198368691e-06, |
| "loss": 0.1428, |
| "num_input_tokens_seen": 2703504, |
| "step": 6590 |
| }, |
| { |
| "epoch": 7.9842615012106535, |
| "grad_norm": 0.5208486914634705, |
| "learning_rate": 5.944297782573918e-06, |
| "loss": 0.1717, |
| "num_input_tokens_seen": 2705616, |
| "step": 6595 |
| }, |
| { |
| "epoch": 7.990314769975787, |
| "grad_norm": 0.23970891535282135, |
| "learning_rate": 5.910146445203154e-06, |
| "loss": 0.1391, |
| "num_input_tokens_seen": 2707728, |
| "step": 6600 |
| }, |
| { |
| "epoch": 7.99636803874092, |
| "grad_norm": 0.4893709123134613, |
| "learning_rate": 5.876080338732643e-06, |
| "loss": 0.1757, |
| "num_input_tokens_seen": 2709776, |
| "step": 6605 |
| }, |
| { |
| "epoch": 8.0, |
| "eval_loss": 0.1937568485736847, |
| "eval_runtime": 7.6786, |
| "eval_samples_per_second": 47.795, |
| "eval_steps_per_second": 11.981, |
| "num_input_tokens_seen": 2710624, |
| "step": 6608 |
| }, |
| { |
| "epoch": 8.002421307506053, |
| "grad_norm": 0.3293020725250244, |
| "learning_rate": 5.842099615258109e-06, |
| "loss": 0.2257, |
| "num_input_tokens_seen": 2711456, |
| "step": 6610 |
| }, |
| { |
| "epoch": 8.008474576271187, |
| "grad_norm": 0.16256506741046906, |
| "learning_rate": 5.808204426494054e-06, |
| "loss": 0.1804, |
| "num_input_tokens_seen": 2713536, |
| "step": 6615 |
| }, |
| { |
| "epoch": 8.01452784503632, |
| "grad_norm": 0.2330913096666336, |
| "learning_rate": 5.774394923773088e-06, |
| "loss": 0.1956, |
| "num_input_tokens_seen": 2715680, |
| "step": 6620 |
| }, |
| { |
| "epoch": 8.020581113801454, |
| "grad_norm": 0.3086908459663391, |
| "learning_rate": 5.74067125804526e-06, |
| "loss": 0.1759, |
| "num_input_tokens_seen": 2717728, |
| "step": 6625 |
| }, |
| { |
| "epoch": 8.026634382566586, |
| "grad_norm": 0.297152578830719, |
| "learning_rate": 5.70703357987738e-06, |
| "loss": 0.1842, |
| "num_input_tokens_seen": 2719712, |
| "step": 6630 |
| }, |
| { |
| "epoch": 8.032687651331718, |
| "grad_norm": 0.6419103741645813, |
| "learning_rate": 5.673482039452363e-06, |
| "loss": 0.1589, |
| "num_input_tokens_seen": 2721856, |
| "step": 6635 |
| }, |
| { |
| "epoch": 8.038740920096853, |
| "grad_norm": 0.45413094758987427, |
| "learning_rate": 5.640016786568525e-06, |
| "loss": 0.2187, |
| "num_input_tokens_seen": 2724000, |
| "step": 6640 |
| }, |
| { |
| "epoch": 8.044794188861985, |
| "grad_norm": 0.3285627067089081, |
| "learning_rate": 5.606637970638917e-06, |
| "loss": 0.1993, |
| "num_input_tokens_seen": 2725792, |
| "step": 6645 |
| }, |
| { |
| "epoch": 8.05084745762712, |
| "grad_norm": 0.183476060628891, |
| "learning_rate": 5.573345740690714e-06, |
| "loss": 0.2077, |
| "num_input_tokens_seen": 2727744, |
| "step": 6650 |
| }, |
| { |
| "epoch": 8.056900726392252, |
| "grad_norm": 0.23720000684261322, |
| "learning_rate": 5.540140245364478e-06, |
| "loss": 0.1874, |
| "num_input_tokens_seen": 2729856, |
| "step": 6655 |
| }, |
| { |
| "epoch": 8.062953995157384, |
| "grad_norm": 0.439514696598053, |
| "learning_rate": 5.5070216329135365e-06, |
| "loss": 0.206, |
| "num_input_tokens_seen": 2731872, |
| "step": 6660 |
| }, |
| { |
| "epoch": 8.069007263922519, |
| "grad_norm": 0.5628945827484131, |
| "learning_rate": 5.473990051203298e-06, |
| "loss": 0.2102, |
| "num_input_tokens_seen": 2734016, |
| "step": 6665 |
| }, |
| { |
| "epoch": 8.075060532687651, |
| "grad_norm": 0.566626787185669, |
| "learning_rate": 5.441045647710627e-06, |
| "loss": 0.1789, |
| "num_input_tokens_seen": 2736032, |
| "step": 6670 |
| }, |
| { |
| "epoch": 8.081113801452785, |
| "grad_norm": 0.5366233587265015, |
| "learning_rate": 5.408188569523137e-06, |
| "loss": 0.2214, |
| "num_input_tokens_seen": 2737952, |
| "step": 6675 |
| }, |
| { |
| "epoch": 8.087167070217918, |
| "grad_norm": 0.28939247131347656, |
| "learning_rate": 5.375418963338566e-06, |
| "loss": 0.2275, |
| "num_input_tokens_seen": 2739968, |
| "step": 6680 |
| }, |
| { |
| "epoch": 8.09322033898305, |
| "grad_norm": 0.6849366426467896, |
| "learning_rate": 5.342736975464116e-06, |
| "loss": 0.1812, |
| "num_input_tokens_seen": 2742016, |
| "step": 6685 |
| }, |
| { |
| "epoch": 8.099273607748184, |
| "grad_norm": 0.19537819921970367, |
| "learning_rate": 5.310142751815792e-06, |
| "loss": 0.1887, |
| "num_input_tokens_seen": 2744128, |
| "step": 6690 |
| }, |
| { |
| "epoch": 8.105326876513317, |
| "grad_norm": 0.5376015901565552, |
| "learning_rate": 5.277636437917769e-06, |
| "loss": 0.2059, |
| "num_input_tokens_seen": 2746112, |
| "step": 6695 |
| }, |
| { |
| "epoch": 8.111380145278451, |
| "grad_norm": 0.2035432755947113, |
| "learning_rate": 5.245218178901717e-06, |
| "loss": 0.17, |
| "num_input_tokens_seen": 2748128, |
| "step": 6700 |
| }, |
| { |
| "epoch": 8.117433414043584, |
| "grad_norm": 0.6067910194396973, |
| "learning_rate": 5.212888119506168e-06, |
| "loss": 0.2309, |
| "num_input_tokens_seen": 2750176, |
| "step": 6705 |
| }, |
| { |
| "epoch": 8.123486682808716, |
| "grad_norm": 0.17496290802955627, |
| "learning_rate": 5.180646404075862e-06, |
| "loss": 0.192, |
| "num_input_tokens_seen": 2752256, |
| "step": 6710 |
| }, |
| { |
| "epoch": 8.12953995157385, |
| "grad_norm": 0.5829048752784729, |
| "learning_rate": 5.1484931765611286e-06, |
| "loss": 0.2313, |
| "num_input_tokens_seen": 2754208, |
| "step": 6715 |
| }, |
| { |
| "epoch": 8.135593220338983, |
| "grad_norm": 0.2960008382797241, |
| "learning_rate": 5.116428580517207e-06, |
| "loss": 0.2284, |
| "num_input_tokens_seen": 2756352, |
| "step": 6720 |
| }, |
| { |
| "epoch": 8.141646489104117, |
| "grad_norm": 0.6623200178146362, |
| "learning_rate": 5.084452759103603e-06, |
| "loss": 0.1768, |
| "num_input_tokens_seen": 2758528, |
| "step": 6725 |
| }, |
| { |
| "epoch": 8.14769975786925, |
| "grad_norm": 0.233219712972641, |
| "learning_rate": 5.052565855083511e-06, |
| "loss": 0.1774, |
| "num_input_tokens_seen": 2760608, |
| "step": 6730 |
| }, |
| { |
| "epoch": 8.153753026634382, |
| "grad_norm": 0.27998173236846924, |
| "learning_rate": 5.020768010823102e-06, |
| "loss": 0.1745, |
| "num_input_tokens_seen": 2762656, |
| "step": 6735 |
| }, |
| { |
| "epoch": 8.159806295399516, |
| "grad_norm": 0.1849880814552307, |
| "learning_rate": 4.98905936829093e-06, |
| "loss": 0.1954, |
| "num_input_tokens_seen": 2764672, |
| "step": 6740 |
| }, |
| { |
| "epoch": 8.165859564164649, |
| "grad_norm": 0.6322874426841736, |
| "learning_rate": 4.957440069057281e-06, |
| "loss": 0.2072, |
| "num_input_tokens_seen": 2766752, |
| "step": 6745 |
| }, |
| { |
| "epoch": 8.171912832929783, |
| "grad_norm": 0.34019213914871216, |
| "learning_rate": 4.92591025429357e-06, |
| "loss": 0.1741, |
| "num_input_tokens_seen": 2768736, |
| "step": 6750 |
| }, |
| { |
| "epoch": 8.177966101694915, |
| "grad_norm": 0.2495529055595398, |
| "learning_rate": 4.8944700647716616e-06, |
| "loss": 0.2333, |
| "num_input_tokens_seen": 2770752, |
| "step": 6755 |
| }, |
| { |
| "epoch": 8.184019370460048, |
| "grad_norm": 0.35278037190437317, |
| "learning_rate": 4.863119640863284e-06, |
| "loss": 0.209, |
| "num_input_tokens_seen": 2772928, |
| "step": 6760 |
| }, |
| { |
| "epoch": 8.190072639225182, |
| "grad_norm": 0.3446584641933441, |
| "learning_rate": 4.831859122539381e-06, |
| "loss": 0.2109, |
| "num_input_tokens_seen": 2774976, |
| "step": 6765 |
| }, |
| { |
| "epoch": 8.196125907990314, |
| "grad_norm": 0.37324345111846924, |
| "learning_rate": 4.800688649369489e-06, |
| "loss": 0.1989, |
| "num_input_tokens_seen": 2777152, |
| "step": 6770 |
| }, |
| { |
| "epoch": 8.202179176755449, |
| "grad_norm": 0.11549874395132065, |
| "learning_rate": 4.769608360521135e-06, |
| "loss": 0.1756, |
| "num_input_tokens_seen": 2779296, |
| "step": 6775 |
| }, |
| { |
| "epoch": 8.208232445520581, |
| "grad_norm": 0.3937399089336395, |
| "learning_rate": 4.7386183947591815e-06, |
| "loss": 0.1826, |
| "num_input_tokens_seen": 2781376, |
| "step": 6780 |
| }, |
| { |
| "epoch": 8.214285714285714, |
| "grad_norm": 0.2797986567020416, |
| "learning_rate": 4.7077188904452255e-06, |
| "loss": 0.2112, |
| "num_input_tokens_seen": 2783424, |
| "step": 6785 |
| }, |
| { |
| "epoch": 8.220338983050848, |
| "grad_norm": 0.30990055203437805, |
| "learning_rate": 4.676909985536981e-06, |
| "loss": 0.223, |
| "num_input_tokens_seen": 2785376, |
| "step": 6790 |
| }, |
| { |
| "epoch": 8.22639225181598, |
| "grad_norm": 0.15921911597251892, |
| "learning_rate": 4.64619181758767e-06, |
| "loss": 0.2123, |
| "num_input_tokens_seen": 2787456, |
| "step": 6795 |
| }, |
| { |
| "epoch": 8.232445520581114, |
| "grad_norm": 0.40815281867980957, |
| "learning_rate": 4.615564523745391e-06, |
| "loss": 0.1582, |
| "num_input_tokens_seen": 2789568, |
| "step": 6800 |
| }, |
| { |
| "epoch": 8.238498789346247, |
| "grad_norm": 0.16876575350761414, |
| "learning_rate": 4.585028240752498e-06, |
| "loss": 0.1938, |
| "num_input_tokens_seen": 2791584, |
| "step": 6805 |
| }, |
| { |
| "epoch": 8.24455205811138, |
| "grad_norm": 0.2718143165111542, |
| "learning_rate": 4.554583104945037e-06, |
| "loss": 0.2232, |
| "num_input_tokens_seen": 2793632, |
| "step": 6810 |
| }, |
| { |
| "epoch": 8.250605326876514, |
| "grad_norm": 0.8955281376838684, |
| "learning_rate": 4.524229252252091e-06, |
| "loss": 0.2211, |
| "num_input_tokens_seen": 2795744, |
| "step": 6815 |
| }, |
| { |
| "epoch": 8.256658595641646, |
| "grad_norm": 0.17680075764656067, |
| "learning_rate": 4.493966818195191e-06, |
| "loss": 0.2047, |
| "num_input_tokens_seen": 2797696, |
| "step": 6820 |
| }, |
| { |
| "epoch": 8.26271186440678, |
| "grad_norm": 0.48575228452682495, |
| "learning_rate": 4.463795937887713e-06, |
| "loss": 0.2082, |
| "num_input_tokens_seen": 2799744, |
| "step": 6825 |
| }, |
| { |
| "epoch": 8.268765133171913, |
| "grad_norm": 0.22202694416046143, |
| "learning_rate": 4.433716746034252e-06, |
| "loss": 0.1391, |
| "num_input_tokens_seen": 2801824, |
| "step": 6830 |
| }, |
| { |
| "epoch": 8.274818401937045, |
| "grad_norm": 0.4246073365211487, |
| "learning_rate": 4.40372937693008e-06, |
| "loss": 0.2011, |
| "num_input_tokens_seen": 2803872, |
| "step": 6835 |
| }, |
| { |
| "epoch": 8.28087167070218, |
| "grad_norm": 0.8069907426834106, |
| "learning_rate": 4.3738339644604635e-06, |
| "loss": 0.2255, |
| "num_input_tokens_seen": 2805824, |
| "step": 6840 |
| }, |
| { |
| "epoch": 8.286924939467312, |
| "grad_norm": 0.37280088663101196, |
| "learning_rate": 4.344030642100133e-06, |
| "loss": 0.2127, |
| "num_input_tokens_seen": 2807712, |
| "step": 6845 |
| }, |
| { |
| "epoch": 8.292978208232446, |
| "grad_norm": 0.5240939855575562, |
| "learning_rate": 4.314319542912643e-06, |
| "loss": 0.1653, |
| "num_input_tokens_seen": 2809760, |
| "step": 6850 |
| }, |
| { |
| "epoch": 8.299031476997579, |
| "grad_norm": 0.45029082894325256, |
| "learning_rate": 4.284700799549829e-06, |
| "loss": 0.2109, |
| "num_input_tokens_seen": 2811872, |
| "step": 6855 |
| }, |
| { |
| "epoch": 8.305084745762711, |
| "grad_norm": 0.3136141002178192, |
| "learning_rate": 4.255174544251147e-06, |
| "loss": 0.2217, |
| "num_input_tokens_seen": 2813888, |
| "step": 6860 |
| }, |
| { |
| "epoch": 8.311138014527845, |
| "grad_norm": 0.18651261925697327, |
| "learning_rate": 4.225740908843146e-06, |
| "loss": 0.143, |
| "num_input_tokens_seen": 2815808, |
| "step": 6865 |
| }, |
| { |
| "epoch": 8.317191283292978, |
| "grad_norm": 0.293517142534256, |
| "learning_rate": 4.196400024738831e-06, |
| "loss": 0.183, |
| "num_input_tokens_seen": 2817760, |
| "step": 6870 |
| }, |
| { |
| "epoch": 8.323244552058112, |
| "grad_norm": 0.39754846692085266, |
| "learning_rate": 4.167152022937124e-06, |
| "loss": 0.1893, |
| "num_input_tokens_seen": 2819776, |
| "step": 6875 |
| }, |
| { |
| "epoch": 8.329297820823244, |
| "grad_norm": 0.6498522162437439, |
| "learning_rate": 4.137997034022237e-06, |
| "loss": 0.2277, |
| "num_input_tokens_seen": 2821824, |
| "step": 6880 |
| }, |
| { |
| "epoch": 8.335351089588377, |
| "grad_norm": 0.23051509261131287, |
| "learning_rate": 4.108935188163096e-06, |
| "loss": 0.1779, |
| "num_input_tokens_seen": 2823968, |
| "step": 6885 |
| }, |
| { |
| "epoch": 8.341404358353511, |
| "grad_norm": 0.3624891936779022, |
| "learning_rate": 4.079966615112782e-06, |
| "loss": 0.1555, |
| "num_input_tokens_seen": 2825920, |
| "step": 6890 |
| }, |
| { |
| "epoch": 8.347457627118644, |
| "grad_norm": 0.23736338317394257, |
| "learning_rate": 4.05109144420795e-06, |
| "loss": 0.1753, |
| "num_input_tokens_seen": 2828224, |
| "step": 6895 |
| }, |
| { |
| "epoch": 8.353510895883778, |
| "grad_norm": 0.1986524909734726, |
| "learning_rate": 4.022309804368215e-06, |
| "loss": 0.1389, |
| "num_input_tokens_seen": 2830208, |
| "step": 6900 |
| }, |
| { |
| "epoch": 8.35956416464891, |
| "grad_norm": 0.5415716171264648, |
| "learning_rate": 3.993621824095622e-06, |
| "loss": 0.168, |
| "num_input_tokens_seen": 2832160, |
| "step": 6905 |
| }, |
| { |
| "epoch": 8.365617433414043, |
| "grad_norm": 0.30332624912261963, |
| "learning_rate": 3.965027631474036e-06, |
| "loss": 0.1837, |
| "num_input_tokens_seen": 2834208, |
| "step": 6910 |
| }, |
| { |
| "epoch": 8.371670702179177, |
| "grad_norm": 0.3517955243587494, |
| "learning_rate": 3.936527354168606e-06, |
| "loss": 0.2034, |
| "num_input_tokens_seen": 2836320, |
| "step": 6915 |
| }, |
| { |
| "epoch": 8.37772397094431, |
| "grad_norm": 0.3454847037792206, |
| "learning_rate": 3.90812111942516e-06, |
| "loss": 0.2182, |
| "num_input_tokens_seen": 2838464, |
| "step": 6920 |
| }, |
| { |
| "epoch": 8.383777239709444, |
| "grad_norm": 0.08716411143541336, |
| "learning_rate": 3.8798090540696495e-06, |
| "loss": 0.1846, |
| "num_input_tokens_seen": 2840512, |
| "step": 6925 |
| }, |
| { |
| "epoch": 8.389830508474576, |
| "grad_norm": 0.19822971522808075, |
| "learning_rate": 3.851591284507591e-06, |
| "loss": 0.1907, |
| "num_input_tokens_seen": 2842592, |
| "step": 6930 |
| }, |
| { |
| "epoch": 8.39588377723971, |
| "grad_norm": 0.3349051773548126, |
| "learning_rate": 3.82346793672351e-06, |
| "loss": 0.2163, |
| "num_input_tokens_seen": 2844576, |
| "step": 6935 |
| }, |
| { |
| "epoch": 8.401937046004843, |
| "grad_norm": 0.3251063823699951, |
| "learning_rate": 3.795439136280346e-06, |
| "loss": 0.2257, |
| "num_input_tokens_seen": 2846720, |
| "step": 6940 |
| }, |
| { |
| "epoch": 8.407990314769975, |
| "grad_norm": 0.27988186478614807, |
| "learning_rate": 3.767505008318914e-06, |
| "loss": 0.1973, |
| "num_input_tokens_seen": 2848832, |
| "step": 6945 |
| }, |
| { |
| "epoch": 8.41404358353511, |
| "grad_norm": 0.5585899353027344, |
| "learning_rate": 3.739665677557341e-06, |
| "loss": 0.1928, |
| "num_input_tokens_seen": 2850880, |
| "step": 6950 |
| }, |
| { |
| "epoch": 8.420096852300242, |
| "grad_norm": 0.2884608209133148, |
| "learning_rate": 3.711921268290533e-06, |
| "loss": 0.1702, |
| "num_input_tokens_seen": 2852896, |
| "step": 6955 |
| }, |
| { |
| "epoch": 8.426150121065376, |
| "grad_norm": 0.48848360776901245, |
| "learning_rate": 3.6842719043895748e-06, |
| "loss": 0.1945, |
| "num_input_tokens_seen": 2855008, |
| "step": 6960 |
| }, |
| { |
| "epoch": 8.432203389830509, |
| "grad_norm": 0.41199782490730286, |
| "learning_rate": 3.656717709301194e-06, |
| "loss": 0.172, |
| "num_input_tokens_seen": 2856960, |
| "step": 6965 |
| }, |
| { |
| "epoch": 8.438256658595641, |
| "grad_norm": 0.4844982326030731, |
| "learning_rate": 3.629258806047231e-06, |
| "loss": 0.2059, |
| "num_input_tokens_seen": 2859200, |
| "step": 6970 |
| }, |
| { |
| "epoch": 8.444309927360775, |
| "grad_norm": 0.5721977949142456, |
| "learning_rate": 3.60189531722408e-06, |
| "loss": 0.2026, |
| "num_input_tokens_seen": 2861280, |
| "step": 6975 |
| }, |
| { |
| "epoch": 8.450363196125908, |
| "grad_norm": 0.43065741658210754, |
| "learning_rate": 3.5746273650021228e-06, |
| "loss": 0.2241, |
| "num_input_tokens_seen": 2863392, |
| "step": 6980 |
| }, |
| { |
| "epoch": 8.456416464891042, |
| "grad_norm": 0.2157776802778244, |
| "learning_rate": 3.5474550711252026e-06, |
| "loss": 0.2135, |
| "num_input_tokens_seen": 2865344, |
| "step": 6985 |
| }, |
| { |
| "epoch": 8.462469733656174, |
| "grad_norm": 0.22446975111961365, |
| "learning_rate": 3.5203785569100674e-06, |
| "loss": 0.2619, |
| "num_input_tokens_seen": 2867424, |
| "step": 6990 |
| }, |
| { |
| "epoch": 8.468523002421307, |
| "grad_norm": 0.14394637942314148, |
| "learning_rate": 3.493397943245852e-06, |
| "loss": 0.1351, |
| "num_input_tokens_seen": 2869472, |
| "step": 6995 |
| }, |
| { |
| "epoch": 8.474576271186441, |
| "grad_norm": 0.5854501128196716, |
| "learning_rate": 3.466513350593506e-06, |
| "loss": 0.239, |
| "num_input_tokens_seen": 2871680, |
| "step": 7000 |
| }, |
| { |
| "epoch": 8.480629539951574, |
| "grad_norm": 0.37906989455223083, |
| "learning_rate": 3.439724898985278e-06, |
| "loss": 0.1856, |
| "num_input_tokens_seen": 2873760, |
| "step": 7005 |
| }, |
| { |
| "epoch": 8.486682808716708, |
| "grad_norm": 0.2611500322818756, |
| "learning_rate": 3.4130327080241636e-06, |
| "loss": 0.1877, |
| "num_input_tokens_seen": 2875712, |
| "step": 7010 |
| }, |
| { |
| "epoch": 8.49273607748184, |
| "grad_norm": 0.19859932363033295, |
| "learning_rate": 3.3864368968834074e-06, |
| "loss": 0.1947, |
| "num_input_tokens_seen": 2877696, |
| "step": 7015 |
| }, |
| { |
| "epoch": 8.498789346246973, |
| "grad_norm": 0.39439839124679565, |
| "learning_rate": 3.3599375843059193e-06, |
| "loss": 0.1737, |
| "num_input_tokens_seen": 2879680, |
| "step": 7020 |
| }, |
| { |
| "epoch": 8.5, |
| "eval_loss": 0.1940288543701172, |
| "eval_runtime": 7.6565, |
| "eval_samples_per_second": 47.933, |
| "eval_steps_per_second": 12.016, |
| "num_input_tokens_seen": 2880128, |
| "step": 7021 |
| }, |
| { |
| "epoch": 8.504842615012107, |
| "grad_norm": 0.38143688440322876, |
| "learning_rate": 3.3335348886037815e-06, |
| "loss": 0.2286, |
| "num_input_tokens_seen": 2881728, |
| "step": 7025 |
| }, |
| { |
| "epoch": 8.51089588377724, |
| "grad_norm": 0.14638325572013855, |
| "learning_rate": 3.3072289276576964e-06, |
| "loss": 0.1787, |
| "num_input_tokens_seen": 2883744, |
| "step": 7030 |
| }, |
| { |
| "epoch": 8.516949152542374, |
| "grad_norm": 0.19705192744731903, |
| "learning_rate": 3.281019818916492e-06, |
| "loss": 0.2132, |
| "num_input_tokens_seen": 2885792, |
| "step": 7035 |
| }, |
| { |
| "epoch": 8.523002421307506, |
| "grad_norm": 0.24369968473911285, |
| "learning_rate": 3.254907679396574e-06, |
| "loss": 0.177, |
| "num_input_tokens_seen": 2888000, |
| "step": 7040 |
| }, |
| { |
| "epoch": 8.529055690072639, |
| "grad_norm": 0.37336161732673645, |
| "learning_rate": 3.2288926256813846e-06, |
| "loss": 0.1394, |
| "num_input_tokens_seen": 2890016, |
| "step": 7045 |
| }, |
| { |
| "epoch": 8.535108958837773, |
| "grad_norm": 0.27337437868118286, |
| "learning_rate": 3.2029747739209247e-06, |
| "loss": 0.2018, |
| "num_input_tokens_seen": 2891968, |
| "step": 7050 |
| }, |
| { |
| "epoch": 8.541162227602905, |
| "grad_norm": 0.14469315111637115, |
| "learning_rate": 3.177154239831223e-06, |
| "loss": 0.1498, |
| "num_input_tokens_seen": 2893952, |
| "step": 7055 |
| }, |
| { |
| "epoch": 8.54721549636804, |
| "grad_norm": 0.2165355682373047, |
| "learning_rate": 3.1514311386937917e-06, |
| "loss": 0.2137, |
| "num_input_tokens_seen": 2895904, |
| "step": 7060 |
| }, |
| { |
| "epoch": 8.553268765133172, |
| "grad_norm": 0.45738929510116577, |
| "learning_rate": 3.1258055853551487e-06, |
| "loss": 0.1554, |
| "num_input_tokens_seen": 2897952, |
| "step": 7065 |
| }, |
| { |
| "epoch": 8.559322033898304, |
| "grad_norm": 0.4926764965057373, |
| "learning_rate": 3.1002776942262696e-06, |
| "loss": 0.2078, |
| "num_input_tokens_seen": 2900064, |
| "step": 7070 |
| }, |
| { |
| "epoch": 8.565375302663439, |
| "grad_norm": 0.19983620941638947, |
| "learning_rate": 3.0748475792821197e-06, |
| "loss": 0.1755, |
| "num_input_tokens_seen": 2902432, |
| "step": 7075 |
| }, |
| { |
| "epoch": 8.571428571428571, |
| "grad_norm": 0.18823428452014923, |
| "learning_rate": 3.0495153540611e-06, |
| "loss": 0.1906, |
| "num_input_tokens_seen": 2904512, |
| "step": 7080 |
| }, |
| { |
| "epoch": 8.577481840193705, |
| "grad_norm": 0.3810865581035614, |
| "learning_rate": 3.024281131664569e-06, |
| "loss": 0.1263, |
| "num_input_tokens_seen": 2906496, |
| "step": 7085 |
| }, |
| { |
| "epoch": 8.583535108958838, |
| "grad_norm": 0.37583184242248535, |
| "learning_rate": 2.999145024756325e-06, |
| "loss": 0.1573, |
| "num_input_tokens_seen": 2908544, |
| "step": 7090 |
| }, |
| { |
| "epoch": 8.58958837772397, |
| "grad_norm": 0.24153131246566772, |
| "learning_rate": 2.9741071455621245e-06, |
| "loss": 0.1571, |
| "num_input_tokens_seen": 2910688, |
| "step": 7095 |
| }, |
| { |
| "epoch": 8.595641646489105, |
| "grad_norm": 0.5701848268508911, |
| "learning_rate": 2.9491676058691437e-06, |
| "loss": 0.1798, |
| "num_input_tokens_seen": 2912832, |
| "step": 7100 |
| }, |
| { |
| "epoch": 8.601694915254237, |
| "grad_norm": 0.4430864453315735, |
| "learning_rate": 2.924326517025508e-06, |
| "loss": 0.2256, |
| "num_input_tokens_seen": 2914816, |
| "step": 7105 |
| }, |
| { |
| "epoch": 8.607748184019371, |
| "grad_norm": 0.2909928560256958, |
| "learning_rate": 2.8995839899397915e-06, |
| "loss": 0.1824, |
| "num_input_tokens_seen": 2916928, |
| "step": 7110 |
| }, |
| { |
| "epoch": 8.613801452784504, |
| "grad_norm": 0.32852888107299805, |
| "learning_rate": 2.8749401350805115e-06, |
| "loss": 0.1699, |
| "num_input_tokens_seen": 2918912, |
| "step": 7115 |
| }, |
| { |
| "epoch": 8.619854721549636, |
| "grad_norm": 0.33807820081710815, |
| "learning_rate": 2.8503950624756415e-06, |
| "loss": 0.1821, |
| "num_input_tokens_seen": 2920896, |
| "step": 7120 |
| }, |
| { |
| "epoch": 8.62590799031477, |
| "grad_norm": 0.24896188080310822, |
| "learning_rate": 2.825948881712123e-06, |
| "loss": 0.2142, |
| "num_input_tokens_seen": 2922976, |
| "step": 7125 |
| }, |
| { |
| "epoch": 8.631961259079903, |
| "grad_norm": 0.24985922873020172, |
| "learning_rate": 2.801601701935369e-06, |
| "loss": 0.1513, |
| "num_input_tokens_seen": 2925120, |
| "step": 7130 |
| }, |
| { |
| "epoch": 8.638014527845037, |
| "grad_norm": 0.3762378394603729, |
| "learning_rate": 2.777353631848789e-06, |
| "loss": 0.2049, |
| "num_input_tokens_seen": 2927072, |
| "step": 7135 |
| }, |
| { |
| "epoch": 8.64406779661017, |
| "grad_norm": 0.1994863599538803, |
| "learning_rate": 2.7532047797132867e-06, |
| "loss": 0.1737, |
| "num_input_tokens_seen": 2929152, |
| "step": 7140 |
| }, |
| { |
| "epoch": 8.650121065375302, |
| "grad_norm": 0.39066359400749207, |
| "learning_rate": 2.7291552533467853e-06, |
| "loss": 0.1784, |
| "num_input_tokens_seen": 2931136, |
| "step": 7145 |
| }, |
| { |
| "epoch": 8.656174334140436, |
| "grad_norm": 0.2963811457157135, |
| "learning_rate": 2.7052051601237473e-06, |
| "loss": 0.1584, |
| "num_input_tokens_seen": 2933312, |
| "step": 7150 |
| }, |
| { |
| "epoch": 8.662227602905569, |
| "grad_norm": 0.34648895263671875, |
| "learning_rate": 2.681354606974698e-06, |
| "loss": 0.1861, |
| "num_input_tokens_seen": 2935328, |
| "step": 7155 |
| }, |
| { |
| "epoch": 8.668280871670703, |
| "grad_norm": 0.21150127053260803, |
| "learning_rate": 2.6576037003857414e-06, |
| "loss": 0.175, |
| "num_input_tokens_seen": 2937216, |
| "step": 7160 |
| }, |
| { |
| "epoch": 8.674334140435835, |
| "grad_norm": 0.351188063621521, |
| "learning_rate": 2.633952546398083e-06, |
| "loss": 0.1618, |
| "num_input_tokens_seen": 2939168, |
| "step": 7165 |
| }, |
| { |
| "epoch": 8.680387409200968, |
| "grad_norm": 0.2172980010509491, |
| "learning_rate": 2.6104012506075692e-06, |
| "loss": 0.1944, |
| "num_input_tokens_seen": 2941504, |
| "step": 7170 |
| }, |
| { |
| "epoch": 8.686440677966102, |
| "grad_norm": 0.21408477425575256, |
| "learning_rate": 2.5869499181641916e-06, |
| "loss": 0.1847, |
| "num_input_tokens_seen": 2943520, |
| "step": 7175 |
| }, |
| { |
| "epoch": 8.692493946731235, |
| "grad_norm": 0.5822416543960571, |
| "learning_rate": 2.5635986537716538e-06, |
| "loss": 0.2187, |
| "num_input_tokens_seen": 2945568, |
| "step": 7180 |
| }, |
| { |
| "epoch": 8.698547215496369, |
| "grad_norm": 0.30243992805480957, |
| "learning_rate": 2.540347561686873e-06, |
| "loss": 0.1481, |
| "num_input_tokens_seen": 2947488, |
| "step": 7185 |
| }, |
| { |
| "epoch": 8.704600484261501, |
| "grad_norm": 0.6463418006896973, |
| "learning_rate": 2.5171967457195216e-06, |
| "loss": 0.2356, |
| "num_input_tokens_seen": 2949504, |
| "step": 7190 |
| }, |
| { |
| "epoch": 8.710653753026634, |
| "grad_norm": 0.2906031012535095, |
| "learning_rate": 2.494146309231571e-06, |
| "loss": 0.2259, |
| "num_input_tokens_seen": 2951552, |
| "step": 7195 |
| }, |
| { |
| "epoch": 8.716707021791768, |
| "grad_norm": 0.6156424880027771, |
| "learning_rate": 2.471196355136826e-06, |
| "loss": 0.1994, |
| "num_input_tokens_seen": 2953632, |
| "step": 7200 |
| }, |
| { |
| "epoch": 8.7227602905569, |
| "grad_norm": 0.3577008843421936, |
| "learning_rate": 2.4483469859004625e-06, |
| "loss": 0.184, |
| "num_input_tokens_seen": 2955936, |
| "step": 7205 |
| }, |
| { |
| "epoch": 8.728813559322035, |
| "grad_norm": 0.31193807721138, |
| "learning_rate": 2.425598303538576e-06, |
| "loss": 0.1894, |
| "num_input_tokens_seen": 2958048, |
| "step": 7210 |
| }, |
| { |
| "epoch": 8.734866828087167, |
| "grad_norm": 0.36252132058143616, |
| "learning_rate": 2.402950409617727e-06, |
| "loss": 0.1971, |
| "num_input_tokens_seen": 2960160, |
| "step": 7215 |
| }, |
| { |
| "epoch": 8.7409200968523, |
| "grad_norm": 0.2776913642883301, |
| "learning_rate": 2.380403405254475e-06, |
| "loss": 0.2086, |
| "num_input_tokens_seen": 2962208, |
| "step": 7220 |
| }, |
| { |
| "epoch": 8.746973365617434, |
| "grad_norm": 0.10153469443321228, |
| "learning_rate": 2.35795739111494e-06, |
| "loss": 0.2099, |
| "num_input_tokens_seen": 2964320, |
| "step": 7225 |
| }, |
| { |
| "epoch": 8.753026634382566, |
| "grad_norm": 0.27656248211860657, |
| "learning_rate": 2.335612467414344e-06, |
| "loss": 0.1681, |
| "num_input_tokens_seen": 2966272, |
| "step": 7230 |
| }, |
| { |
| "epoch": 8.7590799031477, |
| "grad_norm": 0.401172012090683, |
| "learning_rate": 2.313368733916585e-06, |
| "loss": 0.2228, |
| "num_input_tokens_seen": 2968288, |
| "step": 7235 |
| }, |
| { |
| "epoch": 8.765133171912833, |
| "grad_norm": 0.3927387595176697, |
| "learning_rate": 2.291226289933751e-06, |
| "loss": 0.1878, |
| "num_input_tokens_seen": 2970208, |
| "step": 7240 |
| }, |
| { |
| "epoch": 8.771186440677965, |
| "grad_norm": 0.3676404356956482, |
| "learning_rate": 2.2691852343257157e-06, |
| "loss": 0.1645, |
| "num_input_tokens_seen": 2972352, |
| "step": 7245 |
| }, |
| { |
| "epoch": 8.7772397094431, |
| "grad_norm": 0.17031407356262207, |
| "learning_rate": 2.2472456654996755e-06, |
| "loss": 0.1373, |
| "num_input_tokens_seen": 2974368, |
| "step": 7250 |
| }, |
| { |
| "epoch": 8.783292978208232, |
| "grad_norm": 0.3022800087928772, |
| "learning_rate": 2.2254076814097163e-06, |
| "loss": 0.2461, |
| "num_input_tokens_seen": 2976288, |
| "step": 7255 |
| }, |
| { |
| "epoch": 8.789346246973366, |
| "grad_norm": 0.32072508335113525, |
| "learning_rate": 2.203671379556388e-06, |
| "loss": 0.2015, |
| "num_input_tokens_seen": 2978240, |
| "step": 7260 |
| }, |
| { |
| "epoch": 8.795399515738499, |
| "grad_norm": 0.3579932749271393, |
| "learning_rate": 2.1820368569862444e-06, |
| "loss": 0.2484, |
| "num_input_tokens_seen": 2980256, |
| "step": 7265 |
| }, |
| { |
| "epoch": 8.801452784503631, |
| "grad_norm": 0.37986764311790466, |
| "learning_rate": 2.1605042102914227e-06, |
| "loss": 0.207, |
| "num_input_tokens_seen": 2982400, |
| "step": 7270 |
| }, |
| { |
| "epoch": 8.807506053268765, |
| "grad_norm": 0.8521299362182617, |
| "learning_rate": 2.1390735356092206e-06, |
| "loss": 0.1842, |
| "num_input_tokens_seen": 2984416, |
| "step": 7275 |
| }, |
| { |
| "epoch": 8.813559322033898, |
| "grad_norm": 0.35832566022872925, |
| "learning_rate": 2.1177449286216565e-06, |
| "loss": 0.2122, |
| "num_input_tokens_seen": 2986496, |
| "step": 7280 |
| }, |
| { |
| "epoch": 8.819612590799032, |
| "grad_norm": 0.5020646452903748, |
| "learning_rate": 2.0965184845550407e-06, |
| "loss": 0.1581, |
| "num_input_tokens_seen": 2988512, |
| "step": 7285 |
| }, |
| { |
| "epoch": 8.825665859564165, |
| "grad_norm": 0.334235280752182, |
| "learning_rate": 2.075394298179553e-06, |
| "loss": 0.1968, |
| "num_input_tokens_seen": 2990560, |
| "step": 7290 |
| }, |
| { |
| "epoch": 8.831719128329297, |
| "grad_norm": 0.22929278016090393, |
| "learning_rate": 2.0543724638088347e-06, |
| "loss": 0.1538, |
| "num_input_tokens_seen": 2992768, |
| "step": 7295 |
| }, |
| { |
| "epoch": 8.837772397094431, |
| "grad_norm": 0.18368889391422272, |
| "learning_rate": 2.0334530752995433e-06, |
| "loss": 0.1475, |
| "num_input_tokens_seen": 2994784, |
| "step": 7300 |
| }, |
| { |
| "epoch": 8.843825665859564, |
| "grad_norm": 0.3872652053833008, |
| "learning_rate": 2.01263622605094e-06, |
| "loss": 0.1773, |
| "num_input_tokens_seen": 2996896, |
| "step": 7305 |
| }, |
| { |
| "epoch": 8.849878934624698, |
| "grad_norm": 0.17723634839057922, |
| "learning_rate": 1.991922009004485e-06, |
| "loss": 0.1762, |
| "num_input_tokens_seen": 2998976, |
| "step": 7310 |
| }, |
| { |
| "epoch": 8.85593220338983, |
| "grad_norm": 0.3536140024662018, |
| "learning_rate": 1.9713105166434042e-06, |
| "loss": 0.1576, |
| "num_input_tokens_seen": 3001184, |
| "step": 7315 |
| }, |
| { |
| "epoch": 8.861985472154963, |
| "grad_norm": 0.2743946611881256, |
| "learning_rate": 1.950801840992303e-06, |
| "loss": 0.1668, |
| "num_input_tokens_seen": 3003168, |
| "step": 7320 |
| }, |
| { |
| "epoch": 8.868038740920097, |
| "grad_norm": 0.6186208128929138, |
| "learning_rate": 1.930396073616725e-06, |
| "loss": 0.1931, |
| "num_input_tokens_seen": 3005152, |
| "step": 7325 |
| }, |
| { |
| "epoch": 8.87409200968523, |
| "grad_norm": 0.306153267621994, |
| "learning_rate": 1.9100933056227593e-06, |
| "loss": 0.2081, |
| "num_input_tokens_seen": 3007200, |
| "step": 7330 |
| }, |
| { |
| "epoch": 8.880145278450364, |
| "grad_norm": 0.2928207814693451, |
| "learning_rate": 1.8898936276566303e-06, |
| "loss": 0.1824, |
| "num_input_tokens_seen": 3009280, |
| "step": 7335 |
| }, |
| { |
| "epoch": 8.886198547215496, |
| "grad_norm": 0.46753543615341187, |
| "learning_rate": 1.8697971299043048e-06, |
| "loss": 0.2075, |
| "num_input_tokens_seen": 3011360, |
| "step": 7340 |
| }, |
| { |
| "epoch": 8.892251815980629, |
| "grad_norm": 0.22565805912017822, |
| "learning_rate": 1.8498039020910628e-06, |
| "loss": 0.2196, |
| "num_input_tokens_seen": 3013568, |
| "step": 7345 |
| }, |
| { |
| "epoch": 8.898305084745763, |
| "grad_norm": 0.42717045545578003, |
| "learning_rate": 1.8299140334811226e-06, |
| "loss": 0.1849, |
| "num_input_tokens_seen": 3015552, |
| "step": 7350 |
| }, |
| { |
| "epoch": 8.904358353510895, |
| "grad_norm": 0.5215063095092773, |
| "learning_rate": 1.8101276128772272e-06, |
| "loss": 0.2409, |
| "num_input_tokens_seen": 3017536, |
| "step": 7355 |
| }, |
| { |
| "epoch": 8.91041162227603, |
| "grad_norm": 0.49285098910331726, |
| "learning_rate": 1.7904447286202607e-06, |
| "loss": 0.2434, |
| "num_input_tokens_seen": 3019584, |
| "step": 7360 |
| }, |
| { |
| "epoch": 8.916464891041162, |
| "grad_norm": 0.3806793987751007, |
| "learning_rate": 1.7708654685888337e-06, |
| "loss": 0.1523, |
| "num_input_tokens_seen": 3021728, |
| "step": 7365 |
| }, |
| { |
| "epoch": 8.922518159806295, |
| "grad_norm": 0.5680190920829773, |
| "learning_rate": 1.7513899201989148e-06, |
| "loss": 0.1775, |
| "num_input_tokens_seen": 3023584, |
| "step": 7370 |
| }, |
| { |
| "epoch": 8.928571428571429, |
| "grad_norm": 0.360101580619812, |
| "learning_rate": 1.7320181704034237e-06, |
| "loss": 0.1497, |
| "num_input_tokens_seen": 3025600, |
| "step": 7375 |
| }, |
| { |
| "epoch": 8.934624697336561, |
| "grad_norm": 0.27625852823257446, |
| "learning_rate": 1.7127503056918542e-06, |
| "loss": 0.1891, |
| "num_input_tokens_seen": 3027680, |
| "step": 7380 |
| }, |
| { |
| "epoch": 8.940677966101696, |
| "grad_norm": 0.4202336370944977, |
| "learning_rate": 1.6935864120898704e-06, |
| "loss": 0.149, |
| "num_input_tokens_seen": 3029856, |
| "step": 7385 |
| }, |
| { |
| "epoch": 8.946731234866828, |
| "grad_norm": 0.32964271306991577, |
| "learning_rate": 1.674526575158944e-06, |
| "loss": 0.1933, |
| "num_input_tokens_seen": 3032096, |
| "step": 7390 |
| }, |
| { |
| "epoch": 8.95278450363196, |
| "grad_norm": 0.29182112216949463, |
| "learning_rate": 1.6555708799959547e-06, |
| "loss": 0.2014, |
| "num_input_tokens_seen": 3034112, |
| "step": 7395 |
| }, |
| { |
| "epoch": 8.958837772397095, |
| "grad_norm": 0.23933826386928558, |
| "learning_rate": 1.6367194112328288e-06, |
| "loss": 0.1896, |
| "num_input_tokens_seen": 3036096, |
| "step": 7400 |
| }, |
| { |
| "epoch": 8.964891041162227, |
| "grad_norm": 0.24577218294143677, |
| "learning_rate": 1.617972253036143e-06, |
| "loss": 0.1706, |
| "num_input_tokens_seen": 3038080, |
| "step": 7405 |
| }, |
| { |
| "epoch": 8.970944309927361, |
| "grad_norm": 0.5137957334518433, |
| "learning_rate": 1.5993294891067573e-06, |
| "loss": 0.178, |
| "num_input_tokens_seen": 3040064, |
| "step": 7410 |
| }, |
| { |
| "epoch": 8.976997578692494, |
| "grad_norm": 0.27902355790138245, |
| "learning_rate": 1.580791202679438e-06, |
| "loss": 0.1823, |
| "num_input_tokens_seen": 3042048, |
| "step": 7415 |
| }, |
| { |
| "epoch": 8.983050847457626, |
| "grad_norm": 0.5112169981002808, |
| "learning_rate": 1.562357476522497e-06, |
| "loss": 0.1863, |
| "num_input_tokens_seen": 3044192, |
| "step": 7420 |
| }, |
| { |
| "epoch": 8.98910411622276, |
| "grad_norm": 0.4094531834125519, |
| "learning_rate": 1.5440283929374023e-06, |
| "loss": 0.229, |
| "num_input_tokens_seen": 3046112, |
| "step": 7425 |
| }, |
| { |
| "epoch": 8.995157384987893, |
| "grad_norm": 0.752479612827301, |
| "learning_rate": 1.5258040337584322e-06, |
| "loss": 0.2554, |
| "num_input_tokens_seen": 3048000, |
| "step": 7430 |
| }, |
| { |
| "epoch": 9.0, |
| "eval_loss": 0.1926165670156479, |
| "eval_runtime": 7.6496, |
| "eval_samples_per_second": 47.977, |
| "eval_steps_per_second": 12.027, |
| "num_input_tokens_seen": 3049392, |
| "step": 7434 |
| }, |
| { |
| "epoch": 9.001210653753027, |
| "grad_norm": 0.5370000004768372, |
| "learning_rate": 1.5076844803522922e-06, |
| "loss": 0.1748, |
| "num_input_tokens_seen": 3049808, |
| "step": 7435 |
| }, |
| { |
| "epoch": 9.00726392251816, |
| "grad_norm": 0.3140076994895935, |
| "learning_rate": 1.4896698136177612e-06, |
| "loss": 0.1432, |
| "num_input_tokens_seen": 3051792, |
| "step": 7440 |
| }, |
| { |
| "epoch": 9.013317191283292, |
| "grad_norm": 0.4940311312675476, |
| "learning_rate": 1.4717601139853266e-06, |
| "loss": 0.2158, |
| "num_input_tokens_seen": 3053776, |
| "step": 7445 |
| }, |
| { |
| "epoch": 9.019370460048426, |
| "grad_norm": 0.16382355988025665, |
| "learning_rate": 1.4539554614168339e-06, |
| "loss": 0.2002, |
| "num_input_tokens_seen": 3055856, |
| "step": 7450 |
| }, |
| { |
| "epoch": 9.025423728813559, |
| "grad_norm": 0.32316985726356506, |
| "learning_rate": 1.4362559354051092e-06, |
| "loss": 0.2219, |
| "num_input_tokens_seen": 3057840, |
| "step": 7455 |
| }, |
| { |
| "epoch": 9.031476997578693, |
| "grad_norm": 0.2876828908920288, |
| "learning_rate": 1.4186616149736349e-06, |
| "loss": 0.2451, |
| "num_input_tokens_seen": 3059920, |
| "step": 7460 |
| }, |
| { |
| "epoch": 9.037530266343826, |
| "grad_norm": 0.4233318567276001, |
| "learning_rate": 1.401172578676166e-06, |
| "loss": 0.1982, |
| "num_input_tokens_seen": 3061872, |
| "step": 7465 |
| }, |
| { |
| "epoch": 9.043583535108958, |
| "grad_norm": 0.5016361474990845, |
| "learning_rate": 1.383788904596403e-06, |
| "loss": 0.1715, |
| "num_input_tokens_seen": 3063888, |
| "step": 7470 |
| }, |
| { |
| "epoch": 9.049636803874092, |
| "grad_norm": 0.4053129255771637, |
| "learning_rate": 1.3665106703476178e-06, |
| "loss": 0.2163, |
| "num_input_tokens_seen": 3065872, |
| "step": 7475 |
| }, |
| { |
| "epoch": 9.055690072639225, |
| "grad_norm": 0.22209487855434418, |
| "learning_rate": 1.349337953072341e-06, |
| "loss": 0.1541, |
| "num_input_tokens_seen": 3067888, |
| "step": 7480 |
| }, |
| { |
| "epoch": 9.061743341404359, |
| "grad_norm": 0.4888889789581299, |
| "learning_rate": 1.3322708294419923e-06, |
| "loss": 0.2217, |
| "num_input_tokens_seen": 3069968, |
| "step": 7485 |
| }, |
| { |
| "epoch": 9.067796610169491, |
| "grad_norm": 0.47201064229011536, |
| "learning_rate": 1.3153093756565426e-06, |
| "loss": 0.18, |
| "num_input_tokens_seen": 3072176, |
| "step": 7490 |
| }, |
| { |
| "epoch": 9.073849878934624, |
| "grad_norm": 0.1938951015472412, |
| "learning_rate": 1.298453667444169e-06, |
| "loss": 0.2431, |
| "num_input_tokens_seen": 3074288, |
| "step": 7495 |
| }, |
| { |
| "epoch": 9.079903147699758, |
| "grad_norm": 0.6183655261993408, |
| "learning_rate": 1.281703780060947e-06, |
| "loss": 0.1891, |
| "num_input_tokens_seen": 3076304, |
| "step": 7500 |
| }, |
| { |
| "epoch": 9.08595641646489, |
| "grad_norm": 0.5016622543334961, |
| "learning_rate": 1.265059788290468e-06, |
| "loss": 0.1894, |
| "num_input_tokens_seen": 3078320, |
| "step": 7505 |
| }, |
| { |
| "epoch": 9.092009685230025, |
| "grad_norm": 0.17877379059791565, |
| "learning_rate": 1.2485217664435418e-06, |
| "loss": 0.2009, |
| "num_input_tokens_seen": 3080464, |
| "step": 7510 |
| }, |
| { |
| "epoch": 9.098062953995157, |
| "grad_norm": 0.17705337703227997, |
| "learning_rate": 1.232089788357843e-06, |
| "loss": 0.188, |
| "num_input_tokens_seen": 3082672, |
| "step": 7515 |
| }, |
| { |
| "epoch": 9.104116222760291, |
| "grad_norm": 0.5186365842819214, |
| "learning_rate": 1.2157639273975979e-06, |
| "loss": 0.2201, |
| "num_input_tokens_seen": 3084720, |
| "step": 7520 |
| }, |
| { |
| "epoch": 9.110169491525424, |
| "grad_norm": 0.21987123787403107, |
| "learning_rate": 1.19954425645325e-06, |
| "loss": 0.1731, |
| "num_input_tokens_seen": 3086864, |
| "step": 7525 |
| }, |
| { |
| "epoch": 9.116222760290556, |
| "grad_norm": 0.4533383846282959, |
| "learning_rate": 1.183430847941125e-06, |
| "loss": 0.1842, |
| "num_input_tokens_seen": 3088880, |
| "step": 7530 |
| }, |
| { |
| "epoch": 9.12227602905569, |
| "grad_norm": 0.43044987320899963, |
| "learning_rate": 1.1674237738031223e-06, |
| "loss": 0.16, |
| "num_input_tokens_seen": 3090960, |
| "step": 7535 |
| }, |
| { |
| "epoch": 9.128329297820823, |
| "grad_norm": 0.2750639021396637, |
| "learning_rate": 1.1515231055063914e-06, |
| "loss": 0.1521, |
| "num_input_tokens_seen": 3093040, |
| "step": 7540 |
| }, |
| { |
| "epoch": 9.134382566585957, |
| "grad_norm": 0.2573758363723755, |
| "learning_rate": 1.135728914043005e-06, |
| "loss": 0.1667, |
| "num_input_tokens_seen": 3095024, |
| "step": 7545 |
| }, |
| { |
| "epoch": 9.14043583535109, |
| "grad_norm": 0.2806335389614105, |
| "learning_rate": 1.120041269929642e-06, |
| "loss": 0.2177, |
| "num_input_tokens_seen": 3097008, |
| "step": 7550 |
| }, |
| { |
| "epoch": 9.146489104116222, |
| "grad_norm": 0.19661761820316315, |
| "learning_rate": 1.1044602432072836e-06, |
| "loss": 0.1562, |
| "num_input_tokens_seen": 3099184, |
| "step": 7555 |
| }, |
| { |
| "epoch": 9.152542372881356, |
| "grad_norm": 0.40045350790023804, |
| "learning_rate": 1.0889859034408922e-06, |
| "loss": 0.2808, |
| "num_input_tokens_seen": 3101328, |
| "step": 7560 |
| }, |
| { |
| "epoch": 9.158595641646489, |
| "grad_norm": 0.4137887954711914, |
| "learning_rate": 1.0736183197191024e-06, |
| "loss": 0.2032, |
| "num_input_tokens_seen": 3103408, |
| "step": 7565 |
| }, |
| { |
| "epoch": 9.164648910411623, |
| "grad_norm": 0.34546148777008057, |
| "learning_rate": 1.0583575606539108e-06, |
| "loss": 0.2461, |
| "num_input_tokens_seen": 3105616, |
| "step": 7570 |
| }, |
| { |
| "epoch": 9.170702179176756, |
| "grad_norm": 0.24163086712360382, |
| "learning_rate": 1.0432036943803708e-06, |
| "loss": 0.164, |
| "num_input_tokens_seen": 3107536, |
| "step": 7575 |
| }, |
| { |
| "epoch": 9.176755447941888, |
| "grad_norm": 0.5265182256698608, |
| "learning_rate": 1.0281567885562947e-06, |
| "loss": 0.1851, |
| "num_input_tokens_seen": 3109648, |
| "step": 7580 |
| }, |
| { |
| "epoch": 9.182808716707022, |
| "grad_norm": 0.5361573696136475, |
| "learning_rate": 1.0132169103619444e-06, |
| "loss": 0.185, |
| "num_input_tokens_seen": 3111504, |
| "step": 7585 |
| }, |
| { |
| "epoch": 9.188861985472155, |
| "grad_norm": 0.3071657419204712, |
| "learning_rate": 9.98384126499735e-07, |
| "loss": 0.201, |
| "num_input_tokens_seen": 3113424, |
| "step": 7590 |
| }, |
| { |
| "epoch": 9.194915254237289, |
| "grad_norm": 0.6272903680801392, |
| "learning_rate": 9.836585031939154e-07, |
| "loss": 0.1906, |
| "num_input_tokens_seen": 3115504, |
| "step": 7595 |
| }, |
| { |
| "epoch": 9.200968523002421, |
| "grad_norm": 0.45092782378196716, |
| "learning_rate": 9.690401061903249e-07, |
| "loss": 0.2563, |
| "num_input_tokens_seen": 3117488, |
| "step": 7600 |
| }, |
| { |
| "epoch": 9.207021791767554, |
| "grad_norm": 0.24368023872375488, |
| "learning_rate": 9.545290007560437e-07, |
| "loss": 0.2027, |
| "num_input_tokens_seen": 3119376, |
| "step": 7605 |
| }, |
| { |
| "epoch": 9.213075060532688, |
| "grad_norm": 0.4679562747478485, |
| "learning_rate": 9.401252516791304e-07, |
| "loss": 0.1892, |
| "num_input_tokens_seen": 3121424, |
| "step": 7610 |
| }, |
| { |
| "epoch": 9.21912832929782, |
| "grad_norm": 0.21977920830249786, |
| "learning_rate": 9.258289232683321e-07, |
| "loss": 0.1643, |
| "num_input_tokens_seen": 3123504, |
| "step": 7615 |
| }, |
| { |
| "epoch": 9.225181598062955, |
| "grad_norm": 0.38903844356536865, |
| "learning_rate": 9.11640079352788e-07, |
| "loss": 0.1724, |
| "num_input_tokens_seen": 3125712, |
| "step": 7620 |
| }, |
| { |
| "epoch": 9.231234866828087, |
| "grad_norm": 0.3856162428855896, |
| "learning_rate": 8.975587832817545e-07, |
| "loss": 0.1916, |
| "num_input_tokens_seen": 3127824, |
| "step": 7625 |
| }, |
| { |
| "epoch": 9.23728813559322, |
| "grad_norm": 0.2965726852416992, |
| "learning_rate": 8.835850979243055e-07, |
| "loss": 0.2049, |
| "num_input_tokens_seen": 3129936, |
| "step": 7630 |
| }, |
| { |
| "epoch": 9.243341404358354, |
| "grad_norm": 0.2608250081539154, |
| "learning_rate": 8.697190856690685e-07, |
| "loss": 0.1771, |
| "num_input_tokens_seen": 3131984, |
| "step": 7635 |
| }, |
| { |
| "epoch": 9.249394673123486, |
| "grad_norm": 0.36251935362815857, |
| "learning_rate": 8.559608084239474e-07, |
| "loss": 0.1963, |
| "num_input_tokens_seen": 3134064, |
| "step": 7640 |
| }, |
| { |
| "epoch": 9.25544794188862, |
| "grad_norm": 0.4114110469818115, |
| "learning_rate": 8.423103276158306e-07, |
| "loss": 0.1484, |
| "num_input_tokens_seen": 3136176, |
| "step": 7645 |
| }, |
| { |
| "epoch": 9.261501210653753, |
| "grad_norm": 0.40797290205955505, |
| "learning_rate": 8.287677041903308e-07, |
| "loss": 0.1948, |
| "num_input_tokens_seen": 3138288, |
| "step": 7650 |
| }, |
| { |
| "epoch": 9.267554479418886, |
| "grad_norm": 0.5304010510444641, |
| "learning_rate": 8.15332998611501e-07, |
| "loss": 0.175, |
| "num_input_tokens_seen": 3140272, |
| "step": 7655 |
| }, |
| { |
| "epoch": 9.27360774818402, |
| "grad_norm": 0.1611151099205017, |
| "learning_rate": 8.020062708615745e-07, |
| "loss": 0.2121, |
| "num_input_tokens_seen": 3142448, |
| "step": 7660 |
| }, |
| { |
| "epoch": 9.279661016949152, |
| "grad_norm": 0.5812345743179321, |
| "learning_rate": 7.887875804406946e-07, |
| "loss": 0.211, |
| "num_input_tokens_seen": 3144528, |
| "step": 7665 |
| }, |
| { |
| "epoch": 9.285714285714286, |
| "grad_norm": 0.43123534321784973, |
| "learning_rate": 7.756769863666524e-07, |
| "loss": 0.1982, |
| "num_input_tokens_seen": 3146512, |
| "step": 7670 |
| }, |
| { |
| "epoch": 9.291767554479419, |
| "grad_norm": 0.28313106298446655, |
| "learning_rate": 7.626745471746022e-07, |
| "loss": 0.1675, |
| "num_input_tokens_seen": 3148560, |
| "step": 7675 |
| }, |
| { |
| "epoch": 9.297820823244551, |
| "grad_norm": 0.35335543751716614, |
| "learning_rate": 7.497803209168347e-07, |
| "loss": 0.1945, |
| "num_input_tokens_seen": 3150640, |
| "step": 7680 |
| }, |
| { |
| "epoch": 9.303874092009686, |
| "grad_norm": 0.22999970614910126, |
| "learning_rate": 7.369943651624938e-07, |
| "loss": 0.1879, |
| "num_input_tokens_seen": 3152688, |
| "step": 7685 |
| }, |
| { |
| "epoch": 9.309927360774818, |
| "grad_norm": 0.5014199614524841, |
| "learning_rate": 7.243167369973242e-07, |
| "loss": 0.1967, |
| "num_input_tokens_seen": 3154672, |
| "step": 7690 |
| }, |
| { |
| "epoch": 9.315980629539952, |
| "grad_norm": 0.2948695719242096, |
| "learning_rate": 7.117474930234124e-07, |
| "loss": 0.226, |
| "num_input_tokens_seen": 3156656, |
| "step": 7695 |
| }, |
| { |
| "epoch": 9.322033898305085, |
| "grad_norm": 0.34910848736763, |
| "learning_rate": 6.992866893589578e-07, |
| "loss": 0.1711, |
| "num_input_tokens_seen": 3158640, |
| "step": 7700 |
| }, |
| { |
| "epoch": 9.328087167070217, |
| "grad_norm": 0.27760210633277893, |
| "learning_rate": 6.869343816379825e-07, |
| "loss": 0.1765, |
| "num_input_tokens_seen": 3160624, |
| "step": 7705 |
| }, |
| { |
| "epoch": 9.334140435835351, |
| "grad_norm": 0.3043615221977234, |
| "learning_rate": 6.74690625010116e-07, |
| "loss": 0.1904, |
| "num_input_tokens_seen": 3162608, |
| "step": 7710 |
| }, |
| { |
| "epoch": 9.340193704600484, |
| "grad_norm": 0.33441075682640076, |
| "learning_rate": 6.625554741403333e-07, |
| "loss": 0.2033, |
| "num_input_tokens_seen": 3164560, |
| "step": 7715 |
| }, |
| { |
| "epoch": 9.346246973365618, |
| "grad_norm": 0.4149807095527649, |
| "learning_rate": 6.505289832087231e-07, |
| "loss": 0.2094, |
| "num_input_tokens_seen": 3166768, |
| "step": 7720 |
| }, |
| { |
| "epoch": 9.35230024213075, |
| "grad_norm": 0.18698649108409882, |
| "learning_rate": 6.386112059102251e-07, |
| "loss": 0.2317, |
| "num_input_tokens_seen": 3168912, |
| "step": 7725 |
| }, |
| { |
| "epoch": 9.358353510895883, |
| "grad_norm": 0.42502710223197937, |
| "learning_rate": 6.268021954544096e-07, |
| "loss": 0.1946, |
| "num_input_tokens_seen": 3170800, |
| "step": 7730 |
| }, |
| { |
| "epoch": 9.364406779661017, |
| "grad_norm": 0.22544017434120178, |
| "learning_rate": 6.15102004565235e-07, |
| "loss": 0.2103, |
| "num_input_tokens_seen": 3172784, |
| "step": 7735 |
| }, |
| { |
| "epoch": 9.37046004842615, |
| "grad_norm": 0.452474981546402, |
| "learning_rate": 6.035106854808014e-07, |
| "loss": 0.1711, |
| "num_input_tokens_seen": 3174928, |
| "step": 7740 |
| }, |
| { |
| "epoch": 9.376513317191284, |
| "grad_norm": 0.4445652961730957, |
| "learning_rate": 5.920282899531421e-07, |
| "loss": 0.2139, |
| "num_input_tokens_seen": 3176976, |
| "step": 7745 |
| }, |
| { |
| "epoch": 9.382566585956416, |
| "grad_norm": 0.25134438276290894, |
| "learning_rate": 5.806548692479624e-07, |
| "loss": 0.1928, |
| "num_input_tokens_seen": 3178896, |
| "step": 7750 |
| }, |
| { |
| "epoch": 9.388619854721549, |
| "grad_norm": 0.527720034122467, |
| "learning_rate": 5.693904741444267e-07, |
| "loss": 0.1709, |
| "num_input_tokens_seen": 3180848, |
| "step": 7755 |
| }, |
| { |
| "epoch": 9.394673123486683, |
| "grad_norm": 0.19415412843227386, |
| "learning_rate": 5.58235154934944e-07, |
| "loss": 0.1425, |
| "num_input_tokens_seen": 3182704, |
| "step": 7760 |
| }, |
| { |
| "epoch": 9.400726392251816, |
| "grad_norm": 0.248501718044281, |
| "learning_rate": 5.471889614249104e-07, |
| "loss": 0.1489, |
| "num_input_tokens_seen": 3184848, |
| "step": 7765 |
| }, |
| { |
| "epoch": 9.40677966101695, |
| "grad_norm": 0.3817324936389923, |
| "learning_rate": 5.362519429325225e-07, |
| "loss": 0.2635, |
| "num_input_tokens_seen": 3186832, |
| "step": 7770 |
| }, |
| { |
| "epoch": 9.412832929782082, |
| "grad_norm": 0.2975015938282013, |
| "learning_rate": 5.254241482885253e-07, |
| "loss": 0.1756, |
| "num_input_tokens_seen": 3188912, |
| "step": 7775 |
| }, |
| { |
| "epoch": 9.418886198547215, |
| "grad_norm": 0.32076549530029297, |
| "learning_rate": 5.147056258360289e-07, |
| "loss": 0.1798, |
| "num_input_tokens_seen": 3191152, |
| "step": 7780 |
| }, |
| { |
| "epoch": 9.424939467312349, |
| "grad_norm": 0.17415617406368256, |
| "learning_rate": 5.040964234302559e-07, |
| "loss": 0.1756, |
| "num_input_tokens_seen": 3193232, |
| "step": 7785 |
| }, |
| { |
| "epoch": 9.430992736077481, |
| "grad_norm": 0.46165698766708374, |
| "learning_rate": 4.935965884383525e-07, |
| "loss": 0.2378, |
| "num_input_tokens_seen": 3195312, |
| "step": 7790 |
| }, |
| { |
| "epoch": 9.437046004842616, |
| "grad_norm": 0.2524760067462921, |
| "learning_rate": 4.832061677391697e-07, |
| "loss": 0.178, |
| "num_input_tokens_seen": 3197328, |
| "step": 7795 |
| }, |
| { |
| "epoch": 9.443099273607748, |
| "grad_norm": 0.3904651999473572, |
| "learning_rate": 4.729252077230517e-07, |
| "loss": 0.1873, |
| "num_input_tokens_seen": 3199280, |
| "step": 7800 |
| }, |
| { |
| "epoch": 9.44915254237288, |
| "grad_norm": 0.44877561926841736, |
| "learning_rate": 4.6275375429163656e-07, |
| "loss": 0.1588, |
| "num_input_tokens_seen": 3201328, |
| "step": 7805 |
| }, |
| { |
| "epoch": 9.455205811138015, |
| "grad_norm": 0.22410571575164795, |
| "learning_rate": 4.526918528576396e-07, |
| "loss": 0.1754, |
| "num_input_tokens_seen": 3203344, |
| "step": 7810 |
| }, |
| { |
| "epoch": 9.461259079903147, |
| "grad_norm": 0.19009947776794434, |
| "learning_rate": 4.427395483446617e-07, |
| "loss": 0.1817, |
| "num_input_tokens_seen": 3205488, |
| "step": 7815 |
| }, |
| { |
| "epoch": 9.467312348668282, |
| "grad_norm": 0.3117964565753937, |
| "learning_rate": 4.328968851869758e-07, |
| "loss": 0.205, |
| "num_input_tokens_seen": 3207504, |
| "step": 7820 |
| }, |
| { |
| "epoch": 9.473365617433414, |
| "grad_norm": 0.48840615153312683, |
| "learning_rate": 4.231639073293492e-07, |
| "loss": 0.1772, |
| "num_input_tokens_seen": 3209712, |
| "step": 7825 |
| }, |
| { |
| "epoch": 9.479418886198546, |
| "grad_norm": 0.23759415745735168, |
| "learning_rate": 4.13540658226827e-07, |
| "loss": 0.1878, |
| "num_input_tokens_seen": 3211728, |
| "step": 7830 |
| }, |
| { |
| "epoch": 9.48547215496368, |
| "grad_norm": 0.43204250931739807, |
| "learning_rate": 4.040271808445406e-07, |
| "loss": 0.2174, |
| "num_input_tokens_seen": 3213616, |
| "step": 7835 |
| }, |
| { |
| "epoch": 9.491525423728813, |
| "grad_norm": 0.24293774366378784, |
| "learning_rate": 3.94623517657533e-07, |
| "loss": 0.1473, |
| "num_input_tokens_seen": 3215536, |
| "step": 7840 |
| }, |
| { |
| "epoch": 9.497578692493947, |
| "grad_norm": 0.24489519000053406, |
| "learning_rate": 3.8532971065055045e-07, |
| "loss": 0.2058, |
| "num_input_tokens_seen": 3217552, |
| "step": 7845 |
| }, |
| { |
| "epoch": 9.5, |
| "eval_loss": 0.19325600564479828, |
| "eval_runtime": 7.6751, |
| "eval_samples_per_second": 47.817, |
| "eval_steps_per_second": 11.987, |
| "num_input_tokens_seen": 3218352, |
| "step": 7847 |
| }, |
| { |
| "epoch": 9.50363196125908, |
| "grad_norm": 0.316686749458313, |
| "learning_rate": 3.761458013178648e-07, |
| "loss": 0.179, |
| "num_input_tokens_seen": 3219664, |
| "step": 7850 |
| }, |
| { |
| "epoch": 9.509685230024212, |
| "grad_norm": 0.5225061178207397, |
| "learning_rate": 3.670718306630766e-07, |
| "loss": 0.1722, |
| "num_input_tokens_seen": 3221648, |
| "step": 7855 |
| }, |
| { |
| "epoch": 9.515738498789347, |
| "grad_norm": 0.3102196455001831, |
| "learning_rate": 3.5810783919895673e-07, |
| "loss": 0.1579, |
| "num_input_tokens_seen": 3223632, |
| "step": 7860 |
| }, |
| { |
| "epoch": 9.521791767554479, |
| "grad_norm": 0.4766305387020111, |
| "learning_rate": 3.4925386694723284e-07, |
| "loss": 0.2126, |
| "num_input_tokens_seen": 3225616, |
| "step": 7865 |
| }, |
| { |
| "epoch": 9.527845036319613, |
| "grad_norm": 0.4590197503566742, |
| "learning_rate": 3.405099534384393e-07, |
| "loss": 0.1998, |
| "num_input_tokens_seen": 3227728, |
| "step": 7870 |
| }, |
| { |
| "epoch": 9.533898305084746, |
| "grad_norm": 0.5207440257072449, |
| "learning_rate": 3.31876137711723e-07, |
| "loss": 0.2009, |
| "num_input_tokens_seen": 3229744, |
| "step": 7875 |
| }, |
| { |
| "epoch": 9.539951573849878, |
| "grad_norm": 0.37301743030548096, |
| "learning_rate": 3.233524583146741e-07, |
| "loss": 0.202, |
| "num_input_tokens_seen": 3231664, |
| "step": 7880 |
| }, |
| { |
| "epoch": 9.546004842615012, |
| "grad_norm": 0.19768239557743073, |
| "learning_rate": 3.149389533031566e-07, |
| "loss": 0.1636, |
| "num_input_tokens_seen": 3233712, |
| "step": 7885 |
| }, |
| { |
| "epoch": 9.552058111380145, |
| "grad_norm": 0.5510238409042358, |
| "learning_rate": 3.066356602411419e-07, |
| "loss": 0.1716, |
| "num_input_tokens_seen": 3235728, |
| "step": 7890 |
| }, |
| { |
| "epoch": 9.558111380145279, |
| "grad_norm": 0.3876868486404419, |
| "learning_rate": 2.984426162005227e-07, |
| "loss": 0.221, |
| "num_input_tokens_seen": 3237712, |
| "step": 7895 |
| }, |
| { |
| "epoch": 9.564164648910412, |
| "grad_norm": 0.6382808685302734, |
| "learning_rate": 2.903598577609717e-07, |
| "loss": 0.2248, |
| "num_input_tokens_seen": 3239664, |
| "step": 7900 |
| }, |
| { |
| "epoch": 9.570217917675544, |
| "grad_norm": 0.533298134803772, |
| "learning_rate": 2.823874210097638e-07, |
| "loss": 0.2, |
| "num_input_tokens_seen": 3241776, |
| "step": 7905 |
| }, |
| { |
| "epoch": 9.576271186440678, |
| "grad_norm": 0.6512199640274048, |
| "learning_rate": 2.745253415416177e-07, |
| "loss": 0.2053, |
| "num_input_tokens_seen": 3243824, |
| "step": 7910 |
| }, |
| { |
| "epoch": 9.58232445520581, |
| "grad_norm": 0.16263864934444427, |
| "learning_rate": 2.6677365445852976e-07, |
| "loss": 0.1827, |
| "num_input_tokens_seen": 3246000, |
| "step": 7915 |
| }, |
| { |
| "epoch": 9.588377723970945, |
| "grad_norm": 0.3981066942214966, |
| "learning_rate": 2.5913239436964054e-07, |
| "loss": 0.1771, |
| "num_input_tokens_seen": 3248112, |
| "step": 7920 |
| }, |
| { |
| "epoch": 9.594430992736077, |
| "grad_norm": 0.3139074444770813, |
| "learning_rate": 2.5160159539105443e-07, |
| "loss": 0.174, |
| "num_input_tokens_seen": 3250256, |
| "step": 7925 |
| }, |
| { |
| "epoch": 9.600484261501212, |
| "grad_norm": 0.5219495892524719, |
| "learning_rate": 2.441812911456981e-07, |
| "loss": 0.2233, |
| "num_input_tokens_seen": 3252368, |
| "step": 7930 |
| }, |
| { |
| "epoch": 9.606537530266344, |
| "grad_norm": 0.45383334159851074, |
| "learning_rate": 2.3687151476317337e-07, |
| "loss": 0.2179, |
| "num_input_tokens_seen": 3254416, |
| "step": 7935 |
| }, |
| { |
| "epoch": 9.612590799031477, |
| "grad_norm": 0.5202348232269287, |
| "learning_rate": 2.2967229887960186e-07, |
| "loss": 0.2432, |
| "num_input_tokens_seen": 3256496, |
| "step": 7940 |
| }, |
| { |
| "epoch": 9.61864406779661, |
| "grad_norm": 0.4350161850452423, |
| "learning_rate": 2.2258367563748884e-07, |
| "loss": 0.2408, |
| "num_input_tokens_seen": 3258576, |
| "step": 7945 |
| }, |
| { |
| "epoch": 9.624697336561743, |
| "grad_norm": 0.4891444146633148, |
| "learning_rate": 2.1560567668556797e-07, |
| "loss": 0.2212, |
| "num_input_tokens_seen": 3260496, |
| "step": 7950 |
| }, |
| { |
| "epoch": 9.630750605326877, |
| "grad_norm": 0.6660648584365845, |
| "learning_rate": 2.0873833317866798e-07, |
| "loss": 0.2182, |
| "num_input_tokens_seen": 3262608, |
| "step": 7955 |
| }, |
| { |
| "epoch": 9.63680387409201, |
| "grad_norm": 0.31146183609962463, |
| "learning_rate": 2.019816757775711e-07, |
| "loss": 0.1838, |
| "num_input_tokens_seen": 3264592, |
| "step": 7960 |
| }, |
| { |
| "epoch": 9.642857142857142, |
| "grad_norm": 0.14717160165309906, |
| "learning_rate": 1.9533573464888543e-07, |
| "loss": 0.1659, |
| "num_input_tokens_seen": 3266704, |
| "step": 7965 |
| }, |
| { |
| "epoch": 9.648910411622277, |
| "grad_norm": 0.2484346628189087, |
| "learning_rate": 1.8880053946488675e-07, |
| "loss": 0.2012, |
| "num_input_tokens_seen": 3268816, |
| "step": 7970 |
| }, |
| { |
| "epoch": 9.654963680387409, |
| "grad_norm": 0.39253896474838257, |
| "learning_rate": 1.8237611940341291e-07, |
| "loss": 0.2205, |
| "num_input_tokens_seen": 3270864, |
| "step": 7975 |
| }, |
| { |
| "epoch": 9.661016949152543, |
| "grad_norm": 0.42186811566352844, |
| "learning_rate": 1.760625031477142e-07, |
| "loss": 0.203, |
| "num_input_tokens_seen": 3272944, |
| "step": 7980 |
| }, |
| { |
| "epoch": 9.667070217917676, |
| "grad_norm": 0.6267507076263428, |
| "learning_rate": 1.6985971888633935e-07, |
| "loss": 0.1879, |
| "num_input_tokens_seen": 3274992, |
| "step": 7985 |
| }, |
| { |
| "epoch": 9.673123486682808, |
| "grad_norm": 0.2953133285045624, |
| "learning_rate": 1.637677943129967e-07, |
| "loss": 0.1322, |
| "num_input_tokens_seen": 3277008, |
| "step": 7990 |
| }, |
| { |
| "epoch": 9.679176755447942, |
| "grad_norm": 0.2336490899324417, |
| "learning_rate": 1.5778675662643793e-07, |
| "loss": 0.1864, |
| "num_input_tokens_seen": 3278928, |
| "step": 7995 |
| }, |
| { |
| "epoch": 9.685230024213075, |
| "grad_norm": 0.362797349691391, |
| "learning_rate": 1.5191663253034116e-07, |
| "loss": 0.2052, |
| "num_input_tokens_seen": 3280944, |
| "step": 8000 |
| }, |
| { |
| "epoch": 9.69128329297821, |
| "grad_norm": 0.3932698965072632, |
| "learning_rate": 1.461574482331779e-07, |
| "loss": 0.2065, |
| "num_input_tokens_seen": 3282960, |
| "step": 8005 |
| }, |
| { |
| "epoch": 9.697336561743342, |
| "grad_norm": 0.37164241075515747, |
| "learning_rate": 1.4050922944811305e-07, |
| "loss": 0.1541, |
| "num_input_tokens_seen": 3285008, |
| "step": 8010 |
| }, |
| { |
| "epoch": 9.703389830508474, |
| "grad_norm": 0.608330249786377, |
| "learning_rate": 1.349720013928718e-07, |
| "loss": 0.2235, |
| "num_input_tokens_seen": 3287088, |
| "step": 8015 |
| }, |
| { |
| "epoch": 9.709443099273608, |
| "grad_norm": 0.4523017108440399, |
| "learning_rate": 1.2954578878964507e-07, |
| "loss": 0.206, |
| "num_input_tokens_seen": 3289168, |
| "step": 8020 |
| }, |
| { |
| "epoch": 9.71549636803874, |
| "grad_norm": 0.2168329954147339, |
| "learning_rate": 1.2423061586496477e-07, |
| "loss": 0.1927, |
| "num_input_tokens_seen": 3291376, |
| "step": 8025 |
| }, |
| { |
| "epoch": 9.721549636803875, |
| "grad_norm": 0.38658609986305237, |
| "learning_rate": 1.1902650634960378e-07, |
| "loss": 0.19, |
| "num_input_tokens_seen": 3293360, |
| "step": 8030 |
| }, |
| { |
| "epoch": 9.727602905569007, |
| "grad_norm": 0.4904116690158844, |
| "learning_rate": 1.1393348347846777e-07, |
| "loss": 0.1966, |
| "num_input_tokens_seen": 3295344, |
| "step": 8035 |
| }, |
| { |
| "epoch": 9.73365617433414, |
| "grad_norm": 0.2101762294769287, |
| "learning_rate": 1.0895156999048972e-07, |
| "loss": 0.1413, |
| "num_input_tokens_seen": 3297392, |
| "step": 8040 |
| }, |
| { |
| "epoch": 9.739709443099274, |
| "grad_norm": 0.3092457950115204, |
| "learning_rate": 1.0408078812853273e-07, |
| "loss": 0.2276, |
| "num_input_tokens_seen": 3299376, |
| "step": 8045 |
| }, |
| { |
| "epoch": 9.745762711864407, |
| "grad_norm": 0.46512511372566223, |
| "learning_rate": 9.932115963928734e-08, |
| "loss": 0.1753, |
| "num_input_tokens_seen": 3301360, |
| "step": 8050 |
| }, |
| { |
| "epoch": 9.75181598062954, |
| "grad_norm": 0.22224070131778717, |
| "learning_rate": 9.467270577317167e-08, |
| "loss": 0.2308, |
| "num_input_tokens_seen": 3303440, |
| "step": 8055 |
| }, |
| { |
| "epoch": 9.757869249394673, |
| "grad_norm": 0.355302631855011, |
| "learning_rate": 9.013544728424528e-08, |
| "loss": 0.1754, |
| "num_input_tokens_seen": 3305552, |
| "step": 8060 |
| }, |
| { |
| "epoch": 9.763922518159806, |
| "grad_norm": 0.2829434275627136, |
| "learning_rate": 8.570940443010655e-08, |
| "loss": 0.1779, |
| "num_input_tokens_seen": 3307728, |
| "step": 8065 |
| }, |
| { |
| "epoch": 9.76997578692494, |
| "grad_norm": 0.48170995712280273, |
| "learning_rate": 8.139459697181218e-08, |
| "loss": 0.204, |
| "num_input_tokens_seen": 3309776, |
| "step": 8070 |
| }, |
| { |
| "epoch": 9.776029055690072, |
| "grad_norm": 0.2598755359649658, |
| "learning_rate": 7.719104417377443e-08, |
| "loss": 0.1489, |
| "num_input_tokens_seen": 3311760, |
| "step": 8075 |
| }, |
| { |
| "epoch": 9.782082324455207, |
| "grad_norm": 0.425149142742157, |
| "learning_rate": 7.30987648036946e-08, |
| "loss": 0.1433, |
| "num_input_tokens_seen": 3313808, |
| "step": 8080 |
| }, |
| { |
| "epoch": 9.788135593220339, |
| "grad_norm": 0.3688722848892212, |
| "learning_rate": 6.911777713246581e-08, |
| "loss": 0.1607, |
| "num_input_tokens_seen": 3315888, |
| "step": 8085 |
| }, |
| { |
| "epoch": 9.794188861985472, |
| "grad_norm": 0.2523213028907776, |
| "learning_rate": 6.524809893409256e-08, |
| "loss": 0.1933, |
| "num_input_tokens_seen": 3318000, |
| "step": 8090 |
| }, |
| { |
| "epoch": 9.800242130750606, |
| "grad_norm": 0.19854332506656647, |
| "learning_rate": 6.148974748561299e-08, |
| "loss": 0.139, |
| "num_input_tokens_seen": 3320016, |
| "step": 8095 |
| }, |
| { |
| "epoch": 9.806295399515738, |
| "grad_norm": 0.5553900003433228, |
| "learning_rate": 5.784273956702391e-08, |
| "loss": 0.1946, |
| "num_input_tokens_seen": 3322096, |
| "step": 8100 |
| }, |
| { |
| "epoch": 9.812348668280872, |
| "grad_norm": 0.35563018918037415, |
| "learning_rate": 5.4307091461205936e-08, |
| "loss": 0.1496, |
| "num_input_tokens_seen": 3324176, |
| "step": 8105 |
| }, |
| { |
| "epoch": 9.818401937046005, |
| "grad_norm": 0.42958784103393555, |
| "learning_rate": 5.08828189538485e-08, |
| "loss": 0.2244, |
| "num_input_tokens_seen": 3326320, |
| "step": 8110 |
| }, |
| { |
| "epoch": 9.824455205811137, |
| "grad_norm": 0.19304096698760986, |
| "learning_rate": 4.7569937333372115e-08, |
| "loss": 0.1776, |
| "num_input_tokens_seen": 3328464, |
| "step": 8115 |
| }, |
| { |
| "epoch": 9.830508474576272, |
| "grad_norm": 0.232134148478508, |
| "learning_rate": 4.436846139087847e-08, |
| "loss": 0.1348, |
| "num_input_tokens_seen": 3330480, |
| "step": 8120 |
| }, |
| { |
| "epoch": 9.836561743341404, |
| "grad_norm": 0.16398993134498596, |
| "learning_rate": 4.127840542006711e-08, |
| "loss": 0.2058, |
| "num_input_tokens_seen": 3332624, |
| "step": 8125 |
| }, |
| { |
| "epoch": 9.842615012106538, |
| "grad_norm": 0.288708359003067, |
| "learning_rate": 3.829978321718553e-08, |
| "loss": 0.1619, |
| "num_input_tokens_seen": 3334768, |
| "step": 8130 |
| }, |
| { |
| "epoch": 9.84866828087167, |
| "grad_norm": 0.5418060421943665, |
| "learning_rate": 3.543260808095139e-08, |
| "loss": 0.179, |
| "num_input_tokens_seen": 3336784, |
| "step": 8135 |
| }, |
| { |
| "epoch": 9.854721549636803, |
| "grad_norm": 0.22882364690303802, |
| "learning_rate": 3.267689281250541e-08, |
| "loss": 0.1607, |
| "num_input_tokens_seen": 3338832, |
| "step": 8140 |
| }, |
| { |
| "epoch": 9.860774818401937, |
| "grad_norm": 0.4315253794193268, |
| "learning_rate": 3.003264971535857e-08, |
| "loss": 0.2356, |
| "num_input_tokens_seen": 3340848, |
| "step": 8145 |
| }, |
| { |
| "epoch": 9.86682808716707, |
| "grad_norm": 0.19929257035255432, |
| "learning_rate": 2.7499890595314438e-08, |
| "loss": 0.1843, |
| "num_input_tokens_seen": 3342960, |
| "step": 8150 |
| }, |
| { |
| "epoch": 9.872881355932204, |
| "grad_norm": 0.3341890573501587, |
| "learning_rate": 2.507862676044137e-08, |
| "loss": 0.1827, |
| "num_input_tokens_seen": 3345104, |
| "step": 8155 |
| }, |
| { |
| "epoch": 9.878934624697337, |
| "grad_norm": 0.14005398750305176, |
| "learning_rate": 2.2768869021014274e-08, |
| "loss": 0.1745, |
| "num_input_tokens_seen": 3347024, |
| "step": 8160 |
| }, |
| { |
| "epoch": 9.884987893462469, |
| "grad_norm": 0.5116079449653625, |
| "learning_rate": 2.0570627689459054e-08, |
| "loss": 0.2058, |
| "num_input_tokens_seen": 3349200, |
| "step": 8165 |
| }, |
| { |
| "epoch": 9.891041162227603, |
| "grad_norm": 0.5228228569030762, |
| "learning_rate": 1.848391258031379e-08, |
| "loss": 0.1791, |
| "num_input_tokens_seen": 3351248, |
| "step": 8170 |
| }, |
| { |
| "epoch": 9.897094430992736, |
| "grad_norm": 0.6169029474258423, |
| "learning_rate": 1.6508733010184297e-08, |
| "loss": 0.1861, |
| "num_input_tokens_seen": 3353488, |
| "step": 8175 |
| }, |
| { |
| "epoch": 9.90314769975787, |
| "grad_norm": 0.3636971414089203, |
| "learning_rate": 1.4645097797694186e-08, |
| "loss": 0.1719, |
| "num_input_tokens_seen": 3355440, |
| "step": 8180 |
| }, |
| { |
| "epoch": 9.909200968523002, |
| "grad_norm": 0.5593004822731018, |
| "learning_rate": 1.2893015263459874e-08, |
| "loss": 0.2699, |
| "num_input_tokens_seen": 3357296, |
| "step": 8185 |
| }, |
| { |
| "epoch": 9.915254237288135, |
| "grad_norm": 0.20834974944591522, |
| "learning_rate": 1.125249323004618e-08, |
| "loss": 0.2053, |
| "num_input_tokens_seen": 3359280, |
| "step": 8190 |
| }, |
| { |
| "epoch": 9.92130750605327, |
| "grad_norm": 0.16997875273227692, |
| "learning_rate": 9.723539021927463e-09, |
| "loss": 0.2096, |
| "num_input_tokens_seen": 3361328, |
| "step": 8195 |
| }, |
| { |
| "epoch": 9.927360774818402, |
| "grad_norm": 0.5368923544883728, |
| "learning_rate": 8.306159465459872e-09, |
| "loss": 0.1803, |
| "num_input_tokens_seen": 3363344, |
| "step": 8200 |
| }, |
| { |
| "epoch": 9.933414043583536, |
| "grad_norm": 0.258119136095047, |
| "learning_rate": 7.00036088885081e-09, |
| "loss": 0.1881, |
| "num_input_tokens_seen": 3365296, |
| "step": 8205 |
| }, |
| { |
| "epoch": 9.939467312348668, |
| "grad_norm": 0.3872841000556946, |
| "learning_rate": 5.806149122128401e-09, |
| "loss": 0.1482, |
| "num_input_tokens_seen": 3367504, |
| "step": 8210 |
| }, |
| { |
| "epoch": 9.9455205811138, |
| "grad_norm": 0.38133659958839417, |
| "learning_rate": 4.723529497113743e-09, |
| "loss": 0.2161, |
| "num_input_tokens_seen": 3369616, |
| "step": 8215 |
| }, |
| { |
| "epoch": 9.951573849878935, |
| "grad_norm": 0.3784112334251404, |
| "learning_rate": 3.752506847407023e-09, |
| "loss": 0.1843, |
| "num_input_tokens_seen": 3371728, |
| "step": 8220 |
| }, |
| { |
| "epoch": 9.957627118644067, |
| "grad_norm": 0.41736942529678345, |
| "learning_rate": 2.8930855083542096e-09, |
| "loss": 0.2074, |
| "num_input_tokens_seen": 3373648, |
| "step": 8225 |
| }, |
| { |
| "epoch": 9.963680387409202, |
| "grad_norm": 0.3605712354183197, |
| "learning_rate": 2.145269317033183e-09, |
| "loss": 0.1793, |
| "num_input_tokens_seen": 3375664, |
| "step": 8230 |
| }, |
| { |
| "epoch": 9.969733656174334, |
| "grad_norm": 0.3409971296787262, |
| "learning_rate": 1.509061612234297e-09, |
| "loss": 0.1531, |
| "num_input_tokens_seen": 3377808, |
| "step": 8235 |
| }, |
| { |
| "epoch": 9.975786924939467, |
| "grad_norm": 0.6673860549926758, |
| "learning_rate": 9.844652344492832e-10, |
| "loss": 0.2167, |
| "num_input_tokens_seen": 3379888, |
| "step": 8240 |
| }, |
| { |
| "epoch": 9.9818401937046, |
| "grad_norm": 0.2759976387023926, |
| "learning_rate": 5.714825258545942e-10, |
| "loss": 0.216, |
| "num_input_tokens_seen": 3382064, |
| "step": 8245 |
| }, |
| { |
| "epoch": 9.987893462469733, |
| "grad_norm": 0.42253345251083374, |
| "learning_rate": 2.7011533030585347e-10, |
| "loss": 0.1486, |
| "num_input_tokens_seen": 3384144, |
| "step": 8250 |
| }, |
| { |
| "epoch": 9.993946731234868, |
| "grad_norm": 0.29823410511016846, |
| "learning_rate": 8.036499332397807e-11, |
| "loss": 0.1605, |
| "num_input_tokens_seen": 3386160, |
| "step": 8255 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 0.6797935366630554, |
| "learning_rate": 2.2323620896269604e-12, |
| "loss": 0.1835, |
| "num_input_tokens_seen": 3388032, |
| "step": 8260 |
| }, |
| { |
| "epoch": 10.0, |
| "eval_loss": 0.1926523745059967, |
| "eval_runtime": 7.6705, |
| "eval_samples_per_second": 47.845, |
| "eval_steps_per_second": 11.994, |
| "num_input_tokens_seen": 3388032, |
| "step": 8260 |
| }, |
| { |
| "epoch": 10.0, |
| "num_input_tokens_seen": 3388032, |
| "step": 8260, |
| "total_flos": 1.5256154967795302e+17, |
| "train_loss": 0.9589975316573575, |
| "train_runtime": 1705.8594, |
| "train_samples_per_second": 19.351, |
| "train_steps_per_second": 4.842 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 8260, |
| "num_input_tokens_seen": 3388032, |
| "num_train_epochs": 10, |
| "save_steps": 413, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.5256154967795302e+17, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|