{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.93339911198816, "eval_steps": 1000, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0493339911198816, "grad_norm": 3.404296875, "learning_rate": 1.999987141198133e-05, "loss": 1.1607, "mean_token_accuracy": 0.7450588776171208, "num_tokens": 817724.0, "step": 100 }, { "epoch": 0.0986679822397632, "grad_norm": 3.275390625, "learning_rate": 1.9998811023678148e-05, "loss": 1.0497, "mean_token_accuracy": 0.764276393353939, "num_tokens": 1635495.0, "step": 200 }, { "epoch": 0.1480019733596448, "grad_norm": 3.578125, "learning_rate": 1.999667964947103e-05, "loss": 1.0262, "mean_token_accuracy": 0.7685012140870094, "num_tokens": 2453891.0, "step": 300 }, { "epoch": 0.1973359644795264, "grad_norm": 2.833984375, "learning_rate": 1.999347751765429e-05, "loss": 1.0036, "mean_token_accuracy": 0.7728290051221848, "num_tokens": 3271228.0, "step": 400 }, { "epoch": 0.246669955599408, "grad_norm": 2.86328125, "learning_rate": 1.998920497121251e-05, "loss": 0.9978, "mean_token_accuracy": 0.77385304749012, "num_tokens": 4090128.0, "step": 500 }, { "epoch": 0.2960039467192896, "grad_norm": 2.96484375, "learning_rate": 1.998386246778381e-05, "loss": 0.9859, "mean_token_accuracy": 0.7768463261425496, "num_tokens": 4908319.0, "step": 600 }, { "epoch": 0.3453379378391712, "grad_norm": 3.03515625, "learning_rate": 1.997745057961081e-05, "loss": 0.9702, "mean_token_accuracy": 0.7799235332012177, "num_tokens": 5726519.0, "step": 700 }, { "epoch": 0.3946719289590528, "grad_norm": 2.759765625, "learning_rate": 1.9969969993479352e-05, "loss": 0.9674, "mean_token_accuracy": 0.7801838928461075, "num_tokens": 6545163.0, "step": 800 }, { "epoch": 0.4440059200789344, "grad_norm": 3.3046875, "learning_rate": 1.9961421510644935e-05, "loss": 0.9597, "mean_token_accuracy": 0.7807393775880337, "num_tokens": 7363259.0, "step": 900 }, { "epoch": 0.493339911198816, "grad_norm": 3.087890625, "learning_rate": 1.9951806046746892e-05, "loss": 0.9476, "step": 1000 }, { "epoch": 0.493339911198816, "eval_loss": 0.9531487822532654, "eval_mean_token_accuracy": 0.784176768076837, "eval_num_tokens": 8182192.0, "eval_runtime": 232.2542, "eval_samples_per_second": 69.863, "eval_steps_per_second": 8.736, "step": 1000 }, { "epoch": 0.5426739023186976, "grad_norm": 2.607421875, "learning_rate": 1.9941124631710298e-05, "loss": 0.9412, "mean_token_accuracy": 0.7848582742363215, "num_tokens": 9000632.0, "step": 1100 }, { "epoch": 0.5920078934385792, "grad_norm": 2.814453125, "learning_rate": 1.9929378409635686e-05, "loss": 0.9383, "mean_token_accuracy": 0.7856255556643009, "num_tokens": 9817706.0, "step": 1200 }, { "epoch": 0.6413418845584608, "grad_norm": 2.822265625, "learning_rate": 1.9916568638676474e-05, "loss": 0.9302, "mean_token_accuracy": 0.7883923088014125, "num_tokens": 10635361.0, "step": 1300 }, { "epoch": 0.6906758756783424, "grad_norm": 2.779296875, "learning_rate": 1.9902696690904203e-05, "loss": 0.928, "mean_token_accuracy": 0.788385953605175, "num_tokens": 11452193.0, "step": 1400 }, { "epoch": 0.740009866798224, "grad_norm": 2.681640625, "learning_rate": 1.9887764052161598e-05, "loss": 0.9187, "mean_token_accuracy": 0.7905992804467679, "num_tokens": 12270285.0, "step": 1500 }, { "epoch": 0.7893438579181056, "grad_norm": 2.6796875, "learning_rate": 1.9871772321903388e-05, "loss": 0.9069, "mean_token_accuracy": 0.7929588802158832, "num_tokens": 13088068.0, "step": 1600 }, { "epoch": 0.8386778490379871, "grad_norm": 2.859375, "learning_rate": 1.9854723213024996e-05, "loss": 0.9023, "mean_token_accuracy": 0.7939252285659313, "num_tokens": 13906129.0, "step": 1700 }, { "epoch": 0.8880118401578688, "grad_norm": 2.439453125, "learning_rate": 1.983661855167908e-05, "loss": 0.895, "mean_token_accuracy": 0.7945813001692295, "num_tokens": 14724436.0, "step": 1800 }, { "epoch": 0.9373458312777504, "grad_norm": 2.708984375, "learning_rate": 1.981746027707992e-05, "loss": 0.8826, "mean_token_accuracy": 0.7975821754336357, "num_tokens": 15542251.0, "step": 1900 }, { "epoch": 0.986679822397632, "grad_norm": 2.47265625, "learning_rate": 1.9797250441295696e-05, "loss": 0.8814, "step": 2000 }, { "epoch": 0.986679822397632, "eval_loss": 0.8888025879859924, "eval_mean_token_accuracy": 0.7973491149679554, "eval_num_tokens": 16360700.0, "eval_runtime": 230.6477, "eval_samples_per_second": 70.35, "eval_steps_per_second": 8.797, "step": 2000 }, { "epoch": 1.0360138135175136, "grad_norm": 2.748046875, "learning_rate": 1.9775991209028724e-05, "loss": 0.8088, "mean_token_accuracy": 0.8046260391920805, "num_tokens": 17176613.0, "step": 2100 }, { "epoch": 1.0853478046373952, "grad_norm": 2.654296875, "learning_rate": 1.9753684857383547e-05, "loss": 0.7927, "mean_token_accuracy": 0.813809906244278, "num_tokens": 17994211.0, "step": 2200 }, { "epoch": 1.1346817957572768, "grad_norm": 2.775390625, "learning_rate": 1.9730333775623062e-05, "loss": 0.7785, "mean_token_accuracy": 0.8165469121932983, "num_tokens": 18812403.0, "step": 2300 }, { "epoch": 1.1840157868771584, "grad_norm": 2.703125, "learning_rate": 1.9705940464912593e-05, "loss": 0.7799, "mean_token_accuracy": 0.8157392312586308, "num_tokens": 19630645.0, "step": 2400 }, { "epoch": 1.23334977799704, "grad_norm": 2.24609375, "learning_rate": 1.9680507538051982e-05, "loss": 0.7792, "mean_token_accuracy": 0.8161792799830436, "num_tokens": 20448907.0, "step": 2500 }, { "epoch": 1.2826837691169215, "grad_norm": 2.34765625, "learning_rate": 1.9654037719195753e-05, "loss": 0.7791, "mean_token_accuracy": 0.8167361421883106, "num_tokens": 21267549.0, "step": 2600 }, { "epoch": 1.3320177602368033, "grad_norm": 2.255859375, "learning_rate": 1.9626533843561284e-05, "loss": 0.7712, "mean_token_accuracy": 0.8178707587718964, "num_tokens": 22085783.0, "step": 2700 }, { "epoch": 1.3813517513566849, "grad_norm": 2.62109375, "learning_rate": 1.959799885712516e-05, "loss": 0.7802, "mean_token_accuracy": 0.8168295152485371, "num_tokens": 22903659.0, "step": 2800 }, { "epoch": 1.4306857424765664, "grad_norm": 2.36328125, "learning_rate": 1.9568435816307606e-05, "loss": 0.7786, "mean_token_accuracy": 0.817071139216423, "num_tokens": 23721992.0, "step": 2900 }, { "epoch": 1.480019733596448, "grad_norm": 2.35546875, "learning_rate": 1.9537847887645114e-05, "loss": 0.7726, "step": 3000 }, { "epoch": 1.480019733596448, "eval_loss": 0.8628731966018677, "eval_mean_token_accuracy": 0.8050215809494478, "eval_num_tokens": 24539842.0, "eval_runtime": 258.3525, "eval_samples_per_second": 62.806, "eval_steps_per_second": 7.854, "step": 3000 }, { "epoch": 1.5293537247163296, "grad_norm": 2.36328125, "learning_rate": 1.9506238347451272e-05, "loss": 0.7797, "mean_token_accuracy": 0.8179577070474625, "num_tokens": 25357355.0, "step": 3100 }, { "epoch": 1.5786877158362111, "grad_norm": 2.5078125, "learning_rate": 1.9473610581465835e-05, "loss": 0.7734, "mean_token_accuracy": 0.818172342479229, "num_tokens": 26175340.0, "step": 3200 }, { "epoch": 1.6280217069560927, "grad_norm": 2.435546875, "learning_rate": 1.943996808449207e-05, "loss": 0.7736, "mean_token_accuracy": 0.8182803516089916, "num_tokens": 26993802.0, "step": 3300 }, { "epoch": 1.6773556980759743, "grad_norm": 2.40234375, "learning_rate": 1.940531446002243e-05, "loss": 0.7706, "mean_token_accuracy": 0.8192763808369636, "num_tokens": 27811947.0, "step": 3400 }, { "epoch": 1.7266896891958559, "grad_norm": 2.5390625, "learning_rate": 1.9369653419852568e-05, "loss": 0.7673, "mean_token_accuracy": 0.8196577854454518, "num_tokens": 28629348.0, "step": 3500 }, { "epoch": 1.7760236803157374, "grad_norm": 2.453125, "learning_rate": 1.933298878368378e-05, "loss": 0.7637, "mean_token_accuracy": 0.8204946468770504, "num_tokens": 29447420.0, "step": 3600 }, { "epoch": 1.825357671435619, "grad_norm": 2.337890625, "learning_rate": 1.929532447871384e-05, "loss": 0.7558, "mean_token_accuracy": 0.8219007922708989, "num_tokens": 30265221.0, "step": 3700 }, { "epoch": 1.8746916625555008, "grad_norm": 2.48828125, "learning_rate": 1.925666453921639e-05, "loss": 0.7587, "mean_token_accuracy": 0.82237908706069, "num_tokens": 31082572.0, "step": 3800 }, { "epoch": 1.9240256536753824, "grad_norm": 2.51953125, "learning_rate": 1.9217013106108798e-05, "loss": 0.7501, "mean_token_accuracy": 0.8232213893532753, "num_tokens": 31900505.0, "step": 3900 }, { "epoch": 1.973359644795264, "grad_norm": 2.33203125, "learning_rate": 1.917637442650863e-05, "loss": 0.7557, "step": 4000 }, { "epoch": 1.973359644795264, "eval_loss": 0.8336860537528992, "eval_mean_token_accuracy": 0.8114011559420586, "eval_num_tokens": 32718869.0, "eval_runtime": 301.9223, "eval_samples_per_second": 53.742, "eval_steps_per_second": 6.72, "step": 4000 }, { "epoch": 2.0226936359151457, "grad_norm": 2.255859375, "learning_rate": 1.913475285327874e-05, "loss": 0.7076, "mean_token_accuracy": 0.826932647228241, "num_tokens": 33535218.0, "step": 4100 }, { "epoch": 2.0720276270350273, "grad_norm": 2.8046875, "learning_rate": 1.9092152844561e-05, "loss": 0.6484, "mean_token_accuracy": 0.8430611005425453, "num_tokens": 34353569.0, "step": 4200 }, { "epoch": 2.121361618154909, "grad_norm": 2.31640625, "learning_rate": 1.904857896329882e-05, "loss": 0.6535, "mean_token_accuracy": 0.8420212762057782, "num_tokens": 35171634.0, "step": 4300 }, { "epoch": 2.1706956092747904, "grad_norm": 2.474609375, "learning_rate": 1.9004035876748393e-05, "loss": 0.65, "mean_token_accuracy": 0.8425224512815476, "num_tokens": 35989510.0, "step": 4400 }, { "epoch": 2.220029600394672, "grad_norm": 2.37109375, "learning_rate": 1.8958528355978767e-05, "loss": 0.6482, "mean_token_accuracy": 0.8430168768763542, "num_tokens": 36807593.0, "step": 4500 }, { "epoch": 2.2693635915145536, "grad_norm": 2.544921875, "learning_rate": 1.8912061275360817e-05, "loss": 0.6563, "mean_token_accuracy": 0.8413836374878884, "num_tokens": 37625695.0, "step": 4600 }, { "epoch": 2.318697582634435, "grad_norm": 2.4140625, "learning_rate": 1.8864639612045153e-05, "loss": 0.6556, "mean_token_accuracy": 0.841529670804739, "num_tokens": 38443841.0, "step": 4700 }, { "epoch": 2.3680315737543167, "grad_norm": 2.556640625, "learning_rate": 1.8816268445428996e-05, "loss": 0.6495, "mean_token_accuracy": 0.8425976119935512, "num_tokens": 39262075.0, "step": 4800 }, { "epoch": 2.4173655648741983, "grad_norm": 2.5078125, "learning_rate": 1.8766952956612123e-05, "loss": 0.655, "mean_token_accuracy": 0.8413432243466378, "num_tokens": 40079953.0, "step": 4900 }, { "epoch": 2.46669955599408, "grad_norm": 2.724609375, "learning_rate": 1.8716698427841926e-05, "loss": 0.6576, "step": 5000 }, { "epoch": 2.46669955599408, "eval_loss": 0.836199939250946, "eval_mean_token_accuracy": 0.8145093770377327, "eval_num_tokens": 40898183.0, "eval_runtime": 227.0413, "eval_samples_per_second": 71.467, "eval_steps_per_second": 8.937, "step": 5000 }, { "epoch": 2.5160335471139614, "grad_norm": 2.349609375, "learning_rate": 1.8665510241947596e-05, "loss": 0.6579, "mean_token_accuracy": 0.8412897626310587, "num_tokens": 41716178.0, "step": 5100 }, { "epoch": 2.565367538233843, "grad_norm": 2.609375, "learning_rate": 1.8613393881763583e-05, "loss": 0.6552, "mean_token_accuracy": 0.8418721158802509, "num_tokens": 42533359.0, "step": 5200 }, { "epoch": 2.6147015293537246, "grad_norm": 2.529296875, "learning_rate": 1.8560354929542322e-05, "loss": 0.6504, "mean_token_accuracy": 0.8428675523400306, "num_tokens": 43352238.0, "step": 5300 }, { "epoch": 2.6640355204736066, "grad_norm": 2.5703125, "learning_rate": 1.8506399066356294e-05, "loss": 0.6562, "mean_token_accuracy": 0.8421946428716183, "num_tokens": 44170387.0, "step": 5400 }, { "epoch": 2.7133695115934877, "grad_norm": 2.55078125, "learning_rate": 1.8451532071489532e-05, "loss": 0.6579, "mean_token_accuracy": 0.8413851109147071, "num_tokens": 44988361.0, "step": 5500 }, { "epoch": 2.7627035027133697, "grad_norm": 2.5703125, "learning_rate": 1.839575982181859e-05, "loss": 0.6559, "mean_token_accuracy": 0.8422524558007717, "num_tokens": 45805576.0, "step": 5600 }, { "epoch": 2.812037493833251, "grad_norm": 2.552734375, "learning_rate": 1.8339088291183072e-05, "loss": 0.6524, "mean_token_accuracy": 0.8429344496130944, "num_tokens": 46623168.0, "step": 5700 }, { "epoch": 2.861371484953133, "grad_norm": 2.591796875, "learning_rate": 1.828152354974575e-05, "loss": 0.6572, "mean_token_accuracy": 0.841839095801115, "num_tokens": 47441605.0, "step": 5800 }, { "epoch": 2.9107054760730144, "grad_norm": 2.5625, "learning_rate": 1.8223071763342388e-05, "loss": 0.6569, "mean_token_accuracy": 0.8422958692908287, "num_tokens": 48259542.0, "step": 5900 }, { "epoch": 2.960039467192896, "grad_norm": 2.3671875, "learning_rate": 1.8163739192821325e-05, "loss": 0.6521, "step": 6000 }, { "epoch": 2.960039467192896, "eval_loss": 0.8176103830337524, "eval_mean_token_accuracy": 0.8183424575626116, "eval_num_tokens": 49077499.0, "eval_runtime": 227.0382, "eval_samples_per_second": 71.468, "eval_steps_per_second": 8.937, "step": 6000 }, { "epoch": 3.0093734583127776, "grad_norm": 2.68359375, "learning_rate": 1.8103532193372832e-05, "loss": 0.6343, "mean_token_accuracy": 0.8449597600102424, "num_tokens": 49893961.0, "step": 6100 }, { "epoch": 3.058707449432659, "grad_norm": 2.630859375, "learning_rate": 1.8042457213848448e-05, "loss": 0.5458, "mean_token_accuracy": 0.8658474875986576, "num_tokens": 50711532.0, "step": 6200 }, { "epoch": 3.1080414405525407, "grad_norm": 2.708984375, "learning_rate": 1.798052079607019e-05, "loss": 0.538, "mean_token_accuracy": 0.8663399314880371, "num_tokens": 51528714.0, "step": 6300 }, { "epoch": 3.1573754316724223, "grad_norm": 2.62890625, "learning_rate": 1.791772957412987e-05, "loss": 0.5517, "mean_token_accuracy": 0.8640760770440101, "num_tokens": 52345966.0, "step": 6400 }, { "epoch": 3.206709422792304, "grad_norm": 2.859375, "learning_rate": 1.785409027367852e-05, "loss": 0.5442, "mean_token_accuracy": 0.8657858520746231, "num_tokens": 53163793.0, "step": 6500 }, { "epoch": 3.2560434139121854, "grad_norm": 2.669921875, "learning_rate": 1.7789609711205967e-05, "loss": 0.5516, "mean_token_accuracy": 0.863896958976984, "num_tokens": 53981672.0, "step": 6600 }, { "epoch": 3.305377405032067, "grad_norm": 2.638671875, "learning_rate": 1.7724294793310742e-05, "loss": 0.552, "mean_token_accuracy": 0.8638136276602745, "num_tokens": 54800133.0, "step": 6700 }, { "epoch": 3.3547113961519486, "grad_norm": 2.669921875, "learning_rate": 1.765815251596029e-05, "loss": 0.5537, "mean_token_accuracy": 0.8634848801791668, "num_tokens": 55617519.0, "step": 6800 }, { "epoch": 3.40404538727183, "grad_norm": 2.712890625, "learning_rate": 1.7591189963741614e-05, "loss": 0.5526, "mean_token_accuracy": 0.8636865784227848, "num_tokens": 56435580.0, "step": 6900 }, { "epoch": 3.4533793783917117, "grad_norm": 2.89453125, "learning_rate": 1.7523414309102462e-05, "loss": 0.5566, "step": 7000 }, { "epoch": 3.4533793783917117, "eval_loss": 0.8430932760238647, "eval_mean_token_accuracy": 0.8191284864274312, "eval_num_tokens": 57254348.0, "eval_runtime": 227.0935, "eval_samples_per_second": 71.451, "eval_steps_per_second": 8.935, "step": 7000 }, { "epoch": 3.5027133695115937, "grad_norm": 2.541015625, "learning_rate": 1.7454832811583045e-05, "loss": 0.553, "mean_token_accuracy": 0.8629767662286758, "num_tokens": 58072994.0, "step": 7100 }, { "epoch": 3.552047360631475, "grad_norm": 2.796875, "learning_rate": 1.738545281703848e-05, "loss": 0.5539, "mean_token_accuracy": 0.8632567670941352, "num_tokens": 58890672.0, "step": 7200 }, { "epoch": 3.601381351751357, "grad_norm": 2.82421875, "learning_rate": 1.731528175685196e-05, "loss": 0.5572, "mean_token_accuracy": 0.8626312711834907, "num_tokens": 59708564.0, "step": 7300 }, { "epoch": 3.6507153428712384, "grad_norm": 2.693359375, "learning_rate": 1.7244327147138765e-05, "loss": 0.5553, "mean_token_accuracy": 0.8632141479849815, "num_tokens": 60526575.0, "step": 7400 }, { "epoch": 3.70004933399112, "grad_norm": 2.8125, "learning_rate": 1.7172596587941203e-05, "loss": 0.5573, "mean_token_accuracy": 0.8632130342721939, "num_tokens": 61345050.0, "step": 7500 }, { "epoch": 3.7493833251110016, "grad_norm": 2.921875, "learning_rate": 1.710009776241456e-05, "loss": 0.5579, "mean_token_accuracy": 0.8623562103509903, "num_tokens": 62161790.0, "step": 7600 }, { "epoch": 3.798717316230883, "grad_norm": 2.767578125, "learning_rate": 1.702683843600415e-05, "loss": 0.5567, "mean_token_accuracy": 0.8631315796077251, "num_tokens": 62980263.0, "step": 7700 }, { "epoch": 3.8480513073507647, "grad_norm": 2.58984375, "learning_rate": 1.6952826455613546e-05, "loss": 0.5595, "mean_token_accuracy": 0.862657565176487, "num_tokens": 63798977.0, "step": 7800 }, { "epoch": 3.8973852984706463, "grad_norm": 2.658203125, "learning_rate": 1.687806974876408e-05, "loss": 0.5531, "mean_token_accuracy": 0.8633909998834133, "num_tokens": 64617733.0, "step": 7900 }, { "epoch": 3.946719289590528, "grad_norm": 3.1953125, "learning_rate": 1.680257632274572e-05, "loss": 0.5546, "step": 8000 }, { "epoch": 3.946719289590528, "eval_loss": 0.8310380578041077, "eval_mean_token_accuracy": 0.8215583155107005, "eval_num_tokens": 65435962.0, "eval_runtime": 227.0562, "eval_samples_per_second": 71.462, "eval_steps_per_second": 8.936, "step": 8000 }, { "epoch": 3.9960532807104094, "grad_norm": 2.80078125, "learning_rate": 1.6726354263759423e-05, "loss": 0.5585, "mean_token_accuracy": 0.8631708553433418, "num_tokens": 66254184.0, "step": 8100 }, { "epoch": 4.0453872718302915, "grad_norm": 2.806640625, "learning_rate": 1.6649411736050957e-05, "loss": 0.4575, "mean_token_accuracy": 0.8858809275925159, "num_tokens": 67070561.0, "step": 8200 }, { "epoch": 4.094721262950173, "grad_norm": 3.056640625, "learning_rate": 1.6571756981036476e-05, "loss": 0.4536, "mean_token_accuracy": 0.8867035652697086, "num_tokens": 67889388.0, "step": 8300 }, { "epoch": 4.144055254070055, "grad_norm": 2.76171875, "learning_rate": 1.6493398316419727e-05, "loss": 0.4523, "mean_token_accuracy": 0.8867955373227596, "num_tokens": 68706133.0, "step": 8400 }, { "epoch": 4.193389245189936, "grad_norm": 3.005859375, "learning_rate": 1.641434413530116e-05, "loss": 0.458, "mean_token_accuracy": 0.8855340279638767, "num_tokens": 69523848.0, "step": 8500 }, { "epoch": 4.242723236309818, "grad_norm": 2.94140625, "learning_rate": 1.6334602905278917e-05, "loss": 0.4607, "mean_token_accuracy": 0.8852245907485485, "num_tokens": 70342124.0, "step": 8600 }, { "epoch": 4.292057227429699, "grad_norm": 2.60546875, "learning_rate": 1.6254183167541848e-05, "loss": 0.4618, "mean_token_accuracy": 0.884294263869524, "num_tokens": 71160314.0, "step": 8700 }, { "epoch": 4.341391218549581, "grad_norm": 2.896484375, "learning_rate": 1.617309353595468e-05, "loss": 0.462, "mean_token_accuracy": 0.884438044577837, "num_tokens": 71979044.0, "step": 8800 }, { "epoch": 4.390725209669462, "grad_norm": 2.900390625, "learning_rate": 1.6091342696135354e-05, "loss": 0.4601, "mean_token_accuracy": 0.8847412486374379, "num_tokens": 72797208.0, "step": 8900 }, { "epoch": 4.440059200789344, "grad_norm": 2.693359375, "learning_rate": 1.6008939404524686e-05, "loss": 0.4664, "step": 9000 }, { "epoch": 4.440059200789344, "eval_loss": 0.8724967837333679, "eval_mean_token_accuracy": 0.8213894294658396, "eval_num_tokens": 73615031.0, "eval_runtime": 227.1181, "eval_samples_per_second": 71.443, "eval_steps_per_second": 8.934, "step": 9000 }, { "epoch": 4.489393191909225, "grad_norm": 2.93359375, "learning_rate": 1.5925892487448492e-05, "loss": 0.4624, "mean_token_accuracy": 0.8841097000986338, "num_tokens": 74433117.0, "step": 9100 }, { "epoch": 4.538727183029107, "grad_norm": 3.052734375, "learning_rate": 1.584221084017215e-05, "loss": 0.4687, "mean_token_accuracy": 0.8826957462728023, "num_tokens": 75251550.0, "step": 9200 }, { "epoch": 4.588061174148988, "grad_norm": 2.978515625, "learning_rate": 1.5757903425947833e-05, "loss": 0.4656, "mean_token_accuracy": 0.8832169409096241, "num_tokens": 76069101.0, "step": 9300 }, { "epoch": 4.63739516526887, "grad_norm": 2.970703125, "learning_rate": 1.5672979275054444e-05, "loss": 0.4678, "mean_token_accuracy": 0.8832441847026348, "num_tokens": 76886994.0, "step": 9400 }, { "epoch": 4.686729156388752, "grad_norm": 2.892578125, "learning_rate": 1.5587447483830364e-05, "loss": 0.4697, "mean_token_accuracy": 0.8826014402508736, "num_tokens": 77705672.0, "step": 9500 }, { "epoch": 4.736063147508633, "grad_norm": 2.853515625, "learning_rate": 1.5501317213699145e-05, "loss": 0.4668, "mean_token_accuracy": 0.8834595142304897, "num_tokens": 78523274.0, "step": 9600 }, { "epoch": 4.785397138628515, "grad_norm": 2.931640625, "learning_rate": 1.5414597690188197e-05, "loss": 0.4741, "mean_token_accuracy": 0.8814607061445713, "num_tokens": 79339785.0, "step": 9700 }, { "epoch": 4.834731129748397, "grad_norm": 2.744140625, "learning_rate": 1.5327298201940647e-05, "loss": 0.4693, "mean_token_accuracy": 0.8828557208180428, "num_tokens": 80158261.0, "step": 9800 }, { "epoch": 4.884065120868279, "grad_norm": 2.93359375, "learning_rate": 1.523942809972041e-05, "loss": 0.4684, "mean_token_accuracy": 0.8827220787107944, "num_tokens": 80976546.0, "step": 9900 }, { "epoch": 4.93339911198816, "grad_norm": 2.7421875, "learning_rate": 1.5150996795410626e-05, "loss": 0.4701, "step": 10000 }, { "epoch": 4.93339911198816, "eval_loss": 0.8658666014671326, "eval_mean_token_accuracy": 0.8231359128002582, "eval_num_tokens": 81794889.0, "eval_runtime": 227.1166, "eval_samples_per_second": 71.443, "eval_steps_per_second": 8.934, "step": 10000 } ], "logging_steps": 100, "max_steps": 30405, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.1647962034877235e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }