diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,16038 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.999989599547927, + "eval_steps": 1000, + "global_step": 108168, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0013867269430644586, + "grad_norm": 1.8933687210083008, + "learning_rate": 2.957486136783734e-06, + "loss": 1.2241, + "step": 50 + }, + { + "epoch": 0.002773453886128917, + "grad_norm": 0.7502820491790771, + "learning_rate": 6.038200862600124e-06, + "loss": 1.0267, + "step": 100 + }, + { + "epoch": 0.004160180829193376, + "grad_norm": 0.5821689963340759, + "learning_rate": 9.118915588416513e-06, + "loss": 0.8167, + "step": 150 + }, + { + "epoch": 0.005546907772257834, + "grad_norm": 0.5138927698135376, + "learning_rate": 1.2199630314232902e-05, + "loss": 0.6408, + "step": 200 + }, + { + "epoch": 0.006933634715322293, + "grad_norm": 0.619263768196106, + "learning_rate": 1.5280345040049293e-05, + "loss": 0.5468, + "step": 250 + }, + { + "epoch": 0.008320361658386751, + "grad_norm": 0.5078439712524414, + "learning_rate": 1.836105976586568e-05, + "loss": 0.4952, + "step": 300 + }, + { + "epoch": 0.00970708860145121, + "grad_norm": 0.5653749108314514, + "learning_rate": 2.144177449168207e-05, + "loss": 0.4388, + "step": 350 + }, + { + "epoch": 0.011093815544515669, + "grad_norm": 0.6189213991165161, + "learning_rate": 2.452248921749846e-05, + "loss": 0.4232, + "step": 400 + }, + { + "epoch": 0.012480542487580126, + "grad_norm": 0.6082913875579834, + "learning_rate": 2.760320394331485e-05, + "loss": 0.401, + "step": 450 + }, + { + "epoch": 0.013867269430644586, + "grad_norm": 0.6956301331520081, + "learning_rate": 3.068391866913124e-05, + "loss": 0.3895, + "step": 500 + }, + { + "epoch": 0.015253996373709043, + "grad_norm": 0.7030412554740906, + "learning_rate": 3.3764633394947633e-05, + "loss": 0.3676, + "step": 550 + }, + { + "epoch": 0.016640723316773503, + "grad_norm": 0.6779190897941589, + "learning_rate": 3.684534812076402e-05, + "loss": 0.3653, + "step": 600 + }, + { + "epoch": 0.01802745025983796, + "grad_norm": 0.8930213451385498, + "learning_rate": 3.992606284658041e-05, + "loss": 0.3645, + "step": 650 + }, + { + "epoch": 0.01941417720290242, + "grad_norm": 0.6423994302749634, + "learning_rate": 4.30067775723968e-05, + "loss": 0.3514, + "step": 700 + }, + { + "epoch": 0.02080090414596688, + "grad_norm": 0.7728660106658936, + "learning_rate": 4.608749229821319e-05, + "loss": 0.3468, + "step": 750 + }, + { + "epoch": 0.022187631089031337, + "grad_norm": 0.7561061978340149, + "learning_rate": 4.916820702402958e-05, + "loss": 0.3499, + "step": 800 + }, + { + "epoch": 0.023574358032095795, + "grad_norm": 0.6163890957832336, + "learning_rate": 5.224892174984597e-05, + "loss": 0.3417, + "step": 850 + }, + { + "epoch": 0.024961084975160253, + "grad_norm": 0.7334563732147217, + "learning_rate": 5.532963647566236e-05, + "loss": 0.3299, + "step": 900 + }, + { + "epoch": 0.026347811918224714, + "grad_norm": 0.655237078666687, + "learning_rate": 5.841035120147874e-05, + "loss": 0.3306, + "step": 950 + }, + { + "epoch": 0.02773453886128917, + "grad_norm": 0.8147113919258118, + "learning_rate": 6.149106592729513e-05, + "loss": 0.3281, + "step": 1000 + }, + { + "epoch": 0.02773453886128917, + "eval_loss": 0.32194069027900696, + "eval_runtime": 501.2457, + "eval_samples_per_second": 5.7, + "eval_steps_per_second": 5.7, + "step": 1000 + }, + { + "epoch": 0.02912126580435363, + "grad_norm": 0.6397083401679993, + "learning_rate": 6.457178065311152e-05, + "loss": 0.3204, + "step": 1050 + }, + { + "epoch": 0.030507992747418087, + "grad_norm": 0.5808627009391785, + "learning_rate": 6.765249537892791e-05, + "loss": 0.3229, + "step": 1100 + }, + { + "epoch": 0.03189471969048255, + "grad_norm": 0.6929567456245422, + "learning_rate": 7.073321010474431e-05, + "loss": 0.3148, + "step": 1150 + }, + { + "epoch": 0.033281446633547006, + "grad_norm": 0.620298445224762, + "learning_rate": 7.38139248305607e-05, + "loss": 0.32, + "step": 1200 + }, + { + "epoch": 0.034668173576611463, + "grad_norm": 0.5947968363761902, + "learning_rate": 7.689463955637708e-05, + "loss": 0.306, + "step": 1250 + }, + { + "epoch": 0.03605490051967592, + "grad_norm": 0.6097683906555176, + "learning_rate": 7.997535428219347e-05, + "loss": 0.3179, + "step": 1300 + }, + { + "epoch": 0.03744162746274038, + "grad_norm": 0.6339348554611206, + "learning_rate": 8.305606900800986e-05, + "loss": 0.3161, + "step": 1350 + }, + { + "epoch": 0.03882835440580484, + "grad_norm": 0.5278933644294739, + "learning_rate": 8.613678373382625e-05, + "loss": 0.3153, + "step": 1400 + }, + { + "epoch": 0.040215081348869294, + "grad_norm": 0.4927423894405365, + "learning_rate": 8.921749845964264e-05, + "loss": 0.3111, + "step": 1450 + }, + { + "epoch": 0.04160180829193376, + "grad_norm": 0.4745596945285797, + "learning_rate": 9.229821318545902e-05, + "loss": 0.304, + "step": 1500 + }, + { + "epoch": 0.04298853523499822, + "grad_norm": 0.6532231569290161, + "learning_rate": 9.537892791127541e-05, + "loss": 0.3084, + "step": 1550 + }, + { + "epoch": 0.044375262178062674, + "grad_norm": 0.5528659820556641, + "learning_rate": 9.84596426370918e-05, + "loss": 0.3084, + "step": 1600 + }, + { + "epoch": 0.04576198912112713, + "grad_norm": 0.45793089270591736, + "learning_rate": 0.0001015403573629082, + "loss": 0.2964, + "step": 1650 + }, + { + "epoch": 0.04714871606419159, + "grad_norm": 0.5063529014587402, + "learning_rate": 0.00010462107208872458, + "loss": 0.2924, + "step": 1700 + }, + { + "epoch": 0.04853544300725605, + "grad_norm": 0.48600247502326965, + "learning_rate": 0.00010770178681454097, + "loss": 0.2947, + "step": 1750 + }, + { + "epoch": 0.049922169950320505, + "grad_norm": 0.4872143268585205, + "learning_rate": 0.00011078250154035737, + "loss": 0.297, + "step": 1800 + }, + { + "epoch": 0.05130889689338496, + "grad_norm": 0.5091805458068848, + "learning_rate": 0.00011386321626617376, + "loss": 0.2888, + "step": 1850 + }, + { + "epoch": 0.05269562383644943, + "grad_norm": 0.41649994254112244, + "learning_rate": 0.00011694393099199015, + "loss": 0.2871, + "step": 1900 + }, + { + "epoch": 0.054082350779513885, + "grad_norm": 0.5174862146377563, + "learning_rate": 0.00012002464571780654, + "loss": 0.2922, + "step": 1950 + }, + { + "epoch": 0.05546907772257834, + "grad_norm": 0.45786553621292114, + "learning_rate": 0.00012310536044362293, + "loss": 0.2883, + "step": 2000 + }, + { + "epoch": 0.05546907772257834, + "eval_loss": 0.28488224744796753, + "eval_runtime": 500.9558, + "eval_samples_per_second": 5.703, + "eval_steps_per_second": 5.703, + "step": 2000 + }, + { + "epoch": 0.0568558046656428, + "grad_norm": 0.4992533326148987, + "learning_rate": 0.00012606284658040666, + "loss": 0.3033, + "step": 2050 + }, + { + "epoch": 0.05824253160870726, + "grad_norm": 0.4205988049507141, + "learning_rate": 0.00012914356130622304, + "loss": 0.2867, + "step": 2100 + }, + { + "epoch": 0.059629258551771716, + "grad_norm": 0.4288152754306793, + "learning_rate": 0.00013222427603203944, + "loss": 0.2795, + "step": 2150 + }, + { + "epoch": 0.061015985494836174, + "grad_norm": 0.4856145977973938, + "learning_rate": 0.00013530499075785582, + "loss": 0.2833, + "step": 2200 + }, + { + "epoch": 0.06240271243790063, + "grad_norm": 0.4891654849052429, + "learning_rate": 0.00013838570548367222, + "loss": 0.2797, + "step": 2250 + }, + { + "epoch": 0.0637894393809651, + "grad_norm": 0.39899352192878723, + "learning_rate": 0.00014146642020948863, + "loss": 0.2785, + "step": 2300 + }, + { + "epoch": 0.06517616632402955, + "grad_norm": 0.3616255819797516, + "learning_rate": 0.000144547134935305, + "loss": 0.2798, + "step": 2350 + }, + { + "epoch": 0.06656289326709401, + "grad_norm": 0.3556617498397827, + "learning_rate": 0.0001476278496611214, + "loss": 0.2811, + "step": 2400 + }, + { + "epoch": 0.06794962021015846, + "grad_norm": 0.39639297127723694, + "learning_rate": 0.00015070856438693776, + "loss": 0.2813, + "step": 2450 + }, + { + "epoch": 0.06933634715322293, + "grad_norm": 0.35177573561668396, + "learning_rate": 0.00015378927911275416, + "loss": 0.2797, + "step": 2500 + }, + { + "epoch": 0.07072307409628739, + "grad_norm": 0.38610222935676575, + "learning_rate": 0.00015686999383857054, + "loss": 0.2747, + "step": 2550 + }, + { + "epoch": 0.07210980103935184, + "grad_norm": 0.36727309226989746, + "learning_rate": 0.00015995070856438694, + "loss": 0.2776, + "step": 2600 + }, + { + "epoch": 0.07349652798241631, + "grad_norm": 0.3905107378959656, + "learning_rate": 0.00016303142329020332, + "loss": 0.2772, + "step": 2650 + }, + { + "epoch": 0.07488325492548076, + "grad_norm": 0.3958912193775177, + "learning_rate": 0.00016611213801601973, + "loss": 0.2707, + "step": 2700 + }, + { + "epoch": 0.07626998186854522, + "grad_norm": 0.4029497504234314, + "learning_rate": 0.0001691928527418361, + "loss": 0.2692, + "step": 2750 + }, + { + "epoch": 0.07765670881160967, + "grad_norm": 0.3514055907726288, + "learning_rate": 0.0001722735674676525, + "loss": 0.2759, + "step": 2800 + }, + { + "epoch": 0.07904343575467414, + "grad_norm": 0.34912553429603577, + "learning_rate": 0.00017529266789895255, + "loss": 0.2793, + "step": 2850 + }, + { + "epoch": 0.08043016269773859, + "grad_norm": 0.3493233621120453, + "learning_rate": 0.00017831176833025262, + "loss": 0.2845, + "step": 2900 + }, + { + "epoch": 0.08181688964080305, + "grad_norm": 0.30080145597457886, + "learning_rate": 0.00018139248305606902, + "loss": 0.2686, + "step": 2950 + }, + { + "epoch": 0.08320361658386752, + "grad_norm": 0.3265998959541321, + "learning_rate": 0.0001844731977818854, + "loss": 0.2695, + "step": 3000 + }, + { + "epoch": 0.08320361658386752, + "eval_loss": 0.26523345708847046, + "eval_runtime": 500.4565, + "eval_samples_per_second": 5.709, + "eval_steps_per_second": 5.709, + "step": 3000 + }, + { + "epoch": 0.08459034352693197, + "grad_norm": 0.29866209626197815, + "learning_rate": 0.0001875539125077018, + "loss": 0.2679, + "step": 3050 + }, + { + "epoch": 0.08597707046999643, + "grad_norm": 0.3191625475883484, + "learning_rate": 0.00019063462723351818, + "loss": 0.267, + "step": 3100 + }, + { + "epoch": 0.08736379741306088, + "grad_norm": 0.3110339939594269, + "learning_rate": 0.00019371534195933459, + "loss": 0.2658, + "step": 3150 + }, + { + "epoch": 0.08875052435612535, + "grad_norm": 0.32120850682258606, + "learning_rate": 0.00019679605668515096, + "loss": 0.2724, + "step": 3200 + }, + { + "epoch": 0.0901372512991898, + "grad_norm": 0.28446418046951294, + "learning_rate": 0.00019987677141096734, + "loss": 0.268, + "step": 3250 + }, + { + "epoch": 0.09152397824225426, + "grad_norm": 0.2722443640232086, + "learning_rate": 0.00019999989671933422, + "loss": 0.2716, + "step": 3300 + }, + { + "epoch": 0.09291070518531871, + "grad_norm": 0.31304416060447693, + "learning_rate": 0.00019999956948482068, + "loss": 0.2631, + "step": 3350 + }, + { + "epoch": 0.09429743212838318, + "grad_norm": 0.2516928017139435, + "learning_rate": 0.00019999901811788604, + "loss": 0.2647, + "step": 3400 + }, + { + "epoch": 0.09568415907144764, + "grad_norm": 0.288006067276001, + "learning_rate": 0.00019999824261976613, + "loss": 0.263, + "step": 3450 + }, + { + "epoch": 0.0970708860145121, + "grad_norm": 0.2745107114315033, + "learning_rate": 0.00019999724299219913, + "loss": 0.2642, + "step": 3500 + }, + { + "epoch": 0.09845761295757656, + "grad_norm": 2.800987720489502, + "learning_rate": 0.00019999601923742548, + "loss": 0.7176, + "step": 3550 + }, + { + "epoch": 0.09984433990064101, + "grad_norm": 0.3590925931930542, + "learning_rate": 0.00019999457135818805, + "loss": 0.3146, + "step": 3600 + }, + { + "epoch": 0.10123106684370548, + "grad_norm": 0.32617494463920593, + "learning_rate": 0.00019999289935773202, + "loss": 0.2786, + "step": 3650 + }, + { + "epoch": 0.10261779378676993, + "grad_norm": 0.3239264488220215, + "learning_rate": 0.0001999910032398049, + "loss": 0.2807, + "step": 3700 + }, + { + "epoch": 0.10400452072983439, + "grad_norm": 0.3022274076938629, + "learning_rate": 0.00019998888300865652, + "loss": 0.2758, + "step": 3750 + }, + { + "epoch": 0.10539124767289886, + "grad_norm": 0.33024862408638, + "learning_rate": 0.000199986538669039, + "loss": 0.2687, + "step": 3800 + }, + { + "epoch": 0.1067779746159633, + "grad_norm": 0.6899451017379761, + "learning_rate": 0.00019998397022620687, + "loss": 0.2699, + "step": 3850 + }, + { + "epoch": 0.10816470155902777, + "grad_norm": 0.2794604003429413, + "learning_rate": 0.0001999811776859168, + "loss": 0.2667, + "step": 3900 + }, + { + "epoch": 0.10955142850209222, + "grad_norm": 0.2764255106449127, + "learning_rate": 0.00019997816105442778, + "loss": 0.2658, + "step": 3950 + }, + { + "epoch": 0.11093815544515669, + "grad_norm": 0.43574222922325134, + "learning_rate": 0.0001999749203385012, + "loss": 0.2664, + "step": 4000 + }, + { + "epoch": 0.11093815544515669, + "eval_loss": 0.26065966486930847, + "eval_runtime": 500.842, + "eval_samples_per_second": 5.704, + "eval_steps_per_second": 5.704, + "step": 4000 + }, + { + "epoch": 0.11232488238822114, + "grad_norm": 0.5340762734413147, + "learning_rate": 0.00019997145554540046, + "loss": 0.272, + "step": 4050 + }, + { + "epoch": 0.1137116093312856, + "grad_norm": 0.32403895258903503, + "learning_rate": 0.00019996776668289136, + "loss": 0.2679, + "step": 4100 + }, + { + "epoch": 0.11509833627435005, + "grad_norm": 0.2928290367126465, + "learning_rate": 0.0001999638537592419, + "loss": 0.2624, + "step": 4150 + }, + { + "epoch": 0.11648506321741452, + "grad_norm": 0.23226021230220795, + "learning_rate": 0.00019995971678322228, + "loss": 0.2557, + "step": 4200 + }, + { + "epoch": 0.11787179016047898, + "grad_norm": 0.2748055160045624, + "learning_rate": 0.00019995535576410476, + "loss": 0.2625, + "step": 4250 + }, + { + "epoch": 0.11925851710354343, + "grad_norm": 0.2713299095630646, + "learning_rate": 0.00019995077071166385, + "loss": 0.2611, + "step": 4300 + }, + { + "epoch": 0.1206452440466079, + "grad_norm": 0.24674977362155914, + "learning_rate": 0.00019994596163617624, + "loss": 0.2647, + "step": 4350 + }, + { + "epoch": 0.12203197098967235, + "grad_norm": 0.359017014503479, + "learning_rate": 0.00019994092854842065, + "loss": 0.2601, + "step": 4400 + }, + { + "epoch": 0.12341869793273681, + "grad_norm": 0.38051414489746094, + "learning_rate": 0.00019993567145967791, + "loss": 0.253, + "step": 4450 + }, + { + "epoch": 0.12480542487580126, + "grad_norm": 0.26227161288261414, + "learning_rate": 0.0001999301903817309, + "loss": 0.2584, + "step": 4500 + }, + { + "epoch": 0.12619215181886573, + "grad_norm": 0.21259668469429016, + "learning_rate": 0.00019992448532686453, + "loss": 0.2618, + "step": 4550 + }, + { + "epoch": 0.1275788787619302, + "grad_norm": 0.23226451873779297, + "learning_rate": 0.0001999185563078658, + "loss": 0.2526, + "step": 4600 + }, + { + "epoch": 0.12896560570499466, + "grad_norm": 0.24459871649742126, + "learning_rate": 0.00019991240333802352, + "loss": 0.2523, + "step": 4650 + }, + { + "epoch": 0.1303523326480591, + "grad_norm": 0.29185208678245544, + "learning_rate": 0.00019990602643112863, + "loss": 0.2546, + "step": 4700 + }, + { + "epoch": 0.13173905959112356, + "grad_norm": 0.23443324863910675, + "learning_rate": 0.00019989942560147387, + "loss": 0.2557, + "step": 4750 + }, + { + "epoch": 0.13312578653418802, + "grad_norm": 0.22915039956569672, + "learning_rate": 0.00019989260086385394, + "loss": 0.2546, + "step": 4800 + }, + { + "epoch": 0.1345125134772525, + "grad_norm": 0.2710748016834259, + "learning_rate": 0.00019988555223356531, + "loss": 0.2619, + "step": 4850 + }, + { + "epoch": 0.13589924042031692, + "grad_norm": 0.24671098589897156, + "learning_rate": 0.00019987827972640633, + "loss": 0.2594, + "step": 4900 + }, + { + "epoch": 0.1372859673633814, + "grad_norm": 0.2359282672405243, + "learning_rate": 0.00019987078335867713, + "loss": 0.2616, + "step": 4950 + }, + { + "epoch": 0.13867269430644585, + "grad_norm": 0.2197064608335495, + "learning_rate": 0.00019986306314717956, + "loss": 0.2507, + "step": 5000 + }, + { + "epoch": 0.13867269430644585, + "eval_loss": 0.25083017349243164, + "eval_runtime": 500.7995, + "eval_samples_per_second": 5.705, + "eval_steps_per_second": 5.705, + "step": 5000 + }, + { + "epoch": 0.14005942124951032, + "grad_norm": 0.2249370515346527, + "learning_rate": 0.0001998551191092172, + "loss": 0.2574, + "step": 5050 + }, + { + "epoch": 0.14144614819257478, + "grad_norm": 0.36345556378364563, + "learning_rate": 0.0001998469512625953, + "loss": 0.2493, + "step": 5100 + }, + { + "epoch": 0.14283287513563922, + "grad_norm": 0.24807791411876678, + "learning_rate": 0.00019983855962562067, + "loss": 0.2542, + "step": 5150 + }, + { + "epoch": 0.14421960207870368, + "grad_norm": 3.6125738620758057, + "learning_rate": 0.00019982994421710186, + "loss": 0.2595, + "step": 5200 + }, + { + "epoch": 0.14560632902176815, + "grad_norm": 0.4985048472881317, + "learning_rate": 0.0001998211050563488, + "loss": 0.2558, + "step": 5250 + }, + { + "epoch": 0.14699305596483261, + "grad_norm": 0.3320443332195282, + "learning_rate": 0.00019981204216317308, + "loss": 0.2545, + "step": 5300 + }, + { + "epoch": 0.14837978290789705, + "grad_norm": 0.2081877887248993, + "learning_rate": 0.00019980275555788759, + "loss": 0.2536, + "step": 5350 + }, + { + "epoch": 0.14976650985096152, + "grad_norm": 0.27258801460266113, + "learning_rate": 0.00019979324526130676, + "loss": 0.2505, + "step": 5400 + }, + { + "epoch": 0.15115323679402598, + "grad_norm": 0.23199999332427979, + "learning_rate": 0.00019978351129474632, + "loss": 0.2556, + "step": 5450 + }, + { + "epoch": 0.15253996373709044, + "grad_norm": 0.20929445326328278, + "learning_rate": 0.00019977355368002334, + "loss": 0.2486, + "step": 5500 + }, + { + "epoch": 0.1539266906801549, + "grad_norm": 0.23551955819129944, + "learning_rate": 0.00019976337243945617, + "loss": 0.2517, + "step": 5550 + }, + { + "epoch": 0.15531341762321935, + "grad_norm": 0.30231812596321106, + "learning_rate": 0.0001997529675958644, + "loss": 0.2498, + "step": 5600 + }, + { + "epoch": 0.1567001445662838, + "grad_norm": 0.24430635571479797, + "learning_rate": 0.00019974233917256865, + "loss": 0.2523, + "step": 5650 + }, + { + "epoch": 0.15808687150934828, + "grad_norm": 6.362756252288818, + "learning_rate": 0.0001997314871933909, + "loss": 0.2529, + "step": 5700 + }, + { + "epoch": 0.15947359845241274, + "grad_norm": 0.2339017242193222, + "learning_rate": 0.00019972041168265397, + "loss": 0.2524, + "step": 5750 + }, + { + "epoch": 0.16086032539547718, + "grad_norm": 0.22503100335597992, + "learning_rate": 0.0001997091126651818, + "loss": 0.251, + "step": 5800 + }, + { + "epoch": 0.16224705233854164, + "grad_norm": 0.26495125889778137, + "learning_rate": 0.00019969759016629928, + "loss": 0.2517, + "step": 5850 + }, + { + "epoch": 0.1636337792816061, + "grad_norm": 0.25339657068252563, + "learning_rate": 0.00019968584421183212, + "loss": 0.2505, + "step": 5900 + }, + { + "epoch": 0.16502050622467057, + "grad_norm": 0.20266841351985931, + "learning_rate": 0.000199673874828107, + "loss": 0.2501, + "step": 5950 + }, + { + "epoch": 0.16640723316773504, + "grad_norm": 0.19285647571086884, + "learning_rate": 0.00019966168204195125, + "loss": 0.2445, + "step": 6000 + }, + { + "epoch": 0.16640723316773504, + "eval_loss": 0.24731825292110443, + "eval_runtime": 500.9495, + "eval_samples_per_second": 5.703, + "eval_steps_per_second": 5.703, + "step": 6000 + }, + { + "epoch": 0.16779396011079947, + "grad_norm": 0.2121065855026245, + "learning_rate": 0.000199649265880693, + "loss": 0.2466, + "step": 6050 + }, + { + "epoch": 0.16918068705386394, + "grad_norm": 0.2560518980026245, + "learning_rate": 0.000199636626372161, + "loss": 0.2572, + "step": 6100 + }, + { + "epoch": 0.1705674139969284, + "grad_norm": 0.22927352786064148, + "learning_rate": 0.00019962376354468466, + "loss": 0.2509, + "step": 6150 + }, + { + "epoch": 0.17195414093999287, + "grad_norm": 0.2201690673828125, + "learning_rate": 0.00019961067742709377, + "loss": 0.2501, + "step": 6200 + }, + { + "epoch": 0.1733408678830573, + "grad_norm": 0.23233374953269958, + "learning_rate": 0.0001995973680487188, + "loss": 0.2525, + "step": 6250 + }, + { + "epoch": 0.17472759482612177, + "grad_norm": 0.254256933927536, + "learning_rate": 0.00019958383543939041, + "loss": 0.2499, + "step": 6300 + }, + { + "epoch": 0.17611432176918623, + "grad_norm": 0.1754632294178009, + "learning_rate": 0.00019957007962943975, + "loss": 0.251, + "step": 6350 + }, + { + "epoch": 0.1775010487122507, + "grad_norm": 0.23628771305084229, + "learning_rate": 0.00019955610064969817, + "loss": 0.256, + "step": 6400 + }, + { + "epoch": 0.17888777565531516, + "grad_norm": 0.23698653280735016, + "learning_rate": 0.00019954189853149725, + "loss": 0.2474, + "step": 6450 + }, + { + "epoch": 0.1802745025983796, + "grad_norm": 0.27713823318481445, + "learning_rate": 0.00019952747330666867, + "loss": 0.2481, + "step": 6500 + }, + { + "epoch": 0.18166122954144406, + "grad_norm": 0.1710810512304306, + "learning_rate": 0.00019951282500754413, + "loss": 0.2564, + "step": 6550 + }, + { + "epoch": 0.18304795648450853, + "grad_norm": 0.21406157314777374, + "learning_rate": 0.00019949795366695544, + "loss": 0.2517, + "step": 6600 + }, + { + "epoch": 0.184434683427573, + "grad_norm": 0.20108449459075928, + "learning_rate": 0.00019948285931823415, + "loss": 0.2518, + "step": 6650 + }, + { + "epoch": 0.18582141037063743, + "grad_norm": 5.1352715492248535, + "learning_rate": 0.0001994675419952118, + "loss": 0.2546, + "step": 6700 + }, + { + "epoch": 0.1872081373137019, + "grad_norm": 0.22743810713291168, + "learning_rate": 0.00019945200173221962, + "loss": 0.2457, + "step": 6750 + }, + { + "epoch": 0.18859486425676636, + "grad_norm": 0.20475907623767853, + "learning_rate": 0.0001994362385640885, + "loss": 0.2529, + "step": 6800 + }, + { + "epoch": 0.18998159119983082, + "grad_norm": 0.22172316908836365, + "learning_rate": 0.000199420252526149, + "loss": 0.2554, + "step": 6850 + }, + { + "epoch": 0.1913683181428953, + "grad_norm": 2.967470407485962, + "learning_rate": 0.0001994040436542311, + "loss": 0.2555, + "step": 6900 + }, + { + "epoch": 0.19275504508595973, + "grad_norm": 0.23698735237121582, + "learning_rate": 0.00019938761198466437, + "loss": 0.2619, + "step": 6950 + }, + { + "epoch": 0.1941417720290242, + "grad_norm": 0.17891797423362732, + "learning_rate": 0.0001993709575542776, + "loss": 0.2464, + "step": 7000 + }, + { + "epoch": 0.1941417720290242, + "eval_loss": 0.24410127103328705, + "eval_runtime": 500.8833, + "eval_samples_per_second": 5.704, + "eval_steps_per_second": 5.704, + "step": 7000 + }, + { + "epoch": 0.19552849897208865, + "grad_norm": 0.21030811965465546, + "learning_rate": 0.00019935408040039901, + "loss": 0.2517, + "step": 7050 + }, + { + "epoch": 0.19691522591515312, + "grad_norm": 0.1913098245859146, + "learning_rate": 0.00019933698056085586, + "loss": 0.249, + "step": 7100 + }, + { + "epoch": 0.19830195285821758, + "grad_norm": 0.2044433057308197, + "learning_rate": 0.00019931965807397465, + "loss": 0.2496, + "step": 7150 + }, + { + "epoch": 0.19968867980128202, + "grad_norm": 0.18698015809059143, + "learning_rate": 0.00019930211297858078, + "loss": 0.2537, + "step": 7200 + }, + { + "epoch": 0.20107540674434649, + "grad_norm": 0.22580522298812866, + "learning_rate": 0.00019928434531399876, + "loss": 0.2456, + "step": 7250 + }, + { + "epoch": 0.20246213368741095, + "grad_norm": 0.1749202162027359, + "learning_rate": 0.00019926635512005183, + "loss": 0.2504, + "step": 7300 + }, + { + "epoch": 0.20384886063047541, + "grad_norm": 0.2123364359140396, + "learning_rate": 0.00019924814243706197, + "loss": 0.2477, + "step": 7350 + }, + { + "epoch": 0.20523558757353985, + "grad_norm": 0.2234705090522766, + "learning_rate": 0.00019922970730584997, + "loss": 0.2457, + "step": 7400 + }, + { + "epoch": 0.20662231451660432, + "grad_norm": 0.20742256939411163, + "learning_rate": 0.00019921104976773505, + "loss": 0.249, + "step": 7450 + }, + { + "epoch": 0.20800904145966878, + "grad_norm": 0.18315458297729492, + "learning_rate": 0.000199192169864535, + "loss": 0.2459, + "step": 7500 + }, + { + "epoch": 0.20939576840273325, + "grad_norm": 0.19357183575630188, + "learning_rate": 0.000199173067638566, + "loss": 0.2439, + "step": 7550 + }, + { + "epoch": 0.2107824953457977, + "grad_norm": 0.2398926168680191, + "learning_rate": 0.00019915374313264248, + "loss": 0.2497, + "step": 7600 + }, + { + "epoch": 0.21216922228886215, + "grad_norm": 0.20313721895217896, + "learning_rate": 0.00019913419639007714, + "loss": 0.2447, + "step": 7650 + }, + { + "epoch": 0.2135559492319266, + "grad_norm": 0.17255066335201263, + "learning_rate": 0.00019911442745468075, + "loss": 0.2447, + "step": 7700 + }, + { + "epoch": 0.21494267617499108, + "grad_norm": 0.19140756130218506, + "learning_rate": 0.0001990944363707621, + "loss": 0.2383, + "step": 7750 + }, + { + "epoch": 0.21632940311805554, + "grad_norm": 0.15212053060531616, + "learning_rate": 0.00019907422318312783, + "loss": 0.2485, + "step": 7800 + }, + { + "epoch": 0.21771613006111998, + "grad_norm": 0.1841588169336319, + "learning_rate": 0.0001990537879370825, + "loss": 0.2432, + "step": 7850 + }, + { + "epoch": 0.21910285700418444, + "grad_norm": 0.2013355791568756, + "learning_rate": 0.00019903313067842833, + "loss": 0.2431, + "step": 7900 + }, + { + "epoch": 0.2204895839472489, + "grad_norm": 0.17149454355239868, + "learning_rate": 0.0001990122514534651, + "loss": 0.247, + "step": 7950 + }, + { + "epoch": 0.22187631089031337, + "grad_norm": 0.24272453784942627, + "learning_rate": 0.00019899115030899014, + "loss": 0.2468, + "step": 8000 + }, + { + "epoch": 0.22187631089031337, + "eval_loss": 0.24099861085414886, + "eval_runtime": 501.2129, + "eval_samples_per_second": 5.7, + "eval_steps_per_second": 5.7, + "step": 8000 + }, + { + "epoch": 0.22326303783337784, + "grad_norm": 0.2419915497303009, + "learning_rate": 0.00019896982729229813, + "loss": 0.2454, + "step": 8050 + }, + { + "epoch": 0.22464976477644227, + "grad_norm": 0.16482336819171906, + "learning_rate": 0.0001989482824511811, + "loss": 0.2423, + "step": 8100 + }, + { + "epoch": 0.22603649171950674, + "grad_norm": 0.22351431846618652, + "learning_rate": 0.00019892651583392824, + "loss": 0.2501, + "step": 8150 + }, + { + "epoch": 0.2274232186625712, + "grad_norm": 0.19319549202919006, + "learning_rate": 0.0001989045274893258, + "loss": 0.2452, + "step": 8200 + }, + { + "epoch": 0.22880994560563567, + "grad_norm": 0.15613292157649994, + "learning_rate": 0.00019888231746665696, + "loss": 0.2428, + "step": 8250 + }, + { + "epoch": 0.2301966725487001, + "grad_norm": 0.18092665076255798, + "learning_rate": 0.00019885988581570184, + "loss": 0.2448, + "step": 8300 + }, + { + "epoch": 0.23158339949176457, + "grad_norm": 0.18928927183151245, + "learning_rate": 0.00019883723258673724, + "loss": 0.2493, + "step": 8350 + }, + { + "epoch": 0.23297012643482903, + "grad_norm": 0.19816988706588745, + "learning_rate": 0.0001988143578305366, + "loss": 0.2465, + "step": 8400 + }, + { + "epoch": 0.2343568533778935, + "grad_norm": 0.19853706657886505, + "learning_rate": 0.00019879126159836992, + "loss": 0.2443, + "step": 8450 + }, + { + "epoch": 0.23574358032095796, + "grad_norm": 0.17544203996658325, + "learning_rate": 0.00019876794394200353, + "loss": 0.2429, + "step": 8500 + }, + { + "epoch": 0.2371303072640224, + "grad_norm": 0.16583149135112762, + "learning_rate": 0.0001987444049137001, + "loss": 0.244, + "step": 8550 + }, + { + "epoch": 0.23851703420708686, + "grad_norm": 0.18239592015743256, + "learning_rate": 0.00019872064456621848, + "loss": 0.2447, + "step": 8600 + }, + { + "epoch": 0.23990376115015133, + "grad_norm": 0.15820704400539398, + "learning_rate": 0.0001986966629528135, + "loss": 0.2469, + "step": 8650 + }, + { + "epoch": 0.2412904880932158, + "grad_norm": 0.18477188050746918, + "learning_rate": 0.00019867246012723598, + "loss": 0.2407, + "step": 8700 + }, + { + "epoch": 0.24267721503628023, + "grad_norm": 0.1676979809999466, + "learning_rate": 0.0001986480361437325, + "loss": 0.2448, + "step": 8750 + }, + { + "epoch": 0.2440639419793447, + "grad_norm": 0.2173600196838379, + "learning_rate": 0.00019862339105704543, + "loss": 0.2409, + "step": 8800 + }, + { + "epoch": 0.24545066892240916, + "grad_norm": 0.17326687276363373, + "learning_rate": 0.00019859852492241256, + "loss": 0.2387, + "step": 8850 + }, + { + "epoch": 0.24683739586547362, + "grad_norm": 0.16229301691055298, + "learning_rate": 0.00019857343779556725, + "loss": 0.2467, + "step": 8900 + }, + { + "epoch": 0.2482241228085381, + "grad_norm": 0.21166543662548065, + "learning_rate": 0.0001985481297327381, + "loss": 0.2507, + "step": 8950 + }, + { + "epoch": 0.24961084975160253, + "grad_norm": 0.17892777919769287, + "learning_rate": 0.00019852260079064894, + "loss": 0.2416, + "step": 9000 + }, + { + "epoch": 0.24961084975160253, + "eval_loss": 0.23973840475082397, + "eval_runtime": 500.5349, + "eval_samples_per_second": 5.708, + "eval_steps_per_second": 5.708, + "step": 9000 + }, + { + "epoch": 0.250997576694667, + "grad_norm": 0.20435132086277008, + "learning_rate": 0.00019849685102651867, + "loss": 0.2385, + "step": 9050 + }, + { + "epoch": 0.25238430363773146, + "grad_norm": 0.1890842318534851, + "learning_rate": 0.0001984708804980611, + "loss": 0.2416, + "step": 9100 + }, + { + "epoch": 0.2537710305807959, + "grad_norm": 0.18390174210071564, + "learning_rate": 0.00019844468926348482, + "loss": 0.2469, + "step": 9150 + }, + { + "epoch": 0.2551577575238604, + "grad_norm": 0.23599492013454437, + "learning_rate": 0.00019841827738149314, + "loss": 0.2417, + "step": 9200 + }, + { + "epoch": 0.25654448446692485, + "grad_norm": 0.1522965133190155, + "learning_rate": 0.00019839164491128398, + "loss": 0.2427, + "step": 9250 + }, + { + "epoch": 0.2579312114099893, + "grad_norm": 0.206534281373024, + "learning_rate": 0.00019836479191254948, + "loss": 0.2452, + "step": 9300 + }, + { + "epoch": 0.2593179383530537, + "grad_norm": 0.18928374350070953, + "learning_rate": 0.00019833771844547627, + "loss": 0.244, + "step": 9350 + }, + { + "epoch": 0.2607046652961182, + "grad_norm": 0.17130087316036224, + "learning_rate": 0.00019831042457074498, + "loss": 0.2488, + "step": 9400 + }, + { + "epoch": 0.26209139223918265, + "grad_norm": 0.17631781101226807, + "learning_rate": 0.00019828291034953033, + "loss": 0.2441, + "step": 9450 + }, + { + "epoch": 0.2634781191822471, + "grad_norm": 0.1852494180202484, + "learning_rate": 0.00019825517584350083, + "loss": 0.2414, + "step": 9500 + }, + { + "epoch": 0.2648648461253116, + "grad_norm": 0.21513506770133972, + "learning_rate": 0.0001982272211148188, + "loss": 0.2412, + "step": 9550 + }, + { + "epoch": 0.26625157306837605, + "grad_norm": 0.18172813951969147, + "learning_rate": 0.0001981990462261401, + "loss": 0.2435, + "step": 9600 + }, + { + "epoch": 0.2676383000114405, + "grad_norm": 0.1561124324798584, + "learning_rate": 0.00019817065124061407, + "loss": 0.238, + "step": 9650 + }, + { + "epoch": 0.269025026954505, + "grad_norm": 0.16663338243961334, + "learning_rate": 0.00019814203622188338, + "loss": 0.2383, + "step": 9700 + }, + { + "epoch": 0.27041175389756944, + "grad_norm": 0.17735238373279572, + "learning_rate": 0.0001981132012340838, + "loss": 0.2459, + "step": 9750 + }, + { + "epoch": 0.27179848084063385, + "grad_norm": 0.21334126591682434, + "learning_rate": 0.00019808414634184417, + "loss": 0.2425, + "step": 9800 + }, + { + "epoch": 0.2731852077836983, + "grad_norm": 0.16817434132099152, + "learning_rate": 0.00019805487161028625, + "loss": 0.2361, + "step": 9850 + }, + { + "epoch": 0.2745719347267628, + "grad_norm": 0.17149919271469116, + "learning_rate": 0.00019802537710502443, + "loss": 0.2431, + "step": 9900 + }, + { + "epoch": 0.27595866166982724, + "grad_norm": 0.1521356999874115, + "learning_rate": 0.00019799566289216576, + "loss": 0.2411, + "step": 9950 + }, + { + "epoch": 0.2773453886128917, + "grad_norm": 0.15583455562591553, + "learning_rate": 0.00019796572903830974, + "loss": 0.2388, + "step": 10000 + }, + { + "epoch": 0.2773453886128917, + "eval_loss": 0.23783154785633087, + "eval_runtime": 501.3932, + "eval_samples_per_second": 5.698, + "eval_steps_per_second": 5.698, + "step": 10000 + }, + { + "epoch": 0.2787321155559562, + "grad_norm": 0.15069644153118134, + "learning_rate": 0.00019793557561054807, + "loss": 0.245, + "step": 10050 + }, + { + "epoch": 0.28011884249902064, + "grad_norm": 0.16481320559978485, + "learning_rate": 0.0001979052026764647, + "loss": 0.2403, + "step": 10100 + }, + { + "epoch": 0.2815055694420851, + "grad_norm": 0.16549484431743622, + "learning_rate": 0.00019787461030413553, + "loss": 0.2404, + "step": 10150 + }, + { + "epoch": 0.28289229638514957, + "grad_norm": 0.1722942292690277, + "learning_rate": 0.0001978437985621282, + "loss": 0.2407, + "step": 10200 + }, + { + "epoch": 0.284279023328214, + "grad_norm": 1.554700255393982, + "learning_rate": 0.0001978127675195022, + "loss": 0.2423, + "step": 10250 + }, + { + "epoch": 0.28566575027127844, + "grad_norm": 0.18697640299797058, + "learning_rate": 0.0001977815172458084, + "loss": 0.2458, + "step": 10300 + }, + { + "epoch": 0.2870524772143429, + "grad_norm": 0.19721738994121552, + "learning_rate": 0.00019775004781108914, + "loss": 0.2423, + "step": 10350 + }, + { + "epoch": 0.28843920415740737, + "grad_norm": 0.13843601942062378, + "learning_rate": 0.00019771835928587787, + "loss": 0.249, + "step": 10400 + }, + { + "epoch": 0.28982593110047183, + "grad_norm": 0.19530989229679108, + "learning_rate": 0.0001976864517411992, + "loss": 0.2438, + "step": 10450 + }, + { + "epoch": 0.2912126580435363, + "grad_norm": 0.14896182715892792, + "learning_rate": 0.0001976543252485686, + "loss": 0.2392, + "step": 10500 + }, + { + "epoch": 0.29259938498660076, + "grad_norm": 0.1485060602426529, + "learning_rate": 0.00019762197987999223, + "loss": 0.2371, + "step": 10550 + }, + { + "epoch": 0.29398611192966523, + "grad_norm": 0.20084735751152039, + "learning_rate": 0.00019758941570796688, + "loss": 0.2461, + "step": 10600 + }, + { + "epoch": 0.2953728388727297, + "grad_norm": 0.1450163722038269, + "learning_rate": 0.0001975566328054797, + "loss": 0.2379, + "step": 10650 + }, + { + "epoch": 0.2967595658157941, + "grad_norm": 0.14225760102272034, + "learning_rate": 0.00019752363124600817, + "loss": 0.2465, + "step": 10700 + }, + { + "epoch": 0.29814629275885857, + "grad_norm": 0.182630255818367, + "learning_rate": 0.00019749041110351975, + "loss": 0.2382, + "step": 10750 + }, + { + "epoch": 0.29953301970192303, + "grad_norm": 0.18140457570552826, + "learning_rate": 0.00019745697245247194, + "loss": 0.2394, + "step": 10800 + }, + { + "epoch": 0.3009197466449875, + "grad_norm": 0.1756162941455841, + "learning_rate": 0.00019742331536781187, + "loss": 0.2377, + "step": 10850 + }, + { + "epoch": 0.30230647358805196, + "grad_norm": 0.14414621889591217, + "learning_rate": 0.0001973894399249763, + "loss": 0.2408, + "step": 10900 + }, + { + "epoch": 0.3036932005311164, + "grad_norm": 0.1697167605161667, + "learning_rate": 0.00019735534619989142, + "loss": 0.2442, + "step": 10950 + }, + { + "epoch": 0.3050799274741809, + "grad_norm": 0.15641078352928162, + "learning_rate": 0.00019732103426897265, + "loss": 0.2421, + "step": 11000 + }, + { + "epoch": 0.3050799274741809, + "eval_loss": 0.23684217035770416, + "eval_runtime": 500.474, + "eval_samples_per_second": 5.709, + "eval_steps_per_second": 5.709, + "step": 11000 + }, + { + "epoch": 0.30646665441724535, + "grad_norm": 0.190172016620636, + "learning_rate": 0.00019728650420912448, + "loss": 0.2475, + "step": 11050 + }, + { + "epoch": 0.3078533813603098, + "grad_norm": 0.16632623970508575, + "learning_rate": 0.0001972517560977403, + "loss": 0.2426, + "step": 11100 + }, + { + "epoch": 0.30924010830337423, + "grad_norm": 0.16913548111915588, + "learning_rate": 0.00019721679001270226, + "loss": 0.2386, + "step": 11150 + }, + { + "epoch": 0.3106268352464387, + "grad_norm": 0.16081750392913818, + "learning_rate": 0.00019718160603238096, + "loss": 0.2358, + "step": 11200 + }, + { + "epoch": 0.31201356218950316, + "grad_norm": 0.19061852991580963, + "learning_rate": 0.00019714620423563552, + "loss": 0.238, + "step": 11250 + }, + { + "epoch": 0.3134002891325676, + "grad_norm": 0.16220314800739288, + "learning_rate": 0.00019711058470181316, + "loss": 0.2428, + "step": 11300 + }, + { + "epoch": 0.3147870160756321, + "grad_norm": 0.20064842700958252, + "learning_rate": 0.00019707474751074915, + "loss": 0.2393, + "step": 11350 + }, + { + "epoch": 0.31617374301869655, + "grad_norm": 0.14250491559505463, + "learning_rate": 0.00019703869274276657, + "loss": 0.2376, + "step": 11400 + }, + { + "epoch": 0.317560469961761, + "grad_norm": 0.18501660227775574, + "learning_rate": 0.00019700242047867623, + "loss": 0.2405, + "step": 11450 + }, + { + "epoch": 0.3189471969048255, + "grad_norm": 0.1680876910686493, + "learning_rate": 0.00019696593079977635, + "loss": 0.241, + "step": 11500 + }, + { + "epoch": 0.32033392384788995, + "grad_norm": 0.15119992196559906, + "learning_rate": 0.00019692922378785252, + "loss": 0.2371, + "step": 11550 + }, + { + "epoch": 0.32172065079095435, + "grad_norm": 0.15388673543930054, + "learning_rate": 0.0001968922995251774, + "loss": 0.2425, + "step": 11600 + }, + { + "epoch": 0.3231073777340188, + "grad_norm": 0.19946704804897308, + "learning_rate": 0.00019685515809451056, + "loss": 0.2476, + "step": 11650 + }, + { + "epoch": 0.3244941046770833, + "grad_norm": 0.17677927017211914, + "learning_rate": 0.0001968177995790984, + "loss": 0.2432, + "step": 11700 + }, + { + "epoch": 0.32588083162014775, + "grad_norm": 0.18418142199516296, + "learning_rate": 0.00019678022406267374, + "loss": 0.2387, + "step": 11750 + }, + { + "epoch": 0.3272675585632122, + "grad_norm": 0.1462264358997345, + "learning_rate": 0.00019674243162945594, + "loss": 0.2377, + "step": 11800 + }, + { + "epoch": 0.3286542855062767, + "grad_norm": 0.14166492223739624, + "learning_rate": 0.0001967044223641504, + "loss": 0.238, + "step": 11850 + }, + { + "epoch": 0.33004101244934114, + "grad_norm": 0.17436008155345917, + "learning_rate": 0.00019666619635194866, + "loss": 0.2429, + "step": 11900 + }, + { + "epoch": 0.3314277393924056, + "grad_norm": 0.15779553353786469, + "learning_rate": 0.00019662775367852787, + "loss": 0.2404, + "step": 11950 + }, + { + "epoch": 0.33281446633547007, + "grad_norm": 0.17796078324317932, + "learning_rate": 0.000196589094430051, + "loss": 0.235, + "step": 12000 + }, + { + "epoch": 0.33281446633547007, + "eval_loss": 0.235828697681427, + "eval_runtime": 500.6046, + "eval_samples_per_second": 5.707, + "eval_steps_per_second": 5.707, + "step": 12000 + }, + { + "epoch": 0.3342011932785345, + "grad_norm": 0.14978894591331482, + "learning_rate": 0.0001965502186931662, + "loss": 0.2419, + "step": 12050 + }, + { + "epoch": 0.33558792022159895, + "grad_norm": 0.17456893622875214, + "learning_rate": 0.00019651112655500713, + "loss": 0.2389, + "step": 12100 + }, + { + "epoch": 0.3369746471646634, + "grad_norm": 0.1462843269109726, + "learning_rate": 0.0001964718181031922, + "loss": 0.2363, + "step": 12150 + }, + { + "epoch": 0.3383613741077279, + "grad_norm": 0.16996078193187714, + "learning_rate": 0.0001964322934258248, + "loss": 0.2404, + "step": 12200 + }, + { + "epoch": 0.33974810105079234, + "grad_norm": 0.1906641721725464, + "learning_rate": 0.00019639255261149298, + "loss": 0.2394, + "step": 12250 + }, + { + "epoch": 0.3411348279938568, + "grad_norm": 0.15007531642913818, + "learning_rate": 0.00019635259574926912, + "loss": 0.2371, + "step": 12300 + }, + { + "epoch": 0.34252155493692127, + "grad_norm": 0.18667016923427582, + "learning_rate": 0.00019631242292870993, + "loss": 0.24, + "step": 12350 + }, + { + "epoch": 0.34390828187998573, + "grad_norm": 0.1689510941505432, + "learning_rate": 0.0001962720342398561, + "loss": 0.2359, + "step": 12400 + }, + { + "epoch": 0.3452950088230502, + "grad_norm": 0.1622210294008255, + "learning_rate": 0.0001962314297732321, + "loss": 0.2405, + "step": 12450 + }, + { + "epoch": 0.3466817357661146, + "grad_norm": 0.20153377950191498, + "learning_rate": 0.0001961906096198462, + "loss": 0.2368, + "step": 12500 + }, + { + "epoch": 0.34806846270917907, + "grad_norm": 0.1634126603603363, + "learning_rate": 0.00019614957387118994, + "loss": 0.236, + "step": 12550 + }, + { + "epoch": 0.34945518965224354, + "grad_norm": 0.21276158094406128, + "learning_rate": 0.00019610832261923817, + "loss": 0.2397, + "step": 12600 + }, + { + "epoch": 0.350841916595308, + "grad_norm": 0.16108940541744232, + "learning_rate": 0.00019606685595644865, + "loss": 0.2424, + "step": 12650 + }, + { + "epoch": 0.35222864353837247, + "grad_norm": 0.20505978167057037, + "learning_rate": 0.00019602517397576205, + "loss": 0.2423, + "step": 12700 + }, + { + "epoch": 0.35361537048143693, + "grad_norm": 0.1431368589401245, + "learning_rate": 0.0001959832767706016, + "loss": 0.2353, + "step": 12750 + }, + { + "epoch": 0.3550020974245014, + "grad_norm": 0.1670791357755661, + "learning_rate": 0.00019594116443487293, + "loss": 0.2366, + "step": 12800 + }, + { + "epoch": 0.35638882436756586, + "grad_norm": 0.1353309154510498, + "learning_rate": 0.00019589883706296385, + "loss": 0.2387, + "step": 12850 + }, + { + "epoch": 0.3577755513106303, + "grad_norm": 0.16561363637447357, + "learning_rate": 0.00019585629474974415, + "loss": 0.2373, + "step": 12900 + }, + { + "epoch": 0.35916227825369473, + "grad_norm": 0.16978101432323456, + "learning_rate": 0.00019581353759056528, + "loss": 0.2383, + "step": 12950 + }, + { + "epoch": 0.3605490051967592, + "grad_norm": 0.13398033380508423, + "learning_rate": 0.0001957705656812604, + "loss": 0.2389, + "step": 13000 + }, + { + "epoch": 0.3605490051967592, + "eval_loss": 0.2349192500114441, + "eval_runtime": 500.9767, + "eval_samples_per_second": 5.703, + "eval_steps_per_second": 5.703, + "step": 13000 + }, + { + "epoch": 0.36193573213982366, + "grad_norm": 0.17141664028167725, + "learning_rate": 0.00019572737911814387, + "loss": 0.2379, + "step": 13050 + }, + { + "epoch": 0.3633224590828881, + "grad_norm": 0.25635290145874023, + "learning_rate": 0.00019568397799801118, + "loss": 0.2354, + "step": 13100 + }, + { + "epoch": 0.3647091860259526, + "grad_norm": 0.19244590401649475, + "learning_rate": 0.00019564036241813876, + "loss": 0.2372, + "step": 13150 + }, + { + "epoch": 0.36609591296901706, + "grad_norm": 0.1587456613779068, + "learning_rate": 0.00019559653247628364, + "loss": 0.2399, + "step": 13200 + }, + { + "epoch": 0.3674826399120815, + "grad_norm": 0.22146746516227722, + "learning_rate": 0.0001955524882706834, + "loss": 0.2356, + "step": 13250 + }, + { + "epoch": 0.368869366855146, + "grad_norm": 0.21101641654968262, + "learning_rate": 0.0001955082299000558, + "loss": 0.2425, + "step": 13300 + }, + { + "epoch": 0.37025609379821045, + "grad_norm": 0.16459371149539948, + "learning_rate": 0.0001954637574635986, + "loss": 0.239, + "step": 13350 + }, + { + "epoch": 0.37164282074127486, + "grad_norm": 0.15547959506511688, + "learning_rate": 0.0001954190710609894, + "loss": 0.2358, + "step": 13400 + }, + { + "epoch": 0.3730295476843393, + "grad_norm": 0.1342894285917282, + "learning_rate": 0.00019537417079238534, + "loss": 0.2363, + "step": 13450 + }, + { + "epoch": 0.3744162746274038, + "grad_norm": 0.14169098436832428, + "learning_rate": 0.0001953290567584229, + "loss": 0.2355, + "step": 13500 + }, + { + "epoch": 0.37580300157046825, + "grad_norm": 0.17943793535232544, + "learning_rate": 0.00019528372906021772, + "loss": 0.2354, + "step": 13550 + }, + { + "epoch": 0.3771897285135327, + "grad_norm": 0.20254671573638916, + "learning_rate": 0.0001952381877993643, + "loss": 0.2411, + "step": 13600 + }, + { + "epoch": 0.3785764554565972, + "grad_norm": 0.1362125426530838, + "learning_rate": 0.0001951924330779358, + "loss": 0.2383, + "step": 13650 + }, + { + "epoch": 0.37996318239966165, + "grad_norm": 0.19201667606830597, + "learning_rate": 0.0001951464649984838, + "loss": 0.2398, + "step": 13700 + }, + { + "epoch": 0.3813499093427261, + "grad_norm": 0.15204668045043945, + "learning_rate": 0.0001951002836640382, + "loss": 0.2347, + "step": 13750 + }, + { + "epoch": 0.3827366362857906, + "grad_norm": 0.14426596462726593, + "learning_rate": 0.00019505388917810665, + "loss": 0.2399, + "step": 13800 + }, + { + "epoch": 0.38412336322885504, + "grad_norm": 0.1463170200586319, + "learning_rate": 0.0001950072816446748, + "loss": 0.2316, + "step": 13850 + }, + { + "epoch": 0.38551009017191945, + "grad_norm": 0.15552669763565063, + "learning_rate": 0.00019496046116820566, + "loss": 0.2354, + "step": 13900 + }, + { + "epoch": 0.3868968171149839, + "grad_norm": 0.16742919385433197, + "learning_rate": 0.00019491342785363952, + "loss": 0.2388, + "step": 13950 + }, + { + "epoch": 0.3882835440580484, + "grad_norm": 0.16111566126346588, + "learning_rate": 0.00019486618180639375, + "loss": 0.2385, + "step": 14000 + }, + { + "epoch": 0.3882835440580484, + "eval_loss": 0.23382489383220673, + "eval_runtime": 500.6533, + "eval_samples_per_second": 5.707, + "eval_steps_per_second": 5.707, + "step": 14000 + }, + { + "epoch": 0.38967027100111284, + "grad_norm": 0.15741662681102753, + "learning_rate": 0.00019481872313236256, + "loss": 0.2374, + "step": 14050 + }, + { + "epoch": 0.3910569979441773, + "grad_norm": 0.15046770870685577, + "learning_rate": 0.00019477105193791664, + "loss": 0.2379, + "step": 14100 + }, + { + "epoch": 0.3924437248872418, + "grad_norm": 0.14219743013381958, + "learning_rate": 0.00019472316832990308, + "loss": 0.2434, + "step": 14150 + }, + { + "epoch": 0.39383045183030624, + "grad_norm": 0.15226851403713226, + "learning_rate": 0.000194675072415645, + "loss": 0.2427, + "step": 14200 + }, + { + "epoch": 0.3952171787733707, + "grad_norm": 0.19782114028930664, + "learning_rate": 0.00019462676430294143, + "loss": 0.2357, + "step": 14250 + }, + { + "epoch": 0.39660390571643517, + "grad_norm": 0.14243118464946747, + "learning_rate": 0.00019457824410006692, + "loss": 0.2343, + "step": 14300 + }, + { + "epoch": 0.3979906326594996, + "grad_norm": 0.22301803529262543, + "learning_rate": 0.00019452951191577155, + "loss": 0.2406, + "step": 14350 + }, + { + "epoch": 0.39937735960256404, + "grad_norm": 0.13103021681308746, + "learning_rate": 0.00019448056785928032, + "loss": 0.2398, + "step": 14400 + }, + { + "epoch": 0.4007640865456285, + "grad_norm": 0.16922806203365326, + "learning_rate": 0.00019443141204029325, + "loss": 0.2363, + "step": 14450 + }, + { + "epoch": 0.40215081348869297, + "grad_norm": 0.17801126837730408, + "learning_rate": 0.00019438204456898492, + "loss": 0.2377, + "step": 14500 + }, + { + "epoch": 0.40353754043175744, + "grad_norm": 0.14513610303401947, + "learning_rate": 0.0001943324655560043, + "loss": 0.241, + "step": 14550 + }, + { + "epoch": 0.4049242673748219, + "grad_norm": 0.14587055146694183, + "learning_rate": 0.00019428267511247457, + "loss": 0.2345, + "step": 14600 + }, + { + "epoch": 0.40631099431788636, + "grad_norm": 0.17200471460819244, + "learning_rate": 0.00019423267334999267, + "loss": 0.2345, + "step": 14650 + }, + { + "epoch": 0.40769772126095083, + "grad_norm": 0.16612234711647034, + "learning_rate": 0.00019418246038062928, + "loss": 0.235, + "step": 14700 + }, + { + "epoch": 0.4090844482040153, + "grad_norm": 0.14822156727313995, + "learning_rate": 0.00019413203631692843, + "loss": 0.2384, + "step": 14750 + }, + { + "epoch": 0.4104711751470797, + "grad_norm": 0.15960198640823364, + "learning_rate": 0.00019408140127190725, + "loss": 0.2375, + "step": 14800 + }, + { + "epoch": 0.41185790209014417, + "grad_norm": NaN, + "learning_rate": 0.00019403157434308126, + "loss": 0.233, + "step": 14850 + }, + { + "epoch": 0.41324462903320863, + "grad_norm": 0.15910230576992035, + "learning_rate": 0.00019398154500404588, + "loss": 0.2728, + "step": 14900 + }, + { + "epoch": 0.4146313559762731, + "grad_norm": 0.16004903614521027, + "learning_rate": 0.0001939302861212685, + "loss": 0.2359, + "step": 14950 + }, + { + "epoch": 0.41601808291933756, + "grad_norm": 0.1622370034456253, + "learning_rate": 0.00019387881670936035, + "loss": 0.2413, + "step": 15000 + }, + { + "epoch": 0.41601808291933756, + "eval_loss": 0.23365913331508636, + "eval_runtime": 500.916, + "eval_samples_per_second": 5.704, + "eval_steps_per_second": 5.704, + "step": 15000 + }, + { + "epoch": 0.417404809862402, + "grad_norm": 0.1744803488254547, + "learning_rate": 0.00019382713688368162, + "loss": 0.2406, + "step": 15050 + }, + { + "epoch": 0.4187915368054665, + "grad_norm": 0.19140714406967163, + "learning_rate": 0.00019377524676006397, + "loss": 0.2385, + "step": 15100 + }, + { + "epoch": 0.42017826374853096, + "grad_norm": 0.14320451021194458, + "learning_rate": 0.00019372314645481052, + "loss": 0.2384, + "step": 15150 + }, + { + "epoch": 0.4215649906915954, + "grad_norm": 0.18620997667312622, + "learning_rate": 0.00019367083608469546, + "loss": 0.2343, + "step": 15200 + }, + { + "epoch": 0.42295171763465983, + "grad_norm": 0.13473859429359436, + "learning_rate": 0.00019361831576696382, + "loss": 0.2399, + "step": 15250 + }, + { + "epoch": 0.4243384445777243, + "grad_norm": 0.15213748812675476, + "learning_rate": 0.00019356558561933108, + "loss": 0.2358, + "step": 15300 + }, + { + "epoch": 0.42572517152078876, + "grad_norm": 0.16841459274291992, + "learning_rate": 0.0001935126457599832, + "loss": 0.2332, + "step": 15350 + }, + { + "epoch": 0.4271118984638532, + "grad_norm": 0.14978626370429993, + "learning_rate": 0.00019345949630757603, + "loss": 0.2382, + "step": 15400 + }, + { + "epoch": 0.4284986254069177, + "grad_norm": 0.18397267162799835, + "learning_rate": 0.00019340613738123526, + "loss": 0.2328, + "step": 15450 + }, + { + "epoch": 0.42988535234998215, + "grad_norm": 0.13535378873348236, + "learning_rate": 0.000193352569100556, + "loss": 0.2278, + "step": 15500 + }, + { + "epoch": 0.4312720792930466, + "grad_norm": 0.1288972645998001, + "learning_rate": 0.00019329879158560274, + "loss": 0.2385, + "step": 15550 + }, + { + "epoch": 0.4326588062361111, + "grad_norm": 0.1488959789276123, + "learning_rate": 0.0001932448049569088, + "loss": 0.2352, + "step": 15600 + }, + { + "epoch": 0.43404553317917555, + "grad_norm": 0.16358473896980286, + "learning_rate": 0.00019319060933547624, + "loss": 0.2362, + "step": 15650 + }, + { + "epoch": 0.43543226012223996, + "grad_norm": 0.13347339630126953, + "learning_rate": 0.00019313620484277553, + "loss": 0.2376, + "step": 15700 + }, + { + "epoch": 0.4368189870653044, + "grad_norm": 0.13555756211280823, + "learning_rate": 0.0001930815916007453, + "loss": 0.2308, + "step": 15750 + }, + { + "epoch": 0.4382057140083689, + "grad_norm": 0.13955436646938324, + "learning_rate": 0.0001930267697317921, + "loss": 0.2329, + "step": 15800 + }, + { + "epoch": 0.43959244095143335, + "grad_norm": 0.1596931517124176, + "learning_rate": 0.00019297173935879, + "loss": 0.2322, + "step": 15850 + }, + { + "epoch": 0.4409791678944978, + "grad_norm": 0.14860297739505768, + "learning_rate": 0.00019291650060508045, + "loss": 0.234, + "step": 15900 + }, + { + "epoch": 0.4423658948375623, + "grad_norm": 0.14575625956058502, + "learning_rate": 0.00019286105359447194, + "loss": 0.2362, + "step": 15950 + }, + { + "epoch": 0.44375262178062674, + "grad_norm": 0.1400967240333557, + "learning_rate": 0.00019280539845123974, + "loss": 0.2358, + "step": 16000 + }, + { + "epoch": 0.44375262178062674, + "eval_loss": 0.23256094753742218, + "eval_runtime": 500.6637, + "eval_samples_per_second": 5.706, + "eval_steps_per_second": 5.706, + "step": 16000 + }, + { + "epoch": 0.4451393487236912, + "grad_norm": 0.2537101209163666, + "learning_rate": 0.00019274953530012563, + "loss": 0.2363, + "step": 16050 + }, + { + "epoch": 0.4465260756667557, + "grad_norm": 0.192925825715065, + "learning_rate": 0.0001926934642663375, + "loss": 0.2343, + "step": 16100 + }, + { + "epoch": 0.4479128026098201, + "grad_norm": 0.17011120915412903, + "learning_rate": 0.0001926371854755493, + "loss": 0.2362, + "step": 16150 + }, + { + "epoch": 0.44929952955288455, + "grad_norm": 0.1474524289369583, + "learning_rate": 0.00019258069905390065, + "loss": 0.2359, + "step": 16200 + }, + { + "epoch": 0.450686256495949, + "grad_norm": 0.15591026842594147, + "learning_rate": 0.00019252400512799643, + "loss": 0.2338, + "step": 16250 + }, + { + "epoch": 0.4520729834390135, + "grad_norm": 0.14443908631801605, + "learning_rate": 0.00019246710382490664, + "loss": 0.2421, + "step": 16300 + }, + { + "epoch": 0.45345971038207794, + "grad_norm": 0.12614597380161285, + "learning_rate": 0.00019240999527216608, + "loss": 0.2373, + "step": 16350 + }, + { + "epoch": 0.4548464373251424, + "grad_norm": 0.1438266485929489, + "learning_rate": 0.00019235267959777415, + "loss": 0.2443, + "step": 16400 + }, + { + "epoch": 0.45623316426820687, + "grad_norm": 0.14473649859428406, + "learning_rate": 0.00019229515693019436, + "loss": 0.241, + "step": 16450 + }, + { + "epoch": 0.45761989121127133, + "grad_norm": 0.13498128950595856, + "learning_rate": 0.00019223742739835423, + "loss": 0.2393, + "step": 16500 + }, + { + "epoch": 0.4590066181543358, + "grad_norm": 0.14498169720172882, + "learning_rate": 0.0001921794911316449, + "loss": 0.2363, + "step": 16550 + }, + { + "epoch": 0.4603933450974002, + "grad_norm": 0.14319288730621338, + "learning_rate": 0.00019212134825992091, + "loss": 0.2359, + "step": 16600 + }, + { + "epoch": 0.4617800720404647, + "grad_norm": 0.12314629554748535, + "learning_rate": 0.00019206299891349983, + "loss": 0.23, + "step": 16650 + }, + { + "epoch": 0.46316679898352914, + "grad_norm": 0.14780518412590027, + "learning_rate": 0.00019200444322316207, + "loss": 0.2381, + "step": 16700 + }, + { + "epoch": 0.4645535259265936, + "grad_norm": 0.1493334025144577, + "learning_rate": 0.0001919456813201504, + "loss": 0.2345, + "step": 16750 + }, + { + "epoch": 0.46594025286965807, + "grad_norm": 0.11972863227128983, + "learning_rate": 0.00019188671333616992, + "loss": 0.235, + "step": 16800 + }, + { + "epoch": 0.46732697981272253, + "grad_norm": 0.13366112112998962, + "learning_rate": 0.00019182753940338753, + "loss": 0.2306, + "step": 16850 + }, + { + "epoch": 0.468713706755787, + "grad_norm": 0.13790684938430786, + "learning_rate": 0.00019176815965443186, + "loss": 0.2366, + "step": 16900 + }, + { + "epoch": 0.47010043369885146, + "grad_norm": 0.14081595838069916, + "learning_rate": 0.0001917085742223926, + "loss": 0.2368, + "step": 16950 + }, + { + "epoch": 0.4714871606419159, + "grad_norm": 0.13987073302268982, + "learning_rate": 0.00019164878324082074, + "loss": 0.2337, + "step": 17000 + }, + { + "epoch": 0.4714871606419159, + "eval_loss": 0.2317454218864441, + "eval_runtime": 500.9301, + "eval_samples_per_second": 5.703, + "eval_steps_per_second": 5.703, + "step": 17000 + }, + { + "epoch": 0.47287388758498033, + "grad_norm": 0.1430695503950119, + "learning_rate": 0.00019158878684372778, + "loss": 0.2346, + "step": 17050 + }, + { + "epoch": 0.4742606145280448, + "grad_norm": 0.14264121651649475, + "learning_rate": 0.00019152858516558564, + "loss": 0.2339, + "step": 17100 + }, + { + "epoch": 0.47564734147110926, + "grad_norm": 0.15278013050556183, + "learning_rate": 0.00019146817834132644, + "loss": 0.2333, + "step": 17150 + }, + { + "epoch": 0.47703406841417373, + "grad_norm": 0.15283286571502686, + "learning_rate": 0.000191407566506342, + "loss": 0.2323, + "step": 17200 + }, + { + "epoch": 0.4784207953572382, + "grad_norm": 0.13433212041854858, + "learning_rate": 0.00019134674979648367, + "loss": 0.2406, + "step": 17250 + }, + { + "epoch": 0.47980752230030266, + "grad_norm": 0.14129064977169037, + "learning_rate": 0.00019128572834806203, + "loss": 0.2353, + "step": 17300 + }, + { + "epoch": 0.4811942492433671, + "grad_norm": 0.14736846089363098, + "learning_rate": 0.00019122450229784653, + "loss": 0.2312, + "step": 17350 + }, + { + "epoch": 0.4825809761864316, + "grad_norm": 0.14513076841831207, + "learning_rate": 0.00019116307178306514, + "loss": 0.2358, + "step": 17400 + }, + { + "epoch": 0.48396770312949605, + "grad_norm": 0.14358818531036377, + "learning_rate": 0.0001911014369414042, + "loss": 0.2376, + "step": 17450 + }, + { + "epoch": 0.48535443007256046, + "grad_norm": 0.14574295282363892, + "learning_rate": 0.00019103959791100792, + "loss": 0.2306, + "step": 17500 + }, + { + "epoch": 0.4867411570156249, + "grad_norm": 0.1347060352563858, + "learning_rate": 0.00019097755483047827, + "loss": 0.2341, + "step": 17550 + }, + { + "epoch": 0.4881278839586894, + "grad_norm": 0.1792859435081482, + "learning_rate": 0.00019091530783887448, + "loss": 0.2392, + "step": 17600 + }, + { + "epoch": 0.48951461090175385, + "grad_norm": 0.11206398904323578, + "learning_rate": 0.00019085285707571282, + "loss": 0.236, + "step": 17650 + }, + { + "epoch": 0.4909013378448183, + "grad_norm": 0.16337329149246216, + "learning_rate": 0.0001907902026809663, + "loss": 0.239, + "step": 17700 + }, + { + "epoch": 0.4922880647878828, + "grad_norm": 0.14579764008522034, + "learning_rate": 0.0001907273447950644, + "loss": 0.2258, + "step": 17750 + }, + { + "epoch": 0.49367479173094725, + "grad_norm": 0.1381896585226059, + "learning_rate": 0.00019066428355889257, + "loss": 0.2366, + "step": 17800 + }, + { + "epoch": 0.4950615186740117, + "grad_norm": 0.13557949662208557, + "learning_rate": 0.00019060101911379208, + "loss": 0.236, + "step": 17850 + }, + { + "epoch": 0.4964482456170762, + "grad_norm": 0.13205058872699738, + "learning_rate": 0.00019053755160155974, + "loss": 0.237, + "step": 17900 + }, + { + "epoch": 0.4978349725601406, + "grad_norm": 0.1766868382692337, + "learning_rate": 0.00019047388116444735, + "loss": 0.241, + "step": 17950 + }, + { + "epoch": 0.49922169950320505, + "grad_norm": 0.1567864567041397, + "learning_rate": 0.00019041000794516171, + "loss": 0.2269, + "step": 18000 + }, + { + "epoch": 0.49922169950320505, + "eval_loss": 0.23145872354507446, + "eval_runtime": 500.5681, + "eval_samples_per_second": 5.708, + "eval_steps_per_second": 5.708, + "step": 18000 + }, + { + "epoch": 0.5006084264462696, + "grad_norm": 0.13615478575229645, + "learning_rate": 0.00019034593208686396, + "loss": 0.2347, + "step": 18050 + }, + { + "epoch": 0.501995153389334, + "grad_norm": 0.13786327838897705, + "learning_rate": 0.00019028165373316948, + "loss": 0.2335, + "step": 18100 + }, + { + "epoch": 0.5033818803323985, + "grad_norm": 0.14584092795848846, + "learning_rate": 0.0001902171730281476, + "loss": 0.2392, + "step": 18150 + }, + { + "epoch": 0.5047686072754629, + "grad_norm": 0.18500222265720367, + "learning_rate": 0.000190152490116321, + "loss": 0.2336, + "step": 18200 + }, + { + "epoch": 0.5061553342185273, + "grad_norm": 0.14118489623069763, + "learning_rate": 0.0001900876051426658, + "loss": 0.2362, + "step": 18250 + }, + { + "epoch": 0.5075420611615918, + "grad_norm": 0.18030238151550293, + "learning_rate": 0.00019002251825261078, + "loss": 0.2363, + "step": 18300 + }, + { + "epoch": 0.5089287881046562, + "grad_norm": 0.1916930228471756, + "learning_rate": 0.00018995722959203745, + "loss": 0.2342, + "step": 18350 + }, + { + "epoch": 0.5103155150477208, + "grad_norm": 0.1503581702709198, + "learning_rate": 0.00018989173930727951, + "loss": 0.2365, + "step": 18400 + }, + { + "epoch": 0.5117022419907852, + "grad_norm": 0.14816977083683014, + "learning_rate": 0.0001898260475451225, + "loss": 0.2387, + "step": 18450 + }, + { + "epoch": 0.5130889689338497, + "grad_norm": 0.13476118445396423, + "learning_rate": 0.00018976015445280363, + "loss": 0.2343, + "step": 18500 + }, + { + "epoch": 0.5144756958769141, + "grad_norm": 0.17522576451301575, + "learning_rate": 0.00018969406017801127, + "loss": 0.2299, + "step": 18550 + }, + { + "epoch": 0.5158624228199786, + "grad_norm": 0.13437584042549133, + "learning_rate": 0.00018962776486888485, + "loss": 0.2342, + "step": 18600 + }, + { + "epoch": 0.517249149763043, + "grad_norm": 0.14156264066696167, + "learning_rate": 0.0001895612686740142, + "loss": 0.2363, + "step": 18650 + }, + { + "epoch": 0.5186358767061074, + "grad_norm": 0.11037924140691757, + "learning_rate": 0.00018949457174243954, + "loss": 0.2343, + "step": 18700 + }, + { + "epoch": 0.520022603649172, + "grad_norm": 0.1362009048461914, + "learning_rate": 0.00018942767422365094, + "loss": 0.2363, + "step": 18750 + }, + { + "epoch": 0.5214093305922364, + "grad_norm": 0.1261095106601715, + "learning_rate": 0.00018936057626758808, + "loss": 0.2341, + "step": 18800 + }, + { + "epoch": 0.5227960575353009, + "grad_norm": 0.13382628560066223, + "learning_rate": 0.00018929327802463987, + "loss": 0.2309, + "step": 18850 + }, + { + "epoch": 0.5241827844783653, + "grad_norm": 0.15190520882606506, + "learning_rate": 0.00018922577964564417, + "loss": 0.2338, + "step": 18900 + }, + { + "epoch": 0.5255695114214298, + "grad_norm": 0.13708838820457458, + "learning_rate": 0.00018915808128188734, + "loss": 0.2338, + "step": 18950 + }, + { + "epoch": 0.5269562383644942, + "grad_norm": 0.20378737151622772, + "learning_rate": 0.0001890901830851041, + "loss": 0.2341, + "step": 19000 + }, + { + "epoch": 0.5269562383644942, + "eval_loss": 0.23116359114646912, + "eval_runtime": 500.7638, + "eval_samples_per_second": 5.705, + "eval_steps_per_second": 5.705, + "step": 19000 + }, + { + "epoch": 0.5283429653075588, + "grad_norm": 0.17179715633392334, + "learning_rate": 0.00018902208520747685, + "loss": 0.2363, + "step": 19050 + }, + { + "epoch": 0.5297296922506232, + "grad_norm": 0.13991795480251312, + "learning_rate": 0.00018895378780163578, + "loss": 0.2308, + "step": 19100 + }, + { + "epoch": 0.5311164191936876, + "grad_norm": 0.11662200093269348, + "learning_rate": 0.0001888852910206581, + "loss": 0.2354, + "step": 19150 + }, + { + "epoch": 0.5325031461367521, + "grad_norm": 0.1577063351869583, + "learning_rate": 0.00018881659501806804, + "loss": 0.2331, + "step": 19200 + }, + { + "epoch": 0.5338898730798165, + "grad_norm": 0.14893421530723572, + "learning_rate": 0.0001887476999478362, + "loss": 0.2345, + "step": 19250 + }, + { + "epoch": 0.535276600022881, + "grad_norm": 0.14458926022052765, + "learning_rate": 0.00018867860596437946, + "loss": 0.2364, + "step": 19300 + }, + { + "epoch": 0.5366633269659454, + "grad_norm": 0.18197046220302582, + "learning_rate": 0.00018860931322256056, + "loss": 0.2316, + "step": 19350 + }, + { + "epoch": 0.53805005390901, + "grad_norm": 0.12696345150470734, + "learning_rate": 0.0001885398218776876, + "loss": 0.2288, + "step": 19400 + }, + { + "epoch": 0.5394367808520744, + "grad_norm": 0.14459608495235443, + "learning_rate": 0.00018847013208551393, + "loss": 0.2342, + "step": 19450 + }, + { + "epoch": 0.5408235077951389, + "grad_norm": 0.13681089878082275, + "learning_rate": 0.00018840024400223758, + "loss": 0.2341, + "step": 19500 + }, + { + "epoch": 0.5422102347382033, + "grad_norm": 0.1358567178249359, + "learning_rate": 0.00018833015778450113, + "loss": 0.239, + "step": 19550 + }, + { + "epoch": 0.5435969616812677, + "grad_norm": 0.1429983228445053, + "learning_rate": 0.0001882598735893912, + "loss": 0.234, + "step": 19600 + }, + { + "epoch": 0.5449836886243322, + "grad_norm": 0.15259206295013428, + "learning_rate": 0.00018818939157443806, + "loss": 0.2333, + "step": 19650 + }, + { + "epoch": 0.5463704155673966, + "grad_norm": 0.1499055027961731, + "learning_rate": 0.00018811871189761554, + "loss": 0.2335, + "step": 19700 + }, + { + "epoch": 0.5477571425104611, + "grad_norm": 0.15547756850719452, + "learning_rate": 0.0001880478347173403, + "loss": 0.2331, + "step": 19750 + }, + { + "epoch": 0.5491438694535256, + "grad_norm": 0.13615499436855316, + "learning_rate": 0.00018797676019247187, + "loss": 0.2327, + "step": 19800 + }, + { + "epoch": 0.5505305963965901, + "grad_norm": 0.15891136229038239, + "learning_rate": 0.00018790548848231188, + "loss": 0.2293, + "step": 19850 + }, + { + "epoch": 0.5519173233396545, + "grad_norm": 0.1028260812163353, + "learning_rate": 0.0001878340197466041, + "loss": 0.2337, + "step": 19900 + }, + { + "epoch": 0.553304050282719, + "grad_norm": 0.15393692255020142, + "learning_rate": 0.0001877623541455338, + "loss": 0.2332, + "step": 19950 + }, + { + "epoch": 0.5546907772257834, + "grad_norm": 0.11807084083557129, + "learning_rate": 0.0001876904918397275, + "loss": 0.2352, + "step": 20000 + }, + { + "epoch": 0.5546907772257834, + "eval_loss": 0.2310873419046402, + "eval_runtime": 501.0545, + "eval_samples_per_second": 5.702, + "eval_steps_per_second": 5.702, + "step": 20000 + }, + { + "epoch": 0.5560775041688478, + "grad_norm": 0.1603621393442154, + "learning_rate": 0.00018761843299025267, + "loss": 0.2347, + "step": 20050 + }, + { + "epoch": 0.5574642311119123, + "grad_norm": 0.14295394718647003, + "learning_rate": 0.00018754617775861718, + "loss": 0.2335, + "step": 20100 + }, + { + "epoch": 0.5588509580549768, + "grad_norm": 0.1290232539176941, + "learning_rate": 0.0001874737263067692, + "loss": 0.2337, + "step": 20150 + }, + { + "epoch": 0.5602376849980413, + "grad_norm": 0.16112935543060303, + "learning_rate": 0.00018740107879709655, + "loss": 0.2354, + "step": 20200 + }, + { + "epoch": 0.5616244119411057, + "grad_norm": 0.13674217462539673, + "learning_rate": 0.00018732823539242664, + "loss": 0.23, + "step": 20250 + }, + { + "epoch": 0.5630111388841702, + "grad_norm": 0.18549004197120667, + "learning_rate": 0.00018725519625602578, + "loss": 0.2353, + "step": 20300 + }, + { + "epoch": 0.5643978658272346, + "grad_norm": 0.13107050955295563, + "learning_rate": 0.0001871819615515991, + "loss": 0.2392, + "step": 20350 + }, + { + "epoch": 0.5657845927702991, + "grad_norm": 0.13590605556964874, + "learning_rate": 0.00018710853144329002, + "loss": 0.2347, + "step": 20400 + }, + { + "epoch": 0.5671713197133635, + "grad_norm": 0.13591018319129944, + "learning_rate": 0.0001870349060956799, + "loss": 0.229, + "step": 20450 + }, + { + "epoch": 0.568558046656428, + "grad_norm": 0.11401943862438202, + "learning_rate": 0.00018696108567378773, + "loss": 0.2326, + "step": 20500 + }, + { + "epoch": 0.5699447735994925, + "grad_norm": 0.18518146872520447, + "learning_rate": 0.00018688707034306978, + "loss": 0.2351, + "step": 20550 + }, + { + "epoch": 0.5713315005425569, + "grad_norm": 0.1642865538597107, + "learning_rate": 0.00018681286026941905, + "loss": 0.2384, + "step": 20600 + }, + { + "epoch": 0.5727182274856214, + "grad_norm": 0.133639395236969, + "learning_rate": 0.00018673845561916513, + "loss": 0.2324, + "step": 20650 + }, + { + "epoch": 0.5741049544286858, + "grad_norm": 0.120590940117836, + "learning_rate": 0.00018666385655907367, + "loss": 0.2315, + "step": 20700 + }, + { + "epoch": 0.5754916813717503, + "grad_norm": 0.15754735469818115, + "learning_rate": 0.00018658906325634604, + "loss": 0.2388, + "step": 20750 + }, + { + "epoch": 0.5768784083148147, + "grad_norm": 0.15975181758403778, + "learning_rate": 0.00018651407587861905, + "loss": 0.2376, + "step": 20800 + }, + { + "epoch": 0.5782651352578793, + "grad_norm": 0.13276700675487518, + "learning_rate": 0.0001864388945939644, + "loss": 0.2379, + "step": 20850 + }, + { + "epoch": 0.5796518622009437, + "grad_norm": 0.16388626396656036, + "learning_rate": 0.0001863635195708885, + "loss": 0.2332, + "step": 20900 + }, + { + "epoch": 0.5810385891440081, + "grad_norm": 0.18847975134849548, + "learning_rate": 0.0001862879509783319, + "loss": 0.2381, + "step": 20950 + }, + { + "epoch": 0.5824253160870726, + "grad_norm": 0.24493199586868286, + "learning_rate": 0.00018621218898566907, + "loss": 0.2328, + "step": 21000 + }, + { + "epoch": 0.5824253160870726, + "eval_loss": 0.23020677268505096, + "eval_runtime": 499.9502, + "eval_samples_per_second": 5.715, + "eval_steps_per_second": 5.715, + "step": 21000 + }, + { + "epoch": 0.583812043030137, + "grad_norm": 0.16316668689250946, + "learning_rate": 0.00018613623376270794, + "loss": 0.2429, + "step": 21050 + }, + { + "epoch": 0.5851987699732015, + "grad_norm": 0.13449080288410187, + "learning_rate": 0.0001860600854796895, + "loss": 0.2298, + "step": 21100 + }, + { + "epoch": 0.5865854969162659, + "grad_norm": 0.11589767783880234, + "learning_rate": 0.00018598374430728746, + "loss": 0.2344, + "step": 21150 + }, + { + "epoch": 0.5879722238593305, + "grad_norm": 0.11659828573465347, + "learning_rate": 0.0001859072104166079, + "loss": 0.2333, + "step": 21200 + }, + { + "epoch": 0.5893589508023949, + "grad_norm": 0.155133455991745, + "learning_rate": 0.00018583048397918884, + "loss": 0.2362, + "step": 21250 + }, + { + "epoch": 0.5907456777454594, + "grad_norm": 0.16488181054592133, + "learning_rate": 0.00018575356516699977, + "loss": 0.2334, + "step": 21300 + }, + { + "epoch": 0.5921324046885238, + "grad_norm": 0.18307441473007202, + "learning_rate": 0.0001856764541524415, + "loss": 0.2272, + "step": 21350 + }, + { + "epoch": 0.5935191316315882, + "grad_norm": 0.1316101998090744, + "learning_rate": 0.00018559915110834553, + "loss": 0.2342, + "step": 21400 + }, + { + "epoch": 0.5949058585746527, + "grad_norm": 0.1548035889863968, + "learning_rate": 0.00018552165620797382, + "loss": 0.2323, + "step": 21450 + }, + { + "epoch": 0.5962925855177171, + "grad_norm": 0.13214810192584991, + "learning_rate": 0.00018544396962501828, + "loss": 0.2319, + "step": 21500 + }, + { + "epoch": 0.5976793124607817, + "grad_norm": 0.14733006060123444, + "learning_rate": 0.00018536609153360046, + "loss": 0.237, + "step": 21550 + }, + { + "epoch": 0.5990660394038461, + "grad_norm": 0.14465801417827606, + "learning_rate": 0.0001852880221082712, + "loss": 0.2318, + "step": 21600 + }, + { + "epoch": 0.6004527663469106, + "grad_norm": 0.14646270871162415, + "learning_rate": 0.00018520976152401012, + "loss": 0.2368, + "step": 21650 + }, + { + "epoch": 0.601839493289975, + "grad_norm": 0.14174975454807281, + "learning_rate": 0.00018513130995622535, + "loss": 0.2349, + "step": 21700 + }, + { + "epoch": 0.6032262202330395, + "grad_norm": 0.12805262207984924, + "learning_rate": 0.00018505266758075302, + "loss": 0.2315, + "step": 21750 + }, + { + "epoch": 0.6046129471761039, + "grad_norm": 0.1598140299320221, + "learning_rate": 0.00018497383457385697, + "loss": 0.2332, + "step": 21800 + }, + { + "epoch": 0.6059996741191683, + "grad_norm": 0.13651584088802338, + "learning_rate": 0.00018489481111222828, + "loss": 0.2348, + "step": 21850 + }, + { + "epoch": 0.6073864010622329, + "grad_norm": 0.13091818988323212, + "learning_rate": 0.0001848155973729849, + "loss": 0.2287, + "step": 21900 + }, + { + "epoch": 0.6087731280052973, + "grad_norm": 0.17191646993160248, + "learning_rate": 0.00018473619353367128, + "loss": 0.2342, + "step": 21950 + }, + { + "epoch": 0.6101598549483618, + "grad_norm": 0.10674546658992767, + "learning_rate": 0.0001846565997722579, + "loss": 0.2309, + "step": 22000 + }, + { + "epoch": 0.6101598549483618, + "eval_loss": 0.22999995946884155, + "eval_runtime": 499.7816, + "eval_samples_per_second": 5.716, + "eval_steps_per_second": 5.716, + "step": 22000 + }, + { + "epoch": 0.6115465818914262, + "grad_norm": 0.1321185827255249, + "learning_rate": 0.000184576816267141, + "loss": 0.2347, + "step": 22050 + }, + { + "epoch": 0.6129333088344907, + "grad_norm": 0.12945061922073364, + "learning_rate": 0.00018449684319714202, + "loss": 0.2298, + "step": 22100 + }, + { + "epoch": 0.6143200357775551, + "grad_norm": 0.16403023898601532, + "learning_rate": 0.00018441668074150732, + "loss": 0.2276, + "step": 22150 + }, + { + "epoch": 0.6157067627206196, + "grad_norm": 0.14253240823745728, + "learning_rate": 0.00018433632907990775, + "loss": 0.2315, + "step": 22200 + }, + { + "epoch": 0.617093489663684, + "grad_norm": 0.1752641350030899, + "learning_rate": 0.00018425578839243814, + "loss": 0.2327, + "step": 22250 + }, + { + "epoch": 0.6184802166067485, + "grad_norm": 0.11023511737585068, + "learning_rate": 0.00018417505885961712, + "loss": 0.2341, + "step": 22300 + }, + { + "epoch": 0.619866943549813, + "grad_norm": 0.1494046449661255, + "learning_rate": 0.00018409414066238654, + "loss": 0.2307, + "step": 22350 + }, + { + "epoch": 0.6212536704928774, + "grad_norm": 0.13288947939872742, + "learning_rate": 0.00018401303398211103, + "loss": 0.2307, + "step": 22400 + }, + { + "epoch": 0.6226403974359419, + "grad_norm": 0.13972090184688568, + "learning_rate": 0.0001839317390005778, + "loss": 0.231, + "step": 22450 + }, + { + "epoch": 0.6240271243790063, + "grad_norm": 0.16141022741794586, + "learning_rate": 0.000183850255899996, + "loss": 0.2395, + "step": 22500 + }, + { + "epoch": 0.6254138513220708, + "grad_norm": 0.17160941660404205, + "learning_rate": 0.00018376858486299647, + "loss": 0.2371, + "step": 22550 + }, + { + "epoch": 0.6268005782651352, + "grad_norm": 0.13852784037590027, + "learning_rate": 0.00018368672607263132, + "loss": 0.2286, + "step": 22600 + }, + { + "epoch": 0.6281873052081998, + "grad_norm": 0.16050252318382263, + "learning_rate": 0.00018360467971237338, + "loss": 0.2345, + "step": 22650 + }, + { + "epoch": 0.6295740321512642, + "grad_norm": 0.12499688565731049, + "learning_rate": 0.0001835224459661159, + "loss": 0.232, + "step": 22700 + }, + { + "epoch": 0.6309607590943286, + "grad_norm": 0.16804257035255432, + "learning_rate": 0.00018344002501817226, + "loss": 0.2336, + "step": 22750 + }, + { + "epoch": 0.6323474860373931, + "grad_norm": 0.15330076217651367, + "learning_rate": 0.00018335741705327526, + "loss": 0.2314, + "step": 22800 + }, + { + "epoch": 0.6337342129804575, + "grad_norm": 0.12613581120967865, + "learning_rate": 0.00018327462225657692, + "loss": 0.235, + "step": 22850 + }, + { + "epoch": 0.635120939923522, + "grad_norm": 0.16671714186668396, + "learning_rate": 0.00018319164081364802, + "loss": 0.2319, + "step": 22900 + }, + { + "epoch": 0.6365076668665864, + "grad_norm": 0.11536330729722977, + "learning_rate": 0.00018310847291047776, + "loss": 0.2296, + "step": 22950 + }, + { + "epoch": 0.637894393809651, + "grad_norm": 0.1565777063369751, + "learning_rate": 0.00018302511873347305, + "loss": 0.23, + "step": 23000 + }, + { + "epoch": 0.637894393809651, + "eval_loss": 0.22944478690624237, + "eval_runtime": 500.3715, + "eval_samples_per_second": 5.71, + "eval_steps_per_second": 5.71, + "step": 23000 + }, + { + "epoch": 0.6392811207527154, + "grad_norm": 0.18740278482437134, + "learning_rate": 0.00018294157846945853, + "loss": 0.2315, + "step": 23050 + }, + { + "epoch": 0.6406678476957799, + "grad_norm": 0.14261969923973083, + "learning_rate": 0.00018285785230567577, + "loss": 0.2291, + "step": 23100 + }, + { + "epoch": 0.6420545746388443, + "grad_norm": 0.16137824952602386, + "learning_rate": 0.00018277394042978307, + "loss": 0.2325, + "step": 23150 + }, + { + "epoch": 0.6434413015819087, + "grad_norm": 0.1337035894393921, + "learning_rate": 0.00018268984302985495, + "loss": 0.2322, + "step": 23200 + }, + { + "epoch": 0.6448280285249732, + "grad_norm": 0.11618442833423615, + "learning_rate": 0.0001826055602943818, + "loss": 0.2349, + "step": 23250 + }, + { + "epoch": 0.6462147554680376, + "grad_norm": 0.12656192481517792, + "learning_rate": 0.0001825210924122693, + "loss": 0.234, + "step": 23300 + }, + { + "epoch": 0.6476014824111022, + "grad_norm": 0.11272765696048737, + "learning_rate": 0.0001824364395728382, + "loss": 0.2313, + "step": 23350 + }, + { + "epoch": 0.6489882093541666, + "grad_norm": 0.13132552802562714, + "learning_rate": 0.00018235160196582384, + "loss": 0.2289, + "step": 23400 + }, + { + "epoch": 0.6503749362972311, + "grad_norm": 0.11405663937330246, + "learning_rate": 0.00018226657978137554, + "loss": 0.2356, + "step": 23450 + }, + { + "epoch": 0.6517616632402955, + "grad_norm": 0.15040431916713715, + "learning_rate": 0.00018218137321005643, + "loss": 0.2303, + "step": 23500 + }, + { + "epoch": 0.65314839018336, + "grad_norm": 0.13074640929698944, + "learning_rate": 0.00018209598244284288, + "loss": 0.2319, + "step": 23550 + }, + { + "epoch": 0.6545351171264244, + "grad_norm": 0.14512640237808228, + "learning_rate": 0.00018201040767112413, + "loss": 0.2393, + "step": 23600 + }, + { + "epoch": 0.6559218440694888, + "grad_norm": 0.10800650715827942, + "learning_rate": 0.00018192464908670176, + "loss": 0.2318, + "step": 23650 + }, + { + "epoch": 0.6573085710125534, + "grad_norm": 0.12321613729000092, + "learning_rate": 0.00018183870688178946, + "loss": 0.2331, + "step": 23700 + }, + { + "epoch": 0.6586952979556178, + "grad_norm": 0.1868344396352768, + "learning_rate": 0.00018175258124901236, + "loss": 0.2317, + "step": 23750 + }, + { + "epoch": 0.6600820248986823, + "grad_norm": 0.11993540078401566, + "learning_rate": 0.00018166627238140674, + "loss": 0.2309, + "step": 23800 + }, + { + "epoch": 0.6614687518417467, + "grad_norm": 0.11594246327877045, + "learning_rate": 0.00018157978047241962, + "loss": 0.2322, + "step": 23850 + }, + { + "epoch": 0.6628554787848112, + "grad_norm": 0.18056848645210266, + "learning_rate": 0.00018149310571590824, + "loss": 0.2335, + "step": 23900 + }, + { + "epoch": 0.6642422057278756, + "grad_norm": 0.14387637376785278, + "learning_rate": 0.00018140624830613965, + "loss": 0.2366, + "step": 23950 + }, + { + "epoch": 0.6656289326709401, + "grad_norm": 0.16983430087566376, + "learning_rate": 0.00018131920843779035, + "loss": 0.2361, + "step": 24000 + }, + { + "epoch": 0.6656289326709401, + "eval_loss": 0.22958332300186157, + "eval_runtime": 500.0504, + "eval_samples_per_second": 5.713, + "eval_steps_per_second": 5.713, + "step": 24000 + }, + { + "epoch": 0.6670156596140046, + "grad_norm": 0.13279864192008972, + "learning_rate": 0.0001812319863059457, + "loss": 0.2359, + "step": 24050 + }, + { + "epoch": 0.668402386557069, + "grad_norm": 0.11594101786613464, + "learning_rate": 0.00018114458210609962, + "loss": 0.2358, + "step": 24100 + }, + { + "epoch": 0.6697891135001335, + "grad_norm": 0.13613513112068176, + "learning_rate": 0.0001810569960341541, + "loss": 0.2278, + "step": 24150 + }, + { + "epoch": 0.6711758404431979, + "grad_norm": 0.12295212596654892, + "learning_rate": 0.00018096922828641878, + "loss": 0.2315, + "step": 24200 + }, + { + "epoch": 0.6725625673862624, + "grad_norm": 0.17889654636383057, + "learning_rate": 0.00018088127905961047, + "loss": 0.2305, + "step": 24250 + }, + { + "epoch": 0.6739492943293268, + "grad_norm": 0.16525234282016754, + "learning_rate": 0.0001807931485508528, + "loss": 0.2304, + "step": 24300 + }, + { + "epoch": 0.6753360212723913, + "grad_norm": 0.11446121335029602, + "learning_rate": 0.0001807048369576756, + "loss": 0.2333, + "step": 24350 + }, + { + "epoch": 0.6767227482154557, + "grad_norm": 0.14533396065235138, + "learning_rate": 0.00018061634447801467, + "loss": 0.2354, + "step": 24400 + }, + { + "epoch": 0.6781094751585203, + "grad_norm": 0.14825408160686493, + "learning_rate": 0.0001805276713102112, + "loss": 0.2316, + "step": 24450 + }, + { + "epoch": 0.6794962021015847, + "grad_norm": 0.148117333650589, + "learning_rate": 0.00018043881765301135, + "loss": 0.2338, + "step": 24500 + }, + { + "epoch": 0.6808829290446491, + "grad_norm": 0.10264230519533157, + "learning_rate": 0.00018034978370556583, + "loss": 0.2298, + "step": 24550 + }, + { + "epoch": 0.6822696559877136, + "grad_norm": 0.12200962007045746, + "learning_rate": 0.00018026056966742945, + "loss": 0.2284, + "step": 24600 + }, + { + "epoch": 0.683656382930778, + "grad_norm": 0.14096751809120178, + "learning_rate": 0.00018017117573856063, + "loss": 0.2333, + "step": 24650 + }, + { + "epoch": 0.6850431098738425, + "grad_norm": 0.16554249823093414, + "learning_rate": 0.00018008160211932108, + "loss": 0.2316, + "step": 24700 + }, + { + "epoch": 0.686429836816907, + "grad_norm": 0.11679153889417648, + "learning_rate": 0.0001799918490104751, + "loss": 0.2287, + "step": 24750 + }, + { + "epoch": 0.6878165637599715, + "grad_norm": 0.1387365758419037, + "learning_rate": 0.00017990191661318943, + "loss": 0.2356, + "step": 24800 + }, + { + "epoch": 0.6892032907030359, + "grad_norm": 0.1255553960800171, + "learning_rate": 0.00017981180512903255, + "loss": 0.2342, + "step": 24850 + }, + { + "epoch": 0.6905900176461004, + "grad_norm": 0.17247521877288818, + "learning_rate": 0.00017972151475997443, + "loss": 0.2303, + "step": 24900 + }, + { + "epoch": 0.6919767445891648, + "grad_norm": 0.20023292303085327, + "learning_rate": 0.0001796310457083859, + "loss": 0.2346, + "step": 24950 + }, + { + "epoch": 0.6933634715322292, + "grad_norm": 0.11909276992082596, + "learning_rate": 0.0001795403981770383, + "loss": 0.2264, + "step": 25000 + }, + { + "epoch": 0.6933634715322292, + "eval_loss": 0.2287738025188446, + "eval_runtime": 500.5021, + "eval_samples_per_second": 5.708, + "eval_steps_per_second": 5.708, + "step": 25000 + }, + { + "epoch": 0.6947501984752937, + "grad_norm": 0.13509905338287354, + "learning_rate": 0.00017944957236910308, + "loss": 0.2318, + "step": 25050 + }, + { + "epoch": 0.6961369254183581, + "grad_norm": 0.15455523133277893, + "learning_rate": 0.0001793585684881511, + "loss": 0.2325, + "step": 25100 + }, + { + "epoch": 0.6975236523614227, + "grad_norm": 0.1231105625629425, + "learning_rate": 0.00017926738673815248, + "loss": 0.2303, + "step": 25150 + }, + { + "epoch": 0.6989103793044871, + "grad_norm": 0.19073975086212158, + "learning_rate": 0.00017917602732347597, + "loss": 0.2309, + "step": 25200 + }, + { + "epoch": 0.7002971062475516, + "grad_norm": 0.16656789183616638, + "learning_rate": 0.00017908449044888854, + "loss": 0.2334, + "step": 25250 + }, + { + "epoch": 0.701683833190616, + "grad_norm": 0.12732850015163422, + "learning_rate": 0.00017899277631955486, + "loss": 0.2348, + "step": 25300 + }, + { + "epoch": 0.7030705601336805, + "grad_norm": 0.20655155181884766, + "learning_rate": 0.00017890088514103692, + "loss": 0.2355, + "step": 25350 + }, + { + "epoch": 0.7044572870767449, + "grad_norm": 0.10959596931934357, + "learning_rate": 0.00017880881711929353, + "loss": 0.2304, + "step": 25400 + }, + { + "epoch": 0.7058440140198093, + "grad_norm": 0.15412519872188568, + "learning_rate": 0.00017871657246067987, + "loss": 0.2336, + "step": 25450 + }, + { + "epoch": 0.7072307409628739, + "grad_norm": 0.16455277800559998, + "learning_rate": 0.00017862415137194702, + "loss": 0.2319, + "step": 25500 + }, + { + "epoch": 0.7086174679059383, + "grad_norm": 0.1389029622077942, + "learning_rate": 0.00017853340773211896, + "loss": 0.2294, + "step": 25550 + }, + { + "epoch": 0.7100041948490028, + "grad_norm": 0.14564301073551178, + "learning_rate": 0.0001784424950430794, + "loss": 0.2326, + "step": 25600 + }, + { + "epoch": 0.7113909217920672, + "grad_norm": 0.1606937199831009, + "learning_rate": 0.00017834955293674994, + "loss": 0.23, + "step": 25650 + }, + { + "epoch": 0.7127776487351317, + "grad_norm": 0.13401974737644196, + "learning_rate": 0.00017825643522291457, + "loss": 0.2361, + "step": 25700 + }, + { + "epoch": 0.7141643756781961, + "grad_norm": 0.12457278370857239, + "learning_rate": 0.0001781631421102812, + "loss": 0.232, + "step": 25750 + }, + { + "epoch": 0.7155511026212606, + "grad_norm": 0.13395826518535614, + "learning_rate": 0.0001780696738079508, + "loss": 0.2294, + "step": 25800 + }, + { + "epoch": 0.7169378295643251, + "grad_norm": 0.13083291053771973, + "learning_rate": 0.00017797603052541704, + "loss": 0.2328, + "step": 25850 + }, + { + "epoch": 0.7183245565073895, + "grad_norm": 0.14696165919303894, + "learning_rate": 0.00017788221247256583, + "loss": 0.233, + "step": 25900 + }, + { + "epoch": 0.719711283450454, + "grad_norm": 0.1512746810913086, + "learning_rate": 0.00017778821985967467, + "loss": 0.2319, + "step": 25950 + }, + { + "epoch": 0.7210980103935184, + "grad_norm": 0.1260426789522171, + "learning_rate": 0.00017769405289741247, + "loss": 0.2341, + "step": 26000 + }, + { + "epoch": 0.7210980103935184, + "eval_loss": 0.22873948514461517, + "eval_runtime": 500.274, + "eval_samples_per_second": 5.711, + "eval_steps_per_second": 5.711, + "step": 26000 + }, + { + "epoch": 0.7224847373365829, + "grad_norm": 0.1653342843055725, + "learning_rate": 0.00017759971179683875, + "loss": 0.2316, + "step": 26050 + }, + { + "epoch": 0.7238714642796473, + "grad_norm": 0.13507039844989777, + "learning_rate": 0.00017750519676940348, + "loss": 0.2357, + "step": 26100 + }, + { + "epoch": 0.7252581912227118, + "grad_norm": 0.128819540143013, + "learning_rate": 0.00017741050802694635, + "loss": 0.231, + "step": 26150 + }, + { + "epoch": 0.7266449181657763, + "grad_norm": 0.13130728900432587, + "learning_rate": 0.00017731564578169647, + "loss": 0.2305, + "step": 26200 + }, + { + "epoch": 0.7280316451088408, + "grad_norm": 0.12267379462718964, + "learning_rate": 0.0001772206102462718, + "loss": 0.2345, + "step": 26250 + }, + { + "epoch": 0.7294183720519052, + "grad_norm": 0.14595343172550201, + "learning_rate": 0.0001771254016336787, + "loss": 0.2294, + "step": 26300 + }, + { + "epoch": 0.7308050989949696, + "grad_norm": 0.13935647904872894, + "learning_rate": 0.0001770300201573114, + "loss": 0.2358, + "step": 26350 + }, + { + "epoch": 0.7321918259380341, + "grad_norm": 0.11328408867120743, + "learning_rate": 0.00017693446603095174, + "loss": 0.2339, + "step": 26400 + }, + { + "epoch": 0.7335785528810985, + "grad_norm": 0.19857367873191833, + "learning_rate": 0.00017683873946876835, + "loss": 0.2269, + "step": 26450 + }, + { + "epoch": 0.734965279824163, + "grad_norm": 0.16225670278072357, + "learning_rate": 0.00017674284068531641, + "loss": 0.2307, + "step": 26500 + }, + { + "epoch": 0.7363520067672275, + "grad_norm": 0.1412588506937027, + "learning_rate": 0.00017664676989553714, + "loss": 0.229, + "step": 26550 + }, + { + "epoch": 0.737738733710292, + "grad_norm": 0.14530161023139954, + "learning_rate": 0.00017655052731475724, + "loss": 0.2308, + "step": 26600 + }, + { + "epoch": 0.7391254606533564, + "grad_norm": 0.12190265953540802, + "learning_rate": 0.0001764541131586885, + "loss": 0.2294, + "step": 26650 + }, + { + "epoch": 0.7405121875964209, + "grad_norm": 0.13169080018997192, + "learning_rate": 0.00017635752764342717, + "loss": 0.2275, + "step": 26700 + }, + { + "epoch": 0.7418989145394853, + "grad_norm": 0.12346599251031876, + "learning_rate": 0.00017626077098545367, + "loss": 0.2326, + "step": 26750 + }, + { + "epoch": 0.7432856414825497, + "grad_norm": 0.12645727396011353, + "learning_rate": 0.00017616384340163197, + "loss": 0.2369, + "step": 26800 + }, + { + "epoch": 0.7446723684256142, + "grad_norm": 0.12523086369037628, + "learning_rate": 0.00017606674510920915, + "loss": 0.2291, + "step": 26850 + }, + { + "epoch": 0.7460590953686786, + "grad_norm": 0.14181695878505707, + "learning_rate": 0.0001759694763258149, + "loss": 0.2266, + "step": 26900 + }, + { + "epoch": 0.7474458223117432, + "grad_norm": 0.13824765384197235, + "learning_rate": 0.00017587203726946102, + "loss": 0.2281, + "step": 26950 + }, + { + "epoch": 0.7488325492548076, + "grad_norm": 0.1162494495511055, + "learning_rate": 0.000175774428158541, + "loss": 0.2326, + "step": 27000 + }, + { + "epoch": 0.7488325492548076, + "eval_loss": 0.22845527529716492, + "eval_runtime": 500.3687, + "eval_samples_per_second": 5.71, + "eval_steps_per_second": 5.71, + "step": 27000 + }, + { + "epoch": 0.7502192761978721, + "grad_norm": 0.1494184285402298, + "learning_rate": 0.0001756766492118294, + "loss": 0.2335, + "step": 27050 + }, + { + "epoch": 0.7516060031409365, + "grad_norm": 0.14270345866680145, + "learning_rate": 0.00017557870064848153, + "loss": 0.2378, + "step": 27100 + }, + { + "epoch": 0.752992730084001, + "grad_norm": 0.17542113363742828, + "learning_rate": 0.0001754805826880328, + "loss": 0.2344, + "step": 27150 + }, + { + "epoch": 0.7543794570270654, + "grad_norm": 0.14542442560195923, + "learning_rate": 0.0001753822955503983, + "loss": 0.2413, + "step": 27200 + }, + { + "epoch": 0.75576618397013, + "grad_norm": 0.13541916012763977, + "learning_rate": 0.00017528383945587236, + "loss": 0.2331, + "step": 27250 + }, + { + "epoch": 0.7571529109131944, + "grad_norm": 0.1555178165435791, + "learning_rate": 0.00017518521462512796, + "loss": 0.2314, + "step": 27300 + }, + { + "epoch": 0.7585396378562588, + "grad_norm": 0.10956469923257828, + "learning_rate": 0.0001750864212792162, + "loss": 0.2312, + "step": 27350 + }, + { + "epoch": 0.7599263647993233, + "grad_norm": 0.15572619438171387, + "learning_rate": 0.00017498745963956603, + "loss": 0.2334, + "step": 27400 + }, + { + "epoch": 0.7613130917423877, + "grad_norm": 0.1467774659395218, + "learning_rate": 0.0001748883299279835, + "loss": 0.231, + "step": 27450 + }, + { + "epoch": 0.7626998186854522, + "grad_norm": 0.12245896458625793, + "learning_rate": 0.00017478903236665136, + "loss": 0.2374, + "step": 27500 + }, + { + "epoch": 0.7640865456285166, + "grad_norm": 0.10392642766237259, + "learning_rate": 0.00017468956717812864, + "loss": 0.2313, + "step": 27550 + }, + { + "epoch": 0.7654732725715812, + "grad_norm": 0.1239921823143959, + "learning_rate": 0.00017458993458534998, + "loss": 0.2349, + "step": 27600 + }, + { + "epoch": 0.7668599995146456, + "grad_norm": 0.13776883482933044, + "learning_rate": 0.00017449013481162534, + "loss": 0.2362, + "step": 27650 + }, + { + "epoch": 0.7682467264577101, + "grad_norm": 0.1389874666929245, + "learning_rate": 0.00017439016808063932, + "loss": 0.2304, + "step": 27700 + }, + { + "epoch": 0.7696334534007745, + "grad_norm": 0.11973544955253601, + "learning_rate": 0.00017429003461645072, + "loss": 0.2352, + "step": 27750 + }, + { + "epoch": 0.7710201803438389, + "grad_norm": 0.13108691573143005, + "learning_rate": 0.00017418973464349209, + "loss": 0.2311, + "step": 27800 + }, + { + "epoch": 0.7724069072869034, + "grad_norm": 0.12594327330589294, + "learning_rate": 0.00017408926838656912, + "loss": 0.2332, + "step": 27850 + }, + { + "epoch": 0.7737936342299678, + "grad_norm": 0.14845065772533417, + "learning_rate": 0.00017398863607086024, + "loss": 0.2307, + "step": 27900 + }, + { + "epoch": 0.7751803611730324, + "grad_norm": 0.11298257112503052, + "learning_rate": 0.0001738878379219161, + "loss": 0.2331, + "step": 27950 + }, + { + "epoch": 0.7765670881160968, + "grad_norm": 0.11864858120679855, + "learning_rate": 0.000173786874165659, + "loss": 0.231, + "step": 28000 + }, + { + "epoch": 0.7765670881160968, + "eval_loss": 0.22779151797294617, + "eval_runtime": 501.235, + "eval_samples_per_second": 5.7, + "eval_steps_per_second": 5.7, + "step": 28000 + }, + { + "epoch": 0.7779538150591613, + "grad_norm": 0.11632022261619568, + "learning_rate": 0.00017368574502838239, + "loss": 0.229, + "step": 28050 + }, + { + "epoch": 0.7793405420022257, + "grad_norm": 0.1431494504213333, + "learning_rate": 0.00017358445073675042, + "loss": 0.2318, + "step": 28100 + }, + { + "epoch": 0.7807272689452902, + "grad_norm": 0.12157493084669113, + "learning_rate": 0.00017348299151779748, + "loss": 0.2343, + "step": 28150 + }, + { + "epoch": 0.7821139958883546, + "grad_norm": 0.11989067494869232, + "learning_rate": 0.00017338136759892752, + "loss": 0.2347, + "step": 28200 + }, + { + "epoch": 0.783500722831419, + "grad_norm": 0.12739787995815277, + "learning_rate": 0.00017327957920791365, + "loss": 0.2328, + "step": 28250 + }, + { + "epoch": 0.7848874497744835, + "grad_norm": 0.15567833185195923, + "learning_rate": 0.00017317762657289768, + "loss": 0.2297, + "step": 28300 + }, + { + "epoch": 0.786274176717548, + "grad_norm": 0.12073542922735214, + "learning_rate": 0.00017307550992238943, + "loss": 0.2296, + "step": 28350 + }, + { + "epoch": 0.7876609036606125, + "grad_norm": 0.1477758288383484, + "learning_rate": 0.0001729732294852665, + "loss": 0.2328, + "step": 28400 + }, + { + "epoch": 0.7890476306036769, + "grad_norm": 0.1612139195203781, + "learning_rate": 0.00017287078549077343, + "loss": 0.2314, + "step": 28450 + }, + { + "epoch": 0.7904343575467414, + "grad_norm": 0.15718688070774078, + "learning_rate": 0.00017276817816852145, + "loss": 0.2289, + "step": 28500 + }, + { + "epoch": 0.7918210844898058, + "grad_norm": 0.1242058202624321, + "learning_rate": 0.0001726654077484878, + "loss": 0.2301, + "step": 28550 + }, + { + "epoch": 0.7932078114328703, + "grad_norm": 0.13269132375717163, + "learning_rate": 0.0001725624744610153, + "loss": 0.2303, + "step": 28600 + }, + { + "epoch": 0.7945945383759347, + "grad_norm": 0.12394677847623825, + "learning_rate": 0.0001724593785368118, + "loss": 0.2362, + "step": 28650 + }, + { + "epoch": 0.7959812653189992, + "grad_norm": 0.1323787420988083, + "learning_rate": 0.00017235612020694978, + "loss": 0.2281, + "step": 28700 + }, + { + "epoch": 0.7973679922620637, + "grad_norm": 0.1532479077577591, + "learning_rate": 0.00017225269970286552, + "loss": 0.2321, + "step": 28750 + }, + { + "epoch": 0.7987547192051281, + "grad_norm": 0.14882826805114746, + "learning_rate": 0.00017214911725635897, + "loss": 0.2316, + "step": 28800 + }, + { + "epoch": 0.8001414461481926, + "grad_norm": 0.11855613440275192, + "learning_rate": 0.00017204537309959292, + "loss": 0.2271, + "step": 28850 + }, + { + "epoch": 0.801528173091257, + "grad_norm": 0.15302914381027222, + "learning_rate": 0.00017194146746509268, + "loss": 0.2296, + "step": 28900 + }, + { + "epoch": 0.8029149000343215, + "grad_norm": 0.11822402477264404, + "learning_rate": 0.00017183740058574547, + "loss": 0.2301, + "step": 28950 + }, + { + "epoch": 0.8043016269773859, + "grad_norm": 0.1369016021490097, + "learning_rate": 0.00017173317269479992, + "loss": 0.2291, + "step": 29000 + }, + { + "epoch": 0.8043016269773859, + "eval_loss": 0.2273886650800705, + "eval_runtime": 501.6607, + "eval_samples_per_second": 5.695, + "eval_steps_per_second": 5.695, + "step": 29000 + }, + { + "epoch": 0.8056883539204505, + "grad_norm": 0.12872962653636932, + "learning_rate": 0.00017162878402586553, + "loss": 0.2344, + "step": 29050 + }, + { + "epoch": 0.8070750808635149, + "grad_norm": 0.13491351902484894, + "learning_rate": 0.00017152423481291216, + "loss": 0.2357, + "step": 29100 + }, + { + "epoch": 0.8084618078065793, + "grad_norm": 0.12680833041667938, + "learning_rate": 0.00017141952529026945, + "loss": 0.2333, + "step": 29150 + }, + { + "epoch": 0.8098485347496438, + "grad_norm": 0.12384926527738571, + "learning_rate": 0.0001713146556926265, + "loss": 0.2421, + "step": 29200 + }, + { + "epoch": 0.8112352616927082, + "grad_norm": 0.13864979147911072, + "learning_rate": 0.00017120962625503098, + "loss": 0.2262, + "step": 29250 + }, + { + "epoch": 0.8126219886357727, + "grad_norm": 0.12703485786914825, + "learning_rate": 0.00017110443721288901, + "loss": 0.2295, + "step": 29300 + }, + { + "epoch": 0.8140087155788371, + "grad_norm": 0.12121795862913132, + "learning_rate": 0.0001709990888019643, + "loss": 0.2286, + "step": 29350 + }, + { + "epoch": 0.8153954425219017, + "grad_norm": 0.11982162296772003, + "learning_rate": 0.00017089358125837783, + "loss": 0.2286, + "step": 29400 + }, + { + "epoch": 0.8167821694649661, + "grad_norm": 0.1372060328722, + "learning_rate": 0.00017078791481860725, + "loss": 0.2244, + "step": 29450 + }, + { + "epoch": 0.8181688964080306, + "grad_norm": 0.12731321156024933, + "learning_rate": 0.0001706820897194863, + "loss": 0.2259, + "step": 29500 + }, + { + "epoch": 0.819555623351095, + "grad_norm": 0.14031195640563965, + "learning_rate": 0.00017057610619820437, + "loss": 0.2297, + "step": 29550 + }, + { + "epoch": 0.8209423502941594, + "grad_norm": 0.13404880464076996, + "learning_rate": 0.0001704699644923059, + "loss": 0.2293, + "step": 29600 + }, + { + "epoch": 0.8223290772372239, + "grad_norm": 0.12400925159454346, + "learning_rate": 0.00017036366483968987, + "loss": 0.2263, + "step": 29650 + }, + { + "epoch": 0.8237158041802883, + "grad_norm": 0.14439739286899567, + "learning_rate": 0.00017025720747860937, + "loss": 0.2272, + "step": 29700 + }, + { + "epoch": 0.8251025311233529, + "grad_norm": 0.12196583300828934, + "learning_rate": 0.00017015059264767084, + "loss": 0.2337, + "step": 29750 + }, + { + "epoch": 0.8264892580664173, + "grad_norm": 0.13919509947299957, + "learning_rate": 0.00017004382058583367, + "loss": 0.2337, + "step": 29800 + }, + { + "epoch": 0.8278759850094818, + "grad_norm": 0.11371088027954102, + "learning_rate": 0.00016993689153240978, + "loss": 0.2252, + "step": 29850 + }, + { + "epoch": 0.8292627119525462, + "grad_norm": 0.1316608041524887, + "learning_rate": 0.00016982980572706282, + "loss": 0.2281, + "step": 29900 + }, + { + "epoch": 0.8306494388956107, + "grad_norm": 0.18003039062023163, + "learning_rate": 0.00016972256340980785, + "loss": 0.2296, + "step": 29950 + }, + { + "epoch": 0.8320361658386751, + "grad_norm": 0.16534283757209778, + "learning_rate": 0.0001696151648210107, + "loss": 0.2267, + "step": 30000 + }, + { + "epoch": 0.8320361658386751, + "eval_loss": 0.22761212289333344, + "eval_runtime": 501.069, + "eval_samples_per_second": 5.702, + "eval_steps_per_second": 5.702, + "step": 30000 + }, + { + "epoch": 0.8334228927817395, + "grad_norm": 0.11093872785568237, + "learning_rate": 0.00016950761020138747, + "loss": 0.234, + "step": 30050 + }, + { + "epoch": 0.834809619724804, + "grad_norm": 0.14647316932678223, + "learning_rate": 0.00016939989979200394, + "loss": 0.232, + "step": 30100 + }, + { + "epoch": 0.8361963466678685, + "grad_norm": 0.14312680065631866, + "learning_rate": 0.00016929203383427515, + "loss": 0.2299, + "step": 30150 + }, + { + "epoch": 0.837583073610933, + "grad_norm": 0.11662258952856064, + "learning_rate": 0.00016918401256996467, + "loss": 0.2298, + "step": 30200 + }, + { + "epoch": 0.8389698005539974, + "grad_norm": 0.11783650517463684, + "learning_rate": 0.0001690758362411843, + "loss": 0.2345, + "step": 30250 + }, + { + "epoch": 0.8403565274970619, + "grad_norm": 0.12562035024166107, + "learning_rate": 0.0001689675050903932, + "loss": 0.2341, + "step": 30300 + }, + { + "epoch": 0.8417432544401263, + "grad_norm": 0.1082848459482193, + "learning_rate": 0.00016885901936039774, + "loss": 0.2298, + "step": 30350 + }, + { + "epoch": 0.8431299813831908, + "grad_norm": 0.14080305397510529, + "learning_rate": 0.0001687503792943506, + "loss": 0.2364, + "step": 30400 + }, + { + "epoch": 0.8445167083262552, + "grad_norm": 0.133138969540596, + "learning_rate": 0.00016864158513575048, + "loss": 0.2293, + "step": 30450 + }, + { + "epoch": 0.8459034352693197, + "grad_norm": 0.13258026540279388, + "learning_rate": 0.00016853263712844136, + "loss": 0.2269, + "step": 30500 + }, + { + "epoch": 0.8472901622123842, + "grad_norm": 0.12311206012964249, + "learning_rate": 0.00016842353551661216, + "loss": 0.2297, + "step": 30550 + }, + { + "epoch": 0.8486768891554486, + "grad_norm": 0.12220294028520584, + "learning_rate": 0.00016831428054479597, + "loss": 0.2301, + "step": 30600 + }, + { + "epoch": 0.8500636160985131, + "grad_norm": 0.112845279276371, + "learning_rate": 0.00016820487245786968, + "loss": 0.2295, + "step": 30650 + }, + { + "epoch": 0.8514503430415775, + "grad_norm": 0.17439040541648865, + "learning_rate": 0.0001680953115010533, + "loss": 0.2299, + "step": 30700 + }, + { + "epoch": 0.852837069984642, + "grad_norm": 0.14124707877635956, + "learning_rate": 0.0001679855979199096, + "loss": 0.228, + "step": 30750 + }, + { + "epoch": 0.8542237969277064, + "grad_norm": 0.12298920005559921, + "learning_rate": 0.00016787573196034328, + "loss": 0.2293, + "step": 30800 + }, + { + "epoch": 0.855610523870771, + "grad_norm": 0.15425720810890198, + "learning_rate": 0.0001677657138686006, + "loss": 0.2263, + "step": 30850 + }, + { + "epoch": 0.8569972508138354, + "grad_norm": 0.13903729617595673, + "learning_rate": 0.0001676555438912689, + "loss": 0.2315, + "step": 30900 + }, + { + "epoch": 0.8583839777568998, + "grad_norm": 0.1249585896730423, + "learning_rate": 0.00016754522227527589, + "loss": 0.2289, + "step": 30950 + }, + { + "epoch": 0.8597707046999643, + "grad_norm": 0.13223236799240112, + "learning_rate": 0.00016743474926788908, + "loss": 0.2303, + "step": 31000 + }, + { + "epoch": 0.8597707046999643, + "eval_loss": 0.22721892595291138, + "eval_runtime": 500.5938, + "eval_samples_per_second": 5.707, + "eval_steps_per_second": 5.707, + "step": 31000 + }, + { + "epoch": 0.8611574316430287, + "grad_norm": 0.15615518391132355, + "learning_rate": 0.00016732412511671544, + "loss": 0.2306, + "step": 31050 + }, + { + "epoch": 0.8625441585860932, + "grad_norm": 0.14526858925819397, + "learning_rate": 0.0001672133500697005, + "loss": 0.2307, + "step": 31100 + }, + { + "epoch": 0.8639308855291576, + "grad_norm": 0.11307808756828308, + "learning_rate": 0.00016710242437512825, + "loss": 0.237, + "step": 31150 + }, + { + "epoch": 0.8653176124722222, + "grad_norm": 0.1289224922657013, + "learning_rate": 0.00016699134828162017, + "loss": 0.2344, + "step": 31200 + }, + { + "epoch": 0.8667043394152866, + "grad_norm": 0.1631319522857666, + "learning_rate": 0.00016688012203813486, + "loss": 0.2305, + "step": 31250 + }, + { + "epoch": 0.8680910663583511, + "grad_norm": 0.1249733492732048, + "learning_rate": 0.00016676874589396744, + "loss": 0.2301, + "step": 31300 + }, + { + "epoch": 0.8694777933014155, + "grad_norm": 0.11502408981323242, + "learning_rate": 0.00016665722009874905, + "loss": 0.2319, + "step": 31350 + }, + { + "epoch": 0.8708645202444799, + "grad_norm": 0.13455846905708313, + "learning_rate": 0.00016654554490244628, + "loss": 0.228, + "step": 31400 + }, + { + "epoch": 0.8722512471875444, + "grad_norm": 0.1758633404970169, + "learning_rate": 0.00016643372055536048, + "loss": 0.2309, + "step": 31450 + }, + { + "epoch": 0.8736379741306088, + "grad_norm": 0.11880768090486526, + "learning_rate": 0.00016632174730812734, + "loss": 0.23, + "step": 31500 + }, + { + "epoch": 0.8750247010736734, + "grad_norm": 0.13718900084495544, + "learning_rate": 0.0001662096254117163, + "loss": 0.2279, + "step": 31550 + }, + { + "epoch": 0.8764114280167378, + "grad_norm": 0.1170978993177414, + "learning_rate": 0.00016609735511743, + "loss": 0.2306, + "step": 31600 + }, + { + "epoch": 0.8777981549598023, + "grad_norm": 0.15582193434238434, + "learning_rate": 0.0001659849366769036, + "loss": 0.2312, + "step": 31650 + }, + { + "epoch": 0.8791848819028667, + "grad_norm": 0.12351904064416885, + "learning_rate": 0.00016587237034210435, + "loss": 0.2292, + "step": 31700 + }, + { + "epoch": 0.8805716088459312, + "grad_norm": 0.18479709327220917, + "learning_rate": 0.000165759656365331, + "loss": 0.2274, + "step": 31750 + }, + { + "epoch": 0.8819583357889956, + "grad_norm": 0.14211027324199677, + "learning_rate": 0.00016564679499921328, + "loss": 0.2298, + "step": 31800 + }, + { + "epoch": 0.88334506273206, + "grad_norm": 0.1540357619524002, + "learning_rate": 0.00016553378649671112, + "loss": 0.2304, + "step": 31850 + }, + { + "epoch": 0.8847317896751246, + "grad_norm": 0.12503454089164734, + "learning_rate": 0.00016542063111111427, + "loss": 0.2294, + "step": 31900 + }, + { + "epoch": 0.886118516618189, + "grad_norm": 0.13658925890922546, + "learning_rate": 0.00016530732909604177, + "loss": 0.2291, + "step": 31950 + }, + { + "epoch": 0.8875052435612535, + "grad_norm": 0.15731070935726166, + "learning_rate": 0.00016519388070544128, + "loss": 0.2322, + "step": 32000 + }, + { + "epoch": 0.8875052435612535, + "eval_loss": 0.22673186659812927, + "eval_runtime": 500.5013, + "eval_samples_per_second": 5.708, + "eval_steps_per_second": 5.708, + "step": 32000 + }, + { + "epoch": 0.8888919705043179, + "grad_norm": 0.11884371191263199, + "learning_rate": 0.0001650802861935885, + "loss": 0.2312, + "step": 32050 + }, + { + "epoch": 0.8902786974473824, + "grad_norm": 0.168379008769989, + "learning_rate": 0.00016496654581508663, + "loss": 0.2312, + "step": 32100 + }, + { + "epoch": 0.8916654243904468, + "grad_norm": 0.11641304939985275, + "learning_rate": 0.00016485265982486591, + "loss": 0.2271, + "step": 32150 + }, + { + "epoch": 0.8930521513335113, + "grad_norm": 0.12015505880117416, + "learning_rate": 0.00016473862847818277, + "loss": 0.2308, + "step": 32200 + }, + { + "epoch": 0.8944388782765758, + "grad_norm": 0.17053671181201935, + "learning_rate": 0.00016462445203061957, + "loss": 0.2324, + "step": 32250 + }, + { + "epoch": 0.8958256052196402, + "grad_norm": 0.12947635352611542, + "learning_rate": 0.0001645101307380839, + "loss": 0.2318, + "step": 32300 + }, + { + "epoch": 0.8972123321627047, + "grad_norm": 0.11198735982179642, + "learning_rate": 0.00016439566485680783, + "loss": 0.23, + "step": 32350 + }, + { + "epoch": 0.8985990591057691, + "grad_norm": 0.1204909086227417, + "learning_rate": 0.00016428105464334772, + "loss": 0.23, + "step": 32400 + }, + { + "epoch": 0.8999857860488336, + "grad_norm": 0.11191330850124359, + "learning_rate": 0.00016416630035458326, + "loss": 0.2295, + "step": 32450 + }, + { + "epoch": 0.901372512991898, + "grad_norm": 0.10705868154764175, + "learning_rate": 0.00016405140224771717, + "loss": 0.2246, + "step": 32500 + }, + { + "epoch": 0.9027592399349625, + "grad_norm": 0.11882634460926056, + "learning_rate": 0.0001639363605802744, + "loss": 0.2345, + "step": 32550 + }, + { + "epoch": 0.904145966878027, + "grad_norm": 0.1181696355342865, + "learning_rate": 0.0001638211756101018, + "loss": 0.2306, + "step": 32600 + }, + { + "epoch": 0.9055326938210915, + "grad_norm": 0.1270473152399063, + "learning_rate": 0.00016370584759536734, + "loss": 0.2297, + "step": 32650 + }, + { + "epoch": 0.9069194207641559, + "grad_norm": 0.11503591388463974, + "learning_rate": 0.00016359037679455955, + "loss": 0.2292, + "step": 32700 + }, + { + "epoch": 0.9083061477072203, + "grad_norm": 0.11596430093050003, + "learning_rate": 0.0001634747634664871, + "loss": 0.2324, + "step": 32750 + }, + { + "epoch": 0.9096928746502848, + "grad_norm": 0.16631336510181427, + "learning_rate": 0.00016335900787027802, + "loss": 0.23, + "step": 32800 + }, + { + "epoch": 0.9110796015933492, + "grad_norm": 0.12083205580711365, + "learning_rate": 0.0001632431102653793, + "loss": 0.2295, + "step": 32850 + }, + { + "epoch": 0.9124663285364137, + "grad_norm": 0.1268964558839798, + "learning_rate": 0.00016312707091155609, + "loss": 0.2299, + "step": 32900 + }, + { + "epoch": 0.9138530554794781, + "grad_norm": 0.1737286001443863, + "learning_rate": 0.00016301089006889137, + "loss": 0.2291, + "step": 32950 + }, + { + "epoch": 0.9152397824225427, + "grad_norm": 0.12454930692911148, + "learning_rate": 0.00016289456799778522, + "loss": 0.2289, + "step": 33000 + }, + { + "epoch": 0.9152397824225427, + "eval_loss": 0.22642949223518372, + "eval_runtime": 500.8866, + "eval_samples_per_second": 5.704, + "eval_steps_per_second": 5.704, + "step": 33000 + }, + { + "epoch": 0.9166265093656071, + "grad_norm": 0.12109609693288803, + "learning_rate": 0.00016277810495895419, + "loss": 0.2289, + "step": 33050 + }, + { + "epoch": 0.9180132363086716, + "grad_norm": 0.16857489943504333, + "learning_rate": 0.00016266150121343085, + "loss": 0.2265, + "step": 33100 + }, + { + "epoch": 0.919399963251736, + "grad_norm": 0.13193485140800476, + "learning_rate": 0.00016254475702256308, + "loss": 0.2277, + "step": 33150 + }, + { + "epoch": 0.9207866901948004, + "grad_norm": 0.13189518451690674, + "learning_rate": 0.0001624278726480137, + "loss": 0.2346, + "step": 33200 + }, + { + "epoch": 0.9221734171378649, + "grad_norm": 0.16021443903446198, + "learning_rate": 0.00016231084835175948, + "loss": 0.2273, + "step": 33250 + }, + { + "epoch": 0.9235601440809293, + "grad_norm": 0.14241939783096313, + "learning_rate": 0.00016219368439609103, + "loss": 0.236, + "step": 33300 + }, + { + "epoch": 0.9249468710239939, + "grad_norm": 0.18355390429496765, + "learning_rate": 0.0001620763810436119, + "loss": 0.2281, + "step": 33350 + }, + { + "epoch": 0.9263335979670583, + "grad_norm": 0.1321648508310318, + "learning_rate": 0.0001619612887687756, + "loss": 0.241, + "step": 33400 + }, + { + "epoch": 0.9277203249101228, + "grad_norm": 0.16118654608726501, + "learning_rate": 0.00016184371018656649, + "loss": 0.233, + "step": 33450 + }, + { + "epoch": 0.9291070518531872, + "grad_norm": 0.11974034458398819, + "learning_rate": 0.00016172599299195568, + "loss": 0.219, + "step": 33500 + }, + { + "epoch": 0.9304937787962517, + "grad_norm": 0.14652998745441437, + "learning_rate": 0.00016160813744878674, + "loss": 0.2316, + "step": 33550 + }, + { + "epoch": 0.9318805057393161, + "grad_norm": 0.09738484770059586, + "learning_rate": 0.0001614901438212133, + "loss": 0.2351, + "step": 33600 + }, + { + "epoch": 0.9332672326823805, + "grad_norm": 0.15131749212741852, + "learning_rate": 0.00016137201237369846, + "loss": 0.2281, + "step": 33650 + }, + { + "epoch": 0.9346539596254451, + "grad_norm": 0.16536715626716614, + "learning_rate": 0.00016125374337101422, + "loss": 0.2317, + "step": 33700 + }, + { + "epoch": 0.9360406865685095, + "grad_norm": 0.15788187086582184, + "learning_rate": 0.0001611353370782409, + "loss": 0.2261, + "step": 33750 + }, + { + "epoch": 0.937427413511574, + "grad_norm": 0.11554282158613205, + "learning_rate": 0.00016101679376076655, + "loss": 0.2288, + "step": 33800 + }, + { + "epoch": 0.9388141404546384, + "grad_norm": 0.1376064121723175, + "learning_rate": 0.00016089811368428633, + "loss": 0.2287, + "step": 33850 + }, + { + "epoch": 0.9402008673977029, + "grad_norm": 0.1270899623632431, + "learning_rate": 0.0001607792971148019, + "loss": 0.2232, + "step": 33900 + }, + { + "epoch": 0.9415875943407673, + "grad_norm": 0.1187126636505127, + "learning_rate": 0.00016066034431862084, + "loss": 0.2321, + "step": 33950 + }, + { + "epoch": 0.9429743212838319, + "grad_norm": 0.14895334839820862, + "learning_rate": 0.00016054125556235613, + "loss": 0.2306, + "step": 34000 + }, + { + "epoch": 0.9429743212838319, + "eval_loss": 0.22613388299942017, + "eval_runtime": 500.7207, + "eval_samples_per_second": 5.706, + "eval_steps_per_second": 5.706, + "step": 34000 + }, + { + "epoch": 0.9443610482268963, + "grad_norm": 0.12219640612602234, + "learning_rate": 0.00016042203111292538, + "loss": 0.2315, + "step": 34050 + }, + { + "epoch": 0.9457477751699607, + "grad_norm": 0.1677113175392151, + "learning_rate": 0.00016030267123755038, + "loss": 0.2327, + "step": 34100 + }, + { + "epoch": 0.9471345021130252, + "grad_norm": 0.12030269205570221, + "learning_rate": 0.00016018317620375652, + "loss": 0.2282, + "step": 34150 + }, + { + "epoch": 0.9485212290560896, + "grad_norm": 0.13181360065937042, + "learning_rate": 0.00016006354627937203, + "loss": 0.2287, + "step": 34200 + }, + { + "epoch": 0.9499079559991541, + "grad_norm": 0.13087068498134613, + "learning_rate": 0.00015994378173252752, + "loss": 0.2282, + "step": 34250 + }, + { + "epoch": 0.9512946829422185, + "grad_norm": 0.14467494189739227, + "learning_rate": 0.0001598238828316553, + "loss": 0.2254, + "step": 34300 + }, + { + "epoch": 0.952681409885283, + "grad_norm": 0.14921946823596954, + "learning_rate": 0.00015970384984548885, + "loss": 0.2324, + "step": 34350 + }, + { + "epoch": 0.9540681368283475, + "grad_norm": 0.19342415034770966, + "learning_rate": 0.0001595836830430622, + "loss": 0.2342, + "step": 34400 + }, + { + "epoch": 0.955454863771412, + "grad_norm": 0.12381652742624283, + "learning_rate": 0.00015946338269370923, + "loss": 0.2262, + "step": 34450 + }, + { + "epoch": 0.9568415907144764, + "grad_norm": 0.1456434279680252, + "learning_rate": 0.00015934294906706315, + "loss": 0.2277, + "step": 34500 + }, + { + "epoch": 0.9582283176575408, + "grad_norm": 0.11485321074724197, + "learning_rate": 0.000159222382433056, + "loss": 0.2355, + "step": 34550 + }, + { + "epoch": 0.9596150446006053, + "grad_norm": 0.10027427971363068, + "learning_rate": 0.00015910168306191785, + "loss": 0.2269, + "step": 34600 + }, + { + "epoch": 0.9610017715436697, + "grad_norm": 0.16801820695400238, + "learning_rate": 0.0001589808512241763, + "loss": 0.2282, + "step": 34650 + }, + { + "epoch": 0.9623884984867342, + "grad_norm": 0.11840588599443436, + "learning_rate": 0.00015885988719065573, + "loss": 0.2304, + "step": 34700 + }, + { + "epoch": 0.9637752254297987, + "grad_norm": 0.16810324788093567, + "learning_rate": 0.00015873879123247706, + "loss": 0.231, + "step": 34750 + }, + { + "epoch": 0.9651619523728632, + "grad_norm": 0.1277480274438858, + "learning_rate": 0.0001586175636210567, + "loss": 0.2292, + "step": 34800 + }, + { + "epoch": 0.9665486793159276, + "grad_norm": 0.13225620985031128, + "learning_rate": 0.0001584962046281062, + "loss": 0.2255, + "step": 34850 + }, + { + "epoch": 0.9679354062589921, + "grad_norm": 0.14994849264621735, + "learning_rate": 0.00015837471452563159, + "loss": 0.2306, + "step": 34900 + }, + { + "epoch": 0.9693221332020565, + "grad_norm": 0.11426250636577606, + "learning_rate": 0.00015825309358593272, + "loss": 0.2311, + "step": 34950 + }, + { + "epoch": 0.9707088601451209, + "grad_norm": 0.1453811228275299, + "learning_rate": 0.00015813134208160276, + "loss": 0.2276, + "step": 35000 + }, + { + "epoch": 0.9707088601451209, + "eval_loss": 0.22605940699577332, + "eval_runtime": 500.6317, + "eval_samples_per_second": 5.707, + "eval_steps_per_second": 5.707, + "step": 35000 + }, + { + "epoch": 0.9720955870881854, + "grad_norm": 0.14036044478416443, + "learning_rate": 0.0001580094602855275, + "loss": 0.2241, + "step": 35050 + }, + { + "epoch": 0.9734823140312499, + "grad_norm": 0.1456310898065567, + "learning_rate": 0.00015788744847088464, + "loss": 0.2352, + "step": 35100 + }, + { + "epoch": 0.9748690409743144, + "grad_norm": 0.1325587034225464, + "learning_rate": 0.0001577653069111435, + "loss": 0.2267, + "step": 35150 + }, + { + "epoch": 0.9762557679173788, + "grad_norm": 0.13475272059440613, + "learning_rate": 0.000157643035880064, + "loss": 0.232, + "step": 35200 + }, + { + "epoch": 0.9776424948604433, + "grad_norm": 0.13557064533233643, + "learning_rate": 0.00015752063565169645, + "loss": 0.2342, + "step": 35250 + }, + { + "epoch": 0.9790292218035077, + "grad_norm": 0.149173304438591, + "learning_rate": 0.00015739810650038054, + "loss": 0.2284, + "step": 35300 + }, + { + "epoch": 0.9804159487465722, + "grad_norm": 0.11646503955125809, + "learning_rate": 0.00015727544870074503, + "loss": 0.2259, + "step": 35350 + }, + { + "epoch": 0.9818026756896366, + "grad_norm": 0.126033216714859, + "learning_rate": 0.000157152662527707, + "loss": 0.2289, + "step": 35400 + }, + { + "epoch": 0.983189402632701, + "grad_norm": 0.17162640392780304, + "learning_rate": 0.00015702974825647123, + "loss": 0.2293, + "step": 35450 + }, + { + "epoch": 0.9845761295757656, + "grad_norm": 0.12047728151082993, + "learning_rate": 0.0001569067061625297, + "loss": 0.2265, + "step": 35500 + }, + { + "epoch": 0.98596285651883, + "grad_norm": 0.1183520033955574, + "learning_rate": 0.00015678353652166078, + "loss": 0.2272, + "step": 35550 + }, + { + "epoch": 0.9873495834618945, + "grad_norm": 0.13919849693775177, + "learning_rate": 0.00015666023960992878, + "loss": 0.2295, + "step": 35600 + }, + { + "epoch": 0.9887363104049589, + "grad_norm": 0.14626280963420868, + "learning_rate": 0.00015653681570368318, + "loss": 0.2293, + "step": 35650 + }, + { + "epoch": 0.9901230373480234, + "grad_norm": 0.11618024855852127, + "learning_rate": 0.00015641326507955823, + "loss": 0.2264, + "step": 35700 + }, + { + "epoch": 0.9915097642910878, + "grad_norm": 0.12280390411615372, + "learning_rate": 0.0001562895880144721, + "loss": 0.233, + "step": 35750 + }, + { + "epoch": 0.9928964912341524, + "grad_norm": 0.11896737664937973, + "learning_rate": 0.0001561657847856264, + "loss": 0.2276, + "step": 35800 + }, + { + "epoch": 0.9942832181772168, + "grad_norm": 0.1226055920124054, + "learning_rate": 0.0001560418556705055, + "loss": 0.2364, + "step": 35850 + }, + { + "epoch": 0.9956699451202812, + "grad_norm": 0.1566486656665802, + "learning_rate": 0.00015591780094687587, + "loss": 0.2315, + "step": 35900 + }, + { + "epoch": 0.9970566720633457, + "grad_norm": 0.12156879901885986, + "learning_rate": 0.0001557936208927856, + "loss": 0.2284, + "step": 35950 + }, + { + "epoch": 0.9984433990064101, + "grad_norm": 0.12765392661094666, + "learning_rate": 0.00015566931578656366, + "loss": 0.2319, + "step": 36000 + }, + { + "epoch": 0.9984433990064101, + "eval_loss": 0.22568126022815704, + "eval_runtime": 500.5568, + "eval_samples_per_second": 5.708, + "eval_steps_per_second": 5.708, + "step": 36000 + }, + { + "epoch": 0.9998301259494746, + "grad_norm": 0.11263388395309448, + "learning_rate": 0.00015554488590681934, + "loss": 0.2249, + "step": 36050 + }, + { + "epoch": 1.0012168528925391, + "grad_norm": 0.12134028226137161, + "learning_rate": 0.00015542033153244142, + "loss": 0.2296, + "step": 36100 + }, + { + "epoch": 1.0026035798356034, + "grad_norm": 0.12478175759315491, + "learning_rate": 0.00015529565294259795, + "loss": 0.2295, + "step": 36150 + }, + { + "epoch": 1.003990306778668, + "grad_norm": 0.1091291755437851, + "learning_rate": 0.0001551708504167352, + "loss": 0.2285, + "step": 36200 + }, + { + "epoch": 1.0053770337217325, + "grad_norm": 0.11158731579780579, + "learning_rate": 0.00015504592423457733, + "loss": 0.2267, + "step": 36250 + }, + { + "epoch": 1.006763760664797, + "grad_norm": 0.17226600646972656, + "learning_rate": 0.00015492087467612562, + "loss": 0.2369, + "step": 36300 + }, + { + "epoch": 1.0081504876078613, + "grad_norm": 0.10548936575651169, + "learning_rate": 0.00015479570202165784, + "loss": 0.2257, + "step": 36350 + }, + { + "epoch": 1.0095372145509258, + "grad_norm": 0.12710842490196228, + "learning_rate": 0.0001546704065517278, + "loss": 0.2283, + "step": 36400 + }, + { + "epoch": 1.0109239414939903, + "grad_norm": 0.13734006881713867, + "learning_rate": 0.0001545449885471644, + "loss": 0.2266, + "step": 36450 + }, + { + "epoch": 1.0123106684370546, + "grad_norm": 0.14669275283813477, + "learning_rate": 0.00015441944828907124, + "loss": 0.2265, + "step": 36500 + }, + { + "epoch": 1.0136973953801192, + "grad_norm": 0.10941125452518463, + "learning_rate": 0.000154293786058826, + "loss": 0.231, + "step": 36550 + }, + { + "epoch": 1.0150841223231837, + "grad_norm": 0.12528035044670105, + "learning_rate": 0.00015416800213807972, + "loss": 0.2286, + "step": 36600 + }, + { + "epoch": 1.0164708492662482, + "grad_norm": 0.1242556944489479, + "learning_rate": 0.00015404209680875607, + "loss": 0.2277, + "step": 36650 + }, + { + "epoch": 1.0178575762093125, + "grad_norm": 0.09937360137701035, + "learning_rate": 0.000153916070353051, + "loss": 0.2247, + "step": 36700 + }, + { + "epoch": 1.019244303152377, + "grad_norm": 0.11109854280948639, + "learning_rate": 0.00015378992305343183, + "loss": 0.2248, + "step": 36750 + }, + { + "epoch": 1.0206310300954415, + "grad_norm": 0.14019356667995453, + "learning_rate": 0.00015366365519263683, + "loss": 0.2252, + "step": 36800 + }, + { + "epoch": 1.0220177570385058, + "grad_norm": 0.11496023088693619, + "learning_rate": 0.00015353979599334788, + "loss": 0.2228, + "step": 36850 + }, + { + "epoch": 1.0234044839815704, + "grad_norm": 0.15292219817638397, + "learning_rate": 0.0001534132902566159, + "loss": 0.2307, + "step": 36900 + }, + { + "epoch": 1.0247912109246349, + "grad_norm": 0.12410300970077515, + "learning_rate": 0.00015328666480286793, + "loss": 0.2263, + "step": 36950 + }, + { + "epoch": 1.0261779378676994, + "grad_norm": 0.14905387163162231, + "learning_rate": 0.00015315991991591386, + "loss": 0.2228, + "step": 37000 + }, + { + "epoch": 1.0261779378676994, + "eval_loss": 0.22574713826179504, + "eval_runtime": 500.6484, + "eval_samples_per_second": 5.707, + "eval_steps_per_second": 5.707, + "step": 37000 + }, + { + "epoch": 1.0275646648107637, + "grad_norm": 0.12682612240314484, + "learning_rate": 0.0001530330558798313, + "loss": 0.2257, + "step": 37050 + }, + { + "epoch": 1.0289513917538282, + "grad_norm": 0.15558844804763794, + "learning_rate": 0.00015290607297896482, + "loss": 0.2259, + "step": 37100 + }, + { + "epoch": 1.0303381186968927, + "grad_norm": 0.16526414453983307, + "learning_rate": 0.00015277897149792562, + "loss": 0.2301, + "step": 37150 + }, + { + "epoch": 1.0317248456399573, + "grad_norm": 0.1130262240767479, + "learning_rate": 0.0001526517517215905, + "loss": 0.2244, + "step": 37200 + }, + { + "epoch": 1.0331115725830216, + "grad_norm": 0.12639841437339783, + "learning_rate": 0.00015252441393510146, + "loss": 0.2269, + "step": 37250 + }, + { + "epoch": 1.034498299526086, + "grad_norm": 0.12753638625144958, + "learning_rate": 0.000152396958423865, + "loss": 0.2277, + "step": 37300 + }, + { + "epoch": 1.0358850264691506, + "grad_norm": 0.1574636995792389, + "learning_rate": 0.00015226938547355145, + "loss": 0.2302, + "step": 37350 + }, + { + "epoch": 1.037271753412215, + "grad_norm": 0.1075245812535286, + "learning_rate": 0.0001521416953700944, + "loss": 0.2318, + "step": 37400 + }, + { + "epoch": 1.0386584803552794, + "grad_norm": 0.15765556693077087, + "learning_rate": 0.00015201388839969005, + "loss": 0.2271, + "step": 37450 + }, + { + "epoch": 1.040045207298344, + "grad_norm": 0.14305494725704193, + "learning_rate": 0.00015188596484879636, + "loss": 0.2268, + "step": 37500 + }, + { + "epoch": 1.0414319342414085, + "grad_norm": 0.14217057824134827, + "learning_rate": 0.0001517579250041328, + "loss": 0.2302, + "step": 37550 + }, + { + "epoch": 1.0428186611844728, + "grad_norm": 0.12122397124767303, + "learning_rate": 0.00015162976915267948, + "loss": 0.2264, + "step": 37600 + }, + { + "epoch": 1.0442053881275373, + "grad_norm": 0.1215621680021286, + "learning_rate": 0.00015150149758167634, + "loss": 0.2239, + "step": 37650 + }, + { + "epoch": 1.0455921150706018, + "grad_norm": 0.1759423315525055, + "learning_rate": 0.00015137311057862279, + "loss": 0.2244, + "step": 37700 + }, + { + "epoch": 1.046978842013666, + "grad_norm": 0.11546457558870316, + "learning_rate": 0.00015124460843127704, + "loss": 0.226, + "step": 37750 + }, + { + "epoch": 1.0483655689567306, + "grad_norm": 0.16507115960121155, + "learning_rate": 0.00015111599142765526, + "loss": 0.2267, + "step": 37800 + }, + { + "epoch": 1.0497522958997951, + "grad_norm": 0.15918377041816711, + "learning_rate": 0.0001509872598560311, + "loss": 0.2265, + "step": 37850 + }, + { + "epoch": 1.0511390228428596, + "grad_norm": 0.12590187788009644, + "learning_rate": 0.000150858414004935, + "loss": 0.2285, + "step": 37900 + }, + { + "epoch": 1.052525749785924, + "grad_norm": 0.11883638054132462, + "learning_rate": 0.0001507294541631535, + "loss": 0.2233, + "step": 37950 + }, + { + "epoch": 1.0539124767289885, + "grad_norm": 0.11353275179862976, + "learning_rate": 0.00015060038061972874, + "loss": 0.2238, + "step": 38000 + }, + { + "epoch": 1.0539124767289885, + "eval_loss": 0.22568707168102264, + "eval_runtime": 500.8783, + "eval_samples_per_second": 5.704, + "eval_steps_per_second": 5.704, + "step": 38000 + }, + { + "epoch": 1.055299203672053, + "grad_norm": 0.1161685511469841, + "learning_rate": 0.00015047119366395757, + "loss": 0.2292, + "step": 38050 + }, + { + "epoch": 1.0566859306151175, + "grad_norm": 0.13814447820186615, + "learning_rate": 0.00015034189358539103, + "loss": 0.2251, + "step": 38100 + }, + { + "epoch": 1.0580726575581818, + "grad_norm": 0.15208768844604492, + "learning_rate": 0.00015021248067383387, + "loss": 0.2286, + "step": 38150 + }, + { + "epoch": 1.0594593845012463, + "grad_norm": 0.12832270562648773, + "learning_rate": 0.00015008295521934354, + "loss": 0.229, + "step": 38200 + }, + { + "epoch": 1.0608461114443108, + "grad_norm": 0.12442856281995773, + "learning_rate": 0.00014995331751222992, + "loss": 0.2286, + "step": 38250 + }, + { + "epoch": 1.0622328383873751, + "grad_norm": 0.14005307853221893, + "learning_rate": 0.00014982356784305428, + "loss": 0.2293, + "step": 38300 + }, + { + "epoch": 1.0636195653304397, + "grad_norm": 0.14418749511241913, + "learning_rate": 0.00014969370650262903, + "loss": 0.2328, + "step": 38350 + }, + { + "epoch": 1.0650062922735042, + "grad_norm": 0.11833231151103973, + "learning_rate": 0.00014956373378201677, + "loss": 0.2273, + "step": 38400 + }, + { + "epoch": 1.0663930192165687, + "grad_norm": 0.12782081961631775, + "learning_rate": 0.00014943364997252977, + "loss": 0.2224, + "step": 38450 + }, + { + "epoch": 1.067779746159633, + "grad_norm": 0.11903475224971771, + "learning_rate": 0.00014930345536572924, + "loss": 0.2256, + "step": 38500 + }, + { + "epoch": 1.0691664731026975, + "grad_norm": 0.17546679079532623, + "learning_rate": 0.00014917315025342483, + "loss": 0.2306, + "step": 38550 + }, + { + "epoch": 1.070553200045762, + "grad_norm": 0.16552455723285675, + "learning_rate": 0.0001490427349276737, + "loss": 0.2242, + "step": 38600 + }, + { + "epoch": 1.0719399269888266, + "grad_norm": 0.11756553500890732, + "learning_rate": 0.00014891220968078024, + "loss": 0.223, + "step": 38650 + }, + { + "epoch": 1.0733266539318909, + "grad_norm": 0.13542614877223969, + "learning_rate": 0.000148781574805295, + "loss": 0.2293, + "step": 38700 + }, + { + "epoch": 1.0747133808749554, + "grad_norm": 0.1370215266942978, + "learning_rate": 0.00014865083059401445, + "loss": 0.2291, + "step": 38750 + }, + { + "epoch": 1.07610010781802, + "grad_norm": 0.1472005844116211, + "learning_rate": 0.00014851997733997992, + "loss": 0.2272, + "step": 38800 + }, + { + "epoch": 1.0774868347610842, + "grad_norm": 0.1240694522857666, + "learning_rate": 0.00014838901533647733, + "loss": 0.2237, + "step": 38850 + }, + { + "epoch": 1.0788735617041487, + "grad_norm": 0.11901194602251053, + "learning_rate": 0.0001482579448770362, + "loss": 0.2285, + "step": 38900 + }, + { + "epoch": 1.0802602886472132, + "grad_norm": 0.2202654331922531, + "learning_rate": 0.0001481267662554292, + "loss": 0.2321, + "step": 38950 + }, + { + "epoch": 1.0816470155902778, + "grad_norm": 0.11475471407175064, + "learning_rate": 0.00014799547976567144, + "loss": 0.2296, + "step": 39000 + }, + { + "epoch": 1.0816470155902778, + "eval_loss": 0.2248746156692505, + "eval_runtime": 500.4656, + "eval_samples_per_second": 5.709, + "eval_steps_per_second": 5.709, + "step": 39000 + }, + { + "epoch": 1.083033742533342, + "grad_norm": 0.1217503771185875, + "learning_rate": 0.00014786408570201975, + "loss": 0.2223, + "step": 39050 + }, + { + "epoch": 1.0844204694764066, + "grad_norm": 0.14427083730697632, + "learning_rate": 0.00014773258435897207, + "loss": 0.2279, + "step": 39100 + }, + { + "epoch": 1.085807196419471, + "grad_norm": 0.11865708976984024, + "learning_rate": 0.00014760097603126689, + "loss": 0.2295, + "step": 39150 + }, + { + "epoch": 1.0871939233625354, + "grad_norm": 0.14178717136383057, + "learning_rate": 0.0001474718963578798, + "loss": 0.2261, + "step": 39200 + }, + { + "epoch": 1.0885806503056, + "grad_norm": 0.15393276512622833, + "learning_rate": 0.0001473400770710278, + "loss": 0.2308, + "step": 39250 + }, + { + "epoch": 1.0899673772486644, + "grad_norm": 0.11602922528982162, + "learning_rate": 0.00014720815167925812, + "loss": 0.2283, + "step": 39300 + }, + { + "epoch": 1.091354104191729, + "grad_norm": 0.16645793616771698, + "learning_rate": 0.00014707612047825964, + "loss": 0.233, + "step": 39350 + }, + { + "epoch": 1.0927408311347933, + "grad_norm": 0.10213354974985123, + "learning_rate": 0.00014694398376395825, + "loss": 0.2277, + "step": 39400 + }, + { + "epoch": 1.0941275580778578, + "grad_norm": 0.11264722794294357, + "learning_rate": 0.0001468117418325166, + "loss": 0.2267, + "step": 39450 + }, + { + "epoch": 1.0955142850209223, + "grad_norm": 0.12596255540847778, + "learning_rate": 0.00014667939498033293, + "loss": 0.2226, + "step": 39500 + }, + { + "epoch": 1.0969010119639866, + "grad_norm": 0.10382383316755295, + "learning_rate": 0.0001465469435040407, + "loss": 0.2297, + "step": 39550 + }, + { + "epoch": 1.0982877389070511, + "grad_norm": 0.12972958385944366, + "learning_rate": 0.00014641438770050794, + "loss": 0.2256, + "step": 39600 + }, + { + "epoch": 1.0996744658501156, + "grad_norm": 0.13036096096038818, + "learning_rate": 0.00014628172786683641, + "loss": 0.2235, + "step": 39650 + }, + { + "epoch": 1.1010611927931802, + "grad_norm": 0.1233506128191948, + "learning_rate": 0.00014614896430036113, + "loss": 0.2243, + "step": 39700 + }, + { + "epoch": 1.1024479197362445, + "grad_norm": 0.11503315716981888, + "learning_rate": 0.00014601609729864956, + "loss": 0.2285, + "step": 39750 + }, + { + "epoch": 1.103834646679309, + "grad_norm": 0.12343501299619675, + "learning_rate": 0.000145883127159501, + "loss": 0.2272, + "step": 39800 + }, + { + "epoch": 1.1052213736223735, + "grad_norm": 0.1226864606142044, + "learning_rate": 0.00014575005418094594, + "loss": 0.2332, + "step": 39850 + }, + { + "epoch": 1.106608100565438, + "grad_norm": 0.1333167850971222, + "learning_rate": 0.00014561687866124535, + "loss": 0.2304, + "step": 39900 + }, + { + "epoch": 1.1079948275085023, + "grad_norm": 0.1088777631521225, + "learning_rate": 0.00014548360089889002, + "loss": 0.2296, + "step": 39950 + }, + { + "epoch": 1.1093815544515668, + "grad_norm": 0.11975093185901642, + "learning_rate": 0.00014535022119259994, + "loss": 0.2255, + "step": 40000 + }, + { + "epoch": 1.1093815544515668, + "eval_loss": 0.22516606748104095, + "eval_runtime": 500.4411, + "eval_samples_per_second": 5.709, + "eval_steps_per_second": 5.709, + "step": 40000 + }, + { + "epoch": 1.1107682813946314, + "grad_norm": 0.19725576043128967, + "learning_rate": 0.0001452167398413235, + "loss": 0.2317, + "step": 40050 + }, + { + "epoch": 1.1121550083376956, + "grad_norm": 0.12385617196559906, + "learning_rate": 0.00014508315714423706, + "loss": 0.2269, + "step": 40100 + }, + { + "epoch": 1.1135417352807602, + "grad_norm": 0.12559738755226135, + "learning_rate": 0.000144949473400744, + "loss": 0.2295, + "step": 40150 + }, + { + "epoch": 1.1149284622238247, + "grad_norm": 0.1279434859752655, + "learning_rate": 0.0001448156889104742, + "loss": 0.2283, + "step": 40200 + }, + { + "epoch": 1.1163151891668892, + "grad_norm": 0.14756010472774506, + "learning_rate": 0.0001446818039732834, + "loss": 0.2267, + "step": 40250 + }, + { + "epoch": 1.1177019161099535, + "grad_norm": 0.11476084589958191, + "learning_rate": 0.00014454781888925238, + "loss": 0.2265, + "step": 40300 + }, + { + "epoch": 1.119088643053018, + "grad_norm": 0.12701088190078735, + "learning_rate": 0.00014441373395868653, + "loss": 0.2255, + "step": 40350 + }, + { + "epoch": 1.1204753699960825, + "grad_norm": 0.14300104975700378, + "learning_rate": 0.00014427954948211493, + "loss": 0.227, + "step": 40400 + }, + { + "epoch": 1.121862096939147, + "grad_norm": 0.11292553693056107, + "learning_rate": 0.00014414526576028973, + "loss": 0.2239, + "step": 40450 + }, + { + "epoch": 1.1232488238822114, + "grad_norm": 0.1404883861541748, + "learning_rate": 0.00014401088309418564, + "loss": 0.2234, + "step": 40500 + }, + { + "epoch": 1.1246355508252759, + "grad_norm": 0.15262041985988617, + "learning_rate": 0.00014387640178499905, + "loss": 0.2319, + "step": 40550 + }, + { + "epoch": 1.1260222777683404, + "grad_norm": 0.16456229984760284, + "learning_rate": 0.0001437418221341475, + "loss": 0.2264, + "step": 40600 + }, + { + "epoch": 1.1274090047114047, + "grad_norm": 0.12468329817056656, + "learning_rate": 0.0001436071444432689, + "loss": 0.2273, + "step": 40650 + }, + { + "epoch": 1.1287957316544692, + "grad_norm": 0.12449460476636887, + "learning_rate": 0.0001434723690142209, + "loss": 0.2333, + "step": 40700 + }, + { + "epoch": 1.1301824585975337, + "grad_norm": 0.12426210194826126, + "learning_rate": 0.0001433374961490803, + "loss": 0.2328, + "step": 40750 + }, + { + "epoch": 1.1315691855405983, + "grad_norm": 0.1501815766096115, + "learning_rate": 0.00014320252615014216, + "loss": 0.2214, + "step": 40800 + }, + { + "epoch": 1.1329559124836626, + "grad_norm": 0.15881818532943726, + "learning_rate": 0.00014306745931991932, + "loss": 0.2292, + "step": 40850 + }, + { + "epoch": 1.134342639426727, + "grad_norm": 0.12299991399049759, + "learning_rate": 0.00014293229596114163, + "loss": 0.2238, + "step": 40900 + }, + { + "epoch": 1.1357293663697916, + "grad_norm": 0.14259304106235504, + "learning_rate": 0.0001427970363767553, + "loss": 0.2291, + "step": 40950 + }, + { + "epoch": 1.137116093312856, + "grad_norm": 0.12536148726940155, + "learning_rate": 0.00014266168086992225, + "loss": 0.2252, + "step": 41000 + }, + { + "epoch": 1.137116093312856, + "eval_loss": 0.2245665341615677, + "eval_runtime": 501.2828, + "eval_samples_per_second": 5.699, + "eval_steps_per_second": 5.699, + "step": 41000 + }, + { + "epoch": 1.1385028202559204, + "grad_norm": 0.12410587817430496, + "learning_rate": 0.00014252622974401932, + "loss": 0.2268, + "step": 41050 + }, + { + "epoch": 1.139889547198985, + "grad_norm": 0.12877434492111206, + "learning_rate": 0.00014239068330263775, + "loss": 0.2258, + "step": 41100 + }, + { + "epoch": 1.1412762741420495, + "grad_norm": 0.1299249529838562, + "learning_rate": 0.00014225504184958232, + "loss": 0.2301, + "step": 41150 + }, + { + "epoch": 1.1426630010851138, + "grad_norm": 0.15234452486038208, + "learning_rate": 0.00014211930568887088, + "loss": 0.2192, + "step": 41200 + }, + { + "epoch": 1.1440497280281783, + "grad_norm": 0.12678442895412445, + "learning_rate": 0.00014198347512473343, + "loss": 0.2311, + "step": 41250 + }, + { + "epoch": 1.1454364549712428, + "grad_norm": 0.12326008826494217, + "learning_rate": 0.0001418475504616116, + "loss": 0.2318, + "step": 41300 + }, + { + "epoch": 1.146823181914307, + "grad_norm": 0.11192907392978668, + "learning_rate": 0.00014171153200415797, + "loss": 0.2232, + "step": 41350 + }, + { + "epoch": 1.1482099088573716, + "grad_norm": 0.11843819916248322, + "learning_rate": 0.00014157542005723532, + "loss": 0.2277, + "step": 41400 + }, + { + "epoch": 1.1495966358004361, + "grad_norm": 0.12903502583503723, + "learning_rate": 0.0001414419399397752, + "loss": 0.2237, + "step": 41450 + }, + { + "epoch": 1.1509833627435007, + "grad_norm": 0.13532768189907074, + "learning_rate": 0.00014130564378392948, + "loss": 0.2291, + "step": 41500 + }, + { + "epoch": 1.152370089686565, + "grad_norm": 0.11242423951625824, + "learning_rate": 0.00014116925504834574, + "loss": 0.2263, + "step": 41550 + }, + { + "epoch": 1.1537568166296295, + "grad_norm": 0.14420267939567566, + "learning_rate": 0.00014103277403871667, + "loss": 0.231, + "step": 41600 + }, + { + "epoch": 1.155143543572694, + "grad_norm": 0.11390483379364014, + "learning_rate": 0.00014089620106094174, + "loss": 0.2281, + "step": 41650 + }, + { + "epoch": 1.1565302705157583, + "grad_norm": 0.10996092855930328, + "learning_rate": 0.0001407595364211267, + "loss": 0.223, + "step": 41700 + }, + { + "epoch": 1.1579169974588228, + "grad_norm": 0.1297358274459839, + "learning_rate": 0.00014062278042558253, + "loss": 0.2251, + "step": 41750 + }, + { + "epoch": 1.1593037244018873, + "grad_norm": 0.13994191586971283, + "learning_rate": 0.00014048593338082508, + "loss": 0.2261, + "step": 41800 + }, + { + "epoch": 1.1606904513449519, + "grad_norm": 0.15100865066051483, + "learning_rate": 0.00014034899559357432, + "loss": 0.2257, + "step": 41850 + }, + { + "epoch": 1.1620771782880164, + "grad_norm": 0.1151217371225357, + "learning_rate": 0.0001402119673707535, + "loss": 0.2278, + "step": 41900 + }, + { + "epoch": 1.1634639052310807, + "grad_norm": 0.1580880582332611, + "learning_rate": 0.00014007484901948865, + "loss": 0.2247, + "step": 41950 + }, + { + "epoch": 1.1648506321741452, + "grad_norm": 0.1323232203722, + "learning_rate": 0.00013993764084710777, + "loss": 0.2229, + "step": 42000 + }, + { + "epoch": 1.1648506321741452, + "eval_loss": 0.22439424693584442, + "eval_runtime": 501.4893, + "eval_samples_per_second": 5.697, + "eval_steps_per_second": 5.697, + "step": 42000 + }, + { + "epoch": 1.1662373591172097, + "grad_norm": 0.11002755165100098, + "learning_rate": 0.00013980034316114014, + "loss": 0.2287, + "step": 42050 + }, + { + "epoch": 1.167624086060274, + "grad_norm": 0.16875265538692474, + "learning_rate": 0.00013966295626931575, + "loss": 0.2268, + "step": 42100 + }, + { + "epoch": 1.1690108130033385, + "grad_norm": 0.1291196197271347, + "learning_rate": 0.0001395254804795645, + "loss": 0.2267, + "step": 42150 + }, + { + "epoch": 1.170397539946403, + "grad_norm": 0.12030452489852905, + "learning_rate": 0.0001393879161000155, + "loss": 0.2284, + "step": 42200 + }, + { + "epoch": 1.1717842668894676, + "grad_norm": 0.1254565715789795, + "learning_rate": 0.00013925026343899644, + "loss": 0.2325, + "step": 42250 + }, + { + "epoch": 1.1731709938325319, + "grad_norm": 0.10753902792930603, + "learning_rate": 0.000139112522805033, + "loss": 0.2265, + "step": 42300 + }, + { + "epoch": 1.1745577207755964, + "grad_norm": 0.14079649746418, + "learning_rate": 0.00013897469450684783, + "loss": 0.2279, + "step": 42350 + }, + { + "epoch": 1.175944447718661, + "grad_norm": 0.13644090294837952, + "learning_rate": 0.00013883677885336013, + "loss": 0.2264, + "step": 42400 + }, + { + "epoch": 1.1773311746617252, + "grad_norm": 0.15901681780815125, + "learning_rate": 0.000138698776153685, + "loss": 0.2274, + "step": 42450 + }, + { + "epoch": 1.1787179016047897, + "grad_norm": 0.14739197492599487, + "learning_rate": 0.00013856068671713254, + "loss": 0.2223, + "step": 42500 + }, + { + "epoch": 1.1801046285478543, + "grad_norm": 0.1077587679028511, + "learning_rate": 0.00013842251085320728, + "loss": 0.2257, + "step": 42550 + }, + { + "epoch": 1.1814913554909188, + "grad_norm": 0.12596414983272552, + "learning_rate": 0.00013828424887160745, + "loss": 0.2251, + "step": 42600 + }, + { + "epoch": 1.182878082433983, + "grad_norm": 0.11234478652477264, + "learning_rate": 0.0001381459010822243, + "loss": 0.2225, + "step": 42650 + }, + { + "epoch": 1.1842648093770476, + "grad_norm": 0.11206696927547455, + "learning_rate": 0.00013800746779514143, + "loss": 0.2266, + "step": 42700 + }, + { + "epoch": 1.185651536320112, + "grad_norm": 0.10260911285877228, + "learning_rate": 0.0001378689493206341, + "loss": 0.2241, + "step": 42750 + }, + { + "epoch": 1.1870382632631764, + "grad_norm": 0.12874187529087067, + "learning_rate": 0.0001377303459691684, + "loss": 0.2277, + "step": 42800 + }, + { + "epoch": 1.188424990206241, + "grad_norm": 0.1351606696844101, + "learning_rate": 0.0001375916580514007, + "loss": 0.2268, + "step": 42850 + }, + { + "epoch": 1.1898117171493054, + "grad_norm": 0.1250632107257843, + "learning_rate": 0.000137452885878177, + "loss": 0.2265, + "step": 42900 + }, + { + "epoch": 1.19119844409237, + "grad_norm": 0.12516459822654724, + "learning_rate": 0.00013731402976053202, + "loss": 0.2256, + "step": 42950 + }, + { + "epoch": 1.1925851710354343, + "grad_norm": 0.12791725993156433, + "learning_rate": 0.00013717509000968865, + "loss": 0.2252, + "step": 43000 + }, + { + "epoch": 1.1925851710354343, + "eval_loss": 0.22418725490570068, + "eval_runtime": 501.0375, + "eval_samples_per_second": 5.702, + "eval_steps_per_second": 5.702, + "step": 43000 + }, + { + "epoch": 1.1939718979784988, + "grad_norm": 0.152371346950531, + "learning_rate": 0.00013703606693705732, + "loss": 0.2308, + "step": 43050 + }, + { + "epoch": 1.1953586249215633, + "grad_norm": 0.14723214507102966, + "learning_rate": 0.0001368969608542351, + "loss": 0.2258, + "step": 43100 + }, + { + "epoch": 1.1967453518646276, + "grad_norm": 0.1414303481578827, + "learning_rate": 0.00013675777207300524, + "loss": 0.2278, + "step": 43150 + }, + { + "epoch": 1.1981320788076921, + "grad_norm": 0.15416811406612396, + "learning_rate": 0.00013661850090533617, + "loss": 0.2324, + "step": 43200 + }, + { + "epoch": 1.1995188057507566, + "grad_norm": 0.11736203730106354, + "learning_rate": 0.00013647914766338112, + "loss": 0.2292, + "step": 43250 + }, + { + "epoch": 1.2009055326938212, + "grad_norm": 0.1547485738992691, + "learning_rate": 0.00013633971265947722, + "loss": 0.2281, + "step": 43300 + }, + { + "epoch": 1.2022922596368855, + "grad_norm": 0.15800827741622925, + "learning_rate": 0.0001362001962061449, + "loss": 0.2296, + "step": 43350 + }, + { + "epoch": 1.20367898657995, + "grad_norm": 0.15381957590579987, + "learning_rate": 0.0001360605986160871, + "loss": 0.2291, + "step": 43400 + }, + { + "epoch": 1.2050657135230145, + "grad_norm": 0.17754536867141724, + "learning_rate": 0.00013592092020218855, + "loss": 0.2285, + "step": 43450 + }, + { + "epoch": 1.2064524404660788, + "grad_norm": 0.1404140442609787, + "learning_rate": 0.0001357811612775153, + "loss": 0.2253, + "step": 43500 + }, + { + "epoch": 1.2078391674091433, + "grad_norm": 0.11709395796060562, + "learning_rate": 0.00013564132215531372, + "loss": 0.2261, + "step": 43550 + }, + { + "epoch": 1.2092258943522078, + "grad_norm": 0.11466790735721588, + "learning_rate": 0.00013550140314901, + "loss": 0.2295, + "step": 43600 + }, + { + "epoch": 1.2106126212952724, + "grad_norm": 0.14058195054531097, + "learning_rate": 0.00013536140457220933, + "loss": 0.2307, + "step": 43650 + }, + { + "epoch": 1.2119993482383369, + "grad_norm": 0.18355610966682434, + "learning_rate": 0.00013522132673869522, + "loss": 0.2283, + "step": 43700 + }, + { + "epoch": 1.2133860751814012, + "grad_norm": 0.1437745839357376, + "learning_rate": 0.00013508116996242893, + "loss": 0.2244, + "step": 43750 + }, + { + "epoch": 1.2147728021244657, + "grad_norm": 0.12281102687120438, + "learning_rate": 0.00013494093455754851, + "loss": 0.2266, + "step": 43800 + }, + { + "epoch": 1.2161595290675302, + "grad_norm": 0.15082257986068726, + "learning_rate": 0.00013480062083836842, + "loss": 0.2275, + "step": 43850 + }, + { + "epoch": 1.2175462560105945, + "grad_norm": 0.13360853493213654, + "learning_rate": 0.00013466022911937846, + "loss": 0.2293, + "step": 43900 + }, + { + "epoch": 1.218932982953659, + "grad_norm": 0.1245453953742981, + "learning_rate": 0.00013451975971524337, + "loss": 0.2252, + "step": 43950 + }, + { + "epoch": 1.2203197098967236, + "grad_norm": 0.12427138537168503, + "learning_rate": 0.00013437921294080202, + "loss": 0.2273, + "step": 44000 + }, + { + "epoch": 1.2203197098967236, + "eval_loss": 0.22416169941425323, + "eval_runtime": 501.199, + "eval_samples_per_second": 5.7, + "eval_steps_per_second": 5.7, + "step": 44000 + }, + { + "epoch": 1.221706436839788, + "grad_norm": 0.13315744698047638, + "learning_rate": 0.00013423858911106664, + "loss": 0.2273, + "step": 44050 + }, + { + "epoch": 1.2230931637828524, + "grad_norm": 0.11731356382369995, + "learning_rate": 0.0001340978885412221, + "loss": 0.2284, + "step": 44100 + }, + { + "epoch": 1.224479890725917, + "grad_norm": 0.1332121342420578, + "learning_rate": 0.00013395711154662548, + "loss": 0.2311, + "step": 44150 + }, + { + "epoch": 1.2258666176689814, + "grad_norm": 0.11775799095630646, + "learning_rate": 0.00013381625844280495, + "loss": 0.2207, + "step": 44200 + }, + { + "epoch": 1.2272533446120457, + "grad_norm": 0.13608750700950623, + "learning_rate": 0.00013367532954545934, + "loss": 0.2259, + "step": 44250 + }, + { + "epoch": 1.2286400715551102, + "grad_norm": 0.11276783794164658, + "learning_rate": 0.00013353432517045739, + "loss": 0.2254, + "step": 44300 + }, + { + "epoch": 1.2300267984981748, + "grad_norm": 0.11962584406137466, + "learning_rate": 0.00013339324563383693, + "loss": 0.2231, + "step": 44350 + }, + { + "epoch": 1.2314135254412393, + "grad_norm": 0.14515165984630585, + "learning_rate": 0.0001332520912518044, + "loss": 0.2273, + "step": 44400 + }, + { + "epoch": 1.2328002523843036, + "grad_norm": 0.14967331290245056, + "learning_rate": 0.00013311086234073376, + "loss": 0.2292, + "step": 44450 + }, + { + "epoch": 1.234186979327368, + "grad_norm": 0.10794315487146378, + "learning_rate": 0.00013296955921716626, + "loss": 0.2213, + "step": 44500 + }, + { + "epoch": 1.2355737062704326, + "grad_norm": 0.1261892467737198, + "learning_rate": 0.0001328281821978093, + "loss": 0.2249, + "step": 44550 + }, + { + "epoch": 1.236960433213497, + "grad_norm": 0.16944009065628052, + "learning_rate": 0.00013268673159953608, + "loss": 0.2279, + "step": 44600 + }, + { + "epoch": 1.2383471601565614, + "grad_norm": 0.14991511404514313, + "learning_rate": 0.00013254520773938453, + "loss": 0.224, + "step": 44650 + }, + { + "epoch": 1.239733887099626, + "grad_norm": 0.16776132583618164, + "learning_rate": 0.00013240361093455686, + "loss": 0.2267, + "step": 44700 + }, + { + "epoch": 1.2411206140426905, + "grad_norm": 0.15971648693084717, + "learning_rate": 0.00013226194150241886, + "loss": 0.2269, + "step": 44750 + }, + { + "epoch": 1.2425073409857548, + "grad_norm": 0.16267691552639008, + "learning_rate": 0.00013212019976049897, + "loss": 0.2262, + "step": 44800 + }, + { + "epoch": 1.2438940679288193, + "grad_norm": 0.13528917729854584, + "learning_rate": 0.00013197838602648773, + "loss": 0.2282, + "step": 44850 + }, + { + "epoch": 1.2452807948718838, + "grad_norm": 0.13532580435276031, + "learning_rate": 0.0001318365006182371, + "loss": 0.2269, + "step": 44900 + }, + { + "epoch": 1.246667521814948, + "grad_norm": 0.15377886593341827, + "learning_rate": 0.00013169738368628263, + "loss": 0.2298, + "step": 44950 + }, + { + "epoch": 1.2480542487580126, + "grad_norm": 0.16382162272930145, + "learning_rate": 0.00013155535730139284, + "loss": 0.2301, + "step": 45000 + }, + { + "epoch": 1.2480542487580126, + "eval_loss": 0.22414694726467133, + "eval_runtime": 500.918, + "eval_samples_per_second": 5.704, + "eval_steps_per_second": 5.704, + "step": 45000 + }, + { + "epoch": 1.2494409757010772, + "grad_norm": 0.13876722753047943, + "learning_rate": 0.00013141326019041228, + "loss": 0.2249, + "step": 45050 + }, + { + "epoch": 1.2508277026441417, + "grad_norm": 0.1360548585653305, + "learning_rate": 0.00013127393671013348, + "loss": 0.2255, + "step": 45100 + }, + { + "epoch": 1.2522144295872062, + "grad_norm": 0.1435881406068802, + "learning_rate": 0.00013113170050124578, + "loss": 0.2314, + "step": 45150 + }, + { + "epoch": 1.2536011565302705, + "grad_norm": 0.12622830271720886, + "learning_rate": 0.00013098939451582363, + "loss": 0.2248, + "step": 45200 + }, + { + "epoch": 1.254987883473335, + "grad_norm": 0.1429251879453659, + "learning_rate": 0.00013084701907282228, + "loss": 0.2312, + "step": 45250 + }, + { + "epoch": 1.2563746104163993, + "grad_norm": 0.12246144562959671, + "learning_rate": 0.00013070457449135262, + "loss": 0.2236, + "step": 45300 + }, + { + "epoch": 1.2577613373594638, + "grad_norm": 0.11872986704111099, + "learning_rate": 0.00013056206109068045, + "loss": 0.2263, + "step": 45350 + }, + { + "epoch": 1.2591480643025283, + "grad_norm": 0.12920017540454865, + "learning_rate": 0.00013041947919022594, + "loss": 0.2258, + "step": 45400 + }, + { + "epoch": 1.2605347912455929, + "grad_norm": 0.15954279899597168, + "learning_rate": 0.00013027682910956271, + "loss": 0.2272, + "step": 45450 + }, + { + "epoch": 1.2619215181886574, + "grad_norm": 0.16156534850597382, + "learning_rate": 0.00013013411116841723, + "loss": 0.2245, + "step": 45500 + }, + { + "epoch": 1.2633082451317217, + "grad_norm": 0.12423060089349747, + "learning_rate": 0.00012999132568666805, + "loss": 0.2271, + "step": 45550 + }, + { + "epoch": 1.2646949720747862, + "grad_norm": 0.1252107322216034, + "learning_rate": 0.0001298484729843451, + "loss": 0.2298, + "step": 45600 + }, + { + "epoch": 1.2660816990178507, + "grad_norm": 0.16947528719902039, + "learning_rate": 0.00012970555338162896, + "loss": 0.2273, + "step": 45650 + }, + { + "epoch": 1.267468425960915, + "grad_norm": 0.14459671080112457, + "learning_rate": 0.00012956256719885026, + "loss": 0.2282, + "step": 45700 + }, + { + "epoch": 1.2688551529039795, + "grad_norm": 0.1194702684879303, + "learning_rate": 0.00012941951475648866, + "loss": 0.2263, + "step": 45750 + }, + { + "epoch": 1.270241879847044, + "grad_norm": 0.12180822342634201, + "learning_rate": 0.00012927639637517249, + "loss": 0.227, + "step": 45800 + }, + { + "epoch": 1.2716286067901086, + "grad_norm": 0.14245355129241943, + "learning_rate": 0.00012913321237567783, + "loss": 0.2262, + "step": 45850 + }, + { + "epoch": 1.2730153337331729, + "grad_norm": 0.14033064246177673, + "learning_rate": 0.00012898996307892784, + "loss": 0.2249, + "step": 45900 + }, + { + "epoch": 1.2744020606762374, + "grad_norm": 0.11540055274963379, + "learning_rate": 0.00012884664880599198, + "loss": 0.2265, + "step": 45950 + }, + { + "epoch": 1.275788787619302, + "grad_norm": 0.10777000337839127, + "learning_rate": 0.00012870326987808538, + "loss": 0.2245, + "step": 46000 + }, + { + "epoch": 1.275788787619302, + "eval_loss": 0.2235965132713318, + "eval_runtime": 500.5657, + "eval_samples_per_second": 5.708, + "eval_steps_per_second": 5.708, + "step": 46000 + }, + { + "epoch": 1.2771755145623662, + "grad_norm": 0.13470718264579773, + "learning_rate": 0.00012855982661656815, + "loss": 0.2226, + "step": 46050 + }, + { + "epoch": 1.2785622415054307, + "grad_norm": 0.12822124361991882, + "learning_rate": 0.0001284163193429445, + "loss": 0.2294, + "step": 46100 + }, + { + "epoch": 1.2799489684484953, + "grad_norm": 0.14290271699428558, + "learning_rate": 0.0001282727483788621, + "loss": 0.2231, + "step": 46150 + }, + { + "epoch": 1.2813356953915598, + "grad_norm": 0.13675449788570404, + "learning_rate": 0.00012812911404611144, + "loss": 0.2283, + "step": 46200 + }, + { + "epoch": 1.282722422334624, + "grad_norm": 0.16636592149734497, + "learning_rate": 0.00012798541666662506, + "loss": 0.223, + "step": 46250 + }, + { + "epoch": 1.2841091492776886, + "grad_norm": 0.12275688350200653, + "learning_rate": 0.00012784165656247665, + "loss": 0.2271, + "step": 46300 + }, + { + "epoch": 1.2854958762207531, + "grad_norm": 0.13686712086200714, + "learning_rate": 0.00012769783405588072, + "loss": 0.223, + "step": 46350 + }, + { + "epoch": 1.2868826031638174, + "grad_norm": 0.1588650494813919, + "learning_rate": 0.00012755394946919145, + "loss": 0.2296, + "step": 46400 + }, + { + "epoch": 1.288269330106882, + "grad_norm": 0.12250595539808273, + "learning_rate": 0.00012741000312490228, + "loss": 0.2257, + "step": 46450 + }, + { + "epoch": 1.2896560570499465, + "grad_norm": 0.12048181891441345, + "learning_rate": 0.00012726599534564496, + "loss": 0.2231, + "step": 46500 + }, + { + "epoch": 1.291042783993011, + "grad_norm": 0.12862320244312286, + "learning_rate": 0.00012712192645418909, + "loss": 0.2274, + "step": 46550 + }, + { + "epoch": 1.2924295109360753, + "grad_norm": 0.13872814178466797, + "learning_rate": 0.00012697779677344108, + "loss": 0.2253, + "step": 46600 + }, + { + "epoch": 1.2938162378791398, + "grad_norm": 0.15560327470302582, + "learning_rate": 0.0001268336066264437, + "loss": 0.2243, + "step": 46650 + }, + { + "epoch": 1.2952029648222043, + "grad_norm": 0.12047038972377777, + "learning_rate": 0.0001266893563363752, + "loss": 0.2222, + "step": 46700 + }, + { + "epoch": 1.2965896917652686, + "grad_norm": 0.11812040954828262, + "learning_rate": 0.00012654504622654867, + "loss": 0.2278, + "step": 46750 + }, + { + "epoch": 1.2979764187083331, + "grad_norm": 0.12126338481903076, + "learning_rate": 0.00012640067662041118, + "loss": 0.2221, + "step": 46800 + }, + { + "epoch": 1.2993631456513977, + "grad_norm": 0.12661194801330566, + "learning_rate": 0.0001262562478415433, + "loss": 0.2259, + "step": 46850 + }, + { + "epoch": 1.3007498725944622, + "grad_norm": 0.1360771805047989, + "learning_rate": 0.00012611176021365807, + "loss": 0.2266, + "step": 46900 + }, + { + "epoch": 1.3021365995375267, + "grad_norm": 0.11665530502796173, + "learning_rate": 0.0001259672140606005, + "loss": 0.2254, + "step": 46950 + }, + { + "epoch": 1.303523326480591, + "grad_norm": 0.15935631096363068, + "learning_rate": 0.00012582260970634684, + "loss": 0.2274, + "step": 47000 + }, + { + "epoch": 1.303523326480591, + "eval_loss": 0.223495215177536, + "eval_runtime": 500.5307, + "eval_samples_per_second": 5.708, + "eval_steps_per_second": 5.708, + "step": 47000 + }, + { + "epoch": 1.3049100534236555, + "grad_norm": 0.11267057806253433, + "learning_rate": 0.0001256779474750037, + "loss": 0.2268, + "step": 47050 + }, + { + "epoch": 1.3062967803667198, + "grad_norm": 0.12521906197071075, + "learning_rate": 0.0001255332276908074, + "loss": 0.2288, + "step": 47100 + }, + { + "epoch": 1.3076835073097843, + "grad_norm": 0.17053671181201935, + "learning_rate": 0.00012538845067812333, + "loss": 0.2263, + "step": 47150 + }, + { + "epoch": 1.3090702342528489, + "grad_norm": 0.11105658113956451, + "learning_rate": 0.0001252436167614451, + "loss": 0.2288, + "step": 47200 + }, + { + "epoch": 1.3104569611959134, + "grad_norm": 0.11419650167226791, + "learning_rate": 0.00012509872626539388, + "loss": 0.2229, + "step": 47250 + }, + { + "epoch": 1.311843688138978, + "grad_norm": 0.12066779285669327, + "learning_rate": 0.00012495377951471766, + "loss": 0.2245, + "step": 47300 + }, + { + "epoch": 1.3132304150820422, + "grad_norm": 0.12847816944122314, + "learning_rate": 0.00012480877683429043, + "loss": 0.2239, + "step": 47350 + }, + { + "epoch": 1.3146171420251067, + "grad_norm": 0.1426202654838562, + "learning_rate": 0.00012466371854911169, + "loss": 0.22, + "step": 47400 + }, + { + "epoch": 1.3160038689681712, + "grad_norm": 0.16877411305904388, + "learning_rate": 0.00012451860498430547, + "loss": 0.2233, + "step": 47450 + }, + { + "epoch": 1.3173905959112355, + "grad_norm": 0.12421774864196777, + "learning_rate": 0.00012437343646511966, + "loss": 0.2253, + "step": 47500 + }, + { + "epoch": 1.3187773228543, + "grad_norm": 0.12950457632541656, + "learning_rate": 0.00012422821331692542, + "loss": 0.222, + "step": 47550 + }, + { + "epoch": 1.3201640497973646, + "grad_norm": 0.11517184227705002, + "learning_rate": 0.00012408293586521632, + "loss": 0.226, + "step": 47600 + }, + { + "epoch": 1.321550776740429, + "grad_norm": 0.17244720458984375, + "learning_rate": 0.0001239376044356076, + "loss": 0.2228, + "step": 47650 + }, + { + "epoch": 1.3229375036834934, + "grad_norm": 0.11641084402799606, + "learning_rate": 0.00012379221935383553, + "loss": 0.2279, + "step": 47700 + }, + { + "epoch": 1.324324230626558, + "grad_norm": 0.1356060951948166, + "learning_rate": 0.00012364678094575665, + "loss": 0.227, + "step": 47750 + }, + { + "epoch": 1.3257109575696224, + "grad_norm": 0.21041586995124817, + "learning_rate": 0.00012350128953734693, + "loss": 0.2263, + "step": 47800 + }, + { + "epoch": 1.3270976845126867, + "grad_norm": 0.14442645013332367, + "learning_rate": 0.00012335574545470124, + "loss": 0.2254, + "step": 47850 + }, + { + "epoch": 1.3284844114557512, + "grad_norm": 0.11860388517379761, + "learning_rate": 0.0001232101490240324, + "loss": 0.2296, + "step": 47900 + }, + { + "epoch": 1.3298711383988158, + "grad_norm": 0.1434333771467209, + "learning_rate": 0.0001230645005716707, + "loss": 0.2259, + "step": 47950 + }, + { + "epoch": 1.3312578653418803, + "grad_norm": 0.13076238334178925, + "learning_rate": 0.0001229188004240629, + "loss": 0.2191, + "step": 48000 + }, + { + "epoch": 1.3312578653418803, + "eval_loss": 0.22342181205749512, + "eval_runtime": 500.8608, + "eval_samples_per_second": 5.704, + "eval_steps_per_second": 5.704, + "step": 48000 + }, + { + "epoch": 1.3326445922849446, + "grad_norm": 0.13471820950508118, + "learning_rate": 0.00012277304890777164, + "loss": 0.225, + "step": 48050 + }, + { + "epoch": 1.334031319228009, + "grad_norm": 0.11158140748739243, + "learning_rate": 0.00012262724634947477, + "loss": 0.2241, + "step": 48100 + }, + { + "epoch": 1.3354180461710736, + "grad_norm": 0.131271094083786, + "learning_rate": 0.00012248139307596451, + "loss": 0.2282, + "step": 48150 + }, + { + "epoch": 1.336804773114138, + "grad_norm": 0.12760807573795319, + "learning_rate": 0.00012233548941414677, + "loss": 0.2269, + "step": 48200 + }, + { + "epoch": 1.3381915000572024, + "grad_norm": 0.10696883499622345, + "learning_rate": 0.00012218953569104025, + "loss": 0.2313, + "step": 48250 + }, + { + "epoch": 1.339578227000267, + "grad_norm": 0.11438623070716858, + "learning_rate": 0.00012204353223377612, + "loss": 0.2261, + "step": 48300 + }, + { + "epoch": 1.3409649539433315, + "grad_norm": 0.1496392786502838, + "learning_rate": 0.00012189747936959677, + "loss": 0.2287, + "step": 48350 + }, + { + "epoch": 1.3423516808863958, + "grad_norm": 0.11277805268764496, + "learning_rate": 0.00012175137742585546, + "loss": 0.2226, + "step": 48400 + }, + { + "epoch": 1.3437384078294603, + "grad_norm": 0.14849400520324707, + "learning_rate": 0.00012160522673001542, + "loss": 0.2211, + "step": 48450 + }, + { + "epoch": 1.3451251347725248, + "grad_norm": 0.11590241640806198, + "learning_rate": 0.00012145902760964916, + "loss": 0.2277, + "step": 48500 + }, + { + "epoch": 1.3465118617155891, + "grad_norm": 0.12213417887687683, + "learning_rate": 0.00012131278039243772, + "loss": 0.2218, + "step": 48550 + }, + { + "epoch": 1.3478985886586536, + "grad_norm": 0.11973270773887634, + "learning_rate": 0.00012116648540616996, + "loss": 0.228, + "step": 48600 + }, + { + "epoch": 1.3492853156017182, + "grad_norm": 0.12690085172653198, + "learning_rate": 0.00012102014297874171, + "loss": 0.2243, + "step": 48650 + }, + { + "epoch": 1.3506720425447827, + "grad_norm": 0.12574374675750732, + "learning_rate": 0.00012087375343815526, + "loss": 0.2261, + "step": 48700 + }, + { + "epoch": 1.3520587694878472, + "grad_norm": 0.11568839848041534, + "learning_rate": 0.00012072731711251848, + "loss": 0.2254, + "step": 48750 + }, + { + "epoch": 1.3534454964309115, + "grad_norm": 0.1170002669095993, + "learning_rate": 0.00012058083433004403, + "loss": 0.2298, + "step": 48800 + }, + { + "epoch": 1.354832223373976, + "grad_norm": 0.1263497769832611, + "learning_rate": 0.0001204343054190487, + "loss": 0.2257, + "step": 48850 + }, + { + "epoch": 1.3562189503170403, + "grad_norm": 0.14301252365112305, + "learning_rate": 0.00012028773070795275, + "loss": 0.2253, + "step": 48900 + }, + { + "epoch": 1.3576056772601048, + "grad_norm": 0.1464497148990631, + "learning_rate": 0.000120141110525279, + "loss": 0.2256, + "step": 48950 + }, + { + "epoch": 1.3589924042031694, + "grad_norm": 0.1061500534415245, + "learning_rate": 0.00011999444519965228, + "loss": 0.2212, + "step": 49000 + }, + { + "epoch": 1.3589924042031694, + "eval_loss": 0.22327525913715363, + "eval_runtime": 501.03, + "eval_samples_per_second": 5.702, + "eval_steps_per_second": 5.702, + "step": 49000 + }, + { + "epoch": 1.3603791311462339, + "grad_norm": 0.10947709530591965, + "learning_rate": 0.00011984773505979852, + "loss": 0.2176, + "step": 49050 + }, + { + "epoch": 1.3617658580892984, + "grad_norm": 0.11933697760105133, + "learning_rate": 0.00011970098043454412, + "loss": 0.2254, + "step": 49100 + }, + { + "epoch": 1.3631525850323627, + "grad_norm": 0.11835236102342606, + "learning_rate": 0.0001195541816528152, + "loss": 0.2236, + "step": 49150 + }, + { + "epoch": 1.3645393119754272, + "grad_norm": 0.15372909605503082, + "learning_rate": 0.00011940733904363681, + "loss": 0.2225, + "step": 49200 + }, + { + "epoch": 1.3659260389184917, + "grad_norm": 0.137434184551239, + "learning_rate": 0.00011926045293613228, + "loss": 0.2249, + "step": 49250 + }, + { + "epoch": 1.367312765861556, + "grad_norm": 0.1446930319070816, + "learning_rate": 0.00011911352365952247, + "loss": 0.228, + "step": 49300 + }, + { + "epoch": 1.3686994928046206, + "grad_norm": 0.12815728783607483, + "learning_rate": 0.0001189665515431249, + "loss": 0.2256, + "step": 49350 + }, + { + "epoch": 1.370086219747685, + "grad_norm": 0.12660840153694153, + "learning_rate": 0.00011881953691635312, + "loss": 0.2235, + "step": 49400 + }, + { + "epoch": 1.3714729466907496, + "grad_norm": 0.15400618314743042, + "learning_rate": 0.00011867248010871604, + "loss": 0.2271, + "step": 49450 + }, + { + "epoch": 1.372859673633814, + "grad_norm": 0.1160617545247078, + "learning_rate": 0.00011852538144981701, + "loss": 0.222, + "step": 49500 + }, + { + "epoch": 1.3742464005768784, + "grad_norm": 0.10030169039964676, + "learning_rate": 0.0001183782412693533, + "loss": 0.227, + "step": 49550 + }, + { + "epoch": 1.375633127519943, + "grad_norm": 0.1285124272108078, + "learning_rate": 0.00011823105989711515, + "loss": 0.2227, + "step": 49600 + }, + { + "epoch": 1.3770198544630072, + "grad_norm": 0.16413910686969757, + "learning_rate": 0.00011808383766298512, + "loss": 0.2241, + "step": 49650 + }, + { + "epoch": 1.3784065814060718, + "grad_norm": 0.12963584065437317, + "learning_rate": 0.00011793657489693743, + "loss": 0.2268, + "step": 49700 + }, + { + "epoch": 1.3797933083491363, + "grad_norm": 0.11898767948150635, + "learning_rate": 0.00011778927192903709, + "loss": 0.2244, + "step": 49750 + }, + { + "epoch": 1.3811800352922008, + "grad_norm": 0.10802606493234634, + "learning_rate": 0.00011764192908943925, + "loss": 0.2227, + "step": 49800 + }, + { + "epoch": 1.382566762235265, + "grad_norm": 0.17253124713897705, + "learning_rate": 0.00011749749474137916, + "loss": 0.2278, + "step": 49850 + }, + { + "epoch": 1.3839534891783296, + "grad_norm": 0.11553626507520676, + "learning_rate": 0.00011735007393019295, + "loss": 0.2232, + "step": 49900 + }, + { + "epoch": 1.3853402161213941, + "grad_norm": 0.16757309436798096, + "learning_rate": 0.00011720261423169856, + "loss": 0.2288, + "step": 49950 + }, + { + "epoch": 1.3867269430644584, + "grad_norm": 0.17031162977218628, + "learning_rate": 0.0001170551159764024, + "loss": 0.2249, + "step": 50000 + }, + { + "epoch": 1.3867269430644584, + "eval_loss": 0.22317548096179962, + "eval_runtime": 500.2665, + "eval_samples_per_second": 5.711, + "eval_steps_per_second": 5.711, + "step": 50000 + }, + { + "epoch": 1.388113670007523, + "grad_norm": 0.11588902771472931, + "learning_rate": 0.00011690757949489732, + "loss": 0.2284, + "step": 50050 + }, + { + "epoch": 1.3895003969505875, + "grad_norm": 0.12574197351932526, + "learning_rate": 0.00011676000511786185, + "loss": 0.2316, + "step": 50100 + }, + { + "epoch": 1.390887123893652, + "grad_norm": 0.11579444259405136, + "learning_rate": 0.0001166123931760594, + "loss": 0.2239, + "step": 50150 + }, + { + "epoch": 1.3922738508367163, + "grad_norm": 0.1079888865351677, + "learning_rate": 0.00011646474400033762, + "loss": 0.2256, + "step": 50200 + }, + { + "epoch": 1.3936605777797808, + "grad_norm": 0.1480822116136551, + "learning_rate": 0.00011631705792162764, + "loss": 0.2254, + "step": 50250 + }, + { + "epoch": 1.3950473047228453, + "grad_norm": 0.12621110677719116, + "learning_rate": 0.0001161693352709432, + "loss": 0.2262, + "step": 50300 + }, + { + "epoch": 1.3964340316659096, + "grad_norm": 0.1542740762233734, + "learning_rate": 0.00011602157637938016, + "loss": 0.222, + "step": 50350 + }, + { + "epoch": 1.3978207586089741, + "grad_norm": 0.11742950975894928, + "learning_rate": 0.00011587378157811545, + "loss": 0.2197, + "step": 50400 + }, + { + "epoch": 1.3992074855520387, + "grad_norm": 0.1457587331533432, + "learning_rate": 0.00011572595119840666, + "loss": 0.2236, + "step": 50450 + }, + { + "epoch": 1.4005942124951032, + "grad_norm": 0.12402471154928207, + "learning_rate": 0.00011557808557159093, + "loss": 0.2251, + "step": 50500 + }, + { + "epoch": 1.4019809394381677, + "grad_norm": 0.1243479996919632, + "learning_rate": 0.00011543018502908455, + "loss": 0.2192, + "step": 50550 + }, + { + "epoch": 1.403367666381232, + "grad_norm": 0.126913920044899, + "learning_rate": 0.00011528224990238199, + "loss": 0.2248, + "step": 50600 + }, + { + "epoch": 1.4047543933242965, + "grad_norm": 0.13005682826042175, + "learning_rate": 0.00011513428052305528, + "loss": 0.2287, + "step": 50650 + }, + { + "epoch": 1.4061411202673608, + "grad_norm": 0.12297997623682022, + "learning_rate": 0.00011498627722275319, + "loss": 0.2244, + "step": 50700 + }, + { + "epoch": 1.4075278472104253, + "grad_norm": 0.14594142138957977, + "learning_rate": 0.00011483824033320053, + "loss": 0.2262, + "step": 50750 + }, + { + "epoch": 1.4089145741534899, + "grad_norm": 0.14156201481819153, + "learning_rate": 0.00011469017018619734, + "loss": 0.2251, + "step": 50800 + }, + { + "epoch": 1.4103013010965544, + "grad_norm": 0.11722792685031891, + "learning_rate": 0.00011454206711361835, + "loss": 0.223, + "step": 50850 + }, + { + "epoch": 1.411688028039619, + "grad_norm": 0.10951201617717743, + "learning_rate": 0.0001143939314474119, + "loss": 0.2276, + "step": 50900 + }, + { + "epoch": 1.4130747549826832, + "grad_norm": 0.13954362273216248, + "learning_rate": 0.00011424576351959957, + "loss": 0.2233, + "step": 50950 + }, + { + "epoch": 1.4144614819257477, + "grad_norm": 0.12842465937137604, + "learning_rate": 0.00011409756366227509, + "loss": 0.2244, + "step": 51000 + }, + { + "epoch": 1.4144614819257477, + "eval_loss": 0.2229885309934616, + "eval_runtime": 500.7247, + "eval_samples_per_second": 5.706, + "eval_steps_per_second": 5.706, + "step": 51000 + }, + { + "epoch": 1.4158482088688122, + "grad_norm": 0.11812816560268402, + "learning_rate": 0.0001139493322076038, + "loss": 0.2243, + "step": 51050 + }, + { + "epoch": 1.4172349358118765, + "grad_norm": 0.11909503489732742, + "learning_rate": 0.00011380106948782194, + "loss": 0.2241, + "step": 51100 + }, + { + "epoch": 1.418621662754941, + "grad_norm": 0.12617436051368713, + "learning_rate": 0.00011365277583523573, + "loss": 0.2247, + "step": 51150 + }, + { + "epoch": 1.4200083896980056, + "grad_norm": 0.1255272477865219, + "learning_rate": 0.00011350445158222074, + "loss": 0.2222, + "step": 51200 + }, + { + "epoch": 1.42139511664107, + "grad_norm": 0.1265612095594406, + "learning_rate": 0.00011335609706122117, + "loss": 0.2248, + "step": 51250 + }, + { + "epoch": 1.4227818435841344, + "grad_norm": 0.1541653722524643, + "learning_rate": 0.000113207712604749, + "loss": 0.2325, + "step": 51300 + }, + { + "epoch": 1.424168570527199, + "grad_norm": 0.14105264842510223, + "learning_rate": 0.00011305929854538338, + "loss": 0.2222, + "step": 51350 + }, + { + "epoch": 1.4255552974702634, + "grad_norm": 0.13859054446220398, + "learning_rate": 0.00011291085521576972, + "loss": 0.225, + "step": 51400 + }, + { + "epoch": 1.4269420244133277, + "grad_norm": 0.14842507243156433, + "learning_rate": 0.00011276238294861912, + "loss": 0.2233, + "step": 51450 + }, + { + "epoch": 1.4283287513563923, + "grad_norm": 0.1151481494307518, + "learning_rate": 0.00011261388207670747, + "loss": 0.2241, + "step": 51500 + }, + { + "epoch": 1.4297154782994568, + "grad_norm": 0.13431167602539062, + "learning_rate": 0.00011246535293287483, + "loss": 0.2259, + "step": 51550 + }, + { + "epoch": 1.4311022052425213, + "grad_norm": 0.10691921412944794, + "learning_rate": 0.0001123167958500246, + "loss": 0.2257, + "step": 51600 + }, + { + "epoch": 1.4324889321855856, + "grad_norm": 0.12425126880407333, + "learning_rate": 0.00011216821116112275, + "loss": 0.2272, + "step": 51650 + }, + { + "epoch": 1.4338756591286501, + "grad_norm": 0.1344369500875473, + "learning_rate": 0.00011201959919919722, + "loss": 0.2236, + "step": 51700 + }, + { + "epoch": 1.4352623860717146, + "grad_norm": 0.154687762260437, + "learning_rate": 0.00011187096029733704, + "loss": 0.2248, + "step": 51750 + }, + { + "epoch": 1.436649113014779, + "grad_norm": 0.14304719865322113, + "learning_rate": 0.00011172229478869158, + "loss": 0.2269, + "step": 51800 + }, + { + "epoch": 1.4380358399578435, + "grad_norm": 0.14757920801639557, + "learning_rate": 0.00011157360300646988, + "loss": 0.2224, + "step": 51850 + }, + { + "epoch": 1.439422566900908, + "grad_norm": 0.13079752027988434, + "learning_rate": 0.00011142488528393989, + "loss": 0.2249, + "step": 51900 + }, + { + "epoch": 1.4408092938439725, + "grad_norm": 0.1415751427412033, + "learning_rate": 0.00011127614195442766, + "loss": 0.2308, + "step": 51950 + }, + { + "epoch": 1.442196020787037, + "grad_norm": 0.14857710897922516, + "learning_rate": 0.00011112737335131667, + "loss": 0.2276, + "step": 52000 + }, + { + "epoch": 1.442196020787037, + "eval_loss": 0.22276601195335388, + "eval_runtime": 500.893, + "eval_samples_per_second": 5.704, + "eval_steps_per_second": 5.704, + "step": 52000 + }, + { + "epoch": 1.4435827477301013, + "grad_norm": 0.12997597455978394, + "learning_rate": 0.000110978579808047, + "loss": 0.2244, + "step": 52050 + }, + { + "epoch": 1.4449694746731658, + "grad_norm": 0.13583651185035706, + "learning_rate": 0.00011082976165811469, + "loss": 0.2271, + "step": 52100 + }, + { + "epoch": 1.4463562016162301, + "grad_norm": 0.15908506512641907, + "learning_rate": 0.00011068091923507087, + "loss": 0.2276, + "step": 52150 + }, + { + "epoch": 1.4477429285592947, + "grad_norm": 0.10295715928077698, + "learning_rate": 0.00011053205287252113, + "loss": 0.2268, + "step": 52200 + }, + { + "epoch": 1.4491296555023592, + "grad_norm": 0.14841599762439728, + "learning_rate": 0.00011038316290412463, + "loss": 0.227, + "step": 52250 + }, + { + "epoch": 1.4505163824454237, + "grad_norm": 0.11808385699987411, + "learning_rate": 0.00011023722815431241, + "loss": 0.2212, + "step": 52300 + }, + { + "epoch": 1.4519031093884882, + "grad_norm": 0.14345276355743408, + "learning_rate": 0.00011008829243090724, + "loss": 0.2264, + "step": 52350 + }, + { + "epoch": 1.4532898363315525, + "grad_norm": 0.1435631662607193, + "learning_rate": 0.00010993933409627062, + "loss": 0.222, + "step": 52400 + }, + { + "epoch": 1.454676563274617, + "grad_norm": 0.10484310239553452, + "learning_rate": 0.00010979035348426798, + "loss": 0.2253, + "step": 52450 + }, + { + "epoch": 1.4560632902176813, + "grad_norm": 0.12916283309459686, + "learning_rate": 0.00010964135092881453, + "loss": 0.2261, + "step": 52500 + }, + { + "epoch": 1.4574500171607458, + "grad_norm": 0.15381525456905365, + "learning_rate": 0.00010949232676387484, + "loss": 0.2276, + "step": 52550 + }, + { + "epoch": 1.4588367441038104, + "grad_norm": 0.11151523888111115, + "learning_rate": 0.00010934328132346172, + "loss": 0.2262, + "step": 52600 + }, + { + "epoch": 1.4602234710468749, + "grad_norm": 0.11262549459934235, + "learning_rate": 0.00010919421494163582, + "loss": 0.222, + "step": 52650 + }, + { + "epoch": 1.4616101979899394, + "grad_norm": 0.12128688395023346, + "learning_rate": 0.00010904512795250468, + "loss": 0.223, + "step": 52700 + }, + { + "epoch": 1.4629969249330037, + "grad_norm": 0.12478487193584442, + "learning_rate": 0.00010889602069022198, + "loss": 0.2229, + "step": 52750 + }, + { + "epoch": 1.4643836518760682, + "grad_norm": 0.12064434587955475, + "learning_rate": 0.00010874689348898685, + "loss": 0.2216, + "step": 52800 + }, + { + "epoch": 1.4657703788191327, + "grad_norm": 0.1457664519548416, + "learning_rate": 0.00010859774668304321, + "loss": 0.225, + "step": 52850 + }, + { + "epoch": 1.467157105762197, + "grad_norm": 0.1602487713098526, + "learning_rate": 0.00010844858060667881, + "loss": 0.2242, + "step": 52900 + }, + { + "epoch": 1.4685438327052616, + "grad_norm": 0.12469302117824554, + "learning_rate": 0.00010829939559422464, + "loss": 0.2252, + "step": 52950 + }, + { + "epoch": 1.469930559648326, + "grad_norm": 0.1300072968006134, + "learning_rate": 0.00010815019198005407, + "loss": 0.2294, + "step": 53000 + }, + { + "epoch": 1.469930559648326, + "eval_loss": 0.2224954217672348, + "eval_runtime": 500.39, + "eval_samples_per_second": 5.71, + "eval_steps_per_second": 5.71, + "step": 53000 + }, + { + "epoch": 1.4713172865913906, + "grad_norm": 0.11393048614263535, + "learning_rate": 0.00010800097009858226, + "loss": 0.2211, + "step": 53050 + }, + { + "epoch": 1.472704013534455, + "grad_norm": 0.11365407705307007, + "learning_rate": 0.00010785173028426525, + "loss": 0.2247, + "step": 53100 + }, + { + "epoch": 1.4740907404775194, + "grad_norm": 0.13408495485782623, + "learning_rate": 0.00010770247287159932, + "loss": 0.2243, + "step": 53150 + }, + { + "epoch": 1.475477467420584, + "grad_norm": 0.12727609276771545, + "learning_rate": 0.00010755319819512011, + "loss": 0.2314, + "step": 53200 + }, + { + "epoch": 1.4768641943636482, + "grad_norm": 0.11358857899904251, + "learning_rate": 0.00010740390658940205, + "loss": 0.224, + "step": 53250 + }, + { + "epoch": 1.4782509213067128, + "grad_norm": 0.13022546470165253, + "learning_rate": 0.00010725459838905748, + "loss": 0.2241, + "step": 53300 + }, + { + "epoch": 1.4796376482497773, + "grad_norm": 0.125900000333786, + "learning_rate": 0.00010710527392873587, + "loss": 0.2252, + "step": 53350 + }, + { + "epoch": 1.4810243751928418, + "grad_norm": 0.13671207427978516, + "learning_rate": 0.00010695593354312321, + "loss": 0.2249, + "step": 53400 + }, + { + "epoch": 1.482411102135906, + "grad_norm": 0.15761978924274445, + "learning_rate": 0.0001068065775669412, + "loss": 0.2255, + "step": 53450 + }, + { + "epoch": 1.4837978290789706, + "grad_norm": 0.1286703646183014, + "learning_rate": 0.00010665720633494641, + "loss": 0.2214, + "step": 53500 + }, + { + "epoch": 1.4851845560220351, + "grad_norm": 0.1332877278327942, + "learning_rate": 0.00010650782018192962, + "loss": 0.2226, + "step": 53550 + }, + { + "epoch": 1.4865712829650994, + "grad_norm": 0.13217736780643463, + "learning_rate": 0.00010635841944271511, + "loss": 0.2268, + "step": 53600 + }, + { + "epoch": 1.487958009908164, + "grad_norm": 0.12781362235546112, + "learning_rate": 0.0001062090044521598, + "loss": 0.2244, + "step": 53650 + }, + { + "epoch": 1.4893447368512285, + "grad_norm": 0.1170235425233841, + "learning_rate": 0.0001060595755451526, + "loss": 0.2278, + "step": 53700 + }, + { + "epoch": 1.490731463794293, + "grad_norm": 0.11187135428190231, + "learning_rate": 0.00010591013305661357, + "loss": 0.2257, + "step": 53750 + }, + { + "epoch": 1.4921181907373575, + "grad_norm": 0.11063262075185776, + "learning_rate": 0.00010576067732149315, + "loss": 0.2233, + "step": 53800 + }, + { + "epoch": 1.4935049176804218, + "grad_norm": 0.18011736869812012, + "learning_rate": 0.00010561120867477164, + "loss": 0.2236, + "step": 53850 + }, + { + "epoch": 1.4948916446234863, + "grad_norm": 0.11368495970964432, + "learning_rate": 0.00010546172745145812, + "loss": 0.2275, + "step": 53900 + }, + { + "epoch": 1.4962783715665506, + "grad_norm": 0.11830084025859833, + "learning_rate": 0.00010531223398658993, + "loss": 0.2243, + "step": 53950 + }, + { + "epoch": 1.4976650985096152, + "grad_norm": 0.129221111536026, + "learning_rate": 0.00010516272861523182, + "loss": 0.2249, + "step": 54000 + }, + { + "epoch": 1.4976650985096152, + "eval_loss": 0.222273588180542, + "eval_runtime": 500.3973, + "eval_samples_per_second": 5.709, + "eval_steps_per_second": 5.709, + "step": 54000 + }, + { + "epoch": 1.4990518254526797, + "grad_norm": 0.1275608390569687, + "learning_rate": 0.00010501321167247526, + "loss": 0.2209, + "step": 54050 + }, + { + "epoch": 1.5004385523957442, + "grad_norm": 0.126673623919487, + "learning_rate": 0.0001048636834934376, + "loss": 0.2213, + "step": 54100 + }, + { + "epoch": 1.5018252793388087, + "grad_norm": 0.09999104589223862, + "learning_rate": 0.0001047141444132615, + "loss": 0.2253, + "step": 54150 + }, + { + "epoch": 1.503212006281873, + "grad_norm": 0.11277350038290024, + "learning_rate": 0.00010456459476711389, + "loss": 0.2246, + "step": 54200 + }, + { + "epoch": 1.5045987332249375, + "grad_norm": 0.1197759360074997, + "learning_rate": 0.00010441503489018545, + "loss": 0.2221, + "step": 54250 + }, + { + "epoch": 1.5059854601680018, + "grad_norm": 0.11321547627449036, + "learning_rate": 0.00010426546511768982, + "loss": 0.222, + "step": 54300 + }, + { + "epoch": 1.5073721871110664, + "grad_norm": 0.13402992486953735, + "learning_rate": 0.00010411588578486282, + "loss": 0.2201, + "step": 54350 + }, + { + "epoch": 1.5087589140541309, + "grad_norm": 0.11394736170768738, + "learning_rate": 0.00010396629722696163, + "loss": 0.2237, + "step": 54400 + }, + { + "epoch": 1.5101456409971954, + "grad_norm": 0.14211580157279968, + "learning_rate": 0.00010381669977926414, + "loss": 0.2238, + "step": 54450 + }, + { + "epoch": 1.51153236794026, + "grad_norm": 0.15154938399791718, + "learning_rate": 0.00010366709377706825, + "loss": 0.2225, + "step": 54500 + }, + { + "epoch": 1.5129190948833242, + "grad_norm": 0.14525644481182098, + "learning_rate": 0.00010351747955569088, + "loss": 0.2245, + "step": 54550 + }, + { + "epoch": 1.5143058218263887, + "grad_norm": 0.1171354353427887, + "learning_rate": 0.00010336785745046747, + "loss": 0.2209, + "step": 54600 + }, + { + "epoch": 1.515692548769453, + "grad_norm": 0.10528494417667389, + "learning_rate": 0.00010321822779675115, + "loss": 0.2247, + "step": 54650 + }, + { + "epoch": 1.5170792757125175, + "grad_norm": 0.13853302597999573, + "learning_rate": 0.00010306859092991188, + "loss": 0.2243, + "step": 54700 + }, + { + "epoch": 1.518466002655582, + "grad_norm": 0.15869873762130737, + "learning_rate": 0.00010291894718533585, + "loss": 0.2257, + "step": 54750 + }, + { + "epoch": 1.5198527295986466, + "grad_norm": 0.11144551634788513, + "learning_rate": 0.0001027692968984247, + "loss": 0.2249, + "step": 54800 + }, + { + "epoch": 1.521239456541711, + "grad_norm": 0.1125815361738205, + "learning_rate": 0.00010261964040459458, + "loss": 0.2243, + "step": 54850 + }, + { + "epoch": 1.5226261834847756, + "grad_norm": 0.11292921751737595, + "learning_rate": 0.00010246997803927576, + "loss": 0.2219, + "step": 54900 + }, + { + "epoch": 1.52401291042784, + "grad_norm": 0.1216253936290741, + "learning_rate": 0.00010232031013791152, + "loss": 0.223, + "step": 54950 + }, + { + "epoch": 1.5253996373709042, + "grad_norm": 0.12706224620342255, + "learning_rate": 0.00010217063703595761, + "loss": 0.2214, + "step": 55000 + }, + { + "epoch": 1.5253996373709042, + "eval_loss": 0.22219954431056976, + "eval_runtime": 500.7255, + "eval_samples_per_second": 5.706, + "eval_steps_per_second": 5.706, + "step": 55000 + }, + { + "epoch": 1.5267863643139687, + "grad_norm": 0.12699660658836365, + "learning_rate": 0.0001020209590688814, + "loss": 0.2274, + "step": 55050 + }, + { + "epoch": 1.5281730912570333, + "grad_norm": 0.11792019754648209, + "learning_rate": 0.00010187127657216122, + "loss": 0.2263, + "step": 55100 + }, + { + "epoch": 1.5295598182000978, + "grad_norm": 0.11735875904560089, + "learning_rate": 0.00010172158988128548, + "loss": 0.224, + "step": 55150 + }, + { + "epoch": 1.5309465451431623, + "grad_norm": 0.1137237474322319, + "learning_rate": 0.00010157189933175203, + "loss": 0.2225, + "step": 55200 + }, + { + "epoch": 1.5323332720862268, + "grad_norm": 0.12405762076377869, + "learning_rate": 0.0001014222052590674, + "loss": 0.2183, + "step": 55250 + }, + { + "epoch": 1.5337199990292911, + "grad_norm": 0.1327371746301651, + "learning_rate": 0.00010127250799874596, + "loss": 0.2211, + "step": 55300 + }, + { + "epoch": 1.5351067259723556, + "grad_norm": 0.14912429451942444, + "learning_rate": 0.00010112280788630928, + "loss": 0.2236, + "step": 55350 + }, + { + "epoch": 1.53649345291542, + "grad_norm": 0.1276470422744751, + "learning_rate": 0.00010097310525728527, + "loss": 0.2264, + "step": 55400 + }, + { + "epoch": 1.5378801798584845, + "grad_norm": 0.157034233212471, + "learning_rate": 0.00010082340044720746, + "loss": 0.2252, + "step": 55450 + }, + { + "epoch": 1.539266906801549, + "grad_norm": 0.12907952070236206, + "learning_rate": 0.00010067369379161437, + "loss": 0.2252, + "step": 55500 + }, + { + "epoch": 1.5406536337446135, + "grad_norm": 0.12835729122161865, + "learning_rate": 0.00010052398562604856, + "loss": 0.2231, + "step": 55550 + }, + { + "epoch": 1.542040360687678, + "grad_norm": 0.10687188804149628, + "learning_rate": 0.00010037427628605604, + "loss": 0.2267, + "step": 55600 + }, + { + "epoch": 1.5434270876307423, + "grad_norm": 0.15926331281661987, + "learning_rate": 0.0001002245661071854, + "loss": 0.2252, + "step": 55650 + }, + { + "epoch": 1.5448138145738068, + "grad_norm": 0.1307857781648636, + "learning_rate": 0.00010007485542498716, + "loss": 0.222, + "step": 55700 + }, + { + "epoch": 1.5462005415168711, + "grad_norm": 0.13711467385292053, + "learning_rate": 9.992813879148622e-05, + "loss": 0.225, + "step": 55750 + }, + { + "epoch": 1.5475872684599357, + "grad_norm": 0.12018255889415741, + "learning_rate": 9.977842810264401e-05, + "loss": 0.2236, + "step": 55800 + }, + { + "epoch": 1.5489739954030002, + "grad_norm": 0.17128078639507294, + "learning_rate": 9.962871791041844e-05, + "loss": 0.2258, + "step": 55850 + }, + { + "epoch": 1.5503607223460647, + "grad_norm": 0.13206107914447784, + "learning_rate": 9.947900855035997e-05, + "loss": 0.2215, + "step": 55900 + }, + { + "epoch": 1.5517474492891292, + "grad_norm": 0.11228550225496292, + "learning_rate": 9.932930035801728e-05, + "loss": 0.2247, + "step": 55950 + }, + { + "epoch": 1.5531341762321935, + "grad_norm": 0.11278003454208374, + "learning_rate": 9.91795936689364e-05, + "loss": 0.2231, + "step": 56000 + }, + { + "epoch": 1.5531341762321935, + "eval_loss": 0.2218015044927597, + "eval_runtime": 500.47, + "eval_samples_per_second": 5.709, + "eval_steps_per_second": 5.709, + "step": 56000 + }, + { + "epoch": 1.554520903175258, + "grad_norm": 0.12507981061935425, + "learning_rate": 9.903288289547497e-05, + "loss": 0.2251, + "step": 56050 + }, + { + "epoch": 1.5559076301183223, + "grad_norm": 0.109380342066288, + "learning_rate": 9.888318017276653e-05, + "loss": 0.2241, + "step": 56100 + }, + { + "epoch": 1.5572943570613869, + "grad_norm": 0.16204926371574402, + "learning_rate": 9.873347995322417e-05, + "loss": 0.2275, + "step": 56150 + }, + { + "epoch": 1.5586810840044514, + "grad_norm": 0.15255117416381836, + "learning_rate": 9.858378257237604e-05, + "loss": 0.2266, + "step": 56200 + }, + { + "epoch": 1.560067810947516, + "grad_norm": 0.10249169170856476, + "learning_rate": 9.843408836574402e-05, + "loss": 0.2281, + "step": 56250 + }, + { + "epoch": 1.5614545378905804, + "grad_norm": 0.11107359826564789, + "learning_rate": 9.828439766884277e-05, + "loss": 0.221, + "step": 56300 + }, + { + "epoch": 1.5628412648336447, + "grad_norm": 0.11863153427839279, + "learning_rate": 9.813471081717909e-05, + "loss": 0.2303, + "step": 56350 + }, + { + "epoch": 1.5642279917767092, + "grad_norm": 0.12891624867916107, + "learning_rate": 9.798502814625123e-05, + "loss": 0.2217, + "step": 56400 + }, + { + "epoch": 1.5656147187197735, + "grad_norm": 0.13360735774040222, + "learning_rate": 9.783534999154802e-05, + "loss": 0.2252, + "step": 56450 + }, + { + "epoch": 1.567001445662838, + "grad_norm": 0.13507115840911865, + "learning_rate": 9.768567668854817e-05, + "loss": 0.2259, + "step": 56500 + }, + { + "epoch": 1.5683881726059026, + "grad_norm": 0.11616547405719757, + "learning_rate": 9.753600857271952e-05, + "loss": 0.2235, + "step": 56550 + }, + { + "epoch": 1.569774899548967, + "grad_norm": 0.12262886762619019, + "learning_rate": 9.738634597951829e-05, + "loss": 0.2229, + "step": 56600 + }, + { + "epoch": 1.5711616264920316, + "grad_norm": 0.11454316973686218, + "learning_rate": 9.723668924438826e-05, + "loss": 0.2235, + "step": 56650 + }, + { + "epoch": 1.5725483534350961, + "grad_norm": 0.1089673787355423, + "learning_rate": 9.708703870276025e-05, + "loss": 0.2188, + "step": 56700 + }, + { + "epoch": 1.5739350803781604, + "grad_norm": 0.14894092082977295, + "learning_rate": 9.693739469005102e-05, + "loss": 0.2259, + "step": 56750 + }, + { + "epoch": 1.5753218073212247, + "grad_norm": 0.13055862486362457, + "learning_rate": 9.678775754166277e-05, + "loss": 0.2264, + "step": 56800 + }, + { + "epoch": 1.5767085342642893, + "grad_norm": 0.13678143918514252, + "learning_rate": 9.66381275929823e-05, + "loss": 0.2262, + "step": 56850 + }, + { + "epoch": 1.5780952612073538, + "grad_norm": 0.1054852306842804, + "learning_rate": 9.648850517938029e-05, + "loss": 0.224, + "step": 56900 + }, + { + "epoch": 1.5794819881504183, + "grad_norm": 0.13340288400650024, + "learning_rate": 9.633889063621053e-05, + "loss": 0.2204, + "step": 56950 + }, + { + "epoch": 1.5808687150934828, + "grad_norm": 0.1335633248090744, + "learning_rate": 9.618928429880915e-05, + "loss": 0.2255, + "step": 57000 + }, + { + "epoch": 1.5808687150934828, + "eval_loss": 0.22177007794380188, + "eval_runtime": 500.5574, + "eval_samples_per_second": 5.708, + "eval_steps_per_second": 5.708, + "step": 57000 + }, + { + "epoch": 1.5822554420365473, + "grad_norm": 0.14497853815555573, + "learning_rate": 9.603968650249387e-05, + "loss": 0.2215, + "step": 57050 + }, + { + "epoch": 1.5836421689796116, + "grad_norm": 0.12169457972049713, + "learning_rate": 9.589009758256336e-05, + "loss": 0.2248, + "step": 57100 + }, + { + "epoch": 1.5850288959226762, + "grad_norm": 0.15703269839286804, + "learning_rate": 9.57405178742963e-05, + "loss": 0.2312, + "step": 57150 + }, + { + "epoch": 1.5864156228657404, + "grad_norm": 0.12093982100486755, + "learning_rate": 9.559094771295076e-05, + "loss": 0.2217, + "step": 57200 + }, + { + "epoch": 1.587802349808805, + "grad_norm": 0.11299122124910355, + "learning_rate": 9.544138743376341e-05, + "loss": 0.2225, + "step": 57250 + }, + { + "epoch": 1.5891890767518695, + "grad_norm": 0.12226125597953796, + "learning_rate": 9.529183737194875e-05, + "loss": 0.2235, + "step": 57300 + }, + { + "epoch": 1.590575803694934, + "grad_norm": 0.09305635094642639, + "learning_rate": 9.514229786269836e-05, + "loss": 0.2207, + "step": 57350 + }, + { + "epoch": 1.5919625306379985, + "grad_norm": 0.13961461186408997, + "learning_rate": 9.499276924118032e-05, + "loss": 0.2259, + "step": 57400 + }, + { + "epoch": 1.5933492575810628, + "grad_norm": 0.11887528002262115, + "learning_rate": 9.484325184253808e-05, + "loss": 0.2232, + "step": 57450 + }, + { + "epoch": 1.5947359845241273, + "grad_norm": 0.16336101293563843, + "learning_rate": 9.469374600189009e-05, + "loss": 0.2175, + "step": 57500 + }, + { + "epoch": 1.5961227114671916, + "grad_norm": 0.10530219227075577, + "learning_rate": 9.454425205432887e-05, + "loss": 0.222, + "step": 57550 + }, + { + "epoch": 1.5975094384102562, + "grad_norm": 0.10913381725549698, + "learning_rate": 9.439477033492027e-05, + "loss": 0.2242, + "step": 57600 + }, + { + "epoch": 1.5988961653533207, + "grad_norm": 0.12135528773069382, + "learning_rate": 9.424530117870271e-05, + "loss": 0.226, + "step": 57650 + }, + { + "epoch": 1.6002828922963852, + "grad_norm": 0.13996323943138123, + "learning_rate": 9.409584492068646e-05, + "loss": 0.224, + "step": 57700 + }, + { + "epoch": 1.6016696192394497, + "grad_norm": 0.12019480764865875, + "learning_rate": 9.394640189585291e-05, + "loss": 0.2251, + "step": 57750 + }, + { + "epoch": 1.603056346182514, + "grad_norm": 0.14267127215862274, + "learning_rate": 9.379697243915376e-05, + "loss": 0.2231, + "step": 57800 + }, + { + "epoch": 1.6044430731255785, + "grad_norm": 0.1371176540851593, + "learning_rate": 9.364755688551027e-05, + "loss": 0.2234, + "step": 57850 + }, + { + "epoch": 1.6058298000686428, + "grad_norm": 0.11706887930631638, + "learning_rate": 9.349815556981269e-05, + "loss": 0.2248, + "step": 57900 + }, + { + "epoch": 1.6072165270117074, + "grad_norm": 0.14251373708248138, + "learning_rate": 9.334876882691918e-05, + "loss": 0.2239, + "step": 57950 + }, + { + "epoch": 1.6086032539547719, + "grad_norm": 0.1372220814228058, + "learning_rate": 9.319939699165527e-05, + "loss": 0.2252, + "step": 58000 + }, + { + "epoch": 1.6086032539547719, + "eval_loss": 0.22149445116519928, + "eval_runtime": 500.9696, + "eval_samples_per_second": 5.703, + "eval_steps_per_second": 5.703, + "step": 58000 + }, + { + "epoch": 1.6099899808978364, + "grad_norm": 0.10551794618368149, + "learning_rate": 9.305004039881319e-05, + "loss": 0.2268, + "step": 58050 + }, + { + "epoch": 1.611376707840901, + "grad_norm": 0.11267270147800446, + "learning_rate": 9.290069938315087e-05, + "loss": 0.2248, + "step": 58100 + }, + { + "epoch": 1.6127634347839652, + "grad_norm": 0.1272066831588745, + "learning_rate": 9.275137427939142e-05, + "loss": 0.2219, + "step": 58150 + }, + { + "epoch": 1.6141501617270297, + "grad_norm": 0.11539309471845627, + "learning_rate": 9.260206542222224e-05, + "loss": 0.223, + "step": 58200 + }, + { + "epoch": 1.615536888670094, + "grad_norm": 0.13046038150787354, + "learning_rate": 9.245277314629432e-05, + "loss": 0.2204, + "step": 58250 + }, + { + "epoch": 1.6169236156131586, + "grad_norm": 0.13138452172279358, + "learning_rate": 9.230349778622144e-05, + "loss": 0.2227, + "step": 58300 + }, + { + "epoch": 1.618310342556223, + "grad_norm": 0.12613414227962494, + "learning_rate": 9.215423967657963e-05, + "loss": 0.2218, + "step": 58350 + }, + { + "epoch": 1.6196970694992876, + "grad_norm": 0.11702366173267365, + "learning_rate": 9.200499915190609e-05, + "loss": 0.2256, + "step": 58400 + }, + { + "epoch": 1.6210837964423521, + "grad_norm": 0.16387322545051575, + "learning_rate": 9.185577654669866e-05, + "loss": 0.2261, + "step": 58450 + }, + { + "epoch": 1.6224705233854166, + "grad_norm": 0.1341577172279358, + "learning_rate": 9.1706572195415e-05, + "loss": 0.2225, + "step": 58500 + }, + { + "epoch": 1.623857250328481, + "grad_norm": 0.11689752340316772, + "learning_rate": 9.155738643247191e-05, + "loss": 0.2209, + "step": 58550 + }, + { + "epoch": 1.6252439772715452, + "grad_norm": 0.14046530425548553, + "learning_rate": 9.140821959224448e-05, + "loss": 0.224, + "step": 58600 + }, + { + "epoch": 1.6266307042146098, + "grad_norm": 0.13324092328548431, + "learning_rate": 9.125907200906539e-05, + "loss": 0.2229, + "step": 58650 + }, + { + "epoch": 1.6280174311576743, + "grad_norm": 0.10926762223243713, + "learning_rate": 9.110994401722413e-05, + "loss": 0.2231, + "step": 58700 + }, + { + "epoch": 1.6294041581007388, + "grad_norm": 0.144717276096344, + "learning_rate": 9.096083595096642e-05, + "loss": 0.2252, + "step": 58750 + }, + { + "epoch": 1.6307908850438033, + "grad_norm": 0.11615584790706635, + "learning_rate": 9.081174814449314e-05, + "loss": 0.2203, + "step": 58800 + }, + { + "epoch": 1.6321776119868678, + "grad_norm": 0.1326524317264557, + "learning_rate": 9.066268093195987e-05, + "loss": 0.2261, + "step": 58850 + }, + { + "epoch": 1.6335643389299321, + "grad_norm": 0.18373699486255646, + "learning_rate": 9.051363464747599e-05, + "loss": 0.2248, + "step": 58900 + }, + { + "epoch": 1.6349510658729967, + "grad_norm": 0.12923288345336914, + "learning_rate": 9.036460962510398e-05, + "loss": 0.2265, + "step": 58950 + }, + { + "epoch": 1.636337792816061, + "grad_norm": 0.12572887539863586, + "learning_rate": 9.021560619885865e-05, + "loss": 0.2273, + "step": 59000 + }, + { + "epoch": 1.636337792816061, + "eval_loss": 0.22130924463272095, + "eval_runtime": 501.6665, + "eval_samples_per_second": 5.695, + "eval_steps_per_second": 5.695, + "step": 59000 + }, + { + "epoch": 1.6377245197591255, + "grad_norm": 0.12783953547477722, + "learning_rate": 9.006662470270646e-05, + "loss": 0.2276, + "step": 59050 + }, + { + "epoch": 1.63911124670219, + "grad_norm": 0.1579081267118454, + "learning_rate": 8.991766547056464e-05, + "loss": 0.2258, + "step": 59100 + }, + { + "epoch": 1.6404979736452545, + "grad_norm": 0.13635234534740448, + "learning_rate": 8.976872883630062e-05, + "loss": 0.2187, + "step": 59150 + }, + { + "epoch": 1.641884700588319, + "grad_norm": 0.13230207562446594, + "learning_rate": 8.961981513373109e-05, + "loss": 0.22, + "step": 59200 + }, + { + "epoch": 1.6432714275313833, + "grad_norm": 0.13722644746303558, + "learning_rate": 8.947092469662137e-05, + "loss": 0.2231, + "step": 59250 + }, + { + "epoch": 1.6446581544744479, + "grad_norm": 0.10856415331363678, + "learning_rate": 8.932205785868466e-05, + "loss": 0.2182, + "step": 59300 + }, + { + "epoch": 1.6460448814175122, + "grad_norm": 0.11690490692853928, + "learning_rate": 8.91732149535812e-05, + "loss": 0.2179, + "step": 59350 + }, + { + "epoch": 1.6474316083605767, + "grad_norm": 0.12944932281970978, + "learning_rate": 8.902439631491768e-05, + "loss": 0.224, + "step": 59400 + }, + { + "epoch": 1.6488183353036412, + "grad_norm": 0.10274020582437515, + "learning_rate": 8.887560227624632e-05, + "loss": 0.2217, + "step": 59450 + }, + { + "epoch": 1.6502050622467057, + "grad_norm": 0.10526053607463837, + "learning_rate": 8.872683317106422e-05, + "loss": 0.2205, + "step": 59500 + }, + { + "epoch": 1.6515917891897702, + "grad_norm": 0.13197946548461914, + "learning_rate": 8.857808933281269e-05, + "loss": 0.2233, + "step": 59550 + }, + { + "epoch": 1.6529785161328345, + "grad_norm": 0.12400202453136444, + "learning_rate": 8.842937109487625e-05, + "loss": 0.2239, + "step": 59600 + }, + { + "epoch": 1.654365243075899, + "grad_norm": 0.13808733224868774, + "learning_rate": 8.828067879058219e-05, + "loss": 0.2239, + "step": 59650 + }, + { + "epoch": 1.6557519700189633, + "grad_norm": 0.1406649500131607, + "learning_rate": 8.813201275319957e-05, + "loss": 0.2234, + "step": 59700 + }, + { + "epoch": 1.6571386969620279, + "grad_norm": 0.13243602216243744, + "learning_rate": 8.798337331593862e-05, + "loss": 0.2255, + "step": 59750 + }, + { + "epoch": 1.6585254239050924, + "grad_norm": 0.12087827175855637, + "learning_rate": 8.783476081194993e-05, + "loss": 0.223, + "step": 59800 + }, + { + "epoch": 1.659912150848157, + "grad_norm": 0.12543760240077972, + "learning_rate": 8.768617557432374e-05, + "loss": 0.2232, + "step": 59850 + }, + { + "epoch": 1.6612988777912214, + "grad_norm": 0.11448033154010773, + "learning_rate": 8.753761793608915e-05, + "loss": 0.2214, + "step": 59900 + }, + { + "epoch": 1.662685604734286, + "grad_norm": 0.13189347088336945, + "learning_rate": 8.738908823021349e-05, + "loss": 0.2256, + "step": 59950 + }, + { + "epoch": 1.6640723316773502, + "grad_norm": 0.12486356496810913, + "learning_rate": 8.724058678960139e-05, + "loss": 0.2226, + "step": 60000 + }, + { + "epoch": 1.6640723316773502, + "eval_loss": 0.2211039811372757, + "eval_runtime": 501.6014, + "eval_samples_per_second": 5.696, + "eval_steps_per_second": 5.696, + "step": 60000 + }, + { + "epoch": 1.6654590586204145, + "grad_norm": 0.12641596794128418, + "learning_rate": 8.709211394709415e-05, + "loss": 0.2244, + "step": 60050 + }, + { + "epoch": 1.666845785563479, + "grad_norm": 0.14593897759914398, + "learning_rate": 8.694367003546897e-05, + "loss": 0.2257, + "step": 60100 + }, + { + "epoch": 1.6682325125065436, + "grad_norm": 0.11628296971321106, + "learning_rate": 8.679525538743825e-05, + "loss": 0.2253, + "step": 60150 + }, + { + "epoch": 1.669619239449608, + "grad_norm": 0.10328439623117447, + "learning_rate": 8.664687033564874e-05, + "loss": 0.2214, + "step": 60200 + }, + { + "epoch": 1.6710059663926726, + "grad_norm": 0.13650208711624146, + "learning_rate": 8.649851521268087e-05, + "loss": 0.2234, + "step": 60250 + }, + { + "epoch": 1.6723926933357371, + "grad_norm": 0.1255771368741989, + "learning_rate": 8.635019035104798e-05, + "loss": 0.2223, + "step": 60300 + }, + { + "epoch": 1.6737794202788014, + "grad_norm": 0.10108000785112381, + "learning_rate": 8.620189608319568e-05, + "loss": 0.2273, + "step": 60350 + }, + { + "epoch": 1.675166147221866, + "grad_norm": 0.11289294809103012, + "learning_rate": 8.605363274150089e-05, + "loss": 0.2252, + "step": 60400 + }, + { + "epoch": 1.6765528741649303, + "grad_norm": 0.18194100260734558, + "learning_rate": 8.590540065827126e-05, + "loss": 0.2287, + "step": 60450 + }, + { + "epoch": 1.6779396011079948, + "grad_norm": 0.1335909515619278, + "learning_rate": 8.575720016574438e-05, + "loss": 0.2223, + "step": 60500 + }, + { + "epoch": 1.6793263280510593, + "grad_norm": 0.17010542750358582, + "learning_rate": 8.561199465248794e-05, + "loss": 0.2241, + "step": 60550 + }, + { + "epoch": 1.6807130549941238, + "grad_norm": 0.14800529181957245, + "learning_rate": 8.546385768944199e-05, + "loss": 0.2253, + "step": 60600 + }, + { + "epoch": 1.6820997819371883, + "grad_norm": 0.138923779129982, + "learning_rate": 8.531575330674397e-05, + "loss": 0.2229, + "step": 60650 + }, + { + "epoch": 1.6834865088802526, + "grad_norm": 0.1428767740726471, + "learning_rate": 8.51676818363453e-05, + "loss": 0.2241, + "step": 60700 + }, + { + "epoch": 1.6848732358233172, + "grad_norm": 0.15303942561149597, + "learning_rate": 8.501964361012355e-05, + "loss": 0.2221, + "step": 60750 + }, + { + "epoch": 1.6862599627663815, + "grad_norm": 0.12015249580144882, + "learning_rate": 8.487163895988181e-05, + "loss": 0.2289, + "step": 60800 + }, + { + "epoch": 1.687646689709446, + "grad_norm": 0.1367703676223755, + "learning_rate": 8.472366821734789e-05, + "loss": 0.2264, + "step": 60850 + }, + { + "epoch": 1.6890334166525105, + "grad_norm": 0.14325445890426636, + "learning_rate": 8.457573171417366e-05, + "loss": 0.2247, + "step": 60900 + }, + { + "epoch": 1.690420143595575, + "grad_norm": 0.18521633744239807, + "learning_rate": 8.442782978193423e-05, + "loss": 0.2229, + "step": 60950 + }, + { + "epoch": 1.6918068705386395, + "grad_norm": 0.11883826553821564, + "learning_rate": 8.427996275212719e-05, + "loss": 0.2245, + "step": 61000 + }, + { + "epoch": 1.6918068705386395, + "eval_loss": 0.2209625393152237, + "eval_runtime": 501.1761, + "eval_samples_per_second": 5.701, + "eval_steps_per_second": 5.701, + "step": 61000 + }, + { + "epoch": 1.6931935974817038, + "grad_norm": 0.15255926549434662, + "learning_rate": 8.413213095617189e-05, + "loss": 0.2231, + "step": 61050 + }, + { + "epoch": 1.6945803244247684, + "grad_norm": 0.11666153371334076, + "learning_rate": 8.398433472540878e-05, + "loss": 0.2232, + "step": 61100 + }, + { + "epoch": 1.6959670513678327, + "grad_norm": 0.1510268598794937, + "learning_rate": 8.383657439109852e-05, + "loss": 0.2279, + "step": 61150 + }, + { + "epoch": 1.6973537783108972, + "grad_norm": 0.12121795862913132, + "learning_rate": 8.368885028442138e-05, + "loss": 0.218, + "step": 61200 + }, + { + "epoch": 1.6987405052539617, + "grad_norm": 0.1254698634147644, + "learning_rate": 8.354116273647637e-05, + "loss": 0.2244, + "step": 61250 + }, + { + "epoch": 1.7001272321970262, + "grad_norm": 0.1407916098833084, + "learning_rate": 8.339351207828064e-05, + "loss": 0.2235, + "step": 61300 + }, + { + "epoch": 1.7015139591400907, + "grad_norm": 0.16278910636901855, + "learning_rate": 8.324589864076858e-05, + "loss": 0.2218, + "step": 61350 + }, + { + "epoch": 1.702900686083155, + "grad_norm": 0.12832941114902496, + "learning_rate": 8.30983227547912e-05, + "loss": 0.2237, + "step": 61400 + }, + { + "epoch": 1.7042874130262196, + "grad_norm": 0.10571594536304474, + "learning_rate": 8.295078475111532e-05, + "loss": 0.2241, + "step": 61450 + }, + { + "epoch": 1.7056741399692839, + "grad_norm": 0.12712325155735016, + "learning_rate": 8.280328496042287e-05, + "loss": 0.2253, + "step": 61500 + }, + { + "epoch": 1.7070608669123484, + "grad_norm": 0.12702125310897827, + "learning_rate": 8.265582371331011e-05, + "loss": 0.2225, + "step": 61550 + }, + { + "epoch": 1.708447593855413, + "grad_norm": 0.10937677323818207, + "learning_rate": 8.250840134028694e-05, + "loss": 0.226, + "step": 61600 + }, + { + "epoch": 1.7098343207984774, + "grad_norm": 0.15084721148014069, + "learning_rate": 8.236101817177609e-05, + "loss": 0.224, + "step": 61650 + }, + { + "epoch": 1.711221047741542, + "grad_norm": 0.10948482900857925, + "learning_rate": 8.221367453811247e-05, + "loss": 0.2222, + "step": 61700 + }, + { + "epoch": 1.7126077746846065, + "grad_norm": 0.12602409720420837, + "learning_rate": 8.206637076954236e-05, + "loss": 0.2221, + "step": 61750 + }, + { + "epoch": 1.7139945016276708, + "grad_norm": 0.11353398859500885, + "learning_rate": 8.191910719622267e-05, + "loss": 0.219, + "step": 61800 + }, + { + "epoch": 1.715381228570735, + "grad_norm": 0.17882102727890015, + "learning_rate": 8.177188414822025e-05, + "loss": 0.2259, + "step": 61850 + }, + { + "epoch": 1.7167679555137996, + "grad_norm": 0.14457036554813385, + "learning_rate": 8.162470195551111e-05, + "loss": 0.2232, + "step": 61900 + }, + { + "epoch": 1.718154682456864, + "grad_norm": 0.13123807311058044, + "learning_rate": 8.147756094797964e-05, + "loss": 0.2189, + "step": 61950 + }, + { + "epoch": 1.7195414093999286, + "grad_norm": 0.13137926161289215, + "learning_rate": 8.133046145541801e-05, + "loss": 0.223, + "step": 62000 + }, + { + "epoch": 1.7195414093999286, + "eval_loss": 0.22088629007339478, + "eval_runtime": 501.7917, + "eval_samples_per_second": 5.694, + "eval_steps_per_second": 5.694, + "step": 62000 + }, + { + "epoch": 1.7209281363429931, + "grad_norm": 0.13826268911361694, + "learning_rate": 8.118340380752526e-05, + "loss": 0.2267, + "step": 62050 + }, + { + "epoch": 1.7223148632860577, + "grad_norm": 0.11485900729894638, + "learning_rate": 8.103638833390666e-05, + "loss": 0.2245, + "step": 62100 + }, + { + "epoch": 1.723701590229122, + "grad_norm": 0.14433616399765015, + "learning_rate": 8.088941536407302e-05, + "loss": 0.2254, + "step": 62150 + }, + { + "epoch": 1.7250883171721865, + "grad_norm": 0.15891560912132263, + "learning_rate": 8.07424852274398e-05, + "loss": 0.2259, + "step": 62200 + }, + { + "epoch": 1.7264750441152508, + "grad_norm": 0.15121401846408844, + "learning_rate": 8.059559825332653e-05, + "loss": 0.2238, + "step": 62250 + }, + { + "epoch": 1.7278617710583153, + "grad_norm": 0.1441587507724762, + "learning_rate": 8.044875477095589e-05, + "loss": 0.2222, + "step": 62300 + }, + { + "epoch": 1.7292484980013798, + "grad_norm": 0.1155286505818367, + "learning_rate": 8.03019551094532e-05, + "loss": 0.2264, + "step": 62350 + }, + { + "epoch": 1.7306352249444443, + "grad_norm": 0.11506728082895279, + "learning_rate": 8.01551995978455e-05, + "loss": 0.2168, + "step": 62400 + }, + { + "epoch": 1.7320219518875088, + "grad_norm": 0.11319868266582489, + "learning_rate": 8.00084885650609e-05, + "loss": 0.2199, + "step": 62450 + }, + { + "epoch": 1.7334086788305731, + "grad_norm": 0.11964423209428787, + "learning_rate": 7.986182233992773e-05, + "loss": 0.2293, + "step": 62500 + }, + { + "epoch": 1.7347954057736377, + "grad_norm": 0.14446033537387848, + "learning_rate": 7.971520125117408e-05, + "loss": 0.2209, + "step": 62550 + }, + { + "epoch": 1.736182132716702, + "grad_norm": 0.1368509829044342, + "learning_rate": 7.95686256274267e-05, + "loss": 0.2276, + "step": 62600 + }, + { + "epoch": 1.7375688596597665, + "grad_norm": 0.11722143739461899, + "learning_rate": 7.942209579721052e-05, + "loss": 0.222, + "step": 62650 + }, + { + "epoch": 1.738955586602831, + "grad_norm": 0.11798449605703354, + "learning_rate": 7.927561208894781e-05, + "loss": 0.2211, + "step": 62700 + }, + { + "epoch": 1.7403423135458955, + "grad_norm": 0.13891583681106567, + "learning_rate": 7.912917483095743e-05, + "loss": 0.2218, + "step": 62750 + }, + { + "epoch": 1.74172904048896, + "grad_norm": 0.11691787093877792, + "learning_rate": 7.898278435145419e-05, + "loss": 0.2216, + "step": 62800 + }, + { + "epoch": 1.7431157674320243, + "grad_norm": 0.1415584832429886, + "learning_rate": 7.883644097854802e-05, + "loss": 0.2185, + "step": 62850 + }, + { + "epoch": 1.7445024943750889, + "grad_norm": 0.14408282935619354, + "learning_rate": 7.869014504024328e-05, + "loss": 0.2261, + "step": 62900 + }, + { + "epoch": 1.7458892213181532, + "grad_norm": 0.11771035194396973, + "learning_rate": 7.8543896864438e-05, + "loss": 0.2207, + "step": 62950 + }, + { + "epoch": 1.7472759482612177, + "grad_norm": 0.12419497966766357, + "learning_rate": 7.839769677892322e-05, + "loss": 0.2201, + "step": 63000 + }, + { + "epoch": 1.7472759482612177, + "eval_loss": 0.22052037715911865, + "eval_runtime": 501.3675, + "eval_samples_per_second": 5.698, + "eval_steps_per_second": 5.698, + "step": 63000 + }, + { + "epoch": 1.7486626752042822, + "grad_norm": 0.13703902065753937, + "learning_rate": 7.825154511138208e-05, + "loss": 0.2231, + "step": 63050 + }, + { + "epoch": 1.7500494021473467, + "grad_norm": 0.131145641207695, + "learning_rate": 7.810544218938931e-05, + "loss": 0.2242, + "step": 63100 + }, + { + "epoch": 1.7514361290904112, + "grad_norm": 0.12763576209545135, + "learning_rate": 7.79593883404103e-05, + "loss": 0.2217, + "step": 63150 + }, + { + "epoch": 1.7528228560334755, + "grad_norm": 0.13587799668312073, + "learning_rate": 7.781338389180049e-05, + "loss": 0.2236, + "step": 63200 + }, + { + "epoch": 1.75420958297654, + "grad_norm": 0.1352778673171997, + "learning_rate": 7.766742917080461e-05, + "loss": 0.2255, + "step": 63250 + }, + { + "epoch": 1.7555963099196044, + "grad_norm": 0.13144232332706451, + "learning_rate": 7.752152450455587e-05, + "loss": 0.2224, + "step": 63300 + }, + { + "epoch": 1.7569830368626689, + "grad_norm": 0.13210804760456085, + "learning_rate": 7.737567022007541e-05, + "loss": 0.2226, + "step": 63350 + }, + { + "epoch": 1.7583697638057334, + "grad_norm": 0.12364811450242996, + "learning_rate": 7.722986664427134e-05, + "loss": 0.2266, + "step": 63400 + }, + { + "epoch": 1.759756490748798, + "grad_norm": 0.13110807538032532, + "learning_rate": 7.708411410393817e-05, + "loss": 0.2207, + "step": 63450 + }, + { + "epoch": 1.7611432176918624, + "grad_norm": 0.11447171866893768, + "learning_rate": 7.693841292575598e-05, + "loss": 0.2276, + "step": 63500 + }, + { + "epoch": 1.762529944634927, + "grad_norm": 0.13065434992313385, + "learning_rate": 7.679276343628978e-05, + "loss": 0.2215, + "step": 63550 + }, + { + "epoch": 1.7639166715779913, + "grad_norm": 0.13254573941230774, + "learning_rate": 7.664716596198869e-05, + "loss": 0.2217, + "step": 63600 + }, + { + "epoch": 1.7653033985210556, + "grad_norm": 0.12096452713012695, + "learning_rate": 7.650162082918525e-05, + "loss": 0.2226, + "step": 63650 + }, + { + "epoch": 1.76669012546412, + "grad_norm": 0.11871534585952759, + "learning_rate": 7.635612836409466e-05, + "loss": 0.2219, + "step": 63700 + }, + { + "epoch": 1.7680768524071846, + "grad_norm": 0.12863662838935852, + "learning_rate": 7.621068889281419e-05, + "loss": 0.2189, + "step": 63750 + }, + { + "epoch": 1.7694635793502491, + "grad_norm": 0.12479788810014725, + "learning_rate": 7.60653027413222e-05, + "loss": 0.224, + "step": 63800 + }, + { + "epoch": 1.7708503062933136, + "grad_norm": 0.11679526418447495, + "learning_rate": 7.591997023547763e-05, + "loss": 0.2266, + "step": 63850 + }, + { + "epoch": 1.7722370332363782, + "grad_norm": 0.13567224144935608, + "learning_rate": 7.577469170101908e-05, + "loss": 0.2249, + "step": 63900 + }, + { + "epoch": 1.7736237601794425, + "grad_norm": 0.11825359612703323, + "learning_rate": 7.562946746356432e-05, + "loss": 0.2164, + "step": 63950 + }, + { + "epoch": 1.775010487122507, + "grad_norm": 0.1773427426815033, + "learning_rate": 7.548429784860931e-05, + "loss": 0.2246, + "step": 64000 + }, + { + "epoch": 1.775010487122507, + "eval_loss": 0.22055774927139282, + "eval_runtime": 500.8502, + "eval_samples_per_second": 5.704, + "eval_steps_per_second": 5.704, + "step": 64000 + }, + { + "epoch": 1.7763972140655713, + "grad_norm": 0.14357559382915497, + "learning_rate": 7.533918318152764e-05, + "loss": 0.2252, + "step": 64050 + }, + { + "epoch": 1.7777839410086358, + "grad_norm": 0.13654856383800507, + "learning_rate": 7.519412378756967e-05, + "loss": 0.2244, + "step": 64100 + }, + { + "epoch": 1.7791706679517003, + "grad_norm": 0.13761308789253235, + "learning_rate": 7.504911999186203e-05, + "loss": 0.2216, + "step": 64150 + }, + { + "epoch": 1.7805573948947648, + "grad_norm": 0.1746763437986374, + "learning_rate": 7.490707052670636e-05, + "loss": 0.2252, + "step": 64200 + }, + { + "epoch": 1.7819441218378294, + "grad_norm": 0.12090858817100525, + "learning_rate": 7.476217777423408e-05, + "loss": 0.2202, + "step": 64250 + }, + { + "epoch": 1.7833308487808937, + "grad_norm": 0.1337929368019104, + "learning_rate": 7.461734158814738e-05, + "loss": 0.2266, + "step": 64300 + }, + { + "epoch": 1.7847175757239582, + "grad_norm": 0.11186370998620987, + "learning_rate": 7.447256229307243e-05, + "loss": 0.2205, + "step": 64350 + }, + { + "epoch": 1.7861043026670225, + "grad_norm": 0.11708831042051315, + "learning_rate": 7.432784021350796e-05, + "loss": 0.2221, + "step": 64400 + }, + { + "epoch": 1.787491029610087, + "grad_norm": 0.13799749314785004, + "learning_rate": 7.418317567382446e-05, + "loss": 0.2222, + "step": 64450 + }, + { + "epoch": 1.7888777565531515, + "grad_norm": 0.12827137112617493, + "learning_rate": 7.403856899826352e-05, + "loss": 0.2204, + "step": 64500 + }, + { + "epoch": 1.790264483496216, + "grad_norm": 0.1483440101146698, + "learning_rate": 7.389402051093692e-05, + "loss": 0.2245, + "step": 64550 + }, + { + "epoch": 1.7916512104392806, + "grad_norm": 0.1483563780784607, + "learning_rate": 7.37495305358261e-05, + "loss": 0.2199, + "step": 64600 + }, + { + "epoch": 1.7930379373823448, + "grad_norm": 0.12296409904956818, + "learning_rate": 7.360509939678129e-05, + "loss": 0.2211, + "step": 64650 + }, + { + "epoch": 1.7944246643254094, + "grad_norm": 0.10277026891708374, + "learning_rate": 7.346072741752098e-05, + "loss": 0.22, + "step": 64700 + }, + { + "epoch": 1.7958113912684737, + "grad_norm": 0.11957128345966339, + "learning_rate": 7.331641492163092e-05, + "loss": 0.2218, + "step": 64750 + }, + { + "epoch": 1.7971981182115382, + "grad_norm": 0.1205180436372757, + "learning_rate": 7.317216223256362e-05, + "loss": 0.2238, + "step": 64800 + }, + { + "epoch": 1.7985848451546027, + "grad_norm": 0.12323558330535889, + "learning_rate": 7.302796967363748e-05, + "loss": 0.2238, + "step": 64850 + }, + { + "epoch": 1.7999715720976672, + "grad_norm": 0.1744375079870224, + "learning_rate": 7.288383756803618e-05, + "loss": 0.2236, + "step": 64900 + }, + { + "epoch": 1.8013582990407317, + "grad_norm": 0.11908925324678421, + "learning_rate": 7.27397662388079e-05, + "loss": 0.2199, + "step": 64950 + }, + { + "epoch": 1.802745025983796, + "grad_norm": 0.16596545279026031, + "learning_rate": 7.259575600886457e-05, + "loss": 0.2201, + "step": 65000 + }, + { + "epoch": 1.802745025983796, + "eval_loss": 0.22035908699035645, + "eval_runtime": 501.5752, + "eval_samples_per_second": 5.696, + "eval_steps_per_second": 5.696, + "step": 65000 + }, + { + "epoch": 1.8041317529268606, + "grad_norm": 0.10997340828180313, + "learning_rate": 7.245180720098122e-05, + "loss": 0.2226, + "step": 65050 + }, + { + "epoch": 1.8055184798699249, + "grad_norm": 0.13975024223327637, + "learning_rate": 7.230792013779512e-05, + "loss": 0.2227, + "step": 65100 + }, + { + "epoch": 1.8069052068129894, + "grad_norm": 0.13971950113773346, + "learning_rate": 7.216409514180532e-05, + "loss": 0.2287, + "step": 65150 + }, + { + "epoch": 1.808291933756054, + "grad_norm": 0.1235787570476532, + "learning_rate": 7.20203325353716e-05, + "loss": 0.2237, + "step": 65200 + }, + { + "epoch": 1.8096786606991184, + "grad_norm": 0.15590088069438934, + "learning_rate": 7.187663264071396e-05, + "loss": 0.222, + "step": 65250 + }, + { + "epoch": 1.811065387642183, + "grad_norm": 0.12113209813833237, + "learning_rate": 7.173299577991184e-05, + "loss": 0.2226, + "step": 65300 + }, + { + "epoch": 1.8124521145852475, + "grad_norm": 0.10521169751882553, + "learning_rate": 7.158942227490341e-05, + "loss": 0.2229, + "step": 65350 + }, + { + "epoch": 1.8138388415283118, + "grad_norm": 0.1138349398970604, + "learning_rate": 7.14459124474848e-05, + "loss": 0.2185, + "step": 65400 + }, + { + "epoch": 1.815225568471376, + "grad_norm": 0.11943230032920837, + "learning_rate": 7.130246661930945e-05, + "loss": 0.2202, + "step": 65450 + }, + { + "epoch": 1.8166122954144406, + "grad_norm": 0.12407558411359787, + "learning_rate": 7.115908511188736e-05, + "loss": 0.2209, + "step": 65500 + }, + { + "epoch": 1.817999022357505, + "grad_norm": 0.10693726688623428, + "learning_rate": 7.101576824658439e-05, + "loss": 0.2211, + "step": 65550 + }, + { + "epoch": 1.8193857493005696, + "grad_norm": 0.12432532757520676, + "learning_rate": 7.087251634462143e-05, + "loss": 0.2281, + "step": 65600 + }, + { + "epoch": 1.8207724762436341, + "grad_norm": 0.16990961134433746, + "learning_rate": 7.072932972707387e-05, + "loss": 0.2286, + "step": 65650 + }, + { + "epoch": 1.8221592031866987, + "grad_norm": 0.1222047433257103, + "learning_rate": 7.05862087148707e-05, + "loss": 0.2183, + "step": 65700 + }, + { + "epoch": 1.823545930129763, + "grad_norm": 0.13230758905410767, + "learning_rate": 7.044315362879388e-05, + "loss": 0.2244, + "step": 65750 + }, + { + "epoch": 1.8249326570728275, + "grad_norm": 0.11451806128025055, + "learning_rate": 7.030016478947762e-05, + "loss": 0.2208, + "step": 65800 + }, + { + "epoch": 1.8263193840158918, + "grad_norm": 0.1575726568698883, + "learning_rate": 7.015724251740766e-05, + "loss": 0.2233, + "step": 65850 + }, + { + "epoch": 1.8277061109589563, + "grad_norm": 0.11409944295883179, + "learning_rate": 7.001438713292047e-05, + "loss": 0.2232, + "step": 65900 + }, + { + "epoch": 1.8290928379020208, + "grad_norm": 0.1506188064813614, + "learning_rate": 6.987159895620277e-05, + "loss": 0.2272, + "step": 65950 + }, + { + "epoch": 1.8304795648450853, + "grad_norm": 0.12321347743272781, + "learning_rate": 6.972887830729048e-05, + "loss": 0.2225, + "step": 66000 + }, + { + "epoch": 1.8304795648450853, + "eval_loss": 0.22010773420333862, + "eval_runtime": 500.7778, + "eval_samples_per_second": 5.705, + "eval_steps_per_second": 5.705, + "step": 66000 + }, + { + "epoch": 1.8318662917881499, + "grad_norm": 0.10877233743667603, + "learning_rate": 6.958622550606821e-05, + "loss": 0.2208, + "step": 66050 + }, + { + "epoch": 1.8332530187312142, + "grad_norm": 0.12374629080295563, + "learning_rate": 6.944364087226851e-05, + "loss": 0.2198, + "step": 66100 + }, + { + "epoch": 1.8346397456742787, + "grad_norm": 0.12141034007072449, + "learning_rate": 6.930112472547118e-05, + "loss": 0.2202, + "step": 66150 + }, + { + "epoch": 1.836026472617343, + "grad_norm": 0.11637747287750244, + "learning_rate": 6.915867738510247e-05, + "loss": 0.2196, + "step": 66200 + }, + { + "epoch": 1.8374131995604075, + "grad_norm": 0.11352310329675674, + "learning_rate": 6.901629917043442e-05, + "loss": 0.2169, + "step": 66250 + }, + { + "epoch": 1.838799926503472, + "grad_norm": 0.13060016930103302, + "learning_rate": 6.887399040058408e-05, + "loss": 0.2259, + "step": 66300 + }, + { + "epoch": 1.8401866534465365, + "grad_norm": 0.13059192895889282, + "learning_rate": 6.873175139451306e-05, + "loss": 0.2235, + "step": 66350 + }, + { + "epoch": 1.841573380389601, + "grad_norm": 0.1436002552509308, + "learning_rate": 6.858958247102638e-05, + "loss": 0.2223, + "step": 66400 + }, + { + "epoch": 1.8429601073326654, + "grad_norm": 0.11226887255907059, + "learning_rate": 6.844748394877205e-05, + "loss": 0.2216, + "step": 66450 + }, + { + "epoch": 1.8443468342757299, + "grad_norm": 0.1210029125213623, + "learning_rate": 6.83054561462403e-05, + "loss": 0.2211, + "step": 66500 + }, + { + "epoch": 1.8457335612187942, + "grad_norm": 0.12150629609823227, + "learning_rate": 6.81634993817629e-05, + "loss": 0.2206, + "step": 66550 + }, + { + "epoch": 1.8471202881618587, + "grad_norm": 0.11927852034568787, + "learning_rate": 6.80216139735123e-05, + "loss": 0.2204, + "step": 66600 + }, + { + "epoch": 1.8485070151049232, + "grad_norm": 0.1343168169260025, + "learning_rate": 6.787980023950108e-05, + "loss": 0.2217, + "step": 66650 + }, + { + "epoch": 1.8498937420479877, + "grad_norm": 0.11947305500507355, + "learning_rate": 6.773805849758116e-05, + "loss": 0.2175, + "step": 66700 + }, + { + "epoch": 1.8512804689910523, + "grad_norm": 0.11362255364656448, + "learning_rate": 6.759638906544313e-05, + "loss": 0.2233, + "step": 66750 + }, + { + "epoch": 1.8526671959341166, + "grad_norm": 0.13201679289340973, + "learning_rate": 6.745479226061548e-05, + "loss": 0.22, + "step": 66800 + }, + { + "epoch": 1.854053922877181, + "grad_norm": 0.12694838643074036, + "learning_rate": 6.731326840046395e-05, + "loss": 0.2245, + "step": 66850 + }, + { + "epoch": 1.8554406498202454, + "grad_norm": 0.12625135481357574, + "learning_rate": 6.71718178021907e-05, + "loss": 0.226, + "step": 66900 + }, + { + "epoch": 1.85682737676331, + "grad_norm": 0.11247176676988602, + "learning_rate": 6.703044078283378e-05, + "loss": 0.2251, + "step": 66950 + }, + { + "epoch": 1.8582141037063744, + "grad_norm": 0.11113060265779495, + "learning_rate": 6.688913765926627e-05, + "loss": 0.2218, + "step": 67000 + }, + { + "epoch": 1.8582141037063744, + "eval_loss": 0.22002293169498444, + "eval_runtime": 500.6759, + "eval_samples_per_second": 5.706, + "eval_steps_per_second": 5.706, + "step": 67000 + }, + { + "epoch": 1.859600830649439, + "grad_norm": 0.12426327913999557, + "learning_rate": 6.674790874819566e-05, + "loss": 0.2283, + "step": 67050 + }, + { + "epoch": 1.8609875575925034, + "grad_norm": 0.12066414952278137, + "learning_rate": 6.660675436616306e-05, + "loss": 0.2195, + "step": 67100 + }, + { + "epoch": 1.862374284535568, + "grad_norm": 0.1106274425983429, + "learning_rate": 6.646567482954262e-05, + "loss": 0.2203, + "step": 67150 + }, + { + "epoch": 1.8637610114786323, + "grad_norm": 0.15133588016033173, + "learning_rate": 6.632467045454068e-05, + "loss": 0.2222, + "step": 67200 + }, + { + "epoch": 1.8651477384216966, + "grad_norm": 0.1420869678258896, + "learning_rate": 6.618374155719507e-05, + "loss": 0.2216, + "step": 67250 + }, + { + "epoch": 1.866534465364761, + "grad_norm": 0.1180604100227356, + "learning_rate": 6.604288845337453e-05, + "loss": 0.2234, + "step": 67300 + }, + { + "epoch": 1.8679211923078256, + "grad_norm": 0.1143430843949318, + "learning_rate": 6.59021114587779e-05, + "loss": 0.2215, + "step": 67350 + }, + { + "epoch": 1.8693079192508901, + "grad_norm": 0.11402673274278641, + "learning_rate": 6.57614108889334e-05, + "loss": 0.2195, + "step": 67400 + }, + { + "epoch": 1.8706946461939546, + "grad_norm": 0.11139002442359924, + "learning_rate": 6.5620787059198e-05, + "loss": 0.2245, + "step": 67450 + }, + { + "epoch": 1.8720813731370192, + "grad_norm": 0.12388894706964493, + "learning_rate": 6.548024028475661e-05, + "loss": 0.2223, + "step": 67500 + }, + { + "epoch": 1.8734681000800835, + "grad_norm": 0.1154908537864685, + "learning_rate": 6.533977088062155e-05, + "loss": 0.2255, + "step": 67550 + }, + { + "epoch": 1.874854827023148, + "grad_norm": 0.13296931982040405, + "learning_rate": 6.519937916163161e-05, + "loss": 0.2223, + "step": 67600 + }, + { + "epoch": 1.8762415539662123, + "grad_norm": 0.1450251191854477, + "learning_rate": 6.505906544245151e-05, + "loss": 0.2235, + "step": 67650 + }, + { + "epoch": 1.8776282809092768, + "grad_norm": 0.13592007756233215, + "learning_rate": 6.491883003757108e-05, + "loss": 0.2212, + "step": 67700 + }, + { + "epoch": 1.8790150078523413, + "grad_norm": 0.12879903614521027, + "learning_rate": 6.47786732613048e-05, + "loss": 0.2212, + "step": 67750 + }, + { + "epoch": 1.8804017347954058, + "grad_norm": 0.11880529671907425, + "learning_rate": 6.463859542779072e-05, + "loss": 0.2249, + "step": 67800 + }, + { + "epoch": 1.8817884617384704, + "grad_norm": 0.09874891489744186, + "learning_rate": 6.449859685099002e-05, + "loss": 0.2205, + "step": 67850 + }, + { + "epoch": 1.8831751886815347, + "grad_norm": 0.12535731494426727, + "learning_rate": 6.43586778446863e-05, + "loss": 0.2236, + "step": 67900 + }, + { + "epoch": 1.8845619156245992, + "grad_norm": 0.10726474970579147, + "learning_rate": 6.42188387224847e-05, + "loss": 0.2228, + "step": 67950 + }, + { + "epoch": 1.8859486425676635, + "grad_norm": 0.1287633180618286, + "learning_rate": 6.407907979781145e-05, + "loss": 0.2181, + "step": 68000 + }, + { + "epoch": 1.8859486425676635, + "eval_loss": 0.2199305146932602, + "eval_runtime": 500.7534, + "eval_samples_per_second": 5.705, + "eval_steps_per_second": 5.705, + "step": 68000 + }, + { + "epoch": 1.887335369510728, + "grad_norm": 0.09685543924570084, + "learning_rate": 6.393940138391295e-05, + "loss": 0.2223, + "step": 68050 + }, + { + "epoch": 1.8887220964537925, + "grad_norm": 0.18234391510486603, + "learning_rate": 6.379980379385513e-05, + "loss": 0.2216, + "step": 68100 + }, + { + "epoch": 1.890108823396857, + "grad_norm": 0.11249957233667374, + "learning_rate": 6.366028734052279e-05, + "loss": 0.2183, + "step": 68150 + }, + { + "epoch": 1.8914955503399216, + "grad_norm": 0.14707712829113007, + "learning_rate": 6.35208523366189e-05, + "loss": 0.2239, + "step": 68200 + }, + { + "epoch": 1.8928822772829859, + "grad_norm": 0.1354626715183258, + "learning_rate": 6.338149909466387e-05, + "loss": 0.2211, + "step": 68250 + }, + { + "epoch": 1.8942690042260504, + "grad_norm": 0.11492297798395157, + "learning_rate": 6.324222792699481e-05, + "loss": 0.2233, + "step": 68300 + }, + { + "epoch": 1.8956557311691147, + "grad_norm": 0.15535590052604675, + "learning_rate": 6.310303914576487e-05, + "loss": 0.215, + "step": 68350 + }, + { + "epoch": 1.8970424581121792, + "grad_norm": 0.12155631929636002, + "learning_rate": 6.296393306294268e-05, + "loss": 0.2186, + "step": 68400 + }, + { + "epoch": 1.8984291850552437, + "grad_norm": 0.16767702996730804, + "learning_rate": 6.282490999031134e-05, + "loss": 0.2221, + "step": 68450 + }, + { + "epoch": 1.8998159119983082, + "grad_norm": 0.14361165463924408, + "learning_rate": 6.2685970239468e-05, + "loss": 0.2254, + "step": 68500 + }, + { + "epoch": 1.9012026389413728, + "grad_norm": 0.1382211595773697, + "learning_rate": 6.254711412182303e-05, + "loss": 0.2227, + "step": 68550 + }, + { + "epoch": 1.902589365884437, + "grad_norm": 0.1300189197063446, + "learning_rate": 6.240834194859931e-05, + "loss": 0.2181, + "step": 68600 + }, + { + "epoch": 1.9039760928275016, + "grad_norm": 0.1218055710196495, + "learning_rate": 6.227242696147264e-05, + "loss": 0.2234, + "step": 68650 + }, + { + "epoch": 1.9053628197705659, + "grad_norm": 0.14510662853717804, + "learning_rate": 6.213382191563584e-05, + "loss": 0.2212, + "step": 68700 + }, + { + "epoch": 1.9067495467136304, + "grad_norm": 0.1312136948108673, + "learning_rate": 6.19953017405461e-05, + "loss": 0.22, + "step": 68750 + }, + { + "epoch": 1.908136273656695, + "grad_norm": 0.1256617307662964, + "learning_rate": 6.185686674667344e-05, + "loss": 0.2247, + "step": 68800 + }, + { + "epoch": 1.9095230005997594, + "grad_norm": 0.15276198089122772, + "learning_rate": 6.171851724429687e-05, + "loss": 0.2203, + "step": 68850 + }, + { + "epoch": 1.910909727542824, + "grad_norm": 0.1289031058549881, + "learning_rate": 6.158025354350377e-05, + "loss": 0.2185, + "step": 68900 + }, + { + "epoch": 1.9122964544858885, + "grad_norm": 0.14539048075675964, + "learning_rate": 6.144207595418932e-05, + "loss": 0.2226, + "step": 68950 + }, + { + "epoch": 1.9136831814289528, + "grad_norm": 0.12622995674610138, + "learning_rate": 6.130398478605562e-05, + "loss": 0.2212, + "step": 69000 + }, + { + "epoch": 1.9136831814289528, + "eval_loss": 0.21972130239009857, + "eval_runtime": 500.141, + "eval_samples_per_second": 5.712, + "eval_steps_per_second": 5.712, + "step": 69000 + }, + { + "epoch": 1.915069908372017, + "grad_norm": 0.1135367900133133, + "learning_rate": 6.116598034861105e-05, + "loss": 0.2255, + "step": 69050 + }, + { + "epoch": 1.9164566353150816, + "grad_norm": 0.12307292968034744, + "learning_rate": 6.102806295116965e-05, + "loss": 0.2236, + "step": 69100 + }, + { + "epoch": 1.9178433622581461, + "grad_norm": 0.15067149698734283, + "learning_rate": 6.089023290285036e-05, + "loss": 0.2245, + "step": 69150 + }, + { + "epoch": 1.9192300892012106, + "grad_norm": 0.13182760775089264, + "learning_rate": 6.075249051257632e-05, + "loss": 0.2234, + "step": 69200 + }, + { + "epoch": 1.9206168161442752, + "grad_norm": 0.13820376992225647, + "learning_rate": 6.061483608907419e-05, + "loss": 0.2232, + "step": 69250 + }, + { + "epoch": 1.9220035430873397, + "grad_norm": 0.12792882323265076, + "learning_rate": 6.0477269940873505e-05, + "loss": 0.2184, + "step": 69300 + }, + { + "epoch": 1.923390270030404, + "grad_norm": 0.11212699115276337, + "learning_rate": 6.0339792376305974e-05, + "loss": 0.2206, + "step": 69350 + }, + { + "epoch": 1.9247769969734685, + "grad_norm": 0.12071092426776886, + "learning_rate": 6.020240370350465e-05, + "loss": 0.2214, + "step": 69400 + }, + { + "epoch": 1.9261637239165328, + "grad_norm": 0.14231905341148376, + "learning_rate": 6.006510423040349e-05, + "loss": 0.228, + "step": 69450 + }, + { + "epoch": 1.9275504508595973, + "grad_norm": 0.13944728672504425, + "learning_rate": 5.99278942647364e-05, + "loss": 0.2199, + "step": 69500 + }, + { + "epoch": 1.9289371778026618, + "grad_norm": 0.12536093592643738, + "learning_rate": 5.979077411403675e-05, + "loss": 0.2242, + "step": 69550 + }, + { + "epoch": 1.9303239047457263, + "grad_norm": 0.11361142992973328, + "learning_rate": 5.965374408563655e-05, + "loss": 0.2275, + "step": 69600 + }, + { + "epoch": 1.9317106316887909, + "grad_norm": 0.13786083459854126, + "learning_rate": 5.9516804486665866e-05, + "loss": 0.2276, + "step": 69650 + }, + { + "epoch": 1.9330973586318552, + "grad_norm": 0.11472176760435104, + "learning_rate": 5.9379955624052006e-05, + "loss": 0.2235, + "step": 69700 + }, + { + "epoch": 1.9344840855749197, + "grad_norm": 0.17608420550823212, + "learning_rate": 5.9243197804519036e-05, + "loss": 0.2197, + "step": 69750 + }, + { + "epoch": 1.935870812517984, + "grad_norm": 0.12841780483722687, + "learning_rate": 5.9106531334586856e-05, + "loss": 0.2228, + "step": 69800 + }, + { + "epoch": 1.9372575394610485, + "grad_norm": 0.17407533526420593, + "learning_rate": 5.8969956520570646e-05, + "loss": 0.221, + "step": 69850 + }, + { + "epoch": 1.938644266404113, + "grad_norm": 0.12569575011730194, + "learning_rate": 5.883347366858014e-05, + "loss": 0.2228, + "step": 69900 + }, + { + "epoch": 1.9400309933471775, + "grad_norm": 0.1247481182217598, + "learning_rate": 5.8697083084519025e-05, + "loss": 0.2252, + "step": 69950 + }, + { + "epoch": 1.941417720290242, + "grad_norm": 0.15431839227676392, + "learning_rate": 5.85607850740841e-05, + "loss": 0.22, + "step": 70000 + }, + { + "epoch": 1.941417720290242, + "eval_loss": 0.21946103870868683, + "eval_runtime": 500.6645, + "eval_samples_per_second": 5.706, + "eval_steps_per_second": 5.706, + "step": 70000 + }, + { + "epoch": 1.9428044472333064, + "grad_norm": 0.13469503819942474, + "learning_rate": 5.842457994276473e-05, + "loss": 0.2178, + "step": 70050 + }, + { + "epoch": 1.9441911741763709, + "grad_norm": 0.103294737637043, + "learning_rate": 5.828846799584203e-05, + "loss": 0.2227, + "step": 70100 + }, + { + "epoch": 1.9455779011194352, + "grad_norm": 0.12273615598678589, + "learning_rate": 5.815244953838843e-05, + "loss": 0.2227, + "step": 70150 + }, + { + "epoch": 1.9469646280624997, + "grad_norm": 0.1339019238948822, + "learning_rate": 5.80165248752666e-05, + "loss": 0.215, + "step": 70200 + }, + { + "epoch": 1.9483513550055642, + "grad_norm": 0.10198177397251129, + "learning_rate": 5.788069431112913e-05, + "loss": 0.2222, + "step": 70250 + }, + { + "epoch": 1.9497380819486287, + "grad_norm": 0.14309309422969818, + "learning_rate": 5.7744958150417685e-05, + "loss": 0.223, + "step": 70300 + }, + { + "epoch": 1.9511248088916933, + "grad_norm": 0.11085857450962067, + "learning_rate": 5.760931669736226e-05, + "loss": 0.2246, + "step": 70350 + }, + { + "epoch": 1.9525115358347578, + "grad_norm": 0.13969945907592773, + "learning_rate": 5.7473770255980705e-05, + "loss": 0.2223, + "step": 70400 + }, + { + "epoch": 1.953898262777822, + "grad_norm": 0.14510655403137207, + "learning_rate": 5.734102721654016e-05, + "loss": 0.221, + "step": 70450 + }, + { + "epoch": 1.9552849897208864, + "grad_norm": 0.12342046946287155, + "learning_rate": 5.720566979435193e-05, + "loss": 0.2223, + "step": 70500 + }, + { + "epoch": 1.956671716663951, + "grad_norm": 0.1510363519191742, + "learning_rate": 5.707040828854496e-05, + "loss": 0.2236, + "step": 70550 + }, + { + "epoch": 1.9580584436070154, + "grad_norm": 0.1418016403913498, + "learning_rate": 5.6935243002285547e-05, + "loss": 0.2258, + "step": 70600 + }, + { + "epoch": 1.95944517055008, + "grad_norm": 0.12346237152814865, + "learning_rate": 5.680017423852406e-05, + "loss": 0.2184, + "step": 70650 + }, + { + "epoch": 1.9608318974931445, + "grad_norm": 0.1376170963048935, + "learning_rate": 5.666520229999489e-05, + "loss": 0.2241, + "step": 70700 + }, + { + "epoch": 1.962218624436209, + "grad_norm": 0.13145020604133606, + "learning_rate": 5.6530327489215084e-05, + "loss": 0.2245, + "step": 70750 + }, + { + "epoch": 1.9636053513792733, + "grad_norm": 0.14705248177051544, + "learning_rate": 5.639555010848416e-05, + "loss": 0.2217, + "step": 70800 + }, + { + "epoch": 1.9649920783223378, + "grad_norm": 0.1190200001001358, + "learning_rate": 5.6260870459883264e-05, + "loss": 0.2212, + "step": 70850 + }, + { + "epoch": 1.966378805265402, + "grad_norm": 0.1403389275074005, + "learning_rate": 5.612628884527436e-05, + "loss": 0.2227, + "step": 70900 + }, + { + "epoch": 1.9677655322084666, + "grad_norm": 0.10089599341154099, + "learning_rate": 5.5991805566299884e-05, + "loss": 0.2242, + "step": 70950 + }, + { + "epoch": 1.9691522591515311, + "grad_norm": 0.1129344254732132, + "learning_rate": 5.5857420924381665e-05, + "loss": 0.218, + "step": 71000 + }, + { + "epoch": 1.9691522591515311, + "eval_loss": 0.21950487792491913, + "eval_runtime": 500.1742, + "eval_samples_per_second": 5.712, + "eval_steps_per_second": 5.712, + "step": 71000 + }, + { + "epoch": 1.9705389860945957, + "grad_norm": 0.11886442452669144, + "learning_rate": 5.57231352207206e-05, + "loss": 0.2208, + "step": 71050 + }, + { + "epoch": 1.9719257130376602, + "grad_norm": 0.13062913715839386, + "learning_rate": 5.5588948756295787e-05, + "loss": 0.222, + "step": 71100 + }, + { + "epoch": 1.9733124399807245, + "grad_norm": 0.11810411512851715, + "learning_rate": 5.5454861831863905e-05, + "loss": 0.2198, + "step": 71150 + }, + { + "epoch": 1.974699166923789, + "grad_norm": 0.1369456797838211, + "learning_rate": 5.5320874747958475e-05, + "loss": 0.2224, + "step": 71200 + }, + { + "epoch": 1.9760858938668533, + "grad_norm": 0.1086314469575882, + "learning_rate": 5.51869878048893e-05, + "loss": 0.2207, + "step": 71250 + }, + { + "epoch": 1.9774726208099178, + "grad_norm": 0.13363894820213318, + "learning_rate": 5.5053201302741765e-05, + "loss": 0.2219, + "step": 71300 + }, + { + "epoch": 1.9788593477529823, + "grad_norm": 0.12869518995285034, + "learning_rate": 5.491951554137602e-05, + "loss": 0.223, + "step": 71350 + }, + { + "epoch": 1.9802460746960469, + "grad_norm": 0.11446097493171692, + "learning_rate": 5.478593082042655e-05, + "loss": 0.22, + "step": 71400 + }, + { + "epoch": 1.9816328016391114, + "grad_norm": 0.1284862756729126, + "learning_rate": 5.4652447439301204e-05, + "loss": 0.219, + "step": 71450 + }, + { + "epoch": 1.9830195285821757, + "grad_norm": 0.1428086757659912, + "learning_rate": 5.451906569718095e-05, + "loss": 0.2222, + "step": 71500 + }, + { + "epoch": 1.9844062555252402, + "grad_norm": 0.15157289803028107, + "learning_rate": 5.43857858930187e-05, + "loss": 0.2199, + "step": 71550 + }, + { + "epoch": 1.9857929824683045, + "grad_norm": 0.12832427024841309, + "learning_rate": 5.4252608325539066e-05, + "loss": 0.2248, + "step": 71600 + }, + { + "epoch": 1.987179709411369, + "grad_norm": 0.16493399441242218, + "learning_rate": 5.411953329323736e-05, + "loss": 0.2212, + "step": 71650 + }, + { + "epoch": 1.9885664363544335, + "grad_norm": 0.1472720503807068, + "learning_rate": 5.3986561094379226e-05, + "loss": 0.2229, + "step": 71700 + }, + { + "epoch": 1.989953163297498, + "grad_norm": 0.17510546743869781, + "learning_rate": 5.3853692026999704e-05, + "loss": 0.2217, + "step": 71750 + }, + { + "epoch": 1.9913398902405626, + "grad_norm": 0.12741264700889587, + "learning_rate": 5.372092638890274e-05, + "loss": 0.2205, + "step": 71800 + }, + { + "epoch": 1.9927266171836269, + "grad_norm": 0.13984480500221252, + "learning_rate": 5.358826447766052e-05, + "loss": 0.218, + "step": 71850 + }, + { + "epoch": 1.9941133441266914, + "grad_norm": 0.14489765465259552, + "learning_rate": 5.345570659061254e-05, + "loss": 0.2256, + "step": 71900 + }, + { + "epoch": 1.9955000710697557, + "grad_norm": 0.11793538182973862, + "learning_rate": 5.332325302486545e-05, + "loss": 0.2214, + "step": 71950 + }, + { + "epoch": 1.9968867980128202, + "grad_norm": 0.11862395703792572, + "learning_rate": 5.3190904077291794e-05, + "loss": 0.2185, + "step": 72000 + }, + { + "epoch": 1.9968867980128202, + "eval_loss": 0.21919070184230804, + "eval_runtime": 500.6392, + "eval_samples_per_second": 5.707, + "eval_steps_per_second": 5.707, + "step": 72000 + }, + { + "epoch": 1.9982735249558847, + "grad_norm": 0.14340032637119293, + "learning_rate": 5.305866004452982e-05, + "loss": 0.2204, + "step": 72050 + }, + { + "epoch": 1.9996602518989492, + "grad_norm": 0.10452345758676529, + "learning_rate": 5.2926521222982494e-05, + "loss": 0.2243, + "step": 72100 + }, + { + "epoch": 2.0010469788420138, + "grad_norm": 0.1440684199333191, + "learning_rate": 5.279448790881709e-05, + "loss": 0.2194, + "step": 72150 + }, + { + "epoch": 2.0024337057850783, + "grad_norm": 0.1383608728647232, + "learning_rate": 5.2662560397964265e-05, + "loss": 0.2185, + "step": 72200 + }, + { + "epoch": 2.0038204327281424, + "grad_norm": 0.12105223536491394, + "learning_rate": 5.253073898611769e-05, + "loss": 0.2176, + "step": 72250 + }, + { + "epoch": 2.005207159671207, + "grad_norm": 0.13564659655094147, + "learning_rate": 5.239902396873312e-05, + "loss": 0.2188, + "step": 72300 + }, + { + "epoch": 2.0065938866142714, + "grad_norm": 0.10812518000602722, + "learning_rate": 5.226741564102793e-05, + "loss": 0.221, + "step": 72350 + }, + { + "epoch": 2.007980613557336, + "grad_norm": 0.1458885222673416, + "learning_rate": 5.2135914297980257e-05, + "loss": 0.222, + "step": 72400 + }, + { + "epoch": 2.0093673405004004, + "grad_norm": 0.13904157280921936, + "learning_rate": 5.2004520234328556e-05, + "loss": 0.2226, + "step": 72450 + }, + { + "epoch": 2.010754067443465, + "grad_norm": 0.12214989960193634, + "learning_rate": 5.18732337445708e-05, + "loss": 0.2179, + "step": 72500 + }, + { + "epoch": 2.0121407943865295, + "grad_norm": 0.1347937136888504, + "learning_rate": 5.1742055122963804e-05, + "loss": 0.2153, + "step": 72550 + }, + { + "epoch": 2.013527521329594, + "grad_norm": 0.17244890332221985, + "learning_rate": 5.161098466352271e-05, + "loss": 0.2201, + "step": 72600 + }, + { + "epoch": 2.014914248272658, + "grad_norm": 0.1166064664721489, + "learning_rate": 5.14800226600201e-05, + "loss": 0.2212, + "step": 72650 + }, + { + "epoch": 2.0163009752157226, + "grad_norm": 0.11647620797157288, + "learning_rate": 5.134916940598558e-05, + "loss": 0.2161, + "step": 72700 + }, + { + "epoch": 2.017687702158787, + "grad_norm": 0.133785679936409, + "learning_rate": 5.121842519470501e-05, + "loss": 0.2214, + "step": 72750 + }, + { + "epoch": 2.0190744291018516, + "grad_norm": 0.11569181829690933, + "learning_rate": 5.108779031921982e-05, + "loss": 0.2181, + "step": 72800 + }, + { + "epoch": 2.020461156044916, + "grad_norm": 0.16622135043144226, + "learning_rate": 5.095726507232631e-05, + "loss": 0.2178, + "step": 72850 + }, + { + "epoch": 2.0218478829879807, + "grad_norm": 0.13199181854724884, + "learning_rate": 5.082684974657519e-05, + "loss": 0.2218, + "step": 72900 + }, + { + "epoch": 2.023234609931045, + "grad_norm": 0.12465277314186096, + "learning_rate": 5.069654463427077e-05, + "loss": 0.2211, + "step": 72950 + }, + { + "epoch": 2.0246213368741093, + "grad_norm": 0.11640169471502304, + "learning_rate": 5.0566350027470235e-05, + "loss": 0.2189, + "step": 73000 + }, + { + "epoch": 2.0246213368741093, + "eval_loss": 0.21916010975837708, + "eval_runtime": 500.4715, + "eval_samples_per_second": 5.709, + "eval_steps_per_second": 5.709, + "step": 73000 + }, + { + "epoch": 2.026008063817174, + "grad_norm": 0.13460282981395721, + "learning_rate": 5.0436266217983255e-05, + "loss": 0.2175, + "step": 73050 + }, + { + "epoch": 2.0273947907602383, + "grad_norm": 0.14442716538906097, + "learning_rate": 5.030629349737095e-05, + "loss": 0.2217, + "step": 73100 + }, + { + "epoch": 2.028781517703303, + "grad_norm": 0.12992683053016663, + "learning_rate": 5.017643215694573e-05, + "loss": 0.2183, + "step": 73150 + }, + { + "epoch": 2.0301682446463674, + "grad_norm": 0.12550853192806244, + "learning_rate": 5.00466824877701e-05, + "loss": 0.2186, + "step": 73200 + }, + { + "epoch": 2.031554971589432, + "grad_norm": 0.153733029961586, + "learning_rate": 4.9917044780656474e-05, + "loss": 0.2175, + "step": 73250 + }, + { + "epoch": 2.0329416985324964, + "grad_norm": 0.13093726336956024, + "learning_rate": 4.978751932616615e-05, + "loss": 0.2206, + "step": 73300 + }, + { + "epoch": 2.0343284254755605, + "grad_norm": 0.13688836991786957, + "learning_rate": 4.9658106414608995e-05, + "loss": 0.2213, + "step": 73350 + }, + { + "epoch": 2.035715152418625, + "grad_norm": 0.1340765506029129, + "learning_rate": 4.9528806336042475e-05, + "loss": 0.2195, + "step": 73400 + }, + { + "epoch": 2.0371018793616895, + "grad_norm": 0.13744907081127167, + "learning_rate": 4.9399619380271267e-05, + "loss": 0.2196, + "step": 73450 + }, + { + "epoch": 2.038488606304754, + "grad_norm": 0.11359039694070816, + "learning_rate": 4.927054583684647e-05, + "loss": 0.2175, + "step": 73500 + }, + { + "epoch": 2.0398753332478186, + "grad_norm": 0.1374887377023697, + "learning_rate": 4.914158599506499e-05, + "loss": 0.2191, + "step": 73550 + }, + { + "epoch": 2.041262060190883, + "grad_norm": 0.1326727718114853, + "learning_rate": 4.901274014396892e-05, + "loss": 0.2174, + "step": 73600 + }, + { + "epoch": 2.0426487871339476, + "grad_norm": 0.1448783129453659, + "learning_rate": 4.8884008572344753e-05, + "loss": 0.218, + "step": 73650 + }, + { + "epoch": 2.0440355140770117, + "grad_norm": 0.15006962418556213, + "learning_rate": 4.8755391568723e-05, + "loss": 0.2153, + "step": 73700 + }, + { + "epoch": 2.045422241020076, + "grad_norm": 0.11637504398822784, + "learning_rate": 4.862688942137723e-05, + "loss": 0.2209, + "step": 73750 + }, + { + "epoch": 2.0468089679631407, + "grad_norm": 0.15461041033267975, + "learning_rate": 4.849850241832373e-05, + "loss": 0.2218, + "step": 73800 + }, + { + "epoch": 2.0481956949062052, + "grad_norm": 0.11848367750644684, + "learning_rate": 4.837023084732056e-05, + "loss": 0.2199, + "step": 73850 + }, + { + "epoch": 2.0495824218492698, + "grad_norm": 0.1128213182091713, + "learning_rate": 4.824207499586719e-05, + "loss": 0.2235, + "step": 73900 + }, + { + "epoch": 2.0509691487923343, + "grad_norm": 0.13028693199157715, + "learning_rate": 4.811403515120364e-05, + "loss": 0.2189, + "step": 73950 + }, + { + "epoch": 2.052355875735399, + "grad_norm": 0.1149388998746872, + "learning_rate": 4.798611160031001e-05, + "loss": 0.222, + "step": 74000 + }, + { + "epoch": 2.052355875735399, + "eval_loss": 0.21903079748153687, + "eval_runtime": 500.4713, + "eval_samples_per_second": 5.709, + "eval_steps_per_second": 5.709, + "step": 74000 + }, + { + "epoch": 2.0537426026784633, + "grad_norm": 0.11896523833274841, + "learning_rate": 4.7858304629905606e-05, + "loss": 0.2157, + "step": 74050 + }, + { + "epoch": 2.0551293296215274, + "grad_norm": 0.12097673863172531, + "learning_rate": 4.7730614526448546e-05, + "loss": 0.2204, + "step": 74100 + }, + { + "epoch": 2.056516056564592, + "grad_norm": 0.1441742181777954, + "learning_rate": 4.760304157613503e-05, + "loss": 0.2195, + "step": 74150 + }, + { + "epoch": 2.0579027835076564, + "grad_norm": 0.12328975647687912, + "learning_rate": 4.7475586064898545e-05, + "loss": 0.2183, + "step": 74200 + }, + { + "epoch": 2.059289510450721, + "grad_norm": 0.11700959503650665, + "learning_rate": 4.734824827840954e-05, + "loss": 0.2219, + "step": 74250 + }, + { + "epoch": 2.0606762373937855, + "grad_norm": 0.11523609608411789, + "learning_rate": 4.722102850207437e-05, + "loss": 0.2197, + "step": 74300 + }, + { + "epoch": 2.06206296433685, + "grad_norm": 0.10766173154115677, + "learning_rate": 4.70939270210352e-05, + "loss": 0.2222, + "step": 74350 + }, + { + "epoch": 2.0634496912799145, + "grad_norm": 0.1485738754272461, + "learning_rate": 4.6966944120168754e-05, + "loss": 0.2171, + "step": 74400 + }, + { + "epoch": 2.0648364182229786, + "grad_norm": 0.12434457242488861, + "learning_rate": 4.684008008408619e-05, + "loss": 0.2194, + "step": 74450 + }, + { + "epoch": 2.066223145166043, + "grad_norm": 0.11212220788002014, + "learning_rate": 4.671333519713209e-05, + "loss": 0.2244, + "step": 74500 + }, + { + "epoch": 2.0676098721091076, + "grad_norm": 0.1226951852440834, + "learning_rate": 4.658670974338409e-05, + "loss": 0.2176, + "step": 74550 + }, + { + "epoch": 2.068996599052172, + "grad_norm": 0.1348971724510193, + "learning_rate": 4.6460204006652174e-05, + "loss": 0.2161, + "step": 74600 + }, + { + "epoch": 2.0703833259952367, + "grad_norm": 0.11871132254600525, + "learning_rate": 4.633381827047782e-05, + "loss": 0.2216, + "step": 74650 + }, + { + "epoch": 2.071770052938301, + "grad_norm": 0.13671475648880005, + "learning_rate": 4.620755281813376e-05, + "loss": 0.2143, + "step": 74700 + }, + { + "epoch": 2.0731567798813657, + "grad_norm": 0.11762852221727371, + "learning_rate": 4.60814079326229e-05, + "loss": 0.2194, + "step": 74750 + }, + { + "epoch": 2.07454350682443, + "grad_norm": 0.14862984418869019, + "learning_rate": 4.59553838966782e-05, + "loss": 0.2223, + "step": 74800 + }, + { + "epoch": 2.0759302337674943, + "grad_norm": 0.16189099848270416, + "learning_rate": 4.58294809927615e-05, + "loss": 0.2203, + "step": 74850 + }, + { + "epoch": 2.077316960710559, + "grad_norm": 0.15650849044322968, + "learning_rate": 4.5703699503063294e-05, + "loss": 0.2204, + "step": 74900 + }, + { + "epoch": 2.0787036876536233, + "grad_norm": 0.1271338164806366, + "learning_rate": 4.557803970950182e-05, + "loss": 0.2179, + "step": 74950 + }, + { + "epoch": 2.080090414596688, + "grad_norm": 0.14354722201824188, + "learning_rate": 4.545250189372268e-05, + "loss": 0.2166, + "step": 75000 + }, + { + "epoch": 2.080090414596688, + "eval_loss": 0.2189180701971054, + "eval_runtime": 500.4567, + "eval_samples_per_second": 5.709, + "eval_steps_per_second": 5.709, + "step": 75000 + }, + { + "epoch": 2.0814771415397524, + "grad_norm": 0.15187138319015503, + "learning_rate": 4.5327086337098056e-05, + "loss": 0.2223, + "step": 75050 + }, + { + "epoch": 2.082863868482817, + "grad_norm": 0.13435395061969757, + "learning_rate": 4.5201793320726016e-05, + "loss": 0.2206, + "step": 75100 + }, + { + "epoch": 2.084250595425881, + "grad_norm": 0.13874699175357819, + "learning_rate": 4.507662312543007e-05, + "loss": 0.2208, + "step": 75150 + }, + { + "epoch": 2.0856373223689455, + "grad_norm": 0.1275242120027542, + "learning_rate": 4.495157603175842e-05, + "loss": 0.2204, + "step": 75200 + }, + { + "epoch": 2.08702404931201, + "grad_norm": 0.17980913817882538, + "learning_rate": 4.482665231998338e-05, + "loss": 0.2254, + "step": 75250 + }, + { + "epoch": 2.0884107762550745, + "grad_norm": 0.1556035280227661, + "learning_rate": 4.470185227010064e-05, + "loss": 0.2167, + "step": 75300 + }, + { + "epoch": 2.089797503198139, + "grad_norm": 0.12536190450191498, + "learning_rate": 4.4577176161828835e-05, + "loss": 0.2204, + "step": 75350 + }, + { + "epoch": 2.0911842301412036, + "grad_norm": 0.16269785165786743, + "learning_rate": 4.445262427460868e-05, + "loss": 0.218, + "step": 75400 + }, + { + "epoch": 2.092570957084268, + "grad_norm": 0.16465267539024353, + "learning_rate": 4.4328196887602616e-05, + "loss": 0.2201, + "step": 75450 + }, + { + "epoch": 2.093957684027332, + "grad_norm": 0.12975232303142548, + "learning_rate": 4.420389427969386e-05, + "loss": 0.22, + "step": 75500 + }, + { + "epoch": 2.0953444109703967, + "grad_norm": 0.11653002351522446, + "learning_rate": 4.407971672948612e-05, + "loss": 0.2144, + "step": 75550 + }, + { + "epoch": 2.096731137913461, + "grad_norm": 0.1448681503534317, + "learning_rate": 4.3955664515302744e-05, + "loss": 0.2237, + "step": 75600 + }, + { + "epoch": 2.0981178648565257, + "grad_norm": 0.1351730227470398, + "learning_rate": 4.3831737915186144e-05, + "loss": 0.2204, + "step": 75650 + }, + { + "epoch": 2.0995045917995903, + "grad_norm": 0.12176606059074402, + "learning_rate": 4.370793720689724e-05, + "loss": 0.2219, + "step": 75700 + }, + { + "epoch": 2.100891318742655, + "grad_norm": 0.152371346950531, + "learning_rate": 4.3584262667914696e-05, + "loss": 0.2211, + "step": 75750 + }, + { + "epoch": 2.1022780456857193, + "grad_norm": 0.11460871249437332, + "learning_rate": 4.3460714575434517e-05, + "loss": 0.2199, + "step": 75800 + }, + { + "epoch": 2.1036647726287834, + "grad_norm": 0.12700140476226807, + "learning_rate": 4.3337293206369125e-05, + "loss": 0.2194, + "step": 75850 + }, + { + "epoch": 2.105051499571848, + "grad_norm": 0.15812121331691742, + "learning_rate": 4.3213998837347116e-05, + "loss": 0.2196, + "step": 75900 + }, + { + "epoch": 2.1064382265149124, + "grad_norm": 0.13531926274299622, + "learning_rate": 4.309083174471221e-05, + "loss": 0.2209, + "step": 75950 + }, + { + "epoch": 2.107824953457977, + "grad_norm": 0.13829369843006134, + "learning_rate": 4.2967792204523136e-05, + "loss": 0.2183, + "step": 76000 + }, + { + "epoch": 2.107824953457977, + "eval_loss": 0.2188321202993393, + "eval_runtime": 500.5809, + "eval_samples_per_second": 5.707, + "eval_steps_per_second": 5.707, + "step": 76000 + }, + { + "epoch": 2.1092116804010415, + "grad_norm": 0.13063254952430725, + "learning_rate": 4.284488049255246e-05, + "loss": 0.2162, + "step": 76050 + }, + { + "epoch": 2.110598407344106, + "grad_norm": 0.13592955470085144, + "learning_rate": 4.272209688428646e-05, + "loss": 0.2186, + "step": 76100 + }, + { + "epoch": 2.1119851342871705, + "grad_norm": 0.1165243461728096, + "learning_rate": 4.25994416549241e-05, + "loss": 0.2158, + "step": 76150 + }, + { + "epoch": 2.113371861230235, + "grad_norm": 0.14689646661281586, + "learning_rate": 4.247691507937673e-05, + "loss": 0.2211, + "step": 76200 + }, + { + "epoch": 2.114758588173299, + "grad_norm": 0.11267469078302383, + "learning_rate": 4.235451743226737e-05, + "loss": 0.2178, + "step": 76250 + }, + { + "epoch": 2.1161453151163636, + "grad_norm": 0.15382416546344757, + "learning_rate": 4.2232248987929936e-05, + "loss": 0.2197, + "step": 76300 + }, + { + "epoch": 2.117532042059428, + "grad_norm": 0.15660510957241058, + "learning_rate": 4.2110110020408855e-05, + "loss": 0.2201, + "step": 76350 + }, + { + "epoch": 2.1189187690024927, + "grad_norm": 0.13238658010959625, + "learning_rate": 4.198810080345834e-05, + "loss": 0.2208, + "step": 76400 + }, + { + "epoch": 2.120305495945557, + "grad_norm": 0.11956652998924255, + "learning_rate": 4.186622161054181e-05, + "loss": 0.2193, + "step": 76450 + }, + { + "epoch": 2.1216922228886217, + "grad_norm": 0.12554652988910675, + "learning_rate": 4.174690641406727e-05, + "loss": 0.219, + "step": 76500 + }, + { + "epoch": 2.123078949831686, + "grad_norm": 0.13725616037845612, + "learning_rate": 4.162528547436844e-05, + "loss": 0.2175, + "step": 76550 + }, + { + "epoch": 2.1244656767747503, + "grad_norm": 0.12407530844211578, + "learning_rate": 4.1503795371893814e-05, + "loss": 0.2188, + "step": 76600 + }, + { + "epoch": 2.125852403717815, + "grad_norm": 0.1060599759221077, + "learning_rate": 4.1382436378943334e-05, + "loss": 0.219, + "step": 76650 + }, + { + "epoch": 2.1272391306608793, + "grad_norm": 0.12038925290107727, + "learning_rate": 4.126120876752295e-05, + "loss": 0.2237, + "step": 76700 + }, + { + "epoch": 2.128625857603944, + "grad_norm": 0.12510186433792114, + "learning_rate": 4.114011280934425e-05, + "loss": 0.2172, + "step": 76750 + }, + { + "epoch": 2.1300125845470084, + "grad_norm": 0.12675727903842926, + "learning_rate": 4.102156676187841e-05, + "loss": 0.2233, + "step": 76800 + }, + { + "epoch": 2.131399311490073, + "grad_norm": 0.14749465882778168, + "learning_rate": 4.090073227756616e-05, + "loss": 0.221, + "step": 76850 + }, + { + "epoch": 2.1327860384331374, + "grad_norm": 0.1391042023897171, + "learning_rate": 4.07800302544438e-05, + "loss": 0.221, + "step": 76900 + }, + { + "epoch": 2.1341727653762015, + "grad_norm": 0.10255371034145355, + "learning_rate": 4.0659460963044785e-05, + "loss": 0.2123, + "step": 76950 + }, + { + "epoch": 2.135559492319266, + "grad_norm": 0.11783240735530853, + "learning_rate": 4.0539024673605206e-05, + "loss": 0.219, + "step": 77000 + }, + { + "epoch": 2.135559492319266, + "eval_loss": 0.21858830749988556, + "eval_runtime": 500.5181, + "eval_samples_per_second": 5.708, + "eval_steps_per_second": 5.708, + "step": 77000 + }, + { + "epoch": 2.1369462192623305, + "grad_norm": 0.1376359611749649, + "learning_rate": 4.041872165606292e-05, + "loss": 0.2194, + "step": 77050 + }, + { + "epoch": 2.138332946205395, + "grad_norm": 0.12311622500419617, + "learning_rate": 4.029855218005727e-05, + "loss": 0.2212, + "step": 77100 + }, + { + "epoch": 2.1397196731484596, + "grad_norm": 0.11871961504220963, + "learning_rate": 4.017851651492808e-05, + "loss": 0.2196, + "step": 77150 + }, + { + "epoch": 2.141106400091524, + "grad_norm": 0.11650574952363968, + "learning_rate": 4.005861492971541e-05, + "loss": 0.2181, + "step": 77200 + }, + { + "epoch": 2.1424931270345886, + "grad_norm": 0.13727551698684692, + "learning_rate": 3.9938847693158685e-05, + "loss": 0.2218, + "step": 77250 + }, + { + "epoch": 2.143879853977653, + "grad_norm": 0.11267198622226715, + "learning_rate": 3.981921507369629e-05, + "loss": 0.2196, + "step": 77300 + }, + { + "epoch": 2.145266580920717, + "grad_norm": 0.1510041058063507, + "learning_rate": 3.9699717339464915e-05, + "loss": 0.2168, + "step": 77350 + }, + { + "epoch": 2.1466533078637817, + "grad_norm": 0.12016372382640839, + "learning_rate": 3.95803547582988e-05, + "loss": 0.2202, + "step": 77400 + }, + { + "epoch": 2.1480400348068462, + "grad_norm": 0.13027046620845795, + "learning_rate": 3.9461127597729366e-05, + "loss": 0.2192, + "step": 77450 + }, + { + "epoch": 2.1494267617499108, + "grad_norm": 0.10763117671012878, + "learning_rate": 3.934203612498449e-05, + "loss": 0.2174, + "step": 77500 + }, + { + "epoch": 2.1508134886929753, + "grad_norm": 0.12586219608783722, + "learning_rate": 3.922308060698797e-05, + "loss": 0.2196, + "step": 77550 + }, + { + "epoch": 2.15220021563604, + "grad_norm": 0.13267678022384644, + "learning_rate": 3.910426131035876e-05, + "loss": 0.2164, + "step": 77600 + }, + { + "epoch": 2.1535869425791043, + "grad_norm": 0.14239874482154846, + "learning_rate": 3.8985578501410635e-05, + "loss": 0.2182, + "step": 77650 + }, + { + "epoch": 2.1549736695221684, + "grad_norm": 0.11145935207605362, + "learning_rate": 3.886703244615132e-05, + "loss": 0.2174, + "step": 77700 + }, + { + "epoch": 2.156360396465233, + "grad_norm": 0.14922165870666504, + "learning_rate": 3.874862341028216e-05, + "loss": 0.2225, + "step": 77750 + }, + { + "epoch": 2.1577471234082974, + "grad_norm": 0.16182860732078552, + "learning_rate": 3.863035165919735e-05, + "loss": 0.216, + "step": 77800 + }, + { + "epoch": 2.159133850351362, + "grad_norm": 0.11681864410638809, + "learning_rate": 3.85122174579833e-05, + "loss": 0.2222, + "step": 77850 + }, + { + "epoch": 2.1605205772944265, + "grad_norm": 0.11636471748352051, + "learning_rate": 3.839422107141826e-05, + "loss": 0.2156, + "step": 77900 + }, + { + "epoch": 2.161907304237491, + "grad_norm": 0.12071753293275833, + "learning_rate": 3.827636276397149e-05, + "loss": 0.2176, + "step": 77950 + }, + { + "epoch": 2.1632940311805555, + "grad_norm": 0.14000236988067627, + "learning_rate": 3.815864279980284e-05, + "loss": 0.2192, + "step": 78000 + }, + { + "epoch": 2.1632940311805555, + "eval_loss": 0.21850600838661194, + "eval_runtime": 500.629, + "eval_samples_per_second": 5.707, + "eval_steps_per_second": 5.707, + "step": 78000 + }, + { + "epoch": 2.1646807581236196, + "grad_norm": 0.12112339586019516, + "learning_rate": 3.8041061442762015e-05, + "loss": 0.2194, + "step": 78050 + }, + { + "epoch": 2.166067485066684, + "grad_norm": 0.1251995861530304, + "learning_rate": 3.792361895638814e-05, + "loss": 0.2201, + "step": 78100 + }, + { + "epoch": 2.1674542120097486, + "grad_norm": 0.11946742981672287, + "learning_rate": 3.780631560390897e-05, + "loss": 0.2214, + "step": 78150 + }, + { + "epoch": 2.168840938952813, + "grad_norm": 0.11297029256820679, + "learning_rate": 3.768915164824055e-05, + "loss": 0.2169, + "step": 78200 + }, + { + "epoch": 2.1702276658958777, + "grad_norm": 0.13184936344623566, + "learning_rate": 3.7572127351986316e-05, + "loss": 0.2201, + "step": 78250 + }, + { + "epoch": 2.171614392838942, + "grad_norm": 0.14334504306316376, + "learning_rate": 3.7455242977436924e-05, + "loss": 0.2201, + "step": 78300 + }, + { + "epoch": 2.1730011197820067, + "grad_norm": 0.1557203084230423, + "learning_rate": 3.733849878656918e-05, + "loss": 0.2134, + "step": 78350 + }, + { + "epoch": 2.174387846725071, + "grad_norm": 0.16170355677604675, + "learning_rate": 3.722189504104583e-05, + "loss": 0.2184, + "step": 78400 + }, + { + "epoch": 2.1757745736681353, + "grad_norm": 0.15113234519958496, + "learning_rate": 3.7105432002214815e-05, + "loss": 0.2219, + "step": 78450 + }, + { + "epoch": 2.1771613006112, + "grad_norm": 0.14761875569820404, + "learning_rate": 3.698910993110864e-05, + "loss": 0.2217, + "step": 78500 + }, + { + "epoch": 2.1785480275542644, + "grad_norm": 0.1473364681005478, + "learning_rate": 3.6872929088443945e-05, + "loss": 0.2228, + "step": 78550 + }, + { + "epoch": 2.179934754497329, + "grad_norm": 0.11361527442932129, + "learning_rate": 3.6756889734620735e-05, + "loss": 0.2166, + "step": 78600 + }, + { + "epoch": 2.1813214814403934, + "grad_norm": 0.12869326770305634, + "learning_rate": 3.664099212972202e-05, + "loss": 0.221, + "step": 78650 + }, + { + "epoch": 2.182708208383458, + "grad_norm": 0.12593664228916168, + "learning_rate": 3.6525236533512896e-05, + "loss": 0.2193, + "step": 78700 + }, + { + "epoch": 2.184094935326522, + "grad_norm": 0.13714058697223663, + "learning_rate": 3.640962320544047e-05, + "loss": 0.22, + "step": 78750 + }, + { + "epoch": 2.1854816622695865, + "grad_norm": 0.12551981210708618, + "learning_rate": 3.6294152404632685e-05, + "loss": 0.2217, + "step": 78800 + }, + { + "epoch": 2.186868389212651, + "grad_norm": 0.139542818069458, + "learning_rate": 3.617882438989822e-05, + "loss": 0.2167, + "step": 78850 + }, + { + "epoch": 2.1882551161557156, + "grad_norm": 0.15694187581539154, + "learning_rate": 3.606363941972561e-05, + "loss": 0.2223, + "step": 78900 + }, + { + "epoch": 2.18964184309878, + "grad_norm": 0.12061590701341629, + "learning_rate": 3.5948597752282854e-05, + "loss": 0.2199, + "step": 78950 + }, + { + "epoch": 2.1910285700418446, + "grad_norm": 0.1186976209282875, + "learning_rate": 3.583369964541677e-05, + "loss": 0.221, + "step": 79000 + }, + { + "epoch": 2.1910285700418446, + "eval_loss": 0.21846945583820343, + "eval_runtime": 500.8819, + "eval_samples_per_second": 5.704, + "eval_steps_per_second": 5.704, + "step": 79000 + }, + { + "epoch": 2.192415296984909, + "grad_norm": 0.14024211466312408, + "learning_rate": 3.5718945356652314e-05, + "loss": 0.2195, + "step": 79050 + }, + { + "epoch": 2.193802023927973, + "grad_norm": 0.14544863998889923, + "learning_rate": 3.560433514319217e-05, + "loss": 0.2194, + "step": 79100 + }, + { + "epoch": 2.1951887508710377, + "grad_norm": 0.14944560825824738, + "learning_rate": 3.548986926191612e-05, + "loss": 0.2171, + "step": 79150 + }, + { + "epoch": 2.1965754778141022, + "grad_norm": 0.1533380150794983, + "learning_rate": 3.537554796938044e-05, + "loss": 0.2174, + "step": 79200 + }, + { + "epoch": 2.1979622047571667, + "grad_norm": 0.1631087213754654, + "learning_rate": 3.526365362963201e-05, + "loss": 0.2197, + "step": 79250 + }, + { + "epoch": 2.1993489317002313, + "grad_norm": 0.11946437507867813, + "learning_rate": 3.514961937842551e-05, + "loss": 0.2172, + "step": 79300 + }, + { + "epoch": 2.200735658643296, + "grad_norm": 0.11075513809919357, + "learning_rate": 3.5035730478572906e-05, + "loss": 0.2198, + "step": 79350 + }, + { + "epoch": 2.2021223855863603, + "grad_norm": 0.151596337556839, + "learning_rate": 3.49219871853373e-05, + "loss": 0.2214, + "step": 79400 + }, + { + "epoch": 2.2035091125294244, + "grad_norm": 0.15046928822994232, + "learning_rate": 3.4808389753655324e-05, + "loss": 0.2189, + "step": 79450 + }, + { + "epoch": 2.204895839472489, + "grad_norm": 0.1315099596977234, + "learning_rate": 3.469493843813677e-05, + "loss": 0.2205, + "step": 79500 + }, + { + "epoch": 2.2062825664155534, + "grad_norm": 0.15399102866649628, + "learning_rate": 3.458163349306397e-05, + "loss": 0.2191, + "step": 79550 + }, + { + "epoch": 2.207669293358618, + "grad_norm": 0.1285402774810791, + "learning_rate": 3.4468475172391054e-05, + "loss": 0.2211, + "step": 79600 + }, + { + "epoch": 2.2090560203016825, + "grad_norm": 0.13388928771018982, + "learning_rate": 3.435546372974363e-05, + "loss": 0.2197, + "step": 79650 + }, + { + "epoch": 2.210442747244747, + "grad_norm": 0.13818307220935822, + "learning_rate": 3.424259941841807e-05, + "loss": 0.2174, + "step": 79700 + }, + { + "epoch": 2.2118294741878115, + "grad_norm": 0.11872979253530502, + "learning_rate": 3.4129882491381015e-05, + "loss": 0.2184, + "step": 79750 + }, + { + "epoch": 2.213216201130876, + "grad_norm": 0.156584233045578, + "learning_rate": 3.4017313201268655e-05, + "loss": 0.222, + "step": 79800 + }, + { + "epoch": 2.21460292807394, + "grad_norm": 0.13037458062171936, + "learning_rate": 3.3904891800386426e-05, + "loss": 0.2176, + "step": 79850 + }, + { + "epoch": 2.2159896550170046, + "grad_norm": 0.13034582138061523, + "learning_rate": 3.379261854070815e-05, + "loss": 0.2204, + "step": 79900 + }, + { + "epoch": 2.217376381960069, + "grad_norm": 0.14262603223323822, + "learning_rate": 3.3680493673875735e-05, + "loss": 0.2204, + "step": 79950 + }, + { + "epoch": 2.2187631089031337, + "grad_norm": 0.12774142622947693, + "learning_rate": 3.3568517451198454e-05, + "loss": 0.2186, + "step": 80000 + }, + { + "epoch": 2.2187631089031337, + "eval_loss": 0.21821601688861847, + "eval_runtime": 500.5342, + "eval_samples_per_second": 5.708, + "eval_steps_per_second": 5.708, + "step": 80000 + }, + { + "epoch": 2.220149835846198, + "grad_norm": 0.12721596658229828, + "learning_rate": 3.345669012365234e-05, + "loss": 0.2158, + "step": 80050 + }, + { + "epoch": 2.2215365627892627, + "grad_norm": 0.11402986943721771, + "learning_rate": 3.334501194187981e-05, + "loss": 0.2203, + "step": 80100 + }, + { + "epoch": 2.2229232897323272, + "grad_norm": 0.15116162598133087, + "learning_rate": 3.323348315618896e-05, + "loss": 0.2191, + "step": 80150 + }, + { + "epoch": 2.2243100166753913, + "grad_norm": 0.1261793076992035, + "learning_rate": 3.312210401655306e-05, + "loss": 0.2175, + "step": 80200 + }, + { + "epoch": 2.225696743618456, + "grad_norm": 0.14110060036182404, + "learning_rate": 3.301087477260987e-05, + "loss": 0.2175, + "step": 80250 + }, + { + "epoch": 2.2270834705615203, + "grad_norm": 0.11654798686504364, + "learning_rate": 3.2899795673661335e-05, + "loss": 0.2186, + "step": 80300 + }, + { + "epoch": 2.228470197504585, + "grad_norm": 0.11238402873277664, + "learning_rate": 3.278886696867275e-05, + "loss": 0.2174, + "step": 80350 + }, + { + "epoch": 2.2298569244476494, + "grad_norm": 0.14211712777614594, + "learning_rate": 3.267808890627239e-05, + "loss": 0.2206, + "step": 80400 + }, + { + "epoch": 2.231243651390714, + "grad_norm": 0.12510523200035095, + "learning_rate": 3.256746173475088e-05, + "loss": 0.2158, + "step": 80450 + }, + { + "epoch": 2.2326303783337784, + "grad_norm": 0.11722100526094437, + "learning_rate": 3.2456985702060694e-05, + "loss": 0.2249, + "step": 80500 + }, + { + "epoch": 2.2340171052768425, + "grad_norm": 0.15641877055168152, + "learning_rate": 3.234666105581542e-05, + "loss": 0.2159, + "step": 80550 + }, + { + "epoch": 2.235403832219907, + "grad_norm": 0.15180495381355286, + "learning_rate": 3.223648804328946e-05, + "loss": 0.2197, + "step": 80600 + }, + { + "epoch": 2.2367905591629715, + "grad_norm": 0.11919309943914413, + "learning_rate": 3.212646691141736e-05, + "loss": 0.2214, + "step": 80650 + }, + { + "epoch": 2.238177286106036, + "grad_norm": 0.12934933602809906, + "learning_rate": 3.2016597906793134e-05, + "loss": 0.2177, + "step": 80700 + }, + { + "epoch": 2.2395640130491006, + "grad_norm": 0.1465185284614563, + "learning_rate": 3.1906881275669975e-05, + "loss": 0.2174, + "step": 80750 + }, + { + "epoch": 2.240950739992165, + "grad_norm": 0.14115314185619354, + "learning_rate": 3.1797317263959415e-05, + "loss": 0.2173, + "step": 80800 + }, + { + "epoch": 2.2423374669352296, + "grad_norm": 0.12480226904153824, + "learning_rate": 3.1687906117231e-05, + "loss": 0.218, + "step": 80850 + }, + { + "epoch": 2.243724193878294, + "grad_norm": 0.1279391646385193, + "learning_rate": 3.157864808071167e-05, + "loss": 0.2171, + "step": 80900 + }, + { + "epoch": 2.245110920821358, + "grad_norm": 0.16663573682308197, + "learning_rate": 3.146954339928516e-05, + "loss": 0.218, + "step": 80950 + }, + { + "epoch": 2.2464976477644227, + "grad_norm": 0.14335867762565613, + "learning_rate": 3.136059231749145e-05, + "loss": 0.2205, + "step": 81000 + }, + { + "epoch": 2.2464976477644227, + "eval_loss": 0.21820572018623352, + "eval_runtime": 500.4609, + "eval_samples_per_second": 5.709, + "eval_steps_per_second": 5.709, + "step": 81000 + }, + { + "epoch": 2.2478843747074873, + "grad_norm": 0.11844130605459213, + "learning_rate": 3.12517950795263e-05, + "loss": 0.2227, + "step": 81050 + }, + { + "epoch": 2.2492711016505518, + "grad_norm": 0.13108013570308685, + "learning_rate": 3.1143151929240696e-05, + "loss": 0.2228, + "step": 81100 + }, + { + "epoch": 2.2506578285936163, + "grad_norm": 0.1188526302576065, + "learning_rate": 3.103466311014013e-05, + "loss": 0.2151, + "step": 81150 + }, + { + "epoch": 2.252044555536681, + "grad_norm": 0.15748703479766846, + "learning_rate": 3.092632886538432e-05, + "loss": 0.2161, + "step": 81200 + }, + { + "epoch": 2.2534312824797453, + "grad_norm": 0.1411810666322708, + "learning_rate": 3.0818149437786414e-05, + "loss": 0.2204, + "step": 81250 + }, + { + "epoch": 2.2548180094228094, + "grad_norm": 0.16991600394248962, + "learning_rate": 3.0710125069812724e-05, + "loss": 0.2221, + "step": 81300 + }, + { + "epoch": 2.256204736365874, + "grad_norm": 0.14075744152069092, + "learning_rate": 3.060225600358184e-05, + "loss": 0.2181, + "step": 81350 + }, + { + "epoch": 2.2575914633089385, + "grad_norm": 0.15437312424182892, + "learning_rate": 3.0494542480864418e-05, + "loss": 0.2196, + "step": 81400 + }, + { + "epoch": 2.258978190252003, + "grad_norm": 0.1334267407655716, + "learning_rate": 3.038698474308236e-05, + "loss": 0.2194, + "step": 81450 + }, + { + "epoch": 2.2603649171950675, + "grad_norm": 0.11487523466348648, + "learning_rate": 3.0279583031308524e-05, + "loss": 0.2199, + "step": 81500 + }, + { + "epoch": 2.261751644138132, + "grad_norm": 0.1350301206111908, + "learning_rate": 3.017233758626593e-05, + "loss": 0.2249, + "step": 81550 + }, + { + "epoch": 2.2631383710811965, + "grad_norm": 0.13608673214912415, + "learning_rate": 3.006524864832748e-05, + "loss": 0.2176, + "step": 81600 + }, + { + "epoch": 2.2645250980242606, + "grad_norm": 0.11125027388334274, + "learning_rate": 2.9958316457515222e-05, + "loss": 0.2203, + "step": 81650 + }, + { + "epoch": 2.265911824967325, + "grad_norm": 0.11785724014043808, + "learning_rate": 2.9851541253499894e-05, + "loss": 0.2174, + "step": 81700 + }, + { + "epoch": 2.2672985519103896, + "grad_norm": 0.14160412549972534, + "learning_rate": 2.974492327560042e-05, + "loss": 0.2173, + "step": 81750 + }, + { + "epoch": 2.268685278853454, + "grad_norm": 0.10561185330152512, + "learning_rate": 2.9638462762783215e-05, + "loss": 0.2209, + "step": 81800 + }, + { + "epoch": 2.2700720057965187, + "grad_norm": 0.12722249329090118, + "learning_rate": 2.9532159953661886e-05, + "loss": 0.2187, + "step": 81850 + }, + { + "epoch": 2.271458732739583, + "grad_norm": 0.12994156777858734, + "learning_rate": 2.9426015086496474e-05, + "loss": 0.218, + "step": 81900 + }, + { + "epoch": 2.2728454596826477, + "grad_norm": 0.13776080310344696, + "learning_rate": 2.93200283991931e-05, + "loss": 0.2197, + "step": 81950 + }, + { + "epoch": 2.274232186625712, + "grad_norm": 0.14488635957241058, + "learning_rate": 2.921420012930328e-05, + "loss": 0.2177, + "step": 82000 + }, + { + "epoch": 2.274232186625712, + "eval_loss": 0.2180478274822235, + "eval_runtime": 500.2847, + "eval_samples_per_second": 5.711, + "eval_steps_per_second": 5.711, + "step": 82000 + }, + { + "epoch": 2.2756189135687763, + "grad_norm": 0.15002746880054474, + "learning_rate": 2.9108530514023512e-05, + "loss": 0.219, + "step": 82050 + }, + { + "epoch": 2.277005640511841, + "grad_norm": 0.14248649775981903, + "learning_rate": 2.9003019790194684e-05, + "loss": 0.2169, + "step": 82100 + }, + { + "epoch": 2.2783923674549054, + "grad_norm": 0.11851143091917038, + "learning_rate": 2.8897668194301598e-05, + "loss": 0.2186, + "step": 82150 + }, + { + "epoch": 2.27977909439797, + "grad_norm": 0.14575442671775818, + "learning_rate": 2.8792475962472277e-05, + "loss": 0.22, + "step": 82200 + }, + { + "epoch": 2.2811658213410344, + "grad_norm": 0.12261557579040527, + "learning_rate": 2.868744333047767e-05, + "loss": 0.2196, + "step": 82250 + }, + { + "epoch": 2.282552548284099, + "grad_norm": 0.12256131321191788, + "learning_rate": 2.8582570533731002e-05, + "loss": 0.2174, + "step": 82300 + }, + { + "epoch": 2.283939275227163, + "grad_norm": 0.13776589930057526, + "learning_rate": 2.8477857807287156e-05, + "loss": 0.2138, + "step": 82350 + }, + { + "epoch": 2.2853260021702275, + "grad_norm": 0.16855372488498688, + "learning_rate": 2.8373305385842385e-05, + "loss": 0.2188, + "step": 82400 + }, + { + "epoch": 2.286712729113292, + "grad_norm": 0.13965797424316406, + "learning_rate": 2.8268913503733498e-05, + "loss": 0.2199, + "step": 82450 + }, + { + "epoch": 2.2880994560563566, + "grad_norm": 0.1284012645483017, + "learning_rate": 2.816468239493758e-05, + "loss": 0.2178, + "step": 82500 + }, + { + "epoch": 2.289486182999421, + "grad_norm": 0.1288595348596573, + "learning_rate": 2.8060612293071363e-05, + "loss": 0.2162, + "step": 82550 + }, + { + "epoch": 2.2908729099424856, + "grad_norm": 0.1225946918129921, + "learning_rate": 2.795670343139072e-05, + "loss": 0.2135, + "step": 82600 + }, + { + "epoch": 2.29225963688555, + "grad_norm": 0.1356947422027588, + "learning_rate": 2.7852956042790023e-05, + "loss": 0.2204, + "step": 82650 + }, + { + "epoch": 2.293646363828614, + "grad_norm": 0.11796356737613678, + "learning_rate": 2.774937035980184e-05, + "loss": 0.2209, + "step": 82700 + }, + { + "epoch": 2.2950330907716787, + "grad_norm": 0.14882323145866394, + "learning_rate": 2.7648013501010216e-05, + "loss": 0.2201, + "step": 82750 + }, + { + "epoch": 2.2964198177147432, + "grad_norm": 0.1550634354352951, + "learning_rate": 2.7544748679733266e-05, + "loss": 0.2214, + "step": 82800 + }, + { + "epoch": 2.2978065446578078, + "grad_norm": 0.12448450177907944, + "learning_rate": 2.7441646254864463e-05, + "loss": 0.2206, + "step": 82850 + }, + { + "epoch": 2.2991932716008723, + "grad_norm": 0.13499294221401215, + "learning_rate": 2.7338706457490704e-05, + "loss": 0.2182, + "step": 82900 + }, + { + "epoch": 2.300579998543937, + "grad_norm": 0.1396186500787735, + "learning_rate": 2.7235929518334515e-05, + "loss": 0.2214, + "step": 82950 + }, + { + "epoch": 2.3019667254870013, + "grad_norm": 0.1256234049797058, + "learning_rate": 2.7133315667753244e-05, + "loss": 0.2177, + "step": 83000 + }, + { + "epoch": 2.3019667254870013, + "eval_loss": 0.21797436475753784, + "eval_runtime": 500.3137, + "eval_samples_per_second": 5.71, + "eval_steps_per_second": 5.71, + "step": 83000 + }, + { + "epoch": 2.3033534524300654, + "grad_norm": 0.13521268963813782, + "learning_rate": 2.7030865135738836e-05, + "loss": 0.2164, + "step": 83050 + }, + { + "epoch": 2.30474017937313, + "grad_norm": 0.1525382250547409, + "learning_rate": 2.692857815191714e-05, + "loss": 0.218, + "step": 83100 + }, + { + "epoch": 2.3061269063161944, + "grad_norm": 0.11054780334234238, + "learning_rate": 2.6826454945547452e-05, + "loss": 0.2179, + "step": 83150 + }, + { + "epoch": 2.307513633259259, + "grad_norm": 0.10860705375671387, + "learning_rate": 2.6724495745521928e-05, + "loss": 0.2198, + "step": 83200 + }, + { + "epoch": 2.3089003602023235, + "grad_norm": 0.1467871069908142, + "learning_rate": 2.6622700780365195e-05, + "loss": 0.2182, + "step": 83250 + }, + { + "epoch": 2.310287087145388, + "grad_norm": 0.13562606275081635, + "learning_rate": 2.6521070278233783e-05, + "loss": 0.219, + "step": 83300 + }, + { + "epoch": 2.3116738140884525, + "grad_norm": 0.14807383716106415, + "learning_rate": 2.6419604466915526e-05, + "loss": 0.2187, + "step": 83350 + }, + { + "epoch": 2.3130605410315166, + "grad_norm": 0.13751082122325897, + "learning_rate": 2.631830357382925e-05, + "loss": 0.2183, + "step": 83400 + }, + { + "epoch": 2.314447267974581, + "grad_norm": 0.13988618552684784, + "learning_rate": 2.621716782602396e-05, + "loss": 0.2189, + "step": 83450 + }, + { + "epoch": 2.3158339949176456, + "grad_norm": 0.13154689967632294, + "learning_rate": 2.611619745017878e-05, + "loss": 0.2189, + "step": 83500 + }, + { + "epoch": 2.31722072186071, + "grad_norm": 0.11987145990133286, + "learning_rate": 2.6015392672601924e-05, + "loss": 0.2176, + "step": 83550 + }, + { + "epoch": 2.3186074488037747, + "grad_norm": 0.12398877739906311, + "learning_rate": 2.5914753719230623e-05, + "loss": 0.2197, + "step": 83600 + }, + { + "epoch": 2.319994175746839, + "grad_norm": 0.11837327480316162, + "learning_rate": 2.581428081563031e-05, + "loss": 0.2177, + "step": 83650 + }, + { + "epoch": 2.3213809026899037, + "grad_norm": 0.14284634590148926, + "learning_rate": 2.571397418699436e-05, + "loss": 0.2183, + "step": 83700 + }, + { + "epoch": 2.3227676296329682, + "grad_norm": 0.11705781519412994, + "learning_rate": 2.561383405814336e-05, + "loss": 0.2185, + "step": 83750 + }, + { + "epoch": 2.3241543565760328, + "grad_norm": 0.13301438093185425, + "learning_rate": 2.55138606535248e-05, + "loss": 0.2187, + "step": 83800 + }, + { + "epoch": 2.325541083519097, + "grad_norm": 0.15416869521141052, + "learning_rate": 2.5414054197212467e-05, + "loss": 0.2185, + "step": 83850 + }, + { + "epoch": 2.3269278104621614, + "grad_norm": 0.13096120953559875, + "learning_rate": 2.5314414912905938e-05, + "loss": 0.2181, + "step": 83900 + }, + { + "epoch": 2.328314537405226, + "grad_norm": 0.19566649198532104, + "learning_rate": 2.5214943023930137e-05, + "loss": 0.2175, + "step": 83950 + }, + { + "epoch": 2.3297012643482904, + "grad_norm": 0.1359308511018753, + "learning_rate": 2.511563875323474e-05, + "loss": 0.2185, + "step": 84000 + }, + { + "epoch": 2.3297012643482904, + "eval_loss": 0.21784663200378418, + "eval_runtime": 500.2701, + "eval_samples_per_second": 5.711, + "eval_steps_per_second": 5.711, + "step": 84000 + }, + { + "epoch": 2.331087991291355, + "grad_norm": 0.12440098077058792, + "learning_rate": 2.501650232339382e-05, + "loss": 0.2178, + "step": 84050 + }, + { + "epoch": 2.3324747182344194, + "grad_norm": 0.13704413175582886, + "learning_rate": 2.4917533956605153e-05, + "loss": 0.218, + "step": 84100 + }, + { + "epoch": 2.333861445177484, + "grad_norm": 0.13410316407680511, + "learning_rate": 2.481873387468995e-05, + "loss": 0.2135, + "step": 84150 + }, + { + "epoch": 2.335248172120548, + "grad_norm": 0.13458983600139618, + "learning_rate": 2.4720102299092117e-05, + "loss": 0.2206, + "step": 84200 + }, + { + "epoch": 2.3366348990636125, + "grad_norm": 0.13513872027397156, + "learning_rate": 2.4621639450877987e-05, + "loss": 0.216, + "step": 84250 + }, + { + "epoch": 2.338021626006677, + "grad_norm": 0.12012195587158203, + "learning_rate": 2.4523345550735665e-05, + "loss": 0.2154, + "step": 84300 + }, + { + "epoch": 2.3394083529497416, + "grad_norm": 0.1352149099111557, + "learning_rate": 2.4425220818974624e-05, + "loss": 0.2162, + "step": 84350 + }, + { + "epoch": 2.340795079892806, + "grad_norm": 0.12285764515399933, + "learning_rate": 2.4327265475525097e-05, + "loss": 0.2205, + "step": 84400 + }, + { + "epoch": 2.3421818068358706, + "grad_norm": 0.12704753875732422, + "learning_rate": 2.4229479739937745e-05, + "loss": 0.2186, + "step": 84450 + }, + { + "epoch": 2.343568533778935, + "grad_norm": 0.131776824593544, + "learning_rate": 2.4131863831383062e-05, + "loss": 0.219, + "step": 84500 + }, + { + "epoch": 2.3449552607219992, + "grad_norm": 0.12816810607910156, + "learning_rate": 2.4034417968650834e-05, + "loss": 0.2227, + "step": 84550 + }, + { + "epoch": 2.3463419876650637, + "grad_norm": 0.12194739282131195, + "learning_rate": 2.3937142370149857e-05, + "loss": 0.2175, + "step": 84600 + }, + { + "epoch": 2.3477287146081283, + "grad_norm": 0.17542821168899536, + "learning_rate": 2.3840037253907098e-05, + "loss": 0.2232, + "step": 84650 + }, + { + "epoch": 2.349115441551193, + "grad_norm": 0.14499437808990479, + "learning_rate": 2.3743102837567688e-05, + "loss": 0.2222, + "step": 84700 + }, + { + "epoch": 2.3505021684942573, + "grad_norm": 0.1500397026538849, + "learning_rate": 2.364633933839391e-05, + "loss": 0.2233, + "step": 84750 + }, + { + "epoch": 2.351888895437322, + "grad_norm": 0.11950129270553589, + "learning_rate": 2.354974697326514e-05, + "loss": 0.2174, + "step": 84800 + }, + { + "epoch": 2.3532756223803863, + "grad_norm": 0.11446011066436768, + "learning_rate": 2.3453325958677053e-05, + "loss": 0.2177, + "step": 84850 + }, + { + "epoch": 2.3546623493234504, + "grad_norm": 0.14888976514339447, + "learning_rate": 2.335707651074137e-05, + "loss": 0.2172, + "step": 84900 + }, + { + "epoch": 2.356049076266515, + "grad_norm": 0.14163066446781158, + "learning_rate": 2.3260998845185254e-05, + "loss": 0.218, + "step": 84950 + }, + { + "epoch": 2.3574358032095795, + "grad_norm": 0.12244392931461334, + "learning_rate": 2.3165093177350793e-05, + "loss": 0.2175, + "step": 85000 + }, + { + "epoch": 2.3574358032095795, + "eval_loss": 0.21762743592262268, + "eval_runtime": 500.9336, + "eval_samples_per_second": 5.703, + "eval_steps_per_second": 5.703, + "step": 85000 + }, + { + "epoch": 2.358822530152644, + "grad_norm": 0.15694329142570496, + "learning_rate": 2.3069359722194617e-05, + "loss": 0.222, + "step": 85050 + }, + { + "epoch": 2.3602092570957085, + "grad_norm": 0.14646472036838531, + "learning_rate": 2.2973798694287362e-05, + "loss": 0.2174, + "step": 85100 + }, + { + "epoch": 2.361595984038773, + "grad_norm": 0.1558624804019928, + "learning_rate": 2.2878410307813235e-05, + "loss": 0.2209, + "step": 85150 + }, + { + "epoch": 2.3629827109818375, + "grad_norm": 0.15354704856872559, + "learning_rate": 2.2783194776569394e-05, + "loss": 0.2182, + "step": 85200 + }, + { + "epoch": 2.3643694379249016, + "grad_norm": 0.1462877094745636, + "learning_rate": 2.2688152313965684e-05, + "loss": 0.2188, + "step": 85250 + }, + { + "epoch": 2.365756164867966, + "grad_norm": 0.15048891305923462, + "learning_rate": 2.2593283133023945e-05, + "loss": 0.2161, + "step": 85300 + }, + { + "epoch": 2.3671428918110307, + "grad_norm": 0.12739485502243042, + "learning_rate": 2.2498587446377716e-05, + "loss": 0.2173, + "step": 85350 + }, + { + "epoch": 2.368529618754095, + "grad_norm": 0.12822887301445007, + "learning_rate": 2.2404065466271673e-05, + "loss": 0.222, + "step": 85400 + }, + { + "epoch": 2.3699163456971597, + "grad_norm": 0.1442478448152542, + "learning_rate": 2.2311602660026586e-05, + "loss": 0.2203, + "step": 85450 + }, + { + "epoch": 2.371303072640224, + "grad_norm": 0.15499895811080933, + "learning_rate": 2.2217425243509928e-05, + "loss": 0.2198, + "step": 85500 + }, + { + "epoch": 2.3726897995832887, + "grad_norm": 0.12855781614780426, + "learning_rate": 2.212342216371176e-05, + "loss": 0.221, + "step": 85550 + }, + { + "epoch": 2.374076526526353, + "grad_norm": 0.13640813529491425, + "learning_rate": 2.2029593631324417e-05, + "loss": 0.2175, + "step": 85600 + }, + { + "epoch": 2.3754632534694173, + "grad_norm": 0.13200990855693817, + "learning_rate": 2.19359398566489e-05, + "loss": 0.2175, + "step": 85650 + }, + { + "epoch": 2.376849980412482, + "grad_norm": 0.14753013849258423, + "learning_rate": 2.1842461049594677e-05, + "loss": 0.2188, + "step": 85700 + }, + { + "epoch": 2.3782367073555464, + "grad_norm": 0.11965631693601608, + "learning_rate": 2.17491574196789e-05, + "loss": 0.2191, + "step": 85750 + }, + { + "epoch": 2.379623434298611, + "grad_norm": 0.11599334329366684, + "learning_rate": 2.1656029176026193e-05, + "loss": 0.2199, + "step": 85800 + }, + { + "epoch": 2.3810101612416754, + "grad_norm": 0.14910413324832916, + "learning_rate": 2.1563076527367996e-05, + "loss": 0.2175, + "step": 85850 + }, + { + "epoch": 2.38239688818474, + "grad_norm": 0.1290595680475235, + "learning_rate": 2.147029968204226e-05, + "loss": 0.2163, + "step": 85900 + }, + { + "epoch": 2.383783615127804, + "grad_norm": 0.13954681158065796, + "learning_rate": 2.1377698847992878e-05, + "loss": 0.2253, + "step": 85950 + }, + { + "epoch": 2.3851703420708685, + "grad_norm": 0.10592395067214966, + "learning_rate": 2.1285274232769194e-05, + "loss": 0.2204, + "step": 86000 + }, + { + "epoch": 2.3851703420708685, + "eval_loss": 0.2176109105348587, + "eval_runtime": 500.5123, + "eval_samples_per_second": 5.708, + "eval_steps_per_second": 5.708, + "step": 86000 + }, + { + "epoch": 2.386557069013933, + "grad_norm": 0.1263597160577774, + "learning_rate": 2.1193026043525655e-05, + "loss": 0.2194, + "step": 86050 + }, + { + "epoch": 2.3879437959569976, + "grad_norm": 0.13431338965892792, + "learning_rate": 2.1100954487021252e-05, + "loss": 0.2156, + "step": 86100 + }, + { + "epoch": 2.389330522900062, + "grad_norm": 0.13657350838184357, + "learning_rate": 2.10090597696191e-05, + "loss": 0.2163, + "step": 86150 + }, + { + "epoch": 2.3907172498431266, + "grad_norm": 0.1250746250152588, + "learning_rate": 2.0917342097285897e-05, + "loss": 0.2192, + "step": 86200 + }, + { + "epoch": 2.392103976786191, + "grad_norm": 0.11769583076238632, + "learning_rate": 2.0825801675591618e-05, + "loss": 0.22, + "step": 86250 + }, + { + "epoch": 2.393490703729255, + "grad_norm": 0.1266479194164276, + "learning_rate": 2.073443870970886e-05, + "loss": 0.2205, + "step": 86300 + }, + { + "epoch": 2.3948774306723197, + "grad_norm": 0.1402258574962616, + "learning_rate": 2.0643253404412564e-05, + "loss": 0.2161, + "step": 86350 + }, + { + "epoch": 2.3962641576153842, + "grad_norm": 0.15390367805957794, + "learning_rate": 2.055224596407942e-05, + "loss": 0.2153, + "step": 86400 + }, + { + "epoch": 2.3976508845584488, + "grad_norm": 0.15276868641376495, + "learning_rate": 2.0461416592687487e-05, + "loss": 0.2193, + "step": 86450 + }, + { + "epoch": 2.3990376115015133, + "grad_norm": 0.14889566600322723, + "learning_rate": 2.0370765493815735e-05, + "loss": 0.2186, + "step": 86500 + }, + { + "epoch": 2.400424338444578, + "grad_norm": 0.12452477216720581, + "learning_rate": 2.0280292870643524e-05, + "loss": 0.2249, + "step": 86550 + }, + { + "epoch": 2.4018110653876423, + "grad_norm": 0.12419555336236954, + "learning_rate": 2.0189998925950227e-05, + "loss": 0.219, + "step": 86600 + }, + { + "epoch": 2.4031977923307064, + "grad_norm": 0.12875986099243164, + "learning_rate": 2.0099883862114688e-05, + "loss": 0.2214, + "step": 86650 + }, + { + "epoch": 2.404584519273771, + "grad_norm": 0.18010209500789642, + "learning_rate": 2.0009947881114888e-05, + "loss": 0.2176, + "step": 86700 + }, + { + "epoch": 2.4059712462168354, + "grad_norm": 0.142224982380867, + "learning_rate": 1.992019118452735e-05, + "loss": 0.2147, + "step": 86750 + }, + { + "epoch": 2.4073579731599, + "grad_norm": 0.13554586470127106, + "learning_rate": 1.9830613973526823e-05, + "loss": 0.2146, + "step": 86800 + }, + { + "epoch": 2.4087447001029645, + "grad_norm": 0.16745540499687195, + "learning_rate": 1.974121644888569e-05, + "loss": 0.2172, + "step": 86850 + }, + { + "epoch": 2.410131427046029, + "grad_norm": 0.138095885515213, + "learning_rate": 1.9651998810973737e-05, + "loss": 0.2236, + "step": 86900 + }, + { + "epoch": 2.4115181539890935, + "grad_norm": 0.12732721865177155, + "learning_rate": 1.9562961259757418e-05, + "loss": 0.2195, + "step": 86950 + }, + { + "epoch": 2.4129048809321576, + "grad_norm": 0.14291352033615112, + "learning_rate": 1.9474103994799643e-05, + "loss": 0.2199, + "step": 87000 + }, + { + "epoch": 2.4129048809321576, + "eval_loss": 0.21756121516227722, + "eval_runtime": 500.6311, + "eval_samples_per_second": 5.707, + "eval_steps_per_second": 5.707, + "step": 87000 + }, + { + "epoch": 2.414291607875222, + "grad_norm": 0.15151503682136536, + "learning_rate": 1.9385427215259166e-05, + "loss": 0.22, + "step": 87050 + }, + { + "epoch": 2.4156783348182866, + "grad_norm": 0.14093713462352753, + "learning_rate": 1.9296931119890283e-05, + "loss": 0.2168, + "step": 87100 + }, + { + "epoch": 2.417065061761351, + "grad_norm": 0.15471357107162476, + "learning_rate": 1.9208615907042316e-05, + "loss": 0.2209, + "step": 87150 + }, + { + "epoch": 2.4184517887044157, + "grad_norm": 0.15164633095264435, + "learning_rate": 1.9120481774659083e-05, + "loss": 0.2207, + "step": 87200 + }, + { + "epoch": 2.41983851564748, + "grad_norm": 0.1304929554462433, + "learning_rate": 1.9032528920278625e-05, + "loss": 0.2192, + "step": 87250 + }, + { + "epoch": 2.4212252425905447, + "grad_norm": 0.1501680463552475, + "learning_rate": 1.8944757541032664e-05, + "loss": 0.2209, + "step": 87300 + }, + { + "epoch": 2.4226119695336092, + "grad_norm": 0.12184792011976242, + "learning_rate": 1.8857167833646184e-05, + "loss": 0.219, + "step": 87350 + }, + { + "epoch": 2.4239986964766738, + "grad_norm": 0.14270274341106415, + "learning_rate": 1.8769759994436896e-05, + "loss": 0.2161, + "step": 87400 + }, + { + "epoch": 2.425385423419738, + "grad_norm": 0.14052851498126984, + "learning_rate": 1.868253421931503e-05, + "loss": 0.2177, + "step": 87450 + }, + { + "epoch": 2.4267721503628024, + "grad_norm": 0.14709405601024628, + "learning_rate": 1.859549070378259e-05, + "loss": 0.218, + "step": 87500 + }, + { + "epoch": 2.428158877305867, + "grad_norm": 0.15017499029636383, + "learning_rate": 1.8508629642933207e-05, + "loss": 0.2171, + "step": 87550 + }, + { + "epoch": 2.4295456042489314, + "grad_norm": 0.11489958316087723, + "learning_rate": 1.842195123145152e-05, + "loss": 0.2208, + "step": 87600 + }, + { + "epoch": 2.430932331191996, + "grad_norm": 0.1223435029387474, + "learning_rate": 1.8335455663612744e-05, + "loss": 0.2186, + "step": 87650 + }, + { + "epoch": 2.4323190581350604, + "grad_norm": 0.10422486811876297, + "learning_rate": 1.8249143133282344e-05, + "loss": 0.2169, + "step": 87700 + }, + { + "epoch": 2.433705785078125, + "grad_norm": 0.11790075153112411, + "learning_rate": 1.8163013833915532e-05, + "loss": 0.2201, + "step": 87750 + }, + { + "epoch": 2.435092512021189, + "grad_norm": 0.12330956012010574, + "learning_rate": 1.807706795855685e-05, + "loss": 0.2181, + "step": 87800 + }, + { + "epoch": 2.4364792389642536, + "grad_norm": 0.12955226004123688, + "learning_rate": 1.7991305699839623e-05, + "loss": 0.2158, + "step": 87850 + }, + { + "epoch": 2.437865965907318, + "grad_norm": 0.11457941681146622, + "learning_rate": 1.790572724998577e-05, + "loss": 0.2155, + "step": 87900 + }, + { + "epoch": 2.4392526928503826, + "grad_norm": 0.14329944550991058, + "learning_rate": 1.782033280080513e-05, + "loss": 0.2175, + "step": 87950 + }, + { + "epoch": 2.440639419793447, + "grad_norm": 0.15041890740394592, + "learning_rate": 1.7735122543695205e-05, + "loss": 0.2199, + "step": 88000 + }, + { + "epoch": 2.440639419793447, + "eval_loss": 0.21746821701526642, + "eval_runtime": 500.6597, + "eval_samples_per_second": 5.706, + "eval_steps_per_second": 5.706, + "step": 88000 + }, + { + "epoch": 2.4420261467365116, + "grad_norm": 0.13580143451690674, + "learning_rate": 1.765009666964056e-05, + "loss": 0.2163, + "step": 88050 + }, + { + "epoch": 2.443412873679576, + "grad_norm": 0.14240418374538422, + "learning_rate": 1.7565255369212662e-05, + "loss": 0.2221, + "step": 88100 + }, + { + "epoch": 2.4447996006226402, + "grad_norm": 0.13065385818481445, + "learning_rate": 1.748059883256913e-05, + "loss": 0.2226, + "step": 88150 + }, + { + "epoch": 2.4461863275657048, + "grad_norm": 0.13227535784244537, + "learning_rate": 1.7396127249453542e-05, + "loss": 0.2152, + "step": 88200 + }, + { + "epoch": 2.4475730545087693, + "grad_norm": 0.13343574106693268, + "learning_rate": 1.7311840809194934e-05, + "loss": 0.2162, + "step": 88250 + }, + { + "epoch": 2.448959781451834, + "grad_norm": 0.14660726487636566, + "learning_rate": 1.7227739700707322e-05, + "loss": 0.2183, + "step": 88300 + }, + { + "epoch": 2.4503465083948983, + "grad_norm": 0.13315744698047638, + "learning_rate": 1.7143824112489413e-05, + "loss": 0.218, + "step": 88350 + }, + { + "epoch": 2.451733235337963, + "grad_norm": 0.15965452790260315, + "learning_rate": 1.7060094232624012e-05, + "loss": 0.2169, + "step": 88400 + }, + { + "epoch": 2.4531199622810274, + "grad_norm": 0.1352396458387375, + "learning_rate": 1.6976550248777747e-05, + "loss": 0.217, + "step": 88450 + }, + { + "epoch": 2.4545066892240914, + "grad_norm": 0.16009649634361267, + "learning_rate": 1.6893192348200582e-05, + "loss": 0.2179, + "step": 88500 + }, + { + "epoch": 2.455893416167156, + "grad_norm": 0.13520574569702148, + "learning_rate": 1.6810020717725427e-05, + "loss": 0.2179, + "step": 88550 + }, + { + "epoch": 2.4572801431102205, + "grad_norm": 0.1435248851776123, + "learning_rate": 1.6727035543767634e-05, + "loss": 0.2185, + "step": 88600 + }, + { + "epoch": 2.458666870053285, + "grad_norm": 0.12575069069862366, + "learning_rate": 1.6644237012324716e-05, + "loss": 0.2224, + "step": 88650 + }, + { + "epoch": 2.4600535969963495, + "grad_norm": 0.143171489238739, + "learning_rate": 1.6561625308975782e-05, + "loss": 0.2159, + "step": 88700 + }, + { + "epoch": 2.461440323939414, + "grad_norm": 0.1348053365945816, + "learning_rate": 1.6479200618881275e-05, + "loss": 0.2171, + "step": 88750 + }, + { + "epoch": 2.4628270508824786, + "grad_norm": 0.12457796931266785, + "learning_rate": 1.639696312678245e-05, + "loss": 0.2211, + "step": 88800 + }, + { + "epoch": 2.4642137778255426, + "grad_norm": 0.1280537098646164, + "learning_rate": 1.6314913017000955e-05, + "loss": 0.218, + "step": 88850 + }, + { + "epoch": 2.465600504768607, + "grad_norm": 0.12102089077234268, + "learning_rate": 1.6233050473438483e-05, + "loss": 0.2186, + "step": 88900 + }, + { + "epoch": 2.4669872317116717, + "grad_norm": 0.15358726680278778, + "learning_rate": 1.615137567957634e-05, + "loss": 0.2192, + "step": 88950 + }, + { + "epoch": 2.468373958654736, + "grad_norm": 0.1613384634256363, + "learning_rate": 1.6069888818475022e-05, + "loss": 0.2191, + "step": 89000 + }, + { + "epoch": 2.468373958654736, + "eval_loss": 0.21739034354686737, + "eval_runtime": 500.3264, + "eval_samples_per_second": 5.71, + "eval_steps_per_second": 5.71, + "step": 89000 + }, + { + "epoch": 2.4697606855978007, + "grad_norm": 0.12117033451795578, + "learning_rate": 1.5988590072773744e-05, + "loss": 0.2202, + "step": 89050 + }, + { + "epoch": 2.4711474125408652, + "grad_norm": 0.13079126179218292, + "learning_rate": 1.590747962469018e-05, + "loss": 0.2146, + "step": 89100 + }, + { + "epoch": 2.4725341394839297, + "grad_norm": 0.12103881686925888, + "learning_rate": 1.582655765601989e-05, + "loss": 0.2182, + "step": 89150 + }, + { + "epoch": 2.473920866426994, + "grad_norm": 0.12128273397684097, + "learning_rate": 1.574582434813604e-05, + "loss": 0.217, + "step": 89200 + }, + { + "epoch": 2.4753075933700583, + "grad_norm": 0.13186509907245636, + "learning_rate": 1.5665279881988946e-05, + "loss": 0.2181, + "step": 89250 + }, + { + "epoch": 2.476694320313123, + "grad_norm": 0.17148008942604065, + "learning_rate": 1.5584924438105586e-05, + "loss": 0.2235, + "step": 89300 + }, + { + "epoch": 2.4780810472561874, + "grad_norm": 0.12658308446407318, + "learning_rate": 1.550475819658942e-05, + "loss": 0.219, + "step": 89350 + }, + { + "epoch": 2.479467774199252, + "grad_norm": 0.13896940648555756, + "learning_rate": 1.5424781337119685e-05, + "loss": 0.2193, + "step": 89400 + }, + { + "epoch": 2.4808545011423164, + "grad_norm": 0.13384407758712769, + "learning_rate": 1.534658792605652e-05, + "loss": 0.2173, + "step": 89450 + }, + { + "epoch": 2.482241228085381, + "grad_norm": 0.13054698705673218, + "learning_rate": 1.526698657146697e-05, + "loss": 0.217, + "step": 89500 + }, + { + "epoch": 2.483627955028445, + "grad_norm": 0.11624244600534439, + "learning_rate": 1.5187575131849496e-05, + "loss": 0.2174, + "step": 89550 + }, + { + "epoch": 2.4850146819715095, + "grad_norm": 0.13589714467525482, + "learning_rate": 1.5109936348064579e-05, + "loss": 0.2108, + "step": 89600 + }, + { + "epoch": 2.486401408914574, + "grad_norm": 0.1458209604024887, + "learning_rate": 1.5030901464780044e-05, + "loss": 0.2138, + "step": 89650 + }, + { + "epoch": 2.4877881358576386, + "grad_norm": 0.1267354041337967, + "learning_rate": 1.4952057025613075e-05, + "loss": 0.2154, + "step": 89700 + }, + { + "epoch": 2.489174862800703, + "grad_norm": 0.13832834362983704, + "learning_rate": 1.4873403207280445e-05, + "loss": 0.2182, + "step": 89750 + }, + { + "epoch": 2.4905615897437676, + "grad_norm": 0.15721414983272552, + "learning_rate": 1.4794940186071582e-05, + "loss": 0.2181, + "step": 89800 + }, + { + "epoch": 2.491948316686832, + "grad_norm": 0.13558489084243774, + "learning_rate": 1.471666813784831e-05, + "loss": 0.2194, + "step": 89850 + }, + { + "epoch": 2.493335043629896, + "grad_norm": 0.14502158761024475, + "learning_rate": 1.4638587238044466e-05, + "loss": 0.2197, + "step": 89900 + }, + { + "epoch": 2.4947217705729607, + "grad_norm": 0.14285750687122345, + "learning_rate": 1.4560697661665346e-05, + "loss": 0.2185, + "step": 89950 + }, + { + "epoch": 2.4961084975160253, + "grad_norm": 0.14996595680713654, + "learning_rate": 1.4482999583287549e-05, + "loss": 0.2188, + "step": 90000 + }, + { + "epoch": 2.4961084975160253, + "eval_loss": 0.21735654771327972, + "eval_runtime": 500.6813, + "eval_samples_per_second": 5.706, + "eval_steps_per_second": 5.706, + "step": 90000 + }, + { + "epoch": 2.49749522445909, + "grad_norm": 0.12452629953622818, + "learning_rate": 1.4405493177058382e-05, + "loss": 0.2141, + "step": 90050 + }, + { + "epoch": 2.4988819514021543, + "grad_norm": 0.11585172265768051, + "learning_rate": 1.432817861669561e-05, + "loss": 0.22, + "step": 90100 + }, + { + "epoch": 2.500268678345219, + "grad_norm": 0.1297215074300766, + "learning_rate": 1.4251056075486935e-05, + "loss": 0.217, + "step": 90150 + }, + { + "epoch": 2.5016554052882833, + "grad_norm": 0.1487419456243515, + "learning_rate": 1.4174125726289755e-05, + "loss": 0.2192, + "step": 90200 + }, + { + "epoch": 2.5030421322313474, + "grad_norm": 0.13567551970481873, + "learning_rate": 1.409738774153062e-05, + "loss": 0.2171, + "step": 90250 + }, + { + "epoch": 2.5044288591744124, + "grad_norm": 0.12301863729953766, + "learning_rate": 1.4020842293205016e-05, + "loss": 0.2144, + "step": 90300 + }, + { + "epoch": 2.5058155861174765, + "grad_norm": 0.12166890501976013, + "learning_rate": 1.394448955287685e-05, + "loss": 0.2216, + "step": 90350 + }, + { + "epoch": 2.507202313060541, + "grad_norm": 0.1442263275384903, + "learning_rate": 1.386832969167805e-05, + "loss": 0.2166, + "step": 90400 + }, + { + "epoch": 2.5085890400036055, + "grad_norm": 0.1268533319234848, + "learning_rate": 1.3792362880308374e-05, + "loss": 0.2185, + "step": 90450 + }, + { + "epoch": 2.50997576694667, + "grad_norm": 0.11250314116477966, + "learning_rate": 1.3716589289034731e-05, + "loss": 0.2174, + "step": 90500 + }, + { + "epoch": 2.5113624938897345, + "grad_norm": 0.12006182223558426, + "learning_rate": 1.3641009087691103e-05, + "loss": 0.219, + "step": 90550 + }, + { + "epoch": 2.5127492208327986, + "grad_norm": 0.15108701586723328, + "learning_rate": 1.3565622445677906e-05, + "loss": 0.2219, + "step": 90600 + }, + { + "epoch": 2.5141359477758636, + "grad_norm": 0.1432884782552719, + "learning_rate": 1.3490429531961802e-05, + "loss": 0.2158, + "step": 90650 + }, + { + "epoch": 2.5155226747189277, + "grad_norm": 0.15546324849128723, + "learning_rate": 1.3415430515075178e-05, + "loss": 0.2208, + "step": 90700 + }, + { + "epoch": 2.516909401661992, + "grad_norm": 0.126853808760643, + "learning_rate": 1.3340625563115905e-05, + "loss": 0.217, + "step": 90750 + }, + { + "epoch": 2.5182961286050567, + "grad_norm": 0.1568535566329956, + "learning_rate": 1.3266014843746832e-05, + "loss": 0.2153, + "step": 90800 + }, + { + "epoch": 2.519682855548121, + "grad_norm": 0.15077820420265198, + "learning_rate": 1.3191598524195537e-05, + "loss": 0.22, + "step": 90850 + }, + { + "epoch": 2.5210695824911857, + "grad_norm": 0.12230156362056732, + "learning_rate": 1.3117376771253775e-05, + "loss": 0.2172, + "step": 90900 + }, + { + "epoch": 2.52245630943425, + "grad_norm": 0.15573225915431976, + "learning_rate": 1.30433497512773e-05, + "loss": 0.2178, + "step": 90950 + }, + { + "epoch": 2.5238430363773148, + "grad_norm": 0.13438810408115387, + "learning_rate": 1.2969517630185401e-05, + "loss": 0.2168, + "step": 91000 + }, + { + "epoch": 2.5238430363773148, + "eval_loss": 0.21724413335323334, + "eval_runtime": 500.5096, + "eval_samples_per_second": 5.708, + "eval_steps_per_second": 5.708, + "step": 91000 + }, + { + "epoch": 2.525229763320379, + "grad_norm": 0.11917885392904282, + "learning_rate": 1.2895880573460462e-05, + "loss": 0.2173, + "step": 91050 + }, + { + "epoch": 2.5266164902634434, + "grad_norm": 0.127394899725914, + "learning_rate": 1.2822438746147769e-05, + "loss": 0.2163, + "step": 91100 + }, + { + "epoch": 2.528003217206508, + "grad_norm": 0.16986310482025146, + "learning_rate": 1.2749192312854929e-05, + "loss": 0.2234, + "step": 91150 + }, + { + "epoch": 2.5293899441495724, + "grad_norm": 0.12022554129362106, + "learning_rate": 1.2676141437751665e-05, + "loss": 0.2173, + "step": 91200 + }, + { + "epoch": 2.530776671092637, + "grad_norm": 0.13442516326904297, + "learning_rate": 1.2603286284569382e-05, + "loss": 0.213, + "step": 91250 + }, + { + "epoch": 2.5321633980357015, + "grad_norm": 0.1352687031030655, + "learning_rate": 1.2530627016600826e-05, + "loss": 0.2209, + "step": 91300 + }, + { + "epoch": 2.533550124978766, + "grad_norm": 0.11729501932859421, + "learning_rate": 1.245816379669963e-05, + "loss": 0.2173, + "step": 91350 + }, + { + "epoch": 2.53493685192183, + "grad_norm": 0.13084493577480316, + "learning_rate": 1.2385896787280072e-05, + "loss": 0.2149, + "step": 91400 + }, + { + "epoch": 2.5363235788648946, + "grad_norm": 0.12369387596845627, + "learning_rate": 1.2313826150316698e-05, + "loss": 0.2196, + "step": 91450 + }, + { + "epoch": 2.537710305807959, + "grad_norm": 0.13943591713905334, + "learning_rate": 1.2241952047343796e-05, + "loss": 0.22, + "step": 91500 + }, + { + "epoch": 2.5390970327510236, + "grad_norm": 0.13443367183208466, + "learning_rate": 1.2170274639455282e-05, + "loss": 0.2201, + "step": 91550 + }, + { + "epoch": 2.540483759694088, + "grad_norm": 0.12157223373651505, + "learning_rate": 1.2098794087304088e-05, + "loss": 0.2199, + "step": 91600 + }, + { + "epoch": 2.5418704866371526, + "grad_norm": 0.13608892261981964, + "learning_rate": 1.2027510551102084e-05, + "loss": 0.2219, + "step": 91650 + }, + { + "epoch": 2.543257213580217, + "grad_norm": 0.11314541846513748, + "learning_rate": 1.1956424190619408e-05, + "loss": 0.2161, + "step": 91700 + }, + { + "epoch": 2.5446439405232812, + "grad_norm": 0.12370922416448593, + "learning_rate": 1.188553516518437e-05, + "loss": 0.2159, + "step": 91750 + }, + { + "epoch": 2.5460306674663458, + "grad_norm": 0.1209016963839531, + "learning_rate": 1.1814843633682904e-05, + "loss": 0.2144, + "step": 91800 + }, + { + "epoch": 2.5474173944094103, + "grad_norm": 0.1325283944606781, + "learning_rate": 1.174434975455837e-05, + "loss": 0.2193, + "step": 91850 + }, + { + "epoch": 2.548804121352475, + "grad_norm": 0.15919071435928345, + "learning_rate": 1.1674053685811048e-05, + "loss": 0.2152, + "step": 91900 + }, + { + "epoch": 2.5501908482955393, + "grad_norm": 0.11343590915203094, + "learning_rate": 1.1603955584997916e-05, + "loss": 0.2174, + "step": 91950 + }, + { + "epoch": 2.551577575238604, + "grad_norm": 0.13005082309246063, + "learning_rate": 1.1534055609232219e-05, + "loss": 0.218, + "step": 92000 + }, + { + "epoch": 2.551577575238604, + "eval_loss": 0.21716812252998352, + "eval_runtime": 500.3626, + "eval_samples_per_second": 5.71, + "eval_steps_per_second": 5.71, + "step": 92000 + }, + { + "epoch": 2.5529643021816684, + "grad_norm": 0.11071674525737762, + "learning_rate": 1.1464353915183146e-05, + "loss": 0.2193, + "step": 92050 + }, + { + "epoch": 2.5543510291247324, + "grad_norm": 0.13590101897716522, + "learning_rate": 1.1394850659075484e-05, + "loss": 0.2188, + "step": 92100 + }, + { + "epoch": 2.555737756067797, + "grad_norm": 0.13247303664684296, + "learning_rate": 1.1325545996689192e-05, + "loss": 0.217, + "step": 92150 + }, + { + "epoch": 2.5571244830108615, + "grad_norm": 0.1305960714817047, + "learning_rate": 1.1256440083359188e-05, + "loss": 0.2148, + "step": 92200 + }, + { + "epoch": 2.558511209953926, + "grad_norm": 0.13151520490646362, + "learning_rate": 1.1187533073974855e-05, + "loss": 0.2177, + "step": 92250 + }, + { + "epoch": 2.5598979368969905, + "grad_norm": 0.13488546013832092, + "learning_rate": 1.111882512297986e-05, + "loss": 0.2202, + "step": 92300 + }, + { + "epoch": 2.561284663840055, + "grad_norm": 0.1446569859981537, + "learning_rate": 1.1050316384371617e-05, + "loss": 0.2173, + "step": 92350 + }, + { + "epoch": 2.5626713907831196, + "grad_norm": 0.1253671795129776, + "learning_rate": 1.0982007011701101e-05, + "loss": 0.2166, + "step": 92400 + }, + { + "epoch": 2.5640581177261836, + "grad_norm": 0.12608706951141357, + "learning_rate": 1.0913897158072405e-05, + "loss": 0.2158, + "step": 92450 + }, + { + "epoch": 2.565444844669248, + "grad_norm": 0.12183616310358047, + "learning_rate": 1.0845986976142497e-05, + "loss": 0.2156, + "step": 92500 + }, + { + "epoch": 2.5668315716123127, + "grad_norm": 0.12636205554008484, + "learning_rate": 1.0778276618120708e-05, + "loss": 0.2159, + "step": 92550 + }, + { + "epoch": 2.568218298555377, + "grad_norm": 0.1183227151632309, + "learning_rate": 1.0710766235768588e-05, + "loss": 0.2125, + "step": 92600 + }, + { + "epoch": 2.5696050254984417, + "grad_norm": 0.12173448503017426, + "learning_rate": 1.0643455980399453e-05, + "loss": 0.2161, + "step": 92650 + }, + { + "epoch": 2.5709917524415062, + "grad_norm": 0.1476822942495346, + "learning_rate": 1.0576346002878023e-05, + "loss": 0.2192, + "step": 92700 + }, + { + "epoch": 2.5723784793845708, + "grad_norm": 0.11871439963579178, + "learning_rate": 1.0509436453620202e-05, + "loss": 0.2204, + "step": 92750 + }, + { + "epoch": 2.573765206327635, + "grad_norm": 0.15758539736270905, + "learning_rate": 1.0442727482592596e-05, + "loss": 0.2204, + "step": 92800 + }, + { + "epoch": 2.5751519332706994, + "grad_norm": 0.13511809706687927, + "learning_rate": 1.0376219239312279e-05, + "loss": 0.2153, + "step": 92850 + }, + { + "epoch": 2.576538660213764, + "grad_norm": 0.13542090356349945, + "learning_rate": 1.0309911872846455e-05, + "loss": 0.218, + "step": 92900 + }, + { + "epoch": 2.5779253871568284, + "grad_norm": 0.15104569494724274, + "learning_rate": 1.0243805531812067e-05, + "loss": 0.2207, + "step": 92950 + }, + { + "epoch": 2.579312114099893, + "grad_norm": 0.14258405566215515, + "learning_rate": 1.017790036437547e-05, + "loss": 0.2187, + "step": 93000 + }, + { + "epoch": 2.579312114099893, + "eval_loss": 0.2171117663383484, + "eval_runtime": 500.4014, + "eval_samples_per_second": 5.709, + "eval_steps_per_second": 5.709, + "step": 93000 + }, + { + "epoch": 2.5806988410429574, + "grad_norm": 0.13689257204532623, + "learning_rate": 1.0112196518252159e-05, + "loss": 0.2162, + "step": 93050 + }, + { + "epoch": 2.582085567986022, + "grad_norm": 0.1326596736907959, + "learning_rate": 1.0046694140706415e-05, + "loss": 0.2179, + "step": 93100 + }, + { + "epoch": 2.583472294929086, + "grad_norm": 0.1485782414674759, + "learning_rate": 9.981393378550896e-06, + "loss": 0.2176, + "step": 93150 + }, + { + "epoch": 2.5848590218721506, + "grad_norm": 0.10767937451601028, + "learning_rate": 9.91629437814644e-06, + "loss": 0.2224, + "step": 93200 + }, + { + "epoch": 2.586245748815215, + "grad_norm": 0.12032176554203033, + "learning_rate": 9.851397285401597e-06, + "loss": 0.2171, + "step": 93250 + }, + { + "epoch": 2.5876324757582796, + "grad_norm": 0.13087831437587738, + "learning_rate": 9.786702245772484e-06, + "loss": 0.2161, + "step": 93300 + }, + { + "epoch": 2.589019202701344, + "grad_norm": 0.15828031301498413, + "learning_rate": 9.722209404262228e-06, + "loss": 0.2203, + "step": 93350 + }, + { + "epoch": 2.5904059296444086, + "grad_norm": 0.1374100148677826, + "learning_rate": 9.65791890542087e-06, + "loss": 0.222, + "step": 93400 + }, + { + "epoch": 2.591792656587473, + "grad_norm": 0.1500755399465561, + "learning_rate": 9.593830893344824e-06, + "loss": 0.2158, + "step": 93450 + }, + { + "epoch": 2.5931793835305372, + "grad_norm": 0.14486078917980194, + "learning_rate": 9.529945511676774e-06, + "loss": 0.2222, + "step": 93500 + }, + { + "epoch": 2.594566110473602, + "grad_norm": 0.14060664176940918, + "learning_rate": 9.466262903605138e-06, + "loss": 0.2129, + "step": 93550 + }, + { + "epoch": 2.5959528374166663, + "grad_norm": 0.15706004202365875, + "learning_rate": 9.40278321186394e-06, + "loss": 0.2192, + "step": 93600 + }, + { + "epoch": 2.597339564359731, + "grad_norm": 0.16333714127540588, + "learning_rate": 9.339506578732348e-06, + "loss": 0.2146, + "step": 93650 + }, + { + "epoch": 2.5987262913027953, + "grad_norm": 0.12438689172267914, + "learning_rate": 9.276433146034425e-06, + "loss": 0.2166, + "step": 93700 + }, + { + "epoch": 2.60011301824586, + "grad_norm": 0.1589004248380661, + "learning_rate": 9.213563055138807e-06, + "loss": 0.2181, + "step": 93750 + }, + { + "epoch": 2.6014997451889244, + "grad_norm": 0.11392497271299362, + "learning_rate": 9.150896446958324e-06, + "loss": 0.2142, + "step": 93800 + }, + { + "epoch": 2.6028864721319884, + "grad_norm": 0.13960012793540955, + "learning_rate": 9.088433461949809e-06, + "loss": 0.2166, + "step": 93850 + }, + { + "epoch": 2.6042731990750534, + "grad_norm": 0.13375328481197357, + "learning_rate": 9.027417426767926e-06, + "loss": 0.2122, + "step": 93900 + }, + { + "epoch": 2.6056599260181175, + "grad_norm": 0.13815660774707794, + "learning_rate": 8.96535802822891e-06, + "loss": 0.2178, + "step": 93950 + }, + { + "epoch": 2.607046652961182, + "grad_norm": 0.12065936625003815, + "learning_rate": 8.903502668715357e-06, + "loss": 0.2213, + "step": 94000 + }, + { + "epoch": 2.607046652961182, + "eval_loss": 0.21707016229629517, + "eval_runtime": 500.3557, + "eval_samples_per_second": 5.71, + "eval_steps_per_second": 5.71, + "step": 94000 + }, + { + "epoch": 2.6084333799042465, + "grad_norm": 0.12994138896465302, + "learning_rate": 8.843082508667189e-06, + "loss": 0.218, + "step": 94050 + }, + { + "epoch": 2.609820106847311, + "grad_norm": 0.13512447476387024, + "learning_rate": 8.781631554994407e-06, + "loss": 0.2198, + "step": 94100 + }, + { + "epoch": 2.6112068337903755, + "grad_norm": 0.15621940791606903, + "learning_rate": 8.720385052139468e-06, + "loss": 0.2192, + "step": 94150 + }, + { + "epoch": 2.6125935607334396, + "grad_norm": 0.13967250287532806, + "learning_rate": 8.659343137376263e-06, + "loss": 0.2193, + "step": 94200 + }, + { + "epoch": 2.6139802876765046, + "grad_norm": 0.1347551941871643, + "learning_rate": 8.598505947520063e-06, + "loss": 0.2199, + "step": 94250 + }, + { + "epoch": 2.6153670146195687, + "grad_norm": 0.1365870088338852, + "learning_rate": 8.53787361892735e-06, + "loss": 0.2209, + "step": 94300 + }, + { + "epoch": 2.616753741562633, + "grad_norm": 0.12225038558244705, + "learning_rate": 8.477446287495371e-06, + "loss": 0.2139, + "step": 94350 + }, + { + "epoch": 2.6181404685056977, + "grad_norm": 0.14266717433929443, + "learning_rate": 8.417224088662012e-06, + "loss": 0.2165, + "step": 94400 + }, + { + "epoch": 2.6195271954487622, + "grad_norm": 0.1277306079864502, + "learning_rate": 8.357207157405277e-06, + "loss": 0.2168, + "step": 94450 + }, + { + "epoch": 2.6209139223918267, + "grad_norm": 0.1608654409646988, + "learning_rate": 8.2973956282432e-06, + "loss": 0.2172, + "step": 94500 + }, + { + "epoch": 2.622300649334891, + "grad_norm": 0.12185334414243698, + "learning_rate": 8.237789635233317e-06, + "loss": 0.2178, + "step": 94550 + }, + { + "epoch": 2.623687376277956, + "grad_norm": 0.1386982798576355, + "learning_rate": 8.178389311972612e-06, + "loss": 0.2169, + "step": 94600 + }, + { + "epoch": 2.62507410322102, + "grad_norm": 0.1339062601327896, + "learning_rate": 8.119194791597006e-06, + "loss": 0.2166, + "step": 94650 + }, + { + "epoch": 2.6264608301640844, + "grad_norm": 0.15275806188583374, + "learning_rate": 8.060206206781206e-06, + "loss": 0.2182, + "step": 94700 + }, + { + "epoch": 2.627847557107149, + "grad_norm": 0.1413796991109848, + "learning_rate": 8.001423689738308e-06, + "loss": 0.2204, + "step": 94750 + }, + { + "epoch": 2.6292342840502134, + "grad_norm": 0.1281316876411438, + "learning_rate": 7.942847372219564e-06, + "loss": 0.2189, + "step": 94800 + }, + { + "epoch": 2.630621010993278, + "grad_norm": 0.14383311569690704, + "learning_rate": 7.884477385514089e-06, + "loss": 0.2203, + "step": 94850 + }, + { + "epoch": 2.6320077379363425, + "grad_norm": 0.1202847883105278, + "learning_rate": 7.826313860448454e-06, + "loss": 0.2153, + "step": 94900 + }, + { + "epoch": 2.633394464879407, + "grad_norm": 0.12072198837995529, + "learning_rate": 7.768356927386589e-06, + "loss": 0.216, + "step": 94950 + }, + { + "epoch": 2.634781191822471, + "grad_norm": 0.12037204205989838, + "learning_rate": 7.710606716229285e-06, + "loss": 0.2228, + "step": 95000 + }, + { + "epoch": 2.634781191822471, + "eval_loss": 0.21694940328598022, + "eval_runtime": 500.561, + "eval_samples_per_second": 5.708, + "eval_steps_per_second": 5.708, + "step": 95000 + }, + { + "epoch": 2.6361679187655356, + "grad_norm": 0.12889185547828674, + "learning_rate": 7.653063356414081e-06, + "loss": 0.2203, + "step": 95050 + }, + { + "epoch": 2.6375546457086, + "grad_norm": 0.1323080062866211, + "learning_rate": 7.59572697691483e-06, + "loss": 0.2183, + "step": 95100 + }, + { + "epoch": 2.6389413726516646, + "grad_norm": 0.12650491297245026, + "learning_rate": 7.538597706241513e-06, + "loss": 0.2143, + "step": 95150 + }, + { + "epoch": 2.640328099594729, + "grad_norm": 0.15389423072338104, + "learning_rate": 7.481675672439903e-06, + "loss": 0.2158, + "step": 95200 + }, + { + "epoch": 2.6417148265377937, + "grad_norm": 0.14535321295261383, + "learning_rate": 7.424961003091291e-06, + "loss": 0.2153, + "step": 95250 + }, + { + "epoch": 2.643101553480858, + "grad_norm": 0.1973002701997757, + "learning_rate": 7.368453825312161e-06, + "loss": 0.2192, + "step": 95300 + }, + { + "epoch": 2.6444882804239223, + "grad_norm": 0.1256517767906189, + "learning_rate": 7.312154265753978e-06, + "loss": 0.2209, + "step": 95350 + }, + { + "epoch": 2.6458750073669868, + "grad_norm": 0.13010665774345398, + "learning_rate": 7.256062450602863e-06, + "loss": 0.2207, + "step": 95400 + }, + { + "epoch": 2.6472617343100513, + "grad_norm": 0.11461709439754486, + "learning_rate": 7.200178505579269e-06, + "loss": 0.2164, + "step": 95450 + }, + { + "epoch": 2.648648461253116, + "grad_norm": 0.14307774603366852, + "learning_rate": 7.144502555937815e-06, + "loss": 0.2221, + "step": 95500 + }, + { + "epoch": 2.6500351881961803, + "grad_norm": 0.14916153252124786, + "learning_rate": 7.0890347264668255e-06, + "loss": 0.2202, + "step": 95550 + }, + { + "epoch": 2.651421915139245, + "grad_norm": 0.10386321693658829, + "learning_rate": 7.033775141488308e-06, + "loss": 0.2192, + "step": 95600 + }, + { + "epoch": 2.6528086420823094, + "grad_norm": 0.1536647379398346, + "learning_rate": 6.9787239248573885e-06, + "loss": 0.2157, + "step": 95650 + }, + { + "epoch": 2.6541953690253735, + "grad_norm": 0.12528762221336365, + "learning_rate": 6.9238811999622565e-06, + "loss": 0.2197, + "step": 95700 + }, + { + "epoch": 2.655582095968438, + "grad_norm": 0.13676822185516357, + "learning_rate": 6.869247089723729e-06, + "loss": 0.2214, + "step": 95750 + }, + { + "epoch": 2.6569688229115025, + "grad_norm": 0.13648496568202972, + "learning_rate": 6.81482171659511e-06, + "loss": 0.2178, + "step": 95800 + }, + { + "epoch": 2.658355549854567, + "grad_norm": 0.14560841023921967, + "learning_rate": 6.760605202561832e-06, + "loss": 0.2193, + "step": 95850 + }, + { + "epoch": 2.6597422767976315, + "grad_norm": 0.14555421471595764, + "learning_rate": 6.7065976691411904e-06, + "loss": 0.2193, + "step": 95900 + }, + { + "epoch": 2.661129003740696, + "grad_norm": 0.14283110201358795, + "learning_rate": 6.652799237382112e-06, + "loss": 0.2191, + "step": 95950 + }, + { + "epoch": 2.6625157306837606, + "grad_norm": 0.19822004437446594, + "learning_rate": 6.599210027864833e-06, + "loss": 0.2208, + "step": 96000 + }, + { + "epoch": 2.6625157306837606, + "eval_loss": 0.21687686443328857, + "eval_runtime": 500.1004, + "eval_samples_per_second": 5.713, + "eval_steps_per_second": 5.713, + "step": 96000 + }, + { + "epoch": 2.6639024576268246, + "grad_norm": 0.1323961615562439, + "learning_rate": 6.545830160700695e-06, + "loss": 0.2157, + "step": 96050 + }, + { + "epoch": 2.665289184569889, + "grad_norm": 0.1361609548330307, + "learning_rate": 6.492659755531749e-06, + "loss": 0.2166, + "step": 96100 + }, + { + "epoch": 2.6666759115129537, + "grad_norm": 0.1284680813550949, + "learning_rate": 6.439698931530669e-06, + "loss": 0.2152, + "step": 96150 + }, + { + "epoch": 2.668062638456018, + "grad_norm": 0.12301739305257797, + "learning_rate": 6.386947807400323e-06, + "loss": 0.217, + "step": 96200 + }, + { + "epoch": 2.6694493653990827, + "grad_norm": 0.1513117402791977, + "learning_rate": 6.33440650137358e-06, + "loss": 0.2157, + "step": 96250 + }, + { + "epoch": 2.6708360923421472, + "grad_norm": 0.15935802459716797, + "learning_rate": 6.282075131213083e-06, + "loss": 0.2193, + "step": 96300 + }, + { + "epoch": 2.6722228192852118, + "grad_norm": 0.1329813152551651, + "learning_rate": 6.229953814210865e-06, + "loss": 0.2126, + "step": 96350 + }, + { + "epoch": 2.673609546228276, + "grad_norm": 0.14577654004096985, + "learning_rate": 6.178042667188222e-06, + "loss": 0.2135, + "step": 96400 + }, + { + "epoch": 2.6749962731713404, + "grad_norm": 0.12831629812717438, + "learning_rate": 6.126341806495361e-06, + "loss": 0.2185, + "step": 96450 + }, + { + "epoch": 2.676383000114405, + "grad_norm": 0.13574399054050446, + "learning_rate": 6.074851348011179e-06, + "loss": 0.2171, + "step": 96500 + }, + { + "epoch": 2.6777697270574694, + "grad_norm": 0.15019351243972778, + "learning_rate": 6.023571407142969e-06, + "loss": 0.2188, + "step": 96550 + }, + { + "epoch": 2.679156454000534, + "grad_norm": 0.11595098674297333, + "learning_rate": 5.972502098826216e-06, + "loss": 0.2193, + "step": 96600 + }, + { + "epoch": 2.6805431809435984, + "grad_norm": 0.11979762464761734, + "learning_rate": 5.9216435375242685e-06, + "loss": 0.2215, + "step": 96650 + }, + { + "epoch": 2.681929907886663, + "grad_norm": 0.13104109466075897, + "learning_rate": 5.870995837228166e-06, + "loss": 0.2163, + "step": 96700 + }, + { + "epoch": 2.683316634829727, + "grad_norm": 0.15309420228004456, + "learning_rate": 5.820559111456292e-06, + "loss": 0.2147, + "step": 96750 + }, + { + "epoch": 2.6847033617727916, + "grad_norm": 0.13580960035324097, + "learning_rate": 5.7703334732541855e-06, + "loss": 0.2169, + "step": 96800 + }, + { + "epoch": 2.686090088715856, + "grad_norm": 0.14888140559196472, + "learning_rate": 5.720319035194299e-06, + "loss": 0.2193, + "step": 96850 + }, + { + "epoch": 2.6874768156589206, + "grad_norm": 0.1494724303483963, + "learning_rate": 5.670515909375651e-06, + "loss": 0.2186, + "step": 96900 + }, + { + "epoch": 2.688863542601985, + "grad_norm": 0.11105342954397202, + "learning_rate": 5.6209242074237165e-06, + "loss": 0.2171, + "step": 96950 + }, + { + "epoch": 2.6902502695450496, + "grad_norm": 0.1481621414422989, + "learning_rate": 5.5715440404900175e-06, + "loss": 0.2189, + "step": 97000 + }, + { + "epoch": 2.6902502695450496, + "eval_loss": 0.21687200665473938, + "eval_runtime": 500.2659, + "eval_samples_per_second": 5.711, + "eval_steps_per_second": 5.711, + "step": 97000 + }, + { + "epoch": 2.691636996488114, + "grad_norm": 0.1304822862148285, + "learning_rate": 5.522375519252021e-06, + "loss": 0.2215, + "step": 97050 + }, + { + "epoch": 2.6930237234311782, + "grad_norm": 0.12556232511997223, + "learning_rate": 5.474395813301237e-06, + "loss": 0.2177, + "step": 97100 + }, + { + "epoch": 2.694410450374243, + "grad_norm": 0.10873960703611374, + "learning_rate": 5.42564667520441e-06, + "loss": 0.2182, + "step": 97150 + }, + { + "epoch": 2.6957971773173073, + "grad_norm": 0.1206154152750969, + "learning_rate": 5.377109509807965e-06, + "loss": 0.2183, + "step": 97200 + }, + { + "epoch": 2.697183904260372, + "grad_norm": 0.12211555987596512, + "learning_rate": 5.3287844258999135e-06, + "loss": 0.2175, + "step": 97250 + }, + { + "epoch": 2.6985706312034363, + "grad_norm": 0.13117828965187073, + "learning_rate": 5.28067153179288e-06, + "loss": 0.2208, + "step": 97300 + }, + { + "epoch": 2.699957358146501, + "grad_norm": 0.12565045058727264, + "learning_rate": 5.23277093532395e-06, + "loss": 0.2154, + "step": 97350 + }, + { + "epoch": 2.7013440850895654, + "grad_norm": 0.132746160030365, + "learning_rate": 5.1850827438543305e-06, + "loss": 0.2196, + "step": 97400 + }, + { + "epoch": 2.7027308120326294, + "grad_norm": 0.12424889206886292, + "learning_rate": 5.1376070642691896e-06, + "loss": 0.2143, + "step": 97450 + }, + { + "epoch": 2.7041175389756944, + "grad_norm": 0.13726244866847992, + "learning_rate": 5.0903440029773985e-06, + "loss": 0.2176, + "step": 97500 + }, + { + "epoch": 2.7055042659187585, + "grad_norm": 0.13994988799095154, + "learning_rate": 5.043293665911219e-06, + "loss": 0.217, + "step": 97550 + }, + { + "epoch": 2.706890992861823, + "grad_norm": 0.13315385580062866, + "learning_rate": 4.996456158526197e-06, + "loss": 0.2136, + "step": 97600 + }, + { + "epoch": 2.7082777198048875, + "grad_norm": 0.12573137879371643, + "learning_rate": 4.949831585800779e-06, + "loss": 0.2172, + "step": 97650 + }, + { + "epoch": 2.709664446747952, + "grad_norm": 0.12147074192762375, + "learning_rate": 4.903420052236252e-06, + "loss": 0.2215, + "step": 97700 + }, + { + "epoch": 2.7110511736910166, + "grad_norm": 0.12491251528263092, + "learning_rate": 4.857221661856304e-06, + "loss": 0.2188, + "step": 97750 + }, + { + "epoch": 2.7124379006340806, + "grad_norm": 0.11851975321769714, + "learning_rate": 4.8112365182070075e-06, + "loss": 0.218, + "step": 97800 + }, + { + "epoch": 2.7138246275771456, + "grad_norm": 0.14051896333694458, + "learning_rate": 4.765464724356383e-06, + "loss": 0.2167, + "step": 97850 + }, + { + "epoch": 2.7152113545202097, + "grad_norm": 0.14575433731079102, + "learning_rate": 4.719906382894324e-06, + "loss": 0.2172, + "step": 97900 + }, + { + "epoch": 2.716598081463274, + "grad_norm": 0.11300890892744064, + "learning_rate": 4.674561595932259e-06, + "loss": 0.2206, + "step": 97950 + }, + { + "epoch": 2.7179848084063387, + "grad_norm": 0.13367551565170288, + "learning_rate": 4.629430465103002e-06, + "loss": 0.2165, + "step": 98000 + }, + { + "epoch": 2.7179848084063387, + "eval_loss": 0.21681541204452515, + "eval_runtime": 500.7329, + "eval_samples_per_second": 5.706, + "eval_steps_per_second": 5.706, + "step": 98000 + }, + { + "epoch": 2.7193715353494032, + "grad_norm": 0.13205870985984802, + "learning_rate": 4.5845130915605165e-06, + "loss": 0.2165, + "step": 98050 + }, + { + "epoch": 2.7207582622924678, + "grad_norm": 0.13241606950759888, + "learning_rate": 4.539809575979581e-06, + "loss": 0.2193, + "step": 98100 + }, + { + "epoch": 2.722144989235532, + "grad_norm": 0.15105368196964264, + "learning_rate": 4.495320018555738e-06, + "loss": 0.2168, + "step": 98150 + }, + { + "epoch": 2.723531716178597, + "grad_norm": 0.16313302516937256, + "learning_rate": 4.451044519004921e-06, + "loss": 0.217, + "step": 98200 + }, + { + "epoch": 2.724918443121661, + "grad_norm": 0.13042916357517242, + "learning_rate": 4.406983176563329e-06, + "loss": 0.2197, + "step": 98250 + }, + { + "epoch": 2.7263051700647254, + "grad_norm": 0.11967332661151886, + "learning_rate": 4.363136089987096e-06, + "loss": 0.2216, + "step": 98300 + }, + { + "epoch": 2.72769189700779, + "grad_norm": 0.13527044653892517, + "learning_rate": 4.319503357552235e-06, + "loss": 0.2187, + "step": 98350 + }, + { + "epoch": 2.7290786239508544, + "grad_norm": 0.13896431028842926, + "learning_rate": 4.276085077054226e-06, + "loss": 0.218, + "step": 98400 + }, + { + "epoch": 2.730465350893919, + "grad_norm": 0.13394545018672943, + "learning_rate": 4.2328813458079374e-06, + "loss": 0.2211, + "step": 98450 + }, + { + "epoch": 2.7318520778369835, + "grad_norm": 0.1567400097846985, + "learning_rate": 4.189892260647388e-06, + "loss": 0.2192, + "step": 98500 + }, + { + "epoch": 2.733238804780048, + "grad_norm": 0.13998396694660187, + "learning_rate": 4.147117917925425e-06, + "loss": 0.2173, + "step": 98550 + }, + { + "epoch": 2.734625531723112, + "grad_norm": 0.11015547066926956, + "learning_rate": 4.104558413513649e-06, + "loss": 0.2178, + "step": 98600 + }, + { + "epoch": 2.7360122586661766, + "grad_norm": 0.1378924697637558, + "learning_rate": 4.062213842802121e-06, + "loss": 0.2193, + "step": 98650 + }, + { + "epoch": 2.737398985609241, + "grad_norm": 0.13427558541297913, + "learning_rate": 4.020084300699178e-06, + "loss": 0.2165, + "step": 98700 + }, + { + "epoch": 2.7387857125523056, + "grad_norm": 0.11761970818042755, + "learning_rate": 3.978169881631166e-06, + "loss": 0.2171, + "step": 98750 + }, + { + "epoch": 2.74017243949537, + "grad_norm": 0.12466447800397873, + "learning_rate": 3.936470679542292e-06, + "loss": 0.2211, + "step": 98800 + }, + { + "epoch": 2.7415591664384347, + "grad_norm": 0.15706753730773926, + "learning_rate": 3.894986787894394e-06, + "loss": 0.219, + "step": 98850 + }, + { + "epoch": 2.742945893381499, + "grad_norm": 0.14471718668937683, + "learning_rate": 3.853718299666742e-06, + "loss": 0.2158, + "step": 98900 + }, + { + "epoch": 2.7443326203245633, + "grad_norm": 0.12557634711265564, + "learning_rate": 3.812665307355745e-06, + "loss": 0.2208, + "step": 98950 + }, + { + "epoch": 2.745719347267628, + "grad_norm": 0.12567783892154694, + "learning_rate": 3.7718279029749225e-06, + "loss": 0.2175, + "step": 99000 + }, + { + "epoch": 2.745719347267628, + "eval_loss": 0.21679456532001495, + "eval_runtime": 500.082, + "eval_samples_per_second": 5.713, + "eval_steps_per_second": 5.713, + "step": 99000 + }, + { + "epoch": 2.7471060742106923, + "grad_norm": 0.13429369032382965, + "learning_rate": 3.731206178054503e-06, + "loss": 0.2174, + "step": 99050 + }, + { + "epoch": 2.748492801153757, + "grad_norm": 0.13248606026172638, + "learning_rate": 3.690800223641322e-06, + "loss": 0.2192, + "step": 99100 + }, + { + "epoch": 2.7498795280968213, + "grad_norm": 0.13042642176151276, + "learning_rate": 3.6506101302986595e-06, + "loss": 0.2166, + "step": 99150 + }, + { + "epoch": 2.751266255039886, + "grad_norm": 0.15077584981918335, + "learning_rate": 3.6106359881058815e-06, + "loss": 0.2152, + "step": 99200 + }, + { + "epoch": 2.7526529819829504, + "grad_norm": 0.13205015659332275, + "learning_rate": 3.570877886658419e-06, + "loss": 0.2196, + "step": 99250 + }, + { + "epoch": 2.7540397089260145, + "grad_norm": 0.14192979037761688, + "learning_rate": 3.531335915067424e-06, + "loss": 0.2182, + "step": 99300 + }, + { + "epoch": 2.755426435869079, + "grad_norm": 0.13009020686149597, + "learning_rate": 3.49201016195968e-06, + "loss": 0.2194, + "step": 99350 + }, + { + "epoch": 2.7568131628121435, + "grad_norm": 0.11861055344343185, + "learning_rate": 3.4529007154773142e-06, + "loss": 0.2181, + "step": 99400 + }, + { + "epoch": 2.758199889755208, + "grad_norm": 0.15699726343154907, + "learning_rate": 3.414007663277674e-06, + "loss": 0.2175, + "step": 99450 + }, + { + "epoch": 2.7595866166982725, + "grad_norm": 0.12084462493658066, + "learning_rate": 3.3753310925330516e-06, + "loss": 0.2188, + "step": 99500 + }, + { + "epoch": 2.760973343641337, + "grad_norm": 0.14574265480041504, + "learning_rate": 3.336871089930571e-06, + "loss": 0.2213, + "step": 99550 + }, + { + "epoch": 2.7623600705844016, + "grad_norm": 0.1578470766544342, + "learning_rate": 3.2986277416719227e-06, + "loss": 0.2185, + "step": 99600 + }, + { + "epoch": 2.7637467975274657, + "grad_norm": 0.1389865130186081, + "learning_rate": 3.2606011334732178e-06, + "loss": 0.2196, + "step": 99650 + }, + { + "epoch": 2.76513352447053, + "grad_norm": 0.12240829318761826, + "learning_rate": 3.222791350564802e-06, + "loss": 0.2209, + "step": 99700 + }, + { + "epoch": 2.7665202514135947, + "grad_norm": 0.13817352056503296, + "learning_rate": 3.1851984776909984e-06, + "loss": 0.2158, + "step": 99750 + }, + { + "epoch": 2.767906978356659, + "grad_norm": 0.16704636812210083, + "learning_rate": 3.1478225991099954e-06, + "loss": 0.2181, + "step": 99800 + }, + { + "epoch": 2.7692937052997237, + "grad_norm": 0.12111228704452515, + "learning_rate": 3.110663798593616e-06, + "loss": 0.2161, + "step": 99850 + }, + { + "epoch": 2.7706804322427883, + "grad_norm": 0.10894995182752609, + "learning_rate": 3.0737221594271616e-06, + "loss": 0.2138, + "step": 99900 + }, + { + "epoch": 2.772067159185853, + "grad_norm": 0.13423964381217957, + "learning_rate": 3.036997764409133e-06, + "loss": 0.2189, + "step": 99950 + }, + { + "epoch": 2.773453886128917, + "grad_norm": 0.17966866493225098, + "learning_rate": 3.000490695851188e-06, + "loss": 0.2156, + "step": 100000 + }, + { + "epoch": 2.773453886128917, + "eval_loss": 0.216772198677063, + "eval_runtime": 500.5232, + "eval_samples_per_second": 5.708, + "eval_steps_per_second": 5.708, + "step": 100000 + }, + { + "epoch": 2.7748406130719814, + "grad_norm": 0.12200125306844711, + "learning_rate": 2.9642010355778403e-06, + "loss": 0.2163, + "step": 100050 + }, + { + "epoch": 2.776227340015046, + "grad_norm": 0.14171266555786133, + "learning_rate": 2.9281288649263496e-06, + "loss": 0.2199, + "step": 100100 + }, + { + "epoch": 2.7776140669581104, + "grad_norm": 0.14463447034358978, + "learning_rate": 2.8922742647464974e-06, + "loss": 0.2167, + "step": 100150 + }, + { + "epoch": 2.779000793901175, + "grad_norm": 0.14498113095760345, + "learning_rate": 2.8566373154003788e-06, + "loss": 0.2215, + "step": 100200 + }, + { + "epoch": 2.7803875208442395, + "grad_norm": 0.14089851081371307, + "learning_rate": 2.821218096762346e-06, + "loss": 0.2179, + "step": 100250 + }, + { + "epoch": 2.781774247787304, + "grad_norm": 0.1408817619085312, + "learning_rate": 2.786016688218651e-06, + "loss": 0.2221, + "step": 100300 + }, + { + "epoch": 2.783160974730368, + "grad_norm": 0.1360626369714737, + "learning_rate": 2.7510331686674383e-06, + "loss": 0.2169, + "step": 100350 + }, + { + "epoch": 2.7845477016734326, + "grad_norm": 0.12629085779190063, + "learning_rate": 2.7162676165184197e-06, + "loss": 0.2169, + "step": 100400 + }, + { + "epoch": 2.785934428616497, + "grad_norm": 0.14268292486667633, + "learning_rate": 2.68172010969282e-06, + "loss": 0.2142, + "step": 100450 + }, + { + "epoch": 2.7873211555595616, + "grad_norm": 0.1308322250843048, + "learning_rate": 2.6473907256231333e-06, + "loss": 0.2178, + "step": 100500 + }, + { + "epoch": 2.788707882502626, + "grad_norm": 0.13275696337223053, + "learning_rate": 2.6132795412529777e-06, + "loss": 0.2191, + "step": 100550 + }, + { + "epoch": 2.7900946094456907, + "grad_norm": 0.14593924582004547, + "learning_rate": 2.5793866330368954e-06, + "loss": 0.2164, + "step": 100600 + }, + { + "epoch": 2.791481336388755, + "grad_norm": 0.13292020559310913, + "learning_rate": 2.5457120769402208e-06, + "loss": 0.2161, + "step": 100650 + }, + { + "epoch": 2.7928680633318193, + "grad_norm": 0.13708774745464325, + "learning_rate": 2.5122559484388685e-06, + "loss": 0.2139, + "step": 100700 + }, + { + "epoch": 2.794254790274884, + "grad_norm": 0.1346159726381302, + "learning_rate": 2.479018322519189e-06, + "loss": 0.2186, + "step": 100750 + }, + { + "epoch": 2.7956415172179483, + "grad_norm": 0.1296459138393402, + "learning_rate": 2.4459992736778125e-06, + "loss": 0.2153, + "step": 100800 + }, + { + "epoch": 2.797028244161013, + "grad_norm": 0.15982100367546082, + "learning_rate": 2.413198875921441e-06, + "loss": 0.2162, + "step": 100850 + }, + { + "epoch": 2.7984149711040773, + "grad_norm": 0.1282055675983429, + "learning_rate": 2.3806172027667216e-06, + "loss": 0.2142, + "step": 100900 + }, + { + "epoch": 2.799801698047142, + "grad_norm": 0.12306920439004898, + "learning_rate": 2.3482543272400403e-06, + "loss": 0.2176, + "step": 100950 + }, + { + "epoch": 2.8011884249902064, + "grad_norm": 0.11848998814821243, + "learning_rate": 2.3161103218774404e-06, + "loss": 0.218, + "step": 101000 + }, + { + "epoch": 2.8011884249902064, + "eval_loss": 0.21674667298793793, + "eval_runtime": 500.3702, + "eval_samples_per_second": 5.71, + "eval_steps_per_second": 5.71, + "step": 101000 + }, + { + "epoch": 2.8025751519332704, + "grad_norm": 0.1424969583749771, + "learning_rate": 2.284185258724336e-06, + "loss": 0.2143, + "step": 101050 + }, + { + "epoch": 2.8039618788763354, + "grad_norm": 0.1375187486410141, + "learning_rate": 2.2524792093354897e-06, + "loss": 0.2186, + "step": 101100 + }, + { + "epoch": 2.8053486058193995, + "grad_norm": 0.13417352735996246, + "learning_rate": 2.220992244774711e-06, + "loss": 0.2152, + "step": 101150 + }, + { + "epoch": 2.806735332762464, + "grad_norm": 0.149616077542305, + "learning_rate": 2.189724435614815e-06, + "loss": 0.2155, + "step": 101200 + }, + { + "epoch": 2.8081220597055285, + "grad_norm": 0.12678956985473633, + "learning_rate": 2.1586758519373973e-06, + "loss": 0.2163, + "step": 101250 + }, + { + "epoch": 2.809508786648593, + "grad_norm": 0.12688349187374115, + "learning_rate": 2.127846563332703e-06, + "loss": 0.219, + "step": 101300 + }, + { + "epoch": 2.8108955135916576, + "grad_norm": 0.12490526586771011, + "learning_rate": 2.097846687174676e-06, + "loss": 0.2185, + "step": 101350 + }, + { + "epoch": 2.8122822405347216, + "grad_norm": 0.11337302625179291, + "learning_rate": 2.0674518061951975e-06, + "loss": 0.2178, + "step": 101400 + }, + { + "epoch": 2.8136689674777866, + "grad_norm": 0.14391370117664337, + "learning_rate": 2.037276424751977e-06, + "loss": 0.2172, + "step": 101450 + }, + { + "epoch": 2.8150556944208507, + "grad_norm": 0.15447823703289032, + "learning_rate": 2.007320610478136e-06, + "loss": 0.2175, + "step": 101500 + }, + { + "epoch": 2.816442421363915, + "grad_norm": 0.1505586951971054, + "learning_rate": 1.977584430514623e-06, + "loss": 0.2214, + "step": 101550 + }, + { + "epoch": 2.8178291483069797, + "grad_norm": 0.1483069509267807, + "learning_rate": 1.9480679515101797e-06, + "loss": 0.2172, + "step": 101600 + }, + { + "epoch": 2.8192158752500442, + "grad_norm": 0.11997207999229431, + "learning_rate": 1.9187712396210756e-06, + "loss": 0.221, + "step": 101650 + }, + { + "epoch": 2.8206026021931088, + "grad_norm": 0.1311604231595993, + "learning_rate": 1.8896943605110185e-06, + "loss": 0.2171, + "step": 101700 + }, + { + "epoch": 2.821989329136173, + "grad_norm": 0.12765543162822723, + "learning_rate": 1.8608373793510102e-06, + "loss": 0.2168, + "step": 101750 + }, + { + "epoch": 2.823376056079238, + "grad_norm": 0.17365878820419312, + "learning_rate": 1.8322003608191696e-06, + "loss": 0.2174, + "step": 101800 + }, + { + "epoch": 2.824762783022302, + "grad_norm": 0.11939753592014313, + "learning_rate": 1.8037833691006312e-06, + "loss": 0.2167, + "step": 101850 + }, + { + "epoch": 2.8261495099653664, + "grad_norm": 0.14094018936157227, + "learning_rate": 1.7755864678873468e-06, + "loss": 0.2171, + "step": 101900 + }, + { + "epoch": 2.827536236908431, + "grad_norm": 0.1488160490989685, + "learning_rate": 1.7476097203779852e-06, + "loss": 0.2189, + "step": 101950 + }, + { + "epoch": 2.8289229638514954, + "grad_norm": 0.12787774205207825, + "learning_rate": 1.719853189277787e-06, + "loss": 0.2199, + "step": 102000 + }, + { + "epoch": 2.8289229638514954, + "eval_loss": 0.21673431992530823, + "eval_runtime": 500.5835, + "eval_samples_per_second": 5.707, + "eval_steps_per_second": 5.707, + "step": 102000 + }, + { + "epoch": 2.83030969079456, + "grad_norm": 0.12656597793102264, + "learning_rate": 1.6923169367983994e-06, + "loss": 0.2173, + "step": 102050 + }, + { + "epoch": 2.8316964177376245, + "grad_norm": 0.12270744144916534, + "learning_rate": 1.6650010246577751e-06, + "loss": 0.2189, + "step": 102100 + }, + { + "epoch": 2.833083144680689, + "grad_norm": 0.13541977107524872, + "learning_rate": 1.6379055140799626e-06, + "loss": 0.2195, + "step": 102150 + }, + { + "epoch": 2.834469871623753, + "grad_norm": 0.17381368577480316, + "learning_rate": 1.6110304657950715e-06, + "loss": 0.2198, + "step": 102200 + }, + { + "epoch": 2.8358565985668176, + "grad_norm": 0.11857634037733078, + "learning_rate": 1.584375940039029e-06, + "loss": 0.2178, + "step": 102250 + }, + { + "epoch": 2.837243325509882, + "grad_norm": 0.13569362461566925, + "learning_rate": 1.557941996553558e-06, + "loss": 0.2151, + "step": 102300 + }, + { + "epoch": 2.8386300524529466, + "grad_norm": 0.1297358274459839, + "learning_rate": 1.5317286945859433e-06, + "loss": 0.2149, + "step": 102350 + }, + { + "epoch": 2.840016779396011, + "grad_norm": 0.14771656692028046, + "learning_rate": 1.505736092888932e-06, + "loss": 0.2152, + "step": 102400 + }, + { + "epoch": 2.8414035063390757, + "grad_norm": 0.11793384701013565, + "learning_rate": 1.4799642497206334e-06, + "loss": 0.2184, + "step": 102450 + }, + { + "epoch": 2.84279023328214, + "grad_norm": 0.12428101897239685, + "learning_rate": 1.454413222844353e-06, + "loss": 0.2151, + "step": 102500 + }, + { + "epoch": 2.8441769602252043, + "grad_norm": 0.14781787991523743, + "learning_rate": 1.4290830695284807e-06, + "loss": 0.2156, + "step": 102550 + }, + { + "epoch": 2.845563687168269, + "grad_norm": 0.1396929919719696, + "learning_rate": 1.4039738465463136e-06, + "loss": 0.2198, + "step": 102600 + }, + { + "epoch": 2.8469504141113333, + "grad_norm": 0.14160382747650146, + "learning_rate": 1.3790856101760452e-06, + "loss": 0.2195, + "step": 102650 + }, + { + "epoch": 2.848337141054398, + "grad_norm": 0.1488640010356903, + "learning_rate": 1.354418416200498e-06, + "loss": 0.2191, + "step": 102700 + }, + { + "epoch": 2.8497238679974624, + "grad_norm": 0.17428374290466309, + "learning_rate": 1.3299723199070802e-06, + "loss": 0.217, + "step": 102750 + }, + { + "epoch": 2.851110594940527, + "grad_norm": 0.1595117300748825, + "learning_rate": 1.3057473760876848e-06, + "loss": 0.2167, + "step": 102800 + }, + { + "epoch": 2.8524973218835914, + "grad_norm": 0.14677169919013977, + "learning_rate": 1.2817436390384796e-06, + "loss": 0.217, + "step": 102850 + }, + { + "epoch": 2.8538840488266555, + "grad_norm": 0.13991901278495789, + "learning_rate": 1.2579611625598509e-06, + "loss": 0.217, + "step": 102900 + }, + { + "epoch": 2.85527077576972, + "grad_norm": 0.13375407457351685, + "learning_rate": 1.2343999999562817e-06, + "loss": 0.2193, + "step": 102950 + }, + { + "epoch": 2.8566575027127845, + "grad_norm": 0.1192784383893013, + "learning_rate": 1.2110602040361963e-06, + "loss": 0.2198, + "step": 103000 + }, + { + "epoch": 2.8566575027127845, + "eval_loss": 0.21671663224697113, + "eval_runtime": 500.3054, + "eval_samples_per_second": 5.711, + "eval_steps_per_second": 5.711, + "step": 103000 + }, + { + "epoch": 2.858044229655849, + "grad_norm": 0.13127942383289337, + "learning_rate": 1.1879418271118603e-06, + "loss": 0.2189, + "step": 103050 + }, + { + "epoch": 2.8594309565989136, + "grad_norm": 0.130979984998703, + "learning_rate": 1.165044920999292e-06, + "loss": 0.2188, + "step": 103100 + }, + { + "epoch": 2.860817683541978, + "grad_norm": 0.13157400488853455, + "learning_rate": 1.1423695370180954e-06, + "loss": 0.2198, + "step": 103150 + }, + { + "epoch": 2.8622044104850426, + "grad_norm": 0.1267186552286148, + "learning_rate": 1.1199157259913606e-06, + "loss": 0.2161, + "step": 103200 + }, + { + "epoch": 2.8635911374281067, + "grad_norm": 0.11663077026605606, + "learning_rate": 1.0976835382455975e-06, + "loss": 0.2161, + "step": 103250 + }, + { + "epoch": 2.864977864371171, + "grad_norm": 0.1856304109096527, + "learning_rate": 1.0756730236105572e-06, + "loss": 0.2205, + "step": 103300 + }, + { + "epoch": 2.8663645913142357, + "grad_norm": 0.11844471096992493, + "learning_rate": 1.0538842314191444e-06, + "loss": 0.2181, + "step": 103350 + }, + { + "epoch": 2.8677513182573002, + "grad_norm": 0.12963257730007172, + "learning_rate": 1.0323172105073164e-06, + "loss": 0.2195, + "step": 103400 + }, + { + "epoch": 2.8691380452003648, + "grad_norm": NaN, + "learning_rate": 1.0113967390973257e-06, + "loss": 0.2195, + "step": 103450 + }, + { + "epoch": 2.8705247721434293, + "grad_norm": 0.15201549232006073, + "learning_rate": 9.90268967449348e-07, + "loss": 0.2255, + "step": 103500 + }, + { + "epoch": 2.871911499086494, + "grad_norm": 0.15500032901763916, + "learning_rate": 9.69363109664001e-07, + "loss": 0.2233, + "step": 103550 + }, + { + "epoch": 2.873298226029558, + "grad_norm": 0.12956801056861877, + "learning_rate": 9.486792125983024e-07, + "loss": 0.2187, + "step": 103600 + }, + { + "epoch": 2.8746849529726224, + "grad_norm": 0.12632031738758087, + "learning_rate": 9.282173226117574e-07, + "loss": 0.2189, + "step": 103650 + }, + { + "epoch": 2.876071679915687, + "grad_norm": 0.12151964753866196, + "learning_rate": 9.079774855663026e-07, + "loss": 0.2189, + "step": 103700 + }, + { + "epoch": 2.8774584068587514, + "grad_norm": 0.1441679447889328, + "learning_rate": 8.879597468261502e-07, + "loss": 0.2161, + "step": 103750 + }, + { + "epoch": 2.878845133801816, + "grad_norm": 0.14770221710205078, + "learning_rate": 8.681641512577665e-07, + "loss": 0.2201, + "step": 103800 + }, + { + "epoch": 2.8802318607448805, + "grad_norm": 0.14278040826320648, + "learning_rate": 8.485907432296714e-07, + "loss": 0.2181, + "step": 103850 + }, + { + "epoch": 2.881618587687945, + "grad_norm": 0.12169021368026733, + "learning_rate": 8.292395666124053e-07, + "loss": 0.2158, + "step": 103900 + }, + { + "epoch": 2.883005314631009, + "grad_norm": 0.1436878740787506, + "learning_rate": 8.101106647784295e-07, + "loss": 0.2195, + "step": 103950 + }, + { + "epoch": 2.884392041574074, + "grad_norm": 0.15572527050971985, + "learning_rate": 7.912040806019816e-07, + "loss": 0.2187, + "step": 104000 + }, + { + "epoch": 2.884392041574074, + "eval_loss": 0.21670910716056824, + "eval_runtime": 500.2765, + "eval_samples_per_second": 5.711, + "eval_steps_per_second": 5.711, + "step": 104000 + }, + { + "epoch": 2.885778768517138, + "grad_norm": 0.15793730318546295, + "learning_rate": 7.725198564590419e-07, + "loss": 0.2154, + "step": 104050 + }, + { + "epoch": 2.8871654954602026, + "grad_norm": 0.12704993784427643, + "learning_rate": 7.540580342272007e-07, + "loss": 0.2175, + "step": 104100 + }, + { + "epoch": 2.888552222403267, + "grad_norm": 0.1465180665254593, + "learning_rate": 7.358186552855362e-07, + "loss": 0.2154, + "step": 104150 + }, + { + "epoch": 2.8899389493463317, + "grad_norm": 0.13564006984233856, + "learning_rate": 7.178017605146137e-07, + "loss": 0.2194, + "step": 104200 + }, + { + "epoch": 2.891325676289396, + "grad_norm": 0.12637755274772644, + "learning_rate": 7.000073902962978e-07, + "loss": 0.2167, + "step": 104250 + }, + { + "epoch": 2.8927124032324603, + "grad_norm": 0.1291232705116272, + "learning_rate": 6.824355845137298e-07, + "loss": 0.2167, + "step": 104300 + }, + { + "epoch": 2.8940991301755252, + "grad_norm": 0.12603652477264404, + "learning_rate": 6.650863825511611e-07, + "loss": 0.2153, + "step": 104350 + }, + { + "epoch": 2.8954858571185893, + "grad_norm": 0.11706072837114334, + "learning_rate": 6.479598232939754e-07, + "loss": 0.2164, + "step": 104400 + }, + { + "epoch": 2.896872584061654, + "grad_norm": 0.1363930106163025, + "learning_rate": 6.310559451284892e-07, + "loss": 0.2175, + "step": 104450 + }, + { + "epoch": 2.8982593110047183, + "grad_norm": 0.12287319451570511, + "learning_rate": 6.143747859419513e-07, + "loss": 0.2166, + "step": 104500 + }, + { + "epoch": 2.899646037947783, + "grad_norm": 0.1426548957824707, + "learning_rate": 5.979163831223989e-07, + "loss": 0.2192, + "step": 104550 + }, + { + "epoch": 2.9010327648908474, + "grad_norm": 0.13957655429840088, + "learning_rate": 5.816807735586127e-07, + "loss": 0.2205, + "step": 104600 + }, + { + "epoch": 2.9024194918339115, + "grad_norm": 0.11439050734043121, + "learning_rate": 5.656679936400178e-07, + "loss": 0.2222, + "step": 104650 + }, + { + "epoch": 2.9038062187769764, + "grad_norm": 0.11388220638036728, + "learning_rate": 5.498780792565938e-07, + "loss": 0.2183, + "step": 104700 + }, + { + "epoch": 2.9051929457200405, + "grad_norm": 0.13717804849147797, + "learning_rate": 5.343110657988093e-07, + "loss": 0.2211, + "step": 104750 + }, + { + "epoch": 2.906579672663105, + "grad_norm": 0.1320059895515442, + "learning_rate": 5.189669881575432e-07, + "loss": 0.2221, + "step": 104800 + }, + { + "epoch": 2.9079663996061695, + "grad_norm": 0.16280671954154968, + "learning_rate": 5.041461175445905e-07, + "loss": 0.2166, + "step": 104850 + }, + { + "epoch": 2.909353126549234, + "grad_norm": 0.1369704157114029, + "learning_rate": 4.892435537993234e-07, + "loss": 0.2219, + "step": 104900 + }, + { + "epoch": 2.9107398534922986, + "grad_norm": 0.15099848806858063, + "learning_rate": 4.7456402688191845e-07, + "loss": 0.2212, + "step": 104950 + }, + { + "epoch": 2.9121265804353627, + "grad_norm": 0.14291705191135406, + "learning_rate": 4.601075696940793e-07, + "loss": 0.2164, + "step": 105000 + }, + { + "epoch": 2.9121265804353627, + "eval_loss": 0.21670198440551758, + "eval_runtime": 500.5524, + "eval_samples_per_second": 5.708, + "eval_steps_per_second": 5.708, + "step": 105000 + }, + { + "epoch": 2.9135133073784276, + "grad_norm": 0.1311856508255005, + "learning_rate": 4.4587421463757604e-07, + "loss": 0.224, + "step": 105050 + }, + { + "epoch": 2.9149000343214917, + "grad_norm": 0.161887988448143, + "learning_rate": 4.3186399361409003e-07, + "loss": 0.2213, + "step": 105100 + }, + { + "epoch": 2.916286761264556, + "grad_norm": 0.11059953272342682, + "learning_rate": 4.1807693802521364e-07, + "loss": 0.2192, + "step": 105150 + }, + { + "epoch": 2.9176734882076207, + "grad_norm": 0.12950967252254486, + "learning_rate": 4.0451307877233947e-07, + "loss": 0.217, + "step": 105200 + }, + { + "epoch": 2.9190602151506853, + "grad_norm": 0.16520391404628754, + "learning_rate": 3.9117244625660467e-07, + "loss": 0.2232, + "step": 105250 + }, + { + "epoch": 2.9204469420937498, + "grad_norm": 0.12426646798849106, + "learning_rate": 3.780550703788355e-07, + "loss": 0.2138, + "step": 105300 + }, + { + "epoch": 2.9218336690368143, + "grad_norm": 0.13209545612335205, + "learning_rate": 3.651609805394252e-07, + "loss": 0.2167, + "step": 105350 + }, + { + "epoch": 2.923220395979879, + "grad_norm": 0.13010719418525696, + "learning_rate": 3.52490205638345e-07, + "loss": 0.2172, + "step": 105400 + }, + { + "epoch": 2.924607122922943, + "grad_norm": 0.15640610456466675, + "learning_rate": 3.4004277407502226e-07, + "loss": 0.2241, + "step": 105450 + }, + { + "epoch": 2.9259938498660074, + "grad_norm": 0.14542129635810852, + "learning_rate": 3.2781871374832907e-07, + "loss": 0.2229, + "step": 105500 + }, + { + "epoch": 2.927380576809072, + "grad_norm": 0.18153244256973267, + "learning_rate": 3.158180520564491e-07, + "loss": 0.2188, + "step": 105550 + }, + { + "epoch": 2.9287673037521365, + "grad_norm": 0.10919786989688873, + "learning_rate": 3.040408158968777e-07, + "loss": 0.2179, + "step": 105600 + }, + { + "epoch": 2.930154030695201, + "grad_norm": 0.14857454597949982, + "learning_rate": 2.9248703166633305e-07, + "loss": 0.2178, + "step": 105650 + }, + { + "epoch": 2.9315407576382655, + "grad_norm": 0.1264246106147766, + "learning_rate": 2.8115672526068947e-07, + "loss": 0.2194, + "step": 105700 + }, + { + "epoch": 2.93292748458133, + "grad_norm": 0.1416803002357483, + "learning_rate": 2.700499220749664e-07, + "loss": 0.217, + "step": 105750 + }, + { + "epoch": 2.934314211524394, + "grad_norm": 0.12332191318273544, + "learning_rate": 2.5916664700320615e-07, + "loss": 0.2174, + "step": 105800 + }, + { + "epoch": 2.9357009384674586, + "grad_norm": 0.14106211066246033, + "learning_rate": 2.4850692443847413e-07, + "loss": 0.2226, + "step": 105850 + }, + { + "epoch": 2.937087665410523, + "grad_norm": 0.11803478747606277, + "learning_rate": 2.380707782727476e-07, + "loss": 0.2179, + "step": 105900 + }, + { + "epoch": 2.9384743923535876, + "grad_norm": 0.15033772587776184, + "learning_rate": 2.278582318969269e-07, + "loss": 0.2241, + "step": 105950 + }, + { + "epoch": 2.939861119296652, + "grad_norm": 0.1364557147026062, + "learning_rate": 2.178693082007355e-07, + "loss": 0.2161, + "step": 106000 + }, + { + "epoch": 2.939861119296652, + "eval_loss": 0.21669968962669373, + "eval_runtime": 500.2368, + "eval_samples_per_second": 5.711, + "eval_steps_per_second": 5.711, + "step": 106000 + }, + { + "epoch": 2.9412478462397167, + "grad_norm": 0.1393050253391266, + "learning_rate": 2.081040295726866e-07, + "loss": 0.2236, + "step": 106050 + }, + { + "epoch": 2.942634573182781, + "grad_norm": 0.1260671466588974, + "learning_rate": 1.9856241790003892e-07, + "loss": 0.2192, + "step": 106100 + }, + { + "epoch": 2.9440213001258453, + "grad_norm": 0.1376418024301529, + "learning_rate": 1.8924449456870773e-07, + "loss": 0.2175, + "step": 106150 + }, + { + "epoch": 2.94540802706891, + "grad_norm": 0.1473878175020218, + "learning_rate": 1.8015028046328707e-07, + "loss": 0.2202, + "step": 106200 + }, + { + "epoch": 2.9467947540119743, + "grad_norm": 0.12721039354801178, + "learning_rate": 1.7127979596694987e-07, + "loss": 0.2177, + "step": 106250 + }, + { + "epoch": 2.948181480955039, + "grad_norm": 0.13388779759407043, + "learning_rate": 1.626330609613924e-07, + "loss": 0.2177, + "step": 106300 + }, + { + "epoch": 2.9495682078981034, + "grad_norm": 0.1170138493180275, + "learning_rate": 1.5421009482686766e-07, + "loss": 0.2192, + "step": 106350 + }, + { + "epoch": 2.950954934841168, + "grad_norm": 0.13245512545108795, + "learning_rate": 1.460109164420187e-07, + "loss": 0.2164, + "step": 106400 + }, + { + "epoch": 2.9523416617842324, + "grad_norm": 0.11573006212711334, + "learning_rate": 1.3803554418396758e-07, + "loss": 0.2164, + "step": 106450 + }, + { + "epoch": 2.9537283887272965, + "grad_norm": 0.1261189877986908, + "learning_rate": 1.3028399592818208e-07, + "loss": 0.2161, + "step": 106500 + }, + { + "epoch": 2.955115115670361, + "grad_norm": 0.1416768878698349, + "learning_rate": 1.227562890484535e-07, + "loss": 0.2183, + "step": 106550 + }, + { + "epoch": 2.9565018426134255, + "grad_norm": 0.1263783723115921, + "learning_rate": 1.1545244041690773e-07, + "loss": 0.2154, + "step": 106600 + }, + { + "epoch": 2.95788856955649, + "grad_norm": 0.11391854286193848, + "learning_rate": 1.0837246640389432e-07, + "loss": 0.2137, + "step": 106650 + }, + { + "epoch": 2.9592752964995546, + "grad_norm": 0.12810936570167542, + "learning_rate": 1.0151638287799747e-07, + "loss": 0.2192, + "step": 106700 + }, + { + "epoch": 2.960662023442619, + "grad_norm": 0.11619652062654495, + "learning_rate": 9.488420520600283e-08, + "loss": 0.2171, + "step": 106750 + }, + { + "epoch": 2.9620487503856836, + "grad_norm": 0.1461794227361679, + "learning_rate": 8.847594825281968e-08, + "loss": 0.2173, + "step": 106800 + }, + { + "epoch": 2.9634354773287477, + "grad_norm": 0.13875767588615417, + "learning_rate": 8.229162638150323e-08, + "loss": 0.2177, + "step": 106850 + }, + { + "epoch": 2.964822204271812, + "grad_norm": 0.1234852597117424, + "learning_rate": 7.633125345317682e-08, + "loss": 0.216, + "step": 106900 + }, + { + "epoch": 2.9662089312148767, + "grad_norm": 0.12513309717178345, + "learning_rate": 7.0594842827032e-08, + "loss": 0.2188, + "step": 106950 + }, + { + "epoch": 2.9675956581579412, + "grad_norm": 0.1280387043952942, + "learning_rate": 6.508240736027294e-08, + "loss": 0.2176, + "step": 107000 + }, + { + "epoch": 2.9675956581579412, + "eval_loss": 0.2166997194290161, + "eval_runtime": 500.3572, + "eval_samples_per_second": 5.71, + "eval_steps_per_second": 5.71, + "step": 107000 + }, + { + "epoch": 2.9689823851010058, + "grad_norm": 0.14746126532554626, + "learning_rate": 5.97939594081054e-08, + "loss": 0.2189, + "step": 107050 + }, + { + "epoch": 2.9703691120440703, + "grad_norm": 0.13869601488113403, + "learning_rate": 5.472951082371447e-08, + "loss": 0.2153, + "step": 107100 + }, + { + "epoch": 2.971755838987135, + "grad_norm": 0.1652906984090805, + "learning_rate": 4.9889072958220203e-08, + "loss": 0.2179, + "step": 107150 + }, + { + "epoch": 2.973142565930199, + "grad_norm": 0.15118339657783508, + "learning_rate": 4.5272656660655385e-08, + "loss": 0.2175, + "step": 107200 + }, + { + "epoch": 2.9745292928732634, + "grad_norm": 0.12483932077884674, + "learning_rate": 4.088027227795444e-08, + "loss": 0.2159, + "step": 107250 + }, + { + "epoch": 2.975916019816328, + "grad_norm": 0.12288288027048111, + "learning_rate": 3.6711929654920096e-08, + "loss": 0.2176, + "step": 107300 + }, + { + "epoch": 2.9773027467593924, + "grad_norm": 0.14908748865127563, + "learning_rate": 3.2767638134190146e-08, + "loss": 0.2165, + "step": 107350 + }, + { + "epoch": 2.978689473702457, + "grad_norm": 0.1312543898820877, + "learning_rate": 2.904740655623739e-08, + "loss": 0.2121, + "step": 107400 + }, + { + "epoch": 2.9800762006455215, + "grad_norm": 0.13569709658622742, + "learning_rate": 2.5551243259358537e-08, + "loss": 0.2172, + "step": 107450 + }, + { + "epoch": 2.981462927588586, + "grad_norm": 0.1605551540851593, + "learning_rate": 2.227915607960762e-08, + "loss": 0.2165, + "step": 107500 + }, + { + "epoch": 2.98284965453165, + "grad_norm": 0.14773668348789215, + "learning_rate": 1.9231152350829285e-08, + "loss": 0.2155, + "step": 107550 + }, + { + "epoch": 2.984236381474715, + "grad_norm": 0.12570437788963318, + "learning_rate": 1.6407238904625476e-08, + "loss": 0.2206, + "step": 107600 + }, + { + "epoch": 2.985623108417779, + "grad_norm": 0.15974928438663483, + "learning_rate": 1.380742207031105e-08, + "loss": 0.2159, + "step": 107650 + }, + { + "epoch": 2.9870098353608436, + "grad_norm": 0.14371749758720398, + "learning_rate": 1.1431707674958158e-08, + "loss": 0.2214, + "step": 107700 + }, + { + "epoch": 2.988396562303908, + "grad_norm": 0.1482096165418625, + "learning_rate": 9.28010104334076e-09, + "loss": 0.217, + "step": 107750 + }, + { + "epoch": 2.9897832892469727, + "grad_norm": 0.14239954948425293, + "learning_rate": 7.3526069979013015e-09, + "loss": 0.2184, + "step": 107800 + }, + { + "epoch": 2.991170016190037, + "grad_norm": 0.16931667923927307, + "learning_rate": 5.649229858828431e-09, + "loss": 0.2217, + "step": 107850 + }, + { + "epoch": 2.9925567431331013, + "grad_norm": 0.11450044065713882, + "learning_rate": 4.169973443945985e-09, + "loss": 0.2111, + "step": 107900 + }, + { + "epoch": 2.9939434700761662, + "grad_norm": 0.13328483700752258, + "learning_rate": 2.91484106875739e-09, + "loss": 0.2185, + "step": 107950 + }, + { + "epoch": 2.9953301970192303, + "grad_norm": 0.15810194611549377, + "learning_rate": 1.8838355464345647e-09, + "loss": 0.2183, + "step": 108000 + }, + { + "epoch": 2.9953301970192303, + "eval_loss": 0.2166995406150818, + "eval_runtime": 500.4679, + "eval_samples_per_second": 5.709, + "eval_steps_per_second": 5.709, + "step": 108000 + }, + { + "epoch": 2.996716923962295, + "grad_norm": 0.16397058963775635, + "learning_rate": 1.0769591878068142e-09, + "loss": 0.2181, + "step": 108050 + }, + { + "epoch": 2.9981036509053594, + "grad_norm": 0.1313973069190979, + "learning_rate": 4.942138013608322e-10, + "loss": 0.2171, + "step": 108100 + }, + { + "epoch": 2.999490377848424, + "grad_norm": 0.13321100175380707, + "learning_rate": 1.3560069320739388e-10, + "loss": 0.2169, + "step": 108150 + } + ], + "logging_steps": 50, + "max_steps": 108168, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.9496046202925875e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}