diff --git "a/checkpoint-46000/trainer_state.json" "b/checkpoint-46000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-46000/trainer_state.json" @@ -0,0 +1,6841 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.275788787619302, + "eval_steps": 1000, + "global_step": 46000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0013867269430644586, + "grad_norm": 1.8933687210083008, + "learning_rate": 2.957486136783734e-06, + "loss": 1.2241, + "step": 50 + }, + { + "epoch": 0.002773453886128917, + "grad_norm": 0.7502820491790771, + "learning_rate": 6.038200862600124e-06, + "loss": 1.0267, + "step": 100 + }, + { + "epoch": 0.004160180829193376, + "grad_norm": 0.5821689963340759, + "learning_rate": 9.118915588416513e-06, + "loss": 0.8167, + "step": 150 + }, + { + "epoch": 0.005546907772257834, + "grad_norm": 0.5138927698135376, + "learning_rate": 1.2199630314232902e-05, + "loss": 0.6408, + "step": 200 + }, + { + "epoch": 0.006933634715322293, + "grad_norm": 0.619263768196106, + "learning_rate": 1.5280345040049293e-05, + "loss": 0.5468, + "step": 250 + }, + { + "epoch": 0.008320361658386751, + "grad_norm": 0.5078439712524414, + "learning_rate": 1.836105976586568e-05, + "loss": 0.4952, + "step": 300 + }, + { + "epoch": 0.00970708860145121, + "grad_norm": 0.5653749108314514, + "learning_rate": 2.144177449168207e-05, + "loss": 0.4388, + "step": 350 + }, + { + "epoch": 0.011093815544515669, + "grad_norm": 0.6189213991165161, + "learning_rate": 2.452248921749846e-05, + "loss": 0.4232, + "step": 400 + }, + { + "epoch": 0.012480542487580126, + "grad_norm": 0.6082913875579834, + "learning_rate": 2.760320394331485e-05, + "loss": 0.401, + "step": 450 + }, + { + "epoch": 0.013867269430644586, + "grad_norm": 0.6956301331520081, + "learning_rate": 3.068391866913124e-05, + "loss": 0.3895, + "step": 500 + }, + { + "epoch": 0.015253996373709043, + "grad_norm": 0.7030412554740906, + "learning_rate": 3.3764633394947633e-05, + "loss": 0.3676, + "step": 550 + }, + { + "epoch": 0.016640723316773503, + "grad_norm": 0.6779190897941589, + "learning_rate": 3.684534812076402e-05, + "loss": 0.3653, + "step": 600 + }, + { + "epoch": 0.01802745025983796, + "grad_norm": 0.8930213451385498, + "learning_rate": 3.992606284658041e-05, + "loss": 0.3645, + "step": 650 + }, + { + "epoch": 0.01941417720290242, + "grad_norm": 0.6423994302749634, + "learning_rate": 4.30067775723968e-05, + "loss": 0.3514, + "step": 700 + }, + { + "epoch": 0.02080090414596688, + "grad_norm": 0.7728660106658936, + "learning_rate": 4.608749229821319e-05, + "loss": 0.3468, + "step": 750 + }, + { + "epoch": 0.022187631089031337, + "grad_norm": 0.7561061978340149, + "learning_rate": 4.916820702402958e-05, + "loss": 0.3499, + "step": 800 + }, + { + "epoch": 0.023574358032095795, + "grad_norm": 0.6163890957832336, + "learning_rate": 5.224892174984597e-05, + "loss": 0.3417, + "step": 850 + }, + { + "epoch": 0.024961084975160253, + "grad_norm": 0.7334563732147217, + "learning_rate": 5.532963647566236e-05, + "loss": 0.3299, + "step": 900 + }, + { + "epoch": 0.026347811918224714, + "grad_norm": 0.655237078666687, + "learning_rate": 5.841035120147874e-05, + "loss": 0.3306, + "step": 950 + }, + { + "epoch": 0.02773453886128917, + "grad_norm": 0.8147113919258118, + "learning_rate": 6.149106592729513e-05, + "loss": 0.3281, + "step": 1000 + }, + { + "epoch": 0.02773453886128917, + "eval_loss": 0.32194069027900696, + "eval_runtime": 501.2457, + "eval_samples_per_second": 5.7, + "eval_steps_per_second": 5.7, + "step": 1000 + }, + { + "epoch": 0.02912126580435363, + "grad_norm": 0.6397083401679993, + "learning_rate": 6.457178065311152e-05, + "loss": 0.3204, + "step": 1050 + }, + { + "epoch": 0.030507992747418087, + "grad_norm": 0.5808627009391785, + "learning_rate": 6.765249537892791e-05, + "loss": 0.3229, + "step": 1100 + }, + { + "epoch": 0.03189471969048255, + "grad_norm": 0.6929567456245422, + "learning_rate": 7.073321010474431e-05, + "loss": 0.3148, + "step": 1150 + }, + { + "epoch": 0.033281446633547006, + "grad_norm": 0.620298445224762, + "learning_rate": 7.38139248305607e-05, + "loss": 0.32, + "step": 1200 + }, + { + "epoch": 0.034668173576611463, + "grad_norm": 0.5947968363761902, + "learning_rate": 7.689463955637708e-05, + "loss": 0.306, + "step": 1250 + }, + { + "epoch": 0.03605490051967592, + "grad_norm": 0.6097683906555176, + "learning_rate": 7.997535428219347e-05, + "loss": 0.3179, + "step": 1300 + }, + { + "epoch": 0.03744162746274038, + "grad_norm": 0.6339348554611206, + "learning_rate": 8.305606900800986e-05, + "loss": 0.3161, + "step": 1350 + }, + { + "epoch": 0.03882835440580484, + "grad_norm": 0.5278933644294739, + "learning_rate": 8.613678373382625e-05, + "loss": 0.3153, + "step": 1400 + }, + { + "epoch": 0.040215081348869294, + "grad_norm": 0.4927423894405365, + "learning_rate": 8.921749845964264e-05, + "loss": 0.3111, + "step": 1450 + }, + { + "epoch": 0.04160180829193376, + "grad_norm": 0.4745596945285797, + "learning_rate": 9.229821318545902e-05, + "loss": 0.304, + "step": 1500 + }, + { + "epoch": 0.04298853523499822, + "grad_norm": 0.6532231569290161, + "learning_rate": 9.537892791127541e-05, + "loss": 0.3084, + "step": 1550 + }, + { + "epoch": 0.044375262178062674, + "grad_norm": 0.5528659820556641, + "learning_rate": 9.84596426370918e-05, + "loss": 0.3084, + "step": 1600 + }, + { + "epoch": 0.04576198912112713, + "grad_norm": 0.45793089270591736, + "learning_rate": 0.0001015403573629082, + "loss": 0.2964, + "step": 1650 + }, + { + "epoch": 0.04714871606419159, + "grad_norm": 0.5063529014587402, + "learning_rate": 0.00010462107208872458, + "loss": 0.2924, + "step": 1700 + }, + { + "epoch": 0.04853544300725605, + "grad_norm": 0.48600247502326965, + "learning_rate": 0.00010770178681454097, + "loss": 0.2947, + "step": 1750 + }, + { + "epoch": 0.049922169950320505, + "grad_norm": 0.4872143268585205, + "learning_rate": 0.00011078250154035737, + "loss": 0.297, + "step": 1800 + }, + { + "epoch": 0.05130889689338496, + "grad_norm": 0.5091805458068848, + "learning_rate": 0.00011386321626617376, + "loss": 0.2888, + "step": 1850 + }, + { + "epoch": 0.05269562383644943, + "grad_norm": 0.41649994254112244, + "learning_rate": 0.00011694393099199015, + "loss": 0.2871, + "step": 1900 + }, + { + "epoch": 0.054082350779513885, + "grad_norm": 0.5174862146377563, + "learning_rate": 0.00012002464571780654, + "loss": 0.2922, + "step": 1950 + }, + { + "epoch": 0.05546907772257834, + "grad_norm": 0.45786553621292114, + "learning_rate": 0.00012310536044362293, + "loss": 0.2883, + "step": 2000 + }, + { + "epoch": 0.05546907772257834, + "eval_loss": 0.28488224744796753, + "eval_runtime": 500.9558, + "eval_samples_per_second": 5.703, + "eval_steps_per_second": 5.703, + "step": 2000 + }, + { + "epoch": 0.0568558046656428, + "grad_norm": 0.4992533326148987, + "learning_rate": 0.00012606284658040666, + "loss": 0.3033, + "step": 2050 + }, + { + "epoch": 0.05824253160870726, + "grad_norm": 0.4205988049507141, + "learning_rate": 0.00012914356130622304, + "loss": 0.2867, + "step": 2100 + }, + { + "epoch": 0.059629258551771716, + "grad_norm": 0.4288152754306793, + "learning_rate": 0.00013222427603203944, + "loss": 0.2795, + "step": 2150 + }, + { + "epoch": 0.061015985494836174, + "grad_norm": 0.4856145977973938, + "learning_rate": 0.00013530499075785582, + "loss": 0.2833, + "step": 2200 + }, + { + "epoch": 0.06240271243790063, + "grad_norm": 0.4891654849052429, + "learning_rate": 0.00013838570548367222, + "loss": 0.2797, + "step": 2250 + }, + { + "epoch": 0.0637894393809651, + "grad_norm": 0.39899352192878723, + "learning_rate": 0.00014146642020948863, + "loss": 0.2785, + "step": 2300 + }, + { + "epoch": 0.06517616632402955, + "grad_norm": 0.3616255819797516, + "learning_rate": 0.000144547134935305, + "loss": 0.2798, + "step": 2350 + }, + { + "epoch": 0.06656289326709401, + "grad_norm": 0.3556617498397827, + "learning_rate": 0.0001476278496611214, + "loss": 0.2811, + "step": 2400 + }, + { + "epoch": 0.06794962021015846, + "grad_norm": 0.39639297127723694, + "learning_rate": 0.00015070856438693776, + "loss": 0.2813, + "step": 2450 + }, + { + "epoch": 0.06933634715322293, + "grad_norm": 0.35177573561668396, + "learning_rate": 0.00015378927911275416, + "loss": 0.2797, + "step": 2500 + }, + { + "epoch": 0.07072307409628739, + "grad_norm": 0.38610222935676575, + "learning_rate": 0.00015686999383857054, + "loss": 0.2747, + "step": 2550 + }, + { + "epoch": 0.07210980103935184, + "grad_norm": 0.36727309226989746, + "learning_rate": 0.00015995070856438694, + "loss": 0.2776, + "step": 2600 + }, + { + "epoch": 0.07349652798241631, + "grad_norm": 0.3905107378959656, + "learning_rate": 0.00016303142329020332, + "loss": 0.2772, + "step": 2650 + }, + { + "epoch": 0.07488325492548076, + "grad_norm": 0.3958912193775177, + "learning_rate": 0.00016611213801601973, + "loss": 0.2707, + "step": 2700 + }, + { + "epoch": 0.07626998186854522, + "grad_norm": 0.4029497504234314, + "learning_rate": 0.0001691928527418361, + "loss": 0.2692, + "step": 2750 + }, + { + "epoch": 0.07765670881160967, + "grad_norm": 0.3514055907726288, + "learning_rate": 0.0001722735674676525, + "loss": 0.2759, + "step": 2800 + }, + { + "epoch": 0.07904343575467414, + "grad_norm": 0.34912553429603577, + "learning_rate": 0.00017529266789895255, + "loss": 0.2793, + "step": 2850 + }, + { + "epoch": 0.08043016269773859, + "grad_norm": 0.3493233621120453, + "learning_rate": 0.00017831176833025262, + "loss": 0.2845, + "step": 2900 + }, + { + "epoch": 0.08181688964080305, + "grad_norm": 0.30080145597457886, + "learning_rate": 0.00018139248305606902, + "loss": 0.2686, + "step": 2950 + }, + { + "epoch": 0.08320361658386752, + "grad_norm": 0.3265998959541321, + "learning_rate": 0.0001844731977818854, + "loss": 0.2695, + "step": 3000 + }, + { + "epoch": 0.08320361658386752, + "eval_loss": 0.26523345708847046, + "eval_runtime": 500.4565, + "eval_samples_per_second": 5.709, + "eval_steps_per_second": 5.709, + "step": 3000 + }, + { + "epoch": 0.08459034352693197, + "grad_norm": 0.29866209626197815, + "learning_rate": 0.0001875539125077018, + "loss": 0.2679, + "step": 3050 + }, + { + "epoch": 0.08597707046999643, + "grad_norm": 0.3191625475883484, + "learning_rate": 0.00019063462723351818, + "loss": 0.267, + "step": 3100 + }, + { + "epoch": 0.08736379741306088, + "grad_norm": 0.3110339939594269, + "learning_rate": 0.00019371534195933459, + "loss": 0.2658, + "step": 3150 + }, + { + "epoch": 0.08875052435612535, + "grad_norm": 0.32120850682258606, + "learning_rate": 0.00019679605668515096, + "loss": 0.2724, + "step": 3200 + }, + { + "epoch": 0.0901372512991898, + "grad_norm": 0.28446418046951294, + "learning_rate": 0.00019987677141096734, + "loss": 0.268, + "step": 3250 + }, + { + "epoch": 0.09152397824225426, + "grad_norm": 0.2722443640232086, + "learning_rate": 0.00019999989671933422, + "loss": 0.2716, + "step": 3300 + }, + { + "epoch": 0.09291070518531871, + "grad_norm": 0.31304416060447693, + "learning_rate": 0.00019999956948482068, + "loss": 0.2631, + "step": 3350 + }, + { + "epoch": 0.09429743212838318, + "grad_norm": 0.2516928017139435, + "learning_rate": 0.00019999901811788604, + "loss": 0.2647, + "step": 3400 + }, + { + "epoch": 0.09568415907144764, + "grad_norm": 0.288006067276001, + "learning_rate": 0.00019999824261976613, + "loss": 0.263, + "step": 3450 + }, + { + "epoch": 0.0970708860145121, + "grad_norm": 0.2745107114315033, + "learning_rate": 0.00019999724299219913, + "loss": 0.2642, + "step": 3500 + }, + { + "epoch": 0.09845761295757656, + "grad_norm": 2.800987720489502, + "learning_rate": 0.00019999601923742548, + "loss": 0.7176, + "step": 3550 + }, + { + "epoch": 0.09984433990064101, + "grad_norm": 0.3590925931930542, + "learning_rate": 0.00019999457135818805, + "loss": 0.3146, + "step": 3600 + }, + { + "epoch": 0.10123106684370548, + "grad_norm": 0.32617494463920593, + "learning_rate": 0.00019999289935773202, + "loss": 0.2786, + "step": 3650 + }, + { + "epoch": 0.10261779378676993, + "grad_norm": 0.3239264488220215, + "learning_rate": 0.0001999910032398049, + "loss": 0.2807, + "step": 3700 + }, + { + "epoch": 0.10400452072983439, + "grad_norm": 0.3022274076938629, + "learning_rate": 0.00019998888300865652, + "loss": 0.2758, + "step": 3750 + }, + { + "epoch": 0.10539124767289886, + "grad_norm": 0.33024862408638, + "learning_rate": 0.000199986538669039, + "loss": 0.2687, + "step": 3800 + }, + { + "epoch": 0.1067779746159633, + "grad_norm": 0.6899451017379761, + "learning_rate": 0.00019998397022620687, + "loss": 0.2699, + "step": 3850 + }, + { + "epoch": 0.10816470155902777, + "grad_norm": 0.2794604003429413, + "learning_rate": 0.0001999811776859168, + "loss": 0.2667, + "step": 3900 + }, + { + "epoch": 0.10955142850209222, + "grad_norm": 0.2764255106449127, + "learning_rate": 0.00019997816105442778, + "loss": 0.2658, + "step": 3950 + }, + { + "epoch": 0.11093815544515669, + "grad_norm": 0.43574222922325134, + "learning_rate": 0.0001999749203385012, + "loss": 0.2664, + "step": 4000 + }, + { + "epoch": 0.11093815544515669, + "eval_loss": 0.26065966486930847, + "eval_runtime": 500.842, + "eval_samples_per_second": 5.704, + "eval_steps_per_second": 5.704, + "step": 4000 + }, + { + "epoch": 0.11232488238822114, + "grad_norm": 0.5340762734413147, + "learning_rate": 0.00019997145554540046, + "loss": 0.272, + "step": 4050 + }, + { + "epoch": 0.1137116093312856, + "grad_norm": 0.32403895258903503, + "learning_rate": 0.00019996776668289136, + "loss": 0.2679, + "step": 4100 + }, + { + "epoch": 0.11509833627435005, + "grad_norm": 0.2928290367126465, + "learning_rate": 0.0001999638537592419, + "loss": 0.2624, + "step": 4150 + }, + { + "epoch": 0.11648506321741452, + "grad_norm": 0.23226021230220795, + "learning_rate": 0.00019995971678322228, + "loss": 0.2557, + "step": 4200 + }, + { + "epoch": 0.11787179016047898, + "grad_norm": 0.2748055160045624, + "learning_rate": 0.00019995535576410476, + "loss": 0.2625, + "step": 4250 + }, + { + "epoch": 0.11925851710354343, + "grad_norm": 0.2713299095630646, + "learning_rate": 0.00019995077071166385, + "loss": 0.2611, + "step": 4300 + }, + { + "epoch": 0.1206452440466079, + "grad_norm": 0.24674977362155914, + "learning_rate": 0.00019994596163617624, + "loss": 0.2647, + "step": 4350 + }, + { + "epoch": 0.12203197098967235, + "grad_norm": 0.359017014503479, + "learning_rate": 0.00019994092854842065, + "loss": 0.2601, + "step": 4400 + }, + { + "epoch": 0.12341869793273681, + "grad_norm": 0.38051414489746094, + "learning_rate": 0.00019993567145967791, + "loss": 0.253, + "step": 4450 + }, + { + "epoch": 0.12480542487580126, + "grad_norm": 0.26227161288261414, + "learning_rate": 0.0001999301903817309, + "loss": 0.2584, + "step": 4500 + }, + { + "epoch": 0.12619215181886573, + "grad_norm": 0.21259668469429016, + "learning_rate": 0.00019992448532686453, + "loss": 0.2618, + "step": 4550 + }, + { + "epoch": 0.1275788787619302, + "grad_norm": 0.23226451873779297, + "learning_rate": 0.0001999185563078658, + "loss": 0.2526, + "step": 4600 + }, + { + "epoch": 0.12896560570499466, + "grad_norm": 0.24459871649742126, + "learning_rate": 0.00019991240333802352, + "loss": 0.2523, + "step": 4650 + }, + { + "epoch": 0.1303523326480591, + "grad_norm": 0.29185208678245544, + "learning_rate": 0.00019990602643112863, + "loss": 0.2546, + "step": 4700 + }, + { + "epoch": 0.13173905959112356, + "grad_norm": 0.23443324863910675, + "learning_rate": 0.00019989942560147387, + "loss": 0.2557, + "step": 4750 + }, + { + "epoch": 0.13312578653418802, + "grad_norm": 0.22915039956569672, + "learning_rate": 0.00019989260086385394, + "loss": 0.2546, + "step": 4800 + }, + { + "epoch": 0.1345125134772525, + "grad_norm": 0.2710748016834259, + "learning_rate": 0.00019988555223356531, + "loss": 0.2619, + "step": 4850 + }, + { + "epoch": 0.13589924042031692, + "grad_norm": 0.24671098589897156, + "learning_rate": 0.00019987827972640633, + "loss": 0.2594, + "step": 4900 + }, + { + "epoch": 0.1372859673633814, + "grad_norm": 0.2359282672405243, + "learning_rate": 0.00019987078335867713, + "loss": 0.2616, + "step": 4950 + }, + { + "epoch": 0.13867269430644585, + "grad_norm": 0.2197064608335495, + "learning_rate": 0.00019986306314717956, + "loss": 0.2507, + "step": 5000 + }, + { + "epoch": 0.13867269430644585, + "eval_loss": 0.25083017349243164, + "eval_runtime": 500.7995, + "eval_samples_per_second": 5.705, + "eval_steps_per_second": 5.705, + "step": 5000 + }, + { + "epoch": 0.14005942124951032, + "grad_norm": 0.2249370515346527, + "learning_rate": 0.0001998551191092172, + "loss": 0.2574, + "step": 5050 + }, + { + "epoch": 0.14144614819257478, + "grad_norm": 0.36345556378364563, + "learning_rate": 0.0001998469512625953, + "loss": 0.2493, + "step": 5100 + }, + { + "epoch": 0.14283287513563922, + "grad_norm": 0.24807791411876678, + "learning_rate": 0.00019983855962562067, + "loss": 0.2542, + "step": 5150 + }, + { + "epoch": 0.14421960207870368, + "grad_norm": 3.6125738620758057, + "learning_rate": 0.00019982994421710186, + "loss": 0.2595, + "step": 5200 + }, + { + "epoch": 0.14560632902176815, + "grad_norm": 0.4985048472881317, + "learning_rate": 0.0001998211050563488, + "loss": 0.2558, + "step": 5250 + }, + { + "epoch": 0.14699305596483261, + "grad_norm": 0.3320443332195282, + "learning_rate": 0.00019981204216317308, + "loss": 0.2545, + "step": 5300 + }, + { + "epoch": 0.14837978290789705, + "grad_norm": 0.2081877887248993, + "learning_rate": 0.00019980275555788759, + "loss": 0.2536, + "step": 5350 + }, + { + "epoch": 0.14976650985096152, + "grad_norm": 0.27258801460266113, + "learning_rate": 0.00019979324526130676, + "loss": 0.2505, + "step": 5400 + }, + { + "epoch": 0.15115323679402598, + "grad_norm": 0.23199999332427979, + "learning_rate": 0.00019978351129474632, + "loss": 0.2556, + "step": 5450 + }, + { + "epoch": 0.15253996373709044, + "grad_norm": 0.20929445326328278, + "learning_rate": 0.00019977355368002334, + "loss": 0.2486, + "step": 5500 + }, + { + "epoch": 0.1539266906801549, + "grad_norm": 0.23551955819129944, + "learning_rate": 0.00019976337243945617, + "loss": 0.2517, + "step": 5550 + }, + { + "epoch": 0.15531341762321935, + "grad_norm": 0.30231812596321106, + "learning_rate": 0.0001997529675958644, + "loss": 0.2498, + "step": 5600 + }, + { + "epoch": 0.1567001445662838, + "grad_norm": 0.24430635571479797, + "learning_rate": 0.00019974233917256865, + "loss": 0.2523, + "step": 5650 + }, + { + "epoch": 0.15808687150934828, + "grad_norm": 6.362756252288818, + "learning_rate": 0.0001997314871933909, + "loss": 0.2529, + "step": 5700 + }, + { + "epoch": 0.15947359845241274, + "grad_norm": 0.2339017242193222, + "learning_rate": 0.00019972041168265397, + "loss": 0.2524, + "step": 5750 + }, + { + "epoch": 0.16086032539547718, + "grad_norm": 0.22503100335597992, + "learning_rate": 0.0001997091126651818, + "loss": 0.251, + "step": 5800 + }, + { + "epoch": 0.16224705233854164, + "grad_norm": 0.26495125889778137, + "learning_rate": 0.00019969759016629928, + "loss": 0.2517, + "step": 5850 + }, + { + "epoch": 0.1636337792816061, + "grad_norm": 0.25339657068252563, + "learning_rate": 0.00019968584421183212, + "loss": 0.2505, + "step": 5900 + }, + { + "epoch": 0.16502050622467057, + "grad_norm": 0.20266841351985931, + "learning_rate": 0.000199673874828107, + "loss": 0.2501, + "step": 5950 + }, + { + "epoch": 0.16640723316773504, + "grad_norm": 0.19285647571086884, + "learning_rate": 0.00019966168204195125, + "loss": 0.2445, + "step": 6000 + }, + { + "epoch": 0.16640723316773504, + "eval_loss": 0.24731825292110443, + "eval_runtime": 500.9495, + "eval_samples_per_second": 5.703, + "eval_steps_per_second": 5.703, + "step": 6000 + }, + { + "epoch": 0.16779396011079947, + "grad_norm": 0.2121065855026245, + "learning_rate": 0.000199649265880693, + "loss": 0.2466, + "step": 6050 + }, + { + "epoch": 0.16918068705386394, + "grad_norm": 0.2560518980026245, + "learning_rate": 0.000199636626372161, + "loss": 0.2572, + "step": 6100 + }, + { + "epoch": 0.1705674139969284, + "grad_norm": 0.22927352786064148, + "learning_rate": 0.00019962376354468466, + "loss": 0.2509, + "step": 6150 + }, + { + "epoch": 0.17195414093999287, + "grad_norm": 0.2201690673828125, + "learning_rate": 0.00019961067742709377, + "loss": 0.2501, + "step": 6200 + }, + { + "epoch": 0.1733408678830573, + "grad_norm": 0.23233374953269958, + "learning_rate": 0.0001995973680487188, + "loss": 0.2525, + "step": 6250 + }, + { + "epoch": 0.17472759482612177, + "grad_norm": 0.254256933927536, + "learning_rate": 0.00019958383543939041, + "loss": 0.2499, + "step": 6300 + }, + { + "epoch": 0.17611432176918623, + "grad_norm": 0.1754632294178009, + "learning_rate": 0.00019957007962943975, + "loss": 0.251, + "step": 6350 + }, + { + "epoch": 0.1775010487122507, + "grad_norm": 0.23628771305084229, + "learning_rate": 0.00019955610064969817, + "loss": 0.256, + "step": 6400 + }, + { + "epoch": 0.17888777565531516, + "grad_norm": 0.23698653280735016, + "learning_rate": 0.00019954189853149725, + "loss": 0.2474, + "step": 6450 + }, + { + "epoch": 0.1802745025983796, + "grad_norm": 0.27713823318481445, + "learning_rate": 0.00019952747330666867, + "loss": 0.2481, + "step": 6500 + }, + { + "epoch": 0.18166122954144406, + "grad_norm": 0.1710810512304306, + "learning_rate": 0.00019951282500754413, + "loss": 0.2564, + "step": 6550 + }, + { + "epoch": 0.18304795648450853, + "grad_norm": 0.21406157314777374, + "learning_rate": 0.00019949795366695544, + "loss": 0.2517, + "step": 6600 + }, + { + "epoch": 0.184434683427573, + "grad_norm": 0.20108449459075928, + "learning_rate": 0.00019948285931823415, + "loss": 0.2518, + "step": 6650 + }, + { + "epoch": 0.18582141037063743, + "grad_norm": 5.1352715492248535, + "learning_rate": 0.0001994675419952118, + "loss": 0.2546, + "step": 6700 + }, + { + "epoch": 0.1872081373137019, + "grad_norm": 0.22743810713291168, + "learning_rate": 0.00019945200173221962, + "loss": 0.2457, + "step": 6750 + }, + { + "epoch": 0.18859486425676636, + "grad_norm": 0.20475907623767853, + "learning_rate": 0.0001994362385640885, + "loss": 0.2529, + "step": 6800 + }, + { + "epoch": 0.18998159119983082, + "grad_norm": 0.22172316908836365, + "learning_rate": 0.000199420252526149, + "loss": 0.2554, + "step": 6850 + }, + { + "epoch": 0.1913683181428953, + "grad_norm": 2.967470407485962, + "learning_rate": 0.0001994040436542311, + "loss": 0.2555, + "step": 6900 + }, + { + "epoch": 0.19275504508595973, + "grad_norm": 0.23698735237121582, + "learning_rate": 0.00019938761198466437, + "loss": 0.2619, + "step": 6950 + }, + { + "epoch": 0.1941417720290242, + "grad_norm": 0.17891797423362732, + "learning_rate": 0.0001993709575542776, + "loss": 0.2464, + "step": 7000 + }, + { + "epoch": 0.1941417720290242, + "eval_loss": 0.24410127103328705, + "eval_runtime": 500.8833, + "eval_samples_per_second": 5.704, + "eval_steps_per_second": 5.704, + "step": 7000 + }, + { + "epoch": 0.19552849897208865, + "grad_norm": 0.21030811965465546, + "learning_rate": 0.00019935408040039901, + "loss": 0.2517, + "step": 7050 + }, + { + "epoch": 0.19691522591515312, + "grad_norm": 0.1913098245859146, + "learning_rate": 0.00019933698056085586, + "loss": 0.249, + "step": 7100 + }, + { + "epoch": 0.19830195285821758, + "grad_norm": 0.2044433057308197, + "learning_rate": 0.00019931965807397465, + "loss": 0.2496, + "step": 7150 + }, + { + "epoch": 0.19968867980128202, + "grad_norm": 0.18698015809059143, + "learning_rate": 0.00019930211297858078, + "loss": 0.2537, + "step": 7200 + }, + { + "epoch": 0.20107540674434649, + "grad_norm": 0.22580522298812866, + "learning_rate": 0.00019928434531399876, + "loss": 0.2456, + "step": 7250 + }, + { + "epoch": 0.20246213368741095, + "grad_norm": 0.1749202162027359, + "learning_rate": 0.00019926635512005183, + "loss": 0.2504, + "step": 7300 + }, + { + "epoch": 0.20384886063047541, + "grad_norm": 0.2123364359140396, + "learning_rate": 0.00019924814243706197, + "loss": 0.2477, + "step": 7350 + }, + { + "epoch": 0.20523558757353985, + "grad_norm": 0.2234705090522766, + "learning_rate": 0.00019922970730584997, + "loss": 0.2457, + "step": 7400 + }, + { + "epoch": 0.20662231451660432, + "grad_norm": 0.20742256939411163, + "learning_rate": 0.00019921104976773505, + "loss": 0.249, + "step": 7450 + }, + { + "epoch": 0.20800904145966878, + "grad_norm": 0.18315458297729492, + "learning_rate": 0.000199192169864535, + "loss": 0.2459, + "step": 7500 + }, + { + "epoch": 0.20939576840273325, + "grad_norm": 0.19357183575630188, + "learning_rate": 0.000199173067638566, + "loss": 0.2439, + "step": 7550 + }, + { + "epoch": 0.2107824953457977, + "grad_norm": 0.2398926168680191, + "learning_rate": 0.00019915374313264248, + "loss": 0.2497, + "step": 7600 + }, + { + "epoch": 0.21216922228886215, + "grad_norm": 0.20313721895217896, + "learning_rate": 0.00019913419639007714, + "loss": 0.2447, + "step": 7650 + }, + { + "epoch": 0.2135559492319266, + "grad_norm": 0.17255066335201263, + "learning_rate": 0.00019911442745468075, + "loss": 0.2447, + "step": 7700 + }, + { + "epoch": 0.21494267617499108, + "grad_norm": 0.19140756130218506, + "learning_rate": 0.0001990944363707621, + "loss": 0.2383, + "step": 7750 + }, + { + "epoch": 0.21632940311805554, + "grad_norm": 0.15212053060531616, + "learning_rate": 0.00019907422318312783, + "loss": 0.2485, + "step": 7800 + }, + { + "epoch": 0.21771613006111998, + "grad_norm": 0.1841588169336319, + "learning_rate": 0.0001990537879370825, + "loss": 0.2432, + "step": 7850 + }, + { + "epoch": 0.21910285700418444, + "grad_norm": 0.2013355791568756, + "learning_rate": 0.00019903313067842833, + "loss": 0.2431, + "step": 7900 + }, + { + "epoch": 0.2204895839472489, + "grad_norm": 0.17149454355239868, + "learning_rate": 0.0001990122514534651, + "loss": 0.247, + "step": 7950 + }, + { + "epoch": 0.22187631089031337, + "grad_norm": 0.24272453784942627, + "learning_rate": 0.00019899115030899014, + "loss": 0.2468, + "step": 8000 + }, + { + "epoch": 0.22187631089031337, + "eval_loss": 0.24099861085414886, + "eval_runtime": 501.2129, + "eval_samples_per_second": 5.7, + "eval_steps_per_second": 5.7, + "step": 8000 + }, + { + "epoch": 0.22326303783337784, + "grad_norm": 0.2419915497303009, + "learning_rate": 0.00019896982729229813, + "loss": 0.2454, + "step": 8050 + }, + { + "epoch": 0.22464976477644227, + "grad_norm": 0.16482336819171906, + "learning_rate": 0.0001989482824511811, + "loss": 0.2423, + "step": 8100 + }, + { + "epoch": 0.22603649171950674, + "grad_norm": 0.22351431846618652, + "learning_rate": 0.00019892651583392824, + "loss": 0.2501, + "step": 8150 + }, + { + "epoch": 0.2274232186625712, + "grad_norm": 0.19319549202919006, + "learning_rate": 0.0001989045274893258, + "loss": 0.2452, + "step": 8200 + }, + { + "epoch": 0.22880994560563567, + "grad_norm": 0.15613292157649994, + "learning_rate": 0.00019888231746665696, + "loss": 0.2428, + "step": 8250 + }, + { + "epoch": 0.2301966725487001, + "grad_norm": 0.18092665076255798, + "learning_rate": 0.00019885988581570184, + "loss": 0.2448, + "step": 8300 + }, + { + "epoch": 0.23158339949176457, + "grad_norm": 0.18928927183151245, + "learning_rate": 0.00019883723258673724, + "loss": 0.2493, + "step": 8350 + }, + { + "epoch": 0.23297012643482903, + "grad_norm": 0.19816988706588745, + "learning_rate": 0.0001988143578305366, + "loss": 0.2465, + "step": 8400 + }, + { + "epoch": 0.2343568533778935, + "grad_norm": 0.19853706657886505, + "learning_rate": 0.00019879126159836992, + "loss": 0.2443, + "step": 8450 + }, + { + "epoch": 0.23574358032095796, + "grad_norm": 0.17544203996658325, + "learning_rate": 0.00019876794394200353, + "loss": 0.2429, + "step": 8500 + }, + { + "epoch": 0.2371303072640224, + "grad_norm": 0.16583149135112762, + "learning_rate": 0.0001987444049137001, + "loss": 0.244, + "step": 8550 + }, + { + "epoch": 0.23851703420708686, + "grad_norm": 0.18239592015743256, + "learning_rate": 0.00019872064456621848, + "loss": 0.2447, + "step": 8600 + }, + { + "epoch": 0.23990376115015133, + "grad_norm": 0.15820704400539398, + "learning_rate": 0.0001986966629528135, + "loss": 0.2469, + "step": 8650 + }, + { + "epoch": 0.2412904880932158, + "grad_norm": 0.18477188050746918, + "learning_rate": 0.00019867246012723598, + "loss": 0.2407, + "step": 8700 + }, + { + "epoch": 0.24267721503628023, + "grad_norm": 0.1676979809999466, + "learning_rate": 0.0001986480361437325, + "loss": 0.2448, + "step": 8750 + }, + { + "epoch": 0.2440639419793447, + "grad_norm": 0.2173600196838379, + "learning_rate": 0.00019862339105704543, + "loss": 0.2409, + "step": 8800 + }, + { + "epoch": 0.24545066892240916, + "grad_norm": 0.17326687276363373, + "learning_rate": 0.00019859852492241256, + "loss": 0.2387, + "step": 8850 + }, + { + "epoch": 0.24683739586547362, + "grad_norm": 0.16229301691055298, + "learning_rate": 0.00019857343779556725, + "loss": 0.2467, + "step": 8900 + }, + { + "epoch": 0.2482241228085381, + "grad_norm": 0.21166543662548065, + "learning_rate": 0.0001985481297327381, + "loss": 0.2507, + "step": 8950 + }, + { + "epoch": 0.24961084975160253, + "grad_norm": 0.17892777919769287, + "learning_rate": 0.00019852260079064894, + "loss": 0.2416, + "step": 9000 + }, + { + "epoch": 0.24961084975160253, + "eval_loss": 0.23973840475082397, + "eval_runtime": 500.5349, + "eval_samples_per_second": 5.708, + "eval_steps_per_second": 5.708, + "step": 9000 + }, + { + "epoch": 0.250997576694667, + "grad_norm": 0.20435132086277008, + "learning_rate": 0.00019849685102651867, + "loss": 0.2385, + "step": 9050 + }, + { + "epoch": 0.25238430363773146, + "grad_norm": 0.1890842318534851, + "learning_rate": 0.0001984708804980611, + "loss": 0.2416, + "step": 9100 + }, + { + "epoch": 0.2537710305807959, + "grad_norm": 0.18390174210071564, + "learning_rate": 0.00019844468926348482, + "loss": 0.2469, + "step": 9150 + }, + { + "epoch": 0.2551577575238604, + "grad_norm": 0.23599492013454437, + "learning_rate": 0.00019841827738149314, + "loss": 0.2417, + "step": 9200 + }, + { + "epoch": 0.25654448446692485, + "grad_norm": 0.1522965133190155, + "learning_rate": 0.00019839164491128398, + "loss": 0.2427, + "step": 9250 + }, + { + "epoch": 0.2579312114099893, + "grad_norm": 0.206534281373024, + "learning_rate": 0.00019836479191254948, + "loss": 0.2452, + "step": 9300 + }, + { + "epoch": 0.2593179383530537, + "grad_norm": 0.18928374350070953, + "learning_rate": 0.00019833771844547627, + "loss": 0.244, + "step": 9350 + }, + { + "epoch": 0.2607046652961182, + "grad_norm": 0.17130087316036224, + "learning_rate": 0.00019831042457074498, + "loss": 0.2488, + "step": 9400 + }, + { + "epoch": 0.26209139223918265, + "grad_norm": 0.17631781101226807, + "learning_rate": 0.00019828291034953033, + "loss": 0.2441, + "step": 9450 + }, + { + "epoch": 0.2634781191822471, + "grad_norm": 0.1852494180202484, + "learning_rate": 0.00019825517584350083, + "loss": 0.2414, + "step": 9500 + }, + { + "epoch": 0.2648648461253116, + "grad_norm": 0.21513506770133972, + "learning_rate": 0.0001982272211148188, + "loss": 0.2412, + "step": 9550 + }, + { + "epoch": 0.26625157306837605, + "grad_norm": 0.18172813951969147, + "learning_rate": 0.0001981990462261401, + "loss": 0.2435, + "step": 9600 + }, + { + "epoch": 0.2676383000114405, + "grad_norm": 0.1561124324798584, + "learning_rate": 0.00019817065124061407, + "loss": 0.238, + "step": 9650 + }, + { + "epoch": 0.269025026954505, + "grad_norm": 0.16663338243961334, + "learning_rate": 0.00019814203622188338, + "loss": 0.2383, + "step": 9700 + }, + { + "epoch": 0.27041175389756944, + "grad_norm": 0.17735238373279572, + "learning_rate": 0.0001981132012340838, + "loss": 0.2459, + "step": 9750 + }, + { + "epoch": 0.27179848084063385, + "grad_norm": 0.21334126591682434, + "learning_rate": 0.00019808414634184417, + "loss": 0.2425, + "step": 9800 + }, + { + "epoch": 0.2731852077836983, + "grad_norm": 0.16817434132099152, + "learning_rate": 0.00019805487161028625, + "loss": 0.2361, + "step": 9850 + }, + { + "epoch": 0.2745719347267628, + "grad_norm": 0.17149919271469116, + "learning_rate": 0.00019802537710502443, + "loss": 0.2431, + "step": 9900 + }, + { + "epoch": 0.27595866166982724, + "grad_norm": 0.1521356999874115, + "learning_rate": 0.00019799566289216576, + "loss": 0.2411, + "step": 9950 + }, + { + "epoch": 0.2773453886128917, + "grad_norm": 0.15583455562591553, + "learning_rate": 0.00019796572903830974, + "loss": 0.2388, + "step": 10000 + }, + { + "epoch": 0.2773453886128917, + "eval_loss": 0.23783154785633087, + "eval_runtime": 501.3932, + "eval_samples_per_second": 5.698, + "eval_steps_per_second": 5.698, + "step": 10000 + }, + { + "epoch": 0.2787321155559562, + "grad_norm": 0.15069644153118134, + "learning_rate": 0.00019793557561054807, + "loss": 0.245, + "step": 10050 + }, + { + "epoch": 0.28011884249902064, + "grad_norm": 0.16481320559978485, + "learning_rate": 0.0001979052026764647, + "loss": 0.2403, + "step": 10100 + }, + { + "epoch": 0.2815055694420851, + "grad_norm": 0.16549484431743622, + "learning_rate": 0.00019787461030413553, + "loss": 0.2404, + "step": 10150 + }, + { + "epoch": 0.28289229638514957, + "grad_norm": 0.1722942292690277, + "learning_rate": 0.0001978437985621282, + "loss": 0.2407, + "step": 10200 + }, + { + "epoch": 0.284279023328214, + "grad_norm": 1.554700255393982, + "learning_rate": 0.0001978127675195022, + "loss": 0.2423, + "step": 10250 + }, + { + "epoch": 0.28566575027127844, + "grad_norm": 0.18697640299797058, + "learning_rate": 0.0001977815172458084, + "loss": 0.2458, + "step": 10300 + }, + { + "epoch": 0.2870524772143429, + "grad_norm": 0.19721738994121552, + "learning_rate": 0.00019775004781108914, + "loss": 0.2423, + "step": 10350 + }, + { + "epoch": 0.28843920415740737, + "grad_norm": 0.13843601942062378, + "learning_rate": 0.00019771835928587787, + "loss": 0.249, + "step": 10400 + }, + { + "epoch": 0.28982593110047183, + "grad_norm": 0.19530989229679108, + "learning_rate": 0.0001976864517411992, + "loss": 0.2438, + "step": 10450 + }, + { + "epoch": 0.2912126580435363, + "grad_norm": 0.14896182715892792, + "learning_rate": 0.0001976543252485686, + "loss": 0.2392, + "step": 10500 + }, + { + "epoch": 0.29259938498660076, + "grad_norm": 0.1485060602426529, + "learning_rate": 0.00019762197987999223, + "loss": 0.2371, + "step": 10550 + }, + { + "epoch": 0.29398611192966523, + "grad_norm": 0.20084735751152039, + "learning_rate": 0.00019758941570796688, + "loss": 0.2461, + "step": 10600 + }, + { + "epoch": 0.2953728388727297, + "grad_norm": 0.1450163722038269, + "learning_rate": 0.0001975566328054797, + "loss": 0.2379, + "step": 10650 + }, + { + "epoch": 0.2967595658157941, + "grad_norm": 0.14225760102272034, + "learning_rate": 0.00019752363124600817, + "loss": 0.2465, + "step": 10700 + }, + { + "epoch": 0.29814629275885857, + "grad_norm": 0.182630255818367, + "learning_rate": 0.00019749041110351975, + "loss": 0.2382, + "step": 10750 + }, + { + "epoch": 0.29953301970192303, + "grad_norm": 0.18140457570552826, + "learning_rate": 0.00019745697245247194, + "loss": 0.2394, + "step": 10800 + }, + { + "epoch": 0.3009197466449875, + "grad_norm": 0.1756162941455841, + "learning_rate": 0.00019742331536781187, + "loss": 0.2377, + "step": 10850 + }, + { + "epoch": 0.30230647358805196, + "grad_norm": 0.14414621889591217, + "learning_rate": 0.0001973894399249763, + "loss": 0.2408, + "step": 10900 + }, + { + "epoch": 0.3036932005311164, + "grad_norm": 0.1697167605161667, + "learning_rate": 0.00019735534619989142, + "loss": 0.2442, + "step": 10950 + }, + { + "epoch": 0.3050799274741809, + "grad_norm": 0.15641078352928162, + "learning_rate": 0.00019732103426897265, + "loss": 0.2421, + "step": 11000 + }, + { + "epoch": 0.3050799274741809, + "eval_loss": 0.23684217035770416, + "eval_runtime": 500.474, + "eval_samples_per_second": 5.709, + "eval_steps_per_second": 5.709, + "step": 11000 + }, + { + "epoch": 0.30646665441724535, + "grad_norm": 0.190172016620636, + "learning_rate": 0.00019728650420912448, + "loss": 0.2475, + "step": 11050 + }, + { + "epoch": 0.3078533813603098, + "grad_norm": 0.16632623970508575, + "learning_rate": 0.0001972517560977403, + "loss": 0.2426, + "step": 11100 + }, + { + "epoch": 0.30924010830337423, + "grad_norm": 0.16913548111915588, + "learning_rate": 0.00019721679001270226, + "loss": 0.2386, + "step": 11150 + }, + { + "epoch": 0.3106268352464387, + "grad_norm": 0.16081750392913818, + "learning_rate": 0.00019718160603238096, + "loss": 0.2358, + "step": 11200 + }, + { + "epoch": 0.31201356218950316, + "grad_norm": 0.19061852991580963, + "learning_rate": 0.00019714620423563552, + "loss": 0.238, + "step": 11250 + }, + { + "epoch": 0.3134002891325676, + "grad_norm": 0.16220314800739288, + "learning_rate": 0.00019711058470181316, + "loss": 0.2428, + "step": 11300 + }, + { + "epoch": 0.3147870160756321, + "grad_norm": 0.20064842700958252, + "learning_rate": 0.00019707474751074915, + "loss": 0.2393, + "step": 11350 + }, + { + "epoch": 0.31617374301869655, + "grad_norm": 0.14250491559505463, + "learning_rate": 0.00019703869274276657, + "loss": 0.2376, + "step": 11400 + }, + { + "epoch": 0.317560469961761, + "grad_norm": 0.18501660227775574, + "learning_rate": 0.00019700242047867623, + "loss": 0.2405, + "step": 11450 + }, + { + "epoch": 0.3189471969048255, + "grad_norm": 0.1680876910686493, + "learning_rate": 0.00019696593079977635, + "loss": 0.241, + "step": 11500 + }, + { + "epoch": 0.32033392384788995, + "grad_norm": 0.15119992196559906, + "learning_rate": 0.00019692922378785252, + "loss": 0.2371, + "step": 11550 + }, + { + "epoch": 0.32172065079095435, + "grad_norm": 0.15388673543930054, + "learning_rate": 0.0001968922995251774, + "loss": 0.2425, + "step": 11600 + }, + { + "epoch": 0.3231073777340188, + "grad_norm": 0.19946704804897308, + "learning_rate": 0.00019685515809451056, + "loss": 0.2476, + "step": 11650 + }, + { + "epoch": 0.3244941046770833, + "grad_norm": 0.17677927017211914, + "learning_rate": 0.0001968177995790984, + "loss": 0.2432, + "step": 11700 + }, + { + "epoch": 0.32588083162014775, + "grad_norm": 0.18418142199516296, + "learning_rate": 0.00019678022406267374, + "loss": 0.2387, + "step": 11750 + }, + { + "epoch": 0.3272675585632122, + "grad_norm": 0.1462264358997345, + "learning_rate": 0.00019674243162945594, + "loss": 0.2377, + "step": 11800 + }, + { + "epoch": 0.3286542855062767, + "grad_norm": 0.14166492223739624, + "learning_rate": 0.0001967044223641504, + "loss": 0.238, + "step": 11850 + }, + { + "epoch": 0.33004101244934114, + "grad_norm": 0.17436008155345917, + "learning_rate": 0.00019666619635194866, + "loss": 0.2429, + "step": 11900 + }, + { + "epoch": 0.3314277393924056, + "grad_norm": 0.15779553353786469, + "learning_rate": 0.00019662775367852787, + "loss": 0.2404, + "step": 11950 + }, + { + "epoch": 0.33281446633547007, + "grad_norm": 0.17796078324317932, + "learning_rate": 0.000196589094430051, + "loss": 0.235, + "step": 12000 + }, + { + "epoch": 0.33281446633547007, + "eval_loss": 0.235828697681427, + "eval_runtime": 500.6046, + "eval_samples_per_second": 5.707, + "eval_steps_per_second": 5.707, + "step": 12000 + }, + { + "epoch": 0.3342011932785345, + "grad_norm": 0.14978894591331482, + "learning_rate": 0.0001965502186931662, + "loss": 0.2419, + "step": 12050 + }, + { + "epoch": 0.33558792022159895, + "grad_norm": 0.17456893622875214, + "learning_rate": 0.00019651112655500713, + "loss": 0.2389, + "step": 12100 + }, + { + "epoch": 0.3369746471646634, + "grad_norm": 0.1462843269109726, + "learning_rate": 0.0001964718181031922, + "loss": 0.2363, + "step": 12150 + }, + { + "epoch": 0.3383613741077279, + "grad_norm": 0.16996078193187714, + "learning_rate": 0.0001964322934258248, + "loss": 0.2404, + "step": 12200 + }, + { + "epoch": 0.33974810105079234, + "grad_norm": 0.1906641721725464, + "learning_rate": 0.00019639255261149298, + "loss": 0.2394, + "step": 12250 + }, + { + "epoch": 0.3411348279938568, + "grad_norm": 0.15007531642913818, + "learning_rate": 0.00019635259574926912, + "loss": 0.2371, + "step": 12300 + }, + { + "epoch": 0.34252155493692127, + "grad_norm": 0.18667016923427582, + "learning_rate": 0.00019631242292870993, + "loss": 0.24, + "step": 12350 + }, + { + "epoch": 0.34390828187998573, + "grad_norm": 0.1689510941505432, + "learning_rate": 0.0001962720342398561, + "loss": 0.2359, + "step": 12400 + }, + { + "epoch": 0.3452950088230502, + "grad_norm": 0.1622210294008255, + "learning_rate": 0.0001962314297732321, + "loss": 0.2405, + "step": 12450 + }, + { + "epoch": 0.3466817357661146, + "grad_norm": 0.20153377950191498, + "learning_rate": 0.0001961906096198462, + "loss": 0.2368, + "step": 12500 + }, + { + "epoch": 0.34806846270917907, + "grad_norm": 0.1634126603603363, + "learning_rate": 0.00019614957387118994, + "loss": 0.236, + "step": 12550 + }, + { + "epoch": 0.34945518965224354, + "grad_norm": 0.21276158094406128, + "learning_rate": 0.00019610832261923817, + "loss": 0.2397, + "step": 12600 + }, + { + "epoch": 0.350841916595308, + "grad_norm": 0.16108940541744232, + "learning_rate": 0.00019606685595644865, + "loss": 0.2424, + "step": 12650 + }, + { + "epoch": 0.35222864353837247, + "grad_norm": 0.20505978167057037, + "learning_rate": 0.00019602517397576205, + "loss": 0.2423, + "step": 12700 + }, + { + "epoch": 0.35361537048143693, + "grad_norm": 0.1431368589401245, + "learning_rate": 0.0001959832767706016, + "loss": 0.2353, + "step": 12750 + }, + { + "epoch": 0.3550020974245014, + "grad_norm": 0.1670791357755661, + "learning_rate": 0.00019594116443487293, + "loss": 0.2366, + "step": 12800 + }, + { + "epoch": 0.35638882436756586, + "grad_norm": 0.1353309154510498, + "learning_rate": 0.00019589883706296385, + "loss": 0.2387, + "step": 12850 + }, + { + "epoch": 0.3577755513106303, + "grad_norm": 0.16561363637447357, + "learning_rate": 0.00019585629474974415, + "loss": 0.2373, + "step": 12900 + }, + { + "epoch": 0.35916227825369473, + "grad_norm": 0.16978101432323456, + "learning_rate": 0.00019581353759056528, + "loss": 0.2383, + "step": 12950 + }, + { + "epoch": 0.3605490051967592, + "grad_norm": 0.13398033380508423, + "learning_rate": 0.0001957705656812604, + "loss": 0.2389, + "step": 13000 + }, + { + "epoch": 0.3605490051967592, + "eval_loss": 0.2349192500114441, + "eval_runtime": 500.9767, + "eval_samples_per_second": 5.703, + "eval_steps_per_second": 5.703, + "step": 13000 + }, + { + "epoch": 0.36193573213982366, + "grad_norm": 0.17141664028167725, + "learning_rate": 0.00019572737911814387, + "loss": 0.2379, + "step": 13050 + }, + { + "epoch": 0.3633224590828881, + "grad_norm": 0.25635290145874023, + "learning_rate": 0.00019568397799801118, + "loss": 0.2354, + "step": 13100 + }, + { + "epoch": 0.3647091860259526, + "grad_norm": 0.19244590401649475, + "learning_rate": 0.00019564036241813876, + "loss": 0.2372, + "step": 13150 + }, + { + "epoch": 0.36609591296901706, + "grad_norm": 0.1587456613779068, + "learning_rate": 0.00019559653247628364, + "loss": 0.2399, + "step": 13200 + }, + { + "epoch": 0.3674826399120815, + "grad_norm": 0.22146746516227722, + "learning_rate": 0.0001955524882706834, + "loss": 0.2356, + "step": 13250 + }, + { + "epoch": 0.368869366855146, + "grad_norm": 0.21101641654968262, + "learning_rate": 0.0001955082299000558, + "loss": 0.2425, + "step": 13300 + }, + { + "epoch": 0.37025609379821045, + "grad_norm": 0.16459371149539948, + "learning_rate": 0.0001954637574635986, + "loss": 0.239, + "step": 13350 + }, + { + "epoch": 0.37164282074127486, + "grad_norm": 0.15547959506511688, + "learning_rate": 0.0001954190710609894, + "loss": 0.2358, + "step": 13400 + }, + { + "epoch": 0.3730295476843393, + "grad_norm": 0.1342894285917282, + "learning_rate": 0.00019537417079238534, + "loss": 0.2363, + "step": 13450 + }, + { + "epoch": 0.3744162746274038, + "grad_norm": 0.14169098436832428, + "learning_rate": 0.0001953290567584229, + "loss": 0.2355, + "step": 13500 + }, + { + "epoch": 0.37580300157046825, + "grad_norm": 0.17943793535232544, + "learning_rate": 0.00019528372906021772, + "loss": 0.2354, + "step": 13550 + }, + { + "epoch": 0.3771897285135327, + "grad_norm": 0.20254671573638916, + "learning_rate": 0.0001952381877993643, + "loss": 0.2411, + "step": 13600 + }, + { + "epoch": 0.3785764554565972, + "grad_norm": 0.1362125426530838, + "learning_rate": 0.0001951924330779358, + "loss": 0.2383, + "step": 13650 + }, + { + "epoch": 0.37996318239966165, + "grad_norm": 0.19201667606830597, + "learning_rate": 0.0001951464649984838, + "loss": 0.2398, + "step": 13700 + }, + { + "epoch": 0.3813499093427261, + "grad_norm": 0.15204668045043945, + "learning_rate": 0.0001951002836640382, + "loss": 0.2347, + "step": 13750 + }, + { + "epoch": 0.3827366362857906, + "grad_norm": 0.14426596462726593, + "learning_rate": 0.00019505388917810665, + "loss": 0.2399, + "step": 13800 + }, + { + "epoch": 0.38412336322885504, + "grad_norm": 0.1463170200586319, + "learning_rate": 0.0001950072816446748, + "loss": 0.2316, + "step": 13850 + }, + { + "epoch": 0.38551009017191945, + "grad_norm": 0.15552669763565063, + "learning_rate": 0.00019496046116820566, + "loss": 0.2354, + "step": 13900 + }, + { + "epoch": 0.3868968171149839, + "grad_norm": 0.16742919385433197, + "learning_rate": 0.00019491342785363952, + "loss": 0.2388, + "step": 13950 + }, + { + "epoch": 0.3882835440580484, + "grad_norm": 0.16111566126346588, + "learning_rate": 0.00019486618180639375, + "loss": 0.2385, + "step": 14000 + }, + { + "epoch": 0.3882835440580484, + "eval_loss": 0.23382489383220673, + "eval_runtime": 500.6533, + "eval_samples_per_second": 5.707, + "eval_steps_per_second": 5.707, + "step": 14000 + }, + { + "epoch": 0.38967027100111284, + "grad_norm": 0.15741662681102753, + "learning_rate": 0.00019481872313236256, + "loss": 0.2374, + "step": 14050 + }, + { + "epoch": 0.3910569979441773, + "grad_norm": 0.15046770870685577, + "learning_rate": 0.00019477105193791664, + "loss": 0.2379, + "step": 14100 + }, + { + "epoch": 0.3924437248872418, + "grad_norm": 0.14219743013381958, + "learning_rate": 0.00019472316832990308, + "loss": 0.2434, + "step": 14150 + }, + { + "epoch": 0.39383045183030624, + "grad_norm": 0.15226851403713226, + "learning_rate": 0.000194675072415645, + "loss": 0.2427, + "step": 14200 + }, + { + "epoch": 0.3952171787733707, + "grad_norm": 0.19782114028930664, + "learning_rate": 0.00019462676430294143, + "loss": 0.2357, + "step": 14250 + }, + { + "epoch": 0.39660390571643517, + "grad_norm": 0.14243118464946747, + "learning_rate": 0.00019457824410006692, + "loss": 0.2343, + "step": 14300 + }, + { + "epoch": 0.3979906326594996, + "grad_norm": 0.22301803529262543, + "learning_rate": 0.00019452951191577155, + "loss": 0.2406, + "step": 14350 + }, + { + "epoch": 0.39937735960256404, + "grad_norm": 0.13103021681308746, + "learning_rate": 0.00019448056785928032, + "loss": 0.2398, + "step": 14400 + }, + { + "epoch": 0.4007640865456285, + "grad_norm": 0.16922806203365326, + "learning_rate": 0.00019443141204029325, + "loss": 0.2363, + "step": 14450 + }, + { + "epoch": 0.40215081348869297, + "grad_norm": 0.17801126837730408, + "learning_rate": 0.00019438204456898492, + "loss": 0.2377, + "step": 14500 + }, + { + "epoch": 0.40353754043175744, + "grad_norm": 0.14513610303401947, + "learning_rate": 0.0001943324655560043, + "loss": 0.241, + "step": 14550 + }, + { + "epoch": 0.4049242673748219, + "grad_norm": 0.14587055146694183, + "learning_rate": 0.00019428267511247457, + "loss": 0.2345, + "step": 14600 + }, + { + "epoch": 0.40631099431788636, + "grad_norm": 0.17200471460819244, + "learning_rate": 0.00019423267334999267, + "loss": 0.2345, + "step": 14650 + }, + { + "epoch": 0.40769772126095083, + "grad_norm": 0.16612234711647034, + "learning_rate": 0.00019418246038062928, + "loss": 0.235, + "step": 14700 + }, + { + "epoch": 0.4090844482040153, + "grad_norm": 0.14822156727313995, + "learning_rate": 0.00019413203631692843, + "loss": 0.2384, + "step": 14750 + }, + { + "epoch": 0.4104711751470797, + "grad_norm": 0.15960198640823364, + "learning_rate": 0.00019408140127190725, + "loss": 0.2375, + "step": 14800 + }, + { + "epoch": 0.41185790209014417, + "grad_norm": NaN, + "learning_rate": 0.00019403157434308126, + "loss": 0.233, + "step": 14850 + }, + { + "epoch": 0.41324462903320863, + "grad_norm": 0.15910230576992035, + "learning_rate": 0.00019398154500404588, + "loss": 0.2728, + "step": 14900 + }, + { + "epoch": 0.4146313559762731, + "grad_norm": 0.16004903614521027, + "learning_rate": 0.0001939302861212685, + "loss": 0.2359, + "step": 14950 + }, + { + "epoch": 0.41601808291933756, + "grad_norm": 0.1622370034456253, + "learning_rate": 0.00019387881670936035, + "loss": 0.2413, + "step": 15000 + }, + { + "epoch": 0.41601808291933756, + "eval_loss": 0.23365913331508636, + "eval_runtime": 500.916, + "eval_samples_per_second": 5.704, + "eval_steps_per_second": 5.704, + "step": 15000 + }, + { + "epoch": 0.417404809862402, + "grad_norm": 0.1744803488254547, + "learning_rate": 0.00019382713688368162, + "loss": 0.2406, + "step": 15050 + }, + { + "epoch": 0.4187915368054665, + "grad_norm": 0.19140714406967163, + "learning_rate": 0.00019377524676006397, + "loss": 0.2385, + "step": 15100 + }, + { + "epoch": 0.42017826374853096, + "grad_norm": 0.14320451021194458, + "learning_rate": 0.00019372314645481052, + "loss": 0.2384, + "step": 15150 + }, + { + "epoch": 0.4215649906915954, + "grad_norm": 0.18620997667312622, + "learning_rate": 0.00019367083608469546, + "loss": 0.2343, + "step": 15200 + }, + { + "epoch": 0.42295171763465983, + "grad_norm": 0.13473859429359436, + "learning_rate": 0.00019361831576696382, + "loss": 0.2399, + "step": 15250 + }, + { + "epoch": 0.4243384445777243, + "grad_norm": 0.15213748812675476, + "learning_rate": 0.00019356558561933108, + "loss": 0.2358, + "step": 15300 + }, + { + "epoch": 0.42572517152078876, + "grad_norm": 0.16841459274291992, + "learning_rate": 0.0001935126457599832, + "loss": 0.2332, + "step": 15350 + }, + { + "epoch": 0.4271118984638532, + "grad_norm": 0.14978626370429993, + "learning_rate": 0.00019345949630757603, + "loss": 0.2382, + "step": 15400 + }, + { + "epoch": 0.4284986254069177, + "grad_norm": 0.18397267162799835, + "learning_rate": 0.00019340613738123526, + "loss": 0.2328, + "step": 15450 + }, + { + "epoch": 0.42988535234998215, + "grad_norm": 0.13535378873348236, + "learning_rate": 0.000193352569100556, + "loss": 0.2278, + "step": 15500 + }, + { + "epoch": 0.4312720792930466, + "grad_norm": 0.1288972645998001, + "learning_rate": 0.00019329879158560274, + "loss": 0.2385, + "step": 15550 + }, + { + "epoch": 0.4326588062361111, + "grad_norm": 0.1488959789276123, + "learning_rate": 0.0001932448049569088, + "loss": 0.2352, + "step": 15600 + }, + { + "epoch": 0.43404553317917555, + "grad_norm": 0.16358473896980286, + "learning_rate": 0.00019319060933547624, + "loss": 0.2362, + "step": 15650 + }, + { + "epoch": 0.43543226012223996, + "grad_norm": 0.13347339630126953, + "learning_rate": 0.00019313620484277553, + "loss": 0.2376, + "step": 15700 + }, + { + "epoch": 0.4368189870653044, + "grad_norm": 0.13555756211280823, + "learning_rate": 0.0001930815916007453, + "loss": 0.2308, + "step": 15750 + }, + { + "epoch": 0.4382057140083689, + "grad_norm": 0.13955436646938324, + "learning_rate": 0.0001930267697317921, + "loss": 0.2329, + "step": 15800 + }, + { + "epoch": 0.43959244095143335, + "grad_norm": 0.1596931517124176, + "learning_rate": 0.00019297173935879, + "loss": 0.2322, + "step": 15850 + }, + { + "epoch": 0.4409791678944978, + "grad_norm": 0.14860297739505768, + "learning_rate": 0.00019291650060508045, + "loss": 0.234, + "step": 15900 + }, + { + "epoch": 0.4423658948375623, + "grad_norm": 0.14575625956058502, + "learning_rate": 0.00019286105359447194, + "loss": 0.2362, + "step": 15950 + }, + { + "epoch": 0.44375262178062674, + "grad_norm": 0.1400967240333557, + "learning_rate": 0.00019280539845123974, + "loss": 0.2358, + "step": 16000 + }, + { + "epoch": 0.44375262178062674, + "eval_loss": 0.23256094753742218, + "eval_runtime": 500.6637, + "eval_samples_per_second": 5.706, + "eval_steps_per_second": 5.706, + "step": 16000 + }, + { + "epoch": 0.4451393487236912, + "grad_norm": 0.2537101209163666, + "learning_rate": 0.00019274953530012563, + "loss": 0.2363, + "step": 16050 + }, + { + "epoch": 0.4465260756667557, + "grad_norm": 0.192925825715065, + "learning_rate": 0.0001926934642663375, + "loss": 0.2343, + "step": 16100 + }, + { + "epoch": 0.4479128026098201, + "grad_norm": 0.17011120915412903, + "learning_rate": 0.0001926371854755493, + "loss": 0.2362, + "step": 16150 + }, + { + "epoch": 0.44929952955288455, + "grad_norm": 0.1474524289369583, + "learning_rate": 0.00019258069905390065, + "loss": 0.2359, + "step": 16200 + }, + { + "epoch": 0.450686256495949, + "grad_norm": 0.15591026842594147, + "learning_rate": 0.00019252400512799643, + "loss": 0.2338, + "step": 16250 + }, + { + "epoch": 0.4520729834390135, + "grad_norm": 0.14443908631801605, + "learning_rate": 0.00019246710382490664, + "loss": 0.2421, + "step": 16300 + }, + { + "epoch": 0.45345971038207794, + "grad_norm": 0.12614597380161285, + "learning_rate": 0.00019240999527216608, + "loss": 0.2373, + "step": 16350 + }, + { + "epoch": 0.4548464373251424, + "grad_norm": 0.1438266485929489, + "learning_rate": 0.00019235267959777415, + "loss": 0.2443, + "step": 16400 + }, + { + "epoch": 0.45623316426820687, + "grad_norm": 0.14473649859428406, + "learning_rate": 0.00019229515693019436, + "loss": 0.241, + "step": 16450 + }, + { + "epoch": 0.45761989121127133, + "grad_norm": 0.13498128950595856, + "learning_rate": 0.00019223742739835423, + "loss": 0.2393, + "step": 16500 + }, + { + "epoch": 0.4590066181543358, + "grad_norm": 0.14498169720172882, + "learning_rate": 0.0001921794911316449, + "loss": 0.2363, + "step": 16550 + }, + { + "epoch": 0.4603933450974002, + "grad_norm": 0.14319288730621338, + "learning_rate": 0.00019212134825992091, + "loss": 0.2359, + "step": 16600 + }, + { + "epoch": 0.4617800720404647, + "grad_norm": 0.12314629554748535, + "learning_rate": 0.00019206299891349983, + "loss": 0.23, + "step": 16650 + }, + { + "epoch": 0.46316679898352914, + "grad_norm": 0.14780518412590027, + "learning_rate": 0.00019200444322316207, + "loss": 0.2381, + "step": 16700 + }, + { + "epoch": 0.4645535259265936, + "grad_norm": 0.1493334025144577, + "learning_rate": 0.0001919456813201504, + "loss": 0.2345, + "step": 16750 + }, + { + "epoch": 0.46594025286965807, + "grad_norm": 0.11972863227128983, + "learning_rate": 0.00019188671333616992, + "loss": 0.235, + "step": 16800 + }, + { + "epoch": 0.46732697981272253, + "grad_norm": 0.13366112112998962, + "learning_rate": 0.00019182753940338753, + "loss": 0.2306, + "step": 16850 + }, + { + "epoch": 0.468713706755787, + "grad_norm": 0.13790684938430786, + "learning_rate": 0.00019176815965443186, + "loss": 0.2366, + "step": 16900 + }, + { + "epoch": 0.47010043369885146, + "grad_norm": 0.14081595838069916, + "learning_rate": 0.0001917085742223926, + "loss": 0.2368, + "step": 16950 + }, + { + "epoch": 0.4714871606419159, + "grad_norm": 0.13987073302268982, + "learning_rate": 0.00019164878324082074, + "loss": 0.2337, + "step": 17000 + }, + { + "epoch": 0.4714871606419159, + "eval_loss": 0.2317454218864441, + "eval_runtime": 500.9301, + "eval_samples_per_second": 5.703, + "eval_steps_per_second": 5.703, + "step": 17000 + }, + { + "epoch": 0.47287388758498033, + "grad_norm": 0.1430695503950119, + "learning_rate": 0.00019158878684372778, + "loss": 0.2346, + "step": 17050 + }, + { + "epoch": 0.4742606145280448, + "grad_norm": 0.14264121651649475, + "learning_rate": 0.00019152858516558564, + "loss": 0.2339, + "step": 17100 + }, + { + "epoch": 0.47564734147110926, + "grad_norm": 0.15278013050556183, + "learning_rate": 0.00019146817834132644, + "loss": 0.2333, + "step": 17150 + }, + { + "epoch": 0.47703406841417373, + "grad_norm": 0.15283286571502686, + "learning_rate": 0.000191407566506342, + "loss": 0.2323, + "step": 17200 + }, + { + "epoch": 0.4784207953572382, + "grad_norm": 0.13433212041854858, + "learning_rate": 0.00019134674979648367, + "loss": 0.2406, + "step": 17250 + }, + { + "epoch": 0.47980752230030266, + "grad_norm": 0.14129064977169037, + "learning_rate": 0.00019128572834806203, + "loss": 0.2353, + "step": 17300 + }, + { + "epoch": 0.4811942492433671, + "grad_norm": 0.14736846089363098, + "learning_rate": 0.00019122450229784653, + "loss": 0.2312, + "step": 17350 + }, + { + "epoch": 0.4825809761864316, + "grad_norm": 0.14513076841831207, + "learning_rate": 0.00019116307178306514, + "loss": 0.2358, + "step": 17400 + }, + { + "epoch": 0.48396770312949605, + "grad_norm": 0.14358818531036377, + "learning_rate": 0.0001911014369414042, + "loss": 0.2376, + "step": 17450 + }, + { + "epoch": 0.48535443007256046, + "grad_norm": 0.14574295282363892, + "learning_rate": 0.00019103959791100792, + "loss": 0.2306, + "step": 17500 + }, + { + "epoch": 0.4867411570156249, + "grad_norm": 0.1347060352563858, + "learning_rate": 0.00019097755483047827, + "loss": 0.2341, + "step": 17550 + }, + { + "epoch": 0.4881278839586894, + "grad_norm": 0.1792859435081482, + "learning_rate": 0.00019091530783887448, + "loss": 0.2392, + "step": 17600 + }, + { + "epoch": 0.48951461090175385, + "grad_norm": 0.11206398904323578, + "learning_rate": 0.00019085285707571282, + "loss": 0.236, + "step": 17650 + }, + { + "epoch": 0.4909013378448183, + "grad_norm": 0.16337329149246216, + "learning_rate": 0.0001907902026809663, + "loss": 0.239, + "step": 17700 + }, + { + "epoch": 0.4922880647878828, + "grad_norm": 0.14579764008522034, + "learning_rate": 0.0001907273447950644, + "loss": 0.2258, + "step": 17750 + }, + { + "epoch": 0.49367479173094725, + "grad_norm": 0.1381896585226059, + "learning_rate": 0.00019066428355889257, + "loss": 0.2366, + "step": 17800 + }, + { + "epoch": 0.4950615186740117, + "grad_norm": 0.13557949662208557, + "learning_rate": 0.00019060101911379208, + "loss": 0.236, + "step": 17850 + }, + { + "epoch": 0.4964482456170762, + "grad_norm": 0.13205058872699738, + "learning_rate": 0.00019053755160155974, + "loss": 0.237, + "step": 17900 + }, + { + "epoch": 0.4978349725601406, + "grad_norm": 0.1766868382692337, + "learning_rate": 0.00019047388116444735, + "loss": 0.241, + "step": 17950 + }, + { + "epoch": 0.49922169950320505, + "grad_norm": 0.1567864567041397, + "learning_rate": 0.00019041000794516171, + "loss": 0.2269, + "step": 18000 + }, + { + "epoch": 0.49922169950320505, + "eval_loss": 0.23145872354507446, + "eval_runtime": 500.5681, + "eval_samples_per_second": 5.708, + "eval_steps_per_second": 5.708, + "step": 18000 + }, + { + "epoch": 0.5006084264462696, + "grad_norm": 0.13615478575229645, + "learning_rate": 0.00019034593208686396, + "loss": 0.2347, + "step": 18050 + }, + { + "epoch": 0.501995153389334, + "grad_norm": 0.13786327838897705, + "learning_rate": 0.00019028165373316948, + "loss": 0.2335, + "step": 18100 + }, + { + "epoch": 0.5033818803323985, + "grad_norm": 0.14584092795848846, + "learning_rate": 0.0001902171730281476, + "loss": 0.2392, + "step": 18150 + }, + { + "epoch": 0.5047686072754629, + "grad_norm": 0.18500222265720367, + "learning_rate": 0.000190152490116321, + "loss": 0.2336, + "step": 18200 + }, + { + "epoch": 0.5061553342185273, + "grad_norm": 0.14118489623069763, + "learning_rate": 0.0001900876051426658, + "loss": 0.2362, + "step": 18250 + }, + { + "epoch": 0.5075420611615918, + "grad_norm": 0.18030238151550293, + "learning_rate": 0.00019002251825261078, + "loss": 0.2363, + "step": 18300 + }, + { + "epoch": 0.5089287881046562, + "grad_norm": 0.1916930228471756, + "learning_rate": 0.00018995722959203745, + "loss": 0.2342, + "step": 18350 + }, + { + "epoch": 0.5103155150477208, + "grad_norm": 0.1503581702709198, + "learning_rate": 0.00018989173930727951, + "loss": 0.2365, + "step": 18400 + }, + { + "epoch": 0.5117022419907852, + "grad_norm": 0.14816977083683014, + "learning_rate": 0.0001898260475451225, + "loss": 0.2387, + "step": 18450 + }, + { + "epoch": 0.5130889689338497, + "grad_norm": 0.13476118445396423, + "learning_rate": 0.00018976015445280363, + "loss": 0.2343, + "step": 18500 + }, + { + "epoch": 0.5144756958769141, + "grad_norm": 0.17522576451301575, + "learning_rate": 0.00018969406017801127, + "loss": 0.2299, + "step": 18550 + }, + { + "epoch": 0.5158624228199786, + "grad_norm": 0.13437584042549133, + "learning_rate": 0.00018962776486888485, + "loss": 0.2342, + "step": 18600 + }, + { + "epoch": 0.517249149763043, + "grad_norm": 0.14156264066696167, + "learning_rate": 0.0001895612686740142, + "loss": 0.2363, + "step": 18650 + }, + { + "epoch": 0.5186358767061074, + "grad_norm": 0.11037924140691757, + "learning_rate": 0.00018949457174243954, + "loss": 0.2343, + "step": 18700 + }, + { + "epoch": 0.520022603649172, + "grad_norm": 0.1362009048461914, + "learning_rate": 0.00018942767422365094, + "loss": 0.2363, + "step": 18750 + }, + { + "epoch": 0.5214093305922364, + "grad_norm": 0.1261095106601715, + "learning_rate": 0.00018936057626758808, + "loss": 0.2341, + "step": 18800 + }, + { + "epoch": 0.5227960575353009, + "grad_norm": 0.13382628560066223, + "learning_rate": 0.00018929327802463987, + "loss": 0.2309, + "step": 18850 + }, + { + "epoch": 0.5241827844783653, + "grad_norm": 0.15190520882606506, + "learning_rate": 0.00018922577964564417, + "loss": 0.2338, + "step": 18900 + }, + { + "epoch": 0.5255695114214298, + "grad_norm": 0.13708838820457458, + "learning_rate": 0.00018915808128188734, + "loss": 0.2338, + "step": 18950 + }, + { + "epoch": 0.5269562383644942, + "grad_norm": 0.20378737151622772, + "learning_rate": 0.0001890901830851041, + "loss": 0.2341, + "step": 19000 + }, + { + "epoch": 0.5269562383644942, + "eval_loss": 0.23116359114646912, + "eval_runtime": 500.7638, + "eval_samples_per_second": 5.705, + "eval_steps_per_second": 5.705, + "step": 19000 + }, + { + "epoch": 0.5283429653075588, + "grad_norm": 0.17179715633392334, + "learning_rate": 0.00018902208520747685, + "loss": 0.2363, + "step": 19050 + }, + { + "epoch": 0.5297296922506232, + "grad_norm": 0.13991795480251312, + "learning_rate": 0.00018895378780163578, + "loss": 0.2308, + "step": 19100 + }, + { + "epoch": 0.5311164191936876, + "grad_norm": 0.11662200093269348, + "learning_rate": 0.0001888852910206581, + "loss": 0.2354, + "step": 19150 + }, + { + "epoch": 0.5325031461367521, + "grad_norm": 0.1577063351869583, + "learning_rate": 0.00018881659501806804, + "loss": 0.2331, + "step": 19200 + }, + { + "epoch": 0.5338898730798165, + "grad_norm": 0.14893421530723572, + "learning_rate": 0.0001887476999478362, + "loss": 0.2345, + "step": 19250 + }, + { + "epoch": 0.535276600022881, + "grad_norm": 0.14458926022052765, + "learning_rate": 0.00018867860596437946, + "loss": 0.2364, + "step": 19300 + }, + { + "epoch": 0.5366633269659454, + "grad_norm": 0.18197046220302582, + "learning_rate": 0.00018860931322256056, + "loss": 0.2316, + "step": 19350 + }, + { + "epoch": 0.53805005390901, + "grad_norm": 0.12696345150470734, + "learning_rate": 0.0001885398218776876, + "loss": 0.2288, + "step": 19400 + }, + { + "epoch": 0.5394367808520744, + "grad_norm": 0.14459608495235443, + "learning_rate": 0.00018847013208551393, + "loss": 0.2342, + "step": 19450 + }, + { + "epoch": 0.5408235077951389, + "grad_norm": 0.13681089878082275, + "learning_rate": 0.00018840024400223758, + "loss": 0.2341, + "step": 19500 + }, + { + "epoch": 0.5422102347382033, + "grad_norm": 0.1358567178249359, + "learning_rate": 0.00018833015778450113, + "loss": 0.239, + "step": 19550 + }, + { + "epoch": 0.5435969616812677, + "grad_norm": 0.1429983228445053, + "learning_rate": 0.0001882598735893912, + "loss": 0.234, + "step": 19600 + }, + { + "epoch": 0.5449836886243322, + "grad_norm": 0.15259206295013428, + "learning_rate": 0.00018818939157443806, + "loss": 0.2333, + "step": 19650 + }, + { + "epoch": 0.5463704155673966, + "grad_norm": 0.1499055027961731, + "learning_rate": 0.00018811871189761554, + "loss": 0.2335, + "step": 19700 + }, + { + "epoch": 0.5477571425104611, + "grad_norm": 0.15547756850719452, + "learning_rate": 0.0001880478347173403, + "loss": 0.2331, + "step": 19750 + }, + { + "epoch": 0.5491438694535256, + "grad_norm": 0.13615499436855316, + "learning_rate": 0.00018797676019247187, + "loss": 0.2327, + "step": 19800 + }, + { + "epoch": 0.5505305963965901, + "grad_norm": 0.15891136229038239, + "learning_rate": 0.00018790548848231188, + "loss": 0.2293, + "step": 19850 + }, + { + "epoch": 0.5519173233396545, + "grad_norm": 0.1028260812163353, + "learning_rate": 0.0001878340197466041, + "loss": 0.2337, + "step": 19900 + }, + { + "epoch": 0.553304050282719, + "grad_norm": 0.15393692255020142, + "learning_rate": 0.0001877623541455338, + "loss": 0.2332, + "step": 19950 + }, + { + "epoch": 0.5546907772257834, + "grad_norm": 0.11807084083557129, + "learning_rate": 0.0001876904918397275, + "loss": 0.2352, + "step": 20000 + }, + { + "epoch": 0.5546907772257834, + "eval_loss": 0.2310873419046402, + "eval_runtime": 501.0545, + "eval_samples_per_second": 5.702, + "eval_steps_per_second": 5.702, + "step": 20000 + }, + { + "epoch": 0.5560775041688478, + "grad_norm": 0.1603621393442154, + "learning_rate": 0.00018761843299025267, + "loss": 0.2347, + "step": 20050 + }, + { + "epoch": 0.5574642311119123, + "grad_norm": 0.14295394718647003, + "learning_rate": 0.00018754617775861718, + "loss": 0.2335, + "step": 20100 + }, + { + "epoch": 0.5588509580549768, + "grad_norm": 0.1290232539176941, + "learning_rate": 0.0001874737263067692, + "loss": 0.2337, + "step": 20150 + }, + { + "epoch": 0.5602376849980413, + "grad_norm": 0.16112935543060303, + "learning_rate": 0.00018740107879709655, + "loss": 0.2354, + "step": 20200 + }, + { + "epoch": 0.5616244119411057, + "grad_norm": 0.13674217462539673, + "learning_rate": 0.00018732823539242664, + "loss": 0.23, + "step": 20250 + }, + { + "epoch": 0.5630111388841702, + "grad_norm": 0.18549004197120667, + "learning_rate": 0.00018725519625602578, + "loss": 0.2353, + "step": 20300 + }, + { + "epoch": 0.5643978658272346, + "grad_norm": 0.13107050955295563, + "learning_rate": 0.0001871819615515991, + "loss": 0.2392, + "step": 20350 + }, + { + "epoch": 0.5657845927702991, + "grad_norm": 0.13590605556964874, + "learning_rate": 0.00018710853144329002, + "loss": 0.2347, + "step": 20400 + }, + { + "epoch": 0.5671713197133635, + "grad_norm": 0.13591018319129944, + "learning_rate": 0.0001870349060956799, + "loss": 0.229, + "step": 20450 + }, + { + "epoch": 0.568558046656428, + "grad_norm": 0.11401943862438202, + "learning_rate": 0.00018696108567378773, + "loss": 0.2326, + "step": 20500 + }, + { + "epoch": 0.5699447735994925, + "grad_norm": 0.18518146872520447, + "learning_rate": 0.00018688707034306978, + "loss": 0.2351, + "step": 20550 + }, + { + "epoch": 0.5713315005425569, + "grad_norm": 0.1642865538597107, + "learning_rate": 0.00018681286026941905, + "loss": 0.2384, + "step": 20600 + }, + { + "epoch": 0.5727182274856214, + "grad_norm": 0.133639395236969, + "learning_rate": 0.00018673845561916513, + "loss": 0.2324, + "step": 20650 + }, + { + "epoch": 0.5741049544286858, + "grad_norm": 0.120590940117836, + "learning_rate": 0.00018666385655907367, + "loss": 0.2315, + "step": 20700 + }, + { + "epoch": 0.5754916813717503, + "grad_norm": 0.15754735469818115, + "learning_rate": 0.00018658906325634604, + "loss": 0.2388, + "step": 20750 + }, + { + "epoch": 0.5768784083148147, + "grad_norm": 0.15975181758403778, + "learning_rate": 0.00018651407587861905, + "loss": 0.2376, + "step": 20800 + }, + { + "epoch": 0.5782651352578793, + "grad_norm": 0.13276700675487518, + "learning_rate": 0.0001864388945939644, + "loss": 0.2379, + "step": 20850 + }, + { + "epoch": 0.5796518622009437, + "grad_norm": 0.16388626396656036, + "learning_rate": 0.0001863635195708885, + "loss": 0.2332, + "step": 20900 + }, + { + "epoch": 0.5810385891440081, + "grad_norm": 0.18847975134849548, + "learning_rate": 0.0001862879509783319, + "loss": 0.2381, + "step": 20950 + }, + { + "epoch": 0.5824253160870726, + "grad_norm": 0.24493199586868286, + "learning_rate": 0.00018621218898566907, + "loss": 0.2328, + "step": 21000 + }, + { + "epoch": 0.5824253160870726, + "eval_loss": 0.23020677268505096, + "eval_runtime": 499.9502, + "eval_samples_per_second": 5.715, + "eval_steps_per_second": 5.715, + "step": 21000 + }, + { + "epoch": 0.583812043030137, + "grad_norm": 0.16316668689250946, + "learning_rate": 0.00018613623376270794, + "loss": 0.2429, + "step": 21050 + }, + { + "epoch": 0.5851987699732015, + "grad_norm": 0.13449080288410187, + "learning_rate": 0.0001860600854796895, + "loss": 0.2298, + "step": 21100 + }, + { + "epoch": 0.5865854969162659, + "grad_norm": 0.11589767783880234, + "learning_rate": 0.00018598374430728746, + "loss": 0.2344, + "step": 21150 + }, + { + "epoch": 0.5879722238593305, + "grad_norm": 0.11659828573465347, + "learning_rate": 0.0001859072104166079, + "loss": 0.2333, + "step": 21200 + }, + { + "epoch": 0.5893589508023949, + "grad_norm": 0.155133455991745, + "learning_rate": 0.00018583048397918884, + "loss": 0.2362, + "step": 21250 + }, + { + "epoch": 0.5907456777454594, + "grad_norm": 0.16488181054592133, + "learning_rate": 0.00018575356516699977, + "loss": 0.2334, + "step": 21300 + }, + { + "epoch": 0.5921324046885238, + "grad_norm": 0.18307441473007202, + "learning_rate": 0.0001856764541524415, + "loss": 0.2272, + "step": 21350 + }, + { + "epoch": 0.5935191316315882, + "grad_norm": 0.1316101998090744, + "learning_rate": 0.00018559915110834553, + "loss": 0.2342, + "step": 21400 + }, + { + "epoch": 0.5949058585746527, + "grad_norm": 0.1548035889863968, + "learning_rate": 0.00018552165620797382, + "loss": 0.2323, + "step": 21450 + }, + { + "epoch": 0.5962925855177171, + "grad_norm": 0.13214810192584991, + "learning_rate": 0.00018544396962501828, + "loss": 0.2319, + "step": 21500 + }, + { + "epoch": 0.5976793124607817, + "grad_norm": 0.14733006060123444, + "learning_rate": 0.00018536609153360046, + "loss": 0.237, + "step": 21550 + }, + { + "epoch": 0.5990660394038461, + "grad_norm": 0.14465801417827606, + "learning_rate": 0.0001852880221082712, + "loss": 0.2318, + "step": 21600 + }, + { + "epoch": 0.6004527663469106, + "grad_norm": 0.14646270871162415, + "learning_rate": 0.00018520976152401012, + "loss": 0.2368, + "step": 21650 + }, + { + "epoch": 0.601839493289975, + "grad_norm": 0.14174975454807281, + "learning_rate": 0.00018513130995622535, + "loss": 0.2349, + "step": 21700 + }, + { + "epoch": 0.6032262202330395, + "grad_norm": 0.12805262207984924, + "learning_rate": 0.00018505266758075302, + "loss": 0.2315, + "step": 21750 + }, + { + "epoch": 0.6046129471761039, + "grad_norm": 0.1598140299320221, + "learning_rate": 0.00018497383457385697, + "loss": 0.2332, + "step": 21800 + }, + { + "epoch": 0.6059996741191683, + "grad_norm": 0.13651584088802338, + "learning_rate": 0.00018489481111222828, + "loss": 0.2348, + "step": 21850 + }, + { + "epoch": 0.6073864010622329, + "grad_norm": 0.13091818988323212, + "learning_rate": 0.0001848155973729849, + "loss": 0.2287, + "step": 21900 + }, + { + "epoch": 0.6087731280052973, + "grad_norm": 0.17191646993160248, + "learning_rate": 0.00018473619353367128, + "loss": 0.2342, + "step": 21950 + }, + { + "epoch": 0.6101598549483618, + "grad_norm": 0.10674546658992767, + "learning_rate": 0.0001846565997722579, + "loss": 0.2309, + "step": 22000 + }, + { + "epoch": 0.6101598549483618, + "eval_loss": 0.22999995946884155, + "eval_runtime": 499.7816, + "eval_samples_per_second": 5.716, + "eval_steps_per_second": 5.716, + "step": 22000 + }, + { + "epoch": 0.6115465818914262, + "grad_norm": 0.1321185827255249, + "learning_rate": 0.000184576816267141, + "loss": 0.2347, + "step": 22050 + }, + { + "epoch": 0.6129333088344907, + "grad_norm": 0.12945061922073364, + "learning_rate": 0.00018449684319714202, + "loss": 0.2298, + "step": 22100 + }, + { + "epoch": 0.6143200357775551, + "grad_norm": 0.16403023898601532, + "learning_rate": 0.00018441668074150732, + "loss": 0.2276, + "step": 22150 + }, + { + "epoch": 0.6157067627206196, + "grad_norm": 0.14253240823745728, + "learning_rate": 0.00018433632907990775, + "loss": 0.2315, + "step": 22200 + }, + { + "epoch": 0.617093489663684, + "grad_norm": 0.1752641350030899, + "learning_rate": 0.00018425578839243814, + "loss": 0.2327, + "step": 22250 + }, + { + "epoch": 0.6184802166067485, + "grad_norm": 0.11023511737585068, + "learning_rate": 0.00018417505885961712, + "loss": 0.2341, + "step": 22300 + }, + { + "epoch": 0.619866943549813, + "grad_norm": 0.1494046449661255, + "learning_rate": 0.00018409414066238654, + "loss": 0.2307, + "step": 22350 + }, + { + "epoch": 0.6212536704928774, + "grad_norm": 0.13288947939872742, + "learning_rate": 0.00018401303398211103, + "loss": 0.2307, + "step": 22400 + }, + { + "epoch": 0.6226403974359419, + "grad_norm": 0.13972090184688568, + "learning_rate": 0.0001839317390005778, + "loss": 0.231, + "step": 22450 + }, + { + "epoch": 0.6240271243790063, + "grad_norm": 0.16141022741794586, + "learning_rate": 0.000183850255899996, + "loss": 0.2395, + "step": 22500 + }, + { + "epoch": 0.6254138513220708, + "grad_norm": 0.17160941660404205, + "learning_rate": 0.00018376858486299647, + "loss": 0.2371, + "step": 22550 + }, + { + "epoch": 0.6268005782651352, + "grad_norm": 0.13852784037590027, + "learning_rate": 0.00018368672607263132, + "loss": 0.2286, + "step": 22600 + }, + { + "epoch": 0.6281873052081998, + "grad_norm": 0.16050252318382263, + "learning_rate": 0.00018360467971237338, + "loss": 0.2345, + "step": 22650 + }, + { + "epoch": 0.6295740321512642, + "grad_norm": 0.12499688565731049, + "learning_rate": 0.0001835224459661159, + "loss": 0.232, + "step": 22700 + }, + { + "epoch": 0.6309607590943286, + "grad_norm": 0.16804257035255432, + "learning_rate": 0.00018344002501817226, + "loss": 0.2336, + "step": 22750 + }, + { + "epoch": 0.6323474860373931, + "grad_norm": 0.15330076217651367, + "learning_rate": 0.00018335741705327526, + "loss": 0.2314, + "step": 22800 + }, + { + "epoch": 0.6337342129804575, + "grad_norm": 0.12613581120967865, + "learning_rate": 0.00018327462225657692, + "loss": 0.235, + "step": 22850 + }, + { + "epoch": 0.635120939923522, + "grad_norm": 0.16671714186668396, + "learning_rate": 0.00018319164081364802, + "loss": 0.2319, + "step": 22900 + }, + { + "epoch": 0.6365076668665864, + "grad_norm": 0.11536330729722977, + "learning_rate": 0.00018310847291047776, + "loss": 0.2296, + "step": 22950 + }, + { + "epoch": 0.637894393809651, + "grad_norm": 0.1565777063369751, + "learning_rate": 0.00018302511873347305, + "loss": 0.23, + "step": 23000 + }, + { + "epoch": 0.637894393809651, + "eval_loss": 0.22944478690624237, + "eval_runtime": 500.3715, + "eval_samples_per_second": 5.71, + "eval_steps_per_second": 5.71, + "step": 23000 + }, + { + "epoch": 0.6392811207527154, + "grad_norm": 0.18740278482437134, + "learning_rate": 0.00018294157846945853, + "loss": 0.2315, + "step": 23050 + }, + { + "epoch": 0.6406678476957799, + "grad_norm": 0.14261969923973083, + "learning_rate": 0.00018285785230567577, + "loss": 0.2291, + "step": 23100 + }, + { + "epoch": 0.6420545746388443, + "grad_norm": 0.16137824952602386, + "learning_rate": 0.00018277394042978307, + "loss": 0.2325, + "step": 23150 + }, + { + "epoch": 0.6434413015819087, + "grad_norm": 0.1337035894393921, + "learning_rate": 0.00018268984302985495, + "loss": 0.2322, + "step": 23200 + }, + { + "epoch": 0.6448280285249732, + "grad_norm": 0.11618442833423615, + "learning_rate": 0.0001826055602943818, + "loss": 0.2349, + "step": 23250 + }, + { + "epoch": 0.6462147554680376, + "grad_norm": 0.12656192481517792, + "learning_rate": 0.0001825210924122693, + "loss": 0.234, + "step": 23300 + }, + { + "epoch": 0.6476014824111022, + "grad_norm": 0.11272765696048737, + "learning_rate": 0.0001824364395728382, + "loss": 0.2313, + "step": 23350 + }, + { + "epoch": 0.6489882093541666, + "grad_norm": 0.13132552802562714, + "learning_rate": 0.00018235160196582384, + "loss": 0.2289, + "step": 23400 + }, + { + "epoch": 0.6503749362972311, + "grad_norm": 0.11405663937330246, + "learning_rate": 0.00018226657978137554, + "loss": 0.2356, + "step": 23450 + }, + { + "epoch": 0.6517616632402955, + "grad_norm": 0.15040431916713715, + "learning_rate": 0.00018218137321005643, + "loss": 0.2303, + "step": 23500 + }, + { + "epoch": 0.65314839018336, + "grad_norm": 0.13074640929698944, + "learning_rate": 0.00018209598244284288, + "loss": 0.2319, + "step": 23550 + }, + { + "epoch": 0.6545351171264244, + "grad_norm": 0.14512640237808228, + "learning_rate": 0.00018201040767112413, + "loss": 0.2393, + "step": 23600 + }, + { + "epoch": 0.6559218440694888, + "grad_norm": 0.10800650715827942, + "learning_rate": 0.00018192464908670176, + "loss": 0.2318, + "step": 23650 + }, + { + "epoch": 0.6573085710125534, + "grad_norm": 0.12321613729000092, + "learning_rate": 0.00018183870688178946, + "loss": 0.2331, + "step": 23700 + }, + { + "epoch": 0.6586952979556178, + "grad_norm": 0.1868344396352768, + "learning_rate": 0.00018175258124901236, + "loss": 0.2317, + "step": 23750 + }, + { + "epoch": 0.6600820248986823, + "grad_norm": 0.11993540078401566, + "learning_rate": 0.00018166627238140674, + "loss": 0.2309, + "step": 23800 + }, + { + "epoch": 0.6614687518417467, + "grad_norm": 0.11594246327877045, + "learning_rate": 0.00018157978047241962, + "loss": 0.2322, + "step": 23850 + }, + { + "epoch": 0.6628554787848112, + "grad_norm": 0.18056848645210266, + "learning_rate": 0.00018149310571590824, + "loss": 0.2335, + "step": 23900 + }, + { + "epoch": 0.6642422057278756, + "grad_norm": 0.14387637376785278, + "learning_rate": 0.00018140624830613965, + "loss": 0.2366, + "step": 23950 + }, + { + "epoch": 0.6656289326709401, + "grad_norm": 0.16983430087566376, + "learning_rate": 0.00018131920843779035, + "loss": 0.2361, + "step": 24000 + }, + { + "epoch": 0.6656289326709401, + "eval_loss": 0.22958332300186157, + "eval_runtime": 500.0504, + "eval_samples_per_second": 5.713, + "eval_steps_per_second": 5.713, + "step": 24000 + }, + { + "epoch": 0.6670156596140046, + "grad_norm": 0.13279864192008972, + "learning_rate": 0.0001812319863059457, + "loss": 0.2359, + "step": 24050 + }, + { + "epoch": 0.668402386557069, + "grad_norm": 0.11594101786613464, + "learning_rate": 0.00018114458210609962, + "loss": 0.2358, + "step": 24100 + }, + { + "epoch": 0.6697891135001335, + "grad_norm": 0.13613513112068176, + "learning_rate": 0.0001810569960341541, + "loss": 0.2278, + "step": 24150 + }, + { + "epoch": 0.6711758404431979, + "grad_norm": 0.12295212596654892, + "learning_rate": 0.00018096922828641878, + "loss": 0.2315, + "step": 24200 + }, + { + "epoch": 0.6725625673862624, + "grad_norm": 0.17889654636383057, + "learning_rate": 0.00018088127905961047, + "loss": 0.2305, + "step": 24250 + }, + { + "epoch": 0.6739492943293268, + "grad_norm": 0.16525234282016754, + "learning_rate": 0.0001807931485508528, + "loss": 0.2304, + "step": 24300 + }, + { + "epoch": 0.6753360212723913, + "grad_norm": 0.11446121335029602, + "learning_rate": 0.0001807048369576756, + "loss": 0.2333, + "step": 24350 + }, + { + "epoch": 0.6767227482154557, + "grad_norm": 0.14533396065235138, + "learning_rate": 0.00018061634447801467, + "loss": 0.2354, + "step": 24400 + }, + { + "epoch": 0.6781094751585203, + "grad_norm": 0.14825408160686493, + "learning_rate": 0.0001805276713102112, + "loss": 0.2316, + "step": 24450 + }, + { + "epoch": 0.6794962021015847, + "grad_norm": 0.148117333650589, + "learning_rate": 0.00018043881765301135, + "loss": 0.2338, + "step": 24500 + }, + { + "epoch": 0.6808829290446491, + "grad_norm": 0.10264230519533157, + "learning_rate": 0.00018034978370556583, + "loss": 0.2298, + "step": 24550 + }, + { + "epoch": 0.6822696559877136, + "grad_norm": 0.12200962007045746, + "learning_rate": 0.00018026056966742945, + "loss": 0.2284, + "step": 24600 + }, + { + "epoch": 0.683656382930778, + "grad_norm": 0.14096751809120178, + "learning_rate": 0.00018017117573856063, + "loss": 0.2333, + "step": 24650 + }, + { + "epoch": 0.6850431098738425, + "grad_norm": 0.16554249823093414, + "learning_rate": 0.00018008160211932108, + "loss": 0.2316, + "step": 24700 + }, + { + "epoch": 0.686429836816907, + "grad_norm": 0.11679153889417648, + "learning_rate": 0.0001799918490104751, + "loss": 0.2287, + "step": 24750 + }, + { + "epoch": 0.6878165637599715, + "grad_norm": 0.1387365758419037, + "learning_rate": 0.00017990191661318943, + "loss": 0.2356, + "step": 24800 + }, + { + "epoch": 0.6892032907030359, + "grad_norm": 0.1255553960800171, + "learning_rate": 0.00017981180512903255, + "loss": 0.2342, + "step": 24850 + }, + { + "epoch": 0.6905900176461004, + "grad_norm": 0.17247521877288818, + "learning_rate": 0.00017972151475997443, + "loss": 0.2303, + "step": 24900 + }, + { + "epoch": 0.6919767445891648, + "grad_norm": 0.20023292303085327, + "learning_rate": 0.0001796310457083859, + "loss": 0.2346, + "step": 24950 + }, + { + "epoch": 0.6933634715322292, + "grad_norm": 0.11909276992082596, + "learning_rate": 0.0001795403981770383, + "loss": 0.2264, + "step": 25000 + }, + { + "epoch": 0.6933634715322292, + "eval_loss": 0.2287738025188446, + "eval_runtime": 500.5021, + "eval_samples_per_second": 5.708, + "eval_steps_per_second": 5.708, + "step": 25000 + }, + { + "epoch": 0.6947501984752937, + "grad_norm": 0.13509905338287354, + "learning_rate": 0.00017944957236910308, + "loss": 0.2318, + "step": 25050 + }, + { + "epoch": 0.6961369254183581, + "grad_norm": 0.15455523133277893, + "learning_rate": 0.0001793585684881511, + "loss": 0.2325, + "step": 25100 + }, + { + "epoch": 0.6975236523614227, + "grad_norm": 0.1231105625629425, + "learning_rate": 0.00017926738673815248, + "loss": 0.2303, + "step": 25150 + }, + { + "epoch": 0.6989103793044871, + "grad_norm": 0.19073975086212158, + "learning_rate": 0.00017917602732347597, + "loss": 0.2309, + "step": 25200 + }, + { + "epoch": 0.7002971062475516, + "grad_norm": 0.16656789183616638, + "learning_rate": 0.00017908449044888854, + "loss": 0.2334, + "step": 25250 + }, + { + "epoch": 0.701683833190616, + "grad_norm": 0.12732850015163422, + "learning_rate": 0.00017899277631955486, + "loss": 0.2348, + "step": 25300 + }, + { + "epoch": 0.7030705601336805, + "grad_norm": 0.20655155181884766, + "learning_rate": 0.00017890088514103692, + "loss": 0.2355, + "step": 25350 + }, + { + "epoch": 0.7044572870767449, + "grad_norm": 0.10959596931934357, + "learning_rate": 0.00017880881711929353, + "loss": 0.2304, + "step": 25400 + }, + { + "epoch": 0.7058440140198093, + "grad_norm": 0.15412519872188568, + "learning_rate": 0.00017871657246067987, + "loss": 0.2336, + "step": 25450 + }, + { + "epoch": 0.7072307409628739, + "grad_norm": 0.16455277800559998, + "learning_rate": 0.00017862415137194702, + "loss": 0.2319, + "step": 25500 + }, + { + "epoch": 0.7086174679059383, + "grad_norm": 0.1389029622077942, + "learning_rate": 0.00017853340773211896, + "loss": 0.2294, + "step": 25550 + }, + { + "epoch": 0.7100041948490028, + "grad_norm": 0.14564301073551178, + "learning_rate": 0.0001784424950430794, + "loss": 0.2326, + "step": 25600 + }, + { + "epoch": 0.7113909217920672, + "grad_norm": 0.1606937199831009, + "learning_rate": 0.00017834955293674994, + "loss": 0.23, + "step": 25650 + }, + { + "epoch": 0.7127776487351317, + "grad_norm": 0.13401974737644196, + "learning_rate": 0.00017825643522291457, + "loss": 0.2361, + "step": 25700 + }, + { + "epoch": 0.7141643756781961, + "grad_norm": 0.12457278370857239, + "learning_rate": 0.0001781631421102812, + "loss": 0.232, + "step": 25750 + }, + { + "epoch": 0.7155511026212606, + "grad_norm": 0.13395826518535614, + "learning_rate": 0.0001780696738079508, + "loss": 0.2294, + "step": 25800 + }, + { + "epoch": 0.7169378295643251, + "grad_norm": 0.13083291053771973, + "learning_rate": 0.00017797603052541704, + "loss": 0.2328, + "step": 25850 + }, + { + "epoch": 0.7183245565073895, + "grad_norm": 0.14696165919303894, + "learning_rate": 0.00017788221247256583, + "loss": 0.233, + "step": 25900 + }, + { + "epoch": 0.719711283450454, + "grad_norm": 0.1512746810913086, + "learning_rate": 0.00017778821985967467, + "loss": 0.2319, + "step": 25950 + }, + { + "epoch": 0.7210980103935184, + "grad_norm": 0.1260426789522171, + "learning_rate": 0.00017769405289741247, + "loss": 0.2341, + "step": 26000 + }, + { + "epoch": 0.7210980103935184, + "eval_loss": 0.22873948514461517, + "eval_runtime": 500.274, + "eval_samples_per_second": 5.711, + "eval_steps_per_second": 5.711, + "step": 26000 + }, + { + "epoch": 0.7224847373365829, + "grad_norm": 0.1653342843055725, + "learning_rate": 0.00017759971179683875, + "loss": 0.2316, + "step": 26050 + }, + { + "epoch": 0.7238714642796473, + "grad_norm": 0.13507039844989777, + "learning_rate": 0.00017750519676940348, + "loss": 0.2357, + "step": 26100 + }, + { + "epoch": 0.7252581912227118, + "grad_norm": 0.128819540143013, + "learning_rate": 0.00017741050802694635, + "loss": 0.231, + "step": 26150 + }, + { + "epoch": 0.7266449181657763, + "grad_norm": 0.13130728900432587, + "learning_rate": 0.00017731564578169647, + "loss": 0.2305, + "step": 26200 + }, + { + "epoch": 0.7280316451088408, + "grad_norm": 0.12267379462718964, + "learning_rate": 0.0001772206102462718, + "loss": 0.2345, + "step": 26250 + }, + { + "epoch": 0.7294183720519052, + "grad_norm": 0.14595343172550201, + "learning_rate": 0.0001771254016336787, + "loss": 0.2294, + "step": 26300 + }, + { + "epoch": 0.7308050989949696, + "grad_norm": 0.13935647904872894, + "learning_rate": 0.0001770300201573114, + "loss": 0.2358, + "step": 26350 + }, + { + "epoch": 0.7321918259380341, + "grad_norm": 0.11328408867120743, + "learning_rate": 0.00017693446603095174, + "loss": 0.2339, + "step": 26400 + }, + { + "epoch": 0.7335785528810985, + "grad_norm": 0.19857367873191833, + "learning_rate": 0.00017683873946876835, + "loss": 0.2269, + "step": 26450 + }, + { + "epoch": 0.734965279824163, + "grad_norm": 0.16225670278072357, + "learning_rate": 0.00017674284068531641, + "loss": 0.2307, + "step": 26500 + }, + { + "epoch": 0.7363520067672275, + "grad_norm": 0.1412588506937027, + "learning_rate": 0.00017664676989553714, + "loss": 0.229, + "step": 26550 + }, + { + "epoch": 0.737738733710292, + "grad_norm": 0.14530161023139954, + "learning_rate": 0.00017655052731475724, + "loss": 0.2308, + "step": 26600 + }, + { + "epoch": 0.7391254606533564, + "grad_norm": 0.12190265953540802, + "learning_rate": 0.0001764541131586885, + "loss": 0.2294, + "step": 26650 + }, + { + "epoch": 0.7405121875964209, + "grad_norm": 0.13169080018997192, + "learning_rate": 0.00017635752764342717, + "loss": 0.2275, + "step": 26700 + }, + { + "epoch": 0.7418989145394853, + "grad_norm": 0.12346599251031876, + "learning_rate": 0.00017626077098545367, + "loss": 0.2326, + "step": 26750 + }, + { + "epoch": 0.7432856414825497, + "grad_norm": 0.12645727396011353, + "learning_rate": 0.00017616384340163197, + "loss": 0.2369, + "step": 26800 + }, + { + "epoch": 0.7446723684256142, + "grad_norm": 0.12523086369037628, + "learning_rate": 0.00017606674510920915, + "loss": 0.2291, + "step": 26850 + }, + { + "epoch": 0.7460590953686786, + "grad_norm": 0.14181695878505707, + "learning_rate": 0.0001759694763258149, + "loss": 0.2266, + "step": 26900 + }, + { + "epoch": 0.7474458223117432, + "grad_norm": 0.13824765384197235, + "learning_rate": 0.00017587203726946102, + "loss": 0.2281, + "step": 26950 + }, + { + "epoch": 0.7488325492548076, + "grad_norm": 0.1162494495511055, + "learning_rate": 0.000175774428158541, + "loss": 0.2326, + "step": 27000 + }, + { + "epoch": 0.7488325492548076, + "eval_loss": 0.22845527529716492, + "eval_runtime": 500.3687, + "eval_samples_per_second": 5.71, + "eval_steps_per_second": 5.71, + "step": 27000 + }, + { + "epoch": 0.7502192761978721, + "grad_norm": 0.1494184285402298, + "learning_rate": 0.0001756766492118294, + "loss": 0.2335, + "step": 27050 + }, + { + "epoch": 0.7516060031409365, + "grad_norm": 0.14270345866680145, + "learning_rate": 0.00017557870064848153, + "loss": 0.2378, + "step": 27100 + }, + { + "epoch": 0.752992730084001, + "grad_norm": 0.17542113363742828, + "learning_rate": 0.0001754805826880328, + "loss": 0.2344, + "step": 27150 + }, + { + "epoch": 0.7543794570270654, + "grad_norm": 0.14542442560195923, + "learning_rate": 0.0001753822955503983, + "loss": 0.2413, + "step": 27200 + }, + { + "epoch": 0.75576618397013, + "grad_norm": 0.13541916012763977, + "learning_rate": 0.00017528383945587236, + "loss": 0.2331, + "step": 27250 + }, + { + "epoch": 0.7571529109131944, + "grad_norm": 0.1555178165435791, + "learning_rate": 0.00017518521462512796, + "loss": 0.2314, + "step": 27300 + }, + { + "epoch": 0.7585396378562588, + "grad_norm": 0.10956469923257828, + "learning_rate": 0.0001750864212792162, + "loss": 0.2312, + "step": 27350 + }, + { + "epoch": 0.7599263647993233, + "grad_norm": 0.15572619438171387, + "learning_rate": 0.00017498745963956603, + "loss": 0.2334, + "step": 27400 + }, + { + "epoch": 0.7613130917423877, + "grad_norm": 0.1467774659395218, + "learning_rate": 0.0001748883299279835, + "loss": 0.231, + "step": 27450 + }, + { + "epoch": 0.7626998186854522, + "grad_norm": 0.12245896458625793, + "learning_rate": 0.00017478903236665136, + "loss": 0.2374, + "step": 27500 + }, + { + "epoch": 0.7640865456285166, + "grad_norm": 0.10392642766237259, + "learning_rate": 0.00017468956717812864, + "loss": 0.2313, + "step": 27550 + }, + { + "epoch": 0.7654732725715812, + "grad_norm": 0.1239921823143959, + "learning_rate": 0.00017458993458534998, + "loss": 0.2349, + "step": 27600 + }, + { + "epoch": 0.7668599995146456, + "grad_norm": 0.13776883482933044, + "learning_rate": 0.00017449013481162534, + "loss": 0.2362, + "step": 27650 + }, + { + "epoch": 0.7682467264577101, + "grad_norm": 0.1389874666929245, + "learning_rate": 0.00017439016808063932, + "loss": 0.2304, + "step": 27700 + }, + { + "epoch": 0.7696334534007745, + "grad_norm": 0.11973544955253601, + "learning_rate": 0.00017429003461645072, + "loss": 0.2352, + "step": 27750 + }, + { + "epoch": 0.7710201803438389, + "grad_norm": 0.13108691573143005, + "learning_rate": 0.00017418973464349209, + "loss": 0.2311, + "step": 27800 + }, + { + "epoch": 0.7724069072869034, + "grad_norm": 0.12594327330589294, + "learning_rate": 0.00017408926838656912, + "loss": 0.2332, + "step": 27850 + }, + { + "epoch": 0.7737936342299678, + "grad_norm": 0.14845065772533417, + "learning_rate": 0.00017398863607086024, + "loss": 0.2307, + "step": 27900 + }, + { + "epoch": 0.7751803611730324, + "grad_norm": 0.11298257112503052, + "learning_rate": 0.0001738878379219161, + "loss": 0.2331, + "step": 27950 + }, + { + "epoch": 0.7765670881160968, + "grad_norm": 0.11864858120679855, + "learning_rate": 0.000173786874165659, + "loss": 0.231, + "step": 28000 + }, + { + "epoch": 0.7765670881160968, + "eval_loss": 0.22779151797294617, + "eval_runtime": 501.235, + "eval_samples_per_second": 5.7, + "eval_steps_per_second": 5.7, + "step": 28000 + }, + { + "epoch": 0.7779538150591613, + "grad_norm": 0.11632022261619568, + "learning_rate": 0.00017368574502838239, + "loss": 0.229, + "step": 28050 + }, + { + "epoch": 0.7793405420022257, + "grad_norm": 0.1431494504213333, + "learning_rate": 0.00017358445073675042, + "loss": 0.2318, + "step": 28100 + }, + { + "epoch": 0.7807272689452902, + "grad_norm": 0.12157493084669113, + "learning_rate": 0.00017348299151779748, + "loss": 0.2343, + "step": 28150 + }, + { + "epoch": 0.7821139958883546, + "grad_norm": 0.11989067494869232, + "learning_rate": 0.00017338136759892752, + "loss": 0.2347, + "step": 28200 + }, + { + "epoch": 0.783500722831419, + "grad_norm": 0.12739787995815277, + "learning_rate": 0.00017327957920791365, + "loss": 0.2328, + "step": 28250 + }, + { + "epoch": 0.7848874497744835, + "grad_norm": 0.15567833185195923, + "learning_rate": 0.00017317762657289768, + "loss": 0.2297, + "step": 28300 + }, + { + "epoch": 0.786274176717548, + "grad_norm": 0.12073542922735214, + "learning_rate": 0.00017307550992238943, + "loss": 0.2296, + "step": 28350 + }, + { + "epoch": 0.7876609036606125, + "grad_norm": 0.1477758288383484, + "learning_rate": 0.0001729732294852665, + "loss": 0.2328, + "step": 28400 + }, + { + "epoch": 0.7890476306036769, + "grad_norm": 0.1612139195203781, + "learning_rate": 0.00017287078549077343, + "loss": 0.2314, + "step": 28450 + }, + { + "epoch": 0.7904343575467414, + "grad_norm": 0.15718688070774078, + "learning_rate": 0.00017276817816852145, + "loss": 0.2289, + "step": 28500 + }, + { + "epoch": 0.7918210844898058, + "grad_norm": 0.1242058202624321, + "learning_rate": 0.0001726654077484878, + "loss": 0.2301, + "step": 28550 + }, + { + "epoch": 0.7932078114328703, + "grad_norm": 0.13269132375717163, + "learning_rate": 0.0001725624744610153, + "loss": 0.2303, + "step": 28600 + }, + { + "epoch": 0.7945945383759347, + "grad_norm": 0.12394677847623825, + "learning_rate": 0.0001724593785368118, + "loss": 0.2362, + "step": 28650 + }, + { + "epoch": 0.7959812653189992, + "grad_norm": 0.1323787420988083, + "learning_rate": 0.00017235612020694978, + "loss": 0.2281, + "step": 28700 + }, + { + "epoch": 0.7973679922620637, + "grad_norm": 0.1532479077577591, + "learning_rate": 0.00017225269970286552, + "loss": 0.2321, + "step": 28750 + }, + { + "epoch": 0.7987547192051281, + "grad_norm": 0.14882826805114746, + "learning_rate": 0.00017214911725635897, + "loss": 0.2316, + "step": 28800 + }, + { + "epoch": 0.8001414461481926, + "grad_norm": 0.11855613440275192, + "learning_rate": 0.00017204537309959292, + "loss": 0.2271, + "step": 28850 + }, + { + "epoch": 0.801528173091257, + "grad_norm": 0.15302914381027222, + "learning_rate": 0.00017194146746509268, + "loss": 0.2296, + "step": 28900 + }, + { + "epoch": 0.8029149000343215, + "grad_norm": 0.11822402477264404, + "learning_rate": 0.00017183740058574547, + "loss": 0.2301, + "step": 28950 + }, + { + "epoch": 0.8043016269773859, + "grad_norm": 0.1369016021490097, + "learning_rate": 0.00017173317269479992, + "loss": 0.2291, + "step": 29000 + }, + { + "epoch": 0.8043016269773859, + "eval_loss": 0.2273886650800705, + "eval_runtime": 501.6607, + "eval_samples_per_second": 5.695, + "eval_steps_per_second": 5.695, + "step": 29000 + }, + { + "epoch": 0.8056883539204505, + "grad_norm": 0.12872962653636932, + "learning_rate": 0.00017162878402586553, + "loss": 0.2344, + "step": 29050 + }, + { + "epoch": 0.8070750808635149, + "grad_norm": 0.13491351902484894, + "learning_rate": 0.00017152423481291216, + "loss": 0.2357, + "step": 29100 + }, + { + "epoch": 0.8084618078065793, + "grad_norm": 0.12680833041667938, + "learning_rate": 0.00017141952529026945, + "loss": 0.2333, + "step": 29150 + }, + { + "epoch": 0.8098485347496438, + "grad_norm": 0.12384926527738571, + "learning_rate": 0.0001713146556926265, + "loss": 0.2421, + "step": 29200 + }, + { + "epoch": 0.8112352616927082, + "grad_norm": 0.13864979147911072, + "learning_rate": 0.00017120962625503098, + "loss": 0.2262, + "step": 29250 + }, + { + "epoch": 0.8126219886357727, + "grad_norm": 0.12703485786914825, + "learning_rate": 0.00017110443721288901, + "loss": 0.2295, + "step": 29300 + }, + { + "epoch": 0.8140087155788371, + "grad_norm": 0.12121795862913132, + "learning_rate": 0.0001709990888019643, + "loss": 0.2286, + "step": 29350 + }, + { + "epoch": 0.8153954425219017, + "grad_norm": 0.11982162296772003, + "learning_rate": 0.00017089358125837783, + "loss": 0.2286, + "step": 29400 + }, + { + "epoch": 0.8167821694649661, + "grad_norm": 0.1372060328722, + "learning_rate": 0.00017078791481860725, + "loss": 0.2244, + "step": 29450 + }, + { + "epoch": 0.8181688964080306, + "grad_norm": 0.12731321156024933, + "learning_rate": 0.0001706820897194863, + "loss": 0.2259, + "step": 29500 + }, + { + "epoch": 0.819555623351095, + "grad_norm": 0.14031195640563965, + "learning_rate": 0.00017057610619820437, + "loss": 0.2297, + "step": 29550 + }, + { + "epoch": 0.8209423502941594, + "grad_norm": 0.13404880464076996, + "learning_rate": 0.0001704699644923059, + "loss": 0.2293, + "step": 29600 + }, + { + "epoch": 0.8223290772372239, + "grad_norm": 0.12400925159454346, + "learning_rate": 0.00017036366483968987, + "loss": 0.2263, + "step": 29650 + }, + { + "epoch": 0.8237158041802883, + "grad_norm": 0.14439739286899567, + "learning_rate": 0.00017025720747860937, + "loss": 0.2272, + "step": 29700 + }, + { + "epoch": 0.8251025311233529, + "grad_norm": 0.12196583300828934, + "learning_rate": 0.00017015059264767084, + "loss": 0.2337, + "step": 29750 + }, + { + "epoch": 0.8264892580664173, + "grad_norm": 0.13919509947299957, + "learning_rate": 0.00017004382058583367, + "loss": 0.2337, + "step": 29800 + }, + { + "epoch": 0.8278759850094818, + "grad_norm": 0.11371088027954102, + "learning_rate": 0.00016993689153240978, + "loss": 0.2252, + "step": 29850 + }, + { + "epoch": 0.8292627119525462, + "grad_norm": 0.1316608041524887, + "learning_rate": 0.00016982980572706282, + "loss": 0.2281, + "step": 29900 + }, + { + "epoch": 0.8306494388956107, + "grad_norm": 0.18003039062023163, + "learning_rate": 0.00016972256340980785, + "loss": 0.2296, + "step": 29950 + }, + { + "epoch": 0.8320361658386751, + "grad_norm": 0.16534283757209778, + "learning_rate": 0.0001696151648210107, + "loss": 0.2267, + "step": 30000 + }, + { + "epoch": 0.8320361658386751, + "eval_loss": 0.22761212289333344, + "eval_runtime": 501.069, + "eval_samples_per_second": 5.702, + "eval_steps_per_second": 5.702, + "step": 30000 + }, + { + "epoch": 0.8334228927817395, + "grad_norm": 0.11093872785568237, + "learning_rate": 0.00016950761020138747, + "loss": 0.234, + "step": 30050 + }, + { + "epoch": 0.834809619724804, + "grad_norm": 0.14647316932678223, + "learning_rate": 0.00016939989979200394, + "loss": 0.232, + "step": 30100 + }, + { + "epoch": 0.8361963466678685, + "grad_norm": 0.14312680065631866, + "learning_rate": 0.00016929203383427515, + "loss": 0.2299, + "step": 30150 + }, + { + "epoch": 0.837583073610933, + "grad_norm": 0.11662258952856064, + "learning_rate": 0.00016918401256996467, + "loss": 0.2298, + "step": 30200 + }, + { + "epoch": 0.8389698005539974, + "grad_norm": 0.11783650517463684, + "learning_rate": 0.0001690758362411843, + "loss": 0.2345, + "step": 30250 + }, + { + "epoch": 0.8403565274970619, + "grad_norm": 0.12562035024166107, + "learning_rate": 0.0001689675050903932, + "loss": 0.2341, + "step": 30300 + }, + { + "epoch": 0.8417432544401263, + "grad_norm": 0.1082848459482193, + "learning_rate": 0.00016885901936039774, + "loss": 0.2298, + "step": 30350 + }, + { + "epoch": 0.8431299813831908, + "grad_norm": 0.14080305397510529, + "learning_rate": 0.0001687503792943506, + "loss": 0.2364, + "step": 30400 + }, + { + "epoch": 0.8445167083262552, + "grad_norm": 0.133138969540596, + "learning_rate": 0.00016864158513575048, + "loss": 0.2293, + "step": 30450 + }, + { + "epoch": 0.8459034352693197, + "grad_norm": 0.13258026540279388, + "learning_rate": 0.00016853263712844136, + "loss": 0.2269, + "step": 30500 + }, + { + "epoch": 0.8472901622123842, + "grad_norm": 0.12311206012964249, + "learning_rate": 0.00016842353551661216, + "loss": 0.2297, + "step": 30550 + }, + { + "epoch": 0.8486768891554486, + "grad_norm": 0.12220294028520584, + "learning_rate": 0.00016831428054479597, + "loss": 0.2301, + "step": 30600 + }, + { + "epoch": 0.8500636160985131, + "grad_norm": 0.112845279276371, + "learning_rate": 0.00016820487245786968, + "loss": 0.2295, + "step": 30650 + }, + { + "epoch": 0.8514503430415775, + "grad_norm": 0.17439040541648865, + "learning_rate": 0.0001680953115010533, + "loss": 0.2299, + "step": 30700 + }, + { + "epoch": 0.852837069984642, + "grad_norm": 0.14124707877635956, + "learning_rate": 0.0001679855979199096, + "loss": 0.228, + "step": 30750 + }, + { + "epoch": 0.8542237969277064, + "grad_norm": 0.12298920005559921, + "learning_rate": 0.00016787573196034328, + "loss": 0.2293, + "step": 30800 + }, + { + "epoch": 0.855610523870771, + "grad_norm": 0.15425720810890198, + "learning_rate": 0.0001677657138686006, + "loss": 0.2263, + "step": 30850 + }, + { + "epoch": 0.8569972508138354, + "grad_norm": 0.13903729617595673, + "learning_rate": 0.0001676555438912689, + "loss": 0.2315, + "step": 30900 + }, + { + "epoch": 0.8583839777568998, + "grad_norm": 0.1249585896730423, + "learning_rate": 0.00016754522227527589, + "loss": 0.2289, + "step": 30950 + }, + { + "epoch": 0.8597707046999643, + "grad_norm": 0.13223236799240112, + "learning_rate": 0.00016743474926788908, + "loss": 0.2303, + "step": 31000 + }, + { + "epoch": 0.8597707046999643, + "eval_loss": 0.22721892595291138, + "eval_runtime": 500.5938, + "eval_samples_per_second": 5.707, + "eval_steps_per_second": 5.707, + "step": 31000 + }, + { + "epoch": 0.8611574316430287, + "grad_norm": 0.15615518391132355, + "learning_rate": 0.00016732412511671544, + "loss": 0.2306, + "step": 31050 + }, + { + "epoch": 0.8625441585860932, + "grad_norm": 0.14526858925819397, + "learning_rate": 0.0001672133500697005, + "loss": 0.2307, + "step": 31100 + }, + { + "epoch": 0.8639308855291576, + "grad_norm": 0.11307808756828308, + "learning_rate": 0.00016710242437512825, + "loss": 0.237, + "step": 31150 + }, + { + "epoch": 0.8653176124722222, + "grad_norm": 0.1289224922657013, + "learning_rate": 0.00016699134828162017, + "loss": 0.2344, + "step": 31200 + }, + { + "epoch": 0.8667043394152866, + "grad_norm": 0.1631319522857666, + "learning_rate": 0.00016688012203813486, + "loss": 0.2305, + "step": 31250 + }, + { + "epoch": 0.8680910663583511, + "grad_norm": 0.1249733492732048, + "learning_rate": 0.00016676874589396744, + "loss": 0.2301, + "step": 31300 + }, + { + "epoch": 0.8694777933014155, + "grad_norm": 0.11502408981323242, + "learning_rate": 0.00016665722009874905, + "loss": 0.2319, + "step": 31350 + }, + { + "epoch": 0.8708645202444799, + "grad_norm": 0.13455846905708313, + "learning_rate": 0.00016654554490244628, + "loss": 0.228, + "step": 31400 + }, + { + "epoch": 0.8722512471875444, + "grad_norm": 0.1758633404970169, + "learning_rate": 0.00016643372055536048, + "loss": 0.2309, + "step": 31450 + }, + { + "epoch": 0.8736379741306088, + "grad_norm": 0.11880768090486526, + "learning_rate": 0.00016632174730812734, + "loss": 0.23, + "step": 31500 + }, + { + "epoch": 0.8750247010736734, + "grad_norm": 0.13718900084495544, + "learning_rate": 0.0001662096254117163, + "loss": 0.2279, + "step": 31550 + }, + { + "epoch": 0.8764114280167378, + "grad_norm": 0.1170978993177414, + "learning_rate": 0.00016609735511743, + "loss": 0.2306, + "step": 31600 + }, + { + "epoch": 0.8777981549598023, + "grad_norm": 0.15582193434238434, + "learning_rate": 0.0001659849366769036, + "loss": 0.2312, + "step": 31650 + }, + { + "epoch": 0.8791848819028667, + "grad_norm": 0.12351904064416885, + "learning_rate": 0.00016587237034210435, + "loss": 0.2292, + "step": 31700 + }, + { + "epoch": 0.8805716088459312, + "grad_norm": 0.18479709327220917, + "learning_rate": 0.000165759656365331, + "loss": 0.2274, + "step": 31750 + }, + { + "epoch": 0.8819583357889956, + "grad_norm": 0.14211027324199677, + "learning_rate": 0.00016564679499921328, + "loss": 0.2298, + "step": 31800 + }, + { + "epoch": 0.88334506273206, + "grad_norm": 0.1540357619524002, + "learning_rate": 0.00016553378649671112, + "loss": 0.2304, + "step": 31850 + }, + { + "epoch": 0.8847317896751246, + "grad_norm": 0.12503454089164734, + "learning_rate": 0.00016542063111111427, + "loss": 0.2294, + "step": 31900 + }, + { + "epoch": 0.886118516618189, + "grad_norm": 0.13658925890922546, + "learning_rate": 0.00016530732909604177, + "loss": 0.2291, + "step": 31950 + }, + { + "epoch": 0.8875052435612535, + "grad_norm": 0.15731070935726166, + "learning_rate": 0.00016519388070544128, + "loss": 0.2322, + "step": 32000 + }, + { + "epoch": 0.8875052435612535, + "eval_loss": 0.22673186659812927, + "eval_runtime": 500.5013, + "eval_samples_per_second": 5.708, + "eval_steps_per_second": 5.708, + "step": 32000 + }, + { + "epoch": 0.8888919705043179, + "grad_norm": 0.11884371191263199, + "learning_rate": 0.0001650802861935885, + "loss": 0.2312, + "step": 32050 + }, + { + "epoch": 0.8902786974473824, + "grad_norm": 0.168379008769989, + "learning_rate": 0.00016496654581508663, + "loss": 0.2312, + "step": 32100 + }, + { + "epoch": 0.8916654243904468, + "grad_norm": 0.11641304939985275, + "learning_rate": 0.00016485265982486591, + "loss": 0.2271, + "step": 32150 + }, + { + "epoch": 0.8930521513335113, + "grad_norm": 0.12015505880117416, + "learning_rate": 0.00016473862847818277, + "loss": 0.2308, + "step": 32200 + }, + { + "epoch": 0.8944388782765758, + "grad_norm": 0.17053671181201935, + "learning_rate": 0.00016462445203061957, + "loss": 0.2324, + "step": 32250 + }, + { + "epoch": 0.8958256052196402, + "grad_norm": 0.12947635352611542, + "learning_rate": 0.0001645101307380839, + "loss": 0.2318, + "step": 32300 + }, + { + "epoch": 0.8972123321627047, + "grad_norm": 0.11198735982179642, + "learning_rate": 0.00016439566485680783, + "loss": 0.23, + "step": 32350 + }, + { + "epoch": 0.8985990591057691, + "grad_norm": 0.1204909086227417, + "learning_rate": 0.00016428105464334772, + "loss": 0.23, + "step": 32400 + }, + { + "epoch": 0.8999857860488336, + "grad_norm": 0.11191330850124359, + "learning_rate": 0.00016416630035458326, + "loss": 0.2295, + "step": 32450 + }, + { + "epoch": 0.901372512991898, + "grad_norm": 0.10705868154764175, + "learning_rate": 0.00016405140224771717, + "loss": 0.2246, + "step": 32500 + }, + { + "epoch": 0.9027592399349625, + "grad_norm": 0.11882634460926056, + "learning_rate": 0.0001639363605802744, + "loss": 0.2345, + "step": 32550 + }, + { + "epoch": 0.904145966878027, + "grad_norm": 0.1181696355342865, + "learning_rate": 0.0001638211756101018, + "loss": 0.2306, + "step": 32600 + }, + { + "epoch": 0.9055326938210915, + "grad_norm": 0.1270473152399063, + "learning_rate": 0.00016370584759536734, + "loss": 0.2297, + "step": 32650 + }, + { + "epoch": 0.9069194207641559, + "grad_norm": 0.11503591388463974, + "learning_rate": 0.00016359037679455955, + "loss": 0.2292, + "step": 32700 + }, + { + "epoch": 0.9083061477072203, + "grad_norm": 0.11596430093050003, + "learning_rate": 0.0001634747634664871, + "loss": 0.2324, + "step": 32750 + }, + { + "epoch": 0.9096928746502848, + "grad_norm": 0.16631336510181427, + "learning_rate": 0.00016335900787027802, + "loss": 0.23, + "step": 32800 + }, + { + "epoch": 0.9110796015933492, + "grad_norm": 0.12083205580711365, + "learning_rate": 0.0001632431102653793, + "loss": 0.2295, + "step": 32850 + }, + { + "epoch": 0.9124663285364137, + "grad_norm": 0.1268964558839798, + "learning_rate": 0.00016312707091155609, + "loss": 0.2299, + "step": 32900 + }, + { + "epoch": 0.9138530554794781, + "grad_norm": 0.1737286001443863, + "learning_rate": 0.00016301089006889137, + "loss": 0.2291, + "step": 32950 + }, + { + "epoch": 0.9152397824225427, + "grad_norm": 0.12454930692911148, + "learning_rate": 0.00016289456799778522, + "loss": 0.2289, + "step": 33000 + }, + { + "epoch": 0.9152397824225427, + "eval_loss": 0.22642949223518372, + "eval_runtime": 500.8866, + "eval_samples_per_second": 5.704, + "eval_steps_per_second": 5.704, + "step": 33000 + }, + { + "epoch": 0.9166265093656071, + "grad_norm": 0.12109609693288803, + "learning_rate": 0.00016277810495895419, + "loss": 0.2289, + "step": 33050 + }, + { + "epoch": 0.9180132363086716, + "grad_norm": 0.16857489943504333, + "learning_rate": 0.00016266150121343085, + "loss": 0.2265, + "step": 33100 + }, + { + "epoch": 0.919399963251736, + "grad_norm": 0.13193485140800476, + "learning_rate": 0.00016254475702256308, + "loss": 0.2277, + "step": 33150 + }, + { + "epoch": 0.9207866901948004, + "grad_norm": 0.13189518451690674, + "learning_rate": 0.0001624278726480137, + "loss": 0.2346, + "step": 33200 + }, + { + "epoch": 0.9221734171378649, + "grad_norm": 0.16021443903446198, + "learning_rate": 0.00016231084835175948, + "loss": 0.2273, + "step": 33250 + }, + { + "epoch": 0.9235601440809293, + "grad_norm": 0.14241939783096313, + "learning_rate": 0.00016219368439609103, + "loss": 0.236, + "step": 33300 + }, + { + "epoch": 0.9249468710239939, + "grad_norm": 0.18355390429496765, + "learning_rate": 0.0001620763810436119, + "loss": 0.2281, + "step": 33350 + }, + { + "epoch": 0.9263335979670583, + "grad_norm": 0.1321648508310318, + "learning_rate": 0.0001619612887687756, + "loss": 0.241, + "step": 33400 + }, + { + "epoch": 0.9277203249101228, + "grad_norm": 0.16118654608726501, + "learning_rate": 0.00016184371018656649, + "loss": 0.233, + "step": 33450 + }, + { + "epoch": 0.9291070518531872, + "grad_norm": 0.11974034458398819, + "learning_rate": 0.00016172599299195568, + "loss": 0.219, + "step": 33500 + }, + { + "epoch": 0.9304937787962517, + "grad_norm": 0.14652998745441437, + "learning_rate": 0.00016160813744878674, + "loss": 0.2316, + "step": 33550 + }, + { + "epoch": 0.9318805057393161, + "grad_norm": 0.09738484770059586, + "learning_rate": 0.0001614901438212133, + "loss": 0.2351, + "step": 33600 + }, + { + "epoch": 0.9332672326823805, + "grad_norm": 0.15131749212741852, + "learning_rate": 0.00016137201237369846, + "loss": 0.2281, + "step": 33650 + }, + { + "epoch": 0.9346539596254451, + "grad_norm": 0.16536715626716614, + "learning_rate": 0.00016125374337101422, + "loss": 0.2317, + "step": 33700 + }, + { + "epoch": 0.9360406865685095, + "grad_norm": 0.15788187086582184, + "learning_rate": 0.0001611353370782409, + "loss": 0.2261, + "step": 33750 + }, + { + "epoch": 0.937427413511574, + "grad_norm": 0.11554282158613205, + "learning_rate": 0.00016101679376076655, + "loss": 0.2288, + "step": 33800 + }, + { + "epoch": 0.9388141404546384, + "grad_norm": 0.1376064121723175, + "learning_rate": 0.00016089811368428633, + "loss": 0.2287, + "step": 33850 + }, + { + "epoch": 0.9402008673977029, + "grad_norm": 0.1270899623632431, + "learning_rate": 0.0001607792971148019, + "loss": 0.2232, + "step": 33900 + }, + { + "epoch": 0.9415875943407673, + "grad_norm": 0.1187126636505127, + "learning_rate": 0.00016066034431862084, + "loss": 0.2321, + "step": 33950 + }, + { + "epoch": 0.9429743212838319, + "grad_norm": 0.14895334839820862, + "learning_rate": 0.00016054125556235613, + "loss": 0.2306, + "step": 34000 + }, + { + "epoch": 0.9429743212838319, + "eval_loss": 0.22613388299942017, + "eval_runtime": 500.7207, + "eval_samples_per_second": 5.706, + "eval_steps_per_second": 5.706, + "step": 34000 + }, + { + "epoch": 0.9443610482268963, + "grad_norm": 0.12219640612602234, + "learning_rate": 0.00016042203111292538, + "loss": 0.2315, + "step": 34050 + }, + { + "epoch": 0.9457477751699607, + "grad_norm": 0.1677113175392151, + "learning_rate": 0.00016030267123755038, + "loss": 0.2327, + "step": 34100 + }, + { + "epoch": 0.9471345021130252, + "grad_norm": 0.12030269205570221, + "learning_rate": 0.00016018317620375652, + "loss": 0.2282, + "step": 34150 + }, + { + "epoch": 0.9485212290560896, + "grad_norm": 0.13181360065937042, + "learning_rate": 0.00016006354627937203, + "loss": 0.2287, + "step": 34200 + }, + { + "epoch": 0.9499079559991541, + "grad_norm": 0.13087068498134613, + "learning_rate": 0.00015994378173252752, + "loss": 0.2282, + "step": 34250 + }, + { + "epoch": 0.9512946829422185, + "grad_norm": 0.14467494189739227, + "learning_rate": 0.0001598238828316553, + "loss": 0.2254, + "step": 34300 + }, + { + "epoch": 0.952681409885283, + "grad_norm": 0.14921946823596954, + "learning_rate": 0.00015970384984548885, + "loss": 0.2324, + "step": 34350 + }, + { + "epoch": 0.9540681368283475, + "grad_norm": 0.19342415034770966, + "learning_rate": 0.0001595836830430622, + "loss": 0.2342, + "step": 34400 + }, + { + "epoch": 0.955454863771412, + "grad_norm": 0.12381652742624283, + "learning_rate": 0.00015946338269370923, + "loss": 0.2262, + "step": 34450 + }, + { + "epoch": 0.9568415907144764, + "grad_norm": 0.1456434279680252, + "learning_rate": 0.00015934294906706315, + "loss": 0.2277, + "step": 34500 + }, + { + "epoch": 0.9582283176575408, + "grad_norm": 0.11485321074724197, + "learning_rate": 0.000159222382433056, + "loss": 0.2355, + "step": 34550 + }, + { + "epoch": 0.9596150446006053, + "grad_norm": 0.10027427971363068, + "learning_rate": 0.00015910168306191785, + "loss": 0.2269, + "step": 34600 + }, + { + "epoch": 0.9610017715436697, + "grad_norm": 0.16801820695400238, + "learning_rate": 0.0001589808512241763, + "loss": 0.2282, + "step": 34650 + }, + { + "epoch": 0.9623884984867342, + "grad_norm": 0.11840588599443436, + "learning_rate": 0.00015885988719065573, + "loss": 0.2304, + "step": 34700 + }, + { + "epoch": 0.9637752254297987, + "grad_norm": 0.16810324788093567, + "learning_rate": 0.00015873879123247706, + "loss": 0.231, + "step": 34750 + }, + { + "epoch": 0.9651619523728632, + "grad_norm": 0.1277480274438858, + "learning_rate": 0.0001586175636210567, + "loss": 0.2292, + "step": 34800 + }, + { + "epoch": 0.9665486793159276, + "grad_norm": 0.13225620985031128, + "learning_rate": 0.0001584962046281062, + "loss": 0.2255, + "step": 34850 + }, + { + "epoch": 0.9679354062589921, + "grad_norm": 0.14994849264621735, + "learning_rate": 0.00015837471452563159, + "loss": 0.2306, + "step": 34900 + }, + { + "epoch": 0.9693221332020565, + "grad_norm": 0.11426250636577606, + "learning_rate": 0.00015825309358593272, + "loss": 0.2311, + "step": 34950 + }, + { + "epoch": 0.9707088601451209, + "grad_norm": 0.1453811228275299, + "learning_rate": 0.00015813134208160276, + "loss": 0.2276, + "step": 35000 + }, + { + "epoch": 0.9707088601451209, + "eval_loss": 0.22605940699577332, + "eval_runtime": 500.6317, + "eval_samples_per_second": 5.707, + "eval_steps_per_second": 5.707, + "step": 35000 + }, + { + "epoch": 0.9720955870881854, + "grad_norm": 0.14036044478416443, + "learning_rate": 0.0001580094602855275, + "loss": 0.2241, + "step": 35050 + }, + { + "epoch": 0.9734823140312499, + "grad_norm": 0.1456310898065567, + "learning_rate": 0.00015788744847088464, + "loss": 0.2352, + "step": 35100 + }, + { + "epoch": 0.9748690409743144, + "grad_norm": 0.1325587034225464, + "learning_rate": 0.0001577653069111435, + "loss": 0.2267, + "step": 35150 + }, + { + "epoch": 0.9762557679173788, + "grad_norm": 0.13475272059440613, + "learning_rate": 0.000157643035880064, + "loss": 0.232, + "step": 35200 + }, + { + "epoch": 0.9776424948604433, + "grad_norm": 0.13557064533233643, + "learning_rate": 0.00015752063565169645, + "loss": 0.2342, + "step": 35250 + }, + { + "epoch": 0.9790292218035077, + "grad_norm": 0.149173304438591, + "learning_rate": 0.00015739810650038054, + "loss": 0.2284, + "step": 35300 + }, + { + "epoch": 0.9804159487465722, + "grad_norm": 0.11646503955125809, + "learning_rate": 0.00015727544870074503, + "loss": 0.2259, + "step": 35350 + }, + { + "epoch": 0.9818026756896366, + "grad_norm": 0.126033216714859, + "learning_rate": 0.000157152662527707, + "loss": 0.2289, + "step": 35400 + }, + { + "epoch": 0.983189402632701, + "grad_norm": 0.17162640392780304, + "learning_rate": 0.00015702974825647123, + "loss": 0.2293, + "step": 35450 + }, + { + "epoch": 0.9845761295757656, + "grad_norm": 0.12047728151082993, + "learning_rate": 0.0001569067061625297, + "loss": 0.2265, + "step": 35500 + }, + { + "epoch": 0.98596285651883, + "grad_norm": 0.1183520033955574, + "learning_rate": 0.00015678353652166078, + "loss": 0.2272, + "step": 35550 + }, + { + "epoch": 0.9873495834618945, + "grad_norm": 0.13919849693775177, + "learning_rate": 0.00015666023960992878, + "loss": 0.2295, + "step": 35600 + }, + { + "epoch": 0.9887363104049589, + "grad_norm": 0.14626280963420868, + "learning_rate": 0.00015653681570368318, + "loss": 0.2293, + "step": 35650 + }, + { + "epoch": 0.9901230373480234, + "grad_norm": 0.11618024855852127, + "learning_rate": 0.00015641326507955823, + "loss": 0.2264, + "step": 35700 + }, + { + "epoch": 0.9915097642910878, + "grad_norm": 0.12280390411615372, + "learning_rate": 0.0001562895880144721, + "loss": 0.233, + "step": 35750 + }, + { + "epoch": 0.9928964912341524, + "grad_norm": 0.11896737664937973, + "learning_rate": 0.0001561657847856264, + "loss": 0.2276, + "step": 35800 + }, + { + "epoch": 0.9942832181772168, + "grad_norm": 0.1226055920124054, + "learning_rate": 0.0001560418556705055, + "loss": 0.2364, + "step": 35850 + }, + { + "epoch": 0.9956699451202812, + "grad_norm": 0.1566486656665802, + "learning_rate": 0.00015591780094687587, + "loss": 0.2315, + "step": 35900 + }, + { + "epoch": 0.9970566720633457, + "grad_norm": 0.12156879901885986, + "learning_rate": 0.0001557936208927856, + "loss": 0.2284, + "step": 35950 + }, + { + "epoch": 0.9984433990064101, + "grad_norm": 0.12765392661094666, + "learning_rate": 0.00015566931578656366, + "loss": 0.2319, + "step": 36000 + }, + { + "epoch": 0.9984433990064101, + "eval_loss": 0.22568126022815704, + "eval_runtime": 500.5568, + "eval_samples_per_second": 5.708, + "eval_steps_per_second": 5.708, + "step": 36000 + }, + { + "epoch": 0.9998301259494746, + "grad_norm": 0.11263388395309448, + "learning_rate": 0.00015554488590681934, + "loss": 0.2249, + "step": 36050 + }, + { + "epoch": 1.0012168528925391, + "grad_norm": 0.12134028226137161, + "learning_rate": 0.00015542033153244142, + "loss": 0.2296, + "step": 36100 + }, + { + "epoch": 1.0026035798356034, + "grad_norm": 0.12478175759315491, + "learning_rate": 0.00015529565294259795, + "loss": 0.2295, + "step": 36150 + }, + { + "epoch": 1.003990306778668, + "grad_norm": 0.1091291755437851, + "learning_rate": 0.0001551708504167352, + "loss": 0.2285, + "step": 36200 + }, + { + "epoch": 1.0053770337217325, + "grad_norm": 0.11158731579780579, + "learning_rate": 0.00015504592423457733, + "loss": 0.2267, + "step": 36250 + }, + { + "epoch": 1.006763760664797, + "grad_norm": 0.17226600646972656, + "learning_rate": 0.00015492087467612562, + "loss": 0.2369, + "step": 36300 + }, + { + "epoch": 1.0081504876078613, + "grad_norm": 0.10548936575651169, + "learning_rate": 0.00015479570202165784, + "loss": 0.2257, + "step": 36350 + }, + { + "epoch": 1.0095372145509258, + "grad_norm": 0.12710842490196228, + "learning_rate": 0.0001546704065517278, + "loss": 0.2283, + "step": 36400 + }, + { + "epoch": 1.0109239414939903, + "grad_norm": 0.13734006881713867, + "learning_rate": 0.0001545449885471644, + "loss": 0.2266, + "step": 36450 + }, + { + "epoch": 1.0123106684370546, + "grad_norm": 0.14669275283813477, + "learning_rate": 0.00015441944828907124, + "loss": 0.2265, + "step": 36500 + }, + { + "epoch": 1.0136973953801192, + "grad_norm": 0.10941125452518463, + "learning_rate": 0.000154293786058826, + "loss": 0.231, + "step": 36550 + }, + { + "epoch": 1.0150841223231837, + "grad_norm": 0.12528035044670105, + "learning_rate": 0.00015416800213807972, + "loss": 0.2286, + "step": 36600 + }, + { + "epoch": 1.0164708492662482, + "grad_norm": 0.1242556944489479, + "learning_rate": 0.00015404209680875607, + "loss": 0.2277, + "step": 36650 + }, + { + "epoch": 1.0178575762093125, + "grad_norm": 0.09937360137701035, + "learning_rate": 0.000153916070353051, + "loss": 0.2247, + "step": 36700 + }, + { + "epoch": 1.019244303152377, + "grad_norm": 0.11109854280948639, + "learning_rate": 0.00015378992305343183, + "loss": 0.2248, + "step": 36750 + }, + { + "epoch": 1.0206310300954415, + "grad_norm": 0.14019356667995453, + "learning_rate": 0.00015366365519263683, + "loss": 0.2252, + "step": 36800 + }, + { + "epoch": 1.0220177570385058, + "grad_norm": 0.11496023088693619, + "learning_rate": 0.00015353979599334788, + "loss": 0.2228, + "step": 36850 + }, + { + "epoch": 1.0234044839815704, + "grad_norm": 0.15292219817638397, + "learning_rate": 0.0001534132902566159, + "loss": 0.2307, + "step": 36900 + }, + { + "epoch": 1.0247912109246349, + "grad_norm": 0.12410300970077515, + "learning_rate": 0.00015328666480286793, + "loss": 0.2263, + "step": 36950 + }, + { + "epoch": 1.0261779378676994, + "grad_norm": 0.14905387163162231, + "learning_rate": 0.00015315991991591386, + "loss": 0.2228, + "step": 37000 + }, + { + "epoch": 1.0261779378676994, + "eval_loss": 0.22574713826179504, + "eval_runtime": 500.6484, + "eval_samples_per_second": 5.707, + "eval_steps_per_second": 5.707, + "step": 37000 + }, + { + "epoch": 1.0275646648107637, + "grad_norm": 0.12682612240314484, + "learning_rate": 0.0001530330558798313, + "loss": 0.2257, + "step": 37050 + }, + { + "epoch": 1.0289513917538282, + "grad_norm": 0.15558844804763794, + "learning_rate": 0.00015290607297896482, + "loss": 0.2259, + "step": 37100 + }, + { + "epoch": 1.0303381186968927, + "grad_norm": 0.16526414453983307, + "learning_rate": 0.00015277897149792562, + "loss": 0.2301, + "step": 37150 + }, + { + "epoch": 1.0317248456399573, + "grad_norm": 0.1130262240767479, + "learning_rate": 0.0001526517517215905, + "loss": 0.2244, + "step": 37200 + }, + { + "epoch": 1.0331115725830216, + "grad_norm": 0.12639841437339783, + "learning_rate": 0.00015252441393510146, + "loss": 0.2269, + "step": 37250 + }, + { + "epoch": 1.034498299526086, + "grad_norm": 0.12753638625144958, + "learning_rate": 0.000152396958423865, + "loss": 0.2277, + "step": 37300 + }, + { + "epoch": 1.0358850264691506, + "grad_norm": 0.1574636995792389, + "learning_rate": 0.00015226938547355145, + "loss": 0.2302, + "step": 37350 + }, + { + "epoch": 1.037271753412215, + "grad_norm": 0.1075245812535286, + "learning_rate": 0.0001521416953700944, + "loss": 0.2318, + "step": 37400 + }, + { + "epoch": 1.0386584803552794, + "grad_norm": 0.15765556693077087, + "learning_rate": 0.00015201388839969005, + "loss": 0.2271, + "step": 37450 + }, + { + "epoch": 1.040045207298344, + "grad_norm": 0.14305494725704193, + "learning_rate": 0.00015188596484879636, + "loss": 0.2268, + "step": 37500 + }, + { + "epoch": 1.0414319342414085, + "grad_norm": 0.14217057824134827, + "learning_rate": 0.0001517579250041328, + "loss": 0.2302, + "step": 37550 + }, + { + "epoch": 1.0428186611844728, + "grad_norm": 0.12122397124767303, + "learning_rate": 0.00015162976915267948, + "loss": 0.2264, + "step": 37600 + }, + { + "epoch": 1.0442053881275373, + "grad_norm": 0.1215621680021286, + "learning_rate": 0.00015150149758167634, + "loss": 0.2239, + "step": 37650 + }, + { + "epoch": 1.0455921150706018, + "grad_norm": 0.1759423315525055, + "learning_rate": 0.00015137311057862279, + "loss": 0.2244, + "step": 37700 + }, + { + "epoch": 1.046978842013666, + "grad_norm": 0.11546457558870316, + "learning_rate": 0.00015124460843127704, + "loss": 0.226, + "step": 37750 + }, + { + "epoch": 1.0483655689567306, + "grad_norm": 0.16507115960121155, + "learning_rate": 0.00015111599142765526, + "loss": 0.2267, + "step": 37800 + }, + { + "epoch": 1.0497522958997951, + "grad_norm": 0.15918377041816711, + "learning_rate": 0.0001509872598560311, + "loss": 0.2265, + "step": 37850 + }, + { + "epoch": 1.0511390228428596, + "grad_norm": 0.12590187788009644, + "learning_rate": 0.000150858414004935, + "loss": 0.2285, + "step": 37900 + }, + { + "epoch": 1.052525749785924, + "grad_norm": 0.11883638054132462, + "learning_rate": 0.0001507294541631535, + "loss": 0.2233, + "step": 37950 + }, + { + "epoch": 1.0539124767289885, + "grad_norm": 0.11353275179862976, + "learning_rate": 0.00015060038061972874, + "loss": 0.2238, + "step": 38000 + }, + { + "epoch": 1.0539124767289885, + "eval_loss": 0.22568707168102264, + "eval_runtime": 500.8783, + "eval_samples_per_second": 5.704, + "eval_steps_per_second": 5.704, + "step": 38000 + }, + { + "epoch": 1.055299203672053, + "grad_norm": 0.1161685511469841, + "learning_rate": 0.00015047119366395757, + "loss": 0.2292, + "step": 38050 + }, + { + "epoch": 1.0566859306151175, + "grad_norm": 0.13814447820186615, + "learning_rate": 0.00015034189358539103, + "loss": 0.2251, + "step": 38100 + }, + { + "epoch": 1.0580726575581818, + "grad_norm": 0.15208768844604492, + "learning_rate": 0.00015021248067383387, + "loss": 0.2286, + "step": 38150 + }, + { + "epoch": 1.0594593845012463, + "grad_norm": 0.12832270562648773, + "learning_rate": 0.00015008295521934354, + "loss": 0.229, + "step": 38200 + }, + { + "epoch": 1.0608461114443108, + "grad_norm": 0.12442856281995773, + "learning_rate": 0.00014995331751222992, + "loss": 0.2286, + "step": 38250 + }, + { + "epoch": 1.0622328383873751, + "grad_norm": 0.14005307853221893, + "learning_rate": 0.00014982356784305428, + "loss": 0.2293, + "step": 38300 + }, + { + "epoch": 1.0636195653304397, + "grad_norm": 0.14418749511241913, + "learning_rate": 0.00014969370650262903, + "loss": 0.2328, + "step": 38350 + }, + { + "epoch": 1.0650062922735042, + "grad_norm": 0.11833231151103973, + "learning_rate": 0.00014956373378201677, + "loss": 0.2273, + "step": 38400 + }, + { + "epoch": 1.0663930192165687, + "grad_norm": 0.12782081961631775, + "learning_rate": 0.00014943364997252977, + "loss": 0.2224, + "step": 38450 + }, + { + "epoch": 1.067779746159633, + "grad_norm": 0.11903475224971771, + "learning_rate": 0.00014930345536572924, + "loss": 0.2256, + "step": 38500 + }, + { + "epoch": 1.0691664731026975, + "grad_norm": 0.17546679079532623, + "learning_rate": 0.00014917315025342483, + "loss": 0.2306, + "step": 38550 + }, + { + "epoch": 1.070553200045762, + "grad_norm": 0.16552455723285675, + "learning_rate": 0.0001490427349276737, + "loss": 0.2242, + "step": 38600 + }, + { + "epoch": 1.0719399269888266, + "grad_norm": 0.11756553500890732, + "learning_rate": 0.00014891220968078024, + "loss": 0.223, + "step": 38650 + }, + { + "epoch": 1.0733266539318909, + "grad_norm": 0.13542614877223969, + "learning_rate": 0.000148781574805295, + "loss": 0.2293, + "step": 38700 + }, + { + "epoch": 1.0747133808749554, + "grad_norm": 0.1370215266942978, + "learning_rate": 0.00014865083059401445, + "loss": 0.2291, + "step": 38750 + }, + { + "epoch": 1.07610010781802, + "grad_norm": 0.1472005844116211, + "learning_rate": 0.00014851997733997992, + "loss": 0.2272, + "step": 38800 + }, + { + "epoch": 1.0774868347610842, + "grad_norm": 0.1240694522857666, + "learning_rate": 0.00014838901533647733, + "loss": 0.2237, + "step": 38850 + }, + { + "epoch": 1.0788735617041487, + "grad_norm": 0.11901194602251053, + "learning_rate": 0.0001482579448770362, + "loss": 0.2285, + "step": 38900 + }, + { + "epoch": 1.0802602886472132, + "grad_norm": 0.2202654331922531, + "learning_rate": 0.0001481267662554292, + "loss": 0.2321, + "step": 38950 + }, + { + "epoch": 1.0816470155902778, + "grad_norm": 0.11475471407175064, + "learning_rate": 0.00014799547976567144, + "loss": 0.2296, + "step": 39000 + }, + { + "epoch": 1.0816470155902778, + "eval_loss": 0.2248746156692505, + "eval_runtime": 500.4656, + "eval_samples_per_second": 5.709, + "eval_steps_per_second": 5.709, + "step": 39000 + }, + { + "epoch": 1.083033742533342, + "grad_norm": 0.1217503771185875, + "learning_rate": 0.00014786408570201975, + "loss": 0.2223, + "step": 39050 + }, + { + "epoch": 1.0844204694764066, + "grad_norm": 0.14427083730697632, + "learning_rate": 0.00014773258435897207, + "loss": 0.2279, + "step": 39100 + }, + { + "epoch": 1.085807196419471, + "grad_norm": 0.11865708976984024, + "learning_rate": 0.00014760097603126689, + "loss": 0.2295, + "step": 39150 + }, + { + "epoch": 1.0871939233625354, + "grad_norm": 0.14178717136383057, + "learning_rate": 0.0001474718963578798, + "loss": 0.2261, + "step": 39200 + }, + { + "epoch": 1.0885806503056, + "grad_norm": 0.15393276512622833, + "learning_rate": 0.0001473400770710278, + "loss": 0.2308, + "step": 39250 + }, + { + "epoch": 1.0899673772486644, + "grad_norm": 0.11602922528982162, + "learning_rate": 0.00014720815167925812, + "loss": 0.2283, + "step": 39300 + }, + { + "epoch": 1.091354104191729, + "grad_norm": 0.16645793616771698, + "learning_rate": 0.00014707612047825964, + "loss": 0.233, + "step": 39350 + }, + { + "epoch": 1.0927408311347933, + "grad_norm": 0.10213354974985123, + "learning_rate": 0.00014694398376395825, + "loss": 0.2277, + "step": 39400 + }, + { + "epoch": 1.0941275580778578, + "grad_norm": 0.11264722794294357, + "learning_rate": 0.0001468117418325166, + "loss": 0.2267, + "step": 39450 + }, + { + "epoch": 1.0955142850209223, + "grad_norm": 0.12596255540847778, + "learning_rate": 0.00014667939498033293, + "loss": 0.2226, + "step": 39500 + }, + { + "epoch": 1.0969010119639866, + "grad_norm": 0.10382383316755295, + "learning_rate": 0.0001465469435040407, + "loss": 0.2297, + "step": 39550 + }, + { + "epoch": 1.0982877389070511, + "grad_norm": 0.12972958385944366, + "learning_rate": 0.00014641438770050794, + "loss": 0.2256, + "step": 39600 + }, + { + "epoch": 1.0996744658501156, + "grad_norm": 0.13036096096038818, + "learning_rate": 0.00014628172786683641, + "loss": 0.2235, + "step": 39650 + }, + { + "epoch": 1.1010611927931802, + "grad_norm": 0.1233506128191948, + "learning_rate": 0.00014614896430036113, + "loss": 0.2243, + "step": 39700 + }, + { + "epoch": 1.1024479197362445, + "grad_norm": 0.11503315716981888, + "learning_rate": 0.00014601609729864956, + "loss": 0.2285, + "step": 39750 + }, + { + "epoch": 1.103834646679309, + "grad_norm": 0.12343501299619675, + "learning_rate": 0.000145883127159501, + "loss": 0.2272, + "step": 39800 + }, + { + "epoch": 1.1052213736223735, + "grad_norm": 0.1226864606142044, + "learning_rate": 0.00014575005418094594, + "loss": 0.2332, + "step": 39850 + }, + { + "epoch": 1.106608100565438, + "grad_norm": 0.1333167850971222, + "learning_rate": 0.00014561687866124535, + "loss": 0.2304, + "step": 39900 + }, + { + "epoch": 1.1079948275085023, + "grad_norm": 0.1088777631521225, + "learning_rate": 0.00014548360089889002, + "loss": 0.2296, + "step": 39950 + }, + { + "epoch": 1.1093815544515668, + "grad_norm": 0.11975093185901642, + "learning_rate": 0.00014535022119259994, + "loss": 0.2255, + "step": 40000 + }, + { + "epoch": 1.1093815544515668, + "eval_loss": 0.22516606748104095, + "eval_runtime": 500.4411, + "eval_samples_per_second": 5.709, + "eval_steps_per_second": 5.709, + "step": 40000 + }, + { + "epoch": 1.1107682813946314, + "grad_norm": 0.19725576043128967, + "learning_rate": 0.0001452167398413235, + "loss": 0.2317, + "step": 40050 + }, + { + "epoch": 1.1121550083376956, + "grad_norm": 0.12385617196559906, + "learning_rate": 0.00014508315714423706, + "loss": 0.2269, + "step": 40100 + }, + { + "epoch": 1.1135417352807602, + "grad_norm": 0.12559738755226135, + "learning_rate": 0.000144949473400744, + "loss": 0.2295, + "step": 40150 + }, + { + "epoch": 1.1149284622238247, + "grad_norm": 0.1279434859752655, + "learning_rate": 0.0001448156889104742, + "loss": 0.2283, + "step": 40200 + }, + { + "epoch": 1.1163151891668892, + "grad_norm": 0.14756010472774506, + "learning_rate": 0.0001446818039732834, + "loss": 0.2267, + "step": 40250 + }, + { + "epoch": 1.1177019161099535, + "grad_norm": 0.11476084589958191, + "learning_rate": 0.00014454781888925238, + "loss": 0.2265, + "step": 40300 + }, + { + "epoch": 1.119088643053018, + "grad_norm": 0.12701088190078735, + "learning_rate": 0.00014441373395868653, + "loss": 0.2255, + "step": 40350 + }, + { + "epoch": 1.1204753699960825, + "grad_norm": 0.14300104975700378, + "learning_rate": 0.00014427954948211493, + "loss": 0.227, + "step": 40400 + }, + { + "epoch": 1.121862096939147, + "grad_norm": 0.11292553693056107, + "learning_rate": 0.00014414526576028973, + "loss": 0.2239, + "step": 40450 + }, + { + "epoch": 1.1232488238822114, + "grad_norm": 0.1404883861541748, + "learning_rate": 0.00014401088309418564, + "loss": 0.2234, + "step": 40500 + }, + { + "epoch": 1.1246355508252759, + "grad_norm": 0.15262041985988617, + "learning_rate": 0.00014387640178499905, + "loss": 0.2319, + "step": 40550 + }, + { + "epoch": 1.1260222777683404, + "grad_norm": 0.16456229984760284, + "learning_rate": 0.0001437418221341475, + "loss": 0.2264, + "step": 40600 + }, + { + "epoch": 1.1274090047114047, + "grad_norm": 0.12468329817056656, + "learning_rate": 0.0001436071444432689, + "loss": 0.2273, + "step": 40650 + }, + { + "epoch": 1.1287957316544692, + "grad_norm": 0.12449460476636887, + "learning_rate": 0.0001434723690142209, + "loss": 0.2333, + "step": 40700 + }, + { + "epoch": 1.1301824585975337, + "grad_norm": 0.12426210194826126, + "learning_rate": 0.0001433374961490803, + "loss": 0.2328, + "step": 40750 + }, + { + "epoch": 1.1315691855405983, + "grad_norm": 0.1501815766096115, + "learning_rate": 0.00014320252615014216, + "loss": 0.2214, + "step": 40800 + }, + { + "epoch": 1.1329559124836626, + "grad_norm": 0.15881818532943726, + "learning_rate": 0.00014306745931991932, + "loss": 0.2292, + "step": 40850 + }, + { + "epoch": 1.134342639426727, + "grad_norm": 0.12299991399049759, + "learning_rate": 0.00014293229596114163, + "loss": 0.2238, + "step": 40900 + }, + { + "epoch": 1.1357293663697916, + "grad_norm": 0.14259304106235504, + "learning_rate": 0.0001427970363767553, + "loss": 0.2291, + "step": 40950 + }, + { + "epoch": 1.137116093312856, + "grad_norm": 0.12536148726940155, + "learning_rate": 0.00014266168086992225, + "loss": 0.2252, + "step": 41000 + }, + { + "epoch": 1.137116093312856, + "eval_loss": 0.2245665341615677, + "eval_runtime": 501.2828, + "eval_samples_per_second": 5.699, + "eval_steps_per_second": 5.699, + "step": 41000 + }, + { + "epoch": 1.1385028202559204, + "grad_norm": 0.12410587817430496, + "learning_rate": 0.00014252622974401932, + "loss": 0.2268, + "step": 41050 + }, + { + "epoch": 1.139889547198985, + "grad_norm": 0.12877434492111206, + "learning_rate": 0.00014239068330263775, + "loss": 0.2258, + "step": 41100 + }, + { + "epoch": 1.1412762741420495, + "grad_norm": 0.1299249529838562, + "learning_rate": 0.00014225504184958232, + "loss": 0.2301, + "step": 41150 + }, + { + "epoch": 1.1426630010851138, + "grad_norm": 0.15234452486038208, + "learning_rate": 0.00014211930568887088, + "loss": 0.2192, + "step": 41200 + }, + { + "epoch": 1.1440497280281783, + "grad_norm": 0.12678442895412445, + "learning_rate": 0.00014198347512473343, + "loss": 0.2311, + "step": 41250 + }, + { + "epoch": 1.1454364549712428, + "grad_norm": 0.12326008826494217, + "learning_rate": 0.0001418475504616116, + "loss": 0.2318, + "step": 41300 + }, + { + "epoch": 1.146823181914307, + "grad_norm": 0.11192907392978668, + "learning_rate": 0.00014171153200415797, + "loss": 0.2232, + "step": 41350 + }, + { + "epoch": 1.1482099088573716, + "grad_norm": 0.11843819916248322, + "learning_rate": 0.00014157542005723532, + "loss": 0.2277, + "step": 41400 + }, + { + "epoch": 1.1495966358004361, + "grad_norm": 0.12903502583503723, + "learning_rate": 0.0001414419399397752, + "loss": 0.2237, + "step": 41450 + }, + { + "epoch": 1.1509833627435007, + "grad_norm": 0.13532768189907074, + "learning_rate": 0.00014130564378392948, + "loss": 0.2291, + "step": 41500 + }, + { + "epoch": 1.152370089686565, + "grad_norm": 0.11242423951625824, + "learning_rate": 0.00014116925504834574, + "loss": 0.2263, + "step": 41550 + }, + { + "epoch": 1.1537568166296295, + "grad_norm": 0.14420267939567566, + "learning_rate": 0.00014103277403871667, + "loss": 0.231, + "step": 41600 + }, + { + "epoch": 1.155143543572694, + "grad_norm": 0.11390483379364014, + "learning_rate": 0.00014089620106094174, + "loss": 0.2281, + "step": 41650 + }, + { + "epoch": 1.1565302705157583, + "grad_norm": 0.10996092855930328, + "learning_rate": 0.0001407595364211267, + "loss": 0.223, + "step": 41700 + }, + { + "epoch": 1.1579169974588228, + "grad_norm": 0.1297358274459839, + "learning_rate": 0.00014062278042558253, + "loss": 0.2251, + "step": 41750 + }, + { + "epoch": 1.1593037244018873, + "grad_norm": 0.13994191586971283, + "learning_rate": 0.00014048593338082508, + "loss": 0.2261, + "step": 41800 + }, + { + "epoch": 1.1606904513449519, + "grad_norm": 0.15100865066051483, + "learning_rate": 0.00014034899559357432, + "loss": 0.2257, + "step": 41850 + }, + { + "epoch": 1.1620771782880164, + "grad_norm": 0.1151217371225357, + "learning_rate": 0.0001402119673707535, + "loss": 0.2278, + "step": 41900 + }, + { + "epoch": 1.1634639052310807, + "grad_norm": 0.1580880582332611, + "learning_rate": 0.00014007484901948865, + "loss": 0.2247, + "step": 41950 + }, + { + "epoch": 1.1648506321741452, + "grad_norm": 0.1323232203722, + "learning_rate": 0.00013993764084710777, + "loss": 0.2229, + "step": 42000 + }, + { + "epoch": 1.1648506321741452, + "eval_loss": 0.22439424693584442, + "eval_runtime": 501.4893, + "eval_samples_per_second": 5.697, + "eval_steps_per_second": 5.697, + "step": 42000 + }, + { + "epoch": 1.1662373591172097, + "grad_norm": 0.11002755165100098, + "learning_rate": 0.00013980034316114014, + "loss": 0.2287, + "step": 42050 + }, + { + "epoch": 1.167624086060274, + "grad_norm": 0.16875265538692474, + "learning_rate": 0.00013966295626931575, + "loss": 0.2268, + "step": 42100 + }, + { + "epoch": 1.1690108130033385, + "grad_norm": 0.1291196197271347, + "learning_rate": 0.0001395254804795645, + "loss": 0.2267, + "step": 42150 + }, + { + "epoch": 1.170397539946403, + "grad_norm": 0.12030452489852905, + "learning_rate": 0.0001393879161000155, + "loss": 0.2284, + "step": 42200 + }, + { + "epoch": 1.1717842668894676, + "grad_norm": 0.1254565715789795, + "learning_rate": 0.00013925026343899644, + "loss": 0.2325, + "step": 42250 + }, + { + "epoch": 1.1731709938325319, + "grad_norm": 0.10753902792930603, + "learning_rate": 0.000139112522805033, + "loss": 0.2265, + "step": 42300 + }, + { + "epoch": 1.1745577207755964, + "grad_norm": 0.14079649746418, + "learning_rate": 0.00013897469450684783, + "loss": 0.2279, + "step": 42350 + }, + { + "epoch": 1.175944447718661, + "grad_norm": 0.13644090294837952, + "learning_rate": 0.00013883677885336013, + "loss": 0.2264, + "step": 42400 + }, + { + "epoch": 1.1773311746617252, + "grad_norm": 0.15901681780815125, + "learning_rate": 0.000138698776153685, + "loss": 0.2274, + "step": 42450 + }, + { + "epoch": 1.1787179016047897, + "grad_norm": 0.14739197492599487, + "learning_rate": 0.00013856068671713254, + "loss": 0.2223, + "step": 42500 + }, + { + "epoch": 1.1801046285478543, + "grad_norm": 0.1077587679028511, + "learning_rate": 0.00013842251085320728, + "loss": 0.2257, + "step": 42550 + }, + { + "epoch": 1.1814913554909188, + "grad_norm": 0.12596414983272552, + "learning_rate": 0.00013828424887160745, + "loss": 0.2251, + "step": 42600 + }, + { + "epoch": 1.182878082433983, + "grad_norm": 0.11234478652477264, + "learning_rate": 0.0001381459010822243, + "loss": 0.2225, + "step": 42650 + }, + { + "epoch": 1.1842648093770476, + "grad_norm": 0.11206696927547455, + "learning_rate": 0.00013800746779514143, + "loss": 0.2266, + "step": 42700 + }, + { + "epoch": 1.185651536320112, + "grad_norm": 0.10260911285877228, + "learning_rate": 0.0001378689493206341, + "loss": 0.2241, + "step": 42750 + }, + { + "epoch": 1.1870382632631764, + "grad_norm": 0.12874187529087067, + "learning_rate": 0.0001377303459691684, + "loss": 0.2277, + "step": 42800 + }, + { + "epoch": 1.188424990206241, + "grad_norm": 0.1351606696844101, + "learning_rate": 0.0001375916580514007, + "loss": 0.2268, + "step": 42850 + }, + { + "epoch": 1.1898117171493054, + "grad_norm": 0.1250632107257843, + "learning_rate": 0.000137452885878177, + "loss": 0.2265, + "step": 42900 + }, + { + "epoch": 1.19119844409237, + "grad_norm": 0.12516459822654724, + "learning_rate": 0.00013731402976053202, + "loss": 0.2256, + "step": 42950 + }, + { + "epoch": 1.1925851710354343, + "grad_norm": 0.12791725993156433, + "learning_rate": 0.00013717509000968865, + "loss": 0.2252, + "step": 43000 + }, + { + "epoch": 1.1925851710354343, + "eval_loss": 0.22418725490570068, + "eval_runtime": 501.0375, + "eval_samples_per_second": 5.702, + "eval_steps_per_second": 5.702, + "step": 43000 + }, + { + "epoch": 1.1939718979784988, + "grad_norm": 0.152371346950531, + "learning_rate": 0.00013703606693705732, + "loss": 0.2308, + "step": 43050 + }, + { + "epoch": 1.1953586249215633, + "grad_norm": 0.14723214507102966, + "learning_rate": 0.0001368969608542351, + "loss": 0.2258, + "step": 43100 + }, + { + "epoch": 1.1967453518646276, + "grad_norm": 0.1414303481578827, + "learning_rate": 0.00013675777207300524, + "loss": 0.2278, + "step": 43150 + }, + { + "epoch": 1.1981320788076921, + "grad_norm": 0.15416811406612396, + "learning_rate": 0.00013661850090533617, + "loss": 0.2324, + "step": 43200 + }, + { + "epoch": 1.1995188057507566, + "grad_norm": 0.11736203730106354, + "learning_rate": 0.00013647914766338112, + "loss": 0.2292, + "step": 43250 + }, + { + "epoch": 1.2009055326938212, + "grad_norm": 0.1547485738992691, + "learning_rate": 0.00013633971265947722, + "loss": 0.2281, + "step": 43300 + }, + { + "epoch": 1.2022922596368855, + "grad_norm": 0.15800827741622925, + "learning_rate": 0.0001362001962061449, + "loss": 0.2296, + "step": 43350 + }, + { + "epoch": 1.20367898657995, + "grad_norm": 0.15381957590579987, + "learning_rate": 0.0001360605986160871, + "loss": 0.2291, + "step": 43400 + }, + { + "epoch": 1.2050657135230145, + "grad_norm": 0.17754536867141724, + "learning_rate": 0.00013592092020218855, + "loss": 0.2285, + "step": 43450 + }, + { + "epoch": 1.2064524404660788, + "grad_norm": 0.1404140442609787, + "learning_rate": 0.0001357811612775153, + "loss": 0.2253, + "step": 43500 + }, + { + "epoch": 1.2078391674091433, + "grad_norm": 0.11709395796060562, + "learning_rate": 0.00013564132215531372, + "loss": 0.2261, + "step": 43550 + }, + { + "epoch": 1.2092258943522078, + "grad_norm": 0.11466790735721588, + "learning_rate": 0.00013550140314901, + "loss": 0.2295, + "step": 43600 + }, + { + "epoch": 1.2106126212952724, + "grad_norm": 0.14058195054531097, + "learning_rate": 0.00013536140457220933, + "loss": 0.2307, + "step": 43650 + }, + { + "epoch": 1.2119993482383369, + "grad_norm": 0.18355610966682434, + "learning_rate": 0.00013522132673869522, + "loss": 0.2283, + "step": 43700 + }, + { + "epoch": 1.2133860751814012, + "grad_norm": 0.1437745839357376, + "learning_rate": 0.00013508116996242893, + "loss": 0.2244, + "step": 43750 + }, + { + "epoch": 1.2147728021244657, + "grad_norm": 0.12281102687120438, + "learning_rate": 0.00013494093455754851, + "loss": 0.2266, + "step": 43800 + }, + { + "epoch": 1.2161595290675302, + "grad_norm": 0.15082257986068726, + "learning_rate": 0.00013480062083836842, + "loss": 0.2275, + "step": 43850 + }, + { + "epoch": 1.2175462560105945, + "grad_norm": 0.13360853493213654, + "learning_rate": 0.00013466022911937846, + "loss": 0.2293, + "step": 43900 + }, + { + "epoch": 1.218932982953659, + "grad_norm": 0.1245453953742981, + "learning_rate": 0.00013451975971524337, + "loss": 0.2252, + "step": 43950 + }, + { + "epoch": 1.2203197098967236, + "grad_norm": 0.12427138537168503, + "learning_rate": 0.00013437921294080202, + "loss": 0.2273, + "step": 44000 + }, + { + "epoch": 1.2203197098967236, + "eval_loss": 0.22416169941425323, + "eval_runtime": 501.199, + "eval_samples_per_second": 5.7, + "eval_steps_per_second": 5.7, + "step": 44000 + }, + { + "epoch": 1.221706436839788, + "grad_norm": 0.13315744698047638, + "learning_rate": 0.00013423858911106664, + "loss": 0.2273, + "step": 44050 + }, + { + "epoch": 1.2230931637828524, + "grad_norm": 0.11731356382369995, + "learning_rate": 0.0001340978885412221, + "loss": 0.2284, + "step": 44100 + }, + { + "epoch": 1.224479890725917, + "grad_norm": 0.1332121342420578, + "learning_rate": 0.00013395711154662548, + "loss": 0.2311, + "step": 44150 + }, + { + "epoch": 1.2258666176689814, + "grad_norm": 0.11775799095630646, + "learning_rate": 0.00013381625844280495, + "loss": 0.2207, + "step": 44200 + }, + { + "epoch": 1.2272533446120457, + "grad_norm": 0.13608750700950623, + "learning_rate": 0.00013367532954545934, + "loss": 0.2259, + "step": 44250 + }, + { + "epoch": 1.2286400715551102, + "grad_norm": 0.11276783794164658, + "learning_rate": 0.00013353432517045739, + "loss": 0.2254, + "step": 44300 + }, + { + "epoch": 1.2300267984981748, + "grad_norm": 0.11962584406137466, + "learning_rate": 0.00013339324563383693, + "loss": 0.2231, + "step": 44350 + }, + { + "epoch": 1.2314135254412393, + "grad_norm": 0.14515165984630585, + "learning_rate": 0.0001332520912518044, + "loss": 0.2273, + "step": 44400 + }, + { + "epoch": 1.2328002523843036, + "grad_norm": 0.14967331290245056, + "learning_rate": 0.00013311086234073376, + "loss": 0.2292, + "step": 44450 + }, + { + "epoch": 1.234186979327368, + "grad_norm": 0.10794315487146378, + "learning_rate": 0.00013296955921716626, + "loss": 0.2213, + "step": 44500 + }, + { + "epoch": 1.2355737062704326, + "grad_norm": 0.1261892467737198, + "learning_rate": 0.0001328281821978093, + "loss": 0.2249, + "step": 44550 + }, + { + "epoch": 1.236960433213497, + "grad_norm": 0.16944009065628052, + "learning_rate": 0.00013268673159953608, + "loss": 0.2279, + "step": 44600 + }, + { + "epoch": 1.2383471601565614, + "grad_norm": 0.14991511404514313, + "learning_rate": 0.00013254520773938453, + "loss": 0.224, + "step": 44650 + }, + { + "epoch": 1.239733887099626, + "grad_norm": 0.16776132583618164, + "learning_rate": 0.00013240361093455686, + "loss": 0.2267, + "step": 44700 + }, + { + "epoch": 1.2411206140426905, + "grad_norm": 0.15971648693084717, + "learning_rate": 0.00013226194150241886, + "loss": 0.2269, + "step": 44750 + }, + { + "epoch": 1.2425073409857548, + "grad_norm": 0.16267691552639008, + "learning_rate": 0.00013212019976049897, + "loss": 0.2262, + "step": 44800 + }, + { + "epoch": 1.2438940679288193, + "grad_norm": 0.13528917729854584, + "learning_rate": 0.00013197838602648773, + "loss": 0.2282, + "step": 44850 + }, + { + "epoch": 1.2452807948718838, + "grad_norm": 0.13532580435276031, + "learning_rate": 0.0001318365006182371, + "loss": 0.2269, + "step": 44900 + }, + { + "epoch": 1.246667521814948, + "grad_norm": 0.15377886593341827, + "learning_rate": 0.00013169738368628263, + "loss": 0.2298, + "step": 44950 + }, + { + "epoch": 1.2480542487580126, + "grad_norm": 0.16382162272930145, + "learning_rate": 0.00013155535730139284, + "loss": 0.2301, + "step": 45000 + }, + { + "epoch": 1.2480542487580126, + "eval_loss": 0.22414694726467133, + "eval_runtime": 500.918, + "eval_samples_per_second": 5.704, + "eval_steps_per_second": 5.704, + "step": 45000 + }, + { + "epoch": 1.2494409757010772, + "grad_norm": 0.13876722753047943, + "learning_rate": 0.00013141326019041228, + "loss": 0.2249, + "step": 45050 + }, + { + "epoch": 1.2508277026441417, + "grad_norm": 0.1360548585653305, + "learning_rate": 0.00013127393671013348, + "loss": 0.2255, + "step": 45100 + }, + { + "epoch": 1.2522144295872062, + "grad_norm": 0.1435881406068802, + "learning_rate": 0.00013113170050124578, + "loss": 0.2314, + "step": 45150 + }, + { + "epoch": 1.2536011565302705, + "grad_norm": 0.12622830271720886, + "learning_rate": 0.00013098939451582363, + "loss": 0.2248, + "step": 45200 + }, + { + "epoch": 1.254987883473335, + "grad_norm": 0.1429251879453659, + "learning_rate": 0.00013084701907282228, + "loss": 0.2312, + "step": 45250 + }, + { + "epoch": 1.2563746104163993, + "grad_norm": 0.12246144562959671, + "learning_rate": 0.00013070457449135262, + "loss": 0.2236, + "step": 45300 + }, + { + "epoch": 1.2577613373594638, + "grad_norm": 0.11872986704111099, + "learning_rate": 0.00013056206109068045, + "loss": 0.2263, + "step": 45350 + }, + { + "epoch": 1.2591480643025283, + "grad_norm": 0.12920017540454865, + "learning_rate": 0.00013041947919022594, + "loss": 0.2258, + "step": 45400 + }, + { + "epoch": 1.2605347912455929, + "grad_norm": 0.15954279899597168, + "learning_rate": 0.00013027682910956271, + "loss": 0.2272, + "step": 45450 + }, + { + "epoch": 1.2619215181886574, + "grad_norm": 0.16156534850597382, + "learning_rate": 0.00013013411116841723, + "loss": 0.2245, + "step": 45500 + }, + { + "epoch": 1.2633082451317217, + "grad_norm": 0.12423060089349747, + "learning_rate": 0.00012999132568666805, + "loss": 0.2271, + "step": 45550 + }, + { + "epoch": 1.2646949720747862, + "grad_norm": 0.1252107322216034, + "learning_rate": 0.0001298484729843451, + "loss": 0.2298, + "step": 45600 + }, + { + "epoch": 1.2660816990178507, + "grad_norm": 0.16947528719902039, + "learning_rate": 0.00012970555338162896, + "loss": 0.2273, + "step": 45650 + }, + { + "epoch": 1.267468425960915, + "grad_norm": 0.14459671080112457, + "learning_rate": 0.00012956256719885026, + "loss": 0.2282, + "step": 45700 + }, + { + "epoch": 1.2688551529039795, + "grad_norm": 0.1194702684879303, + "learning_rate": 0.00012941951475648866, + "loss": 0.2263, + "step": 45750 + }, + { + "epoch": 1.270241879847044, + "grad_norm": 0.12180822342634201, + "learning_rate": 0.00012927639637517249, + "loss": 0.227, + "step": 45800 + }, + { + "epoch": 1.2716286067901086, + "grad_norm": 0.14245355129241943, + "learning_rate": 0.00012913321237567783, + "loss": 0.2262, + "step": 45850 + }, + { + "epoch": 1.2730153337331729, + "grad_norm": 0.14033064246177673, + "learning_rate": 0.00012898996307892784, + "loss": 0.2249, + "step": 45900 + }, + { + "epoch": 1.2744020606762374, + "grad_norm": 0.11540055274963379, + "learning_rate": 0.00012884664880599198, + "loss": 0.2265, + "step": 45950 + }, + { + "epoch": 1.275788787619302, + "grad_norm": 0.10777000337839127, + "learning_rate": 0.00012870326987808538, + "loss": 0.2245, + "step": 46000 + }, + { + "epoch": 1.275788787619302, + "eval_loss": 0.2235965132713318, + "eval_runtime": 500.5657, + "eval_samples_per_second": 5.708, + "eval_steps_per_second": 5.708, + "step": 46000 + } + ], + "logging_steps": 50, + "max_steps": 108168, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.2909744594944e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}