diff --git "a/checkpoint-80000/trainer_state.json" "b/checkpoint-80000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-80000/trainer_state.json" @@ -0,0 +1,6274 @@ +{ + "best_global_step": 80000, + "best_metric": 2.1577999591827393, + "best_model_checkpoint": "models/plausibert/checkpoint-80000", + "epoch": 75.47180938900684, + "eval_steps": 1000, + "global_step": 80000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.09436187780136825, + "grad_norm": 3.896272659301758, + "learning_rate": 9.9e-07, + "loss": 10.1635, + "step": 100 + }, + { + "epoch": 0.1887237556027365, + "grad_norm": 2.5898056030273438, + "learning_rate": 1.99e-06, + "loss": 9.1407, + "step": 200 + }, + { + "epoch": 0.28308563340410475, + "grad_norm": 2.2082550525665283, + "learning_rate": 2.99e-06, + "loss": 8.6837, + "step": 300 + }, + { + "epoch": 0.377447511205473, + "grad_norm": 2.3000612258911133, + "learning_rate": 3.99e-06, + "loss": 8.2813, + "step": 400 + }, + { + "epoch": 0.4718093890068412, + "grad_norm": 1.8331196308135986, + "learning_rate": 4.9900000000000005e-06, + "loss": 7.8228, + "step": 500 + }, + { + "epoch": 0.5661712668082095, + "grad_norm": 1.509745717048645, + "learning_rate": 5.99e-06, + "loss": 7.3049, + "step": 600 + }, + { + "epoch": 0.6605331446095777, + "grad_norm": 1.0882461071014404, + "learning_rate": 6.990000000000001e-06, + "loss": 6.8271, + "step": 700 + }, + { + "epoch": 0.754895022410946, + "grad_norm": 0.9488278031349182, + "learning_rate": 7.99e-06, + "loss": 6.4755, + "step": 800 + }, + { + "epoch": 0.8492569002123143, + "grad_norm": 0.7304624319076538, + "learning_rate": 8.99e-06, + "loss": 6.2671, + "step": 900 + }, + { + "epoch": 0.9436187780136824, + "grad_norm": 0.7234325408935547, + "learning_rate": 9.990000000000001e-06, + "loss": 6.1688, + "step": 1000 + }, + { + "epoch": 0.9436187780136824, + "eval_loss": 6.0837483406066895, + "eval_runtime": 74.3964, + "eval_samples_per_second": 202.577, + "eval_steps_per_second": 6.331, + "step": 1000 + }, + { + "epoch": 1.0377447511205473, + "grad_norm": 0.5878735780715942, + "learning_rate": 1.099e-05, + "loss": 6.0804, + "step": 1100 + }, + { + "epoch": 1.1321066289219155, + "grad_norm": 1.0044981241226196, + "learning_rate": 1.199e-05, + "loss": 6.0195, + "step": 1200 + }, + { + "epoch": 1.226468506723284, + "grad_norm": 0.6754953265190125, + "learning_rate": 1.299e-05, + "loss": 5.9978, + "step": 1300 + }, + { + "epoch": 1.320830384524652, + "grad_norm": 0.7222411036491394, + "learning_rate": 1.399e-05, + "loss": 5.9548, + "step": 1400 + }, + { + "epoch": 1.4151922623260202, + "grad_norm": 0.6526289582252502, + "learning_rate": 1.499e-05, + "loss": 5.9321, + "step": 1500 + }, + { + "epoch": 1.5095541401273884, + "grad_norm": 0.7330458164215088, + "learning_rate": 1.599e-05, + "loss": 5.8904, + "step": 1600 + }, + { + "epoch": 1.6039160179287568, + "grad_norm": 0.7171738743782043, + "learning_rate": 1.699e-05, + "loss": 5.887, + "step": 1700 + }, + { + "epoch": 1.698277895730125, + "grad_norm": 0.6907537579536438, + "learning_rate": 1.7990000000000002e-05, + "loss": 5.8503, + "step": 1800 + }, + { + "epoch": 1.7926397735314934, + "grad_norm": 1.0631508827209473, + "learning_rate": 1.8990000000000003e-05, + "loss": 5.8309, + "step": 1900 + }, + { + "epoch": 1.8870016513328616, + "grad_norm": 0.5577560663223267, + "learning_rate": 1.999e-05, + "loss": 5.8133, + "step": 2000 + }, + { + "epoch": 1.8870016513328616, + "eval_loss": 5.77741003036499, + "eval_runtime": 74.4232, + "eval_samples_per_second": 202.504, + "eval_steps_per_second": 6.329, + "step": 2000 + }, + { + "epoch": 1.9813635291342298, + "grad_norm": 0.878653347492218, + "learning_rate": 2.099e-05, + "loss": 5.7944, + "step": 2100 + }, + { + "epoch": 2.0754895022410946, + "grad_norm": 0.5987277626991272, + "learning_rate": 2.199e-05, + "loss": 5.7819, + "step": 2200 + }, + { + "epoch": 2.169851380042463, + "grad_norm": 0.6282809972763062, + "learning_rate": 2.2990000000000002e-05, + "loss": 5.7676, + "step": 2300 + }, + { + "epoch": 2.264213257843831, + "grad_norm": 0.7545515298843384, + "learning_rate": 2.3990000000000002e-05, + "loss": 5.7626, + "step": 2400 + }, + { + "epoch": 2.358575135645199, + "grad_norm": 0.9032400846481323, + "learning_rate": 2.4990000000000003e-05, + "loss": 5.7559, + "step": 2500 + }, + { + "epoch": 2.452937013446568, + "grad_norm": 0.6327532529830933, + "learning_rate": 2.5990000000000004e-05, + "loss": 5.7318, + "step": 2600 + }, + { + "epoch": 2.547298891247936, + "grad_norm": 0.6848271489143372, + "learning_rate": 2.6989999999999997e-05, + "loss": 5.7179, + "step": 2700 + }, + { + "epoch": 2.641660769049304, + "grad_norm": 0.5881490707397461, + "learning_rate": 2.7989999999999998e-05, + "loss": 5.7122, + "step": 2800 + }, + { + "epoch": 2.7360226468506723, + "grad_norm": 1.0944511890411377, + "learning_rate": 2.8990000000000002e-05, + "loss": 5.7164, + "step": 2900 + }, + { + "epoch": 2.8303845246520405, + "grad_norm": 0.9962751865386963, + "learning_rate": 2.9990000000000003e-05, + "loss": 5.6835, + "step": 3000 + }, + { + "epoch": 2.8303845246520405, + "eval_loss": 5.670976161956787, + "eval_runtime": 74.4573, + "eval_samples_per_second": 202.411, + "eval_steps_per_second": 6.326, + "step": 3000 + }, + { + "epoch": 2.9247464024534087, + "grad_norm": 0.8781231641769409, + "learning_rate": 3.099e-05, + "loss": 5.6835, + "step": 3100 + }, + { + "epoch": 3.0188723755602735, + "grad_norm": 0.806686282157898, + "learning_rate": 3.1990000000000004e-05, + "loss": 5.6689, + "step": 3200 + }, + { + "epoch": 3.1132342533616417, + "grad_norm": 0.8316852450370789, + "learning_rate": 3.299e-05, + "loss": 5.6723, + "step": 3300 + }, + { + "epoch": 3.2075961311630103, + "grad_norm": 0.82398521900177, + "learning_rate": 3.399e-05, + "loss": 5.6536, + "step": 3400 + }, + { + "epoch": 3.3019580089643785, + "grad_norm": 0.9566921591758728, + "learning_rate": 3.499e-05, + "loss": 5.6482, + "step": 3500 + }, + { + "epoch": 3.3963198867657467, + "grad_norm": 0.8480503559112549, + "learning_rate": 3.599e-05, + "loss": 5.6511, + "step": 3600 + }, + { + "epoch": 3.490681764567115, + "grad_norm": 0.9649183750152588, + "learning_rate": 3.699e-05, + "loss": 5.644, + "step": 3700 + }, + { + "epoch": 3.585043642368483, + "grad_norm": 0.870224118232727, + "learning_rate": 3.799e-05, + "loss": 5.6256, + "step": 3800 + }, + { + "epoch": 3.6794055201698512, + "grad_norm": 0.6461514830589294, + "learning_rate": 3.8990000000000004e-05, + "loss": 5.6282, + "step": 3900 + }, + { + "epoch": 3.77376739797122, + "grad_norm": 0.7242473363876343, + "learning_rate": 3.999e-05, + "loss": 5.627, + "step": 4000 + }, + { + "epoch": 3.77376739797122, + "eval_loss": 5.605582237243652, + "eval_runtime": 74.4202, + "eval_samples_per_second": 202.512, + "eval_steps_per_second": 6.329, + "step": 4000 + }, + { + "epoch": 3.868129275772588, + "grad_norm": 0.6954628825187683, + "learning_rate": 4.099e-05, + "loss": 5.6102, + "step": 4100 + }, + { + "epoch": 3.962491153573956, + "grad_norm": 0.676533579826355, + "learning_rate": 4.199e-05, + "loss": 5.6129, + "step": 4200 + }, + { + "epoch": 4.056617126680821, + "grad_norm": 0.7012454271316528, + "learning_rate": 4.299e-05, + "loss": 5.6066, + "step": 4300 + }, + { + "epoch": 4.150979004482189, + "grad_norm": 1.0515376329421997, + "learning_rate": 4.3990000000000004e-05, + "loss": 5.5951, + "step": 4400 + }, + { + "epoch": 4.245340882283557, + "grad_norm": 0.7566595077514648, + "learning_rate": 4.499e-05, + "loss": 5.5894, + "step": 4500 + }, + { + "epoch": 4.339702760084926, + "grad_norm": 0.8826500177383423, + "learning_rate": 4.599e-05, + "loss": 5.5889, + "step": 4600 + }, + { + "epoch": 4.434064637886294, + "grad_norm": 1.1490343809127808, + "learning_rate": 4.699e-05, + "loss": 5.5867, + "step": 4700 + }, + { + "epoch": 4.528426515687662, + "grad_norm": 0.8356620669364929, + "learning_rate": 4.799e-05, + "loss": 5.571, + "step": 4800 + }, + { + "epoch": 4.622788393489031, + "grad_norm": 0.7320058345794678, + "learning_rate": 4.8990000000000004e-05, + "loss": 5.5733, + "step": 4900 + }, + { + "epoch": 4.717150271290398, + "grad_norm": 0.9290647506713867, + "learning_rate": 4.999e-05, + "loss": 5.5623, + "step": 5000 + }, + { + "epoch": 4.717150271290398, + "eval_loss": 5.556262016296387, + "eval_runtime": 74.4326, + "eval_samples_per_second": 202.479, + "eval_steps_per_second": 6.328, + "step": 5000 + }, + { + "epoch": 4.811512149091767, + "grad_norm": 0.733899712562561, + "learning_rate": 5.0990000000000005e-05, + "loss": 5.5642, + "step": 5100 + }, + { + "epoch": 4.905874026893136, + "grad_norm": 0.7362884879112244, + "learning_rate": 5.199000000000001e-05, + "loss": 5.5571, + "step": 5200 + }, + { + "epoch": 5.0, + "grad_norm": 0.9703426957130432, + "learning_rate": 5.2990000000000006e-05, + "loss": 5.5584, + "step": 5300 + }, + { + "epoch": 5.094361877801369, + "grad_norm": 0.8319281339645386, + "learning_rate": 5.399000000000001e-05, + "loss": 5.5516, + "step": 5400 + }, + { + "epoch": 5.188723755602736, + "grad_norm": 0.9939006567001343, + "learning_rate": 5.499000000000001e-05, + "loss": 5.535, + "step": 5500 + }, + { + "epoch": 5.283085633404105, + "grad_norm": 1.1493581533432007, + "learning_rate": 5.599e-05, + "loss": 5.5439, + "step": 5600 + }, + { + "epoch": 5.377447511205473, + "grad_norm": 0.9831592440605164, + "learning_rate": 5.699e-05, + "loss": 5.5357, + "step": 5700 + }, + { + "epoch": 5.471809389006841, + "grad_norm": 0.8419109582901001, + "learning_rate": 5.799e-05, + "loss": 5.5192, + "step": 5800 + }, + { + "epoch": 5.56617126680821, + "grad_norm": 0.803837239742279, + "learning_rate": 5.899e-05, + "loss": 5.5352, + "step": 5900 + }, + { + "epoch": 5.660533144609578, + "grad_norm": 0.8247868418693542, + "learning_rate": 5.999e-05, + "loss": 5.5243, + "step": 6000 + }, + { + "epoch": 5.660533144609578, + "eval_loss": 5.519050598144531, + "eval_runtime": 74.4404, + "eval_samples_per_second": 202.457, + "eval_steps_per_second": 6.327, + "step": 6000 + }, + { + "epoch": 5.754895022410946, + "grad_norm": 0.8315047025680542, + "learning_rate": 6.0990000000000004e-05, + "loss": 5.5147, + "step": 6100 + }, + { + "epoch": 5.849256900212314, + "grad_norm": 0.7833096385002136, + "learning_rate": 6.199000000000001e-05, + "loss": 5.523, + "step": 6200 + }, + { + "epoch": 5.943618778013683, + "grad_norm": 0.8719848990440369, + "learning_rate": 6.299e-05, + "loss": 5.5146, + "step": 6300 + }, + { + "epoch": 6.037744751120547, + "grad_norm": 0.8079792857170105, + "learning_rate": 6.399e-05, + "loss": 5.5031, + "step": 6400 + }, + { + "epoch": 6.132106628921916, + "grad_norm": 0.9179410338401794, + "learning_rate": 6.499000000000001e-05, + "loss": 5.5075, + "step": 6500 + }, + { + "epoch": 6.2264685067232834, + "grad_norm": 0.9099860191345215, + "learning_rate": 6.599000000000001e-05, + "loss": 5.4979, + "step": 6600 + }, + { + "epoch": 6.320830384524652, + "grad_norm": 0.7887560129165649, + "learning_rate": 6.699000000000001e-05, + "loss": 5.4848, + "step": 6700 + }, + { + "epoch": 6.415192262326021, + "grad_norm": 0.9273598194122314, + "learning_rate": 6.799e-05, + "loss": 5.4865, + "step": 6800 + }, + { + "epoch": 6.509554140127388, + "grad_norm": 0.8765608072280884, + "learning_rate": 6.899e-05, + "loss": 5.4878, + "step": 6900 + }, + { + "epoch": 6.603916017928757, + "grad_norm": 0.8114970922470093, + "learning_rate": 6.999e-05, + "loss": 5.4844, + "step": 7000 + }, + { + "epoch": 6.603916017928757, + "eval_loss": 5.4852294921875, + "eval_runtime": 74.3954, + "eval_samples_per_second": 202.58, + "eval_steps_per_second": 6.331, + "step": 7000 + }, + { + "epoch": 6.698277895730125, + "grad_norm": 0.8651418089866638, + "learning_rate": 7.099e-05, + "loss": 5.4928, + "step": 7100 + }, + { + "epoch": 6.792639773531493, + "grad_norm": 1.0564604997634888, + "learning_rate": 7.199000000000001e-05, + "loss": 5.4697, + "step": 7200 + }, + { + "epoch": 6.887001651332861, + "grad_norm": 0.6973365545272827, + "learning_rate": 7.299e-05, + "loss": 5.477, + "step": 7300 + }, + { + "epoch": 6.98136352913423, + "grad_norm": 0.9218101501464844, + "learning_rate": 7.399e-05, + "loss": 5.482, + "step": 7400 + }, + { + "epoch": 7.075489502241094, + "grad_norm": 0.8261534571647644, + "learning_rate": 7.499e-05, + "loss": 5.4594, + "step": 7500 + }, + { + "epoch": 7.169851380042463, + "grad_norm": 0.9930619597434998, + "learning_rate": 7.599000000000001e-05, + "loss": 5.4673, + "step": 7600 + }, + { + "epoch": 7.264213257843831, + "grad_norm": 0.9928425550460815, + "learning_rate": 7.699e-05, + "loss": 5.4517, + "step": 7700 + }, + { + "epoch": 7.358575135645199, + "grad_norm": 0.9189549088478088, + "learning_rate": 7.799e-05, + "loss": 5.4587, + "step": 7800 + }, + { + "epoch": 7.452937013446568, + "grad_norm": 0.8727918267250061, + "learning_rate": 7.899000000000001e-05, + "loss": 5.4462, + "step": 7900 + }, + { + "epoch": 7.5472988912479355, + "grad_norm": 0.87702476978302, + "learning_rate": 7.999000000000001e-05, + "loss": 5.4608, + "step": 8000 + }, + { + "epoch": 7.5472988912479355, + "eval_loss": 5.4536590576171875, + "eval_runtime": 74.428, + "eval_samples_per_second": 202.491, + "eval_steps_per_second": 6.328, + "step": 8000 + }, + { + "epoch": 7.641660769049304, + "grad_norm": 0.9891802668571472, + "learning_rate": 8.099e-05, + "loss": 5.4534, + "step": 8100 + }, + { + "epoch": 7.736022646850673, + "grad_norm": 1.0268616676330566, + "learning_rate": 8.199e-05, + "loss": 5.4537, + "step": 8200 + }, + { + "epoch": 7.8303845246520405, + "grad_norm": 0.7097076177597046, + "learning_rate": 8.299e-05, + "loss": 5.4557, + "step": 8300 + }, + { + "epoch": 7.924746402453409, + "grad_norm": 0.955984354019165, + "learning_rate": 8.399e-05, + "loss": 5.4478, + "step": 8400 + }, + { + "epoch": 8.018872375560274, + "grad_norm": 1.1078083515167236, + "learning_rate": 8.499e-05, + "loss": 5.4363, + "step": 8500 + }, + { + "epoch": 8.113234253361641, + "grad_norm": 0.8345474004745483, + "learning_rate": 8.599000000000001e-05, + "loss": 5.4326, + "step": 8600 + }, + { + "epoch": 8.20759613116301, + "grad_norm": 0.8543062210083008, + "learning_rate": 8.699e-05, + "loss": 5.4211, + "step": 8700 + }, + { + "epoch": 8.301958008964379, + "grad_norm": 0.9149999022483826, + "learning_rate": 8.799e-05, + "loss": 5.4342, + "step": 8800 + }, + { + "epoch": 8.396319886765747, + "grad_norm": 0.8864068984985352, + "learning_rate": 8.899e-05, + "loss": 5.4274, + "step": 8900 + }, + { + "epoch": 8.490681764567114, + "grad_norm": 0.8433997631072998, + "learning_rate": 8.999000000000001e-05, + "loss": 5.4176, + "step": 9000 + }, + { + "epoch": 8.490681764567114, + "eval_loss": 5.430926322937012, + "eval_runtime": 74.4314, + "eval_samples_per_second": 202.482, + "eval_steps_per_second": 6.328, + "step": 9000 + }, + { + "epoch": 8.585043642368483, + "grad_norm": 0.9321457147598267, + "learning_rate": 9.099000000000001e-05, + "loss": 5.4309, + "step": 9100 + }, + { + "epoch": 8.679405520169851, + "grad_norm": 0.9454299211502075, + "learning_rate": 9.199e-05, + "loss": 5.4222, + "step": 9200 + }, + { + "epoch": 8.77376739797122, + "grad_norm": 0.9724736213684082, + "learning_rate": 9.299e-05, + "loss": 5.4294, + "step": 9300 + }, + { + "epoch": 8.868129275772588, + "grad_norm": 0.8900998830795288, + "learning_rate": 9.399e-05, + "loss": 5.4178, + "step": 9400 + }, + { + "epoch": 8.962491153573955, + "grad_norm": 0.8408654928207397, + "learning_rate": 9.499e-05, + "loss": 5.4098, + "step": 9500 + }, + { + "epoch": 9.056617126680822, + "grad_norm": 0.9535470008850098, + "learning_rate": 9.599000000000001e-05, + "loss": 5.4211, + "step": 9600 + }, + { + "epoch": 9.150979004482188, + "grad_norm": 0.9209726452827454, + "learning_rate": 9.699e-05, + "loss": 5.406, + "step": 9700 + }, + { + "epoch": 9.245340882283557, + "grad_norm": 0.9139648079872131, + "learning_rate": 9.799e-05, + "loss": 5.4004, + "step": 9800 + }, + { + "epoch": 9.339702760084926, + "grad_norm": 0.9717345833778381, + "learning_rate": 9.899e-05, + "loss": 5.4054, + "step": 9900 + }, + { + "epoch": 9.434064637886294, + "grad_norm": 0.9255397319793701, + "learning_rate": 9.999000000000001e-05, + "loss": 5.4013, + "step": 10000 + }, + { + "epoch": 9.434064637886294, + "eval_loss": 5.412718296051025, + "eval_runtime": 74.3953, + "eval_samples_per_second": 202.58, + "eval_steps_per_second": 6.331, + "step": 10000 + }, + { + "epoch": 9.528426515687663, + "grad_norm": 1.028997540473938, + "learning_rate": 9.999970144476398e-05, + "loss": 5.4009, + "step": 10100 + }, + { + "epoch": 9.62278839348903, + "grad_norm": 0.9851202964782715, + "learning_rate": 9.999879368940656e-05, + "loss": 5.4014, + "step": 10200 + }, + { + "epoch": 9.717150271290398, + "grad_norm": 0.8858544230461121, + "learning_rate": 9.999727671452668e-05, + "loss": 5.3941, + "step": 10300 + }, + { + "epoch": 9.811512149091767, + "grad_norm": 1.0239341259002686, + "learning_rate": 9.999515053860821e-05, + "loss": 5.4031, + "step": 10400 + }, + { + "epoch": 9.905874026893136, + "grad_norm": 1.0142488479614258, + "learning_rate": 9.999241518755793e-05, + "loss": 5.395, + "step": 10500 + }, + { + "epoch": 10.0, + "grad_norm": 0.9302240014076233, + "learning_rate": 9.998907069470524e-05, + "loss": 5.3871, + "step": 10600 + }, + { + "epoch": 10.094361877801369, + "grad_norm": 0.9354774951934814, + "learning_rate": 9.998511710080171e-05, + "loss": 5.3863, + "step": 10700 + }, + { + "epoch": 10.188723755602737, + "grad_norm": 1.126571774482727, + "learning_rate": 9.998055445402067e-05, + "loss": 5.377, + "step": 10800 + }, + { + "epoch": 10.283085633404104, + "grad_norm": 0.9135729670524597, + "learning_rate": 9.997538280995651e-05, + "loss": 5.3809, + "step": 10900 + }, + { + "epoch": 10.377447511205473, + "grad_norm": 0.8609702587127686, + "learning_rate": 9.996960223162406e-05, + "loss": 5.3784, + "step": 11000 + }, + { + "epoch": 10.377447511205473, + "eval_loss": 5.395565986633301, + "eval_runtime": 74.4304, + "eval_samples_per_second": 202.484, + "eval_steps_per_second": 6.328, + "step": 11000 + }, + { + "epoch": 10.471809389006841, + "grad_norm": 1.0586097240447998, + "learning_rate": 9.996321278945788e-05, + "loss": 5.3636, + "step": 11100 + }, + { + "epoch": 10.56617126680821, + "grad_norm": 0.8425832390785217, + "learning_rate": 9.995621456131128e-05, + "loss": 5.3767, + "step": 11200 + }, + { + "epoch": 10.660533144609577, + "grad_norm": 0.9150360822677612, + "learning_rate": 9.994860763245549e-05, + "loss": 5.3744, + "step": 11300 + }, + { + "epoch": 10.754895022410945, + "grad_norm": 0.9344813227653503, + "learning_rate": 9.99403920955785e-05, + "loss": 5.3904, + "step": 11400 + }, + { + "epoch": 10.849256900212314, + "grad_norm": 1.030868411064148, + "learning_rate": 9.993156805078405e-05, + "loss": 5.3683, + "step": 11500 + }, + { + "epoch": 10.943618778013683, + "grad_norm": 0.9607158899307251, + "learning_rate": 9.992213560559034e-05, + "loss": 5.3726, + "step": 11600 + }, + { + "epoch": 11.037744751120547, + "grad_norm": 0.9345623254776001, + "learning_rate": 9.991209487492876e-05, + "loss": 5.3647, + "step": 11700 + }, + { + "epoch": 11.132106628921916, + "grad_norm": 0.8988705277442932, + "learning_rate": 9.990144598114242e-05, + "loss": 5.3618, + "step": 11800 + }, + { + "epoch": 11.226468506723284, + "grad_norm": 1.0196607112884521, + "learning_rate": 9.989018905398473e-05, + "loss": 5.364, + "step": 11900 + }, + { + "epoch": 11.320830384524651, + "grad_norm": 0.8753230571746826, + "learning_rate": 9.98783242306178e-05, + "loss": 5.3542, + "step": 12000 + }, + { + "epoch": 11.320830384524651, + "eval_loss": 5.378592491149902, + "eval_runtime": 74.4306, + "eval_samples_per_second": 202.484, + "eval_steps_per_second": 6.328, + "step": 12000 + }, + { + "epoch": 11.41519226232602, + "grad_norm": 0.8527357578277588, + "learning_rate": 9.986585165561076e-05, + "loss": 5.3541, + "step": 12100 + }, + { + "epoch": 11.509554140127388, + "grad_norm": 0.8356689214706421, + "learning_rate": 9.9852771480938e-05, + "loss": 5.3532, + "step": 12200 + }, + { + "epoch": 11.603916017928757, + "grad_norm": 0.8830206990242004, + "learning_rate": 9.983908386597732e-05, + "loss": 5.3614, + "step": 12300 + }, + { + "epoch": 11.698277895730126, + "grad_norm": 0.937129557132721, + "learning_rate": 9.9824788977508e-05, + "loss": 5.3556, + "step": 12400 + }, + { + "epoch": 11.792639773531493, + "grad_norm": 0.8641858100891113, + "learning_rate": 9.980988698970872e-05, + "loss": 5.3481, + "step": 12500 + }, + { + "epoch": 11.887001651332861, + "grad_norm": 0.9982609152793884, + "learning_rate": 9.979437808415552e-05, + "loss": 5.348, + "step": 12600 + }, + { + "epoch": 11.98136352913423, + "grad_norm": 0.9662423133850098, + "learning_rate": 9.977826244981952e-05, + "loss": 5.3448, + "step": 12700 + }, + { + "epoch": 12.075489502241094, + "grad_norm": 0.991277277469635, + "learning_rate": 9.976154028306461e-05, + "loss": 5.3463, + "step": 12800 + }, + { + "epoch": 12.169851380042463, + "grad_norm": 0.841627299785614, + "learning_rate": 9.974421178764515e-05, + "loss": 5.3306, + "step": 12900 + }, + { + "epoch": 12.264213257843831, + "grad_norm": 0.886860191822052, + "learning_rate": 9.972627717470337e-05, + "loss": 5.3562, + "step": 13000 + }, + { + "epoch": 12.264213257843831, + "eval_loss": 5.368251800537109, + "eval_runtime": 74.4164, + "eval_samples_per_second": 202.522, + "eval_steps_per_second": 6.329, + "step": 13000 + }, + { + "epoch": 12.3585751356452, + "grad_norm": 0.9396846294403076, + "learning_rate": 9.970773666276686e-05, + "loss": 5.3461, + "step": 13100 + }, + { + "epoch": 12.452937013446567, + "grad_norm": 0.9581524729728699, + "learning_rate": 9.968859047774595e-05, + "loss": 5.3421, + "step": 13200 + }, + { + "epoch": 12.547298891247936, + "grad_norm": 0.9131600856781006, + "learning_rate": 9.966883885293081e-05, + "loss": 5.3472, + "step": 13300 + }, + { + "epoch": 12.641660769049304, + "grad_norm": 0.9555962085723877, + "learning_rate": 9.964848202898879e-05, + "loss": 5.3453, + "step": 13400 + }, + { + "epoch": 12.736022646850673, + "grad_norm": 0.8874040842056274, + "learning_rate": 9.962752025396133e-05, + "loss": 5.3367, + "step": 13500 + }, + { + "epoch": 12.830384524652041, + "grad_norm": 1.0667670965194702, + "learning_rate": 9.96059537832611e-05, + "loss": 5.3221, + "step": 13600 + }, + { + "epoch": 12.924746402453408, + "grad_norm": 1.00221848487854, + "learning_rate": 9.958378287966868e-05, + "loss": 5.3402, + "step": 13700 + }, + { + "epoch": 13.018872375560274, + "grad_norm": 1.027253270149231, + "learning_rate": 9.956100781332958e-05, + "loss": 5.3324, + "step": 13800 + }, + { + "epoch": 13.113234253361641, + "grad_norm": 1.002406120300293, + "learning_rate": 9.953762886175075e-05, + "loss": 5.326, + "step": 13900 + }, + { + "epoch": 13.20759613116301, + "grad_norm": 0.8681198954582214, + "learning_rate": 9.951364630979738e-05, + "loss": 5.3311, + "step": 14000 + }, + { + "epoch": 13.20759613116301, + "eval_loss": 5.359684944152832, + "eval_runtime": 74.4319, + "eval_samples_per_second": 202.48, + "eval_steps_per_second": 6.328, + "step": 14000 + }, + { + "epoch": 13.301958008964379, + "grad_norm": 0.8745918273925781, + "learning_rate": 9.948906044968926e-05, + "loss": 5.3234, + "step": 14100 + }, + { + "epoch": 13.396319886765747, + "grad_norm": 0.8706562519073486, + "learning_rate": 9.946387158099738e-05, + "loss": 5.327, + "step": 14200 + }, + { + "epoch": 13.490681764567114, + "grad_norm": 0.9299585819244385, + "learning_rate": 9.943808001064013e-05, + "loss": 5.3347, + "step": 14300 + }, + { + "epoch": 13.585043642368483, + "grad_norm": 0.9840534925460815, + "learning_rate": 9.941168605287965e-05, + "loss": 5.3222, + "step": 14400 + }, + { + "epoch": 13.679405520169851, + "grad_norm": 0.8881776332855225, + "learning_rate": 9.938469002931798e-05, + "loss": 5.3211, + "step": 14500 + }, + { + "epoch": 13.77376739797122, + "grad_norm": 1.0282058715820312, + "learning_rate": 9.935709226889319e-05, + "loss": 5.3328, + "step": 14600 + }, + { + "epoch": 13.868129275772588, + "grad_norm": 0.9646194577217102, + "learning_rate": 9.932889310787522e-05, + "loss": 5.3127, + "step": 14700 + }, + { + "epoch": 13.962491153573955, + "grad_norm": 0.9002189636230469, + "learning_rate": 9.9300092889862e-05, + "loss": 5.3208, + "step": 14800 + }, + { + "epoch": 14.056617126680822, + "grad_norm": 0.9314839839935303, + "learning_rate": 9.927069196577507e-05, + "loss": 5.3141, + "step": 14900 + }, + { + "epoch": 14.150979004482188, + "grad_norm": 0.9567928314208984, + "learning_rate": 9.924069069385543e-05, + "loss": 5.3125, + "step": 15000 + }, + { + "epoch": 14.150979004482188, + "eval_loss": 5.350492477416992, + "eval_runtime": 74.4863, + "eval_samples_per_second": 202.333, + "eval_steps_per_second": 6.323, + "step": 15000 + }, + { + "epoch": 14.245340882283557, + "grad_norm": 0.9932417273521423, + "learning_rate": 9.921008943965908e-05, + "loss": 5.3151, + "step": 15100 + }, + { + "epoch": 14.339702760084926, + "grad_norm": 1.0583364963531494, + "learning_rate": 9.917888857605268e-05, + "loss": 5.3138, + "step": 15200 + }, + { + "epoch": 14.434064637886294, + "grad_norm": 0.9595634341239929, + "learning_rate": 9.91470884832089e-05, + "loss": 5.2995, + "step": 15300 + }, + { + "epoch": 14.528426515687663, + "grad_norm": 1.0111640691757202, + "learning_rate": 9.911468954860181e-05, + "loss": 5.3113, + "step": 15400 + }, + { + "epoch": 14.62278839348903, + "grad_norm": 0.872369647026062, + "learning_rate": 9.908169216700223e-05, + "loss": 5.3142, + "step": 15500 + }, + { + "epoch": 14.717150271290398, + "grad_norm": 1.0529162883758545, + "learning_rate": 9.904809674047284e-05, + "loss": 5.325, + "step": 15600 + }, + { + "epoch": 14.811512149091767, + "grad_norm": 0.9173269271850586, + "learning_rate": 9.90139036783633e-05, + "loss": 5.307, + "step": 15700 + }, + { + "epoch": 14.905874026893136, + "grad_norm": 0.8593319058418274, + "learning_rate": 9.897911339730527e-05, + "loss": 5.3098, + "step": 15800 + }, + { + "epoch": 15.0, + "grad_norm": 1.0840786695480347, + "learning_rate": 9.894372632120738e-05, + "loss": 5.3069, + "step": 15900 + }, + { + "epoch": 15.094361877801369, + "grad_norm": 0.9976385235786438, + "learning_rate": 9.890774288124996e-05, + "loss": 5.301, + "step": 16000 + }, + { + "epoch": 15.094361877801369, + "eval_loss": 5.339163303375244, + "eval_runtime": 74.4349, + "eval_samples_per_second": 202.472, + "eval_steps_per_second": 6.328, + "step": 16000 + }, + { + "epoch": 15.188723755602737, + "grad_norm": 0.8718565702438354, + "learning_rate": 9.887116351587985e-05, + "loss": 5.3039, + "step": 16100 + }, + { + "epoch": 15.283085633404104, + "grad_norm": 0.942323625087738, + "learning_rate": 9.883398867080513e-05, + "loss": 5.2966, + "step": 16200 + }, + { + "epoch": 15.377447511205473, + "grad_norm": 0.9328644871711731, + "learning_rate": 9.87962187989895e-05, + "loss": 5.2976, + "step": 16300 + }, + { + "epoch": 15.471809389006841, + "grad_norm": 0.9764389395713806, + "learning_rate": 9.875785436064697e-05, + "loss": 5.3005, + "step": 16400 + }, + { + "epoch": 15.56617126680821, + "grad_norm": 0.951077401638031, + "learning_rate": 9.871889582323609e-05, + "loss": 5.2958, + "step": 16500 + }, + { + "epoch": 15.660533144609577, + "grad_norm": 0.8876622915267944, + "learning_rate": 9.867934366145435e-05, + "loss": 5.3006, + "step": 16600 + }, + { + "epoch": 15.754895022410945, + "grad_norm": 1.1068483591079712, + "learning_rate": 9.863919835723236e-05, + "loss": 5.3, + "step": 16700 + }, + { + "epoch": 15.849256900212314, + "grad_norm": 0.9473031163215637, + "learning_rate": 9.859846039972798e-05, + "loss": 5.2966, + "step": 16800 + }, + { + "epoch": 15.943618778013683, + "grad_norm": 1.0127873420715332, + "learning_rate": 9.855713028532036e-05, + "loss": 5.2963, + "step": 16900 + }, + { + "epoch": 16.03774475112055, + "grad_norm": 0.8792197108268738, + "learning_rate": 9.851520851760394e-05, + "loss": 5.2898, + "step": 17000 + }, + { + "epoch": 16.03774475112055, + "eval_loss": 5.332190036773682, + "eval_runtime": 74.4273, + "eval_samples_per_second": 202.493, + "eval_steps_per_second": 6.328, + "step": 17000 + }, + { + "epoch": 16.132106628921914, + "grad_norm": 0.9135002493858337, + "learning_rate": 9.847269560738218e-05, + "loss": 5.2841, + "step": 17100 + }, + { + "epoch": 16.226468506723283, + "grad_norm": 0.8058662414550781, + "learning_rate": 9.842959207266149e-05, + "loss": 5.2849, + "step": 17200 + }, + { + "epoch": 16.32083038452465, + "grad_norm": 0.919384777545929, + "learning_rate": 9.838589843864484e-05, + "loss": 5.2938, + "step": 17300 + }, + { + "epoch": 16.41519226232602, + "grad_norm": 0.9282925128936768, + "learning_rate": 9.834161523772539e-05, + "loss": 5.2806, + "step": 17400 + }, + { + "epoch": 16.50955414012739, + "grad_norm": 0.8830996155738831, + "learning_rate": 9.829674300947993e-05, + "loss": 5.2875, + "step": 17500 + }, + { + "epoch": 16.603916017928757, + "grad_norm": 0.8510077595710754, + "learning_rate": 9.825128230066244e-05, + "loss": 5.296, + "step": 17600 + }, + { + "epoch": 16.698277895730126, + "grad_norm": 0.9782845377922058, + "learning_rate": 9.82052336651973e-05, + "loss": 5.2783, + "step": 17700 + }, + { + "epoch": 16.792639773531494, + "grad_norm": 0.9722960591316223, + "learning_rate": 9.815859766417257e-05, + "loss": 5.2933, + "step": 17800 + }, + { + "epoch": 16.887001651332863, + "grad_norm": 0.9551956653594971, + "learning_rate": 9.811137486583324e-05, + "loss": 5.2802, + "step": 17900 + }, + { + "epoch": 16.981363529134228, + "grad_norm": 0.9742000699043274, + "learning_rate": 9.806356584557419e-05, + "loss": 5.296, + "step": 18000 + }, + { + "epoch": 16.981363529134228, + "eval_loss": 5.326197147369385, + "eval_runtime": 74.3924, + "eval_samples_per_second": 202.588, + "eval_steps_per_second": 6.331, + "step": 18000 + }, + { + "epoch": 17.075489502241094, + "grad_norm": 0.8250226974487305, + "learning_rate": 9.801517118593327e-05, + "loss": 5.283, + "step": 18100 + }, + { + "epoch": 17.169851380042463, + "grad_norm": 0.9234561324119568, + "learning_rate": 9.796619147658408e-05, + "loss": 5.2766, + "step": 18200 + }, + { + "epoch": 17.26421325784383, + "grad_norm": 0.920927107334137, + "learning_rate": 9.791662731432898e-05, + "loss": 5.2737, + "step": 18300 + }, + { + "epoch": 17.3585751356452, + "grad_norm": 0.9160899519920349, + "learning_rate": 9.78664793030916e-05, + "loss": 5.2762, + "step": 18400 + }, + { + "epoch": 17.45293701344657, + "grad_norm": 0.9943878054618835, + "learning_rate": 9.781574805390967e-05, + "loss": 5.2694, + "step": 18500 + }, + { + "epoch": 17.547298891247937, + "grad_norm": 0.9411502480506897, + "learning_rate": 9.776443418492744e-05, + "loss": 5.2783, + "step": 18600 + }, + { + "epoch": 17.641660769049302, + "grad_norm": 0.9028970003128052, + "learning_rate": 9.771253832138819e-05, + "loss": 5.2812, + "step": 18700 + }, + { + "epoch": 17.73602264685067, + "grad_norm": 0.930030345916748, + "learning_rate": 9.766006109562664e-05, + "loss": 5.279, + "step": 18800 + }, + { + "epoch": 17.83038452465204, + "grad_norm": 1.0116451978683472, + "learning_rate": 9.760700314706125e-05, + "loss": 5.2862, + "step": 18900 + }, + { + "epoch": 17.924746402453408, + "grad_norm": 0.8852808475494385, + "learning_rate": 9.755336512218638e-05, + "loss": 5.2797, + "step": 19000 + }, + { + "epoch": 17.924746402453408, + "eval_loss": 5.320762634277344, + "eval_runtime": 74.4346, + "eval_samples_per_second": 202.473, + "eval_steps_per_second": 6.328, + "step": 19000 + }, + { + "epoch": 18.018872375560274, + "grad_norm": 0.9398846626281738, + "learning_rate": 9.749914767456441e-05, + "loss": 5.2747, + "step": 19100 + }, + { + "epoch": 18.113234253361643, + "grad_norm": 0.8749862909317017, + "learning_rate": 9.744435146481785e-05, + "loss": 5.2593, + "step": 19200 + }, + { + "epoch": 18.20759613116301, + "grad_norm": 0.812777578830719, + "learning_rate": 9.738897716062121e-05, + "loss": 5.2673, + "step": 19300 + }, + { + "epoch": 18.301958008964377, + "grad_norm": 0.9763257503509521, + "learning_rate": 9.733302543669291e-05, + "loss": 5.2755, + "step": 19400 + }, + { + "epoch": 18.396319886765745, + "grad_norm": 0.9894685745239258, + "learning_rate": 9.727649697478708e-05, + "loss": 5.2701, + "step": 19500 + }, + { + "epoch": 18.490681764567114, + "grad_norm": 1.0243197679519653, + "learning_rate": 9.721939246368515e-05, + "loss": 5.2652, + "step": 19600 + }, + { + "epoch": 18.585043642368483, + "grad_norm": 0.9538172483444214, + "learning_rate": 9.716171259918758e-05, + "loss": 5.2685, + "step": 19700 + }, + { + "epoch": 18.67940552016985, + "grad_norm": 1.0511356592178345, + "learning_rate": 9.710345808410532e-05, + "loss": 5.2751, + "step": 19800 + }, + { + "epoch": 18.77376739797122, + "grad_norm": 0.8636530637741089, + "learning_rate": 9.704462962825124e-05, + "loss": 5.2657, + "step": 19900 + }, + { + "epoch": 18.86812927577259, + "grad_norm": 0.820347785949707, + "learning_rate": 9.698522794843154e-05, + "loss": 5.2805, + "step": 20000 + }, + { + "epoch": 18.86812927577259, + "eval_loss": 5.320885181427002, + "eval_runtime": 74.4058, + "eval_samples_per_second": 202.551, + "eval_steps_per_second": 6.33, + "step": 20000 + }, + { + "epoch": 18.962491153573957, + "grad_norm": 0.9779840707778931, + "learning_rate": 9.692525376843691e-05, + "loss": 5.2646, + "step": 20100 + }, + { + "epoch": 19.05661712668082, + "grad_norm": 0.883474588394165, + "learning_rate": 9.686470781903383e-05, + "loss": 5.2645, + "step": 20200 + }, + { + "epoch": 19.15097900448219, + "grad_norm": 1.031437873840332, + "learning_rate": 9.680359083795557e-05, + "loss": 5.2595, + "step": 20300 + }, + { + "epoch": 19.245340882283557, + "grad_norm": 0.8232062458992004, + "learning_rate": 9.674190356989325e-05, + "loss": 5.2606, + "step": 20400 + }, + { + "epoch": 19.339702760084926, + "grad_norm": 0.8686699271202087, + "learning_rate": 9.66796467664868e-05, + "loss": 5.2575, + "step": 20500 + }, + { + "epoch": 19.434064637886294, + "grad_norm": 0.8924238085746765, + "learning_rate": 9.661682118631568e-05, + "loss": 5.2488, + "step": 20600 + }, + { + "epoch": 19.528426515687663, + "grad_norm": 0.9776726365089417, + "learning_rate": 9.655342759488979e-05, + "loss": 5.264, + "step": 20700 + }, + { + "epoch": 19.62278839348903, + "grad_norm": 0.9295623302459717, + "learning_rate": 9.648946676464002e-05, + "loss": 5.2589, + "step": 20800 + }, + { + "epoch": 19.7171502712904, + "grad_norm": 0.9422546625137329, + "learning_rate": 9.642493947490889e-05, + "loss": 5.2616, + "step": 20900 + }, + { + "epoch": 19.81151214909177, + "grad_norm": 1.0146008729934692, + "learning_rate": 9.635984651194109e-05, + "loss": 5.2631, + "step": 21000 + }, + { + "epoch": 19.81151214909177, + "eval_loss": 5.316196918487549, + "eval_runtime": 74.4315, + "eval_samples_per_second": 202.481, + "eval_steps_per_second": 6.328, + "step": 21000 + }, + { + "epoch": 19.905874026893134, + "grad_norm": 0.8792014718055725, + "learning_rate": 9.629418866887381e-05, + "loss": 5.2685, + "step": 21100 + }, + { + "epoch": 20.0, + "grad_norm": 1.1166847944259644, + "learning_rate": 9.622796674572716e-05, + "loss": 5.2622, + "step": 21200 + }, + { + "epoch": 20.09436187780137, + "grad_norm": 0.888293445110321, + "learning_rate": 9.616118154939436e-05, + "loss": 5.2506, + "step": 21300 + }, + { + "epoch": 20.188723755602737, + "grad_norm": 0.9711266756057739, + "learning_rate": 9.609383389363198e-05, + "loss": 5.2503, + "step": 21400 + }, + { + "epoch": 20.283085633404106, + "grad_norm": 0.9572346806526184, + "learning_rate": 9.602592459904993e-05, + "loss": 5.2483, + "step": 21500 + }, + { + "epoch": 20.377447511205474, + "grad_norm": 0.8501202464103699, + "learning_rate": 9.595745449310152e-05, + "loss": 5.2526, + "step": 21600 + }, + { + "epoch": 20.47180938900684, + "grad_norm": 0.9165418744087219, + "learning_rate": 9.588842441007342e-05, + "loss": 5.2609, + "step": 21700 + }, + { + "epoch": 20.566171266808208, + "grad_norm": 0.8789685964584351, + "learning_rate": 9.581883519107538e-05, + "loss": 5.2498, + "step": 21800 + }, + { + "epoch": 20.660533144609577, + "grad_norm": 0.9526987671852112, + "learning_rate": 9.574868768403007e-05, + "loss": 5.2476, + "step": 21900 + }, + { + "epoch": 20.754895022410945, + "grad_norm": 0.8855116963386536, + "learning_rate": 9.567798274366273e-05, + "loss": 5.2582, + "step": 22000 + }, + { + "epoch": 20.754895022410945, + "eval_loss": 5.307859897613525, + "eval_runtime": 74.4524, + "eval_samples_per_second": 202.425, + "eval_steps_per_second": 6.326, + "step": 22000 + }, + { + "epoch": 20.849256900212314, + "grad_norm": 0.821835994720459, + "learning_rate": 9.560672123149077e-05, + "loss": 5.2586, + "step": 22100 + }, + { + "epoch": 20.943618778013683, + "grad_norm": 1.0579211711883545, + "learning_rate": 9.55349040158132e-05, + "loss": 5.2541, + "step": 22200 + }, + { + "epoch": 21.03774475112055, + "grad_norm": 1.004662275314331, + "learning_rate": 9.546253197170015e-05, + "loss": 5.2489, + "step": 22300 + }, + { + "epoch": 21.132106628921914, + "grad_norm": 0.9038196802139282, + "learning_rate": 9.538960598098211e-05, + "loss": 5.2426, + "step": 22400 + }, + { + "epoch": 21.226468506723283, + "grad_norm": 0.8407565951347351, + "learning_rate": 9.531612693223928e-05, + "loss": 5.2479, + "step": 22500 + }, + { + "epoch": 21.32083038452465, + "grad_norm": 1.034454107284546, + "learning_rate": 9.524209572079068e-05, + "loss": 5.2437, + "step": 22600 + }, + { + "epoch": 21.41519226232602, + "grad_norm": 0.9391712546348572, + "learning_rate": 9.516751324868326e-05, + "loss": 5.2527, + "step": 22700 + }, + { + "epoch": 21.50955414012739, + "grad_norm": 0.9005853533744812, + "learning_rate": 9.509238042468091e-05, + "loss": 5.2437, + "step": 22800 + }, + { + "epoch": 21.603916017928757, + "grad_norm": 0.8823351860046387, + "learning_rate": 9.501669816425337e-05, + "loss": 5.2555, + "step": 22900 + }, + { + "epoch": 21.698277895730126, + "grad_norm": 0.9329714179039001, + "learning_rate": 9.494046738956508e-05, + "loss": 5.237, + "step": 23000 + }, + { + "epoch": 21.698277895730126, + "eval_loss": 5.306467056274414, + "eval_runtime": 74.4426, + "eval_samples_per_second": 202.451, + "eval_steps_per_second": 6.327, + "step": 23000 + }, + { + "epoch": 21.792639773531494, + "grad_norm": 0.8710659742355347, + "learning_rate": 9.486368902946402e-05, + "loss": 5.2476, + "step": 23100 + }, + { + "epoch": 21.887001651332863, + "grad_norm": 0.9512269496917725, + "learning_rate": 9.478636401947026e-05, + "loss": 5.2428, + "step": 23200 + }, + { + "epoch": 21.981363529134228, + "grad_norm": 1.0027644634246826, + "learning_rate": 9.47084933017646e-05, + "loss": 5.2451, + "step": 23300 + }, + { + "epoch": 22.075489502241094, + "grad_norm": 0.9528675079345703, + "learning_rate": 9.463007782517723e-05, + "loss": 5.2369, + "step": 23400 + }, + { + "epoch": 22.169851380042463, + "grad_norm": 0.9792280197143555, + "learning_rate": 9.455111854517595e-05, + "loss": 5.229, + "step": 23500 + }, + { + "epoch": 22.26421325784383, + "grad_norm": 0.8722760081291199, + "learning_rate": 9.447161642385467e-05, + "loss": 5.2364, + "step": 23600 + }, + { + "epoch": 22.3585751356452, + "grad_norm": 0.9419755935668945, + "learning_rate": 9.439157242992164e-05, + "loss": 5.2363, + "step": 23700 + }, + { + "epoch": 22.45293701344657, + "grad_norm": 0.8856399059295654, + "learning_rate": 9.43109875386877e-05, + "loss": 5.2399, + "step": 23800 + }, + { + "epoch": 22.547298891247937, + "grad_norm": 0.9981026649475098, + "learning_rate": 9.422986273205429e-05, + "loss": 5.2433, + "step": 23900 + }, + { + "epoch": 22.641660769049302, + "grad_norm": 0.9505060911178589, + "learning_rate": 9.414819899850158e-05, + "loss": 5.2481, + "step": 24000 + }, + { + "epoch": 22.641660769049302, + "eval_loss": 5.302011489868164, + "eval_runtime": 74.4717, + "eval_samples_per_second": 202.372, + "eval_steps_per_second": 6.325, + "step": 24000 + }, + { + "epoch": 22.73602264685067, + "grad_norm": 0.8508577942848206, + "learning_rate": 9.40659973330764e-05, + "loss": 5.2348, + "step": 24100 + }, + { + "epoch": 22.83038452465204, + "grad_norm": 0.9357001781463623, + "learning_rate": 9.398325873738007e-05, + "loss": 5.2411, + "step": 24200 + }, + { + "epoch": 22.924746402453408, + "grad_norm": 1.0099997520446777, + "learning_rate": 9.389998421955632e-05, + "loss": 5.2383, + "step": 24300 + }, + { + "epoch": 23.018872375560274, + "grad_norm": 0.9384352564811707, + "learning_rate": 9.381617479427885e-05, + "loss": 5.2366, + "step": 24400 + }, + { + "epoch": 23.113234253361643, + "grad_norm": 1.0836660861968994, + "learning_rate": 9.373183148273905e-05, + "loss": 5.2278, + "step": 24500 + }, + { + "epoch": 23.20759613116301, + "grad_norm": 0.9002297520637512, + "learning_rate": 9.364695531263354e-05, + "loss": 5.229, + "step": 24600 + }, + { + "epoch": 23.301958008964377, + "grad_norm": 0.9582403302192688, + "learning_rate": 9.35615473181517e-05, + "loss": 5.2378, + "step": 24700 + }, + { + "epoch": 23.396319886765745, + "grad_norm": 0.9208775162696838, + "learning_rate": 9.347560853996298e-05, + "loss": 5.2292, + "step": 24800 + }, + { + "epoch": 23.490681764567114, + "grad_norm": 0.9435613751411438, + "learning_rate": 9.338914002520426e-05, + "loss": 5.2313, + "step": 24900 + }, + { + "epoch": 23.585043642368483, + "grad_norm": 0.8621221780776978, + "learning_rate": 9.330214282746712e-05, + "loss": 5.2405, + "step": 25000 + }, + { + "epoch": 23.585043642368483, + "eval_loss": 5.301399230957031, + "eval_runtime": 74.4481, + "eval_samples_per_second": 202.436, + "eval_steps_per_second": 6.327, + "step": 25000 + }, + { + "epoch": 23.67940552016985, + "grad_norm": 0.9942018389701843, + "learning_rate": 9.321461800678494e-05, + "loss": 5.2424, + "step": 25100 + }, + { + "epoch": 23.77376739797122, + "grad_norm": 0.8850314021110535, + "learning_rate": 9.312656662962004e-05, + "loss": 5.2341, + "step": 25200 + }, + { + "epoch": 23.86812927577259, + "grad_norm": 0.8494738936424255, + "learning_rate": 9.30379897688507e-05, + "loss": 5.2369, + "step": 25300 + }, + { + "epoch": 23.962491153573957, + "grad_norm": 0.8826471567153931, + "learning_rate": 9.294888850375796e-05, + "loss": 5.2394, + "step": 25400 + }, + { + "epoch": 24.05661712668082, + "grad_norm": 0.9835422039031982, + "learning_rate": 9.285926392001265e-05, + "loss": 5.2274, + "step": 25500 + }, + { + "epoch": 24.15097900448219, + "grad_norm": 0.9036953449249268, + "learning_rate": 9.276911710966205e-05, + "loss": 5.2244, + "step": 25600 + }, + { + "epoch": 24.245340882283557, + "grad_norm": 0.916057288646698, + "learning_rate": 9.267844917111657e-05, + "loss": 5.2275, + "step": 25700 + }, + { + "epoch": 24.339702760084926, + "grad_norm": 0.8915982246398926, + "learning_rate": 9.258726120913643e-05, + "loss": 5.232, + "step": 25800 + }, + { + "epoch": 24.434064637886294, + "grad_norm": 1.0098341703414917, + "learning_rate": 9.249555433481819e-05, + "loss": 5.2243, + "step": 25900 + }, + { + "epoch": 24.528426515687663, + "grad_norm": 0.9247532486915588, + "learning_rate": 9.240332966558116e-05, + "loss": 5.2289, + "step": 26000 + }, + { + "epoch": 24.528426515687663, + "eval_loss": 5.294754505157471, + "eval_runtime": 74.4828, + "eval_samples_per_second": 202.342, + "eval_steps_per_second": 6.324, + "step": 26000 + }, + { + "epoch": 24.62278839348903, + "grad_norm": 0.9268798828125, + "learning_rate": 9.231058832515383e-05, + "loss": 5.2264, + "step": 26100 + }, + { + "epoch": 24.7171502712904, + "grad_norm": 0.9089873433113098, + "learning_rate": 9.221733144356015e-05, + "loss": 5.2231, + "step": 26200 + }, + { + "epoch": 24.81151214909177, + "grad_norm": 0.9429314136505127, + "learning_rate": 9.212356015710581e-05, + "loss": 5.2244, + "step": 26300 + }, + { + "epoch": 24.905874026893134, + "grad_norm": 0.9047618508338928, + "learning_rate": 9.202927560836436e-05, + "loss": 5.2291, + "step": 26400 + }, + { + "epoch": 25.0, + "grad_norm": 1.1686391830444336, + "learning_rate": 9.193447894616324e-05, + "loss": 5.2308, + "step": 26500 + }, + { + "epoch": 25.09436187780137, + "grad_norm": 0.9237032532691956, + "learning_rate": 9.183917132556987e-05, + "loss": 5.2191, + "step": 26600 + }, + { + "epoch": 25.188723755602737, + "grad_norm": 0.8873036503791809, + "learning_rate": 9.174335390787754e-05, + "loss": 5.2141, + "step": 26700 + }, + { + "epoch": 25.283085633404106, + "grad_norm": 0.964063286781311, + "learning_rate": 9.164702786059125e-05, + "loss": 5.2186, + "step": 26800 + }, + { + "epoch": 25.377447511205474, + "grad_norm": 0.8931275010108948, + "learning_rate": 9.155019435741348e-05, + "loss": 5.2149, + "step": 26900 + }, + { + "epoch": 25.47180938900684, + "grad_norm": 0.9907437562942505, + "learning_rate": 9.14528545782299e-05, + "loss": 5.2133, + "step": 27000 + }, + { + "epoch": 25.47180938900684, + "eval_loss": 5.286930084228516, + "eval_runtime": 74.443, + "eval_samples_per_second": 202.45, + "eval_steps_per_second": 6.327, + "step": 27000 + }, + { + "epoch": 25.566171266808208, + "grad_norm": 0.9528295397758484, + "learning_rate": 9.135500970909501e-05, + "loss": 5.2296, + "step": 27100 + }, + { + "epoch": 25.660533144609577, + "grad_norm": 0.939414918422699, + "learning_rate": 9.125666094221766e-05, + "loss": 5.2269, + "step": 27200 + }, + { + "epoch": 25.754895022410945, + "grad_norm": 0.9641194343566895, + "learning_rate": 9.115780947594654e-05, + "loss": 5.2203, + "step": 27300 + }, + { + "epoch": 25.849256900212314, + "grad_norm": 0.9802380204200745, + "learning_rate": 9.105845651475556e-05, + "loss": 5.2267, + "step": 27400 + }, + { + "epoch": 25.943618778013683, + "grad_norm": 0.8933775424957275, + "learning_rate": 9.09586032692292e-05, + "loss": 5.2196, + "step": 27500 + }, + { + "epoch": 26.03774475112055, + "grad_norm": 0.846247136592865, + "learning_rate": 9.085825095604778e-05, + "loss": 5.2226, + "step": 27600 + }, + { + "epoch": 26.132106628921914, + "grad_norm": 0.9873145222663879, + "learning_rate": 9.075740079797253e-05, + "loss": 5.2118, + "step": 27700 + }, + { + "epoch": 26.226468506723283, + "grad_norm": 0.9664213061332703, + "learning_rate": 9.06560540238308e-05, + "loss": 5.2127, + "step": 27800 + }, + { + "epoch": 26.32083038452465, + "grad_norm": 0.902673065662384, + "learning_rate": 9.055421186850104e-05, + "loss": 5.222, + "step": 27900 + }, + { + "epoch": 26.41519226232602, + "grad_norm": 1.0140800476074219, + "learning_rate": 9.045187557289783e-05, + "loss": 5.2122, + "step": 28000 + }, + { + "epoch": 26.41519226232602, + "eval_loss": 5.291759490966797, + "eval_runtime": 74.4092, + "eval_samples_per_second": 202.542, + "eval_steps_per_second": 6.33, + "step": 28000 + }, + { + "epoch": 26.50955414012739, + "grad_norm": 0.9105413556098938, + "learning_rate": 9.034904638395656e-05, + "loss": 5.2119, + "step": 28100 + }, + { + "epoch": 26.603916017928757, + "grad_norm": 0.8379244208335876, + "learning_rate": 9.024572555461852e-05, + "loss": 5.2099, + "step": 28200 + }, + { + "epoch": 26.698277895730126, + "grad_norm": 0.9082247614860535, + "learning_rate": 9.014191434381535e-05, + "loss": 5.2044, + "step": 28300 + }, + { + "epoch": 26.792639773531494, + "grad_norm": 0.9054832458496094, + "learning_rate": 9.003761401645393e-05, + "loss": 5.2102, + "step": 28400 + }, + { + "epoch": 26.887001651332863, + "grad_norm": 0.8595414757728577, + "learning_rate": 8.99328258434008e-05, + "loss": 5.2244, + "step": 28500 + }, + { + "epoch": 26.981363529134228, + "grad_norm": 0.9551483988761902, + "learning_rate": 8.982755110146681e-05, + "loss": 5.2267, + "step": 28600 + }, + { + "epoch": 27.075489502241094, + "grad_norm": 0.9670704007148743, + "learning_rate": 8.972179107339148e-05, + "loss": 5.2109, + "step": 28700 + }, + { + "epoch": 27.169851380042463, + "grad_norm": 0.9536374807357788, + "learning_rate": 8.961554704782731e-05, + "loss": 5.197, + "step": 28800 + }, + { + "epoch": 27.26421325784383, + "grad_norm": 0.9553530812263489, + "learning_rate": 8.95088203193243e-05, + "loss": 5.2085, + "step": 28900 + }, + { + "epoch": 27.3585751356452, + "grad_norm": 0.9752506017684937, + "learning_rate": 8.940161218831391e-05, + "loss": 5.2078, + "step": 29000 + }, + { + "epoch": 27.3585751356452, + "eval_loss": 5.286397457122803, + "eval_runtime": 74.4089, + "eval_samples_per_second": 202.543, + "eval_steps_per_second": 6.33, + "step": 29000 + }, + { + "epoch": 27.45293701344657, + "grad_norm": 0.9288462400436401, + "learning_rate": 8.929392396109341e-05, + "loss": 5.2063, + "step": 29100 + }, + { + "epoch": 27.547298891247937, + "grad_norm": 0.9132038950920105, + "learning_rate": 8.918575694980983e-05, + "loss": 5.217, + "step": 29200 + }, + { + "epoch": 27.641660769049302, + "grad_norm": 1.0119094848632812, + "learning_rate": 8.90771124724441e-05, + "loss": 5.2139, + "step": 29300 + }, + { + "epoch": 27.73602264685067, + "grad_norm": 0.9567475318908691, + "learning_rate": 8.896799185279487e-05, + "loss": 5.2077, + "step": 29400 + }, + { + "epoch": 27.83038452465204, + "grad_norm": 0.8996191620826721, + "learning_rate": 8.885839642046249e-05, + "loss": 5.2139, + "step": 29500 + }, + { + "epoch": 27.924746402453408, + "grad_norm": 0.942906379699707, + "learning_rate": 8.874832751083266e-05, + "loss": 5.219, + "step": 29600 + }, + { + "epoch": 28.018872375560274, + "grad_norm": 0.8692095279693604, + "learning_rate": 8.863778646506035e-05, + "loss": 5.2142, + "step": 29700 + }, + { + "epoch": 28.113234253361643, + "grad_norm": 0.9690754413604736, + "learning_rate": 8.85267746300533e-05, + "loss": 5.2075, + "step": 29800 + }, + { + "epoch": 28.20759613116301, + "grad_norm": 0.9451592564582825, + "learning_rate": 8.841529335845569e-05, + "loss": 5.2045, + "step": 29900 + }, + { + "epoch": 28.301958008964377, + "grad_norm": 0.8922234177589417, + "learning_rate": 8.830334400863164e-05, + "loss": 5.2029, + "step": 30000 + }, + { + "epoch": 28.301958008964377, + "eval_loss": 5.287634372711182, + "eval_runtime": 74.4172, + "eval_samples_per_second": 202.52, + "eval_steps_per_second": 6.329, + "step": 30000 + }, + { + "epoch": 28.396319886765745, + "grad_norm": 0.9044151306152344, + "learning_rate": 8.819092794464863e-05, + "loss": 5.2027, + "step": 30100 + }, + { + "epoch": 28.490681764567114, + "grad_norm": 0.8109731078147888, + "learning_rate": 8.807804653626095e-05, + "loss": 5.2031, + "step": 30200 + }, + { + "epoch": 28.585043642368483, + "grad_norm": 0.9353645443916321, + "learning_rate": 8.796470115889292e-05, + "loss": 5.208, + "step": 30300 + }, + { + "epoch": 28.67940552016985, + "grad_norm": 0.956208348274231, + "learning_rate": 8.785089319362221e-05, + "loss": 5.2047, + "step": 30400 + }, + { + "epoch": 28.77376739797122, + "grad_norm": 0.799325704574585, + "learning_rate": 8.773662402716294e-05, + "loss": 5.2049, + "step": 30500 + }, + { + "epoch": 28.86812927577259, + "grad_norm": 0.8981509804725647, + "learning_rate": 8.762189505184885e-05, + "loss": 5.2081, + "step": 30600 + }, + { + "epoch": 28.962491153573957, + "grad_norm": 0.9017074704170227, + "learning_rate": 8.75067076656163e-05, + "loss": 5.1975, + "step": 30700 + }, + { + "epoch": 29.05661712668082, + "grad_norm": 0.888663649559021, + "learning_rate": 8.739106327198724e-05, + "loss": 5.1941, + "step": 30800 + }, + { + "epoch": 29.15097900448219, + "grad_norm": 0.900802731513977, + "learning_rate": 8.727496328005211e-05, + "loss": 5.192, + "step": 30900 + }, + { + "epoch": 29.245340882283557, + "grad_norm": 0.9100425839424133, + "learning_rate": 8.715840910445267e-05, + "loss": 5.2038, + "step": 31000 + }, + { + "epoch": 29.245340882283557, + "eval_loss": 5.285486221313477, + "eval_runtime": 74.4352, + "eval_samples_per_second": 202.471, + "eval_steps_per_second": 6.328, + "step": 31000 + }, + { + "epoch": 29.339702760084926, + "grad_norm": 0.8852345943450928, + "learning_rate": 8.704140216536478e-05, + "loss": 5.2091, + "step": 31100 + }, + { + "epoch": 29.434064637886294, + "grad_norm": 0.9929203391075134, + "learning_rate": 8.692394388848107e-05, + "loss": 5.2047, + "step": 31200 + }, + { + "epoch": 29.528426515687663, + "grad_norm": 0.9344567656517029, + "learning_rate": 8.680603570499354e-05, + "loss": 5.1932, + "step": 31300 + }, + { + "epoch": 29.62278839348903, + "grad_norm": 0.8371131420135498, + "learning_rate": 8.668767905157625e-05, + "loss": 5.199, + "step": 31400 + }, + { + "epoch": 29.7171502712904, + "grad_norm": 0.9597654342651367, + "learning_rate": 8.656887537036762e-05, + "loss": 5.1964, + "step": 31500 + }, + { + "epoch": 29.81151214909177, + "grad_norm": 0.9672077894210815, + "learning_rate": 8.644962610895303e-05, + "loss": 5.2033, + "step": 31600 + }, + { + "epoch": 29.905874026893134, + "grad_norm": 0.9107534289360046, + "learning_rate": 8.63299327203471e-05, + "loss": 5.198, + "step": 31700 + }, + { + "epoch": 30.0, + "grad_norm": 1.1500047445297241, + "learning_rate": 8.620979666297603e-05, + "loss": 5.2054, + "step": 31800 + }, + { + "epoch": 30.09436187780137, + "grad_norm": 0.8798314929008484, + "learning_rate": 8.608921940065973e-05, + "loss": 5.1939, + "step": 31900 + }, + { + "epoch": 30.188723755602737, + "grad_norm": 0.9284983277320862, + "learning_rate": 8.596820240259408e-05, + "loss": 5.1908, + "step": 32000 + }, + { + "epoch": 30.188723755602737, + "eval_loss": 5.2796783447265625, + "eval_runtime": 74.4825, + "eval_samples_per_second": 202.343, + "eval_steps_per_second": 6.324, + "step": 32000 + }, + { + "epoch": 30.283085633404106, + "grad_norm": 0.9129772782325745, + "learning_rate": 8.584674714333303e-05, + "loss": 5.1838, + "step": 32100 + }, + { + "epoch": 30.377447511205474, + "grad_norm": 0.902010977268219, + "learning_rate": 8.57248551027706e-05, + "loss": 5.197, + "step": 32200 + }, + { + "epoch": 30.47180938900684, + "grad_norm": 0.877648651599884, + "learning_rate": 8.56025277661228e-05, + "loss": 5.1797, + "step": 32300 + }, + { + "epoch": 30.566171266808208, + "grad_norm": 0.9197627305984497, + "learning_rate": 8.547976662390964e-05, + "loss": 5.1965, + "step": 32400 + }, + { + "epoch": 30.660533144609577, + "grad_norm": 0.9505943655967712, + "learning_rate": 8.535657317193692e-05, + "loss": 5.2051, + "step": 32500 + }, + { + "epoch": 30.754895022410945, + "grad_norm": 0.9619107246398926, + "learning_rate": 8.523294891127794e-05, + "loss": 5.1884, + "step": 32600 + }, + { + "epoch": 30.849256900212314, + "grad_norm": 0.9347026944160461, + "learning_rate": 8.510889534825532e-05, + "loss": 5.1951, + "step": 32700 + }, + { + "epoch": 30.943618778013683, + "grad_norm": 0.9318591952323914, + "learning_rate": 8.498441399442258e-05, + "loss": 5.2003, + "step": 32800 + }, + { + "epoch": 31.03774475112055, + "grad_norm": 0.9306088089942932, + "learning_rate": 8.485950636654572e-05, + "loss": 5.1937, + "step": 32900 + }, + { + "epoch": 31.132106628921914, + "grad_norm": 0.9033887386322021, + "learning_rate": 8.473417398658476e-05, + "loss": 5.1805, + "step": 33000 + }, + { + "epoch": 31.132106628921914, + "eval_loss": 5.279585361480713, + "eval_runtime": 74.456, + "eval_samples_per_second": 202.415, + "eval_steps_per_second": 6.326, + "step": 33000 + }, + { + "epoch": 31.226468506723283, + "grad_norm": 0.873508095741272, + "learning_rate": 8.460841838167523e-05, + "loss": 5.1898, + "step": 33100 + }, + { + "epoch": 31.32083038452465, + "grad_norm": 0.9387214183807373, + "learning_rate": 8.448224108410947e-05, + "loss": 5.186, + "step": 33200 + }, + { + "epoch": 31.41519226232602, + "grad_norm": 0.9126655459403992, + "learning_rate": 8.435564363131803e-05, + "loss": 5.1895, + "step": 33300 + }, + { + "epoch": 31.50955414012739, + "grad_norm": 0.9194200038909912, + "learning_rate": 8.422862756585091e-05, + "loss": 5.1815, + "step": 33400 + }, + { + "epoch": 31.603916017928757, + "grad_norm": 0.9167635440826416, + "learning_rate": 8.41011944353588e-05, + "loss": 5.1931, + "step": 33500 + }, + { + "epoch": 31.698277895730126, + "grad_norm": 0.9253891110420227, + "learning_rate": 8.397334579257418e-05, + "loss": 5.1931, + "step": 33600 + }, + { + "epoch": 31.792639773531494, + "grad_norm": 0.890615701675415, + "learning_rate": 8.384508319529242e-05, + "loss": 5.1991, + "step": 33700 + }, + { + "epoch": 31.887001651332863, + "grad_norm": 0.9657667875289917, + "learning_rate": 8.371640820635278e-05, + "loss": 5.1965, + "step": 33800 + }, + { + "epoch": 31.981363529134228, + "grad_norm": 1.0598945617675781, + "learning_rate": 8.358732239361938e-05, + "loss": 5.1899, + "step": 33900 + }, + { + "epoch": 32.0754895022411, + "grad_norm": 1.0309218168258667, + "learning_rate": 8.345782732996215e-05, + "loss": 5.1919, + "step": 34000 + }, + { + "epoch": 32.0754895022411, + "eval_loss": 5.279961109161377, + "eval_runtime": 74.46, + "eval_samples_per_second": 202.404, + "eval_steps_per_second": 6.326, + "step": 34000 + }, + { + "epoch": 32.169851380042466, + "grad_norm": 0.9428073167800903, + "learning_rate": 8.332792459323753e-05, + "loss": 5.1871, + "step": 34100 + }, + { + "epoch": 32.26421325784383, + "grad_norm": 0.8995642066001892, + "learning_rate": 8.31976157662694e-05, + "loss": 5.1898, + "step": 34200 + }, + { + "epoch": 32.3585751356452, + "grad_norm": 0.9593998193740845, + "learning_rate": 8.30669024368297e-05, + "loss": 5.1826, + "step": 34300 + }, + { + "epoch": 32.452937013446565, + "grad_norm": 0.9802656769752502, + "learning_rate": 8.293578619761906e-05, + "loss": 5.1928, + "step": 34400 + }, + { + "epoch": 32.547298891247934, + "grad_norm": 0.9958524703979492, + "learning_rate": 8.280426864624753e-05, + "loss": 5.1782, + "step": 34500 + }, + { + "epoch": 32.6416607690493, + "grad_norm": 0.9810296297073364, + "learning_rate": 8.267235138521492e-05, + "loss": 5.1867, + "step": 34600 + }, + { + "epoch": 32.73602264685067, + "grad_norm": 0.9276301264762878, + "learning_rate": 8.254003602189146e-05, + "loss": 5.1786, + "step": 34700 + }, + { + "epoch": 32.83038452465204, + "grad_norm": 0.913356363773346, + "learning_rate": 8.240732416849807e-05, + "loss": 5.1913, + "step": 34800 + }, + { + "epoch": 32.92474640245341, + "grad_norm": 0.9506176114082336, + "learning_rate": 8.227421744208683e-05, + "loss": 5.1852, + "step": 34900 + }, + { + "epoch": 33.01887237556027, + "grad_norm": 0.9290357232093811, + "learning_rate": 8.214071746452117e-05, + "loss": 5.1822, + "step": 35000 + }, + { + "epoch": 33.01887237556027, + "eval_loss": 5.276256084442139, + "eval_runtime": 74.4707, + "eval_samples_per_second": 202.375, + "eval_steps_per_second": 6.325, + "step": 35000 + }, + { + "epoch": 33.11323425336164, + "grad_norm": 0.9290668368339539, + "learning_rate": 8.200682586245621e-05, + "loss": 5.1844, + "step": 35100 + }, + { + "epoch": 33.20759613116301, + "grad_norm": 0.9466826319694519, + "learning_rate": 8.187254426731884e-05, + "loss": 5.1794, + "step": 35200 + }, + { + "epoch": 33.30195800896438, + "grad_norm": 1.0010716915130615, + "learning_rate": 8.173787431528794e-05, + "loss": 5.1817, + "step": 35300 + }, + { + "epoch": 33.396319886765745, + "grad_norm": 0.9706283807754517, + "learning_rate": 8.160281764727436e-05, + "loss": 5.1868, + "step": 35400 + }, + { + "epoch": 33.490681764567114, + "grad_norm": 0.9433771371841431, + "learning_rate": 8.146737590890101e-05, + "loss": 5.1779, + "step": 35500 + }, + { + "epoch": 33.58504364236848, + "grad_norm": 0.9575541615486145, + "learning_rate": 8.133155075048269e-05, + "loss": 5.1879, + "step": 35600 + }, + { + "epoch": 33.67940552016985, + "grad_norm": 0.9401003122329712, + "learning_rate": 8.119534382700613e-05, + "loss": 5.1768, + "step": 35700 + }, + { + "epoch": 33.77376739797122, + "grad_norm": 0.9027467370033264, + "learning_rate": 8.105875679810968e-05, + "loss": 5.1786, + "step": 35800 + }, + { + "epoch": 33.86812927577259, + "grad_norm": 0.9518372416496277, + "learning_rate": 8.092179132806317e-05, + "loss": 5.1776, + "step": 35900 + }, + { + "epoch": 33.96249115357396, + "grad_norm": 0.9268783926963806, + "learning_rate": 8.078444908574767e-05, + "loss": 5.1755, + "step": 36000 + }, + { + "epoch": 33.96249115357396, + "eval_loss": 5.270759105682373, + "eval_runtime": 74.4429, + "eval_samples_per_second": 202.45, + "eval_steps_per_second": 6.327, + "step": 36000 + }, + { + "epoch": 34.05661712668082, + "grad_norm": 0.8898890614509583, + "learning_rate": 8.064673174463505e-05, + "loss": 5.1774, + "step": 36100 + }, + { + "epoch": 34.15097900448219, + "grad_norm": 0.9448688626289368, + "learning_rate": 8.050864098276762e-05, + "loss": 5.1719, + "step": 36200 + }, + { + "epoch": 34.24534088228356, + "grad_norm": 0.9344834685325623, + "learning_rate": 8.037017848273776e-05, + "loss": 5.1745, + "step": 36300 + }, + { + "epoch": 34.339702760084926, + "grad_norm": 1.0767810344696045, + "learning_rate": 8.023134593166734e-05, + "loss": 5.155, + "step": 36400 + }, + { + "epoch": 34.434064637886294, + "grad_norm": 1.1843194961547852, + "learning_rate": 8.009214502118718e-05, + "loss": 5.108, + "step": 36500 + }, + { + "epoch": 34.52842651568766, + "grad_norm": 1.1245286464691162, + "learning_rate": 7.995257744741642e-05, + "loss": 5.0859, + "step": 36600 + }, + { + "epoch": 34.62278839348903, + "grad_norm": 1.4160962104797363, + "learning_rate": 7.981264491094192e-05, + "loss": 5.068, + "step": 36700 + }, + { + "epoch": 34.7171502712904, + "grad_norm": 1.0435899496078491, + "learning_rate": 7.967234911679749e-05, + "loss": 5.0546, + "step": 36800 + }, + { + "epoch": 34.81151214909177, + "grad_norm": 1.2017834186553955, + "learning_rate": 7.953169177444309e-05, + "loss": 5.0384, + "step": 36900 + }, + { + "epoch": 34.90587402689314, + "grad_norm": 1.2122361660003662, + "learning_rate": 7.939067459774405e-05, + "loss": 5.0225, + "step": 37000 + }, + { + "epoch": 34.90587402689314, + "eval_loss": 5.060994625091553, + "eval_runtime": 74.4648, + "eval_samples_per_second": 202.391, + "eval_steps_per_second": 6.325, + "step": 37000 + }, + { + "epoch": 35.0, + "grad_norm": 1.66022527217865, + "learning_rate": 7.924929930495018e-05, + "loss": 4.9977, + "step": 37100 + }, + { + "epoch": 35.09436187780137, + "grad_norm": 1.222360610961914, + "learning_rate": 7.910756761867479e-05, + "loss": 4.9734, + "step": 37200 + }, + { + "epoch": 35.18872375560274, + "grad_norm": 1.9777710437774658, + "learning_rate": 7.896548126587374e-05, + "loss": 4.9506, + "step": 37300 + }, + { + "epoch": 35.283085633404106, + "grad_norm": 1.3450212478637695, + "learning_rate": 7.882304197782443e-05, + "loss": 4.9493, + "step": 37400 + }, + { + "epoch": 35.377447511205474, + "grad_norm": 1.3580466508865356, + "learning_rate": 7.86802514901046e-05, + "loss": 4.9329, + "step": 37500 + }, + { + "epoch": 35.47180938900684, + "grad_norm": 1.2852448225021362, + "learning_rate": 7.853711154257133e-05, + "loss": 4.9236, + "step": 37600 + }, + { + "epoch": 35.56617126680821, + "grad_norm": 1.6216447353363037, + "learning_rate": 7.839362387933965e-05, + "loss": 4.9067, + "step": 37700 + }, + { + "epoch": 35.66053314460958, + "grad_norm": 1.6207915544509888, + "learning_rate": 7.824979024876149e-05, + "loss": 4.875, + "step": 37800 + }, + { + "epoch": 35.75489502241095, + "grad_norm": 1.4836673736572266, + "learning_rate": 7.810561240340424e-05, + "loss": 4.8447, + "step": 37900 + }, + { + "epoch": 35.84925690021232, + "grad_norm": 1.5666450262069702, + "learning_rate": 7.796109210002945e-05, + "loss": 4.7756, + "step": 38000 + }, + { + "epoch": 35.84925690021232, + "eval_loss": 4.731563091278076, + "eval_runtime": 74.4706, + "eval_samples_per_second": 202.375, + "eval_steps_per_second": 6.325, + "step": 38000 + }, + { + "epoch": 35.94361877801368, + "grad_norm": 2.0269269943237305, + "learning_rate": 7.781623109957139e-05, + "loss": 4.698, + "step": 38100 + }, + { + "epoch": 36.03774475112055, + "grad_norm": 3.2447714805603027, + "learning_rate": 7.767103116711566e-05, + "loss": 4.6137, + "step": 38200 + }, + { + "epoch": 36.13210662892192, + "grad_norm": 2.3450400829315186, + "learning_rate": 7.752549407187761e-05, + "loss": 4.5311, + "step": 38300 + }, + { + "epoch": 36.226468506723286, + "grad_norm": 2.9792938232421875, + "learning_rate": 7.73796215871808e-05, + "loss": 4.467, + "step": 38400 + }, + { + "epoch": 36.320830384524655, + "grad_norm": 2.1335771083831787, + "learning_rate": 7.723341549043543e-05, + "loss": 4.4239, + "step": 38500 + }, + { + "epoch": 36.41519226232602, + "grad_norm": 2.37923526763916, + "learning_rate": 7.708687756311666e-05, + "loss": 4.3596, + "step": 38600 + }, + { + "epoch": 36.50955414012739, + "grad_norm": 2.716700553894043, + "learning_rate": 7.694000959074288e-05, + "loss": 4.3063, + "step": 38700 + }, + { + "epoch": 36.60391601792875, + "grad_norm": 1.805467963218689, + "learning_rate": 7.679281336285398e-05, + "loss": 4.2623, + "step": 38800 + }, + { + "epoch": 36.69827789573012, + "grad_norm": 2.4338560104370117, + "learning_rate": 7.664529067298954e-05, + "loss": 4.2091, + "step": 38900 + }, + { + "epoch": 36.79263977353149, + "grad_norm": 1.896016001701355, + "learning_rate": 7.649744331866702e-05, + "loss": 4.1527, + "step": 39000 + }, + { + "epoch": 36.79263977353149, + "eval_loss": 4.014511585235596, + "eval_runtime": 74.4548, + "eval_samples_per_second": 202.418, + "eval_steps_per_second": 6.326, + "step": 39000 + }, + { + "epoch": 36.88700165133286, + "grad_norm": 2.2716317176818848, + "learning_rate": 7.634927310135972e-05, + "loss": 4.0514, + "step": 39100 + }, + { + "epoch": 36.98136352913423, + "grad_norm": 1.6702080965042114, + "learning_rate": 7.620078182647502e-05, + "loss": 3.9428, + "step": 39200 + }, + { + "epoch": 37.0754895022411, + "grad_norm": 1.7880933284759521, + "learning_rate": 7.605197130333222e-05, + "loss": 3.8424, + "step": 39300 + }, + { + "epoch": 37.169851380042466, + "grad_norm": 1.788912296295166, + "learning_rate": 7.590284334514057e-05, + "loss": 3.732, + "step": 39400 + }, + { + "epoch": 37.26421325784383, + "grad_norm": 1.85847806930542, + "learning_rate": 7.575339976897722e-05, + "loss": 3.6599, + "step": 39500 + }, + { + "epoch": 37.3585751356452, + "grad_norm": 1.744305968284607, + "learning_rate": 7.560364239576496e-05, + "loss": 3.5681, + "step": 39600 + }, + { + "epoch": 37.452937013446565, + "grad_norm": 1.8111028671264648, + "learning_rate": 7.545357305025013e-05, + "loss": 3.5125, + "step": 39700 + }, + { + "epoch": 37.547298891247934, + "grad_norm": 1.7670716047286987, + "learning_rate": 7.530319356098033e-05, + "loss": 3.4513, + "step": 39800 + }, + { + "epoch": 37.6416607690493, + "grad_norm": 1.5341713428497314, + "learning_rate": 7.51525057602822e-05, + "loss": 3.4147, + "step": 39900 + }, + { + "epoch": 37.73602264685067, + "grad_norm": 1.5770525932312012, + "learning_rate": 7.500151148423902e-05, + "loss": 3.3725, + "step": 40000 + }, + { + "epoch": 37.73602264685067, + "eval_loss": 3.2892656326293945, + "eval_runtime": 74.4068, + "eval_samples_per_second": 202.549, + "eval_steps_per_second": 6.33, + "step": 40000 + }, + { + "epoch": 37.83038452465204, + "grad_norm": 1.4793765544891357, + "learning_rate": 7.485021257266841e-05, + "loss": 3.3299, + "step": 40100 + }, + { + "epoch": 37.92474640245341, + "grad_norm": 1.519507646560669, + "learning_rate": 7.469861086909983e-05, + "loss": 3.3008, + "step": 40200 + }, + { + "epoch": 38.01887237556027, + "grad_norm": 1.6102982759475708, + "learning_rate": 7.454670822075225e-05, + "loss": 3.2762, + "step": 40300 + }, + { + "epoch": 38.11323425336164, + "grad_norm": 1.501295804977417, + "learning_rate": 7.439450647851145e-05, + "loss": 3.2414, + "step": 40400 + }, + { + "epoch": 38.20759613116301, + "grad_norm": 1.3700292110443115, + "learning_rate": 7.424200749690763e-05, + "loss": 3.2046, + "step": 40500 + }, + { + "epoch": 38.30195800896438, + "grad_norm": 1.5433059930801392, + "learning_rate": 7.40892131340928e-05, + "loss": 3.1777, + "step": 40600 + }, + { + "epoch": 38.396319886765745, + "grad_norm": 1.410462498664856, + "learning_rate": 7.393612525181801e-05, + "loss": 3.1619, + "step": 40700 + }, + { + "epoch": 38.490681764567114, + "grad_norm": 1.526921272277832, + "learning_rate": 7.37827457154108e-05, + "loss": 3.1474, + "step": 40800 + }, + { + "epoch": 38.58504364236848, + "grad_norm": 1.4830299615859985, + "learning_rate": 7.362907639375244e-05, + "loss": 3.1138, + "step": 40900 + }, + { + "epoch": 38.67940552016985, + "grad_norm": 1.3642972707748413, + "learning_rate": 7.347511915925512e-05, + "loss": 3.0895, + "step": 41000 + }, + { + "epoch": 38.67940552016985, + "eval_loss": 3.0351099967956543, + "eval_runtime": 74.4301, + "eval_samples_per_second": 202.485, + "eval_steps_per_second": 6.328, + "step": 41000 + }, + { + "epoch": 38.77376739797122, + "grad_norm": 1.26517653465271, + "learning_rate": 7.33208758878391e-05, + "loss": 3.0726, + "step": 41100 + }, + { + "epoch": 38.86812927577259, + "grad_norm": 1.4588021039962769, + "learning_rate": 7.316634845891003e-05, + "loss": 3.0682, + "step": 41200 + }, + { + "epoch": 38.96249115357396, + "grad_norm": 1.3476386070251465, + "learning_rate": 7.301153875533583e-05, + "loss": 3.0388, + "step": 41300 + }, + { + "epoch": 39.05661712668082, + "grad_norm": 1.5515798330307007, + "learning_rate": 7.28564486634239e-05, + "loss": 3.0181, + "step": 41400 + }, + { + "epoch": 39.15097900448219, + "grad_norm": 1.422370195388794, + "learning_rate": 7.270108007289807e-05, + "loss": 2.9994, + "step": 41500 + }, + { + "epoch": 39.24534088228356, + "grad_norm": 1.347652792930603, + "learning_rate": 7.254543487687558e-05, + "loss": 2.9834, + "step": 41600 + }, + { + "epoch": 39.339702760084926, + "grad_norm": 1.3476914167404175, + "learning_rate": 7.2389514971844e-05, + "loss": 2.9631, + "step": 41700 + }, + { + "epoch": 39.434064637886294, + "grad_norm": 1.3662831783294678, + "learning_rate": 7.22333222576382e-05, + "loss": 2.9611, + "step": 41800 + }, + { + "epoch": 39.52842651568766, + "grad_norm": 1.3495389223098755, + "learning_rate": 7.207685863741711e-05, + "loss": 2.9458, + "step": 41900 + }, + { + "epoch": 39.62278839348903, + "grad_norm": 1.4348385334014893, + "learning_rate": 7.192012601764053e-05, + "loss": 2.9351, + "step": 42000 + }, + { + "epoch": 39.62278839348903, + "eval_loss": 2.878948450088501, + "eval_runtime": 74.419, + "eval_samples_per_second": 202.516, + "eval_steps_per_second": 6.329, + "step": 42000 + }, + { + "epoch": 39.7171502712904, + "grad_norm": 1.299336314201355, + "learning_rate": 7.1763126308046e-05, + "loss": 2.9141, + "step": 42100 + }, + { + "epoch": 39.81151214909177, + "grad_norm": 1.3337122201919556, + "learning_rate": 7.160586142162544e-05, + "loss": 2.8945, + "step": 42200 + }, + { + "epoch": 39.90587402689314, + "grad_norm": 1.3183374404907227, + "learning_rate": 7.144833327460186e-05, + "loss": 2.8872, + "step": 42300 + }, + { + "epoch": 40.0, + "grad_norm": 1.5197392702102661, + "learning_rate": 7.129054378640599e-05, + "loss": 2.8863, + "step": 42400 + }, + { + "epoch": 40.09436187780137, + "grad_norm": 1.358708381652832, + "learning_rate": 7.1132494879653e-05, + "loss": 2.854, + "step": 42500 + }, + { + "epoch": 40.18872375560274, + "grad_norm": 1.3579105138778687, + "learning_rate": 7.097418848011888e-05, + "loss": 2.8579, + "step": 42600 + }, + { + "epoch": 40.283085633404106, + "grad_norm": 1.2787528038024902, + "learning_rate": 7.081562651671719e-05, + "loss": 2.8426, + "step": 42700 + }, + { + "epoch": 40.377447511205474, + "grad_norm": 1.3120508193969727, + "learning_rate": 7.065681092147542e-05, + "loss": 2.8356, + "step": 42800 + }, + { + "epoch": 40.47180938900684, + "grad_norm": 1.29078209400177, + "learning_rate": 7.049774362951144e-05, + "loss": 2.8274, + "step": 42900 + }, + { + "epoch": 40.56617126680821, + "grad_norm": 1.2766653299331665, + "learning_rate": 7.033842657901005e-05, + "loss": 2.8066, + "step": 43000 + }, + { + "epoch": 40.56617126680821, + "eval_loss": 2.7730472087860107, + "eval_runtime": 74.4694, + "eval_samples_per_second": 202.378, + "eval_steps_per_second": 6.325, + "step": 43000 + }, + { + "epoch": 40.66053314460958, + "grad_norm": 1.3247416019439697, + "learning_rate": 7.017886171119917e-05, + "loss": 2.7981, + "step": 43100 + }, + { + "epoch": 40.75489502241095, + "grad_norm": 1.271553635597229, + "learning_rate": 7.001905097032644e-05, + "loss": 2.794, + "step": 43200 + }, + { + "epoch": 40.84925690021232, + "grad_norm": 1.228617787361145, + "learning_rate": 6.985899630363526e-05, + "loss": 2.7911, + "step": 43300 + }, + { + "epoch": 40.94361877801368, + "grad_norm": 1.2349623441696167, + "learning_rate": 6.969869966134123e-05, + "loss": 2.7901, + "step": 43400 + }, + { + "epoch": 41.03774475112055, + "grad_norm": 1.2398724555969238, + "learning_rate": 6.953816299660834e-05, + "loss": 2.7727, + "step": 43500 + }, + { + "epoch": 41.13210662892192, + "grad_norm": 1.2596232891082764, + "learning_rate": 6.937738826552524e-05, + "loss": 2.7543, + "step": 43600 + }, + { + "epoch": 41.226468506723286, + "grad_norm": 1.349109172821045, + "learning_rate": 6.921637742708123e-05, + "loss": 2.7419, + "step": 43700 + }, + { + "epoch": 41.320830384524655, + "grad_norm": 1.1863901615142822, + "learning_rate": 6.905513244314259e-05, + "loss": 2.7433, + "step": 43800 + }, + { + "epoch": 41.41519226232602, + "grad_norm": 1.2365715503692627, + "learning_rate": 6.889365527842857e-05, + "loss": 2.7391, + "step": 43900 + }, + { + "epoch": 41.50955414012739, + "grad_norm": 1.2725337743759155, + "learning_rate": 6.873194790048746e-05, + "loss": 2.7429, + "step": 44000 + }, + { + "epoch": 41.50955414012739, + "eval_loss": 2.692702054977417, + "eval_runtime": 74.4255, + "eval_samples_per_second": 202.498, + "eval_steps_per_second": 6.328, + "step": 44000 + }, + { + "epoch": 41.60391601792875, + "grad_norm": 1.2004797458648682, + "learning_rate": 6.857001227967263e-05, + "loss": 2.727, + "step": 44100 + }, + { + "epoch": 41.69827789573012, + "grad_norm": 1.2816728353500366, + "learning_rate": 6.84078503891185e-05, + "loss": 2.7233, + "step": 44200 + }, + { + "epoch": 41.79263977353149, + "grad_norm": 1.2110925912857056, + "learning_rate": 6.824546420471653e-05, + "loss": 2.7166, + "step": 44300 + }, + { + "epoch": 41.88700165133286, + "grad_norm": 1.2935187816619873, + "learning_rate": 6.808285570509117e-05, + "loss": 2.7073, + "step": 44400 + }, + { + "epoch": 41.98136352913423, + "grad_norm": 1.2628765106201172, + "learning_rate": 6.792002687157564e-05, + "loss": 2.6934, + "step": 44500 + }, + { + "epoch": 42.0754895022411, + "grad_norm": 1.2200449705123901, + "learning_rate": 6.775697968818788e-05, + "loss": 2.6895, + "step": 44600 + }, + { + "epoch": 42.169851380042466, + "grad_norm": 1.1676024198532104, + "learning_rate": 6.759371614160639e-05, + "loss": 2.6743, + "step": 44700 + }, + { + "epoch": 42.26421325784383, + "grad_norm": 1.1828370094299316, + "learning_rate": 6.743023822114596e-05, + "loss": 2.6713, + "step": 44800 + }, + { + "epoch": 42.3585751356452, + "grad_norm": 1.2179129123687744, + "learning_rate": 6.726654791873343e-05, + "loss": 2.6621, + "step": 44900 + }, + { + "epoch": 42.452937013446565, + "grad_norm": 1.232735514640808, + "learning_rate": 6.710264722888352e-05, + "loss": 2.6762, + "step": 45000 + }, + { + "epoch": 42.452937013446565, + "eval_loss": 2.6310536861419678, + "eval_runtime": 74.4881, + "eval_samples_per_second": 202.328, + "eval_steps_per_second": 6.323, + "step": 45000 + }, + { + "epoch": 42.547298891247934, + "grad_norm": 1.1474517583847046, + "learning_rate": 6.693853814867439e-05, + "loss": 2.6642, + "step": 45100 + }, + { + "epoch": 42.6416607690493, + "grad_norm": 1.2410264015197754, + "learning_rate": 6.677422267772338e-05, + "loss": 2.6602, + "step": 45200 + }, + { + "epoch": 42.73602264685067, + "grad_norm": 1.3303461074829102, + "learning_rate": 6.660970281816269e-05, + "loss": 2.665, + "step": 45300 + }, + { + "epoch": 42.83038452465204, + "grad_norm": 1.1773780584335327, + "learning_rate": 6.644498057461485e-05, + "loss": 2.6539, + "step": 45400 + }, + { + "epoch": 42.92474640245341, + "grad_norm": 1.2169513702392578, + "learning_rate": 6.628005795416842e-05, + "loss": 2.6352, + "step": 45500 + }, + { + "epoch": 43.01887237556027, + "grad_norm": 1.1680723428726196, + "learning_rate": 6.611493696635351e-05, + "loss": 2.6335, + "step": 45600 + }, + { + "epoch": 43.11323425336164, + "grad_norm": 1.14376699924469, + "learning_rate": 6.594961962311722e-05, + "loss": 2.614, + "step": 45700 + }, + { + "epoch": 43.20759613116301, + "grad_norm": 1.1640697717666626, + "learning_rate": 6.578410793879921e-05, + "loss": 2.6253, + "step": 45800 + }, + { + "epoch": 43.30195800896438, + "grad_norm": 1.2426387071609497, + "learning_rate": 6.561840393010713e-05, + "loss": 2.6163, + "step": 45900 + }, + { + "epoch": 43.396319886765745, + "grad_norm": 1.1884350776672363, + "learning_rate": 6.545250961609202e-05, + "loss": 2.6201, + "step": 46000 + }, + { + "epoch": 43.396319886765745, + "eval_loss": 2.5862319469451904, + "eval_runtime": 74.4528, + "eval_samples_per_second": 202.423, + "eval_steps_per_second": 6.326, + "step": 46000 + }, + { + "epoch": 43.490681764567114, + "grad_norm": 1.2242307662963867, + "learning_rate": 6.528642701812378e-05, + "loss": 2.6032, + "step": 46100 + }, + { + "epoch": 43.58504364236848, + "grad_norm": 1.1581670045852661, + "learning_rate": 6.51201581598664e-05, + "loss": 2.6036, + "step": 46200 + }, + { + "epoch": 43.67940552016985, + "grad_norm": 1.1338080167770386, + "learning_rate": 6.49537050672535e-05, + "loss": 2.6141, + "step": 46300 + }, + { + "epoch": 43.77376739797122, + "grad_norm": 1.2403849363327026, + "learning_rate": 6.478706976846344e-05, + "loss": 2.5999, + "step": 46400 + }, + { + "epoch": 43.86812927577259, + "grad_norm": 1.2618863582611084, + "learning_rate": 6.462025429389475e-05, + "loss": 2.5949, + "step": 46500 + }, + { + "epoch": 43.96249115357396, + "grad_norm": 1.1581882238388062, + "learning_rate": 6.445326067614139e-05, + "loss": 2.5872, + "step": 46600 + }, + { + "epoch": 44.05661712668082, + "grad_norm": 1.1863130331039429, + "learning_rate": 6.428609094996785e-05, + "loss": 2.5819, + "step": 46700 + }, + { + "epoch": 44.15097900448219, + "grad_norm": 1.1343599557876587, + "learning_rate": 6.411874715228447e-05, + "loss": 2.5694, + "step": 46800 + }, + { + "epoch": 44.24534088228356, + "grad_norm": 1.1974676847457886, + "learning_rate": 6.395123132212268e-05, + "loss": 2.5648, + "step": 46900 + }, + { + "epoch": 44.339702760084926, + "grad_norm": 1.151482343673706, + "learning_rate": 6.378354550060997e-05, + "loss": 2.5672, + "step": 47000 + }, + { + "epoch": 44.339702760084926, + "eval_loss": 2.544923782348633, + "eval_runtime": 74.4495, + "eval_samples_per_second": 202.432, + "eval_steps_per_second": 6.326, + "step": 47000 + }, + { + "epoch": 44.434064637886294, + "grad_norm": 1.144623875617981, + "learning_rate": 6.361569173094515e-05, + "loss": 2.5668, + "step": 47100 + }, + { + "epoch": 44.52842651568766, + "grad_norm": 1.1448100805282593, + "learning_rate": 6.344767205837345e-05, + "loss": 2.5664, + "step": 47200 + }, + { + "epoch": 44.62278839348903, + "grad_norm": 1.1526908874511719, + "learning_rate": 6.327948853016153e-05, + "loss": 2.5536, + "step": 47300 + }, + { + "epoch": 44.7171502712904, + "grad_norm": 1.1646764278411865, + "learning_rate": 6.311114319557261e-05, + "loss": 2.5634, + "step": 47400 + }, + { + "epoch": 44.81151214909177, + "grad_norm": 1.1626538038253784, + "learning_rate": 6.29426381058415e-05, + "loss": 2.5495, + "step": 47500 + }, + { + "epoch": 44.90587402689314, + "grad_norm": 1.1385385990142822, + "learning_rate": 6.277397531414951e-05, + "loss": 2.5466, + "step": 47600 + }, + { + "epoch": 45.0, + "grad_norm": 1.4365875720977783, + "learning_rate": 6.260515687559953e-05, + "loss": 2.5465, + "step": 47700 + }, + { + "epoch": 45.09436187780137, + "grad_norm": 1.1712549924850464, + "learning_rate": 6.243618484719098e-05, + "loss": 2.534, + "step": 47800 + }, + { + "epoch": 45.18872375560274, + "grad_norm": 1.1397508382797241, + "learning_rate": 6.226706128779468e-05, + "loss": 2.5308, + "step": 47900 + }, + { + "epoch": 45.283085633404106, + "grad_norm": 1.0902845859527588, + "learning_rate": 6.209778825812784e-05, + "loss": 2.5285, + "step": 48000 + }, + { + "epoch": 45.283085633404106, + "eval_loss": 2.5119264125823975, + "eval_runtime": 74.4894, + "eval_samples_per_second": 202.324, + "eval_steps_per_second": 6.323, + "step": 48000 + }, + { + "epoch": 45.377447511205474, + "grad_norm": 1.174816370010376, + "learning_rate": 6.19283678207289e-05, + "loss": 2.5255, + "step": 48100 + }, + { + "epoch": 45.47180938900684, + "grad_norm": 1.1556349992752075, + "learning_rate": 6.175880203993243e-05, + "loss": 2.5208, + "step": 48200 + }, + { + "epoch": 45.56617126680821, + "grad_norm": 1.1801139116287231, + "learning_rate": 6.158909298184395e-05, + "loss": 2.5132, + "step": 48300 + }, + { + "epoch": 45.66053314460958, + "grad_norm": 1.0904967784881592, + "learning_rate": 6.14192427143148e-05, + "loss": 2.531, + "step": 48400 + }, + { + "epoch": 45.75489502241095, + "grad_norm": 1.1581918001174927, + "learning_rate": 6.124925330691685e-05, + "loss": 2.5225, + "step": 48500 + }, + { + "epoch": 45.84925690021232, + "grad_norm": 1.130543828010559, + "learning_rate": 6.107912683091741e-05, + "loss": 2.501, + "step": 48600 + }, + { + "epoch": 45.94361877801368, + "grad_norm": 1.114476203918457, + "learning_rate": 6.0908865359253886e-05, + "loss": 2.5157, + "step": 48700 + }, + { + "epoch": 46.03774475112055, + "grad_norm": 1.150086760520935, + "learning_rate": 6.07384709665086e-05, + "loss": 2.501, + "step": 48800 + }, + { + "epoch": 46.13210662892192, + "grad_norm": 1.1558949947357178, + "learning_rate": 6.0567945728883435e-05, + "loss": 2.4929, + "step": 48900 + }, + { + "epoch": 46.226468506723286, + "grad_norm": 1.1270654201507568, + "learning_rate": 6.03972917241746e-05, + "loss": 2.4942, + "step": 49000 + }, + { + "epoch": 46.226468506723286, + "eval_loss": 2.4788951873779297, + "eval_runtime": 74.4909, + "eval_samples_per_second": 202.32, + "eval_steps_per_second": 6.323, + "step": 49000 + }, + { + "epoch": 46.320830384524655, + "grad_norm": 1.1517997980117798, + "learning_rate": 6.02265110317473e-05, + "loss": 2.5006, + "step": 49100 + }, + { + "epoch": 46.41519226232602, + "grad_norm": 1.0966933965682983, + "learning_rate": 6.005560573251037e-05, + "loss": 2.4881, + "step": 49200 + }, + { + "epoch": 46.50955414012739, + "grad_norm": 1.147829532623291, + "learning_rate": 5.9884577908890926e-05, + "loss": 2.4907, + "step": 49300 + }, + { + "epoch": 46.60391601792875, + "grad_norm": 1.1360352039337158, + "learning_rate": 5.971342964480906e-05, + "loss": 2.4895, + "step": 49400 + }, + { + "epoch": 46.69827789573012, + "grad_norm": 1.1289290189743042, + "learning_rate": 5.954216302565235e-05, + "loss": 2.4808, + "step": 49500 + }, + { + "epoch": 46.79263977353149, + "grad_norm": 1.0954254865646362, + "learning_rate": 5.9370780138250484e-05, + "loss": 2.4772, + "step": 49600 + }, + { + "epoch": 46.88700165133286, + "grad_norm": 1.141011118888855, + "learning_rate": 5.9199283070849875e-05, + "loss": 2.4766, + "step": 49700 + }, + { + "epoch": 46.98136352913423, + "grad_norm": 1.0591996908187866, + "learning_rate": 5.9027673913088165e-05, + "loss": 2.4805, + "step": 49800 + }, + { + "epoch": 47.0754895022411, + "grad_norm": 1.0702204704284668, + "learning_rate": 5.885595475596878e-05, + "loss": 2.4638, + "step": 49900 + }, + { + "epoch": 47.169851380042466, + "grad_norm": 1.1920737028121948, + "learning_rate": 5.868412769183547e-05, + "loss": 2.4721, + "step": 50000 + }, + { + "epoch": 47.169851380042466, + "eval_loss": 2.450434923171997, + "eval_runtime": 74.4123, + "eval_samples_per_second": 202.534, + "eval_steps_per_second": 6.33, + "step": 50000 + }, + { + "epoch": 47.26421325784383, + "grad_norm": 1.0812413692474365, + "learning_rate": 5.8512194814346775e-05, + "loss": 2.4477, + "step": 50100 + }, + { + "epoch": 47.3585751356452, + "grad_norm": 1.0759973526000977, + "learning_rate": 5.8340158218450555e-05, + "loss": 2.454, + "step": 50200 + }, + { + "epoch": 47.452937013446565, + "grad_norm": 1.1221734285354614, + "learning_rate": 5.8168020000358435e-05, + "loss": 2.4554, + "step": 50300 + }, + { + "epoch": 47.547298891247934, + "grad_norm": 1.0968127250671387, + "learning_rate": 5.799578225752028e-05, + "loss": 2.4586, + "step": 50400 + }, + { + "epoch": 47.6416607690493, + "grad_norm": 1.1258801221847534, + "learning_rate": 5.7823447088598624e-05, + "loss": 2.462, + "step": 50500 + }, + { + "epoch": 47.73602264685067, + "grad_norm": 1.1324305534362793, + "learning_rate": 5.765101659344313e-05, + "loss": 2.4518, + "step": 50600 + }, + { + "epoch": 47.83038452465204, + "grad_norm": 1.1313503980636597, + "learning_rate": 5.747849287306496e-05, + "loss": 2.4556, + "step": 50700 + }, + { + "epoch": 47.92474640245341, + "grad_norm": 1.088899850845337, + "learning_rate": 5.730587802961119e-05, + "loss": 2.4553, + "step": 50800 + }, + { + "epoch": 48.01887237556027, + "grad_norm": 1.1050726175308228, + "learning_rate": 5.7133174166339245e-05, + "loss": 2.4449, + "step": 50900 + }, + { + "epoch": 48.11323425336164, + "grad_norm": 1.2336230278015137, + "learning_rate": 5.696038338759117e-05, + "loss": 2.4343, + "step": 51000 + }, + { + "epoch": 48.11323425336164, + "eval_loss": 2.423373222351074, + "eval_runtime": 74.4359, + "eval_samples_per_second": 202.47, + "eval_steps_per_second": 6.328, + "step": 51000 + }, + { + "epoch": 48.20759613116301, + "grad_norm": 1.10746169090271, + "learning_rate": 5.678750779876807e-05, + "loss": 2.4329, + "step": 51100 + }, + { + "epoch": 48.30195800896438, + "grad_norm": 1.1919947862625122, + "learning_rate": 5.661454950630445e-05, + "loss": 2.429, + "step": 51200 + }, + { + "epoch": 48.396319886765745, + "grad_norm": 1.0968834161758423, + "learning_rate": 5.6441510617642526e-05, + "loss": 2.4384, + "step": 51300 + }, + { + "epoch": 48.490681764567114, + "grad_norm": 1.112357497215271, + "learning_rate": 5.626839324120654e-05, + "loss": 2.4317, + "step": 51400 + }, + { + "epoch": 48.58504364236848, + "grad_norm": 1.0847004652023315, + "learning_rate": 5.609519948637708e-05, + "loss": 2.4362, + "step": 51500 + }, + { + "epoch": 48.67940552016985, + "grad_norm": 1.1191656589508057, + "learning_rate": 5.592193146346543e-05, + "loss": 2.4201, + "step": 51600 + }, + { + "epoch": 48.77376739797122, + "grad_norm": 1.1359890699386597, + "learning_rate": 5.5748591283687725e-05, + "loss": 2.4272, + "step": 51700 + }, + { + "epoch": 48.86812927577259, + "grad_norm": 1.1025972366333008, + "learning_rate": 5.557518105913939e-05, + "loss": 2.4205, + "step": 51800 + }, + { + "epoch": 48.96249115357396, + "grad_norm": 1.1198244094848633, + "learning_rate": 5.540170290276927e-05, + "loss": 2.4185, + "step": 51900 + }, + { + "epoch": 49.05661712668082, + "grad_norm": 1.0897448062896729, + "learning_rate": 5.5228158928353944e-05, + "loss": 2.4059, + "step": 52000 + }, + { + "epoch": 49.05661712668082, + "eval_loss": 2.4050960540771484, + "eval_runtime": 74.4452, + "eval_samples_per_second": 202.444, + "eval_steps_per_second": 6.327, + "step": 52000 + }, + { + "epoch": 49.15097900448219, + "grad_norm": 1.1507387161254883, + "learning_rate": 5.5054551250471985e-05, + "loss": 2.4074, + "step": 52100 + }, + { + "epoch": 49.24534088228356, + "grad_norm": 1.1095491647720337, + "learning_rate": 5.488088198447816e-05, + "loss": 2.4071, + "step": 52200 + }, + { + "epoch": 49.339702760084926, + "grad_norm": 1.0501704216003418, + "learning_rate": 5.470715324647766e-05, + "loss": 2.4103, + "step": 52300 + }, + { + "epoch": 49.434064637886294, + "grad_norm": 1.1576809883117676, + "learning_rate": 5.453336715330034e-05, + "loss": 2.4132, + "step": 52400 + }, + { + "epoch": 49.52842651568766, + "grad_norm": 1.136675238609314, + "learning_rate": 5.43595258224749e-05, + "loss": 2.4065, + "step": 52500 + }, + { + "epoch": 49.62278839348903, + "grad_norm": 1.1476448774337769, + "learning_rate": 5.4185631372203106e-05, + "loss": 2.4027, + "step": 52600 + }, + { + "epoch": 49.7171502712904, + "grad_norm": 1.0723971128463745, + "learning_rate": 5.401168592133394e-05, + "loss": 2.3952, + "step": 52700 + }, + { + "epoch": 49.81151214909177, + "grad_norm": 1.1071858406066895, + "learning_rate": 5.3837691589337833e-05, + "loss": 2.3957, + "step": 52800 + }, + { + "epoch": 49.90587402689314, + "grad_norm": 1.0979355573654175, + "learning_rate": 5.3663650496280814e-05, + "loss": 2.4061, + "step": 52900 + }, + { + "epoch": 50.0, + "grad_norm": 1.390066385269165, + "learning_rate": 5.348956476279867e-05, + "loss": 2.3988, + "step": 53000 + }, + { + "epoch": 50.0, + "eval_loss": 2.382521152496338, + "eval_runtime": 74.4633, + "eval_samples_per_second": 202.395, + "eval_steps_per_second": 6.325, + "step": 53000 + }, + { + "epoch": 50.09436187780137, + "grad_norm": 1.0886965990066528, + "learning_rate": 5.331543651007114e-05, + "loss": 2.3889, + "step": 53100 + }, + { + "epoch": 50.18872375560274, + "grad_norm": 1.1360448598861694, + "learning_rate": 5.314126785979601e-05, + "loss": 2.373, + "step": 53200 + }, + { + "epoch": 50.283085633404106, + "grad_norm": 1.1651926040649414, + "learning_rate": 5.296706093416334e-05, + "loss": 2.3825, + "step": 53300 + }, + { + "epoch": 50.377447511205474, + "grad_norm": 1.0949503183364868, + "learning_rate": 5.2792817855829534e-05, + "loss": 2.3856, + "step": 53400 + }, + { + "epoch": 50.47180938900684, + "grad_norm": 1.0986069440841675, + "learning_rate": 5.261854074789151e-05, + "loss": 2.3846, + "step": 53500 + }, + { + "epoch": 50.56617126680821, + "grad_norm": 1.0695897340774536, + "learning_rate": 5.244423173386084e-05, + "loss": 2.385, + "step": 53600 + }, + { + "epoch": 50.66053314460958, + "grad_norm": 1.084670066833496, + "learning_rate": 5.226989293763784e-05, + "loss": 2.3788, + "step": 53700 + }, + { + "epoch": 50.75489502241095, + "grad_norm": 1.0809028148651123, + "learning_rate": 5.2095526483485736e-05, + "loss": 2.381, + "step": 53800 + }, + { + "epoch": 50.84925690021232, + "grad_norm": 1.0939772129058838, + "learning_rate": 5.192113449600473e-05, + "loss": 2.3798, + "step": 53900 + }, + { + "epoch": 50.94361877801368, + "grad_norm": 1.149915099143982, + "learning_rate": 5.1746719100106164e-05, + "loss": 2.3664, + "step": 54000 + }, + { + "epoch": 50.94361877801368, + "eval_loss": 2.366403102874756, + "eval_runtime": 74.4788, + "eval_samples_per_second": 202.353, + "eval_steps_per_second": 6.324, + "step": 54000 + }, + { + "epoch": 51.03774475112055, + "grad_norm": 1.0868983268737793, + "learning_rate": 5.1572282420986615e-05, + "loss": 2.3738, + "step": 54100 + }, + { + "epoch": 51.13210662892192, + "grad_norm": 1.0766066312789917, + "learning_rate": 5.139782658410193e-05, + "loss": 2.3648, + "step": 54200 + }, + { + "epoch": 51.226468506723286, + "grad_norm": 1.1454092264175415, + "learning_rate": 5.122335371514144e-05, + "loss": 2.3695, + "step": 54300 + }, + { + "epoch": 51.320830384524655, + "grad_norm": 1.0294060707092285, + "learning_rate": 5.1048865940002e-05, + "loss": 2.3615, + "step": 54400 + }, + { + "epoch": 51.41519226232602, + "grad_norm": 1.1249427795410156, + "learning_rate": 5.0874365384762093e-05, + "loss": 2.3652, + "step": 54500 + }, + { + "epoch": 51.50955414012739, + "grad_norm": 1.0959460735321045, + "learning_rate": 5.069985417565589e-05, + "loss": 2.3635, + "step": 54600 + }, + { + "epoch": 51.60391601792875, + "grad_norm": 1.0896023511886597, + "learning_rate": 5.0525334439047435e-05, + "loss": 2.3554, + "step": 54700 + }, + { + "epoch": 51.69827789573012, + "grad_norm": 1.1385838985443115, + "learning_rate": 5.035080830140462e-05, + "loss": 2.3591, + "step": 54800 + }, + { + "epoch": 51.79263977353149, + "grad_norm": 1.1226471662521362, + "learning_rate": 5.017627788927336e-05, + "loss": 2.3575, + "step": 54900 + }, + { + "epoch": 51.88700165133286, + "grad_norm": 1.1267681121826172, + "learning_rate": 5.000174532925165e-05, + "loss": 2.3521, + "step": 55000 + }, + { + "epoch": 51.88700165133286, + "eval_loss": 2.3515806198120117, + "eval_runtime": 74.4619, + "eval_samples_per_second": 202.399, + "eval_steps_per_second": 6.325, + "step": 55000 + }, + { + "epoch": 51.98136352913423, + "grad_norm": 1.100253701210022, + "learning_rate": 4.982721274796365e-05, + "loss": 2.3561, + "step": 55100 + }, + { + "epoch": 52.0754895022411, + "grad_norm": 1.0981378555297852, + "learning_rate": 4.9652682272033776e-05, + "loss": 2.3454, + "step": 55200 + }, + { + "epoch": 52.169851380042466, + "grad_norm": 1.14932382106781, + "learning_rate": 4.947815602806083e-05, + "loss": 2.3457, + "step": 55300 + }, + { + "epoch": 52.26421325784383, + "grad_norm": 1.112889051437378, + "learning_rate": 4.9303636142592005e-05, + "loss": 2.3483, + "step": 55400 + }, + { + "epoch": 52.3585751356452, + "grad_norm": 1.1395831108093262, + "learning_rate": 4.912912474209699e-05, + "loss": 2.3499, + "step": 55500 + }, + { + "epoch": 52.452937013446565, + "grad_norm": 1.0740474462509155, + "learning_rate": 4.8954623952942196e-05, + "loss": 2.337, + "step": 55600 + }, + { + "epoch": 52.547298891247934, + "grad_norm": 1.0660241842269897, + "learning_rate": 4.878013590136461e-05, + "loss": 2.3385, + "step": 55700 + }, + { + "epoch": 52.6416607690493, + "grad_norm": 1.138458490371704, + "learning_rate": 4.860566271344612e-05, + "loss": 2.3486, + "step": 55800 + }, + { + "epoch": 52.73602264685067, + "grad_norm": 1.1012473106384277, + "learning_rate": 4.8431206515087425e-05, + "loss": 2.3311, + "step": 55900 + }, + { + "epoch": 52.83038452465204, + "grad_norm": 1.0752781629562378, + "learning_rate": 4.825676943198228e-05, + "loss": 2.3411, + "step": 56000 + }, + { + "epoch": 52.83038452465204, + "eval_loss": 2.3332431316375732, + "eval_runtime": 74.4827, + "eval_samples_per_second": 202.342, + "eval_steps_per_second": 6.324, + "step": 56000 + }, + { + "epoch": 52.92474640245341, + "grad_norm": 1.1414703130722046, + "learning_rate": 4.808235358959146e-05, + "loss": 2.3436, + "step": 56100 + }, + { + "epoch": 53.01887237556027, + "grad_norm": 1.0539082288742065, + "learning_rate": 4.790796111311697e-05, + "loss": 2.3465, + "step": 56200 + }, + { + "epoch": 53.11323425336164, + "grad_norm": 1.176776647567749, + "learning_rate": 4.773359412747614e-05, + "loss": 2.3285, + "step": 56300 + }, + { + "epoch": 53.20759613116301, + "grad_norm": 1.0964272022247314, + "learning_rate": 4.75592547572756e-05, + "loss": 2.3213, + "step": 56400 + }, + { + "epoch": 53.30195800896438, + "grad_norm": 1.0305556058883667, + "learning_rate": 4.738494512678562e-05, + "loss": 2.3247, + "step": 56500 + }, + { + "epoch": 53.396319886765745, + "grad_norm": 1.0632230043411255, + "learning_rate": 4.7210667359913984e-05, + "loss": 2.3313, + "step": 56600 + }, + { + "epoch": 53.490681764567114, + "grad_norm": 1.098626732826233, + "learning_rate": 4.7036423580180325e-05, + "loss": 2.3334, + "step": 56700 + }, + { + "epoch": 53.58504364236848, + "grad_norm": 1.1028692722320557, + "learning_rate": 4.6862215910690103e-05, + "loss": 2.3131, + "step": 56800 + }, + { + "epoch": 53.67940552016985, + "grad_norm": 1.1090211868286133, + "learning_rate": 4.668804647410876e-05, + "loss": 2.3232, + "step": 56900 + }, + { + "epoch": 53.77376739797122, + "grad_norm": 1.062287449836731, + "learning_rate": 4.6513917392635945e-05, + "loss": 2.3135, + "step": 57000 + }, + { + "epoch": 53.77376739797122, + "eval_loss": 2.322282314300537, + "eval_runtime": 74.4526, + "eval_samples_per_second": 202.424, + "eval_steps_per_second": 6.326, + "step": 57000 + }, + { + "epoch": 53.86812927577259, + "grad_norm": 1.0684916973114014, + "learning_rate": 4.6339830787979574e-05, + "loss": 2.3177, + "step": 57100 + }, + { + "epoch": 53.96249115357396, + "grad_norm": 1.158130407333374, + "learning_rate": 4.616578878132996e-05, + "loss": 2.3185, + "step": 57200 + }, + { + "epoch": 54.05661712668082, + "grad_norm": 1.0510168075561523, + "learning_rate": 4.5991793493334035e-05, + "loss": 2.3252, + "step": 57300 + }, + { + "epoch": 54.15097900448219, + "grad_norm": 1.1067076921463013, + "learning_rate": 4.58178470440695e-05, + "loss": 2.3158, + "step": 57400 + }, + { + "epoch": 54.24534088228356, + "grad_norm": 1.0520684719085693, + "learning_rate": 4.564395155301891e-05, + "loss": 2.3024, + "step": 57500 + }, + { + "epoch": 54.339702760084926, + "grad_norm": 1.1140958070755005, + "learning_rate": 4.5470109139043984e-05, + "loss": 2.3164, + "step": 57600 + }, + { + "epoch": 54.434064637886294, + "grad_norm": 1.0808565616607666, + "learning_rate": 4.529632192035965e-05, + "loss": 2.3067, + "step": 57700 + }, + { + "epoch": 54.52842651568766, + "grad_norm": 1.1175216436386108, + "learning_rate": 4.51225920145083e-05, + "loss": 2.3154, + "step": 57800 + }, + { + "epoch": 54.62278839348903, + "grad_norm": 1.0674301385879517, + "learning_rate": 4.494892153833406e-05, + "loss": 2.3054, + "step": 57900 + }, + { + "epoch": 54.7171502712904, + "grad_norm": 1.0364117622375488, + "learning_rate": 4.477531260795683e-05, + "loss": 2.3042, + "step": 58000 + }, + { + "epoch": 54.7171502712904, + "eval_loss": 2.3081283569335938, + "eval_runtime": 74.4358, + "eval_samples_per_second": 202.47, + "eval_steps_per_second": 6.328, + "step": 58000 + }, + { + "epoch": 54.81151214909177, + "grad_norm": 1.1012537479400635, + "learning_rate": 4.460176733874668e-05, + "loss": 2.2988, + "step": 58100 + }, + { + "epoch": 54.90587402689314, + "grad_norm": 1.0269622802734375, + "learning_rate": 4.442828784529791e-05, + "loss": 2.3111, + "step": 58200 + }, + { + "epoch": 55.0, + "grad_norm": 1.3486961126327515, + "learning_rate": 4.4254876241403444e-05, + "loss": 2.3058, + "step": 58300 + }, + { + "epoch": 55.09436187780137, + "grad_norm": 1.0656137466430664, + "learning_rate": 4.4081534640028924e-05, + "loss": 2.2969, + "step": 58400 + }, + { + "epoch": 55.18872375560274, + "grad_norm": 1.0997986793518066, + "learning_rate": 4.390826515328704e-05, + "loss": 2.2923, + "step": 58500 + }, + { + "epoch": 55.283085633404106, + "grad_norm": 1.061912178993225, + "learning_rate": 4.373506989241186e-05, + "loss": 2.2893, + "step": 58600 + }, + { + "epoch": 55.377447511205474, + "grad_norm": 1.12472403049469, + "learning_rate": 4.356195096773292e-05, + "loss": 2.2975, + "step": 58700 + }, + { + "epoch": 55.47180938900684, + "grad_norm": 1.0722994804382324, + "learning_rate": 4.338891048864973e-05, + "loss": 2.2996, + "step": 58800 + }, + { + "epoch": 55.56617126680821, + "grad_norm": 1.0319920778274536, + "learning_rate": 4.321595056360589e-05, + "loss": 2.2927, + "step": 58900 + }, + { + "epoch": 55.66053314460958, + "grad_norm": 1.0819764137268066, + "learning_rate": 4.304307330006352e-05, + "loss": 2.2922, + "step": 59000 + }, + { + "epoch": 55.66053314460958, + "eval_loss": 2.297449827194214, + "eval_runtime": 74.4766, + "eval_samples_per_second": 202.359, + "eval_steps_per_second": 6.324, + "step": 59000 + }, + { + "epoch": 55.75489502241095, + "grad_norm": 1.0915340185165405, + "learning_rate": 4.2870280804477525e-05, + "loss": 2.2955, + "step": 59100 + }, + { + "epoch": 55.84925690021232, + "grad_norm": 1.072871208190918, + "learning_rate": 4.2697575182269924e-05, + "loss": 2.2896, + "step": 59200 + }, + { + "epoch": 55.94361877801368, + "grad_norm": 1.1387864351272583, + "learning_rate": 4.2524958537804226e-05, + "loss": 2.2905, + "step": 59300 + }, + { + "epoch": 56.03774475112055, + "grad_norm": 1.0957666635513306, + "learning_rate": 4.235243297435975e-05, + "loss": 2.2877, + "step": 59400 + }, + { + "epoch": 56.13210662892192, + "grad_norm": 1.1053986549377441, + "learning_rate": 4.2180000594106076e-05, + "loss": 2.2804, + "step": 59500 + }, + { + "epoch": 56.226468506723286, + "grad_norm": 1.094224214553833, + "learning_rate": 4.200766349807731e-05, + "loss": 2.2813, + "step": 59600 + }, + { + "epoch": 56.320830384524655, + "grad_norm": 1.0825510025024414, + "learning_rate": 4.18354237861466e-05, + "loss": 2.2922, + "step": 59700 + }, + { + "epoch": 56.41519226232602, + "grad_norm": 1.0940465927124023, + "learning_rate": 4.1663283557000455e-05, + "loss": 2.2865, + "step": 59800 + }, + { + "epoch": 56.50955414012739, + "grad_norm": 1.053289771080017, + "learning_rate": 4.1491244908113266e-05, + "loss": 2.2806, + "step": 59900 + }, + { + "epoch": 56.60391601792875, + "grad_norm": 1.0980818271636963, + "learning_rate": 4.1319309935721695e-05, + "loss": 2.2792, + "step": 60000 + }, + { + "epoch": 56.60391601792875, + "eval_loss": 2.285236120223999, + "eval_runtime": 74.4453, + "eval_samples_per_second": 202.444, + "eval_steps_per_second": 6.327, + "step": 60000 + }, + { + "epoch": 56.69827789573012, + "grad_norm": 1.0499736070632935, + "learning_rate": 4.114748073479907e-05, + "loss": 2.2796, + "step": 60100 + }, + { + "epoch": 56.79263977353149, + "grad_norm": 1.0926116704940796, + "learning_rate": 4.097575939903003e-05, + "loss": 2.2816, + "step": 60200 + }, + { + "epoch": 56.88700165133286, + "grad_norm": 1.1019150018692017, + "learning_rate": 4.080414802078481e-05, + "loss": 2.2815, + "step": 60300 + }, + { + "epoch": 56.98136352913423, + "grad_norm": 1.0841056108474731, + "learning_rate": 4.063264869109395e-05, + "loss": 2.277, + "step": 60400 + }, + { + "epoch": 57.0754895022411, + "grad_norm": 1.0328353643417358, + "learning_rate": 4.046126349962261e-05, + "loss": 2.2802, + "step": 60500 + }, + { + "epoch": 57.169851380042466, + "grad_norm": 1.1018388271331787, + "learning_rate": 4.0289994534645305e-05, + "loss": 2.2737, + "step": 60600 + }, + { + "epoch": 57.26421325784383, + "grad_norm": 1.0932639837265015, + "learning_rate": 4.01188438830203e-05, + "loss": 2.2687, + "step": 60700 + }, + { + "epoch": 57.3585751356452, + "grad_norm": 1.0744473934173584, + "learning_rate": 3.994781363016427e-05, + "loss": 2.2597, + "step": 60800 + }, + { + "epoch": 57.452937013446565, + "grad_norm": 1.1107820272445679, + "learning_rate": 3.977690586002688e-05, + "loss": 2.2658, + "step": 60900 + }, + { + "epoch": 57.547298891247934, + "grad_norm": 1.0874978303909302, + "learning_rate": 3.9606122655065365e-05, + "loss": 2.2689, + "step": 61000 + }, + { + "epoch": 57.547298891247934, + "eval_loss": 2.2746315002441406, + "eval_runtime": 74.4452, + "eval_samples_per_second": 202.444, + "eval_steps_per_second": 6.327, + "step": 61000 + }, + { + "epoch": 57.6416607690493, + "grad_norm": 1.073294758796692, + "learning_rate": 3.943546609621921e-05, + "loss": 2.2597, + "step": 61100 + }, + { + "epoch": 57.73602264685067, + "grad_norm": 1.0983116626739502, + "learning_rate": 3.926493826288469e-05, + "loss": 2.2643, + "step": 61200 + }, + { + "epoch": 57.83038452465204, + "grad_norm": 1.0410650968551636, + "learning_rate": 3.909454123288968e-05, + "loss": 2.2739, + "step": 61300 + }, + { + "epoch": 57.92474640245341, + "grad_norm": 1.1241108179092407, + "learning_rate": 3.892427708246818e-05, + "loss": 2.2623, + "step": 61400 + }, + { + "epoch": 58.01887237556027, + "grad_norm": 1.1001108884811401, + "learning_rate": 3.875414788623515e-05, + "loss": 2.2569, + "step": 61500 + }, + { + "epoch": 58.11323425336164, + "grad_norm": 1.0980916023254395, + "learning_rate": 3.858415571716116e-05, + "loss": 2.2532, + "step": 61600 + }, + { + "epoch": 58.20759613116301, + "grad_norm": 1.1283693313598633, + "learning_rate": 3.8414302646547114e-05, + "loss": 2.2601, + "step": 61700 + }, + { + "epoch": 58.30195800896438, + "grad_norm": 1.0919461250305176, + "learning_rate": 3.824459074399911e-05, + "loss": 2.2589, + "step": 61800 + }, + { + "epoch": 58.396319886765745, + "grad_norm": 1.0695630311965942, + "learning_rate": 3.8075022077403095e-05, + "loss": 2.2421, + "step": 61900 + }, + { + "epoch": 58.490681764567114, + "grad_norm": 1.106446385383606, + "learning_rate": 3.790559871289979e-05, + "loss": 2.2574, + "step": 62000 + }, + { + "epoch": 58.490681764567114, + "eval_loss": 2.263988494873047, + "eval_runtime": 74.4531, + "eval_samples_per_second": 202.423, + "eval_steps_per_second": 6.326, + "step": 62000 + }, + { + "epoch": 58.58504364236848, + "grad_norm": 1.0683228969573975, + "learning_rate": 3.77363227148594e-05, + "loss": 2.2527, + "step": 62100 + }, + { + "epoch": 58.67940552016985, + "grad_norm": 1.1563806533813477, + "learning_rate": 3.756719614585656e-05, + "loss": 2.2533, + "step": 62200 + }, + { + "epoch": 58.77376739797122, + "grad_norm": 1.0460565090179443, + "learning_rate": 3.739822106664513e-05, + "loss": 2.2522, + "step": 62300 + }, + { + "epoch": 58.86812927577259, + "grad_norm": 1.110073208808899, + "learning_rate": 3.7229399536133106e-05, + "loss": 2.2516, + "step": 62400 + }, + { + "epoch": 58.96249115357396, + "grad_norm": 1.0516654253005981, + "learning_rate": 3.706073361135759e-05, + "loss": 2.2577, + "step": 62500 + }, + { + "epoch": 59.05661712668082, + "grad_norm": 1.1236464977264404, + "learning_rate": 3.6892225347459624e-05, + "loss": 2.2567, + "step": 62600 + }, + { + "epoch": 59.15097900448219, + "grad_norm": 1.089566946029663, + "learning_rate": 3.672387679765925e-05, + "loss": 2.2364, + "step": 62700 + }, + { + "epoch": 59.24534088228356, + "grad_norm": 1.051033854484558, + "learning_rate": 3.65556900132304e-05, + "loss": 2.2515, + "step": 62800 + }, + { + "epoch": 59.339702760084926, + "grad_norm": 1.058039903640747, + "learning_rate": 3.638766704347598e-05, + "loss": 2.2509, + "step": 62900 + }, + { + "epoch": 59.434064637886294, + "grad_norm": 1.0932092666625977, + "learning_rate": 3.621980993570283e-05, + "loss": 2.2361, + "step": 63000 + }, + { + "epoch": 59.434064637886294, + "eval_loss": 2.256521701812744, + "eval_runtime": 74.4604, + "eval_samples_per_second": 202.403, + "eval_steps_per_second": 6.326, + "step": 63000 + }, + { + "epoch": 59.52842651568766, + "grad_norm": 1.0966417789459229, + "learning_rate": 3.605212073519687e-05, + "loss": 2.2486, + "step": 63100 + }, + { + "epoch": 59.62278839348903, + "grad_norm": 1.0995506048202515, + "learning_rate": 3.588460148519808e-05, + "loss": 2.2425, + "step": 63200 + }, + { + "epoch": 59.7171502712904, + "grad_norm": 1.0373094081878662, + "learning_rate": 3.5717254226875605e-05, + "loss": 2.2398, + "step": 63300 + }, + { + "epoch": 59.81151214909177, + "grad_norm": 1.1059927940368652, + "learning_rate": 3.555008099930305e-05, + "loss": 2.2465, + "step": 63400 + }, + { + "epoch": 59.90587402689314, + "grad_norm": 1.104933261871338, + "learning_rate": 3.5383083839433385e-05, + "loss": 2.2386, + "step": 63500 + }, + { + "epoch": 60.0, + "grad_norm": 1.4146462678909302, + "learning_rate": 3.521626478207432e-05, + "loss": 2.2446, + "step": 63600 + }, + { + "epoch": 60.09436187780137, + "grad_norm": 1.073795199394226, + "learning_rate": 3.504962585986342e-05, + "loss": 2.2379, + "step": 63700 + }, + { + "epoch": 60.18872375560274, + "grad_norm": 1.1478886604309082, + "learning_rate": 3.488316910324338e-05, + "loss": 2.2373, + "step": 63800 + }, + { + "epoch": 60.283085633404106, + "grad_norm": 1.112461805343628, + "learning_rate": 3.471689654043724e-05, + "loss": 2.2395, + "step": 63900 + }, + { + "epoch": 60.377447511205474, + "grad_norm": 1.0776383876800537, + "learning_rate": 3.455081019742368e-05, + "loss": 2.2296, + "step": 64000 + }, + { + "epoch": 60.377447511205474, + "eval_loss": 2.2467448711395264, + "eval_runtime": 74.4598, + "eval_samples_per_second": 202.404, + "eval_steps_per_second": 6.326, + "step": 64000 + }, + { + "epoch": 60.47180938900684, + "grad_norm": 1.0833407640457153, + "learning_rate": 3.438491209791242e-05, + "loss": 2.2313, + "step": 64100 + }, + { + "epoch": 60.56617126680821, + "grad_norm": 1.1021618843078613, + "learning_rate": 3.42192042633194e-05, + "loss": 2.2326, + "step": 64200 + }, + { + "epoch": 60.66053314460958, + "grad_norm": 1.0688246488571167, + "learning_rate": 3.405368871274234e-05, + "loss": 2.2187, + "step": 64300 + }, + { + "epoch": 60.75489502241095, + "grad_norm": 1.0851497650146484, + "learning_rate": 3.3888367462935946e-05, + "loss": 2.2369, + "step": 64400 + }, + { + "epoch": 60.84925690021232, + "grad_norm": 1.0779963731765747, + "learning_rate": 3.3723242528287515e-05, + "loss": 2.2317, + "step": 64500 + }, + { + "epoch": 60.94361877801368, + "grad_norm": 1.113876461982727, + "learning_rate": 3.355831592079223e-05, + "loss": 2.2348, + "step": 64600 + }, + { + "epoch": 61.03774475112055, + "grad_norm": 1.0656490325927734, + "learning_rate": 3.3393589650028766e-05, + "loss": 2.2316, + "step": 64700 + }, + { + "epoch": 61.13210662892192, + "grad_norm": 1.1197242736816406, + "learning_rate": 3.322906572313477e-05, + "loss": 2.2145, + "step": 64800 + }, + { + "epoch": 61.226468506723286, + "grad_norm": 1.1261602640151978, + "learning_rate": 3.306474614478234e-05, + "loss": 2.2289, + "step": 64900 + }, + { + "epoch": 61.320830384524655, + "grad_norm": 1.1310150623321533, + "learning_rate": 3.2900632917153705e-05, + "loss": 2.2245, + "step": 65000 + }, + { + "epoch": 61.320830384524655, + "eval_loss": 2.2367091178894043, + "eval_runtime": 74.4387, + "eval_samples_per_second": 202.462, + "eval_steps_per_second": 6.327, + "step": 65000 + }, + { + "epoch": 61.41519226232602, + "grad_norm": 1.089492678642273, + "learning_rate": 3.273672803991673e-05, + "loss": 2.2191, + "step": 65100 + }, + { + "epoch": 61.50955414012739, + "grad_norm": 1.0696125030517578, + "learning_rate": 3.257303351020066e-05, + "loss": 2.2212, + "step": 65200 + }, + { + "epoch": 61.60391601792875, + "grad_norm": 1.116721749305725, + "learning_rate": 3.240955132257162e-05, + "loss": 2.2255, + "step": 65300 + }, + { + "epoch": 61.69827789573012, + "grad_norm": 1.1005743741989136, + "learning_rate": 3.224628346900853e-05, + "loss": 2.2215, + "step": 65400 + }, + { + "epoch": 61.79263977353149, + "grad_norm": 1.0481823682785034, + "learning_rate": 3.208323193887863e-05, + "loss": 2.2267, + "step": 65500 + }, + { + "epoch": 61.88700165133286, + "grad_norm": 1.0808483362197876, + "learning_rate": 3.192039871891336e-05, + "loss": 2.2148, + "step": 65600 + }, + { + "epoch": 61.98136352913423, + "grad_norm": 1.1039003133773804, + "learning_rate": 3.1757785793184144e-05, + "loss": 2.2253, + "step": 65700 + }, + { + "epoch": 62.0754895022411, + "grad_norm": 1.0520274639129639, + "learning_rate": 3.159539514307812e-05, + "loss": 2.2174, + "step": 65800 + }, + { + "epoch": 62.169851380042466, + "grad_norm": 1.1201003789901733, + "learning_rate": 3.143322874727417e-05, + "loss": 2.2051, + "step": 65900 + }, + { + "epoch": 62.26421325784383, + "grad_norm": 1.1086193323135376, + "learning_rate": 3.1271288581718586e-05, + "loss": 2.2222, + "step": 66000 + }, + { + "epoch": 62.26421325784383, + "eval_loss": 2.226095676422119, + "eval_runtime": 74.4774, + "eval_samples_per_second": 202.357, + "eval_steps_per_second": 6.324, + "step": 66000 + }, + { + "epoch": 62.3585751356452, + "grad_norm": 1.087605357170105, + "learning_rate": 3.1109576619601245e-05, + "loss": 2.2157, + "step": 66100 + }, + { + "epoch": 62.452937013446565, + "grad_norm": 1.1039416790008545, + "learning_rate": 3.0948094831331334e-05, + "loss": 2.2156, + "step": 66200 + }, + { + "epoch": 62.547298891247934, + "grad_norm": 1.0668971538543701, + "learning_rate": 3.078684518451346e-05, + "loss": 2.2123, + "step": 66300 + }, + { + "epoch": 62.6416607690493, + "grad_norm": 1.1140378713607788, + "learning_rate": 3.062582964392373e-05, + "loss": 2.2119, + "step": 66400 + }, + { + "epoch": 62.73602264685067, + "grad_norm": 1.0850133895874023, + "learning_rate": 3.0465050171485677e-05, + "loss": 2.2167, + "step": 66500 + }, + { + "epoch": 62.83038452465204, + "grad_norm": 1.1161259412765503, + "learning_rate": 3.0304508726246428e-05, + "loss": 2.2059, + "step": 66600 + }, + { + "epoch": 62.92474640245341, + "grad_norm": 1.0978820323944092, + "learning_rate": 3.0144207264352814e-05, + "loss": 2.2089, + "step": 66700 + }, + { + "epoch": 63.01887237556027, + "grad_norm": 1.107197880744934, + "learning_rate": 2.99841477390276e-05, + "loss": 2.2076, + "step": 66800 + }, + { + "epoch": 63.11323425336164, + "grad_norm": 1.0904182195663452, + "learning_rate": 2.982433210054557e-05, + "loss": 2.1997, + "step": 66900 + }, + { + "epoch": 63.20759613116301, + "grad_norm": 1.082038402557373, + "learning_rate": 2.9664762296209824e-05, + "loss": 2.2159, + "step": 67000 + }, + { + "epoch": 63.20759613116301, + "eval_loss": 2.2268171310424805, + "eval_runtime": 74.4765, + "eval_samples_per_second": 202.359, + "eval_steps_per_second": 6.324, + "step": 67000 + }, + { + "epoch": 63.30195800896438, + "grad_norm": 1.0698933601379395, + "learning_rate": 2.9505440270328112e-05, + "loss": 2.2073, + "step": 67100 + }, + { + "epoch": 63.396319886765745, + "grad_norm": 1.1355483531951904, + "learning_rate": 2.9346367964188992e-05, + "loss": 2.2007, + "step": 67200 + }, + { + "epoch": 63.490681764567114, + "grad_norm": 1.0896204710006714, + "learning_rate": 2.918754731603835e-05, + "loss": 2.2074, + "step": 67300 + }, + { + "epoch": 63.58504364236848, + "grad_norm": 1.0855309963226318, + "learning_rate": 2.9028980261055637e-05, + "loss": 2.2171, + "step": 67400 + }, + { + "epoch": 63.67940552016985, + "grad_norm": 1.0763812065124512, + "learning_rate": 2.8870668731330426e-05, + "loss": 2.2012, + "step": 67500 + }, + { + "epoch": 63.77376739797122, + "grad_norm": 1.0958815813064575, + "learning_rate": 2.8712614655838683e-05, + "loss": 2.2026, + "step": 67600 + }, + { + "epoch": 63.86812927577259, + "grad_norm": 1.1061776876449585, + "learning_rate": 2.8554819960419493e-05, + "loss": 2.1959, + "step": 67700 + }, + { + "epoch": 63.96249115357396, + "grad_norm": 1.0832457542419434, + "learning_rate": 2.8397286567751397e-05, + "loss": 2.1905, + "step": 67800 + }, + { + "epoch": 64.05661712668082, + "grad_norm": 1.0960924625396729, + "learning_rate": 2.824001639732905e-05, + "loss": 2.1935, + "step": 67900 + }, + { + "epoch": 64.1509790044822, + "grad_norm": 1.039017677307129, + "learning_rate": 2.8083011365439892e-05, + "loss": 2.1978, + "step": 68000 + }, + { + "epoch": 64.1509790044822, + "eval_loss": 2.213329315185547, + "eval_runtime": 74.4287, + "eval_samples_per_second": 202.489, + "eval_steps_per_second": 6.328, + "step": 68000 + }, + { + "epoch": 64.24534088228356, + "grad_norm": 1.052176594734192, + "learning_rate": 2.792627338514065e-05, + "loss": 2.1899, + "step": 68100 + }, + { + "epoch": 64.33970276008493, + "grad_norm": 1.1135153770446777, + "learning_rate": 2.7769804366234187e-05, + "loss": 2.1966, + "step": 68200 + }, + { + "epoch": 64.4340646378863, + "grad_norm": 1.0758830308914185, + "learning_rate": 2.7613606215246067e-05, + "loss": 2.2014, + "step": 68300 + }, + { + "epoch": 64.52842651568766, + "grad_norm": 1.0544888973236084, + "learning_rate": 2.7457680835401533e-05, + "loss": 2.1979, + "step": 68400 + }, + { + "epoch": 64.62278839348903, + "grad_norm": 1.0929769277572632, + "learning_rate": 2.730203012660209e-05, + "loss": 2.1956, + "step": 68500 + }, + { + "epoch": 64.7171502712904, + "grad_norm": 1.10369074344635, + "learning_rate": 2.714665598540249e-05, + "loss": 2.1912, + "step": 68600 + }, + { + "epoch": 64.81151214909177, + "grad_norm": 1.0878170728683472, + "learning_rate": 2.699156030498764e-05, + "loss": 2.1846, + "step": 68700 + }, + { + "epoch": 64.90587402689313, + "grad_norm": 1.1195570230484009, + "learning_rate": 2.6836744975149463e-05, + "loss": 2.1976, + "step": 68800 + }, + { + "epoch": 65.0, + "grad_norm": 1.416730523109436, + "learning_rate": 2.6682211882263873e-05, + "loss": 2.2005, + "step": 68900 + }, + { + "epoch": 65.09436187780136, + "grad_norm": 1.0857688188552856, + "learning_rate": 2.6527962909267856e-05, + "loss": 2.1879, + "step": 69000 + }, + { + "epoch": 65.09436187780136, + "eval_loss": 2.2124412059783936, + "eval_runtime": 74.4579, + "eval_samples_per_second": 202.41, + "eval_steps_per_second": 6.326, + "step": 69000 + }, + { + "epoch": 65.18872375560274, + "grad_norm": 1.096017837524414, + "learning_rate": 2.637399993563648e-05, + "loss": 2.1833, + "step": 69100 + }, + { + "epoch": 65.2830856334041, + "grad_norm": 1.0832054615020752, + "learning_rate": 2.6220324837359956e-05, + "loss": 2.193, + "step": 69200 + }, + { + "epoch": 65.37744751120547, + "grad_norm": 1.1171544790267944, + "learning_rate": 2.6066939486920904e-05, + "loss": 2.187, + "step": 69300 + }, + { + "epoch": 65.47180938900684, + "grad_norm": 1.0824764966964722, + "learning_rate": 2.5913845753271393e-05, + "loss": 2.1909, + "step": 69400 + }, + { + "epoch": 65.56617126680821, + "grad_norm": 1.1256800889968872, + "learning_rate": 2.5761045501810222e-05, + "loss": 2.1933, + "step": 69500 + }, + { + "epoch": 65.66053314460957, + "grad_norm": 1.117341160774231, + "learning_rate": 2.560854059436029e-05, + "loss": 2.189, + "step": 69600 + }, + { + "epoch": 65.75489502241095, + "grad_norm": 1.0918456315994263, + "learning_rate": 2.5456332889145718e-05, + "loss": 2.1832, + "step": 69700 + }, + { + "epoch": 65.84925690021231, + "grad_norm": 1.0791367292404175, + "learning_rate": 2.530442424076941e-05, + "loss": 2.1835, + "step": 69800 + }, + { + "epoch": 65.94361877801369, + "grad_norm": 1.0746275186538696, + "learning_rate": 2.5152816500190253e-05, + "loss": 2.1803, + "step": 69900 + }, + { + "epoch": 66.03774475112054, + "grad_norm": 1.0764819383621216, + "learning_rate": 2.500151151470077e-05, + "loss": 2.1937, + "step": 70000 + }, + { + "epoch": 66.03774475112054, + "eval_loss": 2.2022976875305176, + "eval_runtime": 74.4488, + "eval_samples_per_second": 202.434, + "eval_steps_per_second": 6.326, + "step": 70000 + }, + { + "epoch": 66.13210662892192, + "grad_norm": 1.081490397453308, + "learning_rate": 2.4850511127904437e-05, + "loss": 2.1802, + "step": 70100 + }, + { + "epoch": 66.22646850672328, + "grad_norm": 1.111242413520813, + "learning_rate": 2.469981717969329e-05, + "loss": 2.1761, + "step": 70200 + }, + { + "epoch": 66.32083038452465, + "grad_norm": 1.1172429323196411, + "learning_rate": 2.4549431506225586e-05, + "loss": 2.1866, + "step": 70300 + }, + { + "epoch": 66.41519226232602, + "grad_norm": 1.1029917001724243, + "learning_rate": 2.4399355939903245e-05, + "loss": 2.1849, + "step": 70400 + }, + { + "epoch": 66.50955414012739, + "grad_norm": 1.1302073001861572, + "learning_rate": 2.4249592309349728e-05, + "loss": 2.1863, + "step": 70500 + }, + { + "epoch": 66.60391601792875, + "grad_norm": 1.1342121362686157, + "learning_rate": 2.410014243938757e-05, + "loss": 2.1741, + "step": 70600 + }, + { + "epoch": 66.69827789573013, + "grad_norm": 1.1101330518722534, + "learning_rate": 2.3951008151016285e-05, + "loss": 2.1791, + "step": 70700 + }, + { + "epoch": 66.79263977353149, + "grad_norm": 1.1350260972976685, + "learning_rate": 2.380219126139014e-05, + "loss": 2.1803, + "step": 70800 + }, + { + "epoch": 66.88700165133287, + "grad_norm": 1.0897001028060913, + "learning_rate": 2.3653693583795932e-05, + "loss": 2.171, + "step": 70900 + }, + { + "epoch": 66.98136352913423, + "grad_norm": 1.1115797758102417, + "learning_rate": 2.3505516927631037e-05, + "loss": 2.1757, + "step": 71000 + }, + { + "epoch": 66.98136352913423, + "eval_loss": 2.197711706161499, + "eval_runtime": 74.4988, + "eval_samples_per_second": 202.299, + "eval_steps_per_second": 6.322, + "step": 71000 + }, + { + "epoch": 67.0754895022411, + "grad_norm": 1.1194738149642944, + "learning_rate": 2.3357663098381217e-05, + "loss": 2.1772, + "step": 71100 + }, + { + "epoch": 67.16985138004246, + "grad_norm": 1.0791974067687988, + "learning_rate": 2.3210133897598744e-05, + "loss": 2.1742, + "step": 71200 + }, + { + "epoch": 67.26421325784383, + "grad_norm": 1.1039748191833496, + "learning_rate": 2.3062931122880348e-05, + "loss": 2.17, + "step": 71300 + }, + { + "epoch": 67.3585751356452, + "grad_norm": 1.0685378313064575, + "learning_rate": 2.2916056567845418e-05, + "loss": 2.1741, + "step": 71400 + }, + { + "epoch": 67.45293701344657, + "grad_norm": 1.154982328414917, + "learning_rate": 2.276951202211402e-05, + "loss": 2.1699, + "step": 71500 + }, + { + "epoch": 67.54729889124793, + "grad_norm": 1.0707484483718872, + "learning_rate": 2.262329927128523e-05, + "loss": 2.1746, + "step": 71600 + }, + { + "epoch": 67.64166076904931, + "grad_norm": 1.0761550664901733, + "learning_rate": 2.2477420096915257e-05, + "loss": 2.1778, + "step": 71700 + }, + { + "epoch": 67.73602264685067, + "grad_norm": 1.1259441375732422, + "learning_rate": 2.2331876276495796e-05, + "loss": 2.1711, + "step": 71800 + }, + { + "epoch": 67.83038452465205, + "grad_norm": 1.1506456136703491, + "learning_rate": 2.218666958343239e-05, + "loss": 2.1731, + "step": 71900 + }, + { + "epoch": 67.92474640245341, + "grad_norm": 1.1114802360534668, + "learning_rate": 2.2041801787022742e-05, + "loss": 2.1747, + "step": 72000 + }, + { + "epoch": 67.92474640245341, + "eval_loss": 2.1910886764526367, + "eval_runtime": 74.4667, + "eval_samples_per_second": 202.386, + "eval_steps_per_second": 6.325, + "step": 72000 + }, + { + "epoch": 68.01887237556028, + "grad_norm": 1.0863537788391113, + "learning_rate": 2.189727465243527e-05, + "loss": 2.1734, + "step": 72100 + }, + { + "epoch": 68.11323425336164, + "grad_norm": 1.1258056163787842, + "learning_rate": 2.1753089940687456e-05, + "loss": 2.1707, + "step": 72200 + }, + { + "epoch": 68.20759613116302, + "grad_norm": 1.1622432470321655, + "learning_rate": 2.1609249408624547e-05, + "loss": 2.1651, + "step": 72300 + }, + { + "epoch": 68.30195800896438, + "grad_norm": 1.0975794792175293, + "learning_rate": 2.1465754808898007e-05, + "loss": 2.1642, + "step": 72400 + }, + { + "epoch": 68.39631988676575, + "grad_norm": 1.0919430255889893, + "learning_rate": 2.132260788994428e-05, + "loss": 2.1662, + "step": 72500 + }, + { + "epoch": 68.49068176456711, + "grad_norm": 1.0746418237686157, + "learning_rate": 2.1179810395963363e-05, + "loss": 2.1791, + "step": 72600 + }, + { + "epoch": 68.58504364236849, + "grad_norm": 1.0876717567443848, + "learning_rate": 2.1037364066897674e-05, + "loss": 2.1635, + "step": 72700 + }, + { + "epoch": 68.67940552016985, + "grad_norm": 1.1063474416732788, + "learning_rate": 2.0895270638410797e-05, + "loss": 2.1609, + "step": 72800 + }, + { + "epoch": 68.77376739797121, + "grad_norm": 1.1409770250320435, + "learning_rate": 2.0753531841866282e-05, + "loss": 2.1673, + "step": 72900 + }, + { + "epoch": 68.86812927577259, + "grad_norm": 1.0966746807098389, + "learning_rate": 2.0612149404306664e-05, + "loss": 2.1635, + "step": 73000 + }, + { + "epoch": 68.86812927577259, + "eval_loss": 2.1918320655822754, + "eval_runtime": 74.4341, + "eval_samples_per_second": 202.475, + "eval_steps_per_second": 6.328, + "step": 73000 + }, + { + "epoch": 68.96249115357395, + "grad_norm": 1.1267166137695312, + "learning_rate": 2.0471125048432317e-05, + "loss": 2.1717, + "step": 73100 + }, + { + "epoch": 69.05661712668082, + "grad_norm": 1.0646253824234009, + "learning_rate": 2.033046049258049e-05, + "loss": 2.1562, + "step": 73200 + }, + { + "epoch": 69.1509790044822, + "grad_norm": 1.1580528020858765, + "learning_rate": 2.0190157450704443e-05, + "loss": 2.1547, + "step": 73300 + }, + { + "epoch": 69.24534088228356, + "grad_norm": 1.0343353748321533, + "learning_rate": 2.005021763235243e-05, + "loss": 2.1636, + "step": 73400 + }, + { + "epoch": 69.33970276008493, + "grad_norm": 1.0552024841308594, + "learning_rate": 1.9910642742647013e-05, + "loss": 2.1634, + "step": 73500 + }, + { + "epoch": 69.4340646378863, + "grad_norm": 1.071376919746399, + "learning_rate": 1.9771434482264132e-05, + "loss": 2.1661, + "step": 73600 + }, + { + "epoch": 69.52842651568766, + "grad_norm": 1.0911625623703003, + "learning_rate": 1.963259454741255e-05, + "loss": 2.1656, + "step": 73700 + }, + { + "epoch": 69.62278839348903, + "grad_norm": 1.0990372896194458, + "learning_rate": 1.949412462981302e-05, + "loss": 2.1524, + "step": 73800 + }, + { + "epoch": 69.7171502712904, + "grad_norm": 1.093256950378418, + "learning_rate": 1.935602641667783e-05, + "loss": 2.1594, + "step": 73900 + }, + { + "epoch": 69.81151214909177, + "grad_norm": 1.104551911354065, + "learning_rate": 1.9218301590690103e-05, + "loss": 2.1617, + "step": 74000 + }, + { + "epoch": 69.81151214909177, + "eval_loss": 2.179980993270874, + "eval_runtime": 74.4861, + "eval_samples_per_second": 202.333, + "eval_steps_per_second": 6.323, + "step": 74000 + }, + { + "epoch": 69.90587402689313, + "grad_norm": 1.1245603561401367, + "learning_rate": 1.9080951829983358e-05, + "loss": 2.1602, + "step": 74100 + }, + { + "epoch": 70.0, + "grad_norm": 1.3696141242980957, + "learning_rate": 1.894397880812113e-05, + "loss": 2.1683, + "step": 74200 + }, + { + "epoch": 70.09436187780136, + "grad_norm": 1.111162543296814, + "learning_rate": 1.8807384194076426e-05, + "loss": 2.1507, + "step": 74300 + }, + { + "epoch": 70.18872375560274, + "grad_norm": 1.0995087623596191, + "learning_rate": 1.8671169652211524e-05, + "loss": 2.1524, + "step": 74400 + }, + { + "epoch": 70.2830856334041, + "grad_norm": 1.139926791191101, + "learning_rate": 1.8535336842257657e-05, + "loss": 2.1656, + "step": 74500 + }, + { + "epoch": 70.37744751120547, + "grad_norm": 1.114806056022644, + "learning_rate": 1.8399887419294696e-05, + "loss": 2.1443, + "step": 74600 + }, + { + "epoch": 70.47180938900684, + "grad_norm": 1.0854475498199463, + "learning_rate": 1.826482303373117e-05, + "loss": 2.1584, + "step": 74700 + }, + { + "epoch": 70.56617126680821, + "grad_norm": 1.087457299232483, + "learning_rate": 1.8130145331283948e-05, + "loss": 2.1424, + "step": 74800 + }, + { + "epoch": 70.66053314460957, + "grad_norm": 1.1035183668136597, + "learning_rate": 1.799585595295837e-05, + "loss": 2.1533, + "step": 74900 + }, + { + "epoch": 70.75489502241095, + "grad_norm": 1.1080502271652222, + "learning_rate": 1.786195653502809e-05, + "loss": 2.1583, + "step": 75000 + }, + { + "epoch": 70.75489502241095, + "eval_loss": 2.1788103580474854, + "eval_runtime": 74.4505, + "eval_samples_per_second": 202.43, + "eval_steps_per_second": 6.326, + "step": 75000 + }, + { + "epoch": 70.84925690021231, + "grad_norm": 1.12478506565094, + "learning_rate": 1.7728448709015304e-05, + "loss": 2.1595, + "step": 75100 + }, + { + "epoch": 70.94361877801369, + "grad_norm": 1.0864694118499756, + "learning_rate": 1.7595334101670703e-05, + "loss": 2.1529, + "step": 75200 + }, + { + "epoch": 71.03774475112054, + "grad_norm": 1.1255881786346436, + "learning_rate": 1.7462614334953798e-05, + "loss": 2.162, + "step": 75300 + }, + { + "epoch": 71.13210662892192, + "grad_norm": 1.1375105381011963, + "learning_rate": 1.733029102601303e-05, + "loss": 2.1481, + "step": 75400 + }, + { + "epoch": 71.22646850672328, + "grad_norm": 1.1133302450180054, + "learning_rate": 1.71983657871662e-05, + "loss": 2.1476, + "step": 75500 + }, + { + "epoch": 71.32083038452465, + "grad_norm": 1.090040683746338, + "learning_rate": 1.706684022588068e-05, + "loss": 2.153, + "step": 75600 + }, + { + "epoch": 71.41519226232602, + "grad_norm": 1.110629916191101, + "learning_rate": 1.6935715944753928e-05, + "loss": 2.147, + "step": 75700 + }, + { + "epoch": 71.50955414012739, + "grad_norm": 1.1086583137512207, + "learning_rate": 1.6804994541493953e-05, + "loss": 2.1422, + "step": 75800 + }, + { + "epoch": 71.60391601792875, + "grad_norm": 1.1097800731658936, + "learning_rate": 1.6674677608899763e-05, + "loss": 2.1469, + "step": 75900 + }, + { + "epoch": 71.69827789573013, + "grad_norm": 1.1248265504837036, + "learning_rate": 1.6544766734842093e-05, + "loss": 2.153, + "step": 76000 + }, + { + "epoch": 71.69827789573013, + "eval_loss": 2.1701321601867676, + "eval_runtime": 74.4907, + "eval_samples_per_second": 202.321, + "eval_steps_per_second": 6.323, + "step": 76000 + }, + { + "epoch": 71.79263977353149, + "grad_norm": 1.0759069919586182, + "learning_rate": 1.641526350224392e-05, + "loss": 2.1415, + "step": 76100 + }, + { + "epoch": 71.88700165133287, + "grad_norm": 1.1383440494537354, + "learning_rate": 1.628616948906129e-05, + "loss": 2.146, + "step": 76200 + }, + { + "epoch": 71.98136352913423, + "grad_norm": 1.1579523086547852, + "learning_rate": 1.615748626826398e-05, + "loss": 2.1467, + "step": 76300 + }, + { + "epoch": 72.0754895022411, + "grad_norm": 1.0908124446868896, + "learning_rate": 1.602921540781645e-05, + "loss": 2.1503, + "step": 76400 + }, + { + "epoch": 72.16985138004246, + "grad_norm": 1.1176483631134033, + "learning_rate": 1.5901358470658667e-05, + "loss": 2.1398, + "step": 76500 + }, + { + "epoch": 72.26421325784383, + "grad_norm": 1.0756431818008423, + "learning_rate": 1.5773917014687024e-05, + "loss": 2.1512, + "step": 76600 + }, + { + "epoch": 72.3585751356452, + "grad_norm": 1.0899015665054321, + "learning_rate": 1.5646892592735478e-05, + "loss": 2.1345, + "step": 76700 + }, + { + "epoch": 72.45293701344657, + "grad_norm": 1.118424892425537, + "learning_rate": 1.55202867525565e-05, + "loss": 2.1433, + "step": 76800 + }, + { + "epoch": 72.54729889124793, + "grad_norm": 1.1094263792037964, + "learning_rate": 1.5394101036802316e-05, + "loss": 2.1485, + "step": 76900 + }, + { + "epoch": 72.64166076904931, + "grad_norm": 1.0856695175170898, + "learning_rate": 1.5268336983006048e-05, + "loss": 2.1478, + "step": 77000 + }, + { + "epoch": 72.64166076904931, + "eval_loss": 2.1706361770629883, + "eval_runtime": 74.46, + "eval_samples_per_second": 202.404, + "eval_steps_per_second": 6.326, + "step": 77000 + }, + { + "epoch": 72.73602264685067, + "grad_norm": 1.095223069190979, + "learning_rate": 1.514299612356298e-05, + "loss": 2.1441, + "step": 77100 + }, + { + "epoch": 72.83038452465205, + "grad_norm": 1.1260662078857422, + "learning_rate": 1.5018079985711963e-05, + "loss": 2.1505, + "step": 77200 + }, + { + "epoch": 72.92474640245341, + "grad_norm": 1.1021870374679565, + "learning_rate": 1.4893590091516686e-05, + "loss": 2.1412, + "step": 77300 + }, + { + "epoch": 73.01887237556028, + "grad_norm": 1.0860086679458618, + "learning_rate": 1.4769527957847246e-05, + "loss": 2.1347, + "step": 77400 + }, + { + "epoch": 73.11323425336164, + "grad_norm": 1.0829050540924072, + "learning_rate": 1.4645895096361568e-05, + "loss": 2.135, + "step": 77500 + }, + { + "epoch": 73.20759613116302, + "grad_norm": 1.0621412992477417, + "learning_rate": 1.4522693013487077e-05, + "loss": 2.1357, + "step": 77600 + }, + { + "epoch": 73.30195800896438, + "grad_norm": 1.0788235664367676, + "learning_rate": 1.439992321040225e-05, + "loss": 2.143, + "step": 77700 + }, + { + "epoch": 73.39631988676575, + "grad_norm": 1.1159855127334595, + "learning_rate": 1.427758718301842e-05, + "loss": 2.1329, + "step": 77800 + }, + { + "epoch": 73.49068176456711, + "grad_norm": 1.1019457578659058, + "learning_rate": 1.4155686421961456e-05, + "loss": 2.1427, + "step": 77900 + }, + { + "epoch": 73.58504364236849, + "grad_norm": 1.1145848035812378, + "learning_rate": 1.4034222412553655e-05, + "loss": 2.1413, + "step": 78000 + }, + { + "epoch": 73.58504364236849, + "eval_loss": 2.165809154510498, + "eval_runtime": 74.4896, + "eval_samples_per_second": 202.324, + "eval_steps_per_second": 6.323, + "step": 78000 + }, + { + "epoch": 73.67940552016985, + "grad_norm": 1.0918891429901123, + "learning_rate": 1.3913196634795644e-05, + "loss": 2.1466, + "step": 78100 + }, + { + "epoch": 73.77376739797121, + "grad_norm": 1.0411089658737183, + "learning_rate": 1.3792610563348352e-05, + "loss": 2.1351, + "step": 78200 + }, + { + "epoch": 73.86812927577259, + "grad_norm": 1.089095950126648, + "learning_rate": 1.3672465667514977e-05, + "loss": 2.1422, + "step": 78300 + }, + { + "epoch": 73.96249115357395, + "grad_norm": 1.1013556718826294, + "learning_rate": 1.3552763411223173e-05, + "loss": 2.1326, + "step": 78400 + }, + { + "epoch": 74.05661712668082, + "grad_norm": 1.0978729724884033, + "learning_rate": 1.3433505253007172e-05, + "loss": 2.1358, + "step": 78500 + }, + { + "epoch": 74.1509790044822, + "grad_norm": 1.1121299266815186, + "learning_rate": 1.3314692645989978e-05, + "loss": 2.1314, + "step": 78600 + }, + { + "epoch": 74.24534088228356, + "grad_norm": 1.087158441543579, + "learning_rate": 1.3196327037865701e-05, + "loss": 2.1449, + "step": 78700 + }, + { + "epoch": 74.33970276008493, + "grad_norm": 1.1194955110549927, + "learning_rate": 1.3078409870881952e-05, + "loss": 2.1358, + "step": 78800 + }, + { + "epoch": 74.4340646378863, + "grad_norm": 1.1199134588241577, + "learning_rate": 1.2960942581822166e-05, + "loss": 2.1389, + "step": 78900 + }, + { + "epoch": 74.52842651568766, + "grad_norm": 1.0960466861724854, + "learning_rate": 1.2843926601988227e-05, + "loss": 2.1328, + "step": 79000 + }, + { + "epoch": 74.52842651568766, + "eval_loss": 2.165520429611206, + "eval_runtime": 74.4987, + "eval_samples_per_second": 202.299, + "eval_steps_per_second": 6.322, + "step": 79000 + }, + { + "epoch": 74.62278839348903, + "grad_norm": 1.1289753913879395, + "learning_rate": 1.272736335718288e-05, + "loss": 2.1309, + "step": 79100 + }, + { + "epoch": 74.7171502712904, + "grad_norm": 1.109093427658081, + "learning_rate": 1.2611254267692518e-05, + "loss": 2.1349, + "step": 79200 + }, + { + "epoch": 74.81151214909177, + "grad_norm": 1.1008985042572021, + "learning_rate": 1.2495600748269732e-05, + "loss": 2.1366, + "step": 79300 + }, + { + "epoch": 74.90587402689313, + "grad_norm": 1.0962027311325073, + "learning_rate": 1.2380404208116148e-05, + "loss": 2.1223, + "step": 79400 + }, + { + "epoch": 75.0, + "grad_norm": 1.3603086471557617, + "learning_rate": 1.2265666050865283e-05, + "loss": 2.1387, + "step": 79500 + }, + { + "epoch": 75.09436187780136, + "grad_norm": 1.0967055559158325, + "learning_rate": 1.215138767456534e-05, + "loss": 2.1333, + "step": 79600 + }, + { + "epoch": 75.18872375560274, + "grad_norm": 1.1414368152618408, + "learning_rate": 1.2037570471662307e-05, + "loss": 2.1335, + "step": 79700 + }, + { + "epoch": 75.2830856334041, + "grad_norm": 1.0993640422821045, + "learning_rate": 1.1924215828982842e-05, + "loss": 2.1304, + "step": 79800 + }, + { + "epoch": 75.37744751120547, + "grad_norm": 1.0678730010986328, + "learning_rate": 1.1811325127717544e-05, + "loss": 2.1344, + "step": 79900 + }, + { + "epoch": 75.47180938900684, + "grad_norm": 1.0797874927520752, + "learning_rate": 1.169889974340393e-05, + "loss": 2.1234, + "step": 80000 + }, + { + "epoch": 75.47180938900684, + "eval_loss": 2.1577999591827393, + "eval_runtime": 74.4676, + "eval_samples_per_second": 202.383, + "eval_steps_per_second": 6.325, + "step": 80000 + } + ], + "logging_steps": 100, + "max_steps": 100000, + "num_input_tokens_seen": 0, + "num_train_epochs": 95, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.6943289668288e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}