diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5569 @@ +{ + "best_metric": 3.2710256576538086, + "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M_6910/checkpoint-30000", + "epoch": 20.0, + "eval_steps": 1000, + "global_step": 37100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.026954177897574125, + "grad_norm": 8.296930313110352, + "learning_rate": 0.000276, + "loss": 44.82, + "step": 50 + }, + { + "epoch": 0.05390835579514825, + "grad_norm": 11.741185188293457, + "learning_rate": 0.0005759999999999999, + "loss": 34.5172, + "step": 100 + }, + { + "epoch": 0.08086253369272237, + "grad_norm": 9.069363594055176, + "learning_rate": 0.000599254054054054, + "loss": 31.9796, + "step": 150 + }, + { + "epoch": 0.1078167115902965, + "grad_norm": 8.488612174987793, + "learning_rate": 0.0005984432432432432, + "loss": 30.1663, + "step": 200 + }, + { + "epoch": 0.1347708894878706, + "grad_norm": 3.753924608230591, + "learning_rate": 0.0005976324324324324, + "loss": 29.1509, + "step": 250 + }, + { + "epoch": 0.16172506738544473, + "grad_norm": 5.127024173736572, + "learning_rate": 0.0005968216216216216, + "loss": 28.2834, + "step": 300 + }, + { + "epoch": 0.18867924528301888, + "grad_norm": 8.49015998840332, + "learning_rate": 0.0005960108108108108, + "loss": 27.4926, + "step": 350 + }, + { + "epoch": 0.215633423180593, + "grad_norm": 5.718660831451416, + "learning_rate": 0.0005951999999999999, + "loss": 26.72, + "step": 400 + }, + { + "epoch": 0.24258760107816713, + "grad_norm": 4.080533027648926, + "learning_rate": 0.0005943891891891891, + "loss": 26.0067, + "step": 450 + }, + { + "epoch": 0.2695417789757412, + "grad_norm": 4.703437805175781, + "learning_rate": 0.0005935783783783783, + "loss": 25.2921, + "step": 500 + }, + { + "epoch": 0.29649595687331537, + "grad_norm": 4.108513355255127, + "learning_rate": 0.0005927675675675675, + "loss": 24.7137, + "step": 550 + }, + { + "epoch": 0.32345013477088946, + "grad_norm": 3.727118968963623, + "learning_rate": 0.0005919567567567567, + "loss": 24.2733, + "step": 600 + }, + { + "epoch": 0.3504043126684636, + "grad_norm": 7.353233814239502, + "learning_rate": 0.0005911459459459459, + "loss": 23.7954, + "step": 650 + }, + { + "epoch": 0.37735849056603776, + "grad_norm": 6.545207977294922, + "learning_rate": 0.0005903351351351351, + "loss": 23.4134, + "step": 700 + }, + { + "epoch": 0.40431266846361186, + "grad_norm": 4.7703776359558105, + "learning_rate": 0.0005895243243243242, + "loss": 23.0314, + "step": 750 + }, + { + "epoch": 0.431266846361186, + "grad_norm": 5.744662761688232, + "learning_rate": 0.0005887135135135134, + "loss": 22.6726, + "step": 800 + }, + { + "epoch": 0.4582210242587601, + "grad_norm": 4.685823440551758, + "learning_rate": 0.0005879027027027026, + "loss": 22.3204, + "step": 850 + }, + { + "epoch": 0.48517520215633425, + "grad_norm": 4.1006999015808105, + "learning_rate": 0.0005870918918918918, + "loss": 22.0621, + "step": 900 + }, + { + "epoch": 0.5121293800539084, + "grad_norm": 4.774529457092285, + "learning_rate": 0.000586281081081081, + "loss": 21.7707, + "step": 950 + }, + { + "epoch": 0.5390835579514824, + "grad_norm": 6.737414836883545, + "learning_rate": 0.0005854702702702703, + "loss": 21.4198, + "step": 1000 + }, + { + "epoch": 0.5390835579514824, + "eval_accuracy": 0.29736351021476004, + "eval_loss": 4.217971324920654, + "eval_runtime": 184.1069, + "eval_samples_per_second": 97.834, + "eval_steps_per_second": 6.116, + "step": 1000 + }, + { + "epoch": 0.5660377358490566, + "grad_norm": 3.665602445602417, + "learning_rate": 0.0005846594594594594, + "loss": 21.278, + "step": 1050 + }, + { + "epoch": 0.5929919137466307, + "grad_norm": 4.725819110870361, + "learning_rate": 0.0005838486486486486, + "loss": 21.0817, + "step": 1100 + }, + { + "epoch": 0.6199460916442049, + "grad_norm": 2.7761611938476562, + "learning_rate": 0.0005830378378378377, + "loss": 20.9031, + "step": 1150 + }, + { + "epoch": 0.6469002695417789, + "grad_norm": 3.616361379623413, + "learning_rate": 0.0005822270270270269, + "loss": 20.7398, + "step": 1200 + }, + { + "epoch": 0.6738544474393531, + "grad_norm": 3.293269395828247, + "learning_rate": 0.0005814162162162161, + "loss": 20.6258, + "step": 1250 + }, + { + "epoch": 0.7008086253369272, + "grad_norm": 3.5149900913238525, + "learning_rate": 0.0005806054054054054, + "loss": 20.5009, + "step": 1300 + }, + { + "epoch": 0.7277628032345014, + "grad_norm": 3.3984060287475586, + "learning_rate": 0.0005797945945945946, + "loss": 20.3903, + "step": 1350 + }, + { + "epoch": 0.7547169811320755, + "grad_norm": 3.0169084072113037, + "learning_rate": 0.0005789837837837838, + "loss": 20.2629, + "step": 1400 + }, + { + "epoch": 0.7816711590296496, + "grad_norm": 3.1698763370513916, + "learning_rate": 0.0005781729729729729, + "loss": 20.1576, + "step": 1450 + }, + { + "epoch": 0.8086253369272237, + "grad_norm": 2.82014536857605, + "learning_rate": 0.0005773621621621621, + "loss": 20.074, + "step": 1500 + }, + { + "epoch": 0.8355795148247979, + "grad_norm": 3.329674482345581, + "learning_rate": 0.0005765513513513512, + "loss": 19.9387, + "step": 1550 + }, + { + "epoch": 0.862533692722372, + "grad_norm": 3.1232547760009766, + "learning_rate": 0.0005757405405405405, + "loss": 19.8412, + "step": 1600 + }, + { + "epoch": 0.889487870619946, + "grad_norm": 3.4831297397613525, + "learning_rate": 0.0005749297297297297, + "loss": 19.7664, + "step": 1650 + }, + { + "epoch": 0.9164420485175202, + "grad_norm": 3.227527618408203, + "learning_rate": 0.0005741189189189189, + "loss": 19.7121, + "step": 1700 + }, + { + "epoch": 0.9433962264150944, + "grad_norm": 2.779829740524292, + "learning_rate": 0.0005733081081081081, + "loss": 19.6274, + "step": 1750 + }, + { + "epoch": 0.9703504043126685, + "grad_norm": 2.9527933597564697, + "learning_rate": 0.0005724972972972973, + "loss": 19.5337, + "step": 1800 + }, + { + "epoch": 0.9973045822102425, + "grad_norm": 3.239156484603882, + "learning_rate": 0.0005716864864864864, + "loss": 19.4532, + "step": 1850 + }, + { + "epoch": 1.0242587601078168, + "grad_norm": 2.773244857788086, + "learning_rate": 0.0005708756756756756, + "loss": 19.1807, + "step": 1900 + }, + { + "epoch": 1.0512129380053907, + "grad_norm": 3.259617328643799, + "learning_rate": 0.0005700648648648648, + "loss": 19.1206, + "step": 1950 + }, + { + "epoch": 1.0781671159029649, + "grad_norm": 3.177616834640503, + "learning_rate": 0.000569254054054054, + "loss": 19.135, + "step": 2000 + }, + { + "epoch": 1.0781671159029649, + "eval_accuracy": 0.3386392111716781, + "eval_loss": 3.795865297317505, + "eval_runtime": 182.4488, + "eval_samples_per_second": 98.724, + "eval_steps_per_second": 6.172, + "step": 2000 + }, + { + "epoch": 1.105121293800539, + "grad_norm": 2.7435619831085205, + "learning_rate": 0.0005684432432432432, + "loss": 19.0787, + "step": 2050 + }, + { + "epoch": 1.1320754716981132, + "grad_norm": 2.1673481464385986, + "learning_rate": 0.0005676324324324324, + "loss": 19.0351, + "step": 2100 + }, + { + "epoch": 1.1590296495956873, + "grad_norm": 2.312748670578003, + "learning_rate": 0.0005668216216216216, + "loss": 18.9722, + "step": 2150 + }, + { + "epoch": 1.1859838274932615, + "grad_norm": 2.636430501937866, + "learning_rate": 0.0005660108108108108, + "loss": 18.9062, + "step": 2200 + }, + { + "epoch": 1.2129380053908356, + "grad_norm": 2.2114217281341553, + "learning_rate": 0.0005652, + "loss": 18.8991, + "step": 2250 + }, + { + "epoch": 1.2398921832884098, + "grad_norm": 2.093247652053833, + "learning_rate": 0.0005643891891891892, + "loss": 18.8394, + "step": 2300 + }, + { + "epoch": 1.266846361185984, + "grad_norm": 1.8652660846710205, + "learning_rate": 0.0005635783783783783, + "loss": 18.7991, + "step": 2350 + }, + { + "epoch": 1.2938005390835579, + "grad_norm": 2.7426042556762695, + "learning_rate": 0.0005627675675675675, + "loss": 18.8135, + "step": 2400 + }, + { + "epoch": 1.320754716981132, + "grad_norm": 2.2966415882110596, + "learning_rate": 0.0005619567567567567, + "loss": 18.7167, + "step": 2450 + }, + { + "epoch": 1.3477088948787062, + "grad_norm": 2.343276023864746, + "learning_rate": 0.0005611459459459459, + "loss": 18.711, + "step": 2500 + }, + { + "epoch": 1.3746630727762803, + "grad_norm": 2.7991366386413574, + "learning_rate": 0.0005603351351351351, + "loss": 18.6614, + "step": 2550 + }, + { + "epoch": 1.4016172506738545, + "grad_norm": 2.226125717163086, + "learning_rate": 0.0005595243243243243, + "loss": 18.6536, + "step": 2600 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 2.165311098098755, + "learning_rate": 0.0005587135135135135, + "loss": 18.5774, + "step": 2650 + }, + { + "epoch": 1.4555256064690028, + "grad_norm": 2.3691980838775635, + "learning_rate": 0.0005579027027027027, + "loss": 18.5688, + "step": 2700 + }, + { + "epoch": 1.482479784366577, + "grad_norm": 2.169313669204712, + "learning_rate": 0.0005570918918918918, + "loss": 18.5461, + "step": 2750 + }, + { + "epoch": 1.509433962264151, + "grad_norm": 1.9214165210723877, + "learning_rate": 0.000556281081081081, + "loss": 18.5095, + "step": 2800 + }, + { + "epoch": 1.536388140161725, + "grad_norm": 2.0874290466308594, + "learning_rate": 0.0005554702702702702, + "loss": 18.5082, + "step": 2850 + }, + { + "epoch": 1.5633423180592994, + "grad_norm": 2.0279717445373535, + "learning_rate": 0.0005546594594594594, + "loss": 18.4448, + "step": 2900 + }, + { + "epoch": 1.5902964959568733, + "grad_norm": 1.8327255249023438, + "learning_rate": 0.0005538486486486486, + "loss": 18.4225, + "step": 2950 + }, + { + "epoch": 1.6172506738544474, + "grad_norm": 2.1154870986938477, + "learning_rate": 0.0005530378378378378, + "loss": 18.3835, + "step": 3000 + }, + { + "epoch": 1.6172506738544474, + "eval_accuracy": 0.353348908946547, + "eval_loss": 3.639807939529419, + "eval_runtime": 179.8343, + "eval_samples_per_second": 100.159, + "eval_steps_per_second": 6.261, + "step": 3000 + }, + { + "epoch": 1.6442048517520216, + "grad_norm": 1.7121838331222534, + "learning_rate": 0.000552227027027027, + "loss": 18.3714, + "step": 3050 + }, + { + "epoch": 1.6711590296495957, + "grad_norm": 1.950905680656433, + "learning_rate": 0.0005514162162162161, + "loss": 18.3744, + "step": 3100 + }, + { + "epoch": 1.6981132075471699, + "grad_norm": 1.8936762809753418, + "learning_rate": 0.0005506054054054053, + "loss": 18.3124, + "step": 3150 + }, + { + "epoch": 1.7250673854447438, + "grad_norm": 2.1329703330993652, + "learning_rate": 0.0005497945945945945, + "loss": 18.2764, + "step": 3200 + }, + { + "epoch": 1.7520215633423182, + "grad_norm": 1.8802683353424072, + "learning_rate": 0.0005489837837837837, + "loss": 18.2335, + "step": 3250 + }, + { + "epoch": 1.778975741239892, + "grad_norm": 1.9462956190109253, + "learning_rate": 0.0005481729729729729, + "loss": 18.2418, + "step": 3300 + }, + { + "epoch": 1.8059299191374663, + "grad_norm": 1.771466851234436, + "learning_rate": 0.0005473621621621621, + "loss": 18.1747, + "step": 3350 + }, + { + "epoch": 1.8328840970350404, + "grad_norm": 1.7610357999801636, + "learning_rate": 0.0005465513513513514, + "loss": 18.1823, + "step": 3400 + }, + { + "epoch": 1.8598382749326146, + "grad_norm": 1.9204082489013672, + "learning_rate": 0.0005457405405405405, + "loss": 18.106, + "step": 3450 + }, + { + "epoch": 1.8867924528301887, + "grad_norm": 2.1296563148498535, + "learning_rate": 0.0005449297297297296, + "loss": 18.1452, + "step": 3500 + }, + { + "epoch": 1.9137466307277629, + "grad_norm": 2.0194156169891357, + "learning_rate": 0.0005441189189189188, + "loss": 18.1186, + "step": 3550 + }, + { + "epoch": 1.940700808625337, + "grad_norm": 2.01413631439209, + "learning_rate": 0.000543308108108108, + "loss": 18.0921, + "step": 3600 + }, + { + "epoch": 1.967654986522911, + "grad_norm": 1.983556866645813, + "learning_rate": 0.0005424972972972972, + "loss": 18.0646, + "step": 3650 + }, + { + "epoch": 1.9946091644204853, + "grad_norm": 1.752462387084961, + "learning_rate": 0.0005416864864864865, + "loss": 18.0691, + "step": 3700 + }, + { + "epoch": 2.0215633423180592, + "grad_norm": 1.784954309463501, + "learning_rate": 0.0005408756756756757, + "loss": 17.6995, + "step": 3750 + }, + { + "epoch": 2.0485175202156336, + "grad_norm": 1.8124125003814697, + "learning_rate": 0.0005400648648648649, + "loss": 17.6682, + "step": 3800 + }, + { + "epoch": 2.0754716981132075, + "grad_norm": 1.8763898611068726, + "learning_rate": 0.000539254054054054, + "loss": 17.6483, + "step": 3850 + }, + { + "epoch": 2.1024258760107815, + "grad_norm": 2.0498359203338623, + "learning_rate": 0.0005384432432432431, + "loss": 17.6595, + "step": 3900 + }, + { + "epoch": 2.129380053908356, + "grad_norm": 2.153012990951538, + "learning_rate": 0.0005376324324324323, + "loss": 17.6197, + "step": 3950 + }, + { + "epoch": 2.1563342318059298, + "grad_norm": 1.7588380575180054, + "learning_rate": 0.0005368216216216215, + "loss": 17.6178, + "step": 4000 + }, + { + "epoch": 2.1563342318059298, + "eval_accuracy": 0.36234291294388216, + "eval_loss": 3.5501065254211426, + "eval_runtime": 179.4768, + "eval_samples_per_second": 100.358, + "eval_steps_per_second": 6.274, + "step": 4000 + }, + { + "epoch": 2.183288409703504, + "grad_norm": 1.8876287937164307, + "learning_rate": 0.0005360108108108108, + "loss": 17.5874, + "step": 4050 + }, + { + "epoch": 2.210242587601078, + "grad_norm": 1.8467564582824707, + "learning_rate": 0.0005352, + "loss": 17.6174, + "step": 4100 + }, + { + "epoch": 2.2371967654986524, + "grad_norm": 1.6003799438476562, + "learning_rate": 0.0005343891891891892, + "loss": 17.6635, + "step": 4150 + }, + { + "epoch": 2.2641509433962264, + "grad_norm": 1.851845145225525, + "learning_rate": 0.0005335783783783784, + "loss": 17.5831, + "step": 4200 + }, + { + "epoch": 2.2911051212938007, + "grad_norm": 1.834288477897644, + "learning_rate": 0.0005327675675675675, + "loss": 17.6081, + "step": 4250 + }, + { + "epoch": 2.3180592991913747, + "grad_norm": 1.741883397102356, + "learning_rate": 0.0005319567567567566, + "loss": 17.5878, + "step": 4300 + }, + { + "epoch": 2.3450134770889486, + "grad_norm": 2.0378310680389404, + "learning_rate": 0.0005311459459459459, + "loss": 17.5712, + "step": 4350 + }, + { + "epoch": 2.371967654986523, + "grad_norm": 1.803305983543396, + "learning_rate": 0.0005303351351351351, + "loss": 17.5698, + "step": 4400 + }, + { + "epoch": 2.398921832884097, + "grad_norm": 1.83433198928833, + "learning_rate": 0.0005295243243243243, + "loss": 17.5659, + "step": 4450 + }, + { + "epoch": 2.4258760107816713, + "grad_norm": 1.8511073589324951, + "learning_rate": 0.0005287135135135135, + "loss": 17.5527, + "step": 4500 + }, + { + "epoch": 2.452830188679245, + "grad_norm": 1.9463386535644531, + "learning_rate": 0.0005279027027027027, + "loss": 17.5418, + "step": 4550 + }, + { + "epoch": 2.4797843665768196, + "grad_norm": 1.6681801080703735, + "learning_rate": 0.0005270918918918919, + "loss": 17.5826, + "step": 4600 + }, + { + "epoch": 2.5067385444743935, + "grad_norm": 1.8517515659332275, + "learning_rate": 0.000526281081081081, + "loss": 17.5022, + "step": 4650 + }, + { + "epoch": 2.533692722371968, + "grad_norm": 1.640101671218872, + "learning_rate": 0.0005254702702702702, + "loss": 17.5124, + "step": 4700 + }, + { + "epoch": 2.560646900269542, + "grad_norm": 1.7293671369552612, + "learning_rate": 0.0005246594594594594, + "loss": 17.5609, + "step": 4750 + }, + { + "epoch": 2.5876010781671157, + "grad_norm": 1.7008033990859985, + "learning_rate": 0.0005238486486486486, + "loss": 17.5315, + "step": 4800 + }, + { + "epoch": 2.61455525606469, + "grad_norm": 1.646754503250122, + "learning_rate": 0.0005230378378378378, + "loss": 17.4806, + "step": 4850 + }, + { + "epoch": 2.641509433962264, + "grad_norm": 1.6908066272735596, + "learning_rate": 0.000522227027027027, + "loss": 17.5326, + "step": 4900 + }, + { + "epoch": 2.6684636118598384, + "grad_norm": 1.6482735872268677, + "learning_rate": 0.0005214162162162162, + "loss": 17.4713, + "step": 4950 + }, + { + "epoch": 2.6954177897574123, + "grad_norm": 1.7912228107452393, + "learning_rate": 0.0005206054054054054, + "loss": 17.4994, + "step": 5000 + }, + { + "epoch": 2.6954177897574123, + "eval_accuracy": 0.36892821615335375, + "eval_loss": 3.4884941577911377, + "eval_runtime": 179.4879, + "eval_samples_per_second": 100.352, + "eval_steps_per_second": 6.273, + "step": 5000 + }, + { + "epoch": 2.7223719676549867, + "grad_norm": 1.4762933254241943, + "learning_rate": 0.0005197945945945946, + "loss": 17.4871, + "step": 5050 + }, + { + "epoch": 2.7493261455525606, + "grad_norm": 1.655876636505127, + "learning_rate": 0.0005189837837837837, + "loss": 17.4543, + "step": 5100 + }, + { + "epoch": 2.776280323450135, + "grad_norm": 1.5306168794631958, + "learning_rate": 0.0005181729729729729, + "loss": 17.4413, + "step": 5150 + }, + { + "epoch": 2.803234501347709, + "grad_norm": 1.7954528331756592, + "learning_rate": 0.0005173621621621621, + "loss": 17.3965, + "step": 5200 + }, + { + "epoch": 2.830188679245283, + "grad_norm": 1.7866185903549194, + "learning_rate": 0.0005165513513513513, + "loss": 17.4073, + "step": 5250 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 1.7150987386703491, + "learning_rate": 0.0005157405405405405, + "loss": 17.4216, + "step": 5300 + }, + { + "epoch": 2.884097035040431, + "grad_norm": 1.7467018365859985, + "learning_rate": 0.0005149297297297297, + "loss": 17.3859, + "step": 5350 + }, + { + "epoch": 2.9110512129380055, + "grad_norm": 1.8833541870117188, + "learning_rate": 0.0005141189189189189, + "loss": 17.3729, + "step": 5400 + }, + { + "epoch": 2.9380053908355794, + "grad_norm": 1.8507732152938843, + "learning_rate": 0.000513308108108108, + "loss": 17.3817, + "step": 5450 + }, + { + "epoch": 2.964959568733154, + "grad_norm": 1.6320147514343262, + "learning_rate": 0.0005124972972972972, + "loss": 17.3824, + "step": 5500 + }, + { + "epoch": 2.9919137466307277, + "grad_norm": 1.676006555557251, + "learning_rate": 0.0005116864864864864, + "loss": 17.3853, + "step": 5550 + }, + { + "epoch": 3.018867924528302, + "grad_norm": 1.6603399515151978, + "learning_rate": 0.0005108756756756756, + "loss": 17.0273, + "step": 5600 + }, + { + "epoch": 3.045822102425876, + "grad_norm": 1.6063870191574097, + "learning_rate": 0.0005100648648648648, + "loss": 16.9003, + "step": 5650 + }, + { + "epoch": 3.07277628032345, + "grad_norm": 1.7241891622543335, + "learning_rate": 0.000509254054054054, + "loss": 16.8804, + "step": 5700 + }, + { + "epoch": 3.0997304582210243, + "grad_norm": 1.7697309255599976, + "learning_rate": 0.0005084432432432432, + "loss": 16.9783, + "step": 5750 + }, + { + "epoch": 3.1266846361185983, + "grad_norm": 1.793578863143921, + "learning_rate": 0.0005076324324324324, + "loss": 16.9443, + "step": 5800 + }, + { + "epoch": 3.1536388140161726, + "grad_norm": 1.615790843963623, + "learning_rate": 0.0005068216216216217, + "loss": 16.9679, + "step": 5850 + }, + { + "epoch": 3.1805929919137466, + "grad_norm": 1.551426887512207, + "learning_rate": 0.0005060108108108107, + "loss": 16.9544, + "step": 5900 + }, + { + "epoch": 3.207547169811321, + "grad_norm": 1.5588505268096924, + "learning_rate": 0.0005051999999999999, + "loss": 16.9667, + "step": 5950 + }, + { + "epoch": 3.234501347708895, + "grad_norm": 1.5159425735473633, + "learning_rate": 0.0005043891891891891, + "loss": 16.9783, + "step": 6000 + }, + { + "epoch": 3.234501347708895, + "eval_accuracy": 0.3732482324243068, + "eval_loss": 3.449159860610962, + "eval_runtime": 179.7805, + "eval_samples_per_second": 100.189, + "eval_steps_per_second": 6.263, + "step": 6000 + }, + { + "epoch": 3.2614555256064692, + "grad_norm": 1.9565342664718628, + "learning_rate": 0.0005035783783783783, + "loss": 17.0079, + "step": 6050 + }, + { + "epoch": 3.288409703504043, + "grad_norm": 1.7779537439346313, + "learning_rate": 0.0005027675675675675, + "loss": 16.9515, + "step": 6100 + }, + { + "epoch": 3.315363881401617, + "grad_norm": 1.6085190773010254, + "learning_rate": 0.0005019567567567568, + "loss": 17.0043, + "step": 6150 + }, + { + "epoch": 3.3423180592991915, + "grad_norm": 1.5527286529541016, + "learning_rate": 0.000501145945945946, + "loss": 16.9876, + "step": 6200 + }, + { + "epoch": 3.3692722371967654, + "grad_norm": 1.6721524000167847, + "learning_rate": 0.0005003351351351352, + "loss": 17.0027, + "step": 6250 + }, + { + "epoch": 3.3962264150943398, + "grad_norm": 1.5313137769699097, + "learning_rate": 0.0004995243243243242, + "loss": 17.0007, + "step": 6300 + }, + { + "epoch": 3.4231805929919137, + "grad_norm": 1.630896806716919, + "learning_rate": 0.0004987135135135134, + "loss": 16.9898, + "step": 6350 + }, + { + "epoch": 3.450134770889488, + "grad_norm": 1.446964979171753, + "learning_rate": 0.0004979027027027026, + "loss": 17.0057, + "step": 6400 + }, + { + "epoch": 3.477088948787062, + "grad_norm": 1.7500450611114502, + "learning_rate": 0.0004970918918918919, + "loss": 16.9788, + "step": 6450 + }, + { + "epoch": 3.5040431266846364, + "grad_norm": 1.640507459640503, + "learning_rate": 0.0004962810810810811, + "loss": 16.9703, + "step": 6500 + }, + { + "epoch": 3.5309973045822103, + "grad_norm": 1.54189133644104, + "learning_rate": 0.0004954702702702703, + "loss": 16.9812, + "step": 6550 + }, + { + "epoch": 3.557951482479784, + "grad_norm": 1.5099515914916992, + "learning_rate": 0.0004946594594594595, + "loss": 16.9529, + "step": 6600 + }, + { + "epoch": 3.5849056603773586, + "grad_norm": 1.5417841672897339, + "learning_rate": 0.0004938486486486486, + "loss": 16.9888, + "step": 6650 + }, + { + "epoch": 3.6118598382749325, + "grad_norm": 1.670837640762329, + "learning_rate": 0.0004930378378378377, + "loss": 16.9116, + "step": 6700 + }, + { + "epoch": 3.638814016172507, + "grad_norm": 1.6270161867141724, + "learning_rate": 0.0004922270270270269, + "loss": 16.9732, + "step": 6750 + }, + { + "epoch": 3.665768194070081, + "grad_norm": 1.6039149761199951, + "learning_rate": 0.0004914162162162162, + "loss": 16.9689, + "step": 6800 + }, + { + "epoch": 3.6927223719676547, + "grad_norm": 1.5096150636672974, + "learning_rate": 0.0004906054054054054, + "loss": 16.984, + "step": 6850 + }, + { + "epoch": 3.719676549865229, + "grad_norm": 1.5823392868041992, + "learning_rate": 0.0004897945945945946, + "loss": 16.9951, + "step": 6900 + }, + { + "epoch": 3.7466307277628035, + "grad_norm": 1.5228644609451294, + "learning_rate": 0.0004889837837837838, + "loss": 16.9585, + "step": 6950 + }, + { + "epoch": 3.7735849056603774, + "grad_norm": 1.5134717226028442, + "learning_rate": 0.00048817297297297296, + "loss": 16.9938, + "step": 7000 + }, + { + "epoch": 3.7735849056603774, + "eval_accuracy": 0.3765476201340876, + "eval_loss": 3.413353681564331, + "eval_runtime": 183.6576, + "eval_samples_per_second": 98.074, + "eval_steps_per_second": 6.131, + "step": 7000 + }, + { + "epoch": 3.8005390835579513, + "grad_norm": 1.524044156074524, + "learning_rate": 0.00048736216216216214, + "loss": 16.9498, + "step": 7050 + }, + { + "epoch": 3.8274932614555257, + "grad_norm": 1.5392922163009644, + "learning_rate": 0.0004865513513513513, + "loss": 16.9014, + "step": 7100 + }, + { + "epoch": 3.8544474393530996, + "grad_norm": 1.5926542282104492, + "learning_rate": 0.00048574054054054046, + "loss": 16.9565, + "step": 7150 + }, + { + "epoch": 3.881401617250674, + "grad_norm": 1.546481966972351, + "learning_rate": 0.00048492972972972965, + "loss": 16.9432, + "step": 7200 + }, + { + "epoch": 3.908355795148248, + "grad_norm": 1.512964129447937, + "learning_rate": 0.0004841189189189189, + "loss": 16.9287, + "step": 7250 + }, + { + "epoch": 3.935309973045822, + "grad_norm": 1.6536102294921875, + "learning_rate": 0.0004833081081081081, + "loss": 16.8681, + "step": 7300 + }, + { + "epoch": 3.9622641509433962, + "grad_norm": 1.5245862007141113, + "learning_rate": 0.00048249729729729727, + "loss": 16.9772, + "step": 7350 + }, + { + "epoch": 3.9892183288409706, + "grad_norm": 1.6729086637496948, + "learning_rate": 0.00048168648648648645, + "loss": 16.9178, + "step": 7400 + }, + { + "epoch": 4.0161725067385445, + "grad_norm": 1.7279188632965088, + "learning_rate": 0.00048087567567567564, + "loss": 16.6391, + "step": 7450 + }, + { + "epoch": 4.0431266846361185, + "grad_norm": 1.4739909172058105, + "learning_rate": 0.0004800648648648648, + "loss": 16.469, + "step": 7500 + }, + { + "epoch": 4.070080862533692, + "grad_norm": 1.708237648010254, + "learning_rate": 0.000479254054054054, + "loss": 16.4581, + "step": 7550 + }, + { + "epoch": 4.097035040431267, + "grad_norm": 1.564699649810791, + "learning_rate": 0.0004784432432432432, + "loss": 16.5009, + "step": 7600 + }, + { + "epoch": 4.123989218328841, + "grad_norm": 1.540399193763733, + "learning_rate": 0.0004776324324324324, + "loss": 16.5215, + "step": 7650 + }, + { + "epoch": 4.150943396226415, + "grad_norm": 1.4679442644119263, + "learning_rate": 0.0004768216216216216, + "loss": 16.5071, + "step": 7700 + }, + { + "epoch": 4.177897574123989, + "grad_norm": 1.6866313219070435, + "learning_rate": 0.00047601081081081076, + "loss": 16.5778, + "step": 7750 + }, + { + "epoch": 4.204851752021563, + "grad_norm": 1.5899074077606201, + "learning_rate": 0.0004752, + "loss": 16.5755, + "step": 7800 + }, + { + "epoch": 4.231805929919138, + "grad_norm": 1.5043095350265503, + "learning_rate": 0.0004743891891891892, + "loss": 16.5264, + "step": 7850 + }, + { + "epoch": 4.258760107816712, + "grad_norm": 1.496482253074646, + "learning_rate": 0.0004735783783783783, + "loss": 16.5654, + "step": 7900 + }, + { + "epoch": 4.285714285714286, + "grad_norm": 1.5001450777053833, + "learning_rate": 0.0004727675675675675, + "loss": 16.6061, + "step": 7950 + }, + { + "epoch": 4.3126684636118595, + "grad_norm": 1.57184898853302, + "learning_rate": 0.0004719567567567567, + "loss": 16.5676, + "step": 8000 + }, + { + "epoch": 4.3126684636118595, + "eval_accuracy": 0.37935440300073925, + "eval_loss": 3.3927013874053955, + "eval_runtime": 183.3993, + "eval_samples_per_second": 98.212, + "eval_steps_per_second": 6.14, + "step": 8000 + }, + { + "epoch": 4.339622641509434, + "grad_norm": 1.6450532674789429, + "learning_rate": 0.0004711459459459459, + "loss": 16.5523, + "step": 8050 + }, + { + "epoch": 4.366576819407008, + "grad_norm": 1.6096779108047485, + "learning_rate": 0.0004703351351351351, + "loss": 16.6157, + "step": 8100 + }, + { + "epoch": 4.393530997304582, + "grad_norm": 1.5400686264038086, + "learning_rate": 0.0004695243243243243, + "loss": 16.5685, + "step": 8150 + }, + { + "epoch": 4.420485175202156, + "grad_norm": 1.546473503112793, + "learning_rate": 0.0004687135135135135, + "loss": 16.5769, + "step": 8200 + }, + { + "epoch": 4.44743935309973, + "grad_norm": 1.4172500371932983, + "learning_rate": 0.0004679027027027027, + "loss": 16.6169, + "step": 8250 + }, + { + "epoch": 4.474393530997305, + "grad_norm": 1.545454740524292, + "learning_rate": 0.0004670918918918918, + "loss": 16.614, + "step": 8300 + }, + { + "epoch": 4.501347708894879, + "grad_norm": 1.4791597127914429, + "learning_rate": 0.000466281081081081, + "loss": 16.6006, + "step": 8350 + }, + { + "epoch": 4.528301886792453, + "grad_norm": 1.5513116121292114, + "learning_rate": 0.0004654702702702702, + "loss": 16.6113, + "step": 8400 + }, + { + "epoch": 4.555256064690027, + "grad_norm": 1.5687803030014038, + "learning_rate": 0.00046465945945945944, + "loss": 16.6075, + "step": 8450 + }, + { + "epoch": 4.5822102425876015, + "grad_norm": 1.5172430276870728, + "learning_rate": 0.0004638486486486486, + "loss": 16.5888, + "step": 8500 + }, + { + "epoch": 4.609164420485175, + "grad_norm": 1.5713242292404175, + "learning_rate": 0.0004630378378378378, + "loss": 16.5863, + "step": 8550 + }, + { + "epoch": 4.636118598382749, + "grad_norm": 1.5270150899887085, + "learning_rate": 0.000462227027027027, + "loss": 16.6009, + "step": 8600 + }, + { + "epoch": 4.663072776280323, + "grad_norm": 1.6063131093978882, + "learning_rate": 0.0004614162162162162, + "loss": 16.6324, + "step": 8650 + }, + { + "epoch": 4.690026954177897, + "grad_norm": 1.5824447870254517, + "learning_rate": 0.0004606054054054053, + "loss": 16.6239, + "step": 8700 + }, + { + "epoch": 4.716981132075472, + "grad_norm": 1.4564862251281738, + "learning_rate": 0.00045979459459459456, + "loss": 16.5982, + "step": 8750 + }, + { + "epoch": 4.743935309973046, + "grad_norm": 1.5050652027130127, + "learning_rate": 0.00045898378378378375, + "loss": 16.5798, + "step": 8800 + }, + { + "epoch": 4.77088948787062, + "grad_norm": 1.4407291412353516, + "learning_rate": 0.00045817297297297293, + "loss": 16.6124, + "step": 8850 + }, + { + "epoch": 4.797843665768194, + "grad_norm": 1.6084630489349365, + "learning_rate": 0.0004573621621621621, + "loss": 16.5858, + "step": 8900 + }, + { + "epoch": 4.824797843665769, + "grad_norm": 1.5382035970687866, + "learning_rate": 0.0004565513513513513, + "loss": 16.5519, + "step": 8950 + }, + { + "epoch": 4.8517520215633425, + "grad_norm": 1.5417052507400513, + "learning_rate": 0.00045574054054054055, + "loss": 16.623, + "step": 9000 + }, + { + "epoch": 4.8517520215633425, + "eval_accuracy": 0.3819663820553638, + "eval_loss": 3.366989850997925, + "eval_runtime": 183.4323, + "eval_samples_per_second": 98.194, + "eval_steps_per_second": 6.139, + "step": 9000 + }, + { + "epoch": 4.878706199460916, + "grad_norm": 1.4613817930221558, + "learning_rate": 0.00045492972972972973, + "loss": 16.6013, + "step": 9050 + }, + { + "epoch": 4.90566037735849, + "grad_norm": 1.5268949270248413, + "learning_rate": 0.00045411891891891887, + "loss": 16.5575, + "step": 9100 + }, + { + "epoch": 4.932614555256064, + "grad_norm": 1.5352474451065063, + "learning_rate": 0.00045330810810810805, + "loss": 16.5424, + "step": 9150 + }, + { + "epoch": 4.959568733153639, + "grad_norm": 1.538854718208313, + "learning_rate": 0.00045249729729729724, + "loss": 16.5602, + "step": 9200 + }, + { + "epoch": 4.986522911051213, + "grad_norm": 1.450854778289795, + "learning_rate": 0.00045168648648648643, + "loss": 16.6054, + "step": 9250 + }, + { + "epoch": 5.013477088948787, + "grad_norm": 1.4519016742706299, + "learning_rate": 0.0004508756756756756, + "loss": 16.3572, + "step": 9300 + }, + { + "epoch": 5.040431266846361, + "grad_norm": 1.5664149522781372, + "learning_rate": 0.00045006486486486486, + "loss": 16.1281, + "step": 9350 + }, + { + "epoch": 5.067385444743936, + "grad_norm": 1.5345157384872437, + "learning_rate": 0.00044925405405405404, + "loss": 16.1432, + "step": 9400 + }, + { + "epoch": 5.09433962264151, + "grad_norm": 1.5099859237670898, + "learning_rate": 0.00044844324324324323, + "loss": 16.1704, + "step": 9450 + }, + { + "epoch": 5.121293800539084, + "grad_norm": 1.5200412273406982, + "learning_rate": 0.00044763243243243236, + "loss": 16.1838, + "step": 9500 + }, + { + "epoch": 5.1482479784366575, + "grad_norm": 1.4238975048065186, + "learning_rate": 0.00044682162162162155, + "loss": 16.1983, + "step": 9550 + }, + { + "epoch": 5.175202156334231, + "grad_norm": 1.560627818107605, + "learning_rate": 0.00044601081081081074, + "loss": 16.2052, + "step": 9600 + }, + { + "epoch": 5.202156334231806, + "grad_norm": 1.5408422946929932, + "learning_rate": 0.0004452, + "loss": 16.2084, + "step": 9650 + }, + { + "epoch": 5.22911051212938, + "grad_norm": 1.4684669971466064, + "learning_rate": 0.00044438918918918917, + "loss": 16.2638, + "step": 9700 + }, + { + "epoch": 5.256064690026954, + "grad_norm": 1.408656120300293, + "learning_rate": 0.00044357837837837835, + "loss": 16.2383, + "step": 9750 + }, + { + "epoch": 5.283018867924528, + "grad_norm": 1.4579087495803833, + "learning_rate": 0.00044276756756756754, + "loss": 16.2952, + "step": 9800 + }, + { + "epoch": 5.309973045822103, + "grad_norm": 1.5319942235946655, + "learning_rate": 0.00044195675675675673, + "loss": 16.2531, + "step": 9850 + }, + { + "epoch": 5.336927223719677, + "grad_norm": 1.6440680027008057, + "learning_rate": 0.00044114594594594586, + "loss": 16.2352, + "step": 9900 + }, + { + "epoch": 5.363881401617251, + "grad_norm": 1.5315848588943481, + "learning_rate": 0.0004403351351351351, + "loss": 16.2721, + "step": 9950 + }, + { + "epoch": 5.390835579514825, + "grad_norm": 1.5545754432678223, + "learning_rate": 0.0004395243243243243, + "loss": 16.268, + "step": 10000 + }, + { + "epoch": 5.390835579514825, + "eval_accuracy": 0.38350960199180106, + "eval_loss": 3.3579905033111572, + "eval_runtime": 183.2768, + "eval_samples_per_second": 98.278, + "eval_steps_per_second": 6.144, + "step": 10000 + }, + { + "epoch": 5.4177897574123985, + "grad_norm": 1.5124561786651611, + "learning_rate": 0.0004387135135135135, + "loss": 16.2922, + "step": 10050 + }, + { + "epoch": 5.444743935309973, + "grad_norm": 1.5418624877929688, + "learning_rate": 0.00043790270270270266, + "loss": 16.3084, + "step": 10100 + }, + { + "epoch": 5.471698113207547, + "grad_norm": 1.556877851486206, + "learning_rate": 0.00043709189189189185, + "loss": 16.2663, + "step": 10150 + }, + { + "epoch": 5.498652291105121, + "grad_norm": 1.4898020029067993, + "learning_rate": 0.00043628108108108104, + "loss": 16.3106, + "step": 10200 + }, + { + "epoch": 5.525606469002695, + "grad_norm": 1.553478717803955, + "learning_rate": 0.0004354702702702703, + "loss": 16.2869, + "step": 10250 + }, + { + "epoch": 5.55256064690027, + "grad_norm": 1.482102870941162, + "learning_rate": 0.0004346594594594594, + "loss": 16.2996, + "step": 10300 + }, + { + "epoch": 5.579514824797844, + "grad_norm": 1.4998143911361694, + "learning_rate": 0.0004338486486486486, + "loss": 16.3236, + "step": 10350 + }, + { + "epoch": 5.606469002695418, + "grad_norm": 1.3639193773269653, + "learning_rate": 0.0004330378378378378, + "loss": 16.3022, + "step": 10400 + }, + { + "epoch": 5.633423180592992, + "grad_norm": 1.5346744060516357, + "learning_rate": 0.00043222702702702697, + "loss": 16.3686, + "step": 10450 + }, + { + "epoch": 5.660377358490566, + "grad_norm": 1.5229798555374146, + "learning_rate": 0.00043141621621621616, + "loss": 16.339, + "step": 10500 + }, + { + "epoch": 5.6873315363881405, + "grad_norm": 1.6072639226913452, + "learning_rate": 0.0004306054054054054, + "loss": 16.3501, + "step": 10550 + }, + { + "epoch": 5.714285714285714, + "grad_norm": 1.42572021484375, + "learning_rate": 0.0004297945945945946, + "loss": 16.3437, + "step": 10600 + }, + { + "epoch": 5.741239892183288, + "grad_norm": 1.48056960105896, + "learning_rate": 0.0004289837837837838, + "loss": 16.3218, + "step": 10650 + }, + { + "epoch": 5.768194070080862, + "grad_norm": 1.6410192251205444, + "learning_rate": 0.0004281729729729729, + "loss": 16.301, + "step": 10700 + }, + { + "epoch": 5.795148247978437, + "grad_norm": 1.4807794094085693, + "learning_rate": 0.0004273621621621621, + "loss": 16.2993, + "step": 10750 + }, + { + "epoch": 5.822102425876011, + "grad_norm": 1.5122348070144653, + "learning_rate": 0.0004265513513513513, + "loss": 16.3178, + "step": 10800 + }, + { + "epoch": 5.849056603773585, + "grad_norm": 1.4910032749176025, + "learning_rate": 0.0004257405405405405, + "loss": 16.3674, + "step": 10850 + }, + { + "epoch": 5.876010781671159, + "grad_norm": 1.449251413345337, + "learning_rate": 0.0004249297297297297, + "loss": 16.3492, + "step": 10900 + }, + { + "epoch": 5.902964959568733, + "grad_norm": 1.4819191694259644, + "learning_rate": 0.0004241189189189189, + "loss": 16.3509, + "step": 10950 + }, + { + "epoch": 5.929919137466308, + "grad_norm": 1.4536081552505493, + "learning_rate": 0.0004233081081081081, + "loss": 16.3595, + "step": 11000 + }, + { + "epoch": 5.929919137466308, + "eval_accuracy": 0.3857778223954198, + "eval_loss": 3.337559938430786, + "eval_runtime": 183.0486, + "eval_samples_per_second": 98.4, + "eval_steps_per_second": 6.151, + "step": 11000 + }, + { + "epoch": 5.9568733153638815, + "grad_norm": 1.6155593395233154, + "learning_rate": 0.00042249729729729727, + "loss": 16.3047, + "step": 11050 + }, + { + "epoch": 5.9838274932614555, + "grad_norm": 1.4833221435546875, + "learning_rate": 0.0004216864864864864, + "loss": 16.3674, + "step": 11100 + }, + { + "epoch": 6.010781671159029, + "grad_norm": 1.4600006341934204, + "learning_rate": 0.0004208756756756756, + "loss": 16.168, + "step": 11150 + }, + { + "epoch": 6.037735849056604, + "grad_norm": 1.5126168727874756, + "learning_rate": 0.00042006486486486483, + "loss": 15.8955, + "step": 11200 + }, + { + "epoch": 6.064690026954178, + "grad_norm": 1.5764695405960083, + "learning_rate": 0.000419254054054054, + "loss": 15.8718, + "step": 11250 + }, + { + "epoch": 6.091644204851752, + "grad_norm": 1.6239944696426392, + "learning_rate": 0.0004184432432432432, + "loss": 15.8961, + "step": 11300 + }, + { + "epoch": 6.118598382749326, + "grad_norm": 1.5485799312591553, + "learning_rate": 0.0004176324324324324, + "loss": 15.9413, + "step": 11350 + }, + { + "epoch": 6.1455525606469, + "grad_norm": 1.5715981721878052, + "learning_rate": 0.0004168216216216216, + "loss": 15.9672, + "step": 11400 + }, + { + "epoch": 6.172506738544475, + "grad_norm": 1.5469777584075928, + "learning_rate": 0.0004160108108108108, + "loss": 15.9807, + "step": 11450 + }, + { + "epoch": 6.199460916442049, + "grad_norm": 1.4775831699371338, + "learning_rate": 0.00041519999999999995, + "loss": 16.0002, + "step": 11500 + }, + { + "epoch": 6.226415094339623, + "grad_norm": 1.545695424079895, + "learning_rate": 0.00041438918918918914, + "loss": 16.028, + "step": 11550 + }, + { + "epoch": 6.2533692722371965, + "grad_norm": 1.4563783407211304, + "learning_rate": 0.00041357837837837833, + "loss": 16.0287, + "step": 11600 + }, + { + "epoch": 6.280323450134771, + "grad_norm": 1.4710512161254883, + "learning_rate": 0.0004127675675675675, + "loss": 16.0216, + "step": 11650 + }, + { + "epoch": 6.307277628032345, + "grad_norm": 1.4847939014434814, + "learning_rate": 0.0004119567567567567, + "loss": 16.0212, + "step": 11700 + }, + { + "epoch": 6.334231805929919, + "grad_norm": 1.4909292459487915, + "learning_rate": 0.00041114594594594594, + "loss": 16.0188, + "step": 11750 + }, + { + "epoch": 6.361185983827493, + "grad_norm": 1.4875826835632324, + "learning_rate": 0.00041033513513513513, + "loss": 16.0297, + "step": 11800 + }, + { + "epoch": 6.388140161725067, + "grad_norm": 1.471168041229248, + "learning_rate": 0.0004095243243243243, + "loss": 16.0682, + "step": 11850 + }, + { + "epoch": 6.415094339622642, + "grad_norm": 1.4281312227249146, + "learning_rate": 0.00040871351351351345, + "loss": 16.0746, + "step": 11900 + }, + { + "epoch": 6.442048517520216, + "grad_norm": 1.4156763553619385, + "learning_rate": 0.00040790270270270264, + "loss": 16.0395, + "step": 11950 + }, + { + "epoch": 6.46900269541779, + "grad_norm": 1.5514986515045166, + "learning_rate": 0.0004070918918918918, + "loss": 16.0848, + "step": 12000 + }, + { + "epoch": 6.46900269541779, + "eval_accuracy": 0.38665525440095816, + "eval_loss": 3.334074020385742, + "eval_runtime": 183.4671, + "eval_samples_per_second": 98.176, + "eval_steps_per_second": 6.137, + "step": 12000 + }, + { + "epoch": 6.495956873315364, + "grad_norm": 1.50123929977417, + "learning_rate": 0.000406281081081081, + "loss": 16.0249, + "step": 12050 + }, + { + "epoch": 6.5229110512129385, + "grad_norm": 1.5448544025421143, + "learning_rate": 0.00040547027027027025, + "loss": 16.055, + "step": 12100 + }, + { + "epoch": 6.549865229110512, + "grad_norm": 1.4838643074035645, + "learning_rate": 0.00040465945945945944, + "loss": 16.0818, + "step": 12150 + }, + { + "epoch": 6.576819407008086, + "grad_norm": 1.5144941806793213, + "learning_rate": 0.00040384864864864863, + "loss": 16.0781, + "step": 12200 + }, + { + "epoch": 6.60377358490566, + "grad_norm": 1.5219935178756714, + "learning_rate": 0.0004030378378378378, + "loss": 16.1189, + "step": 12250 + }, + { + "epoch": 6.630727762803234, + "grad_norm": 1.5085090398788452, + "learning_rate": 0.00040222702702702695, + "loss": 16.0967, + "step": 12300 + }, + { + "epoch": 6.657681940700809, + "grad_norm": 1.3718434572219849, + "learning_rate": 0.00040141621621621614, + "loss": 16.063, + "step": 12350 + }, + { + "epoch": 6.684636118598383, + "grad_norm": 1.4403940439224243, + "learning_rate": 0.0004006054054054054, + "loss": 16.0543, + "step": 12400 + }, + { + "epoch": 6.711590296495957, + "grad_norm": 1.4802608489990234, + "learning_rate": 0.00039979459459459456, + "loss": 16.0814, + "step": 12450 + }, + { + "epoch": 6.738544474393531, + "grad_norm": 1.5132447481155396, + "learning_rate": 0.00039898378378378375, + "loss": 16.0892, + "step": 12500 + }, + { + "epoch": 6.765498652291106, + "grad_norm": 1.5833419561386108, + "learning_rate": 0.00039817297297297294, + "loss": 16.1005, + "step": 12550 + }, + { + "epoch": 6.7924528301886795, + "grad_norm": 1.4338456392288208, + "learning_rate": 0.0003973621621621621, + "loss": 16.1218, + "step": 12600 + }, + { + "epoch": 6.819407008086253, + "grad_norm": 1.5977522134780884, + "learning_rate": 0.00039655135135135137, + "loss": 16.1079, + "step": 12650 + }, + { + "epoch": 6.846361185983827, + "grad_norm": 1.5367788076400757, + "learning_rate": 0.0003957405405405405, + "loss": 16.0879, + "step": 12700 + }, + { + "epoch": 6.873315363881401, + "grad_norm": 1.3250383138656616, + "learning_rate": 0.0003949297297297297, + "loss": 16.1008, + "step": 12750 + }, + { + "epoch": 6.900269541778976, + "grad_norm": 1.6440327167510986, + "learning_rate": 0.00039411891891891887, + "loss": 16.1511, + "step": 12800 + }, + { + "epoch": 6.92722371967655, + "grad_norm": 1.5747231245040894, + "learning_rate": 0.00039330810810810806, + "loss": 16.101, + "step": 12850 + }, + { + "epoch": 6.954177897574124, + "grad_norm": 1.536387324333191, + "learning_rate": 0.00039249729729729725, + "loss": 16.1572, + "step": 12900 + }, + { + "epoch": 6.981132075471698, + "grad_norm": 1.4417625665664673, + "learning_rate": 0.0003916864864864865, + "loss": 16.1219, + "step": 12950 + }, + { + "epoch": 7.008086253369272, + "grad_norm": 1.502989649772644, + "learning_rate": 0.0003908756756756757, + "loss": 15.9683, + "step": 13000 + }, + { + "epoch": 7.008086253369272, + "eval_accuracy": 0.387830161497032, + "eval_loss": 3.324005365371704, + "eval_runtime": 183.0109, + "eval_samples_per_second": 98.42, + "eval_steps_per_second": 6.153, + "step": 13000 + }, + { + "epoch": 7.035040431266847, + "grad_norm": 1.597719430923462, + "learning_rate": 0.00039006486486486486, + "loss": 15.6299, + "step": 13050 + }, + { + "epoch": 7.061994609164421, + "grad_norm": 1.4646968841552734, + "learning_rate": 0.000389254054054054, + "loss": 15.6497, + "step": 13100 + }, + { + "epoch": 7.0889487870619945, + "grad_norm": 1.446700930595398, + "learning_rate": 0.0003884432432432432, + "loss": 15.6701, + "step": 13150 + }, + { + "epoch": 7.115902964959568, + "grad_norm": 1.491384744644165, + "learning_rate": 0.00038763243243243237, + "loss": 15.7227, + "step": 13200 + }, + { + "epoch": 7.142857142857143, + "grad_norm": 1.5890462398529053, + "learning_rate": 0.00038682162162162156, + "loss": 15.7391, + "step": 13250 + }, + { + "epoch": 7.169811320754717, + "grad_norm": 1.4061617851257324, + "learning_rate": 0.0003860108108108108, + "loss": 15.7767, + "step": 13300 + }, + { + "epoch": 7.196765498652291, + "grad_norm": 1.529066562652588, + "learning_rate": 0.0003852, + "loss": 15.7941, + "step": 13350 + }, + { + "epoch": 7.223719676549865, + "grad_norm": 1.6018165349960327, + "learning_rate": 0.00038438918918918917, + "loss": 15.8158, + "step": 13400 + }, + { + "epoch": 7.250673854447439, + "grad_norm": 1.4185563325881958, + "learning_rate": 0.00038357837837837836, + "loss": 15.813, + "step": 13450 + }, + { + "epoch": 7.277628032345014, + "grad_norm": 1.3626817464828491, + "learning_rate": 0.0003827675675675675, + "loss": 15.7934, + "step": 13500 + }, + { + "epoch": 7.304582210242588, + "grad_norm": 1.5028986930847168, + "learning_rate": 0.0003819567567567567, + "loss": 15.8513, + "step": 13550 + }, + { + "epoch": 7.331536388140162, + "grad_norm": 1.55814790725708, + "learning_rate": 0.0003811459459459459, + "loss": 15.8259, + "step": 13600 + }, + { + "epoch": 7.3584905660377355, + "grad_norm": 1.5069361925125122, + "learning_rate": 0.0003803351351351351, + "loss": 15.8, + "step": 13650 + }, + { + "epoch": 7.38544474393531, + "grad_norm": 1.5427130460739136, + "learning_rate": 0.0003795243243243243, + "loss": 15.8371, + "step": 13700 + }, + { + "epoch": 7.412398921832884, + "grad_norm": 1.6525657176971436, + "learning_rate": 0.0003787135135135135, + "loss": 15.8431, + "step": 13750 + }, + { + "epoch": 7.439353099730458, + "grad_norm": 1.4115492105484009, + "learning_rate": 0.00037790270270270267, + "loss": 15.8876, + "step": 13800 + }, + { + "epoch": 7.466307277628032, + "grad_norm": 1.4626988172531128, + "learning_rate": 0.0003770918918918919, + "loss": 15.8694, + "step": 13850 + }, + { + "epoch": 7.493261455525606, + "grad_norm": 1.446192979812622, + "learning_rate": 0.000376281081081081, + "loss": 15.8921, + "step": 13900 + }, + { + "epoch": 7.520215633423181, + "grad_norm": 1.570351004600525, + "learning_rate": 0.00037547027027027023, + "loss": 15.8805, + "step": 13950 + }, + { + "epoch": 7.547169811320755, + "grad_norm": 1.4021625518798828, + "learning_rate": 0.0003746594594594594, + "loss": 15.9139, + "step": 14000 + }, + { + "epoch": 7.547169811320755, + "eval_accuracy": 0.38896671625309154, + "eval_loss": 3.3171634674072266, + "eval_runtime": 183.4636, + "eval_samples_per_second": 98.178, + "eval_steps_per_second": 6.137, + "step": 14000 + }, + { + "epoch": 7.574123989218329, + "grad_norm": 1.5256940126419067, + "learning_rate": 0.0003738486486486486, + "loss": 15.844, + "step": 14050 + }, + { + "epoch": 7.601078167115903, + "grad_norm": 1.5494848489761353, + "learning_rate": 0.0003730378378378378, + "loss": 15.8598, + "step": 14100 + }, + { + "epoch": 7.628032345013477, + "grad_norm": 1.4843692779541016, + "learning_rate": 0.000372227027027027, + "loss": 15.8758, + "step": 14150 + }, + { + "epoch": 7.654986522911051, + "grad_norm": 1.4996877908706665, + "learning_rate": 0.0003714162162162162, + "loss": 15.9204, + "step": 14200 + }, + { + "epoch": 7.681940700808625, + "grad_norm": 1.4797159433364868, + "learning_rate": 0.0003706054054054054, + "loss": 15.9329, + "step": 14250 + }, + { + "epoch": 7.708894878706199, + "grad_norm": 1.4859975576400757, + "learning_rate": 0.0003697945945945946, + "loss": 15.9547, + "step": 14300 + }, + { + "epoch": 7.735849056603773, + "grad_norm": 1.5032356977462769, + "learning_rate": 0.0003689837837837837, + "loss": 15.9601, + "step": 14350 + }, + { + "epoch": 7.762803234501348, + "grad_norm": 1.4542951583862305, + "learning_rate": 0.0003681729729729729, + "loss": 15.8923, + "step": 14400 + }, + { + "epoch": 7.789757412398922, + "grad_norm": 1.4771339893341064, + "learning_rate": 0.0003673621621621621, + "loss": 15.9012, + "step": 14450 + }, + { + "epoch": 7.816711590296496, + "grad_norm": 1.440063714981079, + "learning_rate": 0.00036655135135135134, + "loss": 15.9271, + "step": 14500 + }, + { + "epoch": 7.84366576819407, + "grad_norm": 1.5838462114334106, + "learning_rate": 0.00036574054054054053, + "loss": 15.931, + "step": 14550 + }, + { + "epoch": 7.870619946091644, + "grad_norm": 1.4777487516403198, + "learning_rate": 0.0003649297297297297, + "loss": 15.9197, + "step": 14600 + }, + { + "epoch": 7.8975741239892185, + "grad_norm": 1.497223973274231, + "learning_rate": 0.0003641189189189189, + "loss": 15.9195, + "step": 14650 + }, + { + "epoch": 7.9245283018867925, + "grad_norm": 1.3650174140930176, + "learning_rate": 0.0003633081081081081, + "loss": 15.9278, + "step": 14700 + }, + { + "epoch": 7.951482479784366, + "grad_norm": 1.4432156085968018, + "learning_rate": 0.0003624972972972972, + "loss": 15.973, + "step": 14750 + }, + { + "epoch": 7.97843665768194, + "grad_norm": 1.8180121183395386, + "learning_rate": 0.00036168648648648646, + "loss": 15.9264, + "step": 14800 + }, + { + "epoch": 8.005390835579515, + "grad_norm": 1.5880870819091797, + "learning_rate": 0.00036087567567567565, + "loss": 15.8274, + "step": 14850 + }, + { + "epoch": 8.032345013477089, + "grad_norm": 1.5161232948303223, + "learning_rate": 0.00036006486486486484, + "loss": 15.4744, + "step": 14900 + }, + { + "epoch": 8.059299191374663, + "grad_norm": 1.5016933679580688, + "learning_rate": 0.000359254054054054, + "loss": 15.5281, + "step": 14950 + }, + { + "epoch": 8.086253369272237, + "grad_norm": 1.4770663976669312, + "learning_rate": 0.0003584432432432432, + "loss": 15.5172, + "step": 15000 + }, + { + "epoch": 8.086253369272237, + "eval_accuracy": 0.38934643701328925, + "eval_loss": 3.314887285232544, + "eval_runtime": 182.5262, + "eval_samples_per_second": 98.682, + "eval_steps_per_second": 6.169, + "step": 15000 + }, + { + "epoch": 8.11320754716981, + "grad_norm": 1.571670413017273, + "learning_rate": 0.0003576324324324324, + "loss": 15.5307, + "step": 15050 + }, + { + "epoch": 8.140161725067385, + "grad_norm": 1.412194013595581, + "learning_rate": 0.00035682162162162164, + "loss": 15.5542, + "step": 15100 + }, + { + "epoch": 8.167115902964959, + "grad_norm": 1.5343204736709595, + "learning_rate": 0.0003560108108108108, + "loss": 15.5985, + "step": 15150 + }, + { + "epoch": 8.194070080862534, + "grad_norm": 1.4480562210083008, + "learning_rate": 0.00035519999999999996, + "loss": 15.6071, + "step": 15200 + }, + { + "epoch": 8.221024258760108, + "grad_norm": 1.4651609659194946, + "learning_rate": 0.00035438918918918915, + "loss": 15.6506, + "step": 15250 + }, + { + "epoch": 8.247978436657682, + "grad_norm": 1.5500885248184204, + "learning_rate": 0.00035357837837837833, + "loss": 15.6198, + "step": 15300 + }, + { + "epoch": 8.274932614555256, + "grad_norm": 1.457398533821106, + "learning_rate": 0.0003527675675675675, + "loss": 15.6218, + "step": 15350 + }, + { + "epoch": 8.30188679245283, + "grad_norm": 1.457667350769043, + "learning_rate": 0.00035195675675675676, + "loss": 15.6641, + "step": 15400 + }, + { + "epoch": 8.328840970350404, + "grad_norm": 1.4689645767211914, + "learning_rate": 0.0003511621621621621, + "loss": 15.6165, + "step": 15450 + }, + { + "epoch": 8.355795148247978, + "grad_norm": 1.434046983718872, + "learning_rate": 0.0003503513513513513, + "loss": 15.6322, + "step": 15500 + }, + { + "epoch": 8.382749326145552, + "grad_norm": 1.5870915651321411, + "learning_rate": 0.0003495405405405405, + "loss": 15.6622, + "step": 15550 + }, + { + "epoch": 8.409703504043126, + "grad_norm": 1.5007177591323853, + "learning_rate": 0.0003487297297297297, + "loss": 15.6481, + "step": 15600 + }, + { + "epoch": 8.436657681940702, + "grad_norm": 1.6240930557250977, + "learning_rate": 0.0003479189189189189, + "loss": 15.6799, + "step": 15650 + }, + { + "epoch": 8.463611859838275, + "grad_norm": 1.5161348581314087, + "learning_rate": 0.0003471081081081081, + "loss": 15.7144, + "step": 15700 + }, + { + "epoch": 8.49056603773585, + "grad_norm": 1.4712945222854614, + "learning_rate": 0.0003462972972972973, + "loss": 15.7262, + "step": 15750 + }, + { + "epoch": 8.517520215633423, + "grad_norm": 1.4982839822769165, + "learning_rate": 0.00034548648648648645, + "loss": 15.6994, + "step": 15800 + }, + { + "epoch": 8.544474393530997, + "grad_norm": 1.5805519819259644, + "learning_rate": 0.00034467567567567564, + "loss": 15.6952, + "step": 15850 + }, + { + "epoch": 8.571428571428571, + "grad_norm": 1.38987398147583, + "learning_rate": 0.0003438648648648648, + "loss": 15.7478, + "step": 15900 + }, + { + "epoch": 8.598382749326145, + "grad_norm": 1.5049515962600708, + "learning_rate": 0.000343054054054054, + "loss": 15.7328, + "step": 15950 + }, + { + "epoch": 8.625336927223719, + "grad_norm": 1.4291828870773315, + "learning_rate": 0.0003422432432432432, + "loss": 15.7165, + "step": 16000 + }, + { + "epoch": 8.625336927223719, + "eval_accuracy": 0.39062988231807194, + "eval_loss": 3.300835609436035, + "eval_runtime": 182.7741, + "eval_samples_per_second": 98.548, + "eval_steps_per_second": 6.161, + "step": 16000 + }, + { + "epoch": 8.652291105121293, + "grad_norm": 1.511680006980896, + "learning_rate": 0.00034143243243243244, + "loss": 15.7899, + "step": 16050 + }, + { + "epoch": 8.679245283018869, + "grad_norm": 1.4265791177749634, + "learning_rate": 0.0003406216216216216, + "loss": 15.7614, + "step": 16100 + }, + { + "epoch": 8.706199460916443, + "grad_norm": 1.4805567264556885, + "learning_rate": 0.0003398108108108108, + "loss": 15.7489, + "step": 16150 + }, + { + "epoch": 8.733153638814017, + "grad_norm": 1.4429032802581787, + "learning_rate": 0.00033899999999999995, + "loss": 15.7055, + "step": 16200 + }, + { + "epoch": 8.76010781671159, + "grad_norm": 1.5268951654434204, + "learning_rate": 0.00033818918918918913, + "loss": 15.7464, + "step": 16250 + }, + { + "epoch": 8.787061994609164, + "grad_norm": 1.5547935962677002, + "learning_rate": 0.0003373783783783783, + "loss": 15.7365, + "step": 16300 + }, + { + "epoch": 8.814016172506738, + "grad_norm": 1.6019865274429321, + "learning_rate": 0.0003365675675675675, + "loss": 15.749, + "step": 16350 + }, + { + "epoch": 8.840970350404312, + "grad_norm": 1.576707124710083, + "learning_rate": 0.00033575675675675675, + "loss": 15.7725, + "step": 16400 + }, + { + "epoch": 8.867924528301886, + "grad_norm": 1.544109582901001, + "learning_rate": 0.00033494594594594594, + "loss": 15.7704, + "step": 16450 + }, + { + "epoch": 8.89487870619946, + "grad_norm": 1.5295876264572144, + "learning_rate": 0.0003341351351351351, + "loss": 15.7795, + "step": 16500 + }, + { + "epoch": 8.921832884097036, + "grad_norm": 1.4180580377578735, + "learning_rate": 0.0003333243243243243, + "loss": 15.7632, + "step": 16550 + }, + { + "epoch": 8.94878706199461, + "grad_norm": 1.4332422018051147, + "learning_rate": 0.00033251351351351344, + "loss": 15.7384, + "step": 16600 + }, + { + "epoch": 8.975741239892184, + "grad_norm": 1.5407085418701172, + "learning_rate": 0.00033170270270270263, + "loss": 15.7877, + "step": 16650 + }, + { + "epoch": 9.002695417789758, + "grad_norm": 1.6193679571151733, + "learning_rate": 0.00033089189189189187, + "loss": 15.7523, + "step": 16700 + }, + { + "epoch": 9.029649595687331, + "grad_norm": 1.5814419984817505, + "learning_rate": 0.00033008108108108106, + "loss": 15.3115, + "step": 16750 + }, + { + "epoch": 9.056603773584905, + "grad_norm": 1.523964524269104, + "learning_rate": 0.00032927027027027024, + "loss": 15.3445, + "step": 16800 + }, + { + "epoch": 9.08355795148248, + "grad_norm": 1.4931163787841797, + "learning_rate": 0.00032845945945945943, + "loss": 15.3418, + "step": 16850 + }, + { + "epoch": 9.110512129380053, + "grad_norm": 1.443772792816162, + "learning_rate": 0.0003276486486486486, + "loss": 15.415, + "step": 16900 + }, + { + "epoch": 9.137466307277627, + "grad_norm": 1.5072815418243408, + "learning_rate": 0.00032683783783783786, + "loss": 15.464, + "step": 16950 + }, + { + "epoch": 9.164420485175203, + "grad_norm": 1.5310089588165283, + "learning_rate": 0.00032602702702702694, + "loss": 15.4386, + "step": 17000 + }, + { + "epoch": 9.164420485175203, + "eval_accuracy": 0.39104013284468325, + "eval_loss": 3.3030104637145996, + "eval_runtime": 182.8107, + "eval_samples_per_second": 98.528, + "eval_steps_per_second": 6.159, + "step": 17000 + }, + { + "epoch": 9.191374663072777, + "grad_norm": 1.5395615100860596, + "learning_rate": 0.0003252162162162162, + "loss": 15.4729, + "step": 17050 + }, + { + "epoch": 9.21832884097035, + "grad_norm": 1.4140956401824951, + "learning_rate": 0.00032440540540540537, + "loss": 15.5002, + "step": 17100 + }, + { + "epoch": 9.245283018867925, + "grad_norm": 1.447230577468872, + "learning_rate": 0.00032359459459459455, + "loss": 15.4569, + "step": 17150 + }, + { + "epoch": 9.272237196765499, + "grad_norm": 1.4686763286590576, + "learning_rate": 0.00032278378378378374, + "loss": 15.4912, + "step": 17200 + }, + { + "epoch": 9.299191374663073, + "grad_norm": 1.5187294483184814, + "learning_rate": 0.00032197297297297293, + "loss": 15.4744, + "step": 17250 + }, + { + "epoch": 9.326145552560646, + "grad_norm": 1.6057308912277222, + "learning_rate": 0.00032116216216216217, + "loss": 15.4729, + "step": 17300 + }, + { + "epoch": 9.35309973045822, + "grad_norm": 1.606978416442871, + "learning_rate": 0.00032035135135135136, + "loss": 15.4983, + "step": 17350 + }, + { + "epoch": 9.380053908355794, + "grad_norm": 1.5377631187438965, + "learning_rate": 0.0003195405405405405, + "loss": 15.5095, + "step": 17400 + }, + { + "epoch": 9.40700808625337, + "grad_norm": 1.4473826885223389, + "learning_rate": 0.0003187297297297297, + "loss": 15.5087, + "step": 17450 + }, + { + "epoch": 9.433962264150944, + "grad_norm": 1.4930968284606934, + "learning_rate": 0.00031791891891891886, + "loss": 15.5576, + "step": 17500 + }, + { + "epoch": 9.460916442048518, + "grad_norm": 1.4278721809387207, + "learning_rate": 0.00031710810810810805, + "loss": 15.493, + "step": 17550 + }, + { + "epoch": 9.487870619946092, + "grad_norm": 1.4520282745361328, + "learning_rate": 0.0003162972972972973, + "loss": 15.5464, + "step": 17600 + }, + { + "epoch": 9.514824797843666, + "grad_norm": 1.5626846551895142, + "learning_rate": 0.0003154864864864865, + "loss": 15.5072, + "step": 17650 + }, + { + "epoch": 9.54177897574124, + "grad_norm": 1.4320954084396362, + "learning_rate": 0.00031467567567567567, + "loss": 15.5693, + "step": 17700 + }, + { + "epoch": 9.568733153638814, + "grad_norm": 1.511391282081604, + "learning_rate": 0.00031386486486486485, + "loss": 15.5792, + "step": 17750 + }, + { + "epoch": 9.595687331536388, + "grad_norm": 1.561791181564331, + "learning_rate": 0.00031305405405405404, + "loss": 15.6085, + "step": 17800 + }, + { + "epoch": 9.622641509433961, + "grad_norm": 1.5459599494934082, + "learning_rate": 0.0003122432432432432, + "loss": 15.5633, + "step": 17850 + }, + { + "epoch": 9.649595687331537, + "grad_norm": 1.5561130046844482, + "learning_rate": 0.0003114324324324324, + "loss": 15.5673, + "step": 17900 + }, + { + "epoch": 9.676549865229111, + "grad_norm": 1.5501707792282104, + "learning_rate": 0.0003106216216216216, + "loss": 15.6434, + "step": 17950 + }, + { + "epoch": 9.703504043126685, + "grad_norm": 1.5101507902145386, + "learning_rate": 0.0003098108108108108, + "loss": 15.6082, + "step": 18000 + }, + { + "epoch": 9.703504043126685, + "eval_accuracy": 0.39216082515983036, + "eval_loss": 3.289290189743042, + "eval_runtime": 183.0971, + "eval_samples_per_second": 98.374, + "eval_steps_per_second": 6.15, + "step": 18000 + }, + { + "epoch": 9.730458221024259, + "grad_norm": 1.5629507303237915, + "learning_rate": 0.000309, + "loss": 15.5575, + "step": 18050 + }, + { + "epoch": 9.757412398921833, + "grad_norm": 1.4830306768417358, + "learning_rate": 0.00030818918918918916, + "loss": 15.6156, + "step": 18100 + }, + { + "epoch": 9.784366576819407, + "grad_norm": 1.5295294523239136, + "learning_rate": 0.00030737837837837835, + "loss": 15.5877, + "step": 18150 + }, + { + "epoch": 9.81132075471698, + "grad_norm": 1.597743272781372, + "learning_rate": 0.0003065675675675676, + "loss": 15.6086, + "step": 18200 + }, + { + "epoch": 9.838274932614555, + "grad_norm": 1.510759949684143, + "learning_rate": 0.0003057567567567567, + "loss": 15.6034, + "step": 18250 + }, + { + "epoch": 9.865229110512129, + "grad_norm": 1.4675997495651245, + "learning_rate": 0.0003049459459459459, + "loss": 15.6475, + "step": 18300 + }, + { + "epoch": 9.892183288409704, + "grad_norm": 1.540273666381836, + "learning_rate": 0.0003041351351351351, + "loss": 15.6564, + "step": 18350 + }, + { + "epoch": 9.919137466307278, + "grad_norm": 1.4667071104049683, + "learning_rate": 0.0003033243243243243, + "loss": 15.5939, + "step": 18400 + }, + { + "epoch": 9.946091644204852, + "grad_norm": 1.4966003894805908, + "learning_rate": 0.00030251351351351347, + "loss": 15.603, + "step": 18450 + }, + { + "epoch": 9.973045822102426, + "grad_norm": 1.6007273197174072, + "learning_rate": 0.0003017027027027027, + "loss": 15.6328, + "step": 18500 + }, + { + "epoch": 10.0, + "grad_norm": 2.202394485473633, + "learning_rate": 0.0003008918918918919, + "loss": 15.623, + "step": 18550 + }, + { + "epoch": 10.026954177897574, + "grad_norm": 1.4779033660888672, + "learning_rate": 0.0003000810810810811, + "loss": 15.1902, + "step": 18600 + }, + { + "epoch": 10.053908355795148, + "grad_norm": 1.5733294486999512, + "learning_rate": 0.0002992702702702703, + "loss": 15.2046, + "step": 18650 + }, + { + "epoch": 10.080862533692722, + "grad_norm": 1.5170866250991821, + "learning_rate": 0.00029845945945945946, + "loss": 15.2229, + "step": 18700 + }, + { + "epoch": 10.107816711590296, + "grad_norm": 1.6261968612670898, + "learning_rate": 0.0002976486486486486, + "loss": 15.2227, + "step": 18750 + }, + { + "epoch": 10.134770889487871, + "grad_norm": 1.468407154083252, + "learning_rate": 0.00029683783783783784, + "loss": 15.3124, + "step": 18800 + }, + { + "epoch": 10.161725067385445, + "grad_norm": 1.5138444900512695, + "learning_rate": 0.000296027027027027, + "loss": 15.3033, + "step": 18850 + }, + { + "epoch": 10.18867924528302, + "grad_norm": 1.4424211978912354, + "learning_rate": 0.0002952162162162162, + "loss": 15.2999, + "step": 18900 + }, + { + "epoch": 10.215633423180593, + "grad_norm": 1.5810476541519165, + "learning_rate": 0.0002944054054054054, + "loss": 15.317, + "step": 18950 + }, + { + "epoch": 10.242587601078167, + "grad_norm": 1.403139591217041, + "learning_rate": 0.0002935945945945946, + "loss": 15.3025, + "step": 19000 + }, + { + "epoch": 10.242587601078167, + "eval_accuracy": 0.39254825984677316, + "eval_loss": 3.2937002182006836, + "eval_runtime": 182.7882, + "eval_samples_per_second": 98.54, + "eval_steps_per_second": 6.16, + "step": 19000 + }, + { + "epoch": 10.269541778975741, + "grad_norm": 1.54314386844635, + "learning_rate": 0.00029278378378378377, + "loss": 15.3189, + "step": 19050 + }, + { + "epoch": 10.296495956873315, + "grad_norm": 1.4563575983047485, + "learning_rate": 0.00029197297297297296, + "loss": 15.3523, + "step": 19100 + }, + { + "epoch": 10.323450134770889, + "grad_norm": 1.448618769645691, + "learning_rate": 0.00029116216216216215, + "loss": 15.3741, + "step": 19150 + }, + { + "epoch": 10.350404312668463, + "grad_norm": 1.5495871305465698, + "learning_rate": 0.00029035135135135133, + "loss": 15.3719, + "step": 19200 + }, + { + "epoch": 10.377358490566039, + "grad_norm": 1.5040892362594604, + "learning_rate": 0.0002895405405405405, + "loss": 15.3823, + "step": 19250 + }, + { + "epoch": 10.404312668463612, + "grad_norm": 1.4656543731689453, + "learning_rate": 0.0002887297297297297, + "loss": 15.3649, + "step": 19300 + }, + { + "epoch": 10.431266846361186, + "grad_norm": 1.5206462144851685, + "learning_rate": 0.0002879189189189189, + "loss": 15.4259, + "step": 19350 + }, + { + "epoch": 10.45822102425876, + "grad_norm": 1.6246916055679321, + "learning_rate": 0.0002871081081081081, + "loss": 15.4196, + "step": 19400 + }, + { + "epoch": 10.485175202156334, + "grad_norm": 1.4970046281814575, + "learning_rate": 0.00028631351351351346, + "loss": 15.4277, + "step": 19450 + }, + { + "epoch": 10.512129380053908, + "grad_norm": 1.501575231552124, + "learning_rate": 0.0002855027027027027, + "loss": 15.3925, + "step": 19500 + }, + { + "epoch": 10.539083557951482, + "grad_norm": 1.5174660682678223, + "learning_rate": 0.0002846918918918919, + "loss": 15.4452, + "step": 19550 + }, + { + "epoch": 10.566037735849056, + "grad_norm": 1.4966565370559692, + "learning_rate": 0.000283881081081081, + "loss": 15.4459, + "step": 19600 + }, + { + "epoch": 10.59299191374663, + "grad_norm": 1.4583094120025635, + "learning_rate": 0.00028307027027027026, + "loss": 15.4223, + "step": 19650 + }, + { + "epoch": 10.619946091644206, + "grad_norm": 1.563902735710144, + "learning_rate": 0.00028225945945945945, + "loss": 15.4786, + "step": 19700 + }, + { + "epoch": 10.64690026954178, + "grad_norm": 1.456220030784607, + "learning_rate": 0.00028144864864864863, + "loss": 15.4053, + "step": 19750 + }, + { + "epoch": 10.673854447439354, + "grad_norm": 1.4244598150253296, + "learning_rate": 0.000280654054054054, + "loss": 15.4523, + "step": 19800 + }, + { + "epoch": 10.700808625336927, + "grad_norm": 1.6043477058410645, + "learning_rate": 0.00027984324324324325, + "loss": 15.4974, + "step": 19850 + }, + { + "epoch": 10.727762803234501, + "grad_norm": 1.5195151567459106, + "learning_rate": 0.0002790324324324324, + "loss": 15.4587, + "step": 19900 + }, + { + "epoch": 10.754716981132075, + "grad_norm": 1.4490087032318115, + "learning_rate": 0.0002782216216216216, + "loss": 15.4793, + "step": 19950 + }, + { + "epoch": 10.78167115902965, + "grad_norm": 1.4315603971481323, + "learning_rate": 0.0002774108108108108, + "loss": 15.4821, + "step": 20000 + }, + { + "epoch": 10.78167115902965, + "eval_accuracy": 0.3933073754266019, + "eval_loss": 3.2815403938293457, + "eval_runtime": 182.6356, + "eval_samples_per_second": 98.623, + "eval_steps_per_second": 6.165, + "step": 20000 + }, + { + "epoch": 10.808625336927223, + "grad_norm": 1.5369899272918701, + "learning_rate": 0.0002766, + "loss": 15.4788, + "step": 20050 + }, + { + "epoch": 10.835579514824797, + "grad_norm": 1.517513394355774, + "learning_rate": 0.00027578918918918913, + "loss": 15.497, + "step": 20100 + }, + { + "epoch": 10.862533692722373, + "grad_norm": 1.4283959865570068, + "learning_rate": 0.0002749783783783784, + "loss": 15.4969, + "step": 20150 + }, + { + "epoch": 10.889487870619947, + "grad_norm": 1.5472018718719482, + "learning_rate": 0.00027416756756756756, + "loss": 15.4344, + "step": 20200 + }, + { + "epoch": 10.91644204851752, + "grad_norm": 1.4912693500518799, + "learning_rate": 0.00027335675675675675, + "loss": 15.4865, + "step": 20250 + }, + { + "epoch": 10.943396226415095, + "grad_norm": 1.49799644947052, + "learning_rate": 0.0002725459459459459, + "loss": 15.4216, + "step": 20300 + }, + { + "epoch": 10.970350404312669, + "grad_norm": 1.434645652770996, + "learning_rate": 0.0002717351351351351, + "loss": 15.5138, + "step": 20350 + }, + { + "epoch": 10.997304582210242, + "grad_norm": 1.4343045949935913, + "learning_rate": 0.0002709243243243243, + "loss": 15.4722, + "step": 20400 + }, + { + "epoch": 11.024258760107816, + "grad_norm": 1.537205696105957, + "learning_rate": 0.0002701135135135135, + "loss": 15.1417, + "step": 20450 + }, + { + "epoch": 11.05121293800539, + "grad_norm": 1.5110833644866943, + "learning_rate": 0.0002693027027027027, + "loss": 15.0783, + "step": 20500 + }, + { + "epoch": 11.078167115902964, + "grad_norm": 1.4787135124206543, + "learning_rate": 0.00026849189189189187, + "loss": 15.0941, + "step": 20550 + }, + { + "epoch": 11.10512129380054, + "grad_norm": 1.525460124015808, + "learning_rate": 0.00026768108108108106, + "loss": 15.1286, + "step": 20600 + }, + { + "epoch": 11.132075471698114, + "grad_norm": 1.5442931652069092, + "learning_rate": 0.00026687027027027025, + "loss": 15.1526, + "step": 20650 + }, + { + "epoch": 11.159029649595688, + "grad_norm": 1.5607445240020752, + "learning_rate": 0.00026605945945945943, + "loss": 15.1498, + "step": 20700 + }, + { + "epoch": 11.185983827493262, + "grad_norm": 1.588240146636963, + "learning_rate": 0.0002652486486486486, + "loss": 15.2072, + "step": 20750 + }, + { + "epoch": 11.212938005390836, + "grad_norm": 1.4865835905075073, + "learning_rate": 0.0002644378378378378, + "loss": 15.1718, + "step": 20800 + }, + { + "epoch": 11.23989218328841, + "grad_norm": 1.5161540508270264, + "learning_rate": 0.000263627027027027, + "loss": 15.2105, + "step": 20850 + }, + { + "epoch": 11.266846361185983, + "grad_norm": 1.5474003553390503, + "learning_rate": 0.0002628162162162162, + "loss": 15.2186, + "step": 20900 + }, + { + "epoch": 11.293800539083557, + "grad_norm": 1.5761972665786743, + "learning_rate": 0.00026200540540540537, + "loss": 15.2277, + "step": 20950 + }, + { + "epoch": 11.320754716981131, + "grad_norm": 1.513260841369629, + "learning_rate": 0.00026119459459459456, + "loss": 15.1951, + "step": 21000 + }, + { + "epoch": 11.320754716981131, + "eval_accuracy": 0.39346752089170384, + "eval_loss": 3.285801410675049, + "eval_runtime": 182.9596, + "eval_samples_per_second": 98.448, + "eval_steps_per_second": 6.154, + "step": 21000 + }, + { + "epoch": 11.347708894878707, + "grad_norm": 1.5032051801681519, + "learning_rate": 0.0002603837837837838, + "loss": 15.2475, + "step": 21050 + }, + { + "epoch": 11.374663072776281, + "grad_norm": 1.6460040807724, + "learning_rate": 0.00025957297297297293, + "loss": 15.2432, + "step": 21100 + }, + { + "epoch": 11.401617250673855, + "grad_norm": 1.6060497760772705, + "learning_rate": 0.0002587621621621621, + "loss": 15.2565, + "step": 21150 + }, + { + "epoch": 11.428571428571429, + "grad_norm": 1.4609140157699585, + "learning_rate": 0.00025795135135135136, + "loss": 15.2865, + "step": 21200 + }, + { + "epoch": 11.455525606469003, + "grad_norm": 1.5956884622573853, + "learning_rate": 0.00025714054054054054, + "loss": 15.2832, + "step": 21250 + }, + { + "epoch": 11.482479784366577, + "grad_norm": 1.6202877759933472, + "learning_rate": 0.0002563297297297297, + "loss": 15.2709, + "step": 21300 + }, + { + "epoch": 11.50943396226415, + "grad_norm": 1.5032237768173218, + "learning_rate": 0.00025551891891891886, + "loss": 15.2972, + "step": 21350 + }, + { + "epoch": 11.536388140161725, + "grad_norm": 1.5729618072509766, + "learning_rate": 0.0002547081081081081, + "loss": 15.3052, + "step": 21400 + }, + { + "epoch": 11.563342318059298, + "grad_norm": 1.5918726921081543, + "learning_rate": 0.0002538972972972973, + "loss": 15.3656, + "step": 21450 + }, + { + "epoch": 11.590296495956874, + "grad_norm": 1.525824785232544, + "learning_rate": 0.0002530864864864864, + "loss": 15.2748, + "step": 21500 + }, + { + "epoch": 11.617250673854448, + "grad_norm": 1.4660574197769165, + "learning_rate": 0.00025227567567567567, + "loss": 15.3363, + "step": 21550 + }, + { + "epoch": 11.644204851752022, + "grad_norm": 1.5372847318649292, + "learning_rate": 0.00025146486486486485, + "loss": 15.293, + "step": 21600 + }, + { + "epoch": 11.671159029649596, + "grad_norm": 1.4234718084335327, + "learning_rate": 0.00025065405405405404, + "loss": 15.308, + "step": 21650 + }, + { + "epoch": 11.69811320754717, + "grad_norm": 1.3779908418655396, + "learning_rate": 0.00024984324324324323, + "loss": 15.329, + "step": 21700 + }, + { + "epoch": 11.725067385444744, + "grad_norm": 1.5348743200302124, + "learning_rate": 0.0002490324324324324, + "loss": 15.3045, + "step": 21750 + }, + { + "epoch": 11.752021563342318, + "grad_norm": 1.5083909034729004, + "learning_rate": 0.0002482216216216216, + "loss": 15.3429, + "step": 21800 + }, + { + "epoch": 11.778975741239892, + "grad_norm": 1.4889590740203857, + "learning_rate": 0.0002474108108108108, + "loss": 15.3267, + "step": 21850 + }, + { + "epoch": 11.805929919137466, + "grad_norm": 1.5269172191619873, + "learning_rate": 0.0002466, + "loss": 15.3488, + "step": 21900 + }, + { + "epoch": 11.832884097035041, + "grad_norm": 1.5171701908111572, + "learning_rate": 0.00024578918918918916, + "loss": 15.4014, + "step": 21950 + }, + { + "epoch": 11.859838274932615, + "grad_norm": 1.5388139486312866, + "learning_rate": 0.00024497837837837835, + "loss": 15.3396, + "step": 22000 + }, + { + "epoch": 11.859838274932615, + "eval_accuracy": 0.3947092458039498, + "eval_loss": 3.2750062942504883, + "eval_runtime": 182.5517, + "eval_samples_per_second": 98.668, + "eval_steps_per_second": 6.168, + "step": 22000 + }, + { + "epoch": 11.88679245283019, + "grad_norm": 1.6454235315322876, + "learning_rate": 0.00024416756756756754, + "loss": 15.3485, + "step": 22050 + }, + { + "epoch": 11.913746630727763, + "grad_norm": 1.439768671989441, + "learning_rate": 0.00024335675675675673, + "loss": 15.3763, + "step": 22100 + }, + { + "epoch": 11.940700808625337, + "grad_norm": 1.5389134883880615, + "learning_rate": 0.0002425459459459459, + "loss": 15.3453, + "step": 22150 + }, + { + "epoch": 11.967654986522911, + "grad_norm": 1.624977469444275, + "learning_rate": 0.00024173513513513513, + "loss": 15.362, + "step": 22200 + }, + { + "epoch": 11.994609164420485, + "grad_norm": 1.4279649257659912, + "learning_rate": 0.0002409243243243243, + "loss": 15.3672, + "step": 22250 + }, + { + "epoch": 12.021563342318059, + "grad_norm": 1.496293544769287, + "learning_rate": 0.00024012972972972972, + "loss": 15.0297, + "step": 22300 + }, + { + "epoch": 12.048517520215633, + "grad_norm": 1.4558320045471191, + "learning_rate": 0.0002393189189189189, + "loss": 14.9965, + "step": 22350 + }, + { + "epoch": 12.075471698113208, + "grad_norm": 1.4995672702789307, + "learning_rate": 0.00023850810810810806, + "loss": 15.0038, + "step": 22400 + }, + { + "epoch": 12.102425876010782, + "grad_norm": 1.54188072681427, + "learning_rate": 0.00023769729729729728, + "loss": 15.046, + "step": 22450 + }, + { + "epoch": 12.129380053908356, + "grad_norm": 1.5988883972167969, + "learning_rate": 0.00023688648648648647, + "loss": 14.9948, + "step": 22500 + }, + { + "epoch": 12.15633423180593, + "grad_norm": 1.6539379358291626, + "learning_rate": 0.00023607567567567568, + "loss": 15.0689, + "step": 22550 + }, + { + "epoch": 12.183288409703504, + "grad_norm": 1.5286942720413208, + "learning_rate": 0.00023526486486486484, + "loss": 15.0572, + "step": 22600 + }, + { + "epoch": 12.210242587601078, + "grad_norm": 1.512137532234192, + "learning_rate": 0.00023445405405405403, + "loss": 15.0635, + "step": 22650 + }, + { + "epoch": 12.237196765498652, + "grad_norm": 1.6056221723556519, + "learning_rate": 0.00023364324324324321, + "loss": 15.084, + "step": 22700 + }, + { + "epoch": 12.264150943396226, + "grad_norm": 1.5421860218048096, + "learning_rate": 0.00023283243243243243, + "loss": 15.1055, + "step": 22750 + }, + { + "epoch": 12.2911051212938, + "grad_norm": 1.4648276567459106, + "learning_rate": 0.0002320216216216216, + "loss": 15.1192, + "step": 22800 + }, + { + "epoch": 12.318059299191376, + "grad_norm": 1.506853461265564, + "learning_rate": 0.00023121081081081078, + "loss": 15.1146, + "step": 22850 + }, + { + "epoch": 12.34501347708895, + "grad_norm": 1.4823317527770996, + "learning_rate": 0.0002304, + "loss": 15.13, + "step": 22900 + }, + { + "epoch": 12.371967654986523, + "grad_norm": 1.5213923454284668, + "learning_rate": 0.00022958918918918918, + "loss": 15.121, + "step": 22950 + }, + { + "epoch": 12.398921832884097, + "grad_norm": 1.553358793258667, + "learning_rate": 0.00022877837837837834, + "loss": 15.155, + "step": 23000 + }, + { + "epoch": 12.398921832884097, + "eval_accuracy": 0.39454377664292517, + "eval_loss": 3.280299186706543, + "eval_runtime": 182.8131, + "eval_samples_per_second": 98.527, + "eval_steps_per_second": 6.159, + "step": 23000 + }, + { + "epoch": 12.425876010781671, + "grad_norm": 1.6617335081100464, + "learning_rate": 0.00022796756756756755, + "loss": 15.129, + "step": 23050 + }, + { + "epoch": 12.452830188679245, + "grad_norm": 1.5141173601150513, + "learning_rate": 0.00022715675675675674, + "loss": 15.1529, + "step": 23100 + }, + { + "epoch": 12.479784366576819, + "grad_norm": 1.6448322534561157, + "learning_rate": 0.00022634594594594595, + "loss": 15.1113, + "step": 23150 + }, + { + "epoch": 12.506738544474393, + "grad_norm": 1.4851939678192139, + "learning_rate": 0.0002255351351351351, + "loss": 15.168, + "step": 23200 + }, + { + "epoch": 12.533692722371967, + "grad_norm": 1.6145434379577637, + "learning_rate": 0.0002247243243243243, + "loss": 15.2239, + "step": 23250 + }, + { + "epoch": 12.560646900269543, + "grad_norm": 1.4803235530853271, + "learning_rate": 0.00022391351351351349, + "loss": 15.1449, + "step": 23300 + }, + { + "epoch": 12.587601078167117, + "grad_norm": 1.469650149345398, + "learning_rate": 0.0002231027027027027, + "loss": 15.1656, + "step": 23350 + }, + { + "epoch": 12.61455525606469, + "grad_norm": 1.5211254358291626, + "learning_rate": 0.00022229189189189186, + "loss": 15.2007, + "step": 23400 + }, + { + "epoch": 12.641509433962264, + "grad_norm": 1.5244157314300537, + "learning_rate": 0.00022148108108108105, + "loss": 15.2137, + "step": 23450 + }, + { + "epoch": 12.668463611859838, + "grad_norm": 1.5942649841308594, + "learning_rate": 0.00022067027027027026, + "loss": 15.1884, + "step": 23500 + }, + { + "epoch": 12.695417789757412, + "grad_norm": 1.4888988733291626, + "learning_rate": 0.00021985945945945945, + "loss": 15.2281, + "step": 23550 + }, + { + "epoch": 12.722371967654986, + "grad_norm": 1.5697029829025269, + "learning_rate": 0.0002190486486486486, + "loss": 15.2005, + "step": 23600 + }, + { + "epoch": 12.74932614555256, + "grad_norm": 1.4188930988311768, + "learning_rate": 0.00021823783783783782, + "loss": 15.2384, + "step": 23650 + }, + { + "epoch": 12.776280323450134, + "grad_norm": 1.516971468925476, + "learning_rate": 0.000217427027027027, + "loss": 15.2365, + "step": 23700 + }, + { + "epoch": 12.80323450134771, + "grad_norm": 1.4627147912979126, + "learning_rate": 0.0002166162162162162, + "loss": 15.2397, + "step": 23750 + }, + { + "epoch": 12.830188679245284, + "grad_norm": 1.5219035148620605, + "learning_rate": 0.00021580540540540538, + "loss": 15.2338, + "step": 23800 + }, + { + "epoch": 12.857142857142858, + "grad_norm": 1.5930004119873047, + "learning_rate": 0.00021499459459459457, + "loss": 15.1914, + "step": 23850 + }, + { + "epoch": 12.884097035040432, + "grad_norm": 1.6950451135635376, + "learning_rate": 0.00021418378378378376, + "loss": 15.2401, + "step": 23900 + }, + { + "epoch": 12.911051212938006, + "grad_norm": 1.4628324508666992, + "learning_rate": 0.00021337297297297297, + "loss": 15.2338, + "step": 23950 + }, + { + "epoch": 12.93800539083558, + "grad_norm": 1.5722259283065796, + "learning_rate": 0.00021256216216216213, + "loss": 15.2463, + "step": 24000 + }, + { + "epoch": 12.93800539083558, + "eval_accuracy": 0.3955899372151551, + "eval_loss": 3.267047166824341, + "eval_runtime": 182.8794, + "eval_samples_per_second": 98.491, + "eval_steps_per_second": 6.157, + "step": 24000 + }, + { + "epoch": 12.964959568733153, + "grad_norm": 1.5204074382781982, + "learning_rate": 0.00021175135135135132, + "loss": 15.2522, + "step": 24050 + }, + { + "epoch": 12.991913746630727, + "grad_norm": 1.557715892791748, + "learning_rate": 0.00021094054054054053, + "loss": 15.2408, + "step": 24100 + }, + { + "epoch": 13.018867924528301, + "grad_norm": 1.4913039207458496, + "learning_rate": 0.00021012972972972972, + "loss": 14.9635, + "step": 24150 + }, + { + "epoch": 13.045822102425875, + "grad_norm": 1.510380744934082, + "learning_rate": 0.00020931891891891888, + "loss": 14.8479, + "step": 24200 + }, + { + "epoch": 13.07277628032345, + "grad_norm": 1.5142250061035156, + "learning_rate": 0.0002085081081081081, + "loss": 14.9016, + "step": 24250 + }, + { + "epoch": 13.099730458221025, + "grad_norm": 1.5466632843017578, + "learning_rate": 0.00020769729729729728, + "loss": 14.8995, + "step": 24300 + }, + { + "epoch": 13.126684636118599, + "grad_norm": 1.5550708770751953, + "learning_rate": 0.00020688648648648647, + "loss": 14.9194, + "step": 24350 + }, + { + "epoch": 13.153638814016173, + "grad_norm": 1.6031028032302856, + "learning_rate": 0.00020607567567567566, + "loss": 14.9441, + "step": 24400 + }, + { + "epoch": 13.180592991913747, + "grad_norm": 1.5483187437057495, + "learning_rate": 0.00020526486486486484, + "loss": 14.9451, + "step": 24450 + }, + { + "epoch": 13.20754716981132, + "grad_norm": 1.516534686088562, + "learning_rate": 0.00020445405405405403, + "loss": 14.9854, + "step": 24500 + }, + { + "epoch": 13.234501347708894, + "grad_norm": 1.5218985080718994, + "learning_rate": 0.00020364324324324324, + "loss": 15.0095, + "step": 24550 + }, + { + "epoch": 13.261455525606468, + "grad_norm": 1.6100311279296875, + "learning_rate": 0.0002028324324324324, + "loss": 14.9844, + "step": 24600 + }, + { + "epoch": 13.288409703504042, + "grad_norm": 1.5485996007919312, + "learning_rate": 0.0002020216216216216, + "loss": 15.007, + "step": 24650 + }, + { + "epoch": 13.315363881401618, + "grad_norm": 1.6040297746658325, + "learning_rate": 0.0002012108108108108, + "loss": 14.998, + "step": 24700 + }, + { + "epoch": 13.342318059299192, + "grad_norm": 1.5006548166275024, + "learning_rate": 0.0002004, + "loss": 15.019, + "step": 24750 + }, + { + "epoch": 13.369272237196766, + "grad_norm": 1.5284956693649292, + "learning_rate": 0.00019958918918918915, + "loss": 15.0376, + "step": 24800 + }, + { + "epoch": 13.39622641509434, + "grad_norm": 1.576499104499817, + "learning_rate": 0.00019877837837837837, + "loss": 15.0245, + "step": 24850 + }, + { + "epoch": 13.423180592991914, + "grad_norm": 1.5415410995483398, + "learning_rate": 0.00019796756756756755, + "loss": 15.0273, + "step": 24900 + }, + { + "epoch": 13.450134770889488, + "grad_norm": 1.5509371757507324, + "learning_rate": 0.00019715675675675674, + "loss": 15.04, + "step": 24950 + }, + { + "epoch": 13.477088948787062, + "grad_norm": 1.5401982069015503, + "learning_rate": 0.00019634594594594593, + "loss": 15.0772, + "step": 25000 + }, + { + "epoch": 13.477088948787062, + "eval_accuracy": 0.39549693550679194, + "eval_loss": 3.273712635040283, + "eval_runtime": 183.4928, + "eval_samples_per_second": 98.162, + "eval_steps_per_second": 6.136, + "step": 25000 + }, + { + "epoch": 13.504043126684635, + "grad_norm": 1.4463169574737549, + "learning_rate": 0.00019553513513513511, + "loss": 15.0517, + "step": 25050 + }, + { + "epoch": 13.530997304582211, + "grad_norm": 1.5428580045700073, + "learning_rate": 0.0001947243243243243, + "loss": 15.081, + "step": 25100 + }, + { + "epoch": 13.557951482479785, + "grad_norm": 1.4901021718978882, + "learning_rate": 0.00019391351351351352, + "loss": 15.0555, + "step": 25150 + }, + { + "epoch": 13.584905660377359, + "grad_norm": 1.4984480142593384, + "learning_rate": 0.00019310270270270268, + "loss": 15.0935, + "step": 25200 + }, + { + "epoch": 13.611859838274933, + "grad_norm": 1.5072040557861328, + "learning_rate": 0.00019229189189189186, + "loss": 15.0852, + "step": 25250 + }, + { + "epoch": 13.638814016172507, + "grad_norm": 1.5810104608535767, + "learning_rate": 0.00019148108108108108, + "loss": 15.0621, + "step": 25300 + }, + { + "epoch": 13.66576819407008, + "grad_norm": 1.5162920951843262, + "learning_rate": 0.00019067027027027026, + "loss": 15.0682, + "step": 25350 + }, + { + "epoch": 13.692722371967655, + "grad_norm": 1.6046135425567627, + "learning_rate": 0.00018985945945945942, + "loss": 15.1206, + "step": 25400 + }, + { + "epoch": 13.719676549865229, + "grad_norm": 1.5759787559509277, + "learning_rate": 0.00018904864864864864, + "loss": 15.0669, + "step": 25450 + }, + { + "epoch": 13.746630727762803, + "grad_norm": 1.5286028385162354, + "learning_rate": 0.00018823783783783783, + "loss": 15.0961, + "step": 25500 + }, + { + "epoch": 13.773584905660378, + "grad_norm": 1.506309151649475, + "learning_rate": 0.000187427027027027, + "loss": 15.141, + "step": 25550 + }, + { + "epoch": 13.800539083557952, + "grad_norm": 1.6738903522491455, + "learning_rate": 0.00018661621621621617, + "loss": 15.1184, + "step": 25600 + }, + { + "epoch": 13.827493261455526, + "grad_norm": 1.4504151344299316, + "learning_rate": 0.00018580540540540539, + "loss": 15.1011, + "step": 25650 + }, + { + "epoch": 13.8544474393531, + "grad_norm": 1.4721909761428833, + "learning_rate": 0.00018499459459459457, + "loss": 15.1252, + "step": 25700 + }, + { + "epoch": 13.881401617250674, + "grad_norm": 1.5899988412857056, + "learning_rate": 0.0001841837837837838, + "loss": 15.0654, + "step": 25750 + }, + { + "epoch": 13.908355795148248, + "grad_norm": 1.5403255224227905, + "learning_rate": 0.00018337297297297295, + "loss": 15.1186, + "step": 25800 + }, + { + "epoch": 13.935309973045822, + "grad_norm": 1.5205345153808594, + "learning_rate": 0.00018256216216216213, + "loss": 15.1377, + "step": 25850 + }, + { + "epoch": 13.962264150943396, + "grad_norm": 1.550255298614502, + "learning_rate": 0.00018175135135135135, + "loss": 15.1167, + "step": 25900 + }, + { + "epoch": 13.98921832884097, + "grad_norm": 1.5580016374588013, + "learning_rate": 0.00018094054054054054, + "loss": 15.1598, + "step": 25950 + }, + { + "epoch": 14.016172506738544, + "grad_norm": 1.486087441444397, + "learning_rate": 0.0001801297297297297, + "loss": 14.8798, + "step": 26000 + }, + { + "epoch": 14.016172506738544, + "eval_accuracy": 0.395819942608385, + "eval_loss": 3.2739694118499756, + "eval_runtime": 183.4607, + "eval_samples_per_second": 98.179, + "eval_steps_per_second": 6.138, + "step": 26000 + }, + { + "epoch": 14.04312668463612, + "grad_norm": 1.5213587284088135, + "learning_rate": 0.0001793189189189189, + "loss": 14.7999, + "step": 26050 + }, + { + "epoch": 14.070080862533693, + "grad_norm": 1.5840412378311157, + "learning_rate": 0.0001785081081081081, + "loss": 14.8034, + "step": 26100 + }, + { + "epoch": 14.097035040431267, + "grad_norm": 1.479959487915039, + "learning_rate": 0.00017769729729729728, + "loss": 14.8473, + "step": 26150 + }, + { + "epoch": 14.123989218328841, + "grad_norm": 1.521485447883606, + "learning_rate": 0.00017688648648648644, + "loss": 14.8314, + "step": 26200 + }, + { + "epoch": 14.150943396226415, + "grad_norm": 1.5491563081741333, + "learning_rate": 0.00017607567567567566, + "loss": 14.8476, + "step": 26250 + }, + { + "epoch": 14.177897574123989, + "grad_norm": 1.4828426837921143, + "learning_rate": 0.00017528108108108106, + "loss": 14.8784, + "step": 26300 + }, + { + "epoch": 14.204851752021563, + "grad_norm": 1.517014980316162, + "learning_rate": 0.00017447027027027025, + "loss": 14.8559, + "step": 26350 + }, + { + "epoch": 14.231805929919137, + "grad_norm": 1.5260605812072754, + "learning_rate": 0.00017365945945945944, + "loss": 14.8986, + "step": 26400 + }, + { + "epoch": 14.25876010781671, + "grad_norm": 1.5517903566360474, + "learning_rate": 0.00017284864864864865, + "loss": 14.9058, + "step": 26450 + }, + { + "epoch": 14.285714285714286, + "grad_norm": 1.5963462591171265, + "learning_rate": 0.0001720378378378378, + "loss": 14.8757, + "step": 26500 + }, + { + "epoch": 14.31266846361186, + "grad_norm": 1.4833232164382935, + "learning_rate": 0.000171227027027027, + "loss": 14.9005, + "step": 26550 + }, + { + "epoch": 14.339622641509434, + "grad_norm": 1.5324000120162964, + "learning_rate": 0.0001704162162162162, + "loss": 14.8942, + "step": 26600 + }, + { + "epoch": 14.366576819407008, + "grad_norm": 1.5217413902282715, + "learning_rate": 0.0001696054054054054, + "loss": 14.9365, + "step": 26650 + }, + { + "epoch": 14.393530997304582, + "grad_norm": 1.6108272075653076, + "learning_rate": 0.00016879459459459456, + "loss": 14.9405, + "step": 26700 + }, + { + "epoch": 14.420485175202156, + "grad_norm": 1.646351933479309, + "learning_rate": 0.00016798378378378377, + "loss": 14.9517, + "step": 26750 + }, + { + "epoch": 14.44743935309973, + "grad_norm": 1.5736359357833862, + "learning_rate": 0.00016717297297297296, + "loss": 14.9074, + "step": 26800 + }, + { + "epoch": 14.474393530997304, + "grad_norm": 1.5562492609024048, + "learning_rate": 0.00016636216216216215, + "loss": 14.8835, + "step": 26850 + }, + { + "epoch": 14.501347708894878, + "grad_norm": 1.602283239364624, + "learning_rate": 0.00016555135135135133, + "loss": 14.8741, + "step": 26900 + }, + { + "epoch": 14.528301886792454, + "grad_norm": 1.5761126279830933, + "learning_rate": 0.00016474054054054052, + "loss": 14.9874, + "step": 26950 + }, + { + "epoch": 14.555256064690028, + "grad_norm": 1.5599428415298462, + "learning_rate": 0.0001639297297297297, + "loss": 14.9491, + "step": 27000 + }, + { + "epoch": 14.555256064690028, + "eval_accuracy": 0.3961279564439102, + "eval_loss": 3.270308256149292, + "eval_runtime": 183.109, + "eval_samples_per_second": 98.368, + "eval_steps_per_second": 6.149, + "step": 27000 + }, + { + "epoch": 14.582210242587601, + "grad_norm": 1.5091289281845093, + "learning_rate": 0.00016311891891891892, + "loss": 14.9643, + "step": 27050 + }, + { + "epoch": 14.609164420485175, + "grad_norm": 1.554219126701355, + "learning_rate": 0.00016232432432432433, + "loss": 14.947, + "step": 27100 + }, + { + "epoch": 14.63611859838275, + "grad_norm": 1.587387204170227, + "learning_rate": 0.00016151351351351351, + "loss": 14.9556, + "step": 27150 + }, + { + "epoch": 14.663072776280323, + "grad_norm": 1.4709123373031616, + "learning_rate": 0.00016070270270270267, + "loss": 14.9691, + "step": 27200 + }, + { + "epoch": 14.690026954177897, + "grad_norm": 1.5789484977722168, + "learning_rate": 0.0001598918918918919, + "loss": 15.0007, + "step": 27250 + }, + { + "epoch": 14.716981132075471, + "grad_norm": 1.5269670486450195, + "learning_rate": 0.00015908108108108108, + "loss": 14.9889, + "step": 27300 + }, + { + "epoch": 14.743935309973045, + "grad_norm": 1.4882417917251587, + "learning_rate": 0.00015827027027027026, + "loss": 14.9787, + "step": 27350 + }, + { + "epoch": 14.77088948787062, + "grad_norm": 1.5664522647857666, + "learning_rate": 0.00015745945945945945, + "loss": 14.9789, + "step": 27400 + }, + { + "epoch": 14.797843665768195, + "grad_norm": 1.5020064115524292, + "learning_rate": 0.00015664864864864864, + "loss": 14.9879, + "step": 27450 + }, + { + "epoch": 14.824797843665769, + "grad_norm": 1.5945509672164917, + "learning_rate": 0.00015583783783783782, + "loss": 15.0118, + "step": 27500 + }, + { + "epoch": 14.851752021563343, + "grad_norm": 1.5526251792907715, + "learning_rate": 0.00015502702702702704, + "loss": 15.0042, + "step": 27550 + }, + { + "epoch": 14.878706199460916, + "grad_norm": 1.5196871757507324, + "learning_rate": 0.0001542162162162162, + "loss": 15.0268, + "step": 27600 + }, + { + "epoch": 14.90566037735849, + "grad_norm": 1.4666787385940552, + "learning_rate": 0.00015340540540540538, + "loss": 15.0046, + "step": 27650 + }, + { + "epoch": 14.932614555256064, + "grad_norm": 1.455553412437439, + "learning_rate": 0.0001525945945945946, + "loss": 14.9991, + "step": 27700 + }, + { + "epoch": 14.959568733153638, + "grad_norm": 1.4995278120040894, + "learning_rate": 0.00015178378378378379, + "loss": 15.0387, + "step": 27750 + }, + { + "epoch": 14.986522911051212, + "grad_norm": 1.6195017099380493, + "learning_rate": 0.00015097297297297295, + "loss": 15.0153, + "step": 27800 + }, + { + "epoch": 15.013477088948788, + "grad_norm": 1.591896653175354, + "learning_rate": 0.00015016216216216216, + "loss": 14.8525, + "step": 27850 + }, + { + "epoch": 15.040431266846362, + "grad_norm": 1.5427701473236084, + "learning_rate": 0.00014935135135135135, + "loss": 14.6664, + "step": 27900 + }, + { + "epoch": 15.067385444743936, + "grad_norm": 1.6039804220199585, + "learning_rate": 0.00014854054054054053, + "loss": 14.7522, + "step": 27950 + }, + { + "epoch": 15.09433962264151, + "grad_norm": 1.5312916040420532, + "learning_rate": 0.00014772972972972972, + "loss": 14.7585, + "step": 28000 + }, + { + "epoch": 15.09433962264151, + "eval_accuracy": 0.3963390572842719, + "eval_loss": 3.2742018699645996, + "eval_runtime": 183.4262, + "eval_samples_per_second": 98.198, + "eval_steps_per_second": 6.139, + "step": 28000 + }, + { + "epoch": 15.121293800539084, + "grad_norm": 1.5751817226409912, + "learning_rate": 0.0001469189189189189, + "loss": 14.7446, + "step": 28050 + }, + { + "epoch": 15.148247978436657, + "grad_norm": 1.5158833265304565, + "learning_rate": 0.0001461081081081081, + "loss": 14.7897, + "step": 28100 + }, + { + "epoch": 15.175202156334231, + "grad_norm": 1.548401951789856, + "learning_rate": 0.00014529729729729728, + "loss": 14.7723, + "step": 28150 + }, + { + "epoch": 15.202156334231805, + "grad_norm": 1.5945206880569458, + "learning_rate": 0.00014448648648648647, + "loss": 14.7545, + "step": 28200 + }, + { + "epoch": 15.22911051212938, + "grad_norm": 1.6147938966751099, + "learning_rate": 0.00014367567567567566, + "loss": 14.7821, + "step": 28250 + }, + { + "epoch": 15.256064690026955, + "grad_norm": 1.600216269493103, + "learning_rate": 0.00014286486486486487, + "loss": 14.8026, + "step": 28300 + }, + { + "epoch": 15.283018867924529, + "grad_norm": 1.5041214227676392, + "learning_rate": 0.00014205405405405403, + "loss": 14.7822, + "step": 28350 + }, + { + "epoch": 15.309973045822103, + "grad_norm": 1.5731170177459717, + "learning_rate": 0.00014124324324324325, + "loss": 14.8014, + "step": 28400 + }, + { + "epoch": 15.336927223719677, + "grad_norm": 1.62449312210083, + "learning_rate": 0.0001404324324324324, + "loss": 14.8132, + "step": 28450 + }, + { + "epoch": 15.36388140161725, + "grad_norm": 1.5179616212844849, + "learning_rate": 0.00013962162162162162, + "loss": 14.8027, + "step": 28500 + }, + { + "epoch": 15.390835579514825, + "grad_norm": 1.5233399868011475, + "learning_rate": 0.0001388108108108108, + "loss": 14.8032, + "step": 28550 + }, + { + "epoch": 15.417789757412399, + "grad_norm": 1.5631145238876343, + "learning_rate": 0.000138, + "loss": 14.8132, + "step": 28600 + }, + { + "epoch": 15.444743935309972, + "grad_norm": 1.4974826574325562, + "learning_rate": 0.00013718918918918918, + "loss": 14.8246, + "step": 28650 + }, + { + "epoch": 15.471698113207546, + "grad_norm": 1.56992769241333, + "learning_rate": 0.00013637837837837837, + "loss": 14.8093, + "step": 28700 + }, + { + "epoch": 15.498652291105122, + "grad_norm": 1.5067832469940186, + "learning_rate": 0.00013556756756756755, + "loss": 14.8734, + "step": 28750 + }, + { + "epoch": 15.525606469002696, + "grad_norm": 1.5053755044937134, + "learning_rate": 0.00013475675675675674, + "loss": 14.8705, + "step": 28800 + }, + { + "epoch": 15.55256064690027, + "grad_norm": 1.4808902740478516, + "learning_rate": 0.00013394594594594593, + "loss": 14.8348, + "step": 28850 + }, + { + "epoch": 15.579514824797844, + "grad_norm": 1.5790108442306519, + "learning_rate": 0.00013313513513513512, + "loss": 14.8726, + "step": 28900 + }, + { + "epoch": 15.606469002695418, + "grad_norm": 1.5821738243103027, + "learning_rate": 0.0001323243243243243, + "loss": 14.8672, + "step": 28950 + }, + { + "epoch": 15.633423180592992, + "grad_norm": 1.5624059438705444, + "learning_rate": 0.00013151351351351352, + "loss": 14.8196, + "step": 29000 + }, + { + "epoch": 15.633423180592992, + "eval_accuracy": 0.3969098878634074, + "eval_loss": 3.2676687240600586, + "eval_runtime": 182.4618, + "eval_samples_per_second": 98.717, + "eval_steps_per_second": 6.171, + "step": 29000 + }, + { + "epoch": 15.660377358490566, + "grad_norm": 1.6494839191436768, + "learning_rate": 0.00013070270270270268, + "loss": 14.8677, + "step": 29050 + }, + { + "epoch": 15.68733153638814, + "grad_norm": 1.5024958848953247, + "learning_rate": 0.0001298918918918919, + "loss": 14.884, + "step": 29100 + }, + { + "epoch": 15.714285714285714, + "grad_norm": 1.5223402976989746, + "learning_rate": 0.00012908108108108108, + "loss": 14.8573, + "step": 29150 + }, + { + "epoch": 15.74123989218329, + "grad_norm": 1.4910407066345215, + "learning_rate": 0.00012827027027027027, + "loss": 14.8876, + "step": 29200 + }, + { + "epoch": 15.768194070080863, + "grad_norm": 1.6085453033447266, + "learning_rate": 0.00012745945945945945, + "loss": 14.8961, + "step": 29250 + }, + { + "epoch": 15.795148247978437, + "grad_norm": 1.5478426218032837, + "learning_rate": 0.00012664864864864864, + "loss": 14.8845, + "step": 29300 + }, + { + "epoch": 15.822102425876011, + "grad_norm": 1.5788655281066895, + "learning_rate": 0.00012583783783783783, + "loss": 14.9066, + "step": 29350 + }, + { + "epoch": 15.849056603773585, + "grad_norm": 1.5746803283691406, + "learning_rate": 0.000125027027027027, + "loss": 14.8779, + "step": 29400 + }, + { + "epoch": 15.876010781671159, + "grad_norm": 1.4738293886184692, + "learning_rate": 0.0001242162162162162, + "loss": 14.9064, + "step": 29450 + }, + { + "epoch": 15.902964959568733, + "grad_norm": 1.6745882034301758, + "learning_rate": 0.0001234054054054054, + "loss": 14.9054, + "step": 29500 + }, + { + "epoch": 15.929919137466307, + "grad_norm": 1.5883915424346924, + "learning_rate": 0.00012259459459459457, + "loss": 14.9191, + "step": 29550 + }, + { + "epoch": 15.95687331536388, + "grad_norm": 1.6155942678451538, + "learning_rate": 0.00012178378378378378, + "loss": 14.8815, + "step": 29600 + }, + { + "epoch": 15.983827493261456, + "grad_norm": 1.5333513021469116, + "learning_rate": 0.00012098918918918918, + "loss": 14.9168, + "step": 29650 + }, + { + "epoch": 16.01078167115903, + "grad_norm": 1.5406397581100464, + "learning_rate": 0.00012017837837837838, + "loss": 14.7804, + "step": 29700 + }, + { + "epoch": 16.037735849056602, + "grad_norm": 1.56869637966156, + "learning_rate": 0.00011936756756756755, + "loss": 14.621, + "step": 29750 + }, + { + "epoch": 16.064690026954178, + "grad_norm": 1.6393052339553833, + "learning_rate": 0.00011855675675675675, + "loss": 14.5929, + "step": 29800 + }, + { + "epoch": 16.09164420485175, + "grad_norm": 1.5278005599975586, + "learning_rate": 0.00011774594594594594, + "loss": 14.6855, + "step": 29850 + }, + { + "epoch": 16.118598382749326, + "grad_norm": 1.549673080444336, + "learning_rate": 0.00011693513513513513, + "loss": 14.6718, + "step": 29900 + }, + { + "epoch": 16.1455525606469, + "grad_norm": 1.5019139051437378, + "learning_rate": 0.00011612432432432432, + "loss": 14.6548, + "step": 29950 + }, + { + "epoch": 16.172506738544474, + "grad_norm": 1.6385548114776611, + "learning_rate": 0.00011531351351351352, + "loss": 14.6783, + "step": 30000 + }, + { + "epoch": 16.172506738544474, + "eval_accuracy": 0.39686979717370413, + "eval_loss": 3.2710256576538086, + "eval_runtime": 182.5158, + "eval_samples_per_second": 98.687, + "eval_steps_per_second": 6.169, + "step": 30000 + }, + { + "epoch": 16.19946091644205, + "grad_norm": 1.5987632274627686, + "learning_rate": 0.00011450270270270269, + "loss": 14.6573, + "step": 30050 + }, + { + "epoch": 16.22641509433962, + "grad_norm": 1.6917481422424316, + "learning_rate": 0.00011369189189189189, + "loss": 14.664, + "step": 30100 + }, + { + "epoch": 16.253369272237197, + "grad_norm": 1.5871925354003906, + "learning_rate": 0.00011288108108108108, + "loss": 14.6668, + "step": 30150 + }, + { + "epoch": 16.28032345013477, + "grad_norm": 1.5470457077026367, + "learning_rate": 0.00011207027027027026, + "loss": 14.7022, + "step": 30200 + }, + { + "epoch": 16.307277628032345, + "grad_norm": 1.6101864576339722, + "learning_rate": 0.00011125945945945945, + "loss": 14.702, + "step": 30250 + }, + { + "epoch": 16.334231805929917, + "grad_norm": 1.5155388116836548, + "learning_rate": 0.00011044864864864865, + "loss": 14.7447, + "step": 30300 + }, + { + "epoch": 16.361185983827493, + "grad_norm": 1.5606392621994019, + "learning_rate": 0.00010963783783783783, + "loss": 14.7262, + "step": 30350 + }, + { + "epoch": 16.38814016172507, + "grad_norm": 1.6367034912109375, + "learning_rate": 0.00010882702702702703, + "loss": 14.7292, + "step": 30400 + }, + { + "epoch": 16.41509433962264, + "grad_norm": 1.5408399105072021, + "learning_rate": 0.0001080162162162162, + "loss": 14.7541, + "step": 30450 + }, + { + "epoch": 16.442048517520217, + "grad_norm": 1.5572620630264282, + "learning_rate": 0.0001072054054054054, + "loss": 14.7008, + "step": 30500 + }, + { + "epoch": 16.46900269541779, + "grad_norm": 1.5497262477874756, + "learning_rate": 0.00010639459459459459, + "loss": 14.7683, + "step": 30550 + }, + { + "epoch": 16.495956873315365, + "grad_norm": 1.5502400398254395, + "learning_rate": 0.00010558378378378379, + "loss": 14.7466, + "step": 30600 + }, + { + "epoch": 16.522911051212937, + "grad_norm": 1.6329463720321655, + "learning_rate": 0.00010477297297297296, + "loss": 14.7568, + "step": 30650 + }, + { + "epoch": 16.549865229110512, + "grad_norm": 1.5791313648223877, + "learning_rate": 0.00010396216216216216, + "loss": 14.7672, + "step": 30700 + }, + { + "epoch": 16.576819407008085, + "grad_norm": 1.6621166467666626, + "learning_rate": 0.00010315135135135134, + "loss": 14.7865, + "step": 30750 + }, + { + "epoch": 16.60377358490566, + "grad_norm": 1.5358580350875854, + "learning_rate": 0.00010234054054054054, + "loss": 14.7586, + "step": 30800 + }, + { + "epoch": 16.630727762803236, + "grad_norm": 1.5281518697738647, + "learning_rate": 0.00010152972972972972, + "loss": 14.7566, + "step": 30850 + }, + { + "epoch": 16.657681940700808, + "grad_norm": 1.5487334728240967, + "learning_rate": 0.00010071891891891891, + "loss": 14.8048, + "step": 30900 + }, + { + "epoch": 16.684636118598384, + "grad_norm": 1.5345453023910522, + "learning_rate": 9.99081081081081e-05, + "loss": 14.7579, + "step": 30950 + }, + { + "epoch": 16.711590296495956, + "grad_norm": 1.5981355905532837, + "learning_rate": 9.90972972972973e-05, + "loss": 14.7716, + "step": 31000 + }, + { + "epoch": 16.711590296495956, + "eval_accuracy": 0.39735642643977725, + "eval_loss": 3.2666032314300537, + "eval_runtime": 182.2937, + "eval_samples_per_second": 98.808, + "eval_steps_per_second": 6.177, + "step": 31000 + }, + { + "epoch": 16.73854447439353, + "grad_norm": 1.5877584218978882, + "learning_rate": 9.828648648648647e-05, + "loss": 14.7698, + "step": 31050 + }, + { + "epoch": 16.765498652291104, + "grad_norm": 1.5693833827972412, + "learning_rate": 9.747567567567567e-05, + "loss": 14.8035, + "step": 31100 + }, + { + "epoch": 16.79245283018868, + "grad_norm": 1.564299464225769, + "learning_rate": 9.666486486486486e-05, + "loss": 14.8016, + "step": 31150 + }, + { + "epoch": 16.81940700808625, + "grad_norm": 1.6126294136047363, + "learning_rate": 9.585405405405405e-05, + "loss": 14.7674, + "step": 31200 + }, + { + "epoch": 16.846361185983827, + "grad_norm": 1.5660778284072876, + "learning_rate": 9.504324324324323e-05, + "loss": 14.7822, + "step": 31250 + }, + { + "epoch": 16.873315363881403, + "grad_norm": 1.559880018234253, + "learning_rate": 9.423243243243243e-05, + "loss": 14.7547, + "step": 31300 + }, + { + "epoch": 16.900269541778975, + "grad_norm": 1.5699021816253662, + "learning_rate": 9.342162162162161e-05, + "loss": 14.7827, + "step": 31350 + }, + { + "epoch": 16.92722371967655, + "grad_norm": 1.5942411422729492, + "learning_rate": 9.261081081081081e-05, + "loss": 14.8039, + "step": 31400 + }, + { + "epoch": 16.954177897574123, + "grad_norm": 1.544398307800293, + "learning_rate": 9.18e-05, + "loss": 14.7749, + "step": 31450 + }, + { + "epoch": 16.9811320754717, + "grad_norm": 1.6123785972595215, + "learning_rate": 9.098918918918918e-05, + "loss": 14.7925, + "step": 31500 + }, + { + "epoch": 17.00808625336927, + "grad_norm": 1.5455232858657837, + "learning_rate": 9.017837837837837e-05, + "loss": 14.689, + "step": 31550 + }, + { + "epoch": 17.035040431266847, + "grad_norm": 1.584010362625122, + "learning_rate": 8.936756756756757e-05, + "loss": 14.5752, + "step": 31600 + }, + { + "epoch": 17.06199460916442, + "grad_norm": 1.6022430658340454, + "learning_rate": 8.855675675675674e-05, + "loss": 14.5822, + "step": 31650 + }, + { + "epoch": 17.088948787061994, + "grad_norm": 1.4971483945846558, + "learning_rate": 8.774594594594594e-05, + "loss": 14.5793, + "step": 31700 + }, + { + "epoch": 17.11590296495957, + "grad_norm": 1.5255413055419922, + "learning_rate": 8.693513513513513e-05, + "loss": 14.5842, + "step": 31750 + }, + { + "epoch": 17.142857142857142, + "grad_norm": 1.5918787717819214, + "learning_rate": 8.612432432432432e-05, + "loss": 14.5711, + "step": 31800 + }, + { + "epoch": 17.169811320754718, + "grad_norm": 1.5889939069747925, + "learning_rate": 8.53135135135135e-05, + "loss": 14.6055, + "step": 31850 + }, + { + "epoch": 17.19676549865229, + "grad_norm": 1.6539576053619385, + "learning_rate": 8.45027027027027e-05, + "loss": 14.6048, + "step": 31900 + }, + { + "epoch": 17.223719676549866, + "grad_norm": 1.5412706136703491, + "learning_rate": 8.369189189189188e-05, + "loss": 14.5751, + "step": 31950 + }, + { + "epoch": 17.250673854447438, + "grad_norm": 1.5189770460128784, + "learning_rate": 8.288108108108108e-05, + "loss": 14.6427, + "step": 32000 + }, + { + "epoch": 17.250673854447438, + "eval_accuracy": 0.3972496265807574, + "eval_loss": 3.2697131633758545, + "eval_runtime": 179.3647, + "eval_samples_per_second": 100.421, + "eval_steps_per_second": 6.278, + "step": 32000 + }, + { + "epoch": 17.277628032345014, + "grad_norm": 1.5873239040374756, + "learning_rate": 8.207027027027027e-05, + "loss": 14.6094, + "step": 32050 + }, + { + "epoch": 17.304582210242586, + "grad_norm": 1.645750641822815, + "learning_rate": 8.125945945945945e-05, + "loss": 14.6167, + "step": 32100 + }, + { + "epoch": 17.33153638814016, + "grad_norm": 1.5786947011947632, + "learning_rate": 8.044864864864864e-05, + "loss": 14.6306, + "step": 32150 + }, + { + "epoch": 17.358490566037737, + "grad_norm": 1.5392863750457764, + "learning_rate": 7.963783783783784e-05, + "loss": 14.622, + "step": 32200 + }, + { + "epoch": 17.38544474393531, + "grad_norm": 1.5654475688934326, + "learning_rate": 7.882702702702702e-05, + "loss": 14.5784, + "step": 32250 + }, + { + "epoch": 17.412398921832885, + "grad_norm": 1.5552312135696411, + "learning_rate": 7.801621621621622e-05, + "loss": 14.6352, + "step": 32300 + }, + { + "epoch": 17.439353099730457, + "grad_norm": 1.5792616605758667, + "learning_rate": 7.722162162162162e-05, + "loss": 14.6571, + "step": 32350 + }, + { + "epoch": 17.466307277628033, + "grad_norm": 1.5422114133834839, + "learning_rate": 7.64108108108108e-05, + "loss": 14.6243, + "step": 32400 + }, + { + "epoch": 17.493261455525605, + "grad_norm": 1.5832560062408447, + "learning_rate": 7.56e-05, + "loss": 14.6572, + "step": 32450 + }, + { + "epoch": 17.52021563342318, + "grad_norm": 1.5866824388504028, + "learning_rate": 7.478918918918918e-05, + "loss": 14.6522, + "step": 32500 + }, + { + "epoch": 17.547169811320753, + "grad_norm": 1.533219337463379, + "learning_rate": 7.397837837837837e-05, + "loss": 14.6272, + "step": 32550 + }, + { + "epoch": 17.57412398921833, + "grad_norm": 1.584763526916504, + "learning_rate": 7.316756756756756e-05, + "loss": 14.6932, + "step": 32600 + }, + { + "epoch": 17.601078167115904, + "grad_norm": 1.5591111183166504, + "learning_rate": 7.235675675675676e-05, + "loss": 14.6659, + "step": 32650 + }, + { + "epoch": 17.628032345013477, + "grad_norm": 1.5653057098388672, + "learning_rate": 7.154594594594594e-05, + "loss": 14.6659, + "step": 32700 + }, + { + "epoch": 17.654986522911052, + "grad_norm": 1.4933594465255737, + "learning_rate": 7.073513513513513e-05, + "loss": 14.6884, + "step": 32750 + }, + { + "epoch": 17.681940700808624, + "grad_norm": 1.5485045909881592, + "learning_rate": 6.992432432432432e-05, + "loss": 14.6379, + "step": 32800 + }, + { + "epoch": 17.7088948787062, + "grad_norm": 1.5517247915267944, + "learning_rate": 6.91135135135135e-05, + "loss": 14.6793, + "step": 32850 + }, + { + "epoch": 17.735849056603772, + "grad_norm": 1.654420256614685, + "learning_rate": 6.830270270270269e-05, + "loss": 14.6845, + "step": 32900 + }, + { + "epoch": 17.762803234501348, + "grad_norm": 1.5648823976516724, + "learning_rate": 6.749189189189189e-05, + "loss": 14.6606, + "step": 32950 + }, + { + "epoch": 17.78975741239892, + "grad_norm": 1.5435584783554077, + "learning_rate": 6.668108108108108e-05, + "loss": 14.6409, + "step": 33000 + }, + { + "epoch": 17.78975741239892, + "eval_accuracy": 0.39795865596017094, + "eval_loss": 3.263080358505249, + "eval_runtime": 180.8635, + "eval_samples_per_second": 99.589, + "eval_steps_per_second": 6.226, + "step": 33000 + }, + { + "epoch": 17.816711590296496, + "grad_norm": 1.5425926446914673, + "learning_rate": 6.587027027027027e-05, + "loss": 14.7093, + "step": 33050 + }, + { + "epoch": 17.84366576819407, + "grad_norm": 1.5393097400665283, + "learning_rate": 6.505945945945945e-05, + "loss": 14.6421, + "step": 33100 + }, + { + "epoch": 17.870619946091644, + "grad_norm": 1.5586802959442139, + "learning_rate": 6.424864864864864e-05, + "loss": 14.6901, + "step": 33150 + }, + { + "epoch": 17.89757412398922, + "grad_norm": 1.6345175504684448, + "learning_rate": 6.343783783783783e-05, + "loss": 14.6739, + "step": 33200 + }, + { + "epoch": 17.92452830188679, + "grad_norm": 1.6852161884307861, + "learning_rate": 6.262702702702703e-05, + "loss": 14.7087, + "step": 33250 + }, + { + "epoch": 17.951482479784367, + "grad_norm": 1.5203602313995361, + "learning_rate": 6.181621621621622e-05, + "loss": 14.6883, + "step": 33300 + }, + { + "epoch": 17.97843665768194, + "grad_norm": 1.5950666666030884, + "learning_rate": 6.10054054054054e-05, + "loss": 14.6901, + "step": 33350 + }, + { + "epoch": 18.005390835579515, + "grad_norm": 1.5396767854690552, + "learning_rate": 6.019459459459459e-05, + "loss": 14.6374, + "step": 33400 + }, + { + "epoch": 18.032345013477087, + "grad_norm": 1.512335181236267, + "learning_rate": 5.9383783783783776e-05, + "loss": 14.4547, + "step": 33450 + }, + { + "epoch": 18.059299191374663, + "grad_norm": 1.6069568395614624, + "learning_rate": 5.857297297297297e-05, + "loss": 14.5278, + "step": 33500 + }, + { + "epoch": 18.08625336927224, + "grad_norm": 1.5221787691116333, + "learning_rate": 5.776216216216216e-05, + "loss": 14.5299, + "step": 33550 + }, + { + "epoch": 18.11320754716981, + "grad_norm": 1.5961333513259888, + "learning_rate": 5.6951351351351344e-05, + "loss": 14.5246, + "step": 33600 + }, + { + "epoch": 18.140161725067387, + "grad_norm": 1.5239356756210327, + "learning_rate": 5.614054054054054e-05, + "loss": 14.5502, + "step": 33650 + }, + { + "epoch": 18.16711590296496, + "grad_norm": 1.6545807123184204, + "learning_rate": 5.5329729729729725e-05, + "loss": 14.5051, + "step": 33700 + }, + { + "epoch": 18.194070080862534, + "grad_norm": 1.5053555965423584, + "learning_rate": 5.451891891891891e-05, + "loss": 14.5202, + "step": 33750 + }, + { + "epoch": 18.221024258760107, + "grad_norm": 1.5080546140670776, + "learning_rate": 5.37081081081081e-05, + "loss": 14.5393, + "step": 33800 + }, + { + "epoch": 18.247978436657682, + "grad_norm": 1.6003968715667725, + "learning_rate": 5.289729729729729e-05, + "loss": 14.5344, + "step": 33850 + }, + { + "epoch": 18.274932614555254, + "grad_norm": 1.5318918228149414, + "learning_rate": 5.208648648648648e-05, + "loss": 14.5318, + "step": 33900 + }, + { + "epoch": 18.30188679245283, + "grad_norm": 1.556462049484253, + "learning_rate": 5.127567567567567e-05, + "loss": 14.5708, + "step": 33950 + }, + { + "epoch": 18.328840970350406, + "grad_norm": 1.5578322410583496, + "learning_rate": 5.046486486486486e-05, + "loss": 14.5329, + "step": 34000 + }, + { + "epoch": 18.328840970350406, + "eval_accuracy": 0.39798331879638404, + "eval_loss": 3.2663168907165527, + "eval_runtime": 181.0745, + "eval_samples_per_second": 99.473, + "eval_steps_per_second": 6.218, + "step": 34000 + }, + { + "epoch": 18.355795148247978, + "grad_norm": 1.5834003686904907, + "learning_rate": 4.965405405405405e-05, + "loss": 14.5322, + "step": 34050 + }, + { + "epoch": 18.382749326145554, + "grad_norm": 1.5400707721710205, + "learning_rate": 4.8843243243243235e-05, + "loss": 14.56, + "step": 34100 + }, + { + "epoch": 18.409703504043126, + "grad_norm": 1.5586832761764526, + "learning_rate": 4.803243243243243e-05, + "loss": 14.5639, + "step": 34150 + }, + { + "epoch": 18.4366576819407, + "grad_norm": 1.521950364112854, + "learning_rate": 4.7221621621621616e-05, + "loss": 14.5583, + "step": 34200 + }, + { + "epoch": 18.463611859838274, + "grad_norm": 1.5423915386199951, + "learning_rate": 4.64108108108108e-05, + "loss": 14.5764, + "step": 34250 + }, + { + "epoch": 18.49056603773585, + "grad_norm": 1.524101734161377, + "learning_rate": 4.56e-05, + "loss": 14.511, + "step": 34300 + }, + { + "epoch": 18.51752021563342, + "grad_norm": 1.5166869163513184, + "learning_rate": 4.4789189189189184e-05, + "loss": 14.5173, + "step": 34350 + }, + { + "epoch": 18.544474393530997, + "grad_norm": 1.61070716381073, + "learning_rate": 4.397837837837837e-05, + "loss": 14.5917, + "step": 34400 + }, + { + "epoch": 18.571428571428573, + "grad_norm": 1.5960681438446045, + "learning_rate": 4.3167567567567565e-05, + "loss": 14.5383, + "step": 34450 + }, + { + "epoch": 18.598382749326145, + "grad_norm": 1.4851629734039307, + "learning_rate": 4.235675675675675e-05, + "loss": 14.5736, + "step": 34500 + }, + { + "epoch": 18.62533692722372, + "grad_norm": 1.5458905696868896, + "learning_rate": 4.154594594594594e-05, + "loss": 14.563, + "step": 34550 + }, + { + "epoch": 18.652291105121293, + "grad_norm": 1.500968098640442, + "learning_rate": 4.073513513513513e-05, + "loss": 14.5723, + "step": 34600 + }, + { + "epoch": 18.67924528301887, + "grad_norm": 1.5670329332351685, + "learning_rate": 3.992432432432432e-05, + "loss": 14.5681, + "step": 34650 + }, + { + "epoch": 18.70619946091644, + "grad_norm": 1.5421439409255981, + "learning_rate": 3.911351351351351e-05, + "loss": 14.5949, + "step": 34700 + }, + { + "epoch": 18.733153638814017, + "grad_norm": 1.541306734085083, + "learning_rate": 3.83027027027027e-05, + "loss": 14.5743, + "step": 34750 + }, + { + "epoch": 18.76010781671159, + "grad_norm": 1.5344128608703613, + "learning_rate": 3.749189189189189e-05, + "loss": 14.5806, + "step": 34800 + }, + { + "epoch": 18.787061994609164, + "grad_norm": 1.51657235622406, + "learning_rate": 3.6681081081081075e-05, + "loss": 14.5895, + "step": 34850 + }, + { + "epoch": 18.81401617250674, + "grad_norm": 1.5216045379638672, + "learning_rate": 3.587027027027026e-05, + "loss": 14.5862, + "step": 34900 + }, + { + "epoch": 18.840970350404312, + "grad_norm": 1.561399221420288, + "learning_rate": 3.5059459459459456e-05, + "loss": 14.588, + "step": 34950 + }, + { + "epoch": 18.867924528301888, + "grad_norm": 1.6107219457626343, + "learning_rate": 3.424864864864864e-05, + "loss": 14.5702, + "step": 35000 + }, + { + "epoch": 18.867924528301888, + "eval_accuracy": 0.3983890061550617, + "eval_loss": 3.261690855026245, + "eval_runtime": 180.7732, + "eval_samples_per_second": 99.639, + "eval_steps_per_second": 6.229, + "step": 35000 + }, + { + "epoch": 18.89487870619946, + "grad_norm": 1.5193698406219482, + "learning_rate": 3.343783783783783e-05, + "loss": 14.5882, + "step": 35050 + }, + { + "epoch": 18.921832884097036, + "grad_norm": 1.5151571035385132, + "learning_rate": 3.2627027027027024e-05, + "loss": 14.5474, + "step": 35100 + }, + { + "epoch": 18.948787061994608, + "grad_norm": 1.5188689231872559, + "learning_rate": 3.181621621621621e-05, + "loss": 14.5617, + "step": 35150 + }, + { + "epoch": 18.975741239892184, + "grad_norm": 1.5117980241775513, + "learning_rate": 3.10054054054054e-05, + "loss": 14.5587, + "step": 35200 + }, + { + "epoch": 19.002695417789756, + "grad_norm": 1.5636892318725586, + "learning_rate": 3.019459459459459e-05, + "loss": 14.5439, + "step": 35250 + }, + { + "epoch": 19.02964959568733, + "grad_norm": 1.5492565631866455, + "learning_rate": 2.938378378378378e-05, + "loss": 14.4387, + "step": 35300 + }, + { + "epoch": 19.056603773584907, + "grad_norm": 1.5516005754470825, + "learning_rate": 2.857297297297297e-05, + "loss": 14.4445, + "step": 35350 + }, + { + "epoch": 19.08355795148248, + "grad_norm": 1.533972978591919, + "learning_rate": 2.7762162162162163e-05, + "loss": 14.4359, + "step": 35400 + }, + { + "epoch": 19.110512129380055, + "grad_norm": 1.507752537727356, + "learning_rate": 2.695135135135135e-05, + "loss": 14.4215, + "step": 35450 + }, + { + "epoch": 19.137466307277627, + "grad_norm": 1.4865872859954834, + "learning_rate": 2.614054054054054e-05, + "loss": 14.4713, + "step": 35500 + }, + { + "epoch": 19.164420485175203, + "grad_norm": 1.5423104763031006, + "learning_rate": 2.5329729729729728e-05, + "loss": 14.4748, + "step": 35550 + }, + { + "epoch": 19.191374663072775, + "grad_norm": 1.5166065692901611, + "learning_rate": 2.451891891891892e-05, + "loss": 14.5064, + "step": 35600 + }, + { + "epoch": 19.21832884097035, + "grad_norm": 1.5488274097442627, + "learning_rate": 2.370810810810811e-05, + "loss": 14.497, + "step": 35650 + }, + { + "epoch": 19.245283018867923, + "grad_norm": 1.519723892211914, + "learning_rate": 2.2897297297297296e-05, + "loss": 14.4739, + "step": 35700 + }, + { + "epoch": 19.2722371967655, + "grad_norm": 1.5654048919677734, + "learning_rate": 2.2086486486486486e-05, + "loss": 14.4639, + "step": 35750 + }, + { + "epoch": 19.299191374663074, + "grad_norm": 1.5737123489379883, + "learning_rate": 2.1275675675675677e-05, + "loss": 14.4619, + "step": 35800 + }, + { + "epoch": 19.326145552560646, + "grad_norm": 1.5194604396820068, + "learning_rate": 2.0464864864864864e-05, + "loss": 14.4379, + "step": 35850 + }, + { + "epoch": 19.353099730458222, + "grad_norm": 1.4980615377426147, + "learning_rate": 1.9654054054054054e-05, + "loss": 14.4757, + "step": 35900 + }, + { + "epoch": 19.380053908355794, + "grad_norm": 1.5187228918075562, + "learning_rate": 1.8843243243243245e-05, + "loss": 14.4836, + "step": 35950 + }, + { + "epoch": 19.40700808625337, + "grad_norm": 1.5278230905532837, + "learning_rate": 1.8032432432432432e-05, + "loss": 14.4615, + "step": 36000 + }, + { + "epoch": 19.40700808625337, + "eval_accuracy": 0.39837216589244917, + "eval_loss": 3.264392614364624, + "eval_runtime": 180.6409, + "eval_samples_per_second": 99.712, + "eval_steps_per_second": 6.233, + "step": 36000 + }, + { + "epoch": 19.433962264150942, + "grad_norm": 1.5677344799041748, + "learning_rate": 1.722162162162162e-05, + "loss": 14.4584, + "step": 36050 + }, + { + "epoch": 19.460916442048518, + "grad_norm": 1.4985554218292236, + "learning_rate": 1.641081081081081e-05, + "loss": 14.4745, + "step": 36100 + }, + { + "epoch": 19.48787061994609, + "grad_norm": 1.5670796632766724, + "learning_rate": 1.5599999999999996e-05, + "loss": 14.4912, + "step": 36150 + }, + { + "epoch": 19.514824797843666, + "grad_norm": 1.5281639099121094, + "learning_rate": 1.4789189189189187e-05, + "loss": 14.4717, + "step": 36200 + }, + { + "epoch": 19.54177897574124, + "grad_norm": 1.5401979684829712, + "learning_rate": 1.3978378378378376e-05, + "loss": 14.4799, + "step": 36250 + }, + { + "epoch": 19.568733153638814, + "grad_norm": 1.5297380685806274, + "learning_rate": 1.3167567567567566e-05, + "loss": 14.4374, + "step": 36300 + }, + { + "epoch": 19.59568733153639, + "grad_norm": 1.5379348993301392, + "learning_rate": 1.2372972972972972e-05, + "loss": 14.4731, + "step": 36350 + }, + { + "epoch": 19.62264150943396, + "grad_norm": 1.5073282718658447, + "learning_rate": 1.156216216216216e-05, + "loss": 14.444, + "step": 36400 + }, + { + "epoch": 19.649595687331537, + "grad_norm": 1.4987374544143677, + "learning_rate": 1.0751351351351351e-05, + "loss": 14.5121, + "step": 36450 + }, + { + "epoch": 19.67654986522911, + "grad_norm": 1.527362585067749, + "learning_rate": 9.94054054054054e-06, + "loss": 14.5012, + "step": 36500 + }, + { + "epoch": 19.703504043126685, + "grad_norm": 1.567726492881775, + "learning_rate": 9.129729729729729e-06, + "loss": 14.4594, + "step": 36550 + }, + { + "epoch": 19.730458221024257, + "grad_norm": 1.5044466257095337, + "learning_rate": 8.318918918918918e-06, + "loss": 14.4537, + "step": 36600 + }, + { + "epoch": 19.757412398921833, + "grad_norm": 1.486252784729004, + "learning_rate": 7.508108108108107e-06, + "loss": 14.5124, + "step": 36650 + }, + { + "epoch": 19.78436657681941, + "grad_norm": 1.5210126638412476, + "learning_rate": 6.697297297297297e-06, + "loss": 14.4657, + "step": 36700 + }, + { + "epoch": 19.81132075471698, + "grad_norm": 1.5458662509918213, + "learning_rate": 5.8864864864864855e-06, + "loss": 14.4687, + "step": 36750 + }, + { + "epoch": 19.838274932614556, + "grad_norm": 1.511208415031433, + "learning_rate": 5.075675675675675e-06, + "loss": 14.4791, + "step": 36800 + }, + { + "epoch": 19.86522911051213, + "grad_norm": 1.5122674703598022, + "learning_rate": 4.264864864864865e-06, + "loss": 14.4729, + "step": 36850 + }, + { + "epoch": 19.892183288409704, + "grad_norm": 1.492714762687683, + "learning_rate": 3.454054054054054e-06, + "loss": 14.4866, + "step": 36900 + }, + { + "epoch": 19.919137466307276, + "grad_norm": 1.4897505044937134, + "learning_rate": 2.643243243243243e-06, + "loss": 14.4836, + "step": 36950 + }, + { + "epoch": 19.946091644204852, + "grad_norm": 1.5580779314041138, + "learning_rate": 1.8324324324324325e-06, + "loss": 14.4647, + "step": 37000 + }, + { + "epoch": 19.946091644204852, + "eval_accuracy": 0.3985308989484288, + "eval_loss": 3.2621402740478516, + "eval_runtime": 180.8079, + "eval_samples_per_second": 99.62, + "eval_steps_per_second": 6.228, + "step": 37000 + }, + { + "epoch": 19.973045822102424, + "grad_norm": 1.5771269798278809, + "learning_rate": 1.0216216216216215e-06, + "loss": 14.4453, + "step": 37050 + }, + { + "epoch": 20.0, + "grad_norm": 2.5758163928985596, + "learning_rate": 2.108108108108108e-07, + "loss": 14.4773, + "step": 37100 + }, + { + "epoch": 20.0, + "step": 37100, + "total_flos": 1.55087795257344e+18, + "train_loss": 16.0859583358559, + "train_runtime": 127733.4325, + "train_samples_per_second": 46.467, + "train_steps_per_second": 0.29 + } + ], + "logging_steps": 50, + "max_steps": 37100, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.55087795257344e+18, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}