| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.999882294088221, | |
| "eval_steps": 500, | |
| "global_step": 67964, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005885295588970956, | |
| "grad_norm": 9.0, | |
| "learning_rate": 2.9411764705882354e-05, | |
| "loss": 0.6747, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.011770591177941912, | |
| "grad_norm": 15.9375, | |
| "learning_rate": 5.882352941176471e-05, | |
| "loss": 0.6583, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.01765588676691287, | |
| "grad_norm": 11.375, | |
| "learning_rate": 8.823529411764706e-05, | |
| "loss": 0.6365, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.023541182355883823, | |
| "grad_norm": 16.5, | |
| "learning_rate": 0.00011764705882352942, | |
| "loss": 0.6143, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.02942647794485478, | |
| "grad_norm": 10.4375, | |
| "learning_rate": 0.00014705882352941178, | |
| "loss": 0.617, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.03531177353382574, | |
| "grad_norm": 34.25, | |
| "learning_rate": 0.00017647058823529413, | |
| "loss": 0.6062, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.04119706912279669, | |
| "grad_norm": 7.09375, | |
| "learning_rate": 0.00019999995639803067, | |
| "loss": 0.594, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.047082364711767646, | |
| "grad_norm": 26.125, | |
| "learning_rate": 0.00019999843033309612, | |
| "loss": 0.6075, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.05296766030073861, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 0.00019999472420771699, | |
| "loss": 0.5744, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.05885295588970956, | |
| "grad_norm": 9.9375, | |
| "learning_rate": 0.00019998883810269034, | |
| "loss": 0.553, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.06473825147868052, | |
| "grad_norm": 24.0, | |
| "learning_rate": 0.00019998077214633883, | |
| "loss": 0.5583, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.07062354706765148, | |
| "grad_norm": 7.15625, | |
| "learning_rate": 0.00019997052651450793, | |
| "loss": 0.5831, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.07650884265662243, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 0.00019995810143056216, | |
| "loss": 0.5741, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.08239413824559338, | |
| "grad_norm": 11.375, | |
| "learning_rate": 0.00019994349716538005, | |
| "loss": 0.5671, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.08827943383456434, | |
| "grad_norm": 8.0625, | |
| "learning_rate": 0.00019992671403734846, | |
| "loss": 0.5546, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.09416472942353529, | |
| "grad_norm": 25.0, | |
| "learning_rate": 0.00019990775241235544, | |
| "loss": 0.572, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.10005002501250625, | |
| "grad_norm": 22.5, | |
| "learning_rate": 0.00019988661270378238, | |
| "loss": 0.5811, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.10593532060147721, | |
| "grad_norm": 24.5, | |
| "learning_rate": 0.0001998632953724949, | |
| "loss": 0.5737, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.11182061619044817, | |
| "grad_norm": 6.125, | |
| "learning_rate": 0.00019983780092683296, | |
| "loss": 0.5448, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.11770591177941912, | |
| "grad_norm": 7.46875, | |
| "learning_rate": 0.00019981012992259953, | |
| "loss": 0.5646, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.12359120736839008, | |
| "grad_norm": 18.5, | |
| "learning_rate": 0.00019978028296304876, | |
| "loss": 0.5626, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.12947650295736104, | |
| "grad_norm": 8.5625, | |
| "learning_rate": 0.0001997482606988726, | |
| "loss": 0.5608, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.13536179854633199, | |
| "grad_norm": 22.0, | |
| "learning_rate": 0.00019971406382818672, | |
| "loss": 0.5623, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.14124709413530295, | |
| "grad_norm": 31.875, | |
| "learning_rate": 0.00019967769309651529, | |
| "loss": 0.5606, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.1471323897242739, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 0.00019963914929677467, | |
| "loss": 0.5523, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.15301768531324486, | |
| "grad_norm": 10.125, | |
| "learning_rate": 0.0001995984332692562, | |
| "loss": 0.5691, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.1589029809022158, | |
| "grad_norm": 17.25, | |
| "learning_rate": 0.00019955554590160782, | |
| "loss": 0.5424, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.16478827649118677, | |
| "grad_norm": 9.9375, | |
| "learning_rate": 0.0001995104881288147, | |
| "loss": 0.548, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.17067357208015774, | |
| "grad_norm": 6.84375, | |
| "learning_rate": 0.00019946326093317902, | |
| "loss": 0.5425, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.17655886766912868, | |
| "grad_norm": 10.9375, | |
| "learning_rate": 0.0001994138653442983, | |
| "loss": 0.5699, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.18244416325809965, | |
| "grad_norm": 17.125, | |
| "learning_rate": 0.00019936230243904315, | |
| "loss": 0.5564, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.18832945884707059, | |
| "grad_norm": 12.75, | |
| "learning_rate": 0.0001993085733415337, | |
| "loss": 0.5614, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.19421475443604155, | |
| "grad_norm": 20.0, | |
| "learning_rate": 0.0001992526792231152, | |
| "loss": 0.5386, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.2001000500250125, | |
| "grad_norm": 10.9375, | |
| "learning_rate": 0.00019919462130233226, | |
| "loss": 0.5378, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.20598534561398346, | |
| "grad_norm": 8.875, | |
| "learning_rate": 0.00019913440084490255, | |
| "loss": 0.5493, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.21187064120295443, | |
| "grad_norm": 27.25, | |
| "learning_rate": 0.00019907201916368906, | |
| "loss": 0.543, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.21775593679192537, | |
| "grad_norm": 6.46875, | |
| "learning_rate": 0.0001990074776186715, | |
| "loss": 0.5458, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.22364123238089634, | |
| "grad_norm": 8.75, | |
| "learning_rate": 0.00019894077761691662, | |
| "loss": 0.55, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.22952652796986728, | |
| "grad_norm": 8.3125, | |
| "learning_rate": 0.0001988719206125476, | |
| "loss": 0.536, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.23541182355883825, | |
| "grad_norm": 5.4375, | |
| "learning_rate": 0.00019880090810671237, | |
| "loss": 0.5348, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.24129711914780919, | |
| "grad_norm": 31.5, | |
| "learning_rate": 0.00019872774164755072, | |
| "loss": 0.5406, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.24718241473678015, | |
| "grad_norm": 11.5625, | |
| "learning_rate": 0.00019865242283016076, | |
| "loss": 0.5374, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.2530677103257511, | |
| "grad_norm": 24.375, | |
| "learning_rate": 0.00019857495329656398, | |
| "loss": 0.5356, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.2589530059147221, | |
| "grad_norm": 9.5, | |
| "learning_rate": 0.00019849533473566955, | |
| "loss": 0.5555, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.26483830150369303, | |
| "grad_norm": 14.5625, | |
| "learning_rate": 0.00019841356888323749, | |
| "loss": 0.5165, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.27072359709266397, | |
| "grad_norm": 20.5, | |
| "learning_rate": 0.00019832965752184084, | |
| "loss": 0.5487, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.2766088926816349, | |
| "grad_norm": 5.3125, | |
| "learning_rate": 0.0001982436024808266, | |
| "loss": 0.5347, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.2824941882706059, | |
| "grad_norm": 5.96875, | |
| "learning_rate": 0.00019815540563627616, | |
| "loss": 0.5398, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.28837948385957685, | |
| "grad_norm": 7.0625, | |
| "learning_rate": 0.0001980650689109643, | |
| "loss": 0.5359, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.2942647794485478, | |
| "grad_norm": 26.125, | |
| "learning_rate": 0.00019797259427431705, | |
| "loss": 0.5547, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.3001500750375188, | |
| "grad_norm": 18.75, | |
| "learning_rate": 0.0001978779837423691, | |
| "loss": 0.5648, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.3060353706264897, | |
| "grad_norm": 13.0, | |
| "learning_rate": 0.00019778123937771953, | |
| "loss": 0.5182, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.31192066621546066, | |
| "grad_norm": 22.375, | |
| "learning_rate": 0.00019768236328948717, | |
| "loss": 0.5277, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.3178059618044316, | |
| "grad_norm": 44.5, | |
| "learning_rate": 0.00019758135763326426, | |
| "loss": 0.5348, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.3236912573934026, | |
| "grad_norm": 12.8125, | |
| "learning_rate": 0.0001974782246110698, | |
| "loss": 0.5295, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.32957655298237354, | |
| "grad_norm": 6.21875, | |
| "learning_rate": 0.00019737296647130123, | |
| "loss": 0.5472, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.3354618485713445, | |
| "grad_norm": 5.25, | |
| "learning_rate": 0.00019726558550868571, | |
| "loss": 0.5379, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.3413471441603155, | |
| "grad_norm": 8.4375, | |
| "learning_rate": 0.00019715608406422984, | |
| "loss": 0.5282, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.3472324397492864, | |
| "grad_norm": 7.25, | |
| "learning_rate": 0.00019704446452516874, | |
| "loss": 0.5334, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.35311773533825735, | |
| "grad_norm": 12.4375, | |
| "learning_rate": 0.00019693072932491405, | |
| "loss": 0.5487, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.3590030309272283, | |
| "grad_norm": 29.25, | |
| "learning_rate": 0.00019681488094300083, | |
| "loss": 0.5688, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.3648883265161993, | |
| "grad_norm": 11.625, | |
| "learning_rate": 0.00019669692190503343, | |
| "loss": 0.5565, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.37077362210517023, | |
| "grad_norm": 17.125, | |
| "learning_rate": 0.0001965768547826306, | |
| "loss": 0.5318, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.37665891769414117, | |
| "grad_norm": 9.125, | |
| "learning_rate": 0.00019645468219336922, | |
| "loss": 0.5443, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.38254421328311217, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 0.0001963304068007274, | |
| "loss": 0.5574, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.3884295088720831, | |
| "grad_norm": 13.5, | |
| "learning_rate": 0.00019620403131402633, | |
| "loss": 0.5513, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.39431480446105405, | |
| "grad_norm": 27.125, | |
| "learning_rate": 0.00019607555848837128, | |
| "loss": 0.5087, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.400200100050025, | |
| "grad_norm": 11.4375, | |
| "learning_rate": 0.00019594499112459148, | |
| "loss": 0.5271, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.406085395638996, | |
| "grad_norm": 7.65625, | |
| "learning_rate": 0.00019581233206917903, | |
| "loss": 0.5398, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.4119706912279669, | |
| "grad_norm": 11.1875, | |
| "learning_rate": 0.00019567758421422694, | |
| "loss": 0.5233, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.41785598681693786, | |
| "grad_norm": 13.0625, | |
| "learning_rate": 0.000195540750497366, | |
| "loss": 0.5258, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.42374128240590886, | |
| "grad_norm": 10.625, | |
| "learning_rate": 0.00019540183390170075, | |
| "loss": 0.5381, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.4296265779948798, | |
| "grad_norm": 9.3125, | |
| "learning_rate": 0.00019526083745574453, | |
| "loss": 0.5478, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.43551187358385074, | |
| "grad_norm": 44.0, | |
| "learning_rate": 0.00019511776423335327, | |
| "loss": 0.5132, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.4413971691728217, | |
| "grad_norm": 32.75, | |
| "learning_rate": 0.00019497261735365872, | |
| "loss": 0.5271, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.4472824647617927, | |
| "grad_norm": 12.125, | |
| "learning_rate": 0.00019482539998100023, | |
| "loss": 0.5463, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.4531677603507636, | |
| "grad_norm": 18.125, | |
| "learning_rate": 0.00019467611532485588, | |
| "loss": 0.5315, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.45905305593973456, | |
| "grad_norm": 11.3125, | |
| "learning_rate": 0.00019452476663977248, | |
| "loss": 0.5388, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.46493835152870555, | |
| "grad_norm": 21.75, | |
| "learning_rate": 0.00019437135722529471, | |
| "loss": 0.5212, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.4708236471176765, | |
| "grad_norm": 19.625, | |
| "learning_rate": 0.00019421589042589295, | |
| "loss": 0.5573, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.47670894270664743, | |
| "grad_norm": 19.0, | |
| "learning_rate": 0.00019405836963089066, | |
| "loss": 0.5358, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.48259423829561837, | |
| "grad_norm": 31.375, | |
| "learning_rate": 0.00019389879827439024, | |
| "loss": 0.5375, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.48847953388458937, | |
| "grad_norm": 17.375, | |
| "learning_rate": 0.00019373717983519833, | |
| "loss": 0.5458, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.4943648294735603, | |
| "grad_norm": 19.75, | |
| "learning_rate": 0.00019357351783674996, | |
| "loss": 0.5391, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.5002501250625313, | |
| "grad_norm": 13.625, | |
| "learning_rate": 0.00019340781584703155, | |
| "loss": 0.5328, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.5061354206515022, | |
| "grad_norm": 28.5, | |
| "learning_rate": 0.00019324007747850334, | |
| "loss": 0.5214, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.5120207162404732, | |
| "grad_norm": 12.875, | |
| "learning_rate": 0.0001930703063880206, | |
| "loss": 0.5446, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.5179060118294442, | |
| "grad_norm": 18.5, | |
| "learning_rate": 0.00019289850627675378, | |
| "loss": 0.5198, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.5237913074184151, | |
| "grad_norm": 15.375, | |
| "learning_rate": 0.000192724680890108, | |
| "loss": 0.5411, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.5296766030073861, | |
| "grad_norm": 15.8125, | |
| "learning_rate": 0.00019254883401764115, | |
| "loss": 0.529, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.5355618985963571, | |
| "grad_norm": 14.8125, | |
| "learning_rate": 0.00019237096949298156, | |
| "loss": 0.5224, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.5414471941853279, | |
| "grad_norm": 19.875, | |
| "learning_rate": 0.00019219109119374426, | |
| "loss": 0.5383, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.5473324897742989, | |
| "grad_norm": 5.59375, | |
| "learning_rate": 0.0001920092030414464, | |
| "loss": 0.5381, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.5532177853632698, | |
| "grad_norm": 5.5, | |
| "learning_rate": 0.00019182530900142198, | |
| "loss": 0.5447, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.5591030809522408, | |
| "grad_norm": 4.5, | |
| "learning_rate": 0.00019163941308273502, | |
| "loss": 0.5341, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.5649883765412118, | |
| "grad_norm": 5.59375, | |
| "learning_rate": 0.00019145151933809264, | |
| "loss": 0.5411, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.5708736721301827, | |
| "grad_norm": 33.25, | |
| "learning_rate": 0.00019126163186375633, | |
| "loss": 0.5389, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.5767589677191537, | |
| "grad_norm": 16.875, | |
| "learning_rate": 0.0001910697547994527, | |
| "loss": 0.5181, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.5826442633081247, | |
| "grad_norm": 9.625, | |
| "learning_rate": 0.0001908758923282835, | |
| "loss": 0.5404, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.5885295588970956, | |
| "grad_norm": 6.125, | |
| "learning_rate": 0.00019068004867663408, | |
| "loss": 0.543, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.5944148544860666, | |
| "grad_norm": 21.0, | |
| "learning_rate": 0.00019048222811408137, | |
| "loss": 0.541, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 0.6003001500750376, | |
| "grad_norm": 15.875, | |
| "learning_rate": 0.00019028243495330103, | |
| "loss": 0.5135, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.6061854456640084, | |
| "grad_norm": 17.5, | |
| "learning_rate": 0.00019008067354997298, | |
| "loss": 0.5297, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 0.6120707412529794, | |
| "grad_norm": 6.125, | |
| "learning_rate": 0.0001898769483026869, | |
| "loss": 0.5354, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.6179560368419504, | |
| "grad_norm": 22.375, | |
| "learning_rate": 0.000189671263652846, | |
| "loss": 0.5245, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.6238413324309213, | |
| "grad_norm": 5.25, | |
| "learning_rate": 0.00018946362408457036, | |
| "loss": 0.5313, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 0.6297266280198923, | |
| "grad_norm": 7.40625, | |
| "learning_rate": 0.0001892540341245991, | |
| "loss": 0.527, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 0.6356119236088632, | |
| "grad_norm": 5.5, | |
| "learning_rate": 0.0001890424983421918, | |
| "loss": 0.53, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 0.6414972191978342, | |
| "grad_norm": 6.59375, | |
| "learning_rate": 0.00018882902134902872, | |
| "loss": 0.5174, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 0.6473825147868052, | |
| "grad_norm": 25.125, | |
| "learning_rate": 0.00018861360779911048, | |
| "loss": 0.5373, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.6532678103757761, | |
| "grad_norm": 9.25, | |
| "learning_rate": 0.00018839626238865628, | |
| "loss": 0.5373, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 0.6591531059647471, | |
| "grad_norm": 9.3125, | |
| "learning_rate": 0.00018817698985600193, | |
| "loss": 0.5436, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 0.6650384015537181, | |
| "grad_norm": 14.875, | |
| "learning_rate": 0.00018795579498149612, | |
| "loss": 0.5331, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 0.670923697142689, | |
| "grad_norm": 4.875, | |
| "learning_rate": 0.00018773268258739654, | |
| "loss": 0.5337, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 0.67680899273166, | |
| "grad_norm": 27.625, | |
| "learning_rate": 0.0001875076575377646, | |
| "loss": 0.5097, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.682694288320631, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 0.00018728072473835942, | |
| "loss": 0.5335, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 0.6885795839096018, | |
| "grad_norm": 12.625, | |
| "learning_rate": 0.00018705188913653082, | |
| "loss": 0.5152, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 0.6944648794985728, | |
| "grad_norm": 22.25, | |
| "learning_rate": 0.00018682115572111156, | |
| "loss": 0.525, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 0.7003501750875438, | |
| "grad_norm": 11.75, | |
| "learning_rate": 0.00018658852952230853, | |
| "loss": 0.5222, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 0.7062354706765147, | |
| "grad_norm": 5.125, | |
| "learning_rate": 0.00018635401561159306, | |
| "loss": 0.5197, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.7121207662654857, | |
| "grad_norm": 15.375, | |
| "learning_rate": 0.0001861176191015904, | |
| "loss": 0.5207, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 0.7180060618544566, | |
| "grad_norm": 5.96875, | |
| "learning_rate": 0.00018587934514596824, | |
| "loss": 0.5436, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 0.7238913574434276, | |
| "grad_norm": 31.875, | |
| "learning_rate": 0.00018563919893932443, | |
| "loss": 0.5142, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 0.7297766530323986, | |
| "grad_norm": 10.625, | |
| "learning_rate": 0.0001853971857170736, | |
| "loss": 0.5215, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 0.7356619486213695, | |
| "grad_norm": 15.6875, | |
| "learning_rate": 0.00018515331075533303, | |
| "loss": 0.5603, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.7415472442103405, | |
| "grad_norm": 12.375, | |
| "learning_rate": 0.0001849075793708078, | |
| "loss": 0.5134, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 0.7474325397993115, | |
| "grad_norm": 7.3125, | |
| "learning_rate": 0.00018465999692067472, | |
| "loss": 0.5178, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 0.7533178353882823, | |
| "grad_norm": 6.0625, | |
| "learning_rate": 0.00018441056880246555, | |
| "loss": 0.5182, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 0.7592031309772533, | |
| "grad_norm": 15.625, | |
| "learning_rate": 0.00018415930045394944, | |
| "loss": 0.5231, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 0.7650884265662243, | |
| "grad_norm": 7.65625, | |
| "learning_rate": 0.00018390619735301418, | |
| "loss": 0.5019, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.7709737221551952, | |
| "grad_norm": 7.0625, | |
| "learning_rate": 0.000183651265017547, | |
| "loss": 0.5298, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 0.7768590177441662, | |
| "grad_norm": 8.0625, | |
| "learning_rate": 0.00018339450900531413, | |
| "loss": 0.5156, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 0.7827443133331372, | |
| "grad_norm": 23.875, | |
| "learning_rate": 0.00018313593491383975, | |
| "loss": 0.5479, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 0.7886296089221081, | |
| "grad_norm": 12.3125, | |
| "learning_rate": 0.00018287554838028377, | |
| "loss": 0.5341, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 0.7945149045110791, | |
| "grad_norm": 12.4375, | |
| "learning_rate": 0.00018261335508131912, | |
| "loss": 0.5373, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.80040020010005, | |
| "grad_norm": 24.0, | |
| "learning_rate": 0.00018234936073300797, | |
| "loss": 0.5329, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 0.806285495689021, | |
| "grad_norm": 11.8125, | |
| "learning_rate": 0.00018208357109067698, | |
| "loss": 0.5316, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 0.812170791277992, | |
| "grad_norm": 16.875, | |
| "learning_rate": 0.00018181599194879198, | |
| "loss": 0.5425, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 0.8180560868669629, | |
| "grad_norm": 17.375, | |
| "learning_rate": 0.00018154662914083157, | |
| "loss": 0.5318, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 0.8239413824559338, | |
| "grad_norm": 8.5625, | |
| "learning_rate": 0.0001812754885391599, | |
| "loss": 0.5286, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.8298266780449048, | |
| "grad_norm": 21.625, | |
| "learning_rate": 0.00018100257605489884, | |
| "loss": 0.5256, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 0.8357119736338757, | |
| "grad_norm": 15.0, | |
| "learning_rate": 0.00018072789763779888, | |
| "loss": 0.5261, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 0.8415972692228467, | |
| "grad_norm": 9.5, | |
| "learning_rate": 0.0001804514592761095, | |
| "loss": 0.5353, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 0.8474825648118177, | |
| "grad_norm": 16.125, | |
| "learning_rate": 0.0001801732669964487, | |
| "loss": 0.5156, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 0.8533678604007886, | |
| "grad_norm": 8.3125, | |
| "learning_rate": 0.00017989332686367155, | |
| "loss": 0.5343, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.8592531559897596, | |
| "grad_norm": 27.125, | |
| "learning_rate": 0.0001796116449807379, | |
| "loss": 0.5218, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 0.8651384515787306, | |
| "grad_norm": 20.125, | |
| "learning_rate": 0.00017932822748857946, | |
| "loss": 0.5111, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 0.8710237471677015, | |
| "grad_norm": 18.625, | |
| "learning_rate": 0.0001790430805659659, | |
| "loss": 0.5327, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 0.8769090427566725, | |
| "grad_norm": 7.0625, | |
| "learning_rate": 0.00017875621042937002, | |
| "loss": 0.5096, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 0.8827943383456434, | |
| "grad_norm": 10.25, | |
| "learning_rate": 0.0001784676233328324, | |
| "loss": 0.5091, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.8886796339346144, | |
| "grad_norm": 18.5, | |
| "learning_rate": 0.0001781773255678249, | |
| "loss": 0.5177, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 0.8945649295235854, | |
| "grad_norm": 8.5, | |
| "learning_rate": 0.00017788532346311366, | |
| "loss": 0.5353, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 0.9004502251125562, | |
| "grad_norm": 31.0, | |
| "learning_rate": 0.00017759162338462092, | |
| "loss": 0.5387, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 0.9063355207015272, | |
| "grad_norm": 16.5, | |
| "learning_rate": 0.00017729623173528641, | |
| "loss": 0.5059, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 0.9122208162904982, | |
| "grad_norm": 8.0625, | |
| "learning_rate": 0.00017699915495492783, | |
| "loss": 0.5403, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.9181061118794691, | |
| "grad_norm": 16.625, | |
| "learning_rate": 0.0001767003995201001, | |
| "loss": 0.5228, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 0.9239914074684401, | |
| "grad_norm": 5.71875, | |
| "learning_rate": 0.00017639997194395456, | |
| "loss": 0.5305, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 0.9298767030574111, | |
| "grad_norm": 15.5625, | |
| "learning_rate": 0.0001760978787760968, | |
| "loss": 0.5179, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 0.935761998646382, | |
| "grad_norm": 6.96875, | |
| "learning_rate": 0.00017579412660244378, | |
| "loss": 0.5253, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 0.941647294235353, | |
| "grad_norm": 7.46875, | |
| "learning_rate": 0.0001754887220450805, | |
| "loss": 0.5034, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.947532589824324, | |
| "grad_norm": 13.125, | |
| "learning_rate": 0.00017518167176211542, | |
| "loss": 0.4989, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 0.9534178854132949, | |
| "grad_norm": 17.0, | |
| "learning_rate": 0.00017487298244753534, | |
| "loss": 0.5341, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 0.9593031810022659, | |
| "grad_norm": 14.6875, | |
| "learning_rate": 0.00017456266083105956, | |
| "loss": 0.4969, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 0.9651884765912367, | |
| "grad_norm": 7.46875, | |
| "learning_rate": 0.00017425071367799307, | |
| "loss": 0.5237, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 0.9710737721802077, | |
| "grad_norm": 16.375, | |
| "learning_rate": 0.00017393714778907914, | |
| "loss": 0.5359, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.9769590677691787, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 0.00017362197000035093, | |
| "loss": 0.5218, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 0.9828443633581496, | |
| "grad_norm": 20.25, | |
| "learning_rate": 0.00017330518718298264, | |
| "loss": 0.5275, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 0.9887296589471206, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 0.00017298680624313958, | |
| "loss": 0.5268, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 0.9946149545360916, | |
| "grad_norm": 16.375, | |
| "learning_rate": 0.0001726668341218276, | |
| "loss": 0.5311, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 1.0005002501250626, | |
| "grad_norm": 9.75, | |
| "learning_rate": 0.00017234527779474184, | |
| "loss": 0.5364, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 1.0063855457140334, | |
| "grad_norm": 6.53125, | |
| "learning_rate": 0.00017202214427211468, | |
| "loss": 0.5141, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 1.0122708413030044, | |
| "grad_norm": 10.625, | |
| "learning_rate": 0.0001716974405985628, | |
| "loss": 0.5321, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 1.0181561368919754, | |
| "grad_norm": 8.75, | |
| "learning_rate": 0.0001713711738529336, | |
| "loss": 0.5292, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 1.0240414324809464, | |
| "grad_norm": 9.0, | |
| "learning_rate": 0.00017104335114815104, | |
| "loss": 0.5249, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 1.0299267280699174, | |
| "grad_norm": 20.0, | |
| "learning_rate": 0.00017071397963106045, | |
| "loss": 0.5342, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 1.0358120236588884, | |
| "grad_norm": 36.0, | |
| "learning_rate": 0.00017038306648227262, | |
| "loss": 0.481, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 1.0416973192478591, | |
| "grad_norm": 23.0, | |
| "learning_rate": 0.00017005061891600751, | |
| "loss": 0.5246, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 1.0475826148368301, | |
| "grad_norm": 8.4375, | |
| "learning_rate": 0.00016971664417993676, | |
| "loss": 0.5121, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 1.0534679104258011, | |
| "grad_norm": 18.375, | |
| "learning_rate": 0.00016938114955502578, | |
| "loss": 0.518, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 1.0593532060147721, | |
| "grad_norm": 20.875, | |
| "learning_rate": 0.00016904414235537497, | |
| "loss": 0.5402, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.0652385016037431, | |
| "grad_norm": 10.9375, | |
| "learning_rate": 0.00016870562992806035, | |
| "loss": 0.5306, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 1.0711237971927141, | |
| "grad_norm": 10.625, | |
| "learning_rate": 0.00016836561965297324, | |
| "loss": 0.5452, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 1.0770090927816849, | |
| "grad_norm": 24.0, | |
| "learning_rate": 0.00016802411894265953, | |
| "loss": 0.5258, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 1.0828943883706559, | |
| "grad_norm": 24.0, | |
| "learning_rate": 0.00016768113524215798, | |
| "loss": 0.4995, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 1.0887796839596269, | |
| "grad_norm": 22.75, | |
| "learning_rate": 0.00016733667602883797, | |
| "loss": 0.4998, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 1.0946649795485979, | |
| "grad_norm": 12.3125, | |
| "learning_rate": 0.00016699074881223636, | |
| "loss": 0.5308, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 1.1005502751375689, | |
| "grad_norm": 23.625, | |
| "learning_rate": 0.000166643361133894, | |
| "loss": 0.516, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 1.1064355707265396, | |
| "grad_norm": 9.8125, | |
| "learning_rate": 0.00016629452056719118, | |
| "loss": 0.5127, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 1.1123208663155106, | |
| "grad_norm": 21.5, | |
| "learning_rate": 0.00016594423471718236, | |
| "loss": 0.5072, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 1.1182061619044816, | |
| "grad_norm": 10.0, | |
| "learning_rate": 0.0001655925112204308, | |
| "loss": 0.536, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.1240914574934526, | |
| "grad_norm": 7.03125, | |
| "learning_rate": 0.00016523935774484158, | |
| "loss": 0.5184, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 1.1299767530824236, | |
| "grad_norm": 16.5, | |
| "learning_rate": 0.00016488478198949485, | |
| "loss": 0.5186, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 1.1358620486713944, | |
| "grad_norm": 20.625, | |
| "learning_rate": 0.0001645287916844777, | |
| "loss": 0.5418, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 1.1417473442603654, | |
| "grad_norm": 5.46875, | |
| "learning_rate": 0.00016417139459071577, | |
| "loss": 0.5054, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 1.1476326398493364, | |
| "grad_norm": 9.125, | |
| "learning_rate": 0.00016381259849980405, | |
| "loss": 0.4923, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 1.1535179354383074, | |
| "grad_norm": 6.8125, | |
| "learning_rate": 0.000163452411233837, | |
| "loss": 0.5182, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 1.1594032310272784, | |
| "grad_norm": 6.1875, | |
| "learning_rate": 0.00016309084064523792, | |
| "loss": 0.5142, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 1.1652885266162494, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 0.000162727894616588, | |
| "loss": 0.5055, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 1.1711738222052204, | |
| "grad_norm": 5.4375, | |
| "learning_rate": 0.0001623635810604542, | |
| "loss": 0.5187, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 1.1770591177941911, | |
| "grad_norm": 27.875, | |
| "learning_rate": 0.00016199790791921693, | |
| "loss": 0.4999, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.1829444133831621, | |
| "grad_norm": 23.0, | |
| "learning_rate": 0.00016163088316489683, | |
| "loss": 0.5208, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 1.1888297089721331, | |
| "grad_norm": 18.25, | |
| "learning_rate": 0.00016126251479898097, | |
| "loss": 0.5397, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 1.1947150045611041, | |
| "grad_norm": 8.25, | |
| "learning_rate": 0.0001608928108522485, | |
| "loss": 0.5105, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 1.2006003001500751, | |
| "grad_norm": 11.75, | |
| "learning_rate": 0.00016052177938459539, | |
| "loss": 0.5218, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 1.206485595739046, | |
| "grad_norm": 7.71875, | |
| "learning_rate": 0.00016014942848485887, | |
| "loss": 0.5323, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 1.212370891328017, | |
| "grad_norm": 8.375, | |
| "learning_rate": 0.0001597757662706411, | |
| "loss": 0.5348, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 1.218256186916988, | |
| "grad_norm": 6.65625, | |
| "learning_rate": 0.00015940080088813193, | |
| "loss": 0.5107, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 1.2241414825059589, | |
| "grad_norm": 8.25, | |
| "learning_rate": 0.00015902454051193183, | |
| "loss": 0.5125, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 1.2300267780949299, | |
| "grad_norm": 12.875, | |
| "learning_rate": 0.0001586469933448731, | |
| "loss": 0.5284, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 1.2359120736839007, | |
| "grad_norm": 14.8125, | |
| "learning_rate": 0.00015826816761784138, | |
| "loss": 0.5262, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 1.2417973692728717, | |
| "grad_norm": 13.625, | |
| "learning_rate": 0.0001578880715895962, | |
| "loss": 0.5188, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 1.2476826648618426, | |
| "grad_norm": 19.375, | |
| "learning_rate": 0.00015750671354659073, | |
| "loss": 0.5328, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 1.2535679604508136, | |
| "grad_norm": 14.0625, | |
| "learning_rate": 0.00015712410180279132, | |
| "loss": 0.5384, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 1.2594532560397846, | |
| "grad_norm": 20.75, | |
| "learning_rate": 0.0001567402446994962, | |
| "loss": 0.5175, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 1.2653385516287556, | |
| "grad_norm": 13.0, | |
| "learning_rate": 0.0001563551506051536, | |
| "loss": 0.5308, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 1.2712238472177266, | |
| "grad_norm": 39.0, | |
| "learning_rate": 0.00015596882791517932, | |
| "loss": 0.5445, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 1.2771091428066974, | |
| "grad_norm": 17.625, | |
| "learning_rate": 0.00015558128505177373, | |
| "loss": 0.5321, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 1.2829944383956684, | |
| "grad_norm": 6.53125, | |
| "learning_rate": 0.0001551925304637381, | |
| "loss": 0.5123, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 1.2888797339846394, | |
| "grad_norm": 11.625, | |
| "learning_rate": 0.00015480257262629046, | |
| "loss": 0.5374, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 1.2947650295736104, | |
| "grad_norm": 7.34375, | |
| "learning_rate": 0.00015441142004088082, | |
| "loss": 0.5317, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 1.3006503251625814, | |
| "grad_norm": 19.25, | |
| "learning_rate": 0.00015401908123500587, | |
| "loss": 0.5192, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 1.3065356207515522, | |
| "grad_norm": 5.25, | |
| "learning_rate": 0.00015362556476202294, | |
| "loss": 0.5218, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 1.3124209163405232, | |
| "grad_norm": 5.53125, | |
| "learning_rate": 0.00015323087920096363, | |
| "loss": 0.5554, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 1.3183062119294942, | |
| "grad_norm": 12.9375, | |
| "learning_rate": 0.00015283503315634687, | |
| "loss": 0.5106, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 1.3241915075184651, | |
| "grad_norm": 20.125, | |
| "learning_rate": 0.00015243803525799115, | |
| "loss": 0.5166, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 1.3300768031074361, | |
| "grad_norm": 15.4375, | |
| "learning_rate": 0.00015203989416082643, | |
| "loss": 0.5285, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 1.335962098696407, | |
| "grad_norm": 29.25, | |
| "learning_rate": 0.00015164061854470556, | |
| "loss": 0.5226, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 1.341847394285378, | |
| "grad_norm": 8.375, | |
| "learning_rate": 0.0001512402171142149, | |
| "loss": 0.5403, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 1.347732689874349, | |
| "grad_norm": 8.875, | |
| "learning_rate": 0.00015083869859848473, | |
| "loss": 0.5459, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 1.35361798546332, | |
| "grad_norm": 19.5, | |
| "learning_rate": 0.00015043607175099877, | |
| "loss": 0.5232, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 1.359503281052291, | |
| "grad_norm": 6.84375, | |
| "learning_rate": 0.00015003234534940343, | |
| "loss": 0.5384, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 1.3653885766412617, | |
| "grad_norm": 11.875, | |
| "learning_rate": 0.00014962752819531647, | |
| "loss": 0.5146, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 1.371273872230233, | |
| "grad_norm": 10.4375, | |
| "learning_rate": 0.00014922162911413505, | |
| "loss": 0.5263, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 1.3771591678192037, | |
| "grad_norm": 6.75, | |
| "learning_rate": 0.00014881465695484338, | |
| "loss": 0.5244, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 1.3830444634081747, | |
| "grad_norm": 12.8125, | |
| "learning_rate": 0.0001484066205898198, | |
| "loss": 0.5228, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 1.3889297589971457, | |
| "grad_norm": 5.78125, | |
| "learning_rate": 0.0001479975289146434, | |
| "loss": 0.5346, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 1.3948150545861167, | |
| "grad_norm": 21.5, | |
| "learning_rate": 0.00014758739084789983, | |
| "loss": 0.5081, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 1.4007003501750876, | |
| "grad_norm": 19.875, | |
| "learning_rate": 0.0001471762153309873, | |
| "loss": 0.5265, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 1.4065856457640584, | |
| "grad_norm": 6.65625, | |
| "learning_rate": 0.00014676401132792131, | |
| "loss": 0.5238, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 1.4124709413530294, | |
| "grad_norm": 7.96875, | |
| "learning_rate": 0.00014635078782513928, | |
| "loss": 0.5243, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 1.4183562369420004, | |
| "grad_norm": 18.375, | |
| "learning_rate": 0.0001459365538313048, | |
| "loss": 0.519, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 1.4242415325309714, | |
| "grad_norm": 7.53125, | |
| "learning_rate": 0.00014552131837711107, | |
| "loss": 0.5035, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 1.4301268281199424, | |
| "grad_norm": 12.625, | |
| "learning_rate": 0.00014510509051508406, | |
| "loss": 0.5155, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 1.4360121237089132, | |
| "grad_norm": 23.125, | |
| "learning_rate": 0.00014468787931938516, | |
| "loss": 0.5307, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 1.4418974192978842, | |
| "grad_norm": 6.625, | |
| "learning_rate": 0.00014426969388561345, | |
| "loss": 0.5463, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 1.4477827148868552, | |
| "grad_norm": 11.75, | |
| "learning_rate": 0.0001438505433306072, | |
| "loss": 0.5078, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 1.4536680104758262, | |
| "grad_norm": 17.125, | |
| "learning_rate": 0.00014343043679224533, | |
| "loss": 0.5224, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 1.4595533060647972, | |
| "grad_norm": 11.0625, | |
| "learning_rate": 0.00014300938342924803, | |
| "loss": 0.515, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 1.465438601653768, | |
| "grad_norm": 5.375, | |
| "learning_rate": 0.00014258739242097726, | |
| "loss": 0.5313, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 1.4713238972427392, | |
| "grad_norm": 14.1875, | |
| "learning_rate": 0.0001421644729672364, | |
| "loss": 0.5191, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.47720919283171, | |
| "grad_norm": 9.125, | |
| "learning_rate": 0.00014174063428807, | |
| "loss": 0.5358, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 1.483094488420681, | |
| "grad_norm": 18.625, | |
| "learning_rate": 0.00014131588562356243, | |
| "loss": 0.5256, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 1.488979784009652, | |
| "grad_norm": 7.71875, | |
| "learning_rate": 0.00014089023623363667, | |
| "loss": 0.5414, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 1.494865079598623, | |
| "grad_norm": 36.5, | |
| "learning_rate": 0.00014046369539785233, | |
| "loss": 0.526, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 1.500750375187594, | |
| "grad_norm": 14.5, | |
| "learning_rate": 0.00014003627241520347, | |
| "loss": 0.5072, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 1.5066356707765647, | |
| "grad_norm": 7.96875, | |
| "learning_rate": 0.0001396079766039157, | |
| "loss": 0.5244, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 1.5125209663655357, | |
| "grad_norm": 14.0625, | |
| "learning_rate": 0.00013917881730124315, | |
| "loss": 0.5159, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 1.5184062619545067, | |
| "grad_norm": 10.25, | |
| "learning_rate": 0.0001387488038632649, | |
| "loss": 0.5111, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 1.5242915575434777, | |
| "grad_norm": 11.9375, | |
| "learning_rate": 0.00013831794566468097, | |
| "loss": 0.5254, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 1.5301768531324487, | |
| "grad_norm": 23.625, | |
| "learning_rate": 0.00013788625209860793, | |
| "loss": 0.5248, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 1.5360621487214194, | |
| "grad_norm": 12.3125, | |
| "learning_rate": 0.00013745373257637418, | |
| "loss": 0.5324, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 1.5419474443103904, | |
| "grad_norm": 14.875, | |
| "learning_rate": 0.00013702039652731482, | |
| "loss": 0.5062, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 1.5478327398993614, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 0.00013658625339856587, | |
| "loss": 0.5304, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 1.5537180354883324, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 0.0001361513126548585, | |
| "loss": 0.5169, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 1.5596033310773034, | |
| "grad_norm": 17.0, | |
| "learning_rate": 0.0001357155837783127, | |
| "loss": 0.5242, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 1.5654886266662742, | |
| "grad_norm": 10.625, | |
| "learning_rate": 0.00013527907626823048, | |
| "loss": 0.5312, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 1.5713739222552454, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 0.00013484179964088873, | |
| "loss": 0.5313, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 1.5772592178442162, | |
| "grad_norm": 6.71875, | |
| "learning_rate": 0.00013440376342933188, | |
| "loss": 0.5317, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 1.5831445134331872, | |
| "grad_norm": 7.34375, | |
| "learning_rate": 0.00013396497718316406, | |
| "loss": 0.5358, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 1.5890298090221582, | |
| "grad_norm": 16.25, | |
| "learning_rate": 0.00013352545046834075, | |
| "loss": 0.4916, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 1.594915104611129, | |
| "grad_norm": 11.8125, | |
| "learning_rate": 0.00013308519286696043, | |
| "loss": 0.4964, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 1.6008004002001002, | |
| "grad_norm": 5.6875, | |
| "learning_rate": 0.00013264421397705557, | |
| "loss": 0.5129, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 1.606685695789071, | |
| "grad_norm": 18.125, | |
| "learning_rate": 0.0001322025234123835, | |
| "loss": 0.5137, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 1.612570991378042, | |
| "grad_norm": 7.46875, | |
| "learning_rate": 0.0001317601308022165, | |
| "loss": 0.5186, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 1.618456286967013, | |
| "grad_norm": 22.625, | |
| "learning_rate": 0.0001313170457911324, | |
| "loss": 0.5108, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 1.6243415825559837, | |
| "grad_norm": 7.125, | |
| "learning_rate": 0.00013087327803880383, | |
| "loss": 0.522, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 1.630226878144955, | |
| "grad_norm": 25.125, | |
| "learning_rate": 0.0001304288372197879, | |
| "loss": 0.5084, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 1.6361121737339257, | |
| "grad_norm": 27.5, | |
| "learning_rate": 0.00012998373302331516, | |
| "loss": 0.5356, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 1.6419974693228967, | |
| "grad_norm": 6.8125, | |
| "learning_rate": 0.0001295379751530785, | |
| "loss": 0.522, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 1.6478827649118677, | |
| "grad_norm": 10.4375, | |
| "learning_rate": 0.00012909157332702145, | |
| "loss": 0.5182, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 1.6537680605008387, | |
| "grad_norm": 8.6875, | |
| "learning_rate": 0.00012864453727712638, | |
| "loss": 0.5054, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 1.6596533560898097, | |
| "grad_norm": 5.65625, | |
| "learning_rate": 0.00012819687674920234, | |
| "loss": 0.5319, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 1.6655386516787805, | |
| "grad_norm": 19.125, | |
| "learning_rate": 0.0001277486015026727, | |
| "loss": 0.5084, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 1.6714239472677517, | |
| "grad_norm": 7.71875, | |
| "learning_rate": 0.00012729972131036212, | |
| "loss": 0.5115, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 1.6773092428567224, | |
| "grad_norm": 6.59375, | |
| "learning_rate": 0.0001268502459582838, | |
| "loss": 0.5298, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 1.6831945384456934, | |
| "grad_norm": 16.0, | |
| "learning_rate": 0.00012640018524542583, | |
| "loss": 0.5167, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 1.6890798340346644, | |
| "grad_norm": 32.5, | |
| "learning_rate": 0.0001259495489835378, | |
| "loss": 0.4973, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 1.6949651296236352, | |
| "grad_norm": 20.875, | |
| "learning_rate": 0.00012549834699691686, | |
| "loss": 0.5206, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 1.7008504252126064, | |
| "grad_norm": 17.125, | |
| "learning_rate": 0.00012504658912219346, | |
| "loss": 0.5083, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 1.7067357208015772, | |
| "grad_norm": 22.875, | |
| "learning_rate": 0.00012459428520811687, | |
| "loss": 0.501, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 1.7126210163905482, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 0.00012414144511534064, | |
| "loss": 0.5043, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 1.7185063119795192, | |
| "grad_norm": 17.625, | |
| "learning_rate": 0.00012368807871620743, | |
| "loss": 0.5342, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 1.72439160756849, | |
| "grad_norm": 11.75, | |
| "learning_rate": 0.00012323419589453394, | |
| "loss": 0.5153, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 1.7302769031574612, | |
| "grad_norm": 18.625, | |
| "learning_rate": 0.00012277980654539533, | |
| "loss": 0.5525, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 1.736162198746432, | |
| "grad_norm": 5.125, | |
| "learning_rate": 0.0001223249205749096, | |
| "loss": 0.5195, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 1.742047494335403, | |
| "grad_norm": 19.0, | |
| "learning_rate": 0.0001218695479000215, | |
| "loss": 0.5024, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 1.747932789924374, | |
| "grad_norm": 6.0, | |
| "learning_rate": 0.0001214136984482864, | |
| "loss": 0.5058, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 1.753818085513345, | |
| "grad_norm": 23.875, | |
| "learning_rate": 0.00012095738215765391, | |
| "loss": 0.5097, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 1.759703381102316, | |
| "grad_norm": 8.625, | |
| "learning_rate": 0.0001205006089762511, | |
| "loss": 0.5282, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 1.7655886766912867, | |
| "grad_norm": 25.875, | |
| "learning_rate": 0.00012004338886216578, | |
| "loss": 0.508, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 1.771473972280258, | |
| "grad_norm": 8.75, | |
| "learning_rate": 0.0001195857317832292, | |
| "loss": 0.5232, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 1.7773592678692287, | |
| "grad_norm": 18.625, | |
| "learning_rate": 0.00011912764771679898, | |
| "loss": 0.5227, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 1.7832445634581997, | |
| "grad_norm": 14.875, | |
| "learning_rate": 0.00011866914664954139, | |
| "loss": 0.5093, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 1.7891298590471707, | |
| "grad_norm": 7.96875, | |
| "learning_rate": 0.00011821023857721371, | |
| "loss": 0.5307, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 1.7950151546361415, | |
| "grad_norm": 7.375, | |
| "learning_rate": 0.00011775093350444637, | |
| "loss": 0.5205, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 1.8009004502251127, | |
| "grad_norm": 12.25, | |
| "learning_rate": 0.00011729124144452477, | |
| "loss": 0.5136, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 1.8067857458140835, | |
| "grad_norm": 26.125, | |
| "learning_rate": 0.00011683117241917095, | |
| "loss": 0.4868, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 1.8126710414030545, | |
| "grad_norm": 7.5625, | |
| "learning_rate": 0.00011637073645832516, | |
| "loss": 0.5018, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 1.8185563369920255, | |
| "grad_norm": 20.5, | |
| "learning_rate": 0.00011590994359992731, | |
| "loss": 0.5079, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 1.8244416325809962, | |
| "grad_norm": 9.25, | |
| "learning_rate": 0.00011544880388969783, | |
| "loss": 0.546, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 1.8303269281699674, | |
| "grad_norm": 6.3125, | |
| "learning_rate": 0.000114987327380919, | |
| "loss": 0.5261, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 1.8362122237589382, | |
| "grad_norm": 16.5, | |
| "learning_rate": 0.00011452552413421558, | |
| "loss": 0.5218, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 1.8420975193479092, | |
| "grad_norm": 14.3125, | |
| "learning_rate": 0.0001140634042173354, | |
| "loss": 0.534, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 1.8479828149368802, | |
| "grad_norm": 14.625, | |
| "learning_rate": 0.00011360097770493024, | |
| "loss": 0.5182, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 1.8538681105258512, | |
| "grad_norm": 7.34375, | |
| "learning_rate": 0.00011313825467833574, | |
| "loss": 0.5025, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 1.8597534061148222, | |
| "grad_norm": 19.625, | |
| "learning_rate": 0.00011267524522535198, | |
| "loss": 0.507, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 1.865638701703793, | |
| "grad_norm": 19.0, | |
| "learning_rate": 0.00011221195944002332, | |
| "loss": 0.5229, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 1.871523997292764, | |
| "grad_norm": 23.625, | |
| "learning_rate": 0.00011174840742241844, | |
| "loss": 0.5209, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 1.877409292881735, | |
| "grad_norm": 12.5, | |
| "learning_rate": 0.00011128459927841013, | |
| "loss": 0.5025, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 1.883294588470706, | |
| "grad_norm": 11.3125, | |
| "learning_rate": 0.00011082054511945501, | |
| "loss": 0.5267, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 1.889179884059677, | |
| "grad_norm": 14.0, | |
| "learning_rate": 0.00011035625506237304, | |
| "loss": 0.5225, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 1.8950651796486477, | |
| "grad_norm": 5.4375, | |
| "learning_rate": 0.00010989173922912696, | |
| "loss": 0.514, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 1.900950475237619, | |
| "grad_norm": 13.9375, | |
| "learning_rate": 0.00010942700774660173, | |
| "loss": 0.5344, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 1.9068357708265897, | |
| "grad_norm": 21.0, | |
| "learning_rate": 0.00010896207074638356, | |
| "loss": 0.5109, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 1.9127210664155607, | |
| "grad_norm": 15.125, | |
| "learning_rate": 0.0001084969383645392, | |
| "loss": 0.5147, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 1.9186063620045317, | |
| "grad_norm": 8.3125, | |
| "learning_rate": 0.00010803162074139487, | |
| "loss": 0.5041, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 1.9244916575935025, | |
| "grad_norm": 14.875, | |
| "learning_rate": 0.00010756612802131528, | |
| "loss": 0.5334, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 1.9303769531824737, | |
| "grad_norm": 9.625, | |
| "learning_rate": 0.00010710047035248235, | |
| "loss": 0.4981, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 1.9362622487714445, | |
| "grad_norm": 15.4375, | |
| "learning_rate": 0.00010663465788667406, | |
| "loss": 0.5252, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 1.9421475443604155, | |
| "grad_norm": 11.6875, | |
| "learning_rate": 0.0001061687007790432, | |
| "loss": 0.5196, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 1.9480328399493865, | |
| "grad_norm": 21.125, | |
| "learning_rate": 0.00010570260918789578, | |
| "loss": 0.5056, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 1.9539181355383572, | |
| "grad_norm": 6.0625, | |
| "learning_rate": 0.00010523639327446968, | |
| "loss": 0.5173, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 1.9598034311273285, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 0.00010477006320271317, | |
| "loss": 0.4972, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 1.9656887267162992, | |
| "grad_norm": 15.0625, | |
| "learning_rate": 0.00010430362913906327, | |
| "loss": 0.5204, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 1.9715740223052702, | |
| "grad_norm": 42.5, | |
| "learning_rate": 0.00010383710125222412, | |
| "loss": 0.522, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 1.9774593178942412, | |
| "grad_norm": 16.375, | |
| "learning_rate": 0.00010337048971294529, | |
| "loss": 0.538, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 1.9833446134832122, | |
| "grad_norm": 15.0, | |
| "learning_rate": 0.00010290380469380005, | |
| "loss": 0.5178, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 1.9892299090721832, | |
| "grad_norm": 15.25, | |
| "learning_rate": 0.00010243705636896361, | |
| "loss": 0.544, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 1.995115204661154, | |
| "grad_norm": 14.4375, | |
| "learning_rate": 0.00010197025491399128, | |
| "loss": 0.4892, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 2.001000500250125, | |
| "grad_norm": 6.78125, | |
| "learning_rate": 0.00010150341050559669, | |
| "loss": 0.5086, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 2.006885795839096, | |
| "grad_norm": 15.1875, | |
| "learning_rate": 0.00010103653332142988, | |
| "loss": 0.4967, | |
| "step": 34100 | |
| }, | |
| { | |
| "epoch": 2.0127710914280668, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 0.00010056963353985544, | |
| "loss": 0.5222, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 2.018656387017038, | |
| "grad_norm": 9.0, | |
| "learning_rate": 0.00010010272133973058, | |
| "loss": 0.5374, | |
| "step": 34300 | |
| }, | |
| { | |
| "epoch": 2.0245416826060088, | |
| "grad_norm": 14.6875, | |
| "learning_rate": 9.963580690018327e-05, | |
| "loss": 0.5077, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 2.03042697819498, | |
| "grad_norm": 9.6875, | |
| "learning_rate": 9.916890040039031e-05, | |
| "loss": 0.5286, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 2.0363122737839507, | |
| "grad_norm": 18.125, | |
| "learning_rate": 9.870201201935538e-05, | |
| "loss": 0.5236, | |
| "step": 34600 | |
| }, | |
| { | |
| "epoch": 2.042197569372922, | |
| "grad_norm": 7.65625, | |
| "learning_rate": 9.823515193568715e-05, | |
| "loss": 0.5196, | |
| "step": 34700 | |
| }, | |
| { | |
| "epoch": 2.0480828649618927, | |
| "grad_norm": 6.34375, | |
| "learning_rate": 9.776833032737742e-05, | |
| "loss": 0.5108, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 2.0539681605508635, | |
| "grad_norm": 16.625, | |
| "learning_rate": 9.730155737157916e-05, | |
| "loss": 0.5166, | |
| "step": 34900 | |
| }, | |
| { | |
| "epoch": 2.0598534561398347, | |
| "grad_norm": 4.625, | |
| "learning_rate": 9.683484324438467e-05, | |
| "loss": 0.512, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 2.0657387517288055, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 9.636819812060377e-05, | |
| "loss": 0.5163, | |
| "step": 35100 | |
| }, | |
| { | |
| "epoch": 2.0716240473177767, | |
| "grad_norm": 9.875, | |
| "learning_rate": 9.590163217354184e-05, | |
| "loss": 0.5038, | |
| "step": 35200 | |
| }, | |
| { | |
| "epoch": 2.0775093429067475, | |
| "grad_norm": 8.6875, | |
| "learning_rate": 9.543515557477826e-05, | |
| "loss": 0.511, | |
| "step": 35300 | |
| }, | |
| { | |
| "epoch": 2.0833946384957183, | |
| "grad_norm": 8.5625, | |
| "learning_rate": 9.496877849394444e-05, | |
| "loss": 0.498, | |
| "step": 35400 | |
| }, | |
| { | |
| "epoch": 2.0892799340846895, | |
| "grad_norm": 7.0625, | |
| "learning_rate": 9.450251109850225e-05, | |
| "loss": 0.5318, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 2.0951652296736603, | |
| "grad_norm": 21.625, | |
| "learning_rate": 9.40363635535223e-05, | |
| "loss": 0.5205, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 2.1010505252626315, | |
| "grad_norm": 7.84375, | |
| "learning_rate": 9.357034602146232e-05, | |
| "loss": 0.5164, | |
| "step": 35700 | |
| }, | |
| { | |
| "epoch": 2.1069358208516022, | |
| "grad_norm": 24.875, | |
| "learning_rate": 9.310446866194571e-05, | |
| "loss": 0.5349, | |
| "step": 35800 | |
| }, | |
| { | |
| "epoch": 2.112821116440573, | |
| "grad_norm": 17.125, | |
| "learning_rate": 9.263874163153992e-05, | |
| "loss": 0.5042, | |
| "step": 35900 | |
| }, | |
| { | |
| "epoch": 2.1187064120295442, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 9.217317508353507e-05, | |
| "loss": 0.4948, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 2.124591707618515, | |
| "grad_norm": 7.46875, | |
| "learning_rate": 9.170777916772265e-05, | |
| "loss": 0.5195, | |
| "step": 36100 | |
| }, | |
| { | |
| "epoch": 2.1304770032074862, | |
| "grad_norm": 7.75, | |
| "learning_rate": 9.124256403017419e-05, | |
| "loss": 0.5179, | |
| "step": 36200 | |
| }, | |
| { | |
| "epoch": 2.136362298796457, | |
| "grad_norm": 9.3125, | |
| "learning_rate": 9.077753981302009e-05, | |
| "loss": 0.4938, | |
| "step": 36300 | |
| }, | |
| { | |
| "epoch": 2.1422475943854282, | |
| "grad_norm": 13.4375, | |
| "learning_rate": 9.031271665422849e-05, | |
| "loss": 0.5449, | |
| "step": 36400 | |
| }, | |
| { | |
| "epoch": 2.148132889974399, | |
| "grad_norm": 11.1875, | |
| "learning_rate": 8.984810468738427e-05, | |
| "loss": 0.5127, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 2.1540181855633698, | |
| "grad_norm": 9.5, | |
| "learning_rate": 8.938371404146812e-05, | |
| "loss": 0.5085, | |
| "step": 36600 | |
| }, | |
| { | |
| "epoch": 2.159903481152341, | |
| "grad_norm": 12.4375, | |
| "learning_rate": 8.891955484063576e-05, | |
| "loss": 0.5424, | |
| "step": 36700 | |
| }, | |
| { | |
| "epoch": 2.1657887767413118, | |
| "grad_norm": 6.28125, | |
| "learning_rate": 8.845563720399716e-05, | |
| "loss": 0.513, | |
| "step": 36800 | |
| }, | |
| { | |
| "epoch": 2.171674072330283, | |
| "grad_norm": 22.625, | |
| "learning_rate": 8.799197124539595e-05, | |
| "loss": 0.5128, | |
| "step": 36900 | |
| }, | |
| { | |
| "epoch": 2.1775593679192538, | |
| "grad_norm": 17.0, | |
| "learning_rate": 8.752856707318896e-05, | |
| "loss": 0.5216, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 2.1834446635082245, | |
| "grad_norm": 7.40625, | |
| "learning_rate": 8.706543479002584e-05, | |
| "loss": 0.5186, | |
| "step": 37100 | |
| }, | |
| { | |
| "epoch": 2.1893299590971957, | |
| "grad_norm": 19.5, | |
| "learning_rate": 8.660258449262878e-05, | |
| "loss": 0.5274, | |
| "step": 37200 | |
| }, | |
| { | |
| "epoch": 2.1952152546861665, | |
| "grad_norm": 15.1875, | |
| "learning_rate": 8.614002627157239e-05, | |
| "loss": 0.5017, | |
| "step": 37300 | |
| }, | |
| { | |
| "epoch": 2.2011005502751377, | |
| "grad_norm": 13.0, | |
| "learning_rate": 8.56777702110638e-05, | |
| "loss": 0.5044, | |
| "step": 37400 | |
| }, | |
| { | |
| "epoch": 2.2069858458641085, | |
| "grad_norm": 13.3125, | |
| "learning_rate": 8.521582638872273e-05, | |
| "loss": 0.5191, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 2.2128711414530793, | |
| "grad_norm": 7.625, | |
| "learning_rate": 8.475420487536179e-05, | |
| "loss": 0.5101, | |
| "step": 37600 | |
| }, | |
| { | |
| "epoch": 2.2187564370420505, | |
| "grad_norm": 13.75, | |
| "learning_rate": 8.429291573476699e-05, | |
| "loss": 0.5029, | |
| "step": 37700 | |
| }, | |
| { | |
| "epoch": 2.2246417326310213, | |
| "grad_norm": 14.625, | |
| "learning_rate": 8.383196902347823e-05, | |
| "loss": 0.5132, | |
| "step": 37800 | |
| }, | |
| { | |
| "epoch": 2.2305270282199925, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 8.337137479057019e-05, | |
| "loss": 0.516, | |
| "step": 37900 | |
| }, | |
| { | |
| "epoch": 2.2364123238089633, | |
| "grad_norm": 5.71875, | |
| "learning_rate": 8.291114307743317e-05, | |
| "loss": 0.5114, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 2.2422976193979345, | |
| "grad_norm": 12.875, | |
| "learning_rate": 8.24512839175542e-05, | |
| "loss": 0.5025, | |
| "step": 38100 | |
| }, | |
| { | |
| "epoch": 2.2481829149869053, | |
| "grad_norm": 12.0, | |
| "learning_rate": 8.199180733629826e-05, | |
| "loss": 0.5121, | |
| "step": 38200 | |
| }, | |
| { | |
| "epoch": 2.254068210575876, | |
| "grad_norm": 18.125, | |
| "learning_rate": 8.153272335068982e-05, | |
| "loss": 0.5347, | |
| "step": 38300 | |
| }, | |
| { | |
| "epoch": 2.2599535061648472, | |
| "grad_norm": 5.9375, | |
| "learning_rate": 8.107404196919436e-05, | |
| "loss": 0.5165, | |
| "step": 38400 | |
| }, | |
| { | |
| "epoch": 2.265838801753818, | |
| "grad_norm": 12.375, | |
| "learning_rate": 8.061577319150016e-05, | |
| "loss": 0.5022, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 2.271724097342789, | |
| "grad_norm": 9.375, | |
| "learning_rate": 8.015792700830044e-05, | |
| "loss": 0.5203, | |
| "step": 38600 | |
| }, | |
| { | |
| "epoch": 2.27760939293176, | |
| "grad_norm": 10.5, | |
| "learning_rate": 7.97005134010754e-05, | |
| "loss": 0.5199, | |
| "step": 38700 | |
| }, | |
| { | |
| "epoch": 2.283494688520731, | |
| "grad_norm": 12.625, | |
| "learning_rate": 7.924354234187466e-05, | |
| "loss": 0.5376, | |
| "step": 38800 | |
| }, | |
| { | |
| "epoch": 2.289379984109702, | |
| "grad_norm": 13.6875, | |
| "learning_rate": 7.878702379309991e-05, | |
| "loss": 0.5228, | |
| "step": 38900 | |
| }, | |
| { | |
| "epoch": 2.2952652796986728, | |
| "grad_norm": 7.3125, | |
| "learning_rate": 7.833096770728772e-05, | |
| "loss": 0.5474, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 2.301150575287644, | |
| "grad_norm": 17.0, | |
| "learning_rate": 7.787538402689245e-05, | |
| "loss": 0.511, | |
| "step": 39100 | |
| }, | |
| { | |
| "epoch": 2.3070358708766148, | |
| "grad_norm": 18.5, | |
| "learning_rate": 7.742028268406961e-05, | |
| "loss": 0.5169, | |
| "step": 39200 | |
| }, | |
| { | |
| "epoch": 2.3129211664655855, | |
| "grad_norm": 16.5, | |
| "learning_rate": 7.69656736004593e-05, | |
| "loss": 0.5148, | |
| "step": 39300 | |
| }, | |
| { | |
| "epoch": 2.3188064620545568, | |
| "grad_norm": 19.125, | |
| "learning_rate": 7.651156668696989e-05, | |
| "loss": 0.5257, | |
| "step": 39400 | |
| }, | |
| { | |
| "epoch": 2.3246917576435275, | |
| "grad_norm": 5.875, | |
| "learning_rate": 7.6057971843562e-05, | |
| "loss": 0.515, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 2.3305770532324988, | |
| "grad_norm": 5.21875, | |
| "learning_rate": 7.560489895903258e-05, | |
| "loss": 0.4958, | |
| "step": 39600 | |
| }, | |
| { | |
| "epoch": 2.3364623488214695, | |
| "grad_norm": 5.21875, | |
| "learning_rate": 7.515235791079943e-05, | |
| "loss": 0.5117, | |
| "step": 39700 | |
| }, | |
| { | |
| "epoch": 2.3423476444104407, | |
| "grad_norm": 12.5625, | |
| "learning_rate": 7.470035856468578e-05, | |
| "loss": 0.53, | |
| "step": 39800 | |
| }, | |
| { | |
| "epoch": 2.3482329399994115, | |
| "grad_norm": 14.6875, | |
| "learning_rate": 7.424891077470529e-05, | |
| "loss": 0.5052, | |
| "step": 39900 | |
| }, | |
| { | |
| "epoch": 2.3541182355883823, | |
| "grad_norm": 11.625, | |
| "learning_rate": 7.379802438284711e-05, | |
| "loss": 0.5239, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 2.3600035311773535, | |
| "grad_norm": 17.625, | |
| "learning_rate": 7.334770921886143e-05, | |
| "loss": 0.5232, | |
| "step": 40100 | |
| }, | |
| { | |
| "epoch": 2.3658888267663243, | |
| "grad_norm": 15.4375, | |
| "learning_rate": 7.28979751000451e-05, | |
| "loss": 0.5145, | |
| "step": 40200 | |
| }, | |
| { | |
| "epoch": 2.371774122355295, | |
| "grad_norm": 20.875, | |
| "learning_rate": 7.244883183102769e-05, | |
| "loss": 0.4999, | |
| "step": 40300 | |
| }, | |
| { | |
| "epoch": 2.3776594179442663, | |
| "grad_norm": 13.3125, | |
| "learning_rate": 7.200028920355759e-05, | |
| "loss": 0.5153, | |
| "step": 40400 | |
| }, | |
| { | |
| "epoch": 2.383544713533237, | |
| "grad_norm": 11.5625, | |
| "learning_rate": 7.155235699628871e-05, | |
| "loss": 0.4802, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 2.3894300091222083, | |
| "grad_norm": 7.1875, | |
| "learning_rate": 7.110504497456725e-05, | |
| "loss": 0.4936, | |
| "step": 40600 | |
| }, | |
| { | |
| "epoch": 2.395315304711179, | |
| "grad_norm": 13.125, | |
| "learning_rate": 7.065836289021866e-05, | |
| "loss": 0.5239, | |
| "step": 40700 | |
| }, | |
| { | |
| "epoch": 2.4012006003001503, | |
| "grad_norm": 13.1875, | |
| "learning_rate": 7.021232048133527e-05, | |
| "loss": 0.5074, | |
| "step": 40800 | |
| }, | |
| { | |
| "epoch": 2.407085895889121, | |
| "grad_norm": 8.5625, | |
| "learning_rate": 6.976692747206385e-05, | |
| "loss": 0.5173, | |
| "step": 40900 | |
| }, | |
| { | |
| "epoch": 2.412971191478092, | |
| "grad_norm": 18.875, | |
| "learning_rate": 6.932219357239363e-05, | |
| "loss": 0.5261, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 2.418856487067063, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 6.887812847794458e-05, | |
| "loss": 0.5115, | |
| "step": 41100 | |
| }, | |
| { | |
| "epoch": 2.424741782656034, | |
| "grad_norm": 14.625, | |
| "learning_rate": 6.843474186975617e-05, | |
| "loss": 0.5039, | |
| "step": 41200 | |
| }, | |
| { | |
| "epoch": 2.430627078245005, | |
| "grad_norm": 21.375, | |
| "learning_rate": 6.799204341407619e-05, | |
| "loss": 0.525, | |
| "step": 41300 | |
| }, | |
| { | |
| "epoch": 2.436512373833976, | |
| "grad_norm": 12.3125, | |
| "learning_rate": 6.755004276215004e-05, | |
| "loss": 0.4939, | |
| "step": 41400 | |
| }, | |
| { | |
| "epoch": 2.442397669422947, | |
| "grad_norm": 11.125, | |
| "learning_rate": 6.710874955001035e-05, | |
| "loss": 0.5271, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 2.4482829650119178, | |
| "grad_norm": 9.4375, | |
| "learning_rate": 6.666817339826692e-05, | |
| "loss": 0.4943, | |
| "step": 41600 | |
| }, | |
| { | |
| "epoch": 2.4541682606008886, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 6.622832391189689e-05, | |
| "loss": 0.5258, | |
| "step": 41700 | |
| }, | |
| { | |
| "epoch": 2.4600535561898598, | |
| "grad_norm": 11.6875, | |
| "learning_rate": 6.57892106800355e-05, | |
| "loss": 0.5169, | |
| "step": 41800 | |
| }, | |
| { | |
| "epoch": 2.4659388517788305, | |
| "grad_norm": 9.375, | |
| "learning_rate": 6.535084327576683e-05, | |
| "loss": 0.4939, | |
| "step": 41900 | |
| }, | |
| { | |
| "epoch": 2.4718241473678013, | |
| "grad_norm": 13.1875, | |
| "learning_rate": 6.49132312559153e-05, | |
| "loss": 0.5034, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 2.4777094429567725, | |
| "grad_norm": 4.78125, | |
| "learning_rate": 6.447638416083717e-05, | |
| "loss": 0.5401, | |
| "step": 42100 | |
| }, | |
| { | |
| "epoch": 2.4835947385457433, | |
| "grad_norm": 9.25, | |
| "learning_rate": 6.404031151421274e-05, | |
| "loss": 0.5167, | |
| "step": 42200 | |
| }, | |
| { | |
| "epoch": 2.4894800341347145, | |
| "grad_norm": 6.5, | |
| "learning_rate": 6.360502282283845e-05, | |
| "loss": 0.5173, | |
| "step": 42300 | |
| }, | |
| { | |
| "epoch": 2.4953653297236853, | |
| "grad_norm": 11.375, | |
| "learning_rate": 6.317052757641985e-05, | |
| "loss": 0.499, | |
| "step": 42400 | |
| }, | |
| { | |
| "epoch": 2.5012506253126565, | |
| "grad_norm": 14.8125, | |
| "learning_rate": 6.273683524736463e-05, | |
| "loss": 0.5147, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 2.5071359209016273, | |
| "grad_norm": 31.75, | |
| "learning_rate": 6.230395529057611e-05, | |
| "loss": 0.5131, | |
| "step": 42600 | |
| }, | |
| { | |
| "epoch": 2.513021216490598, | |
| "grad_norm": 13.375, | |
| "learning_rate": 6.187189714324713e-05, | |
| "loss": 0.5048, | |
| "step": 42700 | |
| }, | |
| { | |
| "epoch": 2.5189065120795693, | |
| "grad_norm": 10.25, | |
| "learning_rate": 6.144067022465433e-05, | |
| "loss": 0.5142, | |
| "step": 42800 | |
| }, | |
| { | |
| "epoch": 2.52479180766854, | |
| "grad_norm": 21.0, | |
| "learning_rate": 6.1010283935952726e-05, | |
| "loss": 0.5437, | |
| "step": 42900 | |
| }, | |
| { | |
| "epoch": 2.5306771032575113, | |
| "grad_norm": 27.25, | |
| "learning_rate": 6.058074765997088e-05, | |
| "loss": 0.5261, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 2.536562398846482, | |
| "grad_norm": 14.0, | |
| "learning_rate": 6.0152070761006175e-05, | |
| "loss": 0.5375, | |
| "step": 43100 | |
| }, | |
| { | |
| "epoch": 2.5424476944354533, | |
| "grad_norm": 12.875, | |
| "learning_rate": 5.972426258462083e-05, | |
| "loss": 0.5182, | |
| "step": 43200 | |
| }, | |
| { | |
| "epoch": 2.548332990024424, | |
| "grad_norm": 6.59375, | |
| "learning_rate": 5.929733245743809e-05, | |
| "loss": 0.5061, | |
| "step": 43300 | |
| }, | |
| { | |
| "epoch": 2.554218285613395, | |
| "grad_norm": 14.6875, | |
| "learning_rate": 5.887128968693887e-05, | |
| "loss": 0.4996, | |
| "step": 43400 | |
| }, | |
| { | |
| "epoch": 2.560103581202366, | |
| "grad_norm": 24.0, | |
| "learning_rate": 5.8446143561258885e-05, | |
| "loss": 0.5035, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 2.565988876791337, | |
| "grad_norm": 13.875, | |
| "learning_rate": 5.8021903348986115e-05, | |
| "loss": 0.5101, | |
| "step": 43600 | |
| }, | |
| { | |
| "epoch": 2.5718741723803076, | |
| "grad_norm": 17.875, | |
| "learning_rate": 5.75985782989588e-05, | |
| "loss": 0.5041, | |
| "step": 43700 | |
| }, | |
| { | |
| "epoch": 2.577759467969279, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 5.71761776400638e-05, | |
| "loss": 0.5179, | |
| "step": 43800 | |
| }, | |
| { | |
| "epoch": 2.5836447635582496, | |
| "grad_norm": 8.75, | |
| "learning_rate": 5.6754710581035364e-05, | |
| "loss": 0.5118, | |
| "step": 43900 | |
| }, | |
| { | |
| "epoch": 2.589530059147221, | |
| "grad_norm": 7.3125, | |
| "learning_rate": 5.633418631025431e-05, | |
| "loss": 0.5191, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 2.5954153547361916, | |
| "grad_norm": 19.375, | |
| "learning_rate": 5.5914613995547805e-05, | |
| "loss": 0.511, | |
| "step": 44100 | |
| }, | |
| { | |
| "epoch": 2.6013006503251628, | |
| "grad_norm": 6.75, | |
| "learning_rate": 5.549600278398959e-05, | |
| "loss": 0.4941, | |
| "step": 44200 | |
| }, | |
| { | |
| "epoch": 2.6071859459141336, | |
| "grad_norm": 24.375, | |
| "learning_rate": 5.507836180170023e-05, | |
| "loss": 0.5151, | |
| "step": 44300 | |
| }, | |
| { | |
| "epoch": 2.6130712415031043, | |
| "grad_norm": 5.5, | |
| "learning_rate": 5.466170015364863e-05, | |
| "loss": 0.5241, | |
| "step": 44400 | |
| }, | |
| { | |
| "epoch": 2.6189565370920755, | |
| "grad_norm": 9.4375, | |
| "learning_rate": 5.424602692345304e-05, | |
| "loss": 0.5163, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 2.6248418326810463, | |
| "grad_norm": 4.90625, | |
| "learning_rate": 5.3831351173183455e-05, | |
| "loss": 0.5091, | |
| "step": 44600 | |
| }, | |
| { | |
| "epoch": 2.630727128270017, | |
| "grad_norm": 9.125, | |
| "learning_rate": 5.341768194316374e-05, | |
| "loss": 0.5196, | |
| "step": 44700 | |
| }, | |
| { | |
| "epoch": 2.6366124238589883, | |
| "grad_norm": 21.75, | |
| "learning_rate": 5.300502825177469e-05, | |
| "loss": 0.5248, | |
| "step": 44800 | |
| }, | |
| { | |
| "epoch": 2.6424977194479595, | |
| "grad_norm": 14.0, | |
| "learning_rate": 5.259339909525749e-05, | |
| "loss": 0.524, | |
| "step": 44900 | |
| }, | |
| { | |
| "epoch": 2.6483830150369303, | |
| "grad_norm": 28.0, | |
| "learning_rate": 5.2182803447517314e-05, | |
| "loss": 0.4982, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 2.654268310625901, | |
| "grad_norm": 6.15625, | |
| "learning_rate": 5.1773250259928077e-05, | |
| "loss": 0.5137, | |
| "step": 45100 | |
| }, | |
| { | |
| "epoch": 2.6601536062148723, | |
| "grad_norm": 4.9375, | |
| "learning_rate": 5.136474846113688e-05, | |
| "loss": 0.5293, | |
| "step": 45200 | |
| }, | |
| { | |
| "epoch": 2.666038901803843, | |
| "grad_norm": 7.375, | |
| "learning_rate": 5.09573069568697e-05, | |
| "loss": 0.5154, | |
| "step": 45300 | |
| }, | |
| { | |
| "epoch": 2.671924197392814, | |
| "grad_norm": 5.78125, | |
| "learning_rate": 5.055093462973706e-05, | |
| "loss": 0.5202, | |
| "step": 45400 | |
| }, | |
| { | |
| "epoch": 2.677809492981785, | |
| "grad_norm": 18.25, | |
| "learning_rate": 5.014564033904029e-05, | |
| "loss": 0.5225, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 2.683694788570756, | |
| "grad_norm": 8.5, | |
| "learning_rate": 4.97414329205787e-05, | |
| "loss": 0.5142, | |
| "step": 45600 | |
| }, | |
| { | |
| "epoch": 2.689580084159727, | |
| "grad_norm": 6.21875, | |
| "learning_rate": 4.933832118645656e-05, | |
| "loss": 0.5356, | |
| "step": 45700 | |
| }, | |
| { | |
| "epoch": 2.695465379748698, | |
| "grad_norm": 11.5625, | |
| "learning_rate": 4.893631392489137e-05, | |
| "loss": 0.5121, | |
| "step": 45800 | |
| }, | |
| { | |
| "epoch": 2.701350675337669, | |
| "grad_norm": 7.59375, | |
| "learning_rate": 4.853541990002195e-05, | |
| "loss": 0.5437, | |
| "step": 45900 | |
| }, | |
| { | |
| "epoch": 2.70723597092664, | |
| "grad_norm": 16.625, | |
| "learning_rate": 4.8135647851717516e-05, | |
| "loss": 0.5347, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 2.7131212665156106, | |
| "grad_norm": 6.34375, | |
| "learning_rate": 4.7737006495387216e-05, | |
| "loss": 0.5152, | |
| "step": 46100 | |
| }, | |
| { | |
| "epoch": 2.719006562104582, | |
| "grad_norm": 32.5, | |
| "learning_rate": 4.7339504521789935e-05, | |
| "loss": 0.4914, | |
| "step": 46200 | |
| }, | |
| { | |
| "epoch": 2.7248918576935526, | |
| "grad_norm": 30.625, | |
| "learning_rate": 4.694315059684507e-05, | |
| "loss": 0.5021, | |
| "step": 46300 | |
| }, | |
| { | |
| "epoch": 2.7307771532825234, | |
| "grad_norm": 14.5625, | |
| "learning_rate": 4.65479533614433e-05, | |
| "loss": 0.5113, | |
| "step": 46400 | |
| }, | |
| { | |
| "epoch": 2.7366624488714946, | |
| "grad_norm": 13.5, | |
| "learning_rate": 4.6153921431258554e-05, | |
| "loss": 0.5169, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 2.742547744460466, | |
| "grad_norm": 18.25, | |
| "learning_rate": 4.576106339655984e-05, | |
| "loss": 0.5086, | |
| "step": 46600 | |
| }, | |
| { | |
| "epoch": 2.7484330400494366, | |
| "grad_norm": 8.3125, | |
| "learning_rate": 4.536938782202431e-05, | |
| "loss": 0.5176, | |
| "step": 46700 | |
| }, | |
| { | |
| "epoch": 2.7543183356384073, | |
| "grad_norm": 12.4375, | |
| "learning_rate": 4.4978903246550195e-05, | |
| "loss": 0.5146, | |
| "step": 46800 | |
| }, | |
| { | |
| "epoch": 2.7602036312273786, | |
| "grad_norm": 14.25, | |
| "learning_rate": 4.4589618183070844e-05, | |
| "loss": 0.5207, | |
| "step": 46900 | |
| }, | |
| { | |
| "epoch": 2.7660889268163493, | |
| "grad_norm": 11.6875, | |
| "learning_rate": 4.42015411183693e-05, | |
| "loss": 0.5122, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 2.77197422240532, | |
| "grad_norm": 18.25, | |
| "learning_rate": 4.381468051289283e-05, | |
| "loss": 0.5176, | |
| "step": 47100 | |
| }, | |
| { | |
| "epoch": 2.7778595179942913, | |
| "grad_norm": 8.875, | |
| "learning_rate": 4.342904480056893e-05, | |
| "loss": 0.4933, | |
| "step": 47200 | |
| }, | |
| { | |
| "epoch": 2.783744813583262, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 4.304464238862115e-05, | |
| "loss": 0.5001, | |
| "step": 47300 | |
| }, | |
| { | |
| "epoch": 2.7896301091722333, | |
| "grad_norm": 10.4375, | |
| "learning_rate": 4.266148165738593e-05, | |
| "loss": 0.5163, | |
| "step": 47400 | |
| }, | |
| { | |
| "epoch": 2.795515404761204, | |
| "grad_norm": 5.25, | |
| "learning_rate": 4.227957096013e-05, | |
| "loss": 0.5061, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 2.8014007003501753, | |
| "grad_norm": 7.8125, | |
| "learning_rate": 4.1898918622868025e-05, | |
| "loss": 0.5097, | |
| "step": 47600 | |
| }, | |
| { | |
| "epoch": 2.807285995939146, | |
| "grad_norm": 16.125, | |
| "learning_rate": 4.1519532944181374e-05, | |
| "loss": 0.5171, | |
| "step": 47700 | |
| }, | |
| { | |
| "epoch": 2.813171291528117, | |
| "grad_norm": 13.375, | |
| "learning_rate": 4.1141422195036904e-05, | |
| "loss": 0.5217, | |
| "step": 47800 | |
| }, | |
| { | |
| "epoch": 2.819056587117088, | |
| "grad_norm": 8.375, | |
| "learning_rate": 4.0764594618606975e-05, | |
| "loss": 0.5038, | |
| "step": 47900 | |
| }, | |
| { | |
| "epoch": 2.824941882706059, | |
| "grad_norm": 11.75, | |
| "learning_rate": 4.038905843008943e-05, | |
| "loss": 0.4968, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 2.8308271782950296, | |
| "grad_norm": 15.5625, | |
| "learning_rate": 4.001482181652865e-05, | |
| "loss": 0.5336, | |
| "step": 48100 | |
| }, | |
| { | |
| "epoch": 2.836712473884001, | |
| "grad_norm": 6.28125, | |
| "learning_rate": 3.964189293663715e-05, | |
| "loss": 0.5185, | |
| "step": 48200 | |
| }, | |
| { | |
| "epoch": 2.842597769472972, | |
| "grad_norm": 5.84375, | |
| "learning_rate": 3.9270279920617456e-05, | |
| "loss": 0.501, | |
| "step": 48300 | |
| }, | |
| { | |
| "epoch": 2.848483065061943, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 3.889999086998519e-05, | |
| "loss": 0.5302, | |
| "step": 48400 | |
| }, | |
| { | |
| "epoch": 2.8543683606509136, | |
| "grad_norm": 16.375, | |
| "learning_rate": 3.853103385739213e-05, | |
| "loss": 0.5224, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 2.860253656239885, | |
| "grad_norm": 7.46875, | |
| "learning_rate": 3.8163416926450436e-05, | |
| "loss": 0.5142, | |
| "step": 48600 | |
| }, | |
| { | |
| "epoch": 2.8661389518288556, | |
| "grad_norm": 6.71875, | |
| "learning_rate": 3.7797148091557244e-05, | |
| "loss": 0.5233, | |
| "step": 48700 | |
| }, | |
| { | |
| "epoch": 2.8720242474178264, | |
| "grad_norm": 15.25, | |
| "learning_rate": 3.743223533771982e-05, | |
| "loss": 0.5433, | |
| "step": 48800 | |
| }, | |
| { | |
| "epoch": 2.8779095430067976, | |
| "grad_norm": 5.28125, | |
| "learning_rate": 3.706868662038172e-05, | |
| "loss": 0.5114, | |
| "step": 48900 | |
| }, | |
| { | |
| "epoch": 2.8837948385957684, | |
| "grad_norm": 5.65625, | |
| "learning_rate": 3.670650986524905e-05, | |
| "loss": 0.515, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 2.8896801341847396, | |
| "grad_norm": 7.09375, | |
| "learning_rate": 3.634571296811801e-05, | |
| "loss": 0.5299, | |
| "step": 49100 | |
| }, | |
| { | |
| "epoch": 2.8955654297737103, | |
| "grad_norm": 10.9375, | |
| "learning_rate": 3.5986303794702445e-05, | |
| "loss": 0.5259, | |
| "step": 49200 | |
| }, | |
| { | |
| "epoch": 2.9014507253626816, | |
| "grad_norm": 12.375, | |
| "learning_rate": 3.5628290180462556e-05, | |
| "loss": 0.5327, | |
| "step": 49300 | |
| }, | |
| { | |
| "epoch": 2.9073360209516523, | |
| "grad_norm": 13.9375, | |
| "learning_rate": 3.527167993043411e-05, | |
| "loss": 0.5047, | |
| "step": 49400 | |
| }, | |
| { | |
| "epoch": 2.913221316540623, | |
| "grad_norm": 18.875, | |
| "learning_rate": 3.4916480819058074e-05, | |
| "loss": 0.5137, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 2.9191066121295943, | |
| "grad_norm": 17.0, | |
| "learning_rate": 3.4562700590011384e-05, | |
| "loss": 0.5224, | |
| "step": 49600 | |
| }, | |
| { | |
| "epoch": 2.924991907718565, | |
| "grad_norm": 7.21875, | |
| "learning_rate": 3.4210346956037894e-05, | |
| "loss": 0.5242, | |
| "step": 49700 | |
| }, | |
| { | |
| "epoch": 2.930877203307536, | |
| "grad_norm": 6.375, | |
| "learning_rate": 3.385942759878042e-05, | |
| "loss": 0.5102, | |
| "step": 49800 | |
| }, | |
| { | |
| "epoch": 2.936762498896507, | |
| "grad_norm": 12.125, | |
| "learning_rate": 3.35099501686131e-05, | |
| "loss": 0.49, | |
| "step": 49900 | |
| }, | |
| { | |
| "epoch": 2.9426477944854783, | |
| "grad_norm": 8.6875, | |
| "learning_rate": 3.316192228447479e-05, | |
| "loss": 0.5086, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 2.948533090074449, | |
| "grad_norm": 8.25, | |
| "learning_rate": 3.281535153370278e-05, | |
| "loss": 0.5013, | |
| "step": 50100 | |
| }, | |
| { | |
| "epoch": 2.95441838566342, | |
| "grad_norm": 6.65625, | |
| "learning_rate": 3.2470245471867536e-05, | |
| "loss": 0.5204, | |
| "step": 50200 | |
| }, | |
| { | |
| "epoch": 2.960303681252391, | |
| "grad_norm": 8.125, | |
| "learning_rate": 3.212661162260794e-05, | |
| "loss": 0.4943, | |
| "step": 50300 | |
| }, | |
| { | |
| "epoch": 2.966188976841362, | |
| "grad_norm": 7.8125, | |
| "learning_rate": 3.1784457477467135e-05, | |
| "loss": 0.5172, | |
| "step": 50400 | |
| }, | |
| { | |
| "epoch": 2.9720742724303326, | |
| "grad_norm": 7.09375, | |
| "learning_rate": 3.144379049572945e-05, | |
| "loss": 0.5017, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 2.977959568019304, | |
| "grad_norm": 14.9375, | |
| "learning_rate": 3.110461810425754e-05, | |
| "loss": 0.4932, | |
| "step": 50600 | |
| }, | |
| { | |
| "epoch": 2.9838448636082746, | |
| "grad_norm": 18.625, | |
| "learning_rate": 3.076694769733061e-05, | |
| "loss": 0.5163, | |
| "step": 50700 | |
| }, | |
| { | |
| "epoch": 2.989730159197246, | |
| "grad_norm": 29.875, | |
| "learning_rate": 3.043078663648322e-05, | |
| "loss": 0.523, | |
| "step": 50800 | |
| }, | |
| { | |
| "epoch": 2.9956154547862166, | |
| "grad_norm": 7.5625, | |
| "learning_rate": 3.0096142250344683e-05, | |
| "loss": 0.4909, | |
| "step": 50900 | |
| }, | |
| { | |
| "epoch": 3.0015007503751874, | |
| "grad_norm": 15.6875, | |
| "learning_rate": 2.976302183447944e-05, | |
| "loss": 0.5244, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 3.0073860459641586, | |
| "grad_norm": 23.75, | |
| "learning_rate": 2.9431432651227876e-05, | |
| "loss": 0.5018, | |
| "step": 51100 | |
| }, | |
| { | |
| "epoch": 3.0132713415531294, | |
| "grad_norm": 17.375, | |
| "learning_rate": 2.9101381929548122e-05, | |
| "loss": 0.5074, | |
| "step": 51200 | |
| }, | |
| { | |
| "epoch": 3.0191566371421006, | |
| "grad_norm": 14.0, | |
| "learning_rate": 2.8772876864858333e-05, | |
| "loss": 0.5075, | |
| "step": 51300 | |
| }, | |
| { | |
| "epoch": 3.0250419327310714, | |
| "grad_norm": 11.0625, | |
| "learning_rate": 2.844592461887987e-05, | |
| "loss": 0.5093, | |
| "step": 51400 | |
| }, | |
| { | |
| "epoch": 3.0309272283200426, | |
| "grad_norm": 5.96875, | |
| "learning_rate": 2.812053231948125e-05, | |
| "loss": 0.5173, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 3.0368125239090134, | |
| "grad_norm": 24.375, | |
| "learning_rate": 2.7796707060522588e-05, | |
| "loss": 0.5349, | |
| "step": 51600 | |
| }, | |
| { | |
| "epoch": 3.042697819497984, | |
| "grad_norm": 16.5, | |
| "learning_rate": 2.747445590170109e-05, | |
| "loss": 0.5164, | |
| "step": 51700 | |
| }, | |
| { | |
| "epoch": 3.0485831150869553, | |
| "grad_norm": 16.5, | |
| "learning_rate": 2.715378586839713e-05, | |
| "loss": 0.5046, | |
| "step": 51800 | |
| }, | |
| { | |
| "epoch": 3.054468410675926, | |
| "grad_norm": 9.25, | |
| "learning_rate": 2.6834703951520913e-05, | |
| "loss": 0.5054, | |
| "step": 51900 | |
| }, | |
| { | |
| "epoch": 3.0603537062648973, | |
| "grad_norm": 18.0, | |
| "learning_rate": 2.651721710736036e-05, | |
| "loss": 0.5007, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 3.066239001853868, | |
| "grad_norm": 10.25, | |
| "learning_rate": 2.6201332257429156e-05, | |
| "loss": 0.5306, | |
| "step": 52100 | |
| }, | |
| { | |
| "epoch": 3.072124297442839, | |
| "grad_norm": 8.3125, | |
| "learning_rate": 2.5887056288316125e-05, | |
| "loss": 0.5168, | |
| "step": 52200 | |
| }, | |
| { | |
| "epoch": 3.07800959303181, | |
| "grad_norm": 7.40625, | |
| "learning_rate": 2.5574396051534832e-05, | |
| "loss": 0.5217, | |
| "step": 52300 | |
| }, | |
| { | |
| "epoch": 3.083894888620781, | |
| "grad_norm": 17.625, | |
| "learning_rate": 2.526335836337449e-05, | |
| "loss": 0.4916, | |
| "step": 52400 | |
| }, | |
| { | |
| "epoch": 3.089780184209752, | |
| "grad_norm": 10.75, | |
| "learning_rate": 2.4953950004751105e-05, | |
| "loss": 0.5206, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 3.095665479798723, | |
| "grad_norm": 19.0, | |
| "learning_rate": 2.464617772105977e-05, | |
| "loss": 0.5269, | |
| "step": 52600 | |
| }, | |
| { | |
| "epoch": 3.1015507753876936, | |
| "grad_norm": 12.625, | |
| "learning_rate": 2.434004822202769e-05, | |
| "loss": 0.5039, | |
| "step": 52700 | |
| }, | |
| { | |
| "epoch": 3.107436070976665, | |
| "grad_norm": 30.25, | |
| "learning_rate": 2.403556818156767e-05, | |
| "loss": 0.5176, | |
| "step": 52800 | |
| }, | |
| { | |
| "epoch": 3.1133213665656356, | |
| "grad_norm": 7.875, | |
| "learning_rate": 2.3732744237632885e-05, | |
| "loss": 0.4943, | |
| "step": 52900 | |
| }, | |
| { | |
| "epoch": 3.119206662154607, | |
| "grad_norm": 7.90625, | |
| "learning_rate": 2.3431582992071932e-05, | |
| "loss": 0.4948, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 3.1250919577435776, | |
| "grad_norm": 9.8125, | |
| "learning_rate": 2.3132091010485103e-05, | |
| "loss": 0.5129, | |
| "step": 53100 | |
| }, | |
| { | |
| "epoch": 3.1309772533325484, | |
| "grad_norm": 12.9375, | |
| "learning_rate": 2.283427482208107e-05, | |
| "loss": 0.5268, | |
| "step": 53200 | |
| }, | |
| { | |
| "epoch": 3.1368625489215196, | |
| "grad_norm": 5.6875, | |
| "learning_rate": 2.2538140919534678e-05, | |
| "loss": 0.5075, | |
| "step": 53300 | |
| }, | |
| { | |
| "epoch": 3.1427478445104904, | |
| "grad_norm": 6.59375, | |
| "learning_rate": 2.2243695758845374e-05, | |
| "loss": 0.5011, | |
| "step": 53400 | |
| }, | |
| { | |
| "epoch": 3.1486331400994616, | |
| "grad_norm": 6.21875, | |
| "learning_rate": 2.195094575919634e-05, | |
| "loss": 0.5118, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 3.1545184356884324, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 2.1659897302814747e-05, | |
| "loss": 0.5277, | |
| "step": 53600 | |
| }, | |
| { | |
| "epoch": 3.1604037312774036, | |
| "grad_norm": 6.8125, | |
| "learning_rate": 2.1370556734832427e-05, | |
| "loss": 0.5392, | |
| "step": 53700 | |
| }, | |
| { | |
| "epoch": 3.1662890268663744, | |
| "grad_norm": 23.25, | |
| "learning_rate": 2.1082930363147714e-05, | |
| "loss": 0.5214, | |
| "step": 53800 | |
| }, | |
| { | |
| "epoch": 3.172174322455345, | |
| "grad_norm": 14.25, | |
| "learning_rate": 2.0797024458287752e-05, | |
| "loss": 0.5209, | |
| "step": 53900 | |
| }, | |
| { | |
| "epoch": 3.1780596180443164, | |
| "grad_norm": 14.0, | |
| "learning_rate": 2.0512845253271895e-05, | |
| "loss": 0.5026, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 3.183944913633287, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 2.0230398943475905e-05, | |
| "loss": 0.5209, | |
| "step": 54100 | |
| }, | |
| { | |
| "epoch": 3.1898302092222584, | |
| "grad_norm": 9.625, | |
| "learning_rate": 1.994969168649663e-05, | |
| "loss": 0.5195, | |
| "step": 54200 | |
| }, | |
| { | |
| "epoch": 3.195715504811229, | |
| "grad_norm": 6.59375, | |
| "learning_rate": 1.967072960201808e-05, | |
| "loss": 0.5069, | |
| "step": 54300 | |
| }, | |
| { | |
| "epoch": 3.2016008004002, | |
| "grad_norm": 13.4375, | |
| "learning_rate": 1.939351877167771e-05, | |
| "loss": 0.5104, | |
| "step": 54400 | |
| }, | |
| { | |
| "epoch": 3.207486095989171, | |
| "grad_norm": 11.5625, | |
| "learning_rate": 1.9118065238934103e-05, | |
| "loss": 0.4954, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 3.213371391578142, | |
| "grad_norm": 10.875, | |
| "learning_rate": 1.884437500893499e-05, | |
| "loss": 0.5009, | |
| "step": 54600 | |
| }, | |
| { | |
| "epoch": 3.219256687167113, | |
| "grad_norm": 6.25, | |
| "learning_rate": 1.8572454048386455e-05, | |
| "loss": 0.5053, | |
| "step": 54700 | |
| }, | |
| { | |
| "epoch": 3.225141982756084, | |
| "grad_norm": 6.125, | |
| "learning_rate": 1.8302308285422908e-05, | |
| "loss": 0.5228, | |
| "step": 54800 | |
| }, | |
| { | |
| "epoch": 3.2310272783450547, | |
| "grad_norm": 7.46875, | |
| "learning_rate": 1.8033943609477632e-05, | |
| "loss": 0.5134, | |
| "step": 54900 | |
| }, | |
| { | |
| "epoch": 3.236912573934026, | |
| "grad_norm": 8.3125, | |
| "learning_rate": 1.7767365871154717e-05, | |
| "loss": 0.5123, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 3.2427978695229966, | |
| "grad_norm": 20.375, | |
| "learning_rate": 1.750258088210116e-05, | |
| "loss": 0.5023, | |
| "step": 55100 | |
| }, | |
| { | |
| "epoch": 3.248683165111968, | |
| "grad_norm": 5.5, | |
| "learning_rate": 1.7239594414880356e-05, | |
| "loss": 0.5162, | |
| "step": 55200 | |
| }, | |
| { | |
| "epoch": 3.2545684607009386, | |
| "grad_norm": 6.0625, | |
| "learning_rate": 1.6978412202846294e-05, | |
| "loss": 0.5163, | |
| "step": 55300 | |
| }, | |
| { | |
| "epoch": 3.26045375628991, | |
| "grad_norm": 8.4375, | |
| "learning_rate": 1.6719039940018388e-05, | |
| "loss": 0.5008, | |
| "step": 55400 | |
| }, | |
| { | |
| "epoch": 3.2663390518788806, | |
| "grad_norm": 8.0625, | |
| "learning_rate": 1.6461483280957568e-05, | |
| "loss": 0.5165, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 3.2722243474678514, | |
| "grad_norm": 18.375, | |
| "learning_rate": 1.620574784064275e-05, | |
| "loss": 0.5062, | |
| "step": 55600 | |
| }, | |
| { | |
| "epoch": 3.2781096430568226, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 1.5951839194348683e-05, | |
| "loss": 0.5227, | |
| "step": 55700 | |
| }, | |
| { | |
| "epoch": 3.2839949386457934, | |
| "grad_norm": 18.625, | |
| "learning_rate": 1.5699762877524193e-05, | |
| "loss": 0.5, | |
| "step": 55800 | |
| }, | |
| { | |
| "epoch": 3.2898802342347646, | |
| "grad_norm": 20.0, | |
| "learning_rate": 1.5449524385671588e-05, | |
| "loss": 0.5159, | |
| "step": 55900 | |
| }, | |
| { | |
| "epoch": 3.2957655298237354, | |
| "grad_norm": 5.9375, | |
| "learning_rate": 1.5201129174226936e-05, | |
| "loss": 0.513, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 3.3016508254127066, | |
| "grad_norm": 7.40625, | |
| "learning_rate": 1.4954582658440919e-05, | |
| "loss": 0.5171, | |
| "step": 56100 | |
| }, | |
| { | |
| "epoch": 3.3075361210016774, | |
| "grad_norm": 18.375, | |
| "learning_rate": 1.4709890213261047e-05, | |
| "loss": 0.5302, | |
| "step": 56200 | |
| }, | |
| { | |
| "epoch": 3.313421416590648, | |
| "grad_norm": 8.6875, | |
| "learning_rate": 1.4467057173214194e-05, | |
| "loss": 0.4993, | |
| "step": 56300 | |
| }, | |
| { | |
| "epoch": 3.3193067121796194, | |
| "grad_norm": 13.3125, | |
| "learning_rate": 1.4226088832290574e-05, | |
| "loss": 0.5359, | |
| "step": 56400 | |
| }, | |
| { | |
| "epoch": 3.32519200776859, | |
| "grad_norm": 18.5, | |
| "learning_rate": 1.3986990443828074e-05, | |
| "loss": 0.5267, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 3.331077303357561, | |
| "grad_norm": 7.78125, | |
| "learning_rate": 1.3749767220397935e-05, | |
| "loss": 0.5227, | |
| "step": 56600 | |
| }, | |
| { | |
| "epoch": 3.336962598946532, | |
| "grad_norm": 11.8125, | |
| "learning_rate": 1.3514424333691011e-05, | |
| "loss": 0.5096, | |
| "step": 56700 | |
| }, | |
| { | |
| "epoch": 3.342847894535503, | |
| "grad_norm": 10.375, | |
| "learning_rate": 1.328096691440498e-05, | |
| "loss": 0.4976, | |
| "step": 56800 | |
| }, | |
| { | |
| "epoch": 3.348733190124474, | |
| "grad_norm": 12.875, | |
| "learning_rate": 1.304940005213262e-05, | |
| "loss": 0.5155, | |
| "step": 56900 | |
| }, | |
| { | |
| "epoch": 3.354618485713445, | |
| "grad_norm": 7.375, | |
| "learning_rate": 1.2819728795250708e-05, | |
| "loss": 0.5168, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 3.360503781302416, | |
| "grad_norm": 10.125, | |
| "learning_rate": 1.2591958150810102e-05, | |
| "loss": 0.5212, | |
| "step": 57100 | |
| }, | |
| { | |
| "epoch": 3.366389076891387, | |
| "grad_norm": 13.75, | |
| "learning_rate": 1.2366093084426433e-05, | |
| "loss": 0.5127, | |
| "step": 57200 | |
| }, | |
| { | |
| "epoch": 3.3722743724803577, | |
| "grad_norm": 10.25, | |
| "learning_rate": 1.2142138520171965e-05, | |
| "loss": 0.5413, | |
| "step": 57300 | |
| }, | |
| { | |
| "epoch": 3.378159668069329, | |
| "grad_norm": 16.75, | |
| "learning_rate": 1.1920099340468227e-05, | |
| "loss": 0.5217, | |
| "step": 57400 | |
| }, | |
| { | |
| "epoch": 3.3840449636582997, | |
| "grad_norm": 16.75, | |
| "learning_rate": 1.1699980385979504e-05, | |
| "loss": 0.4949, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 3.389930259247271, | |
| "grad_norm": 5.46875, | |
| "learning_rate": 1.1481786455507415e-05, | |
| "loss": 0.4959, | |
| "step": 57600 | |
| }, | |
| { | |
| "epoch": 3.3958155548362416, | |
| "grad_norm": 7.5, | |
| "learning_rate": 1.1265522305886156e-05, | |
| "loss": 0.5145, | |
| "step": 57700 | |
| }, | |
| { | |
| "epoch": 3.4017008504252124, | |
| "grad_norm": 23.875, | |
| "learning_rate": 1.1051192651878938e-05, | |
| "loss": 0.5159, | |
| "step": 57800 | |
| }, | |
| { | |
| "epoch": 3.4075861460141836, | |
| "grad_norm": 9.125, | |
| "learning_rate": 1.0838802166075123e-05, | |
| "loss": 0.5329, | |
| "step": 57900 | |
| }, | |
| { | |
| "epoch": 3.4134714416031544, | |
| "grad_norm": 8.25, | |
| "learning_rate": 1.0628355478788321e-05, | |
| "loss": 0.4948, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 3.4193567371921256, | |
| "grad_norm": 6.78125, | |
| "learning_rate": 1.0419857177955562e-05, | |
| "loss": 0.508, | |
| "step": 58100 | |
| }, | |
| { | |
| "epoch": 3.4252420327810964, | |
| "grad_norm": 10.5, | |
| "learning_rate": 1.0213311809037173e-05, | |
| "loss": 0.5162, | |
| "step": 58200 | |
| }, | |
| { | |
| "epoch": 3.431127328370067, | |
| "grad_norm": 16.0, | |
| "learning_rate": 1.0008723874917747e-05, | |
| "loss": 0.5129, | |
| "step": 58300 | |
| }, | |
| { | |
| "epoch": 3.4370126239590384, | |
| "grad_norm": 10.625, | |
| "learning_rate": 9.806097835807903e-06, | |
| "loss": 0.5129, | |
| "step": 58400 | |
| }, | |
| { | |
| "epoch": 3.442897919548009, | |
| "grad_norm": 26.375, | |
| "learning_rate": 9.605438109147068e-06, | |
| "loss": 0.5151, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 3.4487832151369804, | |
| "grad_norm": 11.25, | |
| "learning_rate": 9.406749069507303e-06, | |
| "loss": 0.515, | |
| "step": 58600 | |
| }, | |
| { | |
| "epoch": 3.454668510725951, | |
| "grad_norm": 13.25, | |
| "learning_rate": 9.210035048497722e-06, | |
| "loss": 0.5047, | |
| "step": 58700 | |
| }, | |
| { | |
| "epoch": 3.4605538063149224, | |
| "grad_norm": 5.375, | |
| "learning_rate": 9.015300334670219e-06, | |
| "loss": 0.5125, | |
| "step": 58800 | |
| }, | |
| { | |
| "epoch": 3.466439101903893, | |
| "grad_norm": 11.75, | |
| "learning_rate": 8.822549173425876e-06, | |
| "loss": 0.5258, | |
| "step": 58900 | |
| }, | |
| { | |
| "epoch": 3.472324397492864, | |
| "grad_norm": 6.9375, | |
| "learning_rate": 8.631785766922507e-06, | |
| "loss": 0.5084, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 3.478209693081835, | |
| "grad_norm": 13.8125, | |
| "learning_rate": 8.443014273982953e-06, | |
| "loss": 0.5027, | |
| "step": 59100 | |
| }, | |
| { | |
| "epoch": 3.484094988670806, | |
| "grad_norm": 17.75, | |
| "learning_rate": 8.256238810004424e-06, | |
| "loss": 0.5255, | |
| "step": 59200 | |
| }, | |
| { | |
| "epoch": 3.489980284259777, | |
| "grad_norm": 10.8125, | |
| "learning_rate": 8.071463446868899e-06, | |
| "loss": 0.5119, | |
| "step": 59300 | |
| }, | |
| { | |
| "epoch": 3.495865579848748, | |
| "grad_norm": 28.375, | |
| "learning_rate": 7.888692212854165e-06, | |
| "loss": 0.507, | |
| "step": 59400 | |
| }, | |
| { | |
| "epoch": 3.501750875437719, | |
| "grad_norm": 19.875, | |
| "learning_rate": 7.707929092546185e-06, | |
| "loss": 0.5097, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 3.50763617102669, | |
| "grad_norm": 8.0, | |
| "learning_rate": 7.52917802675206e-06, | |
| "loss": 0.5138, | |
| "step": 59600 | |
| }, | |
| { | |
| "epoch": 3.5135214666156607, | |
| "grad_norm": 11.5, | |
| "learning_rate": 7.352442912414259e-06, | |
| "loss": 0.5213, | |
| "step": 59700 | |
| }, | |
| { | |
| "epoch": 3.519406762204632, | |
| "grad_norm": 11.3125, | |
| "learning_rate": 7.1777276025256075e-06, | |
| "loss": 0.4977, | |
| "step": 59800 | |
| }, | |
| { | |
| "epoch": 3.5252920577936027, | |
| "grad_norm": 10.625, | |
| "learning_rate": 7.005035906045199e-06, | |
| "loss": 0.5094, | |
| "step": 59900 | |
| }, | |
| { | |
| "epoch": 3.5311773533825734, | |
| "grad_norm": 13.4375, | |
| "learning_rate": 6.834371587815547e-06, | |
| "loss": 0.5202, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 3.5370626489715447, | |
| "grad_norm": 21.125, | |
| "learning_rate": 6.665738368480301e-06, | |
| "loss": 0.5069, | |
| "step": 60100 | |
| }, | |
| { | |
| "epoch": 3.5429479445605154, | |
| "grad_norm": 12.0625, | |
| "learning_rate": 6.4991399244033306e-06, | |
| "loss": 0.5218, | |
| "step": 60200 | |
| }, | |
| { | |
| "epoch": 3.5488332401494866, | |
| "grad_norm": 6.875, | |
| "learning_rate": 6.334579887588377e-06, | |
| "loss": 0.5049, | |
| "step": 60300 | |
| }, | |
| { | |
| "epoch": 3.5547185357384574, | |
| "grad_norm": 12.0, | |
| "learning_rate": 6.172061845600053e-06, | |
| "loss": 0.5291, | |
| "step": 60400 | |
| }, | |
| { | |
| "epoch": 3.5606038313274286, | |
| "grad_norm": 36.5, | |
| "learning_rate": 6.011589341485524e-06, | |
| "loss": 0.5136, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 3.5664891269163994, | |
| "grad_norm": 27.375, | |
| "learning_rate": 5.8531658736972524e-06, | |
| "loss": 0.5103, | |
| "step": 60600 | |
| }, | |
| { | |
| "epoch": 3.57237442250537, | |
| "grad_norm": 10.6875, | |
| "learning_rate": 5.696794896016866e-06, | |
| "loss": 0.5087, | |
| "step": 60700 | |
| }, | |
| { | |
| "epoch": 3.5782597180943414, | |
| "grad_norm": 24.75, | |
| "learning_rate": 5.542479817479651e-06, | |
| "loss": 0.5077, | |
| "step": 60800 | |
| }, | |
| { | |
| "epoch": 3.584145013683312, | |
| "grad_norm": 22.875, | |
| "learning_rate": 5.390224002300437e-06, | |
| "loss": 0.5295, | |
| "step": 60900 | |
| }, | |
| { | |
| "epoch": 3.590030309272283, | |
| "grad_norm": 11.1875, | |
| "learning_rate": 5.240030769800108e-06, | |
| "loss": 0.52, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 3.595915604861254, | |
| "grad_norm": 27.75, | |
| "learning_rate": 5.091903394333331e-06, | |
| "loss": 0.5079, | |
| "step": 61100 | |
| }, | |
| { | |
| "epoch": 3.6018009004502254, | |
| "grad_norm": 14.25, | |
| "learning_rate": 4.945845105217117e-06, | |
| "loss": 0.5164, | |
| "step": 61200 | |
| }, | |
| { | |
| "epoch": 3.607686196039196, | |
| "grad_norm": 6.4375, | |
| "learning_rate": 4.801859086660387e-06, | |
| "loss": 0.5226, | |
| "step": 61300 | |
| }, | |
| { | |
| "epoch": 3.613571491628167, | |
| "grad_norm": 38.25, | |
| "learning_rate": 4.659948477694709e-06, | |
| "loss": 0.5266, | |
| "step": 61400 | |
| }, | |
| { | |
| "epoch": 3.619456787217138, | |
| "grad_norm": 17.0, | |
| "learning_rate": 4.520116372105665e-06, | |
| "loss": 0.5286, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 3.625342082806109, | |
| "grad_norm": 11.3125, | |
| "learning_rate": 4.382365818365552e-06, | |
| "loss": 0.4915, | |
| "step": 61600 | |
| }, | |
| { | |
| "epoch": 3.6312273783950797, | |
| "grad_norm": 26.375, | |
| "learning_rate": 4.246699819566824e-06, | |
| "loss": 0.5006, | |
| "step": 61700 | |
| }, | |
| { | |
| "epoch": 3.637112673984051, | |
| "grad_norm": 6.0, | |
| "learning_rate": 4.1131213333566846e-06, | |
| "loss": 0.5007, | |
| "step": 61800 | |
| }, | |
| { | |
| "epoch": 3.6429979695730217, | |
| "grad_norm": 5.34375, | |
| "learning_rate": 3.981633271872598e-06, | |
| "loss": 0.5202, | |
| "step": 61900 | |
| }, | |
| { | |
| "epoch": 3.648883265161993, | |
| "grad_norm": 11.4375, | |
| "learning_rate": 3.852238501678751e-06, | |
| "loss": 0.5159, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 3.6547685607509637, | |
| "grad_norm": 5.71875, | |
| "learning_rate": 3.7249398437036454e-06, | |
| "loss": 0.511, | |
| "step": 62100 | |
| }, | |
| { | |
| "epoch": 3.660653856339935, | |
| "grad_norm": 27.125, | |
| "learning_rate": 3.5997400731785258e-06, | |
| "loss": 0.5217, | |
| "step": 62200 | |
| }, | |
| { | |
| "epoch": 3.6665391519289057, | |
| "grad_norm": 5.53125, | |
| "learning_rate": 3.4766419195769285e-06, | |
| "loss": 0.5074, | |
| "step": 62300 | |
| }, | |
| { | |
| "epoch": 3.6724244475178764, | |
| "grad_norm": 14.875, | |
| "learning_rate": 3.355648066555117e-06, | |
| "loss": 0.5022, | |
| "step": 62400 | |
| }, | |
| { | |
| "epoch": 3.6783097431068477, | |
| "grad_norm": 11.8125, | |
| "learning_rate": 3.236761151893608e-06, | |
| "loss": 0.501, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 3.6841950386958184, | |
| "grad_norm": 36.5, | |
| "learning_rate": 3.119983767439705e-06, | |
| "loss": 0.5139, | |
| "step": 62600 | |
| }, | |
| { | |
| "epoch": 3.690080334284789, | |
| "grad_norm": 25.0, | |
| "learning_rate": 3.005318459050932e-06, | |
| "loss": 0.5286, | |
| "step": 62700 | |
| }, | |
| { | |
| "epoch": 3.6959656298737604, | |
| "grad_norm": 9.6875, | |
| "learning_rate": 2.892767726539569e-06, | |
| "loss": 0.524, | |
| "step": 62800 | |
| }, | |
| { | |
| "epoch": 3.701850925462731, | |
| "grad_norm": 8.1875, | |
| "learning_rate": 2.7823340236181162e-06, | |
| "loss": 0.5196, | |
| "step": 62900 | |
| }, | |
| { | |
| "epoch": 3.7077362210517024, | |
| "grad_norm": 19.625, | |
| "learning_rate": 2.674019757845847e-06, | |
| "loss": 0.5073, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 3.713621516640673, | |
| "grad_norm": 16.375, | |
| "learning_rate": 2.567827290576297e-06, | |
| "loss": 0.5043, | |
| "step": 63100 | |
| }, | |
| { | |
| "epoch": 3.7195068122296444, | |
| "grad_norm": 7.46875, | |
| "learning_rate": 2.463758936905758e-06, | |
| "loss": 0.5134, | |
| "step": 63200 | |
| }, | |
| { | |
| "epoch": 3.725392107818615, | |
| "grad_norm": 16.125, | |
| "learning_rate": 2.3618169656228873e-06, | |
| "loss": 0.5175, | |
| "step": 63300 | |
| }, | |
| { | |
| "epoch": 3.731277403407586, | |
| "grad_norm": 7.21875, | |
| "learning_rate": 2.2620035991591238e-06, | |
| "loss": 0.5269, | |
| "step": 63400 | |
| }, | |
| { | |
| "epoch": 3.737162698996557, | |
| "grad_norm": 19.75, | |
| "learning_rate": 2.1643210135403825e-06, | |
| "loss": 0.5021, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 3.743047994585528, | |
| "grad_norm": 5.96875, | |
| "learning_rate": 2.06877133833947e-06, | |
| "loss": 0.5249, | |
| "step": 63600 | |
| }, | |
| { | |
| "epoch": 3.748933290174499, | |
| "grad_norm": 17.625, | |
| "learning_rate": 1.97535665662979e-06, | |
| "loss": 0.5282, | |
| "step": 63700 | |
| }, | |
| { | |
| "epoch": 3.75481858576347, | |
| "grad_norm": 13.875, | |
| "learning_rate": 1.8840790049398095e-06, | |
| "loss": 0.5088, | |
| "step": 63800 | |
| }, | |
| { | |
| "epoch": 3.760703881352441, | |
| "grad_norm": 14.3125, | |
| "learning_rate": 1.7949403732087311e-06, | |
| "loss": 0.5365, | |
| "step": 63900 | |
| }, | |
| { | |
| "epoch": 3.766589176941412, | |
| "grad_norm": 19.875, | |
| "learning_rate": 1.7079427047431485e-06, | |
| "loss": 0.5084, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 3.7724744725303827, | |
| "grad_norm": 6.34375, | |
| "learning_rate": 1.6230878961745577e-06, | |
| "loss": 0.5067, | |
| "step": 64100 | |
| }, | |
| { | |
| "epoch": 3.778359768119354, | |
| "grad_norm": 14.125, | |
| "learning_rate": 1.5403777974181354e-06, | |
| "loss": 0.5016, | |
| "step": 64200 | |
| }, | |
| { | |
| "epoch": 3.7842450637083247, | |
| "grad_norm": 9.0, | |
| "learning_rate": 1.4598142116323156e-06, | |
| "loss": 0.5285, | |
| "step": 64300 | |
| }, | |
| { | |
| "epoch": 3.7901303592972955, | |
| "grad_norm": 22.25, | |
| "learning_rate": 1.3813988951795421e-06, | |
| "loss": 0.5291, | |
| "step": 64400 | |
| }, | |
| { | |
| "epoch": 3.7960156548862667, | |
| "grad_norm": 7.15625, | |
| "learning_rate": 1.3051335575879341e-06, | |
| "loss": 0.4998, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 3.8019009504752375, | |
| "grad_norm": 21.0, | |
| "learning_rate": 1.23101986151406e-06, | |
| "loss": 0.5114, | |
| "step": 64600 | |
| }, | |
| { | |
| "epoch": 3.8077862460642087, | |
| "grad_norm": 18.25, | |
| "learning_rate": 1.1590594227066542e-06, | |
| "loss": 0.5212, | |
| "step": 64700 | |
| }, | |
| { | |
| "epoch": 3.8136715416531795, | |
| "grad_norm": 21.25, | |
| "learning_rate": 1.0892538099714023e-06, | |
| "loss": 0.5245, | |
| "step": 64800 | |
| }, | |
| { | |
| "epoch": 3.8195568372421507, | |
| "grad_norm": 7.03125, | |
| "learning_rate": 1.0216045451367452e-06, | |
| "loss": 0.5021, | |
| "step": 64900 | |
| }, | |
| { | |
| "epoch": 3.8254421328311214, | |
| "grad_norm": 5.875, | |
| "learning_rate": 9.561131030206837e-07, | |
| "loss": 0.5257, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 3.831327428420092, | |
| "grad_norm": 12.5625, | |
| "learning_rate": 8.927809113986607e-07, | |
| "loss": 0.5224, | |
| "step": 65100 | |
| }, | |
| { | |
| "epoch": 3.8372127240090634, | |
| "grad_norm": 6.40625, | |
| "learning_rate": 8.316093509724066e-07, | |
| "loss": 0.5038, | |
| "step": 65200 | |
| }, | |
| { | |
| "epoch": 3.843098019598034, | |
| "grad_norm": 5.625, | |
| "learning_rate": 7.725997553398534e-07, | |
| "loss": 0.5153, | |
| "step": 65300 | |
| }, | |
| { | |
| "epoch": 3.8489833151870054, | |
| "grad_norm": 11.9375, | |
| "learning_rate": 7.157534109660358e-07, | |
| "loss": 0.4947, | |
| "step": 65400 | |
| }, | |
| { | |
| "epoch": 3.854868610775976, | |
| "grad_norm": 18.25, | |
| "learning_rate": 6.610715571550796e-07, | |
| "loss": 0.4974, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 3.8607539063649474, | |
| "grad_norm": 12.125, | |
| "learning_rate": 6.085553860231685e-07, | |
| "loss": 0.498, | |
| "step": 65600 | |
| }, | |
| { | |
| "epoch": 3.866639201953918, | |
| "grad_norm": 15.125, | |
| "learning_rate": 5.582060424725421e-07, | |
| "loss": 0.5182, | |
| "step": 65700 | |
| }, | |
| { | |
| "epoch": 3.872524497542889, | |
| "grad_norm": 17.375, | |
| "learning_rate": 5.100246241665496e-07, | |
| "loss": 0.5096, | |
| "step": 65800 | |
| }, | |
| { | |
| "epoch": 3.87840979313186, | |
| "grad_norm": 10.875, | |
| "learning_rate": 4.640121815057241e-07, | |
| "loss": 0.537, | |
| "step": 65900 | |
| }, | |
| { | |
| "epoch": 3.884295088720831, | |
| "grad_norm": 13.75, | |
| "learning_rate": 4.201697176048791e-07, | |
| "loss": 0.5069, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 3.8901803843098017, | |
| "grad_norm": 5.28125, | |
| "learning_rate": 3.7849818827121465e-07, | |
| "loss": 0.5089, | |
| "step": 66100 | |
| }, | |
| { | |
| "epoch": 3.896065679898773, | |
| "grad_norm": 17.125, | |
| "learning_rate": 3.38998501983534e-07, | |
| "loss": 0.5166, | |
| "step": 66200 | |
| }, | |
| { | |
| "epoch": 3.9019509754877437, | |
| "grad_norm": 14.8125, | |
| "learning_rate": 3.0167151987238187e-07, | |
| "loss": 0.5002, | |
| "step": 66300 | |
| }, | |
| { | |
| "epoch": 3.907836271076715, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 2.665180557013147e-07, | |
| "loss": 0.5074, | |
| "step": 66400 | |
| }, | |
| { | |
| "epoch": 3.9137215666656857, | |
| "grad_norm": 11.125, | |
| "learning_rate": 2.3353887584911528e-07, | |
| "loss": 0.5059, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 3.919606862254657, | |
| "grad_norm": 5.4375, | |
| "learning_rate": 2.0273469929313893e-07, | |
| "loss": 0.5305, | |
| "step": 66600 | |
| }, | |
| { | |
| "epoch": 3.9254921578436277, | |
| "grad_norm": 20.5, | |
| "learning_rate": 1.7410619759358204e-07, | |
| "loss": 0.5114, | |
| "step": 66700 | |
| }, | |
| { | |
| "epoch": 3.9313774534325985, | |
| "grad_norm": 11.3125, | |
| "learning_rate": 1.4765399487889352e-07, | |
| "loss": 0.5084, | |
| "step": 66800 | |
| }, | |
| { | |
| "epoch": 3.9372627490215697, | |
| "grad_norm": 13.375, | |
| "learning_rate": 1.2337866783211915e-07, | |
| "loss": 0.5048, | |
| "step": 66900 | |
| }, | |
| { | |
| "epoch": 3.9431480446105405, | |
| "grad_norm": 16.125, | |
| "learning_rate": 1.012807456783782e-07, | |
| "loss": 0.5414, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 3.9490333401995112, | |
| "grad_norm": 16.25, | |
| "learning_rate": 8.136071017330604e-08, | |
| "loss": 0.5128, | |
| "step": 67100 | |
| }, | |
| { | |
| "epoch": 3.9549186357884825, | |
| "grad_norm": 15.0625, | |
| "learning_rate": 6.361899559250705e-08, | |
| "loss": 0.5239, | |
| "step": 67200 | |
| }, | |
| { | |
| "epoch": 3.9608039313774537, | |
| "grad_norm": 20.625, | |
| "learning_rate": 4.8055988722162106e-08, | |
| "loss": 0.508, | |
| "step": 67300 | |
| }, | |
| { | |
| "epoch": 3.9666892269664245, | |
| "grad_norm": 9.25, | |
| "learning_rate": 3.467202885056864e-08, | |
| "loss": 0.5171, | |
| "step": 67400 | |
| }, | |
| { | |
| "epoch": 3.9725745225553952, | |
| "grad_norm": 8.25, | |
| "learning_rate": 2.346740776070222e-08, | |
| "loss": 0.5199, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 3.9784598181443664, | |
| "grad_norm": 11.5, | |
| "learning_rate": 1.4442369723932648e-08, | |
| "loss": 0.4939, | |
| "step": 67600 | |
| }, | |
| { | |
| "epoch": 3.984345113733337, | |
| "grad_norm": 21.0, | |
| "learning_rate": 7.597111494606069e-09, | |
| "loss": 0.5275, | |
| "step": 67700 | |
| }, | |
| { | |
| "epoch": 3.990230409322308, | |
| "grad_norm": 5.4375, | |
| "learning_rate": 2.9317823058483405e-09, | |
| "loss": 0.5191, | |
| "step": 67800 | |
| }, | |
| { | |
| "epoch": 3.996115704911279, | |
| "grad_norm": 6.96875, | |
| "learning_rate": 4.464838662454618e-10, | |
| "loss": 0.5112, | |
| "step": 67900 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 67964, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 4000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.755619563970458e+18, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |