diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -1,7214 +1,122 @@ { - "best_global_step": 5000, - "best_metric": 0.047866612672805786, - "best_model_checkpoint": "hieptt/vietnamese-correction-finetuning/checkpoint-5000", - "epoch": 381.6793893129771, - "eval_steps": 5000, - "global_step": 100000, + "best_global_step": 1000, + "best_metric": 1.550229787826538, + "best_model_checkpoint": "hieptt/vietnamese-correction-finetuning/checkpoint-1000", + "epoch": 0.02686222365487415, + "eval_steps": 1000, + "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.3816793893129771, - "grad_norm": 0.7569677233695984, - "learning_rate": 9.990100000000001e-06, - "loss": 0.0719, + "epoch": 0.002686222365487415, + "grad_norm": 0.9133111238479614, + "learning_rate": 4.950000000000001e-06, + "loss": 3.3195, "step": 100 }, { - "epoch": 0.7633587786259542, - "grad_norm": 0.7828747630119324, - "learning_rate": 9.9801e-06, - "loss": 0.0433, + "epoch": 0.00537244473097483, + "grad_norm": 0.7488560080528259, + "learning_rate": 9.950000000000001e-06, + "loss": 1.7797, "step": 200 }, { - "epoch": 1.1450381679389312, - "grad_norm": 0.5225523114204407, - "learning_rate": 9.9701e-06, - "loss": 0.0334, + "epoch": 0.008058667096462245, + "grad_norm": 0.6220685839653015, + "learning_rate": 1.4950000000000001e-05, + "loss": 1.691, "step": 300 }, { - "epoch": 1.5267175572519083, - "grad_norm": 0.4187374413013458, - "learning_rate": 9.9601e-06, - "loss": 0.0222, + "epoch": 0.01074488946194966, + "grad_norm": 0.5194090008735657, + "learning_rate": 1.995e-05, + "loss": 1.6429, "step": 400 }, { - "epoch": 1.9083969465648853, - "grad_norm": 0.6059781312942505, - "learning_rate": 9.9501e-06, - "loss": 0.0213, + "epoch": 0.013431111827437075, + "grad_norm": 0.5367034673690796, + "learning_rate": 2.495e-05, + "loss": 1.6223, "step": 500 }, { - "epoch": 2.2900763358778624, - "grad_norm": 0.25024721026420593, - "learning_rate": 9.9401e-06, - "loss": 0.0159, + "epoch": 0.01611733419292449, + "grad_norm": 0.49578285217285156, + "learning_rate": 2.995e-05, + "loss": 1.6053, "step": 600 }, { - "epoch": 2.67175572519084, - "grad_norm": 0.5520693063735962, - "learning_rate": 9.9301e-06, - "loss": 0.0127, + "epoch": 0.018803556558411906, + "grad_norm": 0.6656105518341064, + "learning_rate": 3.495e-05, + "loss": 1.5915, "step": 700 }, { - "epoch": 3.053435114503817, - "grad_norm": 0.3793954849243164, - "learning_rate": 9.9201e-06, - "loss": 0.0134, + "epoch": 0.02148977892389932, + "grad_norm": 0.4488033950328827, + "learning_rate": 3.995e-05, + "loss": 1.5812, "step": 800 }, { - "epoch": 3.435114503816794, - "grad_norm": 0.26582738757133484, - "learning_rate": 9.9101e-06, - "loss": 0.0086, + "epoch": 0.024176001289386736, + "grad_norm": 0.5082846283912659, + "learning_rate": 4.495e-05, + "loss": 1.573, "step": 900 }, { - "epoch": 3.816793893129771, - "grad_norm": 0.6121692657470703, - "learning_rate": 9.9001e-06, - "loss": 0.0079, + "epoch": 0.02686222365487415, + "grad_norm": 0.5444520711898804, + "learning_rate": 4.995e-05, + "loss": 1.5654, "step": 1000 }, { - "epoch": 4.198473282442748, - "grad_norm": 0.24214144051074982, - "learning_rate": 9.8901e-06, - "loss": 0.0067, - "step": 1100 - }, - { - "epoch": 4.580152671755725, - "grad_norm": 0.28713083267211914, - "learning_rate": 9.880100000000002e-06, - "loss": 0.0056, - "step": 1200 - }, - { - "epoch": 4.961832061068702, - "grad_norm": 0.1301882117986679, - "learning_rate": 9.8701e-06, - "loss": 0.0054, - "step": 1300 - }, - { - "epoch": 5.34351145038168, - "grad_norm": 0.33141568303108215, - "learning_rate": 9.8601e-06, - "loss": 0.0046, - "step": 1400 - }, - { - "epoch": 5.7251908396946565, - "grad_norm": 0.23512718081474304, - "learning_rate": 9.8501e-06, - "loss": 0.0038, - "step": 1500 - }, - { - "epoch": 6.106870229007634, - "grad_norm": 0.08219015598297119, - "learning_rate": 9.840100000000001e-06, - "loss": 0.0038, - "step": 1600 - }, - { - "epoch": 6.488549618320611, - "grad_norm": 0.6633931994438171, - "learning_rate": 9.830100000000001e-06, - "loss": 0.0035, - "step": 1700 - }, - { - "epoch": 6.870229007633588, - "grad_norm": 0.2726692259311676, - "learning_rate": 9.820100000000001e-06, - "loss": 0.0033, - "step": 1800 - }, - { - "epoch": 7.251908396946565, - "grad_norm": 0.2931516766548157, - "learning_rate": 9.810100000000001e-06, - "loss": 0.0034, - "step": 1900 - }, - { - "epoch": 7.633587786259542, - "grad_norm": 0.09338831901550293, - "learning_rate": 9.800100000000001e-06, - "loss": 0.0035, - "step": 2000 - }, - { - "epoch": 8.01526717557252, - "grad_norm": 0.2765468657016754, - "learning_rate": 9.790100000000001e-06, - "loss": 0.003, - "step": 2100 - }, - { - "epoch": 8.396946564885496, - "grad_norm": 0.07544845342636108, - "learning_rate": 9.780100000000001e-06, - "loss": 0.0025, - "step": 2200 - }, - { - "epoch": 8.778625954198473, - "grad_norm": 0.03525325655937195, - "learning_rate": 9.770100000000002e-06, - "loss": 0.0022, - "step": 2300 - }, - { - "epoch": 9.16030534351145, - "grad_norm": 0.05003313347697258, - "learning_rate": 9.760100000000002e-06, - "loss": 0.0024, - "step": 2400 - }, - { - "epoch": 9.541984732824428, - "grad_norm": 0.13240310549736023, - "learning_rate": 9.750100000000002e-06, - "loss": 0.0031, - "step": 2500 - }, - { - "epoch": 9.923664122137405, - "grad_norm": 0.35928553342819214, - "learning_rate": 9.740100000000002e-06, - "loss": 0.0021, - "step": 2600 - }, - { - "epoch": 10.305343511450381, - "grad_norm": 0.1162368506193161, - "learning_rate": 9.730100000000002e-06, - "loss": 0.0014, - "step": 2700 - }, - { - "epoch": 10.68702290076336, - "grad_norm": 0.48981210589408875, - "learning_rate": 9.720100000000002e-06, - "loss": 0.0017, - "step": 2800 - }, - { - "epoch": 11.068702290076336, - "grad_norm": 0.08054999262094498, - "learning_rate": 9.7101e-06, - "loss": 0.0019, - "step": 2900 - }, - { - "epoch": 11.450381679389313, - "grad_norm": 0.02346479892730713, - "learning_rate": 9.7001e-06, - "loss": 0.0016, - "step": 3000 - }, - { - "epoch": 11.83206106870229, - "grad_norm": 0.04746849089860916, - "learning_rate": 9.6901e-06, - "loss": 0.0016, - "step": 3100 - }, - { - "epoch": 12.213740458015268, - "grad_norm": 0.24276158213615417, - "learning_rate": 9.6801e-06, - "loss": 0.0021, - "step": 3200 - }, - { - "epoch": 12.595419847328245, - "grad_norm": 0.09096894413232803, - "learning_rate": 9.6701e-06, - "loss": 0.0013, - "step": 3300 - }, - { - "epoch": 12.977099236641221, - "grad_norm": 0.04415480047464371, - "learning_rate": 9.660100000000001e-06, - "loss": 0.0018, - "step": 3400 - }, - { - "epoch": 13.358778625954198, - "grad_norm": 0.16314099729061127, - "learning_rate": 9.650100000000001e-06, - "loss": 0.0014, - "step": 3500 - }, - { - "epoch": 13.740458015267176, - "grad_norm": 0.3408973515033722, - "learning_rate": 9.640100000000001e-06, - "loss": 0.0017, - "step": 3600 - }, - { - "epoch": 14.122137404580153, - "grad_norm": 0.08021173626184464, - "learning_rate": 9.630100000000001e-06, - "loss": 0.0016, - "step": 3700 - }, - { - "epoch": 14.50381679389313, - "grad_norm": 0.05408208817243576, - "learning_rate": 9.620100000000001e-06, - "loss": 0.0015, - "step": 3800 - }, - { - "epoch": 14.885496183206106, - "grad_norm": 0.09434926509857178, - "learning_rate": 9.610100000000001e-06, - "loss": 0.0021, - "step": 3900 - }, - { - "epoch": 15.267175572519085, - "grad_norm": 0.08686945587396622, - "learning_rate": 9.600100000000001e-06, - "loss": 0.0015, - "step": 4000 - }, - { - "epoch": 15.648854961832061, - "grad_norm": 0.42048439383506775, - "learning_rate": 9.590100000000002e-06, - "loss": 0.0014, - "step": 4100 - }, - { - "epoch": 16.03053435114504, - "grad_norm": 0.02438484877347946, - "learning_rate": 9.580100000000002e-06, - "loss": 0.0015, - "step": 4200 - }, - { - "epoch": 16.412213740458014, - "grad_norm": 0.02097316086292267, - "learning_rate": 9.570100000000002e-06, - "loss": 0.0015, - "step": 4300 - }, - { - "epoch": 16.793893129770993, - "grad_norm": 0.06143781170248985, - "learning_rate": 9.560100000000002e-06, - "loss": 0.0012, - "step": 4400 - }, - { - "epoch": 17.17557251908397, - "grad_norm": 0.0502663180232048, - "learning_rate": 9.5501e-06, - "loss": 0.0014, - "step": 4500 - }, - { - "epoch": 17.557251908396946, - "grad_norm": 0.16805320978164673, - "learning_rate": 9.5401e-06, - "loss": 0.0013, - "step": 4600 - }, - { - "epoch": 17.938931297709924, - "grad_norm": 0.27268025279045105, - "learning_rate": 9.5301e-06, - "loss": 0.0017, - "step": 4700 - }, - { - "epoch": 18.3206106870229, - "grad_norm": 0.45031046867370605, - "learning_rate": 9.5201e-06, - "loss": 0.0017, - "step": 4800 - }, - { - "epoch": 18.702290076335878, - "grad_norm": 0.0784195140004158, - "learning_rate": 9.5101e-06, - "loss": 0.0016, - "step": 4900 - }, - { - "epoch": 19.083969465648856, - "grad_norm": 0.20674675703048706, - "learning_rate": 9.5001e-06, - "loss": 0.0015, - "step": 5000 - }, - { - "epoch": 19.083969465648856, - "eval_loss": 0.047866612672805786, - "eval_runtime": 13.3454, - "eval_sacrebleu": 98.48652363960265, - "eval_samples_per_second": 70.062, - "eval_steps_per_second": 1.124, - "step": 5000 - }, - { - "epoch": 19.46564885496183, - "grad_norm": 0.03532819449901581, - "learning_rate": 9.4901e-06, - "loss": 0.0014, - "step": 5100 - }, - { - "epoch": 19.84732824427481, - "grad_norm": 0.17402729392051697, - "learning_rate": 9.480100000000001e-06, - "loss": 0.001, - "step": 5200 - }, - { - "epoch": 20.229007633587788, - "grad_norm": 0.02294261008501053, - "learning_rate": 9.470100000000001e-06, - "loss": 0.0011, - "step": 5300 - }, - { - "epoch": 20.610687022900763, - "grad_norm": 0.25006458163261414, - "learning_rate": 9.460100000000001e-06, - "loss": 0.0013, - "step": 5400 - }, - { - "epoch": 20.99236641221374, - "grad_norm": 0.04103191941976547, - "learning_rate": 9.450100000000001e-06, - "loss": 0.0014, - "step": 5500 - }, - { - "epoch": 21.374045801526716, - "grad_norm": 0.010699858888983727, - "learning_rate": 9.440100000000001e-06, - "loss": 0.001, - "step": 5600 - }, - { - "epoch": 21.755725190839694, - "grad_norm": 0.021322397515177727, - "learning_rate": 9.430100000000001e-06, - "loss": 0.0015, - "step": 5700 - }, - { - "epoch": 22.137404580152673, - "grad_norm": 0.19421952962875366, - "learning_rate": 9.420100000000002e-06, - "loss": 0.0013, - "step": 5800 - }, - { - "epoch": 22.519083969465647, - "grad_norm": 0.2574252486228943, - "learning_rate": 9.410100000000002e-06, - "loss": 0.0014, - "step": 5900 - }, - { - "epoch": 22.900763358778626, - "grad_norm": 0.5455259680747986, - "learning_rate": 9.400100000000002e-06, - "loss": 0.0014, - "step": 6000 - }, - { - "epoch": 23.282442748091604, - "grad_norm": 0.1753765344619751, - "learning_rate": 9.3901e-06, - "loss": 0.0012, - "step": 6100 - }, - { - "epoch": 23.66412213740458, - "grad_norm": 0.03236771747469902, - "learning_rate": 9.3801e-06, - "loss": 0.0008, - "step": 6200 - }, - { - "epoch": 24.045801526717558, - "grad_norm": 0.008935701102018356, - "learning_rate": 9.3701e-06, - "loss": 0.0015, - "step": 6300 - }, - { - "epoch": 24.427480916030536, - "grad_norm": 0.01094586681574583, - "learning_rate": 9.3601e-06, - "loss": 0.001, - "step": 6400 - }, - { - "epoch": 24.80916030534351, - "grad_norm": 0.06931117922067642, - "learning_rate": 9.3501e-06, - "loss": 0.0013, - "step": 6500 - }, - { - "epoch": 25.19083969465649, - "grad_norm": 0.1000668928027153, - "learning_rate": 9.3401e-06, - "loss": 0.0007, - "step": 6600 - }, - { - "epoch": 25.572519083969464, - "grad_norm": 0.07587302476167679, - "learning_rate": 9.3301e-06, - "loss": 0.0006, - "step": 6700 - }, - { - "epoch": 25.954198473282442, - "grad_norm": 0.5648608207702637, - "learning_rate": 9.3201e-06, - "loss": 0.001, - "step": 6800 - }, - { - "epoch": 26.33587786259542, - "grad_norm": 0.03543608635663986, - "learning_rate": 9.310100000000001e-06, - "loss": 0.0012, - "step": 6900 - }, - { - "epoch": 26.717557251908396, - "grad_norm": 0.032472752034664154, - "learning_rate": 9.300100000000001e-06, - "loss": 0.0007, - "step": 7000 - }, - { - "epoch": 27.099236641221374, - "grad_norm": 0.031916968524456024, - "learning_rate": 9.290100000000001e-06, - "loss": 0.0007, - "step": 7100 - }, - { - "epoch": 27.480916030534353, - "grad_norm": 0.05069916322827339, - "learning_rate": 9.280100000000001e-06, - "loss": 0.0008, - "step": 7200 - }, - { - "epoch": 27.862595419847327, - "grad_norm": 0.02230975404381752, - "learning_rate": 9.270100000000001e-06, - "loss": 0.0007, - "step": 7300 - }, - { - "epoch": 28.244274809160306, - "grad_norm": 0.04306759685277939, - "learning_rate": 9.260100000000001e-06, - "loss": 0.0011, - "step": 7400 - }, - { - "epoch": 28.625954198473284, - "grad_norm": 0.15488262474536896, - "learning_rate": 9.250100000000001e-06, - "loss": 0.0007, - "step": 7500 - }, - { - "epoch": 29.00763358778626, - "grad_norm": 0.2574034631252289, - "learning_rate": 9.240100000000002e-06, - "loss": 0.0014, - "step": 7600 - }, - { - "epoch": 29.389312977099237, - "grad_norm": 0.012467756867408752, - "learning_rate": 9.2301e-06, - "loss": 0.0013, - "step": 7700 - }, - { - "epoch": 29.770992366412212, - "grad_norm": 0.46930211782455444, - "learning_rate": 9.2201e-06, - "loss": 0.0007, - "step": 7800 - }, - { - "epoch": 30.15267175572519, - "grad_norm": 0.015927040949463844, - "learning_rate": 9.2101e-06, - "loss": 0.0019, - "step": 7900 - }, - { - "epoch": 30.53435114503817, - "grad_norm": 0.00500341085717082, - "learning_rate": 9.2001e-06, - "loss": 0.0006, - "step": 8000 - }, - { - "epoch": 30.916030534351144, - "grad_norm": 0.30345290899276733, - "learning_rate": 9.1901e-06, - "loss": 0.0015, - "step": 8100 - }, - { - "epoch": 31.297709923664122, - "grad_norm": 0.07467725872993469, - "learning_rate": 9.1801e-06, - "loss": 0.004, - "step": 8200 - }, - { - "epoch": 31.6793893129771, - "grad_norm": 0.08250703662633896, - "learning_rate": 9.1701e-06, - "loss": 0.0007, - "step": 8300 - }, - { - "epoch": 32.06106870229008, - "grad_norm": 0.025153586640954018, - "learning_rate": 9.1601e-06, - "loss": 0.0009, - "step": 8400 - }, - { - "epoch": 32.44274809160305, - "grad_norm": 0.2508690059185028, - "learning_rate": 9.1501e-06, - "loss": 0.0008, - "step": 8500 - }, - { - "epoch": 32.82442748091603, - "grad_norm": 0.008141218684613705, - "learning_rate": 9.1401e-06, - "loss": 0.001, - "step": 8600 - }, - { - "epoch": 33.20610687022901, - "grad_norm": 0.1588882952928543, - "learning_rate": 9.130100000000001e-06, - "loss": 0.0007, - "step": 8700 - }, - { - "epoch": 33.587786259541986, - "grad_norm": 0.12282819300889969, - "learning_rate": 9.120100000000001e-06, - "loss": 0.0008, - "step": 8800 - }, - { - "epoch": 33.969465648854964, - "grad_norm": 0.28291991353034973, - "learning_rate": 9.110100000000001e-06, - "loss": 0.0007, - "step": 8900 - }, - { - "epoch": 34.35114503816794, - "grad_norm": 0.006024655885994434, - "learning_rate": 9.100100000000001e-06, - "loss": 0.0008, - "step": 9000 - }, - { - "epoch": 34.732824427480914, - "grad_norm": 0.2243477702140808, - "learning_rate": 9.090100000000001e-06, - "loss": 0.0008, - "step": 9100 - }, - { - "epoch": 35.11450381679389, - "grad_norm": 0.05475495010614395, - "learning_rate": 9.080100000000001e-06, - "loss": 0.0007, - "step": 9200 - }, - { - "epoch": 35.49618320610687, - "grad_norm": 0.0677318200469017, - "learning_rate": 9.0701e-06, - "loss": 0.0006, - "step": 9300 - }, - { - "epoch": 35.87786259541985, - "grad_norm": 0.026140468195080757, - "learning_rate": 9.0601e-06, - "loss": 0.0005, - "step": 9400 - }, - { - "epoch": 36.25954198473283, - "grad_norm": 0.10136673599481583, - "learning_rate": 9.0501e-06, - "loss": 0.0007, - "step": 9500 - }, - { - "epoch": 36.6412213740458, - "grad_norm": 0.006094355136156082, - "learning_rate": 9.0401e-06, - "loss": 0.0005, - "step": 9600 - }, - { - "epoch": 37.02290076335878, - "grad_norm": 0.01774986833333969, - "learning_rate": 9.0301e-06, - "loss": 0.001, - "step": 9700 - }, - { - "epoch": 37.404580152671755, - "grad_norm": 0.2515736520290375, - "learning_rate": 9.0201e-06, - "loss": 0.0009, - "step": 9800 - }, - { - "epoch": 37.786259541984734, - "grad_norm": 0.012614740990102291, - "learning_rate": 9.0101e-06, - "loss": 0.0007, - "step": 9900 - }, - { - "epoch": 38.16793893129771, - "grad_norm": 0.021107325330376625, - "learning_rate": 9.0001e-06, - "loss": 0.0009, - "step": 10000 - }, - { - "epoch": 38.16793893129771, - "eval_loss": 0.05267899110913277, - "eval_runtime": 12.1264, - "eval_sacrebleu": 98.4236226120961, - "eval_samples_per_second": 77.105, - "eval_steps_per_second": 1.237, - "step": 10000 - }, - { - "epoch": 38.54961832061069, - "grad_norm": 0.06139297038316727, - "learning_rate": 8.9901e-06, - "loss": 0.0007, - "step": 10100 - }, - { - "epoch": 38.93129770992366, - "grad_norm": 0.35389262437820435, - "learning_rate": 8.9801e-06, - "loss": 0.001, - "step": 10200 - }, - { - "epoch": 39.31297709923664, - "grad_norm": 0.008090398274362087, - "learning_rate": 8.9701e-06, - "loss": 0.0005, - "step": 10300 - }, - { - "epoch": 39.69465648854962, - "grad_norm": 0.010346139781177044, - "learning_rate": 8.9601e-06, - "loss": 0.0011, - "step": 10400 - }, - { - "epoch": 40.0763358778626, - "grad_norm": 0.022665904834866524, - "learning_rate": 8.950100000000001e-06, - "loss": 0.0006, - "step": 10500 - }, - { - "epoch": 40.458015267175576, - "grad_norm": 0.004521017894148827, - "learning_rate": 8.940100000000001e-06, - "loss": 0.0006, - "step": 10600 - }, - { - "epoch": 40.83969465648855, - "grad_norm": 0.154352605342865, - "learning_rate": 8.930100000000001e-06, - "loss": 0.0009, - "step": 10700 - }, - { - "epoch": 41.221374045801525, - "grad_norm": 0.01481171976774931, - "learning_rate": 8.920100000000001e-06, - "loss": 0.0009, - "step": 10800 - }, - { - "epoch": 41.603053435114504, - "grad_norm": 0.08903075754642487, - "learning_rate": 8.9101e-06, - "loss": 0.0008, - "step": 10900 - }, - { - "epoch": 41.98473282442748, - "grad_norm": 0.11959940940141678, - "learning_rate": 8.9001e-06, - "loss": 0.0009, - "step": 11000 - }, - { - "epoch": 42.36641221374046, - "grad_norm": 0.021108567714691162, - "learning_rate": 8.8901e-06, - "loss": 0.0004, - "step": 11100 - }, - { - "epoch": 42.74809160305343, - "grad_norm": 0.056824758648872375, - "learning_rate": 8.8801e-06, - "loss": 0.0004, - "step": 11200 - }, - { - "epoch": 43.12977099236641, - "grad_norm": 0.050357330590486526, - "learning_rate": 8.8701e-06, - "loss": 0.0004, - "step": 11300 - }, - { - "epoch": 43.51145038167939, - "grad_norm": 0.007239227648824453, - "learning_rate": 8.8601e-06, - "loss": 0.0004, - "step": 11400 - }, - { - "epoch": 43.89312977099237, - "grad_norm": 0.011060679331421852, - "learning_rate": 8.8501e-06, - "loss": 0.0006, - "step": 11500 - }, - { - "epoch": 44.274809160305345, - "grad_norm": 0.013822151347994804, - "learning_rate": 8.8401e-06, - "loss": 0.0004, - "step": 11600 - }, - { - "epoch": 44.656488549618324, - "grad_norm": 0.22547906637191772, - "learning_rate": 8.8301e-06, - "loss": 0.0008, - "step": 11700 - }, - { - "epoch": 45.038167938931295, - "grad_norm": 0.33633965253829956, - "learning_rate": 8.8201e-06, - "loss": 0.0006, - "step": 11800 - }, - { - "epoch": 45.41984732824427, - "grad_norm": 0.06807000190019608, - "learning_rate": 8.8101e-06, - "loss": 0.0006, - "step": 11900 - }, - { - "epoch": 45.80152671755725, - "grad_norm": 0.16109545528888702, - "learning_rate": 8.8001e-06, - "loss": 0.0006, - "step": 12000 - }, - { - "epoch": 46.18320610687023, - "grad_norm": 0.028740419074892998, - "learning_rate": 8.7901e-06, - "loss": 0.0007, - "step": 12100 - }, - { - "epoch": 46.56488549618321, - "grad_norm": 0.12738975882530212, - "learning_rate": 8.780100000000001e-06, - "loss": 0.0004, - "step": 12200 - }, - { - "epoch": 46.94656488549618, - "grad_norm": 0.012924674898386002, - "learning_rate": 8.770100000000001e-06, - "loss": 0.0008, - "step": 12300 - }, - { - "epoch": 47.32824427480916, - "grad_norm": 0.1647537797689438, - "learning_rate": 8.760100000000001e-06, - "loss": 0.0009, - "step": 12400 - }, - { - "epoch": 47.70992366412214, - "grad_norm": 0.012098666280508041, - "learning_rate": 8.7501e-06, - "loss": 0.0006, - "step": 12500 - }, - { - "epoch": 48.091603053435115, - "grad_norm": 0.13420873880386353, - "learning_rate": 8.7401e-06, - "loss": 0.0008, - "step": 12600 - }, - { - "epoch": 48.47328244274809, - "grad_norm": 0.02436097338795662, - "learning_rate": 8.7301e-06, - "loss": 0.001, - "step": 12700 - }, - { - "epoch": 48.85496183206107, - "grad_norm": 0.004736614413559437, - "learning_rate": 8.7201e-06, - "loss": 0.0005, - "step": 12800 - }, - { - "epoch": 49.23664122137404, - "grad_norm": 0.025153541937470436, - "learning_rate": 8.7101e-06, - "loss": 0.0004, - "step": 12900 - }, - { - "epoch": 49.61832061068702, - "grad_norm": 0.0038359477184712887, - "learning_rate": 8.7001e-06, - "loss": 0.0004, - "step": 13000 - }, - { - "epoch": 50.0, - "grad_norm": 0.03686102107167244, - "learning_rate": 8.6901e-06, - "loss": 0.0005, - "step": 13100 - }, - { - "epoch": 50.38167938931298, - "grad_norm": 0.0028796440456062555, - "learning_rate": 8.6801e-06, - "loss": 0.0007, - "step": 13200 - }, - { - "epoch": 50.76335877862596, - "grad_norm": 0.01179441250860691, - "learning_rate": 8.6701e-06, - "loss": 0.0004, - "step": 13300 - }, - { - "epoch": 51.14503816793893, - "grad_norm": 0.016312118619680405, - "learning_rate": 8.6601e-06, - "loss": 0.0007, - "step": 13400 - }, - { - "epoch": 51.52671755725191, - "grad_norm": 0.002660544356331229, - "learning_rate": 8.6501e-06, - "loss": 0.0007, - "step": 13500 - }, - { - "epoch": 51.908396946564885, - "grad_norm": 0.001922117662616074, - "learning_rate": 8.6401e-06, - "loss": 0.0003, - "step": 13600 - }, - { - "epoch": 52.29007633587786, - "grad_norm": 0.040188491344451904, - "learning_rate": 8.6301e-06, - "loss": 0.0004, - "step": 13700 - }, - { - "epoch": 52.67175572519084, - "grad_norm": 0.00425290409475565, - "learning_rate": 8.6201e-06, - "loss": 0.0004, - "step": 13800 - }, - { - "epoch": 53.05343511450382, - "grad_norm": 0.12379986047744751, - "learning_rate": 8.6101e-06, - "loss": 0.0005, - "step": 13900 - }, - { - "epoch": 53.43511450381679, - "grad_norm": 0.010486041195690632, - "learning_rate": 8.600100000000001e-06, - "loss": 0.0005, - "step": 14000 - }, - { - "epoch": 53.81679389312977, - "grad_norm": 0.1387258619070053, - "learning_rate": 8.590100000000001e-06, - "loss": 0.0003, - "step": 14100 - }, - { - "epoch": 54.19847328244275, - "grad_norm": 0.010977004654705524, - "learning_rate": 8.580100000000001e-06, - "loss": 0.0004, - "step": 14200 - }, - { - "epoch": 54.58015267175573, - "grad_norm": 0.2865349054336548, - "learning_rate": 8.570100000000001e-06, - "loss": 0.0005, - "step": 14300 - }, - { - "epoch": 54.961832061068705, - "grad_norm": 0.011387079954147339, - "learning_rate": 8.560100000000001e-06, - "loss": 0.0006, - "step": 14400 - }, - { - "epoch": 55.343511450381676, - "grad_norm": 0.06159939244389534, - "learning_rate": 8.550100000000001e-06, - "loss": 0.0011, - "step": 14500 - }, - { - "epoch": 55.725190839694655, - "grad_norm": 0.04341690614819527, - "learning_rate": 8.540100000000001e-06, - "loss": 0.0005, - "step": 14600 - }, - { - "epoch": 56.10687022900763, - "grad_norm": 0.0031388052739202976, - "learning_rate": 8.530100000000002e-06, - "loss": 0.0003, - "step": 14700 - }, - { - "epoch": 56.48854961832061, - "grad_norm": 0.3307536840438843, - "learning_rate": 8.520100000000002e-06, - "loss": 0.0004, - "step": 14800 - }, - { - "epoch": 56.87022900763359, - "grad_norm": 0.005132837221026421, - "learning_rate": 8.510100000000002e-06, - "loss": 0.0005, - "step": 14900 - }, - { - "epoch": 57.25190839694657, - "grad_norm": 0.07858143001794815, - "learning_rate": 8.500100000000002e-06, - "loss": 0.0004, - "step": 15000 - }, - { - "epoch": 57.25190839694657, - "eval_loss": 0.05650029703974724, - "eval_runtime": 12.4083, - "eval_sacrebleu": 98.48679103916423, - "eval_samples_per_second": 75.353, - "eval_steps_per_second": 1.209, - "step": 15000 - }, - { - "epoch": 57.63358778625954, - "grad_norm": 0.2156197428703308, - "learning_rate": 8.490100000000002e-06, - "loss": 0.0008, - "step": 15100 - }, - { - "epoch": 58.01526717557252, - "grad_norm": 0.01222946122288704, - "learning_rate": 8.4801e-06, - "loss": 0.0006, - "step": 15200 - }, - { - "epoch": 58.396946564885496, - "grad_norm": 1.4871057271957397, - "learning_rate": 8.4701e-06, - "loss": 0.0004, - "step": 15300 - }, - { - "epoch": 58.778625954198475, - "grad_norm": 0.010478519834578037, - "learning_rate": 8.4601e-06, - "loss": 0.0007, - "step": 15400 - }, - { - "epoch": 59.16030534351145, - "grad_norm": 0.022521814331412315, - "learning_rate": 8.4501e-06, - "loss": 0.0006, - "step": 15500 - }, - { - "epoch": 59.541984732824424, - "grad_norm": 0.008455158211290836, - "learning_rate": 8.4401e-06, - "loss": 0.0006, - "step": 15600 - }, - { - "epoch": 59.9236641221374, - "grad_norm": 0.016218015924096107, - "learning_rate": 8.4301e-06, - "loss": 0.0002, - "step": 15700 - }, - { - "epoch": 60.30534351145038, - "grad_norm": 0.008867577649652958, - "learning_rate": 8.420100000000001e-06, - "loss": 0.0005, - "step": 15800 - }, - { - "epoch": 60.68702290076336, - "grad_norm": 0.010618796572089195, - "learning_rate": 8.410100000000001e-06, - "loss": 0.0004, - "step": 15900 - }, - { - "epoch": 61.06870229007634, - "grad_norm": 0.2865886986255646, - "learning_rate": 8.400100000000001e-06, - "loss": 0.0007, - "step": 16000 - }, - { - "epoch": 61.45038167938931, - "grad_norm": 0.053436312824487686, - "learning_rate": 8.390100000000001e-06, - "loss": 0.0004, - "step": 16100 - }, - { - "epoch": 61.83206106870229, - "grad_norm": 0.24601472914218903, - "learning_rate": 8.380100000000001e-06, - "loss": 0.0007, - "step": 16200 - }, - { - "epoch": 62.213740458015266, - "grad_norm": 0.01396193914115429, - "learning_rate": 8.370100000000001e-06, - "loss": 0.0006, - "step": 16300 - }, - { - "epoch": 62.595419847328245, - "grad_norm": 0.05170602351427078, - "learning_rate": 8.360100000000001e-06, - "loss": 0.0004, - "step": 16400 - }, - { - "epoch": 62.97709923664122, - "grad_norm": 0.02577635832130909, - "learning_rate": 8.350100000000002e-06, - "loss": 0.0005, - "step": 16500 - }, - { - "epoch": 63.3587786259542, - "grad_norm": 0.010901645757257938, - "learning_rate": 8.340100000000002e-06, - "loss": 0.0007, - "step": 16600 - }, - { - "epoch": 63.74045801526717, - "grad_norm": 0.002284079324454069, - "learning_rate": 8.330100000000002e-06, - "loss": 0.0003, - "step": 16700 - }, - { - "epoch": 64.12213740458016, - "grad_norm": 0.0018485253676772118, - "learning_rate": 8.3201e-06, - "loss": 0.0004, - "step": 16800 - }, - { - "epoch": 64.50381679389314, - "grad_norm": 0.0015872870571911335, - "learning_rate": 8.3101e-06, - "loss": 0.0005, - "step": 16900 - }, - { - "epoch": 64.8854961832061, - "grad_norm": 0.1890452653169632, - "learning_rate": 8.3001e-06, - "loss": 0.0005, - "step": 17000 - }, - { - "epoch": 65.26717557251908, - "grad_norm": 0.4248383641242981, - "learning_rate": 8.2901e-06, - "loss": 0.0007, - "step": 17100 - }, - { - "epoch": 65.64885496183206, - "grad_norm": 0.019861843436956406, - "learning_rate": 8.2801e-06, - "loss": 0.0003, - "step": 17200 - }, - { - "epoch": 66.03053435114504, - "grad_norm": 0.3097754418849945, - "learning_rate": 8.2701e-06, - "loss": 0.0005, - "step": 17300 - }, - { - "epoch": 66.41221374045801, - "grad_norm": 0.010541427880525589, - "learning_rate": 8.2601e-06, - "loss": 0.0005, - "step": 17400 - }, - { - "epoch": 66.79389312977099, - "grad_norm": 0.35622137784957886, - "learning_rate": 8.250100000000001e-06, - "loss": 0.0004, - "step": 17500 - }, - { - "epoch": 67.17557251908397, - "grad_norm": 0.16328565776348114, - "learning_rate": 8.240100000000001e-06, - "loss": 0.0003, - "step": 17600 - }, - { - "epoch": 67.55725190839695, - "grad_norm": 0.008192314766347408, - "learning_rate": 8.230100000000001e-06, - "loss": 0.0005, - "step": 17700 - }, - { - "epoch": 67.93893129770993, - "grad_norm": 0.060217149555683136, - "learning_rate": 8.220100000000001e-06, - "loss": 0.0003, - "step": 17800 - }, - { - "epoch": 68.3206106870229, - "grad_norm": 0.07561459392309189, - "learning_rate": 8.210100000000001e-06, - "loss": 0.0002, - "step": 17900 - }, - { - "epoch": 68.70229007633588, - "grad_norm": 0.0563255250453949, - "learning_rate": 8.200100000000001e-06, - "loss": 0.0008, - "step": 18000 - }, - { - "epoch": 69.08396946564885, - "grad_norm": 0.3279189169406891, - "learning_rate": 8.190100000000001e-06, - "loss": 0.0007, - "step": 18100 - }, - { - "epoch": 69.46564885496183, - "grad_norm": 0.9007174372673035, - "learning_rate": 8.180100000000002e-06, - "loss": 0.0003, - "step": 18200 - }, - { - "epoch": 69.8473282442748, - "grad_norm": 0.010992350056767464, - "learning_rate": 8.170100000000002e-06, - "loss": 0.0008, - "step": 18300 - }, - { - "epoch": 70.22900763358778, - "grad_norm": 0.00460466556251049, - "learning_rate": 8.1601e-06, - "loss": 0.0003, - "step": 18400 - }, - { - "epoch": 70.61068702290076, - "grad_norm": 0.4365985691547394, - "learning_rate": 8.1501e-06, - "loss": 0.0007, - "step": 18500 - }, - { - "epoch": 70.99236641221374, - "grad_norm": 0.08442062139511108, - "learning_rate": 8.1401e-06, - "loss": 0.0004, - "step": 18600 - }, - { - "epoch": 71.37404580152672, - "grad_norm": 0.01071433536708355, - "learning_rate": 8.1301e-06, - "loss": 0.0002, - "step": 18700 - }, - { - "epoch": 71.7557251908397, - "grad_norm": 0.006448898930102587, - "learning_rate": 8.1201e-06, - "loss": 0.0003, - "step": 18800 - }, - { - "epoch": 72.13740458015268, - "grad_norm": 0.004096378572285175, - "learning_rate": 8.1101e-06, - "loss": 0.0003, - "step": 18900 - }, - { - "epoch": 72.51908396946565, - "grad_norm": 0.017368216067552567, - "learning_rate": 8.1001e-06, - "loss": 0.0002, - "step": 19000 - }, - { - "epoch": 72.90076335877862, - "grad_norm": 0.01318287756294012, - "learning_rate": 8.0901e-06, - "loss": 0.0003, - "step": 19100 - }, - { - "epoch": 73.2824427480916, - "grad_norm": 0.04333416000008583, - "learning_rate": 8.0801e-06, - "loss": 0.0006, - "step": 19200 - }, - { - "epoch": 73.66412213740458, - "grad_norm": 0.1838267594575882, - "learning_rate": 8.070100000000001e-06, - "loss": 0.0004, - "step": 19300 - }, - { - "epoch": 74.04580152671755, - "grad_norm": 0.030327429994940758, - "learning_rate": 8.060100000000001e-06, - "loss": 0.0005, - "step": 19400 - }, - { - "epoch": 74.42748091603053, - "grad_norm": 0.0024014883674681187, - "learning_rate": 8.050100000000001e-06, - "loss": 0.0005, - "step": 19500 - }, - { - "epoch": 74.80916030534351, - "grad_norm": 0.0008008808363229036, - "learning_rate": 8.040100000000001e-06, - "loss": 0.0004, - "step": 19600 - }, - { - "epoch": 75.19083969465649, - "grad_norm": 0.003488209331408143, - "learning_rate": 8.030100000000001e-06, - "loss": 0.0003, - "step": 19700 - }, - { - "epoch": 75.57251908396947, - "grad_norm": 0.025449158623814583, - "learning_rate": 8.020100000000001e-06, - "loss": 0.0003, - "step": 19800 - }, - { - "epoch": 75.95419847328245, - "grad_norm": 0.016767608001828194, - "learning_rate": 8.010100000000001e-06, - "loss": 0.0004, - "step": 19900 - }, - { - "epoch": 76.33587786259542, - "grad_norm": 0.021313291043043137, - "learning_rate": 8.0001e-06, - "loss": 0.0005, - "step": 20000 - }, - { - "epoch": 76.33587786259542, - "eval_loss": 0.05921128764748573, - "eval_runtime": 15.5824, - "eval_sacrebleu": 98.44904998453309, - "eval_samples_per_second": 60.004, - "eval_steps_per_second": 0.963, - "step": 20000 - }, - { - "epoch": 76.7175572519084, - "grad_norm": 0.002728424733504653, - "learning_rate": 7.9901e-06, - "loss": 0.0004, - "step": 20100 - }, - { - "epoch": 77.09923664122137, - "grad_norm": 0.0016456434968858957, - "learning_rate": 7.9801e-06, - "loss": 0.0004, - "step": 20200 - }, - { - "epoch": 77.48091603053435, - "grad_norm": 0.011593434028327465, - "learning_rate": 7.9701e-06, - "loss": 0.0003, - "step": 20300 - }, - { - "epoch": 77.86259541984732, - "grad_norm": 0.05794886499643326, - "learning_rate": 7.9601e-06, - "loss": 0.0004, - "step": 20400 - }, - { - "epoch": 78.2442748091603, - "grad_norm": 0.13322757184505463, - "learning_rate": 7.9501e-06, - "loss": 0.0003, - "step": 20500 - }, - { - "epoch": 78.62595419847328, - "grad_norm": 0.0018421830609440804, - "learning_rate": 7.9401e-06, - "loss": 0.0006, - "step": 20600 - }, - { - "epoch": 79.00763358778626, - "grad_norm": 0.03198527544736862, - "learning_rate": 7.9301e-06, - "loss": 0.0004, - "step": 20700 - }, - { - "epoch": 79.38931297709924, - "grad_norm": 0.003445269539952278, - "learning_rate": 7.9201e-06, - "loss": 0.0003, - "step": 20800 - }, - { - "epoch": 79.77099236641222, - "grad_norm": 0.012325005605816841, - "learning_rate": 7.9101e-06, - "loss": 0.0007, - "step": 20900 - }, - { - "epoch": 80.1526717557252, - "grad_norm": 0.011424236930906773, - "learning_rate": 7.9001e-06, - "loss": 0.0002, - "step": 21000 - }, - { - "epoch": 80.53435114503817, - "grad_norm": 0.008460204117000103, - "learning_rate": 7.890100000000001e-06, - "loss": 0.0001, - "step": 21100 - }, - { - "epoch": 80.91603053435115, - "grad_norm": 0.005678200162947178, - "learning_rate": 7.880100000000001e-06, - "loss": 0.0005, - "step": 21200 - }, - { - "epoch": 81.29770992366412, - "grad_norm": 0.704746425151825, - "learning_rate": 7.870100000000001e-06, - "loss": 0.0003, - "step": 21300 - }, - { - "epoch": 81.6793893129771, - "grad_norm": 0.009600157849490643, - "learning_rate": 7.860100000000001e-06, - "loss": 0.0004, - "step": 21400 - }, - { - "epoch": 82.06106870229007, - "grad_norm": 0.060164470225572586, - "learning_rate": 7.850100000000001e-06, - "loss": 0.0003, - "step": 21500 - }, - { - "epoch": 82.44274809160305, - "grad_norm": 0.010458818636834621, - "learning_rate": 7.8401e-06, - "loss": 0.0006, - "step": 21600 - }, - { - "epoch": 82.82442748091603, - "grad_norm": 0.021136565133929253, - "learning_rate": 7.8301e-06, - "loss": 0.0005, - "step": 21700 - }, - { - "epoch": 83.20610687022901, - "grad_norm": 0.003838100703433156, - "learning_rate": 7.8201e-06, - "loss": 0.0003, - "step": 21800 - }, - { - "epoch": 83.58778625954199, - "grad_norm": 0.006365123670548201, - "learning_rate": 7.8101e-06, - "loss": 0.0007, - "step": 21900 - }, - { - "epoch": 83.96946564885496, - "grad_norm": 0.006348173134028912, - "learning_rate": 7.8001e-06, - "loss": 0.0003, - "step": 22000 - }, - { - "epoch": 84.35114503816794, - "grad_norm": 0.08426064997911453, - "learning_rate": 7.7901e-06, - "loss": 0.0003, - "step": 22100 - }, - { - "epoch": 84.73282442748092, - "grad_norm": 0.1465039700269699, - "learning_rate": 7.7801e-06, - "loss": 0.0004, - "step": 22200 - }, - { - "epoch": 85.1145038167939, - "grad_norm": 0.16135551035404205, - "learning_rate": 7.7701e-06, - "loss": 0.0003, - "step": 22300 - }, - { - "epoch": 85.49618320610686, - "grad_norm": 0.04432014003396034, - "learning_rate": 7.7601e-06, - "loss": 0.0008, - "step": 22400 - }, - { - "epoch": 85.87786259541984, - "grad_norm": 0.023115672171115875, - "learning_rate": 7.7501e-06, - "loss": 0.0007, - "step": 22500 - }, - { - "epoch": 86.25954198473282, - "grad_norm": 0.008059334009885788, - "learning_rate": 7.7401e-06, - "loss": 0.0003, - "step": 22600 - }, - { - "epoch": 86.6412213740458, - "grad_norm": 0.011501766741275787, - "learning_rate": 7.7301e-06, - "loss": 0.0006, - "step": 22700 - }, - { - "epoch": 87.02290076335878, - "grad_norm": 0.04729965701699257, - "learning_rate": 7.7201e-06, - "loss": 0.0005, - "step": 22800 - }, - { - "epoch": 87.40458015267176, - "grad_norm": 0.0022480455227196217, - "learning_rate": 7.710100000000001e-06, - "loss": 0.0003, - "step": 22900 - }, - { - "epoch": 87.78625954198473, - "grad_norm": 0.15582217276096344, - "learning_rate": 7.700100000000001e-06, - "loss": 0.0001, - "step": 23000 - }, - { - "epoch": 88.16793893129771, - "grad_norm": 0.18330171704292297, - "learning_rate": 7.690100000000001e-06, - "loss": 0.0003, - "step": 23100 - }, - { - "epoch": 88.54961832061069, - "grad_norm": 0.052398666739463806, - "learning_rate": 7.6801e-06, - "loss": 0.0002, - "step": 23200 - }, - { - "epoch": 88.93129770992367, - "grad_norm": 0.003869857406243682, - "learning_rate": 7.6701e-06, - "loss": 0.0002, - "step": 23300 - }, - { - "epoch": 89.31297709923665, - "grad_norm": 0.08946622908115387, - "learning_rate": 7.6601e-06, - "loss": 0.0002, - "step": 23400 - }, - { - "epoch": 89.69465648854961, - "grad_norm": 0.002906553214415908, - "learning_rate": 7.6501e-06, - "loss": 0.0002, - "step": 23500 - }, - { - "epoch": 90.07633587786259, - "grad_norm": 0.05098455026745796, - "learning_rate": 7.6401e-06, - "loss": 0.0002, - "step": 23600 - }, - { - "epoch": 90.45801526717557, - "grad_norm": 0.009720506146550179, - "learning_rate": 7.6301e-06, - "loss": 0.0002, - "step": 23700 - }, - { - "epoch": 90.83969465648855, - "grad_norm": 0.001824849401600659, - "learning_rate": 7.6201e-06, - "loss": 0.0003, - "step": 23800 - }, - { - "epoch": 91.22137404580153, - "grad_norm": 0.04330907762050629, - "learning_rate": 7.6101e-06, - "loss": 0.0001, - "step": 23900 - }, - { - "epoch": 91.6030534351145, - "grad_norm": 0.010192295536398888, - "learning_rate": 7.6001e-06, - "loss": 0.0002, - "step": 24000 - }, - { - "epoch": 91.98473282442748, - "grad_norm": 0.0016511849826201797, - "learning_rate": 7.5901e-06, - "loss": 0.0003, - "step": 24100 - }, - { - "epoch": 92.36641221374046, - "grad_norm": 0.0017004406545311213, - "learning_rate": 7.5801000000000005e-06, - "loss": 0.0003, - "step": 24200 - }, - { - "epoch": 92.74809160305344, - "grad_norm": 0.04440297558903694, - "learning_rate": 7.570100000000001e-06, - "loss": 0.0001, - "step": 24300 - }, - { - "epoch": 93.12977099236642, - "grad_norm": 0.005172837525606155, - "learning_rate": 7.5601e-06, - "loss": 0.0004, - "step": 24400 - }, - { - "epoch": 93.5114503816794, - "grad_norm": 0.01078664418309927, - "learning_rate": 7.5501e-06, - "loss": 0.0004, - "step": 24500 - }, - { - "epoch": 93.89312977099236, - "grad_norm": 0.3066396415233612, - "learning_rate": 7.5401e-06, - "loss": 0.0004, - "step": 24600 - }, - { - "epoch": 94.27480916030534, - "grad_norm": 0.0036018453538417816, - "learning_rate": 7.5301e-06, - "loss": 0.0006, - "step": 24700 - }, - { - "epoch": 94.65648854961832, - "grad_norm": 0.017255930230021477, - "learning_rate": 7.5201e-06, - "loss": 0.0003, - "step": 24800 - }, - { - "epoch": 95.0381679389313, - "grad_norm": 0.0082321772351861, - "learning_rate": 7.5101e-06, - "loss": 0.0004, - "step": 24900 - }, - { - "epoch": 95.41984732824427, - "grad_norm": 0.06229685619473457, - "learning_rate": 7.5001e-06, - "loss": 0.0001, - "step": 25000 - }, - { - "epoch": 95.41984732824427, - "eval_loss": 0.06352647393941879, - "eval_runtime": 12.6475, - "eval_sacrebleu": 98.44692377328674, - "eval_samples_per_second": 73.928, - "eval_steps_per_second": 1.186, - "step": 25000 - }, - { - "epoch": 95.80152671755725, - "grad_norm": 0.0039802235551178455, - "learning_rate": 7.4901000000000005e-06, - "loss": 0.0005, - "step": 25100 - }, - { - "epoch": 96.18320610687023, - "grad_norm": 0.005411222577095032, - "learning_rate": 7.4801e-06, - "loss": 0.0003, - "step": 25200 - }, - { - "epoch": 96.56488549618321, - "grad_norm": 0.010220557451248169, - "learning_rate": 7.4701e-06, - "loss": 0.0002, - "step": 25300 - }, - { - "epoch": 96.94656488549619, - "grad_norm": 0.005120398942381144, - "learning_rate": 7.4601e-06, - "loss": 0.0002, - "step": 25400 - }, - { - "epoch": 97.32824427480917, - "grad_norm": 0.007523237727582455, - "learning_rate": 7.4501e-06, - "loss": 0.0004, - "step": 25500 - }, - { - "epoch": 97.70992366412214, - "grad_norm": 0.009140445850789547, - "learning_rate": 7.4401e-06, - "loss": 0.0002, - "step": 25600 - }, - { - "epoch": 98.09160305343511, - "grad_norm": 0.05134638398885727, - "learning_rate": 7.4301e-06, - "loss": 0.0002, - "step": 25700 - }, - { - "epoch": 98.47328244274809, - "grad_norm": 0.018913347274065018, - "learning_rate": 7.4201e-06, - "loss": 0.0002, - "step": 25800 - }, - { - "epoch": 98.85496183206106, - "grad_norm": 0.015694592148065567, - "learning_rate": 7.4101000000000004e-06, - "loss": 0.0002, - "step": 25900 - }, - { - "epoch": 99.23664122137404, - "grad_norm": 0.44005635380744934, - "learning_rate": 7.4001e-06, - "loss": 0.0005, - "step": 26000 - }, - { - "epoch": 99.61832061068702, - "grad_norm": 0.41379034519195557, - "learning_rate": 7.3901e-06, - "loss": 0.0003, - "step": 26100 - }, - { - "epoch": 100.0, - "grad_norm": 0.01757127046585083, - "learning_rate": 7.3801000000000016e-06, - "loss": 0.0002, - "step": 26200 - }, - { - "epoch": 100.38167938931298, - "grad_norm": 0.020249033346772194, - "learning_rate": 7.370100000000001e-06, - "loss": 0.0001, - "step": 26300 - }, - { - "epoch": 100.76335877862596, - "grad_norm": 0.0012605002848431468, - "learning_rate": 7.360100000000001e-06, - "loss": 0.0004, - "step": 26400 - }, - { - "epoch": 101.14503816793894, - "grad_norm": 0.0009935126872733235, - "learning_rate": 7.350100000000001e-06, - "loss": 0.0003, - "step": 26500 - }, - { - "epoch": 101.52671755725191, - "grad_norm": 0.010652081109583378, - "learning_rate": 7.340100000000001e-06, - "loss": 0.0003, - "step": 26600 - }, - { - "epoch": 101.90839694656489, - "grad_norm": 0.007697584573179483, - "learning_rate": 7.330100000000001e-06, - "loss": 0.0002, - "step": 26700 - }, - { - "epoch": 102.29007633587786, - "grad_norm": 0.005309663712978363, - "learning_rate": 7.320100000000001e-06, - "loss": 0.0001, - "step": 26800 - }, - { - "epoch": 102.67175572519083, - "grad_norm": 0.0019546588882803917, - "learning_rate": 7.310100000000001e-06, - "loss": 0.0002, - "step": 26900 - }, - { - "epoch": 103.05343511450381, - "grad_norm": 0.006668527144938707, - "learning_rate": 7.3001000000000015e-06, - "loss": 0.0005, - "step": 27000 - }, - { - "epoch": 103.43511450381679, - "grad_norm": 0.009568038396537304, - "learning_rate": 7.290100000000001e-06, - "loss": 0.0002, - "step": 27100 - }, - { - "epoch": 103.81679389312977, - "grad_norm": 0.007557023782283068, - "learning_rate": 7.280100000000001e-06, - "loss": 0.0004, - "step": 27200 - }, - { - "epoch": 104.19847328244275, - "grad_norm": 0.003133823163807392, - "learning_rate": 7.270100000000001e-06, - "loss": 0.0001, - "step": 27300 - }, - { - "epoch": 104.58015267175573, - "grad_norm": 0.0025260956026613712, - "learning_rate": 7.260100000000001e-06, - "loss": 0.0005, - "step": 27400 - }, - { - "epoch": 104.9618320610687, - "grad_norm": 0.005094371736049652, - "learning_rate": 7.250100000000001e-06, - "loss": 0.0001, - "step": 27500 - }, - { - "epoch": 105.34351145038168, - "grad_norm": 0.0021035203244537115, - "learning_rate": 7.240100000000001e-06, - "loss": 0.0002, - "step": 27600 - }, - { - "epoch": 105.72519083969466, - "grad_norm": 0.01144137978553772, - "learning_rate": 7.230100000000001e-06, - "loss": 0.0002, - "step": 27700 - }, - { - "epoch": 106.10687022900764, - "grad_norm": 0.2611069977283478, - "learning_rate": 7.220100000000001e-06, - "loss": 0.0001, - "step": 27800 - }, - { - "epoch": 106.4885496183206, - "grad_norm": 0.005099534057080746, - "learning_rate": 7.210100000000001e-06, - "loss": 0.0003, - "step": 27900 - }, - { - "epoch": 106.87022900763358, - "grad_norm": 0.0016690207412466407, - "learning_rate": 7.200100000000001e-06, - "loss": 0.0002, - "step": 28000 - }, - { - "epoch": 107.25190839694656, - "grad_norm": 2.1230013370513916, - "learning_rate": 7.190100000000001e-06, - "loss": 0.0002, - "step": 28100 - }, - { - "epoch": 107.63358778625954, - "grad_norm": 0.01016434095799923, - "learning_rate": 7.180100000000001e-06, - "loss": 0.0004, - "step": 28200 - }, - { - "epoch": 108.01526717557252, - "grad_norm": 0.002499851631000638, - "learning_rate": 7.170100000000001e-06, - "loss": 0.0003, - "step": 28300 - }, - { - "epoch": 108.3969465648855, - "grad_norm": 0.025029512122273445, - "learning_rate": 7.160100000000001e-06, - "loss": 0.0002, - "step": 28400 - }, - { - "epoch": 108.77862595419847, - "grad_norm": 0.006664194632321596, - "learning_rate": 7.150100000000001e-06, - "loss": 0.0005, - "step": 28500 - }, - { - "epoch": 109.16030534351145, - "grad_norm": 0.09196371585130692, - "learning_rate": 7.140100000000001e-06, - "loss": 0.0003, - "step": 28600 - }, - { - "epoch": 109.54198473282443, - "grad_norm": 0.005001279059797525, - "learning_rate": 7.1301000000000006e-06, - "loss": 0.0001, - "step": 28700 - }, - { - "epoch": 109.92366412213741, - "grad_norm": 0.08114538341760635, - "learning_rate": 7.120100000000001e-06, - "loss": 0.0001, - "step": 28800 - }, - { - "epoch": 110.30534351145039, - "grad_norm": 0.029932040721178055, - "learning_rate": 7.110100000000001e-06, - "loss": 0.0002, - "step": 28900 - }, - { - "epoch": 110.68702290076335, - "grad_norm": 0.19053128361701965, - "learning_rate": 7.100100000000001e-06, - "loss": 0.0002, - "step": 29000 - }, - { - "epoch": 111.06870229007633, - "grad_norm": 0.0010097407503053546, - "learning_rate": 7.090100000000001e-06, - "loss": 0.0003, - "step": 29100 - }, - { - "epoch": 111.45038167938931, - "grad_norm": 0.06168140470981598, - "learning_rate": 7.080100000000001e-06, - "loss": 0.0001, - "step": 29200 - }, - { - "epoch": 111.83206106870229, - "grad_norm": 0.0871201604604721, - "learning_rate": 7.070100000000001e-06, - "loss": 0.0004, - "step": 29300 - }, - { - "epoch": 112.21374045801527, - "grad_norm": 0.0031851527746766806, - "learning_rate": 7.060100000000001e-06, - "loss": 0.0002, - "step": 29400 - }, - { - "epoch": 112.59541984732824, - "grad_norm": 0.15080685913562775, - "learning_rate": 7.0501000000000005e-06, - "loss": 0.0001, - "step": 29500 - }, - { - "epoch": 112.97709923664122, - "grad_norm": 0.001597443362697959, - "learning_rate": 7.040100000000001e-06, - "loss": 0.0002, - "step": 29600 - }, - { - "epoch": 113.3587786259542, - "grad_norm": 0.0068435585126280785, - "learning_rate": 7.030100000000001e-06, - "loss": 0.0002, - "step": 29700 - }, - { - "epoch": 113.74045801526718, - "grad_norm": 0.00722590833902359, - "learning_rate": 7.020100000000001e-06, - "loss": 0.0001, - "step": 29800 - }, - { - "epoch": 114.12213740458016, - "grad_norm": 0.0014269723324105144, - "learning_rate": 7.010100000000001e-06, - "loss": 0.0006, - "step": 29900 - }, - { - "epoch": 114.50381679389314, - "grad_norm": 0.017447682097554207, - "learning_rate": 7.000100000000001e-06, - "loss": 0.0005, - "step": 30000 - }, - { - "epoch": 114.50381679389314, - "eval_loss": 0.061828721314668655, - "eval_runtime": 13.4007, - "eval_sacrebleu": 98.40366922270569, - "eval_samples_per_second": 69.772, - "eval_steps_per_second": 1.119, - "step": 30000 - }, - { - "epoch": 114.8854961832061, - "grad_norm": 0.01795717515051365, - "learning_rate": 6.990100000000001e-06, - "loss": 0.0002, - "step": 30100 - }, - { - "epoch": 115.26717557251908, - "grad_norm": 0.0028230492025613785, - "learning_rate": 6.980100000000001e-06, - "loss": 0.0004, - "step": 30200 - }, - { - "epoch": 115.64885496183206, - "grad_norm": 0.0036361112724989653, - "learning_rate": 6.9701e-06, - "loss": 0.0001, - "step": 30300 - }, - { - "epoch": 116.03053435114504, - "grad_norm": 0.008832822553813457, - "learning_rate": 6.9601000000000005e-06, - "loss": 0.0001, - "step": 30400 - }, - { - "epoch": 116.41221374045801, - "grad_norm": 0.008196801878511906, - "learning_rate": 6.950100000000001e-06, - "loss": 0.0002, - "step": 30500 - }, - { - "epoch": 116.79389312977099, - "grad_norm": 0.012250511907041073, - "learning_rate": 6.940100000000001e-06, - "loss": 0.0001, - "step": 30600 - }, - { - "epoch": 117.17557251908397, - "grad_norm": 0.007089771796017885, - "learning_rate": 6.930100000000001e-06, - "loss": 0.0001, - "step": 30700 - }, - { - "epoch": 117.55725190839695, - "grad_norm": 0.04049292579293251, - "learning_rate": 6.920100000000001e-06, - "loss": 0.0001, - "step": 30800 - }, - { - "epoch": 117.93893129770993, - "grad_norm": 0.004635801538825035, - "learning_rate": 6.910100000000001e-06, - "loss": 0.0004, - "step": 30900 - }, - { - "epoch": 118.3206106870229, - "grad_norm": 0.05304088443517685, - "learning_rate": 6.900100000000001e-06, - "loss": 0.0002, - "step": 31000 - }, - { - "epoch": 118.70229007633588, - "grad_norm": 0.33209460973739624, - "learning_rate": 6.8901e-06, - "loss": 0.0003, - "step": 31100 - }, - { - "epoch": 119.08396946564885, - "grad_norm": 0.002151914406567812, - "learning_rate": 6.8801e-06, - "loss": 0.0004, - "step": 31200 - }, - { - "epoch": 119.46564885496183, - "grad_norm": 0.037561312317848206, - "learning_rate": 6.8701000000000005e-06, - "loss": 0.0001, - "step": 31300 - }, - { - "epoch": 119.8473282442748, - "grad_norm": 0.03673085942864418, - "learning_rate": 6.860100000000001e-06, - "loss": 0.0001, - "step": 31400 - }, - { - "epoch": 120.22900763358778, - "grad_norm": 0.0021637221798300743, - "learning_rate": 6.850100000000001e-06, - "loss": 0.0003, - "step": 31500 - }, - { - "epoch": 120.61068702290076, - "grad_norm": 0.01722230762243271, - "learning_rate": 6.840100000000001e-06, - "loss": 0.0003, - "step": 31600 - }, - { - "epoch": 120.99236641221374, - "grad_norm": 0.0077673690393567085, - "learning_rate": 6.830100000000001e-06, - "loss": 0.0001, - "step": 31700 - }, - { - "epoch": 121.37404580152672, - "grad_norm": 0.001041444600559771, - "learning_rate": 6.820100000000001e-06, - "loss": 0.0004, - "step": 31800 - }, - { - "epoch": 121.7557251908397, - "grad_norm": 0.019560877233743668, - "learning_rate": 6.8101e-06, - "loss": 0.0003, - "step": 31900 - }, - { - "epoch": 122.13740458015268, - "grad_norm": 0.06604505330324173, - "learning_rate": 6.8001e-06, - "loss": 0.0001, - "step": 32000 - }, - { - "epoch": 122.51908396946565, - "grad_norm": 0.01230511162430048, - "learning_rate": 6.7901000000000004e-06, - "loss": 0.0004, - "step": 32100 - }, - { - "epoch": 122.90076335877862, - "grad_norm": 0.017727281898260117, - "learning_rate": 6.7801000000000005e-06, - "loss": 0.0002, - "step": 32200 - }, - { - "epoch": 123.2824427480916, - "grad_norm": 0.022407682612538338, - "learning_rate": 6.770100000000001e-06, - "loss": 0.0004, - "step": 32300 - }, - { - "epoch": 123.66412213740458, - "grad_norm": 0.03721390292048454, - "learning_rate": 6.760100000000001e-06, - "loss": 0.0001, - "step": 32400 - }, - { - "epoch": 124.04580152671755, - "grad_norm": 0.0007419243338517845, - "learning_rate": 6.750100000000001e-06, - "loss": 0.0001, - "step": 32500 - }, - { - "epoch": 124.42748091603053, - "grad_norm": 0.01308010146021843, - "learning_rate": 6.740100000000001e-06, - "loss": 0.0002, - "step": 32600 - }, - { - "epoch": 124.80916030534351, - "grad_norm": 0.0005770212155766785, - "learning_rate": 6.7301e-06, - "loss": 0.0003, - "step": 32700 - }, - { - "epoch": 125.19083969465649, - "grad_norm": 0.0034893909469246864, - "learning_rate": 6.7201e-06, - "loss": 0.0002, - "step": 32800 - }, - { - "epoch": 125.57251908396947, - "grad_norm": 0.009305262938141823, - "learning_rate": 6.7101e-06, - "loss": 0.0002, - "step": 32900 - }, - { - "epoch": 125.95419847328245, - "grad_norm": 0.000643310253508389, - "learning_rate": 6.7001000000000004e-06, - "loss": 0.0003, - "step": 33000 - }, - { - "epoch": 126.33587786259542, - "grad_norm": 0.014226247556507587, - "learning_rate": 6.6901000000000005e-06, - "loss": 0.0002, - "step": 33100 - }, - { - "epoch": 126.7175572519084, - "grad_norm": 0.0034275040961802006, - "learning_rate": 6.680100000000001e-06, - "loss": 0.0001, - "step": 33200 - }, - { - "epoch": 127.09923664122137, - "grad_norm": 0.0015430613420903683, - "learning_rate": 6.670100000000001e-06, - "loss": 0.0001, - "step": 33300 - }, - { - "epoch": 127.48091603053435, - "grad_norm": 0.18752576410770416, - "learning_rate": 6.660100000000001e-06, - "loss": 0.0002, - "step": 33400 - }, - { - "epoch": 127.86259541984732, - "grad_norm": 0.0032555214129388332, - "learning_rate": 6.6501e-06, - "loss": 0.0002, - "step": 33500 - }, - { - "epoch": 128.24427480916032, - "grad_norm": 0.002811311511322856, - "learning_rate": 6.6401e-06, - "loss": 0.0004, - "step": 33600 - }, - { - "epoch": 128.6259541984733, - "grad_norm": 0.00043868483044207096, - "learning_rate": 6.6301e-06, - "loss": 0.0003, - "step": 33700 - }, - { - "epoch": 129.00763358778627, - "grad_norm": 0.12992912530899048, - "learning_rate": 6.6201e-06, - "loss": 0.0001, - "step": 33800 - }, - { - "epoch": 129.38931297709922, - "grad_norm": 0.07658559828996658, - "learning_rate": 6.6101000000000005e-06, - "loss": 0.0004, - "step": 33900 - }, - { - "epoch": 129.7709923664122, - "grad_norm": 0.0007361789466813207, - "learning_rate": 6.6001000000000006e-06, - "loss": 0.0002, - "step": 34000 - }, - { - "epoch": 130.15267175572518, - "grad_norm": 0.053547557443380356, - "learning_rate": 6.590100000000001e-06, - "loss": 0.0001, - "step": 34100 - }, - { - "epoch": 130.53435114503816, - "grad_norm": 0.007902882993221283, - "learning_rate": 6.580100000000001e-06, - "loss": 0.0001, - "step": 34200 - }, - { - "epoch": 130.91603053435114, - "grad_norm": 0.023441769182682037, - "learning_rate": 6.5701e-06, - "loss": 0.0002, - "step": 34300 - }, - { - "epoch": 131.29770992366412, - "grad_norm": 0.00034708293969742954, - "learning_rate": 6.5601e-06, - "loss": 0.0001, - "step": 34400 - }, - { - "epoch": 131.6793893129771, - "grad_norm": 0.0018309177830815315, - "learning_rate": 6.5501e-06, - "loss": 0.0002, - "step": 34500 - }, - { - "epoch": 132.06106870229007, - "grad_norm": 0.0067507303319871426, - "learning_rate": 6.5401e-06, - "loss": 0.0, - "step": 34600 - }, - { - "epoch": 132.44274809160305, - "grad_norm": 0.0032187646720558405, - "learning_rate": 6.5301e-06, - "loss": 0.0001, - "step": 34700 - }, - { - "epoch": 132.82442748091603, - "grad_norm": 0.00039488039328716695, - "learning_rate": 6.5201000000000005e-06, - "loss": 0.0002, - "step": 34800 - }, - { - "epoch": 133.206106870229, - "grad_norm": 0.007006525062024593, - "learning_rate": 6.5101000000000006e-06, - "loss": 0.0, - "step": 34900 - }, - { - "epoch": 133.58778625954199, - "grad_norm": 0.0017857562052085996, - "learning_rate": 6.500100000000001e-06, - "loss": 0.0002, - "step": 35000 - }, - { - "epoch": 133.58778625954199, - "eval_loss": 0.06826449930667877, - "eval_runtime": 12.8263, - "eval_sacrebleu": 98.27040237882402, - "eval_samples_per_second": 72.897, - "eval_steps_per_second": 1.169, - "step": 35000 - }, - { - "epoch": 133.96946564885496, - "grad_norm": 0.00031158479396253824, - "learning_rate": 6.4901e-06, - "loss": 0.0002, - "step": 35100 - }, - { - "epoch": 134.35114503816794, - "grad_norm": 0.0029920360539108515, - "learning_rate": 6.4801e-06, - "loss": 0.0002, - "step": 35200 - }, - { - "epoch": 134.73282442748092, - "grad_norm": 0.0013086318504065275, - "learning_rate": 6.4701e-06, - "loss": 0.0001, - "step": 35300 - }, - { - "epoch": 135.1145038167939, - "grad_norm": 0.07492779195308685, - "learning_rate": 6.4601e-06, - "loss": 0.0001, - "step": 35400 - }, - { - "epoch": 135.49618320610688, - "grad_norm": 0.0020922007970511913, - "learning_rate": 6.4501e-06, - "loss": 0.0004, - "step": 35500 - }, - { - "epoch": 135.87786259541986, - "grad_norm": 0.0005448561278171837, - "learning_rate": 6.4401e-06, - "loss": 0.0001, - "step": 35600 - }, - { - "epoch": 136.25954198473283, - "grad_norm": 0.11809820681810379, - "learning_rate": 6.4301000000000005e-06, - "loss": 0.0001, - "step": 35700 - }, - { - "epoch": 136.6412213740458, - "grad_norm": 0.017143191769719124, - "learning_rate": 6.420100000000001e-06, - "loss": 0.0001, - "step": 35800 - }, - { - "epoch": 137.0229007633588, - "grad_norm": 0.007829481735825539, - "learning_rate": 6.4101e-06, - "loss": 0.0001, - "step": 35900 - }, - { - "epoch": 137.40458015267177, - "grad_norm": 0.7016693353652954, - "learning_rate": 6.4001e-06, - "loss": 0.0002, - "step": 36000 - }, - { - "epoch": 137.78625954198472, - "grad_norm": 0.004672779235988855, - "learning_rate": 6.3901e-06, - "loss": 0.0001, - "step": 36100 - }, - { - "epoch": 138.1679389312977, - "grad_norm": 0.026437992230057716, - "learning_rate": 6.3801e-06, - "loss": 0.0003, - "step": 36200 - }, - { - "epoch": 138.54961832061068, - "grad_norm": 0.0021988858934491873, - "learning_rate": 6.3701e-06, - "loss": 0.0005, - "step": 36300 - }, - { - "epoch": 138.93129770992365, - "grad_norm": 0.0026272800751030445, - "learning_rate": 6.3601e-06, - "loss": 0.0002, - "step": 36400 - }, - { - "epoch": 139.31297709923663, - "grad_norm": 0.0005723762442357838, - "learning_rate": 6.3501e-06, - "loss": 0.0006, - "step": 36500 - }, - { - "epoch": 139.6946564885496, - "grad_norm": 0.004545825533568859, - "learning_rate": 6.3401000000000005e-06, - "loss": 0.0002, - "step": 36600 - }, - { - "epoch": 140.0763358778626, - "grad_norm": 0.002721506869420409, - "learning_rate": 6.3301e-06, - "loss": 0.0003, - "step": 36700 - }, - { - "epoch": 140.45801526717557, - "grad_norm": 0.004180505871772766, - "learning_rate": 6.3201e-06, - "loss": 0.0001, - "step": 36800 - }, - { - "epoch": 140.83969465648855, - "grad_norm": 0.186207577586174, - "learning_rate": 6.3101e-06, - "loss": 0.0002, - "step": 36900 - }, - { - "epoch": 141.22137404580153, - "grad_norm": 0.0017115280497819185, - "learning_rate": 6.3001e-06, - "loss": 0.0003, - "step": 37000 - }, - { - "epoch": 141.6030534351145, - "grad_norm": 0.0008541855495423079, - "learning_rate": 6.2901e-06, - "loss": 0.0001, - "step": 37100 - }, - { - "epoch": 141.98473282442748, - "grad_norm": 0.002355042612180114, - "learning_rate": 6.2801e-06, - "loss": 0.0003, - "step": 37200 - }, - { - "epoch": 142.36641221374046, - "grad_norm": 0.013107040897011757, - "learning_rate": 6.2701e-06, - "loss": 0.0001, - "step": 37300 - }, - { - "epoch": 142.74809160305344, - "grad_norm": 0.0009587877430021763, - "learning_rate": 6.2601e-06, - "loss": 0.0003, - "step": 37400 - }, - { - "epoch": 143.12977099236642, - "grad_norm": 0.001087266020476818, - "learning_rate": 6.2501e-06, - "loss": 0.0003, - "step": 37500 - }, - { - "epoch": 143.5114503816794, - "grad_norm": 0.008276865817606449, - "learning_rate": 6.2401e-06, - "loss": 0.0002, - "step": 37600 - }, - { - "epoch": 143.89312977099237, - "grad_norm": 0.026819543913006783, - "learning_rate": 6.2301e-06, - "loss": 0.0001, - "step": 37700 - }, - { - "epoch": 144.27480916030535, - "grad_norm": 0.028448406606912613, - "learning_rate": 6.2201e-06, - "loss": 0.0003, - "step": 37800 - }, - { - "epoch": 144.65648854961833, - "grad_norm": 0.0173990149050951, - "learning_rate": 6.2101e-06, - "loss": 0.0002, - "step": 37900 - }, - { - "epoch": 145.0381679389313, - "grad_norm": 0.0007025286322459579, - "learning_rate": 6.2001e-06, - "loss": 0.0001, - "step": 38000 - }, - { - "epoch": 145.4198473282443, - "grad_norm": 0.07833188027143478, - "learning_rate": 6.1901e-06, - "loss": 0.0006, - "step": 38100 - }, - { - "epoch": 145.80152671755727, - "grad_norm": 0.024744229391217232, - "learning_rate": 6.1801e-06, - "loss": 0.0005, - "step": 38200 - }, - { - "epoch": 146.18320610687022, - "grad_norm": 0.003040940035134554, - "learning_rate": 6.1701e-06, - "loss": 0.0001, - "step": 38300 - }, - { - "epoch": 146.5648854961832, - "grad_norm": 0.0012232158333063126, - "learning_rate": 6.1601e-06, - "loss": 0.0002, - "step": 38400 - }, - { - "epoch": 146.94656488549617, - "grad_norm": 0.026980726048350334, - "learning_rate": 6.1501e-06, - "loss": 0.0001, - "step": 38500 - }, - { - "epoch": 147.32824427480915, - "grad_norm": 0.026128340512514114, - "learning_rate": 6.1401e-06, - "loss": 0.0001, - "step": 38600 - }, - { - "epoch": 147.70992366412213, - "grad_norm": 0.034830208867788315, - "learning_rate": 6.130100000000001e-06, - "loss": 0.0001, - "step": 38700 - }, - { - "epoch": 148.0916030534351, - "grad_norm": 0.0024085906334221363, - "learning_rate": 6.120100000000001e-06, - "loss": 0.0001, - "step": 38800 - }, - { - "epoch": 148.4732824427481, - "grad_norm": 0.016766740009188652, - "learning_rate": 6.110100000000001e-06, - "loss": 0.0001, - "step": 38900 - }, - { - "epoch": 148.85496183206106, - "grad_norm": 0.001144357374869287, - "learning_rate": 6.100100000000001e-06, - "loss": 0.0002, - "step": 39000 - }, - { - "epoch": 149.23664122137404, - "grad_norm": 0.001168467104434967, - "learning_rate": 6.090100000000001e-06, - "loss": 0.0001, - "step": 39100 - }, - { - "epoch": 149.61832061068702, - "grad_norm": 0.0004284460737835616, - "learning_rate": 6.080100000000001e-06, - "loss": 0.0001, - "step": 39200 - }, - { - "epoch": 150.0, - "grad_norm": 0.0004981591482646763, - "learning_rate": 6.070100000000001e-06, - "loss": 0.0001, - "step": 39300 - }, - { - "epoch": 150.38167938931298, - "grad_norm": 0.0008872957550920546, - "learning_rate": 6.060100000000001e-06, - "loss": 0.0001, - "step": 39400 - }, - { - "epoch": 150.76335877862596, - "grad_norm": 0.009406983852386475, - "learning_rate": 6.050100000000001e-06, - "loss": 0.0001, - "step": 39500 - }, - { - "epoch": 151.14503816793894, - "grad_norm": 0.012963274493813515, - "learning_rate": 6.040100000000001e-06, - "loss": 0.0, - "step": 39600 - }, - { - "epoch": 151.5267175572519, - "grad_norm": 0.003453217213973403, - "learning_rate": 6.030100000000001e-06, - "loss": 0.0001, - "step": 39700 - }, - { - "epoch": 151.9083969465649, - "grad_norm": 0.0005413664039224386, - "learning_rate": 6.020100000000001e-06, - "loss": 0.0001, - "step": 39800 - }, - { - "epoch": 152.29007633587787, - "grad_norm": 0.011132200248539448, - "learning_rate": 6.010100000000001e-06, - "loss": 0.0001, - "step": 39900 - }, - { - "epoch": 152.67175572519085, - "grad_norm": 0.10698964446783066, - "learning_rate": 6.000100000000001e-06, - "loss": 0.0001, - "step": 40000 - }, - { - "epoch": 152.67175572519085, - "eval_loss": 0.06923888623714447, - "eval_runtime": 12.8684, - "eval_sacrebleu": 98.3530073096174, - "eval_samples_per_second": 72.659, - "eval_steps_per_second": 1.166, - "step": 40000 - }, - { - "epoch": 153.05343511450383, - "grad_norm": 0.00023354697623290122, - "learning_rate": 5.990100000000001e-06, - "loss": 0.0001, - "step": 40100 - }, - { - "epoch": 153.4351145038168, - "grad_norm": 0.0008447120781056583, - "learning_rate": 5.9801000000000006e-06, - "loss": 0.0002, - "step": 40200 - }, - { - "epoch": 153.81679389312978, - "grad_norm": 0.0020947318989783525, - "learning_rate": 5.970100000000001e-06, - "loss": 0.0001, - "step": 40300 - }, - { - "epoch": 154.19847328244273, - "grad_norm": 0.1252506524324417, - "learning_rate": 5.960100000000001e-06, - "loss": 0.0002, - "step": 40400 - }, - { - "epoch": 154.5801526717557, - "grad_norm": 0.0017271244432777166, - "learning_rate": 5.950100000000001e-06, - "loss": 0.0002, - "step": 40500 - }, - { - "epoch": 154.9618320610687, - "grad_norm": 0.0011080058757215738, - "learning_rate": 5.940100000000001e-06, - "loss": 0.0002, - "step": 40600 - }, - { - "epoch": 155.34351145038167, - "grad_norm": 0.2149689793586731, - "learning_rate": 5.930100000000001e-06, - "loss": 0.0001, - "step": 40700 - }, - { - "epoch": 155.72519083969465, - "grad_norm": 0.0002960397396236658, - "learning_rate": 5.920100000000001e-06, - "loss": 0.0001, - "step": 40800 - }, - { - "epoch": 156.10687022900763, - "grad_norm": 0.0021136582363396883, - "learning_rate": 5.910100000000001e-06, - "loss": 0.0004, - "step": 40900 - }, - { - "epoch": 156.4885496183206, - "grad_norm": 0.0002561765140853822, - "learning_rate": 5.9001000000000005e-06, - "loss": 0.0001, - "step": 41000 - }, - { - "epoch": 156.87022900763358, - "grad_norm": 0.00465123075991869, - "learning_rate": 5.890100000000001e-06, - "loss": 0.0002, - "step": 41100 - }, - { - "epoch": 157.25190839694656, - "grad_norm": 0.03916554898023605, - "learning_rate": 5.880100000000001e-06, - "loss": 0.0004, - "step": 41200 - }, - { - "epoch": 157.63358778625954, - "grad_norm": 0.026591990143060684, - "learning_rate": 5.870100000000001e-06, - "loss": 0.0001, - "step": 41300 - }, - { - "epoch": 158.01526717557252, - "grad_norm": 0.005483448039740324, - "learning_rate": 5.860100000000001e-06, - "loss": 0.0003, - "step": 41400 - }, - { - "epoch": 158.3969465648855, - "grad_norm": 0.0037258814554661512, - "learning_rate": 5.850100000000001e-06, - "loss": 0.0001, - "step": 41500 - }, - { - "epoch": 158.77862595419847, - "grad_norm": 0.0006981261540204287, - "learning_rate": 5.840100000000001e-06, - "loss": 0.0001, - "step": 41600 - }, - { - "epoch": 159.16030534351145, - "grad_norm": 0.0013413112610578537, - "learning_rate": 5.830100000000001e-06, - "loss": 0.0, - "step": 41700 - }, - { - "epoch": 159.54198473282443, - "grad_norm": 0.0013540409272536635, - "learning_rate": 5.8201e-06, - "loss": 0.0001, - "step": 41800 - }, - { - "epoch": 159.9236641221374, - "grad_norm": 0.08201773464679718, - "learning_rate": 5.8101000000000005e-06, - "loss": 0.0001, - "step": 41900 - }, - { - "epoch": 160.3053435114504, - "grad_norm": 0.001905901008285582, - "learning_rate": 5.800100000000001e-06, - "loss": 0.0, - "step": 42000 - }, - { - "epoch": 160.68702290076337, - "grad_norm": 0.0014741268241778016, - "learning_rate": 5.790100000000001e-06, - "loss": 0.0, - "step": 42100 - }, - { - "epoch": 161.06870229007635, - "grad_norm": 0.0289907306432724, - "learning_rate": 5.780100000000001e-06, - "loss": 0.0, - "step": 42200 - }, - { - "epoch": 161.45038167938932, - "grad_norm": 0.0030206867959350348, - "learning_rate": 5.770100000000001e-06, - "loss": 0.0001, - "step": 42300 - }, - { - "epoch": 161.8320610687023, - "grad_norm": 0.006713017355650663, - "learning_rate": 5.760100000000001e-06, - "loss": 0.0001, - "step": 42400 - }, - { - "epoch": 162.21374045801528, - "grad_norm": 0.020501023158431053, - "learning_rate": 5.750100000000001e-06, - "loss": 0.0001, - "step": 42500 - }, - { - "epoch": 162.59541984732823, - "grad_norm": 0.06237759441137314, - "learning_rate": 5.7401e-06, - "loss": 0.0005, - "step": 42600 - }, - { - "epoch": 162.9770992366412, - "grad_norm": 0.00038810563273727894, - "learning_rate": 5.7301e-06, - "loss": 0.0001, - "step": 42700 - }, - { - "epoch": 163.3587786259542, - "grad_norm": 0.000870977935846895, - "learning_rate": 5.7201000000000005e-06, - "loss": 0.0004, - "step": 42800 - }, - { - "epoch": 163.74045801526717, - "grad_norm": 0.016740718856453896, - "learning_rate": 5.710100000000001e-06, - "loss": 0.0001, - "step": 42900 - }, - { - "epoch": 164.12213740458014, - "grad_norm": 0.009314488619565964, - "learning_rate": 5.700100000000001e-06, - "loss": 0.0001, - "step": 43000 - }, - { - "epoch": 164.50381679389312, - "grad_norm": 0.004396924749016762, - "learning_rate": 5.690100000000001e-06, - "loss": 0.0001, - "step": 43100 - }, - { - "epoch": 164.8854961832061, - "grad_norm": 0.0006507146172225475, - "learning_rate": 5.680100000000001e-06, - "loss": 0.0001, - "step": 43200 - }, - { - "epoch": 165.26717557251908, - "grad_norm": 0.0880076214671135, - "learning_rate": 5.670100000000001e-06, - "loss": 0.0001, - "step": 43300 - }, - { - "epoch": 165.64885496183206, - "grad_norm": 0.002911583986133337, - "learning_rate": 5.6601e-06, - "loss": 0.0002, - "step": 43400 - }, - { - "epoch": 166.03053435114504, - "grad_norm": 0.0006464822799898684, - "learning_rate": 5.6501e-06, - "loss": 0.0001, - "step": 43500 - }, - { - "epoch": 166.41221374045801, - "grad_norm": 0.0007006984669715166, - "learning_rate": 5.6401000000000004e-06, - "loss": 0.0, - "step": 43600 - }, - { - "epoch": 166.793893129771, - "grad_norm": 0.019798092544078827, - "learning_rate": 5.6301000000000005e-06, - "loss": 0.0001, - "step": 43700 - }, - { - "epoch": 167.17557251908397, - "grad_norm": 0.0017021212261170149, - "learning_rate": 5.620100000000001e-06, - "loss": 0.0001, - "step": 43800 - }, - { - "epoch": 167.55725190839695, - "grad_norm": 0.005481039173901081, - "learning_rate": 5.610100000000001e-06, - "loss": 0.0001, - "step": 43900 - }, - { - "epoch": 167.93893129770993, - "grad_norm": 0.02881401963531971, - "learning_rate": 5.600100000000001e-06, - "loss": 0.0, - "step": 44000 - }, - { - "epoch": 168.3206106870229, - "grad_norm": 0.0013997695641592145, - "learning_rate": 5.590100000000001e-06, - "loss": 0.0003, - "step": 44100 - }, - { - "epoch": 168.70229007633588, - "grad_norm": 0.0005554063245654106, - "learning_rate": 5.5801e-06, - "loss": 0.0001, - "step": 44200 - }, - { - "epoch": 169.08396946564886, - "grad_norm": 0.005088876932859421, - "learning_rate": 5.5701e-06, - "loss": 0.0003, - "step": 44300 - }, - { - "epoch": 169.46564885496184, - "grad_norm": 0.000671022164169699, - "learning_rate": 5.5601e-06, - "loss": 0.0001, - "step": 44400 - }, - { - "epoch": 169.84732824427482, - "grad_norm": 0.0006424835883080959, - "learning_rate": 5.5501000000000004e-06, - "loss": 0.0007, - "step": 44500 - }, - { - "epoch": 170.2290076335878, - "grad_norm": 0.005833714734762907, - "learning_rate": 5.5401000000000005e-06, - "loss": 0.0001, - "step": 44600 - }, - { - "epoch": 170.61068702290078, - "grad_norm": 0.0008527796599082649, - "learning_rate": 5.530100000000001e-06, - "loss": 0.0001, - "step": 44700 - }, - { - "epoch": 170.99236641221373, - "grad_norm": 0.00201510451734066, - "learning_rate": 5.520100000000001e-06, - "loss": 0.0002, - "step": 44800 - }, - { - "epoch": 171.3740458015267, - "grad_norm": 0.017467355355620384, - "learning_rate": 5.510100000000001e-06, - "loss": 0.0001, - "step": 44900 - }, - { - "epoch": 171.75572519083968, - "grad_norm": 0.0023401184007525444, - "learning_rate": 5.5001e-06, - "loss": 0.0001, - "step": 45000 - }, - { - "epoch": 171.75572519083968, - "eval_loss": 0.06445743143558502, - "eval_runtime": 13.0732, - "eval_sacrebleu": 98.4299149528277, - "eval_samples_per_second": 71.52, - "eval_steps_per_second": 1.147, - "step": 45000 - }, - { - "epoch": 172.13740458015266, - "grad_norm": 0.0017037112265825272, - "learning_rate": 5.4901e-06, - "loss": 0.0001, - "step": 45100 - }, - { - "epoch": 172.51908396946564, - "grad_norm": 0.0043260930106043816, - "learning_rate": 5.4801e-06, - "loss": 0.0001, - "step": 45200 - }, - { - "epoch": 172.90076335877862, - "grad_norm": 0.002439398318529129, - "learning_rate": 5.4701e-06, - "loss": 0.0, - "step": 45300 - }, - { - "epoch": 173.2824427480916, - "grad_norm": 0.00044149241875857115, - "learning_rate": 5.4601000000000005e-06, - "loss": 0.0003, - "step": 45400 - }, - { - "epoch": 173.66412213740458, - "grad_norm": 0.001301961368881166, - "learning_rate": 5.4501000000000006e-06, - "loss": 0.0001, - "step": 45500 - }, - { - "epoch": 174.04580152671755, - "grad_norm": 0.011845661327242851, - "learning_rate": 5.440100000000001e-06, - "loss": 0.0001, - "step": 45600 - }, - { - "epoch": 174.42748091603053, - "grad_norm": 0.0079120434820652, - "learning_rate": 5.430100000000001e-06, - "loss": 0.0001, - "step": 45700 - }, - { - "epoch": 174.8091603053435, - "grad_norm": 0.0017839899519458413, - "learning_rate": 5.4201e-06, - "loss": 0.0001, - "step": 45800 - }, - { - "epoch": 175.1908396946565, - "grad_norm": 0.0012268917635083199, - "learning_rate": 5.4101e-06, - "loss": 0.0, - "step": 45900 - }, - { - "epoch": 175.57251908396947, - "grad_norm": 0.0093336571007967, - "learning_rate": 5.4001e-06, - "loss": 0.0, - "step": 46000 - }, - { - "epoch": 175.95419847328245, - "grad_norm": 0.0017296276055276394, - "learning_rate": 5.3901e-06, - "loss": 0.0002, - "step": 46100 - }, - { - "epoch": 176.33587786259542, - "grad_norm": 0.014460418373346329, - "learning_rate": 5.3801e-06, - "loss": 0.0, - "step": 46200 - }, - { - "epoch": 176.7175572519084, - "grad_norm": 0.0021138014271855354, - "learning_rate": 5.3701000000000005e-06, - "loss": 0.0001, - "step": 46300 - }, - { - "epoch": 177.09923664122138, - "grad_norm": 0.0008265760843642056, - "learning_rate": 5.3601000000000006e-06, - "loss": 0.0001, - "step": 46400 - }, - { - "epoch": 177.48091603053436, - "grad_norm": 0.15832078456878662, - "learning_rate": 5.350100000000001e-06, - "loss": 0.0002, - "step": 46500 - }, - { - "epoch": 177.86259541984734, - "grad_norm": 0.006225129589438438, - "learning_rate": 5.3401e-06, - "loss": 0.0001, - "step": 46600 - }, - { - "epoch": 178.24427480916032, - "grad_norm": 0.001128816744312644, - "learning_rate": 5.3301e-06, - "loss": 0.0001, - "step": 46700 - }, - { - "epoch": 178.6259541984733, - "grad_norm": 0.0014230897650122643, - "learning_rate": 5.3201e-06, - "loss": 0.0003, - "step": 46800 - }, - { - "epoch": 179.00763358778627, - "grad_norm": 0.018984733149409294, - "learning_rate": 5.3101e-06, - "loss": 0.0002, - "step": 46900 - }, - { - "epoch": 179.38931297709922, - "grad_norm": 0.020820150151848793, - "learning_rate": 5.3001e-06, - "loss": 0.0003, - "step": 47000 - }, - { - "epoch": 179.7709923664122, - "grad_norm": 0.0034320326521992683, - "learning_rate": 5.2901e-06, - "loss": 0.0001, - "step": 47100 - }, - { - "epoch": 180.15267175572518, - "grad_norm": 0.0023775645531713963, - "learning_rate": 5.2801000000000005e-06, - "loss": 0.0001, - "step": 47200 - }, - { - "epoch": 180.53435114503816, - "grad_norm": 0.01248265616595745, - "learning_rate": 5.270100000000001e-06, - "loss": 0.0001, - "step": 47300 - }, - { - "epoch": 180.91603053435114, - "grad_norm": 0.0006067880894988775, - "learning_rate": 5.2601e-06, - "loss": 0.0001, - "step": 47400 - }, - { - "epoch": 181.29770992366412, - "grad_norm": 0.0020774889271706343, - "learning_rate": 5.2501e-06, - "loss": 0.0001, - "step": 47500 - }, - { - "epoch": 181.6793893129771, - "grad_norm": 0.00026692217215895653, - "learning_rate": 5.2401e-06, - "loss": 0.0001, - "step": 47600 - }, - { - "epoch": 182.06106870229007, - "grad_norm": 0.0009136300650425255, - "learning_rate": 5.2301e-06, - "loss": 0.0, - "step": 47700 - }, - { - "epoch": 182.44274809160305, - "grad_norm": 0.0006349982577376068, - "learning_rate": 5.2201e-06, - "loss": 0.0001, - "step": 47800 - }, - { - "epoch": 182.82442748091603, - "grad_norm": 0.0762195810675621, - "learning_rate": 5.2101e-06, - "loss": 0.0001, - "step": 47900 - }, - { - "epoch": 183.206106870229, - "grad_norm": 0.0009825810557231307, - "learning_rate": 5.2001e-06, - "loss": 0.0001, - "step": 48000 - }, - { - "epoch": 183.58778625954199, - "grad_norm": 0.0038229688070714474, - "learning_rate": 5.1901000000000005e-06, - "loss": 0.0, - "step": 48100 - }, - { - "epoch": 183.96946564885496, - "grad_norm": 0.00385545426979661, - "learning_rate": 5.1801e-06, - "loss": 0.0, - "step": 48200 - }, - { - "epoch": 184.35114503816794, - "grad_norm": 0.003457268001511693, - "learning_rate": 5.1701e-06, - "loss": 0.0, - "step": 48300 - }, - { - "epoch": 184.73282442748092, - "grad_norm": 0.00014962915156502277, - "learning_rate": 5.1601e-06, - "loss": 0.0, - "step": 48400 - }, - { - "epoch": 185.1145038167939, - "grad_norm": 0.0014973586658015847, - "learning_rate": 5.1501e-06, - "loss": 0.0001, - "step": 48500 - }, - { - "epoch": 185.49618320610688, - "grad_norm": 0.003571637673303485, - "learning_rate": 5.1401e-06, - "loss": 0.0, - "step": 48600 - }, - { - "epoch": 185.87786259541986, - "grad_norm": 0.0005043320124968886, - "learning_rate": 5.1301e-06, - "loss": 0.0, - "step": 48700 - }, - { - "epoch": 186.25954198473283, - "grad_norm": 0.00036755617475137115, - "learning_rate": 5.1201e-06, - "loss": 0.0004, - "step": 48800 - }, - { - "epoch": 186.6412213740458, - "grad_norm": 0.0004503914969973266, - "learning_rate": 5.1101e-06, - "loss": 0.0002, - "step": 48900 - }, - { - "epoch": 187.0229007633588, - "grad_norm": 0.0016016371082514524, - "learning_rate": 5.1001e-06, - "loss": 0.0, - "step": 49000 - }, - { - "epoch": 187.40458015267177, - "grad_norm": 0.0015420691343024373, - "learning_rate": 5.0901e-06, - "loss": 0.0001, - "step": 49100 - }, - { - "epoch": 187.78625954198472, - "grad_norm": 0.00416839262470603, - "learning_rate": 5.0801e-06, - "loss": 0.0001, - "step": 49200 - }, - { - "epoch": 188.1679389312977, - "grad_norm": 0.0014447234570980072, - "learning_rate": 5.0701e-06, - "loss": 0.0, - "step": 49300 - }, - { - "epoch": 188.54961832061068, - "grad_norm": 0.026771286502480507, - "learning_rate": 5.0601e-06, - "loss": 0.0002, - "step": 49400 - }, - { - "epoch": 188.93129770992365, - "grad_norm": 0.0045920079573988914, - "learning_rate": 5.0501e-06, - "loss": 0.0, - "step": 49500 - }, - { - "epoch": 189.31297709923663, - "grad_norm": 0.0009305253042839468, - "learning_rate": 5.0401e-06, - "loss": 0.0001, - "step": 49600 - }, - { - "epoch": 189.6946564885496, - "grad_norm": 0.002638279926031828, - "learning_rate": 5.0301e-06, - "loss": 0.0002, - "step": 49700 - }, - { - "epoch": 190.0763358778626, - "grad_norm": 0.00749228848144412, - "learning_rate": 5.0201e-06, - "loss": 0.0, - "step": 49800 - }, - { - "epoch": 190.45801526717557, - "grad_norm": 0.0015944631304591894, - "learning_rate": 5.0101e-06, - "loss": 0.0001, - "step": 49900 - }, - { - "epoch": 190.83969465648855, - "grad_norm": 0.0011049050372093916, - "learning_rate": 5.0001e-06, - "loss": 0.0002, - "step": 50000 - }, - { - "epoch": 190.83969465648855, - "eval_loss": 0.0705500915646553, - "eval_runtime": 15.1552, - "eval_sacrebleu": 98.38156965327812, - "eval_samples_per_second": 61.695, - "eval_steps_per_second": 0.99, - "step": 50000 - }, - { - "epoch": 191.22137404580153, - "grad_norm": 0.022949527949094772, - "learning_rate": 4.990100000000001e-06, - "loss": 0.0004, - "step": 50100 - }, - { - "epoch": 191.6030534351145, - "grad_norm": 0.003185077803209424, - "learning_rate": 4.980100000000001e-06, - "loss": 0.0001, - "step": 50200 - }, - { - "epoch": 191.98473282442748, - "grad_norm": 0.002439779695123434, - "learning_rate": 4.9701e-06, - "loss": 0.0, - "step": 50300 - }, - { - "epoch": 192.36641221374046, - "grad_norm": 0.0016462886705994606, - "learning_rate": 4.9601e-06, - "loss": 0.0001, - "step": 50400 - }, - { - "epoch": 192.74809160305344, - "grad_norm": 0.0029389767441898584, - "learning_rate": 4.9501e-06, - "loss": 0.0, - "step": 50500 - }, - { - "epoch": 193.12977099236642, - "grad_norm": 0.001626283978112042, - "learning_rate": 4.9401e-06, - "loss": 0.0, - "step": 50600 - }, - { - "epoch": 193.5114503816794, - "grad_norm": 0.00018432780052535236, - "learning_rate": 4.9301000000000005e-06, - "loss": 0.0001, - "step": 50700 - }, - { - "epoch": 193.89312977099237, - "grad_norm": 0.00217829761095345, - "learning_rate": 4.9201000000000005e-06, - "loss": 0.0001, - "step": 50800 - }, - { - "epoch": 194.27480916030535, - "grad_norm": 0.014322876930236816, - "learning_rate": 4.910100000000001e-06, - "loss": 0.0, - "step": 50900 - }, - { - "epoch": 194.65648854961833, - "grad_norm": 0.0013682694407179952, - "learning_rate": 4.900100000000001e-06, - "loss": 0.0001, - "step": 51000 - }, - { - "epoch": 195.0381679389313, - "grad_norm": 0.00011015631025657058, - "learning_rate": 4.8901e-06, - "loss": 0.0, - "step": 51100 - }, - { - "epoch": 195.4198473282443, - "grad_norm": 0.00016530833090655506, - "learning_rate": 4.8801e-06, - "loss": 0.0, - "step": 51200 - }, - { - "epoch": 195.80152671755727, - "grad_norm": 0.0008822011877782643, - "learning_rate": 4.8701e-06, - "loss": 0.0001, - "step": 51300 - }, - { - "epoch": 196.18320610687022, - "grad_norm": 0.14383438229560852, - "learning_rate": 4.8601e-06, - "loss": 0.0003, - "step": 51400 - }, - { - "epoch": 196.5648854961832, - "grad_norm": 0.0025567635893821716, - "learning_rate": 4.8501e-06, - "loss": 0.0001, - "step": 51500 - }, - { - "epoch": 196.94656488549617, - "grad_norm": 0.008415630087256432, - "learning_rate": 4.8401000000000005e-06, - "loss": 0.0001, - "step": 51600 - }, - { - "epoch": 197.32824427480915, - "grad_norm": 0.004247774835675955, - "learning_rate": 4.8301000000000006e-06, - "loss": 0.0001, - "step": 51700 - }, - { - "epoch": 197.70992366412213, - "grad_norm": 0.003051754320040345, - "learning_rate": 4.820100000000001e-06, - "loss": 0.0002, - "step": 51800 - }, - { - "epoch": 198.0916030534351, - "grad_norm": 0.0067289709113538265, - "learning_rate": 4.8101e-06, - "loss": 0.0001, - "step": 51900 - }, - { - "epoch": 198.4732824427481, - "grad_norm": 0.0005421788082458079, - "learning_rate": 4.8001e-06, - "loss": 0.0001, - "step": 52000 - }, - { - "epoch": 198.85496183206106, - "grad_norm": 0.0004354407137725502, - "learning_rate": 4.7901e-06, - "loss": 0.0001, - "step": 52100 - }, - { - "epoch": 199.23664122137404, - "grad_norm": 0.0007408035453408957, - "learning_rate": 4.7801e-06, - "loss": 0.0001, - "step": 52200 - }, - { - "epoch": 199.61832061068702, - "grad_norm": 0.0008267344092018902, - "learning_rate": 4.7701e-06, - "loss": 0.0001, - "step": 52300 - }, - { - "epoch": 200.0, - "grad_norm": 0.005437543150037527, - "learning_rate": 4.7601e-06, - "loss": 0.0003, - "step": 52400 - }, - { - "epoch": 200.38167938931298, - "grad_norm": 0.002480014692991972, - "learning_rate": 4.7501000000000005e-06, - "loss": 0.0002, - "step": 52500 - }, - { - "epoch": 200.76335877862596, - "grad_norm": 0.016808325424790382, - "learning_rate": 4.740100000000001e-06, - "loss": 0.0001, - "step": 52600 - }, - { - "epoch": 201.14503816793894, - "grad_norm": 0.0003072597901336849, - "learning_rate": 4.7301e-06, - "loss": 0.0002, - "step": 52700 - }, - { - "epoch": 201.5267175572519, - "grad_norm": 0.021030226722359657, - "learning_rate": 4.7201e-06, - "loss": 0.0001, - "step": 52800 - }, - { - "epoch": 201.9083969465649, - "grad_norm": 0.0023340117186307907, - "learning_rate": 4.7101e-06, - "loss": 0.0, - "step": 52900 - }, - { - "epoch": 202.29007633587787, - "grad_norm": 0.00035964843118563294, - "learning_rate": 4.7001e-06, - "loss": 0.0, - "step": 53000 - }, - { - "epoch": 202.67175572519085, - "grad_norm": 0.0027809382881969213, - "learning_rate": 4.6901e-06, - "loss": 0.0001, - "step": 53100 - }, - { - "epoch": 203.05343511450383, - "grad_norm": 0.001012068591080606, - "learning_rate": 4.6801e-06, - "loss": 0.0002, - "step": 53200 - }, - { - "epoch": 203.4351145038168, - "grad_norm": 0.0007995408959686756, - "learning_rate": 4.6701e-06, - "loss": 0.0001, - "step": 53300 - }, - { - "epoch": 203.81679389312978, - "grad_norm": 0.0035254317335784435, - "learning_rate": 4.6601000000000005e-06, - "loss": 0.0001, - "step": 53400 - }, - { - "epoch": 204.19847328244273, - "grad_norm": 0.001446934649720788, - "learning_rate": 4.650100000000001e-06, - "loss": 0.0, - "step": 53500 - }, - { - "epoch": 204.5801526717557, - "grad_norm": 0.0006400212878361344, - "learning_rate": 4.6401e-06, - "loss": 0.0001, - "step": 53600 - }, - { - "epoch": 204.9618320610687, - "grad_norm": 0.004127305466681719, - "learning_rate": 4.6301e-06, - "loss": 0.0001, - "step": 53700 - }, - { - "epoch": 205.34351145038167, - "grad_norm": 0.2322733849287033, - "learning_rate": 4.6201e-06, - "loss": 0.0002, - "step": 53800 - }, - { - "epoch": 205.72519083969465, - "grad_norm": 0.0006699136574752629, - "learning_rate": 4.6101e-06, - "loss": 0.0001, - "step": 53900 - }, - { - "epoch": 206.10687022900763, - "grad_norm": 0.0013658641837537289, - "learning_rate": 4.6001e-06, - "loss": 0.0001, - "step": 54000 - }, - { - "epoch": 206.4885496183206, - "grad_norm": 0.0002463155542500317, - "learning_rate": 4.5901e-06, - "loss": 0.0003, - "step": 54100 - }, - { - "epoch": 206.87022900763358, - "grad_norm": 0.0011533570941537619, - "learning_rate": 4.5801e-06, - "loss": 0.0001, - "step": 54200 - }, - { - "epoch": 207.25190839694656, - "grad_norm": 0.0003880669828504324, - "learning_rate": 4.5701000000000005e-06, - "loss": 0.0, - "step": 54300 - }, - { - "epoch": 207.63358778625954, - "grad_norm": 0.0005348866106942296, - "learning_rate": 4.560100000000001e-06, - "loss": 0.0001, - "step": 54400 - }, - { - "epoch": 208.01526717557252, - "grad_norm": 0.0036084686871618032, - "learning_rate": 4.550100000000001e-06, - "loss": 0.0001, - "step": 54500 - }, - { - "epoch": 208.3969465648855, - "grad_norm": 0.002091924659907818, - "learning_rate": 4.540100000000001e-06, - "loss": 0.0002, - "step": 54600 - }, - { - "epoch": 208.77862595419847, - "grad_norm": 0.009517332538962364, - "learning_rate": 4.530100000000001e-06, - "loss": 0.0001, - "step": 54700 - }, - { - "epoch": 209.16030534351145, - "grad_norm": 0.0017961460398510098, - "learning_rate": 4.520100000000001e-06, - "loss": 0.0001, - "step": 54800 - }, - { - "epoch": 209.54198473282443, - "grad_norm": 0.011542879045009613, - "learning_rate": 4.5101e-06, - "loss": 0.0001, - "step": 54900 - }, - { - "epoch": 209.9236641221374, - "grad_norm": 0.0007961526280269027, - "learning_rate": 4.5001e-06, - "loss": 0.0002, - "step": 55000 - }, - { - "epoch": 209.9236641221374, - "eval_loss": 0.06830338388681412, - "eval_runtime": 14.8884, - "eval_sacrebleu": 98.3817390052901, - "eval_samples_per_second": 62.801, - "eval_steps_per_second": 1.007, - "step": 55000 - }, - { - "epoch": 210.3053435114504, - "grad_norm": 0.0003626318066380918, - "learning_rate": 4.4901000000000004e-06, - "loss": 0.0001, - "step": 55100 - }, - { - "epoch": 210.68702290076337, - "grad_norm": 0.003250488545745611, - "learning_rate": 4.4801000000000005e-06, - "loss": 0.0001, - "step": 55200 - }, - { - "epoch": 211.06870229007635, - "grad_norm": 0.00048593009705655277, - "learning_rate": 4.470100000000001e-06, - "loss": 0.0, - "step": 55300 - }, - { - "epoch": 211.45038167938932, - "grad_norm": 0.013868845999240875, - "learning_rate": 4.460100000000001e-06, - "loss": 0.0, - "step": 55400 - }, - { - "epoch": 211.8320610687023, - "grad_norm": 0.011143862269818783, - "learning_rate": 4.450100000000001e-06, - "loss": 0.0, - "step": 55500 - }, - { - "epoch": 212.21374045801528, - "grad_norm": 0.00046847719931975007, - "learning_rate": 4.440100000000001e-06, - "loss": 0.0001, - "step": 55600 - }, - { - "epoch": 212.59541984732823, - "grad_norm": 0.0004869813856203109, - "learning_rate": 4.4301e-06, - "loss": 0.0, - "step": 55700 - }, - { - "epoch": 212.9770992366412, - "grad_norm": 0.00048074036021716893, - "learning_rate": 4.4201e-06, - "loss": 0.0001, - "step": 55800 - }, - { - "epoch": 213.3587786259542, - "grad_norm": 0.0026560707483440638, - "learning_rate": 4.4101e-06, - "loss": 0.0, - "step": 55900 - }, - { - "epoch": 213.74045801526717, - "grad_norm": 0.0003701593668665737, - "learning_rate": 4.4001000000000004e-06, - "loss": 0.0, - "step": 56000 - }, - { - "epoch": 214.12213740458014, - "grad_norm": 0.02120647206902504, - "learning_rate": 4.3901000000000005e-06, - "loss": 0.0, - "step": 56100 - }, - { - "epoch": 214.50381679389312, - "grad_norm": 0.0014282125048339367, - "learning_rate": 4.380100000000001e-06, - "loss": 0.0, - "step": 56200 - }, - { - "epoch": 214.8854961832061, - "grad_norm": 0.006585704628378153, - "learning_rate": 4.370100000000001e-06, - "loss": 0.0001, - "step": 56300 - }, - { - "epoch": 215.26717557251908, - "grad_norm": 0.001468599890358746, - "learning_rate": 4.360100000000001e-06, - "loss": 0.0, - "step": 56400 - }, - { - "epoch": 215.64885496183206, - "grad_norm": 0.00032429711427539587, - "learning_rate": 4.3501e-06, - "loss": 0.0001, - "step": 56500 - }, - { - "epoch": 216.03053435114504, - "grad_norm": 0.0006425382453016937, - "learning_rate": 4.3401e-06, - "loss": 0.0003, - "step": 56600 - }, - { - "epoch": 216.41221374045801, - "grad_norm": 0.001136511447839439, - "learning_rate": 4.3301e-06, - "loss": 0.0004, - "step": 56700 - }, - { - "epoch": 216.793893129771, - "grad_norm": 0.0066289640963077545, - "learning_rate": 4.3201e-06, - "loss": 0.0003, - "step": 56800 - }, - { - "epoch": 217.17557251908397, - "grad_norm": 0.0006471850210800767, - "learning_rate": 4.3101000000000005e-06, - "loss": 0.0, - "step": 56900 - }, - { - "epoch": 217.55725190839695, - "grad_norm": 0.0006165873492136598, - "learning_rate": 4.3001000000000006e-06, - "loss": 0.0002, - "step": 57000 - }, - { - "epoch": 217.93893129770993, - "grad_norm": 0.00486396811902523, - "learning_rate": 4.290100000000001e-06, - "loss": 0.0001, - "step": 57100 - }, - { - "epoch": 218.3206106870229, - "grad_norm": 0.006084302440285683, - "learning_rate": 4.280100000000001e-06, - "loss": 0.0006, - "step": 57200 - }, - { - "epoch": 218.70229007633588, - "grad_norm": 0.0007263757870532572, - "learning_rate": 4.2701e-06, - "loss": 0.0, - "step": 57300 - }, - { - "epoch": 219.08396946564886, - "grad_norm": 0.0010548994177952409, - "learning_rate": 4.2601e-06, - "loss": 0.0, - "step": 57400 - }, - { - "epoch": 219.46564885496184, - "grad_norm": 0.0008174892282113433, - "learning_rate": 4.2501e-06, - "loss": 0.0001, - "step": 57500 - }, - { - "epoch": 219.84732824427482, - "grad_norm": 0.003784140106290579, - "learning_rate": 4.2401e-06, - "loss": 0.0, - "step": 57600 - }, - { - "epoch": 220.2290076335878, - "grad_norm": 0.1712358146905899, - "learning_rate": 4.2301e-06, - "loss": 0.0002, - "step": 57700 - }, - { - "epoch": 220.61068702290078, - "grad_norm": 0.0009685051045380533, - "learning_rate": 4.2201000000000005e-06, - "loss": 0.0003, - "step": 57800 - }, - { - "epoch": 220.99236641221373, - "grad_norm": 0.0002811609301716089, - "learning_rate": 4.2101000000000006e-06, - "loss": 0.0, - "step": 57900 - }, - { - "epoch": 221.3740458015267, - "grad_norm": 0.0013500307686626911, - "learning_rate": 4.200100000000001e-06, - "loss": 0.0001, - "step": 58000 - }, - { - "epoch": 221.75572519083968, - "grad_norm": 0.0026434820611029863, - "learning_rate": 4.1901e-06, - "loss": 0.0, - "step": 58100 - }, - { - "epoch": 222.13740458015266, - "grad_norm": 0.00022301946592051536, - "learning_rate": 4.1801e-06, - "loss": 0.0, - "step": 58200 - }, - { - "epoch": 222.51908396946564, - "grad_norm": 0.000502895622048527, - "learning_rate": 4.1701e-06, - "loss": 0.0, - "step": 58300 - }, - { - "epoch": 222.90076335877862, - "grad_norm": 0.00026090064784511924, - "learning_rate": 4.1601e-06, - "loss": 0.0001, - "step": 58400 - }, - { - "epoch": 223.2824427480916, - "grad_norm": 0.000342659855959937, - "learning_rate": 4.1501e-06, - "loss": 0.0, - "step": 58500 - }, - { - "epoch": 223.66412213740458, - "grad_norm": 0.0007804030901752412, - "learning_rate": 4.1401e-06, - "loss": 0.0001, - "step": 58600 - }, - { - "epoch": 224.04580152671755, - "grad_norm": 0.0002739455958362669, - "learning_rate": 4.1301000000000005e-06, - "loss": 0.0002, - "step": 58700 - }, - { - "epoch": 224.42748091603053, - "grad_norm": 0.04318670928478241, - "learning_rate": 4.120100000000001e-06, - "loss": 0.0003, - "step": 58800 - }, - { - "epoch": 224.8091603053435, - "grad_norm": 0.005310933105647564, - "learning_rate": 4.1101e-06, - "loss": 0.0001, - "step": 58900 - }, - { - "epoch": 225.1908396946565, - "grad_norm": 0.0049896519631147385, - "learning_rate": 4.1001e-06, - "loss": 0.0, - "step": 59000 - }, - { - "epoch": 225.57251908396947, - "grad_norm": 0.015347431413829327, - "learning_rate": 4.0901e-06, - "loss": 0.0002, - "step": 59100 - }, - { - "epoch": 225.95419847328245, - "grad_norm": 0.0010005142539739609, - "learning_rate": 4.0801e-06, - "loss": 0.0001, - "step": 59200 - }, - { - "epoch": 226.33587786259542, - "grad_norm": 0.0176298376172781, - "learning_rate": 4.0701e-06, - "loss": 0.0001, - "step": 59300 - }, - { - "epoch": 226.7175572519084, - "grad_norm": 0.0006241757655516267, - "learning_rate": 4.0601e-06, - "loss": 0.0002, - "step": 59400 - }, - { - "epoch": 227.09923664122138, - "grad_norm": 0.050072286278009415, - "learning_rate": 4.0501e-06, - "loss": 0.0, - "step": 59500 - }, - { - "epoch": 227.48091603053436, - "grad_norm": 0.007551845163106918, - "learning_rate": 4.0401000000000005e-06, - "loss": 0.0001, - "step": 59600 - }, - { - "epoch": 227.86259541984734, - "grad_norm": 0.0006704423576593399, - "learning_rate": 4.0301e-06, - "loss": 0.0, - "step": 59700 - }, - { - "epoch": 228.24427480916032, - "grad_norm": 0.0001167279333458282, - "learning_rate": 4.0201e-06, - "loss": 0.0, - "step": 59800 - }, - { - "epoch": 228.6259541984733, - "grad_norm": 0.0003406234900467098, - "learning_rate": 4.0101e-06, - "loss": 0.0, - "step": 59900 - }, - { - "epoch": 229.00763358778627, - "grad_norm": 0.004332631826400757, - "learning_rate": 4.0001e-06, - "loss": 0.0001, - "step": 60000 - }, - { - "epoch": 229.00763358778627, - "eval_loss": 0.06924522668123245, - "eval_runtime": 12.7194, - "eval_sacrebleu": 98.42462230912712, - "eval_samples_per_second": 73.51, - "eval_steps_per_second": 1.179, - "step": 60000 - }, - { - "epoch": 229.38931297709922, - "grad_norm": 0.0004383054911158979, - "learning_rate": 3.9901e-06, - "loss": 0.0, - "step": 60100 - }, - { - "epoch": 229.7709923664122, - "grad_norm": 0.0049883900210261345, - "learning_rate": 3.9801e-06, - "loss": 0.0, - "step": 60200 - }, - { - "epoch": 230.15267175572518, - "grad_norm": 0.0003295672358945012, - "learning_rate": 3.9701e-06, - "loss": 0.0, - "step": 60300 - }, - { - "epoch": 230.53435114503816, - "grad_norm": 0.0025957634206861258, - "learning_rate": 3.9601e-06, - "loss": 0.0001, - "step": 60400 - }, - { - "epoch": 230.91603053435114, - "grad_norm": 0.0013411330292001367, - "learning_rate": 3.9501e-06, - "loss": 0.0, - "step": 60500 - }, - { - "epoch": 231.29770992366412, - "grad_norm": 0.0009917329298332334, - "learning_rate": 3.940100000000001e-06, - "loss": 0.0, - "step": 60600 - }, - { - "epoch": 231.6793893129771, - "grad_norm": 0.002210022648796439, - "learning_rate": 3.930100000000001e-06, - "loss": 0.0001, - "step": 60700 - }, - { - "epoch": 232.06106870229007, - "grad_norm": 0.0004792156396433711, - "learning_rate": 3.920100000000001e-06, - "loss": 0.0002, - "step": 60800 - }, - { - "epoch": 232.44274809160305, - "grad_norm": 0.0035985433496534824, - "learning_rate": 3.910100000000001e-06, - "loss": 0.0004, - "step": 60900 - }, - { - "epoch": 232.82442748091603, - "grad_norm": 0.00014447036664932966, - "learning_rate": 3.9001e-06, - "loss": 0.0001, - "step": 61000 - }, - { - "epoch": 233.206106870229, - "grad_norm": 0.005730424541980028, - "learning_rate": 3.8901e-06, - "loss": 0.0, - "step": 61100 - }, - { - "epoch": 233.58778625954199, - "grad_norm": 0.0038998997770249844, - "learning_rate": 3.8801e-06, - "loss": 0.0001, - "step": 61200 - }, - { - "epoch": 233.96946564885496, - "grad_norm": 0.0002460220130160451, - "learning_rate": 3.8701000000000004e-06, - "loss": 0.0, - "step": 61300 - }, - { - "epoch": 234.35114503816794, - "grad_norm": 0.008543968200683594, - "learning_rate": 3.8601000000000005e-06, - "loss": 0.0, - "step": 61400 - }, - { - "epoch": 234.73282442748092, - "grad_norm": 0.0008228663355112076, - "learning_rate": 3.850100000000001e-06, - "loss": 0.0, - "step": 61500 - }, - { - "epoch": 235.1145038167939, - "grad_norm": 0.0032458484638482332, - "learning_rate": 3.840100000000001e-06, - "loss": 0.0001, - "step": 61600 - }, - { - "epoch": 235.49618320610688, - "grad_norm": 0.0015793191269040108, - "learning_rate": 3.830100000000001e-06, - "loss": 0.0, - "step": 61700 - }, - { - "epoch": 235.87786259541986, - "grad_norm": 0.0018890851642936468, - "learning_rate": 3.8201e-06, - "loss": 0.0, - "step": 61800 - }, - { - "epoch": 236.25954198473283, - "grad_norm": 0.0028850152157247066, - "learning_rate": 3.8101000000000006e-06, - "loss": 0.0, - "step": 61900 - }, - { - "epoch": 236.6412213740458, - "grad_norm": 0.0002459301322232932, - "learning_rate": 3.8001000000000003e-06, - "loss": 0.0001, - "step": 62000 - }, - { - "epoch": 237.0229007633588, - "grad_norm": 0.0008328225230798125, - "learning_rate": 3.7901000000000004e-06, - "loss": 0.0, - "step": 62100 - }, - { - "epoch": 237.40458015267177, - "grad_norm": 0.0006181095377542078, - "learning_rate": 3.7801000000000005e-06, - "loss": 0.0, - "step": 62200 - }, - { - "epoch": 237.78625954198472, - "grad_norm": 0.0003671936865430325, - "learning_rate": 3.7701000000000005e-06, - "loss": 0.0002, - "step": 62300 - }, - { - "epoch": 238.1679389312977, - "grad_norm": 0.0005335118039511144, - "learning_rate": 3.7601000000000002e-06, - "loss": 0.0001, - "step": 62400 - }, - { - "epoch": 238.54961832061068, - "grad_norm": 0.001004107529297471, - "learning_rate": 3.7501000000000003e-06, - "loss": 0.0001, - "step": 62500 - }, - { - "epoch": 238.93129770992365, - "grad_norm": 0.001058288267813623, - "learning_rate": 3.7401000000000004e-06, - "loss": 0.0002, - "step": 62600 - }, - { - "epoch": 239.31297709923663, - "grad_norm": 0.0020912198815494776, - "learning_rate": 3.7301000000000005e-06, - "loss": 0.0001, - "step": 62700 - }, - { - "epoch": 239.6946564885496, - "grad_norm": 0.0014922478003427386, - "learning_rate": 3.7201e-06, - "loss": 0.0001, - "step": 62800 - }, - { - "epoch": 240.0763358778626, - "grad_norm": 0.0019878807943314314, - "learning_rate": 3.7101000000000003e-06, - "loss": 0.0001, - "step": 62900 - }, - { - "epoch": 240.45801526717557, - "grad_norm": 0.0008230987004935741, - "learning_rate": 3.7001000000000004e-06, - "loss": 0.0, - "step": 63000 - }, - { - "epoch": 240.83969465648855, - "grad_norm": 0.0016020069597288966, - "learning_rate": 3.6901000000000005e-06, - "loss": 0.0, - "step": 63100 - }, - { - "epoch": 241.22137404580153, - "grad_norm": 0.001682333997450769, - "learning_rate": 3.6801e-06, - "loss": 0.0, - "step": 63200 - }, - { - "epoch": 241.6030534351145, - "grad_norm": 0.0016903354553505778, - "learning_rate": 3.6701000000000002e-06, - "loss": 0.0001, - "step": 63300 - }, - { - "epoch": 241.98473282442748, - "grad_norm": 0.008831867948174477, - "learning_rate": 3.6601000000000003e-06, - "loss": 0.0, - "step": 63400 - }, - { - "epoch": 242.36641221374046, - "grad_norm": 0.0007798579172231257, - "learning_rate": 3.6501000000000004e-06, - "loss": 0.0001, - "step": 63500 - }, - { - "epoch": 242.74809160305344, - "grad_norm": 0.3243200182914734, - "learning_rate": 3.6401e-06, - "loss": 0.0001, - "step": 63600 - }, - { - "epoch": 243.12977099236642, - "grad_norm": 0.0005018337978981435, - "learning_rate": 3.6301e-06, - "loss": 0.0003, - "step": 63700 - }, - { - "epoch": 243.5114503816794, - "grad_norm": 0.003775683930143714, - "learning_rate": 3.6201000000000003e-06, - "loss": 0.0, - "step": 63800 - }, - { - "epoch": 243.89312977099237, - "grad_norm": 0.0006570370751433074, - "learning_rate": 3.6101000000000004e-06, - "loss": 0.0, - "step": 63900 - }, - { - "epoch": 244.27480916030535, - "grad_norm": 0.0004511134757194668, - "learning_rate": 3.6001e-06, - "loss": 0.0, - "step": 64000 - }, - { - "epoch": 244.65648854961833, - "grad_norm": 0.00034218482323922217, - "learning_rate": 3.5901e-06, - "loss": 0.0002, - "step": 64100 - }, - { - "epoch": 245.0381679389313, - "grad_norm": 0.0013360466109588742, - "learning_rate": 3.5801000000000002e-06, - "loss": 0.0, - "step": 64200 - }, - { - "epoch": 245.4198473282443, - "grad_norm": 0.000908435438759625, - "learning_rate": 3.5701000000000003e-06, - "loss": 0.0, - "step": 64300 - }, - { - "epoch": 245.80152671755727, - "grad_norm": 0.0036820757668465376, - "learning_rate": 3.5601e-06, - "loss": 0.0001, - "step": 64400 - }, - { - "epoch": 246.18320610687022, - "grad_norm": 0.0042778379283845425, - "learning_rate": 3.5501e-06, - "loss": 0.0001, - "step": 64500 - }, - { - "epoch": 246.5648854961832, - "grad_norm": 0.0006720293895341456, - "learning_rate": 3.5401e-06, - "loss": 0.0, - "step": 64600 - }, - { - "epoch": 246.94656488549617, - "grad_norm": 0.004132702946662903, - "learning_rate": 3.5301000000000003e-06, - "loss": 0.0, - "step": 64700 - }, - { - "epoch": 247.32824427480915, - "grad_norm": 0.1539987325668335, - "learning_rate": 3.5201e-06, - "loss": 0.0, - "step": 64800 - }, - { - "epoch": 247.70992366412213, - "grad_norm": 0.0009854933014139533, - "learning_rate": 3.5101e-06, - "loss": 0.0, - "step": 64900 - }, - { - "epoch": 248.0916030534351, - "grad_norm": 0.0006776836817152798, - "learning_rate": 3.5001e-06, - "loss": 0.0, - "step": 65000 - }, - { - "epoch": 248.0916030534351, - "eval_loss": 0.07642494887113571, - "eval_runtime": 11.0441, - "eval_sacrebleu": 98.22997236483829, - "eval_samples_per_second": 84.661, - "eval_steps_per_second": 1.358, - "step": 65000 - }, - { - "epoch": 248.4732824427481, - "grad_norm": 0.0011846576817333698, - "learning_rate": 3.4901000000000003e-06, - "loss": 0.0, - "step": 65100 - }, - { - "epoch": 248.85496183206106, - "grad_norm": 0.00015981722390279174, - "learning_rate": 3.4801e-06, - "loss": 0.0001, - "step": 65200 - }, - { - "epoch": 249.23664122137404, - "grad_norm": 0.0009182260255329311, - "learning_rate": 3.4701e-06, - "loss": 0.0003, - "step": 65300 - }, - { - "epoch": 249.61832061068702, - "grad_norm": 0.002005082555115223, - "learning_rate": 3.4601e-06, - "loss": 0.0001, - "step": 65400 - }, - { - "epoch": 250.0, - "grad_norm": 0.0008901826804503798, - "learning_rate": 3.4501000000000002e-06, - "loss": 0.0, - "step": 65500 - }, - { - "epoch": 250.38167938931298, - "grad_norm": 0.006282407324761152, - "learning_rate": 3.4401000000000003e-06, - "loss": 0.0, - "step": 65600 - }, - { - "epoch": 250.76335877862596, - "grad_norm": 0.0006946716457605362, - "learning_rate": 3.4301e-06, - "loss": 0.0001, - "step": 65700 - }, - { - "epoch": 251.14503816793894, - "grad_norm": 0.0009743022383190691, - "learning_rate": 3.4201e-06, - "loss": 0.0, - "step": 65800 - }, - { - "epoch": 251.5267175572519, - "grad_norm": 0.00014440107042901218, - "learning_rate": 3.4101e-06, - "loss": 0.0, - "step": 65900 - }, - { - "epoch": 251.9083969465649, - "grad_norm": 0.00038495694752782583, - "learning_rate": 3.4001000000000003e-06, - "loss": 0.0, - "step": 66000 - }, - { - "epoch": 252.29007633587787, - "grad_norm": 0.000925507047213614, - "learning_rate": 3.3901e-06, - "loss": 0.0, - "step": 66100 - }, - { - "epoch": 252.67175572519085, - "grad_norm": 0.0489259772002697, - "learning_rate": 3.3801e-06, - "loss": 0.0, - "step": 66200 - }, - { - "epoch": 253.05343511450383, - "grad_norm": 0.003963008988648653, - "learning_rate": 3.3701e-06, - "loss": 0.0, - "step": 66300 - }, - { - "epoch": 253.4351145038168, - "grad_norm": 0.0001968510332517326, - "learning_rate": 3.3601000000000002e-06, - "loss": 0.0, - "step": 66400 - }, - { - "epoch": 253.81679389312978, - "grad_norm": 0.0004394342831801623, - "learning_rate": 3.3501e-06, - "loss": 0.0003, - "step": 66500 - }, - { - "epoch": 254.19847328244273, - "grad_norm": 0.001822329475544393, - "learning_rate": 3.3401e-06, - "loss": 0.0, - "step": 66600 - }, - { - "epoch": 254.5801526717557, - "grad_norm": 0.011056671850383282, - "learning_rate": 3.3301e-06, - "loss": 0.0001, - "step": 66700 - }, - { - "epoch": 254.9618320610687, - "grad_norm": 0.0009788586758077145, - "learning_rate": 3.3201000000000006e-06, - "loss": 0.0, - "step": 66800 - }, - { - "epoch": 255.34351145038167, - "grad_norm": 0.0007667080499231815, - "learning_rate": 3.3101000000000007e-06, - "loss": 0.0, - "step": 66900 - }, - { - "epoch": 255.72519083969465, - "grad_norm": 0.0012655084719881415, - "learning_rate": 3.3001000000000004e-06, - "loss": 0.0001, - "step": 67000 - }, - { - "epoch": 256.10687022900765, - "grad_norm": 0.451745867729187, - "learning_rate": 3.2901000000000005e-06, - "loss": 0.0001, - "step": 67100 - }, - { - "epoch": 256.48854961832063, - "grad_norm": 0.0015185383381322026, - "learning_rate": 3.2801000000000006e-06, - "loss": 0.0, - "step": 67200 - }, - { - "epoch": 256.8702290076336, - "grad_norm": 0.0013192964252084494, - "learning_rate": 3.2701000000000007e-06, - "loss": 0.0002, - "step": 67300 - }, - { - "epoch": 257.2519083969466, - "grad_norm": 0.0006594723672606051, - "learning_rate": 3.2601000000000003e-06, - "loss": 0.0, - "step": 67400 - }, - { - "epoch": 257.63358778625957, - "grad_norm": 0.009008731693029404, - "learning_rate": 3.2501000000000004e-06, - "loss": 0.0, - "step": 67500 - }, - { - "epoch": 258.01526717557255, - "grad_norm": 0.0003555931325536221, - "learning_rate": 3.2401000000000005e-06, - "loss": 0.0, - "step": 67600 - }, - { - "epoch": 258.39694656488547, - "grad_norm": 0.0016379636945202947, - "learning_rate": 3.2301000000000006e-06, - "loss": 0.0, - "step": 67700 - }, - { - "epoch": 258.77862595419845, - "grad_norm": 0.0021731907036155462, - "learning_rate": 3.2201000000000003e-06, - "loss": 0.0, - "step": 67800 - }, - { - "epoch": 259.1603053435114, - "grad_norm": 0.0007260444108396769, - "learning_rate": 3.2101000000000004e-06, - "loss": 0.0, - "step": 67900 - }, - { - "epoch": 259.5419847328244, - "grad_norm": 0.00023493407934438437, - "learning_rate": 3.2001000000000005e-06, - "loss": 0.0, - "step": 68000 - }, - { - "epoch": 259.9236641221374, - "grad_norm": 0.0003064965712837875, - "learning_rate": 3.1901000000000006e-06, - "loss": 0.0001, - "step": 68100 - }, - { - "epoch": 260.30534351145036, - "grad_norm": 0.0002717080351430923, - "learning_rate": 3.1801000000000003e-06, - "loss": 0.0, - "step": 68200 - }, - { - "epoch": 260.68702290076334, - "grad_norm": 0.000287896313238889, - "learning_rate": 3.1701000000000004e-06, - "loss": 0.0, - "step": 68300 - }, - { - "epoch": 261.0687022900763, - "grad_norm": 0.00011128895857837051, - "learning_rate": 3.1601000000000005e-06, - "loss": 0.0, - "step": 68400 - }, - { - "epoch": 261.4503816793893, - "grad_norm": 9.249433787772432e-05, - "learning_rate": 3.1501000000000006e-06, - "loss": 0.0, - "step": 68500 - }, - { - "epoch": 261.8320610687023, - "grad_norm": 0.003091628197580576, - "learning_rate": 3.1401000000000002e-06, - "loss": 0.0, - "step": 68600 - }, - { - "epoch": 262.21374045801525, - "grad_norm": 0.0001517763448646292, - "learning_rate": 3.1301000000000003e-06, - "loss": 0.0001, - "step": 68700 - }, - { - "epoch": 262.59541984732823, - "grad_norm": 0.0050222245045006275, - "learning_rate": 3.1201000000000004e-06, - "loss": 0.0001, - "step": 68800 - }, - { - "epoch": 262.9770992366412, - "grad_norm": 0.0002638766309246421, - "learning_rate": 3.1101000000000005e-06, - "loss": 0.0, - "step": 68900 - }, - { - "epoch": 263.3587786259542, - "grad_norm": 0.00038631295319646597, - "learning_rate": 3.1001e-06, - "loss": 0.0001, - "step": 69000 - }, - { - "epoch": 263.74045801526717, - "grad_norm": 0.0008003843831829727, - "learning_rate": 3.0901000000000003e-06, - "loss": 0.0001, - "step": 69100 - }, - { - "epoch": 264.12213740458014, - "grad_norm": 0.00026344929938204587, - "learning_rate": 3.0801000000000004e-06, - "loss": 0.0001, - "step": 69200 - }, - { - "epoch": 264.5038167938931, - "grad_norm": 0.0016943742521107197, - "learning_rate": 3.0701000000000005e-06, - "loss": 0.0, - "step": 69300 - }, - { - "epoch": 264.8854961832061, - "grad_norm": 0.0016541230725124478, - "learning_rate": 3.0601e-06, - "loss": 0.0001, - "step": 69400 - }, - { - "epoch": 265.2671755725191, - "grad_norm": 0.00018329459999222308, - "learning_rate": 3.0501000000000002e-06, - "loss": 0.0, - "step": 69500 - }, - { - "epoch": 265.64885496183206, - "grad_norm": 0.0006810303893871605, - "learning_rate": 3.0401000000000003e-06, - "loss": 0.0001, - "step": 69600 - }, - { - "epoch": 266.03053435114504, - "grad_norm": 0.00019303402223158628, - "learning_rate": 3.0301000000000004e-06, - "loss": 0.0, - "step": 69700 - }, - { - "epoch": 266.412213740458, - "grad_norm": 0.0006043753819540143, - "learning_rate": 3.0201e-06, - "loss": 0.0, - "step": 69800 - }, - { - "epoch": 266.793893129771, - "grad_norm": 0.10896377265453339, - "learning_rate": 3.0101e-06, - "loss": 0.0001, - "step": 69900 - }, - { - "epoch": 267.17557251908397, - "grad_norm": 0.0005371817969717085, - "learning_rate": 3.0001000000000003e-06, - "loss": 0.0001, - "step": 70000 - }, - { - "epoch": 267.17557251908397, - "eval_loss": 0.0706414207816124, - "eval_runtime": 12.7866, - "eval_sacrebleu": 98.3507650037767, - "eval_samples_per_second": 73.124, - "eval_steps_per_second": 1.173, - "step": 70000 - }, - { - "epoch": 267.55725190839695, - "grad_norm": 0.002264066133648157, - "learning_rate": 2.9901000000000004e-06, - "loss": 0.0, - "step": 70100 - }, - { - "epoch": 267.9389312977099, - "grad_norm": 0.00045830843737348914, - "learning_rate": 2.9801e-06, - "loss": 0.0001, - "step": 70200 - }, - { - "epoch": 268.3206106870229, - "grad_norm": 0.00502806156873703, - "learning_rate": 2.9701e-06, - "loss": 0.0, - "step": 70300 - }, - { - "epoch": 268.7022900763359, - "grad_norm": 0.00045506758033297956, - "learning_rate": 2.9601000000000003e-06, - "loss": 0.0, - "step": 70400 - }, - { - "epoch": 269.08396946564886, - "grad_norm": 0.0007213895441964269, - "learning_rate": 2.9501000000000004e-06, - "loss": 0.0001, - "step": 70500 - }, - { - "epoch": 269.46564885496184, - "grad_norm": 0.006971609313040972, - "learning_rate": 2.9401e-06, - "loss": 0.0, - "step": 70600 - }, - { - "epoch": 269.8473282442748, - "grad_norm": 0.007343415170907974, - "learning_rate": 2.9301e-06, - "loss": 0.0, - "step": 70700 - }, - { - "epoch": 270.2290076335878, - "grad_norm": 0.00012149077520007268, - "learning_rate": 2.9201000000000002e-06, - "loss": 0.0, - "step": 70800 - }, - { - "epoch": 270.6106870229008, - "grad_norm": 0.0004891341086477041, - "learning_rate": 2.9101000000000003e-06, - "loss": 0.0, - "step": 70900 - }, - { - "epoch": 270.99236641221376, - "grad_norm": 0.0004915940226055682, - "learning_rate": 2.9001e-06, - "loss": 0.0, - "step": 71000 - }, - { - "epoch": 271.37404580152673, - "grad_norm": 0.0003145417431369424, - "learning_rate": 2.8901e-06, - "loss": 0.0, - "step": 71100 - }, - { - "epoch": 271.7557251908397, - "grad_norm": 0.001674189232289791, - "learning_rate": 2.8801e-06, - "loss": 0.0, - "step": 71200 - }, - { - "epoch": 272.1374045801527, - "grad_norm": 0.000954287766944617, - "learning_rate": 2.8701000000000003e-06, - "loss": 0.0, - "step": 71300 - }, - { - "epoch": 272.51908396946567, - "grad_norm": 0.0003251973248552531, - "learning_rate": 2.8601e-06, - "loss": 0.0, - "step": 71400 - }, - { - "epoch": 272.90076335877865, - "grad_norm": 0.0010594518389552832, - "learning_rate": 2.8501e-06, - "loss": 0.0, - "step": 71500 - }, - { - "epoch": 273.2824427480916, - "grad_norm": 0.00048636249266564846, - "learning_rate": 2.8401e-06, - "loss": 0.0, - "step": 71600 - }, - { - "epoch": 273.6641221374046, - "grad_norm": 0.027219722047448158, - "learning_rate": 2.8301000000000002e-06, - "loss": 0.0001, - "step": 71700 - }, - { - "epoch": 274.0458015267176, - "grad_norm": 0.0011948850005865097, - "learning_rate": 2.8201e-06, - "loss": 0.0001, - "step": 71800 - }, - { - "epoch": 274.42748091603056, - "grad_norm": 0.0034863767214119434, - "learning_rate": 2.8101e-06, - "loss": 0.0, - "step": 71900 - }, - { - "epoch": 274.80916030534354, - "grad_norm": 0.0019288210896775126, - "learning_rate": 2.8001e-06, - "loss": 0.0, - "step": 72000 - }, - { - "epoch": 275.19083969465646, - "grad_norm": 0.0003933673142455518, - "learning_rate": 2.7901e-06, - "loss": 0.0, - "step": 72100 - }, - { - "epoch": 275.57251908396944, - "grad_norm": 0.0006691425805911422, - "learning_rate": 2.7801e-06, - "loss": 0.0, - "step": 72200 - }, - { - "epoch": 275.9541984732824, - "grad_norm": 0.0018643882358446717, - "learning_rate": 2.7701e-06, - "loss": 0.0, - "step": 72300 - }, - { - "epoch": 276.3358778625954, - "grad_norm": 0.0004697064869105816, - "learning_rate": 2.7601e-06, - "loss": 0.0, - "step": 72400 - }, - { - "epoch": 276.7175572519084, - "grad_norm": 0.0003101348993368447, - "learning_rate": 2.7501e-06, - "loss": 0.0, - "step": 72500 - }, - { - "epoch": 277.09923664122135, - "grad_norm": 0.000497120781801641, - "learning_rate": 2.7401e-06, - "loss": 0.0, - "step": 72600 - }, - { - "epoch": 277.48091603053433, - "grad_norm": 0.003131396370008588, - "learning_rate": 2.7301e-06, - "loss": 0.0, - "step": 72700 - }, - { - "epoch": 277.8625954198473, - "grad_norm": 8.452180190943182e-05, - "learning_rate": 2.7201e-06, - "loss": 0.0001, - "step": 72800 - }, - { - "epoch": 278.2442748091603, - "grad_norm": 0.00016723430599085987, - "learning_rate": 2.7101e-06, - "loss": 0.0, - "step": 72900 - }, - { - "epoch": 278.62595419847327, - "grad_norm": 6.192157889017835e-05, - "learning_rate": 2.7000999999999998e-06, - "loss": 0.0, - "step": 73000 - }, - { - "epoch": 279.00763358778624, - "grad_norm": 0.00033697186154313385, - "learning_rate": 2.6901000000000003e-06, - "loss": 0.0, - "step": 73100 - }, - { - "epoch": 279.3893129770992, - "grad_norm": 0.0007250922499224544, - "learning_rate": 2.6801000000000004e-06, - "loss": 0.0, - "step": 73200 - }, - { - "epoch": 279.7709923664122, - "grad_norm": 0.00043798386468552053, - "learning_rate": 2.6701000000000005e-06, - "loss": 0.0, - "step": 73300 - }, - { - "epoch": 280.1526717557252, - "grad_norm": 0.0003070870880037546, - "learning_rate": 2.6601000000000006e-06, - "loss": 0.0, - "step": 73400 - }, - { - "epoch": 280.53435114503816, - "grad_norm": 0.0007616638322360814, - "learning_rate": 2.6501000000000003e-06, - "loss": 0.0, - "step": 73500 - }, - { - "epoch": 280.91603053435114, - "grad_norm": 0.0008846410200931132, - "learning_rate": 2.6401000000000004e-06, - "loss": 0.0001, - "step": 73600 - }, - { - "epoch": 281.2977099236641, - "grad_norm": 0.00019850108947139233, - "learning_rate": 2.6301000000000004e-06, - "loss": 0.0001, - "step": 73700 - }, - { - "epoch": 281.6793893129771, - "grad_norm": 0.00022825045743957162, - "learning_rate": 2.6201000000000005e-06, - "loss": 0.0, - "step": 73800 - }, - { - "epoch": 282.0610687022901, - "grad_norm": 0.0006464041071012616, - "learning_rate": 2.6101000000000002e-06, - "loss": 0.0, - "step": 73900 - }, - { - "epoch": 282.44274809160305, - "grad_norm": 0.00015778695524204522, - "learning_rate": 2.6001000000000003e-06, - "loss": 0.0, - "step": 74000 - }, - { - "epoch": 282.82442748091603, - "grad_norm": 0.00011923769488930702, - "learning_rate": 2.5901000000000004e-06, - "loss": 0.0001, - "step": 74100 - }, - { - "epoch": 283.206106870229, - "grad_norm": 0.0009114159620366991, - "learning_rate": 2.5801000000000005e-06, - "loss": 0.0, - "step": 74200 - }, - { - "epoch": 283.587786259542, - "grad_norm": 0.07821295410394669, - "learning_rate": 2.5701e-06, - "loss": 0.0001, - "step": 74300 - }, - { - "epoch": 283.96946564885496, - "grad_norm": 0.0005942827556282282, - "learning_rate": 2.5601000000000003e-06, - "loss": 0.0, - "step": 74400 - }, - { - "epoch": 284.35114503816794, - "grad_norm": 0.0003344318247400224, - "learning_rate": 2.5501000000000004e-06, - "loss": 0.0, - "step": 74500 - }, - { - "epoch": 284.7328244274809, - "grad_norm": 0.0004239541303832084, - "learning_rate": 2.5401000000000005e-06, - "loss": 0.0, - "step": 74600 - }, - { - "epoch": 285.1145038167939, - "grad_norm": 0.0004080179496668279, - "learning_rate": 2.5301e-06, - "loss": 0.0002, - "step": 74700 - }, - { - "epoch": 285.4961832061069, - "grad_norm": 0.0002883638080675155, - "learning_rate": 2.5201000000000002e-06, - "loss": 0.0, - "step": 74800 - }, - { - "epoch": 285.87786259541986, - "grad_norm": 0.003567567327991128, - "learning_rate": 2.5101000000000003e-06, - "loss": 0.0, - "step": 74900 - }, - { - "epoch": 286.25954198473283, - "grad_norm": 0.000484933378174901, - "learning_rate": 2.5001000000000004e-06, - "loss": 0.0, - "step": 75000 - }, - { - "epoch": 286.25954198473283, - "eval_loss": 0.07114613801240921, - "eval_runtime": 11.8397, - "eval_sacrebleu": 98.3796929328225, - "eval_samples_per_second": 78.972, - "eval_steps_per_second": 1.267, - "step": 75000 - }, - { - "epoch": 286.6412213740458, - "grad_norm": 0.00425668666139245, - "learning_rate": 2.4901e-06, - "loss": 0.0, - "step": 75100 - }, - { - "epoch": 287.0229007633588, - "grad_norm": 0.0001475068274885416, - "learning_rate": 2.4801e-06, - "loss": 0.0, - "step": 75200 - }, - { - "epoch": 287.40458015267177, - "grad_norm": 0.00040500349132344127, - "learning_rate": 2.4701000000000003e-06, - "loss": 0.0, - "step": 75300 - }, - { - "epoch": 287.78625954198475, - "grad_norm": 0.0004649657057598233, - "learning_rate": 2.4601000000000004e-06, - "loss": 0.0, - "step": 75400 - }, - { - "epoch": 288.1679389312977, - "grad_norm": 0.0006429302738979459, - "learning_rate": 2.4501e-06, - "loss": 0.0, - "step": 75500 - }, - { - "epoch": 288.5496183206107, - "grad_norm": 0.0009631992434151471, - "learning_rate": 2.4401e-06, - "loss": 0.0001, - "step": 75600 - }, - { - "epoch": 288.9312977099237, - "grad_norm": 0.00040664331754669547, - "learning_rate": 2.4301000000000002e-06, - "loss": 0.0, - "step": 75700 - }, - { - "epoch": 289.31297709923666, - "grad_norm": 0.0003451923839747906, - "learning_rate": 2.4201000000000003e-06, - "loss": 0.0, - "step": 75800 - }, - { - "epoch": 289.69465648854964, - "grad_norm": 0.0022604241967201233, - "learning_rate": 2.4101e-06, - "loss": 0.0, - "step": 75900 - }, - { - "epoch": 290.0763358778626, - "grad_norm": 0.0002524483425077051, - "learning_rate": 2.4001e-06, - "loss": 0.0, - "step": 76000 - }, - { - "epoch": 290.4580152671756, - "grad_norm": 0.000300365878501907, - "learning_rate": 2.3901e-06, - "loss": 0.0, - "step": 76100 - }, - { - "epoch": 290.8396946564886, - "grad_norm": 0.0015298264333978295, - "learning_rate": 2.3801000000000003e-06, - "loss": 0.0, - "step": 76200 - }, - { - "epoch": 291.22137404580155, - "grad_norm": 0.00014980745618231595, - "learning_rate": 2.3701e-06, - "loss": 0.0, - "step": 76300 - }, - { - "epoch": 291.60305343511453, - "grad_norm": 7.667691534152254e-05, - "learning_rate": 2.3601e-06, - "loss": 0.0, - "step": 76400 - }, - { - "epoch": 291.98473282442745, - "grad_norm": 9.649044659454376e-05, - "learning_rate": 2.3501e-06, - "loss": 0.0, - "step": 76500 - }, - { - "epoch": 292.36641221374043, - "grad_norm": 7.835325232008472e-05, - "learning_rate": 2.3401000000000003e-06, - "loss": 0.0, - "step": 76600 - }, - { - "epoch": 292.7480916030534, - "grad_norm": 0.0035747180227190256, - "learning_rate": 2.3301e-06, - "loss": 0.0, - "step": 76700 - }, - { - "epoch": 293.1297709923664, - "grad_norm": 0.0004929218557663262, - "learning_rate": 2.3201e-06, - "loss": 0.0, - "step": 76800 - }, - { - "epoch": 293.51145038167937, - "grad_norm": 0.0008625462069176137, - "learning_rate": 2.3101e-06, - "loss": 0.0, - "step": 76900 - }, - { - "epoch": 293.89312977099235, - "grad_norm": 7.546912820544094e-05, - "learning_rate": 2.3001000000000002e-06, - "loss": 0.0, - "step": 77000 - }, - { - "epoch": 294.2748091603053, - "grad_norm": 0.0001980727829504758, - "learning_rate": 2.2901e-06, - "loss": 0.0, - "step": 77100 - }, - { - "epoch": 294.6564885496183, - "grad_norm": 6.45111795165576e-05, - "learning_rate": 2.2801e-06, - "loss": 0.0, - "step": 77200 - }, - { - "epoch": 295.0381679389313, - "grad_norm": 0.00047169000026769936, - "learning_rate": 2.2701e-06, - "loss": 0.0002, - "step": 77300 - }, - { - "epoch": 295.41984732824426, - "grad_norm": 0.0001602059928700328, - "learning_rate": 2.2601e-06, - "loss": 0.0, - "step": 77400 - }, - { - "epoch": 295.80152671755724, - "grad_norm": 0.0016317664412781596, - "learning_rate": 2.2501e-06, - "loss": 0.0, - "step": 77500 - }, - { - "epoch": 296.1832061068702, - "grad_norm": 0.0005564046441577375, - "learning_rate": 2.2401e-06, - "loss": 0.0, - "step": 77600 - }, - { - "epoch": 296.5648854961832, - "grad_norm": 9.140535985352471e-05, - "learning_rate": 2.2301000000000005e-06, - "loss": 0.0, - "step": 77700 - }, - { - "epoch": 296.9465648854962, - "grad_norm": 0.0004671975621022284, - "learning_rate": 2.2201e-06, - "loss": 0.0, - "step": 77800 - }, - { - "epoch": 297.32824427480915, - "grad_norm": 0.00019703837460838258, - "learning_rate": 2.2101000000000002e-06, - "loss": 0.0, - "step": 77900 - }, - { - "epoch": 297.70992366412213, - "grad_norm": 0.0004748985811602324, - "learning_rate": 2.2001000000000003e-06, - "loss": 0.0, - "step": 78000 - }, - { - "epoch": 298.0916030534351, - "grad_norm": 0.00036091229412704706, - "learning_rate": 2.1901000000000004e-06, - "loss": 0.0001, - "step": 78100 - }, - { - "epoch": 298.4732824427481, - "grad_norm": 0.0005071760970167816, - "learning_rate": 2.1801e-06, - "loss": 0.0, - "step": 78200 - }, - { - "epoch": 298.85496183206106, - "grad_norm": 7.702062430325896e-05, - "learning_rate": 2.1701e-06, - "loss": 0.0, - "step": 78300 - }, - { - "epoch": 299.23664122137404, - "grad_norm": 0.0013380494201555848, - "learning_rate": 2.1601000000000003e-06, - "loss": 0.0, - "step": 78400 - }, - { - "epoch": 299.618320610687, - "grad_norm": 0.0025339992716908455, - "learning_rate": 2.1501000000000004e-06, - "loss": 0.0, - "step": 78500 - }, - { - "epoch": 300.0, - "grad_norm": 0.02043747343122959, - "learning_rate": 2.1401e-06, - "loss": 0.0, - "step": 78600 - }, - { - "epoch": 300.381679389313, - "grad_norm": 0.00019974283350165933, - "learning_rate": 2.1301e-06, - "loss": 0.0, - "step": 78700 - }, - { - "epoch": 300.76335877862596, - "grad_norm": 0.000294814701192081, - "learning_rate": 2.1201000000000003e-06, - "loss": 0.0, - "step": 78800 - }, - { - "epoch": 301.14503816793894, - "grad_norm": 0.0004061469517182559, - "learning_rate": 2.1101000000000003e-06, - "loss": 0.0001, - "step": 78900 - }, - { - "epoch": 301.5267175572519, - "grad_norm": 0.00013555915211327374, - "learning_rate": 2.1001e-06, - "loss": 0.0, - "step": 79000 - }, - { - "epoch": 301.9083969465649, - "grad_norm": 0.0008066002628766, - "learning_rate": 2.0901e-06, - "loss": 0.0, - "step": 79100 - }, - { - "epoch": 302.29007633587787, - "grad_norm": 0.0002148078492609784, - "learning_rate": 2.0801e-06, - "loss": 0.0, - "step": 79200 - }, - { - "epoch": 302.67175572519085, - "grad_norm": 9.223959204973653e-05, - "learning_rate": 2.0701000000000003e-06, - "loss": 0.0, - "step": 79300 - }, - { - "epoch": 303.0534351145038, - "grad_norm": 0.00018763703701552004, - "learning_rate": 2.0601e-06, - "loss": 0.0, - "step": 79400 - }, - { - "epoch": 303.4351145038168, - "grad_norm": 0.007028752006590366, - "learning_rate": 2.0501e-06, - "loss": 0.0, - "step": 79500 - }, - { - "epoch": 303.8167938931298, - "grad_norm": 8.952162170317024e-05, - "learning_rate": 2.0401e-06, - "loss": 0.0, - "step": 79600 - }, - { - "epoch": 304.19847328244276, - "grad_norm": 0.0006351001793518662, - "learning_rate": 2.0301000000000003e-06, - "loss": 0.0, - "step": 79700 - }, - { - "epoch": 304.58015267175574, - "grad_norm": 0.00011554649972822517, - "learning_rate": 2.0201e-06, - "loss": 0.0, - "step": 79800 - }, - { - "epoch": 304.9618320610687, - "grad_norm": 0.0008121778373606503, - "learning_rate": 2.0101e-06, - "loss": 0.0, - "step": 79900 - }, - { - "epoch": 305.3435114503817, - "grad_norm": 4.551075107883662e-05, - "learning_rate": 2.0001e-06, - "loss": 0.0, - "step": 80000 - }, - { - "epoch": 305.3435114503817, - "eval_loss": 0.07268395274877548, - "eval_runtime": 12.8326, - "eval_sacrebleu": 98.41475055057425, - "eval_samples_per_second": 72.861, - "eval_steps_per_second": 1.169, - "step": 80000 - }, - { - "epoch": 305.7251908396947, - "grad_norm": 0.0005101580754853785, - "learning_rate": 1.9901000000000002e-06, - "loss": 0.0002, - "step": 80100 - }, - { - "epoch": 306.10687022900765, - "grad_norm": 0.00010200974065810442, - "learning_rate": 1.9801e-06, - "loss": 0.0, - "step": 80200 - }, - { - "epoch": 306.48854961832063, - "grad_norm": 0.0026822371874004602, - "learning_rate": 1.9701e-06, - "loss": 0.0, - "step": 80300 - }, - { - "epoch": 306.8702290076336, - "grad_norm": 0.00022087556135375053, - "learning_rate": 1.9601e-06, - "loss": 0.0, - "step": 80400 - }, - { - "epoch": 307.2519083969466, - "grad_norm": 0.005319487303495407, - "learning_rate": 1.9501e-06, - "loss": 0.0, - "step": 80500 - }, - { - "epoch": 307.63358778625957, - "grad_norm": 0.002392655238509178, - "learning_rate": 1.9401e-06, - "loss": 0.0, - "step": 80600 - }, - { - "epoch": 308.01526717557255, - "grad_norm": 7.424175419146195e-05, - "learning_rate": 1.9301e-06, - "loss": 0.0, - "step": 80700 - }, - { - "epoch": 308.39694656488547, - "grad_norm": 0.0009263260290026665, - "learning_rate": 1.9201e-06, - "loss": 0.0, - "step": 80800 - }, - { - "epoch": 308.77862595419845, - "grad_norm": 0.0012055652914568782, - "learning_rate": 1.9101e-06, - "loss": 0.0, - "step": 80900 - }, - { - "epoch": 309.1603053435114, - "grad_norm": 0.001190527924336493, - "learning_rate": 1.9001000000000002e-06, - "loss": 0.0, - "step": 81000 - }, - { - "epoch": 309.5419847328244, - "grad_norm": 0.00010035101149696857, - "learning_rate": 1.8901000000000003e-06, - "loss": 0.0, - "step": 81100 - }, - { - "epoch": 309.9236641221374, - "grad_norm": 0.00019223205163143575, - "learning_rate": 1.8801000000000002e-06, - "loss": 0.0, - "step": 81200 - }, - { - "epoch": 310.30534351145036, - "grad_norm": 0.00016080672503449023, - "learning_rate": 1.8701000000000003e-06, - "loss": 0.0, - "step": 81300 - }, - { - "epoch": 310.68702290076334, - "grad_norm": 0.0001242541620740667, - "learning_rate": 1.8601000000000002e-06, - "loss": 0.0, - "step": 81400 - }, - { - "epoch": 311.0687022900763, - "grad_norm": 0.0003964394563809037, - "learning_rate": 1.8501000000000003e-06, - "loss": 0.0, - "step": 81500 - }, - { - "epoch": 311.4503816793893, - "grad_norm": 0.00014384974201675504, - "learning_rate": 1.8401000000000002e-06, - "loss": 0.0, - "step": 81600 - }, - { - "epoch": 311.8320610687023, - "grad_norm": 0.000176993606146425, - "learning_rate": 1.8301000000000003e-06, - "loss": 0.0001, - "step": 81700 - }, - { - "epoch": 312.21374045801525, - "grad_norm": 7.356610149145126e-05, - "learning_rate": 1.8201000000000002e-06, - "loss": 0.0, - "step": 81800 - }, - { - "epoch": 312.59541984732823, - "grad_norm": 3.0061322831897996e-05, - "learning_rate": 1.8101000000000003e-06, - "loss": 0.0, - "step": 81900 - }, - { - "epoch": 312.9770992366412, - "grad_norm": 0.00023035002232063562, - "learning_rate": 1.8001000000000001e-06, - "loss": 0.0, - "step": 82000 - }, - { - "epoch": 313.3587786259542, - "grad_norm": 0.0003503488260321319, - "learning_rate": 1.7901000000000002e-06, - "loss": 0.0, - "step": 82100 - }, - { - "epoch": 313.74045801526717, - "grad_norm": 4.8290072300005704e-05, - "learning_rate": 1.7801000000000001e-06, - "loss": 0.0002, - "step": 82200 - }, - { - "epoch": 314.12213740458014, - "grad_norm": 0.002625842113047838, - "learning_rate": 1.7701000000000002e-06, - "loss": 0.0, - "step": 82300 - }, - { - "epoch": 314.5038167938931, - "grad_norm": 0.00023868166317697614, - "learning_rate": 1.7601e-06, - "loss": 0.0001, - "step": 82400 - }, - { - "epoch": 314.8854961832061, - "grad_norm": 0.0006999046890996397, - "learning_rate": 1.7501000000000002e-06, - "loss": 0.0, - "step": 82500 - }, - { - "epoch": 315.2671755725191, - "grad_norm": 0.00010039243352366611, - "learning_rate": 1.7401e-06, - "loss": 0.0001, - "step": 82600 - }, - { - "epoch": 315.64885496183206, - "grad_norm": 0.00040519831236451864, - "learning_rate": 1.7301000000000002e-06, - "loss": 0.0, - "step": 82700 - }, - { - "epoch": 316.03053435114504, - "grad_norm": 9.347263403469697e-05, - "learning_rate": 1.7201e-06, - "loss": 0.0, - "step": 82800 - }, - { - "epoch": 316.412213740458, - "grad_norm": 0.0009526308858767152, - "learning_rate": 1.7101000000000002e-06, - "loss": 0.0, - "step": 82900 - }, - { - "epoch": 316.793893129771, - "grad_norm": 0.00015173423162195832, - "learning_rate": 1.7001e-06, - "loss": 0.0, - "step": 83000 - }, - { - "epoch": 317.17557251908397, - "grad_norm": 0.0003836711111944169, - "learning_rate": 1.6901000000000001e-06, - "loss": 0.0, - "step": 83100 - }, - { - "epoch": 317.55725190839695, - "grad_norm": 0.00022331967193167657, - "learning_rate": 1.6801e-06, - "loss": 0.0, - "step": 83200 - }, - { - "epoch": 317.9389312977099, - "grad_norm": 0.00014383331290446222, - "learning_rate": 1.6701000000000001e-06, - "loss": 0.0, - "step": 83300 - }, - { - "epoch": 318.3206106870229, - "grad_norm": 0.000119604985229671, - "learning_rate": 1.6601e-06, - "loss": 0.0, - "step": 83400 - }, - { - "epoch": 318.7022900763359, - "grad_norm": 0.00012338584929239005, - "learning_rate": 1.6501e-06, - "loss": 0.0, - "step": 83500 - }, - { - "epoch": 319.08396946564886, - "grad_norm": 0.0006752696353942156, - "learning_rate": 1.6401e-06, - "loss": 0.0, - "step": 83600 - }, - { - "epoch": 319.46564885496184, - "grad_norm": 0.000791439728345722, - "learning_rate": 1.6301e-06, - "loss": 0.0, - "step": 83700 - }, - { - "epoch": 319.8473282442748, - "grad_norm": 0.0007236794335767627, - "learning_rate": 1.6201e-06, - "loss": 0.0, - "step": 83800 - }, - { - "epoch": 320.2290076335878, - "grad_norm": 0.0007148933946155012, - "learning_rate": 1.6101e-06, - "loss": 0.0, - "step": 83900 - }, - { - "epoch": 320.6106870229008, - "grad_norm": 0.002037597121670842, - "learning_rate": 1.6001000000000004e-06, - "loss": 0.0, - "step": 84000 - }, - { - "epoch": 320.99236641221376, - "grad_norm": 0.0002546102332416922, - "learning_rate": 1.5901000000000002e-06, - "loss": 0.0, - "step": 84100 - }, - { - "epoch": 321.37404580152673, - "grad_norm": 0.030380288138985634, - "learning_rate": 1.5801000000000003e-06, - "loss": 0.0, - "step": 84200 - }, - { - "epoch": 321.7557251908397, - "grad_norm": 0.0003261581587139517, - "learning_rate": 1.5701000000000002e-06, - "loss": 0.0003, - "step": 84300 - }, - { - "epoch": 322.1374045801527, - "grad_norm": 0.00011467019066913053, - "learning_rate": 1.5601000000000003e-06, - "loss": 0.0, - "step": 84400 - }, - { - "epoch": 322.51908396946567, - "grad_norm": 0.00027519240393303335, - "learning_rate": 1.5501000000000002e-06, - "loss": 0.0, - "step": 84500 - }, - { - "epoch": 322.90076335877865, - "grad_norm": 0.00020273112750146538, - "learning_rate": 1.5401000000000003e-06, - "loss": 0.0, - "step": 84600 - }, - { - "epoch": 323.2824427480916, - "grad_norm": 9.298837539972737e-05, - "learning_rate": 1.5301000000000002e-06, - "loss": 0.0, - "step": 84700 - }, - { - "epoch": 323.6641221374046, - "grad_norm": 0.0004691576468758285, - "learning_rate": 1.5201000000000003e-06, - "loss": 0.0, - "step": 84800 - }, - { - "epoch": 324.0458015267176, - "grad_norm": 0.00035024865064769983, - "learning_rate": 1.5101000000000002e-06, - "loss": 0.0, - "step": 84900 - }, - { - "epoch": 324.42748091603056, - "grad_norm": 0.0001968619180843234, - "learning_rate": 1.5001000000000003e-06, - "loss": 0.0, - "step": 85000 - }, - { - "epoch": 324.42748091603056, - "eval_loss": 0.07435577362775803, - "eval_runtime": 12.7045, - "eval_sacrebleu": 98.37224586357964, - "eval_samples_per_second": 73.596, - "eval_steps_per_second": 1.181, - "step": 85000 - }, - { - "epoch": 324.80916030534354, - "grad_norm": 0.00010008819663198665, - "learning_rate": 1.4901000000000001e-06, - "loss": 0.0001, - "step": 85100 - }, - { - "epoch": 325.19083969465646, - "grad_norm": 0.01809822954237461, - "learning_rate": 1.4801000000000002e-06, - "loss": 0.0001, - "step": 85200 - }, - { - "epoch": 325.57251908396944, - "grad_norm": 0.0018511655507609248, - "learning_rate": 1.4701000000000001e-06, - "loss": 0.0002, - "step": 85300 - }, - { - "epoch": 325.9541984732824, - "grad_norm": 0.0013542008819058537, - "learning_rate": 1.4601000000000002e-06, - "loss": 0.0, - "step": 85400 - }, - { - "epoch": 326.3358778625954, - "grad_norm": 0.00041448758565820754, - "learning_rate": 1.4501e-06, - "loss": 0.0, - "step": 85500 - }, - { - "epoch": 326.7175572519084, - "grad_norm": 0.000122825222206302, - "learning_rate": 1.4401000000000002e-06, - "loss": 0.0, - "step": 85600 - }, - { - "epoch": 327.09923664122135, - "grad_norm": 5.630105079035275e-05, - "learning_rate": 1.4301e-06, - "loss": 0.0, - "step": 85700 - }, - { - "epoch": 327.48091603053433, - "grad_norm": 9.909773507388309e-05, - "learning_rate": 1.4201000000000002e-06, - "loss": 0.0, - "step": 85800 - }, - { - "epoch": 327.8625954198473, - "grad_norm": 0.00021611254487652332, - "learning_rate": 1.4101e-06, - "loss": 0.0, - "step": 85900 - }, - { - "epoch": 328.2442748091603, - "grad_norm": 0.0004478337650652975, - "learning_rate": 1.4001000000000002e-06, - "loss": 0.0, - "step": 86000 - }, - { - "epoch": 328.62595419847327, - "grad_norm": 0.00014239976007957011, - "learning_rate": 1.3901e-06, - "loss": 0.0, - "step": 86100 - }, - { - "epoch": 329.00763358778624, - "grad_norm": 0.0001868542021838948, - "learning_rate": 1.3801000000000001e-06, - "loss": 0.0, - "step": 86200 - }, - { - "epoch": 329.3893129770992, - "grad_norm": 3.0120936571620405e-05, - "learning_rate": 1.3701e-06, - "loss": 0.0, - "step": 86300 - }, - { - "epoch": 329.7709923664122, - "grad_norm": 9.835998935159296e-05, - "learning_rate": 1.3601000000000001e-06, - "loss": 0.0, - "step": 86400 - }, - { - "epoch": 330.1526717557252, - "grad_norm": 0.0001276729308301583, - "learning_rate": 1.3501e-06, - "loss": 0.0, - "step": 86500 - }, - { - "epoch": 330.53435114503816, - "grad_norm": 0.00018047768389806151, - "learning_rate": 1.3401e-06, - "loss": 0.0, - "step": 86600 - }, - { - "epoch": 330.91603053435114, - "grad_norm": 0.000446511636255309, - "learning_rate": 1.3301e-06, - "loss": 0.0, - "step": 86700 - }, - { - "epoch": 331.2977099236641, - "grad_norm": 0.00010882138303713873, - "learning_rate": 1.3201e-06, - "loss": 0.0, - "step": 86800 - }, - { - "epoch": 331.6793893129771, - "grad_norm": 0.00011000638187397271, - "learning_rate": 1.3101e-06, - "loss": 0.0, - "step": 86900 - }, - { - "epoch": 332.0610687022901, - "grad_norm": 0.0008669011294841766, - "learning_rate": 1.3001e-06, - "loss": 0.0, - "step": 87000 - }, - { - "epoch": 332.44274809160305, - "grad_norm": 0.00014212194946594536, - "learning_rate": 1.2901000000000002e-06, - "loss": 0.0, - "step": 87100 - }, - { - "epoch": 332.82442748091603, - "grad_norm": 0.004548476077616215, - "learning_rate": 1.2801000000000002e-06, - "loss": 0.0, - "step": 87200 - }, - { - "epoch": 333.206106870229, - "grad_norm": 0.000261221342952922, - "learning_rate": 1.2701000000000001e-06, - "loss": 0.0002, - "step": 87300 - }, - { - "epoch": 333.587786259542, - "grad_norm": 4.875214654020965e-05, - "learning_rate": 1.2601000000000002e-06, - "loss": 0.0001, - "step": 87400 - }, - { - "epoch": 333.96946564885496, - "grad_norm": 0.0002320551866432652, - "learning_rate": 1.2501000000000001e-06, - "loss": 0.0, - "step": 87500 - }, - { - "epoch": 334.35114503816794, - "grad_norm": 0.0005810837028548121, - "learning_rate": 1.2401e-06, - "loss": 0.0, - "step": 87600 - }, - { - "epoch": 334.7328244274809, - "grad_norm": 7.942037336761132e-05, - "learning_rate": 1.2301e-06, - "loss": 0.0001, - "step": 87700 - }, - { - "epoch": 335.1145038167939, - "grad_norm": 0.0020920836832374334, - "learning_rate": 1.2201e-06, - "loss": 0.0, - "step": 87800 - }, - { - "epoch": 335.4961832061069, - "grad_norm": 0.000243748290813528, - "learning_rate": 1.2101e-06, - "loss": 0.0, - "step": 87900 - }, - { - "epoch": 335.87786259541986, - "grad_norm": 0.00022767962946090847, - "learning_rate": 1.2001000000000002e-06, - "loss": 0.0, - "step": 88000 - }, - { - "epoch": 336.25954198473283, - "grad_norm": 0.00018849635671358556, - "learning_rate": 1.1901e-06, - "loss": 0.0, - "step": 88100 - }, - { - "epoch": 336.6412213740458, - "grad_norm": 0.0013850952964276075, - "learning_rate": 1.1801000000000001e-06, - "loss": 0.0, - "step": 88200 - }, - { - "epoch": 337.0229007633588, - "grad_norm": 0.00010375339479651302, - "learning_rate": 1.1701e-06, - "loss": 0.0, - "step": 88300 - }, - { - "epoch": 337.40458015267177, - "grad_norm": 0.00021955980628263205, - "learning_rate": 1.1601000000000001e-06, - "loss": 0.0, - "step": 88400 - }, - { - "epoch": 337.78625954198475, - "grad_norm": 0.00013619363016914576, - "learning_rate": 1.1501e-06, - "loss": 0.0, - "step": 88500 - }, - { - "epoch": 338.1679389312977, - "grad_norm": 0.00012820912525057793, - "learning_rate": 1.1401e-06, - "loss": 0.0, - "step": 88600 - }, - { - "epoch": 338.5496183206107, - "grad_norm": 8.258437446784228e-05, - "learning_rate": 1.1301e-06, - "loss": 0.0, - "step": 88700 - }, - { - "epoch": 338.9312977099237, - "grad_norm": 0.00011626326886471361, - "learning_rate": 1.1201e-06, - "loss": 0.0, - "step": 88800 - }, - { - "epoch": 339.31297709923666, - "grad_norm": 0.00023723025515209883, - "learning_rate": 1.1101e-06, - "loss": 0.0, - "step": 88900 - }, - { - "epoch": 339.69465648854964, - "grad_norm": 0.0001802946353564039, - "learning_rate": 1.1001e-06, - "loss": 0.0, - "step": 89000 - }, - { - "epoch": 340.0763358778626, - "grad_norm": 0.00044537289068102837, - "learning_rate": 1.0901e-06, - "loss": 0.0001, - "step": 89100 - }, - { - "epoch": 340.4580152671756, - "grad_norm": 0.00014735484728589654, - "learning_rate": 1.0801e-06, - "loss": 0.0, - "step": 89200 - }, - { - "epoch": 340.8396946564886, - "grad_norm": 0.00035787318483926356, - "learning_rate": 1.0701e-06, - "loss": 0.0, - "step": 89300 - }, - { - "epoch": 341.22137404580155, - "grad_norm": 0.00013804069021716714, - "learning_rate": 1.0601e-06, - "loss": 0.0, - "step": 89400 - }, - { - "epoch": 341.60305343511453, - "grad_norm": 0.0003113312122877687, - "learning_rate": 1.0501000000000001e-06, - "loss": 0.0, - "step": 89500 - }, - { - "epoch": 341.98473282442745, - "grad_norm": 0.0010811437387019396, - "learning_rate": 1.0401000000000002e-06, - "loss": 0.0, - "step": 89600 - }, - { - "epoch": 342.36641221374043, - "grad_norm": 0.0008038708474487066, - "learning_rate": 1.0301e-06, - "loss": 0.0, - "step": 89700 - }, - { - "epoch": 342.7480916030534, - "grad_norm": 0.00013997878704685718, - "learning_rate": 1.0201000000000002e-06, - "loss": 0.0, - "step": 89800 - }, - { - "epoch": 343.1297709923664, - "grad_norm": 0.0071556600742042065, - "learning_rate": 1.0101e-06, - "loss": 0.0, - "step": 89900 - }, - { - "epoch": 343.51145038167937, - "grad_norm": 0.0001866053498815745, - "learning_rate": 1.0001000000000002e-06, - "loss": 0.0, - "step": 90000 - }, - { - "epoch": 343.51145038167937, - "eval_loss": 0.07229577004909515, - "eval_runtime": 12.258, - "eval_sacrebleu": 98.36601855157615, - "eval_samples_per_second": 76.277, - "eval_steps_per_second": 1.224, - "step": 90000 - }, - { - "epoch": 343.89312977099235, - "grad_norm": 0.00019708641048055142, - "learning_rate": 9.901e-07, - "loss": 0.0, - "step": 90100 - }, - { - "epoch": 344.2748091603053, - "grad_norm": 0.00095375906676054, - "learning_rate": 9.801000000000002e-07, - "loss": 0.0, - "step": 90200 - }, - { - "epoch": 344.6564885496183, - "grad_norm": 5.31758923898451e-05, - "learning_rate": 9.701e-07, - "loss": 0.0, - "step": 90300 - }, - { - "epoch": 345.0381679389313, - "grad_norm": 0.0016230475157499313, - "learning_rate": 9.601000000000001e-07, - "loss": 0.0, - "step": 90400 - }, - { - "epoch": 345.41984732824426, - "grad_norm": 0.00027962654712609947, - "learning_rate": 9.501e-07, - "loss": 0.0, - "step": 90500 - }, - { - "epoch": 345.80152671755724, - "grad_norm": 0.0010087329428642988, - "learning_rate": 9.401e-07, - "loss": 0.0, - "step": 90600 - }, - { - "epoch": 346.1832061068702, - "grad_norm": 0.0010386575013399124, - "learning_rate": 9.301e-07, - "loss": 0.0, - "step": 90700 - }, - { - "epoch": 346.5648854961832, - "grad_norm": 0.00026174180675297976, - "learning_rate": 9.201e-07, - "loss": 0.0, - "step": 90800 - }, - { - "epoch": 346.9465648854962, - "grad_norm": 0.0002297286264365539, - "learning_rate": 9.101e-07, - "loss": 0.0, - "step": 90900 - }, - { - "epoch": 347.32824427480915, - "grad_norm": 4.946750414092094e-05, - "learning_rate": 9.001000000000002e-07, - "loss": 0.0, - "step": 91000 - }, - { - "epoch": 347.70992366412213, - "grad_norm": 0.001794499228708446, - "learning_rate": 8.901000000000002e-07, - "loss": 0.0, - "step": 91100 - }, - { - "epoch": 348.0916030534351, - "grad_norm": 0.00028683498385362327, - "learning_rate": 8.801000000000002e-07, - "loss": 0.0, - "step": 91200 - }, - { - "epoch": 348.4732824427481, - "grad_norm": 0.0002974801172968, - "learning_rate": 8.701000000000001e-07, - "loss": 0.0, - "step": 91300 - }, - { - "epoch": 348.85496183206106, - "grad_norm": 0.00011121420538984239, - "learning_rate": 8.601000000000001e-07, - "loss": 0.0, - "step": 91400 - }, - { - "epoch": 349.23664122137404, - "grad_norm": 0.00012437546683941036, - "learning_rate": 8.501000000000001e-07, - "loss": 0.0, - "step": 91500 - }, - { - "epoch": 349.618320610687, - "grad_norm": 0.0032250406220555305, - "learning_rate": 8.401000000000001e-07, - "loss": 0.0, - "step": 91600 - }, - { - "epoch": 350.0, - "grad_norm": 9.277357457904145e-05, - "learning_rate": 8.301000000000001e-07, - "loss": 0.0, - "step": 91700 - }, - { - "epoch": 350.381679389313, - "grad_norm": 0.0005177999846637249, - "learning_rate": 8.201000000000001e-07, - "loss": 0.0, - "step": 91800 - }, - { - "epoch": 350.76335877862596, - "grad_norm": 8.722688653506339e-05, - "learning_rate": 8.101000000000001e-07, - "loss": 0.0, - "step": 91900 - }, - { - "epoch": 351.14503816793894, - "grad_norm": 3.59059231413994e-05, - "learning_rate": 8.001000000000001e-07, - "loss": 0.0, - "step": 92000 - }, - { - "epoch": 351.5267175572519, - "grad_norm": 0.00034865588531829417, - "learning_rate": 7.901000000000001e-07, - "loss": 0.0, - "step": 92100 - }, - { - "epoch": 351.9083969465649, - "grad_norm": 0.013772457838058472, - "learning_rate": 7.801000000000001e-07, - "loss": 0.0, - "step": 92200 - }, - { - "epoch": 352.29007633587787, - "grad_norm": 0.000194388281670399, - "learning_rate": 7.701e-07, - "loss": 0.0, - "step": 92300 - }, - { - "epoch": 352.67175572519085, - "grad_norm": 0.00030055633396841586, - "learning_rate": 7.601e-07, - "loss": 0.0, - "step": 92400 - }, - { - "epoch": 353.0534351145038, - "grad_norm": 0.0001172588235931471, - "learning_rate": 7.501e-07, - "loss": 0.0, - "step": 92500 - }, - { - "epoch": 353.4351145038168, - "grad_norm": 0.00011405652185203508, - "learning_rate": 7.401000000000001e-07, - "loss": 0.0, - "step": 92600 - }, - { - "epoch": 353.8167938931298, - "grad_norm": 6.664705870207399e-05, - "learning_rate": 7.301000000000001e-07, - "loss": 0.0, - "step": 92700 - }, - { - "epoch": 354.19847328244276, - "grad_norm": 0.0002055191871477291, - "learning_rate": 7.201000000000001e-07, - "loss": 0.0001, - "step": 92800 - }, - { - "epoch": 354.58015267175574, - "grad_norm": 0.0005788641283288598, - "learning_rate": 7.101000000000001e-07, - "loss": 0.0, - "step": 92900 - }, - { - "epoch": 354.9618320610687, - "grad_norm": 0.0004745021287817508, - "learning_rate": 7.001000000000001e-07, - "loss": 0.0, - "step": 93000 - }, - { - "epoch": 355.3435114503817, - "grad_norm": 0.00022238130623009056, - "learning_rate": 6.901000000000001e-07, - "loss": 0.0, - "step": 93100 - }, - { - "epoch": 355.7251908396947, - "grad_norm": 5.5835618695709854e-05, - "learning_rate": 6.801000000000001e-07, - "loss": 0.0, - "step": 93200 - }, - { - "epoch": 356.10687022900765, - "grad_norm": 0.0005699278553947806, - "learning_rate": 6.701e-07, - "loss": 0.0, - "step": 93300 - }, - { - "epoch": 356.48854961832063, - "grad_norm": 0.00013945194950792938, - "learning_rate": 6.601e-07, - "loss": 0.0, - "step": 93400 - }, - { - "epoch": 356.8702290076336, - "grad_norm": 0.00012388595496304333, - "learning_rate": 6.501e-07, - "loss": 0.0, - "step": 93500 - }, - { - "epoch": 357.2519083969466, - "grad_norm": 6.234057946130633e-05, - "learning_rate": 6.401e-07, - "loss": 0.0, - "step": 93600 - }, - { - "epoch": 357.63358778625957, - "grad_norm": 0.00012075301492586732, - "learning_rate": 6.301e-07, - "loss": 0.0, - "step": 93700 - }, - { - "epoch": 358.01526717557255, - "grad_norm": 0.0002147838968085125, - "learning_rate": 6.201000000000001e-07, - "loss": 0.0, - "step": 93800 - }, - { - "epoch": 358.39694656488547, - "grad_norm": 7.291266956599429e-05, - "learning_rate": 6.101000000000001e-07, - "loss": 0.0, - "step": 93900 - }, - { - "epoch": 358.77862595419845, - "grad_norm": 0.0001753059623297304, - "learning_rate": 6.001000000000001e-07, - "loss": 0.0, - "step": 94000 - }, - { - "epoch": 359.1603053435114, - "grad_norm": 0.021427402272820473, - "learning_rate": 5.901000000000001e-07, - "loss": 0.0, - "step": 94100 - }, - { - "epoch": 359.5419847328244, - "grad_norm": 3.56302443833556e-05, - "learning_rate": 5.801000000000001e-07, - "loss": 0.0, - "step": 94200 - }, - { - "epoch": 359.9236641221374, - "grad_norm": 0.0004847725504077971, - "learning_rate": 5.701000000000001e-07, - "loss": 0.0001, - "step": 94300 - }, - { - "epoch": 360.30534351145036, - "grad_norm": 0.0005537783144973218, - "learning_rate": 5.601e-07, - "loss": 0.0, - "step": 94400 - }, - { - "epoch": 360.68702290076334, - "grad_norm": 8.484358113491908e-05, - "learning_rate": 5.501e-07, - "loss": 0.0, - "step": 94500 - }, - { - "epoch": 361.0687022900763, - "grad_norm": 0.003139751497656107, - "learning_rate": 5.401e-07, - "loss": 0.0, - "step": 94600 - }, - { - "epoch": 361.4503816793893, - "grad_norm": 0.003733165329322219, - "learning_rate": 5.301e-07, - "loss": 0.0, - "step": 94700 - }, - { - "epoch": 361.8320610687023, - "grad_norm": 0.00021900184219703078, - "learning_rate": 5.201e-07, - "loss": 0.0, - "step": 94800 - }, - { - "epoch": 362.21374045801525, - "grad_norm": 0.00019141900702379644, - "learning_rate": 5.101e-07, - "loss": 0.0, - "step": 94900 - }, - { - "epoch": 362.59541984732823, - "grad_norm": 0.00020785974629689008, - "learning_rate": 5.001e-07, - "loss": 0.0, - "step": 95000 - }, - { - "epoch": 362.59541984732823, - "eval_loss": 0.07603045552968979, - "eval_runtime": 12.2727, - "eval_sacrebleu": 98.31584294549747, - "eval_samples_per_second": 76.185, - "eval_steps_per_second": 1.222, - "step": 95000 - }, - { - "epoch": 362.9770992366412, - "grad_norm": 0.0004615593352355063, - "learning_rate": 4.901e-07, - "loss": 0.0, - "step": 95100 - }, - { - "epoch": 363.3587786259542, - "grad_norm": 6.932980613783002e-05, - "learning_rate": 4.801e-07, - "loss": 0.0, - "step": 95200 - }, - { - "epoch": 363.74045801526717, - "grad_norm": 8.285464718937874e-05, - "learning_rate": 4.7010000000000006e-07, - "loss": 0.0, - "step": 95300 - }, - { - "epoch": 364.12213740458014, - "grad_norm": 9.155921725323424e-05, - "learning_rate": 4.6010000000000005e-07, - "loss": 0.0, - "step": 95400 - }, - { - "epoch": 364.5038167938931, - "grad_norm": 0.00012713043543044478, - "learning_rate": 4.5010000000000004e-07, - "loss": 0.0, - "step": 95500 - }, - { - "epoch": 364.8854961832061, - "grad_norm": 3.598256080294959e-05, - "learning_rate": 4.4010000000000003e-07, - "loss": 0.0, - "step": 95600 - }, - { - "epoch": 365.2671755725191, - "grad_norm": 0.0001501008082414046, - "learning_rate": 4.301e-07, - "loss": 0.0, - "step": 95700 - }, - { - "epoch": 365.64885496183206, - "grad_norm": 9.735300409374759e-05, - "learning_rate": 4.201e-07, - "loss": 0.0, - "step": 95800 - }, - { - "epoch": 366.03053435114504, - "grad_norm": 0.0005442662513814867, - "learning_rate": 4.101e-07, - "loss": 0.0, - "step": 95900 - }, - { - "epoch": 366.412213740458, - "grad_norm": 5.740653068642132e-05, - "learning_rate": 4.001e-07, - "loss": 0.0, - "step": 96000 - }, - { - "epoch": 366.793893129771, - "grad_norm": 0.00012417425750754774, - "learning_rate": 3.901000000000001e-07, - "loss": 0.0, - "step": 96100 - }, - { - "epoch": 367.17557251908397, - "grad_norm": 3.415640821913257e-05, - "learning_rate": 3.801000000000001e-07, - "loss": 0.0, - "step": 96200 - }, - { - "epoch": 367.55725190839695, - "grad_norm": 0.0008031368488445878, - "learning_rate": 3.7010000000000006e-07, - "loss": 0.0, - "step": 96300 - }, - { - "epoch": 367.9389312977099, - "grad_norm": 3.9686976379016414e-05, - "learning_rate": 3.6010000000000005e-07, - "loss": 0.0, - "step": 96400 - }, - { - "epoch": 368.3206106870229, - "grad_norm": 0.0001224290463142097, - "learning_rate": 3.5010000000000004e-07, - "loss": 0.0, - "step": 96500 - }, - { - "epoch": 368.7022900763359, - "grad_norm": 0.0076124235056340694, - "learning_rate": 3.4010000000000003e-07, - "loss": 0.0, - "step": 96600 - }, - { - "epoch": 369.08396946564886, - "grad_norm": 0.00038530558231286705, - "learning_rate": 3.301e-07, - "loss": 0.0, - "step": 96700 - }, - { - "epoch": 369.46564885496184, - "grad_norm": 0.0021456016693264246, - "learning_rate": 3.201e-07, - "loss": 0.0, - "step": 96800 - }, - { - "epoch": 369.8473282442748, - "grad_norm": 0.00017911805480252951, - "learning_rate": 3.101e-07, - "loss": 0.0, - "step": 96900 - }, - { - "epoch": 370.2290076335878, - "grad_norm": 0.00015663144586142153, - "learning_rate": 3.001e-07, - "loss": 0.0, - "step": 97000 - }, - { - "epoch": 370.6106870229008, - "grad_norm": 0.006977499928325415, - "learning_rate": 2.9010000000000004e-07, - "loss": 0.0, - "step": 97100 - }, - { - "epoch": 370.99236641221376, - "grad_norm": 0.0003379581612534821, - "learning_rate": 2.8010000000000003e-07, - "loss": 0.0, - "step": 97200 - }, - { - "epoch": 371.37404580152673, - "grad_norm": 0.0040053073316812515, - "learning_rate": 2.701e-07, - "loss": 0.0, - "step": 97300 - }, - { - "epoch": 371.7557251908397, - "grad_norm": 5.939768743701279e-05, - "learning_rate": 2.601e-07, - "loss": 0.0, - "step": 97400 - }, - { - "epoch": 372.1374045801527, - "grad_norm": 0.00012827917817048728, - "learning_rate": 2.5010000000000005e-07, - "loss": 0.0, - "step": 97500 - }, - { - "epoch": 372.51908396946567, - "grad_norm": 7.440439367201179e-05, - "learning_rate": 2.4010000000000004e-07, - "loss": 0.0, - "step": 97600 - }, - { - "epoch": 372.90076335877865, - "grad_norm": 0.000359352066880092, - "learning_rate": 2.301e-07, - "loss": 0.0, - "step": 97700 - }, - { - "epoch": 373.2824427480916, - "grad_norm": 1.8802778868121095e-05, - "learning_rate": 2.201e-07, - "loss": 0.0, - "step": 97800 - }, - { - "epoch": 373.6641221374046, - "grad_norm": 0.0007017580792307854, - "learning_rate": 2.1010000000000004e-07, - "loss": 0.0, - "step": 97900 - }, - { - "epoch": 374.0458015267176, - "grad_norm": 0.0001738242426654324, - "learning_rate": 2.0010000000000002e-07, - "loss": 0.0, - "step": 98000 - }, - { - "epoch": 374.42748091603056, - "grad_norm": 7.502128573833033e-05, - "learning_rate": 1.9010000000000001e-07, - "loss": 0.0001, - "step": 98100 - }, - { - "epoch": 374.80916030534354, - "grad_norm": 6.759719690307975e-05, - "learning_rate": 1.8010000000000003e-07, - "loss": 0.0, - "step": 98200 - }, - { - "epoch": 375.19083969465646, - "grad_norm": 0.0006756431539542973, - "learning_rate": 1.7010000000000002e-07, - "loss": 0.0, - "step": 98300 - }, - { - "epoch": 375.57251908396944, - "grad_norm": 0.00027672393480315804, - "learning_rate": 1.601e-07, - "loss": 0.0, - "step": 98400 - }, - { - "epoch": 375.9541984732824, - "grad_norm": 0.00042677694000303745, - "learning_rate": 1.5010000000000003e-07, - "loss": 0.0, - "step": 98500 - }, - { - "epoch": 376.3358778625954, - "grad_norm": 0.00014421867672353983, - "learning_rate": 1.4010000000000002e-07, - "loss": 0.0, - "step": 98600 - }, - { - "epoch": 376.7175572519084, - "grad_norm": 6.646148540312424e-05, - "learning_rate": 1.301e-07, - "loss": 0.0, - "step": 98700 - }, - { - "epoch": 377.09923664122135, - "grad_norm": 4.318665378377773e-05, - "learning_rate": 1.201e-07, - "loss": 0.0, - "step": 98800 - }, - { - "epoch": 377.48091603053433, - "grad_norm": 7.311939407372847e-05, - "learning_rate": 1.1010000000000001e-07, - "loss": 0.0, - "step": 98900 - }, - { - "epoch": 377.8625954198473, - "grad_norm": 4.21335716964677e-05, - "learning_rate": 1.001e-07, - "loss": 0.0, - "step": 99000 - }, - { - "epoch": 378.2442748091603, - "grad_norm": 0.00013328553177416325, - "learning_rate": 9.010000000000002e-08, - "loss": 0.0, - "step": 99100 - }, - { - "epoch": 378.62595419847327, - "grad_norm": 0.0005755685269832611, - "learning_rate": 8.010000000000001e-08, - "loss": 0.0, - "step": 99200 - }, - { - "epoch": 379.00763358778624, - "grad_norm": 0.005619984585791826, - "learning_rate": 7.01e-08, - "loss": 0.0, - "step": 99300 - }, - { - "epoch": 379.3893129770992, - "grad_norm": 0.0007815620047040284, - "learning_rate": 6.01e-08, - "loss": 0.0, - "step": 99400 - }, - { - "epoch": 379.7709923664122, - "grad_norm": 0.00047278162674047053, - "learning_rate": 5.01e-08, - "loss": 0.0, - "step": 99500 - }, - { - "epoch": 380.1526717557252, - "grad_norm": 0.00017434738401789218, - "learning_rate": 4.01e-08, - "loss": 0.0, - "step": 99600 - }, - { - "epoch": 380.53435114503816, - "grad_norm": 0.00015467384946532547, - "learning_rate": 3.0100000000000005e-08, - "loss": 0.0001, - "step": 99700 - }, - { - "epoch": 380.91603053435114, - "grad_norm": 0.000859471969306469, - "learning_rate": 2.01e-08, - "loss": 0.0, - "step": 99800 - }, - { - "epoch": 381.2977099236641, - "grad_norm": 0.00020486314315348864, - "learning_rate": 1.0100000000000001e-08, - "loss": 0.0, - "step": 99900 - }, - { - "epoch": 381.6793893129771, - "grad_norm": 0.00019715058442670852, - "learning_rate": 1.0000000000000002e-10, - "loss": 0.0, - "step": 100000 - }, - { - "epoch": 381.6793893129771, - "eval_loss": 0.07631514966487885, - "eval_runtime": 12.5973, - "eval_sacrebleu": 98.33262341854797, - "eval_samples_per_second": 74.222, - "eval_steps_per_second": 1.191, - "step": 100000 + "epoch": 0.02686222365487415, + "eval_loss": 1.550229787826538, + "eval_runtime": 3482.8166, + "eval_sacrebleu": 96.21455916515954, + "eval_samples_per_second": 76.009, + "eval_steps_per_second": 0.594, + "step": 1000 } ], "logging_steps": 100, - "max_steps": 100000, + "max_steps": 50000, "num_input_tokens_seen": 0, - "num_train_epochs": 382, - "save_steps": 5000, + "num_train_epochs": 2, + "save_steps": 1000, "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 5, + "early_stopping_threshold": 0.0001 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, - "should_training_stop": true + "should_training_stop": false }, "attributes": {} } }, - "total_flos": 4.651917417256059e+17, - "train_batch_size": 32, + "total_flos": 1.6604548281925632e+16, + "train_batch_size": 64, "trial_name": null, "trial_params": null }