| { | |
| "best_metric": 0.10325244069099426, | |
| "best_model_checkpoint": "./fine-tuned/checkpoint-12500", | |
| "epoch": 2.195486080618249, | |
| "eval_steps": 100, | |
| "global_step": 12500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.008781944322472996, | |
| "grad_norm": 212427.96875, | |
| "learning_rate": 2.9934129632882487e-05, | |
| "loss": 0.5421, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.017563888644945992, | |
| "grad_norm": 15316.291015625, | |
| "learning_rate": 2.9868259265764974e-05, | |
| "loss": 0.1903, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.017563888644945992, | |
| "eval_loss": 0.16095133125782013, | |
| "eval_runtime": 175.5949, | |
| "eval_samples_per_second": 25.399, | |
| "eval_steps_per_second": 3.178, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.026345832967418988, | |
| "grad_norm": 21344.13671875, | |
| "learning_rate": 2.980238889864746e-05, | |
| "loss": 0.1742, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.035127777289891984, | |
| "grad_norm": 26603.357421875, | |
| "learning_rate": 2.973651853152995e-05, | |
| "loss": 0.164, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.035127777289891984, | |
| "eval_loss": 0.14671418070793152, | |
| "eval_runtime": 175.3478, | |
| "eval_samples_per_second": 25.435, | |
| "eval_steps_per_second": 3.182, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.04390972161236498, | |
| "grad_norm": 18468.01953125, | |
| "learning_rate": 2.9670648164412437e-05, | |
| "loss": 0.1697, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.052691665934837977, | |
| "grad_norm": 15799.6875, | |
| "learning_rate": 2.9604777797294924e-05, | |
| "loss": 0.161, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.052691665934837977, | |
| "eval_loss": 0.14008501172065735, | |
| "eval_runtime": 175.2345, | |
| "eval_samples_per_second": 25.452, | |
| "eval_steps_per_second": 3.184, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.06147361025731097, | |
| "grad_norm": 17163.763671875, | |
| "learning_rate": 2.953890743017741e-05, | |
| "loss": 0.1634, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.07025555457978397, | |
| "grad_norm": 17603.025390625, | |
| "learning_rate": 2.94730370630599e-05, | |
| "loss": 0.1543, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.07025555457978397, | |
| "eval_loss": 0.13591521978378296, | |
| "eval_runtime": 175.0506, | |
| "eval_samples_per_second": 25.478, | |
| "eval_steps_per_second": 3.188, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.07903749890225696, | |
| "grad_norm": 12623.9189453125, | |
| "learning_rate": 2.9407166695942387e-05, | |
| "loss": 0.1417, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.08781944322472995, | |
| "grad_norm": 14828.5, | |
| "learning_rate": 2.9341296328824874e-05, | |
| "loss": 0.1403, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.08781944322472995, | |
| "eval_loss": 0.13329531252384186, | |
| "eval_runtime": 175.1721, | |
| "eval_samples_per_second": 25.461, | |
| "eval_steps_per_second": 3.185, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.09660138754720295, | |
| "grad_norm": 16192.8515625, | |
| "learning_rate": 2.927542596170736e-05, | |
| "loss": 0.1444, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.10538333186967595, | |
| "grad_norm": 20510.47265625, | |
| "learning_rate": 2.9209555594589847e-05, | |
| "loss": 0.1466, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.10538333186967595, | |
| "eval_loss": 0.1307835429906845, | |
| "eval_runtime": 175.06, | |
| "eval_samples_per_second": 25.477, | |
| "eval_steps_per_second": 3.187, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.11416527619214895, | |
| "grad_norm": 10555.8408203125, | |
| "learning_rate": 2.9143685227472337e-05, | |
| "loss": 0.1472, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.12294722051462194, | |
| "grad_norm": 12451.990234375, | |
| "learning_rate": 2.907781486035482e-05, | |
| "loss": 0.1415, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.12294722051462194, | |
| "eval_loss": 0.1288571059703827, | |
| "eval_runtime": 175.1799, | |
| "eval_samples_per_second": 25.46, | |
| "eval_steps_per_second": 3.185, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.13172916483709493, | |
| "grad_norm": 11173.96875, | |
| "learning_rate": 2.901194449323731e-05, | |
| "loss": 0.1368, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.14051110915956794, | |
| "grad_norm": 47561.75, | |
| "learning_rate": 2.8946074126119797e-05, | |
| "loss": 0.1399, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.14051110915956794, | |
| "eval_loss": 0.12726937234401703, | |
| "eval_runtime": 175.2229, | |
| "eval_samples_per_second": 25.453, | |
| "eval_steps_per_second": 3.185, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.14929305348204092, | |
| "grad_norm": 11766.6767578125, | |
| "learning_rate": 2.8880203759002283e-05, | |
| "loss": 0.1433, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.15807499780451392, | |
| "grad_norm": 14977.416015625, | |
| "learning_rate": 2.881433339188477e-05, | |
| "loss": 0.1371, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.15807499780451392, | |
| "eval_loss": 0.12529444694519043, | |
| "eval_runtime": 174.8253, | |
| "eval_samples_per_second": 25.511, | |
| "eval_steps_per_second": 3.192, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.1668569421269869, | |
| "grad_norm": 11109.173828125, | |
| "learning_rate": 2.874846302476726e-05, | |
| "loss": 0.1292, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.1756388864494599, | |
| "grad_norm": 9897.7958984375, | |
| "learning_rate": 2.8682592657649747e-05, | |
| "loss": 0.1351, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.1756388864494599, | |
| "eval_loss": 0.12485189735889435, | |
| "eval_runtime": 174.8115, | |
| "eval_samples_per_second": 25.513, | |
| "eval_steps_per_second": 3.192, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.18442083077193291, | |
| "grad_norm": 20060.55859375, | |
| "learning_rate": 2.8616722290532233e-05, | |
| "loss": 0.1303, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.1932027750944059, | |
| "grad_norm": 10244.4052734375, | |
| "learning_rate": 2.855085192341472e-05, | |
| "loss": 0.1413, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.1932027750944059, | |
| "eval_loss": 0.12359971553087234, | |
| "eval_runtime": 175.122, | |
| "eval_samples_per_second": 25.468, | |
| "eval_steps_per_second": 3.186, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.2019847194168789, | |
| "grad_norm": 36993.25, | |
| "learning_rate": 2.848498155629721e-05, | |
| "loss": 0.1275, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.2107666637393519, | |
| "grad_norm": 11102.2646484375, | |
| "learning_rate": 2.8419111189179697e-05, | |
| "loss": 0.1377, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.2107666637393519, | |
| "eval_loss": 0.12276890873908997, | |
| "eval_runtime": 175.1309, | |
| "eval_samples_per_second": 25.467, | |
| "eval_steps_per_second": 3.186, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.21954860806182488, | |
| "grad_norm": 10398.369140625, | |
| "learning_rate": 2.835324082206218e-05, | |
| "loss": 0.1356, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.2283305523842979, | |
| "grad_norm": 14664.177734375, | |
| "learning_rate": 2.828737045494467e-05, | |
| "loss": 0.1309, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.2283305523842979, | |
| "eval_loss": 0.1219501867890358, | |
| "eval_runtime": 174.8703, | |
| "eval_samples_per_second": 25.505, | |
| "eval_steps_per_second": 3.191, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.23711249670677087, | |
| "grad_norm": 9694.1875, | |
| "learning_rate": 2.8221500087827156e-05, | |
| "loss": 0.1271, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.24589444102924388, | |
| "grad_norm": 17376.810546875, | |
| "learning_rate": 2.8155629720709643e-05, | |
| "loss": 0.1434, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.24589444102924388, | |
| "eval_loss": 0.12065327912569046, | |
| "eval_runtime": 174.9734, | |
| "eval_samples_per_second": 25.49, | |
| "eval_steps_per_second": 3.189, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.2546763853517169, | |
| "grad_norm": 13443.2255859375, | |
| "learning_rate": 2.808975935359213e-05, | |
| "loss": 0.1383, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.26345832967418986, | |
| "grad_norm": 10927.8994140625, | |
| "learning_rate": 2.802388898647462e-05, | |
| "loss": 0.125, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.26345832967418986, | |
| "eval_loss": 0.11999432742595673, | |
| "eval_runtime": 174.9084, | |
| "eval_samples_per_second": 25.499, | |
| "eval_steps_per_second": 3.19, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.27224027399666284, | |
| "grad_norm": 9734.189453125, | |
| "learning_rate": 2.7958018619357106e-05, | |
| "loss": 0.1315, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.2810222183191359, | |
| "grad_norm": 11625.3203125, | |
| "learning_rate": 2.7892148252239593e-05, | |
| "loss": 0.1376, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.2810222183191359, | |
| "eval_loss": 0.11961102485656738, | |
| "eval_runtime": 175.2567, | |
| "eval_samples_per_second": 25.448, | |
| "eval_steps_per_second": 3.184, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.28980416264160885, | |
| "grad_norm": 10136.365234375, | |
| "learning_rate": 2.782627788512208e-05, | |
| "loss": 0.1245, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.29858610696408183, | |
| "grad_norm": 9877.0, | |
| "learning_rate": 2.776040751800457e-05, | |
| "loss": 0.1366, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.29858610696408183, | |
| "eval_loss": 0.11874815076589584, | |
| "eval_runtime": 175.0761, | |
| "eval_samples_per_second": 25.475, | |
| "eval_steps_per_second": 3.187, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.30736805128655487, | |
| "grad_norm": 13225.7958984375, | |
| "learning_rate": 2.7694537150887056e-05, | |
| "loss": 0.1309, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.31614999560902785, | |
| "grad_norm": 21314.796875, | |
| "learning_rate": 2.762866678376954e-05, | |
| "loss": 0.1257, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.31614999560902785, | |
| "eval_loss": 0.11850052326917648, | |
| "eval_runtime": 174.9576, | |
| "eval_samples_per_second": 25.492, | |
| "eval_steps_per_second": 3.189, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.3249319399315008, | |
| "grad_norm": 11650.2890625, | |
| "learning_rate": 2.756279641665203e-05, | |
| "loss": 0.1339, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.3337138842539738, | |
| "grad_norm": 11948.90625, | |
| "learning_rate": 2.7496926049534516e-05, | |
| "loss": 0.1239, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.3337138842539738, | |
| "eval_loss": 0.1181233748793602, | |
| "eval_runtime": 174.7123, | |
| "eval_samples_per_second": 25.528, | |
| "eval_steps_per_second": 3.194, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.34249582857644684, | |
| "grad_norm": 13416.5234375, | |
| "learning_rate": 2.7431055682417006e-05, | |
| "loss": 0.128, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.3512777728989198, | |
| "grad_norm": 19758.439453125, | |
| "learning_rate": 2.736518531529949e-05, | |
| "loss": 0.1281, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.3512777728989198, | |
| "eval_loss": 0.1172366663813591, | |
| "eval_runtime": 175.0951, | |
| "eval_samples_per_second": 25.472, | |
| "eval_steps_per_second": 3.187, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.3600597172213928, | |
| "grad_norm": 9282.8935546875, | |
| "learning_rate": 2.729931494818198e-05, | |
| "loss": 0.1247, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.36884166154386583, | |
| "grad_norm": 13347.216796875, | |
| "learning_rate": 2.7233444581064466e-05, | |
| "loss": 0.1256, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.36884166154386583, | |
| "eval_loss": 0.11689984053373337, | |
| "eval_runtime": 175.1847, | |
| "eval_samples_per_second": 25.459, | |
| "eval_steps_per_second": 3.185, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.3776236058663388, | |
| "grad_norm": 9560.6240234375, | |
| "learning_rate": 2.7167574213946952e-05, | |
| "loss": 0.1345, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.3864055501888118, | |
| "grad_norm": 11137.072265625, | |
| "learning_rate": 2.710170384682944e-05, | |
| "loss": 0.1173, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.3864055501888118, | |
| "eval_loss": 0.11689111590385437, | |
| "eval_runtime": 174.8217, | |
| "eval_samples_per_second": 25.512, | |
| "eval_steps_per_second": 3.192, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.3951874945112848, | |
| "grad_norm": 8313.1650390625, | |
| "learning_rate": 2.703583347971193e-05, | |
| "loss": 0.1214, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.4039694388337578, | |
| "grad_norm": 14535.1669921875, | |
| "learning_rate": 2.6969963112594416e-05, | |
| "loss": 0.1291, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.4039694388337578, | |
| "eval_loss": 0.11614538729190826, | |
| "eval_runtime": 174.9499, | |
| "eval_samples_per_second": 25.493, | |
| "eval_steps_per_second": 3.189, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.4127513831562308, | |
| "grad_norm": 14502.2197265625, | |
| "learning_rate": 2.6904092745476902e-05, | |
| "loss": 0.1223, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.4215333274787038, | |
| "grad_norm": 13744.8466796875, | |
| "learning_rate": 2.683822237835939e-05, | |
| "loss": 0.1277, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.4215333274787038, | |
| "eval_loss": 0.11536002904176712, | |
| "eval_runtime": 175.288, | |
| "eval_samples_per_second": 25.444, | |
| "eval_steps_per_second": 3.183, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.4303152718011768, | |
| "grad_norm": 10811.90625, | |
| "learning_rate": 2.6772352011241876e-05, | |
| "loss": 0.1234, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.43909721612364977, | |
| "grad_norm": 10062.30859375, | |
| "learning_rate": 2.6706481644124366e-05, | |
| "loss": 0.1217, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.43909721612364977, | |
| "eval_loss": 0.11480703204870224, | |
| "eval_runtime": 175.079, | |
| "eval_samples_per_second": 25.474, | |
| "eval_steps_per_second": 3.187, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.44787916044612275, | |
| "grad_norm": 9090.5810546875, | |
| "learning_rate": 2.664061127700685e-05, | |
| "loss": 0.1183, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.4566611047685958, | |
| "grad_norm": 13992.6572265625, | |
| "learning_rate": 2.657474090988934e-05, | |
| "loss": 0.1204, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.4566611047685958, | |
| "eval_loss": 0.11466159671545029, | |
| "eval_runtime": 175.0561, | |
| "eval_samples_per_second": 25.478, | |
| "eval_steps_per_second": 3.188, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.46544304909106876, | |
| "grad_norm": 10754.3505859375, | |
| "learning_rate": 2.6508870542771825e-05, | |
| "loss": 0.1233, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.47422499341354174, | |
| "grad_norm": 10475.4765625, | |
| "learning_rate": 2.6443000175654315e-05, | |
| "loss": 0.1226, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.47422499341354174, | |
| "eval_loss": 0.11424204707145691, | |
| "eval_runtime": 174.9273, | |
| "eval_samples_per_second": 25.496, | |
| "eval_steps_per_second": 3.19, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.4830069377360148, | |
| "grad_norm": 16770.1015625, | |
| "learning_rate": 2.63771298085368e-05, | |
| "loss": 0.1331, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.49178888205848775, | |
| "grad_norm": 193283.25, | |
| "learning_rate": 2.631125944141929e-05, | |
| "loss": 0.1193, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.49178888205848775, | |
| "eval_loss": 0.11391730606555939, | |
| "eval_runtime": 175.0052, | |
| "eval_samples_per_second": 25.485, | |
| "eval_steps_per_second": 3.188, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.5005708263809607, | |
| "grad_norm": 8650.1865234375, | |
| "learning_rate": 2.6245389074301775e-05, | |
| "loss": 0.1307, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.5093527707034338, | |
| "grad_norm": 11343.3427734375, | |
| "learning_rate": 2.6179518707184262e-05, | |
| "loss": 0.1173, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.5093527707034338, | |
| "eval_loss": 0.11382684111595154, | |
| "eval_runtime": 175.1185, | |
| "eval_samples_per_second": 25.468, | |
| "eval_steps_per_second": 3.186, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.5181347150259067, | |
| "grad_norm": 9844.486328125, | |
| "learning_rate": 2.611364834006675e-05, | |
| "loss": 0.1229, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.5269166593483797, | |
| "grad_norm": 8915.7255859375, | |
| "learning_rate": 2.6047777972949235e-05, | |
| "loss": 0.125, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.5269166593483797, | |
| "eval_loss": 0.11349175125360489, | |
| "eval_runtime": 175.2134, | |
| "eval_samples_per_second": 25.455, | |
| "eval_steps_per_second": 3.185, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.5356986036708528, | |
| "grad_norm": 9072.705078125, | |
| "learning_rate": 2.5981907605831725e-05, | |
| "loss": 0.1249, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.5444805479933257, | |
| "grad_norm": 11936.0400390625, | |
| "learning_rate": 2.591603723871421e-05, | |
| "loss": 0.1205, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.5444805479933257, | |
| "eval_loss": 0.11320458352565765, | |
| "eval_runtime": 174.8633, | |
| "eval_samples_per_second": 25.506, | |
| "eval_steps_per_second": 3.191, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.5532624923157987, | |
| "grad_norm": 11488.6748046875, | |
| "learning_rate": 2.58501668715967e-05, | |
| "loss": 0.1202, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.5620444366382717, | |
| "grad_norm": 12126.583984375, | |
| "learning_rate": 2.5784296504479185e-05, | |
| "loss": 0.12, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.5620444366382717, | |
| "eval_loss": 0.11316446959972382, | |
| "eval_runtime": 174.862, | |
| "eval_samples_per_second": 25.506, | |
| "eval_steps_per_second": 3.191, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.5708263809607447, | |
| "grad_norm": 14138.3876953125, | |
| "learning_rate": 2.5718426137361675e-05, | |
| "loss": 0.1272, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.5796083252832177, | |
| "grad_norm": 6703.96728515625, | |
| "learning_rate": 2.5652555770244158e-05, | |
| "loss": 0.1245, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.5796083252832177, | |
| "eval_loss": 0.11248895525932312, | |
| "eval_runtime": 175.0938, | |
| "eval_samples_per_second": 25.472, | |
| "eval_steps_per_second": 3.187, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.5883902696056907, | |
| "grad_norm": 14042.6962890625, | |
| "learning_rate": 2.5586685403126648e-05, | |
| "loss": 0.1135, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.5971722139281637, | |
| "grad_norm": 11223.4375, | |
| "learning_rate": 2.5520815036009135e-05, | |
| "loss": 0.1192, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.5971722139281637, | |
| "eval_loss": 0.11230960488319397, | |
| "eval_runtime": 175.0029, | |
| "eval_samples_per_second": 25.485, | |
| "eval_steps_per_second": 3.189, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.6059541582506367, | |
| "grad_norm": 10186.056640625, | |
| "learning_rate": 2.5454944668891625e-05, | |
| "loss": 0.1133, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.6147361025731097, | |
| "grad_norm": 11724.2939453125, | |
| "learning_rate": 2.5389074301774108e-05, | |
| "loss": 0.1191, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.6147361025731097, | |
| "eval_loss": 0.11222900450229645, | |
| "eval_runtime": 174.8917, | |
| "eval_samples_per_second": 25.501, | |
| "eval_steps_per_second": 3.191, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.6235180468955827, | |
| "grad_norm": 13531.3447265625, | |
| "learning_rate": 2.5323203934656598e-05, | |
| "loss": 0.1178, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.6322999912180557, | |
| "grad_norm": 10695.3486328125, | |
| "learning_rate": 2.5257333567539085e-05, | |
| "loss": 0.1175, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.6322999912180557, | |
| "eval_loss": 0.11206092685461044, | |
| "eval_runtime": 174.9921, | |
| "eval_samples_per_second": 25.487, | |
| "eval_steps_per_second": 3.189, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.6410819355405287, | |
| "grad_norm": 9945.763671875, | |
| "learning_rate": 2.5191463200421568e-05, | |
| "loss": 0.1122, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.6498638798630016, | |
| "grad_norm": 21472.962890625, | |
| "learning_rate": 2.5125592833304058e-05, | |
| "loss": 0.1179, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.6498638798630016, | |
| "eval_loss": 0.11144611984491348, | |
| "eval_runtime": 174.9363, | |
| "eval_samples_per_second": 25.495, | |
| "eval_steps_per_second": 3.19, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.6586458241854747, | |
| "grad_norm": 10160.6298828125, | |
| "learning_rate": 2.5059722466186545e-05, | |
| "loss": 0.1148, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.6674277685079476, | |
| "grad_norm": 13687.66015625, | |
| "learning_rate": 2.4993852099069035e-05, | |
| "loss": 0.1166, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.6674277685079476, | |
| "eval_loss": 0.1111949160695076, | |
| "eval_runtime": 174.9122, | |
| "eval_samples_per_second": 25.499, | |
| "eval_steps_per_second": 3.19, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.6762097128304206, | |
| "grad_norm": 9961.3818359375, | |
| "learning_rate": 2.4927981731951518e-05, | |
| "loss": 0.12, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.6849916571528937, | |
| "grad_norm": 11952.0546875, | |
| "learning_rate": 2.4862111364834008e-05, | |
| "loss": 0.1227, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.6849916571528937, | |
| "eval_loss": 0.11115138977766037, | |
| "eval_runtime": 175.0741, | |
| "eval_samples_per_second": 25.475, | |
| "eval_steps_per_second": 3.187, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.6937736014753666, | |
| "grad_norm": 7658.37255859375, | |
| "learning_rate": 2.4796240997716494e-05, | |
| "loss": 0.1178, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.7025555457978396, | |
| "grad_norm": 9078.0087890625, | |
| "learning_rate": 2.4730370630598984e-05, | |
| "loss": 0.1079, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.7025555457978396, | |
| "eval_loss": 0.11094118654727936, | |
| "eval_runtime": 174.8603, | |
| "eval_samples_per_second": 25.506, | |
| "eval_steps_per_second": 3.191, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.7113374901203127, | |
| "grad_norm": 10273.943359375, | |
| "learning_rate": 2.4664500263481468e-05, | |
| "loss": 0.1122, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.7201194344427856, | |
| "grad_norm": 9615.3408203125, | |
| "learning_rate": 2.4598629896363958e-05, | |
| "loss": 0.1178, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.7201194344427856, | |
| "eval_loss": 0.11066293716430664, | |
| "eval_runtime": 176.4782, | |
| "eval_samples_per_second": 25.272, | |
| "eval_steps_per_second": 3.162, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.7289013787652586, | |
| "grad_norm": 9801.9638671875, | |
| "learning_rate": 2.4532759529246444e-05, | |
| "loss": 0.1235, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.7376833230877317, | |
| "grad_norm": 14902.2216796875, | |
| "learning_rate": 2.446688916212893e-05, | |
| "loss": 0.1178, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.7376833230877317, | |
| "eval_loss": 0.1100853979587555, | |
| "eval_runtime": 176.5072, | |
| "eval_samples_per_second": 25.268, | |
| "eval_steps_per_second": 3.161, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.7464652674102046, | |
| "grad_norm": 11338.7451171875, | |
| "learning_rate": 2.4401018795011417e-05, | |
| "loss": 0.1139, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.7552472117326776, | |
| "grad_norm": 27772.08203125, | |
| "learning_rate": 2.4335148427893904e-05, | |
| "loss": 0.1165, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.7552472117326776, | |
| "eval_loss": 0.11026333272457123, | |
| "eval_runtime": 176.5703, | |
| "eval_samples_per_second": 25.259, | |
| "eval_steps_per_second": 3.16, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.7640291560551506, | |
| "grad_norm": 15169.2919921875, | |
| "learning_rate": 2.4269278060776394e-05, | |
| "loss": 0.1223, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.7728111003776236, | |
| "grad_norm": 9459.4482421875, | |
| "learning_rate": 2.4203407693658877e-05, | |
| "loss": 0.1139, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.7728111003776236, | |
| "eval_loss": 0.11010286957025528, | |
| "eval_runtime": 176.8628, | |
| "eval_samples_per_second": 25.217, | |
| "eval_steps_per_second": 3.155, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.7815930447000966, | |
| "grad_norm": 10122.501953125, | |
| "learning_rate": 2.4137537326541367e-05, | |
| "loss": 0.1143, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.7903749890225696, | |
| "grad_norm": 8584.2705078125, | |
| "learning_rate": 2.4071666959423854e-05, | |
| "loss": 0.1104, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.7903749890225696, | |
| "eval_loss": 0.11007058620452881, | |
| "eval_runtime": 176.6787, | |
| "eval_samples_per_second": 25.244, | |
| "eval_steps_per_second": 3.158, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.7991569333450426, | |
| "grad_norm": 18708.171875, | |
| "learning_rate": 2.4005796592306344e-05, | |
| "loss": 0.1165, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.8079388776675156, | |
| "grad_norm": 7859.576171875, | |
| "learning_rate": 2.3939926225188827e-05, | |
| "loss": 0.1147, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.8079388776675156, | |
| "eval_loss": 0.10995937138795853, | |
| "eval_runtime": 176.8489, | |
| "eval_samples_per_second": 25.219, | |
| "eval_steps_per_second": 3.155, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.8167208219899886, | |
| "grad_norm": 27483.9140625, | |
| "learning_rate": 2.3874055858071317e-05, | |
| "loss": 0.1224, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.8255027663124616, | |
| "grad_norm": 8125.94580078125, | |
| "learning_rate": 2.3808185490953804e-05, | |
| "loss": 0.1112, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.8255027663124616, | |
| "eval_loss": 0.10972581803798676, | |
| "eval_runtime": 176.1825, | |
| "eval_samples_per_second": 25.315, | |
| "eval_steps_per_second": 3.167, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.8342847106349346, | |
| "grad_norm": 20998.330078125, | |
| "learning_rate": 2.3742315123836294e-05, | |
| "loss": 0.1213, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.8430666549574076, | |
| "grad_norm": 7832.6513671875, | |
| "learning_rate": 2.3676444756718777e-05, | |
| "loss": 0.1163, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.8430666549574076, | |
| "eval_loss": 0.10929498076438904, | |
| "eval_runtime": 175.008, | |
| "eval_samples_per_second": 25.485, | |
| "eval_steps_per_second": 3.188, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.8518485992798805, | |
| "grad_norm": 10396.4267578125, | |
| "learning_rate": 2.3610574389601264e-05, | |
| "loss": 0.1056, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.8606305436023536, | |
| "grad_norm": 10345.142578125, | |
| "learning_rate": 2.3544704022483754e-05, | |
| "loss": 0.1131, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.8606305436023536, | |
| "eval_loss": 0.10942210257053375, | |
| "eval_runtime": 175.0447, | |
| "eval_samples_per_second": 25.479, | |
| "eval_steps_per_second": 3.188, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.8694124879248265, | |
| "grad_norm": 7617.70703125, | |
| "learning_rate": 2.3478833655366237e-05, | |
| "loss": 0.1033, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.8781944322472995, | |
| "grad_norm": 8898.314453125, | |
| "learning_rate": 2.3412963288248727e-05, | |
| "loss": 0.1168, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.8781944322472995, | |
| "eval_loss": 0.10880845785140991, | |
| "eval_runtime": 175.2118, | |
| "eval_samples_per_second": 25.455, | |
| "eval_steps_per_second": 3.185, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.8869763765697726, | |
| "grad_norm": 12492.4248046875, | |
| "learning_rate": 2.3347092921131214e-05, | |
| "loss": 0.1151, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.8957583208922455, | |
| "grad_norm": 10120.4833984375, | |
| "learning_rate": 2.3281222554013704e-05, | |
| "loss": 0.1111, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.8957583208922455, | |
| "eval_loss": 0.10875380039215088, | |
| "eval_runtime": 175.9563, | |
| "eval_samples_per_second": 25.347, | |
| "eval_steps_per_second": 3.171, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.9045402652147185, | |
| "grad_norm": 6685.44287109375, | |
| "learning_rate": 2.3215352186896187e-05, | |
| "loss": 0.1114, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.9133222095371916, | |
| "grad_norm": 7174.9296875, | |
| "learning_rate": 2.3149481819778677e-05, | |
| "loss": 0.1212, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.9133222095371916, | |
| "eval_loss": 0.10833785682916641, | |
| "eval_runtime": 176.2427, | |
| "eval_samples_per_second": 25.306, | |
| "eval_steps_per_second": 3.166, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.9221041538596645, | |
| "grad_norm": 12618.1103515625, | |
| "learning_rate": 2.3083611452661163e-05, | |
| "loss": 0.1043, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.9308860981821375, | |
| "grad_norm": 12976.857421875, | |
| "learning_rate": 2.3017741085543653e-05, | |
| "loss": 0.1183, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.9308860981821375, | |
| "eval_loss": 0.10824151337146759, | |
| "eval_runtime": 176.4342, | |
| "eval_samples_per_second": 25.279, | |
| "eval_steps_per_second": 3.163, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.9396680425046106, | |
| "grad_norm": 14113.46875, | |
| "learning_rate": 2.2951870718426137e-05, | |
| "loss": 0.1166, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.9484499868270835, | |
| "grad_norm": 14832.5478515625, | |
| "learning_rate": 2.2886000351308623e-05, | |
| "loss": 0.122, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.9484499868270835, | |
| "eval_loss": 0.10793043673038483, | |
| "eval_runtime": 175.8031, | |
| "eval_samples_per_second": 25.369, | |
| "eval_steps_per_second": 3.174, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.9572319311495565, | |
| "grad_norm": 11024.25390625, | |
| "learning_rate": 2.2820129984191113e-05, | |
| "loss": 0.1156, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.9660138754720295, | |
| "grad_norm": 13093.2275390625, | |
| "learning_rate": 2.27542596170736e-05, | |
| "loss": 0.1154, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.9660138754720295, | |
| "eval_loss": 0.10771273821592331, | |
| "eval_runtime": 176.0575, | |
| "eval_samples_per_second": 25.333, | |
| "eval_steps_per_second": 3.169, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.9747958197945025, | |
| "grad_norm": 18551.283203125, | |
| "learning_rate": 2.2688389249956087e-05, | |
| "loss": 0.1112, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.9835777641169755, | |
| "grad_norm": 16615.962890625, | |
| "learning_rate": 2.2622518882838573e-05, | |
| "loss": 0.1103, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.9835777641169755, | |
| "eval_loss": 0.10789492726325989, | |
| "eval_runtime": 176.2712, | |
| "eval_samples_per_second": 25.302, | |
| "eval_steps_per_second": 3.166, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.9923597084394485, | |
| "grad_norm": 10686.6455078125, | |
| "learning_rate": 2.2556648515721063e-05, | |
| "loss": 0.1134, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 1.0011416527619215, | |
| "grad_norm": 8200.20703125, | |
| "learning_rate": 2.2490778148603546e-05, | |
| "loss": 0.1085, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.0011416527619215, | |
| "eval_loss": 0.10795657336711884, | |
| "eval_runtime": 175.8416, | |
| "eval_samples_per_second": 25.364, | |
| "eval_steps_per_second": 3.173, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.0099235970843945, | |
| "grad_norm": 12689.986328125, | |
| "learning_rate": 2.2424907781486036e-05, | |
| "loss": 0.1051, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 1.0187055414068675, | |
| "grad_norm": 14943.79296875, | |
| "learning_rate": 2.2359037414368523e-05, | |
| "loss": 0.1086, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.0187055414068675, | |
| "eval_loss": 0.10745207220315933, | |
| "eval_runtime": 176.5973, | |
| "eval_samples_per_second": 25.255, | |
| "eval_steps_per_second": 3.16, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.0274874857293406, | |
| "grad_norm": 9522.888671875, | |
| "learning_rate": 2.2293167047251013e-05, | |
| "loss": 0.1163, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 1.0362694300518134, | |
| "grad_norm": 10599.6796875, | |
| "learning_rate": 2.2227296680133496e-05, | |
| "loss": 0.1048, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.0362694300518134, | |
| "eval_loss": 0.10774970054626465, | |
| "eval_runtime": 175.7784, | |
| "eval_samples_per_second": 25.373, | |
| "eval_steps_per_second": 3.174, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.0450513743742864, | |
| "grad_norm": 11812.033203125, | |
| "learning_rate": 2.2161426313015986e-05, | |
| "loss": 0.1103, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 1.0538333186967594, | |
| "grad_norm": 14652.2265625, | |
| "learning_rate": 2.2095555945898473e-05, | |
| "loss": 0.1059, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.0538333186967594, | |
| "eval_loss": 0.10774527490139008, | |
| "eval_runtime": 175.7637, | |
| "eval_samples_per_second": 25.375, | |
| "eval_steps_per_second": 3.175, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.0626152630192325, | |
| "grad_norm": 12538.1015625, | |
| "learning_rate": 2.202968557878096e-05, | |
| "loss": 0.102, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 1.0713972073417055, | |
| "grad_norm": 12819.8525390625, | |
| "learning_rate": 2.1963815211663446e-05, | |
| "loss": 0.1066, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.0713972073417055, | |
| "eval_loss": 0.1075778678059578, | |
| "eval_runtime": 176.0587, | |
| "eval_samples_per_second": 25.332, | |
| "eval_steps_per_second": 3.169, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.0801791516641785, | |
| "grad_norm": 12523.4482421875, | |
| "learning_rate": 2.1897944844545933e-05, | |
| "loss": 0.1052, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 1.0889610959866514, | |
| "grad_norm": 12486.875, | |
| "learning_rate": 2.1832074477428423e-05, | |
| "loss": 0.1186, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.0889610959866514, | |
| "eval_loss": 0.10704567283391953, | |
| "eval_runtime": 176.4421, | |
| "eval_samples_per_second": 25.277, | |
| "eval_steps_per_second": 3.163, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.0977430403091244, | |
| "grad_norm": 9313.5810546875, | |
| "learning_rate": 2.176620411031091e-05, | |
| "loss": 0.1096, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 1.1065249846315974, | |
| "grad_norm": 11183.017578125, | |
| "learning_rate": 2.1700333743193396e-05, | |
| "loss": 0.0989, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.1065249846315974, | |
| "eval_loss": 0.10751615464687347, | |
| "eval_runtime": 176.3617, | |
| "eval_samples_per_second": 25.289, | |
| "eval_steps_per_second": 3.164, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.1153069289540705, | |
| "grad_norm": 6714.72705078125, | |
| "learning_rate": 2.1634463376075883e-05, | |
| "loss": 0.1104, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 1.1240888732765435, | |
| "grad_norm": 10059.513671875, | |
| "learning_rate": 2.1568593008958373e-05, | |
| "loss": 0.1057, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.1240888732765435, | |
| "eval_loss": 0.10703834146261215, | |
| "eval_runtime": 176.3406, | |
| "eval_samples_per_second": 25.292, | |
| "eval_steps_per_second": 3.164, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.1328708175990165, | |
| "grad_norm": 11930.1201171875, | |
| "learning_rate": 2.1502722641840856e-05, | |
| "loss": 0.1128, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 1.1416527619214893, | |
| "grad_norm": 18080.43359375, | |
| "learning_rate": 2.1436852274723346e-05, | |
| "loss": 0.1043, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.1416527619214893, | |
| "eval_loss": 0.10702774673700333, | |
| "eval_runtime": 176.6705, | |
| "eval_samples_per_second": 25.245, | |
| "eval_steps_per_second": 3.158, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.1504347062439624, | |
| "grad_norm": 9161.2529296875, | |
| "learning_rate": 2.1370981907605832e-05, | |
| "loss": 0.1037, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 1.1592166505664354, | |
| "grad_norm": 9676.08203125, | |
| "learning_rate": 2.130511154048832e-05, | |
| "loss": 0.1113, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.1592166505664354, | |
| "eval_loss": 0.10694678127765656, | |
| "eval_runtime": 176.4583, | |
| "eval_samples_per_second": 25.275, | |
| "eval_steps_per_second": 3.162, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.1679985948889084, | |
| "grad_norm": 10895.8876953125, | |
| "learning_rate": 2.1239241173370806e-05, | |
| "loss": 0.1029, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 1.1767805392113815, | |
| "grad_norm": 10269.076171875, | |
| "learning_rate": 2.1173370806253292e-05, | |
| "loss": 0.1086, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.1767805392113815, | |
| "eval_loss": 0.10676951706409454, | |
| "eval_runtime": 176.5328, | |
| "eval_samples_per_second": 25.264, | |
| "eval_steps_per_second": 3.161, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.1855624835338543, | |
| "grad_norm": 9631.9169921875, | |
| "learning_rate": 2.1107500439135782e-05, | |
| "loss": 0.111, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 1.1943444278563273, | |
| "grad_norm": 8764.7451171875, | |
| "learning_rate": 2.104163007201827e-05, | |
| "loss": 0.1066, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.1943444278563273, | |
| "eval_loss": 0.10679937154054642, | |
| "eval_runtime": 176.7169, | |
| "eval_samples_per_second": 25.238, | |
| "eval_steps_per_second": 3.158, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.2031263721788004, | |
| "grad_norm": 12220.5439453125, | |
| "learning_rate": 2.0975759704900756e-05, | |
| "loss": 0.1106, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 1.2119083165012734, | |
| "grad_norm": 13508.0205078125, | |
| "learning_rate": 2.0909889337783242e-05, | |
| "loss": 0.1115, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 1.2119083165012734, | |
| "eval_loss": 0.10662820190191269, | |
| "eval_runtime": 176.4182, | |
| "eval_samples_per_second": 25.281, | |
| "eval_steps_per_second": 3.163, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 1.2206902608237464, | |
| "grad_norm": 11431.49609375, | |
| "learning_rate": 2.0844018970665732e-05, | |
| "loss": 0.1042, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 1.2294722051462195, | |
| "grad_norm": 8914.7119140625, | |
| "learning_rate": 2.077814860354822e-05, | |
| "loss": 0.1037, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.2294722051462195, | |
| "eval_loss": 0.10637149214744568, | |
| "eval_runtime": 176.33, | |
| "eval_samples_per_second": 25.293, | |
| "eval_steps_per_second": 3.165, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.2382541494686923, | |
| "grad_norm": 32394.365234375, | |
| "learning_rate": 2.0712278236430705e-05, | |
| "loss": 0.1082, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 1.2470360937911653, | |
| "grad_norm": 13041.302734375, | |
| "learning_rate": 2.0646407869313192e-05, | |
| "loss": 0.1118, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 1.2470360937911653, | |
| "eval_loss": 0.10657413303852081, | |
| "eval_runtime": 176.3827, | |
| "eval_samples_per_second": 25.286, | |
| "eval_steps_per_second": 3.164, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 1.2558180381136383, | |
| "grad_norm": 9132.3310546875, | |
| "learning_rate": 2.0580537502195682e-05, | |
| "loss": 0.103, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 1.2645999824361114, | |
| "grad_norm": 18166.306640625, | |
| "learning_rate": 2.0514667135078165e-05, | |
| "loss": 0.1108, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 1.2645999824361114, | |
| "eval_loss": 0.10623560100793839, | |
| "eval_runtime": 176.2458, | |
| "eval_samples_per_second": 25.306, | |
| "eval_steps_per_second": 3.166, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 1.2733819267585844, | |
| "grad_norm": 12996.111328125, | |
| "learning_rate": 2.0448796767960652e-05, | |
| "loss": 0.1019, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 1.2821638710810572, | |
| "grad_norm": 9010.8212890625, | |
| "learning_rate": 2.0382926400843142e-05, | |
| "loss": 0.1074, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 1.2821638710810572, | |
| "eval_loss": 0.10622620582580566, | |
| "eval_runtime": 176.3743, | |
| "eval_samples_per_second": 25.287, | |
| "eval_steps_per_second": 3.164, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 1.2909458154035303, | |
| "grad_norm": 14462.72265625, | |
| "learning_rate": 2.031705603372563e-05, | |
| "loss": 0.1104, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 1.2997277597260033, | |
| "grad_norm": 9121.498046875, | |
| "learning_rate": 2.0251185666608115e-05, | |
| "loss": 0.1141, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 1.2997277597260033, | |
| "eval_loss": 0.10618162155151367, | |
| "eval_runtime": 176.2451, | |
| "eval_samples_per_second": 25.306, | |
| "eval_steps_per_second": 3.166, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 1.3085097040484763, | |
| "grad_norm": 13231.1484375, | |
| "learning_rate": 2.0185315299490602e-05, | |
| "loss": 0.1106, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 1.3172916483709494, | |
| "grad_norm": 8816.541015625, | |
| "learning_rate": 2.0119444932373092e-05, | |
| "loss": 0.1065, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.3172916483709494, | |
| "eval_loss": 0.1062735840678215, | |
| "eval_runtime": 176.3513, | |
| "eval_samples_per_second": 25.29, | |
| "eval_steps_per_second": 3.164, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.3260735926934224, | |
| "grad_norm": 9924.986328125, | |
| "learning_rate": 2.005357456525558e-05, | |
| "loss": 0.1018, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 1.3348555370158954, | |
| "grad_norm": 14466.806640625, | |
| "learning_rate": 1.9987704198138065e-05, | |
| "loss": 0.1065, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 1.3348555370158954, | |
| "eval_loss": 0.105972521007061, | |
| "eval_runtime": 176.3022, | |
| "eval_samples_per_second": 25.297, | |
| "eval_steps_per_second": 3.165, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 1.3436374813383682, | |
| "grad_norm": 13860.5234375, | |
| "learning_rate": 1.992183383102055e-05, | |
| "loss": 0.1096, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 1.3524194256608413, | |
| "grad_norm": 9354.7333984375, | |
| "learning_rate": 1.985596346390304e-05, | |
| "loss": 0.1014, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 1.3524194256608413, | |
| "eval_loss": 0.10594488680362701, | |
| "eval_runtime": 176.4418, | |
| "eval_samples_per_second": 25.277, | |
| "eval_steps_per_second": 3.163, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 1.3612013699833143, | |
| "grad_norm": 9179.8173828125, | |
| "learning_rate": 1.9790093096785525e-05, | |
| "loss": 0.0998, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 1.3699833143057873, | |
| "grad_norm": 6730.38134765625, | |
| "learning_rate": 1.972422272966801e-05, | |
| "loss": 0.1029, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 1.3699833143057873, | |
| "eval_loss": 0.10599970072507858, | |
| "eval_runtime": 176.3495, | |
| "eval_samples_per_second": 25.291, | |
| "eval_steps_per_second": 3.164, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 1.3787652586282602, | |
| "grad_norm": 9362.3427734375, | |
| "learning_rate": 1.96583523625505e-05, | |
| "loss": 0.1092, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 1.3875472029507332, | |
| "grad_norm": 10184.53125, | |
| "learning_rate": 1.9592481995432988e-05, | |
| "loss": 0.1058, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 1.3875472029507332, | |
| "eval_loss": 0.1059907078742981, | |
| "eval_runtime": 176.2866, | |
| "eval_samples_per_second": 25.3, | |
| "eval_steps_per_second": 3.165, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 1.3963291472732062, | |
| "grad_norm": 11201.806640625, | |
| "learning_rate": 1.9526611628315475e-05, | |
| "loss": 0.1143, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 1.4051110915956793, | |
| "grad_norm": 6871.8662109375, | |
| "learning_rate": 1.946074126119796e-05, | |
| "loss": 0.1009, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.4051110915956793, | |
| "eval_loss": 0.10532288253307343, | |
| "eval_runtime": 176.6701, | |
| "eval_samples_per_second": 25.245, | |
| "eval_steps_per_second": 3.158, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.4138930359181523, | |
| "grad_norm": 9399.228515625, | |
| "learning_rate": 1.939487089408045e-05, | |
| "loss": 0.1049, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 1.4226749802406253, | |
| "grad_norm": 19030.01953125, | |
| "learning_rate": 1.9329000526962938e-05, | |
| "loss": 0.1074, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 1.4226749802406253, | |
| "eval_loss": 0.10543525218963623, | |
| "eval_runtime": 176.5483, | |
| "eval_samples_per_second": 25.262, | |
| "eval_steps_per_second": 3.161, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 1.4314569245630984, | |
| "grad_norm": 11648.6005859375, | |
| "learning_rate": 1.9263130159845425e-05, | |
| "loss": 0.0996, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 1.4402388688855714, | |
| "grad_norm": 7726.56494140625, | |
| "learning_rate": 1.919725979272791e-05, | |
| "loss": 0.104, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 1.4402388688855714, | |
| "eval_loss": 0.10535960644483566, | |
| "eval_runtime": 176.4434, | |
| "eval_samples_per_second": 25.277, | |
| "eval_steps_per_second": 3.162, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 1.4490208132080442, | |
| "grad_norm": 9487.5029296875, | |
| "learning_rate": 1.91313894256104e-05, | |
| "loss": 0.1026, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 1.4578027575305172, | |
| "grad_norm": 8190.107421875, | |
| "learning_rate": 1.9065519058492888e-05, | |
| "loss": 0.1058, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 1.4578027575305172, | |
| "eval_loss": 0.10518208146095276, | |
| "eval_runtime": 175.766, | |
| "eval_samples_per_second": 25.375, | |
| "eval_steps_per_second": 3.175, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 1.4665847018529903, | |
| "grad_norm": 15317.1435546875, | |
| "learning_rate": 1.8999648691375374e-05, | |
| "loss": 0.1107, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 1.4753666461754633, | |
| "grad_norm": 8872.7119140625, | |
| "learning_rate": 1.893377832425786e-05, | |
| "loss": 0.1039, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 1.4753666461754633, | |
| "eval_loss": 0.10524547100067139, | |
| "eval_runtime": 175.2642, | |
| "eval_samples_per_second": 25.447, | |
| "eval_steps_per_second": 3.184, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 1.4841485904979361, | |
| "grad_norm": 12482.90234375, | |
| "learning_rate": 1.8867907957140348e-05, | |
| "loss": 0.1038, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 1.4929305348204092, | |
| "grad_norm": 12325.9970703125, | |
| "learning_rate": 1.8802037590022834e-05, | |
| "loss": 0.1113, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.4929305348204092, | |
| "eval_loss": 0.10507776588201523, | |
| "eval_runtime": 175.5813, | |
| "eval_samples_per_second": 25.401, | |
| "eval_steps_per_second": 3.178, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.5017124791428822, | |
| "grad_norm": 9354.1494140625, | |
| "learning_rate": 1.873616722290532e-05, | |
| "loss": 0.1087, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 1.5104944234653552, | |
| "grad_norm": 171124.34375, | |
| "learning_rate": 1.867029685578781e-05, | |
| "loss": 0.1106, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 1.5104944234653552, | |
| "eval_loss": 0.10511680692434311, | |
| "eval_runtime": 175.6013, | |
| "eval_samples_per_second": 25.398, | |
| "eval_steps_per_second": 3.178, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 1.5192763677878283, | |
| "grad_norm": 10542.4892578125, | |
| "learning_rate": 1.8604426488670297e-05, | |
| "loss": 0.1042, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 1.5280583121103013, | |
| "grad_norm": 8730.197265625, | |
| "learning_rate": 1.8538556121552784e-05, | |
| "loss": 0.1028, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 1.5280583121103013, | |
| "eval_loss": 0.10496073216199875, | |
| "eval_runtime": 175.5259, | |
| "eval_samples_per_second": 25.409, | |
| "eval_steps_per_second": 3.179, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 1.5368402564327743, | |
| "grad_norm": 22947.765625, | |
| "learning_rate": 1.847268575443527e-05, | |
| "loss": 0.1106, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 1.5456222007552474, | |
| "grad_norm": 12794.203125, | |
| "learning_rate": 1.840681538731776e-05, | |
| "loss": 0.1027, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 1.5456222007552474, | |
| "eval_loss": 0.10489310324192047, | |
| "eval_runtime": 175.3848, | |
| "eval_samples_per_second": 25.43, | |
| "eval_steps_per_second": 3.182, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 1.5544041450777202, | |
| "grad_norm": 9543.232421875, | |
| "learning_rate": 1.8340945020200247e-05, | |
| "loss": 0.107, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 1.5631860894001932, | |
| "grad_norm": 7341.599609375, | |
| "learning_rate": 1.8275074653082734e-05, | |
| "loss": 0.0986, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 1.5631860894001932, | |
| "eval_loss": 0.10493362694978714, | |
| "eval_runtime": 175.8527, | |
| "eval_samples_per_second": 25.362, | |
| "eval_steps_per_second": 3.173, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 1.5719680337226662, | |
| "grad_norm": 9900.4501953125, | |
| "learning_rate": 1.820920428596522e-05, | |
| "loss": 0.1101, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 1.580749978045139, | |
| "grad_norm": 9512.732421875, | |
| "learning_rate": 1.8143333918847707e-05, | |
| "loss": 0.1014, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.580749978045139, | |
| "eval_loss": 0.10463293641805649, | |
| "eval_runtime": 175.3499, | |
| "eval_samples_per_second": 25.435, | |
| "eval_steps_per_second": 3.182, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.589531922367612, | |
| "grad_norm": 9556.822265625, | |
| "learning_rate": 1.8077463551730197e-05, | |
| "loss": 0.1015, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 1.5983138666900851, | |
| "grad_norm": 9589.861328125, | |
| "learning_rate": 1.801159318461268e-05, | |
| "loss": 0.1054, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 1.5983138666900851, | |
| "eval_loss": 0.10453452169895172, | |
| "eval_runtime": 175.1783, | |
| "eval_samples_per_second": 25.46, | |
| "eval_steps_per_second": 3.185, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 1.6070958110125582, | |
| "grad_norm": 10552.5419921875, | |
| "learning_rate": 1.794572281749517e-05, | |
| "loss": 0.109, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 1.6158777553350312, | |
| "grad_norm": 14104.455078125, | |
| "learning_rate": 1.7879852450377657e-05, | |
| "loss": 0.1037, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 1.6158777553350312, | |
| "eval_loss": 0.10470784455537796, | |
| "eval_runtime": 175.1306, | |
| "eval_samples_per_second": 25.467, | |
| "eval_steps_per_second": 3.186, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 1.6246596996575042, | |
| "grad_norm": 10874.2314453125, | |
| "learning_rate": 1.7813982083260144e-05, | |
| "loss": 0.1034, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 1.6334416439799773, | |
| "grad_norm": 10729.14453125, | |
| "learning_rate": 1.774811171614263e-05, | |
| "loss": 0.1063, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 1.6334416439799773, | |
| "eval_loss": 0.10444886237382889, | |
| "eval_runtime": 175.0317, | |
| "eval_samples_per_second": 25.481, | |
| "eval_steps_per_second": 3.188, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 1.6422235883024503, | |
| "grad_norm": 13631.76953125, | |
| "learning_rate": 1.768224134902512e-05, | |
| "loss": 0.1051, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 1.6510055326249231, | |
| "grad_norm": 10832.173828125, | |
| "learning_rate": 1.7616370981907607e-05, | |
| "loss": 0.1073, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 1.6510055326249231, | |
| "eval_loss": 0.1044657751917839, | |
| "eval_runtime": 175.9228, | |
| "eval_samples_per_second": 25.352, | |
| "eval_steps_per_second": 3.172, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 1.6597874769473961, | |
| "grad_norm": 12738.5595703125, | |
| "learning_rate": 1.7550500614790094e-05, | |
| "loss": 0.1032, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 1.6685694212698692, | |
| "grad_norm": 11901.1611328125, | |
| "learning_rate": 1.748463024767258e-05, | |
| "loss": 0.1033, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.6685694212698692, | |
| "eval_loss": 0.10441263765096664, | |
| "eval_runtime": 176.1189, | |
| "eval_samples_per_second": 25.324, | |
| "eval_steps_per_second": 3.168, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.677351365592342, | |
| "grad_norm": 11122.455078125, | |
| "learning_rate": 1.741875988055507e-05, | |
| "loss": 0.1077, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 1.686133309914815, | |
| "grad_norm": 11424.095703125, | |
| "learning_rate": 1.7352889513437557e-05, | |
| "loss": 0.1009, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 1.686133309914815, | |
| "eval_loss": 0.10410206019878387, | |
| "eval_runtime": 175.8515, | |
| "eval_samples_per_second": 25.362, | |
| "eval_steps_per_second": 3.173, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 1.694915254237288, | |
| "grad_norm": 12446.83203125, | |
| "learning_rate": 1.728701914632004e-05, | |
| "loss": 0.0972, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 1.703697198559761, | |
| "grad_norm": 6987.2763671875, | |
| "learning_rate": 1.722114877920253e-05, | |
| "loss": 0.1053, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 1.703697198559761, | |
| "eval_loss": 0.10402592271566391, | |
| "eval_runtime": 176.2025, | |
| "eval_samples_per_second": 25.312, | |
| "eval_steps_per_second": 3.167, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 1.7124791428822341, | |
| "grad_norm": 12312.5595703125, | |
| "learning_rate": 1.7155278412085017e-05, | |
| "loss": 0.1049, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 1.7212610872047072, | |
| "grad_norm": 12246.89453125, | |
| "learning_rate": 1.7089408044967507e-05, | |
| "loss": 0.1031, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 1.7212610872047072, | |
| "eval_loss": 0.10409308969974518, | |
| "eval_runtime": 176.2211, | |
| "eval_samples_per_second": 25.309, | |
| "eval_steps_per_second": 3.166, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 1.7300430315271802, | |
| "grad_norm": 15511.779296875, | |
| "learning_rate": 1.702353767784999e-05, | |
| "loss": 0.0989, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 1.7388249758496532, | |
| "grad_norm": 11267.55859375, | |
| "learning_rate": 1.695766731073248e-05, | |
| "loss": 0.1051, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 1.7388249758496532, | |
| "eval_loss": 0.10400799661874771, | |
| "eval_runtime": 175.5444, | |
| "eval_samples_per_second": 25.407, | |
| "eval_steps_per_second": 3.179, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 1.7476069201721263, | |
| "grad_norm": 13126.3349609375, | |
| "learning_rate": 1.6891796943614966e-05, | |
| "loss": 0.1057, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 1.756388864494599, | |
| "grad_norm": 11091.8955078125, | |
| "learning_rate": 1.6825926576497453e-05, | |
| "loss": 0.1087, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.756388864494599, | |
| "eval_loss": 0.10402125865221024, | |
| "eval_runtime": 175.4715, | |
| "eval_samples_per_second": 25.417, | |
| "eval_steps_per_second": 3.18, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.7651708088170721, | |
| "grad_norm": 11781.5146484375, | |
| "learning_rate": 1.676005620937994e-05, | |
| "loss": 0.1053, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 1.7739527531395451, | |
| "grad_norm": 9562.85546875, | |
| "learning_rate": 1.669418584226243e-05, | |
| "loss": 0.102, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 1.7739527531395451, | |
| "eval_loss": 0.10393357276916504, | |
| "eval_runtime": 175.3102, | |
| "eval_samples_per_second": 25.441, | |
| "eval_steps_per_second": 3.183, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 1.782734697462018, | |
| "grad_norm": 11889.0390625, | |
| "learning_rate": 1.6628315475144916e-05, | |
| "loss": 0.1068, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 1.791516641784491, | |
| "grad_norm": 9615.0771484375, | |
| "learning_rate": 1.65624451080274e-05, | |
| "loss": 0.1006, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 1.791516641784491, | |
| "eval_loss": 0.10415999591350555, | |
| "eval_runtime": 175.3578, | |
| "eval_samples_per_second": 25.434, | |
| "eval_steps_per_second": 3.182, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 1.800298586106964, | |
| "grad_norm": 11045.595703125, | |
| "learning_rate": 1.649657474090989e-05, | |
| "loss": 0.1099, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 1.809080530429437, | |
| "grad_norm": 13070.5498046875, | |
| "learning_rate": 1.6430704373792376e-05, | |
| "loss": 0.1113, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 1.809080530429437, | |
| "eval_loss": 0.10374840348958969, | |
| "eval_runtime": 175.4676, | |
| "eval_samples_per_second": 25.418, | |
| "eval_steps_per_second": 3.18, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 1.81786247475191, | |
| "grad_norm": 7781.587890625, | |
| "learning_rate": 1.6364834006674866e-05, | |
| "loss": 0.1021, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 1.8266444190743831, | |
| "grad_norm": 8406.8466796875, | |
| "learning_rate": 1.629896363955735e-05, | |
| "loss": 0.1003, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 1.8266444190743831, | |
| "eval_loss": 0.10402803868055344, | |
| "eval_runtime": 175.179, | |
| "eval_samples_per_second": 25.46, | |
| "eval_steps_per_second": 3.185, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 1.8354263633968562, | |
| "grad_norm": 12187.5849609375, | |
| "learning_rate": 1.623309327243984e-05, | |
| "loss": 0.1022, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 1.8442083077193292, | |
| "grad_norm": 8159.96435546875, | |
| "learning_rate": 1.6167222905322326e-05, | |
| "loss": 0.1066, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.8442083077193292, | |
| "eval_loss": 0.10364160686731339, | |
| "eval_runtime": 175.267, | |
| "eval_samples_per_second": 25.447, | |
| "eval_steps_per_second": 3.184, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.852990252041802, | |
| "grad_norm": 11498.6494140625, | |
| "learning_rate": 1.6101352538204813e-05, | |
| "loss": 0.1037, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 1.861772196364275, | |
| "grad_norm": 11206.7646484375, | |
| "learning_rate": 1.60354821710873e-05, | |
| "loss": 0.1041, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 1.861772196364275, | |
| "eval_loss": 0.10378584265708923, | |
| "eval_runtime": 175.3196, | |
| "eval_samples_per_second": 25.439, | |
| "eval_steps_per_second": 3.183, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 1.870554140686748, | |
| "grad_norm": 13556.3330078125, | |
| "learning_rate": 1.596961180396979e-05, | |
| "loss": 0.0996, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 1.879336085009221, | |
| "grad_norm": 7290.71240234375, | |
| "learning_rate": 1.5903741436852276e-05, | |
| "loss": 0.1069, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 1.879336085009221, | |
| "eval_loss": 0.10361269861459732, | |
| "eval_runtime": 175.3025, | |
| "eval_samples_per_second": 25.442, | |
| "eval_steps_per_second": 3.183, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 1.888118029331694, | |
| "grad_norm": 10433.58203125, | |
| "learning_rate": 1.5837871069734763e-05, | |
| "loss": 0.1039, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 1.896899973654167, | |
| "grad_norm": 7497.98193359375, | |
| "learning_rate": 1.577200070261725e-05, | |
| "loss": 0.0992, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 1.896899973654167, | |
| "eval_loss": 0.10356434434652328, | |
| "eval_runtime": 175.2851, | |
| "eval_samples_per_second": 25.444, | |
| "eval_steps_per_second": 3.183, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 1.90568191797664, | |
| "grad_norm": 7763.9208984375, | |
| "learning_rate": 1.5706130335499736e-05, | |
| "loss": 0.0997, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 1.914463862299113, | |
| "grad_norm": 8243.4501953125, | |
| "learning_rate": 1.5640259968382226e-05, | |
| "loss": 0.1053, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 1.914463862299113, | |
| "eval_loss": 0.10348707437515259, | |
| "eval_runtime": 175.331, | |
| "eval_samples_per_second": 25.438, | |
| "eval_steps_per_second": 3.183, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 1.923245806621586, | |
| "grad_norm": 10855.509765625, | |
| "learning_rate": 1.557438960126471e-05, | |
| "loss": 0.1072, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 1.932027750944059, | |
| "grad_norm": 11178.33203125, | |
| "learning_rate": 1.55085192341472e-05, | |
| "loss": 0.1074, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.932027750944059, | |
| "eval_loss": 0.1035689190030098, | |
| "eval_runtime": 175.1907, | |
| "eval_samples_per_second": 25.458, | |
| "eval_steps_per_second": 3.185, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.9408096952665321, | |
| "grad_norm": 15381.65625, | |
| "learning_rate": 1.5442648867029686e-05, | |
| "loss": 0.104, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 1.9495916395890052, | |
| "grad_norm": 11851.7646484375, | |
| "learning_rate": 1.5376778499912176e-05, | |
| "loss": 0.1021, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 1.9495916395890052, | |
| "eval_loss": 0.1033787652850151, | |
| "eval_runtime": 175.4375, | |
| "eval_samples_per_second": 25.422, | |
| "eval_steps_per_second": 3.181, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 1.958373583911478, | |
| "grad_norm": 10821.587890625, | |
| "learning_rate": 1.531090813279466e-05, | |
| "loss": 0.105, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 1.967155528233951, | |
| "grad_norm": 10268.7001953125, | |
| "learning_rate": 1.5245037765677149e-05, | |
| "loss": 0.0995, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 1.967155528233951, | |
| "eval_loss": 0.1031695231795311, | |
| "eval_runtime": 175.3915, | |
| "eval_samples_per_second": 25.429, | |
| "eval_steps_per_second": 3.181, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 1.975937472556424, | |
| "grad_norm": 15241.69140625, | |
| "learning_rate": 1.5179167398559635e-05, | |
| "loss": 0.1001, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 1.9847194168788969, | |
| "grad_norm": 9984.0908203125, | |
| "learning_rate": 1.5113297031442124e-05, | |
| "loss": 0.1051, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 1.9847194168788969, | |
| "eval_loss": 0.10308495908975601, | |
| "eval_runtime": 175.2586, | |
| "eval_samples_per_second": 25.448, | |
| "eval_steps_per_second": 3.184, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 1.99350136120137, | |
| "grad_norm": 10116.9619140625, | |
| "learning_rate": 1.5047426664324609e-05, | |
| "loss": 0.102, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 2.002283305523843, | |
| "grad_norm": 13065.00390625, | |
| "learning_rate": 1.4981556297207097e-05, | |
| "loss": 0.1029, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 2.002283305523843, | |
| "eval_loss": 0.10323852300643921, | |
| "eval_runtime": 175.2291, | |
| "eval_samples_per_second": 25.452, | |
| "eval_steps_per_second": 3.184, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 2.011065249846316, | |
| "grad_norm": 10087.5390625, | |
| "learning_rate": 1.4915685930089584e-05, | |
| "loss": 0.1009, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 2.019847194168789, | |
| "grad_norm": 9332.802734375, | |
| "learning_rate": 1.4849815562972072e-05, | |
| "loss": 0.093, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 2.019847194168789, | |
| "eval_loss": 0.10356967151165009, | |
| "eval_runtime": 175.1976, | |
| "eval_samples_per_second": 25.457, | |
| "eval_steps_per_second": 3.185, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 2.028629138491262, | |
| "grad_norm": 8534.212890625, | |
| "learning_rate": 1.4783945195854559e-05, | |
| "loss": 0.0969, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 2.037411082813735, | |
| "grad_norm": 10853.4990234375, | |
| "learning_rate": 1.4718074828737047e-05, | |
| "loss": 0.1016, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 2.037411082813735, | |
| "eval_loss": 0.10368319600820541, | |
| "eval_runtime": 175.2089, | |
| "eval_samples_per_second": 25.455, | |
| "eval_steps_per_second": 3.185, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 2.046193027136208, | |
| "grad_norm": 10575.98828125, | |
| "learning_rate": 1.4652204461619534e-05, | |
| "loss": 0.0993, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 2.054974971458681, | |
| "grad_norm": 17123.625, | |
| "learning_rate": 1.458633409450202e-05, | |
| "loss": 0.1011, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 2.054974971458681, | |
| "eval_loss": 0.10356248915195465, | |
| "eval_runtime": 175.2585, | |
| "eval_samples_per_second": 25.448, | |
| "eval_steps_per_second": 3.184, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 2.0637569157811537, | |
| "grad_norm": 13095.1728515625, | |
| "learning_rate": 1.4520463727384507e-05, | |
| "loss": 0.0956, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 2.0725388601036268, | |
| "grad_norm": 11280.3291015625, | |
| "learning_rate": 1.4454593360266995e-05, | |
| "loss": 0.0969, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 2.0725388601036268, | |
| "eval_loss": 0.10367120802402496, | |
| "eval_runtime": 175.3325, | |
| "eval_samples_per_second": 25.437, | |
| "eval_steps_per_second": 3.183, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 2.0813208044261, | |
| "grad_norm": 8861.095703125, | |
| "learning_rate": 1.4388722993149482e-05, | |
| "loss": 0.0971, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 2.090102748748573, | |
| "grad_norm": 15480.5634765625, | |
| "learning_rate": 1.432285262603197e-05, | |
| "loss": 0.0977, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 2.090102748748573, | |
| "eval_loss": 0.1037474200129509, | |
| "eval_runtime": 175.4966, | |
| "eval_samples_per_second": 25.414, | |
| "eval_steps_per_second": 3.18, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 2.098884693071046, | |
| "grad_norm": 10802.611328125, | |
| "learning_rate": 1.4256982258914457e-05, | |
| "loss": 0.0979, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 2.107666637393519, | |
| "grad_norm": 7810.14111328125, | |
| "learning_rate": 1.4191111891796943e-05, | |
| "loss": 0.0963, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 2.107666637393519, | |
| "eval_loss": 0.10353059321641922, | |
| "eval_runtime": 175.3863, | |
| "eval_samples_per_second": 25.43, | |
| "eval_steps_per_second": 3.182, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 2.116448581715992, | |
| "grad_norm": 10527.5751953125, | |
| "learning_rate": 1.4125241524679432e-05, | |
| "loss": 0.0997, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 2.125230526038465, | |
| "grad_norm": 12505.5380859375, | |
| "learning_rate": 1.4059371157561918e-05, | |
| "loss": 0.0981, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 2.125230526038465, | |
| "eval_loss": 0.10333307832479477, | |
| "eval_runtime": 175.2906, | |
| "eval_samples_per_second": 25.443, | |
| "eval_steps_per_second": 3.183, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 2.134012470360938, | |
| "grad_norm": 9851.1923828125, | |
| "learning_rate": 1.3993500790444406e-05, | |
| "loss": 0.1006, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 2.142794414683411, | |
| "grad_norm": 9354.9697265625, | |
| "learning_rate": 1.3927630423326893e-05, | |
| "loss": 0.1032, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 2.142794414683411, | |
| "eval_loss": 0.10333764553070068, | |
| "eval_runtime": 175.147, | |
| "eval_samples_per_second": 25.464, | |
| "eval_steps_per_second": 3.186, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 2.151576359005884, | |
| "grad_norm": 7880.865234375, | |
| "learning_rate": 1.3861760056209381e-05, | |
| "loss": 0.0957, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 2.160358303328357, | |
| "grad_norm": 17636.8515625, | |
| "learning_rate": 1.3795889689091866e-05, | |
| "loss": 0.0952, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 2.160358303328357, | |
| "eval_loss": 0.10335990786552429, | |
| "eval_runtime": 175.3211, | |
| "eval_samples_per_second": 25.439, | |
| "eval_steps_per_second": 3.183, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 2.1691402476508297, | |
| "grad_norm": 15586.3701171875, | |
| "learning_rate": 1.3730019321974355e-05, | |
| "loss": 0.098, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 2.1779221919733027, | |
| "grad_norm": 11448.01953125, | |
| "learning_rate": 1.3664148954856841e-05, | |
| "loss": 0.0942, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 2.1779221919733027, | |
| "eval_loss": 0.10320650041103363, | |
| "eval_runtime": 175.2718, | |
| "eval_samples_per_second": 25.446, | |
| "eval_steps_per_second": 3.184, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 2.1867041362957758, | |
| "grad_norm": 13402.5732421875, | |
| "learning_rate": 1.359827858773933e-05, | |
| "loss": 0.1016, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 2.195486080618249, | |
| "grad_norm": 7308.1123046875, | |
| "learning_rate": 1.3532408220621816e-05, | |
| "loss": 0.0978, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 2.195486080618249, | |
| "eval_loss": 0.10325244069099426, | |
| "eval_runtime": 175.4131, | |
| "eval_samples_per_second": 25.426, | |
| "eval_steps_per_second": 3.181, | |
| "step": 12500 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 22772, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.089152635076608e+16, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |