| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 6.666666666666667, | |
| "eval_steps": 10, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.13, | |
| "learning_rate": 4.9198396793587176e-05, | |
| "loss": 1.4613, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "eval_loss": 1.749655842781067, | |
| "eval_runtime": 9.1057, | |
| "eval_samples_per_second": 0.988, | |
| "eval_steps_per_second": 0.22, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "learning_rate": 4.829659318637275e-05, | |
| "loss": 0.8073, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "eval_loss": 1.7181354761123657, | |
| "eval_runtime": 9.0405, | |
| "eval_samples_per_second": 0.996, | |
| "eval_steps_per_second": 0.221, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "learning_rate": 4.7294589178356715e-05, | |
| "loss": 0.5993, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "eval_loss": 1.6926982402801514, | |
| "eval_runtime": 9.0635, | |
| "eval_samples_per_second": 0.993, | |
| "eval_steps_per_second": 0.221, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "learning_rate": 4.629258517034069e-05, | |
| "loss": 0.5535, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "eval_loss": 1.5878939628601074, | |
| "eval_runtime": 9.0858, | |
| "eval_samples_per_second": 0.991, | |
| "eval_steps_per_second": 0.22, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "learning_rate": 4.529058116232465e-05, | |
| "loss": 0.4914, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "eval_loss": 1.6263775825500488, | |
| "eval_runtime": 9.0441, | |
| "eval_samples_per_second": 0.995, | |
| "eval_steps_per_second": 0.221, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "learning_rate": 4.428857715430862e-05, | |
| "loss": 0.3532, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_loss": 1.5376472473144531, | |
| "eval_runtime": 9.0097, | |
| "eval_samples_per_second": 0.999, | |
| "eval_steps_per_second": 0.222, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "learning_rate": 4.3286573146292584e-05, | |
| "loss": 0.3079, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "eval_loss": 1.50021231174469, | |
| "eval_runtime": 9.0392, | |
| "eval_samples_per_second": 0.996, | |
| "eval_steps_per_second": 0.221, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "learning_rate": 4.228456913827655e-05, | |
| "loss": 0.32, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "eval_loss": 1.4650906324386597, | |
| "eval_runtime": 9.0603, | |
| "eval_samples_per_second": 0.993, | |
| "eval_steps_per_second": 0.221, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "learning_rate": 4.128256513026052e-05, | |
| "loss": 0.22, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "eval_loss": 1.5176193714141846, | |
| "eval_runtime": 9.0417, | |
| "eval_samples_per_second": 0.995, | |
| "eval_steps_per_second": 0.221, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "learning_rate": 4.0280561122244495e-05, | |
| "loss": 0.2341, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "eval_loss": 1.4975649118423462, | |
| "eval_runtime": 9.0642, | |
| "eval_samples_per_second": 0.993, | |
| "eval_steps_per_second": 0.221, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "learning_rate": 3.927855711422846e-05, | |
| "loss": 0.2391, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "eval_loss": 1.7257053852081299, | |
| "eval_runtime": 8.9861, | |
| "eval_samples_per_second": 1.002, | |
| "eval_steps_per_second": 0.223, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "learning_rate": 3.8276553106212426e-05, | |
| "loss": 0.1884, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_loss": 1.639228105545044, | |
| "eval_runtime": 9.0813, | |
| "eval_samples_per_second": 0.991, | |
| "eval_steps_per_second": 0.22, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "learning_rate": 3.727454909819639e-05, | |
| "loss": 0.1881, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "eval_loss": 1.5887799263000488, | |
| "eval_runtime": 9.08, | |
| "eval_samples_per_second": 0.991, | |
| "eval_steps_per_second": 0.22, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "learning_rate": 3.627254509018036e-05, | |
| "loss": 0.1314, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "eval_loss": 1.6493072509765625, | |
| "eval_runtime": 9.0781, | |
| "eval_samples_per_second": 0.991, | |
| "eval_steps_per_second": 0.22, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "learning_rate": 3.527054108216433e-05, | |
| "loss": 0.1524, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 1.6057649850845337, | |
| "eval_runtime": 9.0159, | |
| "eval_samples_per_second": 0.998, | |
| "eval_steps_per_second": 0.222, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "learning_rate": 3.42685370741483e-05, | |
| "loss": 0.1036, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "eval_loss": 1.6697440147399902, | |
| "eval_runtime": 8.9746, | |
| "eval_samples_per_second": 1.003, | |
| "eval_steps_per_second": 0.223, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "learning_rate": 3.326653306613227e-05, | |
| "loss": 0.1176, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "eval_loss": 1.6768882274627686, | |
| "eval_runtime": 9.0646, | |
| "eval_samples_per_second": 0.993, | |
| "eval_steps_per_second": 0.221, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "learning_rate": 3.2264529058116233e-05, | |
| "loss": 0.1375, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "eval_loss": 1.617688775062561, | |
| "eval_runtime": 9.0499, | |
| "eval_samples_per_second": 0.994, | |
| "eval_steps_per_second": 0.221, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.53, | |
| "learning_rate": 3.12625250501002e-05, | |
| "loss": 0.1038, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.53, | |
| "eval_loss": 1.6981985569000244, | |
| "eval_runtime": 9.0561, | |
| "eval_samples_per_second": 0.994, | |
| "eval_steps_per_second": 0.221, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "learning_rate": 3.026052104208417e-05, | |
| "loss": 0.1068, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "eval_loss": 1.7827256917953491, | |
| "eval_runtime": 9.0294, | |
| "eval_samples_per_second": 0.997, | |
| "eval_steps_per_second": 0.221, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "learning_rate": 2.925851703406814e-05, | |
| "loss": 0.1229, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "eval_loss": 1.6968226432800293, | |
| "eval_runtime": 9.039, | |
| "eval_samples_per_second": 0.996, | |
| "eval_steps_per_second": 0.221, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.93, | |
| "learning_rate": 2.8256513026052106e-05, | |
| "loss": 0.142, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.93, | |
| "eval_loss": 1.7085565328598022, | |
| "eval_runtime": 9.0746, | |
| "eval_samples_per_second": 0.992, | |
| "eval_steps_per_second": 0.22, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 3.07, | |
| "learning_rate": 2.7254509018036072e-05, | |
| "loss": 0.0868, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 3.07, | |
| "eval_loss": 1.814492106437683, | |
| "eval_runtime": 9.0805, | |
| "eval_samples_per_second": 0.991, | |
| "eval_steps_per_second": 0.22, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "learning_rate": 2.625250501002004e-05, | |
| "loss": 0.0772, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "eval_loss": 1.916466236114502, | |
| "eval_runtime": 9.072, | |
| "eval_samples_per_second": 0.992, | |
| "eval_steps_per_second": 0.22, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 3.33, | |
| "learning_rate": 2.5250501002004006e-05, | |
| "loss": 0.0644, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 3.33, | |
| "eval_loss": 1.8792904615402222, | |
| "eval_runtime": 9.0616, | |
| "eval_samples_per_second": 0.993, | |
| "eval_steps_per_second": 0.221, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 3.47, | |
| "learning_rate": 2.4248496993987975e-05, | |
| "loss": 0.0886, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 3.47, | |
| "eval_loss": 1.7695975303649902, | |
| "eval_runtime": 9.0109, | |
| "eval_samples_per_second": 0.999, | |
| "eval_steps_per_second": 0.222, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "learning_rate": 2.3246492985971944e-05, | |
| "loss": 0.0807, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "eval_loss": 1.7431364059448242, | |
| "eval_runtime": 9.0009, | |
| "eval_samples_per_second": 1.0, | |
| "eval_steps_per_second": 0.222, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 3.73, | |
| "learning_rate": 2.2244488977955913e-05, | |
| "loss": 0.0873, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 3.73, | |
| "eval_loss": 1.8270233869552612, | |
| "eval_runtime": 9.0218, | |
| "eval_samples_per_second": 0.998, | |
| "eval_steps_per_second": 0.222, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 3.87, | |
| "learning_rate": 2.124248496993988e-05, | |
| "loss": 0.0704, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 3.87, | |
| "eval_loss": 1.8161801099777222, | |
| "eval_runtime": 9.0309, | |
| "eval_samples_per_second": 0.997, | |
| "eval_steps_per_second": 0.221, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "learning_rate": 2.0240480961923848e-05, | |
| "loss": 0.0729, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 1.8546494245529175, | |
| "eval_runtime": 9.0378, | |
| "eval_samples_per_second": 0.996, | |
| "eval_steps_per_second": 0.221, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 4.13, | |
| "learning_rate": 1.9238476953907817e-05, | |
| "loss": 0.063, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 4.13, | |
| "eval_loss": 1.9666298627853394, | |
| "eval_runtime": 9.008, | |
| "eval_samples_per_second": 0.999, | |
| "eval_steps_per_second": 0.222, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 4.27, | |
| "learning_rate": 1.8236472945891783e-05, | |
| "loss": 0.0541, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 4.27, | |
| "eval_loss": 1.9620505571365356, | |
| "eval_runtime": 9.0643, | |
| "eval_samples_per_second": 0.993, | |
| "eval_steps_per_second": 0.221, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 4.4, | |
| "learning_rate": 1.7234468937875752e-05, | |
| "loss": 0.0648, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 4.4, | |
| "eval_loss": 1.962950348854065, | |
| "eval_runtime": 9.0685, | |
| "eval_samples_per_second": 0.992, | |
| "eval_steps_per_second": 0.221, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 4.53, | |
| "learning_rate": 1.623246492985972e-05, | |
| "loss": 0.0631, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 4.53, | |
| "eval_loss": 1.9116477966308594, | |
| "eval_runtime": 9.0789, | |
| "eval_samples_per_second": 0.991, | |
| "eval_steps_per_second": 0.22, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 4.67, | |
| "learning_rate": 1.523046092184369e-05, | |
| "loss": 0.0545, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 4.67, | |
| "eval_loss": 1.9107557535171509, | |
| "eval_runtime": 9.0172, | |
| "eval_samples_per_second": 0.998, | |
| "eval_steps_per_second": 0.222, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "learning_rate": 1.4228456913827657e-05, | |
| "loss": 0.0622, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "eval_loss": 1.965379238128662, | |
| "eval_runtime": 8.9497, | |
| "eval_samples_per_second": 1.006, | |
| "eval_steps_per_second": 0.223, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 4.93, | |
| "learning_rate": 1.3226452905811623e-05, | |
| "loss": 0.0523, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 4.93, | |
| "eval_loss": 2.0281553268432617, | |
| "eval_runtime": 9.0358, | |
| "eval_samples_per_second": 0.996, | |
| "eval_steps_per_second": 0.221, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 5.07, | |
| "learning_rate": 1.2224448897795592e-05, | |
| "loss": 0.0479, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 5.07, | |
| "eval_loss": 2.065614938735962, | |
| "eval_runtime": 9.0421, | |
| "eval_samples_per_second": 0.995, | |
| "eval_steps_per_second": 0.221, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 5.2, | |
| "learning_rate": 1.122244488977956e-05, | |
| "loss": 0.0373, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 5.2, | |
| "eval_loss": 2.0752174854278564, | |
| "eval_runtime": 9.0688, | |
| "eval_samples_per_second": 0.992, | |
| "eval_steps_per_second": 0.221, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 5.33, | |
| "learning_rate": 1.0220440881763528e-05, | |
| "loss": 0.0406, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 5.33, | |
| "eval_loss": 2.0857064723968506, | |
| "eval_runtime": 9.0511, | |
| "eval_samples_per_second": 0.994, | |
| "eval_steps_per_second": 0.221, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 5.47, | |
| "learning_rate": 9.218436873747496e-06, | |
| "loss": 0.0463, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 5.47, | |
| "eval_loss": 2.118208885192871, | |
| "eval_runtime": 9.0064, | |
| "eval_samples_per_second": 0.999, | |
| "eval_steps_per_second": 0.222, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 5.6, | |
| "learning_rate": 8.216432865731463e-06, | |
| "loss": 0.0433, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 5.6, | |
| "eval_loss": 2.1232292652130127, | |
| "eval_runtime": 9.0773, | |
| "eval_samples_per_second": 0.991, | |
| "eval_steps_per_second": 0.22, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 5.73, | |
| "learning_rate": 7.214428857715432e-06, | |
| "loss": 0.0425, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 5.73, | |
| "eval_loss": 2.124908924102783, | |
| "eval_runtime": 9.0585, | |
| "eval_samples_per_second": 0.994, | |
| "eval_steps_per_second": 0.221, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 5.87, | |
| "learning_rate": 6.212424849699399e-06, | |
| "loss": 0.0413, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 5.87, | |
| "eval_loss": 2.125481605529785, | |
| "eval_runtime": 9.0338, | |
| "eval_samples_per_second": 0.996, | |
| "eval_steps_per_second": 0.221, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "learning_rate": 5.2104208416833665e-06, | |
| "loss": 0.0579, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 2.1075000762939453, | |
| "eval_runtime": 9.0414, | |
| "eval_samples_per_second": 0.995, | |
| "eval_steps_per_second": 0.221, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 6.13, | |
| "learning_rate": 4.208416833667335e-06, | |
| "loss": 0.0326, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 6.13, | |
| "eval_loss": 2.112290382385254, | |
| "eval_runtime": 8.9941, | |
| "eval_samples_per_second": 1.001, | |
| "eval_steps_per_second": 0.222, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 6.27, | |
| "learning_rate": 3.2064128256513024e-06, | |
| "loss": 0.0378, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 6.27, | |
| "eval_loss": 2.1354782581329346, | |
| "eval_runtime": 9.0542, | |
| "eval_samples_per_second": 0.994, | |
| "eval_steps_per_second": 0.221, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 6.4, | |
| "learning_rate": 2.2044088176352706e-06, | |
| "loss": 0.0384, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 6.4, | |
| "eval_loss": 2.1528055667877197, | |
| "eval_runtime": 9.0296, | |
| "eval_samples_per_second": 0.997, | |
| "eval_steps_per_second": 0.221, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 6.53, | |
| "learning_rate": 1.2024048096192386e-06, | |
| "loss": 0.0328, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 6.53, | |
| "eval_loss": 2.162899971008301, | |
| "eval_runtime": 9.0663, | |
| "eval_samples_per_second": 0.993, | |
| "eval_steps_per_second": 0.221, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 6.67, | |
| "learning_rate": 2.004008016032064e-07, | |
| "loss": 0.0356, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 6.67, | |
| "eval_loss": 2.1663544178009033, | |
| "eval_runtime": 9.0766, | |
| "eval_samples_per_second": 0.992, | |
| "eval_steps_per_second": 0.22, | |
| "step": 500 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 7, | |
| "save_steps": 50, | |
| "total_flos": 3.411755398324224e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |