| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 100, | |
| "global_step": 496, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.004032258064516129, | |
| "grad_norm": 215.24710014620507, | |
| "learning_rate": 4.0000000000000003e-07, | |
| "loss": 1.6159, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.020161290322580645, | |
| "grad_norm": 312.9268127676858, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 1.4896, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.04032258064516129, | |
| "grad_norm": 22.150019622688806, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 1.4524, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06048387096774194, | |
| "grad_norm": 6.1881183003709035, | |
| "learning_rate": 6e-06, | |
| "loss": 1.2634, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.08064516129032258, | |
| "grad_norm": 4.300533001219337, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 1.1845, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.10080645161290322, | |
| "grad_norm": 3.355604785739065, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1828, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.12096774193548387, | |
| "grad_norm": 3.1071264175265796, | |
| "learning_rate": 1.2e-05, | |
| "loss": 1.1377, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.14112903225806453, | |
| "grad_norm": 3.739675462647032, | |
| "learning_rate": 1.4e-05, | |
| "loss": 1.1406, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.16129032258064516, | |
| "grad_norm": 3.5497603101186597, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 1.1189, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1814516129032258, | |
| "grad_norm": 3.459217793654087, | |
| "learning_rate": 1.8e-05, | |
| "loss": 1.1015, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.20161290322580644, | |
| "grad_norm": 2.5061828560304673, | |
| "learning_rate": 2e-05, | |
| "loss": 1.1509, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.2217741935483871, | |
| "grad_norm": 3.071258628815723, | |
| "learning_rate": 1.999379852284651e-05, | |
| "loss": 1.1428, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.24193548387096775, | |
| "grad_norm": 2.1526518734405915, | |
| "learning_rate": 1.9975201783049804e-05, | |
| "loss": 1.0857, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.2620967741935484, | |
| "grad_norm": 3.1257781855467806, | |
| "learning_rate": 1.9944232846061284e-05, | |
| "loss": 1.1007, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.28225806451612906, | |
| "grad_norm": 2.650156490646313, | |
| "learning_rate": 1.9900930122511993e-05, | |
| "loss": 1.1763, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.3024193548387097, | |
| "grad_norm": 2.222582022247095, | |
| "learning_rate": 1.984534732057208e-05, | |
| "loss": 1.0812, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.3225806451612903, | |
| "grad_norm": 3.755219640137959, | |
| "learning_rate": 1.977755337933682e-05, | |
| "loss": 1.1358, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.34274193548387094, | |
| "grad_norm": 2.9861330009858578, | |
| "learning_rate": 1.9697632383321755e-05, | |
| "loss": 1.0906, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.3629032258064516, | |
| "grad_norm": 4.930667988082775, | |
| "learning_rate": 1.960568345817306e-05, | |
| "loss": 1.0903, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.38306451612903225, | |
| "grad_norm": 2.2881396839609565, | |
| "learning_rate": 1.9501820647722458e-05, | |
| "loss": 1.1276, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.4032258064516129, | |
| "grad_norm": 2.699315706389293, | |
| "learning_rate": 1.9386172772539162e-05, | |
| "loss": 1.0391, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.4032258064516129, | |
| "eval_loss": 1.1010785102844238, | |
| "eval_runtime": 5.7903, | |
| "eval_samples_per_second": 68.563, | |
| "eval_steps_per_second": 2.245, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.42338709677419356, | |
| "grad_norm": 2.3951607960351318, | |
| "learning_rate": 1.925888327015434e-05, | |
| "loss": 1.1294, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.4435483870967742, | |
| "grad_norm": 2.590506218859771, | |
| "learning_rate": 1.9120110017156172e-05, | |
| "loss": 1.1039, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.4637096774193548, | |
| "grad_norm": 2.3642899736042855, | |
| "learning_rate": 1.8970025133376252e-05, | |
| "loss": 1.0845, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.4838709677419355, | |
| "grad_norm": 2.4194055228848623, | |
| "learning_rate": 1.8808814768410157e-05, | |
| "loss": 1.1614, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.5040322580645161, | |
| "grad_norm": 2.3158075822843545, | |
| "learning_rate": 1.8636678870736928e-05, | |
| "loss": 1.0667, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.5241935483870968, | |
| "grad_norm": 2.2181248045354165, | |
| "learning_rate": 1.8453830939723913e-05, | |
| "loss": 1.0932, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5443548387096774, | |
| "grad_norm": 2.4950408748993653, | |
| "learning_rate": 1.826049776082446e-05, | |
| "loss": 1.102, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.5645161290322581, | |
| "grad_norm": 2.9826288289990774, | |
| "learning_rate": 1.8056919124296957e-05, | |
| "loss": 1.1153, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5846774193548387, | |
| "grad_norm": 4.789513209353997, | |
| "learning_rate": 1.784334752779408e-05, | |
| "loss": 1.0892, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.6048387096774194, | |
| "grad_norm": 2.153748931520763, | |
| "learning_rate": 1.76200478631911e-05, | |
| "loss": 1.0512, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.625, | |
| "grad_norm": 2.1449515128315584, | |
| "learning_rate": 1.7387297088041696e-05, | |
| "loss": 1.0708, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.6451612903225806, | |
| "grad_norm": 2.568672641778878, | |
| "learning_rate": 1.714538388206878e-05, | |
| "loss": 1.1534, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.6653225806451613, | |
| "grad_norm": 2.4066286973710818, | |
| "learning_rate": 1.6894608289116344e-05, | |
| "loss": 1.0782, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.6854838709677419, | |
| "grad_norm": 2.469873074784075, | |
| "learning_rate": 1.663528134500646e-05, | |
| "loss": 1.1579, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.7056451612903226, | |
| "grad_norm": 2.546868439730841, | |
| "learning_rate": 1.6367724691762967e-05, | |
| "loss": 1.0727, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.7258064516129032, | |
| "grad_norm": 2.610421464248938, | |
| "learning_rate": 1.609227017868033e-05, | |
| "loss": 1.0766, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.7459677419354839, | |
| "grad_norm": 2.4332563313436486, | |
| "learning_rate": 1.5809259450732495e-05, | |
| "loss": 1.1252, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.7661290322580645, | |
| "grad_norm": 2.273728572874247, | |
| "learning_rate": 1.551904352483217e-05, | |
| "loss": 1.1049, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.7862903225806451, | |
| "grad_norm": 1.9228388068167408, | |
| "learning_rate": 1.5221982354466172e-05, | |
| "loss": 1.0661, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.8064516129032258, | |
| "grad_norm": 2.1988965883386573, | |
| "learning_rate": 1.4918444383246738e-05, | |
| "loss": 1.1256, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.8064516129032258, | |
| "eval_loss": 1.0620791912078857, | |
| "eval_runtime": 5.8005, | |
| "eval_samples_per_second": 68.443, | |
| "eval_steps_per_second": 2.241, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.8266129032258065, | |
| "grad_norm": 2.4001670804799273, | |
| "learning_rate": 1.460880608793262e-05, | |
| "loss": 1.069, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.8467741935483871, | |
| "grad_norm": 2.361908189355991, | |
| "learning_rate": 1.4293451511486658e-05, | |
| "loss": 1.0566, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.8669354838709677, | |
| "grad_norm": 2.1713973708275955, | |
| "learning_rate": 1.3972771786749074e-05, | |
| "loss": 1.0593, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.8870967741935484, | |
| "grad_norm": 2.0277729000613403, | |
| "learning_rate": 1.3647164651317178e-05, | |
| "loss": 1.0812, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.907258064516129, | |
| "grad_norm": 2.4350034774265326, | |
| "learning_rate": 1.3317033954233246e-05, | |
| "loss": 1.1498, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.9274193548387096, | |
| "grad_norm": 2.359441580453533, | |
| "learning_rate": 1.2982789155092407e-05, | |
| "loss": 1.1063, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.9475806451612904, | |
| "grad_norm": 2.337686204567925, | |
| "learning_rate": 1.264484481619177e-05, | |
| "loss": 1.0411, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.967741935483871, | |
| "grad_norm": 2.1962971145230297, | |
| "learning_rate": 1.23036200883507e-05, | |
| "loss": 1.0567, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.9879032258064516, | |
| "grad_norm": 2.8727159078532507, | |
| "learning_rate": 1.1959538191039986e-05, | |
| "loss": 1.09, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.0080645161290323, | |
| "grad_norm": 3.2425805939567804, | |
| "learning_rate": 1.1613025887464642e-05, | |
| "loss": 0.8776, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.028225806451613, | |
| "grad_norm": 3.0688033959726506, | |
| "learning_rate": 1.1264512955251479e-05, | |
| "loss": 0.5624, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.0483870967741935, | |
| "grad_norm": 2.310668387893831, | |
| "learning_rate": 1.0914431653397856e-05, | |
| "loss": 0.5702, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.0685483870967742, | |
| "grad_norm": 1.810288404740645, | |
| "learning_rate": 1.056321618614284e-05, | |
| "loss": 0.5516, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.0887096774193548, | |
| "grad_norm": 2.029102262007128, | |
| "learning_rate": 1.0211302164425657e-05, | |
| "loss": 0.492, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.1088709677419355, | |
| "grad_norm": 2.1849097454980977, | |
| "learning_rate": 9.859126065599435e-06, | |
| "loss": 0.5087, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.129032258064516, | |
| "grad_norm": 2.0350424139394456, | |
| "learning_rate": 9.507124692070356e-06, | |
| "loss": 0.5514, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.1491935483870968, | |
| "grad_norm": 2.1657967134210643, | |
| "learning_rate": 9.155734629533612e-06, | |
| "loss": 0.4923, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.1693548387096775, | |
| "grad_norm": 2.3019047292629065, | |
| "learning_rate": 8.805391705478149e-06, | |
| "loss": 0.5185, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.189516129032258, | |
| "grad_norm": 2.342813194456275, | |
| "learning_rate": 8.456530448631856e-06, | |
| "loss": 0.5622, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.2096774193548387, | |
| "grad_norm": 2.4435792193657373, | |
| "learning_rate": 8.10958355001755e-06, | |
| "loss": 0.5037, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.2096774193548387, | |
| "eval_loss": 1.0970051288604736, | |
| "eval_runtime": 5.7929, | |
| "eval_samples_per_second": 68.532, | |
| "eval_steps_per_second": 2.244, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.2298387096774193, | |
| "grad_norm": 2.0093621809495943, | |
| "learning_rate": 7.764981326288273e-06, | |
| "loss": 0.5044, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 2.217076831970701, | |
| "learning_rate": 7.423151186007527e-06, | |
| "loss": 0.4857, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.2701612903225805, | |
| "grad_norm": 2.711127350014508, | |
| "learning_rate": 7.084517099536378e-06, | |
| "loss": 0.5131, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 1.2903225806451613, | |
| "grad_norm": 2.2254535872260925, | |
| "learning_rate": 6.749499073184957e-06, | |
| "loss": 0.5338, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.310483870967742, | |
| "grad_norm": 2.438095643634648, | |
| "learning_rate": 6.418512628280544e-06, | |
| "loss": 0.5052, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.3306451612903225, | |
| "grad_norm": 2.1227128290496045, | |
| "learning_rate": 6.09196828579838e-06, | |
| "loss": 0.4932, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.3508064516129032, | |
| "grad_norm": 2.157875906897965, | |
| "learning_rate": 5.7702710571943695e-06, | |
| "loss": 0.488, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 1.370967741935484, | |
| "grad_norm": 2.1989233445083176, | |
| "learning_rate": 5.453819942071212e-06, | |
| "loss": 0.4638, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.3911290322580645, | |
| "grad_norm": 1.7426003114674993, | |
| "learning_rate": 5.1430074333010346e-06, | |
| "loss": 0.5005, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.4112903225806452, | |
| "grad_norm": 2.4552831246664453, | |
| "learning_rate": 4.838219030218274e-06, | |
| "loss": 0.4814, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.4314516129032258, | |
| "grad_norm": 2.3809447566869113, | |
| "learning_rate": 4.5398327604866056e-06, | |
| "loss": 0.4956, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 1.4516129032258065, | |
| "grad_norm": 1.9931477191927476, | |
| "learning_rate": 4.248218711232952e-06, | |
| "loss": 0.5095, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.471774193548387, | |
| "grad_norm": 2.0758952034120104, | |
| "learning_rate": 3.963738570030135e-06, | |
| "loss": 0.5031, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 1.4919354838709677, | |
| "grad_norm": 2.113141005154832, | |
| "learning_rate": 3.6867451762974117e-06, | |
| "loss": 0.4858, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.5120967741935485, | |
| "grad_norm": 2.033150384960035, | |
| "learning_rate": 3.417582083675365e-06, | |
| "loss": 0.4838, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.532258064516129, | |
| "grad_norm": 2.6578869400642984, | |
| "learning_rate": 3.1565831339178844e-06, | |
| "loss": 0.4981, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.5524193548387095, | |
| "grad_norm": 2.4259980364912557, | |
| "learning_rate": 2.9040720428297754e-06, | |
| "loss": 0.4953, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.5725806451612905, | |
| "grad_norm": 2.188171842523612, | |
| "learning_rate": 2.6603619987635087e-06, | |
| "loss": 0.4886, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.592741935483871, | |
| "grad_norm": 2.4509085938481645, | |
| "learning_rate": 2.4257552741731593e-06, | |
| "loss": 0.4919, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.6129032258064515, | |
| "grad_norm": 2.057354914567607, | |
| "learning_rate": 2.200542850707247e-06, | |
| "loss": 0.5057, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.6129032258064515, | |
| "eval_loss": 1.0593500137329102, | |
| "eval_runtime": 5.7784, | |
| "eval_samples_per_second": 68.704, | |
| "eval_steps_per_second": 2.25, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.6330645161290323, | |
| "grad_norm": 1.797397264265373, | |
| "learning_rate": 1.985004058305535e-06, | |
| "loss": 0.4598, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.653225806451613, | |
| "grad_norm": 2.243456315367938, | |
| "learning_rate": 1.7794062287473734e-06, | |
| "loss": 0.5226, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.6733870967741935, | |
| "grad_norm": 2.220033645347878, | |
| "learning_rate": 1.5840043640813274e-06, | |
| "loss": 0.5404, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.6935483870967742, | |
| "grad_norm": 1.9945052495277809, | |
| "learning_rate": 1.3990408203472938e-06, | |
| "loss": 0.4725, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.713709677419355, | |
| "grad_norm": 2.2666732532368217, | |
| "learning_rate": 1.2247450069834077e-06, | |
| "loss": 0.4773, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.7338709677419355, | |
| "grad_norm": 1.907392372458416, | |
| "learning_rate": 1.061333102290576e-06, | |
| "loss": 0.4776, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.754032258064516, | |
| "grad_norm": 2.6145484574587083, | |
| "learning_rate": 9.090077853075119e-07, | |
| "loss": 0.4864, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.7741935483870968, | |
| "grad_norm": 2.06555720808374, | |
| "learning_rate": 7.679579844288509e-07, | |
| "loss": 0.4692, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.7943548387096775, | |
| "grad_norm": 2.1245728247554982, | |
| "learning_rate": 6.383586430781196e-07, | |
| "loss": 0.5071, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 1.814516129032258, | |
| "grad_norm": 2.0064654819537298, | |
| "learning_rate": 5.203705027262185e-07, | |
| "loss": 0.4659, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.8346774193548387, | |
| "grad_norm": 2.1497980743373906, | |
| "learning_rate": 4.141399035245053e-07, | |
| "loss": 0.5124, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 1.8548387096774195, | |
| "grad_norm": 2.1598292791262064, | |
| "learning_rate": 3.197986027997657e-07, | |
| "loss": 0.4742, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.875, | |
| "grad_norm": 2.01644524585016, | |
| "learning_rate": 2.3746361163621723e-07, | |
| "loss": 0.4833, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 1.8951612903225805, | |
| "grad_norm": 1.9818198911248883, | |
| "learning_rate": 1.6723704974718758e-07, | |
| "loss": 0.5176, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.9153225806451613, | |
| "grad_norm": 2.0731118119143126, | |
| "learning_rate": 1.0920601881650006e-07, | |
| "loss": 0.4895, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 1.935483870967742, | |
| "grad_norm": 1.9849538964737026, | |
| "learning_rate": 6.344249446665673e-08, | |
| "loss": 0.5093, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.9556451612903225, | |
| "grad_norm": 1.8778229972783114, | |
| "learning_rate": 3.0003236987802276e-08, | |
| "loss": 0.4298, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 1.9758064516129032, | |
| "grad_norm": 2.0981658218579873, | |
| "learning_rate": 8.929720938193331e-09, | |
| "loss": 0.524, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.995967741935484, | |
| "grad_norm": 2.1714963758794936, | |
| "learning_rate": 2.48083703494606e-10, | |
| "loss": 0.523, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "step": 496, | |
| "total_flos": 20768208814080.0, | |
| "train_loss": 0.8128850824169574, | |
| "train_runtime": 1218.4539, | |
| "train_samples_per_second": 13.005, | |
| "train_steps_per_second": 0.407 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 496, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 20768208814080.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |