{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.03681207436039021, "eval_steps": 20, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012270691453463403, "grad_norm": 0.5895100831985474, "learning_rate": 0.00019993864026834657, "loss": 2.5283, "step": 20 }, { "epoch": 0.0012270691453463403, "eval_loss": 2.1053755283355713, "eval_runtime": 20.2067, "eval_samples_per_second": 4.949, "eval_steps_per_second": 0.643, "step": 20 }, { "epoch": 0.0024541382906926807, "grad_norm": 0.5672204494476318, "learning_rate": 0.00019985682729280866, "loss": 2.0202, "step": 40 }, { "epoch": 0.0024541382906926807, "eval_loss": 1.9263768196105957, "eval_runtime": 19.6426, "eval_samples_per_second": 5.091, "eval_steps_per_second": 0.662, "step": 40 }, { "epoch": 0.003681207436039021, "grad_norm": 0.8112285733222961, "learning_rate": 0.00019977501431727072, "loss": 1.9429, "step": 60 }, { "epoch": 0.003681207436039021, "eval_loss": 1.8300584554672241, "eval_runtime": 19.6764, "eval_samples_per_second": 5.082, "eval_steps_per_second": 0.661, "step": 60 }, { "epoch": 0.004908276581385361, "grad_norm": 0.7514470219612122, "learning_rate": 0.0001996932013417328, "loss": 1.8545, "step": 80 }, { "epoch": 0.004908276581385361, "eval_loss": 1.7825714349746704, "eval_runtime": 19.7307, "eval_samples_per_second": 5.068, "eval_steps_per_second": 0.659, "step": 80 }, { "epoch": 0.006135345726731701, "grad_norm": 0.6156793832778931, "learning_rate": 0.00019961138836619487, "loss": 1.8015, "step": 100 }, { "epoch": 0.006135345726731701, "eval_loss": 1.7314132452011108, "eval_runtime": 19.6578, "eval_samples_per_second": 5.087, "eval_steps_per_second": 0.661, "step": 100 }, { "epoch": 0.007362414872078042, "grad_norm": 0.8181987404823303, "learning_rate": 0.000199529575390657, "loss": 1.7025, "step": 120 }, { "epoch": 0.007362414872078042, "eval_loss": 1.6928819417953491, "eval_runtime": 19.5713, "eval_samples_per_second": 5.11, "eval_steps_per_second": 0.664, "step": 120 }, { "epoch": 0.008589484017424381, "grad_norm": 0.6988233327865601, "learning_rate": 0.00019944776241511905, "loss": 1.7335, "step": 140 }, { "epoch": 0.008589484017424381, "eval_loss": 1.6641780138015747, "eval_runtime": 19.6822, "eval_samples_per_second": 5.081, "eval_steps_per_second": 0.66, "step": 140 }, { "epoch": 0.009816553162770723, "grad_norm": 0.623275637626648, "learning_rate": 0.00019936594943958114, "loss": 1.6708, "step": 160 }, { "epoch": 0.009816553162770723, "eval_loss": 1.629647970199585, "eval_runtime": 19.5744, "eval_samples_per_second": 5.109, "eval_steps_per_second": 0.664, "step": 160 }, { "epoch": 0.011043622308117063, "grad_norm": 0.6912758350372314, "learning_rate": 0.0001992841364640432, "loss": 1.7161, "step": 180 }, { "epoch": 0.011043622308117063, "eval_loss": 1.6033389568328857, "eval_runtime": 19.6839, "eval_samples_per_second": 5.08, "eval_steps_per_second": 0.66, "step": 180 }, { "epoch": 0.012270691453463402, "grad_norm": 0.6177836656570435, "learning_rate": 0.0001992023234885053, "loss": 1.7165, "step": 200 }, { "epoch": 0.012270691453463402, "eval_loss": 1.5879381895065308, "eval_runtime": 19.7091, "eval_samples_per_second": 5.074, "eval_steps_per_second": 0.66, "step": 200 }, { "epoch": 0.013497760598809742, "grad_norm": 0.8630465269088745, "learning_rate": 0.00019912051051296735, "loss": 1.64, "step": 220 }, { "epoch": 0.013497760598809742, "eval_loss": 1.5652003288269043, "eval_runtime": 19.9102, "eval_samples_per_second": 5.023, "eval_steps_per_second": 0.653, "step": 220 }, { "epoch": 0.014724829744156084, "grad_norm": 0.7266297936439514, "learning_rate": 0.00019903869753742944, "loss": 1.6705, "step": 240 }, { "epoch": 0.014724829744156084, "eval_loss": 1.5418590307235718, "eval_runtime": 19.7135, "eval_samples_per_second": 5.073, "eval_steps_per_second": 0.659, "step": 240 }, { "epoch": 0.015951898889502422, "grad_norm": 0.7300752997398376, "learning_rate": 0.00019895688456189153, "loss": 1.669, "step": 260 }, { "epoch": 0.015951898889502422, "eval_loss": 1.5231231451034546, "eval_runtime": 19.522, "eval_samples_per_second": 5.122, "eval_steps_per_second": 0.666, "step": 260 }, { "epoch": 0.017178968034848762, "grad_norm": 0.7053245306015015, "learning_rate": 0.00019887507158635362, "loss": 1.6513, "step": 280 }, { "epoch": 0.017178968034848762, "eval_loss": 1.514104962348938, "eval_runtime": 19.6874, "eval_samples_per_second": 5.079, "eval_steps_per_second": 0.66, "step": 280 }, { "epoch": 0.018406037180195105, "grad_norm": 0.8148968815803528, "learning_rate": 0.00019879325861081568, "loss": 1.5712, "step": 300 }, { "epoch": 0.018406037180195105, "eval_loss": 1.4980120658874512, "eval_runtime": 19.7232, "eval_samples_per_second": 5.07, "eval_steps_per_second": 0.659, "step": 300 }, { "epoch": 0.019633106325541445, "grad_norm": 0.5613670349121094, "learning_rate": 0.00019871144563527777, "loss": 1.5492, "step": 320 }, { "epoch": 0.019633106325541445, "eval_loss": 1.4802027940750122, "eval_runtime": 19.7147, "eval_samples_per_second": 5.072, "eval_steps_per_second": 0.659, "step": 320 }, { "epoch": 0.020860175470887785, "grad_norm": 0.7558555603027344, "learning_rate": 0.00019862963265973983, "loss": 1.6268, "step": 340 }, { "epoch": 0.020860175470887785, "eval_loss": 1.4685406684875488, "eval_runtime": 19.7336, "eval_samples_per_second": 5.068, "eval_steps_per_second": 0.659, "step": 340 }, { "epoch": 0.022087244616234125, "grad_norm": 0.6657942533493042, "learning_rate": 0.00019854781968420192, "loss": 1.5955, "step": 360 }, { "epoch": 0.022087244616234125, "eval_loss": 1.4536309242248535, "eval_runtime": 19.6042, "eval_samples_per_second": 5.101, "eval_steps_per_second": 0.663, "step": 360 }, { "epoch": 0.023314313761580465, "grad_norm": 0.8438799977302551, "learning_rate": 0.00019846600670866399, "loss": 1.5271, "step": 380 }, { "epoch": 0.023314313761580465, "eval_loss": 1.4461709260940552, "eval_runtime": 19.7178, "eval_samples_per_second": 5.072, "eval_steps_per_second": 0.659, "step": 380 }, { "epoch": 0.024541382906926805, "grad_norm": 0.6734594702720642, "learning_rate": 0.0001983841937331261, "loss": 1.4713, "step": 400 }, { "epoch": 0.024541382906926805, "eval_loss": 1.4335358142852783, "eval_runtime": 19.7501, "eval_samples_per_second": 5.063, "eval_steps_per_second": 0.658, "step": 400 }, { "epoch": 0.025768452052273145, "grad_norm": 0.8461142778396606, "learning_rate": 0.00019830238075758816, "loss": 1.5175, "step": 420 }, { "epoch": 0.025768452052273145, "eval_loss": 1.4290988445281982, "eval_runtime": 19.703, "eval_samples_per_second": 5.075, "eval_steps_per_second": 0.66, "step": 420 }, { "epoch": 0.026995521197619485, "grad_norm": 0.7308184504508972, "learning_rate": 0.00019822056778205025, "loss": 1.4878, "step": 440 }, { "epoch": 0.026995521197619485, "eval_loss": 1.4188473224639893, "eval_runtime": 19.6677, "eval_samples_per_second": 5.084, "eval_steps_per_second": 0.661, "step": 440 }, { "epoch": 0.028222590342965825, "grad_norm": 0.7773933410644531, "learning_rate": 0.00019813875480651232, "loss": 1.5046, "step": 460 }, { "epoch": 0.028222590342965825, "eval_loss": 1.4094576835632324, "eval_runtime": 19.6378, "eval_samples_per_second": 5.092, "eval_steps_per_second": 0.662, "step": 460 }, { "epoch": 0.029449659488312168, "grad_norm": 0.6018341779708862, "learning_rate": 0.0001980569418309744, "loss": 1.508, "step": 480 }, { "epoch": 0.029449659488312168, "eval_loss": 1.396529197692871, "eval_runtime": 19.7008, "eval_samples_per_second": 5.076, "eval_steps_per_second": 0.66, "step": 480 }, { "epoch": 0.030676728633658508, "grad_norm": 0.6028321981430054, "learning_rate": 0.00019797512885543647, "loss": 1.5018, "step": 500 }, { "epoch": 0.030676728633658508, "eval_loss": 1.3898202180862427, "eval_runtime": 19.7395, "eval_samples_per_second": 5.066, "eval_steps_per_second": 0.659, "step": 500 }, { "epoch": 0.031903797779004844, "grad_norm": 0.7919607162475586, "learning_rate": 0.00019789331587989856, "loss": 1.4158, "step": 520 }, { "epoch": 0.031903797779004844, "eval_loss": 1.385123610496521, "eval_runtime": 19.7083, "eval_samples_per_second": 5.074, "eval_steps_per_second": 0.66, "step": 520 }, { "epoch": 0.03313086692435119, "grad_norm": 0.7193537354469299, "learning_rate": 0.00019781150290436065, "loss": 1.4829, "step": 540 }, { "epoch": 0.03313086692435119, "eval_loss": 1.3717445135116577, "eval_runtime": 19.7284, "eval_samples_per_second": 5.069, "eval_steps_per_second": 0.659, "step": 540 }, { "epoch": 0.034357936069697524, "grad_norm": 0.623745322227478, "learning_rate": 0.00019772968992882274, "loss": 1.5216, "step": 560 }, { "epoch": 0.034357936069697524, "eval_loss": 1.3708571195602417, "eval_runtime": 19.6215, "eval_samples_per_second": 5.096, "eval_steps_per_second": 0.663, "step": 560 }, { "epoch": 0.03558500521504387, "grad_norm": 0.7613083124160767, "learning_rate": 0.0001976478769532848, "loss": 1.4677, "step": 580 }, { "epoch": 0.03558500521504387, "eval_loss": 1.3612563610076904, "eval_runtime": 19.7315, "eval_samples_per_second": 5.068, "eval_steps_per_second": 0.659, "step": 580 }, { "epoch": 0.03681207436039021, "grad_norm": 0.6662244200706482, "learning_rate": 0.00019756606397774689, "loss": 1.4336, "step": 600 }, { "epoch": 0.03681207436039021, "eval_loss": 1.3519067764282227, "eval_runtime": 19.6923, "eval_samples_per_second": 5.078, "eval_steps_per_second": 0.66, "step": 600 } ], "logging_steps": 20, "max_steps": 48897, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "total_flos": 3.892256129028096e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }