| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.03681207436039021, | |
| "eval_steps": 20, | |
| "global_step": 600, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0012270691453463403, | |
| "grad_norm": 0.5895100831985474, | |
| "learning_rate": 0.00019993864026834657, | |
| "loss": 2.5283, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0012270691453463403, | |
| "eval_loss": 2.1053755283355713, | |
| "eval_runtime": 20.2067, | |
| "eval_samples_per_second": 4.949, | |
| "eval_steps_per_second": 0.643, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0024541382906926807, | |
| "grad_norm": 0.5672204494476318, | |
| "learning_rate": 0.00019985682729280866, | |
| "loss": 2.0202, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.0024541382906926807, | |
| "eval_loss": 1.9263768196105957, | |
| "eval_runtime": 19.6426, | |
| "eval_samples_per_second": 5.091, | |
| "eval_steps_per_second": 0.662, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.003681207436039021, | |
| "grad_norm": 0.8112285733222961, | |
| "learning_rate": 0.00019977501431727072, | |
| "loss": 1.9429, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.003681207436039021, | |
| "eval_loss": 1.8300584554672241, | |
| "eval_runtime": 19.6764, | |
| "eval_samples_per_second": 5.082, | |
| "eval_steps_per_second": 0.661, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.004908276581385361, | |
| "grad_norm": 0.7514470219612122, | |
| "learning_rate": 0.0001996932013417328, | |
| "loss": 1.8545, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.004908276581385361, | |
| "eval_loss": 1.7825714349746704, | |
| "eval_runtime": 19.7307, | |
| "eval_samples_per_second": 5.068, | |
| "eval_steps_per_second": 0.659, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.006135345726731701, | |
| "grad_norm": 0.6156793832778931, | |
| "learning_rate": 0.00019961138836619487, | |
| "loss": 1.8015, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.006135345726731701, | |
| "eval_loss": 1.7314132452011108, | |
| "eval_runtime": 19.6578, | |
| "eval_samples_per_second": 5.087, | |
| "eval_steps_per_second": 0.661, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.007362414872078042, | |
| "grad_norm": 0.8181987404823303, | |
| "learning_rate": 0.000199529575390657, | |
| "loss": 1.7025, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.007362414872078042, | |
| "eval_loss": 1.6928819417953491, | |
| "eval_runtime": 19.5713, | |
| "eval_samples_per_second": 5.11, | |
| "eval_steps_per_second": 0.664, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.008589484017424381, | |
| "grad_norm": 0.6988233327865601, | |
| "learning_rate": 0.00019944776241511905, | |
| "loss": 1.7335, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.008589484017424381, | |
| "eval_loss": 1.6641780138015747, | |
| "eval_runtime": 19.6822, | |
| "eval_samples_per_second": 5.081, | |
| "eval_steps_per_second": 0.66, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.009816553162770723, | |
| "grad_norm": 0.623275637626648, | |
| "learning_rate": 0.00019936594943958114, | |
| "loss": 1.6708, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.009816553162770723, | |
| "eval_loss": 1.629647970199585, | |
| "eval_runtime": 19.5744, | |
| "eval_samples_per_second": 5.109, | |
| "eval_steps_per_second": 0.664, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.011043622308117063, | |
| "grad_norm": 0.6912758350372314, | |
| "learning_rate": 0.0001992841364640432, | |
| "loss": 1.7161, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.011043622308117063, | |
| "eval_loss": 1.6033389568328857, | |
| "eval_runtime": 19.6839, | |
| "eval_samples_per_second": 5.08, | |
| "eval_steps_per_second": 0.66, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.012270691453463402, | |
| "grad_norm": 0.6177836656570435, | |
| "learning_rate": 0.0001992023234885053, | |
| "loss": 1.7165, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.012270691453463402, | |
| "eval_loss": 1.5879381895065308, | |
| "eval_runtime": 19.7091, | |
| "eval_samples_per_second": 5.074, | |
| "eval_steps_per_second": 0.66, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.013497760598809742, | |
| "grad_norm": 0.8630465269088745, | |
| "learning_rate": 0.00019912051051296735, | |
| "loss": 1.64, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.013497760598809742, | |
| "eval_loss": 1.5652003288269043, | |
| "eval_runtime": 19.9102, | |
| "eval_samples_per_second": 5.023, | |
| "eval_steps_per_second": 0.653, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.014724829744156084, | |
| "grad_norm": 0.7266297936439514, | |
| "learning_rate": 0.00019903869753742944, | |
| "loss": 1.6705, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.014724829744156084, | |
| "eval_loss": 1.5418590307235718, | |
| "eval_runtime": 19.7135, | |
| "eval_samples_per_second": 5.073, | |
| "eval_steps_per_second": 0.659, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.015951898889502422, | |
| "grad_norm": 0.7300752997398376, | |
| "learning_rate": 0.00019895688456189153, | |
| "loss": 1.669, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.015951898889502422, | |
| "eval_loss": 1.5231231451034546, | |
| "eval_runtime": 19.522, | |
| "eval_samples_per_second": 5.122, | |
| "eval_steps_per_second": 0.666, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.017178968034848762, | |
| "grad_norm": 0.7053245306015015, | |
| "learning_rate": 0.00019887507158635362, | |
| "loss": 1.6513, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.017178968034848762, | |
| "eval_loss": 1.514104962348938, | |
| "eval_runtime": 19.6874, | |
| "eval_samples_per_second": 5.079, | |
| "eval_steps_per_second": 0.66, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.018406037180195105, | |
| "grad_norm": 0.8148968815803528, | |
| "learning_rate": 0.00019879325861081568, | |
| "loss": 1.5712, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.018406037180195105, | |
| "eval_loss": 1.4980120658874512, | |
| "eval_runtime": 19.7232, | |
| "eval_samples_per_second": 5.07, | |
| "eval_steps_per_second": 0.659, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.019633106325541445, | |
| "grad_norm": 0.5613670349121094, | |
| "learning_rate": 0.00019871144563527777, | |
| "loss": 1.5492, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.019633106325541445, | |
| "eval_loss": 1.4802027940750122, | |
| "eval_runtime": 19.7147, | |
| "eval_samples_per_second": 5.072, | |
| "eval_steps_per_second": 0.659, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.020860175470887785, | |
| "grad_norm": 0.7558555603027344, | |
| "learning_rate": 0.00019862963265973983, | |
| "loss": 1.6268, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.020860175470887785, | |
| "eval_loss": 1.4685406684875488, | |
| "eval_runtime": 19.7336, | |
| "eval_samples_per_second": 5.068, | |
| "eval_steps_per_second": 0.659, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.022087244616234125, | |
| "grad_norm": 0.6657942533493042, | |
| "learning_rate": 0.00019854781968420192, | |
| "loss": 1.5955, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.022087244616234125, | |
| "eval_loss": 1.4536309242248535, | |
| "eval_runtime": 19.6042, | |
| "eval_samples_per_second": 5.101, | |
| "eval_steps_per_second": 0.663, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.023314313761580465, | |
| "grad_norm": 0.8438799977302551, | |
| "learning_rate": 0.00019846600670866399, | |
| "loss": 1.5271, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.023314313761580465, | |
| "eval_loss": 1.4461709260940552, | |
| "eval_runtime": 19.7178, | |
| "eval_samples_per_second": 5.072, | |
| "eval_steps_per_second": 0.659, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.024541382906926805, | |
| "grad_norm": 0.6734594702720642, | |
| "learning_rate": 0.0001983841937331261, | |
| "loss": 1.4713, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.024541382906926805, | |
| "eval_loss": 1.4335358142852783, | |
| "eval_runtime": 19.7501, | |
| "eval_samples_per_second": 5.063, | |
| "eval_steps_per_second": 0.658, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.025768452052273145, | |
| "grad_norm": 0.8461142778396606, | |
| "learning_rate": 0.00019830238075758816, | |
| "loss": 1.5175, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.025768452052273145, | |
| "eval_loss": 1.4290988445281982, | |
| "eval_runtime": 19.703, | |
| "eval_samples_per_second": 5.075, | |
| "eval_steps_per_second": 0.66, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.026995521197619485, | |
| "grad_norm": 0.7308184504508972, | |
| "learning_rate": 0.00019822056778205025, | |
| "loss": 1.4878, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.026995521197619485, | |
| "eval_loss": 1.4188473224639893, | |
| "eval_runtime": 19.6677, | |
| "eval_samples_per_second": 5.084, | |
| "eval_steps_per_second": 0.661, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.028222590342965825, | |
| "grad_norm": 0.7773933410644531, | |
| "learning_rate": 0.00019813875480651232, | |
| "loss": 1.5046, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.028222590342965825, | |
| "eval_loss": 1.4094576835632324, | |
| "eval_runtime": 19.6378, | |
| "eval_samples_per_second": 5.092, | |
| "eval_steps_per_second": 0.662, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.029449659488312168, | |
| "grad_norm": 0.6018341779708862, | |
| "learning_rate": 0.0001980569418309744, | |
| "loss": 1.508, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.029449659488312168, | |
| "eval_loss": 1.396529197692871, | |
| "eval_runtime": 19.7008, | |
| "eval_samples_per_second": 5.076, | |
| "eval_steps_per_second": 0.66, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.030676728633658508, | |
| "grad_norm": 0.6028321981430054, | |
| "learning_rate": 0.00019797512885543647, | |
| "loss": 1.5018, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.030676728633658508, | |
| "eval_loss": 1.3898202180862427, | |
| "eval_runtime": 19.7395, | |
| "eval_samples_per_second": 5.066, | |
| "eval_steps_per_second": 0.659, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.031903797779004844, | |
| "grad_norm": 0.7919607162475586, | |
| "learning_rate": 0.00019789331587989856, | |
| "loss": 1.4158, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.031903797779004844, | |
| "eval_loss": 1.385123610496521, | |
| "eval_runtime": 19.7083, | |
| "eval_samples_per_second": 5.074, | |
| "eval_steps_per_second": 0.66, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.03313086692435119, | |
| "grad_norm": 0.7193537354469299, | |
| "learning_rate": 0.00019781150290436065, | |
| "loss": 1.4829, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.03313086692435119, | |
| "eval_loss": 1.3717445135116577, | |
| "eval_runtime": 19.7284, | |
| "eval_samples_per_second": 5.069, | |
| "eval_steps_per_second": 0.659, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.034357936069697524, | |
| "grad_norm": 0.623745322227478, | |
| "learning_rate": 0.00019772968992882274, | |
| "loss": 1.5216, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.034357936069697524, | |
| "eval_loss": 1.3708571195602417, | |
| "eval_runtime": 19.6215, | |
| "eval_samples_per_second": 5.096, | |
| "eval_steps_per_second": 0.663, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.03558500521504387, | |
| "grad_norm": 0.7613083124160767, | |
| "learning_rate": 0.0001976478769532848, | |
| "loss": 1.4677, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.03558500521504387, | |
| "eval_loss": 1.3612563610076904, | |
| "eval_runtime": 19.7315, | |
| "eval_samples_per_second": 5.068, | |
| "eval_steps_per_second": 0.659, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.03681207436039021, | |
| "grad_norm": 0.6662244200706482, | |
| "learning_rate": 0.00019756606397774689, | |
| "loss": 1.4336, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.03681207436039021, | |
| "eval_loss": 1.3519067764282227, | |
| "eval_runtime": 19.6923, | |
| "eval_samples_per_second": 5.078, | |
| "eval_steps_per_second": 0.66, | |
| "step": 600 | |
| } | |
| ], | |
| "logging_steps": 20, | |
| "max_steps": 48897, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 50, | |
| "total_flos": 3.892256129028096e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |