{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.999625818521983, "eval_steps": 500, "global_step": 668, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.029934518241347054, "grad_norm": 1.3387344639766428, "learning_rate": 5e-06, "loss": 0.7726, "step": 10 }, { "epoch": 0.05986903648269411, "grad_norm": 0.7768397662301797, "learning_rate": 5e-06, "loss": 0.6941, "step": 20 }, { "epoch": 0.08980355472404115, "grad_norm": 0.707220098018675, "learning_rate": 5e-06, "loss": 0.669, "step": 30 }, { "epoch": 0.11973807296538821, "grad_norm": 0.6673639225008394, "learning_rate": 5e-06, "loss": 0.6651, "step": 40 }, { "epoch": 0.14967259120673526, "grad_norm": 0.931884409826821, "learning_rate": 5e-06, "loss": 0.6563, "step": 50 }, { "epoch": 0.1796071094480823, "grad_norm": 0.7248290568232586, "learning_rate": 5e-06, "loss": 0.6486, "step": 60 }, { "epoch": 0.20954162768942938, "grad_norm": 0.7317021024341339, "learning_rate": 5e-06, "loss": 0.6456, "step": 70 }, { "epoch": 0.23947614593077643, "grad_norm": 0.7878073380412045, "learning_rate": 5e-06, "loss": 0.647, "step": 80 }, { "epoch": 0.2694106641721235, "grad_norm": 0.9477711469098681, "learning_rate": 5e-06, "loss": 0.638, "step": 90 }, { "epoch": 0.2993451824134705, "grad_norm": 0.7824185434647374, "learning_rate": 5e-06, "loss": 0.6395, "step": 100 }, { "epoch": 0.3292797006548176, "grad_norm": 0.7691665877451451, "learning_rate": 5e-06, "loss": 0.6352, "step": 110 }, { "epoch": 0.3592142188961646, "grad_norm": 0.85227640579011, "learning_rate": 5e-06, "loss": 0.6344, "step": 120 }, { "epoch": 0.3891487371375117, "grad_norm": 0.7349060200879011, "learning_rate": 5e-06, "loss": 0.6343, "step": 130 }, { "epoch": 0.41908325537885877, "grad_norm": 0.7948303412589806, "learning_rate": 5e-06, "loss": 0.628, "step": 140 }, { "epoch": 0.4490177736202058, "grad_norm": 0.6403554998368226, "learning_rate": 5e-06, "loss": 0.6296, "step": 150 }, { "epoch": 0.47895229186155286, "grad_norm": 0.6579875234047781, "learning_rate": 5e-06, "loss": 0.6286, "step": 160 }, { "epoch": 0.5088868101028999, "grad_norm": 0.6635988202382727, "learning_rate": 5e-06, "loss": 0.6228, "step": 170 }, { "epoch": 0.538821328344247, "grad_norm": 0.6766189913752628, "learning_rate": 5e-06, "loss": 0.6242, "step": 180 }, { "epoch": 0.568755846585594, "grad_norm": 0.7350593528596518, "learning_rate": 5e-06, "loss": 0.6221, "step": 190 }, { "epoch": 0.598690364826941, "grad_norm": 0.7042346638489535, "learning_rate": 5e-06, "loss": 0.6299, "step": 200 }, { "epoch": 0.6286248830682881, "grad_norm": 0.7310808118757001, "learning_rate": 5e-06, "loss": 0.6238, "step": 210 }, { "epoch": 0.6585594013096352, "grad_norm": 0.915027325723907, "learning_rate": 5e-06, "loss": 0.622, "step": 220 }, { "epoch": 0.6884939195509823, "grad_norm": 0.7146070049162939, "learning_rate": 5e-06, "loss": 0.6245, "step": 230 }, { "epoch": 0.7184284377923292, "grad_norm": 0.6682750595418552, "learning_rate": 5e-06, "loss": 0.6234, "step": 240 }, { "epoch": 0.7483629560336763, "grad_norm": 0.6075185838844738, "learning_rate": 5e-06, "loss": 0.6199, "step": 250 }, { "epoch": 0.7782974742750234, "grad_norm": 0.6614005815451841, "learning_rate": 5e-06, "loss": 0.6186, "step": 260 }, { "epoch": 0.8082319925163705, "grad_norm": 0.7431404889886319, "learning_rate": 5e-06, "loss": 0.6261, "step": 270 }, { "epoch": 0.8381665107577175, "grad_norm": 0.6659051677024307, "learning_rate": 5e-06, "loss": 0.6163, "step": 280 }, { "epoch": 0.8681010289990645, "grad_norm": 0.761848043492867, "learning_rate": 5e-06, "loss": 0.6131, "step": 290 }, { "epoch": 0.8980355472404116, "grad_norm": 0.7790121000018401, "learning_rate": 5e-06, "loss": 0.6145, "step": 300 }, { "epoch": 0.9279700654817586, "grad_norm": 0.700122187157476, "learning_rate": 5e-06, "loss": 0.6131, "step": 310 }, { "epoch": 0.9579045837231057, "grad_norm": 0.6439594670749554, "learning_rate": 5e-06, "loss": 0.6152, "step": 320 }, { "epoch": 0.9878391019644528, "grad_norm": 0.9505341757424101, "learning_rate": 5e-06, "loss": 0.6082, "step": 330 }, { "epoch": 0.9998129092609915, "eval_loss": 0.619162917137146, "eval_runtime": 515.644, "eval_samples_per_second": 17.456, "eval_steps_per_second": 0.547, "step": 334 }, { "epoch": 1.0177736202057999, "grad_norm": 1.0078817159156914, "learning_rate": 5e-06, "loss": 0.6383, "step": 340 }, { "epoch": 1.047708138447147, "grad_norm": 0.9933360913511419, "learning_rate": 5e-06, "loss": 0.5523, "step": 350 }, { "epoch": 1.077642656688494, "grad_norm": 0.7284101974086702, "learning_rate": 5e-06, "loss": 0.5478, "step": 360 }, { "epoch": 1.1075771749298409, "grad_norm": 0.8279225421843056, "learning_rate": 5e-06, "loss": 0.5481, "step": 370 }, { "epoch": 1.137511693171188, "grad_norm": 0.7632668730408979, "learning_rate": 5e-06, "loss": 0.5484, "step": 380 }, { "epoch": 1.167446211412535, "grad_norm": 0.6939352572415957, "learning_rate": 5e-06, "loss": 0.5501, "step": 390 }, { "epoch": 1.197380729653882, "grad_norm": 0.702668433385052, "learning_rate": 5e-06, "loss": 0.5501, "step": 400 }, { "epoch": 1.2273152478952292, "grad_norm": 0.6478605176631643, "learning_rate": 5e-06, "loss": 0.5524, "step": 410 }, { "epoch": 1.2572497661365762, "grad_norm": 0.7369685315266656, "learning_rate": 5e-06, "loss": 0.5567, "step": 420 }, { "epoch": 1.2871842843779233, "grad_norm": 0.6753735200396593, "learning_rate": 5e-06, "loss": 0.5514, "step": 430 }, { "epoch": 1.3171188026192704, "grad_norm": 0.6984955164701175, "learning_rate": 5e-06, "loss": 0.5534, "step": 440 }, { "epoch": 1.3470533208606175, "grad_norm": 0.6720392032007318, "learning_rate": 5e-06, "loss": 0.5475, "step": 450 }, { "epoch": 1.3769878391019645, "grad_norm": 0.6876544426761866, "learning_rate": 5e-06, "loss": 0.5581, "step": 460 }, { "epoch": 1.4069223573433116, "grad_norm": 0.7559712718072091, "learning_rate": 5e-06, "loss": 0.5573, "step": 470 }, { "epoch": 1.4368568755846587, "grad_norm": 0.7543483123276029, "learning_rate": 5e-06, "loss": 0.5592, "step": 480 }, { "epoch": 1.4667913938260055, "grad_norm": 0.670230242807614, "learning_rate": 5e-06, "loss": 0.5587, "step": 490 }, { "epoch": 1.4967259120673526, "grad_norm": 0.793302321148842, "learning_rate": 5e-06, "loss": 0.5549, "step": 500 }, { "epoch": 1.5266604303086997, "grad_norm": 0.6636753856100917, "learning_rate": 5e-06, "loss": 0.5577, "step": 510 }, { "epoch": 1.5565949485500468, "grad_norm": 0.6821579825224984, "learning_rate": 5e-06, "loss": 0.5557, "step": 520 }, { "epoch": 1.5865294667913938, "grad_norm": 0.7552762189905546, "learning_rate": 5e-06, "loss": 0.5563, "step": 530 }, { "epoch": 1.616463985032741, "grad_norm": 0.6729733301722461, "learning_rate": 5e-06, "loss": 0.5599, "step": 540 }, { "epoch": 1.646398503274088, "grad_norm": 0.631104770625896, "learning_rate": 5e-06, "loss": 0.5593, "step": 550 }, { "epoch": 1.6763330215154348, "grad_norm": 0.6474672113131026, "learning_rate": 5e-06, "loss": 0.5499, "step": 560 }, { "epoch": 1.706267539756782, "grad_norm": 0.8000532776371828, "learning_rate": 5e-06, "loss": 0.5517, "step": 570 }, { "epoch": 1.736202057998129, "grad_norm": 0.7694340841619645, "learning_rate": 5e-06, "loss": 0.5529, "step": 580 }, { "epoch": 1.766136576239476, "grad_norm": 0.6621277045994409, "learning_rate": 5e-06, "loss": 0.5558, "step": 590 }, { "epoch": 1.7960710944808231, "grad_norm": 0.715383547152723, "learning_rate": 5e-06, "loss": 0.5565, "step": 600 }, { "epoch": 1.8260056127221702, "grad_norm": 0.625130733885464, "learning_rate": 5e-06, "loss": 0.5542, "step": 610 }, { "epoch": 1.8559401309635173, "grad_norm": 0.6823486917509712, "learning_rate": 5e-06, "loss": 0.5524, "step": 620 }, { "epoch": 1.8858746492048644, "grad_norm": 0.6881561256441558, "learning_rate": 5e-06, "loss": 0.5542, "step": 630 }, { "epoch": 1.9158091674462114, "grad_norm": 0.7622942086873891, "learning_rate": 5e-06, "loss": 0.5557, "step": 640 }, { "epoch": 1.9457436856875585, "grad_norm": 0.665050257763723, "learning_rate": 5e-06, "loss": 0.5539, "step": 650 }, { "epoch": 1.9756782039289056, "grad_norm": 0.7758483897762515, "learning_rate": 5e-06, "loss": 0.5587, "step": 660 }, { "epoch": 1.999625818521983, "eval_loss": 0.6155834197998047, "eval_runtime": 517.7627, "eval_samples_per_second": 17.384, "eval_steps_per_second": 0.545, "step": 668 }, { "epoch": 1.999625818521983, "step": 668, "total_flos": 2545315932536832.0, "train_loss": 0.5962966223676761, "train_runtime": 60653.8853, "train_samples_per_second": 5.639, "train_steps_per_second": 0.011 } ], "logging_steps": 10, "max_steps": 668, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2545315932536832.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }