| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.999625818521983, | |
| "eval_steps": 500, | |
| "global_step": 668, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.029934518241347054, | |
| "grad_norm": 1.3387344639766428, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7726, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.05986903648269411, | |
| "grad_norm": 0.7768397662301797, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6941, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.08980355472404115, | |
| "grad_norm": 0.707220098018675, | |
| "learning_rate": 5e-06, | |
| "loss": 0.669, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.11973807296538821, | |
| "grad_norm": 0.6673639225008394, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6651, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.14967259120673526, | |
| "grad_norm": 0.931884409826821, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6563, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1796071094480823, | |
| "grad_norm": 0.7248290568232586, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6486, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.20954162768942938, | |
| "grad_norm": 0.7317021024341339, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6456, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.23947614593077643, | |
| "grad_norm": 0.7878073380412045, | |
| "learning_rate": 5e-06, | |
| "loss": 0.647, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.2694106641721235, | |
| "grad_norm": 0.9477711469098681, | |
| "learning_rate": 5e-06, | |
| "loss": 0.638, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2993451824134705, | |
| "grad_norm": 0.7824185434647374, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6395, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3292797006548176, | |
| "grad_norm": 0.7691665877451451, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6352, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.3592142188961646, | |
| "grad_norm": 0.85227640579011, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6344, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.3891487371375117, | |
| "grad_norm": 0.7349060200879011, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6343, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.41908325537885877, | |
| "grad_norm": 0.7948303412589806, | |
| "learning_rate": 5e-06, | |
| "loss": 0.628, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4490177736202058, | |
| "grad_norm": 0.6403554998368226, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6296, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.47895229186155286, | |
| "grad_norm": 0.6579875234047781, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6286, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5088868101028999, | |
| "grad_norm": 0.6635988202382727, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6228, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.538821328344247, | |
| "grad_norm": 0.6766189913752628, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6242, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.568755846585594, | |
| "grad_norm": 0.7350593528596518, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6221, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.598690364826941, | |
| "grad_norm": 0.7042346638489535, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6299, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6286248830682881, | |
| "grad_norm": 0.7310808118757001, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6238, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.6585594013096352, | |
| "grad_norm": 0.915027325723907, | |
| "learning_rate": 5e-06, | |
| "loss": 0.622, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.6884939195509823, | |
| "grad_norm": 0.7146070049162939, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6245, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.7184284377923292, | |
| "grad_norm": 0.6682750595418552, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6234, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7483629560336763, | |
| "grad_norm": 0.6075185838844738, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6199, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.7782974742750234, | |
| "grad_norm": 0.6614005815451841, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6186, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.8082319925163705, | |
| "grad_norm": 0.7431404889886319, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6261, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.8381665107577175, | |
| "grad_norm": 0.6659051677024307, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6163, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.8681010289990645, | |
| "grad_norm": 0.761848043492867, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6131, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.8980355472404116, | |
| "grad_norm": 0.7790121000018401, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6145, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.9279700654817586, | |
| "grad_norm": 0.700122187157476, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6131, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.9579045837231057, | |
| "grad_norm": 0.6439594670749554, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6152, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.9878391019644528, | |
| "grad_norm": 0.9505341757424101, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6082, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.9998129092609915, | |
| "eval_loss": 0.619162917137146, | |
| "eval_runtime": 515.644, | |
| "eval_samples_per_second": 17.456, | |
| "eval_steps_per_second": 0.547, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 1.0177736202057999, | |
| "grad_norm": 1.0078817159156914, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6383, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.047708138447147, | |
| "grad_norm": 0.9933360913511419, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5523, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.077642656688494, | |
| "grad_norm": 0.7284101974086702, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5478, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.1075771749298409, | |
| "grad_norm": 0.8279225421843056, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5481, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.137511693171188, | |
| "grad_norm": 0.7632668730408979, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5484, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.167446211412535, | |
| "grad_norm": 0.6939352572415957, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5501, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.197380729653882, | |
| "grad_norm": 0.702668433385052, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5501, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.2273152478952292, | |
| "grad_norm": 0.6478605176631643, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5524, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.2572497661365762, | |
| "grad_norm": 0.7369685315266656, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5567, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.2871842843779233, | |
| "grad_norm": 0.6753735200396593, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5514, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.3171188026192704, | |
| "grad_norm": 0.6984955164701175, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5534, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.3470533208606175, | |
| "grad_norm": 0.6720392032007318, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5475, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.3769878391019645, | |
| "grad_norm": 0.6876544426761866, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5581, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.4069223573433116, | |
| "grad_norm": 0.7559712718072091, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5573, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.4368568755846587, | |
| "grad_norm": 0.7543483123276029, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5592, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.4667913938260055, | |
| "grad_norm": 0.670230242807614, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5587, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.4967259120673526, | |
| "grad_norm": 0.793302321148842, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5549, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.5266604303086997, | |
| "grad_norm": 0.6636753856100917, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5577, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.5565949485500468, | |
| "grad_norm": 0.6821579825224984, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5557, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.5865294667913938, | |
| "grad_norm": 0.7552762189905546, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5563, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.616463985032741, | |
| "grad_norm": 0.6729733301722461, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5599, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.646398503274088, | |
| "grad_norm": 0.631104770625896, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5593, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.6763330215154348, | |
| "grad_norm": 0.6474672113131026, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5499, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.706267539756782, | |
| "grad_norm": 0.8000532776371828, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5517, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.736202057998129, | |
| "grad_norm": 0.7694340841619645, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5529, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.766136576239476, | |
| "grad_norm": 0.6621277045994409, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5558, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.7960710944808231, | |
| "grad_norm": 0.715383547152723, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5565, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.8260056127221702, | |
| "grad_norm": 0.625130733885464, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5542, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.8559401309635173, | |
| "grad_norm": 0.6823486917509712, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5524, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.8858746492048644, | |
| "grad_norm": 0.6881561256441558, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5542, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.9158091674462114, | |
| "grad_norm": 0.7622942086873891, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5557, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.9457436856875585, | |
| "grad_norm": 0.665050257763723, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5539, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.9756782039289056, | |
| "grad_norm": 0.7758483897762515, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5587, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.999625818521983, | |
| "eval_loss": 0.6155834197998047, | |
| "eval_runtime": 517.7627, | |
| "eval_samples_per_second": 17.384, | |
| "eval_steps_per_second": 0.545, | |
| "step": 668 | |
| }, | |
| { | |
| "epoch": 1.999625818521983, | |
| "step": 668, | |
| "total_flos": 2545315932536832.0, | |
| "train_loss": 0.5962966223676761, | |
| "train_runtime": 60653.8853, | |
| "train_samples_per_second": 5.639, | |
| "train_steps_per_second": 0.011 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 668, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2545315932536832.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |