| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 50, |
| "global_step": 448, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0447427293064877, |
| "grad_norm": 3.599489450454712, |
| "learning_rate": 4e-07, |
| "loss": 1.8696, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0894854586129754, |
| "grad_norm": 5.350953578948975, |
| "learning_rate": 8.444444444444444e-07, |
| "loss": 1.7847, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.1342281879194631, |
| "grad_norm": 6.310288429260254, |
| "learning_rate": 1.2888888888888889e-06, |
| "loss": 1.7585, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.1789709172259508, |
| "grad_norm": 3.629427194595337, |
| "learning_rate": 1.7333333333333334e-06, |
| "loss": 1.5704, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.22371364653243847, |
| "grad_norm": 2.6975929737091064, |
| "learning_rate": 1.999513878924193e-06, |
| "loss": 1.6308, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.22371364653243847, |
| "eval_loss": 1.5776370763778687, |
| "eval_runtime": 14.1686, |
| "eval_samples_per_second": 13.198, |
| "eval_steps_per_second": 6.634, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.2684563758389262, |
| "grad_norm": 1.9512403011322021, |
| "learning_rate": 1.994050443200529e-06, |
| "loss": 1.4086, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.3131991051454139, |
| "grad_norm": 1.2651467323303223, |
| "learning_rate": 1.9825492157072085e-06, |
| "loss": 1.3447, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.3579418344519016, |
| "grad_norm": 0.7104034423828125, |
| "learning_rate": 1.96508005408292e-06, |
| "loss": 1.1841, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.40268456375838924, |
| "grad_norm": 0.6381909251213074, |
| "learning_rate": 1.9417490647742737e-06, |
| "loss": 1.1744, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.44742729306487694, |
| "grad_norm": 0.6013015508651733, |
| "learning_rate": 1.9126979585527774e-06, |
| "loss": 1.288, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.44742729306487694, |
| "eval_loss": 1.3293598890304565, |
| "eval_runtime": 14.151, |
| "eval_samples_per_second": 13.215, |
| "eval_steps_per_second": 6.643, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.49217002237136465, |
| "grad_norm": 0.5303295254707336, |
| "learning_rate": 1.878103189773686e-06, |
| "loss": 1.259, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.5369127516778524, |
| "grad_norm": 0.423528790473938, |
| "learning_rate": 1.8381748846047758e-06, |
| "loss": 1.1353, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.5816554809843401, |
| "grad_norm": 0.5351876616477966, |
| "learning_rate": 1.7931555647349358e-06, |
| "loss": 1.2594, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.6263982102908278, |
| "grad_norm": 0.4135826528072357, |
| "learning_rate": 1.7433186743146559e-06, |
| "loss": 1.125, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.6711409395973155, |
| "grad_norm": 0.6532828211784363, |
| "learning_rate": 1.6889669190756866e-06, |
| "loss": 1.2624, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.6711409395973155, |
| "eval_loss": 1.258256196975708, |
| "eval_runtime": 14.4513, |
| "eval_samples_per_second": 12.94, |
| "eval_steps_per_second": 6.505, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.7158836689038032, |
| "grad_norm": 0.5233774781227112, |
| "learning_rate": 1.6304304277179263e-06, |
| "loss": 1.1791, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.7606263982102909, |
| "grad_norm": 0.5228462815284729, |
| "learning_rate": 1.5680647467311555e-06, |
| "loss": 1.1385, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.8053691275167785, |
| "grad_norm": 0.5080588459968567, |
| "learning_rate": 1.5022486808309168e-06, |
| "loss": 1.086, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.8501118568232662, |
| "grad_norm": 0.4504516124725342, |
| "learning_rate": 1.4333819921255834e-06, |
| "loss": 1.2652, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.8948545861297539, |
| "grad_norm": 0.3919249475002289, |
| "learning_rate": 1.3618829719897156e-06, |
| "loss": 1.0429, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.8948545861297539, |
| "eval_loss": 1.2074676752090454, |
| "eval_runtime": 14.3686, |
| "eval_samples_per_second": 13.014, |
| "eval_steps_per_second": 6.542, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.9395973154362416, |
| "grad_norm": 0.5481575131416321, |
| "learning_rate": 1.2881859003919686e-06, |
| "loss": 1.1509, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.9843400447427293, |
| "grad_norm": 0.486341655254364, |
| "learning_rate": 1.2127384081094166e-06, |
| "loss": 1.1126, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.0268456375838926, |
| "grad_norm": 0.4611228108406067, |
| "learning_rate": 1.1359987578500148e-06, |
| "loss": 1.0522, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.0715883668903803, |
| "grad_norm": 0.5306360125541687, |
| "learning_rate": 1.0584330607974673e-06, |
| "loss": 1.1516, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.116331096196868, |
| "grad_norm": 0.44238486886024475, |
| "learning_rate": 9.805124454850148e-07, |
| "loss": 1.0329, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.116331096196868, |
| "eval_loss": 1.169242262840271, |
| "eval_runtime": 14.3254, |
| "eval_samples_per_second": 13.054, |
| "eval_steps_per_second": 6.562, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.1610738255033557, |
| "grad_norm": 0.46727898716926575, |
| "learning_rate": 9.027101961941923e-07, |
| "loss": 0.9688, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.2058165548098434, |
| "grad_norm": 0.45750078558921814, |
| "learning_rate": 8.254988782597293e-07, |
| "loss": 1.1761, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.250559284116331, |
| "grad_norm": 0.39371541142463684, |
| "learning_rate": 7.493474677412793e-07, |
| "loss": 1.0729, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.2953020134228188, |
| "grad_norm": 0.5865498781204224, |
| "learning_rate": 6.747185028961523e-07, |
| "loss": 1.1015, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.3400447427293065, |
| "grad_norm": 0.43303340673446655, |
| "learning_rate": 6.020652747548007e-07, |
| "loss": 1.1735, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.3400447427293065, |
| "eval_loss": 1.1453677415847778, |
| "eval_runtime": 14.3243, |
| "eval_samples_per_second": 13.055, |
| "eval_steps_per_second": 6.562, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.3847874720357942, |
| "grad_norm": 0.5659552812576294, |
| "learning_rate": 5.31829073863304e-07, |
| "loss": 1.0261, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.429530201342282, |
| "grad_norm": 0.5933849811553955, |
| "learning_rate": 4.644365099159442e-07, |
| "loss": 1.0097, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.4742729306487696, |
| "grad_norm": 0.4996931552886963, |
| "learning_rate": 4.002969205582313e-07, |
| "loss": 1.0907, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.5190156599552571, |
| "grad_norm": 0.6406461000442505, |
| "learning_rate": 3.3979988509912437e-07, |
| "loss": 1.1422, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.563758389261745, |
| "grad_norm": 0.5414783358573914, |
| "learning_rate": 2.833128582339887e-07, |
| "loss": 0.9546, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.563758389261745, |
| "eval_loss": 1.1337639093399048, |
| "eval_runtime": 14.3758, |
| "eval_samples_per_second": 13.008, |
| "eval_steps_per_second": 6.539, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.6085011185682325, |
| "grad_norm": 0.5735450983047485, |
| "learning_rate": 2.3117893815088062e-07, |
| "loss": 1.0688, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.6532438478747205, |
| "grad_norm": 0.5685056447982788, |
| "learning_rate": 1.8371478257652906e-07, |
| "loss": 0.998, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.697986577181208, |
| "grad_norm": 0.585393488407135, |
| "learning_rate": 1.4120868541980025e-07, |
| "loss": 1.1943, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.7427293064876959, |
| "grad_norm": 0.36039209365844727, |
| "learning_rate": 1.0391882569497757e-07, |
| "loss": 0.976, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.7874720357941833, |
| "grad_norm": 0.5943603515625, |
| "learning_rate": 7.207169936076973e-08, |
| "loss": 1.0498, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.7874720357941833, |
| "eval_loss": 1.129286527633667, |
| "eval_runtime": 14.2371, |
| "eval_samples_per_second": 13.135, |
| "eval_steps_per_second": 6.602, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.8322147651006713, |
| "grad_norm": 0.5557639598846436, |
| "learning_rate": 4.586074359995118e-08, |
| "loss": 1.0416, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.8769574944071588, |
| "grad_norm": 0.6227531433105469, |
| "learning_rate": 2.544516189565482e-08, |
| "loss": 1.1418, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.9217002237136467, |
| "grad_norm": 0.5767893195152283, |
| "learning_rate": 1.094895704072707e-08, |
| "loss": 0.9902, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.9664429530201342, |
| "grad_norm": 0.5177537798881531, |
| "learning_rate": 2.460177953573339e-09, |
| "loss": 1.0165, |
| "step": 440 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 448, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 7.075251445555814e+16, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|