| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 301, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0033222591362126247, | |
| "grad_norm": 50.78891372680664, | |
| "learning_rate": 6.451612903225806e-06, | |
| "loss": 20.4635, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.016611295681063124, | |
| "grad_norm": 50.44277572631836, | |
| "learning_rate": 3.2258064516129034e-05, | |
| "loss": 20.8825, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.03322259136212625, | |
| "grad_norm": 21.746339797973633, | |
| "learning_rate": 6.451612903225807e-05, | |
| "loss": 18.4425, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04983388704318937, | |
| "grad_norm": 8.766548156738281, | |
| "learning_rate": 9.677419354838711e-05, | |
| "loss": 16.2833, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.0664451827242525, | |
| "grad_norm": 6.890467643737793, | |
| "learning_rate": 0.00012903225806451613, | |
| "loss": 14.6695, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.08305647840531562, | |
| "grad_norm": 3.6492936611175537, | |
| "learning_rate": 0.00016129032258064516, | |
| "loss": 13.4213, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.09966777408637874, | |
| "grad_norm": 3.308103561401367, | |
| "learning_rate": 0.00019354838709677422, | |
| "loss": 12.7739, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.11627906976744186, | |
| "grad_norm": 6.093023777008057, | |
| "learning_rate": 0.0001998917111338525, | |
| "loss": 11.916, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.132890365448505, | |
| "grad_norm": 11.974358558654785, | |
| "learning_rate": 0.00019945218953682734, | |
| "loss": 11.0787, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.14950166112956811, | |
| "grad_norm": 20.57221221923828, | |
| "learning_rate": 0.00019867615321125795, | |
| "loss": 9.0421, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.16611295681063123, | |
| "grad_norm": 25.553680419921875, | |
| "learning_rate": 0.00019756622801842143, | |
| "loss": 6.0272, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.18272425249169436, | |
| "grad_norm": 7.413240432739258, | |
| "learning_rate": 0.0001961261695938319, | |
| "loss": 3.1666, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.19933554817275748, | |
| "grad_norm": 6.828798294067383, | |
| "learning_rate": 0.00019436085063935835, | |
| "loss": 2.2143, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.2159468438538206, | |
| "grad_norm": 4.01786470413208, | |
| "learning_rate": 0.00019227624443554425, | |
| "loss": 1.9151, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.23255813953488372, | |
| "grad_norm": 2.935255765914917, | |
| "learning_rate": 0.0001898794046299167, | |
| "loss": 1.6848, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.24916943521594684, | |
| "grad_norm": 1.8950200080871582, | |
| "learning_rate": 0.00018717844136967624, | |
| "loss": 1.5136, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.26578073089701, | |
| "grad_norm": 0.5441950559616089, | |
| "learning_rate": 0.00018418249385952575, | |
| "loss": 1.4275, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.2823920265780731, | |
| "grad_norm": 0.7461830377578735, | |
| "learning_rate": 0.00018090169943749476, | |
| "loss": 1.3593, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.29900332225913623, | |
| "grad_norm": 0.5518223643302917, | |
| "learning_rate": 0.0001773471592733964, | |
| "loss": 1.2975, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.31561461794019935, | |
| "grad_norm": 0.4475560486316681, | |
| "learning_rate": 0.0001735309008059829, | |
| "loss": 1.2742, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.33222591362126247, | |
| "grad_norm": 0.6053428053855896, | |
| "learning_rate": 0.00016946583704589973, | |
| "loss": 1.2478, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3488372093023256, | |
| "grad_norm": 0.7606108784675598, | |
| "learning_rate": 0.00016516572288214552, | |
| "loss": 1.2128, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.3654485049833887, | |
| "grad_norm": 0.4854182004928589, | |
| "learning_rate": 0.00016064510853988138, | |
| "loss": 1.2048, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.38205980066445183, | |
| "grad_norm": 1.0294697284698486, | |
| "learning_rate": 0.0001559192903470747, | |
| "loss": 1.1671, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.39867109634551495, | |
| "grad_norm": 0.45866039395332336, | |
| "learning_rate": 0.00015100425897656753, | |
| "loss": 1.1478, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.4152823920265781, | |
| "grad_norm": 0.8675262331962585, | |
| "learning_rate": 0.00014591664533870118, | |
| "loss": 1.13, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.4318936877076412, | |
| "grad_norm": 0.4108262062072754, | |
| "learning_rate": 0.00014067366430758004, | |
| "loss": 1.1204, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.4485049833887043, | |
| "grad_norm": 0.5368378758430481, | |
| "learning_rate": 0.00013529305647138687, | |
| "loss": 1.115, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.46511627906976744, | |
| "grad_norm": 0.7188141942024231, | |
| "learning_rate": 0.0001297930281038482, | |
| "loss": 1.1142, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.48172757475083056, | |
| "grad_norm": 0.922228217124939, | |
| "learning_rate": 0.00012419218955996676, | |
| "loss": 1.098, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.4983388704318937, | |
| "grad_norm": 0.645747721195221, | |
| "learning_rate": 0.00011850949230447145, | |
| "loss": 1.0962, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5149501661129569, | |
| "grad_norm": 0.8197333812713623, | |
| "learning_rate": 0.00011276416478605949, | |
| "loss": 1.0953, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.53156146179402, | |
| "grad_norm": 0.6454654932022095, | |
| "learning_rate": 0.00010697564737441252, | |
| "loss": 1.0993, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5481727574750831, | |
| "grad_norm": 1.0970648527145386, | |
| "learning_rate": 0.00010116352658013973, | |
| "loss": 1.0815, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.5647840531561462, | |
| "grad_norm": 0.427190899848938, | |
| "learning_rate": 9.534746878022534e-05, | |
| "loss": 1.0915, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5813953488372093, | |
| "grad_norm": 0.40045276284217834, | |
| "learning_rate": 8.954715367323468e-05, | |
| "loss": 1.0804, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.5980066445182725, | |
| "grad_norm": 0.8688796758651733, | |
| "learning_rate": 8.378220768944327e-05, | |
| "loss": 1.0561, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.6146179401993356, | |
| "grad_norm": 0.6383819580078125, | |
| "learning_rate": 7.807213758120966e-05, | |
| "loss": 1.0619, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.6312292358803987, | |
| "grad_norm": 0.624631404876709, | |
| "learning_rate": 7.243626441830009e-05, | |
| "loss": 1.0538, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6478405315614618, | |
| "grad_norm": 0.43649202585220337, | |
| "learning_rate": 6.68936582115042e-05, | |
| "loss": 1.0518, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.6644518272425249, | |
| "grad_norm": 0.7624740600585938, | |
| "learning_rate": 6.146307338575519e-05, | |
| "loss": 1.0323, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6810631229235881, | |
| "grad_norm": 0.5153250098228455, | |
| "learning_rate": 5.616288532109225e-05, | |
| "loss": 1.0534, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.6976744186046512, | |
| "grad_norm": 0.5389082431793213, | |
| "learning_rate": 5.101102817619131e-05, | |
| "loss": 1.0411, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 0.539851725101471, | |
| "learning_rate": 4.6024934204848745e-05, | |
| "loss": 1.0291, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.7308970099667774, | |
| "grad_norm": 0.5766161680221558, | |
| "learning_rate": 4.12214747707527e-05, | |
| "loss": 1.0463, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.7475083056478405, | |
| "grad_norm": 0.4088208079338074, | |
| "learning_rate": 3.661690326012897e-05, | |
| "loss": 1.0435, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.7641196013289037, | |
| "grad_norm": 0.5515505075454712, | |
| "learning_rate": 3.222680008542678e-05, | |
| "loss": 1.0276, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.7807308970099668, | |
| "grad_norm": 0.3717176914215088, | |
| "learning_rate": 2.8066019966134904e-05, | |
| "loss": 1.0297, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.7973421926910299, | |
| "grad_norm": 0.42352986335754395, | |
| "learning_rate": 2.4148641665113113e-05, | |
| "loss": 1.0233, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.813953488372093, | |
| "grad_norm": 0.5951583385467529, | |
| "learning_rate": 2.0487920350515212e-05, | |
| "loss": 1.0306, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.8305647840531561, | |
| "grad_norm": 0.4466446042060852, | |
| "learning_rate": 1.7096242744495837e-05, | |
| "loss": 1.031, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.8471760797342193, | |
| "grad_norm": 0.4255996644496918, | |
| "learning_rate": 1.3985085210463477e-05, | |
| "loss": 1.0327, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.8637873754152824, | |
| "grad_norm": 0.4321277141571045, | |
| "learning_rate": 1.116497492069961e-05, | |
| "loss": 1.0231, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.8803986710963455, | |
| "grad_norm": 0.47173938155174255, | |
| "learning_rate": 8.645454235739903e-06, | |
| "loss": 1.0402, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.8970099667774086, | |
| "grad_norm": 0.46528318524360657, | |
| "learning_rate": 6.435048416046863e-06, | |
| "loss": 1.03, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.9136212624584718, | |
| "grad_norm": 0.5179603099822998, | |
| "learning_rate": 4.541236775226809e-06, | |
| "loss": 1.025, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.9302325581395349, | |
| "grad_norm": 0.4209299385547638, | |
| "learning_rate": 2.970427372400353e-06, | |
| "loss": 1.0186, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.946843853820598, | |
| "grad_norm": 0.37040743231773376, | |
| "learning_rate": 1.7279353293586765e-06, | |
| "loss": 1.024, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.9634551495016611, | |
| "grad_norm": 0.37352651357650757, | |
| "learning_rate": 8.17964845873831e-07, | |
| "loss": 1.0293, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.9800664451827242, | |
| "grad_norm": 0.41717350482940674, | |
| "learning_rate": 2.4359497401758024e-07, | |
| "loss": 1.0289, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.9966777408637874, | |
| "grad_norm": 0.497938334941864, | |
| "learning_rate": 6.769199623779532e-09, | |
| "loss": 1.0272, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 1.7453839778900146, | |
| "eval_runtime": 0.4824, | |
| "eval_samples_per_second": 20.73, | |
| "eval_steps_per_second": 2.073, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 301, | |
| "total_flos": 9.178029324081562e+17, | |
| "train_loss": 3.2307936707604368, | |
| "train_runtime": 823.5769, | |
| "train_samples_per_second": 46.729, | |
| "train_steps_per_second": 0.365 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 301, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9.178029324081562e+17, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |