{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 50, "global_step": 744, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.026899798251513115, "grad_norm": 29.746873221105215, "learning_rate": 1.894736842105263e-06, "loss": 1.8652, "step": 10 }, { "epoch": 0.05379959650302623, "grad_norm": 2.6569803505324483, "learning_rate": 4e-06, "loss": 0.9876, "step": 20 }, { "epoch": 0.08069939475453934, "grad_norm": 1.866327801527443, "learning_rate": 6.105263157894737e-06, "loss": 0.7748, "step": 30 }, { "epoch": 0.10759919300605246, "grad_norm": 1.5350711741145153, "learning_rate": 7.999960397771768e-06, "loss": 0.7036, "step": 40 }, { "epoch": 0.13449899125756556, "grad_norm": 1.4235434017069168, "learning_rate": 7.995209079154332e-06, "loss": 0.6701, "step": 50 }, { "epoch": 0.13449899125756556, "eval_loss": 0.6571411490440369, "eval_runtime": 20.3665, "eval_samples_per_second": 23.863, "eval_steps_per_second": 0.786, "step": 50 }, { "epoch": 0.16139878950907868, "grad_norm": 1.5470132855221335, "learning_rate": 7.982548093693699e-06, "loss": 0.6427, "step": 60 }, { "epoch": 0.1882985877605918, "grad_norm": 1.4003108383520437, "learning_rate": 7.962002507456483e-06, "loss": 0.6309, "step": 70 }, { "epoch": 0.21519838601210492, "grad_norm": 1.411199665593455, "learning_rate": 7.933612996347003e-06, "loss": 0.6348, "step": 80 }, { "epoch": 0.242098184263618, "grad_norm": 1.3778290968625495, "learning_rate": 7.897435765577615e-06, "loss": 0.6165, "step": 90 }, { "epoch": 0.26899798251513113, "grad_norm": 1.4467043447342405, "learning_rate": 7.853542438394323e-06, "loss": 0.5984, "step": 100 }, { "epoch": 0.26899798251513113, "eval_loss": 0.6147489547729492, "eval_runtime": 20.4764, "eval_samples_per_second": 23.735, "eval_steps_per_second": 0.781, "step": 100 }, { "epoch": 0.29589778076664425, "grad_norm": 1.4148215321149555, "learning_rate": 7.802019914277922e-06, "loss": 0.6115, "step": 110 }, { "epoch": 0.32279757901815737, "grad_norm": 1.566581827874729, "learning_rate": 7.742970196901463e-06, "loss": 0.6062, "step": 120 }, { "epoch": 0.3496973772696705, "grad_norm": 1.3254884536596998, "learning_rate": 7.676510192184609e-06, "loss": 0.6006, "step": 130 }, { "epoch": 0.3765971755211836, "grad_norm": 1.447517875694622, "learning_rate": 7.602771476844694e-06, "loss": 0.5898, "step": 140 }, { "epoch": 0.4034969737726967, "grad_norm": 1.3445376199324792, "learning_rate": 7.5219000379027296e-06, "loss": 0.5934, "step": 150 }, { "epoch": 0.4034969737726967, "eval_loss": 0.5962715148925781, "eval_runtime": 20.1716, "eval_samples_per_second": 24.093, "eval_steps_per_second": 0.793, "step": 150 }, { "epoch": 0.43039677202420984, "grad_norm": 1.3246907469950613, "learning_rate": 7.434055983660057e-06, "loss": 0.5993, "step": 160 }, { "epoch": 0.45729657027572296, "grad_norm": 1.3467258206153316, "learning_rate": 7.339413226717854e-06, "loss": 0.5915, "step": 170 }, { "epoch": 0.484196368527236, "grad_norm": 1.458985018698563, "learning_rate": 7.23815913966707e-06, "loss": 0.5881, "step": 180 }, { "epoch": 0.5110961667787491, "grad_norm": 1.2485290337068158, "learning_rate": 7.130494184130416e-06, "loss": 0.5887, "step": 190 }, { "epoch": 0.5379959650302623, "grad_norm": 1.3124694415167568, "learning_rate": 7.016631513890864e-06, "loss": 0.5855, "step": 200 }, { "epoch": 0.5379959650302623, "eval_loss": 0.5851770043373108, "eval_runtime": 20.3049, "eval_samples_per_second": 23.935, "eval_steps_per_second": 0.788, "step": 200 }, { "epoch": 0.5648957632817754, "grad_norm": 1.4701321084658303, "learning_rate": 6.896796552892348e-06, "loss": 0.5833, "step": 210 }, { "epoch": 0.5917955615332885, "grad_norm": 1.4050016198469426, "learning_rate": 6.771226548948162e-06, "loss": 0.5812, "step": 220 }, { "epoch": 0.6186953597848016, "grad_norm": 1.4295194453088238, "learning_rate": 6.64017010404058e-06, "loss": 0.5822, "step": 230 }, { "epoch": 0.6455951580363147, "grad_norm": 7.967159671444082, "learning_rate": 6.503886682141661e-06, "loss": 0.573, "step": 240 }, { "epoch": 0.6724949562878278, "grad_norm": 1.2442332016904551, "learning_rate": 6.3626460955295895e-06, "loss": 0.5837, "step": 250 }, { "epoch": 0.6724949562878278, "eval_loss": 0.5775083303451538, "eval_runtime": 20.6713, "eval_samples_per_second": 23.511, "eval_steps_per_second": 0.774, "step": 250 }, { "epoch": 0.699394754539341, "grad_norm": 1.3850365164976395, "learning_rate": 6.2167279706175765e-06, "loss": 0.5726, "step": 260 }, { "epoch": 0.7262945527908541, "grad_norm": 1.2304207907101277, "learning_rate": 6.066421194352859e-06, "loss": 0.5565, "step": 270 }, { "epoch": 0.7531943510423672, "grad_norm": 1.2714930735972703, "learning_rate": 5.912023342281789e-06, "loss": 0.5641, "step": 280 }, { "epoch": 0.7800941492938803, "grad_norm": 1.460856311527336, "learning_rate": 5.753840089413357e-06, "loss": 0.566, "step": 290 }, { "epoch": 0.8069939475453934, "grad_norm": 1.5514244928702674, "learning_rate": 5.592184605047483e-06, "loss": 0.557, "step": 300 }, { "epoch": 0.8069939475453934, "eval_loss": 0.5692653656005859, "eval_runtime": 20.3766, "eval_samples_per_second": 23.851, "eval_steps_per_second": 0.785, "step": 300 }, { "epoch": 0.8338937457969066, "grad_norm": 1.360092781672721, "learning_rate": 5.427376932766216e-06, "loss": 0.5657, "step": 310 }, { "epoch": 0.8607935440484197, "grad_norm": 1.5946830149551399, "learning_rate": 5.259743356815289e-06, "loss": 0.574, "step": 320 }, { "epoch": 0.8876933422999328, "grad_norm": 1.2977931263056453, "learning_rate": 5.089615756130505e-06, "loss": 0.5549, "step": 330 }, { "epoch": 0.9145931405514459, "grad_norm": 1.7677476514254604, "learning_rate": 4.917330947287818e-06, "loss": 0.5546, "step": 340 }, { "epoch": 0.9414929388029589, "grad_norm": 1.365159498913211, "learning_rate": 4.743230017677918e-06, "loss": 0.5525, "step": 350 }, { "epoch": 0.9414929388029589, "eval_loss": 0.5623395442962646, "eval_runtime": 20.4098, "eval_samples_per_second": 23.812, "eval_steps_per_second": 0.784, "step": 350 }, { "epoch": 0.968392737054472, "grad_norm": 1.4020192853941864, "learning_rate": 4.567657650225538e-06, "loss": 0.5491, "step": 360 }, { "epoch": 0.9952925353059852, "grad_norm": 1.2810961882383074, "learning_rate": 4.390961440990333e-06, "loss": 0.5617, "step": 370 }, { "epoch": 1.0215198386012105, "grad_norm": 1.4422549713674222, "learning_rate": 4.213491211000394e-06, "loss": 0.4705, "step": 380 }, { "epoch": 1.0484196368527237, "grad_norm": 1.339607087215748, "learning_rate": 4.035598313680784e-06, "loss": 0.472, "step": 390 }, { "epoch": 1.0753194351042368, "grad_norm": 1.3004229760969053, "learning_rate": 3.8576349392482585e-06, "loss": 0.4579, "step": 400 }, { "epoch": 1.0753194351042368, "eval_loss": 0.5656763315200806, "eval_runtime": 20.3701, "eval_samples_per_second": 23.858, "eval_steps_per_second": 0.785, "step": 400 }, { "epoch": 1.10221923335575, "grad_norm": 1.2705614543341326, "learning_rate": 3.67995341744931e-06, "loss": 0.4475, "step": 410 }, { "epoch": 1.129119031607263, "grad_norm": 1.3779121525385867, "learning_rate": 3.5029055200219857e-06, "loss": 0.4596, "step": 420 }, { "epoch": 1.1560188298587761, "grad_norm": 1.3453138554439108, "learning_rate": 3.326841764262423e-06, "loss": 0.4427, "step": 430 }, { "epoch": 1.1829186281102892, "grad_norm": 1.3661390821897004, "learning_rate": 3.1521107190749343e-06, "loss": 0.4531, "step": 440 }, { "epoch": 1.2098184263618024, "grad_norm": 1.283068671688067, "learning_rate": 2.9790583148794834e-06, "loss": 0.4454, "step": 450 }, { "epoch": 1.2098184263618024, "eval_loss": 0.5635669231414795, "eval_runtime": 20.4014, "eval_samples_per_second": 23.822, "eval_steps_per_second": 0.784, "step": 450 }, { "epoch": 1.2367182246133155, "grad_norm": 1.387077223080061, "learning_rate": 2.808027158742806e-06, "loss": 0.4504, "step": 460 }, { "epoch": 1.2636180228648284, "grad_norm": 1.3934948532441245, "learning_rate": 2.6393558560890605e-06, "loss": 0.4372, "step": 470 }, { "epoch": 1.2905178211163415, "grad_norm": 1.3508965770850627, "learning_rate": 2.4733783403328845e-06, "loss": 0.4459, "step": 480 }, { "epoch": 1.3174176193678546, "grad_norm": 1.3563554813682979, "learning_rate": 2.3104232117620433e-06, "loss": 0.4511, "step": 490 }, { "epoch": 1.3443174176193677, "grad_norm": 1.2997155659311739, "learning_rate": 2.150813086978535e-06, "loss": 0.443, "step": 500 }, { "epoch": 1.3443174176193677, "eval_loss": 0.5615652799606323, "eval_runtime": 20.1025, "eval_samples_per_second": 24.176, "eval_steps_per_second": 0.796, "step": 500 }, { "epoch": 1.3712172158708809, "grad_norm": 1.2836148832407157, "learning_rate": 1.9948639601861227e-06, "loss": 0.4511, "step": 510 }, { "epoch": 1.398117014122394, "grad_norm": 1.3107604919378975, "learning_rate": 1.8428845775888169e-06, "loss": 0.4412, "step": 520 }, { "epoch": 1.425016812373907, "grad_norm": 1.317635662287699, "learning_rate": 1.6951758261388555e-06, "loss": 0.4506, "step": 530 }, { "epoch": 1.4519166106254202, "grad_norm": 1.2801225745819735, "learning_rate": 1.5520301378443373e-06, "loss": 0.4421, "step": 540 }, { "epoch": 1.4788164088769333, "grad_norm": 1.3520095118040596, "learning_rate": 1.4137309108158554e-06, "loss": 0.4443, "step": 550 }, { "epoch": 1.4788164088769333, "eval_loss": 0.5601685643196106, "eval_runtime": 20.2436, "eval_samples_per_second": 24.008, "eval_steps_per_second": 0.79, "step": 550 }, { "epoch": 1.5057162071284464, "grad_norm": 1.3661516182508175, "learning_rate": 1.2805519481983216e-06, "loss": 0.4397, "step": 560 }, { "epoch": 1.5326160053799596, "grad_norm": 1.2619493983661074, "learning_rate": 1.1527569160988053e-06, "loss": 0.4517, "step": 570 }, { "epoch": 1.5595158036314727, "grad_norm": 1.3136850406090343, "learning_rate": 1.0305988215835468e-06, "loss": 0.4459, "step": 580 }, { "epoch": 1.5864156018829858, "grad_norm": 1.3675820589439671, "learning_rate": 9.143195117776081e-07, "loss": 0.4367, "step": 590 }, { "epoch": 1.613315400134499, "grad_norm": 1.2781582138814331, "learning_rate": 8.041491950588457e-07, "loss": 0.4422, "step": 600 }, { "epoch": 1.613315400134499, "eval_loss": 0.5570077896118164, "eval_runtime": 20.3295, "eval_samples_per_second": 23.906, "eval_steps_per_second": 0.787, "step": 600 }, { "epoch": 1.640215198386012, "grad_norm": 1.272071588739883, "learning_rate": 7.003059852941429e-07, "loss": 0.4483, "step": 610 }, { "epoch": 1.6671149966375252, "grad_norm": 1.4222402008140032, "learning_rate": 6.029954700201938e-07, "loss": 0.427, "step": 620 }, { "epoch": 1.6940147948890383, "grad_norm": 1.2265154550516224, "learning_rate": 5.124103034237804e-07, "loss": 0.4319, "step": 630 }, { "epoch": 1.7209145931405514, "grad_norm": 1.2705738648920262, "learning_rate": 4.2872982492732256e-07, "loss": 0.4433, "step": 640 }, { "epoch": 1.7478143913920645, "grad_norm": 1.3331198275976817, "learning_rate": 3.521197041348576e-07, "loss": 0.4462, "step": 650 }, { "epoch": 1.7478143913920645, "eval_loss": 0.5559241771697998, "eval_runtime": 20.3664, "eval_samples_per_second": 23.863, "eval_steps_per_second": 0.786, "step": 650 }, { "epoch": 1.7747141896435776, "grad_norm": 1.2371719113362185, "learning_rate": 2.827316128413475e-07, "loss": 0.4447, "step": 660 }, { "epoch": 1.8016139878950908, "grad_norm": 1.244858103875357, "learning_rate": 2.2070292475468677e-07, "loss": 0.4358, "step": 670 }, { "epoch": 1.8285137861466039, "grad_norm": 1.2807189158760035, "learning_rate": 1.6615644352488923e-07, "loss": 0.4503, "step": 680 }, { "epoch": 1.855413584398117, "grad_norm": 1.3405147314829868, "learning_rate": 1.1920015961889785e-07, "loss": 0.4375, "step": 690 }, { "epoch": 1.88231338264963, "grad_norm": 1.3156533030443653, "learning_rate": 7.992703652236122e-08, "loss": 0.4404, "step": 700 }, { "epoch": 1.88231338264963, "eval_loss": 0.554787814617157, "eval_runtime": 20.3816, "eval_samples_per_second": 23.845, "eval_steps_per_second": 0.785, "step": 700 }, { "epoch": 1.9092131809011432, "grad_norm": 1.3136336467093748, "learning_rate": 4.8414826691641985e-08, "loss": 0.4459, "step": 710 }, { "epoch": 1.9361129791526563, "grad_norm": 1.2848602231721438, "learning_rate": 2.4725917620438408e-08, "loss": 0.4293, "step": 720 }, { "epoch": 1.9630127774041695, "grad_norm": 1.284534380656877, "learning_rate": 8.907208325779069e-09, "loss": 0.4366, "step": 730 }, { "epoch": 1.9899125756556826, "grad_norm": 1.3165574333322239, "learning_rate": 9.900164979099735e-10, "loss": 0.4422, "step": 740 }, { "epoch": 2.0, "step": 744, "total_flos": 77049352159232.0, "train_loss": 0.5424991769175376, "train_runtime": 9734.9137, "train_samples_per_second": 4.887, "train_steps_per_second": 0.076 } ], "logging_steps": 10, "max_steps": 744, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 77049352159232.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }