| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 50, | |
| "global_step": 744, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.026899798251513115, | |
| "grad_norm": 29.746873221105215, | |
| "learning_rate": 1.894736842105263e-06, | |
| "loss": 1.8652, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.05379959650302623, | |
| "grad_norm": 2.6569803505324483, | |
| "learning_rate": 4e-06, | |
| "loss": 0.9876, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.08069939475453934, | |
| "grad_norm": 1.866327801527443, | |
| "learning_rate": 6.105263157894737e-06, | |
| "loss": 0.7748, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.10759919300605246, | |
| "grad_norm": 1.5350711741145153, | |
| "learning_rate": 7.999960397771768e-06, | |
| "loss": 0.7036, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.13449899125756556, | |
| "grad_norm": 1.4235434017069168, | |
| "learning_rate": 7.995209079154332e-06, | |
| "loss": 0.6701, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.13449899125756556, | |
| "eval_loss": 0.6571411490440369, | |
| "eval_runtime": 20.3665, | |
| "eval_samples_per_second": 23.863, | |
| "eval_steps_per_second": 0.786, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.16139878950907868, | |
| "grad_norm": 1.5470132855221335, | |
| "learning_rate": 7.982548093693699e-06, | |
| "loss": 0.6427, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.1882985877605918, | |
| "grad_norm": 1.4003108383520437, | |
| "learning_rate": 7.962002507456483e-06, | |
| "loss": 0.6309, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.21519838601210492, | |
| "grad_norm": 1.411199665593455, | |
| "learning_rate": 7.933612996347003e-06, | |
| "loss": 0.6348, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.242098184263618, | |
| "grad_norm": 1.3778290968625495, | |
| "learning_rate": 7.897435765577615e-06, | |
| "loss": 0.6165, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.26899798251513113, | |
| "grad_norm": 1.4467043447342405, | |
| "learning_rate": 7.853542438394323e-06, | |
| "loss": 0.5984, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.26899798251513113, | |
| "eval_loss": 0.6147489547729492, | |
| "eval_runtime": 20.4764, | |
| "eval_samples_per_second": 23.735, | |
| "eval_steps_per_second": 0.781, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.29589778076664425, | |
| "grad_norm": 1.4148215321149555, | |
| "learning_rate": 7.802019914277922e-06, | |
| "loss": 0.6115, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.32279757901815737, | |
| "grad_norm": 1.566581827874729, | |
| "learning_rate": 7.742970196901463e-06, | |
| "loss": 0.6062, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.3496973772696705, | |
| "grad_norm": 1.3254884536596998, | |
| "learning_rate": 7.676510192184609e-06, | |
| "loss": 0.6006, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3765971755211836, | |
| "grad_norm": 1.447517875694622, | |
| "learning_rate": 7.602771476844694e-06, | |
| "loss": 0.5898, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4034969737726967, | |
| "grad_norm": 1.3445376199324792, | |
| "learning_rate": 7.5219000379027296e-06, | |
| "loss": 0.5934, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.4034969737726967, | |
| "eval_loss": 0.5962715148925781, | |
| "eval_runtime": 20.1716, | |
| "eval_samples_per_second": 24.093, | |
| "eval_steps_per_second": 0.793, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.43039677202420984, | |
| "grad_norm": 1.3246907469950613, | |
| "learning_rate": 7.434055983660057e-06, | |
| "loss": 0.5993, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.45729657027572296, | |
| "grad_norm": 1.3467258206153316, | |
| "learning_rate": 7.339413226717854e-06, | |
| "loss": 0.5915, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.484196368527236, | |
| "grad_norm": 1.458985018698563, | |
| "learning_rate": 7.23815913966707e-06, | |
| "loss": 0.5881, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5110961667787491, | |
| "grad_norm": 1.2485290337068158, | |
| "learning_rate": 7.130494184130416e-06, | |
| "loss": 0.5887, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.5379959650302623, | |
| "grad_norm": 1.3124694415167568, | |
| "learning_rate": 7.016631513890864e-06, | |
| "loss": 0.5855, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5379959650302623, | |
| "eval_loss": 0.5851770043373108, | |
| "eval_runtime": 20.3049, | |
| "eval_samples_per_second": 23.935, | |
| "eval_steps_per_second": 0.788, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5648957632817754, | |
| "grad_norm": 1.4701321084658303, | |
| "learning_rate": 6.896796552892348e-06, | |
| "loss": 0.5833, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5917955615332885, | |
| "grad_norm": 1.4050016198469426, | |
| "learning_rate": 6.771226548948162e-06, | |
| "loss": 0.5812, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.6186953597848016, | |
| "grad_norm": 1.4295194453088238, | |
| "learning_rate": 6.64017010404058e-06, | |
| "loss": 0.5822, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.6455951580363147, | |
| "grad_norm": 7.967159671444082, | |
| "learning_rate": 6.503886682141661e-06, | |
| "loss": 0.573, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.6724949562878278, | |
| "grad_norm": 1.2442332016904551, | |
| "learning_rate": 6.3626460955295895e-06, | |
| "loss": 0.5837, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.6724949562878278, | |
| "eval_loss": 0.5775083303451538, | |
| "eval_runtime": 20.6713, | |
| "eval_samples_per_second": 23.511, | |
| "eval_steps_per_second": 0.774, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.699394754539341, | |
| "grad_norm": 1.3850365164976395, | |
| "learning_rate": 6.2167279706175765e-06, | |
| "loss": 0.5726, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.7262945527908541, | |
| "grad_norm": 1.2304207907101277, | |
| "learning_rate": 6.066421194352859e-06, | |
| "loss": 0.5565, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.7531943510423672, | |
| "grad_norm": 1.2714930735972703, | |
| "learning_rate": 5.912023342281789e-06, | |
| "loss": 0.5641, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.7800941492938803, | |
| "grad_norm": 1.460856311527336, | |
| "learning_rate": 5.753840089413357e-06, | |
| "loss": 0.566, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.8069939475453934, | |
| "grad_norm": 1.5514244928702674, | |
| "learning_rate": 5.592184605047483e-06, | |
| "loss": 0.557, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.8069939475453934, | |
| "eval_loss": 0.5692653656005859, | |
| "eval_runtime": 20.3766, | |
| "eval_samples_per_second": 23.851, | |
| "eval_steps_per_second": 0.785, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.8338937457969066, | |
| "grad_norm": 1.360092781672721, | |
| "learning_rate": 5.427376932766216e-06, | |
| "loss": 0.5657, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.8607935440484197, | |
| "grad_norm": 1.5946830149551399, | |
| "learning_rate": 5.259743356815289e-06, | |
| "loss": 0.574, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.8876933422999328, | |
| "grad_norm": 1.2977931263056453, | |
| "learning_rate": 5.089615756130505e-06, | |
| "loss": 0.5549, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.9145931405514459, | |
| "grad_norm": 1.7677476514254604, | |
| "learning_rate": 4.917330947287818e-06, | |
| "loss": 0.5546, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.9414929388029589, | |
| "grad_norm": 1.365159498913211, | |
| "learning_rate": 4.743230017677918e-06, | |
| "loss": 0.5525, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.9414929388029589, | |
| "eval_loss": 0.5623395442962646, | |
| "eval_runtime": 20.4098, | |
| "eval_samples_per_second": 23.812, | |
| "eval_steps_per_second": 0.784, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.968392737054472, | |
| "grad_norm": 1.4020192853941864, | |
| "learning_rate": 4.567657650225538e-06, | |
| "loss": 0.5491, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.9952925353059852, | |
| "grad_norm": 1.2810961882383074, | |
| "learning_rate": 4.390961440990333e-06, | |
| "loss": 0.5617, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.0215198386012105, | |
| "grad_norm": 1.4422549713674222, | |
| "learning_rate": 4.213491211000394e-06, | |
| "loss": 0.4705, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.0484196368527237, | |
| "grad_norm": 1.339607087215748, | |
| "learning_rate": 4.035598313680784e-06, | |
| "loss": 0.472, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.0753194351042368, | |
| "grad_norm": 1.3004229760969053, | |
| "learning_rate": 3.8576349392482585e-06, | |
| "loss": 0.4579, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.0753194351042368, | |
| "eval_loss": 0.5656763315200806, | |
| "eval_runtime": 20.3701, | |
| "eval_samples_per_second": 23.858, | |
| "eval_steps_per_second": 0.785, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.10221923335575, | |
| "grad_norm": 1.2705614543341326, | |
| "learning_rate": 3.67995341744931e-06, | |
| "loss": 0.4475, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.129119031607263, | |
| "grad_norm": 1.3779121525385867, | |
| "learning_rate": 3.5029055200219857e-06, | |
| "loss": 0.4596, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.1560188298587761, | |
| "grad_norm": 1.3453138554439108, | |
| "learning_rate": 3.326841764262423e-06, | |
| "loss": 0.4427, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.1829186281102892, | |
| "grad_norm": 1.3661390821897004, | |
| "learning_rate": 3.1521107190749343e-06, | |
| "loss": 0.4531, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.2098184263618024, | |
| "grad_norm": 1.283068671688067, | |
| "learning_rate": 2.9790583148794834e-06, | |
| "loss": 0.4454, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.2098184263618024, | |
| "eval_loss": 0.5635669231414795, | |
| "eval_runtime": 20.4014, | |
| "eval_samples_per_second": 23.822, | |
| "eval_steps_per_second": 0.784, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.2367182246133155, | |
| "grad_norm": 1.387077223080061, | |
| "learning_rate": 2.808027158742806e-06, | |
| "loss": 0.4504, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.2636180228648284, | |
| "grad_norm": 1.3934948532441245, | |
| "learning_rate": 2.6393558560890605e-06, | |
| "loss": 0.4372, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.2905178211163415, | |
| "grad_norm": 1.3508965770850627, | |
| "learning_rate": 2.4733783403328845e-06, | |
| "loss": 0.4459, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.3174176193678546, | |
| "grad_norm": 1.3563554813682979, | |
| "learning_rate": 2.3104232117620433e-06, | |
| "loss": 0.4511, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.3443174176193677, | |
| "grad_norm": 1.2997155659311739, | |
| "learning_rate": 2.150813086978535e-06, | |
| "loss": 0.443, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.3443174176193677, | |
| "eval_loss": 0.5615652799606323, | |
| "eval_runtime": 20.1025, | |
| "eval_samples_per_second": 24.176, | |
| "eval_steps_per_second": 0.796, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.3712172158708809, | |
| "grad_norm": 1.2836148832407157, | |
| "learning_rate": 1.9948639601861227e-06, | |
| "loss": 0.4511, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.398117014122394, | |
| "grad_norm": 1.3107604919378975, | |
| "learning_rate": 1.8428845775888169e-06, | |
| "loss": 0.4412, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.425016812373907, | |
| "grad_norm": 1.317635662287699, | |
| "learning_rate": 1.6951758261388555e-06, | |
| "loss": 0.4506, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.4519166106254202, | |
| "grad_norm": 1.2801225745819735, | |
| "learning_rate": 1.5520301378443373e-06, | |
| "loss": 0.4421, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.4788164088769333, | |
| "grad_norm": 1.3520095118040596, | |
| "learning_rate": 1.4137309108158554e-06, | |
| "loss": 0.4443, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.4788164088769333, | |
| "eval_loss": 0.5601685643196106, | |
| "eval_runtime": 20.2436, | |
| "eval_samples_per_second": 24.008, | |
| "eval_steps_per_second": 0.79, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.5057162071284464, | |
| "grad_norm": 1.3661516182508175, | |
| "learning_rate": 1.2805519481983216e-06, | |
| "loss": 0.4397, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.5326160053799596, | |
| "grad_norm": 1.2619493983661074, | |
| "learning_rate": 1.1527569160988053e-06, | |
| "loss": 0.4517, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.5595158036314727, | |
| "grad_norm": 1.3136850406090343, | |
| "learning_rate": 1.0305988215835468e-06, | |
| "loss": 0.4459, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.5864156018829858, | |
| "grad_norm": 1.3675820589439671, | |
| "learning_rate": 9.143195117776081e-07, | |
| "loss": 0.4367, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.613315400134499, | |
| "grad_norm": 1.2781582138814331, | |
| "learning_rate": 8.041491950588457e-07, | |
| "loss": 0.4422, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.613315400134499, | |
| "eval_loss": 0.5570077896118164, | |
| "eval_runtime": 20.3295, | |
| "eval_samples_per_second": 23.906, | |
| "eval_steps_per_second": 0.787, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.640215198386012, | |
| "grad_norm": 1.272071588739883, | |
| "learning_rate": 7.003059852941429e-07, | |
| "loss": 0.4483, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.6671149966375252, | |
| "grad_norm": 1.4222402008140032, | |
| "learning_rate": 6.029954700201938e-07, | |
| "loss": 0.427, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.6940147948890383, | |
| "grad_norm": 1.2265154550516224, | |
| "learning_rate": 5.124103034237804e-07, | |
| "loss": 0.4319, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.7209145931405514, | |
| "grad_norm": 1.2705738648920262, | |
| "learning_rate": 4.2872982492732256e-07, | |
| "loss": 0.4433, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.7478143913920645, | |
| "grad_norm": 1.3331198275976817, | |
| "learning_rate": 3.521197041348576e-07, | |
| "loss": 0.4462, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.7478143913920645, | |
| "eval_loss": 0.5559241771697998, | |
| "eval_runtime": 20.3664, | |
| "eval_samples_per_second": 23.863, | |
| "eval_steps_per_second": 0.786, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.7747141896435776, | |
| "grad_norm": 1.2371719113362185, | |
| "learning_rate": 2.827316128413475e-07, | |
| "loss": 0.4447, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.8016139878950908, | |
| "grad_norm": 1.244858103875357, | |
| "learning_rate": 2.2070292475468677e-07, | |
| "loss": 0.4358, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.8285137861466039, | |
| "grad_norm": 1.2807189158760035, | |
| "learning_rate": 1.6615644352488923e-07, | |
| "loss": 0.4503, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.855413584398117, | |
| "grad_norm": 1.3405147314829868, | |
| "learning_rate": 1.1920015961889785e-07, | |
| "loss": 0.4375, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.88231338264963, | |
| "grad_norm": 1.3156533030443653, | |
| "learning_rate": 7.992703652236122e-08, | |
| "loss": 0.4404, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.88231338264963, | |
| "eval_loss": 0.554787814617157, | |
| "eval_runtime": 20.3816, | |
| "eval_samples_per_second": 23.845, | |
| "eval_steps_per_second": 0.785, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.9092131809011432, | |
| "grad_norm": 1.3136336467093748, | |
| "learning_rate": 4.8414826691641985e-08, | |
| "loss": 0.4459, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.9361129791526563, | |
| "grad_norm": 1.2848602231721438, | |
| "learning_rate": 2.4725917620438408e-08, | |
| "loss": 0.4293, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.9630127774041695, | |
| "grad_norm": 1.284534380656877, | |
| "learning_rate": 8.907208325779069e-09, | |
| "loss": 0.4366, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.9899125756556826, | |
| "grad_norm": 1.3165574333322239, | |
| "learning_rate": 9.900164979099735e-10, | |
| "loss": 0.4422, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "step": 744, | |
| "total_flos": 77049352159232.0, | |
| "train_loss": 0.5424991769175376, | |
| "train_runtime": 9734.9137, | |
| "train_samples_per_second": 4.887, | |
| "train_steps_per_second": 0.076 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 744, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 77049352159232.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |