| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.864, | |
| "eval_steps": 54, | |
| "global_step": 648, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.013333333333333334, | |
| "grad_norm": 3.1677727699279785, | |
| "learning_rate": 1.323529411764706e-05, | |
| "loss": 2.432534408569336, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.02666666666666667, | |
| "grad_norm": 2.737276077270508, | |
| "learning_rate": 2.7941176470588236e-05, | |
| "loss": 1.499803352355957, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.322286367416382, | |
| "learning_rate": 4.2647058823529415e-05, | |
| "loss": 0.958259391784668, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.05333333333333334, | |
| "grad_norm": 1.6951706409454346, | |
| "learning_rate": 5.735294117647059e-05, | |
| "loss": 0.6128009796142578, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.06666666666666667, | |
| "grad_norm": 1.5513038635253906, | |
| "learning_rate": 7.205882352941177e-05, | |
| "loss": 0.5434298992156983, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "eval_loss": 0.4876534342765808, | |
| "eval_runtime": 46.2236, | |
| "eval_samples_per_second": 116.802, | |
| "eval_steps_per_second": 3.656, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 1.2719606161117554, | |
| "learning_rate": 8.676470588235295e-05, | |
| "loss": 0.49590306282043456, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.09333333333333334, | |
| "grad_norm": 1.4482001066207886, | |
| "learning_rate": 9.995417048579285e-05, | |
| "loss": 0.4701418876647949, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.10666666666666667, | |
| "grad_norm": 0.9346111416816711, | |
| "learning_rate": 9.949587534372136e-05, | |
| "loss": 0.42320098876953127, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.961360514163971, | |
| "learning_rate": 9.903758020164987e-05, | |
| "loss": 0.4108582973480225, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.13333333333333333, | |
| "grad_norm": 1.0673013925552368, | |
| "learning_rate": 9.857928505957838e-05, | |
| "loss": 0.4146592617034912, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "eval_loss": 0.39331719279289246, | |
| "eval_runtime": 45.2237, | |
| "eval_samples_per_second": 119.384, | |
| "eval_steps_per_second": 3.737, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.14666666666666667, | |
| "grad_norm": 0.9028648138046265, | |
| "learning_rate": 9.812098991750688e-05, | |
| "loss": 0.395797872543335, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.9546720385551453, | |
| "learning_rate": 9.766269477543539e-05, | |
| "loss": 0.3759224653244019, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.17333333333333334, | |
| "grad_norm": 1.0134034156799316, | |
| "learning_rate": 9.720439963336389e-05, | |
| "loss": 0.3860164642333984, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.18666666666666668, | |
| "grad_norm": 1.0793383121490479, | |
| "learning_rate": 9.67461044912924e-05, | |
| "loss": 0.37697725296020507, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.8906319737434387, | |
| "learning_rate": 9.62878093492209e-05, | |
| "loss": 0.38399662971496584, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.21333333333333335, | |
| "grad_norm": 0.9087033867835999, | |
| "learning_rate": 9.58295142071494e-05, | |
| "loss": 0.35976552963256836, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.216, | |
| "eval_loss": 0.36686545610427856, | |
| "eval_runtime": 45.33, | |
| "eval_samples_per_second": 119.104, | |
| "eval_steps_per_second": 3.728, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.22666666666666666, | |
| "grad_norm": 0.8666670918464661, | |
| "learning_rate": 9.53712190650779e-05, | |
| "loss": 0.3579233169555664, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.8580495715141296, | |
| "learning_rate": 9.491292392300642e-05, | |
| "loss": 0.34284372329711915, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.25333333333333335, | |
| "grad_norm": 0.7784494757652283, | |
| "learning_rate": 9.445462878093493e-05, | |
| "loss": 0.36840295791625977, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.26666666666666666, | |
| "grad_norm": 0.8598802089691162, | |
| "learning_rate": 9.399633363886343e-05, | |
| "loss": 0.3700442314147949, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.8933939337730408, | |
| "learning_rate": 9.353803849679193e-05, | |
| "loss": 0.36236522197723386, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "eval_loss": 0.346111536026001, | |
| "eval_runtime": 45.7874, | |
| "eval_samples_per_second": 117.914, | |
| "eval_steps_per_second": 3.691, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.29333333333333333, | |
| "grad_norm": 1.05680251121521, | |
| "learning_rate": 9.307974335472044e-05, | |
| "loss": 0.35799968242645264, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.30666666666666664, | |
| "grad_norm": 0.8355916142463684, | |
| "learning_rate": 9.262144821264895e-05, | |
| "loss": 0.35074672698974607, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.9081747531890869, | |
| "learning_rate": 9.216315307057746e-05, | |
| "loss": 0.3546539068222046, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 0.9269344210624695, | |
| "learning_rate": 9.170485792850596e-05, | |
| "loss": 0.3317249774932861, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.3466666666666667, | |
| "grad_norm": 0.7904302477836609, | |
| "learning_rate": 9.124656278643447e-05, | |
| "loss": 0.3290853023529053, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.7114660143852234, | |
| "learning_rate": 9.078826764436298e-05, | |
| "loss": 0.322760272026062, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "eval_loss": 0.33535903692245483, | |
| "eval_runtime": 45.3828, | |
| "eval_samples_per_second": 118.966, | |
| "eval_steps_per_second": 3.724, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.37333333333333335, | |
| "grad_norm": 0.8438096046447754, | |
| "learning_rate": 9.032997250229149e-05, | |
| "loss": 0.34818062782287595, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.38666666666666666, | |
| "grad_norm": 0.7516797780990601, | |
| "learning_rate": 8.987167736021999e-05, | |
| "loss": 0.31596965789794923, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.8765379786491394, | |
| "learning_rate": 8.94133822181485e-05, | |
| "loss": 0.33709211349487306, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.41333333333333333, | |
| "grad_norm": 0.8902734518051147, | |
| "learning_rate": 8.8955087076077e-05, | |
| "loss": 0.330603814125061, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.4266666666666667, | |
| "grad_norm": 0.8690024018287659, | |
| "learning_rate": 8.84967919340055e-05, | |
| "loss": 0.32671523094177246, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "eval_loss": 0.33184683322906494, | |
| "eval_runtime": 45.3915, | |
| "eval_samples_per_second": 118.943, | |
| "eval_steps_per_second": 3.723, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.8143032193183899, | |
| "learning_rate": 8.8038496791934e-05, | |
| "loss": 0.3163719177246094, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.4533333333333333, | |
| "grad_norm": 0.7447159290313721, | |
| "learning_rate": 8.758020164986251e-05, | |
| "loss": 0.3183164119720459, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.4666666666666667, | |
| "grad_norm": 0.7337270379066467, | |
| "learning_rate": 8.712190650779101e-05, | |
| "loss": 0.3220224857330322, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.6819722056388855, | |
| "learning_rate": 8.666361136571953e-05, | |
| "loss": 0.3218740940093994, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.49333333333333335, | |
| "grad_norm": 0.7921403646469116, | |
| "learning_rate": 8.620531622364803e-05, | |
| "loss": 0.32049055099487306, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.504, | |
| "eval_loss": 0.3204575181007385, | |
| "eval_runtime": 45.5527, | |
| "eval_samples_per_second": 118.522, | |
| "eval_steps_per_second": 3.71, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.5066666666666667, | |
| "grad_norm": 0.7351377010345459, | |
| "learning_rate": 8.574702108157654e-05, | |
| "loss": 0.3107901573181152, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.800105094909668, | |
| "learning_rate": 8.528872593950504e-05, | |
| "loss": 0.3166258096694946, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.5333333333333333, | |
| "grad_norm": 0.683992326259613, | |
| "learning_rate": 8.483043079743355e-05, | |
| "loss": 0.31937375068664553, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5466666666666666, | |
| "grad_norm": 0.7428257465362549, | |
| "learning_rate": 8.437213565536206e-05, | |
| "loss": 0.3019124984741211, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.7256530523300171, | |
| "learning_rate": 8.391384051329057e-05, | |
| "loss": 0.3088233232498169, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.5733333333333334, | |
| "grad_norm": 0.8287463784217834, | |
| "learning_rate": 8.345554537121907e-05, | |
| "loss": 0.3135841369628906, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "eval_loss": 0.3146475553512573, | |
| "eval_runtime": 45.5431, | |
| "eval_samples_per_second": 118.547, | |
| "eval_steps_per_second": 3.711, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 0.5866666666666667, | |
| "grad_norm": 0.7201558947563171, | |
| "learning_rate": 8.299725022914757e-05, | |
| "loss": 0.33163981437683104, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.7666788697242737, | |
| "learning_rate": 8.253895508707609e-05, | |
| "loss": 0.3103063106536865, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.6133333333333333, | |
| "grad_norm": 0.7993035912513733, | |
| "learning_rate": 8.20806599450046e-05, | |
| "loss": 0.3024377584457397, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.6266666666666667, | |
| "grad_norm": 0.7305812835693359, | |
| "learning_rate": 8.16223648029331e-05, | |
| "loss": 0.30060317516326907, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.7218968868255615, | |
| "learning_rate": 8.11640696608616e-05, | |
| "loss": 0.30545334815979003, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.648, | |
| "eval_loss": 0.30640655755996704, | |
| "eval_runtime": 45.3513, | |
| "eval_samples_per_second": 119.048, | |
| "eval_steps_per_second": 3.726, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 0.6533333333333333, | |
| "grad_norm": 0.7409582734107971, | |
| "learning_rate": 8.07057745187901e-05, | |
| "loss": 0.31014978885650635, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 0.8251257538795471, | |
| "learning_rate": 8.024747937671861e-05, | |
| "loss": 0.2921054124832153, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.8492790460586548, | |
| "learning_rate": 7.978918423464711e-05, | |
| "loss": 0.2973308801651001, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.6933333333333334, | |
| "grad_norm": 0.9368008971214294, | |
| "learning_rate": 7.933088909257562e-05, | |
| "loss": 0.2915247917175293, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.7066666666666667, | |
| "grad_norm": 0.7164352536201477, | |
| "learning_rate": 7.887259395050412e-05, | |
| "loss": 0.2985499382019043, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.7182020545005798, | |
| "learning_rate": 7.841429880843263e-05, | |
| "loss": 0.29260077476501467, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "eval_loss": 0.301230788230896, | |
| "eval_runtime": 45.2631, | |
| "eval_samples_per_second": 119.281, | |
| "eval_steps_per_second": 3.734, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.7333333333333333, | |
| "grad_norm": 0.667168140411377, | |
| "learning_rate": 7.795600366636114e-05, | |
| "loss": 0.2892777442932129, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.7466666666666667, | |
| "grad_norm": 0.8839274048805237, | |
| "learning_rate": 7.749770852428965e-05, | |
| "loss": 0.29940755367279054, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.7022384405136108, | |
| "learning_rate": 7.703941338221815e-05, | |
| "loss": 0.2847739696502686, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.7733333333333333, | |
| "grad_norm": 0.7643616199493408, | |
| "learning_rate": 7.658111824014665e-05, | |
| "loss": 0.3103649139404297, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.7866666666666666, | |
| "grad_norm": 0.8885356187820435, | |
| "learning_rate": 7.612282309807517e-05, | |
| "loss": 0.2783879518508911, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.792, | |
| "eval_loss": 0.2951120138168335, | |
| "eval_runtime": 45.5971, | |
| "eval_samples_per_second": 118.407, | |
| "eval_steps_per_second": 3.706, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.7724995017051697, | |
| "learning_rate": 7.566452795600368e-05, | |
| "loss": 0.30985236167907715, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.8133333333333334, | |
| "grad_norm": 0.9214587807655334, | |
| "learning_rate": 7.520623281393218e-05, | |
| "loss": 0.28750762939453123, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.8266666666666667, | |
| "grad_norm": 0.8178621530532837, | |
| "learning_rate": 7.474793767186068e-05, | |
| "loss": 0.3045247793197632, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.6371078491210938, | |
| "learning_rate": 7.428964252978919e-05, | |
| "loss": 0.2850461006164551, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.8533333333333334, | |
| "grad_norm": 0.6749277710914612, | |
| "learning_rate": 7.383134738771769e-05, | |
| "loss": 0.29334354400634766, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "eval_loss": 0.29118046164512634, | |
| "eval_runtime": 45.5175, | |
| "eval_samples_per_second": 118.614, | |
| "eval_steps_per_second": 3.713, | |
| "step": 648 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2250, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 324, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.141916003015066e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |